1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/fcntl.h> 41 #include <sys/namecache.h> 42 #include <sys/vnode.h> 43 #include <sys/lockf.h> 44 #include <sys/event.h> 45 #include <sys/stat.h> 46 #include <sys/dirent.h> 47 #include <sys/file.h> 48 #include <vm/vm_extern.h> 49 #include <vm/swap_pager.h> 50 #include <vfs/fifofs/fifo.h> 51 52 #include "hammer.h" 53 54 /* 55 * USERFS VNOPS 56 */ 57 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/ 58 static int hammer_vop_fsync(struct vop_fsync_args *); 59 static int hammer_vop_read(struct vop_read_args *); 60 static int hammer_vop_write(struct vop_write_args *); 61 static int hammer_vop_access(struct vop_access_args *); 62 static int hammer_vop_advlock(struct vop_advlock_args *); 63 static int hammer_vop_close(struct vop_close_args *); 64 static int hammer_vop_ncreate(struct vop_ncreate_args *); 65 static int hammer_vop_getattr(struct vop_getattr_args *); 66 static int hammer_vop_nresolve(struct vop_nresolve_args *); 67 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 68 static int hammer_vop_nlink(struct vop_nlink_args *); 69 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 70 static int hammer_vop_nmknod(struct vop_nmknod_args *); 71 static int hammer_vop_open(struct vop_open_args *); 72 static int hammer_vop_print(struct vop_print_args *); 73 static int hammer_vop_readdir(struct vop_readdir_args *); 74 static int hammer_vop_readlink(struct vop_readlink_args *); 75 static int hammer_vop_nremove(struct vop_nremove_args *); 76 static int hammer_vop_nrename(struct vop_nrename_args *); 77 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 78 static int hammer_vop_markatime(struct vop_markatime_args *); 79 static int hammer_vop_setattr(struct vop_setattr_args *); 80 static int hammer_vop_strategy(struct vop_strategy_args *); 81 static int hammer_vop_bmap(struct vop_bmap_args *ap); 82 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 83 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 84 static int hammer_vop_ioctl(struct vop_ioctl_args *); 85 static int hammer_vop_mountctl(struct vop_mountctl_args *); 86 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 87 88 static int hammer_vop_fifoclose (struct vop_close_args *); 89 static int hammer_vop_fiforead (struct vop_read_args *); 90 static int hammer_vop_fifowrite (struct vop_write_args *); 91 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 92 93 struct vop_ops hammer_vnode_vops = { 94 .vop_default = vop_defaultop, 95 .vop_fsync = hammer_vop_fsync, 96 .vop_getpages = vop_stdgetpages, 97 .vop_putpages = vop_stdputpages, 98 .vop_read = hammer_vop_read, 99 .vop_write = hammer_vop_write, 100 .vop_access = hammer_vop_access, 101 .vop_advlock = hammer_vop_advlock, 102 .vop_close = hammer_vop_close, 103 .vop_ncreate = hammer_vop_ncreate, 104 .vop_getattr = hammer_vop_getattr, 105 .vop_inactive = hammer_vop_inactive, 106 .vop_reclaim = hammer_vop_reclaim, 107 .vop_nresolve = hammer_vop_nresolve, 108 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 109 .vop_nlink = hammer_vop_nlink, 110 .vop_nmkdir = hammer_vop_nmkdir, 111 .vop_nmknod = hammer_vop_nmknod, 112 .vop_open = hammer_vop_open, 113 .vop_pathconf = vop_stdpathconf, 114 .vop_print = hammer_vop_print, 115 .vop_readdir = hammer_vop_readdir, 116 .vop_readlink = hammer_vop_readlink, 117 .vop_nremove = hammer_vop_nremove, 118 .vop_nrename = hammer_vop_nrename, 119 .vop_nrmdir = hammer_vop_nrmdir, 120 .vop_markatime = hammer_vop_markatime, 121 .vop_setattr = hammer_vop_setattr, 122 .vop_bmap = hammer_vop_bmap, 123 .vop_strategy = hammer_vop_strategy, 124 .vop_nsymlink = hammer_vop_nsymlink, 125 .vop_nwhiteout = hammer_vop_nwhiteout, 126 .vop_ioctl = hammer_vop_ioctl, 127 .vop_mountctl = hammer_vop_mountctl, 128 .vop_kqfilter = hammer_vop_kqfilter 129 }; 130 131 struct vop_ops hammer_spec_vops = { 132 .vop_default = vop_defaultop, 133 .vop_fsync = hammer_vop_fsync, 134 .vop_read = vop_stdnoread, 135 .vop_write = vop_stdnowrite, 136 .vop_access = hammer_vop_access, 137 .vop_close = hammer_vop_close, 138 .vop_markatime = hammer_vop_markatime, 139 .vop_getattr = hammer_vop_getattr, 140 .vop_inactive = hammer_vop_inactive, 141 .vop_reclaim = hammer_vop_reclaim, 142 .vop_setattr = hammer_vop_setattr 143 }; 144 145 struct vop_ops hammer_fifo_vops = { 146 .vop_default = fifo_vnoperate, 147 .vop_fsync = hammer_vop_fsync, 148 .vop_read = hammer_vop_fiforead, 149 .vop_write = hammer_vop_fifowrite, 150 .vop_access = hammer_vop_access, 151 .vop_close = hammer_vop_fifoclose, 152 .vop_markatime = hammer_vop_markatime, 153 .vop_getattr = hammer_vop_getattr, 154 .vop_inactive = hammer_vop_inactive, 155 .vop_reclaim = hammer_vop_reclaim, 156 .vop_setattr = hammer_vop_setattr, 157 .vop_kqfilter = hammer_vop_fifokqfilter 158 }; 159 160 static __inline 161 void 162 hammer_knote(struct vnode *vp, int flags) 163 { 164 if (flags) 165 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 166 } 167 168 #ifdef DEBUG_TRUNCATE 169 struct hammer_inode *HammerTruncIp; 170 #endif 171 172 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 173 struct vnode *dvp, struct ucred *cred, 174 int flags, int isdir); 175 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 176 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 177 178 #if 0 179 static 180 int 181 hammer_vop_vnoperate(struct vop_generic_args *) 182 { 183 return (VOCALL(&hammer_vnode_vops, ap)); 184 } 185 #endif 186 187 /* 188 * hammer_vop_fsync { vp, waitfor } 189 * 190 * fsync() an inode to disk and wait for it to be completely committed 191 * such that the information would not be undone if a crash occured after 192 * return. 193 * 194 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 195 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 196 * operation. 197 * 198 * Ultimately the combination of a REDO log and use of fast storage 199 * to front-end cluster caches will make fsync fast, but it aint 200 * here yet. And, in anycase, we need real transactional 201 * all-or-nothing features which are not restricted to a single file. 202 */ 203 static 204 int 205 hammer_vop_fsync(struct vop_fsync_args *ap) 206 { 207 hammer_inode_t ip = VTOI(ap->a_vp); 208 hammer_mount_t hmp = ip->hmp; 209 int waitfor = ap->a_waitfor; 210 int mode; 211 212 lwkt_gettoken(&hmp->fs_token); 213 214 /* 215 * Fsync rule relaxation (default is either full synchronous flush 216 * or REDO semantics with synchronous flush). 217 */ 218 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 219 switch(hammer_fsync_mode) { 220 case 0: 221 mode0: 222 /* no REDO, full synchronous flush */ 223 goto skip; 224 case 1: 225 mode1: 226 /* no REDO, full asynchronous flush */ 227 if (waitfor == MNT_WAIT) 228 waitfor = MNT_NOWAIT; 229 goto skip; 230 case 2: 231 /* REDO semantics, synchronous flush */ 232 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 233 goto mode0; 234 mode = HAMMER_FLUSH_UNDOS_AUTO; 235 break; 236 case 3: 237 /* REDO semantics, relaxed asynchronous flush */ 238 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 239 goto mode1; 240 mode = HAMMER_FLUSH_UNDOS_RELAXED; 241 if (waitfor == MNT_WAIT) 242 waitfor = MNT_NOWAIT; 243 break; 244 case 4: 245 /* ignore the fsync() system call */ 246 lwkt_reltoken(&hmp->fs_token); 247 return(0); 248 default: 249 /* we have to do something */ 250 mode = HAMMER_FLUSH_UNDOS_RELAXED; 251 if (waitfor == MNT_WAIT) 252 waitfor = MNT_NOWAIT; 253 break; 254 } 255 256 /* 257 * Fast fsync only needs to flush the UNDO/REDO fifo if 258 * HAMMER_INODE_REDO is non-zero and the only modifications 259 * made to the file are write or write-extends. 260 */ 261 if ((ip->flags & HAMMER_INODE_REDO) && 262 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0 263 ) { 264 ++hammer_count_fsyncs; 265 hammer_flusher_flush_undos(hmp, mode); 266 ip->redo_count = 0; 267 lwkt_reltoken(&hmp->fs_token); 268 return(0); 269 } 270 271 /* 272 * REDO is enabled by fsync(), the idea being we really only 273 * want to lay down REDO records when programs are using 274 * fsync() heavily. The first fsync() on the file starts 275 * the gravy train going and later fsync()s keep it hot by 276 * resetting the redo_count. 277 * 278 * We weren't running REDOs before now so we have to fall 279 * through and do a full fsync of what we have. 280 */ 281 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 282 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 283 ip->flags |= HAMMER_INODE_REDO; 284 ip->redo_count = 0; 285 } 286 } 287 skip: 288 289 /* 290 * Do a full flush sequence. 291 */ 292 ++hammer_count_fsyncs; 293 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 294 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 295 if (waitfor == MNT_WAIT) { 296 vn_unlock(ap->a_vp); 297 hammer_wait_inode(ip); 298 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); 299 } 300 lwkt_reltoken(&hmp->fs_token); 301 return (ip->error); 302 } 303 304 /* 305 * hammer_vop_read { vp, uio, ioflag, cred } 306 * 307 * MPSAFE (for the cache safe does not require fs_token) 308 */ 309 static 310 int 311 hammer_vop_read(struct vop_read_args *ap) 312 { 313 struct hammer_transaction trans; 314 hammer_inode_t ip; 315 hammer_mount_t hmp; 316 off_t offset; 317 struct buf *bp; 318 struct uio *uio; 319 int error; 320 int n; 321 int seqcount; 322 int ioseqcount; 323 int blksize; 324 int bigread; 325 int got_fstoken; 326 327 if (ap->a_vp->v_type != VREG) 328 return (EINVAL); 329 ip = VTOI(ap->a_vp); 330 hmp = ip->hmp; 331 error = 0; 332 uio = ap->a_uio; 333 334 /* 335 * Allow the UIO's size to override the sequential heuristic. 336 */ 337 blksize = hammer_blocksize(uio->uio_offset); 338 seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE; 339 ioseqcount = (ap->a_ioflag >> 16); 340 if (seqcount < ioseqcount) 341 seqcount = ioseqcount; 342 343 /* 344 * If reading or writing a huge amount of data we have to break 345 * atomicy and allow the operation to be interrupted by a signal 346 * or it can DOS the machine. 347 */ 348 bigread = (uio->uio_resid > 100 * 1024 * 1024); 349 got_fstoken = 0; 350 351 /* 352 * Access the data typically in HAMMER_BUFSIZE blocks via the 353 * buffer cache, but HAMMER may use a variable block size based 354 * on the offset. 355 * 356 * XXX Temporary hack, delay the start transaction while we remain 357 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 358 * locked-shared. 359 */ 360 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 361 int64_t base_offset; 362 int64_t file_limit; 363 364 blksize = hammer_blocksize(uio->uio_offset); 365 offset = (int)uio->uio_offset & (blksize - 1); 366 base_offset = uio->uio_offset - offset; 367 368 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 369 break; 370 371 /* 372 * MPSAFE 373 */ 374 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0); 375 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 376 bp->b_flags &= ~B_AGE; 377 error = 0; 378 goto skip; 379 } 380 if (ap->a_ioflag & IO_NRDELAY) { 381 bqrelse(bp); 382 return (EWOULDBLOCK); 383 } 384 385 /* 386 * MPUNSAFE 387 */ 388 if (got_fstoken == 0) { 389 lwkt_gettoken(&hmp->fs_token); 390 got_fstoken = 1; 391 hammer_start_transaction(&trans, ip->hmp); 392 } 393 394 /* 395 * NOTE: A valid bp has already been acquired, but was not 396 * B_CACHE. 397 */ 398 if (hammer_cluster_enable) { 399 /* 400 * Use file_limit to prevent cluster_read() from 401 * creating buffers of the wrong block size past 402 * the demarc. 403 */ 404 file_limit = ip->ino_data.size; 405 if (base_offset < HAMMER_XDEMARC && 406 file_limit > HAMMER_XDEMARC) { 407 file_limit = HAMMER_XDEMARC; 408 } 409 error = cluster_readx(ap->a_vp, 410 file_limit, base_offset, 411 blksize, uio->uio_resid, 412 seqcount * BKVASIZE, &bp); 413 } else { 414 error = breadnx(ap->a_vp, base_offset, blksize, 415 NULL, NULL, 0, &bp); 416 } 417 if (error) { 418 brelse(bp); 419 break; 420 } 421 skip: 422 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { 423 kprintf("doff %016jx read file %016jx@%016jx\n", 424 (intmax_t)bp->b_bio2.bio_offset, 425 (intmax_t)ip->obj_id, 426 (intmax_t)bp->b_loffset); 427 } 428 bp->b_flags &= ~B_IODEBUG; 429 430 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ 431 n = blksize - offset; 432 if (n > uio->uio_resid) 433 n = uio->uio_resid; 434 if (n > ip->ino_data.size - uio->uio_offset) 435 n = (int)(ip->ino_data.size - uio->uio_offset); 436 if (got_fstoken) 437 lwkt_reltoken(&hmp->fs_token); 438 439 /* 440 * Set B_AGE, data has a lower priority than meta-data. 441 * 442 * Use a hold/unlock/drop sequence to run the uiomove 443 * with the buffer unlocked, avoiding deadlocks against 444 * read()s on mmap()'d spaces. 445 */ 446 bp->b_flags |= B_AGE; 447 bqhold(bp); 448 bqrelse(bp); 449 error = uiomove((char *)bp->b_data + offset, n, uio); 450 bqdrop(bp); 451 452 if (got_fstoken) 453 lwkt_gettoken(&hmp->fs_token); 454 455 if (error) 456 break; 457 hammer_stats_file_read += n; 458 } 459 460 /* 461 * Try to update the atime with just the inode lock for maximum 462 * concurrency. If we can't shortcut it we have to get the full 463 * blown transaction. 464 */ 465 if (got_fstoken == 0 && hammer_update_atime_quick(ip) < 0) { 466 lwkt_gettoken(&hmp->fs_token); 467 got_fstoken = 1; 468 hammer_start_transaction(&trans, ip->hmp); 469 } 470 471 if (got_fstoken) { 472 if ((ip->flags & HAMMER_INODE_RO) == 0 && 473 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 474 ip->ino_data.atime = trans.time; 475 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 476 } 477 hammer_done_transaction(&trans); 478 lwkt_reltoken(&hmp->fs_token); 479 } 480 return (error); 481 } 482 483 /* 484 * hammer_vop_write { vp, uio, ioflag, cred } 485 */ 486 static 487 int 488 hammer_vop_write(struct vop_write_args *ap) 489 { 490 struct hammer_transaction trans; 491 struct hammer_inode *ip; 492 hammer_mount_t hmp; 493 thread_t td; 494 struct uio *uio; 495 int offset; 496 off_t base_offset; 497 struct buf *bp; 498 int kflags; 499 int error; 500 int n; 501 int flags; 502 int seqcount; 503 int bigwrite; 504 505 if (ap->a_vp->v_type != VREG) 506 return (EINVAL); 507 ip = VTOI(ap->a_vp); 508 hmp = ip->hmp; 509 error = 0; 510 kflags = 0; 511 seqcount = ap->a_ioflag >> 16; 512 513 if (ip->flags & HAMMER_INODE_RO) 514 return (EROFS); 515 516 /* 517 * Create a transaction to cover the operations we perform. 518 */ 519 lwkt_gettoken(&hmp->fs_token); 520 hammer_start_transaction(&trans, hmp); 521 uio = ap->a_uio; 522 523 /* 524 * Check append mode 525 */ 526 if (ap->a_ioflag & IO_APPEND) 527 uio->uio_offset = ip->ino_data.size; 528 529 /* 530 * Check for illegal write offsets. Valid range is 0...2^63-1. 531 * 532 * NOTE: the base_off assignment is required to work around what 533 * I consider to be a GCC-4 optimization bug. 534 */ 535 if (uio->uio_offset < 0) { 536 hammer_done_transaction(&trans); 537 lwkt_reltoken(&hmp->fs_token); 538 return (EFBIG); 539 } 540 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 541 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 542 hammer_done_transaction(&trans); 543 lwkt_reltoken(&hmp->fs_token); 544 return (EFBIG); 545 } 546 547 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc && 548 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 549 hammer_done_transaction(&trans); 550 lwkt_reltoken(&hmp->fs_token); 551 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); 552 return (EFBIG); 553 } 554 555 /* 556 * If reading or writing a huge amount of data we have to break 557 * atomicy and allow the operation to be interrupted by a signal 558 * or it can DOS the machine. 559 * 560 * Preset redo_count so we stop generating REDOs earlier if the 561 * limit is exceeded. 562 */ 563 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 564 if ((ip->flags & HAMMER_INODE_REDO) && 565 ip->redo_count < hammer_limit_redo) { 566 ip->redo_count += uio->uio_resid; 567 } 568 569 /* 570 * Access the data typically in HAMMER_BUFSIZE blocks via the 571 * buffer cache, but HAMMER may use a variable block size based 572 * on the offset. 573 */ 574 while (uio->uio_resid > 0) { 575 int fixsize = 0; 576 int blksize; 577 int blkmask; 578 int trivial; 579 int endofblk; 580 off_t nsize; 581 582 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 583 break; 584 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 585 break; 586 587 blksize = hammer_blocksize(uio->uio_offset); 588 589 /* 590 * Do not allow HAMMER to blow out the buffer cache. Very 591 * large UIOs can lockout other processes due to bwillwrite() 592 * mechanics. 593 * 594 * The hammer inode is not locked during these operations. 595 * The vnode is locked which can interfere with the pageout 596 * daemon for non-UIO_NOCOPY writes but should not interfere 597 * with the buffer cache. Even so, we cannot afford to 598 * allow the pageout daemon to build up too many dirty buffer 599 * cache buffers. 600 * 601 * Only call this if we aren't being recursively called from 602 * a virtual disk device (vn), else we may deadlock. 603 */ 604 if ((ap->a_ioflag & IO_RECURSE) == 0) 605 bwillwrite(blksize); 606 607 /* 608 * Control the number of pending records associated with 609 * this inode. If too many have accumulated start a 610 * flush. Try to maintain a pipeline with the flusher. 611 * 612 * NOTE: It is possible for other sources to grow the 613 * records but not necessarily issue another flush, 614 * so use a timeout and ensure that a re-flush occurs. 615 */ 616 if (ip->rsv_recs >= hammer_limit_inode_recs) { 617 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 618 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 619 ip->flags |= HAMMER_INODE_RECSW; 620 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 621 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 622 } 623 } 624 625 #if 0 626 /* 627 * Do not allow HAMMER to blow out system memory by 628 * accumulating too many records. Records are so well 629 * decoupled from the buffer cache that it is possible 630 * for userland to push data out to the media via 631 * direct-write, but build up the records queued to the 632 * backend faster then the backend can flush them out. 633 * HAMMER has hit its write limit but the frontend has 634 * no pushback to slow it down. 635 */ 636 if (hmp->rsv_recs > hammer_limit_recs / 2) { 637 /* 638 * Get the inode on the flush list 639 */ 640 if (ip->rsv_recs >= 64) 641 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 642 else if (ip->rsv_recs >= 16) 643 hammer_flush_inode(ip, 0); 644 645 /* 646 * Keep the flusher going if the system keeps 647 * queueing records. 648 */ 649 delta = hmp->count_newrecords - 650 hmp->last_newrecords; 651 if (delta < 0 || delta > hammer_limit_recs / 2) { 652 hmp->last_newrecords = hmp->count_newrecords; 653 hammer_sync_hmp(hmp, MNT_NOWAIT); 654 } 655 656 /* 657 * If we have gotten behind start slowing 658 * down the writers. 659 */ 660 delta = (hmp->rsv_recs - hammer_limit_recs) * 661 hz / hammer_limit_recs; 662 if (delta > 0) 663 tsleep(&trans, 0, "hmrslo", delta); 664 } 665 #endif 666 667 /* 668 * Calculate the blocksize at the current offset and figure 669 * out how much we can actually write. 670 */ 671 blkmask = blksize - 1; 672 offset = (int)uio->uio_offset & blkmask; 673 base_offset = uio->uio_offset & ~(int64_t)blkmask; 674 n = blksize - offset; 675 if (n > uio->uio_resid) { 676 n = uio->uio_resid; 677 endofblk = 0; 678 } else { 679 endofblk = 1; 680 } 681 nsize = uio->uio_offset + n; 682 if (nsize > ip->ino_data.size) { 683 if (uio->uio_offset > ip->ino_data.size) 684 trivial = 0; 685 else 686 trivial = 1; 687 nvextendbuf(ap->a_vp, 688 ip->ino_data.size, 689 nsize, 690 hammer_blocksize(ip->ino_data.size), 691 hammer_blocksize(nsize), 692 hammer_blockoff(ip->ino_data.size), 693 hammer_blockoff(nsize), 694 trivial); 695 fixsize = 1; 696 kflags |= NOTE_EXTEND; 697 } 698 699 if (uio->uio_segflg == UIO_NOCOPY) { 700 /* 701 * Issuing a write with the same data backing the 702 * buffer. Instantiate the buffer to collect the 703 * backing vm pages, then read-in any missing bits. 704 * 705 * This case is used by vop_stdputpages(). 706 */ 707 bp = getblk(ap->a_vp, base_offset, 708 blksize, GETBLK_BHEAVY, 0); 709 if ((bp->b_flags & B_CACHE) == 0) { 710 bqrelse(bp); 711 error = bread(ap->a_vp, base_offset, 712 blksize, &bp); 713 } 714 } else if (offset == 0 && uio->uio_resid >= blksize) { 715 /* 716 * Even though we are entirely overwriting the buffer 717 * we may still have to zero it out to avoid a 718 * mmap/write visibility issue. 719 */ 720 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 721 if ((bp->b_flags & B_CACHE) == 0) 722 vfs_bio_clrbuf(bp); 723 } else if (base_offset >= ip->ino_data.size) { 724 /* 725 * If the base offset of the buffer is beyond the 726 * file EOF, we don't have to issue a read. 727 */ 728 bp = getblk(ap->a_vp, base_offset, 729 blksize, GETBLK_BHEAVY, 0); 730 vfs_bio_clrbuf(bp); 731 } else { 732 /* 733 * Partial overwrite, read in any missing bits then 734 * replace the portion being written. 735 */ 736 error = bread(ap->a_vp, base_offset, blksize, &bp); 737 if (error == 0) 738 bheavy(bp); 739 } 740 if (error == 0) { 741 lwkt_reltoken(&hmp->fs_token); 742 error = uiomove(bp->b_data + offset, n, uio); 743 lwkt_gettoken(&hmp->fs_token); 744 } 745 746 /* 747 * Generate REDO records if enabled and redo_count will not 748 * exceeded the limit. 749 * 750 * If redo_count exceeds the limit we stop generating records 751 * and clear HAMMER_INODE_REDO. This will cause the next 752 * fsync() to do a full meta-data sync instead of just an 753 * UNDO/REDO fifo update. 754 * 755 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 756 * will still be tracked. The tracks will be terminated 757 * when the related meta-data (including possible data 758 * modifications which are not tracked via REDO) is 759 * flushed. 760 */ 761 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 762 if (ip->redo_count < hammer_limit_redo) { 763 bp->b_flags |= B_VFSFLAG1; 764 error = hammer_generate_redo(&trans, ip, 765 base_offset + offset, 766 HAMMER_REDO_WRITE, 767 bp->b_data + offset, 768 (size_t)n); 769 } else { 770 ip->flags &= ~HAMMER_INODE_REDO; 771 } 772 } 773 774 /* 775 * If we screwed up we have to undo any VM size changes we 776 * made. 777 */ 778 if (error) { 779 brelse(bp); 780 if (fixsize) { 781 nvtruncbuf(ap->a_vp, ip->ino_data.size, 782 hammer_blocksize(ip->ino_data.size), 783 hammer_blockoff(ip->ino_data.size)); 784 } 785 break; 786 } 787 kflags |= NOTE_WRITE; 788 hammer_stats_file_write += n; 789 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ 790 if (ip->ino_data.size < uio->uio_offset) { 791 ip->ino_data.size = uio->uio_offset; 792 flags = HAMMER_INODE_SDIRTY; 793 } else { 794 flags = 0; 795 } 796 ip->ino_data.mtime = trans.time; 797 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 798 hammer_modify_inode(&trans, ip, flags); 799 800 /* 801 * Once we dirty the buffer any cached zone-X offset 802 * becomes invalid. HAMMER NOTE: no-history mode cannot 803 * allow overwriting over the same data sector unless 804 * we provide UNDOs for the old data, which we don't. 805 */ 806 bp->b_bio2.bio_offset = NOOFFSET; 807 808 /* 809 * Final buffer disposition. 810 * 811 * Because meta-data updates are deferred, HAMMER is 812 * especially sensitive to excessive bdwrite()s because 813 * the I/O stream is not broken up by disk reads. So the 814 * buffer cache simply cannot keep up. 815 * 816 * WARNING! blksize is variable. cluster_write() is 817 * expected to not blow up if it encounters 818 * buffers that do not match the passed blksize. 819 * 820 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 821 * The ip->rsv_recs check should burst-flush the data. 822 * If we queue it immediately the buf could be left 823 * locked on the device queue for a very long time. 824 * 825 * However, failing to flush a dirty buffer out when 826 * issued from the pageout daemon can result in a low 827 * memory deadlock against bio_page_alloc(), so we 828 * have to bawrite() on IO_ASYNC as well. 829 * 830 * NOTE! To avoid degenerate stalls due to mismatched block 831 * sizes we only honor IO_DIRECT on the write which 832 * abuts the end of the buffer. However, we must 833 * honor IO_SYNC in case someone is silly enough to 834 * configure a HAMMER file as swap, or when HAMMER 835 * is serving NFS (for commits). Ick ick. 836 */ 837 bp->b_flags |= B_AGE; 838 if (ap->a_ioflag & IO_SYNC) { 839 bwrite(bp); 840 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 841 bawrite(bp); 842 } else if (ap->a_ioflag & IO_ASYNC) { 843 bawrite(bp); 844 } else { 845 #if 0 846 if (offset + n == blksize) { 847 if (hammer_cluster_enable == 0 || 848 (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 849 bawrite(bp); 850 } else { 851 cluster_write(bp, ip->ino_data.size, 852 blksize, seqcount); 853 } 854 } else { 855 #endif 856 bdwrite(bp); 857 } 858 } 859 hammer_done_transaction(&trans); 860 hammer_knote(ap->a_vp, kflags); 861 lwkt_reltoken(&hmp->fs_token); 862 return (error); 863 } 864 865 /* 866 * hammer_vop_access { vp, mode, cred } 867 * 868 * MPSAFE - does not require fs_token 869 */ 870 static 871 int 872 hammer_vop_access(struct vop_access_args *ap) 873 { 874 struct hammer_inode *ip = VTOI(ap->a_vp); 875 uid_t uid; 876 gid_t gid; 877 int error; 878 879 ++hammer_stats_file_iopsr; 880 uid = hammer_to_unix_xid(&ip->ino_data.uid); 881 gid = hammer_to_unix_xid(&ip->ino_data.gid); 882 883 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 884 ip->ino_data.uflags); 885 return (error); 886 } 887 888 /* 889 * hammer_vop_advlock { vp, id, op, fl, flags } 890 * 891 * MPSAFE - does not require fs_token 892 */ 893 static 894 int 895 hammer_vop_advlock(struct vop_advlock_args *ap) 896 { 897 hammer_inode_t ip = VTOI(ap->a_vp); 898 899 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 900 } 901 902 /* 903 * hammer_vop_close { vp, fflag } 904 * 905 * We can only sync-on-close for normal closes. XXX disabled for now. 906 */ 907 static 908 int 909 hammer_vop_close(struct vop_close_args *ap) 910 { 911 #if 0 912 struct vnode *vp = ap->a_vp; 913 hammer_inode_t ip = VTOI(vp); 914 int waitfor; 915 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 916 if (vn_islocked(vp) == LK_EXCLUSIVE && 917 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 918 if (ip->flags & HAMMER_INODE_CLOSESYNC) 919 waitfor = MNT_WAIT; 920 else 921 waitfor = MNT_NOWAIT; 922 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 923 HAMMER_INODE_CLOSEASYNC); 924 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 925 } 926 } 927 #endif 928 return (vop_stdclose(ap)); 929 } 930 931 /* 932 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 933 * 934 * The operating system has already ensured that the directory entry 935 * does not exist and done all appropriate namespace locking. 936 */ 937 static 938 int 939 hammer_vop_ncreate(struct vop_ncreate_args *ap) 940 { 941 struct hammer_transaction trans; 942 struct hammer_inode *dip; 943 struct hammer_inode *nip; 944 struct nchandle *nch; 945 hammer_mount_t hmp; 946 int error; 947 948 nch = ap->a_nch; 949 dip = VTOI(ap->a_dvp); 950 hmp = dip->hmp; 951 952 if (dip->flags & HAMMER_INODE_RO) 953 return (EROFS); 954 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 955 return (error); 956 957 /* 958 * Create a transaction to cover the operations we perform. 959 */ 960 lwkt_gettoken(&hmp->fs_token); 961 hammer_start_transaction(&trans, hmp); 962 ++hammer_stats_file_iopsw; 963 964 /* 965 * Create a new filesystem object of the requested type. The 966 * returned inode will be referenced and shared-locked to prevent 967 * it from being moved to the flusher. 968 */ 969 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 970 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 971 NULL, &nip); 972 if (error) { 973 hkprintf("hammer_create_inode error %d\n", error); 974 hammer_done_transaction(&trans); 975 *ap->a_vpp = NULL; 976 lwkt_reltoken(&hmp->fs_token); 977 return (error); 978 } 979 980 /* 981 * Add the new filesystem object to the directory. This will also 982 * bump the inode's link count. 983 */ 984 error = hammer_ip_add_directory(&trans, dip, 985 nch->ncp->nc_name, nch->ncp->nc_nlen, 986 nip); 987 if (error) 988 hkprintf("hammer_ip_add_directory error %d\n", error); 989 990 /* 991 * Finish up. 992 */ 993 if (error) { 994 hammer_rel_inode(nip, 0); 995 hammer_done_transaction(&trans); 996 *ap->a_vpp = NULL; 997 } else { 998 error = hammer_get_vnode(nip, ap->a_vpp); 999 hammer_done_transaction(&trans); 1000 hammer_rel_inode(nip, 0); 1001 if (error == 0) { 1002 cache_setunresolved(ap->a_nch); 1003 cache_setvp(ap->a_nch, *ap->a_vpp); 1004 } 1005 hammer_knote(ap->a_dvp, NOTE_WRITE); 1006 } 1007 lwkt_reltoken(&hmp->fs_token); 1008 return (error); 1009 } 1010 1011 /* 1012 * hammer_vop_getattr { vp, vap } 1013 * 1014 * Retrieve an inode's attribute information. When accessing inodes 1015 * historically we fake the atime field to ensure consistent results. 1016 * The atime field is stored in the B-Tree element and allowed to be 1017 * updated without cycling the element. 1018 * 1019 * MPSAFE - does not require fs_token 1020 */ 1021 static 1022 int 1023 hammer_vop_getattr(struct vop_getattr_args *ap) 1024 { 1025 struct hammer_inode *ip = VTOI(ap->a_vp); 1026 struct vattr *vap = ap->a_vap; 1027 1028 /* 1029 * We want the fsid to be different when accessing a filesystem 1030 * with different as-of's so programs like diff don't think 1031 * the files are the same. 1032 * 1033 * We also want the fsid to be the same when comparing snapshots, 1034 * or when comparing mirrors (which might be backed by different 1035 * physical devices). HAMMER fsids are based on the PFS's 1036 * shared_uuid field. 1037 * 1038 * XXX there is a chance of collision here. The va_fsid reported 1039 * by stat is different from the more involved fsid used in the 1040 * mount structure. 1041 */ 1042 ++hammer_stats_file_iopsr; 1043 hammer_lock_sh(&ip->lock); 1044 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 1045 (u_int32_t)(ip->obj_asof >> 32); 1046 1047 vap->va_fileid = ip->ino_leaf.base.obj_id; 1048 vap->va_mode = ip->ino_data.mode; 1049 vap->va_nlink = ip->ino_data.nlinks; 1050 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1051 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1052 vap->va_rmajor = 0; 1053 vap->va_rminor = 0; 1054 vap->va_size = ip->ino_data.size; 1055 1056 /* 1057 * Special case for @@PFS softlinks. The actual size of the 1058 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1059 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1060 */ 1061 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1062 ip->ino_data.size == 10 && 1063 ip->obj_asof == HAMMER_MAX_TID && 1064 ip->obj_localization == 0 && 1065 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1066 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) 1067 vap->va_size = 26; 1068 else 1069 vap->va_size = 10; 1070 } 1071 1072 /* 1073 * We must provide a consistent atime and mtime for snapshots 1074 * so people can do a 'tar cf - ... | md5' on them and get 1075 * consistent results. 1076 */ 1077 if (ip->flags & HAMMER_INODE_RO) { 1078 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1079 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1080 } else { 1081 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1082 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1083 } 1084 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1085 vap->va_flags = ip->ino_data.uflags; 1086 vap->va_gen = 1; /* hammer inums are unique for all time */ 1087 vap->va_blocksize = HAMMER_BUFSIZE; 1088 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1089 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 1090 ~HAMMER_XBUFMASK64; 1091 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) { 1092 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 1093 ~HAMMER_BUFMASK64; 1094 } else { 1095 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 1096 } 1097 1098 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1099 vap->va_filerev = 0; /* XXX */ 1100 vap->va_uid_uuid = ip->ino_data.uid; 1101 vap->va_gid_uuid = ip->ino_data.gid; 1102 vap->va_fsid_uuid = ip->hmp->fsid; 1103 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1104 VA_FSID_UUID_VALID; 1105 1106 switch (ip->ino_data.obj_type) { 1107 case HAMMER_OBJTYPE_CDEV: 1108 case HAMMER_OBJTYPE_BDEV: 1109 vap->va_rmajor = ip->ino_data.rmajor; 1110 vap->va_rminor = ip->ino_data.rminor; 1111 break; 1112 default: 1113 break; 1114 } 1115 hammer_unlock(&ip->lock); 1116 return(0); 1117 } 1118 1119 /* 1120 * hammer_vop_nresolve { nch, dvp, cred } 1121 * 1122 * Locate the requested directory entry. 1123 */ 1124 static 1125 int 1126 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1127 { 1128 struct hammer_transaction trans; 1129 struct namecache *ncp; 1130 hammer_mount_t hmp; 1131 hammer_inode_t dip; 1132 hammer_inode_t ip; 1133 hammer_tid_t asof; 1134 struct hammer_cursor cursor; 1135 struct vnode *vp; 1136 int64_t namekey; 1137 int error; 1138 int i; 1139 int nlen; 1140 int flags; 1141 int ispfs; 1142 int64_t obj_id; 1143 u_int32_t localization; 1144 u_int32_t max_iterations; 1145 1146 /* 1147 * Misc initialization, plus handle as-of name extensions. Look for 1148 * the '@@' extension. Note that as-of files and directories cannot 1149 * be modified. 1150 */ 1151 dip = VTOI(ap->a_dvp); 1152 ncp = ap->a_nch->ncp; 1153 asof = dip->obj_asof; 1154 localization = dip->obj_localization; /* for code consistency */ 1155 nlen = ncp->nc_nlen; 1156 flags = dip->flags & HAMMER_INODE_RO; 1157 ispfs = 0; 1158 hmp = dip->hmp; 1159 1160 lwkt_gettoken(&hmp->fs_token); 1161 hammer_simple_transaction(&trans, hmp); 1162 ++hammer_stats_file_iopsr; 1163 1164 for (i = 0; i < nlen; ++i) { 1165 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1166 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1167 &ispfs, &asof, &localization); 1168 if (error != 0) { 1169 i = nlen; 1170 break; 1171 } 1172 if (asof != HAMMER_MAX_TID) 1173 flags |= HAMMER_INODE_RO; 1174 break; 1175 } 1176 } 1177 nlen = i; 1178 1179 /* 1180 * If this is a PFS softlink we dive into the PFS 1181 */ 1182 if (ispfs && nlen == 0) { 1183 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1184 asof, localization, 1185 flags, &error); 1186 if (error == 0) { 1187 error = hammer_get_vnode(ip, &vp); 1188 hammer_rel_inode(ip, 0); 1189 } else { 1190 vp = NULL; 1191 } 1192 if (error == 0) { 1193 vn_unlock(vp); 1194 cache_setvp(ap->a_nch, vp); 1195 vrele(vp); 1196 } 1197 goto done; 1198 } 1199 1200 /* 1201 * If there is no path component the time extension is relative to dip. 1202 * e.g. "fubar/@@<snapshot>" 1203 * 1204 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1205 * e.g. "fubar/.@@<snapshot>" 1206 * 1207 * ".." is handled by the kernel. We do not currently handle 1208 * "..@<snapshot>". 1209 */ 1210 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1211 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1212 asof, dip->obj_localization, 1213 flags, &error); 1214 if (error == 0) { 1215 error = hammer_get_vnode(ip, &vp); 1216 hammer_rel_inode(ip, 0); 1217 } else { 1218 vp = NULL; 1219 } 1220 if (error == 0) { 1221 vn_unlock(vp); 1222 cache_setvp(ap->a_nch, vp); 1223 vrele(vp); 1224 } 1225 goto done; 1226 } 1227 1228 /* 1229 * Calculate the namekey and setup the key range for the scan. This 1230 * works kinda like a chained hash table where the lower 32 bits 1231 * of the namekey synthesize the chain. 1232 * 1233 * The key range is inclusive of both key_beg and key_end. 1234 */ 1235 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 1236 &max_iterations); 1237 1238 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1239 cursor.key_beg.localization = dip->obj_localization + 1240 hammer_dir_localization(dip); 1241 cursor.key_beg.obj_id = dip->obj_id; 1242 cursor.key_beg.key = namekey; 1243 cursor.key_beg.create_tid = 0; 1244 cursor.key_beg.delete_tid = 0; 1245 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1246 cursor.key_beg.obj_type = 0; 1247 1248 cursor.key_end = cursor.key_beg; 1249 cursor.key_end.key += max_iterations; 1250 cursor.asof = asof; 1251 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1252 1253 /* 1254 * Scan all matching records (the chain), locate the one matching 1255 * the requested path component. 1256 * 1257 * The hammer_ip_*() functions merge in-memory records with on-disk 1258 * records for the purposes of the search. 1259 */ 1260 obj_id = 0; 1261 localization = HAMMER_DEF_LOCALIZATION; 1262 1263 if (error == 0) { 1264 error = hammer_ip_first(&cursor); 1265 while (error == 0) { 1266 error = hammer_ip_resolve_data(&cursor); 1267 if (error) 1268 break; 1269 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1270 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1271 obj_id = cursor.data->entry.obj_id; 1272 localization = cursor.data->entry.localization; 1273 break; 1274 } 1275 error = hammer_ip_next(&cursor); 1276 } 1277 } 1278 hammer_done_cursor(&cursor); 1279 1280 /* 1281 * Lookup the obj_id. This should always succeed. If it does not 1282 * the filesystem may be damaged and we return a dummy inode. 1283 */ 1284 if (error == 0) { 1285 ip = hammer_get_inode(&trans, dip, obj_id, 1286 asof, localization, 1287 flags, &error); 1288 if (error == ENOENT) { 1289 kprintf("HAMMER: WARNING: Missing " 1290 "inode for dirent \"%s\"\n" 1291 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n", 1292 ncp->nc_name, 1293 (long long)obj_id, (long long)asof, 1294 localization); 1295 error = 0; 1296 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1297 asof, localization, 1298 flags, &error); 1299 } 1300 if (error == 0) { 1301 error = hammer_get_vnode(ip, &vp); 1302 hammer_rel_inode(ip, 0); 1303 } else { 1304 vp = NULL; 1305 } 1306 if (error == 0) { 1307 vn_unlock(vp); 1308 cache_setvp(ap->a_nch, vp); 1309 vrele(vp); 1310 } 1311 } else if (error == ENOENT) { 1312 cache_setvp(ap->a_nch, NULL); 1313 } 1314 done: 1315 hammer_done_transaction(&trans); 1316 lwkt_reltoken(&hmp->fs_token); 1317 return (error); 1318 } 1319 1320 /* 1321 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1322 * 1323 * Locate the parent directory of a directory vnode. 1324 * 1325 * dvp is referenced but not locked. *vpp must be returned referenced and 1326 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 1327 * at the root, instead it could indicate that the directory we were in was 1328 * removed. 1329 * 1330 * NOTE: as-of sequences are not linked into the directory structure. If 1331 * we are at the root with a different asof then the mount point, reload 1332 * the same directory with the mount point's asof. I'm not sure what this 1333 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1334 * get confused, but it hasn't been tested. 1335 */ 1336 static 1337 int 1338 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1339 { 1340 struct hammer_transaction trans; 1341 struct hammer_inode *dip; 1342 struct hammer_inode *ip; 1343 hammer_mount_t hmp; 1344 int64_t parent_obj_id; 1345 u_int32_t parent_obj_localization; 1346 hammer_tid_t asof; 1347 int error; 1348 1349 dip = VTOI(ap->a_dvp); 1350 asof = dip->obj_asof; 1351 hmp = dip->hmp; 1352 1353 /* 1354 * Whos are parent? This could be the root of a pseudo-filesystem 1355 * whos parent is in another localization domain. 1356 */ 1357 lwkt_gettoken(&hmp->fs_token); 1358 parent_obj_id = dip->ino_data.parent_obj_id; 1359 if (dip->obj_id == HAMMER_OBJID_ROOT) 1360 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 1361 else 1362 parent_obj_localization = dip->obj_localization; 1363 1364 if (parent_obj_id == 0) { 1365 if (dip->obj_id == HAMMER_OBJID_ROOT && 1366 asof != hmp->asof) { 1367 parent_obj_id = dip->obj_id; 1368 asof = hmp->asof; 1369 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1370 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 1371 (long long)dip->obj_asof); 1372 } else { 1373 *ap->a_vpp = NULL; 1374 lwkt_reltoken(&hmp->fs_token); 1375 return ENOENT; 1376 } 1377 } 1378 1379 hammer_simple_transaction(&trans, hmp); 1380 ++hammer_stats_file_iopsr; 1381 1382 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1383 asof, parent_obj_localization, 1384 dip->flags, &error); 1385 if (ip) { 1386 error = hammer_get_vnode(ip, ap->a_vpp); 1387 hammer_rel_inode(ip, 0); 1388 } else { 1389 *ap->a_vpp = NULL; 1390 } 1391 hammer_done_transaction(&trans); 1392 lwkt_reltoken(&hmp->fs_token); 1393 return (error); 1394 } 1395 1396 /* 1397 * hammer_vop_nlink { nch, dvp, vp, cred } 1398 */ 1399 static 1400 int 1401 hammer_vop_nlink(struct vop_nlink_args *ap) 1402 { 1403 struct hammer_transaction trans; 1404 struct hammer_inode *dip; 1405 struct hammer_inode *ip; 1406 struct nchandle *nch; 1407 hammer_mount_t hmp; 1408 int error; 1409 1410 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1411 return(EXDEV); 1412 1413 nch = ap->a_nch; 1414 dip = VTOI(ap->a_dvp); 1415 ip = VTOI(ap->a_vp); 1416 hmp = dip->hmp; 1417 1418 if (dip->obj_localization != ip->obj_localization) 1419 return(EXDEV); 1420 1421 if (dip->flags & HAMMER_INODE_RO) 1422 return (EROFS); 1423 if (ip->flags & HAMMER_INODE_RO) 1424 return (EROFS); 1425 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1426 return (error); 1427 1428 /* 1429 * Create a transaction to cover the operations we perform. 1430 */ 1431 lwkt_gettoken(&hmp->fs_token); 1432 hammer_start_transaction(&trans, hmp); 1433 ++hammer_stats_file_iopsw; 1434 1435 /* 1436 * Add the filesystem object to the directory. Note that neither 1437 * dip nor ip are referenced or locked, but their vnodes are 1438 * referenced. This function will bump the inode's link count. 1439 */ 1440 error = hammer_ip_add_directory(&trans, dip, 1441 nch->ncp->nc_name, nch->ncp->nc_nlen, 1442 ip); 1443 1444 /* 1445 * Finish up. 1446 */ 1447 if (error == 0) { 1448 cache_setunresolved(nch); 1449 cache_setvp(nch, ap->a_vp); 1450 } 1451 hammer_done_transaction(&trans); 1452 hammer_knote(ap->a_vp, NOTE_LINK); 1453 hammer_knote(ap->a_dvp, NOTE_WRITE); 1454 lwkt_reltoken(&hmp->fs_token); 1455 return (error); 1456 } 1457 1458 /* 1459 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1460 * 1461 * The operating system has already ensured that the directory entry 1462 * does not exist and done all appropriate namespace locking. 1463 */ 1464 static 1465 int 1466 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1467 { 1468 struct hammer_transaction trans; 1469 struct hammer_inode *dip; 1470 struct hammer_inode *nip; 1471 struct nchandle *nch; 1472 hammer_mount_t hmp; 1473 int error; 1474 1475 nch = ap->a_nch; 1476 dip = VTOI(ap->a_dvp); 1477 hmp = dip->hmp; 1478 1479 if (dip->flags & HAMMER_INODE_RO) 1480 return (EROFS); 1481 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1482 return (error); 1483 1484 /* 1485 * Create a transaction to cover the operations we perform. 1486 */ 1487 lwkt_gettoken(&hmp->fs_token); 1488 hammer_start_transaction(&trans, hmp); 1489 ++hammer_stats_file_iopsw; 1490 1491 /* 1492 * Create a new filesystem object of the requested type. The 1493 * returned inode will be referenced but not locked. 1494 */ 1495 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1496 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1497 NULL, &nip); 1498 if (error) { 1499 hkprintf("hammer_mkdir error %d\n", error); 1500 hammer_done_transaction(&trans); 1501 *ap->a_vpp = NULL; 1502 lwkt_reltoken(&hmp->fs_token); 1503 return (error); 1504 } 1505 /* 1506 * Add the new filesystem object to the directory. This will also 1507 * bump the inode's link count. 1508 */ 1509 error = hammer_ip_add_directory(&trans, dip, 1510 nch->ncp->nc_name, nch->ncp->nc_nlen, 1511 nip); 1512 if (error) 1513 hkprintf("hammer_mkdir (add) error %d\n", error); 1514 1515 /* 1516 * Finish up. 1517 */ 1518 if (error) { 1519 hammer_rel_inode(nip, 0); 1520 *ap->a_vpp = NULL; 1521 } else { 1522 error = hammer_get_vnode(nip, ap->a_vpp); 1523 hammer_rel_inode(nip, 0); 1524 if (error == 0) { 1525 cache_setunresolved(ap->a_nch); 1526 cache_setvp(ap->a_nch, *ap->a_vpp); 1527 } 1528 } 1529 hammer_done_transaction(&trans); 1530 if (error == 0) 1531 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1532 lwkt_reltoken(&hmp->fs_token); 1533 return (error); 1534 } 1535 1536 /* 1537 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1538 * 1539 * The operating system has already ensured that the directory entry 1540 * does not exist and done all appropriate namespace locking. 1541 */ 1542 static 1543 int 1544 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1545 { 1546 struct hammer_transaction trans; 1547 struct hammer_inode *dip; 1548 struct hammer_inode *nip; 1549 struct nchandle *nch; 1550 hammer_mount_t hmp; 1551 int error; 1552 1553 nch = ap->a_nch; 1554 dip = VTOI(ap->a_dvp); 1555 hmp = dip->hmp; 1556 1557 if (dip->flags & HAMMER_INODE_RO) 1558 return (EROFS); 1559 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1560 return (error); 1561 1562 /* 1563 * Create a transaction to cover the operations we perform. 1564 */ 1565 lwkt_gettoken(&hmp->fs_token); 1566 hammer_start_transaction(&trans, hmp); 1567 ++hammer_stats_file_iopsw; 1568 1569 /* 1570 * Create a new filesystem object of the requested type. The 1571 * returned inode will be referenced but not locked. 1572 * 1573 * If mknod specifies a directory a pseudo-fs is created. 1574 */ 1575 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1576 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1577 NULL, &nip); 1578 if (error) { 1579 hammer_done_transaction(&trans); 1580 *ap->a_vpp = NULL; 1581 lwkt_reltoken(&hmp->fs_token); 1582 return (error); 1583 } 1584 1585 /* 1586 * Add the new filesystem object to the directory. This will also 1587 * bump the inode's link count. 1588 */ 1589 error = hammer_ip_add_directory(&trans, dip, 1590 nch->ncp->nc_name, nch->ncp->nc_nlen, 1591 nip); 1592 1593 /* 1594 * Finish up. 1595 */ 1596 if (error) { 1597 hammer_rel_inode(nip, 0); 1598 *ap->a_vpp = NULL; 1599 } else { 1600 error = hammer_get_vnode(nip, ap->a_vpp); 1601 hammer_rel_inode(nip, 0); 1602 if (error == 0) { 1603 cache_setunresolved(ap->a_nch); 1604 cache_setvp(ap->a_nch, *ap->a_vpp); 1605 } 1606 } 1607 hammer_done_transaction(&trans); 1608 if (error == 0) 1609 hammer_knote(ap->a_dvp, NOTE_WRITE); 1610 lwkt_reltoken(&hmp->fs_token); 1611 return (error); 1612 } 1613 1614 /* 1615 * hammer_vop_open { vp, mode, cred, fp } 1616 * 1617 * MPSAFE (does not require fs_token) 1618 */ 1619 static 1620 int 1621 hammer_vop_open(struct vop_open_args *ap) 1622 { 1623 hammer_inode_t ip; 1624 1625 ++hammer_stats_file_iopsr; 1626 ip = VTOI(ap->a_vp); 1627 1628 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1629 return (EROFS); 1630 return(vop_stdopen(ap)); 1631 } 1632 1633 /* 1634 * hammer_vop_print { vp } 1635 */ 1636 static 1637 int 1638 hammer_vop_print(struct vop_print_args *ap) 1639 { 1640 return EOPNOTSUPP; 1641 } 1642 1643 /* 1644 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1645 */ 1646 static 1647 int 1648 hammer_vop_readdir(struct vop_readdir_args *ap) 1649 { 1650 struct hammer_transaction trans; 1651 struct hammer_cursor cursor; 1652 struct hammer_inode *ip; 1653 hammer_mount_t hmp; 1654 struct uio *uio; 1655 hammer_base_elm_t base; 1656 int error; 1657 int cookie_index; 1658 int ncookies; 1659 off_t *cookies; 1660 off_t saveoff; 1661 int r; 1662 int dtype; 1663 1664 ++hammer_stats_file_iopsr; 1665 ip = VTOI(ap->a_vp); 1666 uio = ap->a_uio; 1667 saveoff = uio->uio_offset; 1668 hmp = ip->hmp; 1669 1670 if (ap->a_ncookies) { 1671 ncookies = uio->uio_resid / 16 + 1; 1672 if (ncookies > 1024) 1673 ncookies = 1024; 1674 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1675 cookie_index = 0; 1676 } else { 1677 ncookies = -1; 1678 cookies = NULL; 1679 cookie_index = 0; 1680 } 1681 1682 lwkt_gettoken(&hmp->fs_token); 1683 hammer_simple_transaction(&trans, hmp); 1684 1685 /* 1686 * Handle artificial entries 1687 * 1688 * It should be noted that the minimum value for a directory 1689 * hash key on-media is 0x0000000100000000, so we can use anything 1690 * less then that to represent our 'special' key space. 1691 */ 1692 error = 0; 1693 if (saveoff == 0) { 1694 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1695 if (r) 1696 goto done; 1697 if (cookies) 1698 cookies[cookie_index] = saveoff; 1699 ++saveoff; 1700 ++cookie_index; 1701 if (cookie_index == ncookies) 1702 goto done; 1703 } 1704 if (saveoff == 1) { 1705 if (ip->ino_data.parent_obj_id) { 1706 r = vop_write_dirent(&error, uio, 1707 ip->ino_data.parent_obj_id, 1708 DT_DIR, 2, ".."); 1709 } else { 1710 r = vop_write_dirent(&error, uio, 1711 ip->obj_id, DT_DIR, 2, ".."); 1712 } 1713 if (r) 1714 goto done; 1715 if (cookies) 1716 cookies[cookie_index] = saveoff; 1717 ++saveoff; 1718 ++cookie_index; 1719 if (cookie_index == ncookies) 1720 goto done; 1721 } 1722 1723 /* 1724 * Key range (begin and end inclusive) to scan. Directory keys 1725 * directly translate to a 64 bit 'seek' position. 1726 */ 1727 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1728 cursor.key_beg.localization = ip->obj_localization + 1729 hammer_dir_localization(ip); 1730 cursor.key_beg.obj_id = ip->obj_id; 1731 cursor.key_beg.create_tid = 0; 1732 cursor.key_beg.delete_tid = 0; 1733 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1734 cursor.key_beg.obj_type = 0; 1735 cursor.key_beg.key = saveoff; 1736 1737 cursor.key_end = cursor.key_beg; 1738 cursor.key_end.key = HAMMER_MAX_KEY; 1739 cursor.asof = ip->obj_asof; 1740 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1741 1742 error = hammer_ip_first(&cursor); 1743 1744 while (error == 0) { 1745 error = hammer_ip_resolve_data(&cursor); 1746 if (error) 1747 break; 1748 base = &cursor.leaf->base; 1749 saveoff = base->key; 1750 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1751 1752 if (base->obj_id != ip->obj_id) 1753 panic("readdir: bad record at %p", cursor.node); 1754 1755 /* 1756 * Convert pseudo-filesystems into softlinks 1757 */ 1758 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1759 r = vop_write_dirent( 1760 &error, uio, cursor.data->entry.obj_id, 1761 dtype, 1762 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1763 (void *)cursor.data->entry.name); 1764 if (r) 1765 break; 1766 ++saveoff; 1767 if (cookies) 1768 cookies[cookie_index] = base->key; 1769 ++cookie_index; 1770 if (cookie_index == ncookies) 1771 break; 1772 error = hammer_ip_next(&cursor); 1773 } 1774 hammer_done_cursor(&cursor); 1775 1776 done: 1777 hammer_done_transaction(&trans); 1778 1779 if (ap->a_eofflag) 1780 *ap->a_eofflag = (error == ENOENT); 1781 uio->uio_offset = saveoff; 1782 if (error && cookie_index == 0) { 1783 if (error == ENOENT) 1784 error = 0; 1785 if (cookies) { 1786 kfree(cookies, M_TEMP); 1787 *ap->a_ncookies = 0; 1788 *ap->a_cookies = NULL; 1789 } 1790 } else { 1791 if (error == ENOENT) 1792 error = 0; 1793 if (cookies) { 1794 *ap->a_ncookies = cookie_index; 1795 *ap->a_cookies = cookies; 1796 } 1797 } 1798 lwkt_reltoken(&hmp->fs_token); 1799 return(error); 1800 } 1801 1802 /* 1803 * hammer_vop_readlink { vp, uio, cred } 1804 */ 1805 static 1806 int 1807 hammer_vop_readlink(struct vop_readlink_args *ap) 1808 { 1809 struct hammer_transaction trans; 1810 struct hammer_cursor cursor; 1811 struct hammer_inode *ip; 1812 hammer_mount_t hmp; 1813 char buf[32]; 1814 u_int32_t localization; 1815 hammer_pseudofs_inmem_t pfsm; 1816 int error; 1817 1818 ip = VTOI(ap->a_vp); 1819 hmp = ip->hmp; 1820 1821 lwkt_gettoken(&hmp->fs_token); 1822 1823 /* 1824 * Shortcut if the symlink data was stuffed into ino_data. 1825 * 1826 * Also expand special "@@PFS%05d" softlinks (expansion only 1827 * occurs for non-historical (current) accesses made from the 1828 * primary filesystem). 1829 */ 1830 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1831 char *ptr; 1832 int bytes; 1833 1834 ptr = ip->ino_data.ext.symlink; 1835 bytes = (int)ip->ino_data.size; 1836 if (bytes == 10 && 1837 ip->obj_asof == HAMMER_MAX_TID && 1838 ip->obj_localization == 0 && 1839 strncmp(ptr, "@@PFS", 5) == 0) { 1840 hammer_simple_transaction(&trans, hmp); 1841 bcopy(ptr + 5, buf, 5); 1842 buf[5] = 0; 1843 localization = strtoul(buf, NULL, 10) << 16; 1844 pfsm = hammer_load_pseudofs(&trans, localization, 1845 &error); 1846 if (error == 0) { 1847 if (pfsm->pfsd.mirror_flags & 1848 HAMMER_PFSD_SLAVE) { 1849 /* vap->va_size == 26 */ 1850 ksnprintf(buf, sizeof(buf), 1851 "@@0x%016llx:%05d", 1852 (long long)pfsm->pfsd.sync_end_tid, 1853 localization >> 16); 1854 } else { 1855 /* vap->va_size == 10 */ 1856 ksnprintf(buf, sizeof(buf), 1857 "@@-1:%05d", 1858 localization >> 16); 1859 #if 0 1860 ksnprintf(buf, sizeof(buf), 1861 "@@0x%016llx:%05d", 1862 (long long)HAMMER_MAX_TID, 1863 localization >> 16); 1864 #endif 1865 } 1866 ptr = buf; 1867 bytes = strlen(buf); 1868 } 1869 if (pfsm) 1870 hammer_rel_pseudofs(hmp, pfsm); 1871 hammer_done_transaction(&trans); 1872 } 1873 error = uiomove(ptr, bytes, ap->a_uio); 1874 lwkt_reltoken(&hmp->fs_token); 1875 return(error); 1876 } 1877 1878 /* 1879 * Long version 1880 */ 1881 hammer_simple_transaction(&trans, hmp); 1882 ++hammer_stats_file_iopsr; 1883 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1884 1885 /* 1886 * Key range (begin and end inclusive) to scan. Directory keys 1887 * directly translate to a 64 bit 'seek' position. 1888 */ 1889 cursor.key_beg.localization = ip->obj_localization + 1890 HAMMER_LOCALIZE_MISC; 1891 cursor.key_beg.obj_id = ip->obj_id; 1892 cursor.key_beg.create_tid = 0; 1893 cursor.key_beg.delete_tid = 0; 1894 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1895 cursor.key_beg.obj_type = 0; 1896 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1897 cursor.asof = ip->obj_asof; 1898 cursor.flags |= HAMMER_CURSOR_ASOF; 1899 1900 error = hammer_ip_lookup(&cursor); 1901 if (error == 0) { 1902 error = hammer_ip_resolve_data(&cursor); 1903 if (error == 0) { 1904 KKASSERT(cursor.leaf->data_len >= 1905 HAMMER_SYMLINK_NAME_OFF); 1906 error = uiomove(cursor.data->symlink.name, 1907 cursor.leaf->data_len - 1908 HAMMER_SYMLINK_NAME_OFF, 1909 ap->a_uio); 1910 } 1911 } 1912 hammer_done_cursor(&cursor); 1913 hammer_done_transaction(&trans); 1914 lwkt_reltoken(&hmp->fs_token); 1915 return(error); 1916 } 1917 1918 /* 1919 * hammer_vop_nremove { nch, dvp, cred } 1920 */ 1921 static 1922 int 1923 hammer_vop_nremove(struct vop_nremove_args *ap) 1924 { 1925 struct hammer_transaction trans; 1926 struct hammer_inode *dip; 1927 hammer_mount_t hmp; 1928 int error; 1929 1930 dip = VTOI(ap->a_dvp); 1931 hmp = dip->hmp; 1932 1933 if (hammer_nohistory(dip) == 0 && 1934 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1935 return (error); 1936 } 1937 1938 lwkt_gettoken(&hmp->fs_token); 1939 hammer_start_transaction(&trans, hmp); 1940 ++hammer_stats_file_iopsw; 1941 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1942 hammer_done_transaction(&trans); 1943 if (error == 0) 1944 hammer_knote(ap->a_dvp, NOTE_WRITE); 1945 lwkt_reltoken(&hmp->fs_token); 1946 return (error); 1947 } 1948 1949 /* 1950 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1951 */ 1952 static 1953 int 1954 hammer_vop_nrename(struct vop_nrename_args *ap) 1955 { 1956 struct hammer_transaction trans; 1957 struct namecache *fncp; 1958 struct namecache *tncp; 1959 struct hammer_inode *fdip; 1960 struct hammer_inode *tdip; 1961 struct hammer_inode *ip; 1962 hammer_mount_t hmp; 1963 struct hammer_cursor cursor; 1964 int64_t namekey; 1965 u_int32_t max_iterations; 1966 int nlen, error; 1967 1968 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1969 return(EXDEV); 1970 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1971 return(EXDEV); 1972 1973 fdip = VTOI(ap->a_fdvp); 1974 tdip = VTOI(ap->a_tdvp); 1975 fncp = ap->a_fnch->ncp; 1976 tncp = ap->a_tnch->ncp; 1977 ip = VTOI(fncp->nc_vp); 1978 KKASSERT(ip != NULL); 1979 1980 hmp = ip->hmp; 1981 1982 if (fdip->obj_localization != tdip->obj_localization) 1983 return(EXDEV); 1984 if (fdip->obj_localization != ip->obj_localization) 1985 return(EXDEV); 1986 1987 if (fdip->flags & HAMMER_INODE_RO) 1988 return (EROFS); 1989 if (tdip->flags & HAMMER_INODE_RO) 1990 return (EROFS); 1991 if (ip->flags & HAMMER_INODE_RO) 1992 return (EROFS); 1993 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1994 return (error); 1995 1996 lwkt_gettoken(&hmp->fs_token); 1997 hammer_start_transaction(&trans, hmp); 1998 ++hammer_stats_file_iopsw; 1999 2000 /* 2001 * Remove tncp from the target directory and then link ip as 2002 * tncp. XXX pass trans to dounlink 2003 * 2004 * Force the inode sync-time to match the transaction so it is 2005 * in-sync with the creation of the target directory entry. 2006 */ 2007 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 2008 ap->a_cred, 0, -1); 2009 if (error == 0 || error == ENOENT) { 2010 error = hammer_ip_add_directory(&trans, tdip, 2011 tncp->nc_name, tncp->nc_nlen, 2012 ip); 2013 if (error == 0) { 2014 ip->ino_data.parent_obj_id = tdip->obj_id; 2015 ip->ino_data.ctime = trans.time; 2016 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 2017 } 2018 } 2019 if (error) 2020 goto failed; /* XXX */ 2021 2022 /* 2023 * Locate the record in the originating directory and remove it. 2024 * 2025 * Calculate the namekey and setup the key range for the scan. This 2026 * works kinda like a chained hash table where the lower 32 bits 2027 * of the namekey synthesize the chain. 2028 * 2029 * The key range is inclusive of both key_beg and key_end. 2030 */ 2031 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 2032 &max_iterations); 2033 retry: 2034 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 2035 cursor.key_beg.localization = fdip->obj_localization + 2036 hammer_dir_localization(fdip); 2037 cursor.key_beg.obj_id = fdip->obj_id; 2038 cursor.key_beg.key = namekey; 2039 cursor.key_beg.create_tid = 0; 2040 cursor.key_beg.delete_tid = 0; 2041 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2042 cursor.key_beg.obj_type = 0; 2043 2044 cursor.key_end = cursor.key_beg; 2045 cursor.key_end.key += max_iterations; 2046 cursor.asof = fdip->obj_asof; 2047 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2048 2049 /* 2050 * Scan all matching records (the chain), locate the one matching 2051 * the requested path component. 2052 * 2053 * The hammer_ip_*() functions merge in-memory records with on-disk 2054 * records for the purposes of the search. 2055 */ 2056 error = hammer_ip_first(&cursor); 2057 while (error == 0) { 2058 if (hammer_ip_resolve_data(&cursor) != 0) 2059 break; 2060 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2061 KKASSERT(nlen > 0); 2062 if (fncp->nc_nlen == nlen && 2063 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2064 break; 2065 } 2066 error = hammer_ip_next(&cursor); 2067 } 2068 2069 /* 2070 * If all is ok we have to get the inode so we can adjust nlinks. 2071 * 2072 * WARNING: hammer_ip_del_directory() may have to terminate the 2073 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2074 * twice. 2075 */ 2076 if (error == 0) 2077 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 2078 2079 /* 2080 * XXX A deadlock here will break rename's atomicy for the purposes 2081 * of crash recovery. 2082 */ 2083 if (error == EDEADLK) { 2084 hammer_done_cursor(&cursor); 2085 goto retry; 2086 } 2087 2088 /* 2089 * Cleanup and tell the kernel that the rename succeeded. 2090 * 2091 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2092 * without formally acquiring the vp since the vp might 2093 * have zero refs on it, or in the middle of a reclaim, 2094 * etc. 2095 */ 2096 hammer_done_cursor(&cursor); 2097 if (error == 0) { 2098 cache_rename(ap->a_fnch, ap->a_tnch); 2099 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2100 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2101 while (ip->vp) { 2102 struct vnode *vp; 2103 2104 error = hammer_get_vnode(ip, &vp); 2105 if (error == 0 && vp) { 2106 vn_unlock(vp); 2107 hammer_knote(ip->vp, NOTE_RENAME); 2108 vrele(vp); 2109 break; 2110 } 2111 kprintf("Debug: HAMMER ip/vp race2 avoided\n"); 2112 } 2113 } 2114 2115 failed: 2116 hammer_done_transaction(&trans); 2117 lwkt_reltoken(&hmp->fs_token); 2118 return (error); 2119 } 2120 2121 /* 2122 * hammer_vop_nrmdir { nch, dvp, cred } 2123 */ 2124 static 2125 int 2126 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2127 { 2128 struct hammer_transaction trans; 2129 struct hammer_inode *dip; 2130 hammer_mount_t hmp; 2131 int error; 2132 2133 dip = VTOI(ap->a_dvp); 2134 hmp = dip->hmp; 2135 2136 if (hammer_nohistory(dip) == 0 && 2137 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2138 return (error); 2139 } 2140 2141 lwkt_gettoken(&hmp->fs_token); 2142 hammer_start_transaction(&trans, hmp); 2143 ++hammer_stats_file_iopsw; 2144 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2145 hammer_done_transaction(&trans); 2146 if (error == 0) 2147 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2148 lwkt_reltoken(&hmp->fs_token); 2149 return (error); 2150 } 2151 2152 /* 2153 * hammer_vop_markatime { vp, cred } 2154 */ 2155 static 2156 int 2157 hammer_vop_markatime(struct vop_markatime_args *ap) 2158 { 2159 struct hammer_transaction trans; 2160 struct hammer_inode *ip; 2161 hammer_mount_t hmp; 2162 2163 ip = VTOI(ap->a_vp); 2164 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2165 return (EROFS); 2166 if (ip->flags & HAMMER_INODE_RO) 2167 return (EROFS); 2168 hmp = ip->hmp; 2169 if (hmp->mp->mnt_flag & MNT_NOATIME) 2170 return (0); 2171 lwkt_gettoken(&hmp->fs_token); 2172 hammer_start_transaction(&trans, hmp); 2173 ++hammer_stats_file_iopsw; 2174 2175 ip->ino_data.atime = trans.time; 2176 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2177 hammer_done_transaction(&trans); 2178 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2179 lwkt_reltoken(&hmp->fs_token); 2180 return (0); 2181 } 2182 2183 /* 2184 * hammer_vop_setattr { vp, vap, cred } 2185 */ 2186 static 2187 int 2188 hammer_vop_setattr(struct vop_setattr_args *ap) 2189 { 2190 struct hammer_transaction trans; 2191 struct hammer_inode *ip; 2192 struct vattr *vap; 2193 hammer_mount_t hmp; 2194 int modflags; 2195 int error; 2196 int truncating; 2197 int blksize; 2198 int kflags; 2199 #if 0 2200 int64_t aligned_size; 2201 #endif 2202 u_int32_t flags; 2203 2204 vap = ap->a_vap; 2205 ip = ap->a_vp->v_data; 2206 modflags = 0; 2207 kflags = 0; 2208 hmp = ip->hmp; 2209 2210 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2211 return(EROFS); 2212 if (ip->flags & HAMMER_INODE_RO) 2213 return (EROFS); 2214 if (hammer_nohistory(ip) == 0 && 2215 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2216 return (error); 2217 } 2218 2219 lwkt_gettoken(&hmp->fs_token); 2220 hammer_start_transaction(&trans, hmp); 2221 ++hammer_stats_file_iopsw; 2222 error = 0; 2223 2224 if (vap->va_flags != VNOVAL) { 2225 flags = ip->ino_data.uflags; 2226 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2227 hammer_to_unix_xid(&ip->ino_data.uid), 2228 ap->a_cred); 2229 if (error == 0) { 2230 if (ip->ino_data.uflags != flags) { 2231 ip->ino_data.uflags = flags; 2232 ip->ino_data.ctime = trans.time; 2233 modflags |= HAMMER_INODE_DDIRTY; 2234 kflags |= NOTE_ATTRIB; 2235 } 2236 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2237 error = 0; 2238 goto done; 2239 } 2240 } 2241 goto done; 2242 } 2243 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2244 error = EPERM; 2245 goto done; 2246 } 2247 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2248 mode_t cur_mode = ip->ino_data.mode; 2249 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2250 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2251 uuid_t uuid_uid; 2252 uuid_t uuid_gid; 2253 2254 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2255 ap->a_cred, 2256 &cur_uid, &cur_gid, &cur_mode); 2257 if (error == 0) { 2258 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2259 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2260 if (bcmp(&uuid_uid, &ip->ino_data.uid, 2261 sizeof(uuid_uid)) || 2262 bcmp(&uuid_gid, &ip->ino_data.gid, 2263 sizeof(uuid_gid)) || 2264 ip->ino_data.mode != cur_mode 2265 ) { 2266 ip->ino_data.uid = uuid_uid; 2267 ip->ino_data.gid = uuid_gid; 2268 ip->ino_data.mode = cur_mode; 2269 ip->ino_data.ctime = trans.time; 2270 modflags |= HAMMER_INODE_DDIRTY; 2271 } 2272 kflags |= NOTE_ATTRIB; 2273 } 2274 } 2275 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2276 switch(ap->a_vp->v_type) { 2277 case VREG: 2278 if (vap->va_size == ip->ino_data.size) 2279 break; 2280 2281 /* 2282 * Log the operation if in fast-fsync mode or if 2283 * there are unterminated redo write records present. 2284 * 2285 * The second check is needed so the recovery code 2286 * properly truncates write redos even if nominal 2287 * REDO operations is turned off due to excessive 2288 * writes, because the related records might be 2289 * destroyed and never lay down a TERM_WRITE. 2290 */ 2291 if ((ip->flags & HAMMER_INODE_REDO) || 2292 (ip->flags & HAMMER_INODE_RDIRTY)) { 2293 error = hammer_generate_redo(&trans, ip, 2294 vap->va_size, 2295 HAMMER_REDO_TRUNC, 2296 NULL, 0); 2297 } 2298 blksize = hammer_blocksize(vap->va_size); 2299 2300 /* 2301 * XXX break atomicy, we can deadlock the backend 2302 * if we do not release the lock. Probably not a 2303 * big deal here. 2304 */ 2305 if (vap->va_size < ip->ino_data.size) { 2306 nvtruncbuf(ap->a_vp, vap->va_size, 2307 blksize, 2308 hammer_blockoff(vap->va_size)); 2309 truncating = 1; 2310 kflags |= NOTE_WRITE; 2311 } else { 2312 nvextendbuf(ap->a_vp, 2313 ip->ino_data.size, 2314 vap->va_size, 2315 hammer_blocksize(ip->ino_data.size), 2316 hammer_blocksize(vap->va_size), 2317 hammer_blockoff(ip->ino_data.size), 2318 hammer_blockoff(vap->va_size), 2319 0); 2320 truncating = 0; 2321 kflags |= NOTE_WRITE | NOTE_EXTEND; 2322 } 2323 ip->ino_data.size = vap->va_size; 2324 ip->ino_data.mtime = trans.time; 2325 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2326 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2327 2328 /* 2329 * On-media truncation is cached in the inode until 2330 * the inode is synchronized. We must immediately 2331 * handle any frontend records. 2332 */ 2333 if (truncating) { 2334 hammer_ip_frontend_trunc(ip, vap->va_size); 2335 #ifdef DEBUG_TRUNCATE 2336 if (HammerTruncIp == NULL) 2337 HammerTruncIp = ip; 2338 #endif 2339 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2340 ip->flags |= HAMMER_INODE_TRUNCATED; 2341 ip->trunc_off = vap->va_size; 2342 #ifdef DEBUG_TRUNCATE 2343 if (ip == HammerTruncIp) 2344 kprintf("truncate1 %016llx\n", 2345 (long long)ip->trunc_off); 2346 #endif 2347 } else if (ip->trunc_off > vap->va_size) { 2348 ip->trunc_off = vap->va_size; 2349 #ifdef DEBUG_TRUNCATE 2350 if (ip == HammerTruncIp) 2351 kprintf("truncate2 %016llx\n", 2352 (long long)ip->trunc_off); 2353 #endif 2354 } else { 2355 #ifdef DEBUG_TRUNCATE 2356 if (ip == HammerTruncIp) 2357 kprintf("truncate3 %016llx (ignored)\n", 2358 (long long)vap->va_size); 2359 #endif 2360 } 2361 } 2362 2363 #if 0 2364 /* 2365 * When truncating, nvtruncbuf() may have cleaned out 2366 * a portion of the last block on-disk in the buffer 2367 * cache. We must clean out any frontend records 2368 * for blocks beyond the new last block. 2369 */ 2370 aligned_size = (vap->va_size + (blksize - 1)) & 2371 ~(int64_t)(blksize - 1); 2372 if (truncating && vap->va_size < aligned_size) { 2373 aligned_size -= blksize; 2374 hammer_ip_frontend_trunc(ip, aligned_size); 2375 } 2376 #endif 2377 break; 2378 case VDATABASE: 2379 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2380 ip->flags |= HAMMER_INODE_TRUNCATED; 2381 ip->trunc_off = vap->va_size; 2382 } else if (ip->trunc_off > vap->va_size) { 2383 ip->trunc_off = vap->va_size; 2384 } 2385 hammer_ip_frontend_trunc(ip, vap->va_size); 2386 ip->ino_data.size = vap->va_size; 2387 ip->ino_data.mtime = trans.time; 2388 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2389 kflags |= NOTE_ATTRIB; 2390 break; 2391 default: 2392 error = EINVAL; 2393 goto done; 2394 } 2395 break; 2396 } 2397 if (vap->va_atime.tv_sec != VNOVAL) { 2398 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2399 modflags |= HAMMER_INODE_ATIME; 2400 kflags |= NOTE_ATTRIB; 2401 } 2402 if (vap->va_mtime.tv_sec != VNOVAL) { 2403 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2404 modflags |= HAMMER_INODE_MTIME; 2405 kflags |= NOTE_ATTRIB; 2406 } 2407 if (vap->va_mode != (mode_t)VNOVAL) { 2408 mode_t cur_mode = ip->ino_data.mode; 2409 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2410 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2411 2412 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2413 cur_uid, cur_gid, &cur_mode); 2414 if (error == 0 && ip->ino_data.mode != cur_mode) { 2415 ip->ino_data.mode = cur_mode; 2416 ip->ino_data.ctime = trans.time; 2417 modflags |= HAMMER_INODE_DDIRTY; 2418 kflags |= NOTE_ATTRIB; 2419 } 2420 } 2421 done: 2422 if (error == 0) 2423 hammer_modify_inode(&trans, ip, modflags); 2424 hammer_done_transaction(&trans); 2425 hammer_knote(ap->a_vp, kflags); 2426 lwkt_reltoken(&hmp->fs_token); 2427 return (error); 2428 } 2429 2430 /* 2431 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2432 */ 2433 static 2434 int 2435 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2436 { 2437 struct hammer_transaction trans; 2438 struct hammer_inode *dip; 2439 struct hammer_inode *nip; 2440 hammer_record_t record; 2441 struct nchandle *nch; 2442 hammer_mount_t hmp; 2443 int error; 2444 int bytes; 2445 2446 ap->a_vap->va_type = VLNK; 2447 2448 nch = ap->a_nch; 2449 dip = VTOI(ap->a_dvp); 2450 hmp = dip->hmp; 2451 2452 if (dip->flags & HAMMER_INODE_RO) 2453 return (EROFS); 2454 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2455 return (error); 2456 2457 /* 2458 * Create a transaction to cover the operations we perform. 2459 */ 2460 lwkt_gettoken(&hmp->fs_token); 2461 hammer_start_transaction(&trans, hmp); 2462 ++hammer_stats_file_iopsw; 2463 2464 /* 2465 * Create a new filesystem object of the requested type. The 2466 * returned inode will be referenced but not locked. 2467 */ 2468 2469 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2470 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2471 NULL, &nip); 2472 if (error) { 2473 hammer_done_transaction(&trans); 2474 *ap->a_vpp = NULL; 2475 lwkt_reltoken(&hmp->fs_token); 2476 return (error); 2477 } 2478 2479 /* 2480 * Add a record representing the symlink. symlink stores the link 2481 * as pure data, not a string, and is no \0 terminated. 2482 */ 2483 if (error == 0) { 2484 bytes = strlen(ap->a_target); 2485 2486 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2487 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2488 } else { 2489 record = hammer_alloc_mem_record(nip, bytes); 2490 record->type = HAMMER_MEM_RECORD_GENERAL; 2491 2492 record->leaf.base.localization = nip->obj_localization + 2493 HAMMER_LOCALIZE_MISC; 2494 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2495 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2496 record->leaf.data_len = bytes; 2497 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2498 bcopy(ap->a_target, record->data->symlink.name, bytes); 2499 error = hammer_ip_add_record(&trans, record); 2500 } 2501 2502 /* 2503 * Set the file size to the length of the link. 2504 */ 2505 if (error == 0) { 2506 nip->ino_data.size = bytes; 2507 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2508 } 2509 } 2510 if (error == 0) 2511 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2512 nch->ncp->nc_nlen, nip); 2513 2514 /* 2515 * Finish up. 2516 */ 2517 if (error) { 2518 hammer_rel_inode(nip, 0); 2519 *ap->a_vpp = NULL; 2520 } else { 2521 error = hammer_get_vnode(nip, ap->a_vpp); 2522 hammer_rel_inode(nip, 0); 2523 if (error == 0) { 2524 cache_setunresolved(ap->a_nch); 2525 cache_setvp(ap->a_nch, *ap->a_vpp); 2526 hammer_knote(ap->a_dvp, NOTE_WRITE); 2527 } 2528 } 2529 hammer_done_transaction(&trans); 2530 lwkt_reltoken(&hmp->fs_token); 2531 return (error); 2532 } 2533 2534 /* 2535 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2536 */ 2537 static 2538 int 2539 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2540 { 2541 struct hammer_transaction trans; 2542 struct hammer_inode *dip; 2543 hammer_mount_t hmp; 2544 int error; 2545 2546 dip = VTOI(ap->a_dvp); 2547 hmp = dip->hmp; 2548 2549 if (hammer_nohistory(dip) == 0 && 2550 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2551 return (error); 2552 } 2553 2554 lwkt_gettoken(&hmp->fs_token); 2555 hammer_start_transaction(&trans, hmp); 2556 ++hammer_stats_file_iopsw; 2557 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2558 ap->a_cred, ap->a_flags, -1); 2559 hammer_done_transaction(&trans); 2560 lwkt_reltoken(&hmp->fs_token); 2561 2562 return (error); 2563 } 2564 2565 /* 2566 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2567 */ 2568 static 2569 int 2570 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2571 { 2572 struct hammer_inode *ip = ap->a_vp->v_data; 2573 hammer_mount_t hmp = ip->hmp; 2574 int error; 2575 2576 ++hammer_stats_file_iopsr; 2577 lwkt_gettoken(&hmp->fs_token); 2578 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2579 ap->a_fflag, ap->a_cred); 2580 lwkt_reltoken(&hmp->fs_token); 2581 return (error); 2582 } 2583 2584 static 2585 int 2586 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2587 { 2588 static const struct mountctl_opt extraopt[] = { 2589 { HMNT_NOHISTORY, "nohistory" }, 2590 { HMNT_MASTERID, "master" }, 2591 { 0, NULL} 2592 2593 }; 2594 struct hammer_mount *hmp; 2595 struct mount *mp; 2596 int usedbytes; 2597 int error; 2598 2599 error = 0; 2600 usedbytes = 0; 2601 mp = ap->a_head.a_ops->head.vv_mount; 2602 KKASSERT(mp->mnt_data != NULL); 2603 hmp = (struct hammer_mount *)mp->mnt_data; 2604 2605 lwkt_gettoken(&hmp->fs_token); 2606 2607 switch(ap->a_op) { 2608 case MOUNTCTL_SET_EXPORT: 2609 if (ap->a_ctllen != sizeof(struct export_args)) 2610 error = EINVAL; 2611 else 2612 error = hammer_vfs_export(mp, ap->a_op, 2613 (const struct export_args *)ap->a_ctl); 2614 break; 2615 case MOUNTCTL_MOUNTFLAGS: 2616 { 2617 /* 2618 * Call standard mountctl VOP function 2619 * so we get user mount flags. 2620 */ 2621 error = vop_stdmountctl(ap); 2622 if (error) 2623 break; 2624 2625 usedbytes = *ap->a_res; 2626 2627 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2628 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2629 ap->a_buf, 2630 ap->a_buflen - usedbytes, 2631 &error); 2632 } 2633 2634 *ap->a_res += usedbytes; 2635 break; 2636 } 2637 default: 2638 error = vop_stdmountctl(ap); 2639 break; 2640 } 2641 lwkt_reltoken(&hmp->fs_token); 2642 return(error); 2643 } 2644 2645 /* 2646 * hammer_vop_strategy { vp, bio } 2647 * 2648 * Strategy call, used for regular file read & write only. Note that the 2649 * bp may represent a cluster. 2650 * 2651 * To simplify operation and allow better optimizations in the future, 2652 * this code does not make any assumptions with regards to buffer alignment 2653 * or size. 2654 */ 2655 static 2656 int 2657 hammer_vop_strategy(struct vop_strategy_args *ap) 2658 { 2659 struct buf *bp; 2660 int error; 2661 2662 bp = ap->a_bio->bio_buf; 2663 2664 switch(bp->b_cmd) { 2665 case BUF_CMD_READ: 2666 error = hammer_vop_strategy_read(ap); 2667 break; 2668 case BUF_CMD_WRITE: 2669 error = hammer_vop_strategy_write(ap); 2670 break; 2671 default: 2672 bp->b_error = error = EINVAL; 2673 bp->b_flags |= B_ERROR; 2674 biodone(ap->a_bio); 2675 break; 2676 } 2677 2678 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ 2679 2680 return (error); 2681 } 2682 2683 /* 2684 * Read from a regular file. Iterate the related records and fill in the 2685 * BIO/BUF. Gaps are zero-filled. 2686 * 2687 * The support code in hammer_object.c should be used to deal with mixed 2688 * in-memory and on-disk records. 2689 * 2690 * NOTE: Can be called from the cluster code with an oversized buf. 2691 * 2692 * XXX atime update 2693 */ 2694 static 2695 int 2696 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2697 { 2698 struct hammer_transaction trans; 2699 struct hammer_inode *ip; 2700 struct hammer_inode *dip; 2701 hammer_mount_t hmp; 2702 struct hammer_cursor cursor; 2703 hammer_base_elm_t base; 2704 hammer_off_t disk_offset; 2705 struct bio *bio; 2706 struct bio *nbio; 2707 struct buf *bp; 2708 int64_t rec_offset; 2709 int64_t ran_end; 2710 int64_t tmp64; 2711 int error; 2712 int boff; 2713 int roff; 2714 int n; 2715 int isdedupable; 2716 2717 bio = ap->a_bio; 2718 bp = bio->bio_buf; 2719 ip = ap->a_vp->v_data; 2720 hmp = ip->hmp; 2721 2722 /* 2723 * The zone-2 disk offset may have been set by the cluster code via 2724 * a BMAP operation, or else should be NOOFFSET. 2725 * 2726 * Checking the high bits for a match against zone-2 should suffice. 2727 * 2728 * In cases where a lot of data duplication is present it may be 2729 * more beneficial to drop through and doubule-buffer through the 2730 * device. 2731 */ 2732 nbio = push_bio(bio); 2733 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2734 HAMMER_ZONE_LARGE_DATA) { 2735 if (hammer_double_buffer == 0) { 2736 lwkt_gettoken(&hmp->fs_token); 2737 error = hammer_io_direct_read(hmp, nbio, NULL); 2738 lwkt_reltoken(&hmp->fs_token); 2739 return (error); 2740 } 2741 2742 /* 2743 * Try to shortcut requests for double_buffer mode too. 2744 * Since this mode runs through the device buffer cache 2745 * only compatible buffer sizes (meaning those generated 2746 * by normal filesystem buffers) are legal. 2747 */ 2748 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) { 2749 error = hammer_io_indirect_read(hmp, nbio, NULL); 2750 return (error); 2751 } 2752 } 2753 2754 /* 2755 * Well, that sucked. Do it the hard way. If all the stars are 2756 * aligned we may still be able to issue a direct-read. 2757 */ 2758 lwkt_gettoken(&hmp->fs_token); 2759 hammer_simple_transaction(&trans, hmp); 2760 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2761 2762 /* 2763 * Key range (begin and end inclusive) to scan. Note that the key's 2764 * stored in the actual records represent BASE+LEN, not BASE. The 2765 * first record containing bio_offset will have a key > bio_offset. 2766 */ 2767 cursor.key_beg.localization = ip->obj_localization + 2768 HAMMER_LOCALIZE_MISC; 2769 cursor.key_beg.obj_id = ip->obj_id; 2770 cursor.key_beg.create_tid = 0; 2771 cursor.key_beg.delete_tid = 0; 2772 cursor.key_beg.obj_type = 0; 2773 cursor.key_beg.key = bio->bio_offset + 1; 2774 cursor.asof = ip->obj_asof; 2775 cursor.flags |= HAMMER_CURSOR_ASOF; 2776 2777 cursor.key_end = cursor.key_beg; 2778 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2779 #if 0 2780 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2781 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2782 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2783 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2784 } else 2785 #endif 2786 { 2787 ran_end = bio->bio_offset + bp->b_bufsize; 2788 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2789 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2790 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2791 if (tmp64 < ran_end) 2792 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2793 else 2794 cursor.key_end.key = ran_end + MAXPHYS + 1; 2795 } 2796 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2797 2798 /* 2799 * Set NOSWAPCACHE for cursor data extraction if double buffering 2800 * is disabled or (if the file is not marked cacheable via chflags 2801 * and vm.swapcache_use_chflags is enabled). 2802 */ 2803 if (hammer_double_buffer == 0 || 2804 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 && 2805 vm_swapcache_use_chflags)) { 2806 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; 2807 } 2808 2809 error = hammer_ip_first(&cursor); 2810 boff = 0; 2811 2812 while (error == 0) { 2813 /* 2814 * Get the base file offset of the record. The key for 2815 * data records is (base + bytes) rather then (base). 2816 */ 2817 base = &cursor.leaf->base; 2818 rec_offset = base->key - cursor.leaf->data_len; 2819 2820 /* 2821 * Calculate the gap, if any, and zero-fill it. 2822 * 2823 * n is the offset of the start of the record verses our 2824 * current seek offset in the bio. 2825 */ 2826 n = (int)(rec_offset - (bio->bio_offset + boff)); 2827 if (n > 0) { 2828 if (n > bp->b_bufsize - boff) 2829 n = bp->b_bufsize - boff; 2830 bzero((char *)bp->b_data + boff, n); 2831 boff += n; 2832 n = 0; 2833 } 2834 2835 /* 2836 * Calculate the data offset in the record and the number 2837 * of bytes we can copy. 2838 * 2839 * There are two degenerate cases. First, boff may already 2840 * be at bp->b_bufsize. Secondly, the data offset within 2841 * the record may exceed the record's size. 2842 */ 2843 roff = -n; 2844 rec_offset += roff; 2845 n = cursor.leaf->data_len - roff; 2846 if (n <= 0) { 2847 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff); 2848 n = 0; 2849 } else if (n > bp->b_bufsize - boff) { 2850 n = bp->b_bufsize - boff; 2851 } 2852 2853 /* 2854 * Deal with cached truncations. This cool bit of code 2855 * allows truncate()/ftruncate() to avoid having to sync 2856 * the file. 2857 * 2858 * If the frontend is truncated then all backend records are 2859 * subject to the frontend's truncation. 2860 * 2861 * If the backend is truncated then backend records on-disk 2862 * (but not in-memory) are subject to the backend's 2863 * truncation. In-memory records owned by the backend 2864 * represent data written after the truncation point on the 2865 * backend and must not be truncated. 2866 * 2867 * Truncate operations deal with frontend buffer cache 2868 * buffers and frontend-owned in-memory records synchronously. 2869 */ 2870 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2871 if (hammer_cursor_ondisk(&cursor)/* || 2872 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2873 if (ip->trunc_off <= rec_offset) 2874 n = 0; 2875 else if (ip->trunc_off < rec_offset + n) 2876 n = (int)(ip->trunc_off - rec_offset); 2877 } 2878 } 2879 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2880 if (hammer_cursor_ondisk(&cursor)) { 2881 if (ip->sync_trunc_off <= rec_offset) 2882 n = 0; 2883 else if (ip->sync_trunc_off < rec_offset + n) 2884 n = (int)(ip->sync_trunc_off - rec_offset); 2885 } 2886 } 2887 2888 /* 2889 * Try to issue a direct read into our bio if possible, 2890 * otherwise resolve the element data into a hammer_buffer 2891 * and copy. 2892 * 2893 * The buffer on-disk should be zerod past any real 2894 * truncation point, but may not be for any synthesized 2895 * truncation point from above. 2896 * 2897 * NOTE: disk_offset is only valid if the cursor data is 2898 * on-disk. 2899 */ 2900 disk_offset = cursor.leaf->data_offset + roff; 2901 isdedupable = (boff == 0 && n == bp->b_bufsize && 2902 hammer_cursor_ondisk(&cursor) && 2903 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2904 2905 if (isdedupable && hammer_double_buffer == 0) { 2906 /* 2907 * Direct read case 2908 */ 2909 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2910 HAMMER_ZONE_LARGE_DATA); 2911 nbio->bio_offset = disk_offset; 2912 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2913 if (hammer_live_dedup && error == 0) 2914 hammer_dedup_cache_add(ip, cursor.leaf); 2915 goto done; 2916 } else if (isdedupable) { 2917 /* 2918 * Async I/O case for reading from backing store 2919 * and copying the data to the filesystem buffer. 2920 * live-dedup has to verify the data anyway if it 2921 * gets a hit later so we can just add the entry 2922 * now. 2923 */ 2924 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2925 HAMMER_ZONE_LARGE_DATA); 2926 nbio->bio_offset = disk_offset; 2927 if (hammer_live_dedup) 2928 hammer_dedup_cache_add(ip, cursor.leaf); 2929 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf); 2930 goto done; 2931 } else if (n) { 2932 error = hammer_ip_resolve_data(&cursor); 2933 if (error == 0) { 2934 if (hammer_live_dedup && isdedupable) 2935 hammer_dedup_cache_add(ip, cursor.leaf); 2936 bcopy((char *)cursor.data + roff, 2937 (char *)bp->b_data + boff, n); 2938 } 2939 } 2940 if (error) 2941 break; 2942 2943 /* 2944 * We have to be sure that the only elements added to the 2945 * dedup cache are those which are already on-media. 2946 */ 2947 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) 2948 hammer_dedup_cache_add(ip, cursor.leaf); 2949 2950 /* 2951 * Iterate until we have filled the request. 2952 */ 2953 boff += n; 2954 if (boff == bp->b_bufsize) 2955 break; 2956 error = hammer_ip_next(&cursor); 2957 } 2958 2959 /* 2960 * There may have been a gap after the last record 2961 */ 2962 if (error == ENOENT) 2963 error = 0; 2964 if (error == 0 && boff != bp->b_bufsize) { 2965 KKASSERT(boff < bp->b_bufsize); 2966 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2967 /* boff = bp->b_bufsize; */ 2968 } 2969 2970 /* 2971 * Disallow swapcache operation on the vnode buffer if double 2972 * buffering is enabled, the swapcache will get the data via 2973 * the block device buffer. 2974 */ 2975 if (hammer_double_buffer) 2976 bp->b_flags |= B_NOTMETA; 2977 2978 /* 2979 * Cleanup 2980 */ 2981 bp->b_resid = 0; 2982 bp->b_error = error; 2983 if (error) 2984 bp->b_flags |= B_ERROR; 2985 biodone(ap->a_bio); 2986 2987 done: 2988 /* 2989 * Cache the b-tree node for the last data read in cache[1]. 2990 * 2991 * If we hit the file EOF then also cache the node in the 2992 * governing director's cache[3], it will be used to initialize 2993 * the inode's cache[1] for any inodes looked up via the directory. 2994 * 2995 * This doesn't reduce disk accesses since the B-Tree chain is 2996 * likely cached, but it does reduce cpu overhead when looking 2997 * up file offsets for cpdup/tar/cpio style iterations. 2998 */ 2999 if (cursor.node) 3000 hammer_cache_node(&ip->cache[1], cursor.node); 3001 if (ran_end >= ip->ino_data.size) { 3002 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 3003 ip->obj_asof, ip->obj_localization); 3004 if (dip) { 3005 hammer_cache_node(&dip->cache[3], cursor.node); 3006 hammer_rel_inode(dip, 0); 3007 } 3008 } 3009 hammer_done_cursor(&cursor); 3010 hammer_done_transaction(&trans); 3011 lwkt_reltoken(&hmp->fs_token); 3012 return(error); 3013 } 3014 3015 /* 3016 * BMAP operation - used to support cluster_read() only. 3017 * 3018 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 3019 * 3020 * This routine may return EOPNOTSUPP if the opration is not supported for 3021 * the specified offset. The contents of the pointer arguments do not 3022 * need to be initialized in that case. 3023 * 3024 * If a disk address is available and properly aligned return 0 with 3025 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 3026 * to the run-length relative to that offset. Callers may assume that 3027 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 3028 * large, so return EOPNOTSUPP if it is not sufficiently large. 3029 */ 3030 static 3031 int 3032 hammer_vop_bmap(struct vop_bmap_args *ap) 3033 { 3034 struct hammer_transaction trans; 3035 struct hammer_inode *ip; 3036 hammer_mount_t hmp; 3037 struct hammer_cursor cursor; 3038 hammer_base_elm_t base; 3039 int64_t rec_offset; 3040 int64_t ran_end; 3041 int64_t tmp64; 3042 int64_t base_offset; 3043 int64_t base_disk_offset; 3044 int64_t last_offset; 3045 hammer_off_t last_disk_offset; 3046 hammer_off_t disk_offset; 3047 int rec_len; 3048 int error; 3049 int blksize; 3050 3051 ++hammer_stats_file_iopsr; 3052 ip = ap->a_vp->v_data; 3053 hmp = ip->hmp; 3054 3055 /* 3056 * We can only BMAP regular files. We can't BMAP database files, 3057 * directories, etc. 3058 */ 3059 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 3060 return(EOPNOTSUPP); 3061 3062 /* 3063 * bmap is typically called with runp/runb both NULL when used 3064 * for writing. We do not support BMAP for writing atm. 3065 */ 3066 if (ap->a_cmd != BUF_CMD_READ) 3067 return(EOPNOTSUPP); 3068 3069 /* 3070 * Scan the B-Tree to acquire blockmap addresses, then translate 3071 * to raw addresses. 3072 */ 3073 lwkt_gettoken(&hmp->fs_token); 3074 hammer_simple_transaction(&trans, hmp); 3075 #if 0 3076 kprintf("bmap_beg %016llx ip->cache %p\n", 3077 (long long)ap->a_loffset, ip->cache[1]); 3078 #endif 3079 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 3080 3081 /* 3082 * Key range (begin and end inclusive) to scan. Note that the key's 3083 * stored in the actual records represent BASE+LEN, not BASE. The 3084 * first record containing bio_offset will have a key > bio_offset. 3085 */ 3086 cursor.key_beg.localization = ip->obj_localization + 3087 HAMMER_LOCALIZE_MISC; 3088 cursor.key_beg.obj_id = ip->obj_id; 3089 cursor.key_beg.create_tid = 0; 3090 cursor.key_beg.delete_tid = 0; 3091 cursor.key_beg.obj_type = 0; 3092 if (ap->a_runb) 3093 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 3094 else 3095 cursor.key_beg.key = ap->a_loffset + 1; 3096 if (cursor.key_beg.key < 0) 3097 cursor.key_beg.key = 0; 3098 cursor.asof = ip->obj_asof; 3099 cursor.flags |= HAMMER_CURSOR_ASOF; 3100 3101 cursor.key_end = cursor.key_beg; 3102 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3103 3104 ran_end = ap->a_loffset + MAXPHYS; 3105 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3106 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3107 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3108 if (tmp64 < ran_end) 3109 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 3110 else 3111 cursor.key_end.key = ran_end + MAXPHYS + 1; 3112 3113 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3114 3115 error = hammer_ip_first(&cursor); 3116 base_offset = last_offset = 0; 3117 base_disk_offset = last_disk_offset = 0; 3118 3119 while (error == 0) { 3120 /* 3121 * Get the base file offset of the record. The key for 3122 * data records is (base + bytes) rather then (base). 3123 * 3124 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3125 * The extra bytes should be zero on-disk and the BMAP op 3126 * should still be ok. 3127 */ 3128 base = &cursor.leaf->base; 3129 rec_offset = base->key - cursor.leaf->data_len; 3130 rec_len = cursor.leaf->data_len; 3131 3132 /* 3133 * Incorporate any cached truncation. 3134 * 3135 * NOTE: Modifications to rec_len based on synthesized 3136 * truncation points remove the guarantee that any extended 3137 * data on disk is zero (since the truncations may not have 3138 * taken place on-media yet). 3139 */ 3140 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3141 if (hammer_cursor_ondisk(&cursor) || 3142 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3143 if (ip->trunc_off <= rec_offset) 3144 rec_len = 0; 3145 else if (ip->trunc_off < rec_offset + rec_len) 3146 rec_len = (int)(ip->trunc_off - rec_offset); 3147 } 3148 } 3149 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3150 if (hammer_cursor_ondisk(&cursor)) { 3151 if (ip->sync_trunc_off <= rec_offset) 3152 rec_len = 0; 3153 else if (ip->sync_trunc_off < rec_offset + rec_len) 3154 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3155 } 3156 } 3157 3158 /* 3159 * Accumulate information. If we have hit a discontiguous 3160 * block reset base_offset unless we are already beyond the 3161 * requested offset. If we are, that's it, we stop. 3162 */ 3163 if (error) 3164 break; 3165 if (hammer_cursor_ondisk(&cursor)) { 3166 disk_offset = cursor.leaf->data_offset; 3167 if (rec_offset != last_offset || 3168 disk_offset != last_disk_offset) { 3169 if (rec_offset > ap->a_loffset) 3170 break; 3171 base_offset = rec_offset; 3172 base_disk_offset = disk_offset; 3173 } 3174 last_offset = rec_offset + rec_len; 3175 last_disk_offset = disk_offset + rec_len; 3176 3177 if (hammer_live_dedup) 3178 hammer_dedup_cache_add(ip, cursor.leaf); 3179 } 3180 3181 error = hammer_ip_next(&cursor); 3182 } 3183 3184 #if 0 3185 kprintf("BMAP %016llx: %016llx - %016llx\n", 3186 (long long)ap->a_loffset, 3187 (long long)base_offset, 3188 (long long)last_offset); 3189 kprintf("BMAP %16s: %016llx - %016llx\n", "", 3190 (long long)base_disk_offset, 3191 (long long)last_disk_offset); 3192 #endif 3193 3194 if (cursor.node) { 3195 hammer_cache_node(&ip->cache[1], cursor.node); 3196 #if 0 3197 kprintf("bmap_end2 %016llx ip->cache %p\n", 3198 (long long)ap->a_loffset, ip->cache[1]); 3199 #endif 3200 } 3201 hammer_done_cursor(&cursor); 3202 hammer_done_transaction(&trans); 3203 lwkt_reltoken(&hmp->fs_token); 3204 3205 /* 3206 * If we couldn't find any records or the records we did find were 3207 * all behind the requested offset, return failure. A forward 3208 * truncation can leave a hole w/ no on-disk records. 3209 */ 3210 if (last_offset == 0 || last_offset < ap->a_loffset) 3211 return (EOPNOTSUPP); 3212 3213 /* 3214 * Figure out the block size at the requested offset and adjust 3215 * our limits so the cluster_read() does not create inappropriately 3216 * sized buffer cache buffers. 3217 */ 3218 blksize = hammer_blocksize(ap->a_loffset); 3219 if (hammer_blocksize(base_offset) != blksize) { 3220 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3221 } 3222 if (last_offset != ap->a_loffset && 3223 hammer_blocksize(last_offset - 1) != blksize) { 3224 last_offset = hammer_blockdemarc(ap->a_loffset, 3225 last_offset - 1); 3226 } 3227 3228 /* 3229 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3230 * from occuring. 3231 */ 3232 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3233 3234 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 3235 /* 3236 * Only large-data zones can be direct-IOd 3237 */ 3238 error = EOPNOTSUPP; 3239 } else if ((disk_offset & HAMMER_BUFMASK) || 3240 (last_offset - ap->a_loffset) < blksize) { 3241 /* 3242 * doffsetp is not aligned or the forward run size does 3243 * not cover a whole buffer, disallow the direct I/O. 3244 */ 3245 error = EOPNOTSUPP; 3246 } else { 3247 /* 3248 * We're good. 3249 */ 3250 *ap->a_doffsetp = disk_offset; 3251 if (ap->a_runb) { 3252 *ap->a_runb = ap->a_loffset - base_offset; 3253 KKASSERT(*ap->a_runb >= 0); 3254 } 3255 if (ap->a_runp) { 3256 *ap->a_runp = last_offset - ap->a_loffset; 3257 KKASSERT(*ap->a_runp >= 0); 3258 } 3259 error = 0; 3260 } 3261 return(error); 3262 } 3263 3264 /* 3265 * Write to a regular file. Because this is a strategy call the OS is 3266 * trying to actually get data onto the media. 3267 */ 3268 static 3269 int 3270 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3271 { 3272 hammer_record_t record; 3273 hammer_mount_t hmp; 3274 hammer_inode_t ip; 3275 struct bio *bio; 3276 struct buf *bp; 3277 int blksize; 3278 int bytes; 3279 int error; 3280 3281 bio = ap->a_bio; 3282 bp = bio->bio_buf; 3283 ip = ap->a_vp->v_data; 3284 hmp = ip->hmp; 3285 3286 blksize = hammer_blocksize(bio->bio_offset); 3287 KKASSERT(bp->b_bufsize == blksize); 3288 3289 if (ip->flags & HAMMER_INODE_RO) { 3290 bp->b_error = EROFS; 3291 bp->b_flags |= B_ERROR; 3292 biodone(ap->a_bio); 3293 return(EROFS); 3294 } 3295 3296 lwkt_gettoken(&hmp->fs_token); 3297 3298 /* 3299 * Disallow swapcache operation on the vnode buffer if double 3300 * buffering is enabled, the swapcache will get the data via 3301 * the block device buffer. 3302 */ 3303 if (hammer_double_buffer) 3304 bp->b_flags |= B_NOTMETA; 3305 3306 /* 3307 * Interlock with inode destruction (no in-kernel or directory 3308 * topology visibility). If we queue new IO while trying to 3309 * destroy the inode we can deadlock the vtrunc call in 3310 * hammer_inode_unloadable_check(). 3311 * 3312 * Besides, there's no point flushing a bp associated with an 3313 * inode that is being destroyed on-media and has no kernel 3314 * references. 3315 */ 3316 if ((ip->flags | ip->sync_flags) & 3317 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3318 bp->b_resid = 0; 3319 biodone(ap->a_bio); 3320 lwkt_reltoken(&hmp->fs_token); 3321 return(0); 3322 } 3323 3324 /* 3325 * Reserve space and issue a direct-write from the front-end. 3326 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3327 * allocations. 3328 * 3329 * An in-memory record will be installed to reference the storage 3330 * until the flusher can get to it. 3331 * 3332 * Since we own the high level bio the front-end will not try to 3333 * do a direct-read until the write completes. 3334 * 3335 * NOTE: The only time we do not reserve a full-sized buffers 3336 * worth of data is if the file is small. We do not try to 3337 * allocate a fragment (from the small-data zone) at the end of 3338 * an otherwise large file as this can lead to wildly separated 3339 * data. 3340 */ 3341 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3342 KKASSERT(bio->bio_offset < ip->ino_data.size); 3343 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2) 3344 bytes = bp->b_bufsize; 3345 else 3346 bytes = ((int)ip->ino_data.size + 15) & ~15; 3347 3348 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3349 bytes, &error); 3350 3351 /* 3352 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3353 * in hammer_vop_write(). We must flag the record so the proper 3354 * REDO_TERM_WRITE entry is generated during the flush. 3355 */ 3356 if (record) { 3357 if (bp->b_flags & B_VFSFLAG1) { 3358 record->flags |= HAMMER_RECF_REDO; 3359 bp->b_flags &= ~B_VFSFLAG1; 3360 } 3361 if (record->flags & HAMMER_RECF_DEDUPED) { 3362 bp->b_resid = 0; 3363 hammer_ip_replace_bulk(hmp, record); 3364 biodone(ap->a_bio); 3365 } else { 3366 hammer_io_direct_write(hmp, bio, record); 3367 } 3368 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3369 hammer_flush_inode(ip, 0); 3370 } else { 3371 bp->b_bio2.bio_offset = NOOFFSET; 3372 bp->b_error = error; 3373 bp->b_flags |= B_ERROR; 3374 biodone(ap->a_bio); 3375 } 3376 lwkt_reltoken(&hmp->fs_token); 3377 return(error); 3378 } 3379 3380 /* 3381 * dounlink - disconnect a directory entry 3382 * 3383 * XXX whiteout support not really in yet 3384 */ 3385 static int 3386 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3387 struct vnode *dvp, struct ucred *cred, 3388 int flags, int isdir) 3389 { 3390 struct namecache *ncp; 3391 hammer_inode_t dip; 3392 hammer_inode_t ip; 3393 hammer_mount_t hmp; 3394 struct hammer_cursor cursor; 3395 int64_t namekey; 3396 u_int32_t max_iterations; 3397 int nlen, error; 3398 3399 /* 3400 * Calculate the namekey and setup the key range for the scan. This 3401 * works kinda like a chained hash table where the lower 32 bits 3402 * of the namekey synthesize the chain. 3403 * 3404 * The key range is inclusive of both key_beg and key_end. 3405 */ 3406 dip = VTOI(dvp); 3407 ncp = nch->ncp; 3408 hmp = dip->hmp; 3409 3410 if (dip->flags & HAMMER_INODE_RO) 3411 return (EROFS); 3412 3413 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3414 &max_iterations); 3415 retry: 3416 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3417 cursor.key_beg.localization = dip->obj_localization + 3418 hammer_dir_localization(dip); 3419 cursor.key_beg.obj_id = dip->obj_id; 3420 cursor.key_beg.key = namekey; 3421 cursor.key_beg.create_tid = 0; 3422 cursor.key_beg.delete_tid = 0; 3423 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3424 cursor.key_beg.obj_type = 0; 3425 3426 cursor.key_end = cursor.key_beg; 3427 cursor.key_end.key += max_iterations; 3428 cursor.asof = dip->obj_asof; 3429 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3430 3431 /* 3432 * Scan all matching records (the chain), locate the one matching 3433 * the requested path component. info->last_error contains the 3434 * error code on search termination and could be 0, ENOENT, or 3435 * something else. 3436 * 3437 * The hammer_ip_*() functions merge in-memory records with on-disk 3438 * records for the purposes of the search. 3439 */ 3440 error = hammer_ip_first(&cursor); 3441 3442 while (error == 0) { 3443 error = hammer_ip_resolve_data(&cursor); 3444 if (error) 3445 break; 3446 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3447 KKASSERT(nlen > 0); 3448 if (ncp->nc_nlen == nlen && 3449 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3450 break; 3451 } 3452 error = hammer_ip_next(&cursor); 3453 } 3454 3455 /* 3456 * If all is ok we have to get the inode so we can adjust nlinks. 3457 * To avoid a deadlock with the flusher we must release the inode 3458 * lock on the directory when acquiring the inode for the entry. 3459 * 3460 * If the target is a directory, it must be empty. 3461 */ 3462 if (error == 0) { 3463 hammer_unlock(&cursor.ip->lock); 3464 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3465 hmp->asof, 3466 cursor.data->entry.localization, 3467 0, &error); 3468 hammer_lock_sh(&cursor.ip->lock); 3469 if (error == ENOENT) { 3470 kprintf("HAMMER: WARNING: Removing " 3471 "dirent w/missing inode \"%s\"\n" 3472 "\tobj_id = %016llx\n", 3473 ncp->nc_name, 3474 (long long)cursor.data->entry.obj_id); 3475 error = 0; 3476 } 3477 3478 /* 3479 * If isdir >= 0 we validate that the entry is or is not a 3480 * directory. If isdir < 0 we don't care. 3481 */ 3482 if (error == 0 && isdir >= 0 && ip) { 3483 if (isdir && 3484 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3485 error = ENOTDIR; 3486 } else if (isdir == 0 && 3487 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3488 error = EISDIR; 3489 } 3490 } 3491 3492 /* 3493 * If we are trying to remove a directory the directory must 3494 * be empty. 3495 * 3496 * The check directory code can loop and deadlock/retry. Our 3497 * own cursor's node locks must be released to avoid a 3-way 3498 * deadlock with the flusher if the check directory code 3499 * blocks. 3500 * 3501 * If any changes whatsoever have been made to the cursor 3502 * set EDEADLK and retry. 3503 * 3504 * WARNING: See warnings in hammer_unlock_cursor() 3505 * function. 3506 */ 3507 if (error == 0 && ip && ip->ino_data.obj_type == 3508 HAMMER_OBJTYPE_DIRECTORY) { 3509 hammer_unlock_cursor(&cursor); 3510 error = hammer_ip_check_directory_empty(trans, ip); 3511 hammer_lock_cursor(&cursor); 3512 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3513 kprintf("HAMMER: Warning: avoided deadlock " 3514 "on rmdir '%s'\n", 3515 ncp->nc_name); 3516 error = EDEADLK; 3517 } 3518 } 3519 3520 /* 3521 * Delete the directory entry. 3522 * 3523 * WARNING: hammer_ip_del_directory() may have to terminate 3524 * the cursor to avoid a deadlock. It is ok to call 3525 * hammer_done_cursor() twice. 3526 */ 3527 if (error == 0) { 3528 error = hammer_ip_del_directory(trans, &cursor, 3529 dip, ip); 3530 } 3531 hammer_done_cursor(&cursor); 3532 if (error == 0) { 3533 cache_setunresolved(nch); 3534 cache_setvp(nch, NULL); 3535 3536 /* 3537 * NOTE: ip->vp, if non-NULL, cannot be directly 3538 * referenced without formally acquiring the 3539 * vp since the vp might have zero refs on it, 3540 * or in the middle of a reclaim, etc. 3541 * 3542 * NOTE: The cache_setunresolved() can rip the vp 3543 * out from under us since the vp may not have 3544 * any refs, in which case ip->vp will be NULL 3545 * from the outset. 3546 */ 3547 while (ip && ip->vp) { 3548 struct vnode *vp; 3549 3550 error = hammer_get_vnode(ip, &vp); 3551 if (error == 0 && vp) { 3552 vn_unlock(vp); 3553 hammer_knote(ip->vp, NOTE_DELETE); 3554 cache_inval_vp(ip->vp, CINV_DESTROY); 3555 vrele(vp); 3556 break; 3557 } 3558 kprintf("Debug: HAMMER ip/vp race1 avoided\n"); 3559 } 3560 } 3561 if (ip) 3562 hammer_rel_inode(ip, 0); 3563 } else { 3564 hammer_done_cursor(&cursor); 3565 } 3566 if (error == EDEADLK) 3567 goto retry; 3568 3569 return (error); 3570 } 3571 3572 /************************************************************************ 3573 * FIFO AND SPECFS OPS * 3574 ************************************************************************ 3575 * 3576 */ 3577 static int 3578 hammer_vop_fifoclose (struct vop_close_args *ap) 3579 { 3580 /* XXX update itimes */ 3581 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3582 } 3583 3584 static int 3585 hammer_vop_fiforead (struct vop_read_args *ap) 3586 { 3587 int error; 3588 3589 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3590 /* XXX update access time */ 3591 return (error); 3592 } 3593 3594 static int 3595 hammer_vop_fifowrite (struct vop_write_args *ap) 3596 { 3597 int error; 3598 3599 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3600 /* XXX update access time */ 3601 return (error); 3602 } 3603 3604 static 3605 int 3606 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3607 { 3608 int error; 3609 3610 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3611 if (error) 3612 error = hammer_vop_kqfilter(ap); 3613 return(error); 3614 } 3615 3616 /************************************************************************ 3617 * KQFILTER OPS * 3618 ************************************************************************ 3619 * 3620 */ 3621 static void filt_hammerdetach(struct knote *kn); 3622 static int filt_hammerread(struct knote *kn, long hint); 3623 static int filt_hammerwrite(struct knote *kn, long hint); 3624 static int filt_hammervnode(struct knote *kn, long hint); 3625 3626 static struct filterops hammerread_filtops = 3627 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread }; 3628 static struct filterops hammerwrite_filtops = 3629 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite }; 3630 static struct filterops hammervnode_filtops = 3631 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode }; 3632 3633 static 3634 int 3635 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3636 { 3637 struct vnode *vp = ap->a_vp; 3638 struct knote *kn = ap->a_kn; 3639 3640 switch (kn->kn_filter) { 3641 case EVFILT_READ: 3642 kn->kn_fop = &hammerread_filtops; 3643 break; 3644 case EVFILT_WRITE: 3645 kn->kn_fop = &hammerwrite_filtops; 3646 break; 3647 case EVFILT_VNODE: 3648 kn->kn_fop = &hammervnode_filtops; 3649 break; 3650 default: 3651 return (EOPNOTSUPP); 3652 } 3653 3654 kn->kn_hook = (caddr_t)vp; 3655 3656 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3657 3658 return(0); 3659 } 3660 3661 static void 3662 filt_hammerdetach(struct knote *kn) 3663 { 3664 struct vnode *vp = (void *)kn->kn_hook; 3665 3666 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3667 } 3668 3669 static int 3670 filt_hammerread(struct knote *kn, long hint) 3671 { 3672 struct vnode *vp = (void *)kn->kn_hook; 3673 hammer_inode_t ip = VTOI(vp); 3674 hammer_mount_t hmp = ip->hmp; 3675 off_t off; 3676 3677 if (hint == NOTE_REVOKE) { 3678 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3679 return(1); 3680 } 3681 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3682 off = ip->ino_data.size - kn->kn_fp->f_offset; 3683 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3684 lwkt_reltoken(&hmp->fs_token); 3685 if (kn->kn_sfflags & NOTE_OLDAPI) 3686 return(1); 3687 return (kn->kn_data != 0); 3688 } 3689 3690 static int 3691 filt_hammerwrite(struct knote *kn, long hint) 3692 { 3693 if (hint == NOTE_REVOKE) 3694 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3695 kn->kn_data = 0; 3696 return (1); 3697 } 3698 3699 static int 3700 filt_hammervnode(struct knote *kn, long hint) 3701 { 3702 if (kn->kn_sfflags & hint) 3703 kn->kn_fflags |= hint; 3704 if (hint == NOTE_REVOKE) { 3705 kn->kn_flags |= (EV_EOF | EV_NODATA); 3706 return (1); 3707 } 3708 return (kn->kn_fflags != 0); 3709 } 3710 3711