1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/fcntl.h> 41 #include <sys/namecache.h> 42 #include <sys/vnode.h> 43 #include <sys/lockf.h> 44 #include <sys/event.h> 45 #include <sys/stat.h> 46 #include <sys/dirent.h> 47 #include <sys/file.h> 48 #include <vm/vm_extern.h> 49 #include <vm/swap_pager.h> 50 #include <vfs/fifofs/fifo.h> 51 52 #include "hammer.h" 53 54 /* 55 * USERFS VNOPS 56 */ 57 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/ 58 static int hammer_vop_fsync(struct vop_fsync_args *); 59 static int hammer_vop_read(struct vop_read_args *); 60 static int hammer_vop_write(struct vop_write_args *); 61 static int hammer_vop_access(struct vop_access_args *); 62 static int hammer_vop_advlock(struct vop_advlock_args *); 63 static int hammer_vop_close(struct vop_close_args *); 64 static int hammer_vop_ncreate(struct vop_ncreate_args *); 65 static int hammer_vop_getattr(struct vop_getattr_args *); 66 static int hammer_vop_nresolve(struct vop_nresolve_args *); 67 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 68 static int hammer_vop_nlink(struct vop_nlink_args *); 69 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 70 static int hammer_vop_nmknod(struct vop_nmknod_args *); 71 static int hammer_vop_open(struct vop_open_args *); 72 static int hammer_vop_print(struct vop_print_args *); 73 static int hammer_vop_readdir(struct vop_readdir_args *); 74 static int hammer_vop_readlink(struct vop_readlink_args *); 75 static int hammer_vop_nremove(struct vop_nremove_args *); 76 static int hammer_vop_nrename(struct vop_nrename_args *); 77 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 78 static int hammer_vop_markatime(struct vop_markatime_args *); 79 static int hammer_vop_setattr(struct vop_setattr_args *); 80 static int hammer_vop_strategy(struct vop_strategy_args *); 81 static int hammer_vop_bmap(struct vop_bmap_args *ap); 82 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 83 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 84 static int hammer_vop_ioctl(struct vop_ioctl_args *); 85 static int hammer_vop_mountctl(struct vop_mountctl_args *); 86 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 87 88 static int hammer_vop_fifoclose (struct vop_close_args *); 89 static int hammer_vop_fiforead (struct vop_read_args *); 90 static int hammer_vop_fifowrite (struct vop_write_args *); 91 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 92 93 struct vop_ops hammer_vnode_vops = { 94 .vop_default = vop_defaultop, 95 .vop_fsync = hammer_vop_fsync, 96 .vop_getpages = vop_stdgetpages, 97 .vop_putpages = vop_stdputpages, 98 .vop_read = hammer_vop_read, 99 .vop_write = hammer_vop_write, 100 .vop_access = hammer_vop_access, 101 .vop_advlock = hammer_vop_advlock, 102 .vop_close = hammer_vop_close, 103 .vop_ncreate = hammer_vop_ncreate, 104 .vop_getattr = hammer_vop_getattr, 105 .vop_inactive = hammer_vop_inactive, 106 .vop_reclaim = hammer_vop_reclaim, 107 .vop_nresolve = hammer_vop_nresolve, 108 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 109 .vop_nlink = hammer_vop_nlink, 110 .vop_nmkdir = hammer_vop_nmkdir, 111 .vop_nmknod = hammer_vop_nmknod, 112 .vop_open = hammer_vop_open, 113 .vop_pathconf = vop_stdpathconf, 114 .vop_print = hammer_vop_print, 115 .vop_readdir = hammer_vop_readdir, 116 .vop_readlink = hammer_vop_readlink, 117 .vop_nremove = hammer_vop_nremove, 118 .vop_nrename = hammer_vop_nrename, 119 .vop_nrmdir = hammer_vop_nrmdir, 120 .vop_markatime = hammer_vop_markatime, 121 .vop_setattr = hammer_vop_setattr, 122 .vop_bmap = hammer_vop_bmap, 123 .vop_strategy = hammer_vop_strategy, 124 .vop_nsymlink = hammer_vop_nsymlink, 125 .vop_nwhiteout = hammer_vop_nwhiteout, 126 .vop_ioctl = hammer_vop_ioctl, 127 .vop_mountctl = hammer_vop_mountctl, 128 .vop_kqfilter = hammer_vop_kqfilter 129 }; 130 131 struct vop_ops hammer_spec_vops = { 132 .vop_default = vop_defaultop, 133 .vop_fsync = hammer_vop_fsync, 134 .vop_read = vop_stdnoread, 135 .vop_write = vop_stdnowrite, 136 .vop_access = hammer_vop_access, 137 .vop_close = hammer_vop_close, 138 .vop_markatime = hammer_vop_markatime, 139 .vop_getattr = hammer_vop_getattr, 140 .vop_inactive = hammer_vop_inactive, 141 .vop_reclaim = hammer_vop_reclaim, 142 .vop_setattr = hammer_vop_setattr 143 }; 144 145 struct vop_ops hammer_fifo_vops = { 146 .vop_default = fifo_vnoperate, 147 .vop_fsync = hammer_vop_fsync, 148 .vop_read = hammer_vop_fiforead, 149 .vop_write = hammer_vop_fifowrite, 150 .vop_access = hammer_vop_access, 151 .vop_close = hammer_vop_fifoclose, 152 .vop_markatime = hammer_vop_markatime, 153 .vop_getattr = hammer_vop_getattr, 154 .vop_inactive = hammer_vop_inactive, 155 .vop_reclaim = hammer_vop_reclaim, 156 .vop_setattr = hammer_vop_setattr, 157 .vop_kqfilter = hammer_vop_fifokqfilter 158 }; 159 160 static __inline 161 void 162 hammer_knote(struct vnode *vp, int flags) 163 { 164 if (flags) 165 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 166 } 167 168 #ifdef DEBUG_TRUNCATE 169 struct hammer_inode *HammerTruncIp; 170 #endif 171 172 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 173 struct vnode *dvp, struct ucred *cred, 174 int flags, int isdir); 175 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 176 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 177 178 #if 0 179 static 180 int 181 hammer_vop_vnoperate(struct vop_generic_args *) 182 { 183 return (VOCALL(&hammer_vnode_vops, ap)); 184 } 185 #endif 186 187 /* 188 * hammer_vop_fsync { vp, waitfor } 189 * 190 * fsync() an inode to disk and wait for it to be completely committed 191 * such that the information would not be undone if a crash occured after 192 * return. 193 * 194 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 195 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 196 * operation. 197 * 198 * Ultimately the combination of a REDO log and use of fast storage 199 * to front-end cluster caches will make fsync fast, but it aint 200 * here yet. And, in anycase, we need real transactional 201 * all-or-nothing features which are not restricted to a single file. 202 */ 203 static 204 int 205 hammer_vop_fsync(struct vop_fsync_args *ap) 206 { 207 hammer_inode_t ip = VTOI(ap->a_vp); 208 hammer_mount_t hmp = ip->hmp; 209 int waitfor = ap->a_waitfor; 210 int mode; 211 212 lwkt_gettoken(&hmp->fs_token); 213 214 /* 215 * Fsync rule relaxation (default is either full synchronous flush 216 * or REDO semantics with synchronous flush). 217 */ 218 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 219 switch(hammer_fsync_mode) { 220 case 0: 221 mode0: 222 /* no REDO, full synchronous flush */ 223 goto skip; 224 case 1: 225 mode1: 226 /* no REDO, full asynchronous flush */ 227 if (waitfor == MNT_WAIT) 228 waitfor = MNT_NOWAIT; 229 goto skip; 230 case 2: 231 /* REDO semantics, synchronous flush */ 232 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 233 goto mode0; 234 mode = HAMMER_FLUSH_UNDOS_AUTO; 235 break; 236 case 3: 237 /* REDO semantics, relaxed asynchronous flush */ 238 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 239 goto mode1; 240 mode = HAMMER_FLUSH_UNDOS_RELAXED; 241 if (waitfor == MNT_WAIT) 242 waitfor = MNT_NOWAIT; 243 break; 244 case 4: 245 /* ignore the fsync() system call */ 246 lwkt_reltoken(&hmp->fs_token); 247 return(0); 248 default: 249 /* we have to do something */ 250 mode = HAMMER_FLUSH_UNDOS_RELAXED; 251 if (waitfor == MNT_WAIT) 252 waitfor = MNT_NOWAIT; 253 break; 254 } 255 256 /* 257 * Fast fsync only needs to flush the UNDO/REDO fifo if 258 * HAMMER_INODE_REDO is non-zero and the only modifications 259 * made to the file are write or write-extends. 260 */ 261 if ((ip->flags & HAMMER_INODE_REDO) && 262 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0 263 ) { 264 ++hammer_count_fsyncs; 265 hammer_flusher_flush_undos(hmp, mode); 266 ip->redo_count = 0; 267 lwkt_reltoken(&hmp->fs_token); 268 return(0); 269 } 270 271 /* 272 * REDO is enabled by fsync(), the idea being we really only 273 * want to lay down REDO records when programs are using 274 * fsync() heavily. The first fsync() on the file starts 275 * the gravy train going and later fsync()s keep it hot by 276 * resetting the redo_count. 277 * 278 * We weren't running REDOs before now so we have to fall 279 * through and do a full fsync of what we have. 280 */ 281 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 282 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 283 ip->flags |= HAMMER_INODE_REDO; 284 ip->redo_count = 0; 285 } 286 } 287 skip: 288 289 /* 290 * Do a full flush sequence. 291 * 292 * Attempt to release the vnode while waiting for the inode to 293 * finish flushing. This can really mess up inactive->reclaim 294 * sequences so only do it if the vnode is active. 295 */ 296 ++hammer_count_fsyncs; 297 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 298 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 299 if (waitfor == MNT_WAIT) { 300 if ((ap->a_vp->v_flag & VINACTIVE) == 0) 301 vn_unlock(ap->a_vp); 302 hammer_wait_inode(ip); 303 if ((ap->a_vp->v_flag & VINACTIVE) == 0) 304 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); 305 } 306 lwkt_reltoken(&hmp->fs_token); 307 return (ip->error); 308 } 309 310 /* 311 * hammer_vop_read { vp, uio, ioflag, cred } 312 * 313 * MPSAFE (for the cache safe does not require fs_token) 314 */ 315 static 316 int 317 hammer_vop_read(struct vop_read_args *ap) 318 { 319 struct hammer_transaction trans; 320 hammer_inode_t ip; 321 hammer_mount_t hmp; 322 off_t offset; 323 struct buf *bp; 324 struct uio *uio; 325 int error; 326 int n; 327 int seqcount; 328 int ioseqcount; 329 int blksize; 330 int bigread; 331 int got_fstoken; 332 333 if (ap->a_vp->v_type != VREG) 334 return (EINVAL); 335 ip = VTOI(ap->a_vp); 336 hmp = ip->hmp; 337 error = 0; 338 uio = ap->a_uio; 339 340 /* 341 * Allow the UIO's size to override the sequential heuristic. 342 */ 343 blksize = hammer_blocksize(uio->uio_offset); 344 seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE; 345 ioseqcount = (ap->a_ioflag >> 16); 346 if (seqcount < ioseqcount) 347 seqcount = ioseqcount; 348 349 /* 350 * If reading or writing a huge amount of data we have to break 351 * atomicy and allow the operation to be interrupted by a signal 352 * or it can DOS the machine. 353 */ 354 bigread = (uio->uio_resid > 100 * 1024 * 1024); 355 got_fstoken = 0; 356 357 /* 358 * Access the data typically in HAMMER_BUFSIZE blocks via the 359 * buffer cache, but HAMMER may use a variable block size based 360 * on the offset. 361 * 362 * XXX Temporary hack, delay the start transaction while we remain 363 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 364 * locked-shared. 365 */ 366 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 367 int64_t base_offset; 368 int64_t file_limit; 369 370 blksize = hammer_blocksize(uio->uio_offset); 371 offset = (int)uio->uio_offset & (blksize - 1); 372 base_offset = uio->uio_offset - offset; 373 374 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 375 break; 376 377 /* 378 * MPSAFE 379 */ 380 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0); 381 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 382 bp->b_flags &= ~B_AGE; 383 error = 0; 384 goto skip; 385 } 386 if (ap->a_ioflag & IO_NRDELAY) { 387 bqrelse(bp); 388 return (EWOULDBLOCK); 389 } 390 391 /* 392 * MPUNSAFE 393 */ 394 if (got_fstoken == 0) { 395 lwkt_gettoken(&hmp->fs_token); 396 got_fstoken = 1; 397 hammer_start_transaction(&trans, ip->hmp); 398 } 399 400 /* 401 * NOTE: A valid bp has already been acquired, but was not 402 * B_CACHE. 403 */ 404 if (hammer_cluster_enable) { 405 /* 406 * Use file_limit to prevent cluster_read() from 407 * creating buffers of the wrong block size past 408 * the demarc. 409 */ 410 file_limit = ip->ino_data.size; 411 if (base_offset < HAMMER_XDEMARC && 412 file_limit > HAMMER_XDEMARC) { 413 file_limit = HAMMER_XDEMARC; 414 } 415 error = cluster_readx(ap->a_vp, 416 file_limit, base_offset, 417 blksize, uio->uio_resid, 418 seqcount * BKVASIZE, &bp); 419 } else { 420 error = breadnx(ap->a_vp, base_offset, blksize, 421 NULL, NULL, 0, &bp); 422 } 423 if (error) { 424 brelse(bp); 425 break; 426 } 427 skip: 428 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { 429 kprintf("doff %016jx read file %016jx@%016jx\n", 430 (intmax_t)bp->b_bio2.bio_offset, 431 (intmax_t)ip->obj_id, 432 (intmax_t)bp->b_loffset); 433 } 434 bp->b_flags &= ~B_IODEBUG; 435 if (blksize == HAMMER_XBUFSIZE) 436 bp->b_flags |= B_CLUSTEROK; 437 438 n = blksize - offset; 439 if (n > uio->uio_resid) 440 n = uio->uio_resid; 441 if (n > ip->ino_data.size - uio->uio_offset) 442 n = (int)(ip->ino_data.size - uio->uio_offset); 443 if (got_fstoken) 444 lwkt_reltoken(&hmp->fs_token); 445 446 /* 447 * Set B_AGE, data has a lower priority than meta-data. 448 * 449 * Use a hold/unlock/drop sequence to run the uiomove 450 * with the buffer unlocked, avoiding deadlocks against 451 * read()s on mmap()'d spaces. 452 */ 453 bp->b_flags |= B_AGE; 454 bqhold(bp); 455 bqrelse(bp); 456 error = uiomove((char *)bp->b_data + offset, n, uio); 457 bqdrop(bp); 458 459 if (got_fstoken) 460 lwkt_gettoken(&hmp->fs_token); 461 462 if (error) 463 break; 464 hammer_stats_file_read += n; 465 } 466 467 /* 468 * Try to update the atime with just the inode lock for maximum 469 * concurrency. If we can't shortcut it we have to get the full 470 * blown transaction. 471 */ 472 if (got_fstoken == 0 && hammer_update_atime_quick(ip) < 0) { 473 lwkt_gettoken(&hmp->fs_token); 474 got_fstoken = 1; 475 hammer_start_transaction(&trans, ip->hmp); 476 } 477 478 if (got_fstoken) { 479 if ((ip->flags & HAMMER_INODE_RO) == 0 && 480 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 481 ip->ino_data.atime = trans.time; 482 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 483 } 484 hammer_done_transaction(&trans); 485 lwkt_reltoken(&hmp->fs_token); 486 } 487 return (error); 488 } 489 490 /* 491 * hammer_vop_write { vp, uio, ioflag, cred } 492 */ 493 static 494 int 495 hammer_vop_write(struct vop_write_args *ap) 496 { 497 struct hammer_transaction trans; 498 struct hammer_inode *ip; 499 hammer_mount_t hmp; 500 thread_t td; 501 struct uio *uio; 502 int offset; 503 off_t base_offset; 504 int64_t cluster_eof; 505 struct buf *bp; 506 int kflags; 507 int error; 508 int n; 509 int flags; 510 int seqcount; 511 int bigwrite; 512 513 if (ap->a_vp->v_type != VREG) 514 return (EINVAL); 515 ip = VTOI(ap->a_vp); 516 hmp = ip->hmp; 517 error = 0; 518 kflags = 0; 519 seqcount = ap->a_ioflag >> 16; 520 521 if (ip->flags & HAMMER_INODE_RO) 522 return (EROFS); 523 524 /* 525 * Create a transaction to cover the operations we perform. 526 */ 527 lwkt_gettoken(&hmp->fs_token); 528 hammer_start_transaction(&trans, hmp); 529 uio = ap->a_uio; 530 531 /* 532 * Check append mode 533 */ 534 if (ap->a_ioflag & IO_APPEND) 535 uio->uio_offset = ip->ino_data.size; 536 537 /* 538 * Check for illegal write offsets. Valid range is 0...2^63-1. 539 * 540 * NOTE: the base_off assignment is required to work around what 541 * I consider to be a GCC-4 optimization bug. 542 */ 543 if (uio->uio_offset < 0) { 544 hammer_done_transaction(&trans); 545 lwkt_reltoken(&hmp->fs_token); 546 return (EFBIG); 547 } 548 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 549 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 550 hammer_done_transaction(&trans); 551 lwkt_reltoken(&hmp->fs_token); 552 return (EFBIG); 553 } 554 555 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc && 556 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 557 hammer_done_transaction(&trans); 558 lwkt_reltoken(&hmp->fs_token); 559 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); 560 return (EFBIG); 561 } 562 563 /* 564 * If reading or writing a huge amount of data we have to break 565 * atomicy and allow the operation to be interrupted by a signal 566 * or it can DOS the machine. 567 * 568 * Preset redo_count so we stop generating REDOs earlier if the 569 * limit is exceeded. 570 */ 571 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 572 if ((ip->flags & HAMMER_INODE_REDO) && 573 ip->redo_count < hammer_limit_redo) { 574 ip->redo_count += uio->uio_resid; 575 } 576 577 /* 578 * Access the data typically in HAMMER_BUFSIZE blocks via the 579 * buffer cache, but HAMMER may use a variable block size based 580 * on the offset. 581 */ 582 while (uio->uio_resid > 0) { 583 int fixsize = 0; 584 int blksize; 585 int blkmask; 586 int trivial; 587 int endofblk; 588 off_t nsize; 589 590 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 591 break; 592 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 593 break; 594 595 blksize = hammer_blocksize(uio->uio_offset); 596 597 /* 598 * Do not allow HAMMER to blow out the buffer cache. Very 599 * large UIOs can lockout other processes due to bwillwrite() 600 * mechanics. 601 * 602 * The hammer inode is not locked during these operations. 603 * The vnode is locked which can interfere with the pageout 604 * daemon for non-UIO_NOCOPY writes but should not interfere 605 * with the buffer cache. Even so, we cannot afford to 606 * allow the pageout daemon to build up too many dirty buffer 607 * cache buffers. 608 * 609 * Only call this if we aren't being recursively called from 610 * a virtual disk device (vn), else we may deadlock. 611 */ 612 if ((ap->a_ioflag & IO_RECURSE) == 0) 613 bwillwrite(blksize); 614 615 /* 616 * Control the number of pending records associated with 617 * this inode. If too many have accumulated start a 618 * flush. Try to maintain a pipeline with the flusher. 619 * 620 * NOTE: It is possible for other sources to grow the 621 * records but not necessarily issue another flush, 622 * so use a timeout and ensure that a re-flush occurs. 623 */ 624 if (ip->rsv_recs >= hammer_limit_inode_recs) { 625 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 626 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 627 ip->flags |= HAMMER_INODE_RECSW; 628 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 629 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 630 } 631 } 632 633 #if 0 634 /* 635 * Do not allow HAMMER to blow out system memory by 636 * accumulating too many records. Records are so well 637 * decoupled from the buffer cache that it is possible 638 * for userland to push data out to the media via 639 * direct-write, but build up the records queued to the 640 * backend faster then the backend can flush them out. 641 * HAMMER has hit its write limit but the frontend has 642 * no pushback to slow it down. 643 */ 644 if (hmp->rsv_recs > hammer_limit_recs / 2) { 645 /* 646 * Get the inode on the flush list 647 */ 648 if (ip->rsv_recs >= 64) 649 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 650 else if (ip->rsv_recs >= 16) 651 hammer_flush_inode(ip, 0); 652 653 /* 654 * Keep the flusher going if the system keeps 655 * queueing records. 656 */ 657 delta = hmp->count_newrecords - 658 hmp->last_newrecords; 659 if (delta < 0 || delta > hammer_limit_recs / 2) { 660 hmp->last_newrecords = hmp->count_newrecords; 661 hammer_sync_hmp(hmp, MNT_NOWAIT); 662 } 663 664 /* 665 * If we have gotten behind start slowing 666 * down the writers. 667 */ 668 delta = (hmp->rsv_recs - hammer_limit_recs) * 669 hz / hammer_limit_recs; 670 if (delta > 0) 671 tsleep(&trans, 0, "hmrslo", delta); 672 } 673 #endif 674 675 /* 676 * Calculate the blocksize at the current offset and figure 677 * out how much we can actually write. 678 */ 679 blkmask = blksize - 1; 680 offset = (int)uio->uio_offset & blkmask; 681 base_offset = uio->uio_offset & ~(int64_t)blkmask; 682 n = blksize - offset; 683 if (n > uio->uio_resid) { 684 n = uio->uio_resid; 685 endofblk = 0; 686 } else { 687 endofblk = 1; 688 } 689 nsize = uio->uio_offset + n; 690 if (nsize > ip->ino_data.size) { 691 if (uio->uio_offset > ip->ino_data.size) 692 trivial = 0; 693 else 694 trivial = 1; 695 nvextendbuf(ap->a_vp, 696 ip->ino_data.size, 697 nsize, 698 hammer_blocksize(ip->ino_data.size), 699 hammer_blocksize(nsize), 700 hammer_blockoff(ip->ino_data.size), 701 hammer_blockoff(nsize), 702 trivial); 703 fixsize = 1; 704 kflags |= NOTE_EXTEND; 705 } 706 707 if (uio->uio_segflg == UIO_NOCOPY) { 708 /* 709 * Issuing a write with the same data backing the 710 * buffer. Instantiate the buffer to collect the 711 * backing vm pages, then read-in any missing bits. 712 * 713 * This case is used by vop_stdputpages(). 714 */ 715 bp = getblk(ap->a_vp, base_offset, 716 blksize, GETBLK_BHEAVY, 0); 717 if ((bp->b_flags & B_CACHE) == 0) { 718 bqrelse(bp); 719 error = bread(ap->a_vp, base_offset, 720 blksize, &bp); 721 } 722 } else if (offset == 0 && uio->uio_resid >= blksize) { 723 /* 724 * Even though we are entirely overwriting the buffer 725 * we may still have to zero it out to avoid a 726 * mmap/write visibility issue. 727 */ 728 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 729 if ((bp->b_flags & B_CACHE) == 0) 730 vfs_bio_clrbuf(bp); 731 } else if (base_offset >= ip->ino_data.size) { 732 /* 733 * If the base offset of the buffer is beyond the 734 * file EOF, we don't have to issue a read. 735 */ 736 bp = getblk(ap->a_vp, base_offset, 737 blksize, GETBLK_BHEAVY, 0); 738 vfs_bio_clrbuf(bp); 739 } else { 740 /* 741 * Partial overwrite, read in any missing bits then 742 * replace the portion being written. 743 */ 744 error = bread(ap->a_vp, base_offset, blksize, &bp); 745 if (error == 0) 746 bheavy(bp); 747 } 748 if (error == 0) { 749 lwkt_reltoken(&hmp->fs_token); 750 error = uiomove(bp->b_data + offset, n, uio); 751 lwkt_gettoken(&hmp->fs_token); 752 } 753 754 /* 755 * Generate REDO records if enabled and redo_count will not 756 * exceeded the limit. 757 * 758 * If redo_count exceeds the limit we stop generating records 759 * and clear HAMMER_INODE_REDO. This will cause the next 760 * fsync() to do a full meta-data sync instead of just an 761 * UNDO/REDO fifo update. 762 * 763 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 764 * will still be tracked. The tracks will be terminated 765 * when the related meta-data (including possible data 766 * modifications which are not tracked via REDO) is 767 * flushed. 768 */ 769 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 770 if (ip->redo_count < hammer_limit_redo) { 771 bp->b_flags |= B_VFSFLAG1; 772 error = hammer_generate_redo(&trans, ip, 773 base_offset + offset, 774 HAMMER_REDO_WRITE, 775 bp->b_data + offset, 776 (size_t)n); 777 } else { 778 ip->flags &= ~HAMMER_INODE_REDO; 779 } 780 } 781 782 /* 783 * If we screwed up we have to undo any VM size changes we 784 * made. 785 */ 786 if (error) { 787 brelse(bp); 788 if (fixsize) { 789 nvtruncbuf(ap->a_vp, ip->ino_data.size, 790 hammer_blocksize(ip->ino_data.size), 791 hammer_blockoff(ip->ino_data.size), 792 0); 793 } 794 break; 795 } 796 kflags |= NOTE_WRITE; 797 hammer_stats_file_write += n; 798 if (blksize == HAMMER_XBUFSIZE) 799 bp->b_flags |= B_CLUSTEROK; 800 if (ip->ino_data.size < uio->uio_offset) { 801 ip->ino_data.size = uio->uio_offset; 802 flags = HAMMER_INODE_SDIRTY; 803 } else { 804 flags = 0; 805 } 806 ip->ino_data.mtime = trans.time; 807 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 808 hammer_modify_inode(&trans, ip, flags); 809 810 /* 811 * Once we dirty the buffer any cached zone-X offset 812 * becomes invalid. HAMMER NOTE: no-history mode cannot 813 * allow overwriting over the same data sector unless 814 * we provide UNDOs for the old data, which we don't. 815 */ 816 bp->b_bio2.bio_offset = NOOFFSET; 817 818 /* 819 * Final buffer disposition. 820 * 821 * Because meta-data updates are deferred, HAMMER is 822 * especially sensitive to excessive bdwrite()s because 823 * the I/O stream is not broken up by disk reads. So the 824 * buffer cache simply cannot keep up. 825 * 826 * WARNING! blksize is variable. cluster_write() is 827 * expected to not blow up if it encounters 828 * buffers that do not match the passed blksize. 829 * 830 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 831 * The ip->rsv_recs check should burst-flush the data. 832 * If we queue it immediately the buf could be left 833 * locked on the device queue for a very long time. 834 * 835 * However, failing to flush a dirty buffer out when 836 * issued from the pageout daemon can result in a low 837 * memory deadlock against bio_page_alloc(), so we 838 * have to bawrite() on IO_ASYNC as well. 839 * 840 * NOTE! To avoid degenerate stalls due to mismatched block 841 * sizes we only honor IO_DIRECT on the write which 842 * abuts the end of the buffer. However, we must 843 * honor IO_SYNC in case someone is silly enough to 844 * configure a HAMMER file as swap, or when HAMMER 845 * is serving NFS (for commits). Ick ick. 846 */ 847 bp->b_flags |= B_AGE; 848 if (blksize == HAMMER_XBUFSIZE) 849 bp->b_flags |= B_CLUSTEROK; 850 851 if (ap->a_ioflag & IO_SYNC) { 852 bwrite(bp); 853 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 854 bawrite(bp); 855 } else if (ap->a_ioflag & IO_ASYNC) { 856 bawrite(bp); 857 } else if (hammer_cluster_enable && 858 !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 859 if (base_offset < HAMMER_XDEMARC) 860 cluster_eof = hammer_blockdemarc(base_offset, 861 ip->ino_data.size); 862 else 863 cluster_eof = ip->ino_data.size; 864 cluster_write(bp, cluster_eof, blksize, seqcount); 865 } else { 866 bdwrite(bp); 867 } 868 } 869 hammer_done_transaction(&trans); 870 hammer_knote(ap->a_vp, kflags); 871 lwkt_reltoken(&hmp->fs_token); 872 return (error); 873 } 874 875 /* 876 * hammer_vop_access { vp, mode, cred } 877 * 878 * MPSAFE - does not require fs_token 879 */ 880 static 881 int 882 hammer_vop_access(struct vop_access_args *ap) 883 { 884 struct hammer_inode *ip = VTOI(ap->a_vp); 885 uid_t uid; 886 gid_t gid; 887 int error; 888 889 ++hammer_stats_file_iopsr; 890 uid = hammer_to_unix_xid(&ip->ino_data.uid); 891 gid = hammer_to_unix_xid(&ip->ino_data.gid); 892 893 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 894 ip->ino_data.uflags); 895 return (error); 896 } 897 898 /* 899 * hammer_vop_advlock { vp, id, op, fl, flags } 900 * 901 * MPSAFE - does not require fs_token 902 */ 903 static 904 int 905 hammer_vop_advlock(struct vop_advlock_args *ap) 906 { 907 hammer_inode_t ip = VTOI(ap->a_vp); 908 909 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 910 } 911 912 /* 913 * hammer_vop_close { vp, fflag } 914 * 915 * We can only sync-on-close for normal closes. XXX disabled for now. 916 */ 917 static 918 int 919 hammer_vop_close(struct vop_close_args *ap) 920 { 921 #if 0 922 struct vnode *vp = ap->a_vp; 923 hammer_inode_t ip = VTOI(vp); 924 int waitfor; 925 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 926 if (vn_islocked(vp) == LK_EXCLUSIVE && 927 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 928 if (ip->flags & HAMMER_INODE_CLOSESYNC) 929 waitfor = MNT_WAIT; 930 else 931 waitfor = MNT_NOWAIT; 932 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 933 HAMMER_INODE_CLOSEASYNC); 934 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 935 } 936 } 937 #endif 938 return (vop_stdclose(ap)); 939 } 940 941 /* 942 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 943 * 944 * The operating system has already ensured that the directory entry 945 * does not exist and done all appropriate namespace locking. 946 */ 947 static 948 int 949 hammer_vop_ncreate(struct vop_ncreate_args *ap) 950 { 951 struct hammer_transaction trans; 952 struct hammer_inode *dip; 953 struct hammer_inode *nip; 954 struct nchandle *nch; 955 hammer_mount_t hmp; 956 int error; 957 958 nch = ap->a_nch; 959 dip = VTOI(ap->a_dvp); 960 hmp = dip->hmp; 961 962 if (dip->flags & HAMMER_INODE_RO) 963 return (EROFS); 964 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 965 return (error); 966 967 /* 968 * Create a transaction to cover the operations we perform. 969 */ 970 lwkt_gettoken(&hmp->fs_token); 971 hammer_start_transaction(&trans, hmp); 972 ++hammer_stats_file_iopsw; 973 974 /* 975 * Create a new filesystem object of the requested type. The 976 * returned inode will be referenced and shared-locked to prevent 977 * it from being moved to the flusher. 978 */ 979 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 980 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 981 NULL, &nip); 982 if (error) { 983 hkprintf("hammer_create_inode error %d\n", error); 984 hammer_done_transaction(&trans); 985 *ap->a_vpp = NULL; 986 lwkt_reltoken(&hmp->fs_token); 987 return (error); 988 } 989 990 /* 991 * Add the new filesystem object to the directory. This will also 992 * bump the inode's link count. 993 */ 994 error = hammer_ip_add_directory(&trans, dip, 995 nch->ncp->nc_name, nch->ncp->nc_nlen, 996 nip); 997 if (error) 998 hkprintf("hammer_ip_add_directory error %d\n", error); 999 1000 /* 1001 * Finish up. 1002 */ 1003 if (error) { 1004 hammer_rel_inode(nip, 0); 1005 hammer_done_transaction(&trans); 1006 *ap->a_vpp = NULL; 1007 } else { 1008 error = hammer_get_vnode(nip, ap->a_vpp); 1009 hammer_done_transaction(&trans); 1010 hammer_rel_inode(nip, 0); 1011 if (error == 0) { 1012 cache_setunresolved(ap->a_nch); 1013 cache_setvp(ap->a_nch, *ap->a_vpp); 1014 } 1015 hammer_knote(ap->a_dvp, NOTE_WRITE); 1016 } 1017 lwkt_reltoken(&hmp->fs_token); 1018 return (error); 1019 } 1020 1021 /* 1022 * hammer_vop_getattr { vp, vap } 1023 * 1024 * Retrieve an inode's attribute information. When accessing inodes 1025 * historically we fake the atime field to ensure consistent results. 1026 * The atime field is stored in the B-Tree element and allowed to be 1027 * updated without cycling the element. 1028 * 1029 * MPSAFE - does not require fs_token 1030 */ 1031 static 1032 int 1033 hammer_vop_getattr(struct vop_getattr_args *ap) 1034 { 1035 struct hammer_inode *ip = VTOI(ap->a_vp); 1036 struct vattr *vap = ap->a_vap; 1037 1038 /* 1039 * We want the fsid to be different when accessing a filesystem 1040 * with different as-of's so programs like diff don't think 1041 * the files are the same. 1042 * 1043 * We also want the fsid to be the same when comparing snapshots, 1044 * or when comparing mirrors (which might be backed by different 1045 * physical devices). HAMMER fsids are based on the PFS's 1046 * shared_uuid field. 1047 * 1048 * XXX there is a chance of collision here. The va_fsid reported 1049 * by stat is different from the more involved fsid used in the 1050 * mount structure. 1051 */ 1052 ++hammer_stats_file_iopsr; 1053 hammer_lock_sh(&ip->lock); 1054 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 1055 (u_int32_t)(ip->obj_asof >> 32); 1056 1057 vap->va_fileid = ip->ino_leaf.base.obj_id; 1058 vap->va_mode = ip->ino_data.mode; 1059 vap->va_nlink = ip->ino_data.nlinks; 1060 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1061 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1062 vap->va_rmajor = 0; 1063 vap->va_rminor = 0; 1064 vap->va_size = ip->ino_data.size; 1065 1066 /* 1067 * Special case for @@PFS softlinks. The actual size of the 1068 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1069 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1070 */ 1071 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1072 ip->ino_data.size == 10 && 1073 ip->obj_asof == HAMMER_MAX_TID && 1074 ip->obj_localization == 0 && 1075 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1076 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) 1077 vap->va_size = 26; 1078 else 1079 vap->va_size = 10; 1080 } 1081 1082 /* 1083 * We must provide a consistent atime and mtime for snapshots 1084 * so people can do a 'tar cf - ... | md5' on them and get 1085 * consistent results. 1086 */ 1087 if (ip->flags & HAMMER_INODE_RO) { 1088 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1089 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1090 } else { 1091 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1092 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1093 } 1094 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1095 vap->va_flags = ip->ino_data.uflags; 1096 vap->va_gen = 1; /* hammer inums are unique for all time */ 1097 vap->va_blocksize = HAMMER_BUFSIZE; 1098 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1099 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 1100 ~HAMMER_XBUFMASK64; 1101 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) { 1102 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 1103 ~HAMMER_BUFMASK64; 1104 } else { 1105 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 1106 } 1107 1108 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1109 vap->va_filerev = 0; /* XXX */ 1110 vap->va_uid_uuid = ip->ino_data.uid; 1111 vap->va_gid_uuid = ip->ino_data.gid; 1112 vap->va_fsid_uuid = ip->hmp->fsid; 1113 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1114 VA_FSID_UUID_VALID; 1115 1116 switch (ip->ino_data.obj_type) { 1117 case HAMMER_OBJTYPE_CDEV: 1118 case HAMMER_OBJTYPE_BDEV: 1119 vap->va_rmajor = ip->ino_data.rmajor; 1120 vap->va_rminor = ip->ino_data.rminor; 1121 break; 1122 default: 1123 break; 1124 } 1125 hammer_unlock(&ip->lock); 1126 return(0); 1127 } 1128 1129 /* 1130 * hammer_vop_nresolve { nch, dvp, cred } 1131 * 1132 * Locate the requested directory entry. 1133 */ 1134 static 1135 int 1136 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1137 { 1138 struct hammer_transaction trans; 1139 struct namecache *ncp; 1140 hammer_mount_t hmp; 1141 hammer_inode_t dip; 1142 hammer_inode_t ip; 1143 hammer_tid_t asof; 1144 struct hammer_cursor cursor; 1145 struct vnode *vp; 1146 int64_t namekey; 1147 int error; 1148 int i; 1149 int nlen; 1150 int flags; 1151 int ispfs; 1152 int64_t obj_id; 1153 u_int32_t localization; 1154 u_int32_t max_iterations; 1155 1156 /* 1157 * Misc initialization, plus handle as-of name extensions. Look for 1158 * the '@@' extension. Note that as-of files and directories cannot 1159 * be modified. 1160 */ 1161 dip = VTOI(ap->a_dvp); 1162 ncp = ap->a_nch->ncp; 1163 asof = dip->obj_asof; 1164 localization = dip->obj_localization; /* for code consistency */ 1165 nlen = ncp->nc_nlen; 1166 flags = dip->flags & HAMMER_INODE_RO; 1167 ispfs = 0; 1168 hmp = dip->hmp; 1169 1170 lwkt_gettoken(&hmp->fs_token); 1171 hammer_simple_transaction(&trans, hmp); 1172 ++hammer_stats_file_iopsr; 1173 1174 for (i = 0; i < nlen; ++i) { 1175 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1176 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1177 &ispfs, &asof, &localization); 1178 if (error != 0) { 1179 i = nlen; 1180 break; 1181 } 1182 if (asof != HAMMER_MAX_TID) 1183 flags |= HAMMER_INODE_RO; 1184 break; 1185 } 1186 } 1187 nlen = i; 1188 1189 /* 1190 * If this is a PFS softlink we dive into the PFS 1191 */ 1192 if (ispfs && nlen == 0) { 1193 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1194 asof, localization, 1195 flags, &error); 1196 if (error == 0) { 1197 error = hammer_get_vnode(ip, &vp); 1198 hammer_rel_inode(ip, 0); 1199 } else { 1200 vp = NULL; 1201 } 1202 if (error == 0) { 1203 vn_unlock(vp); 1204 cache_setvp(ap->a_nch, vp); 1205 vrele(vp); 1206 } 1207 goto done; 1208 } 1209 1210 /* 1211 * If there is no path component the time extension is relative to dip. 1212 * e.g. "fubar/@@<snapshot>" 1213 * 1214 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1215 * e.g. "fubar/.@@<snapshot>" 1216 * 1217 * ".." is handled by the kernel. We do not currently handle 1218 * "..@<snapshot>". 1219 */ 1220 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1221 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1222 asof, dip->obj_localization, 1223 flags, &error); 1224 if (error == 0) { 1225 error = hammer_get_vnode(ip, &vp); 1226 hammer_rel_inode(ip, 0); 1227 } else { 1228 vp = NULL; 1229 } 1230 if (error == 0) { 1231 vn_unlock(vp); 1232 cache_setvp(ap->a_nch, vp); 1233 vrele(vp); 1234 } 1235 goto done; 1236 } 1237 1238 /* 1239 * Calculate the namekey and setup the key range for the scan. This 1240 * works kinda like a chained hash table where the lower 32 bits 1241 * of the namekey synthesize the chain. 1242 * 1243 * The key range is inclusive of both key_beg and key_end. 1244 */ 1245 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 1246 &max_iterations); 1247 1248 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1249 cursor.key_beg.localization = dip->obj_localization + 1250 hammer_dir_localization(dip); 1251 cursor.key_beg.obj_id = dip->obj_id; 1252 cursor.key_beg.key = namekey; 1253 cursor.key_beg.create_tid = 0; 1254 cursor.key_beg.delete_tid = 0; 1255 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1256 cursor.key_beg.obj_type = 0; 1257 1258 cursor.key_end = cursor.key_beg; 1259 cursor.key_end.key += max_iterations; 1260 cursor.asof = asof; 1261 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1262 1263 /* 1264 * Scan all matching records (the chain), locate the one matching 1265 * the requested path component. 1266 * 1267 * The hammer_ip_*() functions merge in-memory records with on-disk 1268 * records for the purposes of the search. 1269 */ 1270 obj_id = 0; 1271 localization = HAMMER_DEF_LOCALIZATION; 1272 1273 if (error == 0) { 1274 error = hammer_ip_first(&cursor); 1275 while (error == 0) { 1276 error = hammer_ip_resolve_data(&cursor); 1277 if (error) 1278 break; 1279 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1280 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1281 obj_id = cursor.data->entry.obj_id; 1282 localization = cursor.data->entry.localization; 1283 break; 1284 } 1285 error = hammer_ip_next(&cursor); 1286 } 1287 } 1288 hammer_done_cursor(&cursor); 1289 1290 /* 1291 * Lookup the obj_id. This should always succeed. If it does not 1292 * the filesystem may be damaged and we return a dummy inode. 1293 */ 1294 if (error == 0) { 1295 ip = hammer_get_inode(&trans, dip, obj_id, 1296 asof, localization, 1297 flags, &error); 1298 if (error == ENOENT) { 1299 kprintf("HAMMER: WARNING: Missing " 1300 "inode for dirent \"%s\"\n" 1301 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n", 1302 ncp->nc_name, 1303 (long long)obj_id, (long long)asof, 1304 localization); 1305 error = 0; 1306 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1307 asof, localization, 1308 flags, &error); 1309 } 1310 if (error == 0) { 1311 error = hammer_get_vnode(ip, &vp); 1312 hammer_rel_inode(ip, 0); 1313 } else { 1314 vp = NULL; 1315 } 1316 if (error == 0) { 1317 vn_unlock(vp); 1318 cache_setvp(ap->a_nch, vp); 1319 vrele(vp); 1320 } 1321 } else if (error == ENOENT) { 1322 cache_setvp(ap->a_nch, NULL); 1323 } 1324 done: 1325 hammer_done_transaction(&trans); 1326 lwkt_reltoken(&hmp->fs_token); 1327 return (error); 1328 } 1329 1330 /* 1331 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1332 * 1333 * Locate the parent directory of a directory vnode. 1334 * 1335 * dvp is referenced but not locked. *vpp must be returned referenced and 1336 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 1337 * at the root, instead it could indicate that the directory we were in was 1338 * removed. 1339 * 1340 * NOTE: as-of sequences are not linked into the directory structure. If 1341 * we are at the root with a different asof then the mount point, reload 1342 * the same directory with the mount point's asof. I'm not sure what this 1343 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1344 * get confused, but it hasn't been tested. 1345 */ 1346 static 1347 int 1348 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1349 { 1350 struct hammer_transaction trans; 1351 struct hammer_inode *dip; 1352 struct hammer_inode *ip; 1353 hammer_mount_t hmp; 1354 int64_t parent_obj_id; 1355 u_int32_t parent_obj_localization; 1356 hammer_tid_t asof; 1357 int error; 1358 1359 dip = VTOI(ap->a_dvp); 1360 asof = dip->obj_asof; 1361 hmp = dip->hmp; 1362 1363 /* 1364 * Whos are parent? This could be the root of a pseudo-filesystem 1365 * whos parent is in another localization domain. 1366 */ 1367 lwkt_gettoken(&hmp->fs_token); 1368 parent_obj_id = dip->ino_data.parent_obj_id; 1369 if (dip->obj_id == HAMMER_OBJID_ROOT) 1370 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 1371 else 1372 parent_obj_localization = dip->obj_localization; 1373 1374 if (parent_obj_id == 0) { 1375 if (dip->obj_id == HAMMER_OBJID_ROOT && 1376 asof != hmp->asof) { 1377 parent_obj_id = dip->obj_id; 1378 asof = hmp->asof; 1379 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1380 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 1381 (long long)dip->obj_asof); 1382 } else { 1383 *ap->a_vpp = NULL; 1384 lwkt_reltoken(&hmp->fs_token); 1385 return ENOENT; 1386 } 1387 } 1388 1389 hammer_simple_transaction(&trans, hmp); 1390 ++hammer_stats_file_iopsr; 1391 1392 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1393 asof, parent_obj_localization, 1394 dip->flags, &error); 1395 if (ip) { 1396 error = hammer_get_vnode(ip, ap->a_vpp); 1397 hammer_rel_inode(ip, 0); 1398 } else { 1399 *ap->a_vpp = NULL; 1400 } 1401 hammer_done_transaction(&trans); 1402 lwkt_reltoken(&hmp->fs_token); 1403 return (error); 1404 } 1405 1406 /* 1407 * hammer_vop_nlink { nch, dvp, vp, cred } 1408 */ 1409 static 1410 int 1411 hammer_vop_nlink(struct vop_nlink_args *ap) 1412 { 1413 struct hammer_transaction trans; 1414 struct hammer_inode *dip; 1415 struct hammer_inode *ip; 1416 struct nchandle *nch; 1417 hammer_mount_t hmp; 1418 int error; 1419 1420 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1421 return(EXDEV); 1422 1423 nch = ap->a_nch; 1424 dip = VTOI(ap->a_dvp); 1425 ip = VTOI(ap->a_vp); 1426 hmp = dip->hmp; 1427 1428 if (dip->obj_localization != ip->obj_localization) 1429 return(EXDEV); 1430 1431 if (dip->flags & HAMMER_INODE_RO) 1432 return (EROFS); 1433 if (ip->flags & HAMMER_INODE_RO) 1434 return (EROFS); 1435 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1436 return (error); 1437 1438 /* 1439 * Create a transaction to cover the operations we perform. 1440 */ 1441 lwkt_gettoken(&hmp->fs_token); 1442 hammer_start_transaction(&trans, hmp); 1443 ++hammer_stats_file_iopsw; 1444 1445 /* 1446 * Add the filesystem object to the directory. Note that neither 1447 * dip nor ip are referenced or locked, but their vnodes are 1448 * referenced. This function will bump the inode's link count. 1449 */ 1450 error = hammer_ip_add_directory(&trans, dip, 1451 nch->ncp->nc_name, nch->ncp->nc_nlen, 1452 ip); 1453 1454 /* 1455 * Finish up. 1456 */ 1457 if (error == 0) { 1458 cache_setunresolved(nch); 1459 cache_setvp(nch, ap->a_vp); 1460 } 1461 hammer_done_transaction(&trans); 1462 hammer_knote(ap->a_vp, NOTE_LINK); 1463 hammer_knote(ap->a_dvp, NOTE_WRITE); 1464 lwkt_reltoken(&hmp->fs_token); 1465 return (error); 1466 } 1467 1468 /* 1469 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1470 * 1471 * The operating system has already ensured that the directory entry 1472 * does not exist and done all appropriate namespace locking. 1473 */ 1474 static 1475 int 1476 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1477 { 1478 struct hammer_transaction trans; 1479 struct hammer_inode *dip; 1480 struct hammer_inode *nip; 1481 struct nchandle *nch; 1482 hammer_mount_t hmp; 1483 int error; 1484 1485 nch = ap->a_nch; 1486 dip = VTOI(ap->a_dvp); 1487 hmp = dip->hmp; 1488 1489 if (dip->flags & HAMMER_INODE_RO) 1490 return (EROFS); 1491 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1492 return (error); 1493 1494 /* 1495 * Create a transaction to cover the operations we perform. 1496 */ 1497 lwkt_gettoken(&hmp->fs_token); 1498 hammer_start_transaction(&trans, hmp); 1499 ++hammer_stats_file_iopsw; 1500 1501 /* 1502 * Create a new filesystem object of the requested type. The 1503 * returned inode will be referenced but not locked. 1504 */ 1505 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1506 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1507 NULL, &nip); 1508 if (error) { 1509 hkprintf("hammer_mkdir error %d\n", error); 1510 hammer_done_transaction(&trans); 1511 *ap->a_vpp = NULL; 1512 lwkt_reltoken(&hmp->fs_token); 1513 return (error); 1514 } 1515 /* 1516 * Add the new filesystem object to the directory. This will also 1517 * bump the inode's link count. 1518 */ 1519 error = hammer_ip_add_directory(&trans, dip, 1520 nch->ncp->nc_name, nch->ncp->nc_nlen, 1521 nip); 1522 if (error) 1523 hkprintf("hammer_mkdir (add) error %d\n", error); 1524 1525 /* 1526 * Finish up. 1527 */ 1528 if (error) { 1529 hammer_rel_inode(nip, 0); 1530 *ap->a_vpp = NULL; 1531 } else { 1532 error = hammer_get_vnode(nip, ap->a_vpp); 1533 hammer_rel_inode(nip, 0); 1534 if (error == 0) { 1535 cache_setunresolved(ap->a_nch); 1536 cache_setvp(ap->a_nch, *ap->a_vpp); 1537 } 1538 } 1539 hammer_done_transaction(&trans); 1540 if (error == 0) 1541 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1542 lwkt_reltoken(&hmp->fs_token); 1543 return (error); 1544 } 1545 1546 /* 1547 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1548 * 1549 * The operating system has already ensured that the directory entry 1550 * does not exist and done all appropriate namespace locking. 1551 */ 1552 static 1553 int 1554 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1555 { 1556 struct hammer_transaction trans; 1557 struct hammer_inode *dip; 1558 struct hammer_inode *nip; 1559 struct nchandle *nch; 1560 hammer_mount_t hmp; 1561 int error; 1562 1563 nch = ap->a_nch; 1564 dip = VTOI(ap->a_dvp); 1565 hmp = dip->hmp; 1566 1567 if (dip->flags & HAMMER_INODE_RO) 1568 return (EROFS); 1569 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1570 return (error); 1571 1572 /* 1573 * Create a transaction to cover the operations we perform. 1574 */ 1575 lwkt_gettoken(&hmp->fs_token); 1576 hammer_start_transaction(&trans, hmp); 1577 ++hammer_stats_file_iopsw; 1578 1579 /* 1580 * Create a new filesystem object of the requested type. The 1581 * returned inode will be referenced but not locked. 1582 * 1583 * If mknod specifies a directory a pseudo-fs is created. 1584 */ 1585 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1586 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1587 NULL, &nip); 1588 if (error) { 1589 hammer_done_transaction(&trans); 1590 *ap->a_vpp = NULL; 1591 lwkt_reltoken(&hmp->fs_token); 1592 return (error); 1593 } 1594 1595 /* 1596 * Add the new filesystem object to the directory. This will also 1597 * bump the inode's link count. 1598 */ 1599 error = hammer_ip_add_directory(&trans, dip, 1600 nch->ncp->nc_name, nch->ncp->nc_nlen, 1601 nip); 1602 1603 /* 1604 * Finish up. 1605 */ 1606 if (error) { 1607 hammer_rel_inode(nip, 0); 1608 *ap->a_vpp = NULL; 1609 } else { 1610 error = hammer_get_vnode(nip, ap->a_vpp); 1611 hammer_rel_inode(nip, 0); 1612 if (error == 0) { 1613 cache_setunresolved(ap->a_nch); 1614 cache_setvp(ap->a_nch, *ap->a_vpp); 1615 } 1616 } 1617 hammer_done_transaction(&trans); 1618 if (error == 0) 1619 hammer_knote(ap->a_dvp, NOTE_WRITE); 1620 lwkt_reltoken(&hmp->fs_token); 1621 return (error); 1622 } 1623 1624 /* 1625 * hammer_vop_open { vp, mode, cred, fp } 1626 * 1627 * MPSAFE (does not require fs_token) 1628 */ 1629 static 1630 int 1631 hammer_vop_open(struct vop_open_args *ap) 1632 { 1633 hammer_inode_t ip; 1634 1635 ++hammer_stats_file_iopsr; 1636 ip = VTOI(ap->a_vp); 1637 1638 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1639 return (EROFS); 1640 return(vop_stdopen(ap)); 1641 } 1642 1643 /* 1644 * hammer_vop_print { vp } 1645 */ 1646 static 1647 int 1648 hammer_vop_print(struct vop_print_args *ap) 1649 { 1650 return EOPNOTSUPP; 1651 } 1652 1653 /* 1654 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1655 */ 1656 static 1657 int 1658 hammer_vop_readdir(struct vop_readdir_args *ap) 1659 { 1660 struct hammer_transaction trans; 1661 struct hammer_cursor cursor; 1662 struct hammer_inode *ip; 1663 hammer_mount_t hmp; 1664 struct uio *uio; 1665 hammer_base_elm_t base; 1666 int error; 1667 int cookie_index; 1668 int ncookies; 1669 off_t *cookies; 1670 off_t saveoff; 1671 int r; 1672 int dtype; 1673 1674 ++hammer_stats_file_iopsr; 1675 ip = VTOI(ap->a_vp); 1676 uio = ap->a_uio; 1677 saveoff = uio->uio_offset; 1678 hmp = ip->hmp; 1679 1680 if (ap->a_ncookies) { 1681 ncookies = uio->uio_resid / 16 + 1; 1682 if (ncookies > 1024) 1683 ncookies = 1024; 1684 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1685 cookie_index = 0; 1686 } else { 1687 ncookies = -1; 1688 cookies = NULL; 1689 cookie_index = 0; 1690 } 1691 1692 lwkt_gettoken(&hmp->fs_token); 1693 hammer_simple_transaction(&trans, hmp); 1694 1695 /* 1696 * Handle artificial entries 1697 * 1698 * It should be noted that the minimum value for a directory 1699 * hash key on-media is 0x0000000100000000, so we can use anything 1700 * less then that to represent our 'special' key space. 1701 */ 1702 error = 0; 1703 if (saveoff == 0) { 1704 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1705 if (r) 1706 goto done; 1707 if (cookies) 1708 cookies[cookie_index] = saveoff; 1709 ++saveoff; 1710 ++cookie_index; 1711 if (cookie_index == ncookies) 1712 goto done; 1713 } 1714 if (saveoff == 1) { 1715 if (ip->ino_data.parent_obj_id) { 1716 r = vop_write_dirent(&error, uio, 1717 ip->ino_data.parent_obj_id, 1718 DT_DIR, 2, ".."); 1719 } else { 1720 r = vop_write_dirent(&error, uio, 1721 ip->obj_id, DT_DIR, 2, ".."); 1722 } 1723 if (r) 1724 goto done; 1725 if (cookies) 1726 cookies[cookie_index] = saveoff; 1727 ++saveoff; 1728 ++cookie_index; 1729 if (cookie_index == ncookies) 1730 goto done; 1731 } 1732 1733 /* 1734 * Key range (begin and end inclusive) to scan. Directory keys 1735 * directly translate to a 64 bit 'seek' position. 1736 */ 1737 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1738 cursor.key_beg.localization = ip->obj_localization + 1739 hammer_dir_localization(ip); 1740 cursor.key_beg.obj_id = ip->obj_id; 1741 cursor.key_beg.create_tid = 0; 1742 cursor.key_beg.delete_tid = 0; 1743 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1744 cursor.key_beg.obj_type = 0; 1745 cursor.key_beg.key = saveoff; 1746 1747 cursor.key_end = cursor.key_beg; 1748 cursor.key_end.key = HAMMER_MAX_KEY; 1749 cursor.asof = ip->obj_asof; 1750 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1751 1752 error = hammer_ip_first(&cursor); 1753 1754 while (error == 0) { 1755 error = hammer_ip_resolve_data(&cursor); 1756 if (error) 1757 break; 1758 base = &cursor.leaf->base; 1759 saveoff = base->key; 1760 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1761 1762 if (base->obj_id != ip->obj_id) 1763 panic("readdir: bad record at %p", cursor.node); 1764 1765 /* 1766 * Convert pseudo-filesystems into softlinks 1767 */ 1768 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1769 r = vop_write_dirent( 1770 &error, uio, cursor.data->entry.obj_id, 1771 dtype, 1772 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1773 (void *)cursor.data->entry.name); 1774 if (r) 1775 break; 1776 ++saveoff; 1777 if (cookies) 1778 cookies[cookie_index] = base->key; 1779 ++cookie_index; 1780 if (cookie_index == ncookies) 1781 break; 1782 error = hammer_ip_next(&cursor); 1783 } 1784 hammer_done_cursor(&cursor); 1785 1786 done: 1787 hammer_done_transaction(&trans); 1788 1789 if (ap->a_eofflag) 1790 *ap->a_eofflag = (error == ENOENT); 1791 uio->uio_offset = saveoff; 1792 if (error && cookie_index == 0) { 1793 if (error == ENOENT) 1794 error = 0; 1795 if (cookies) { 1796 kfree(cookies, M_TEMP); 1797 *ap->a_ncookies = 0; 1798 *ap->a_cookies = NULL; 1799 } 1800 } else { 1801 if (error == ENOENT) 1802 error = 0; 1803 if (cookies) { 1804 *ap->a_ncookies = cookie_index; 1805 *ap->a_cookies = cookies; 1806 } 1807 } 1808 lwkt_reltoken(&hmp->fs_token); 1809 return(error); 1810 } 1811 1812 /* 1813 * hammer_vop_readlink { vp, uio, cred } 1814 */ 1815 static 1816 int 1817 hammer_vop_readlink(struct vop_readlink_args *ap) 1818 { 1819 struct hammer_transaction trans; 1820 struct hammer_cursor cursor; 1821 struct hammer_inode *ip; 1822 hammer_mount_t hmp; 1823 char buf[32]; 1824 u_int32_t localization; 1825 hammer_pseudofs_inmem_t pfsm; 1826 int error; 1827 1828 ip = VTOI(ap->a_vp); 1829 hmp = ip->hmp; 1830 1831 lwkt_gettoken(&hmp->fs_token); 1832 1833 /* 1834 * Shortcut if the symlink data was stuffed into ino_data. 1835 * 1836 * Also expand special "@@PFS%05d" softlinks (expansion only 1837 * occurs for non-historical (current) accesses made from the 1838 * primary filesystem). 1839 */ 1840 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1841 char *ptr; 1842 int bytes; 1843 1844 ptr = ip->ino_data.ext.symlink; 1845 bytes = (int)ip->ino_data.size; 1846 if (bytes == 10 && 1847 ip->obj_asof == HAMMER_MAX_TID && 1848 ip->obj_localization == 0 && 1849 strncmp(ptr, "@@PFS", 5) == 0) { 1850 hammer_simple_transaction(&trans, hmp); 1851 bcopy(ptr + 5, buf, 5); 1852 buf[5] = 0; 1853 localization = strtoul(buf, NULL, 10) << 16; 1854 pfsm = hammer_load_pseudofs(&trans, localization, 1855 &error); 1856 if (error == 0) { 1857 if (pfsm->pfsd.mirror_flags & 1858 HAMMER_PFSD_SLAVE) { 1859 /* vap->va_size == 26 */ 1860 ksnprintf(buf, sizeof(buf), 1861 "@@0x%016llx:%05d", 1862 (long long)pfsm->pfsd.sync_end_tid, 1863 localization >> 16); 1864 } else { 1865 /* vap->va_size == 10 */ 1866 ksnprintf(buf, sizeof(buf), 1867 "@@-1:%05d", 1868 localization >> 16); 1869 #if 0 1870 ksnprintf(buf, sizeof(buf), 1871 "@@0x%016llx:%05d", 1872 (long long)HAMMER_MAX_TID, 1873 localization >> 16); 1874 #endif 1875 } 1876 ptr = buf; 1877 bytes = strlen(buf); 1878 } 1879 if (pfsm) 1880 hammer_rel_pseudofs(hmp, pfsm); 1881 hammer_done_transaction(&trans); 1882 } 1883 error = uiomove(ptr, bytes, ap->a_uio); 1884 lwkt_reltoken(&hmp->fs_token); 1885 return(error); 1886 } 1887 1888 /* 1889 * Long version 1890 */ 1891 hammer_simple_transaction(&trans, hmp); 1892 ++hammer_stats_file_iopsr; 1893 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1894 1895 /* 1896 * Key range (begin and end inclusive) to scan. Directory keys 1897 * directly translate to a 64 bit 'seek' position. 1898 */ 1899 cursor.key_beg.localization = ip->obj_localization + 1900 HAMMER_LOCALIZE_MISC; 1901 cursor.key_beg.obj_id = ip->obj_id; 1902 cursor.key_beg.create_tid = 0; 1903 cursor.key_beg.delete_tid = 0; 1904 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1905 cursor.key_beg.obj_type = 0; 1906 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1907 cursor.asof = ip->obj_asof; 1908 cursor.flags |= HAMMER_CURSOR_ASOF; 1909 1910 error = hammer_ip_lookup(&cursor); 1911 if (error == 0) { 1912 error = hammer_ip_resolve_data(&cursor); 1913 if (error == 0) { 1914 KKASSERT(cursor.leaf->data_len >= 1915 HAMMER_SYMLINK_NAME_OFF); 1916 error = uiomove(cursor.data->symlink.name, 1917 cursor.leaf->data_len - 1918 HAMMER_SYMLINK_NAME_OFF, 1919 ap->a_uio); 1920 } 1921 } 1922 hammer_done_cursor(&cursor); 1923 hammer_done_transaction(&trans); 1924 lwkt_reltoken(&hmp->fs_token); 1925 return(error); 1926 } 1927 1928 /* 1929 * hammer_vop_nremove { nch, dvp, cred } 1930 */ 1931 static 1932 int 1933 hammer_vop_nremove(struct vop_nremove_args *ap) 1934 { 1935 struct hammer_transaction trans; 1936 struct hammer_inode *dip; 1937 hammer_mount_t hmp; 1938 int error; 1939 1940 dip = VTOI(ap->a_dvp); 1941 hmp = dip->hmp; 1942 1943 if (hammer_nohistory(dip) == 0 && 1944 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1945 return (error); 1946 } 1947 1948 lwkt_gettoken(&hmp->fs_token); 1949 hammer_start_transaction(&trans, hmp); 1950 ++hammer_stats_file_iopsw; 1951 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1952 hammer_done_transaction(&trans); 1953 if (error == 0) 1954 hammer_knote(ap->a_dvp, NOTE_WRITE); 1955 lwkt_reltoken(&hmp->fs_token); 1956 return (error); 1957 } 1958 1959 /* 1960 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1961 */ 1962 static 1963 int 1964 hammer_vop_nrename(struct vop_nrename_args *ap) 1965 { 1966 struct hammer_transaction trans; 1967 struct namecache *fncp; 1968 struct namecache *tncp; 1969 struct hammer_inode *fdip; 1970 struct hammer_inode *tdip; 1971 struct hammer_inode *ip; 1972 hammer_mount_t hmp; 1973 struct hammer_cursor cursor; 1974 int64_t namekey; 1975 u_int32_t max_iterations; 1976 int nlen, error; 1977 1978 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1979 return(EXDEV); 1980 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1981 return(EXDEV); 1982 1983 fdip = VTOI(ap->a_fdvp); 1984 tdip = VTOI(ap->a_tdvp); 1985 fncp = ap->a_fnch->ncp; 1986 tncp = ap->a_tnch->ncp; 1987 ip = VTOI(fncp->nc_vp); 1988 KKASSERT(ip != NULL); 1989 1990 hmp = ip->hmp; 1991 1992 if (fdip->obj_localization != tdip->obj_localization) 1993 return(EXDEV); 1994 if (fdip->obj_localization != ip->obj_localization) 1995 return(EXDEV); 1996 1997 if (fdip->flags & HAMMER_INODE_RO) 1998 return (EROFS); 1999 if (tdip->flags & HAMMER_INODE_RO) 2000 return (EROFS); 2001 if (ip->flags & HAMMER_INODE_RO) 2002 return (EROFS); 2003 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2004 return (error); 2005 2006 lwkt_gettoken(&hmp->fs_token); 2007 hammer_start_transaction(&trans, hmp); 2008 ++hammer_stats_file_iopsw; 2009 2010 /* 2011 * Remove tncp from the target directory and then link ip as 2012 * tncp. XXX pass trans to dounlink 2013 * 2014 * Force the inode sync-time to match the transaction so it is 2015 * in-sync with the creation of the target directory entry. 2016 */ 2017 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 2018 ap->a_cred, 0, -1); 2019 if (error == 0 || error == ENOENT) { 2020 error = hammer_ip_add_directory(&trans, tdip, 2021 tncp->nc_name, tncp->nc_nlen, 2022 ip); 2023 if (error == 0) { 2024 ip->ino_data.parent_obj_id = tdip->obj_id; 2025 ip->ino_data.ctime = trans.time; 2026 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 2027 } 2028 } 2029 if (error) 2030 goto failed; /* XXX */ 2031 2032 /* 2033 * Locate the record in the originating directory and remove it. 2034 * 2035 * Calculate the namekey and setup the key range for the scan. This 2036 * works kinda like a chained hash table where the lower 32 bits 2037 * of the namekey synthesize the chain. 2038 * 2039 * The key range is inclusive of both key_beg and key_end. 2040 */ 2041 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 2042 &max_iterations); 2043 retry: 2044 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 2045 cursor.key_beg.localization = fdip->obj_localization + 2046 hammer_dir_localization(fdip); 2047 cursor.key_beg.obj_id = fdip->obj_id; 2048 cursor.key_beg.key = namekey; 2049 cursor.key_beg.create_tid = 0; 2050 cursor.key_beg.delete_tid = 0; 2051 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2052 cursor.key_beg.obj_type = 0; 2053 2054 cursor.key_end = cursor.key_beg; 2055 cursor.key_end.key += max_iterations; 2056 cursor.asof = fdip->obj_asof; 2057 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2058 2059 /* 2060 * Scan all matching records (the chain), locate the one matching 2061 * the requested path component. 2062 * 2063 * The hammer_ip_*() functions merge in-memory records with on-disk 2064 * records for the purposes of the search. 2065 */ 2066 error = hammer_ip_first(&cursor); 2067 while (error == 0) { 2068 if (hammer_ip_resolve_data(&cursor) != 0) 2069 break; 2070 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2071 KKASSERT(nlen > 0); 2072 if (fncp->nc_nlen == nlen && 2073 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2074 break; 2075 } 2076 error = hammer_ip_next(&cursor); 2077 } 2078 2079 /* 2080 * If all is ok we have to get the inode so we can adjust nlinks. 2081 * 2082 * WARNING: hammer_ip_del_directory() may have to terminate the 2083 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2084 * twice. 2085 */ 2086 if (error == 0) 2087 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 2088 2089 /* 2090 * XXX A deadlock here will break rename's atomicy for the purposes 2091 * of crash recovery. 2092 */ 2093 if (error == EDEADLK) { 2094 hammer_done_cursor(&cursor); 2095 goto retry; 2096 } 2097 2098 /* 2099 * Cleanup and tell the kernel that the rename succeeded. 2100 * 2101 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2102 * without formally acquiring the vp since the vp might 2103 * have zero refs on it, or in the middle of a reclaim, 2104 * etc. 2105 */ 2106 hammer_done_cursor(&cursor); 2107 if (error == 0) { 2108 cache_rename(ap->a_fnch, ap->a_tnch); 2109 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2110 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2111 while (ip->vp) { 2112 struct vnode *vp; 2113 2114 error = hammer_get_vnode(ip, &vp); 2115 if (error == 0 && vp) { 2116 vn_unlock(vp); 2117 hammer_knote(ip->vp, NOTE_RENAME); 2118 vrele(vp); 2119 break; 2120 } 2121 kprintf("Debug: HAMMER ip/vp race2 avoided\n"); 2122 } 2123 } 2124 2125 failed: 2126 hammer_done_transaction(&trans); 2127 lwkt_reltoken(&hmp->fs_token); 2128 return (error); 2129 } 2130 2131 /* 2132 * hammer_vop_nrmdir { nch, dvp, cred } 2133 */ 2134 static 2135 int 2136 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2137 { 2138 struct hammer_transaction trans; 2139 struct hammer_inode *dip; 2140 hammer_mount_t hmp; 2141 int error; 2142 2143 dip = VTOI(ap->a_dvp); 2144 hmp = dip->hmp; 2145 2146 if (hammer_nohistory(dip) == 0 && 2147 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2148 return (error); 2149 } 2150 2151 lwkt_gettoken(&hmp->fs_token); 2152 hammer_start_transaction(&trans, hmp); 2153 ++hammer_stats_file_iopsw; 2154 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2155 hammer_done_transaction(&trans); 2156 if (error == 0) 2157 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2158 lwkt_reltoken(&hmp->fs_token); 2159 return (error); 2160 } 2161 2162 /* 2163 * hammer_vop_markatime { vp, cred } 2164 */ 2165 static 2166 int 2167 hammer_vop_markatime(struct vop_markatime_args *ap) 2168 { 2169 struct hammer_transaction trans; 2170 struct hammer_inode *ip; 2171 hammer_mount_t hmp; 2172 2173 ip = VTOI(ap->a_vp); 2174 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2175 return (EROFS); 2176 if (ip->flags & HAMMER_INODE_RO) 2177 return (EROFS); 2178 hmp = ip->hmp; 2179 if (hmp->mp->mnt_flag & MNT_NOATIME) 2180 return (0); 2181 lwkt_gettoken(&hmp->fs_token); 2182 hammer_start_transaction(&trans, hmp); 2183 ++hammer_stats_file_iopsw; 2184 2185 ip->ino_data.atime = trans.time; 2186 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2187 hammer_done_transaction(&trans); 2188 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2189 lwkt_reltoken(&hmp->fs_token); 2190 return (0); 2191 } 2192 2193 /* 2194 * hammer_vop_setattr { vp, vap, cred } 2195 */ 2196 static 2197 int 2198 hammer_vop_setattr(struct vop_setattr_args *ap) 2199 { 2200 struct hammer_transaction trans; 2201 struct hammer_inode *ip; 2202 struct vattr *vap; 2203 hammer_mount_t hmp; 2204 int modflags; 2205 int error; 2206 int truncating; 2207 int blksize; 2208 int kflags; 2209 #if 0 2210 int64_t aligned_size; 2211 #endif 2212 u_int32_t flags; 2213 2214 vap = ap->a_vap; 2215 ip = ap->a_vp->v_data; 2216 modflags = 0; 2217 kflags = 0; 2218 hmp = ip->hmp; 2219 2220 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2221 return(EROFS); 2222 if (ip->flags & HAMMER_INODE_RO) 2223 return (EROFS); 2224 if (hammer_nohistory(ip) == 0 && 2225 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2226 return (error); 2227 } 2228 2229 lwkt_gettoken(&hmp->fs_token); 2230 hammer_start_transaction(&trans, hmp); 2231 ++hammer_stats_file_iopsw; 2232 error = 0; 2233 2234 if (vap->va_flags != VNOVAL) { 2235 flags = ip->ino_data.uflags; 2236 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2237 hammer_to_unix_xid(&ip->ino_data.uid), 2238 ap->a_cred); 2239 if (error == 0) { 2240 if (ip->ino_data.uflags != flags) { 2241 ip->ino_data.uflags = flags; 2242 ip->ino_data.ctime = trans.time; 2243 modflags |= HAMMER_INODE_DDIRTY; 2244 kflags |= NOTE_ATTRIB; 2245 } 2246 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2247 error = 0; 2248 goto done; 2249 } 2250 } 2251 goto done; 2252 } 2253 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2254 error = EPERM; 2255 goto done; 2256 } 2257 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2258 mode_t cur_mode = ip->ino_data.mode; 2259 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2260 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2261 uuid_t uuid_uid; 2262 uuid_t uuid_gid; 2263 2264 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2265 ap->a_cred, 2266 &cur_uid, &cur_gid, &cur_mode); 2267 if (error == 0) { 2268 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2269 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2270 if (bcmp(&uuid_uid, &ip->ino_data.uid, 2271 sizeof(uuid_uid)) || 2272 bcmp(&uuid_gid, &ip->ino_data.gid, 2273 sizeof(uuid_gid)) || 2274 ip->ino_data.mode != cur_mode 2275 ) { 2276 ip->ino_data.uid = uuid_uid; 2277 ip->ino_data.gid = uuid_gid; 2278 ip->ino_data.mode = cur_mode; 2279 ip->ino_data.ctime = trans.time; 2280 modflags |= HAMMER_INODE_DDIRTY; 2281 } 2282 kflags |= NOTE_ATTRIB; 2283 } 2284 } 2285 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2286 switch(ap->a_vp->v_type) { 2287 case VREG: 2288 if (vap->va_size == ip->ino_data.size) 2289 break; 2290 2291 /* 2292 * Log the operation if in fast-fsync mode or if 2293 * there are unterminated redo write records present. 2294 * 2295 * The second check is needed so the recovery code 2296 * properly truncates write redos even if nominal 2297 * REDO operations is turned off due to excessive 2298 * writes, because the related records might be 2299 * destroyed and never lay down a TERM_WRITE. 2300 */ 2301 if ((ip->flags & HAMMER_INODE_REDO) || 2302 (ip->flags & HAMMER_INODE_RDIRTY)) { 2303 error = hammer_generate_redo(&trans, ip, 2304 vap->va_size, 2305 HAMMER_REDO_TRUNC, 2306 NULL, 0); 2307 } 2308 blksize = hammer_blocksize(vap->va_size); 2309 2310 /* 2311 * XXX break atomicy, we can deadlock the backend 2312 * if we do not release the lock. Probably not a 2313 * big deal here. 2314 */ 2315 if (vap->va_size < ip->ino_data.size) { 2316 nvtruncbuf(ap->a_vp, vap->va_size, 2317 blksize, 2318 hammer_blockoff(vap->va_size), 2319 0); 2320 truncating = 1; 2321 kflags |= NOTE_WRITE; 2322 } else { 2323 nvextendbuf(ap->a_vp, 2324 ip->ino_data.size, 2325 vap->va_size, 2326 hammer_blocksize(ip->ino_data.size), 2327 hammer_blocksize(vap->va_size), 2328 hammer_blockoff(ip->ino_data.size), 2329 hammer_blockoff(vap->va_size), 2330 0); 2331 truncating = 0; 2332 kflags |= NOTE_WRITE | NOTE_EXTEND; 2333 } 2334 ip->ino_data.size = vap->va_size; 2335 ip->ino_data.mtime = trans.time; 2336 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2337 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2338 2339 /* 2340 * On-media truncation is cached in the inode until 2341 * the inode is synchronized. We must immediately 2342 * handle any frontend records. 2343 */ 2344 if (truncating) { 2345 hammer_ip_frontend_trunc(ip, vap->va_size); 2346 #ifdef DEBUG_TRUNCATE 2347 if (HammerTruncIp == NULL) 2348 HammerTruncIp = ip; 2349 #endif 2350 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2351 ip->flags |= HAMMER_INODE_TRUNCATED; 2352 ip->trunc_off = vap->va_size; 2353 #ifdef DEBUG_TRUNCATE 2354 if (ip == HammerTruncIp) 2355 kprintf("truncate1 %016llx\n", 2356 (long long)ip->trunc_off); 2357 #endif 2358 } else if (ip->trunc_off > vap->va_size) { 2359 ip->trunc_off = vap->va_size; 2360 #ifdef DEBUG_TRUNCATE 2361 if (ip == HammerTruncIp) 2362 kprintf("truncate2 %016llx\n", 2363 (long long)ip->trunc_off); 2364 #endif 2365 } else { 2366 #ifdef DEBUG_TRUNCATE 2367 if (ip == HammerTruncIp) 2368 kprintf("truncate3 %016llx (ignored)\n", 2369 (long long)vap->va_size); 2370 #endif 2371 } 2372 } 2373 2374 #if 0 2375 /* 2376 * When truncating, nvtruncbuf() may have cleaned out 2377 * a portion of the last block on-disk in the buffer 2378 * cache. We must clean out any frontend records 2379 * for blocks beyond the new last block. 2380 */ 2381 aligned_size = (vap->va_size + (blksize - 1)) & 2382 ~(int64_t)(blksize - 1); 2383 if (truncating && vap->va_size < aligned_size) { 2384 aligned_size -= blksize; 2385 hammer_ip_frontend_trunc(ip, aligned_size); 2386 } 2387 #endif 2388 break; 2389 case VDATABASE: 2390 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2391 ip->flags |= HAMMER_INODE_TRUNCATED; 2392 ip->trunc_off = vap->va_size; 2393 } else if (ip->trunc_off > vap->va_size) { 2394 ip->trunc_off = vap->va_size; 2395 } 2396 hammer_ip_frontend_trunc(ip, vap->va_size); 2397 ip->ino_data.size = vap->va_size; 2398 ip->ino_data.mtime = trans.time; 2399 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2400 kflags |= NOTE_ATTRIB; 2401 break; 2402 default: 2403 error = EINVAL; 2404 goto done; 2405 } 2406 break; 2407 } 2408 if (vap->va_atime.tv_sec != VNOVAL) { 2409 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2410 modflags |= HAMMER_INODE_ATIME; 2411 kflags |= NOTE_ATTRIB; 2412 } 2413 if (vap->va_mtime.tv_sec != VNOVAL) { 2414 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2415 modflags |= HAMMER_INODE_MTIME; 2416 kflags |= NOTE_ATTRIB; 2417 } 2418 if (vap->va_mode != (mode_t)VNOVAL) { 2419 mode_t cur_mode = ip->ino_data.mode; 2420 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2421 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2422 2423 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2424 cur_uid, cur_gid, &cur_mode); 2425 if (error == 0 && ip->ino_data.mode != cur_mode) { 2426 ip->ino_data.mode = cur_mode; 2427 ip->ino_data.ctime = trans.time; 2428 modflags |= HAMMER_INODE_DDIRTY; 2429 kflags |= NOTE_ATTRIB; 2430 } 2431 } 2432 done: 2433 if (error == 0) 2434 hammer_modify_inode(&trans, ip, modflags); 2435 hammer_done_transaction(&trans); 2436 hammer_knote(ap->a_vp, kflags); 2437 lwkt_reltoken(&hmp->fs_token); 2438 return (error); 2439 } 2440 2441 /* 2442 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2443 */ 2444 static 2445 int 2446 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2447 { 2448 struct hammer_transaction trans; 2449 struct hammer_inode *dip; 2450 struct hammer_inode *nip; 2451 hammer_record_t record; 2452 struct nchandle *nch; 2453 hammer_mount_t hmp; 2454 int error; 2455 int bytes; 2456 2457 ap->a_vap->va_type = VLNK; 2458 2459 nch = ap->a_nch; 2460 dip = VTOI(ap->a_dvp); 2461 hmp = dip->hmp; 2462 2463 if (dip->flags & HAMMER_INODE_RO) 2464 return (EROFS); 2465 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2466 return (error); 2467 2468 /* 2469 * Create a transaction to cover the operations we perform. 2470 */ 2471 lwkt_gettoken(&hmp->fs_token); 2472 hammer_start_transaction(&trans, hmp); 2473 ++hammer_stats_file_iopsw; 2474 2475 /* 2476 * Create a new filesystem object of the requested type. The 2477 * returned inode will be referenced but not locked. 2478 */ 2479 2480 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2481 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2482 NULL, &nip); 2483 if (error) { 2484 hammer_done_transaction(&trans); 2485 *ap->a_vpp = NULL; 2486 lwkt_reltoken(&hmp->fs_token); 2487 return (error); 2488 } 2489 2490 /* 2491 * Add a record representing the symlink. symlink stores the link 2492 * as pure data, not a string, and is no \0 terminated. 2493 */ 2494 if (error == 0) { 2495 bytes = strlen(ap->a_target); 2496 2497 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2498 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2499 } else { 2500 record = hammer_alloc_mem_record(nip, bytes); 2501 record->type = HAMMER_MEM_RECORD_GENERAL; 2502 2503 record->leaf.base.localization = nip->obj_localization + 2504 HAMMER_LOCALIZE_MISC; 2505 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2506 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2507 record->leaf.data_len = bytes; 2508 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2509 bcopy(ap->a_target, record->data->symlink.name, bytes); 2510 error = hammer_ip_add_record(&trans, record); 2511 } 2512 2513 /* 2514 * Set the file size to the length of the link. 2515 */ 2516 if (error == 0) { 2517 nip->ino_data.size = bytes; 2518 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2519 } 2520 } 2521 if (error == 0) 2522 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2523 nch->ncp->nc_nlen, nip); 2524 2525 /* 2526 * Finish up. 2527 */ 2528 if (error) { 2529 hammer_rel_inode(nip, 0); 2530 *ap->a_vpp = NULL; 2531 } else { 2532 error = hammer_get_vnode(nip, ap->a_vpp); 2533 hammer_rel_inode(nip, 0); 2534 if (error == 0) { 2535 cache_setunresolved(ap->a_nch); 2536 cache_setvp(ap->a_nch, *ap->a_vpp); 2537 hammer_knote(ap->a_dvp, NOTE_WRITE); 2538 } 2539 } 2540 hammer_done_transaction(&trans); 2541 lwkt_reltoken(&hmp->fs_token); 2542 return (error); 2543 } 2544 2545 /* 2546 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2547 */ 2548 static 2549 int 2550 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2551 { 2552 struct hammer_transaction trans; 2553 struct hammer_inode *dip; 2554 hammer_mount_t hmp; 2555 int error; 2556 2557 dip = VTOI(ap->a_dvp); 2558 hmp = dip->hmp; 2559 2560 if (hammer_nohistory(dip) == 0 && 2561 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2562 return (error); 2563 } 2564 2565 lwkt_gettoken(&hmp->fs_token); 2566 hammer_start_transaction(&trans, hmp); 2567 ++hammer_stats_file_iopsw; 2568 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2569 ap->a_cred, ap->a_flags, -1); 2570 hammer_done_transaction(&trans); 2571 lwkt_reltoken(&hmp->fs_token); 2572 2573 return (error); 2574 } 2575 2576 /* 2577 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2578 */ 2579 static 2580 int 2581 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2582 { 2583 struct hammer_inode *ip = ap->a_vp->v_data; 2584 hammer_mount_t hmp = ip->hmp; 2585 int error; 2586 2587 ++hammer_stats_file_iopsr; 2588 lwkt_gettoken(&hmp->fs_token); 2589 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2590 ap->a_fflag, ap->a_cred); 2591 lwkt_reltoken(&hmp->fs_token); 2592 return (error); 2593 } 2594 2595 static 2596 int 2597 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2598 { 2599 static const struct mountctl_opt extraopt[] = { 2600 { HMNT_NOHISTORY, "nohistory" }, 2601 { HMNT_MASTERID, "master" }, 2602 { 0, NULL} 2603 2604 }; 2605 struct hammer_mount *hmp; 2606 struct mount *mp; 2607 int usedbytes; 2608 int error; 2609 2610 error = 0; 2611 usedbytes = 0; 2612 mp = ap->a_head.a_ops->head.vv_mount; 2613 KKASSERT(mp->mnt_data != NULL); 2614 hmp = (struct hammer_mount *)mp->mnt_data; 2615 2616 lwkt_gettoken(&hmp->fs_token); 2617 2618 switch(ap->a_op) { 2619 case MOUNTCTL_SET_EXPORT: 2620 if (ap->a_ctllen != sizeof(struct export_args)) 2621 error = EINVAL; 2622 else 2623 error = hammer_vfs_export(mp, ap->a_op, 2624 (const struct export_args *)ap->a_ctl); 2625 break; 2626 case MOUNTCTL_MOUNTFLAGS: 2627 { 2628 /* 2629 * Call standard mountctl VOP function 2630 * so we get user mount flags. 2631 */ 2632 error = vop_stdmountctl(ap); 2633 if (error) 2634 break; 2635 2636 usedbytes = *ap->a_res; 2637 2638 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2639 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2640 ap->a_buf, 2641 ap->a_buflen - usedbytes, 2642 &error); 2643 } 2644 2645 *ap->a_res += usedbytes; 2646 break; 2647 } 2648 default: 2649 error = vop_stdmountctl(ap); 2650 break; 2651 } 2652 lwkt_reltoken(&hmp->fs_token); 2653 return(error); 2654 } 2655 2656 /* 2657 * hammer_vop_strategy { vp, bio } 2658 * 2659 * Strategy call, used for regular file read & write only. Note that the 2660 * bp may represent a cluster. 2661 * 2662 * To simplify operation and allow better optimizations in the future, 2663 * this code does not make any assumptions with regards to buffer alignment 2664 * or size. 2665 */ 2666 static 2667 int 2668 hammer_vop_strategy(struct vop_strategy_args *ap) 2669 { 2670 struct buf *bp; 2671 int error; 2672 2673 bp = ap->a_bio->bio_buf; 2674 2675 switch(bp->b_cmd) { 2676 case BUF_CMD_READ: 2677 error = hammer_vop_strategy_read(ap); 2678 break; 2679 case BUF_CMD_WRITE: 2680 error = hammer_vop_strategy_write(ap); 2681 break; 2682 default: 2683 bp->b_error = error = EINVAL; 2684 bp->b_flags |= B_ERROR; 2685 biodone(ap->a_bio); 2686 break; 2687 } 2688 2689 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ 2690 2691 return (error); 2692 } 2693 2694 /* 2695 * Read from a regular file. Iterate the related records and fill in the 2696 * BIO/BUF. Gaps are zero-filled. 2697 * 2698 * The support code in hammer_object.c should be used to deal with mixed 2699 * in-memory and on-disk records. 2700 * 2701 * NOTE: Can be called from the cluster code with an oversized buf. 2702 * 2703 * XXX atime update 2704 */ 2705 static 2706 int 2707 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2708 { 2709 struct hammer_transaction trans; 2710 struct hammer_inode *ip; 2711 struct hammer_inode *dip; 2712 hammer_mount_t hmp; 2713 struct hammer_cursor cursor; 2714 hammer_base_elm_t base; 2715 hammer_off_t disk_offset; 2716 struct bio *bio; 2717 struct bio *nbio; 2718 struct buf *bp; 2719 int64_t rec_offset; 2720 int64_t ran_end; 2721 int64_t tmp64; 2722 int error; 2723 int boff; 2724 int roff; 2725 int n; 2726 int isdedupable; 2727 2728 bio = ap->a_bio; 2729 bp = bio->bio_buf; 2730 ip = ap->a_vp->v_data; 2731 hmp = ip->hmp; 2732 2733 /* 2734 * The zone-2 disk offset may have been set by the cluster code via 2735 * a BMAP operation, or else should be NOOFFSET. 2736 * 2737 * Checking the high bits for a match against zone-2 should suffice. 2738 * 2739 * In cases where a lot of data duplication is present it may be 2740 * more beneficial to drop through and doubule-buffer through the 2741 * device. 2742 */ 2743 nbio = push_bio(bio); 2744 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2745 HAMMER_ZONE_LARGE_DATA) { 2746 if (hammer_double_buffer == 0) { 2747 lwkt_gettoken(&hmp->fs_token); 2748 error = hammer_io_direct_read(hmp, nbio, NULL); 2749 lwkt_reltoken(&hmp->fs_token); 2750 return (error); 2751 } 2752 2753 /* 2754 * Try to shortcut requests for double_buffer mode too. 2755 * Since this mode runs through the device buffer cache 2756 * only compatible buffer sizes (meaning those generated 2757 * by normal filesystem buffers) are legal. 2758 */ 2759 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) { 2760 error = hammer_io_indirect_read(hmp, nbio, NULL); 2761 return (error); 2762 } 2763 } 2764 2765 /* 2766 * Well, that sucked. Do it the hard way. If all the stars are 2767 * aligned we may still be able to issue a direct-read. 2768 */ 2769 lwkt_gettoken(&hmp->fs_token); 2770 hammer_simple_transaction(&trans, hmp); 2771 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2772 2773 /* 2774 * Key range (begin and end inclusive) to scan. Note that the key's 2775 * stored in the actual records represent BASE+LEN, not BASE. The 2776 * first record containing bio_offset will have a key > bio_offset. 2777 */ 2778 cursor.key_beg.localization = ip->obj_localization + 2779 HAMMER_LOCALIZE_MISC; 2780 cursor.key_beg.obj_id = ip->obj_id; 2781 cursor.key_beg.create_tid = 0; 2782 cursor.key_beg.delete_tid = 0; 2783 cursor.key_beg.obj_type = 0; 2784 cursor.key_beg.key = bio->bio_offset + 1; 2785 cursor.asof = ip->obj_asof; 2786 cursor.flags |= HAMMER_CURSOR_ASOF; 2787 2788 cursor.key_end = cursor.key_beg; 2789 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2790 #if 0 2791 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2792 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2793 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2794 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2795 } else 2796 #endif 2797 { 2798 ran_end = bio->bio_offset + bp->b_bufsize; 2799 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2800 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2801 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2802 if (tmp64 < ran_end) 2803 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2804 else 2805 cursor.key_end.key = ran_end + MAXPHYS + 1; 2806 } 2807 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2808 2809 /* 2810 * Set NOSWAPCACHE for cursor data extraction if double buffering 2811 * is disabled or (if the file is not marked cacheable via chflags 2812 * and vm.swapcache_use_chflags is enabled). 2813 */ 2814 if (hammer_double_buffer == 0 || 2815 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 && 2816 vm_swapcache_use_chflags)) { 2817 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; 2818 } 2819 2820 error = hammer_ip_first(&cursor); 2821 boff = 0; 2822 2823 while (error == 0) { 2824 /* 2825 * Get the base file offset of the record. The key for 2826 * data records is (base + bytes) rather then (base). 2827 */ 2828 base = &cursor.leaf->base; 2829 rec_offset = base->key - cursor.leaf->data_len; 2830 2831 /* 2832 * Calculate the gap, if any, and zero-fill it. 2833 * 2834 * n is the offset of the start of the record verses our 2835 * current seek offset in the bio. 2836 */ 2837 n = (int)(rec_offset - (bio->bio_offset + boff)); 2838 if (n > 0) { 2839 if (n > bp->b_bufsize - boff) 2840 n = bp->b_bufsize - boff; 2841 bzero((char *)bp->b_data + boff, n); 2842 boff += n; 2843 n = 0; 2844 } 2845 2846 /* 2847 * Calculate the data offset in the record and the number 2848 * of bytes we can copy. 2849 * 2850 * There are two degenerate cases. First, boff may already 2851 * be at bp->b_bufsize. Secondly, the data offset within 2852 * the record may exceed the record's size. 2853 */ 2854 roff = -n; 2855 rec_offset += roff; 2856 n = cursor.leaf->data_len - roff; 2857 if (n <= 0) { 2858 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff); 2859 n = 0; 2860 } else if (n > bp->b_bufsize - boff) { 2861 n = bp->b_bufsize - boff; 2862 } 2863 2864 /* 2865 * Deal with cached truncations. This cool bit of code 2866 * allows truncate()/ftruncate() to avoid having to sync 2867 * the file. 2868 * 2869 * If the frontend is truncated then all backend records are 2870 * subject to the frontend's truncation. 2871 * 2872 * If the backend is truncated then backend records on-disk 2873 * (but not in-memory) are subject to the backend's 2874 * truncation. In-memory records owned by the backend 2875 * represent data written after the truncation point on the 2876 * backend and must not be truncated. 2877 * 2878 * Truncate operations deal with frontend buffer cache 2879 * buffers and frontend-owned in-memory records synchronously. 2880 */ 2881 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2882 if (hammer_cursor_ondisk(&cursor)/* || 2883 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2884 if (ip->trunc_off <= rec_offset) 2885 n = 0; 2886 else if (ip->trunc_off < rec_offset + n) 2887 n = (int)(ip->trunc_off - rec_offset); 2888 } 2889 } 2890 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2891 if (hammer_cursor_ondisk(&cursor)) { 2892 if (ip->sync_trunc_off <= rec_offset) 2893 n = 0; 2894 else if (ip->sync_trunc_off < rec_offset + n) 2895 n = (int)(ip->sync_trunc_off - rec_offset); 2896 } 2897 } 2898 2899 /* 2900 * Try to issue a direct read into our bio if possible, 2901 * otherwise resolve the element data into a hammer_buffer 2902 * and copy. 2903 * 2904 * The buffer on-disk should be zerod past any real 2905 * truncation point, but may not be for any synthesized 2906 * truncation point from above. 2907 * 2908 * NOTE: disk_offset is only valid if the cursor data is 2909 * on-disk. 2910 */ 2911 disk_offset = cursor.leaf->data_offset + roff; 2912 isdedupable = (boff == 0 && n == bp->b_bufsize && 2913 hammer_cursor_ondisk(&cursor) && 2914 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2915 2916 if (isdedupable && hammer_double_buffer == 0) { 2917 /* 2918 * Direct read case 2919 */ 2920 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2921 HAMMER_ZONE_LARGE_DATA); 2922 nbio->bio_offset = disk_offset; 2923 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2924 if (hammer_live_dedup && error == 0) 2925 hammer_dedup_cache_add(ip, cursor.leaf); 2926 goto done; 2927 } else if (isdedupable) { 2928 /* 2929 * Async I/O case for reading from backing store 2930 * and copying the data to the filesystem buffer. 2931 * live-dedup has to verify the data anyway if it 2932 * gets a hit later so we can just add the entry 2933 * now. 2934 */ 2935 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2936 HAMMER_ZONE_LARGE_DATA); 2937 nbio->bio_offset = disk_offset; 2938 if (hammer_live_dedup) 2939 hammer_dedup_cache_add(ip, cursor.leaf); 2940 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf); 2941 goto done; 2942 } else if (n) { 2943 error = hammer_ip_resolve_data(&cursor); 2944 if (error == 0) { 2945 if (hammer_live_dedup && isdedupable) 2946 hammer_dedup_cache_add(ip, cursor.leaf); 2947 bcopy((char *)cursor.data + roff, 2948 (char *)bp->b_data + boff, n); 2949 } 2950 } 2951 if (error) 2952 break; 2953 2954 /* 2955 * We have to be sure that the only elements added to the 2956 * dedup cache are those which are already on-media. 2957 */ 2958 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) 2959 hammer_dedup_cache_add(ip, cursor.leaf); 2960 2961 /* 2962 * Iterate until we have filled the request. 2963 */ 2964 boff += n; 2965 if (boff == bp->b_bufsize) 2966 break; 2967 error = hammer_ip_next(&cursor); 2968 } 2969 2970 /* 2971 * There may have been a gap after the last record 2972 */ 2973 if (error == ENOENT) 2974 error = 0; 2975 if (error == 0 && boff != bp->b_bufsize) { 2976 KKASSERT(boff < bp->b_bufsize); 2977 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2978 /* boff = bp->b_bufsize; */ 2979 } 2980 2981 /* 2982 * Disallow swapcache operation on the vnode buffer if double 2983 * buffering is enabled, the swapcache will get the data via 2984 * the block device buffer. 2985 */ 2986 if (hammer_double_buffer) 2987 bp->b_flags |= B_NOTMETA; 2988 2989 /* 2990 * Cleanup 2991 */ 2992 bp->b_resid = 0; 2993 bp->b_error = error; 2994 if (error) 2995 bp->b_flags |= B_ERROR; 2996 biodone(ap->a_bio); 2997 2998 done: 2999 /* 3000 * Cache the b-tree node for the last data read in cache[1]. 3001 * 3002 * If we hit the file EOF then also cache the node in the 3003 * governing director's cache[3], it will be used to initialize 3004 * the inode's cache[1] for any inodes looked up via the directory. 3005 * 3006 * This doesn't reduce disk accesses since the B-Tree chain is 3007 * likely cached, but it does reduce cpu overhead when looking 3008 * up file offsets for cpdup/tar/cpio style iterations. 3009 */ 3010 if (cursor.node) 3011 hammer_cache_node(&ip->cache[1], cursor.node); 3012 if (ran_end >= ip->ino_data.size) { 3013 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 3014 ip->obj_asof, ip->obj_localization); 3015 if (dip) { 3016 hammer_cache_node(&dip->cache[3], cursor.node); 3017 hammer_rel_inode(dip, 0); 3018 } 3019 } 3020 hammer_done_cursor(&cursor); 3021 hammer_done_transaction(&trans); 3022 lwkt_reltoken(&hmp->fs_token); 3023 return(error); 3024 } 3025 3026 /* 3027 * BMAP operation - used to support cluster_read() only. 3028 * 3029 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 3030 * 3031 * This routine may return EOPNOTSUPP if the opration is not supported for 3032 * the specified offset. The contents of the pointer arguments do not 3033 * need to be initialized in that case. 3034 * 3035 * If a disk address is available and properly aligned return 0 with 3036 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 3037 * to the run-length relative to that offset. Callers may assume that 3038 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 3039 * large, so return EOPNOTSUPP if it is not sufficiently large. 3040 */ 3041 static 3042 int 3043 hammer_vop_bmap(struct vop_bmap_args *ap) 3044 { 3045 struct hammer_transaction trans; 3046 struct hammer_inode *ip; 3047 hammer_mount_t hmp; 3048 struct hammer_cursor cursor; 3049 hammer_base_elm_t base; 3050 int64_t rec_offset; 3051 int64_t ran_end; 3052 int64_t tmp64; 3053 int64_t base_offset; 3054 int64_t base_disk_offset; 3055 int64_t last_offset; 3056 hammer_off_t last_disk_offset; 3057 hammer_off_t disk_offset; 3058 int rec_len; 3059 int error; 3060 int blksize; 3061 3062 ++hammer_stats_file_iopsr; 3063 ip = ap->a_vp->v_data; 3064 hmp = ip->hmp; 3065 3066 /* 3067 * We can only BMAP regular files. We can't BMAP database files, 3068 * directories, etc. 3069 */ 3070 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 3071 return(EOPNOTSUPP); 3072 3073 /* 3074 * bmap is typically called with runp/runb both NULL when used 3075 * for writing. We do not support BMAP for writing atm. 3076 */ 3077 if (ap->a_cmd != BUF_CMD_READ) 3078 return(EOPNOTSUPP); 3079 3080 /* 3081 * Scan the B-Tree to acquire blockmap addresses, then translate 3082 * to raw addresses. 3083 */ 3084 lwkt_gettoken(&hmp->fs_token); 3085 hammer_simple_transaction(&trans, hmp); 3086 #if 0 3087 kprintf("bmap_beg %016llx ip->cache %p\n", 3088 (long long)ap->a_loffset, ip->cache[1]); 3089 #endif 3090 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 3091 3092 /* 3093 * Key range (begin and end inclusive) to scan. Note that the key's 3094 * stored in the actual records represent BASE+LEN, not BASE. The 3095 * first record containing bio_offset will have a key > bio_offset. 3096 */ 3097 cursor.key_beg.localization = ip->obj_localization + 3098 HAMMER_LOCALIZE_MISC; 3099 cursor.key_beg.obj_id = ip->obj_id; 3100 cursor.key_beg.create_tid = 0; 3101 cursor.key_beg.delete_tid = 0; 3102 cursor.key_beg.obj_type = 0; 3103 if (ap->a_runb) 3104 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 3105 else 3106 cursor.key_beg.key = ap->a_loffset + 1; 3107 if (cursor.key_beg.key < 0) 3108 cursor.key_beg.key = 0; 3109 cursor.asof = ip->obj_asof; 3110 cursor.flags |= HAMMER_CURSOR_ASOF; 3111 3112 cursor.key_end = cursor.key_beg; 3113 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3114 3115 ran_end = ap->a_loffset + MAXPHYS; 3116 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3117 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3118 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3119 if (tmp64 < ran_end) 3120 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 3121 else 3122 cursor.key_end.key = ran_end + MAXPHYS + 1; 3123 3124 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3125 3126 error = hammer_ip_first(&cursor); 3127 base_offset = last_offset = 0; 3128 base_disk_offset = last_disk_offset = 0; 3129 3130 while (error == 0) { 3131 /* 3132 * Get the base file offset of the record. The key for 3133 * data records is (base + bytes) rather then (base). 3134 * 3135 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3136 * The extra bytes should be zero on-disk and the BMAP op 3137 * should still be ok. 3138 */ 3139 base = &cursor.leaf->base; 3140 rec_offset = base->key - cursor.leaf->data_len; 3141 rec_len = cursor.leaf->data_len; 3142 3143 /* 3144 * Incorporate any cached truncation. 3145 * 3146 * NOTE: Modifications to rec_len based on synthesized 3147 * truncation points remove the guarantee that any extended 3148 * data on disk is zero (since the truncations may not have 3149 * taken place on-media yet). 3150 */ 3151 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3152 if (hammer_cursor_ondisk(&cursor) || 3153 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3154 if (ip->trunc_off <= rec_offset) 3155 rec_len = 0; 3156 else if (ip->trunc_off < rec_offset + rec_len) 3157 rec_len = (int)(ip->trunc_off - rec_offset); 3158 } 3159 } 3160 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3161 if (hammer_cursor_ondisk(&cursor)) { 3162 if (ip->sync_trunc_off <= rec_offset) 3163 rec_len = 0; 3164 else if (ip->sync_trunc_off < rec_offset + rec_len) 3165 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3166 } 3167 } 3168 3169 /* 3170 * Accumulate information. If we have hit a discontiguous 3171 * block reset base_offset unless we are already beyond the 3172 * requested offset. If we are, that's it, we stop. 3173 */ 3174 if (error) 3175 break; 3176 if (hammer_cursor_ondisk(&cursor)) { 3177 disk_offset = cursor.leaf->data_offset; 3178 if (rec_offset != last_offset || 3179 disk_offset != last_disk_offset) { 3180 if (rec_offset > ap->a_loffset) 3181 break; 3182 base_offset = rec_offset; 3183 base_disk_offset = disk_offset; 3184 } 3185 last_offset = rec_offset + rec_len; 3186 last_disk_offset = disk_offset + rec_len; 3187 3188 if (hammer_live_dedup) 3189 hammer_dedup_cache_add(ip, cursor.leaf); 3190 } 3191 3192 error = hammer_ip_next(&cursor); 3193 } 3194 3195 #if 0 3196 kprintf("BMAP %016llx: %016llx - %016llx\n", 3197 (long long)ap->a_loffset, 3198 (long long)base_offset, 3199 (long long)last_offset); 3200 kprintf("BMAP %16s: %016llx - %016llx\n", "", 3201 (long long)base_disk_offset, 3202 (long long)last_disk_offset); 3203 #endif 3204 3205 if (cursor.node) { 3206 hammer_cache_node(&ip->cache[1], cursor.node); 3207 #if 0 3208 kprintf("bmap_end2 %016llx ip->cache %p\n", 3209 (long long)ap->a_loffset, ip->cache[1]); 3210 #endif 3211 } 3212 hammer_done_cursor(&cursor); 3213 hammer_done_transaction(&trans); 3214 lwkt_reltoken(&hmp->fs_token); 3215 3216 /* 3217 * If we couldn't find any records or the records we did find were 3218 * all behind the requested offset, return failure. A forward 3219 * truncation can leave a hole w/ no on-disk records. 3220 */ 3221 if (last_offset == 0 || last_offset < ap->a_loffset) 3222 return (EOPNOTSUPP); 3223 3224 /* 3225 * Figure out the block size at the requested offset and adjust 3226 * our limits so the cluster_read() does not create inappropriately 3227 * sized buffer cache buffers. 3228 */ 3229 blksize = hammer_blocksize(ap->a_loffset); 3230 if (hammer_blocksize(base_offset) != blksize) { 3231 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3232 } 3233 if (last_offset != ap->a_loffset && 3234 hammer_blocksize(last_offset - 1) != blksize) { 3235 last_offset = hammer_blockdemarc(ap->a_loffset, 3236 last_offset - 1); 3237 } 3238 3239 /* 3240 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3241 * from occuring. 3242 */ 3243 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3244 3245 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 3246 /* 3247 * Only large-data zones can be direct-IOd 3248 */ 3249 error = EOPNOTSUPP; 3250 } else if ((disk_offset & HAMMER_BUFMASK) || 3251 (last_offset - ap->a_loffset) < blksize) { 3252 /* 3253 * doffsetp is not aligned or the forward run size does 3254 * not cover a whole buffer, disallow the direct I/O. 3255 */ 3256 error = EOPNOTSUPP; 3257 } else { 3258 /* 3259 * We're good. 3260 */ 3261 *ap->a_doffsetp = disk_offset; 3262 if (ap->a_runb) { 3263 *ap->a_runb = ap->a_loffset - base_offset; 3264 KKASSERT(*ap->a_runb >= 0); 3265 } 3266 if (ap->a_runp) { 3267 *ap->a_runp = last_offset - ap->a_loffset; 3268 KKASSERT(*ap->a_runp >= 0); 3269 } 3270 error = 0; 3271 } 3272 return(error); 3273 } 3274 3275 /* 3276 * Write to a regular file. Because this is a strategy call the OS is 3277 * trying to actually get data onto the media. 3278 */ 3279 static 3280 int 3281 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3282 { 3283 hammer_record_t record; 3284 hammer_mount_t hmp; 3285 hammer_inode_t ip; 3286 struct bio *bio; 3287 struct buf *bp; 3288 int blksize; 3289 int bytes; 3290 int error; 3291 3292 bio = ap->a_bio; 3293 bp = bio->bio_buf; 3294 ip = ap->a_vp->v_data; 3295 hmp = ip->hmp; 3296 3297 blksize = hammer_blocksize(bio->bio_offset); 3298 KKASSERT(bp->b_bufsize == blksize); 3299 3300 if (ip->flags & HAMMER_INODE_RO) { 3301 bp->b_error = EROFS; 3302 bp->b_flags |= B_ERROR; 3303 biodone(ap->a_bio); 3304 return(EROFS); 3305 } 3306 3307 lwkt_gettoken(&hmp->fs_token); 3308 3309 /* 3310 * Disallow swapcache operation on the vnode buffer if double 3311 * buffering is enabled, the swapcache will get the data via 3312 * the block device buffer. 3313 */ 3314 if (hammer_double_buffer) 3315 bp->b_flags |= B_NOTMETA; 3316 3317 /* 3318 * Interlock with inode destruction (no in-kernel or directory 3319 * topology visibility). If we queue new IO while trying to 3320 * destroy the inode we can deadlock the vtrunc call in 3321 * hammer_inode_unloadable_check(). 3322 * 3323 * Besides, there's no point flushing a bp associated with an 3324 * inode that is being destroyed on-media and has no kernel 3325 * references. 3326 */ 3327 if ((ip->flags | ip->sync_flags) & 3328 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3329 bp->b_resid = 0; 3330 biodone(ap->a_bio); 3331 lwkt_reltoken(&hmp->fs_token); 3332 return(0); 3333 } 3334 3335 /* 3336 * Reserve space and issue a direct-write from the front-end. 3337 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3338 * allocations. 3339 * 3340 * An in-memory record will be installed to reference the storage 3341 * until the flusher can get to it. 3342 * 3343 * Since we own the high level bio the front-end will not try to 3344 * do a direct-read until the write completes. 3345 * 3346 * NOTE: The only time we do not reserve a full-sized buffers 3347 * worth of data is if the file is small. We do not try to 3348 * allocate a fragment (from the small-data zone) at the end of 3349 * an otherwise large file as this can lead to wildly separated 3350 * data. 3351 */ 3352 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3353 KKASSERT(bio->bio_offset < ip->ino_data.size); 3354 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2) 3355 bytes = bp->b_bufsize; 3356 else 3357 bytes = ((int)ip->ino_data.size + 15) & ~15; 3358 3359 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3360 bytes, &error); 3361 3362 /* 3363 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3364 * in hammer_vop_write(). We must flag the record so the proper 3365 * REDO_TERM_WRITE entry is generated during the flush. 3366 */ 3367 if (record) { 3368 if (bp->b_flags & B_VFSFLAG1) { 3369 record->flags |= HAMMER_RECF_REDO; 3370 bp->b_flags &= ~B_VFSFLAG1; 3371 } 3372 if (record->flags & HAMMER_RECF_DEDUPED) { 3373 bp->b_resid = 0; 3374 hammer_ip_replace_bulk(hmp, record); 3375 biodone(ap->a_bio); 3376 } else { 3377 hammer_io_direct_write(hmp, bio, record); 3378 } 3379 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3380 hammer_flush_inode(ip, 0); 3381 } else { 3382 bp->b_bio2.bio_offset = NOOFFSET; 3383 bp->b_error = error; 3384 bp->b_flags |= B_ERROR; 3385 biodone(ap->a_bio); 3386 } 3387 lwkt_reltoken(&hmp->fs_token); 3388 return(error); 3389 } 3390 3391 /* 3392 * dounlink - disconnect a directory entry 3393 * 3394 * XXX whiteout support not really in yet 3395 */ 3396 static int 3397 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3398 struct vnode *dvp, struct ucred *cred, 3399 int flags, int isdir) 3400 { 3401 struct namecache *ncp; 3402 hammer_inode_t dip; 3403 hammer_inode_t ip; 3404 hammer_mount_t hmp; 3405 struct hammer_cursor cursor; 3406 int64_t namekey; 3407 u_int32_t max_iterations; 3408 int nlen, error; 3409 3410 /* 3411 * Calculate the namekey and setup the key range for the scan. This 3412 * works kinda like a chained hash table where the lower 32 bits 3413 * of the namekey synthesize the chain. 3414 * 3415 * The key range is inclusive of both key_beg and key_end. 3416 */ 3417 dip = VTOI(dvp); 3418 ncp = nch->ncp; 3419 hmp = dip->hmp; 3420 3421 if (dip->flags & HAMMER_INODE_RO) 3422 return (EROFS); 3423 3424 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3425 &max_iterations); 3426 retry: 3427 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3428 cursor.key_beg.localization = dip->obj_localization + 3429 hammer_dir_localization(dip); 3430 cursor.key_beg.obj_id = dip->obj_id; 3431 cursor.key_beg.key = namekey; 3432 cursor.key_beg.create_tid = 0; 3433 cursor.key_beg.delete_tid = 0; 3434 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3435 cursor.key_beg.obj_type = 0; 3436 3437 cursor.key_end = cursor.key_beg; 3438 cursor.key_end.key += max_iterations; 3439 cursor.asof = dip->obj_asof; 3440 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3441 3442 /* 3443 * Scan all matching records (the chain), locate the one matching 3444 * the requested path component. info->last_error contains the 3445 * error code on search termination and could be 0, ENOENT, or 3446 * something else. 3447 * 3448 * The hammer_ip_*() functions merge in-memory records with on-disk 3449 * records for the purposes of the search. 3450 */ 3451 error = hammer_ip_first(&cursor); 3452 3453 while (error == 0) { 3454 error = hammer_ip_resolve_data(&cursor); 3455 if (error) 3456 break; 3457 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3458 KKASSERT(nlen > 0); 3459 if (ncp->nc_nlen == nlen && 3460 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3461 break; 3462 } 3463 error = hammer_ip_next(&cursor); 3464 } 3465 3466 /* 3467 * If all is ok we have to get the inode so we can adjust nlinks. 3468 * To avoid a deadlock with the flusher we must release the inode 3469 * lock on the directory when acquiring the inode for the entry. 3470 * 3471 * If the target is a directory, it must be empty. 3472 */ 3473 if (error == 0) { 3474 hammer_unlock(&cursor.ip->lock); 3475 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3476 hmp->asof, 3477 cursor.data->entry.localization, 3478 0, &error); 3479 hammer_lock_sh(&cursor.ip->lock); 3480 if (error == ENOENT) { 3481 kprintf("HAMMER: WARNING: Removing " 3482 "dirent w/missing inode \"%s\"\n" 3483 "\tobj_id = %016llx\n", 3484 ncp->nc_name, 3485 (long long)cursor.data->entry.obj_id); 3486 error = 0; 3487 } 3488 3489 /* 3490 * If isdir >= 0 we validate that the entry is or is not a 3491 * directory. If isdir < 0 we don't care. 3492 */ 3493 if (error == 0 && isdir >= 0 && ip) { 3494 if (isdir && 3495 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3496 error = ENOTDIR; 3497 } else if (isdir == 0 && 3498 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3499 error = EISDIR; 3500 } 3501 } 3502 3503 /* 3504 * If we are trying to remove a directory the directory must 3505 * be empty. 3506 * 3507 * The check directory code can loop and deadlock/retry. Our 3508 * own cursor's node locks must be released to avoid a 3-way 3509 * deadlock with the flusher if the check directory code 3510 * blocks. 3511 * 3512 * If any changes whatsoever have been made to the cursor 3513 * set EDEADLK and retry. 3514 * 3515 * WARNING: See warnings in hammer_unlock_cursor() 3516 * function. 3517 */ 3518 if (error == 0 && ip && ip->ino_data.obj_type == 3519 HAMMER_OBJTYPE_DIRECTORY) { 3520 hammer_unlock_cursor(&cursor); 3521 error = hammer_ip_check_directory_empty(trans, ip); 3522 hammer_lock_cursor(&cursor); 3523 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3524 kprintf("HAMMER: Warning: avoided deadlock " 3525 "on rmdir '%s'\n", 3526 ncp->nc_name); 3527 error = EDEADLK; 3528 } 3529 } 3530 3531 /* 3532 * Delete the directory entry. 3533 * 3534 * WARNING: hammer_ip_del_directory() may have to terminate 3535 * the cursor to avoid a deadlock. It is ok to call 3536 * hammer_done_cursor() twice. 3537 */ 3538 if (error == 0) { 3539 error = hammer_ip_del_directory(trans, &cursor, 3540 dip, ip); 3541 } 3542 hammer_done_cursor(&cursor); 3543 if (error == 0) { 3544 cache_setunresolved(nch); 3545 cache_setvp(nch, NULL); 3546 3547 /* 3548 * NOTE: ip->vp, if non-NULL, cannot be directly 3549 * referenced without formally acquiring the 3550 * vp since the vp might have zero refs on it, 3551 * or in the middle of a reclaim, etc. 3552 * 3553 * NOTE: The cache_setunresolved() can rip the vp 3554 * out from under us since the vp may not have 3555 * any refs, in which case ip->vp will be NULL 3556 * from the outset. 3557 */ 3558 while (ip && ip->vp) { 3559 struct vnode *vp; 3560 3561 error = hammer_get_vnode(ip, &vp); 3562 if (error == 0 && vp) { 3563 vn_unlock(vp); 3564 hammer_knote(ip->vp, NOTE_DELETE); 3565 cache_inval_vp(ip->vp, CINV_DESTROY); 3566 vrele(vp); 3567 break; 3568 } 3569 kprintf("Debug: HAMMER ip/vp race1 avoided\n"); 3570 } 3571 } 3572 if (ip) 3573 hammer_rel_inode(ip, 0); 3574 } else { 3575 hammer_done_cursor(&cursor); 3576 } 3577 if (error == EDEADLK) 3578 goto retry; 3579 3580 return (error); 3581 } 3582 3583 /************************************************************************ 3584 * FIFO AND SPECFS OPS * 3585 ************************************************************************ 3586 * 3587 */ 3588 static int 3589 hammer_vop_fifoclose (struct vop_close_args *ap) 3590 { 3591 /* XXX update itimes */ 3592 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3593 } 3594 3595 static int 3596 hammer_vop_fiforead (struct vop_read_args *ap) 3597 { 3598 int error; 3599 3600 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3601 /* XXX update access time */ 3602 return (error); 3603 } 3604 3605 static int 3606 hammer_vop_fifowrite (struct vop_write_args *ap) 3607 { 3608 int error; 3609 3610 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3611 /* XXX update access time */ 3612 return (error); 3613 } 3614 3615 static 3616 int 3617 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3618 { 3619 int error; 3620 3621 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3622 if (error) 3623 error = hammer_vop_kqfilter(ap); 3624 return(error); 3625 } 3626 3627 /************************************************************************ 3628 * KQFILTER OPS * 3629 ************************************************************************ 3630 * 3631 */ 3632 static void filt_hammerdetach(struct knote *kn); 3633 static int filt_hammerread(struct knote *kn, long hint); 3634 static int filt_hammerwrite(struct knote *kn, long hint); 3635 static int filt_hammervnode(struct knote *kn, long hint); 3636 3637 static struct filterops hammerread_filtops = 3638 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread }; 3639 static struct filterops hammerwrite_filtops = 3640 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite }; 3641 static struct filterops hammervnode_filtops = 3642 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode }; 3643 3644 static 3645 int 3646 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3647 { 3648 struct vnode *vp = ap->a_vp; 3649 struct knote *kn = ap->a_kn; 3650 3651 switch (kn->kn_filter) { 3652 case EVFILT_READ: 3653 kn->kn_fop = &hammerread_filtops; 3654 break; 3655 case EVFILT_WRITE: 3656 kn->kn_fop = &hammerwrite_filtops; 3657 break; 3658 case EVFILT_VNODE: 3659 kn->kn_fop = &hammervnode_filtops; 3660 break; 3661 default: 3662 return (EOPNOTSUPP); 3663 } 3664 3665 kn->kn_hook = (caddr_t)vp; 3666 3667 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3668 3669 return(0); 3670 } 3671 3672 static void 3673 filt_hammerdetach(struct knote *kn) 3674 { 3675 struct vnode *vp = (void *)kn->kn_hook; 3676 3677 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3678 } 3679 3680 static int 3681 filt_hammerread(struct knote *kn, long hint) 3682 { 3683 struct vnode *vp = (void *)kn->kn_hook; 3684 hammer_inode_t ip = VTOI(vp); 3685 hammer_mount_t hmp = ip->hmp; 3686 off_t off; 3687 3688 if (hint == NOTE_REVOKE) { 3689 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3690 return(1); 3691 } 3692 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3693 off = ip->ino_data.size - kn->kn_fp->f_offset; 3694 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3695 lwkt_reltoken(&hmp->fs_token); 3696 if (kn->kn_sfflags & NOTE_OLDAPI) 3697 return(1); 3698 return (kn->kn_data != 0); 3699 } 3700 3701 static int 3702 filt_hammerwrite(struct knote *kn, long hint) 3703 { 3704 if (hint == NOTE_REVOKE) 3705 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3706 kn->kn_data = 0; 3707 return (1); 3708 } 3709 3710 static int 3711 filt_hammervnode(struct knote *kn, long hint) 3712 { 3713 if (kn->kn_sfflags & hint) 3714 kn->kn_fflags |= hint; 3715 if (hint == NOTE_REVOKE) { 3716 kn->kn_flags |= (EV_EOF | EV_NODATA); 3717 return (1); 3718 } 3719 return (kn->kn_fflags != 0); 3720 } 3721 3722