1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/fcntl.h> 39 #include <sys/namecache.h> 40 #include <sys/vnode.h> 41 #include <sys/lockf.h> 42 #include <sys/event.h> 43 #include <sys/stat.h> 44 #include <sys/dirent.h> 45 #include <sys/file.h> 46 #include <vm/vm_extern.h> 47 #include <vm/swap_pager.h> 48 #include <vfs/fifofs/fifo.h> 49 50 #include "hammer.h" 51 52 /* 53 * USERFS VNOPS 54 */ 55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/ 56 static int hammer_vop_fsync(struct vop_fsync_args *); 57 static int hammer_vop_read(struct vop_read_args *); 58 static int hammer_vop_write(struct vop_write_args *); 59 static int hammer_vop_access(struct vop_access_args *); 60 static int hammer_vop_advlock(struct vop_advlock_args *); 61 static int hammer_vop_close(struct vop_close_args *); 62 static int hammer_vop_ncreate(struct vop_ncreate_args *); 63 static int hammer_vop_getattr(struct vop_getattr_args *); 64 static int hammer_vop_nresolve(struct vop_nresolve_args *); 65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 66 static int hammer_vop_nlink(struct vop_nlink_args *); 67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 68 static int hammer_vop_nmknod(struct vop_nmknod_args *); 69 static int hammer_vop_open(struct vop_open_args *); 70 static int hammer_vop_print(struct vop_print_args *); 71 static int hammer_vop_readdir(struct vop_readdir_args *); 72 static int hammer_vop_readlink(struct vop_readlink_args *); 73 static int hammer_vop_nremove(struct vop_nremove_args *); 74 static int hammer_vop_nrename(struct vop_nrename_args *); 75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 76 static int hammer_vop_markatime(struct vop_markatime_args *); 77 static int hammer_vop_setattr(struct vop_setattr_args *); 78 static int hammer_vop_strategy(struct vop_strategy_args *); 79 static int hammer_vop_bmap(struct vop_bmap_args *ap); 80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 82 static int hammer_vop_ioctl(struct vop_ioctl_args *); 83 static int hammer_vop_mountctl(struct vop_mountctl_args *); 84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 85 86 static int hammer_vop_fifoclose (struct vop_close_args *); 87 static int hammer_vop_fiforead (struct vop_read_args *); 88 static int hammer_vop_fifowrite (struct vop_write_args *); 89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 90 91 struct vop_ops hammer_vnode_vops = { 92 .vop_default = vop_defaultop, 93 .vop_fsync = hammer_vop_fsync, 94 .vop_getpages = vop_stdgetpages, 95 .vop_putpages = vop_stdputpages, 96 .vop_read = hammer_vop_read, 97 .vop_write = hammer_vop_write, 98 .vop_access = hammer_vop_access, 99 .vop_advlock = hammer_vop_advlock, 100 .vop_close = hammer_vop_close, 101 .vop_ncreate = hammer_vop_ncreate, 102 .vop_getattr = hammer_vop_getattr, 103 .vop_inactive = hammer_vop_inactive, 104 .vop_reclaim = hammer_vop_reclaim, 105 .vop_nresolve = hammer_vop_nresolve, 106 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 107 .vop_nlink = hammer_vop_nlink, 108 .vop_nmkdir = hammer_vop_nmkdir, 109 .vop_nmknod = hammer_vop_nmknod, 110 .vop_open = hammer_vop_open, 111 .vop_pathconf = vop_stdpathconf, 112 .vop_print = hammer_vop_print, 113 .vop_readdir = hammer_vop_readdir, 114 .vop_readlink = hammer_vop_readlink, 115 .vop_nremove = hammer_vop_nremove, 116 .vop_nrename = hammer_vop_nrename, 117 .vop_nrmdir = hammer_vop_nrmdir, 118 .vop_markatime = hammer_vop_markatime, 119 .vop_setattr = hammer_vop_setattr, 120 .vop_bmap = hammer_vop_bmap, 121 .vop_strategy = hammer_vop_strategy, 122 .vop_nsymlink = hammer_vop_nsymlink, 123 .vop_nwhiteout = hammer_vop_nwhiteout, 124 .vop_ioctl = hammer_vop_ioctl, 125 .vop_mountctl = hammer_vop_mountctl, 126 .vop_kqfilter = hammer_vop_kqfilter 127 }; 128 129 struct vop_ops hammer_spec_vops = { 130 .vop_default = vop_defaultop, 131 .vop_fsync = hammer_vop_fsync, 132 .vop_read = vop_stdnoread, 133 .vop_write = vop_stdnowrite, 134 .vop_access = hammer_vop_access, 135 .vop_close = hammer_vop_close, 136 .vop_markatime = hammer_vop_markatime, 137 .vop_getattr = hammer_vop_getattr, 138 .vop_inactive = hammer_vop_inactive, 139 .vop_reclaim = hammer_vop_reclaim, 140 .vop_setattr = hammer_vop_setattr 141 }; 142 143 struct vop_ops hammer_fifo_vops = { 144 .vop_default = fifo_vnoperate, 145 .vop_fsync = hammer_vop_fsync, 146 .vop_read = hammer_vop_fiforead, 147 .vop_write = hammer_vop_fifowrite, 148 .vop_access = hammer_vop_access, 149 .vop_close = hammer_vop_fifoclose, 150 .vop_markatime = hammer_vop_markatime, 151 .vop_getattr = hammer_vop_getattr, 152 .vop_inactive = hammer_vop_inactive, 153 .vop_reclaim = hammer_vop_reclaim, 154 .vop_setattr = hammer_vop_setattr, 155 .vop_kqfilter = hammer_vop_fifokqfilter 156 }; 157 158 static __inline 159 void 160 hammer_knote(struct vnode *vp, int flags) 161 { 162 if (flags) 163 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 164 } 165 166 #ifdef DEBUG_TRUNCATE 167 struct hammer_inode *HammerTruncIp; 168 #endif 169 170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 171 struct vnode *dvp, struct ucred *cred, 172 int flags, int isdir); 173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 175 176 #if 0 177 static 178 int 179 hammer_vop_vnoperate(struct vop_generic_args *) 180 { 181 return (VOCALL(&hammer_vnode_vops, ap)); 182 } 183 #endif 184 185 /* 186 * hammer_vop_fsync { vp, waitfor } 187 * 188 * fsync() an inode to disk and wait for it to be completely committed 189 * such that the information would not be undone if a crash occured after 190 * return. 191 * 192 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 193 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 194 * operation. 195 * 196 * Ultimately the combination of a REDO log and use of fast storage 197 * to front-end cluster caches will make fsync fast, but it aint 198 * here yet. And, in anycase, we need real transactional 199 * all-or-nothing features which are not restricted to a single file. 200 */ 201 static 202 int 203 hammer_vop_fsync(struct vop_fsync_args *ap) 204 { 205 hammer_inode_t ip = VTOI(ap->a_vp); 206 hammer_mount_t hmp = ip->hmp; 207 int waitfor = ap->a_waitfor; 208 int mode; 209 210 lwkt_gettoken(&hmp->fs_token); 211 212 /* 213 * Fsync rule relaxation (default is either full synchronous flush 214 * or REDO semantics with synchronous flush). 215 */ 216 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 217 switch(hammer_fsync_mode) { 218 case 0: 219 mode0: 220 /* no REDO, full synchronous flush */ 221 goto skip; 222 case 1: 223 mode1: 224 /* no REDO, full asynchronous flush */ 225 if (waitfor == MNT_WAIT) 226 waitfor = MNT_NOWAIT; 227 goto skip; 228 case 2: 229 /* REDO semantics, synchronous flush */ 230 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 231 goto mode0; 232 mode = HAMMER_FLUSH_UNDOS_AUTO; 233 break; 234 case 3: 235 /* REDO semantics, relaxed asynchronous flush */ 236 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 237 goto mode1; 238 mode = HAMMER_FLUSH_UNDOS_RELAXED; 239 if (waitfor == MNT_WAIT) 240 waitfor = MNT_NOWAIT; 241 break; 242 case 4: 243 /* ignore the fsync() system call */ 244 lwkt_reltoken(&hmp->fs_token); 245 return(0); 246 default: 247 /* we have to do something */ 248 mode = HAMMER_FLUSH_UNDOS_RELAXED; 249 if (waitfor == MNT_WAIT) 250 waitfor = MNT_NOWAIT; 251 break; 252 } 253 254 /* 255 * Fast fsync only needs to flush the UNDO/REDO fifo if 256 * HAMMER_INODE_REDO is non-zero and the only modifications 257 * made to the file are write or write-extends. 258 */ 259 if ((ip->flags & HAMMER_INODE_REDO) && 260 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0 261 ) { 262 ++hammer_count_fsyncs; 263 hammer_flusher_flush_undos(hmp, mode); 264 ip->redo_count = 0; 265 lwkt_reltoken(&hmp->fs_token); 266 return(0); 267 } 268 269 /* 270 * REDO is enabled by fsync(), the idea being we really only 271 * want to lay down REDO records when programs are using 272 * fsync() heavily. The first fsync() on the file starts 273 * the gravy train going and later fsync()s keep it hot by 274 * resetting the redo_count. 275 * 276 * We weren't running REDOs before now so we have to fall 277 * through and do a full fsync of what we have. 278 */ 279 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 280 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 281 ip->flags |= HAMMER_INODE_REDO; 282 ip->redo_count = 0; 283 } 284 } 285 skip: 286 287 /* 288 * Do a full flush sequence. 289 * 290 * Attempt to release the vnode while waiting for the inode to 291 * finish flushing. This can really mess up inactive->reclaim 292 * sequences so only do it if the vnode is active. 293 */ 294 ++hammer_count_fsyncs; 295 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 296 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 297 if (waitfor == MNT_WAIT) { 298 if ((ap->a_vp->v_flag & VINACTIVE) == 0) 299 vn_unlock(ap->a_vp); 300 hammer_wait_inode(ip); 301 if ((ap->a_vp->v_flag & VINACTIVE) == 0) 302 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); 303 } 304 lwkt_reltoken(&hmp->fs_token); 305 return (ip->error); 306 } 307 308 /* 309 * hammer_vop_read { vp, uio, ioflag, cred } 310 * 311 * MPSAFE (for the cache safe does not require fs_token) 312 */ 313 static 314 int 315 hammer_vop_read(struct vop_read_args *ap) 316 { 317 struct hammer_transaction trans; 318 hammer_inode_t ip; 319 hammer_mount_t hmp; 320 off_t offset; 321 struct buf *bp; 322 struct uio *uio; 323 int error; 324 int n; 325 int seqcount; 326 int ioseqcount; 327 int blksize; 328 int bigread; 329 int got_fstoken; 330 size_t resid; 331 332 if (ap->a_vp->v_type != VREG) 333 return (EINVAL); 334 ip = VTOI(ap->a_vp); 335 hmp = ip->hmp; 336 error = 0; 337 got_fstoken = 0; 338 uio = ap->a_uio; 339 340 /* 341 * Attempt to shortcut directly to the VM object using lwbufs. 342 * This is much faster than instantiating buffer cache buffers. 343 */ 344 resid = uio->uio_resid; 345 error = vop_helper_read_shortcut(ap); 346 hammer_stats_file_read += resid - uio->uio_resid; 347 if (error) 348 return (error); 349 if (uio->uio_resid == 0) 350 goto finished; 351 352 /* 353 * Allow the UIO's size to override the sequential heuristic. 354 */ 355 blksize = hammer_blocksize(uio->uio_offset); 356 seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE; 357 ioseqcount = (ap->a_ioflag >> 16); 358 if (seqcount < ioseqcount) 359 seqcount = ioseqcount; 360 361 /* 362 * If reading or writing a huge amount of data we have to break 363 * atomicy and allow the operation to be interrupted by a signal 364 * or it can DOS the machine. 365 */ 366 bigread = (uio->uio_resid > 100 * 1024 * 1024); 367 368 /* 369 * Access the data typically in HAMMER_BUFSIZE blocks via the 370 * buffer cache, but HAMMER may use a variable block size based 371 * on the offset. 372 * 373 * XXX Temporary hack, delay the start transaction while we remain 374 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 375 * locked-shared. 376 */ 377 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 378 int64_t base_offset; 379 int64_t file_limit; 380 381 blksize = hammer_blocksize(uio->uio_offset); 382 offset = (int)uio->uio_offset & (blksize - 1); 383 base_offset = uio->uio_offset - offset; 384 385 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 386 break; 387 388 /* 389 * MPSAFE 390 */ 391 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0); 392 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 393 bp->b_flags &= ~B_AGE; 394 error = 0; 395 goto skip; 396 } 397 if (ap->a_ioflag & IO_NRDELAY) { 398 bqrelse(bp); 399 return (EWOULDBLOCK); 400 } 401 402 /* 403 * MPUNSAFE 404 */ 405 if (got_fstoken == 0) { 406 lwkt_gettoken(&hmp->fs_token); 407 got_fstoken = 1; 408 hammer_start_transaction(&trans, ip->hmp); 409 } 410 411 /* 412 * NOTE: A valid bp has already been acquired, but was not 413 * B_CACHE. 414 */ 415 if (hammer_cluster_enable) { 416 /* 417 * Use file_limit to prevent cluster_read() from 418 * creating buffers of the wrong block size past 419 * the demarc. 420 */ 421 file_limit = ip->ino_data.size; 422 if (base_offset < HAMMER_XDEMARC && 423 file_limit > HAMMER_XDEMARC) { 424 file_limit = HAMMER_XDEMARC; 425 } 426 error = cluster_readx(ap->a_vp, 427 file_limit, base_offset, 428 blksize, uio->uio_resid, 429 seqcount * BKVASIZE, &bp); 430 } else { 431 error = breadnx(ap->a_vp, base_offset, blksize, 432 NULL, NULL, 0, &bp); 433 } 434 if (error) { 435 brelse(bp); 436 break; 437 } 438 skip: 439 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { 440 kprintf("doff %016jx read file %016jx@%016jx\n", 441 (intmax_t)bp->b_bio2.bio_offset, 442 (intmax_t)ip->obj_id, 443 (intmax_t)bp->b_loffset); 444 } 445 bp->b_flags &= ~B_IODEBUG; 446 if (blksize == HAMMER_XBUFSIZE) 447 bp->b_flags |= B_CLUSTEROK; 448 449 n = blksize - offset; 450 if (n > uio->uio_resid) 451 n = uio->uio_resid; 452 if (n > ip->ino_data.size - uio->uio_offset) 453 n = (int)(ip->ino_data.size - uio->uio_offset); 454 if (got_fstoken) 455 lwkt_reltoken(&hmp->fs_token); 456 457 /* 458 * Set B_AGE, data has a lower priority than meta-data. 459 * 460 * Use a hold/unlock/drop sequence to run the uiomove 461 * with the buffer unlocked, avoiding deadlocks against 462 * read()s on mmap()'d spaces. 463 */ 464 bp->b_flags |= B_AGE; 465 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio); 466 bqrelse(bp); 467 468 if (got_fstoken) 469 lwkt_gettoken(&hmp->fs_token); 470 471 if (error) 472 break; 473 hammer_stats_file_read += n; 474 } 475 476 finished: 477 478 /* 479 * Try to update the atime with just the inode lock for maximum 480 * concurrency. If we can't shortcut it we have to get the full 481 * blown transaction. 482 */ 483 if (got_fstoken == 0 && hammer_update_atime_quick(ip) < 0) { 484 lwkt_gettoken(&hmp->fs_token); 485 got_fstoken = 1; 486 hammer_start_transaction(&trans, ip->hmp); 487 } 488 489 if (got_fstoken) { 490 if ((ip->flags & HAMMER_INODE_RO) == 0 && 491 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 492 ip->ino_data.atime = trans.time; 493 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 494 } 495 hammer_done_transaction(&trans); 496 lwkt_reltoken(&hmp->fs_token); 497 } 498 return (error); 499 } 500 501 /* 502 * hammer_vop_write { vp, uio, ioflag, cred } 503 */ 504 static 505 int 506 hammer_vop_write(struct vop_write_args *ap) 507 { 508 struct hammer_transaction trans; 509 struct hammer_inode *ip; 510 hammer_mount_t hmp; 511 thread_t td; 512 struct uio *uio; 513 int offset; 514 off_t base_offset; 515 int64_t cluster_eof; 516 struct buf *bp; 517 int kflags; 518 int error; 519 int n; 520 int flags; 521 int seqcount; 522 int bigwrite; 523 524 if (ap->a_vp->v_type != VREG) 525 return (EINVAL); 526 ip = VTOI(ap->a_vp); 527 hmp = ip->hmp; 528 error = 0; 529 kflags = 0; 530 seqcount = ap->a_ioflag >> 16; 531 532 if (ip->flags & HAMMER_INODE_RO) 533 return (EROFS); 534 535 /* 536 * Create a transaction to cover the operations we perform. 537 */ 538 lwkt_gettoken(&hmp->fs_token); 539 hammer_start_transaction(&trans, hmp); 540 uio = ap->a_uio; 541 542 /* 543 * Check append mode 544 */ 545 if (ap->a_ioflag & IO_APPEND) 546 uio->uio_offset = ip->ino_data.size; 547 548 /* 549 * Check for illegal write offsets. Valid range is 0...2^63-1. 550 * 551 * NOTE: the base_off assignment is required to work around what 552 * I consider to be a GCC-4 optimization bug. 553 */ 554 if (uio->uio_offset < 0) { 555 hammer_done_transaction(&trans); 556 lwkt_reltoken(&hmp->fs_token); 557 return (EFBIG); 558 } 559 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 560 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 561 hammer_done_transaction(&trans); 562 lwkt_reltoken(&hmp->fs_token); 563 return (EFBIG); 564 } 565 566 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc && 567 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 568 hammer_done_transaction(&trans); 569 lwkt_reltoken(&hmp->fs_token); 570 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); 571 return (EFBIG); 572 } 573 574 /* 575 * If reading or writing a huge amount of data we have to break 576 * atomicy and allow the operation to be interrupted by a signal 577 * or it can DOS the machine. 578 * 579 * Preset redo_count so we stop generating REDOs earlier if the 580 * limit is exceeded. 581 */ 582 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 583 if ((ip->flags & HAMMER_INODE_REDO) && 584 ip->redo_count < hammer_limit_redo) { 585 ip->redo_count += uio->uio_resid; 586 } 587 588 /* 589 * Access the data typically in HAMMER_BUFSIZE blocks via the 590 * buffer cache, but HAMMER may use a variable block size based 591 * on the offset. 592 */ 593 while (uio->uio_resid > 0) { 594 int fixsize = 0; 595 int blksize; 596 int blkmask; 597 int trivial; 598 int endofblk; 599 off_t nsize; 600 601 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 602 break; 603 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 604 break; 605 606 blksize = hammer_blocksize(uio->uio_offset); 607 608 /* 609 * Do not allow HAMMER to blow out the buffer cache. Very 610 * large UIOs can lockout other processes due to bwillwrite() 611 * mechanics. 612 * 613 * The hammer inode is not locked during these operations. 614 * The vnode is locked which can interfere with the pageout 615 * daemon for non-UIO_NOCOPY writes but should not interfere 616 * with the buffer cache. Even so, we cannot afford to 617 * allow the pageout daemon to build up too many dirty buffer 618 * cache buffers. 619 * 620 * Only call this if we aren't being recursively called from 621 * a virtual disk device (vn), else we may deadlock. 622 */ 623 if ((ap->a_ioflag & IO_RECURSE) == 0) 624 bwillwrite(blksize); 625 626 /* 627 * Control the number of pending records associated with 628 * this inode. If too many have accumulated start a 629 * flush. Try to maintain a pipeline with the flusher. 630 * 631 * NOTE: It is possible for other sources to grow the 632 * records but not necessarily issue another flush, 633 * so use a timeout and ensure that a re-flush occurs. 634 */ 635 if (ip->rsv_recs >= hammer_limit_inode_recs) { 636 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 637 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 638 ip->flags |= HAMMER_INODE_RECSW; 639 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 640 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 641 } 642 } 643 644 #if 0 645 /* 646 * Do not allow HAMMER to blow out system memory by 647 * accumulating too many records. Records are so well 648 * decoupled from the buffer cache that it is possible 649 * for userland to push data out to the media via 650 * direct-write, but build up the records queued to the 651 * backend faster then the backend can flush them out. 652 * HAMMER has hit its write limit but the frontend has 653 * no pushback to slow it down. 654 */ 655 if (hmp->rsv_recs > hammer_limit_recs / 2) { 656 /* 657 * Get the inode on the flush list 658 */ 659 if (ip->rsv_recs >= 64) 660 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 661 else if (ip->rsv_recs >= 16) 662 hammer_flush_inode(ip, 0); 663 664 /* 665 * Keep the flusher going if the system keeps 666 * queueing records. 667 */ 668 delta = hmp->count_newrecords - 669 hmp->last_newrecords; 670 if (delta < 0 || delta > hammer_limit_recs / 2) { 671 hmp->last_newrecords = hmp->count_newrecords; 672 hammer_sync_hmp(hmp, MNT_NOWAIT); 673 } 674 675 /* 676 * If we have gotten behind start slowing 677 * down the writers. 678 */ 679 delta = (hmp->rsv_recs - hammer_limit_recs) * 680 hz / hammer_limit_recs; 681 if (delta > 0) 682 tsleep(&trans, 0, "hmrslo", delta); 683 } 684 #endif 685 686 /* 687 * Calculate the blocksize at the current offset and figure 688 * out how much we can actually write. 689 */ 690 blkmask = blksize - 1; 691 offset = (int)uio->uio_offset & blkmask; 692 base_offset = uio->uio_offset & ~(int64_t)blkmask; 693 n = blksize - offset; 694 if (n > uio->uio_resid) { 695 n = uio->uio_resid; 696 endofblk = 0; 697 } else { 698 endofblk = 1; 699 } 700 nsize = uio->uio_offset + n; 701 if (nsize > ip->ino_data.size) { 702 if (uio->uio_offset > ip->ino_data.size) 703 trivial = 0; 704 else 705 trivial = 1; 706 nvextendbuf(ap->a_vp, 707 ip->ino_data.size, 708 nsize, 709 hammer_blocksize(ip->ino_data.size), 710 hammer_blocksize(nsize), 711 hammer_blockoff(ip->ino_data.size), 712 hammer_blockoff(nsize), 713 trivial); 714 fixsize = 1; 715 kflags |= NOTE_EXTEND; 716 } 717 718 if (uio->uio_segflg == UIO_NOCOPY) { 719 /* 720 * Issuing a write with the same data backing the 721 * buffer. Instantiate the buffer to collect the 722 * backing vm pages, then read-in any missing bits. 723 * 724 * This case is used by vop_stdputpages(). 725 */ 726 bp = getblk(ap->a_vp, base_offset, 727 blksize, GETBLK_BHEAVY, 0); 728 if ((bp->b_flags & B_CACHE) == 0) { 729 bqrelse(bp); 730 error = bread(ap->a_vp, base_offset, 731 blksize, &bp); 732 } 733 } else if (offset == 0 && uio->uio_resid >= blksize) { 734 /* 735 * Even though we are entirely overwriting the buffer 736 * we may still have to zero it out to avoid a 737 * mmap/write visibility issue. 738 */ 739 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 740 if ((bp->b_flags & B_CACHE) == 0) 741 vfs_bio_clrbuf(bp); 742 } else if (base_offset >= ip->ino_data.size) { 743 /* 744 * If the base offset of the buffer is beyond the 745 * file EOF, we don't have to issue a read. 746 */ 747 bp = getblk(ap->a_vp, base_offset, 748 blksize, GETBLK_BHEAVY, 0); 749 vfs_bio_clrbuf(bp); 750 } else { 751 /* 752 * Partial overwrite, read in any missing bits then 753 * replace the portion being written. 754 */ 755 error = bread(ap->a_vp, base_offset, blksize, &bp); 756 if (error == 0) 757 bheavy(bp); 758 } 759 if (error == 0) { 760 lwkt_reltoken(&hmp->fs_token); 761 error = uiomovebp(bp, bp->b_data + offset, n, uio); 762 lwkt_gettoken(&hmp->fs_token); 763 } 764 765 /* 766 * Generate REDO records if enabled and redo_count will not 767 * exceeded the limit. 768 * 769 * If redo_count exceeds the limit we stop generating records 770 * and clear HAMMER_INODE_REDO. This will cause the next 771 * fsync() to do a full meta-data sync instead of just an 772 * UNDO/REDO fifo update. 773 * 774 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 775 * will still be tracked. The tracks will be terminated 776 * when the related meta-data (including possible data 777 * modifications which are not tracked via REDO) is 778 * flushed. 779 */ 780 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 781 if (ip->redo_count < hammer_limit_redo) { 782 bp->b_flags |= B_VFSFLAG1; 783 error = hammer_generate_redo(&trans, ip, 784 base_offset + offset, 785 HAMMER_REDO_WRITE, 786 bp->b_data + offset, 787 (size_t)n); 788 } else { 789 ip->flags &= ~HAMMER_INODE_REDO; 790 } 791 } 792 793 /* 794 * If we screwed up we have to undo any VM size changes we 795 * made. 796 */ 797 if (error) { 798 brelse(bp); 799 if (fixsize) { 800 nvtruncbuf(ap->a_vp, ip->ino_data.size, 801 hammer_blocksize(ip->ino_data.size), 802 hammer_blockoff(ip->ino_data.size), 803 0); 804 } 805 break; 806 } 807 kflags |= NOTE_WRITE; 808 hammer_stats_file_write += n; 809 if (blksize == HAMMER_XBUFSIZE) 810 bp->b_flags |= B_CLUSTEROK; 811 if (ip->ino_data.size < uio->uio_offset) { 812 ip->ino_data.size = uio->uio_offset; 813 flags = HAMMER_INODE_SDIRTY; 814 } else { 815 flags = 0; 816 } 817 ip->ino_data.mtime = trans.time; 818 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 819 hammer_modify_inode(&trans, ip, flags); 820 821 /* 822 * Once we dirty the buffer any cached zone-X offset 823 * becomes invalid. HAMMER NOTE: no-history mode cannot 824 * allow overwriting over the same data sector unless 825 * we provide UNDOs for the old data, which we don't. 826 */ 827 bp->b_bio2.bio_offset = NOOFFSET; 828 829 /* 830 * Final buffer disposition. 831 * 832 * Because meta-data updates are deferred, HAMMER is 833 * especially sensitive to excessive bdwrite()s because 834 * the I/O stream is not broken up by disk reads. So the 835 * buffer cache simply cannot keep up. 836 * 837 * WARNING! blksize is variable. cluster_write() is 838 * expected to not blow up if it encounters 839 * buffers that do not match the passed blksize. 840 * 841 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 842 * The ip->rsv_recs check should burst-flush the data. 843 * If we queue it immediately the buf could be left 844 * locked on the device queue for a very long time. 845 * 846 * However, failing to flush a dirty buffer out when 847 * issued from the pageout daemon can result in a low 848 * memory deadlock against bio_page_alloc(), so we 849 * have to bawrite() on IO_ASYNC as well. 850 * 851 * NOTE! To avoid degenerate stalls due to mismatched block 852 * sizes we only honor IO_DIRECT on the write which 853 * abuts the end of the buffer. However, we must 854 * honor IO_SYNC in case someone is silly enough to 855 * configure a HAMMER file as swap, or when HAMMER 856 * is serving NFS (for commits). Ick ick. 857 */ 858 bp->b_flags |= B_AGE; 859 if (blksize == HAMMER_XBUFSIZE) 860 bp->b_flags |= B_CLUSTEROK; 861 862 if (ap->a_ioflag & IO_SYNC) { 863 bwrite(bp); 864 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 865 bawrite(bp); 866 } else if (ap->a_ioflag & IO_ASYNC) { 867 bawrite(bp); 868 } else if (hammer_cluster_enable && 869 !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 870 if (base_offset < HAMMER_XDEMARC) 871 cluster_eof = hammer_blockdemarc(base_offset, 872 ip->ino_data.size); 873 else 874 cluster_eof = ip->ino_data.size; 875 cluster_write(bp, cluster_eof, blksize, seqcount); 876 } else { 877 bdwrite(bp); 878 } 879 } 880 hammer_done_transaction(&trans); 881 hammer_knote(ap->a_vp, kflags); 882 lwkt_reltoken(&hmp->fs_token); 883 return (error); 884 } 885 886 /* 887 * hammer_vop_access { vp, mode, cred } 888 * 889 * MPSAFE - does not require fs_token 890 */ 891 static 892 int 893 hammer_vop_access(struct vop_access_args *ap) 894 { 895 struct hammer_inode *ip = VTOI(ap->a_vp); 896 uid_t uid; 897 gid_t gid; 898 int error; 899 900 ++hammer_stats_file_iopsr; 901 uid = hammer_to_unix_xid(&ip->ino_data.uid); 902 gid = hammer_to_unix_xid(&ip->ino_data.gid); 903 904 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 905 ip->ino_data.uflags); 906 return (error); 907 } 908 909 /* 910 * hammer_vop_advlock { vp, id, op, fl, flags } 911 * 912 * MPSAFE - does not require fs_token 913 */ 914 static 915 int 916 hammer_vop_advlock(struct vop_advlock_args *ap) 917 { 918 hammer_inode_t ip = VTOI(ap->a_vp); 919 920 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 921 } 922 923 /* 924 * hammer_vop_close { vp, fflag } 925 * 926 * We can only sync-on-close for normal closes. XXX disabled for now. 927 */ 928 static 929 int 930 hammer_vop_close(struct vop_close_args *ap) 931 { 932 #if 0 933 struct vnode *vp = ap->a_vp; 934 hammer_inode_t ip = VTOI(vp); 935 int waitfor; 936 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 937 if (vn_islocked(vp) == LK_EXCLUSIVE && 938 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 939 if (ip->flags & HAMMER_INODE_CLOSESYNC) 940 waitfor = MNT_WAIT; 941 else 942 waitfor = MNT_NOWAIT; 943 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 944 HAMMER_INODE_CLOSEASYNC); 945 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 946 } 947 } 948 #endif 949 return (vop_stdclose(ap)); 950 } 951 952 /* 953 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 954 * 955 * The operating system has already ensured that the directory entry 956 * does not exist and done all appropriate namespace locking. 957 */ 958 static 959 int 960 hammer_vop_ncreate(struct vop_ncreate_args *ap) 961 { 962 struct hammer_transaction trans; 963 struct hammer_inode *dip; 964 struct hammer_inode *nip; 965 struct nchandle *nch; 966 hammer_mount_t hmp; 967 int error; 968 969 nch = ap->a_nch; 970 dip = VTOI(ap->a_dvp); 971 hmp = dip->hmp; 972 973 if (dip->flags & HAMMER_INODE_RO) 974 return (EROFS); 975 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 976 return (error); 977 978 /* 979 * Create a transaction to cover the operations we perform. 980 */ 981 lwkt_gettoken(&hmp->fs_token); 982 hammer_start_transaction(&trans, hmp); 983 ++hammer_stats_file_iopsw; 984 985 /* 986 * Create a new filesystem object of the requested type. The 987 * returned inode will be referenced and shared-locked to prevent 988 * it from being moved to the flusher. 989 */ 990 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 991 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 992 NULL, &nip); 993 if (error) { 994 hkprintf("hammer_create_inode error %d\n", error); 995 hammer_done_transaction(&trans); 996 *ap->a_vpp = NULL; 997 lwkt_reltoken(&hmp->fs_token); 998 return (error); 999 } 1000 1001 /* 1002 * Add the new filesystem object to the directory. This will also 1003 * bump the inode's link count. 1004 */ 1005 error = hammer_ip_add_directory(&trans, dip, 1006 nch->ncp->nc_name, nch->ncp->nc_nlen, 1007 nip); 1008 if (error) 1009 hkprintf("hammer_ip_add_directory error %d\n", error); 1010 1011 /* 1012 * Finish up. 1013 */ 1014 if (error) { 1015 hammer_rel_inode(nip, 0); 1016 hammer_done_transaction(&trans); 1017 *ap->a_vpp = NULL; 1018 } else { 1019 error = hammer_get_vnode(nip, ap->a_vpp); 1020 hammer_done_transaction(&trans); 1021 hammer_rel_inode(nip, 0); 1022 if (error == 0) { 1023 cache_setunresolved(ap->a_nch); 1024 cache_setvp(ap->a_nch, *ap->a_vpp); 1025 } 1026 hammer_knote(ap->a_dvp, NOTE_WRITE); 1027 } 1028 lwkt_reltoken(&hmp->fs_token); 1029 return (error); 1030 } 1031 1032 /* 1033 * hammer_vop_getattr { vp, vap } 1034 * 1035 * Retrieve an inode's attribute information. When accessing inodes 1036 * historically we fake the atime field to ensure consistent results. 1037 * The atime field is stored in the B-Tree element and allowed to be 1038 * updated without cycling the element. 1039 * 1040 * MPSAFE - does not require fs_token 1041 */ 1042 static 1043 int 1044 hammer_vop_getattr(struct vop_getattr_args *ap) 1045 { 1046 struct hammer_inode *ip = VTOI(ap->a_vp); 1047 struct vattr *vap = ap->a_vap; 1048 1049 /* 1050 * We want the fsid to be different when accessing a filesystem 1051 * with different as-of's so programs like diff don't think 1052 * the files are the same. 1053 * 1054 * We also want the fsid to be the same when comparing snapshots, 1055 * or when comparing mirrors (which might be backed by different 1056 * physical devices). HAMMER fsids are based on the PFS's 1057 * shared_uuid field. 1058 * 1059 * XXX there is a chance of collision here. The va_fsid reported 1060 * by stat is different from the more involved fsid used in the 1061 * mount structure. 1062 */ 1063 ++hammer_stats_file_iopsr; 1064 hammer_lock_sh(&ip->lock); 1065 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 1066 (u_int32_t)(ip->obj_asof >> 32); 1067 1068 vap->va_fileid = ip->ino_leaf.base.obj_id; 1069 vap->va_mode = ip->ino_data.mode; 1070 vap->va_nlink = ip->ino_data.nlinks; 1071 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1072 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1073 vap->va_rmajor = 0; 1074 vap->va_rminor = 0; 1075 vap->va_size = ip->ino_data.size; 1076 1077 /* 1078 * Special case for @@PFS softlinks. The actual size of the 1079 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1080 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1081 */ 1082 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1083 ip->ino_data.size == 10 && 1084 ip->obj_asof == HAMMER_MAX_TID && 1085 ip->obj_localization == 0 && 1086 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1087 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) 1088 vap->va_size = 26; 1089 else 1090 vap->va_size = 10; 1091 } 1092 1093 /* 1094 * We must provide a consistent atime and mtime for snapshots 1095 * so people can do a 'tar cf - ... | md5' on them and get 1096 * consistent results. 1097 */ 1098 if (ip->flags & HAMMER_INODE_RO) { 1099 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1100 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1101 } else { 1102 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1103 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1104 } 1105 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1106 vap->va_flags = ip->ino_data.uflags; 1107 vap->va_gen = 1; /* hammer inums are unique for all time */ 1108 vap->va_blocksize = HAMMER_BUFSIZE; 1109 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1110 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 1111 ~HAMMER_XBUFMASK64; 1112 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) { 1113 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 1114 ~HAMMER_BUFMASK64; 1115 } else { 1116 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 1117 } 1118 1119 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1120 vap->va_filerev = 0; /* XXX */ 1121 vap->va_uid_uuid = ip->ino_data.uid; 1122 vap->va_gid_uuid = ip->ino_data.gid; 1123 vap->va_fsid_uuid = ip->hmp->fsid; 1124 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1125 VA_FSID_UUID_VALID; 1126 1127 switch (ip->ino_data.obj_type) { 1128 case HAMMER_OBJTYPE_CDEV: 1129 case HAMMER_OBJTYPE_BDEV: 1130 vap->va_rmajor = ip->ino_data.rmajor; 1131 vap->va_rminor = ip->ino_data.rminor; 1132 break; 1133 default: 1134 break; 1135 } 1136 hammer_unlock(&ip->lock); 1137 return(0); 1138 } 1139 1140 /* 1141 * hammer_vop_nresolve { nch, dvp, cred } 1142 * 1143 * Locate the requested directory entry. 1144 */ 1145 static 1146 int 1147 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1148 { 1149 struct hammer_transaction trans; 1150 struct namecache *ncp; 1151 hammer_mount_t hmp; 1152 hammer_inode_t dip; 1153 hammer_inode_t ip; 1154 hammer_tid_t asof; 1155 struct hammer_cursor cursor; 1156 struct vnode *vp; 1157 int64_t namekey; 1158 int error; 1159 int i; 1160 int nlen; 1161 int flags; 1162 int ispfs; 1163 int64_t obj_id; 1164 u_int32_t localization; 1165 u_int32_t max_iterations; 1166 1167 /* 1168 * Misc initialization, plus handle as-of name extensions. Look for 1169 * the '@@' extension. Note that as-of files and directories cannot 1170 * be modified. 1171 */ 1172 dip = VTOI(ap->a_dvp); 1173 ncp = ap->a_nch->ncp; 1174 asof = dip->obj_asof; 1175 localization = dip->obj_localization; /* for code consistency */ 1176 nlen = ncp->nc_nlen; 1177 flags = dip->flags & HAMMER_INODE_RO; 1178 ispfs = 0; 1179 hmp = dip->hmp; 1180 1181 lwkt_gettoken(&hmp->fs_token); 1182 hammer_simple_transaction(&trans, hmp); 1183 ++hammer_stats_file_iopsr; 1184 1185 for (i = 0; i < nlen; ++i) { 1186 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1187 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1188 &ispfs, &asof, &localization); 1189 if (error != 0) { 1190 i = nlen; 1191 break; 1192 } 1193 if (asof != HAMMER_MAX_TID) 1194 flags |= HAMMER_INODE_RO; 1195 break; 1196 } 1197 } 1198 nlen = i; 1199 1200 /* 1201 * If this is a PFS softlink we dive into the PFS 1202 */ 1203 if (ispfs && nlen == 0) { 1204 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1205 asof, localization, 1206 flags, &error); 1207 if (error == 0) { 1208 error = hammer_get_vnode(ip, &vp); 1209 hammer_rel_inode(ip, 0); 1210 } else { 1211 vp = NULL; 1212 } 1213 if (error == 0) { 1214 vn_unlock(vp); 1215 cache_setvp(ap->a_nch, vp); 1216 vrele(vp); 1217 } 1218 goto done; 1219 } 1220 1221 /* 1222 * If there is no path component the time extension is relative to dip. 1223 * e.g. "fubar/@@<snapshot>" 1224 * 1225 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1226 * e.g. "fubar/.@@<snapshot>" 1227 * 1228 * ".." is handled by the kernel. We do not currently handle 1229 * "..@<snapshot>". 1230 */ 1231 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1232 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1233 asof, dip->obj_localization, 1234 flags, &error); 1235 if (error == 0) { 1236 error = hammer_get_vnode(ip, &vp); 1237 hammer_rel_inode(ip, 0); 1238 } else { 1239 vp = NULL; 1240 } 1241 if (error == 0) { 1242 vn_unlock(vp); 1243 cache_setvp(ap->a_nch, vp); 1244 vrele(vp); 1245 } 1246 goto done; 1247 } 1248 1249 /* 1250 * Calculate the namekey and setup the key range for the scan. This 1251 * works kinda like a chained hash table where the lower 32 bits 1252 * of the namekey synthesize the chain. 1253 * 1254 * The key range is inclusive of both key_beg and key_end. 1255 */ 1256 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 1257 &max_iterations); 1258 1259 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1260 cursor.key_beg.localization = dip->obj_localization + 1261 hammer_dir_localization(dip); 1262 cursor.key_beg.obj_id = dip->obj_id; 1263 cursor.key_beg.key = namekey; 1264 cursor.key_beg.create_tid = 0; 1265 cursor.key_beg.delete_tid = 0; 1266 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1267 cursor.key_beg.obj_type = 0; 1268 1269 cursor.key_end = cursor.key_beg; 1270 cursor.key_end.key += max_iterations; 1271 cursor.asof = asof; 1272 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1273 1274 /* 1275 * Scan all matching records (the chain), locate the one matching 1276 * the requested path component. 1277 * 1278 * The hammer_ip_*() functions merge in-memory records with on-disk 1279 * records for the purposes of the search. 1280 */ 1281 obj_id = 0; 1282 localization = HAMMER_DEF_LOCALIZATION; 1283 1284 if (error == 0) { 1285 error = hammer_ip_first(&cursor); 1286 while (error == 0) { 1287 error = hammer_ip_resolve_data(&cursor); 1288 if (error) 1289 break; 1290 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1291 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1292 obj_id = cursor.data->entry.obj_id; 1293 localization = cursor.data->entry.localization; 1294 break; 1295 } 1296 error = hammer_ip_next(&cursor); 1297 } 1298 } 1299 hammer_done_cursor(&cursor); 1300 1301 /* 1302 * Lookup the obj_id. This should always succeed. If it does not 1303 * the filesystem may be damaged and we return a dummy inode. 1304 */ 1305 if (error == 0) { 1306 ip = hammer_get_inode(&trans, dip, obj_id, 1307 asof, localization, 1308 flags, &error); 1309 if (error == ENOENT) { 1310 kprintf("HAMMER: WARNING: Missing " 1311 "inode for dirent \"%s\"\n" 1312 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n", 1313 ncp->nc_name, 1314 (long long)obj_id, (long long)asof, 1315 localization); 1316 error = 0; 1317 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1318 asof, localization, 1319 flags, &error); 1320 } 1321 if (error == 0) { 1322 error = hammer_get_vnode(ip, &vp); 1323 hammer_rel_inode(ip, 0); 1324 } else { 1325 vp = NULL; 1326 } 1327 if (error == 0) { 1328 vn_unlock(vp); 1329 cache_setvp(ap->a_nch, vp); 1330 vrele(vp); 1331 } 1332 } else if (error == ENOENT) { 1333 cache_setvp(ap->a_nch, NULL); 1334 } 1335 done: 1336 hammer_done_transaction(&trans); 1337 lwkt_reltoken(&hmp->fs_token); 1338 return (error); 1339 } 1340 1341 /* 1342 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1343 * 1344 * Locate the parent directory of a directory vnode. 1345 * 1346 * dvp is referenced but not locked. *vpp must be returned referenced and 1347 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 1348 * at the root, instead it could indicate that the directory we were in was 1349 * removed. 1350 * 1351 * NOTE: as-of sequences are not linked into the directory structure. If 1352 * we are at the root with a different asof then the mount point, reload 1353 * the same directory with the mount point's asof. I'm not sure what this 1354 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1355 * get confused, but it hasn't been tested. 1356 */ 1357 static 1358 int 1359 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1360 { 1361 struct hammer_transaction trans; 1362 struct hammer_inode *dip; 1363 struct hammer_inode *ip; 1364 hammer_mount_t hmp; 1365 int64_t parent_obj_id; 1366 u_int32_t parent_obj_localization; 1367 hammer_tid_t asof; 1368 int error; 1369 1370 dip = VTOI(ap->a_dvp); 1371 asof = dip->obj_asof; 1372 hmp = dip->hmp; 1373 1374 /* 1375 * Whos are parent? This could be the root of a pseudo-filesystem 1376 * whos parent is in another localization domain. 1377 */ 1378 lwkt_gettoken(&hmp->fs_token); 1379 parent_obj_id = dip->ino_data.parent_obj_id; 1380 if (dip->obj_id == HAMMER_OBJID_ROOT) 1381 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 1382 else 1383 parent_obj_localization = dip->obj_localization; 1384 1385 if (parent_obj_id == 0) { 1386 if (dip->obj_id == HAMMER_OBJID_ROOT && 1387 asof != hmp->asof) { 1388 parent_obj_id = dip->obj_id; 1389 asof = hmp->asof; 1390 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1391 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 1392 (long long)dip->obj_asof); 1393 } else { 1394 *ap->a_vpp = NULL; 1395 lwkt_reltoken(&hmp->fs_token); 1396 return ENOENT; 1397 } 1398 } 1399 1400 hammer_simple_transaction(&trans, hmp); 1401 ++hammer_stats_file_iopsr; 1402 1403 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1404 asof, parent_obj_localization, 1405 dip->flags, &error); 1406 if (ip) { 1407 error = hammer_get_vnode(ip, ap->a_vpp); 1408 hammer_rel_inode(ip, 0); 1409 } else { 1410 *ap->a_vpp = NULL; 1411 } 1412 hammer_done_transaction(&trans); 1413 lwkt_reltoken(&hmp->fs_token); 1414 return (error); 1415 } 1416 1417 /* 1418 * hammer_vop_nlink { nch, dvp, vp, cred } 1419 */ 1420 static 1421 int 1422 hammer_vop_nlink(struct vop_nlink_args *ap) 1423 { 1424 struct hammer_transaction trans; 1425 struct hammer_inode *dip; 1426 struct hammer_inode *ip; 1427 struct nchandle *nch; 1428 hammer_mount_t hmp; 1429 int error; 1430 1431 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1432 return(EXDEV); 1433 1434 nch = ap->a_nch; 1435 dip = VTOI(ap->a_dvp); 1436 ip = VTOI(ap->a_vp); 1437 hmp = dip->hmp; 1438 1439 if (dip->obj_localization != ip->obj_localization) 1440 return(EXDEV); 1441 1442 if (dip->flags & HAMMER_INODE_RO) 1443 return (EROFS); 1444 if (ip->flags & HAMMER_INODE_RO) 1445 return (EROFS); 1446 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1447 return (error); 1448 1449 /* 1450 * Create a transaction to cover the operations we perform. 1451 */ 1452 lwkt_gettoken(&hmp->fs_token); 1453 hammer_start_transaction(&trans, hmp); 1454 ++hammer_stats_file_iopsw; 1455 1456 /* 1457 * Add the filesystem object to the directory. Note that neither 1458 * dip nor ip are referenced or locked, but their vnodes are 1459 * referenced. This function will bump the inode's link count. 1460 */ 1461 error = hammer_ip_add_directory(&trans, dip, 1462 nch->ncp->nc_name, nch->ncp->nc_nlen, 1463 ip); 1464 1465 /* 1466 * Finish up. 1467 */ 1468 if (error == 0) { 1469 cache_setunresolved(nch); 1470 cache_setvp(nch, ap->a_vp); 1471 } 1472 hammer_done_transaction(&trans); 1473 hammer_knote(ap->a_vp, NOTE_LINK); 1474 hammer_knote(ap->a_dvp, NOTE_WRITE); 1475 lwkt_reltoken(&hmp->fs_token); 1476 return (error); 1477 } 1478 1479 /* 1480 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1481 * 1482 * The operating system has already ensured that the directory entry 1483 * does not exist and done all appropriate namespace locking. 1484 */ 1485 static 1486 int 1487 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1488 { 1489 struct hammer_transaction trans; 1490 struct hammer_inode *dip; 1491 struct hammer_inode *nip; 1492 struct nchandle *nch; 1493 hammer_mount_t hmp; 1494 int error; 1495 1496 nch = ap->a_nch; 1497 dip = VTOI(ap->a_dvp); 1498 hmp = dip->hmp; 1499 1500 if (dip->flags & HAMMER_INODE_RO) 1501 return (EROFS); 1502 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1503 return (error); 1504 1505 /* 1506 * Create a transaction to cover the operations we perform. 1507 */ 1508 lwkt_gettoken(&hmp->fs_token); 1509 hammer_start_transaction(&trans, hmp); 1510 ++hammer_stats_file_iopsw; 1511 1512 /* 1513 * Create a new filesystem object of the requested type. The 1514 * returned inode will be referenced but not locked. 1515 */ 1516 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1517 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1518 NULL, &nip); 1519 if (error) { 1520 hkprintf("hammer_mkdir error %d\n", error); 1521 hammer_done_transaction(&trans); 1522 *ap->a_vpp = NULL; 1523 lwkt_reltoken(&hmp->fs_token); 1524 return (error); 1525 } 1526 /* 1527 * Add the new filesystem object to the directory. This will also 1528 * bump the inode's link count. 1529 */ 1530 error = hammer_ip_add_directory(&trans, dip, 1531 nch->ncp->nc_name, nch->ncp->nc_nlen, 1532 nip); 1533 if (error) 1534 hkprintf("hammer_mkdir (add) error %d\n", error); 1535 1536 /* 1537 * Finish up. 1538 */ 1539 if (error) { 1540 hammer_rel_inode(nip, 0); 1541 *ap->a_vpp = NULL; 1542 } else { 1543 error = hammer_get_vnode(nip, ap->a_vpp); 1544 hammer_rel_inode(nip, 0); 1545 if (error == 0) { 1546 cache_setunresolved(ap->a_nch); 1547 cache_setvp(ap->a_nch, *ap->a_vpp); 1548 } 1549 } 1550 hammer_done_transaction(&trans); 1551 if (error == 0) 1552 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1553 lwkt_reltoken(&hmp->fs_token); 1554 return (error); 1555 } 1556 1557 /* 1558 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1559 * 1560 * The operating system has already ensured that the directory entry 1561 * does not exist and done all appropriate namespace locking. 1562 */ 1563 static 1564 int 1565 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1566 { 1567 struct hammer_transaction trans; 1568 struct hammer_inode *dip; 1569 struct hammer_inode *nip; 1570 struct nchandle *nch; 1571 hammer_mount_t hmp; 1572 int error; 1573 1574 nch = ap->a_nch; 1575 dip = VTOI(ap->a_dvp); 1576 hmp = dip->hmp; 1577 1578 if (dip->flags & HAMMER_INODE_RO) 1579 return (EROFS); 1580 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1581 return (error); 1582 1583 /* 1584 * Create a transaction to cover the operations we perform. 1585 */ 1586 lwkt_gettoken(&hmp->fs_token); 1587 hammer_start_transaction(&trans, hmp); 1588 ++hammer_stats_file_iopsw; 1589 1590 /* 1591 * Create a new filesystem object of the requested type. The 1592 * returned inode will be referenced but not locked. 1593 * 1594 * If mknod specifies a directory a pseudo-fs is created. 1595 */ 1596 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1597 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1598 NULL, &nip); 1599 if (error) { 1600 hammer_done_transaction(&trans); 1601 *ap->a_vpp = NULL; 1602 lwkt_reltoken(&hmp->fs_token); 1603 return (error); 1604 } 1605 1606 /* 1607 * Add the new filesystem object to the directory. This will also 1608 * bump the inode's link count. 1609 */ 1610 error = hammer_ip_add_directory(&trans, dip, 1611 nch->ncp->nc_name, nch->ncp->nc_nlen, 1612 nip); 1613 1614 /* 1615 * Finish up. 1616 */ 1617 if (error) { 1618 hammer_rel_inode(nip, 0); 1619 *ap->a_vpp = NULL; 1620 } else { 1621 error = hammer_get_vnode(nip, ap->a_vpp); 1622 hammer_rel_inode(nip, 0); 1623 if (error == 0) { 1624 cache_setunresolved(ap->a_nch); 1625 cache_setvp(ap->a_nch, *ap->a_vpp); 1626 } 1627 } 1628 hammer_done_transaction(&trans); 1629 if (error == 0) 1630 hammer_knote(ap->a_dvp, NOTE_WRITE); 1631 lwkt_reltoken(&hmp->fs_token); 1632 return (error); 1633 } 1634 1635 /* 1636 * hammer_vop_open { vp, mode, cred, fp } 1637 * 1638 * MPSAFE (does not require fs_token) 1639 */ 1640 static 1641 int 1642 hammer_vop_open(struct vop_open_args *ap) 1643 { 1644 hammer_inode_t ip; 1645 1646 ++hammer_stats_file_iopsr; 1647 ip = VTOI(ap->a_vp); 1648 1649 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1650 return (EROFS); 1651 return(vop_stdopen(ap)); 1652 } 1653 1654 /* 1655 * hammer_vop_print { vp } 1656 */ 1657 static 1658 int 1659 hammer_vop_print(struct vop_print_args *ap) 1660 { 1661 return EOPNOTSUPP; 1662 } 1663 1664 /* 1665 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1666 */ 1667 static 1668 int 1669 hammer_vop_readdir(struct vop_readdir_args *ap) 1670 { 1671 struct hammer_transaction trans; 1672 struct hammer_cursor cursor; 1673 struct hammer_inode *ip; 1674 hammer_mount_t hmp; 1675 struct uio *uio; 1676 hammer_base_elm_t base; 1677 int error; 1678 int cookie_index; 1679 int ncookies; 1680 off_t *cookies; 1681 off_t saveoff; 1682 int r; 1683 int dtype; 1684 1685 ++hammer_stats_file_iopsr; 1686 ip = VTOI(ap->a_vp); 1687 uio = ap->a_uio; 1688 saveoff = uio->uio_offset; 1689 hmp = ip->hmp; 1690 1691 if (ap->a_ncookies) { 1692 ncookies = uio->uio_resid / 16 + 1; 1693 if (ncookies > 1024) 1694 ncookies = 1024; 1695 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1696 cookie_index = 0; 1697 } else { 1698 ncookies = -1; 1699 cookies = NULL; 1700 cookie_index = 0; 1701 } 1702 1703 lwkt_gettoken(&hmp->fs_token); 1704 hammer_simple_transaction(&trans, hmp); 1705 1706 /* 1707 * Handle artificial entries 1708 * 1709 * It should be noted that the minimum value for a directory 1710 * hash key on-media is 0x0000000100000000, so we can use anything 1711 * less then that to represent our 'special' key space. 1712 */ 1713 error = 0; 1714 if (saveoff == 0) { 1715 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1716 if (r) 1717 goto done; 1718 if (cookies) 1719 cookies[cookie_index] = saveoff; 1720 ++saveoff; 1721 ++cookie_index; 1722 if (cookie_index == ncookies) 1723 goto done; 1724 } 1725 if (saveoff == 1) { 1726 if (ip->ino_data.parent_obj_id) { 1727 r = vop_write_dirent(&error, uio, 1728 ip->ino_data.parent_obj_id, 1729 DT_DIR, 2, ".."); 1730 } else { 1731 r = vop_write_dirent(&error, uio, 1732 ip->obj_id, DT_DIR, 2, ".."); 1733 } 1734 if (r) 1735 goto done; 1736 if (cookies) 1737 cookies[cookie_index] = saveoff; 1738 ++saveoff; 1739 ++cookie_index; 1740 if (cookie_index == ncookies) 1741 goto done; 1742 } 1743 1744 /* 1745 * Key range (begin and end inclusive) to scan. Directory keys 1746 * directly translate to a 64 bit 'seek' position. 1747 */ 1748 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1749 cursor.key_beg.localization = ip->obj_localization + 1750 hammer_dir_localization(ip); 1751 cursor.key_beg.obj_id = ip->obj_id; 1752 cursor.key_beg.create_tid = 0; 1753 cursor.key_beg.delete_tid = 0; 1754 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1755 cursor.key_beg.obj_type = 0; 1756 cursor.key_beg.key = saveoff; 1757 1758 cursor.key_end = cursor.key_beg; 1759 cursor.key_end.key = HAMMER_MAX_KEY; 1760 cursor.asof = ip->obj_asof; 1761 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1762 1763 error = hammer_ip_first(&cursor); 1764 1765 while (error == 0) { 1766 error = hammer_ip_resolve_data(&cursor); 1767 if (error) 1768 break; 1769 base = &cursor.leaf->base; 1770 saveoff = base->key; 1771 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1772 1773 if (base->obj_id != ip->obj_id) 1774 panic("readdir: bad record at %p", cursor.node); 1775 1776 /* 1777 * Convert pseudo-filesystems into softlinks 1778 */ 1779 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1780 r = vop_write_dirent( 1781 &error, uio, cursor.data->entry.obj_id, 1782 dtype, 1783 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1784 (void *)cursor.data->entry.name); 1785 if (r) 1786 break; 1787 ++saveoff; 1788 if (cookies) 1789 cookies[cookie_index] = base->key; 1790 ++cookie_index; 1791 if (cookie_index == ncookies) 1792 break; 1793 error = hammer_ip_next(&cursor); 1794 } 1795 hammer_done_cursor(&cursor); 1796 1797 done: 1798 hammer_done_transaction(&trans); 1799 1800 if (ap->a_eofflag) 1801 *ap->a_eofflag = (error == ENOENT); 1802 uio->uio_offset = saveoff; 1803 if (error && cookie_index == 0) { 1804 if (error == ENOENT) 1805 error = 0; 1806 if (cookies) { 1807 kfree(cookies, M_TEMP); 1808 *ap->a_ncookies = 0; 1809 *ap->a_cookies = NULL; 1810 } 1811 } else { 1812 if (error == ENOENT) 1813 error = 0; 1814 if (cookies) { 1815 *ap->a_ncookies = cookie_index; 1816 *ap->a_cookies = cookies; 1817 } 1818 } 1819 lwkt_reltoken(&hmp->fs_token); 1820 return(error); 1821 } 1822 1823 /* 1824 * hammer_vop_readlink { vp, uio, cred } 1825 */ 1826 static 1827 int 1828 hammer_vop_readlink(struct vop_readlink_args *ap) 1829 { 1830 struct hammer_transaction trans; 1831 struct hammer_cursor cursor; 1832 struct hammer_inode *ip; 1833 hammer_mount_t hmp; 1834 char buf[32]; 1835 u_int32_t localization; 1836 hammer_pseudofs_inmem_t pfsm; 1837 int error; 1838 1839 ip = VTOI(ap->a_vp); 1840 hmp = ip->hmp; 1841 1842 lwkt_gettoken(&hmp->fs_token); 1843 1844 /* 1845 * Shortcut if the symlink data was stuffed into ino_data. 1846 * 1847 * Also expand special "@@PFS%05d" softlinks (expansion only 1848 * occurs for non-historical (current) accesses made from the 1849 * primary filesystem). 1850 */ 1851 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1852 char *ptr; 1853 int bytes; 1854 1855 ptr = ip->ino_data.ext.symlink; 1856 bytes = (int)ip->ino_data.size; 1857 if (bytes == 10 && 1858 ip->obj_asof == HAMMER_MAX_TID && 1859 ip->obj_localization == 0 && 1860 strncmp(ptr, "@@PFS", 5) == 0) { 1861 hammer_simple_transaction(&trans, hmp); 1862 bcopy(ptr + 5, buf, 5); 1863 buf[5] = 0; 1864 localization = strtoul(buf, NULL, 10) << 16; 1865 pfsm = hammer_load_pseudofs(&trans, localization, 1866 &error); 1867 if (error == 0) { 1868 if (pfsm->pfsd.mirror_flags & 1869 HAMMER_PFSD_SLAVE) { 1870 /* vap->va_size == 26 */ 1871 ksnprintf(buf, sizeof(buf), 1872 "@@0x%016llx:%05d", 1873 (long long)pfsm->pfsd.sync_end_tid, 1874 localization >> 16); 1875 } else { 1876 /* vap->va_size == 10 */ 1877 ksnprintf(buf, sizeof(buf), 1878 "@@-1:%05d", 1879 localization >> 16); 1880 #if 0 1881 ksnprintf(buf, sizeof(buf), 1882 "@@0x%016llx:%05d", 1883 (long long)HAMMER_MAX_TID, 1884 localization >> 16); 1885 #endif 1886 } 1887 ptr = buf; 1888 bytes = strlen(buf); 1889 } 1890 if (pfsm) 1891 hammer_rel_pseudofs(hmp, pfsm); 1892 hammer_done_transaction(&trans); 1893 } 1894 error = uiomove(ptr, bytes, ap->a_uio); 1895 lwkt_reltoken(&hmp->fs_token); 1896 return(error); 1897 } 1898 1899 /* 1900 * Long version 1901 */ 1902 hammer_simple_transaction(&trans, hmp); 1903 ++hammer_stats_file_iopsr; 1904 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1905 1906 /* 1907 * Key range (begin and end inclusive) to scan. Directory keys 1908 * directly translate to a 64 bit 'seek' position. 1909 */ 1910 cursor.key_beg.localization = ip->obj_localization + 1911 HAMMER_LOCALIZE_MISC; 1912 cursor.key_beg.obj_id = ip->obj_id; 1913 cursor.key_beg.create_tid = 0; 1914 cursor.key_beg.delete_tid = 0; 1915 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1916 cursor.key_beg.obj_type = 0; 1917 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1918 cursor.asof = ip->obj_asof; 1919 cursor.flags |= HAMMER_CURSOR_ASOF; 1920 1921 error = hammer_ip_lookup(&cursor); 1922 if (error == 0) { 1923 error = hammer_ip_resolve_data(&cursor); 1924 if (error == 0) { 1925 KKASSERT(cursor.leaf->data_len >= 1926 HAMMER_SYMLINK_NAME_OFF); 1927 error = uiomove(cursor.data->symlink.name, 1928 cursor.leaf->data_len - 1929 HAMMER_SYMLINK_NAME_OFF, 1930 ap->a_uio); 1931 } 1932 } 1933 hammer_done_cursor(&cursor); 1934 hammer_done_transaction(&trans); 1935 lwkt_reltoken(&hmp->fs_token); 1936 return(error); 1937 } 1938 1939 /* 1940 * hammer_vop_nremove { nch, dvp, cred } 1941 */ 1942 static 1943 int 1944 hammer_vop_nremove(struct vop_nremove_args *ap) 1945 { 1946 struct hammer_transaction trans; 1947 struct hammer_inode *dip; 1948 hammer_mount_t hmp; 1949 int error; 1950 1951 dip = VTOI(ap->a_dvp); 1952 hmp = dip->hmp; 1953 1954 if (hammer_nohistory(dip) == 0 && 1955 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1956 return (error); 1957 } 1958 1959 lwkt_gettoken(&hmp->fs_token); 1960 hammer_start_transaction(&trans, hmp); 1961 ++hammer_stats_file_iopsw; 1962 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1963 hammer_done_transaction(&trans); 1964 if (error == 0) 1965 hammer_knote(ap->a_dvp, NOTE_WRITE); 1966 lwkt_reltoken(&hmp->fs_token); 1967 return (error); 1968 } 1969 1970 /* 1971 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1972 */ 1973 static 1974 int 1975 hammer_vop_nrename(struct vop_nrename_args *ap) 1976 { 1977 struct hammer_transaction trans; 1978 struct namecache *fncp; 1979 struct namecache *tncp; 1980 struct hammer_inode *fdip; 1981 struct hammer_inode *tdip; 1982 struct hammer_inode *ip; 1983 hammer_mount_t hmp; 1984 struct hammer_cursor cursor; 1985 int64_t namekey; 1986 u_int32_t max_iterations; 1987 int nlen, error; 1988 1989 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1990 return(EXDEV); 1991 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1992 return(EXDEV); 1993 1994 fdip = VTOI(ap->a_fdvp); 1995 tdip = VTOI(ap->a_tdvp); 1996 fncp = ap->a_fnch->ncp; 1997 tncp = ap->a_tnch->ncp; 1998 ip = VTOI(fncp->nc_vp); 1999 KKASSERT(ip != NULL); 2000 2001 hmp = ip->hmp; 2002 2003 if (fdip->obj_localization != tdip->obj_localization) 2004 return(EXDEV); 2005 if (fdip->obj_localization != ip->obj_localization) 2006 return(EXDEV); 2007 2008 if (fdip->flags & HAMMER_INODE_RO) 2009 return (EROFS); 2010 if (tdip->flags & HAMMER_INODE_RO) 2011 return (EROFS); 2012 if (ip->flags & HAMMER_INODE_RO) 2013 return (EROFS); 2014 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2015 return (error); 2016 2017 lwkt_gettoken(&hmp->fs_token); 2018 hammer_start_transaction(&trans, hmp); 2019 ++hammer_stats_file_iopsw; 2020 2021 /* 2022 * Remove tncp from the target directory and then link ip as 2023 * tncp. XXX pass trans to dounlink 2024 * 2025 * Force the inode sync-time to match the transaction so it is 2026 * in-sync with the creation of the target directory entry. 2027 */ 2028 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 2029 ap->a_cred, 0, -1); 2030 if (error == 0 || error == ENOENT) { 2031 error = hammer_ip_add_directory(&trans, tdip, 2032 tncp->nc_name, tncp->nc_nlen, 2033 ip); 2034 if (error == 0) { 2035 ip->ino_data.parent_obj_id = tdip->obj_id; 2036 ip->ino_data.ctime = trans.time; 2037 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 2038 } 2039 } 2040 if (error) 2041 goto failed; /* XXX */ 2042 2043 /* 2044 * Locate the record in the originating directory and remove it. 2045 * 2046 * Calculate the namekey and setup the key range for the scan. This 2047 * works kinda like a chained hash table where the lower 32 bits 2048 * of the namekey synthesize the chain. 2049 * 2050 * The key range is inclusive of both key_beg and key_end. 2051 */ 2052 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 2053 &max_iterations); 2054 retry: 2055 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 2056 cursor.key_beg.localization = fdip->obj_localization + 2057 hammer_dir_localization(fdip); 2058 cursor.key_beg.obj_id = fdip->obj_id; 2059 cursor.key_beg.key = namekey; 2060 cursor.key_beg.create_tid = 0; 2061 cursor.key_beg.delete_tid = 0; 2062 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2063 cursor.key_beg.obj_type = 0; 2064 2065 cursor.key_end = cursor.key_beg; 2066 cursor.key_end.key += max_iterations; 2067 cursor.asof = fdip->obj_asof; 2068 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2069 2070 /* 2071 * Scan all matching records (the chain), locate the one matching 2072 * the requested path component. 2073 * 2074 * The hammer_ip_*() functions merge in-memory records with on-disk 2075 * records for the purposes of the search. 2076 */ 2077 error = hammer_ip_first(&cursor); 2078 while (error == 0) { 2079 if (hammer_ip_resolve_data(&cursor) != 0) 2080 break; 2081 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2082 KKASSERT(nlen > 0); 2083 if (fncp->nc_nlen == nlen && 2084 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2085 break; 2086 } 2087 error = hammer_ip_next(&cursor); 2088 } 2089 2090 /* 2091 * If all is ok we have to get the inode so we can adjust nlinks. 2092 * 2093 * WARNING: hammer_ip_del_directory() may have to terminate the 2094 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2095 * twice. 2096 */ 2097 if (error == 0) 2098 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 2099 2100 /* 2101 * XXX A deadlock here will break rename's atomicy for the purposes 2102 * of crash recovery. 2103 */ 2104 if (error == EDEADLK) { 2105 hammer_done_cursor(&cursor); 2106 goto retry; 2107 } 2108 2109 /* 2110 * Cleanup and tell the kernel that the rename succeeded. 2111 * 2112 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2113 * without formally acquiring the vp since the vp might 2114 * have zero refs on it, or in the middle of a reclaim, 2115 * etc. 2116 */ 2117 hammer_done_cursor(&cursor); 2118 if (error == 0) { 2119 cache_rename(ap->a_fnch, ap->a_tnch); 2120 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2121 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2122 while (ip->vp) { 2123 struct vnode *vp; 2124 2125 error = hammer_get_vnode(ip, &vp); 2126 if (error == 0 && vp) { 2127 vn_unlock(vp); 2128 hammer_knote(ip->vp, NOTE_RENAME); 2129 vrele(vp); 2130 break; 2131 } 2132 kprintf("Debug: HAMMER ip/vp race2 avoided\n"); 2133 } 2134 } 2135 2136 failed: 2137 hammer_done_transaction(&trans); 2138 lwkt_reltoken(&hmp->fs_token); 2139 return (error); 2140 } 2141 2142 /* 2143 * hammer_vop_nrmdir { nch, dvp, cred } 2144 */ 2145 static 2146 int 2147 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2148 { 2149 struct hammer_transaction trans; 2150 struct hammer_inode *dip; 2151 hammer_mount_t hmp; 2152 int error; 2153 2154 dip = VTOI(ap->a_dvp); 2155 hmp = dip->hmp; 2156 2157 if (hammer_nohistory(dip) == 0 && 2158 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2159 return (error); 2160 } 2161 2162 lwkt_gettoken(&hmp->fs_token); 2163 hammer_start_transaction(&trans, hmp); 2164 ++hammer_stats_file_iopsw; 2165 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2166 hammer_done_transaction(&trans); 2167 if (error == 0) 2168 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2169 lwkt_reltoken(&hmp->fs_token); 2170 return (error); 2171 } 2172 2173 /* 2174 * hammer_vop_markatime { vp, cred } 2175 */ 2176 static 2177 int 2178 hammer_vop_markatime(struct vop_markatime_args *ap) 2179 { 2180 struct hammer_transaction trans; 2181 struct hammer_inode *ip; 2182 hammer_mount_t hmp; 2183 2184 ip = VTOI(ap->a_vp); 2185 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2186 return (EROFS); 2187 if (ip->flags & HAMMER_INODE_RO) 2188 return (EROFS); 2189 hmp = ip->hmp; 2190 if (hmp->mp->mnt_flag & MNT_NOATIME) 2191 return (0); 2192 lwkt_gettoken(&hmp->fs_token); 2193 hammer_start_transaction(&trans, hmp); 2194 ++hammer_stats_file_iopsw; 2195 2196 ip->ino_data.atime = trans.time; 2197 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2198 hammer_done_transaction(&trans); 2199 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2200 lwkt_reltoken(&hmp->fs_token); 2201 return (0); 2202 } 2203 2204 /* 2205 * hammer_vop_setattr { vp, vap, cred } 2206 */ 2207 static 2208 int 2209 hammer_vop_setattr(struct vop_setattr_args *ap) 2210 { 2211 struct hammer_transaction trans; 2212 struct hammer_inode *ip; 2213 struct vattr *vap; 2214 hammer_mount_t hmp; 2215 int modflags; 2216 int error; 2217 int truncating; 2218 int blksize; 2219 int kflags; 2220 #if 0 2221 int64_t aligned_size; 2222 #endif 2223 u_int32_t flags; 2224 2225 vap = ap->a_vap; 2226 ip = ap->a_vp->v_data; 2227 modflags = 0; 2228 kflags = 0; 2229 hmp = ip->hmp; 2230 2231 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2232 return(EROFS); 2233 if (ip->flags & HAMMER_INODE_RO) 2234 return (EROFS); 2235 if (hammer_nohistory(ip) == 0 && 2236 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2237 return (error); 2238 } 2239 2240 lwkt_gettoken(&hmp->fs_token); 2241 hammer_start_transaction(&trans, hmp); 2242 ++hammer_stats_file_iopsw; 2243 error = 0; 2244 2245 if (vap->va_flags != VNOVAL) { 2246 flags = ip->ino_data.uflags; 2247 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2248 hammer_to_unix_xid(&ip->ino_data.uid), 2249 ap->a_cred); 2250 if (error == 0) { 2251 if (ip->ino_data.uflags != flags) { 2252 ip->ino_data.uflags = flags; 2253 ip->ino_data.ctime = trans.time; 2254 modflags |= HAMMER_INODE_DDIRTY; 2255 kflags |= NOTE_ATTRIB; 2256 } 2257 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2258 error = 0; 2259 goto done; 2260 } 2261 } 2262 goto done; 2263 } 2264 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2265 error = EPERM; 2266 goto done; 2267 } 2268 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2269 mode_t cur_mode = ip->ino_data.mode; 2270 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2271 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2272 uuid_t uuid_uid; 2273 uuid_t uuid_gid; 2274 2275 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2276 ap->a_cred, 2277 &cur_uid, &cur_gid, &cur_mode); 2278 if (error == 0) { 2279 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2280 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2281 if (bcmp(&uuid_uid, &ip->ino_data.uid, 2282 sizeof(uuid_uid)) || 2283 bcmp(&uuid_gid, &ip->ino_data.gid, 2284 sizeof(uuid_gid)) || 2285 ip->ino_data.mode != cur_mode 2286 ) { 2287 ip->ino_data.uid = uuid_uid; 2288 ip->ino_data.gid = uuid_gid; 2289 ip->ino_data.mode = cur_mode; 2290 ip->ino_data.ctime = trans.time; 2291 modflags |= HAMMER_INODE_DDIRTY; 2292 } 2293 kflags |= NOTE_ATTRIB; 2294 } 2295 } 2296 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2297 switch(ap->a_vp->v_type) { 2298 case VREG: 2299 if (vap->va_size == ip->ino_data.size) 2300 break; 2301 2302 /* 2303 * Log the operation if in fast-fsync mode or if 2304 * there are unterminated redo write records present. 2305 * 2306 * The second check is needed so the recovery code 2307 * properly truncates write redos even if nominal 2308 * REDO operations is turned off due to excessive 2309 * writes, because the related records might be 2310 * destroyed and never lay down a TERM_WRITE. 2311 */ 2312 if ((ip->flags & HAMMER_INODE_REDO) || 2313 (ip->flags & HAMMER_INODE_RDIRTY)) { 2314 error = hammer_generate_redo(&trans, ip, 2315 vap->va_size, 2316 HAMMER_REDO_TRUNC, 2317 NULL, 0); 2318 } 2319 blksize = hammer_blocksize(vap->va_size); 2320 2321 /* 2322 * XXX break atomicy, we can deadlock the backend 2323 * if we do not release the lock. Probably not a 2324 * big deal here. 2325 */ 2326 if (vap->va_size < ip->ino_data.size) { 2327 nvtruncbuf(ap->a_vp, vap->va_size, 2328 blksize, 2329 hammer_blockoff(vap->va_size), 2330 0); 2331 truncating = 1; 2332 kflags |= NOTE_WRITE; 2333 } else { 2334 nvextendbuf(ap->a_vp, 2335 ip->ino_data.size, 2336 vap->va_size, 2337 hammer_blocksize(ip->ino_data.size), 2338 hammer_blocksize(vap->va_size), 2339 hammer_blockoff(ip->ino_data.size), 2340 hammer_blockoff(vap->va_size), 2341 0); 2342 truncating = 0; 2343 kflags |= NOTE_WRITE | NOTE_EXTEND; 2344 } 2345 ip->ino_data.size = vap->va_size; 2346 ip->ino_data.mtime = trans.time; 2347 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2348 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2349 2350 /* 2351 * On-media truncation is cached in the inode until 2352 * the inode is synchronized. We must immediately 2353 * handle any frontend records. 2354 */ 2355 if (truncating) { 2356 hammer_ip_frontend_trunc(ip, vap->va_size); 2357 #ifdef DEBUG_TRUNCATE 2358 if (HammerTruncIp == NULL) 2359 HammerTruncIp = ip; 2360 #endif 2361 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2362 ip->flags |= HAMMER_INODE_TRUNCATED; 2363 ip->trunc_off = vap->va_size; 2364 #ifdef DEBUG_TRUNCATE 2365 if (ip == HammerTruncIp) 2366 kprintf("truncate1 %016llx\n", 2367 (long long)ip->trunc_off); 2368 #endif 2369 } else if (ip->trunc_off > vap->va_size) { 2370 ip->trunc_off = vap->va_size; 2371 #ifdef DEBUG_TRUNCATE 2372 if (ip == HammerTruncIp) 2373 kprintf("truncate2 %016llx\n", 2374 (long long)ip->trunc_off); 2375 #endif 2376 } else { 2377 #ifdef DEBUG_TRUNCATE 2378 if (ip == HammerTruncIp) 2379 kprintf("truncate3 %016llx (ignored)\n", 2380 (long long)vap->va_size); 2381 #endif 2382 } 2383 } 2384 2385 #if 0 2386 /* 2387 * When truncating, nvtruncbuf() may have cleaned out 2388 * a portion of the last block on-disk in the buffer 2389 * cache. We must clean out any frontend records 2390 * for blocks beyond the new last block. 2391 */ 2392 aligned_size = (vap->va_size + (blksize - 1)) & 2393 ~(int64_t)(blksize - 1); 2394 if (truncating && vap->va_size < aligned_size) { 2395 aligned_size -= blksize; 2396 hammer_ip_frontend_trunc(ip, aligned_size); 2397 } 2398 #endif 2399 break; 2400 case VDATABASE: 2401 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2402 ip->flags |= HAMMER_INODE_TRUNCATED; 2403 ip->trunc_off = vap->va_size; 2404 } else if (ip->trunc_off > vap->va_size) { 2405 ip->trunc_off = vap->va_size; 2406 } 2407 hammer_ip_frontend_trunc(ip, vap->va_size); 2408 ip->ino_data.size = vap->va_size; 2409 ip->ino_data.mtime = trans.time; 2410 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2411 kflags |= NOTE_ATTRIB; 2412 break; 2413 default: 2414 error = EINVAL; 2415 goto done; 2416 } 2417 break; 2418 } 2419 if (vap->va_atime.tv_sec != VNOVAL) { 2420 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2421 modflags |= HAMMER_INODE_ATIME; 2422 kflags |= NOTE_ATTRIB; 2423 } 2424 if (vap->va_mtime.tv_sec != VNOVAL) { 2425 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2426 modflags |= HAMMER_INODE_MTIME; 2427 kflags |= NOTE_ATTRIB; 2428 } 2429 if (vap->va_mode != (mode_t)VNOVAL) { 2430 mode_t cur_mode = ip->ino_data.mode; 2431 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2432 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2433 2434 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2435 cur_uid, cur_gid, &cur_mode); 2436 if (error == 0 && ip->ino_data.mode != cur_mode) { 2437 ip->ino_data.mode = cur_mode; 2438 ip->ino_data.ctime = trans.time; 2439 modflags |= HAMMER_INODE_DDIRTY; 2440 kflags |= NOTE_ATTRIB; 2441 } 2442 } 2443 done: 2444 if (error == 0) 2445 hammer_modify_inode(&trans, ip, modflags); 2446 hammer_done_transaction(&trans); 2447 hammer_knote(ap->a_vp, kflags); 2448 lwkt_reltoken(&hmp->fs_token); 2449 return (error); 2450 } 2451 2452 /* 2453 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2454 */ 2455 static 2456 int 2457 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2458 { 2459 struct hammer_transaction trans; 2460 struct hammer_inode *dip; 2461 struct hammer_inode *nip; 2462 hammer_record_t record; 2463 struct nchandle *nch; 2464 hammer_mount_t hmp; 2465 int error; 2466 int bytes; 2467 2468 ap->a_vap->va_type = VLNK; 2469 2470 nch = ap->a_nch; 2471 dip = VTOI(ap->a_dvp); 2472 hmp = dip->hmp; 2473 2474 if (dip->flags & HAMMER_INODE_RO) 2475 return (EROFS); 2476 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2477 return (error); 2478 2479 /* 2480 * Create a transaction to cover the operations we perform. 2481 */ 2482 lwkt_gettoken(&hmp->fs_token); 2483 hammer_start_transaction(&trans, hmp); 2484 ++hammer_stats_file_iopsw; 2485 2486 /* 2487 * Create a new filesystem object of the requested type. The 2488 * returned inode will be referenced but not locked. 2489 */ 2490 2491 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2492 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2493 NULL, &nip); 2494 if (error) { 2495 hammer_done_transaction(&trans); 2496 *ap->a_vpp = NULL; 2497 lwkt_reltoken(&hmp->fs_token); 2498 return (error); 2499 } 2500 2501 /* 2502 * Add a record representing the symlink. symlink stores the link 2503 * as pure data, not a string, and is no \0 terminated. 2504 */ 2505 if (error == 0) { 2506 bytes = strlen(ap->a_target); 2507 2508 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2509 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2510 } else { 2511 record = hammer_alloc_mem_record(nip, bytes); 2512 record->type = HAMMER_MEM_RECORD_GENERAL; 2513 2514 record->leaf.base.localization = nip->obj_localization + 2515 HAMMER_LOCALIZE_MISC; 2516 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2517 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2518 record->leaf.data_len = bytes; 2519 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2520 bcopy(ap->a_target, record->data->symlink.name, bytes); 2521 error = hammer_ip_add_record(&trans, record); 2522 } 2523 2524 /* 2525 * Set the file size to the length of the link. 2526 */ 2527 if (error == 0) { 2528 nip->ino_data.size = bytes; 2529 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2530 } 2531 } 2532 if (error == 0) 2533 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2534 nch->ncp->nc_nlen, nip); 2535 2536 /* 2537 * Finish up. 2538 */ 2539 if (error) { 2540 hammer_rel_inode(nip, 0); 2541 *ap->a_vpp = NULL; 2542 } else { 2543 error = hammer_get_vnode(nip, ap->a_vpp); 2544 hammer_rel_inode(nip, 0); 2545 if (error == 0) { 2546 cache_setunresolved(ap->a_nch); 2547 cache_setvp(ap->a_nch, *ap->a_vpp); 2548 hammer_knote(ap->a_dvp, NOTE_WRITE); 2549 } 2550 } 2551 hammer_done_transaction(&trans); 2552 lwkt_reltoken(&hmp->fs_token); 2553 return (error); 2554 } 2555 2556 /* 2557 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2558 */ 2559 static 2560 int 2561 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2562 { 2563 struct hammer_transaction trans; 2564 struct hammer_inode *dip; 2565 hammer_mount_t hmp; 2566 int error; 2567 2568 dip = VTOI(ap->a_dvp); 2569 hmp = dip->hmp; 2570 2571 if (hammer_nohistory(dip) == 0 && 2572 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2573 return (error); 2574 } 2575 2576 lwkt_gettoken(&hmp->fs_token); 2577 hammer_start_transaction(&trans, hmp); 2578 ++hammer_stats_file_iopsw; 2579 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2580 ap->a_cred, ap->a_flags, -1); 2581 hammer_done_transaction(&trans); 2582 lwkt_reltoken(&hmp->fs_token); 2583 2584 return (error); 2585 } 2586 2587 /* 2588 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2589 */ 2590 static 2591 int 2592 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2593 { 2594 struct hammer_inode *ip = ap->a_vp->v_data; 2595 hammer_mount_t hmp = ip->hmp; 2596 int error; 2597 2598 ++hammer_stats_file_iopsr; 2599 lwkt_gettoken(&hmp->fs_token); 2600 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2601 ap->a_fflag, ap->a_cred); 2602 lwkt_reltoken(&hmp->fs_token); 2603 return (error); 2604 } 2605 2606 static 2607 int 2608 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2609 { 2610 static const struct mountctl_opt extraopt[] = { 2611 { HMNT_NOHISTORY, "nohistory" }, 2612 { HMNT_MASTERID, "master" }, 2613 { 0, NULL} 2614 2615 }; 2616 struct hammer_mount *hmp; 2617 struct mount *mp; 2618 int usedbytes; 2619 int error; 2620 2621 error = 0; 2622 usedbytes = 0; 2623 mp = ap->a_head.a_ops->head.vv_mount; 2624 KKASSERT(mp->mnt_data != NULL); 2625 hmp = (struct hammer_mount *)mp->mnt_data; 2626 2627 lwkt_gettoken(&hmp->fs_token); 2628 2629 switch(ap->a_op) { 2630 case MOUNTCTL_SET_EXPORT: 2631 if (ap->a_ctllen != sizeof(struct export_args)) 2632 error = EINVAL; 2633 else 2634 error = hammer_vfs_export(mp, ap->a_op, 2635 (const struct export_args *)ap->a_ctl); 2636 break; 2637 case MOUNTCTL_MOUNTFLAGS: 2638 { 2639 /* 2640 * Call standard mountctl VOP function 2641 * so we get user mount flags. 2642 */ 2643 error = vop_stdmountctl(ap); 2644 if (error) 2645 break; 2646 2647 usedbytes = *ap->a_res; 2648 2649 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2650 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2651 ap->a_buf, 2652 ap->a_buflen - usedbytes, 2653 &error); 2654 } 2655 2656 *ap->a_res += usedbytes; 2657 break; 2658 } 2659 default: 2660 error = vop_stdmountctl(ap); 2661 break; 2662 } 2663 lwkt_reltoken(&hmp->fs_token); 2664 return(error); 2665 } 2666 2667 /* 2668 * hammer_vop_strategy { vp, bio } 2669 * 2670 * Strategy call, used for regular file read & write only. Note that the 2671 * bp may represent a cluster. 2672 * 2673 * To simplify operation and allow better optimizations in the future, 2674 * this code does not make any assumptions with regards to buffer alignment 2675 * or size. 2676 */ 2677 static 2678 int 2679 hammer_vop_strategy(struct vop_strategy_args *ap) 2680 { 2681 struct buf *bp; 2682 int error; 2683 2684 bp = ap->a_bio->bio_buf; 2685 2686 switch(bp->b_cmd) { 2687 case BUF_CMD_READ: 2688 error = hammer_vop_strategy_read(ap); 2689 break; 2690 case BUF_CMD_WRITE: 2691 error = hammer_vop_strategy_write(ap); 2692 break; 2693 default: 2694 bp->b_error = error = EINVAL; 2695 bp->b_flags |= B_ERROR; 2696 biodone(ap->a_bio); 2697 break; 2698 } 2699 2700 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ 2701 2702 return (error); 2703 } 2704 2705 /* 2706 * Read from a regular file. Iterate the related records and fill in the 2707 * BIO/BUF. Gaps are zero-filled. 2708 * 2709 * The support code in hammer_object.c should be used to deal with mixed 2710 * in-memory and on-disk records. 2711 * 2712 * NOTE: Can be called from the cluster code with an oversized buf. 2713 * 2714 * XXX atime update 2715 */ 2716 static 2717 int 2718 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2719 { 2720 struct hammer_transaction trans; 2721 struct hammer_inode *ip; 2722 struct hammer_inode *dip; 2723 hammer_mount_t hmp; 2724 struct hammer_cursor cursor; 2725 hammer_base_elm_t base; 2726 hammer_off_t disk_offset; 2727 struct bio *bio; 2728 struct bio *nbio; 2729 struct buf *bp; 2730 int64_t rec_offset; 2731 int64_t ran_end; 2732 int64_t tmp64; 2733 int error; 2734 int boff; 2735 int roff; 2736 int n; 2737 int isdedupable; 2738 2739 bio = ap->a_bio; 2740 bp = bio->bio_buf; 2741 ip = ap->a_vp->v_data; 2742 hmp = ip->hmp; 2743 2744 /* 2745 * The zone-2 disk offset may have been set by the cluster code via 2746 * a BMAP operation, or else should be NOOFFSET. 2747 * 2748 * Checking the high bits for a match against zone-2 should suffice. 2749 * 2750 * In cases where a lot of data duplication is present it may be 2751 * more beneficial to drop through and doubule-buffer through the 2752 * device. 2753 */ 2754 nbio = push_bio(bio); 2755 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2756 HAMMER_ZONE_LARGE_DATA) { 2757 if (hammer_double_buffer == 0) { 2758 lwkt_gettoken(&hmp->fs_token); 2759 error = hammer_io_direct_read(hmp, nbio, NULL); 2760 lwkt_reltoken(&hmp->fs_token); 2761 return (error); 2762 } 2763 2764 /* 2765 * Try to shortcut requests for double_buffer mode too. 2766 * Since this mode runs through the device buffer cache 2767 * only compatible buffer sizes (meaning those generated 2768 * by normal filesystem buffers) are legal. 2769 */ 2770 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) { 2771 error = hammer_io_indirect_read(hmp, nbio, NULL); 2772 return (error); 2773 } 2774 } 2775 2776 /* 2777 * Well, that sucked. Do it the hard way. If all the stars are 2778 * aligned we may still be able to issue a direct-read. 2779 */ 2780 lwkt_gettoken(&hmp->fs_token); 2781 hammer_simple_transaction(&trans, hmp); 2782 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2783 2784 /* 2785 * Key range (begin and end inclusive) to scan. Note that the key's 2786 * stored in the actual records represent BASE+LEN, not BASE. The 2787 * first record containing bio_offset will have a key > bio_offset. 2788 */ 2789 cursor.key_beg.localization = ip->obj_localization + 2790 HAMMER_LOCALIZE_MISC; 2791 cursor.key_beg.obj_id = ip->obj_id; 2792 cursor.key_beg.create_tid = 0; 2793 cursor.key_beg.delete_tid = 0; 2794 cursor.key_beg.obj_type = 0; 2795 cursor.key_beg.key = bio->bio_offset + 1; 2796 cursor.asof = ip->obj_asof; 2797 cursor.flags |= HAMMER_CURSOR_ASOF; 2798 2799 cursor.key_end = cursor.key_beg; 2800 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2801 #if 0 2802 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2803 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2804 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2805 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2806 } else 2807 #endif 2808 { 2809 ran_end = bio->bio_offset + bp->b_bufsize; 2810 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2811 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2812 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2813 if (tmp64 < ran_end) 2814 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2815 else 2816 cursor.key_end.key = ran_end + MAXPHYS + 1; 2817 } 2818 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2819 2820 /* 2821 * Set NOSWAPCACHE for cursor data extraction if double buffering 2822 * is disabled or (if the file is not marked cacheable via chflags 2823 * and vm.swapcache_use_chflags is enabled). 2824 */ 2825 if (hammer_double_buffer == 0 || 2826 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 && 2827 vm_swapcache_use_chflags)) { 2828 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; 2829 } 2830 2831 error = hammer_ip_first(&cursor); 2832 boff = 0; 2833 2834 while (error == 0) { 2835 /* 2836 * Get the base file offset of the record. The key for 2837 * data records is (base + bytes) rather then (base). 2838 */ 2839 base = &cursor.leaf->base; 2840 rec_offset = base->key - cursor.leaf->data_len; 2841 2842 /* 2843 * Calculate the gap, if any, and zero-fill it. 2844 * 2845 * n is the offset of the start of the record verses our 2846 * current seek offset in the bio. 2847 */ 2848 n = (int)(rec_offset - (bio->bio_offset + boff)); 2849 if (n > 0) { 2850 if (n > bp->b_bufsize - boff) 2851 n = bp->b_bufsize - boff; 2852 bzero((char *)bp->b_data + boff, n); 2853 boff += n; 2854 n = 0; 2855 } 2856 2857 /* 2858 * Calculate the data offset in the record and the number 2859 * of bytes we can copy. 2860 * 2861 * There are two degenerate cases. First, boff may already 2862 * be at bp->b_bufsize. Secondly, the data offset within 2863 * the record may exceed the record's size. 2864 */ 2865 roff = -n; 2866 rec_offset += roff; 2867 n = cursor.leaf->data_len - roff; 2868 if (n <= 0) { 2869 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff); 2870 n = 0; 2871 } else if (n > bp->b_bufsize - boff) { 2872 n = bp->b_bufsize - boff; 2873 } 2874 2875 /* 2876 * Deal with cached truncations. This cool bit of code 2877 * allows truncate()/ftruncate() to avoid having to sync 2878 * the file. 2879 * 2880 * If the frontend is truncated then all backend records are 2881 * subject to the frontend's truncation. 2882 * 2883 * If the backend is truncated then backend records on-disk 2884 * (but not in-memory) are subject to the backend's 2885 * truncation. In-memory records owned by the backend 2886 * represent data written after the truncation point on the 2887 * backend and must not be truncated. 2888 * 2889 * Truncate operations deal with frontend buffer cache 2890 * buffers and frontend-owned in-memory records synchronously. 2891 */ 2892 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2893 if (hammer_cursor_ondisk(&cursor)/* || 2894 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2895 if (ip->trunc_off <= rec_offset) 2896 n = 0; 2897 else if (ip->trunc_off < rec_offset + n) 2898 n = (int)(ip->trunc_off - rec_offset); 2899 } 2900 } 2901 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2902 if (hammer_cursor_ondisk(&cursor)) { 2903 if (ip->sync_trunc_off <= rec_offset) 2904 n = 0; 2905 else if (ip->sync_trunc_off < rec_offset + n) 2906 n = (int)(ip->sync_trunc_off - rec_offset); 2907 } 2908 } 2909 2910 /* 2911 * Try to issue a direct read into our bio if possible, 2912 * otherwise resolve the element data into a hammer_buffer 2913 * and copy. 2914 * 2915 * The buffer on-disk should be zerod past any real 2916 * truncation point, but may not be for any synthesized 2917 * truncation point from above. 2918 * 2919 * NOTE: disk_offset is only valid if the cursor data is 2920 * on-disk. 2921 */ 2922 disk_offset = cursor.leaf->data_offset + roff; 2923 isdedupable = (boff == 0 && n == bp->b_bufsize && 2924 hammer_cursor_ondisk(&cursor) && 2925 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2926 2927 if (isdedupable && hammer_double_buffer == 0) { 2928 /* 2929 * Direct read case 2930 */ 2931 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2932 HAMMER_ZONE_LARGE_DATA); 2933 nbio->bio_offset = disk_offset; 2934 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2935 if (hammer_live_dedup && error == 0) 2936 hammer_dedup_cache_add(ip, cursor.leaf); 2937 goto done; 2938 } else if (isdedupable) { 2939 /* 2940 * Async I/O case for reading from backing store 2941 * and copying the data to the filesystem buffer. 2942 * live-dedup has to verify the data anyway if it 2943 * gets a hit later so we can just add the entry 2944 * now. 2945 */ 2946 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2947 HAMMER_ZONE_LARGE_DATA); 2948 nbio->bio_offset = disk_offset; 2949 if (hammer_live_dedup) 2950 hammer_dedup_cache_add(ip, cursor.leaf); 2951 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf); 2952 goto done; 2953 } else if (n) { 2954 error = hammer_ip_resolve_data(&cursor); 2955 if (error == 0) { 2956 if (hammer_live_dedup && isdedupable) 2957 hammer_dedup_cache_add(ip, cursor.leaf); 2958 bcopy((char *)cursor.data + roff, 2959 (char *)bp->b_data + boff, n); 2960 } 2961 } 2962 if (error) 2963 break; 2964 2965 /* 2966 * We have to be sure that the only elements added to the 2967 * dedup cache are those which are already on-media. 2968 */ 2969 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) 2970 hammer_dedup_cache_add(ip, cursor.leaf); 2971 2972 /* 2973 * Iterate until we have filled the request. 2974 */ 2975 boff += n; 2976 if (boff == bp->b_bufsize) 2977 break; 2978 error = hammer_ip_next(&cursor); 2979 } 2980 2981 /* 2982 * There may have been a gap after the last record 2983 */ 2984 if (error == ENOENT) 2985 error = 0; 2986 if (error == 0 && boff != bp->b_bufsize) { 2987 KKASSERT(boff < bp->b_bufsize); 2988 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2989 /* boff = bp->b_bufsize; */ 2990 } 2991 2992 /* 2993 * Disallow swapcache operation on the vnode buffer if double 2994 * buffering is enabled, the swapcache will get the data via 2995 * the block device buffer. 2996 */ 2997 if (hammer_double_buffer) 2998 bp->b_flags |= B_NOTMETA; 2999 3000 /* 3001 * Cleanup 3002 */ 3003 bp->b_resid = 0; 3004 bp->b_error = error; 3005 if (error) 3006 bp->b_flags |= B_ERROR; 3007 biodone(ap->a_bio); 3008 3009 done: 3010 /* 3011 * Cache the b-tree node for the last data read in cache[1]. 3012 * 3013 * If we hit the file EOF then also cache the node in the 3014 * governing director's cache[3], it will be used to initialize 3015 * the inode's cache[1] for any inodes looked up via the directory. 3016 * 3017 * This doesn't reduce disk accesses since the B-Tree chain is 3018 * likely cached, but it does reduce cpu overhead when looking 3019 * up file offsets for cpdup/tar/cpio style iterations. 3020 */ 3021 if (cursor.node) 3022 hammer_cache_node(&ip->cache[1], cursor.node); 3023 if (ran_end >= ip->ino_data.size) { 3024 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 3025 ip->obj_asof, ip->obj_localization); 3026 if (dip) { 3027 hammer_cache_node(&dip->cache[3], cursor.node); 3028 hammer_rel_inode(dip, 0); 3029 } 3030 } 3031 hammer_done_cursor(&cursor); 3032 hammer_done_transaction(&trans); 3033 lwkt_reltoken(&hmp->fs_token); 3034 return(error); 3035 } 3036 3037 /* 3038 * BMAP operation - used to support cluster_read() only. 3039 * 3040 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 3041 * 3042 * This routine may return EOPNOTSUPP if the opration is not supported for 3043 * the specified offset. The contents of the pointer arguments do not 3044 * need to be initialized in that case. 3045 * 3046 * If a disk address is available and properly aligned return 0 with 3047 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 3048 * to the run-length relative to that offset. Callers may assume that 3049 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 3050 * large, so return EOPNOTSUPP if it is not sufficiently large. 3051 */ 3052 static 3053 int 3054 hammer_vop_bmap(struct vop_bmap_args *ap) 3055 { 3056 struct hammer_transaction trans; 3057 struct hammer_inode *ip; 3058 hammer_mount_t hmp; 3059 struct hammer_cursor cursor; 3060 hammer_base_elm_t base; 3061 int64_t rec_offset; 3062 int64_t ran_end; 3063 int64_t tmp64; 3064 int64_t base_offset; 3065 int64_t base_disk_offset; 3066 int64_t last_offset; 3067 hammer_off_t last_disk_offset; 3068 hammer_off_t disk_offset; 3069 int rec_len; 3070 int error; 3071 int blksize; 3072 3073 ++hammer_stats_file_iopsr; 3074 ip = ap->a_vp->v_data; 3075 hmp = ip->hmp; 3076 3077 /* 3078 * We can only BMAP regular files. We can't BMAP database files, 3079 * directories, etc. 3080 */ 3081 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 3082 return(EOPNOTSUPP); 3083 3084 /* 3085 * bmap is typically called with runp/runb both NULL when used 3086 * for writing. We do not support BMAP for writing atm. 3087 */ 3088 if (ap->a_cmd != BUF_CMD_READ) 3089 return(EOPNOTSUPP); 3090 3091 /* 3092 * Scan the B-Tree to acquire blockmap addresses, then translate 3093 * to raw addresses. 3094 */ 3095 lwkt_gettoken(&hmp->fs_token); 3096 hammer_simple_transaction(&trans, hmp); 3097 #if 0 3098 kprintf("bmap_beg %016llx ip->cache %p\n", 3099 (long long)ap->a_loffset, ip->cache[1]); 3100 #endif 3101 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 3102 3103 /* 3104 * Key range (begin and end inclusive) to scan. Note that the key's 3105 * stored in the actual records represent BASE+LEN, not BASE. The 3106 * first record containing bio_offset will have a key > bio_offset. 3107 */ 3108 cursor.key_beg.localization = ip->obj_localization + 3109 HAMMER_LOCALIZE_MISC; 3110 cursor.key_beg.obj_id = ip->obj_id; 3111 cursor.key_beg.create_tid = 0; 3112 cursor.key_beg.delete_tid = 0; 3113 cursor.key_beg.obj_type = 0; 3114 if (ap->a_runb) 3115 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 3116 else 3117 cursor.key_beg.key = ap->a_loffset + 1; 3118 if (cursor.key_beg.key < 0) 3119 cursor.key_beg.key = 0; 3120 cursor.asof = ip->obj_asof; 3121 cursor.flags |= HAMMER_CURSOR_ASOF; 3122 3123 cursor.key_end = cursor.key_beg; 3124 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3125 3126 ran_end = ap->a_loffset + MAXPHYS; 3127 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3128 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3129 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3130 if (tmp64 < ran_end) 3131 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 3132 else 3133 cursor.key_end.key = ran_end + MAXPHYS + 1; 3134 3135 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3136 3137 error = hammer_ip_first(&cursor); 3138 base_offset = last_offset = 0; 3139 base_disk_offset = last_disk_offset = 0; 3140 3141 while (error == 0) { 3142 /* 3143 * Get the base file offset of the record. The key for 3144 * data records is (base + bytes) rather then (base). 3145 * 3146 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3147 * The extra bytes should be zero on-disk and the BMAP op 3148 * should still be ok. 3149 */ 3150 base = &cursor.leaf->base; 3151 rec_offset = base->key - cursor.leaf->data_len; 3152 rec_len = cursor.leaf->data_len; 3153 3154 /* 3155 * Incorporate any cached truncation. 3156 * 3157 * NOTE: Modifications to rec_len based on synthesized 3158 * truncation points remove the guarantee that any extended 3159 * data on disk is zero (since the truncations may not have 3160 * taken place on-media yet). 3161 */ 3162 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3163 if (hammer_cursor_ondisk(&cursor) || 3164 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3165 if (ip->trunc_off <= rec_offset) 3166 rec_len = 0; 3167 else if (ip->trunc_off < rec_offset + rec_len) 3168 rec_len = (int)(ip->trunc_off - rec_offset); 3169 } 3170 } 3171 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3172 if (hammer_cursor_ondisk(&cursor)) { 3173 if (ip->sync_trunc_off <= rec_offset) 3174 rec_len = 0; 3175 else if (ip->sync_trunc_off < rec_offset + rec_len) 3176 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3177 } 3178 } 3179 3180 /* 3181 * Accumulate information. If we have hit a discontiguous 3182 * block reset base_offset unless we are already beyond the 3183 * requested offset. If we are, that's it, we stop. 3184 */ 3185 if (error) 3186 break; 3187 if (hammer_cursor_ondisk(&cursor)) { 3188 disk_offset = cursor.leaf->data_offset; 3189 if (rec_offset != last_offset || 3190 disk_offset != last_disk_offset) { 3191 if (rec_offset > ap->a_loffset) 3192 break; 3193 base_offset = rec_offset; 3194 base_disk_offset = disk_offset; 3195 } 3196 last_offset = rec_offset + rec_len; 3197 last_disk_offset = disk_offset + rec_len; 3198 3199 if (hammer_live_dedup) 3200 hammer_dedup_cache_add(ip, cursor.leaf); 3201 } 3202 3203 error = hammer_ip_next(&cursor); 3204 } 3205 3206 #if 0 3207 kprintf("BMAP %016llx: %016llx - %016llx\n", 3208 (long long)ap->a_loffset, 3209 (long long)base_offset, 3210 (long long)last_offset); 3211 kprintf("BMAP %16s: %016llx - %016llx\n", "", 3212 (long long)base_disk_offset, 3213 (long long)last_disk_offset); 3214 #endif 3215 3216 if (cursor.node) { 3217 hammer_cache_node(&ip->cache[1], cursor.node); 3218 #if 0 3219 kprintf("bmap_end2 %016llx ip->cache %p\n", 3220 (long long)ap->a_loffset, ip->cache[1]); 3221 #endif 3222 } 3223 hammer_done_cursor(&cursor); 3224 hammer_done_transaction(&trans); 3225 lwkt_reltoken(&hmp->fs_token); 3226 3227 /* 3228 * If we couldn't find any records or the records we did find were 3229 * all behind the requested offset, return failure. A forward 3230 * truncation can leave a hole w/ no on-disk records. 3231 */ 3232 if (last_offset == 0 || last_offset < ap->a_loffset) 3233 return (EOPNOTSUPP); 3234 3235 /* 3236 * Figure out the block size at the requested offset and adjust 3237 * our limits so the cluster_read() does not create inappropriately 3238 * sized buffer cache buffers. 3239 */ 3240 blksize = hammer_blocksize(ap->a_loffset); 3241 if (hammer_blocksize(base_offset) != blksize) { 3242 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3243 } 3244 if (last_offset != ap->a_loffset && 3245 hammer_blocksize(last_offset - 1) != blksize) { 3246 last_offset = hammer_blockdemarc(ap->a_loffset, 3247 last_offset - 1); 3248 } 3249 3250 /* 3251 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3252 * from occuring. 3253 */ 3254 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3255 3256 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 3257 /* 3258 * Only large-data zones can be direct-IOd 3259 */ 3260 error = EOPNOTSUPP; 3261 } else if ((disk_offset & HAMMER_BUFMASK) || 3262 (last_offset - ap->a_loffset) < blksize) { 3263 /* 3264 * doffsetp is not aligned or the forward run size does 3265 * not cover a whole buffer, disallow the direct I/O. 3266 */ 3267 error = EOPNOTSUPP; 3268 } else { 3269 /* 3270 * We're good. 3271 */ 3272 *ap->a_doffsetp = disk_offset; 3273 if (ap->a_runb) { 3274 *ap->a_runb = ap->a_loffset - base_offset; 3275 KKASSERT(*ap->a_runb >= 0); 3276 } 3277 if (ap->a_runp) { 3278 *ap->a_runp = last_offset - ap->a_loffset; 3279 KKASSERT(*ap->a_runp >= 0); 3280 } 3281 error = 0; 3282 } 3283 return(error); 3284 } 3285 3286 /* 3287 * Write to a regular file. Because this is a strategy call the OS is 3288 * trying to actually get data onto the media. 3289 */ 3290 static 3291 int 3292 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3293 { 3294 hammer_record_t record; 3295 hammer_mount_t hmp; 3296 hammer_inode_t ip; 3297 struct bio *bio; 3298 struct buf *bp; 3299 int blksize __debugvar; 3300 int bytes; 3301 int error; 3302 3303 bio = ap->a_bio; 3304 bp = bio->bio_buf; 3305 ip = ap->a_vp->v_data; 3306 hmp = ip->hmp; 3307 3308 blksize = hammer_blocksize(bio->bio_offset); 3309 KKASSERT(bp->b_bufsize == blksize); 3310 3311 if (ip->flags & HAMMER_INODE_RO) { 3312 bp->b_error = EROFS; 3313 bp->b_flags |= B_ERROR; 3314 biodone(ap->a_bio); 3315 return(EROFS); 3316 } 3317 3318 lwkt_gettoken(&hmp->fs_token); 3319 3320 /* 3321 * Disallow swapcache operation on the vnode buffer if double 3322 * buffering is enabled, the swapcache will get the data via 3323 * the block device buffer. 3324 */ 3325 if (hammer_double_buffer) 3326 bp->b_flags |= B_NOTMETA; 3327 3328 /* 3329 * Interlock with inode destruction (no in-kernel or directory 3330 * topology visibility). If we queue new IO while trying to 3331 * destroy the inode we can deadlock the vtrunc call in 3332 * hammer_inode_unloadable_check(). 3333 * 3334 * Besides, there's no point flushing a bp associated with an 3335 * inode that is being destroyed on-media and has no kernel 3336 * references. 3337 */ 3338 if ((ip->flags | ip->sync_flags) & 3339 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3340 bp->b_resid = 0; 3341 biodone(ap->a_bio); 3342 lwkt_reltoken(&hmp->fs_token); 3343 return(0); 3344 } 3345 3346 /* 3347 * Reserve space and issue a direct-write from the front-end. 3348 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3349 * allocations. 3350 * 3351 * An in-memory record will be installed to reference the storage 3352 * until the flusher can get to it. 3353 * 3354 * Since we own the high level bio the front-end will not try to 3355 * do a direct-read until the write completes. 3356 * 3357 * NOTE: The only time we do not reserve a full-sized buffers 3358 * worth of data is if the file is small. We do not try to 3359 * allocate a fragment (from the small-data zone) at the end of 3360 * an otherwise large file as this can lead to wildly separated 3361 * data. 3362 */ 3363 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3364 KKASSERT(bio->bio_offset < ip->ino_data.size); 3365 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2) 3366 bytes = bp->b_bufsize; 3367 else 3368 bytes = ((int)ip->ino_data.size + 15) & ~15; 3369 3370 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3371 bytes, &error); 3372 3373 /* 3374 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3375 * in hammer_vop_write(). We must flag the record so the proper 3376 * REDO_TERM_WRITE entry is generated during the flush. 3377 */ 3378 if (record) { 3379 if (bp->b_flags & B_VFSFLAG1) { 3380 record->flags |= HAMMER_RECF_REDO; 3381 bp->b_flags &= ~B_VFSFLAG1; 3382 } 3383 if (record->flags & HAMMER_RECF_DEDUPED) { 3384 bp->b_resid = 0; 3385 hammer_ip_replace_bulk(hmp, record); 3386 biodone(ap->a_bio); 3387 } else { 3388 hammer_io_direct_write(hmp, bio, record); 3389 } 3390 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3391 hammer_flush_inode(ip, 0); 3392 } else { 3393 bp->b_bio2.bio_offset = NOOFFSET; 3394 bp->b_error = error; 3395 bp->b_flags |= B_ERROR; 3396 biodone(ap->a_bio); 3397 } 3398 lwkt_reltoken(&hmp->fs_token); 3399 return(error); 3400 } 3401 3402 /* 3403 * dounlink - disconnect a directory entry 3404 * 3405 * XXX whiteout support not really in yet 3406 */ 3407 static int 3408 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3409 struct vnode *dvp, struct ucred *cred, 3410 int flags, int isdir) 3411 { 3412 struct namecache *ncp; 3413 hammer_inode_t dip; 3414 hammer_inode_t ip; 3415 hammer_mount_t hmp; 3416 struct hammer_cursor cursor; 3417 int64_t namekey; 3418 u_int32_t max_iterations; 3419 int nlen, error; 3420 3421 /* 3422 * Calculate the namekey and setup the key range for the scan. This 3423 * works kinda like a chained hash table where the lower 32 bits 3424 * of the namekey synthesize the chain. 3425 * 3426 * The key range is inclusive of both key_beg and key_end. 3427 */ 3428 dip = VTOI(dvp); 3429 ncp = nch->ncp; 3430 hmp = dip->hmp; 3431 3432 if (dip->flags & HAMMER_INODE_RO) 3433 return (EROFS); 3434 3435 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3436 &max_iterations); 3437 retry: 3438 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3439 cursor.key_beg.localization = dip->obj_localization + 3440 hammer_dir_localization(dip); 3441 cursor.key_beg.obj_id = dip->obj_id; 3442 cursor.key_beg.key = namekey; 3443 cursor.key_beg.create_tid = 0; 3444 cursor.key_beg.delete_tid = 0; 3445 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3446 cursor.key_beg.obj_type = 0; 3447 3448 cursor.key_end = cursor.key_beg; 3449 cursor.key_end.key += max_iterations; 3450 cursor.asof = dip->obj_asof; 3451 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3452 3453 /* 3454 * Scan all matching records (the chain), locate the one matching 3455 * the requested path component. info->last_error contains the 3456 * error code on search termination and could be 0, ENOENT, or 3457 * something else. 3458 * 3459 * The hammer_ip_*() functions merge in-memory records with on-disk 3460 * records for the purposes of the search. 3461 */ 3462 error = hammer_ip_first(&cursor); 3463 3464 while (error == 0) { 3465 error = hammer_ip_resolve_data(&cursor); 3466 if (error) 3467 break; 3468 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3469 KKASSERT(nlen > 0); 3470 if (ncp->nc_nlen == nlen && 3471 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3472 break; 3473 } 3474 error = hammer_ip_next(&cursor); 3475 } 3476 3477 /* 3478 * If all is ok we have to get the inode so we can adjust nlinks. 3479 * To avoid a deadlock with the flusher we must release the inode 3480 * lock on the directory when acquiring the inode for the entry. 3481 * 3482 * If the target is a directory, it must be empty. 3483 */ 3484 if (error == 0) { 3485 hammer_unlock(&cursor.ip->lock); 3486 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3487 hmp->asof, 3488 cursor.data->entry.localization, 3489 0, &error); 3490 hammer_lock_sh(&cursor.ip->lock); 3491 if (error == ENOENT) { 3492 kprintf("HAMMER: WARNING: Removing " 3493 "dirent w/missing inode \"%s\"\n" 3494 "\tobj_id = %016llx\n", 3495 ncp->nc_name, 3496 (long long)cursor.data->entry.obj_id); 3497 error = 0; 3498 } 3499 3500 /* 3501 * If isdir >= 0 we validate that the entry is or is not a 3502 * directory. If isdir < 0 we don't care. 3503 */ 3504 if (error == 0 && isdir >= 0 && ip) { 3505 if (isdir && 3506 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3507 error = ENOTDIR; 3508 } else if (isdir == 0 && 3509 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3510 error = EISDIR; 3511 } 3512 } 3513 3514 /* 3515 * If we are trying to remove a directory the directory must 3516 * be empty. 3517 * 3518 * The check directory code can loop and deadlock/retry. Our 3519 * own cursor's node locks must be released to avoid a 3-way 3520 * deadlock with the flusher if the check directory code 3521 * blocks. 3522 * 3523 * If any changes whatsoever have been made to the cursor 3524 * set EDEADLK and retry. 3525 * 3526 * WARNING: See warnings in hammer_unlock_cursor() 3527 * function. 3528 */ 3529 if (error == 0 && ip && ip->ino_data.obj_type == 3530 HAMMER_OBJTYPE_DIRECTORY) { 3531 hammer_unlock_cursor(&cursor); 3532 error = hammer_ip_check_directory_empty(trans, ip); 3533 hammer_lock_cursor(&cursor); 3534 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3535 kprintf("HAMMER: Warning: avoided deadlock " 3536 "on rmdir '%s'\n", 3537 ncp->nc_name); 3538 error = EDEADLK; 3539 } 3540 } 3541 3542 /* 3543 * Delete the directory entry. 3544 * 3545 * WARNING: hammer_ip_del_directory() may have to terminate 3546 * the cursor to avoid a deadlock. It is ok to call 3547 * hammer_done_cursor() twice. 3548 */ 3549 if (error == 0) { 3550 error = hammer_ip_del_directory(trans, &cursor, 3551 dip, ip); 3552 } 3553 hammer_done_cursor(&cursor); 3554 if (error == 0) { 3555 /* 3556 * Tell the namecache that we are now unlinked. 3557 */ 3558 cache_unlink(nch); 3559 3560 /* 3561 * NOTE: ip->vp, if non-NULL, cannot be directly 3562 * referenced without formally acquiring the 3563 * vp since the vp might have zero refs on it, 3564 * or in the middle of a reclaim, etc. 3565 * 3566 * NOTE: The cache_setunresolved() can rip the vp 3567 * out from under us since the vp may not have 3568 * any refs, in which case ip->vp will be NULL 3569 * from the outset. 3570 */ 3571 while (ip && ip->vp) { 3572 struct vnode *vp; 3573 3574 error = hammer_get_vnode(ip, &vp); 3575 if (error == 0 && vp) { 3576 vn_unlock(vp); 3577 hammer_knote(ip->vp, NOTE_DELETE); 3578 cache_inval_vp(ip->vp, CINV_DESTROY); 3579 vrele(vp); 3580 break; 3581 } 3582 kprintf("Debug: HAMMER ip/vp race1 avoided\n"); 3583 } 3584 } 3585 if (ip) 3586 hammer_rel_inode(ip, 0); 3587 } else { 3588 hammer_done_cursor(&cursor); 3589 } 3590 if (error == EDEADLK) 3591 goto retry; 3592 3593 return (error); 3594 } 3595 3596 /************************************************************************ 3597 * FIFO AND SPECFS OPS * 3598 ************************************************************************ 3599 * 3600 */ 3601 static int 3602 hammer_vop_fifoclose (struct vop_close_args *ap) 3603 { 3604 /* XXX update itimes */ 3605 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3606 } 3607 3608 static int 3609 hammer_vop_fiforead (struct vop_read_args *ap) 3610 { 3611 int error; 3612 3613 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3614 /* XXX update access time */ 3615 return (error); 3616 } 3617 3618 static int 3619 hammer_vop_fifowrite (struct vop_write_args *ap) 3620 { 3621 int error; 3622 3623 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3624 /* XXX update access time */ 3625 return (error); 3626 } 3627 3628 static 3629 int 3630 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3631 { 3632 int error; 3633 3634 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3635 if (error) 3636 error = hammer_vop_kqfilter(ap); 3637 return(error); 3638 } 3639 3640 /************************************************************************ 3641 * KQFILTER OPS * 3642 ************************************************************************ 3643 * 3644 */ 3645 static void filt_hammerdetach(struct knote *kn); 3646 static int filt_hammerread(struct knote *kn, long hint); 3647 static int filt_hammerwrite(struct knote *kn, long hint); 3648 static int filt_hammervnode(struct knote *kn, long hint); 3649 3650 static struct filterops hammerread_filtops = 3651 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread }; 3652 static struct filterops hammerwrite_filtops = 3653 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite }; 3654 static struct filterops hammervnode_filtops = 3655 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode }; 3656 3657 static 3658 int 3659 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3660 { 3661 struct vnode *vp = ap->a_vp; 3662 struct knote *kn = ap->a_kn; 3663 3664 switch (kn->kn_filter) { 3665 case EVFILT_READ: 3666 kn->kn_fop = &hammerread_filtops; 3667 break; 3668 case EVFILT_WRITE: 3669 kn->kn_fop = &hammerwrite_filtops; 3670 break; 3671 case EVFILT_VNODE: 3672 kn->kn_fop = &hammervnode_filtops; 3673 break; 3674 default: 3675 return (EOPNOTSUPP); 3676 } 3677 3678 kn->kn_hook = (caddr_t)vp; 3679 3680 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3681 3682 return(0); 3683 } 3684 3685 static void 3686 filt_hammerdetach(struct knote *kn) 3687 { 3688 struct vnode *vp = (void *)kn->kn_hook; 3689 3690 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3691 } 3692 3693 static int 3694 filt_hammerread(struct knote *kn, long hint) 3695 { 3696 struct vnode *vp = (void *)kn->kn_hook; 3697 hammer_inode_t ip = VTOI(vp); 3698 hammer_mount_t hmp = ip->hmp; 3699 off_t off; 3700 3701 if (hint == NOTE_REVOKE) { 3702 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3703 return(1); 3704 } 3705 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3706 off = ip->ino_data.size - kn->kn_fp->f_offset; 3707 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3708 lwkt_reltoken(&hmp->fs_token); 3709 if (kn->kn_sfflags & NOTE_OLDAPI) 3710 return(1); 3711 return (kn->kn_data != 0); 3712 } 3713 3714 static int 3715 filt_hammerwrite(struct knote *kn, long hint) 3716 { 3717 if (hint == NOTE_REVOKE) 3718 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); 3719 kn->kn_data = 0; 3720 return (1); 3721 } 3722 3723 static int 3724 filt_hammervnode(struct knote *kn, long hint) 3725 { 3726 if (kn->kn_sfflags & hint) 3727 kn->kn_fflags |= hint; 3728 if (hint == NOTE_REVOKE) { 3729 kn->kn_flags |= (EV_EOF | EV_NODATA); 3730 return (1); 3731 } 3732 return (kn->kn_fflags != 0); 3733 } 3734 3735