1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/fcntl.h> 41 #include <sys/namecache.h> 42 #include <sys/vnode.h> 43 #include <sys/lockf.h> 44 #include <sys/event.h> 45 #include <sys/stat.h> 46 #include <sys/dirent.h> 47 #include <sys/file.h> 48 #include <vm/vm_extern.h> 49 #include <vfs/fifofs/fifo.h> 50 51 #include <sys/mplock2.h> 52 53 #include "hammer.h" 54 55 /* 56 * USERFS VNOPS 57 */ 58 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/ 59 static int hammer_vop_fsync(struct vop_fsync_args *); 60 static int hammer_vop_read(struct vop_read_args *); 61 static int hammer_vop_write(struct vop_write_args *); 62 static int hammer_vop_access(struct vop_access_args *); 63 static int hammer_vop_advlock(struct vop_advlock_args *); 64 static int hammer_vop_close(struct vop_close_args *); 65 static int hammer_vop_ncreate(struct vop_ncreate_args *); 66 static int hammer_vop_getattr(struct vop_getattr_args *); 67 static int hammer_vop_nresolve(struct vop_nresolve_args *); 68 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 69 static int hammer_vop_nlink(struct vop_nlink_args *); 70 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 71 static int hammer_vop_nmknod(struct vop_nmknod_args *); 72 static int hammer_vop_open(struct vop_open_args *); 73 static int hammer_vop_print(struct vop_print_args *); 74 static int hammer_vop_readdir(struct vop_readdir_args *); 75 static int hammer_vop_readlink(struct vop_readlink_args *); 76 static int hammer_vop_nremove(struct vop_nremove_args *); 77 static int hammer_vop_nrename(struct vop_nrename_args *); 78 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 79 static int hammer_vop_markatime(struct vop_markatime_args *); 80 static int hammer_vop_setattr(struct vop_setattr_args *); 81 static int hammer_vop_strategy(struct vop_strategy_args *); 82 static int hammer_vop_bmap(struct vop_bmap_args *ap); 83 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 84 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 85 static int hammer_vop_ioctl(struct vop_ioctl_args *); 86 static int hammer_vop_mountctl(struct vop_mountctl_args *); 87 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 88 89 static int hammer_vop_fifoclose (struct vop_close_args *); 90 static int hammer_vop_fiforead (struct vop_read_args *); 91 static int hammer_vop_fifowrite (struct vop_write_args *); 92 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 93 94 struct vop_ops hammer_vnode_vops = { 95 .vop_default = vop_defaultop, 96 .vop_fsync = hammer_vop_fsync, 97 .vop_getpages = vop_stdgetpages, 98 .vop_putpages = vop_stdputpages, 99 .vop_read = hammer_vop_read, 100 .vop_write = hammer_vop_write, 101 .vop_access = hammer_vop_access, 102 .vop_advlock = hammer_vop_advlock, 103 .vop_close = hammer_vop_close, 104 .vop_ncreate = hammer_vop_ncreate, 105 .vop_getattr = hammer_vop_getattr, 106 .vop_inactive = hammer_vop_inactive, 107 .vop_reclaim = hammer_vop_reclaim, 108 .vop_nresolve = hammer_vop_nresolve, 109 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 110 .vop_nlink = hammer_vop_nlink, 111 .vop_nmkdir = hammer_vop_nmkdir, 112 .vop_nmknod = hammer_vop_nmknod, 113 .vop_open = hammer_vop_open, 114 .vop_pathconf = vop_stdpathconf, 115 .vop_print = hammer_vop_print, 116 .vop_readdir = hammer_vop_readdir, 117 .vop_readlink = hammer_vop_readlink, 118 .vop_nremove = hammer_vop_nremove, 119 .vop_nrename = hammer_vop_nrename, 120 .vop_nrmdir = hammer_vop_nrmdir, 121 .vop_markatime = hammer_vop_markatime, 122 .vop_setattr = hammer_vop_setattr, 123 .vop_bmap = hammer_vop_bmap, 124 .vop_strategy = hammer_vop_strategy, 125 .vop_nsymlink = hammer_vop_nsymlink, 126 .vop_nwhiteout = hammer_vop_nwhiteout, 127 .vop_ioctl = hammer_vop_ioctl, 128 .vop_mountctl = hammer_vop_mountctl, 129 .vop_kqfilter = hammer_vop_kqfilter 130 }; 131 132 struct vop_ops hammer_spec_vops = { 133 .vop_default = vop_defaultop, 134 .vop_fsync = hammer_vop_fsync, 135 .vop_read = vop_stdnoread, 136 .vop_write = vop_stdnowrite, 137 .vop_access = hammer_vop_access, 138 .vop_close = hammer_vop_close, 139 .vop_markatime = hammer_vop_markatime, 140 .vop_getattr = hammer_vop_getattr, 141 .vop_inactive = hammer_vop_inactive, 142 .vop_reclaim = hammer_vop_reclaim, 143 .vop_setattr = hammer_vop_setattr 144 }; 145 146 struct vop_ops hammer_fifo_vops = { 147 .vop_default = fifo_vnoperate, 148 .vop_fsync = hammer_vop_fsync, 149 .vop_read = hammer_vop_fiforead, 150 .vop_write = hammer_vop_fifowrite, 151 .vop_access = hammer_vop_access, 152 .vop_close = hammer_vop_fifoclose, 153 .vop_markatime = hammer_vop_markatime, 154 .vop_getattr = hammer_vop_getattr, 155 .vop_inactive = hammer_vop_inactive, 156 .vop_reclaim = hammer_vop_reclaim, 157 .vop_setattr = hammer_vop_setattr, 158 .vop_kqfilter = hammer_vop_fifokqfilter 159 }; 160 161 static __inline 162 void 163 hammer_knote(struct vnode *vp, int flags) 164 { 165 if (flags) 166 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); 167 } 168 169 #ifdef DEBUG_TRUNCATE 170 struct hammer_inode *HammerTruncIp; 171 #endif 172 173 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 174 struct vnode *dvp, struct ucred *cred, 175 int flags, int isdir); 176 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 177 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 178 179 #if 0 180 static 181 int 182 hammer_vop_vnoperate(struct vop_generic_args *) 183 { 184 return (VOCALL(&hammer_vnode_vops, ap)); 185 } 186 #endif 187 188 /* 189 * hammer_vop_fsync { vp, waitfor } 190 * 191 * fsync() an inode to disk and wait for it to be completely committed 192 * such that the information would not be undone if a crash occured after 193 * return. 194 * 195 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement 196 * a REDO log. A sysctl is provided to relax HAMMER's fsync() 197 * operation. 198 * 199 * Ultimately the combination of a REDO log and use of fast storage 200 * to front-end cluster caches will make fsync fast, but it aint 201 * here yet. And, in anycase, we need real transactional 202 * all-or-nothing features which are not restricted to a single file. 203 */ 204 static 205 int 206 hammer_vop_fsync(struct vop_fsync_args *ap) 207 { 208 hammer_inode_t ip = VTOI(ap->a_vp); 209 hammer_mount_t hmp = ip->hmp; 210 int waitfor = ap->a_waitfor; 211 int mode; 212 213 lwkt_gettoken(&hmp->fs_token); 214 215 /* 216 * Fsync rule relaxation (default is either full synchronous flush 217 * or REDO semantics with synchronous flush). 218 */ 219 if (ap->a_flags & VOP_FSYNC_SYSCALL) { 220 switch(hammer_fsync_mode) { 221 case 0: 222 mode0: 223 /* no REDO, full synchronous flush */ 224 goto skip; 225 case 1: 226 mode1: 227 /* no REDO, full asynchronous flush */ 228 if (waitfor == MNT_WAIT) 229 waitfor = MNT_NOWAIT; 230 goto skip; 231 case 2: 232 /* REDO semantics, synchronous flush */ 233 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 234 goto mode0; 235 mode = HAMMER_FLUSH_UNDOS_AUTO; 236 break; 237 case 3: 238 /* REDO semantics, relaxed asynchronous flush */ 239 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 240 goto mode1; 241 mode = HAMMER_FLUSH_UNDOS_RELAXED; 242 if (waitfor == MNT_WAIT) 243 waitfor = MNT_NOWAIT; 244 break; 245 case 4: 246 /* ignore the fsync() system call */ 247 lwkt_reltoken(&hmp->fs_token); 248 return(0); 249 default: 250 /* we have to do something */ 251 mode = HAMMER_FLUSH_UNDOS_RELAXED; 252 if (waitfor == MNT_WAIT) 253 waitfor = MNT_NOWAIT; 254 break; 255 } 256 257 /* 258 * Fast fsync only needs to flush the UNDO/REDO fifo if 259 * HAMMER_INODE_REDO is non-zero and the only modifications 260 * made to the file are write or write-extends. 261 */ 262 if ((ip->flags & HAMMER_INODE_REDO) && 263 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0 264 ) { 265 ++hammer_count_fsyncs; 266 hammer_flusher_flush_undos(hmp, mode); 267 ip->redo_count = 0; 268 lwkt_reltoken(&hmp->fs_token); 269 return(0); 270 } 271 272 /* 273 * REDO is enabled by fsync(), the idea being we really only 274 * want to lay down REDO records when programs are using 275 * fsync() heavily. The first fsync() on the file starts 276 * the gravy train going and later fsync()s keep it hot by 277 * resetting the redo_count. 278 * 279 * We weren't running REDOs before now so we have to fall 280 * through and do a full fsync of what we have. 281 */ 282 if (hmp->version >= HAMMER_VOL_VERSION_FOUR && 283 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) { 284 ip->flags |= HAMMER_INODE_REDO; 285 ip->redo_count = 0; 286 } 287 } 288 skip: 289 290 /* 291 * Do a full flush sequence. 292 */ 293 ++hammer_count_fsyncs; 294 vfsync(ap->a_vp, waitfor, 1, NULL, NULL); 295 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 296 if (waitfor == MNT_WAIT) { 297 vn_unlock(ap->a_vp); 298 hammer_wait_inode(ip); 299 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); 300 } 301 lwkt_reltoken(&hmp->fs_token); 302 return (ip->error); 303 } 304 305 /* 306 * hammer_vop_read { vp, uio, ioflag, cred } 307 * 308 * MPSAFE (for the cache safe does not require fs_token) 309 */ 310 static 311 int 312 hammer_vop_read(struct vop_read_args *ap) 313 { 314 struct hammer_transaction trans; 315 hammer_inode_t ip; 316 hammer_mount_t hmp; 317 off_t offset; 318 struct buf *bp; 319 struct uio *uio; 320 int error; 321 int n; 322 int seqcount; 323 int ioseqcount; 324 int blksize; 325 int bigread; 326 int got_fstoken; 327 328 if (ap->a_vp->v_type != VREG) 329 return (EINVAL); 330 ip = VTOI(ap->a_vp); 331 hmp = ip->hmp; 332 error = 0; 333 uio = ap->a_uio; 334 335 /* 336 * Allow the UIO's size to override the sequential heuristic. 337 */ 338 blksize = hammer_blocksize(uio->uio_offset); 339 seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE; 340 ioseqcount = (ap->a_ioflag >> 16); 341 if (seqcount < ioseqcount) 342 seqcount = ioseqcount; 343 344 /* 345 * If reading or writing a huge amount of data we have to break 346 * atomicy and allow the operation to be interrupted by a signal 347 * or it can DOS the machine. 348 */ 349 bigread = (uio->uio_resid > 100 * 1024 * 1024); 350 got_fstoken = 0; 351 352 /* 353 * Access the data typically in HAMMER_BUFSIZE blocks via the 354 * buffer cache, but HAMMER may use a variable block size based 355 * on the offset. 356 * 357 * XXX Temporary hack, delay the start transaction while we remain 358 * MPSAFE. NOTE: ino_data.size cannot change while vnode is 359 * locked-shared. 360 */ 361 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 362 int64_t base_offset; 363 int64_t file_limit; 364 365 blksize = hammer_blocksize(uio->uio_offset); 366 offset = (int)uio->uio_offset & (blksize - 1); 367 base_offset = uio->uio_offset - offset; 368 369 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0) 370 break; 371 372 /* 373 * MPSAFE 374 */ 375 bp = getcacheblk(ap->a_vp, base_offset, blksize); 376 if (bp) { 377 error = 0; 378 goto skip; 379 } 380 381 /* 382 * MPUNSAFE 383 */ 384 if (got_fstoken == 0) { 385 lwkt_gettoken(&hmp->fs_token); 386 got_fstoken = 1; 387 hammer_start_transaction(&trans, ip->hmp); 388 } 389 390 if (hammer_cluster_enable) { 391 /* 392 * Use file_limit to prevent cluster_read() from 393 * creating buffers of the wrong block size past 394 * the demarc. 395 */ 396 file_limit = ip->ino_data.size; 397 if (base_offset < HAMMER_XDEMARC && 398 file_limit > HAMMER_XDEMARC) { 399 file_limit = HAMMER_XDEMARC; 400 } 401 error = cluster_read(ap->a_vp, 402 file_limit, base_offset, 403 blksize, uio->uio_resid, 404 seqcount * BKVASIZE, &bp); 405 } else { 406 error = bread(ap->a_vp, base_offset, blksize, &bp); 407 } 408 if (error) { 409 brelse(bp); 410 break; 411 } 412 skip: 413 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { 414 kprintf("doff %016jx read file %016jx@%016jx\n", 415 (intmax_t)bp->b_bio2.bio_offset, 416 (intmax_t)ip->obj_id, 417 (intmax_t)bp->b_loffset); 418 } 419 bp->b_flags &= ~B_IODEBUG; 420 421 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ 422 n = blksize - offset; 423 if (n > uio->uio_resid) 424 n = uio->uio_resid; 425 if (n > ip->ino_data.size - uio->uio_offset) 426 n = (int)(ip->ino_data.size - uio->uio_offset); 427 if (got_fstoken) 428 lwkt_reltoken(&hmp->fs_token); 429 error = uiomove((char *)bp->b_data + offset, n, uio); 430 if (got_fstoken) 431 lwkt_gettoken(&hmp->fs_token); 432 433 /* data has a lower priority then meta-data */ 434 bp->b_flags |= B_AGE; 435 bqrelse(bp); 436 if (error) 437 break; 438 hammer_stats_file_read += n; 439 } 440 441 /* 442 * XXX only update the atime if we had to get the MP lock. 443 * XXX hack hack hack, fixme. 444 */ 445 if (got_fstoken) { 446 if ((ip->flags & HAMMER_INODE_RO) == 0 && 447 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 448 ip->ino_data.atime = trans.time; 449 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 450 } 451 hammer_done_transaction(&trans); 452 lwkt_reltoken(&hmp->fs_token); 453 } 454 return (error); 455 } 456 457 /* 458 * hammer_vop_write { vp, uio, ioflag, cred } 459 */ 460 static 461 int 462 hammer_vop_write(struct vop_write_args *ap) 463 { 464 struct hammer_transaction trans; 465 struct hammer_inode *ip; 466 hammer_mount_t hmp; 467 struct uio *uio; 468 int offset; 469 off_t base_offset; 470 struct buf *bp; 471 int kflags; 472 int error; 473 int n; 474 int flags; 475 int seqcount; 476 int bigwrite; 477 478 if (ap->a_vp->v_type != VREG) 479 return (EINVAL); 480 ip = VTOI(ap->a_vp); 481 hmp = ip->hmp; 482 error = 0; 483 kflags = 0; 484 seqcount = ap->a_ioflag >> 16; 485 486 if (ip->flags & HAMMER_INODE_RO) 487 return (EROFS); 488 489 /* 490 * Create a transaction to cover the operations we perform. 491 */ 492 lwkt_gettoken(&hmp->fs_token); 493 hammer_start_transaction(&trans, hmp); 494 uio = ap->a_uio; 495 496 /* 497 * Check append mode 498 */ 499 if (ap->a_ioflag & IO_APPEND) 500 uio->uio_offset = ip->ino_data.size; 501 502 /* 503 * Check for illegal write offsets. Valid range is 0...2^63-1. 504 * 505 * NOTE: the base_off assignment is required to work around what 506 * I consider to be a GCC-4 optimization bug. 507 */ 508 if (uio->uio_offset < 0) { 509 hammer_done_transaction(&trans); 510 lwkt_reltoken(&hmp->fs_token); 511 return (EFBIG); 512 } 513 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 514 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { 515 hammer_done_transaction(&trans); 516 lwkt_reltoken(&hmp->fs_token); 517 return (EFBIG); 518 } 519 520 /* 521 * If reading or writing a huge amount of data we have to break 522 * atomicy and allow the operation to be interrupted by a signal 523 * or it can DOS the machine. 524 * 525 * Preset redo_count so we stop generating REDOs earlier if the 526 * limit is exceeded. 527 */ 528 bigwrite = (uio->uio_resid > 100 * 1024 * 1024); 529 if ((ip->flags & HAMMER_INODE_REDO) && 530 ip->redo_count < hammer_limit_redo) { 531 ip->redo_count += uio->uio_resid; 532 } 533 534 /* 535 * Access the data typically in HAMMER_BUFSIZE blocks via the 536 * buffer cache, but HAMMER may use a variable block size based 537 * on the offset. 538 */ 539 while (uio->uio_resid > 0) { 540 int fixsize = 0; 541 int blksize; 542 int blkmask; 543 int trivial; 544 int endofblk; 545 off_t nsize; 546 547 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 548 break; 549 if (bigwrite && (error = hammer_signal_check(hmp)) != 0) 550 break; 551 552 blksize = hammer_blocksize(uio->uio_offset); 553 554 /* 555 * Do not allow HAMMER to blow out the buffer cache. Very 556 * large UIOs can lockout other processes due to bwillwrite() 557 * mechanics. 558 * 559 * The hammer inode is not locked during these operations. 560 * The vnode is locked which can interfere with the pageout 561 * daemon for non-UIO_NOCOPY writes but should not interfere 562 * with the buffer cache. Even so, we cannot afford to 563 * allow the pageout daemon to build up too many dirty buffer 564 * cache buffers. 565 * 566 * Only call this if we aren't being recursively called from 567 * a virtual disk device (vn), else we may deadlock. 568 */ 569 if ((ap->a_ioflag & IO_RECURSE) == 0) 570 bwillwrite(blksize); 571 572 /* 573 * Control the number of pending records associated with 574 * this inode. If too many have accumulated start a 575 * flush. Try to maintain a pipeline with the flusher. 576 */ 577 if (ip->rsv_recs >= hammer_limit_inode_recs) { 578 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 579 } 580 if (ip->rsv_recs >= hammer_limit_inode_recs * 2) { 581 while (ip->rsv_recs >= hammer_limit_inode_recs) { 582 tsleep(&ip->rsv_recs, 0, "hmrwww", hz); 583 } 584 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 585 } 586 587 #if 0 588 /* 589 * Do not allow HAMMER to blow out system memory by 590 * accumulating too many records. Records are so well 591 * decoupled from the buffer cache that it is possible 592 * for userland to push data out to the media via 593 * direct-write, but build up the records queued to the 594 * backend faster then the backend can flush them out. 595 * HAMMER has hit its write limit but the frontend has 596 * no pushback to slow it down. 597 */ 598 if (hmp->rsv_recs > hammer_limit_recs / 2) { 599 /* 600 * Get the inode on the flush list 601 */ 602 if (ip->rsv_recs >= 64) 603 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 604 else if (ip->rsv_recs >= 16) 605 hammer_flush_inode(ip, 0); 606 607 /* 608 * Keep the flusher going if the system keeps 609 * queueing records. 610 */ 611 delta = hmp->count_newrecords - 612 hmp->last_newrecords; 613 if (delta < 0 || delta > hammer_limit_recs / 2) { 614 hmp->last_newrecords = hmp->count_newrecords; 615 hammer_sync_hmp(hmp, MNT_NOWAIT); 616 } 617 618 /* 619 * If we have gotten behind start slowing 620 * down the writers. 621 */ 622 delta = (hmp->rsv_recs - hammer_limit_recs) * 623 hz / hammer_limit_recs; 624 if (delta > 0) 625 tsleep(&trans, 0, "hmrslo", delta); 626 } 627 #endif 628 629 /* 630 * Calculate the blocksize at the current offset and figure 631 * out how much we can actually write. 632 */ 633 blkmask = blksize - 1; 634 offset = (int)uio->uio_offset & blkmask; 635 base_offset = uio->uio_offset & ~(int64_t)blkmask; 636 n = blksize - offset; 637 if (n > uio->uio_resid) { 638 n = uio->uio_resid; 639 endofblk = 0; 640 } else { 641 endofblk = 1; 642 } 643 nsize = uio->uio_offset + n; 644 if (nsize > ip->ino_data.size) { 645 if (uio->uio_offset > ip->ino_data.size) 646 trivial = 0; 647 else 648 trivial = 1; 649 nvextendbuf(ap->a_vp, 650 ip->ino_data.size, 651 nsize, 652 hammer_blocksize(ip->ino_data.size), 653 hammer_blocksize(nsize), 654 hammer_blockoff(ip->ino_data.size), 655 hammer_blockoff(nsize), 656 trivial); 657 fixsize = 1; 658 kflags |= NOTE_EXTEND; 659 } 660 661 if (uio->uio_segflg == UIO_NOCOPY) { 662 /* 663 * Issuing a write with the same data backing the 664 * buffer. Instantiate the buffer to collect the 665 * backing vm pages, then read-in any missing bits. 666 * 667 * This case is used by vop_stdputpages(). 668 */ 669 bp = getblk(ap->a_vp, base_offset, 670 blksize, GETBLK_BHEAVY, 0); 671 if ((bp->b_flags & B_CACHE) == 0) { 672 bqrelse(bp); 673 error = bread(ap->a_vp, base_offset, 674 blksize, &bp); 675 } 676 } else if (offset == 0 && uio->uio_resid >= blksize) { 677 /* 678 * Even though we are entirely overwriting the buffer 679 * we may still have to zero it out to avoid a 680 * mmap/write visibility issue. 681 */ 682 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 683 if ((bp->b_flags & B_CACHE) == 0) 684 vfs_bio_clrbuf(bp); 685 } else if (base_offset >= ip->ino_data.size) { 686 /* 687 * If the base offset of the buffer is beyond the 688 * file EOF, we don't have to issue a read. 689 */ 690 bp = getblk(ap->a_vp, base_offset, 691 blksize, GETBLK_BHEAVY, 0); 692 vfs_bio_clrbuf(bp); 693 } else { 694 /* 695 * Partial overwrite, read in any missing bits then 696 * replace the portion being written. 697 */ 698 error = bread(ap->a_vp, base_offset, blksize, &bp); 699 if (error == 0) 700 bheavy(bp); 701 } 702 if (error == 0) { 703 lwkt_reltoken(&hmp->fs_token); 704 error = uiomove(bp->b_data + offset, n, uio); 705 lwkt_gettoken(&hmp->fs_token); 706 } 707 708 /* 709 * Generate REDO records if enabled and redo_count will not 710 * exceeded the limit. 711 * 712 * If redo_count exceeds the limit we stop generating records 713 * and clear HAMMER_INODE_REDO. This will cause the next 714 * fsync() to do a full meta-data sync instead of just an 715 * UNDO/REDO fifo update. 716 * 717 * When clearing HAMMER_INODE_REDO any pre-existing REDOs 718 * will still be tracked. The tracks will be terminated 719 * when the related meta-data (including possible data 720 * modifications which are not tracked via REDO) is 721 * flushed. 722 */ 723 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) { 724 if (ip->redo_count < hammer_limit_redo) { 725 bp->b_flags |= B_VFSFLAG1; 726 error = hammer_generate_redo(&trans, ip, 727 base_offset + offset, 728 HAMMER_REDO_WRITE, 729 bp->b_data + offset, 730 (size_t)n); 731 } else { 732 ip->flags &= ~HAMMER_INODE_REDO; 733 } 734 } 735 736 /* 737 * If we screwed up we have to undo any VM size changes we 738 * made. 739 */ 740 if (error) { 741 brelse(bp); 742 if (fixsize) { 743 nvtruncbuf(ap->a_vp, ip->ino_data.size, 744 hammer_blocksize(ip->ino_data.size), 745 hammer_blockoff(ip->ino_data.size)); 746 } 747 break; 748 } 749 kflags |= NOTE_WRITE; 750 hammer_stats_file_write += n; 751 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ 752 if (ip->ino_data.size < uio->uio_offset) { 753 ip->ino_data.size = uio->uio_offset; 754 flags = HAMMER_INODE_SDIRTY; 755 } else { 756 flags = 0; 757 } 758 ip->ino_data.mtime = trans.time; 759 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 760 hammer_modify_inode(&trans, ip, flags); 761 762 /* 763 * Once we dirty the buffer any cached zone-X offset 764 * becomes invalid. HAMMER NOTE: no-history mode cannot 765 * allow overwriting over the same data sector unless 766 * we provide UNDOs for the old data, which we don't. 767 */ 768 bp->b_bio2.bio_offset = NOOFFSET; 769 770 /* 771 * Final buffer disposition. 772 * 773 * Because meta-data updates are deferred, HAMMER is 774 * especially sensitive to excessive bdwrite()s because 775 * the I/O stream is not broken up by disk reads. So the 776 * buffer cache simply cannot keep up. 777 * 778 * WARNING! blksize is variable. cluster_write() is 779 * expected to not blow up if it encounters 780 * buffers that do not match the passed blksize. 781 * 782 * NOTE! Hammer shouldn't need to bawrite()/cluster_write(). 783 * The ip->rsv_recs check should burst-flush the data. 784 * If we queue it immediately the buf could be left 785 * locked on the device queue for a very long time. 786 * 787 * NOTE! To avoid degenerate stalls due to mismatched block 788 * sizes we only honor IO_DIRECT on the write which 789 * abuts the end of the buffer. However, we must 790 * honor IO_SYNC in case someone is silly enough to 791 * configure a HAMMER file as swap, or when HAMMER 792 * is serving NFS (for commits). Ick ick. 793 */ 794 bp->b_flags |= B_AGE; 795 if (ap->a_ioflag & IO_SYNC) { 796 bwrite(bp); 797 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) { 798 bawrite(bp); 799 } else { 800 #if 0 801 if (offset + n == blksize) { 802 if (hammer_cluster_enable == 0 || 803 (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) { 804 bawrite(bp); 805 } else { 806 cluster_write(bp, ip->ino_data.size, 807 blksize, seqcount); 808 } 809 } else { 810 #endif 811 bdwrite(bp); 812 } 813 } 814 hammer_done_transaction(&trans); 815 hammer_knote(ap->a_vp, kflags); 816 lwkt_reltoken(&hmp->fs_token); 817 return (error); 818 } 819 820 /* 821 * hammer_vop_access { vp, mode, cred } 822 * 823 * MPSAFE - does not require fs_token 824 */ 825 static 826 int 827 hammer_vop_access(struct vop_access_args *ap) 828 { 829 struct hammer_inode *ip = VTOI(ap->a_vp); 830 uid_t uid; 831 gid_t gid; 832 int error; 833 834 ++hammer_stats_file_iopsr; 835 uid = hammer_to_unix_xid(&ip->ino_data.uid); 836 gid = hammer_to_unix_xid(&ip->ino_data.gid); 837 838 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 839 ip->ino_data.uflags); 840 return (error); 841 } 842 843 /* 844 * hammer_vop_advlock { vp, id, op, fl, flags } 845 * 846 * MPSAFE - does not require fs_token 847 */ 848 static 849 int 850 hammer_vop_advlock(struct vop_advlock_args *ap) 851 { 852 hammer_inode_t ip = VTOI(ap->a_vp); 853 854 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 855 } 856 857 /* 858 * hammer_vop_close { vp, fflag } 859 * 860 * We can only sync-on-close for normal closes. XXX disabled for now. 861 */ 862 static 863 int 864 hammer_vop_close(struct vop_close_args *ap) 865 { 866 #if 0 867 struct vnode *vp = ap->a_vp; 868 hammer_inode_t ip = VTOI(vp); 869 int waitfor; 870 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) { 871 if (vn_islocked(vp) == LK_EXCLUSIVE && 872 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) { 873 if (ip->flags & HAMMER_INODE_CLOSESYNC) 874 waitfor = MNT_WAIT; 875 else 876 waitfor = MNT_NOWAIT; 877 ip->flags &= ~(HAMMER_INODE_CLOSESYNC | 878 HAMMER_INODE_CLOSEASYNC); 879 VOP_FSYNC(vp, MNT_NOWAIT, waitfor); 880 } 881 } 882 #endif 883 return (vop_stdclose(ap)); 884 } 885 886 /* 887 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 888 * 889 * The operating system has already ensured that the directory entry 890 * does not exist and done all appropriate namespace locking. 891 */ 892 static 893 int 894 hammer_vop_ncreate(struct vop_ncreate_args *ap) 895 { 896 struct hammer_transaction trans; 897 struct hammer_inode *dip; 898 struct hammer_inode *nip; 899 struct nchandle *nch; 900 hammer_mount_t hmp; 901 int error; 902 903 nch = ap->a_nch; 904 dip = VTOI(ap->a_dvp); 905 hmp = dip->hmp; 906 907 if (dip->flags & HAMMER_INODE_RO) 908 return (EROFS); 909 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 910 return (error); 911 912 /* 913 * Create a transaction to cover the operations we perform. 914 */ 915 lwkt_gettoken(&hmp->fs_token); 916 hammer_start_transaction(&trans, hmp); 917 ++hammer_stats_file_iopsw; 918 919 /* 920 * Create a new filesystem object of the requested type. The 921 * returned inode will be referenced and shared-locked to prevent 922 * it from being moved to the flusher. 923 */ 924 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 925 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 926 NULL, &nip); 927 if (error) { 928 hkprintf("hammer_create_inode error %d\n", error); 929 hammer_done_transaction(&trans); 930 *ap->a_vpp = NULL; 931 lwkt_reltoken(&hmp->fs_token); 932 return (error); 933 } 934 935 /* 936 * Add the new filesystem object to the directory. This will also 937 * bump the inode's link count. 938 */ 939 error = hammer_ip_add_directory(&trans, dip, 940 nch->ncp->nc_name, nch->ncp->nc_nlen, 941 nip); 942 if (error) 943 hkprintf("hammer_ip_add_directory error %d\n", error); 944 945 /* 946 * Finish up. 947 */ 948 if (error) { 949 hammer_rel_inode(nip, 0); 950 hammer_done_transaction(&trans); 951 *ap->a_vpp = NULL; 952 } else { 953 error = hammer_get_vnode(nip, ap->a_vpp); 954 hammer_done_transaction(&trans); 955 hammer_rel_inode(nip, 0); 956 if (error == 0) { 957 cache_setunresolved(ap->a_nch); 958 cache_setvp(ap->a_nch, *ap->a_vpp); 959 } 960 hammer_knote(ap->a_dvp, NOTE_WRITE); 961 } 962 lwkt_reltoken(&hmp->fs_token); 963 return (error); 964 } 965 966 /* 967 * hammer_vop_getattr { vp, vap } 968 * 969 * Retrieve an inode's attribute information. When accessing inodes 970 * historically we fake the atime field to ensure consistent results. 971 * The atime field is stored in the B-Tree element and allowed to be 972 * updated without cycling the element. 973 * 974 * MPSAFE - does not require fs_token 975 */ 976 static 977 int 978 hammer_vop_getattr(struct vop_getattr_args *ap) 979 { 980 struct hammer_inode *ip = VTOI(ap->a_vp); 981 struct vattr *vap = ap->a_vap; 982 983 /* 984 * We want the fsid to be different when accessing a filesystem 985 * with different as-of's so programs like diff don't think 986 * the files are the same. 987 * 988 * We also want the fsid to be the same when comparing snapshots, 989 * or when comparing mirrors (which might be backed by different 990 * physical devices). HAMMER fsids are based on the PFS's 991 * shared_uuid field. 992 * 993 * XXX there is a chance of collision here. The va_fsid reported 994 * by stat is different from the more involved fsid used in the 995 * mount structure. 996 */ 997 ++hammer_stats_file_iopsr; 998 hammer_lock_sh(&ip->lock); 999 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 1000 (u_int32_t)(ip->obj_asof >> 32); 1001 1002 vap->va_fileid = ip->ino_leaf.base.obj_id; 1003 vap->va_mode = ip->ino_data.mode; 1004 vap->va_nlink = ip->ino_data.nlinks; 1005 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1006 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1007 vap->va_rmajor = 0; 1008 vap->va_rminor = 0; 1009 vap->va_size = ip->ino_data.size; 1010 1011 /* 1012 * Special case for @@PFS softlinks. The actual size of the 1013 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 1014 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 1015 */ 1016 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 1017 ip->ino_data.size == 10 && 1018 ip->obj_asof == HAMMER_MAX_TID && 1019 ip->obj_localization == 0 && 1020 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 1021 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) 1022 vap->va_size = 26; 1023 else 1024 vap->va_size = 10; 1025 } 1026 1027 /* 1028 * We must provide a consistent atime and mtime for snapshots 1029 * so people can do a 'tar cf - ... | md5' on them and get 1030 * consistent results. 1031 */ 1032 if (ip->flags & HAMMER_INODE_RO) { 1033 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 1034 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 1035 } else { 1036 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 1037 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 1038 } 1039 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 1040 vap->va_flags = ip->ino_data.uflags; 1041 vap->va_gen = 1; /* hammer inums are unique for all time */ 1042 vap->va_blocksize = HAMMER_BUFSIZE; 1043 if (ip->ino_data.size >= HAMMER_XDEMARC) { 1044 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 1045 ~HAMMER_XBUFMASK64; 1046 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) { 1047 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 1048 ~HAMMER_BUFMASK64; 1049 } else { 1050 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 1051 } 1052 1053 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 1054 vap->va_filerev = 0; /* XXX */ 1055 vap->va_uid_uuid = ip->ino_data.uid; 1056 vap->va_gid_uuid = ip->ino_data.gid; 1057 vap->va_fsid_uuid = ip->hmp->fsid; 1058 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 1059 VA_FSID_UUID_VALID; 1060 1061 switch (ip->ino_data.obj_type) { 1062 case HAMMER_OBJTYPE_CDEV: 1063 case HAMMER_OBJTYPE_BDEV: 1064 vap->va_rmajor = ip->ino_data.rmajor; 1065 vap->va_rminor = ip->ino_data.rminor; 1066 break; 1067 default: 1068 break; 1069 } 1070 hammer_unlock(&ip->lock); 1071 return(0); 1072 } 1073 1074 /* 1075 * hammer_vop_nresolve { nch, dvp, cred } 1076 * 1077 * Locate the requested directory entry. 1078 */ 1079 static 1080 int 1081 hammer_vop_nresolve(struct vop_nresolve_args *ap) 1082 { 1083 struct hammer_transaction trans; 1084 struct namecache *ncp; 1085 hammer_mount_t hmp; 1086 hammer_inode_t dip; 1087 hammer_inode_t ip; 1088 hammer_tid_t asof; 1089 struct hammer_cursor cursor; 1090 struct vnode *vp; 1091 int64_t namekey; 1092 int error; 1093 int i; 1094 int nlen; 1095 int flags; 1096 int ispfs; 1097 int64_t obj_id; 1098 u_int32_t localization; 1099 u_int32_t max_iterations; 1100 1101 /* 1102 * Misc initialization, plus handle as-of name extensions. Look for 1103 * the '@@' extension. Note that as-of files and directories cannot 1104 * be modified. 1105 */ 1106 dip = VTOI(ap->a_dvp); 1107 ncp = ap->a_nch->ncp; 1108 asof = dip->obj_asof; 1109 localization = dip->obj_localization; /* for code consistency */ 1110 nlen = ncp->nc_nlen; 1111 flags = dip->flags & HAMMER_INODE_RO; 1112 ispfs = 0; 1113 hmp = dip->hmp; 1114 1115 lwkt_gettoken(&hmp->fs_token); 1116 hammer_simple_transaction(&trans, hmp); 1117 ++hammer_stats_file_iopsr; 1118 1119 for (i = 0; i < nlen; ++i) { 1120 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 1121 error = hammer_str_to_tid(ncp->nc_name + i + 2, 1122 &ispfs, &asof, &localization); 1123 if (error != 0) { 1124 i = nlen; 1125 break; 1126 } 1127 if (asof != HAMMER_MAX_TID) 1128 flags |= HAMMER_INODE_RO; 1129 break; 1130 } 1131 } 1132 nlen = i; 1133 1134 /* 1135 * If this is a PFS softlink we dive into the PFS 1136 */ 1137 if (ispfs && nlen == 0) { 1138 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 1139 asof, localization, 1140 flags, &error); 1141 if (error == 0) { 1142 error = hammer_get_vnode(ip, &vp); 1143 hammer_rel_inode(ip, 0); 1144 } else { 1145 vp = NULL; 1146 } 1147 if (error == 0) { 1148 vn_unlock(vp); 1149 cache_setvp(ap->a_nch, vp); 1150 vrele(vp); 1151 } 1152 goto done; 1153 } 1154 1155 /* 1156 * If there is no path component the time extension is relative to dip. 1157 * e.g. "fubar/@@<snapshot>" 1158 * 1159 * "." is handled by the kernel, but ".@@<snapshot>" is not. 1160 * e.g. "fubar/.@@<snapshot>" 1161 * 1162 * ".." is handled by the kernel. We do not currently handle 1163 * "..@<snapshot>". 1164 */ 1165 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 1166 ip = hammer_get_inode(&trans, dip, dip->obj_id, 1167 asof, dip->obj_localization, 1168 flags, &error); 1169 if (error == 0) { 1170 error = hammer_get_vnode(ip, &vp); 1171 hammer_rel_inode(ip, 0); 1172 } else { 1173 vp = NULL; 1174 } 1175 if (error == 0) { 1176 vn_unlock(vp); 1177 cache_setvp(ap->a_nch, vp); 1178 vrele(vp); 1179 } 1180 goto done; 1181 } 1182 1183 /* 1184 * Calculate the namekey and setup the key range for the scan. This 1185 * works kinda like a chained hash table where the lower 32 bits 1186 * of the namekey synthesize the chain. 1187 * 1188 * The key range is inclusive of both key_beg and key_end. 1189 */ 1190 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 1191 &max_iterations); 1192 1193 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 1194 cursor.key_beg.localization = dip->obj_localization + 1195 hammer_dir_localization(dip); 1196 cursor.key_beg.obj_id = dip->obj_id; 1197 cursor.key_beg.key = namekey; 1198 cursor.key_beg.create_tid = 0; 1199 cursor.key_beg.delete_tid = 0; 1200 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1201 cursor.key_beg.obj_type = 0; 1202 1203 cursor.key_end = cursor.key_beg; 1204 cursor.key_end.key += max_iterations; 1205 cursor.asof = asof; 1206 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1207 1208 /* 1209 * Scan all matching records (the chain), locate the one matching 1210 * the requested path component. 1211 * 1212 * The hammer_ip_*() functions merge in-memory records with on-disk 1213 * records for the purposes of the search. 1214 */ 1215 obj_id = 0; 1216 localization = HAMMER_DEF_LOCALIZATION; 1217 1218 if (error == 0) { 1219 error = hammer_ip_first(&cursor); 1220 while (error == 0) { 1221 error = hammer_ip_resolve_data(&cursor); 1222 if (error) 1223 break; 1224 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 1225 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1226 obj_id = cursor.data->entry.obj_id; 1227 localization = cursor.data->entry.localization; 1228 break; 1229 } 1230 error = hammer_ip_next(&cursor); 1231 } 1232 } 1233 hammer_done_cursor(&cursor); 1234 1235 /* 1236 * Lookup the obj_id. This should always succeed. If it does not 1237 * the filesystem may be damaged and we return a dummy inode. 1238 */ 1239 if (error == 0) { 1240 ip = hammer_get_inode(&trans, dip, obj_id, 1241 asof, localization, 1242 flags, &error); 1243 if (error == ENOENT) { 1244 kprintf("HAMMER: WARNING: Missing " 1245 "inode for dirent \"%s\"\n" 1246 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n", 1247 ncp->nc_name, 1248 (long long)obj_id, (long long)asof, 1249 localization); 1250 error = 0; 1251 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 1252 asof, localization, 1253 flags, &error); 1254 } 1255 if (error == 0) { 1256 error = hammer_get_vnode(ip, &vp); 1257 hammer_rel_inode(ip, 0); 1258 } else { 1259 vp = NULL; 1260 } 1261 if (error == 0) { 1262 vn_unlock(vp); 1263 cache_setvp(ap->a_nch, vp); 1264 vrele(vp); 1265 } 1266 } else if (error == ENOENT) { 1267 cache_setvp(ap->a_nch, NULL); 1268 } 1269 done: 1270 hammer_done_transaction(&trans); 1271 lwkt_reltoken(&hmp->fs_token); 1272 return (error); 1273 } 1274 1275 /* 1276 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 1277 * 1278 * Locate the parent directory of a directory vnode. 1279 * 1280 * dvp is referenced but not locked. *vpp must be returned referenced and 1281 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 1282 * at the root, instead it could indicate that the directory we were in was 1283 * removed. 1284 * 1285 * NOTE: as-of sequences are not linked into the directory structure. If 1286 * we are at the root with a different asof then the mount point, reload 1287 * the same directory with the mount point's asof. I'm not sure what this 1288 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 1289 * get confused, but it hasn't been tested. 1290 */ 1291 static 1292 int 1293 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 1294 { 1295 struct hammer_transaction trans; 1296 struct hammer_inode *dip; 1297 struct hammer_inode *ip; 1298 hammer_mount_t hmp; 1299 int64_t parent_obj_id; 1300 u_int32_t parent_obj_localization; 1301 hammer_tid_t asof; 1302 int error; 1303 1304 dip = VTOI(ap->a_dvp); 1305 asof = dip->obj_asof; 1306 hmp = dip->hmp; 1307 1308 /* 1309 * Whos are parent? This could be the root of a pseudo-filesystem 1310 * whos parent is in another localization domain. 1311 */ 1312 lwkt_gettoken(&hmp->fs_token); 1313 parent_obj_id = dip->ino_data.parent_obj_id; 1314 if (dip->obj_id == HAMMER_OBJID_ROOT) 1315 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 1316 else 1317 parent_obj_localization = dip->obj_localization; 1318 1319 if (parent_obj_id == 0) { 1320 if (dip->obj_id == HAMMER_OBJID_ROOT && 1321 asof != hmp->asof) { 1322 parent_obj_id = dip->obj_id; 1323 asof = hmp->asof; 1324 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1325 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 1326 (long long)dip->obj_asof); 1327 } else { 1328 *ap->a_vpp = NULL; 1329 lwkt_reltoken(&hmp->fs_token); 1330 return ENOENT; 1331 } 1332 } 1333 1334 hammer_simple_transaction(&trans, hmp); 1335 ++hammer_stats_file_iopsr; 1336 1337 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1338 asof, parent_obj_localization, 1339 dip->flags, &error); 1340 if (ip) { 1341 error = hammer_get_vnode(ip, ap->a_vpp); 1342 hammer_rel_inode(ip, 0); 1343 } else { 1344 *ap->a_vpp = NULL; 1345 } 1346 hammer_done_transaction(&trans); 1347 lwkt_reltoken(&hmp->fs_token); 1348 return (error); 1349 } 1350 1351 /* 1352 * hammer_vop_nlink { nch, dvp, vp, cred } 1353 */ 1354 static 1355 int 1356 hammer_vop_nlink(struct vop_nlink_args *ap) 1357 { 1358 struct hammer_transaction trans; 1359 struct hammer_inode *dip; 1360 struct hammer_inode *ip; 1361 struct nchandle *nch; 1362 hammer_mount_t hmp; 1363 int error; 1364 1365 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1366 return(EXDEV); 1367 1368 nch = ap->a_nch; 1369 dip = VTOI(ap->a_dvp); 1370 ip = VTOI(ap->a_vp); 1371 hmp = dip->hmp; 1372 1373 if (dip->obj_localization != ip->obj_localization) 1374 return(EXDEV); 1375 1376 if (dip->flags & HAMMER_INODE_RO) 1377 return (EROFS); 1378 if (ip->flags & HAMMER_INODE_RO) 1379 return (EROFS); 1380 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1381 return (error); 1382 1383 /* 1384 * Create a transaction to cover the operations we perform. 1385 */ 1386 lwkt_gettoken(&hmp->fs_token); 1387 hammer_start_transaction(&trans, hmp); 1388 ++hammer_stats_file_iopsw; 1389 1390 /* 1391 * Add the filesystem object to the directory. Note that neither 1392 * dip nor ip are referenced or locked, but their vnodes are 1393 * referenced. This function will bump the inode's link count. 1394 */ 1395 error = hammer_ip_add_directory(&trans, dip, 1396 nch->ncp->nc_name, nch->ncp->nc_nlen, 1397 ip); 1398 1399 /* 1400 * Finish up. 1401 */ 1402 if (error == 0) { 1403 cache_setunresolved(nch); 1404 cache_setvp(nch, ap->a_vp); 1405 } 1406 hammer_done_transaction(&trans); 1407 hammer_knote(ap->a_vp, NOTE_LINK); 1408 hammer_knote(ap->a_dvp, NOTE_WRITE); 1409 lwkt_reltoken(&hmp->fs_token); 1410 return (error); 1411 } 1412 1413 /* 1414 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1415 * 1416 * The operating system has already ensured that the directory entry 1417 * does not exist and done all appropriate namespace locking. 1418 */ 1419 static 1420 int 1421 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1422 { 1423 struct hammer_transaction trans; 1424 struct hammer_inode *dip; 1425 struct hammer_inode *nip; 1426 struct nchandle *nch; 1427 hammer_mount_t hmp; 1428 int error; 1429 1430 nch = ap->a_nch; 1431 dip = VTOI(ap->a_dvp); 1432 hmp = dip->hmp; 1433 1434 if (dip->flags & HAMMER_INODE_RO) 1435 return (EROFS); 1436 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1437 return (error); 1438 1439 /* 1440 * Create a transaction to cover the operations we perform. 1441 */ 1442 lwkt_gettoken(&hmp->fs_token); 1443 hammer_start_transaction(&trans, hmp); 1444 ++hammer_stats_file_iopsw; 1445 1446 /* 1447 * Create a new filesystem object of the requested type. The 1448 * returned inode will be referenced but not locked. 1449 */ 1450 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1451 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1452 NULL, &nip); 1453 if (error) { 1454 hkprintf("hammer_mkdir error %d\n", error); 1455 hammer_done_transaction(&trans); 1456 *ap->a_vpp = NULL; 1457 lwkt_reltoken(&hmp->fs_token); 1458 return (error); 1459 } 1460 /* 1461 * Add the new filesystem object to the directory. This will also 1462 * bump the inode's link count. 1463 */ 1464 error = hammer_ip_add_directory(&trans, dip, 1465 nch->ncp->nc_name, nch->ncp->nc_nlen, 1466 nip); 1467 if (error) 1468 hkprintf("hammer_mkdir (add) error %d\n", error); 1469 1470 /* 1471 * Finish up. 1472 */ 1473 if (error) { 1474 hammer_rel_inode(nip, 0); 1475 *ap->a_vpp = NULL; 1476 } else { 1477 error = hammer_get_vnode(nip, ap->a_vpp); 1478 hammer_rel_inode(nip, 0); 1479 if (error == 0) { 1480 cache_setunresolved(ap->a_nch); 1481 cache_setvp(ap->a_nch, *ap->a_vpp); 1482 } 1483 } 1484 hammer_done_transaction(&trans); 1485 if (error == 0) 1486 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1487 lwkt_reltoken(&hmp->fs_token); 1488 return (error); 1489 } 1490 1491 /* 1492 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1493 * 1494 * The operating system has already ensured that the directory entry 1495 * does not exist and done all appropriate namespace locking. 1496 */ 1497 static 1498 int 1499 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1500 { 1501 struct hammer_transaction trans; 1502 struct hammer_inode *dip; 1503 struct hammer_inode *nip; 1504 struct nchandle *nch; 1505 hammer_mount_t hmp; 1506 int error; 1507 1508 nch = ap->a_nch; 1509 dip = VTOI(ap->a_dvp); 1510 hmp = dip->hmp; 1511 1512 if (dip->flags & HAMMER_INODE_RO) 1513 return (EROFS); 1514 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1515 return (error); 1516 1517 /* 1518 * Create a transaction to cover the operations we perform. 1519 */ 1520 lwkt_gettoken(&hmp->fs_token); 1521 hammer_start_transaction(&trans, hmp); 1522 ++hammer_stats_file_iopsw; 1523 1524 /* 1525 * Create a new filesystem object of the requested type. The 1526 * returned inode will be referenced but not locked. 1527 * 1528 * If mknod specifies a directory a pseudo-fs is created. 1529 */ 1530 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1531 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1532 NULL, &nip); 1533 if (error) { 1534 hammer_done_transaction(&trans); 1535 *ap->a_vpp = NULL; 1536 lwkt_reltoken(&hmp->fs_token); 1537 return (error); 1538 } 1539 1540 /* 1541 * Add the new filesystem object to the directory. This will also 1542 * bump the inode's link count. 1543 */ 1544 error = hammer_ip_add_directory(&trans, dip, 1545 nch->ncp->nc_name, nch->ncp->nc_nlen, 1546 nip); 1547 1548 /* 1549 * Finish up. 1550 */ 1551 if (error) { 1552 hammer_rel_inode(nip, 0); 1553 *ap->a_vpp = NULL; 1554 } else { 1555 error = hammer_get_vnode(nip, ap->a_vpp); 1556 hammer_rel_inode(nip, 0); 1557 if (error == 0) { 1558 cache_setunresolved(ap->a_nch); 1559 cache_setvp(ap->a_nch, *ap->a_vpp); 1560 } 1561 } 1562 hammer_done_transaction(&trans); 1563 if (error == 0) 1564 hammer_knote(ap->a_dvp, NOTE_WRITE); 1565 lwkt_reltoken(&hmp->fs_token); 1566 return (error); 1567 } 1568 1569 /* 1570 * hammer_vop_open { vp, mode, cred, fp } 1571 * 1572 * MPSAFE (does not require fs_token) 1573 */ 1574 static 1575 int 1576 hammer_vop_open(struct vop_open_args *ap) 1577 { 1578 hammer_inode_t ip; 1579 1580 ++hammer_stats_file_iopsr; 1581 ip = VTOI(ap->a_vp); 1582 1583 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1584 return (EROFS); 1585 return(vop_stdopen(ap)); 1586 } 1587 1588 /* 1589 * hammer_vop_print { vp } 1590 */ 1591 static 1592 int 1593 hammer_vop_print(struct vop_print_args *ap) 1594 { 1595 return EOPNOTSUPP; 1596 } 1597 1598 /* 1599 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1600 */ 1601 static 1602 int 1603 hammer_vop_readdir(struct vop_readdir_args *ap) 1604 { 1605 struct hammer_transaction trans; 1606 struct hammer_cursor cursor; 1607 struct hammer_inode *ip; 1608 hammer_mount_t hmp; 1609 struct uio *uio; 1610 hammer_base_elm_t base; 1611 int error; 1612 int cookie_index; 1613 int ncookies; 1614 off_t *cookies; 1615 off_t saveoff; 1616 int r; 1617 int dtype; 1618 1619 ++hammer_stats_file_iopsr; 1620 ip = VTOI(ap->a_vp); 1621 uio = ap->a_uio; 1622 saveoff = uio->uio_offset; 1623 hmp = ip->hmp; 1624 1625 if (ap->a_ncookies) { 1626 ncookies = uio->uio_resid / 16 + 1; 1627 if (ncookies > 1024) 1628 ncookies = 1024; 1629 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1630 cookie_index = 0; 1631 } else { 1632 ncookies = -1; 1633 cookies = NULL; 1634 cookie_index = 0; 1635 } 1636 1637 lwkt_gettoken(&hmp->fs_token); 1638 hammer_simple_transaction(&trans, hmp); 1639 1640 /* 1641 * Handle artificial entries 1642 * 1643 * It should be noted that the minimum value for a directory 1644 * hash key on-media is 0x0000000100000000, so we can use anything 1645 * less then that to represent our 'special' key space. 1646 */ 1647 error = 0; 1648 if (saveoff == 0) { 1649 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1650 if (r) 1651 goto done; 1652 if (cookies) 1653 cookies[cookie_index] = saveoff; 1654 ++saveoff; 1655 ++cookie_index; 1656 if (cookie_index == ncookies) 1657 goto done; 1658 } 1659 if (saveoff == 1) { 1660 if (ip->ino_data.parent_obj_id) { 1661 r = vop_write_dirent(&error, uio, 1662 ip->ino_data.parent_obj_id, 1663 DT_DIR, 2, ".."); 1664 } else { 1665 r = vop_write_dirent(&error, uio, 1666 ip->obj_id, DT_DIR, 2, ".."); 1667 } 1668 if (r) 1669 goto done; 1670 if (cookies) 1671 cookies[cookie_index] = saveoff; 1672 ++saveoff; 1673 ++cookie_index; 1674 if (cookie_index == ncookies) 1675 goto done; 1676 } 1677 1678 /* 1679 * Key range (begin and end inclusive) to scan. Directory keys 1680 * directly translate to a 64 bit 'seek' position. 1681 */ 1682 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1683 cursor.key_beg.localization = ip->obj_localization + 1684 hammer_dir_localization(ip); 1685 cursor.key_beg.obj_id = ip->obj_id; 1686 cursor.key_beg.create_tid = 0; 1687 cursor.key_beg.delete_tid = 0; 1688 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1689 cursor.key_beg.obj_type = 0; 1690 cursor.key_beg.key = saveoff; 1691 1692 cursor.key_end = cursor.key_beg; 1693 cursor.key_end.key = HAMMER_MAX_KEY; 1694 cursor.asof = ip->obj_asof; 1695 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1696 1697 error = hammer_ip_first(&cursor); 1698 1699 while (error == 0) { 1700 error = hammer_ip_resolve_data(&cursor); 1701 if (error) 1702 break; 1703 base = &cursor.leaf->base; 1704 saveoff = base->key; 1705 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1706 1707 if (base->obj_id != ip->obj_id) 1708 panic("readdir: bad record at %p", cursor.node); 1709 1710 /* 1711 * Convert pseudo-filesystems into softlinks 1712 */ 1713 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1714 r = vop_write_dirent( 1715 &error, uio, cursor.data->entry.obj_id, 1716 dtype, 1717 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1718 (void *)cursor.data->entry.name); 1719 if (r) 1720 break; 1721 ++saveoff; 1722 if (cookies) 1723 cookies[cookie_index] = base->key; 1724 ++cookie_index; 1725 if (cookie_index == ncookies) 1726 break; 1727 error = hammer_ip_next(&cursor); 1728 } 1729 hammer_done_cursor(&cursor); 1730 1731 done: 1732 hammer_done_transaction(&trans); 1733 1734 if (ap->a_eofflag) 1735 *ap->a_eofflag = (error == ENOENT); 1736 uio->uio_offset = saveoff; 1737 if (error && cookie_index == 0) { 1738 if (error == ENOENT) 1739 error = 0; 1740 if (cookies) { 1741 kfree(cookies, M_TEMP); 1742 *ap->a_ncookies = 0; 1743 *ap->a_cookies = NULL; 1744 } 1745 } else { 1746 if (error == ENOENT) 1747 error = 0; 1748 if (cookies) { 1749 *ap->a_ncookies = cookie_index; 1750 *ap->a_cookies = cookies; 1751 } 1752 } 1753 lwkt_reltoken(&hmp->fs_token); 1754 return(error); 1755 } 1756 1757 /* 1758 * hammer_vop_readlink { vp, uio, cred } 1759 */ 1760 static 1761 int 1762 hammer_vop_readlink(struct vop_readlink_args *ap) 1763 { 1764 struct hammer_transaction trans; 1765 struct hammer_cursor cursor; 1766 struct hammer_inode *ip; 1767 hammer_mount_t hmp; 1768 char buf[32]; 1769 u_int32_t localization; 1770 hammer_pseudofs_inmem_t pfsm; 1771 int error; 1772 1773 ip = VTOI(ap->a_vp); 1774 hmp = ip->hmp; 1775 1776 lwkt_gettoken(&hmp->fs_token); 1777 1778 /* 1779 * Shortcut if the symlink data was stuffed into ino_data. 1780 * 1781 * Also expand special "@@PFS%05d" softlinks (expansion only 1782 * occurs for non-historical (current) accesses made from the 1783 * primary filesystem). 1784 */ 1785 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1786 char *ptr; 1787 int bytes; 1788 1789 ptr = ip->ino_data.ext.symlink; 1790 bytes = (int)ip->ino_data.size; 1791 if (bytes == 10 && 1792 ip->obj_asof == HAMMER_MAX_TID && 1793 ip->obj_localization == 0 && 1794 strncmp(ptr, "@@PFS", 5) == 0) { 1795 hammer_simple_transaction(&trans, hmp); 1796 bcopy(ptr + 5, buf, 5); 1797 buf[5] = 0; 1798 localization = strtoul(buf, NULL, 10) << 16; 1799 pfsm = hammer_load_pseudofs(&trans, localization, 1800 &error); 1801 if (error == 0) { 1802 if (pfsm->pfsd.mirror_flags & 1803 HAMMER_PFSD_SLAVE) { 1804 /* vap->va_size == 26 */ 1805 ksnprintf(buf, sizeof(buf), 1806 "@@0x%016llx:%05d", 1807 (long long)pfsm->pfsd.sync_end_tid, 1808 localization >> 16); 1809 } else { 1810 /* vap->va_size == 10 */ 1811 ksnprintf(buf, sizeof(buf), 1812 "@@-1:%05d", 1813 localization >> 16); 1814 #if 0 1815 ksnprintf(buf, sizeof(buf), 1816 "@@0x%016llx:%05d", 1817 (long long)HAMMER_MAX_TID, 1818 localization >> 16); 1819 #endif 1820 } 1821 ptr = buf; 1822 bytes = strlen(buf); 1823 } 1824 if (pfsm) 1825 hammer_rel_pseudofs(hmp, pfsm); 1826 hammer_done_transaction(&trans); 1827 } 1828 error = uiomove(ptr, bytes, ap->a_uio); 1829 lwkt_reltoken(&hmp->fs_token); 1830 return(error); 1831 } 1832 1833 /* 1834 * Long version 1835 */ 1836 hammer_simple_transaction(&trans, hmp); 1837 ++hammer_stats_file_iopsr; 1838 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1839 1840 /* 1841 * Key range (begin and end inclusive) to scan. Directory keys 1842 * directly translate to a 64 bit 'seek' position. 1843 */ 1844 cursor.key_beg.localization = ip->obj_localization + 1845 HAMMER_LOCALIZE_MISC; 1846 cursor.key_beg.obj_id = ip->obj_id; 1847 cursor.key_beg.create_tid = 0; 1848 cursor.key_beg.delete_tid = 0; 1849 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1850 cursor.key_beg.obj_type = 0; 1851 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1852 cursor.asof = ip->obj_asof; 1853 cursor.flags |= HAMMER_CURSOR_ASOF; 1854 1855 error = hammer_ip_lookup(&cursor); 1856 if (error == 0) { 1857 error = hammer_ip_resolve_data(&cursor); 1858 if (error == 0) { 1859 KKASSERT(cursor.leaf->data_len >= 1860 HAMMER_SYMLINK_NAME_OFF); 1861 error = uiomove(cursor.data->symlink.name, 1862 cursor.leaf->data_len - 1863 HAMMER_SYMLINK_NAME_OFF, 1864 ap->a_uio); 1865 } 1866 } 1867 hammer_done_cursor(&cursor); 1868 hammer_done_transaction(&trans); 1869 lwkt_reltoken(&hmp->fs_token); 1870 return(error); 1871 } 1872 1873 /* 1874 * hammer_vop_nremove { nch, dvp, cred } 1875 */ 1876 static 1877 int 1878 hammer_vop_nremove(struct vop_nremove_args *ap) 1879 { 1880 struct hammer_transaction trans; 1881 struct hammer_inode *dip; 1882 hammer_mount_t hmp; 1883 int error; 1884 1885 dip = VTOI(ap->a_dvp); 1886 hmp = dip->hmp; 1887 1888 if (hammer_nohistory(dip) == 0 && 1889 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1890 return (error); 1891 } 1892 1893 lwkt_gettoken(&hmp->fs_token); 1894 hammer_start_transaction(&trans, hmp); 1895 ++hammer_stats_file_iopsw; 1896 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1897 hammer_done_transaction(&trans); 1898 if (error == 0) 1899 hammer_knote(ap->a_dvp, NOTE_WRITE); 1900 lwkt_reltoken(&hmp->fs_token); 1901 return (error); 1902 } 1903 1904 /* 1905 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1906 */ 1907 static 1908 int 1909 hammer_vop_nrename(struct vop_nrename_args *ap) 1910 { 1911 struct hammer_transaction trans; 1912 struct namecache *fncp; 1913 struct namecache *tncp; 1914 struct hammer_inode *fdip; 1915 struct hammer_inode *tdip; 1916 struct hammer_inode *ip; 1917 hammer_mount_t hmp; 1918 struct hammer_cursor cursor; 1919 int64_t namekey; 1920 u_int32_t max_iterations; 1921 int nlen, error; 1922 1923 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1924 return(EXDEV); 1925 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1926 return(EXDEV); 1927 1928 fdip = VTOI(ap->a_fdvp); 1929 tdip = VTOI(ap->a_tdvp); 1930 fncp = ap->a_fnch->ncp; 1931 tncp = ap->a_tnch->ncp; 1932 ip = VTOI(fncp->nc_vp); 1933 KKASSERT(ip != NULL); 1934 1935 hmp = ip->hmp; 1936 1937 if (fdip->obj_localization != tdip->obj_localization) 1938 return(EXDEV); 1939 if (fdip->obj_localization != ip->obj_localization) 1940 return(EXDEV); 1941 1942 if (fdip->flags & HAMMER_INODE_RO) 1943 return (EROFS); 1944 if (tdip->flags & HAMMER_INODE_RO) 1945 return (EROFS); 1946 if (ip->flags & HAMMER_INODE_RO) 1947 return (EROFS); 1948 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 1949 return (error); 1950 1951 lwkt_gettoken(&hmp->fs_token); 1952 hammer_start_transaction(&trans, hmp); 1953 ++hammer_stats_file_iopsw; 1954 1955 /* 1956 * Remove tncp from the target directory and then link ip as 1957 * tncp. XXX pass trans to dounlink 1958 * 1959 * Force the inode sync-time to match the transaction so it is 1960 * in-sync with the creation of the target directory entry. 1961 */ 1962 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 1963 ap->a_cred, 0, -1); 1964 if (error == 0 || error == ENOENT) { 1965 error = hammer_ip_add_directory(&trans, tdip, 1966 tncp->nc_name, tncp->nc_nlen, 1967 ip); 1968 if (error == 0) { 1969 ip->ino_data.parent_obj_id = tdip->obj_id; 1970 ip->ino_data.ctime = trans.time; 1971 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); 1972 } 1973 } 1974 if (error) 1975 goto failed; /* XXX */ 1976 1977 /* 1978 * Locate the record in the originating directory and remove it. 1979 * 1980 * Calculate the namekey and setup the key range for the scan. This 1981 * works kinda like a chained hash table where the lower 32 bits 1982 * of the namekey synthesize the chain. 1983 * 1984 * The key range is inclusive of both key_beg and key_end. 1985 */ 1986 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 1987 &max_iterations); 1988 retry: 1989 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 1990 cursor.key_beg.localization = fdip->obj_localization + 1991 hammer_dir_localization(fdip); 1992 cursor.key_beg.obj_id = fdip->obj_id; 1993 cursor.key_beg.key = namekey; 1994 cursor.key_beg.create_tid = 0; 1995 cursor.key_beg.delete_tid = 0; 1996 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1997 cursor.key_beg.obj_type = 0; 1998 1999 cursor.key_end = cursor.key_beg; 2000 cursor.key_end.key += max_iterations; 2001 cursor.asof = fdip->obj_asof; 2002 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2003 2004 /* 2005 * Scan all matching records (the chain), locate the one matching 2006 * the requested path component. 2007 * 2008 * The hammer_ip_*() functions merge in-memory records with on-disk 2009 * records for the purposes of the search. 2010 */ 2011 error = hammer_ip_first(&cursor); 2012 while (error == 0) { 2013 if (hammer_ip_resolve_data(&cursor) != 0) 2014 break; 2015 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2016 KKASSERT(nlen > 0); 2017 if (fncp->nc_nlen == nlen && 2018 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2019 break; 2020 } 2021 error = hammer_ip_next(&cursor); 2022 } 2023 2024 /* 2025 * If all is ok we have to get the inode so we can adjust nlinks. 2026 * 2027 * WARNING: hammer_ip_del_directory() may have to terminate the 2028 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 2029 * twice. 2030 */ 2031 if (error == 0) 2032 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 2033 2034 /* 2035 * XXX A deadlock here will break rename's atomicy for the purposes 2036 * of crash recovery. 2037 */ 2038 if (error == EDEADLK) { 2039 hammer_done_cursor(&cursor); 2040 goto retry; 2041 } 2042 2043 /* 2044 * Cleanup and tell the kernel that the rename succeeded. 2045 * 2046 * NOTE: ip->vp, if non-NULL, cannot be directly referenced 2047 * without formally acquiring the vp since the vp might 2048 * have zero refs on it, or in the middle of a reclaim, 2049 * etc. 2050 */ 2051 hammer_done_cursor(&cursor); 2052 if (error == 0) { 2053 cache_rename(ap->a_fnch, ap->a_tnch); 2054 hammer_knote(ap->a_fdvp, NOTE_WRITE); 2055 hammer_knote(ap->a_tdvp, NOTE_WRITE); 2056 while (ip->vp) { 2057 struct vnode *vp; 2058 2059 error = hammer_get_vnode(ip, &vp); 2060 if (error == 0 && vp) { 2061 vn_unlock(vp); 2062 hammer_knote(ip->vp, NOTE_RENAME); 2063 vrele(vp); 2064 break; 2065 } 2066 kprintf("Debug: HAMMER ip/vp race2 avoided\n"); 2067 } 2068 } 2069 2070 failed: 2071 hammer_done_transaction(&trans); 2072 lwkt_reltoken(&hmp->fs_token); 2073 return (error); 2074 } 2075 2076 /* 2077 * hammer_vop_nrmdir { nch, dvp, cred } 2078 */ 2079 static 2080 int 2081 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 2082 { 2083 struct hammer_transaction trans; 2084 struct hammer_inode *dip; 2085 hammer_mount_t hmp; 2086 int error; 2087 2088 dip = VTOI(ap->a_dvp); 2089 hmp = dip->hmp; 2090 2091 if (hammer_nohistory(dip) == 0 && 2092 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2093 return (error); 2094 } 2095 2096 lwkt_gettoken(&hmp->fs_token); 2097 hammer_start_transaction(&trans, hmp); 2098 ++hammer_stats_file_iopsw; 2099 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 2100 hammer_done_transaction(&trans); 2101 if (error == 0) 2102 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 2103 lwkt_reltoken(&hmp->fs_token); 2104 return (error); 2105 } 2106 2107 /* 2108 * hammer_vop_markatime { vp, cred } 2109 */ 2110 static 2111 int 2112 hammer_vop_markatime(struct vop_markatime_args *ap) 2113 { 2114 struct hammer_transaction trans; 2115 struct hammer_inode *ip; 2116 hammer_mount_t hmp; 2117 2118 ip = VTOI(ap->a_vp); 2119 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2120 return (EROFS); 2121 if (ip->flags & HAMMER_INODE_RO) 2122 return (EROFS); 2123 hmp = ip->hmp; 2124 if (hmp->mp->mnt_flag & MNT_NOATIME) 2125 return (0); 2126 lwkt_gettoken(&hmp->fs_token); 2127 hammer_start_transaction(&trans, hmp); 2128 ++hammer_stats_file_iopsw; 2129 2130 ip->ino_data.atime = trans.time; 2131 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); 2132 hammer_done_transaction(&trans); 2133 hammer_knote(ap->a_vp, NOTE_ATTRIB); 2134 lwkt_reltoken(&hmp->fs_token); 2135 return (0); 2136 } 2137 2138 /* 2139 * hammer_vop_setattr { vp, vap, cred } 2140 */ 2141 static 2142 int 2143 hammer_vop_setattr(struct vop_setattr_args *ap) 2144 { 2145 struct hammer_transaction trans; 2146 struct hammer_inode *ip; 2147 struct vattr *vap; 2148 hammer_mount_t hmp; 2149 int modflags; 2150 int error; 2151 int truncating; 2152 int blksize; 2153 int kflags; 2154 #if 0 2155 int64_t aligned_size; 2156 #endif 2157 u_int32_t flags; 2158 2159 vap = ap->a_vap; 2160 ip = ap->a_vp->v_data; 2161 modflags = 0; 2162 kflags = 0; 2163 hmp = ip->hmp; 2164 2165 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 2166 return(EROFS); 2167 if (ip->flags & HAMMER_INODE_RO) 2168 return (EROFS); 2169 if (hammer_nohistory(ip) == 0 && 2170 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 2171 return (error); 2172 } 2173 2174 lwkt_gettoken(&hmp->fs_token); 2175 hammer_start_transaction(&trans, hmp); 2176 ++hammer_stats_file_iopsw; 2177 error = 0; 2178 2179 if (vap->va_flags != VNOVAL) { 2180 flags = ip->ino_data.uflags; 2181 error = vop_helper_setattr_flags(&flags, vap->va_flags, 2182 hammer_to_unix_xid(&ip->ino_data.uid), 2183 ap->a_cred); 2184 if (error == 0) { 2185 if (ip->ino_data.uflags != flags) { 2186 ip->ino_data.uflags = flags; 2187 ip->ino_data.ctime = trans.time; 2188 modflags |= HAMMER_INODE_DDIRTY; 2189 kflags |= NOTE_ATTRIB; 2190 } 2191 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2192 error = 0; 2193 goto done; 2194 } 2195 } 2196 goto done; 2197 } 2198 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 2199 error = EPERM; 2200 goto done; 2201 } 2202 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 2203 mode_t cur_mode = ip->ino_data.mode; 2204 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2205 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2206 uuid_t uuid_uid; 2207 uuid_t uuid_gid; 2208 2209 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 2210 ap->a_cred, 2211 &cur_uid, &cur_gid, &cur_mode); 2212 if (error == 0) { 2213 hammer_guid_to_uuid(&uuid_uid, cur_uid); 2214 hammer_guid_to_uuid(&uuid_gid, cur_gid); 2215 if (bcmp(&uuid_uid, &ip->ino_data.uid, 2216 sizeof(uuid_uid)) || 2217 bcmp(&uuid_gid, &ip->ino_data.gid, 2218 sizeof(uuid_gid)) || 2219 ip->ino_data.mode != cur_mode 2220 ) { 2221 ip->ino_data.uid = uuid_uid; 2222 ip->ino_data.gid = uuid_gid; 2223 ip->ino_data.mode = cur_mode; 2224 ip->ino_data.ctime = trans.time; 2225 modflags |= HAMMER_INODE_DDIRTY; 2226 } 2227 kflags |= NOTE_ATTRIB; 2228 } 2229 } 2230 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 2231 switch(ap->a_vp->v_type) { 2232 case VREG: 2233 if (vap->va_size == ip->ino_data.size) 2234 break; 2235 2236 /* 2237 * Log the operation if in fast-fsync mode or if 2238 * there are unterminated redo write records present. 2239 * 2240 * The second check is needed so the recovery code 2241 * properly truncates write redos even if nominal 2242 * REDO operations is turned off due to excessive 2243 * writes, because the related records might be 2244 * destroyed and never lay down a TERM_WRITE. 2245 */ 2246 if ((ip->flags & HAMMER_INODE_REDO) || 2247 (ip->flags & HAMMER_INODE_RDIRTY)) { 2248 error = hammer_generate_redo(&trans, ip, 2249 vap->va_size, 2250 HAMMER_REDO_TRUNC, 2251 NULL, 0); 2252 } 2253 blksize = hammer_blocksize(vap->va_size); 2254 2255 /* 2256 * XXX break atomicy, we can deadlock the backend 2257 * if we do not release the lock. Probably not a 2258 * big deal here. 2259 */ 2260 if (vap->va_size < ip->ino_data.size) { 2261 nvtruncbuf(ap->a_vp, vap->va_size, 2262 blksize, 2263 hammer_blockoff(vap->va_size)); 2264 truncating = 1; 2265 kflags |= NOTE_WRITE; 2266 } else { 2267 nvextendbuf(ap->a_vp, 2268 ip->ino_data.size, 2269 vap->va_size, 2270 hammer_blocksize(ip->ino_data.size), 2271 hammer_blocksize(vap->va_size), 2272 hammer_blockoff(ip->ino_data.size), 2273 hammer_blockoff(vap->va_size), 2274 0); 2275 truncating = 0; 2276 kflags |= NOTE_WRITE | NOTE_EXTEND; 2277 } 2278 ip->ino_data.size = vap->va_size; 2279 ip->ino_data.mtime = trans.time; 2280 /* XXX safe to use SDIRTY instead of DDIRTY here? */ 2281 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2282 2283 /* 2284 * On-media truncation is cached in the inode until 2285 * the inode is synchronized. We must immediately 2286 * handle any frontend records. 2287 */ 2288 if (truncating) { 2289 hammer_ip_frontend_trunc(ip, vap->va_size); 2290 #ifdef DEBUG_TRUNCATE 2291 if (HammerTruncIp == NULL) 2292 HammerTruncIp = ip; 2293 #endif 2294 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2295 ip->flags |= HAMMER_INODE_TRUNCATED; 2296 ip->trunc_off = vap->va_size; 2297 #ifdef DEBUG_TRUNCATE 2298 if (ip == HammerTruncIp) 2299 kprintf("truncate1 %016llx\n", 2300 (long long)ip->trunc_off); 2301 #endif 2302 } else if (ip->trunc_off > vap->va_size) { 2303 ip->trunc_off = vap->va_size; 2304 #ifdef DEBUG_TRUNCATE 2305 if (ip == HammerTruncIp) 2306 kprintf("truncate2 %016llx\n", 2307 (long long)ip->trunc_off); 2308 #endif 2309 } else { 2310 #ifdef DEBUG_TRUNCATE 2311 if (ip == HammerTruncIp) 2312 kprintf("truncate3 %016llx (ignored)\n", 2313 (long long)vap->va_size); 2314 #endif 2315 } 2316 } 2317 2318 #if 0 2319 /* 2320 * When truncating, nvtruncbuf() may have cleaned out 2321 * a portion of the last block on-disk in the buffer 2322 * cache. We must clean out any frontend records 2323 * for blocks beyond the new last block. 2324 */ 2325 aligned_size = (vap->va_size + (blksize - 1)) & 2326 ~(int64_t)(blksize - 1); 2327 if (truncating && vap->va_size < aligned_size) { 2328 aligned_size -= blksize; 2329 hammer_ip_frontend_trunc(ip, aligned_size); 2330 } 2331 #endif 2332 break; 2333 case VDATABASE: 2334 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 2335 ip->flags |= HAMMER_INODE_TRUNCATED; 2336 ip->trunc_off = vap->va_size; 2337 } else if (ip->trunc_off > vap->va_size) { 2338 ip->trunc_off = vap->va_size; 2339 } 2340 hammer_ip_frontend_trunc(ip, vap->va_size); 2341 ip->ino_data.size = vap->va_size; 2342 ip->ino_data.mtime = trans.time; 2343 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 2344 kflags |= NOTE_ATTRIB; 2345 break; 2346 default: 2347 error = EINVAL; 2348 goto done; 2349 } 2350 break; 2351 } 2352 if (vap->va_atime.tv_sec != VNOVAL) { 2353 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 2354 modflags |= HAMMER_INODE_ATIME; 2355 kflags |= NOTE_ATTRIB; 2356 } 2357 if (vap->va_mtime.tv_sec != VNOVAL) { 2358 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 2359 modflags |= HAMMER_INODE_MTIME; 2360 kflags |= NOTE_ATTRIB; 2361 } 2362 if (vap->va_mode != (mode_t)VNOVAL) { 2363 mode_t cur_mode = ip->ino_data.mode; 2364 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 2365 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 2366 2367 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 2368 cur_uid, cur_gid, &cur_mode); 2369 if (error == 0 && ip->ino_data.mode != cur_mode) { 2370 ip->ino_data.mode = cur_mode; 2371 ip->ino_data.ctime = trans.time; 2372 modflags |= HAMMER_INODE_DDIRTY; 2373 kflags |= NOTE_ATTRIB; 2374 } 2375 } 2376 done: 2377 if (error == 0) 2378 hammer_modify_inode(&trans, ip, modflags); 2379 hammer_done_transaction(&trans); 2380 hammer_knote(ap->a_vp, kflags); 2381 lwkt_reltoken(&hmp->fs_token); 2382 return (error); 2383 } 2384 2385 /* 2386 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2387 */ 2388 static 2389 int 2390 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2391 { 2392 struct hammer_transaction trans; 2393 struct hammer_inode *dip; 2394 struct hammer_inode *nip; 2395 hammer_record_t record; 2396 struct nchandle *nch; 2397 hammer_mount_t hmp; 2398 int error; 2399 int bytes; 2400 2401 ap->a_vap->va_type = VLNK; 2402 2403 nch = ap->a_nch; 2404 dip = VTOI(ap->a_dvp); 2405 hmp = dip->hmp; 2406 2407 if (dip->flags & HAMMER_INODE_RO) 2408 return (EROFS); 2409 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) 2410 return (error); 2411 2412 /* 2413 * Create a transaction to cover the operations we perform. 2414 */ 2415 lwkt_gettoken(&hmp->fs_token); 2416 hammer_start_transaction(&trans, hmp); 2417 ++hammer_stats_file_iopsw; 2418 2419 /* 2420 * Create a new filesystem object of the requested type. The 2421 * returned inode will be referenced but not locked. 2422 */ 2423 2424 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2425 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2426 NULL, &nip); 2427 if (error) { 2428 hammer_done_transaction(&trans); 2429 *ap->a_vpp = NULL; 2430 lwkt_reltoken(&hmp->fs_token); 2431 return (error); 2432 } 2433 2434 /* 2435 * Add a record representing the symlink. symlink stores the link 2436 * as pure data, not a string, and is no \0 terminated. 2437 */ 2438 if (error == 0) { 2439 bytes = strlen(ap->a_target); 2440 2441 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2442 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2443 } else { 2444 record = hammer_alloc_mem_record(nip, bytes); 2445 record->type = HAMMER_MEM_RECORD_GENERAL; 2446 2447 record->leaf.base.localization = nip->obj_localization + 2448 HAMMER_LOCALIZE_MISC; 2449 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2450 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2451 record->leaf.data_len = bytes; 2452 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2453 bcopy(ap->a_target, record->data->symlink.name, bytes); 2454 error = hammer_ip_add_record(&trans, record); 2455 } 2456 2457 /* 2458 * Set the file size to the length of the link. 2459 */ 2460 if (error == 0) { 2461 nip->ino_data.size = bytes; 2462 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); 2463 } 2464 } 2465 if (error == 0) 2466 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2467 nch->ncp->nc_nlen, nip); 2468 2469 /* 2470 * Finish up. 2471 */ 2472 if (error) { 2473 hammer_rel_inode(nip, 0); 2474 *ap->a_vpp = NULL; 2475 } else { 2476 error = hammer_get_vnode(nip, ap->a_vpp); 2477 hammer_rel_inode(nip, 0); 2478 if (error == 0) { 2479 cache_setunresolved(ap->a_nch); 2480 cache_setvp(ap->a_nch, *ap->a_vpp); 2481 hammer_knote(ap->a_dvp, NOTE_WRITE); 2482 } 2483 } 2484 hammer_done_transaction(&trans); 2485 lwkt_reltoken(&hmp->fs_token); 2486 return (error); 2487 } 2488 2489 /* 2490 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2491 */ 2492 static 2493 int 2494 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2495 { 2496 struct hammer_transaction trans; 2497 struct hammer_inode *dip; 2498 hammer_mount_t hmp; 2499 int error; 2500 2501 dip = VTOI(ap->a_dvp); 2502 hmp = dip->hmp; 2503 2504 if (hammer_nohistory(dip) == 0 && 2505 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2506 return (error); 2507 } 2508 2509 lwkt_gettoken(&hmp->fs_token); 2510 hammer_start_transaction(&trans, hmp); 2511 ++hammer_stats_file_iopsw; 2512 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2513 ap->a_cred, ap->a_flags, -1); 2514 hammer_done_transaction(&trans); 2515 lwkt_reltoken(&hmp->fs_token); 2516 2517 return (error); 2518 } 2519 2520 /* 2521 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2522 */ 2523 static 2524 int 2525 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2526 { 2527 struct hammer_inode *ip = ap->a_vp->v_data; 2528 hammer_mount_t hmp = ip->hmp; 2529 int error; 2530 2531 ++hammer_stats_file_iopsr; 2532 lwkt_gettoken(&hmp->fs_token); 2533 error = hammer_ioctl(ip, ap->a_command, ap->a_data, 2534 ap->a_fflag, ap->a_cred); 2535 lwkt_reltoken(&hmp->fs_token); 2536 return (error); 2537 } 2538 2539 static 2540 int 2541 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2542 { 2543 static const struct mountctl_opt extraopt[] = { 2544 { HMNT_NOHISTORY, "nohistory" }, 2545 { HMNT_MASTERID, "master" }, 2546 { 0, NULL} 2547 2548 }; 2549 struct hammer_mount *hmp; 2550 struct mount *mp; 2551 int usedbytes; 2552 int error; 2553 2554 error = 0; 2555 usedbytes = 0; 2556 mp = ap->a_head.a_ops->head.vv_mount; 2557 KKASSERT(mp->mnt_data != NULL); 2558 hmp = (struct hammer_mount *)mp->mnt_data; 2559 2560 lwkt_gettoken(&hmp->fs_token); 2561 2562 switch(ap->a_op) { 2563 case MOUNTCTL_SET_EXPORT: 2564 if (ap->a_ctllen != sizeof(struct export_args)) 2565 error = EINVAL; 2566 else 2567 error = hammer_vfs_export(mp, ap->a_op, 2568 (const struct export_args *)ap->a_ctl); 2569 break; 2570 case MOUNTCTL_MOUNTFLAGS: 2571 { 2572 /* 2573 * Call standard mountctl VOP function 2574 * so we get user mount flags. 2575 */ 2576 error = vop_stdmountctl(ap); 2577 if (error) 2578 break; 2579 2580 usedbytes = *ap->a_res; 2581 2582 if (usedbytes > 0 && usedbytes < ap->a_buflen) { 2583 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, 2584 ap->a_buf, 2585 ap->a_buflen - usedbytes, 2586 &error); 2587 } 2588 2589 *ap->a_res += usedbytes; 2590 break; 2591 } 2592 default: 2593 error = vop_stdmountctl(ap); 2594 break; 2595 } 2596 lwkt_reltoken(&hmp->fs_token); 2597 return(error); 2598 } 2599 2600 /* 2601 * hammer_vop_strategy { vp, bio } 2602 * 2603 * Strategy call, used for regular file read & write only. Note that the 2604 * bp may represent a cluster. 2605 * 2606 * To simplify operation and allow better optimizations in the future, 2607 * this code does not make any assumptions with regards to buffer alignment 2608 * or size. 2609 */ 2610 static 2611 int 2612 hammer_vop_strategy(struct vop_strategy_args *ap) 2613 { 2614 struct buf *bp; 2615 int error; 2616 2617 bp = ap->a_bio->bio_buf; 2618 2619 switch(bp->b_cmd) { 2620 case BUF_CMD_READ: 2621 error = hammer_vop_strategy_read(ap); 2622 break; 2623 case BUF_CMD_WRITE: 2624 error = hammer_vop_strategy_write(ap); 2625 break; 2626 default: 2627 bp->b_error = error = EINVAL; 2628 bp->b_flags |= B_ERROR; 2629 biodone(ap->a_bio); 2630 break; 2631 } 2632 2633 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */ 2634 2635 return (error); 2636 } 2637 2638 /* 2639 * Read from a regular file. Iterate the related records and fill in the 2640 * BIO/BUF. Gaps are zero-filled. 2641 * 2642 * The support code in hammer_object.c should be used to deal with mixed 2643 * in-memory and on-disk records. 2644 * 2645 * NOTE: Can be called from the cluster code with an oversized buf. 2646 * 2647 * XXX atime update 2648 */ 2649 static 2650 int 2651 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2652 { 2653 struct hammer_transaction trans; 2654 struct hammer_inode *ip; 2655 struct hammer_inode *dip; 2656 hammer_mount_t hmp; 2657 struct hammer_cursor cursor; 2658 hammer_base_elm_t base; 2659 hammer_off_t disk_offset; 2660 struct bio *bio; 2661 struct bio *nbio; 2662 struct buf *bp; 2663 int64_t rec_offset; 2664 int64_t ran_end; 2665 int64_t tmp64; 2666 int error; 2667 int boff; 2668 int roff; 2669 int n; 2670 int isdedupable; 2671 2672 bio = ap->a_bio; 2673 bp = bio->bio_buf; 2674 ip = ap->a_vp->v_data; 2675 hmp = ip->hmp; 2676 2677 /* 2678 * The zone-2 disk offset may have been set by the cluster code via 2679 * a BMAP operation, or else should be NOOFFSET. 2680 * 2681 * Checking the high bits for a match against zone-2 should suffice. 2682 * 2683 * In cases where a lot of data duplication is present it may be 2684 * more beneficial to drop through and doubule-buffer through the 2685 * device. 2686 */ 2687 nbio = push_bio(bio); 2688 if (hammer_double_buffer == 0 && 2689 (nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2690 HAMMER_ZONE_LARGE_DATA) { 2691 lwkt_gettoken(&hmp->fs_token); 2692 error = hammer_io_direct_read(hmp, nbio, NULL); 2693 lwkt_reltoken(&hmp->fs_token); 2694 return (error); 2695 } 2696 2697 /* 2698 * Well, that sucked. Do it the hard way. If all the stars are 2699 * aligned we may still be able to issue a direct-read. 2700 */ 2701 lwkt_gettoken(&hmp->fs_token); 2702 hammer_simple_transaction(&trans, hmp); 2703 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2704 2705 /* 2706 * Key range (begin and end inclusive) to scan. Note that the key's 2707 * stored in the actual records represent BASE+LEN, not BASE. The 2708 * first record containing bio_offset will have a key > bio_offset. 2709 */ 2710 cursor.key_beg.localization = ip->obj_localization + 2711 HAMMER_LOCALIZE_MISC; 2712 cursor.key_beg.obj_id = ip->obj_id; 2713 cursor.key_beg.create_tid = 0; 2714 cursor.key_beg.delete_tid = 0; 2715 cursor.key_beg.obj_type = 0; 2716 cursor.key_beg.key = bio->bio_offset + 1; 2717 cursor.asof = ip->obj_asof; 2718 cursor.flags |= HAMMER_CURSOR_ASOF; 2719 2720 cursor.key_end = cursor.key_beg; 2721 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2722 #if 0 2723 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2724 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2725 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2726 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2727 } else 2728 #endif 2729 { 2730 ran_end = bio->bio_offset + bp->b_bufsize; 2731 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2732 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2733 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2734 if (tmp64 < ran_end) 2735 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2736 else 2737 cursor.key_end.key = ran_end + MAXPHYS + 1; 2738 } 2739 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2740 2741 error = hammer_ip_first(&cursor); 2742 boff = 0; 2743 2744 while (error == 0) { 2745 /* 2746 * Get the base file offset of the record. The key for 2747 * data records is (base + bytes) rather then (base). 2748 */ 2749 base = &cursor.leaf->base; 2750 rec_offset = base->key - cursor.leaf->data_len; 2751 2752 /* 2753 * Calculate the gap, if any, and zero-fill it. 2754 * 2755 * n is the offset of the start of the record verses our 2756 * current seek offset in the bio. 2757 */ 2758 n = (int)(rec_offset - (bio->bio_offset + boff)); 2759 if (n > 0) { 2760 if (n > bp->b_bufsize - boff) 2761 n = bp->b_bufsize - boff; 2762 bzero((char *)bp->b_data + boff, n); 2763 boff += n; 2764 n = 0; 2765 } 2766 2767 /* 2768 * Calculate the data offset in the record and the number 2769 * of bytes we can copy. 2770 * 2771 * There are two degenerate cases. First, boff may already 2772 * be at bp->b_bufsize. Secondly, the data offset within 2773 * the record may exceed the record's size. 2774 */ 2775 roff = -n; 2776 rec_offset += roff; 2777 n = cursor.leaf->data_len - roff; 2778 if (n <= 0) { 2779 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff); 2780 n = 0; 2781 } else if (n > bp->b_bufsize - boff) { 2782 n = bp->b_bufsize - boff; 2783 } 2784 2785 /* 2786 * Deal with cached truncations. This cool bit of code 2787 * allows truncate()/ftruncate() to avoid having to sync 2788 * the file. 2789 * 2790 * If the frontend is truncated then all backend records are 2791 * subject to the frontend's truncation. 2792 * 2793 * If the backend is truncated then backend records on-disk 2794 * (but not in-memory) are subject to the backend's 2795 * truncation. In-memory records owned by the backend 2796 * represent data written after the truncation point on the 2797 * backend and must not be truncated. 2798 * 2799 * Truncate operations deal with frontend buffer cache 2800 * buffers and frontend-owned in-memory records synchronously. 2801 */ 2802 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2803 if (hammer_cursor_ondisk(&cursor)/* || 2804 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) { 2805 if (ip->trunc_off <= rec_offset) 2806 n = 0; 2807 else if (ip->trunc_off < rec_offset + n) 2808 n = (int)(ip->trunc_off - rec_offset); 2809 } 2810 } 2811 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2812 if (hammer_cursor_ondisk(&cursor)) { 2813 if (ip->sync_trunc_off <= rec_offset) 2814 n = 0; 2815 else if (ip->sync_trunc_off < rec_offset + n) 2816 n = (int)(ip->sync_trunc_off - rec_offset); 2817 } 2818 } 2819 2820 /* 2821 * Try to issue a direct read into our bio if possible, 2822 * otherwise resolve the element data into a hammer_buffer 2823 * and copy. 2824 * 2825 * The buffer on-disk should be zerod past any real 2826 * truncation point, but may not be for any synthesized 2827 * truncation point from above. 2828 */ 2829 disk_offset = cursor.leaf->data_offset + roff; 2830 isdedupable = (boff == 0 && n == bp->b_bufsize && 2831 hammer_cursor_ondisk(&cursor) && 2832 ((int)disk_offset & HAMMER_BUFMASK) == 0); 2833 2834 if (isdedupable && hammer_double_buffer == 0) { 2835 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2836 HAMMER_ZONE_LARGE_DATA); 2837 nbio->bio_offset = disk_offset; 2838 error = hammer_io_direct_read(hmp, nbio, cursor.leaf); 2839 if (hammer_live_dedup && error == 0) 2840 hammer_dedup_cache_add(ip, cursor.leaf); 2841 goto done; 2842 } else if (n) { 2843 error = hammer_ip_resolve_data(&cursor); 2844 if (error == 0) { 2845 if (hammer_live_dedup && isdedupable) 2846 hammer_dedup_cache_add(ip, cursor.leaf); 2847 bcopy((char *)cursor.data + roff, 2848 (char *)bp->b_data + boff, n); 2849 } 2850 } 2851 if (error) 2852 break; 2853 2854 /* 2855 * We have to be sure that the only elements added to the 2856 * dedup cache are those which are already on-media. 2857 */ 2858 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor)) 2859 hammer_dedup_cache_add(ip, cursor.leaf); 2860 2861 /* 2862 * Iterate until we have filled the request. 2863 */ 2864 boff += n; 2865 if (boff == bp->b_bufsize) 2866 break; 2867 error = hammer_ip_next(&cursor); 2868 } 2869 2870 /* 2871 * There may have been a gap after the last record 2872 */ 2873 if (error == ENOENT) 2874 error = 0; 2875 if (error == 0 && boff != bp->b_bufsize) { 2876 KKASSERT(boff < bp->b_bufsize); 2877 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2878 /* boff = bp->b_bufsize; */ 2879 } 2880 bp->b_resid = 0; 2881 bp->b_error = error; 2882 if (error) 2883 bp->b_flags |= B_ERROR; 2884 biodone(ap->a_bio); 2885 2886 done: 2887 /* 2888 * Cache the b-tree node for the last data read in cache[1]. 2889 * 2890 * If we hit the file EOF then also cache the node in the 2891 * governing director's cache[3], it will be used to initialize 2892 * the inode's cache[1] for any inodes looked up via the directory. 2893 * 2894 * This doesn't reduce disk accesses since the B-Tree chain is 2895 * likely cached, but it does reduce cpu overhead when looking 2896 * up file offsets for cpdup/tar/cpio style iterations. 2897 */ 2898 if (cursor.node) 2899 hammer_cache_node(&ip->cache[1], cursor.node); 2900 if (ran_end >= ip->ino_data.size) { 2901 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 2902 ip->obj_asof, ip->obj_localization); 2903 if (dip) { 2904 hammer_cache_node(&dip->cache[3], cursor.node); 2905 hammer_rel_inode(dip, 0); 2906 } 2907 } 2908 hammer_done_cursor(&cursor); 2909 hammer_done_transaction(&trans); 2910 lwkt_reltoken(&hmp->fs_token); 2911 return(error); 2912 } 2913 2914 /* 2915 * BMAP operation - used to support cluster_read() only. 2916 * 2917 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 2918 * 2919 * This routine may return EOPNOTSUPP if the opration is not supported for 2920 * the specified offset. The contents of the pointer arguments do not 2921 * need to be initialized in that case. 2922 * 2923 * If a disk address is available and properly aligned return 0 with 2924 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 2925 * to the run-length relative to that offset. Callers may assume that 2926 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 2927 * large, so return EOPNOTSUPP if it is not sufficiently large. 2928 */ 2929 static 2930 int 2931 hammer_vop_bmap(struct vop_bmap_args *ap) 2932 { 2933 struct hammer_transaction trans; 2934 struct hammer_inode *ip; 2935 hammer_mount_t hmp; 2936 struct hammer_cursor cursor; 2937 hammer_base_elm_t base; 2938 int64_t rec_offset; 2939 int64_t ran_end; 2940 int64_t tmp64; 2941 int64_t base_offset; 2942 int64_t base_disk_offset; 2943 int64_t last_offset; 2944 hammer_off_t last_disk_offset; 2945 hammer_off_t disk_offset; 2946 int rec_len; 2947 int error; 2948 int blksize; 2949 2950 ++hammer_stats_file_iopsr; 2951 ip = ap->a_vp->v_data; 2952 hmp = ip->hmp; 2953 2954 /* 2955 * We can only BMAP regular files. We can't BMAP database files, 2956 * directories, etc. 2957 */ 2958 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 2959 return(EOPNOTSUPP); 2960 2961 /* 2962 * bmap is typically called with runp/runb both NULL when used 2963 * for writing. We do not support BMAP for writing atm. 2964 */ 2965 if (ap->a_cmd != BUF_CMD_READ) 2966 return(EOPNOTSUPP); 2967 2968 /* 2969 * Scan the B-Tree to acquire blockmap addresses, then translate 2970 * to raw addresses. 2971 */ 2972 lwkt_gettoken(&hmp->fs_token); 2973 hammer_simple_transaction(&trans, hmp); 2974 #if 0 2975 kprintf("bmap_beg %016llx ip->cache %p\n", 2976 (long long)ap->a_loffset, ip->cache[1]); 2977 #endif 2978 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2979 2980 /* 2981 * Key range (begin and end inclusive) to scan. Note that the key's 2982 * stored in the actual records represent BASE+LEN, not BASE. The 2983 * first record containing bio_offset will have a key > bio_offset. 2984 */ 2985 cursor.key_beg.localization = ip->obj_localization + 2986 HAMMER_LOCALIZE_MISC; 2987 cursor.key_beg.obj_id = ip->obj_id; 2988 cursor.key_beg.create_tid = 0; 2989 cursor.key_beg.delete_tid = 0; 2990 cursor.key_beg.obj_type = 0; 2991 if (ap->a_runb) 2992 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 2993 else 2994 cursor.key_beg.key = ap->a_loffset + 1; 2995 if (cursor.key_beg.key < 0) 2996 cursor.key_beg.key = 0; 2997 cursor.asof = ip->obj_asof; 2998 cursor.flags |= HAMMER_CURSOR_ASOF; 2999 3000 cursor.key_end = cursor.key_beg; 3001 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 3002 3003 ran_end = ap->a_loffset + MAXPHYS; 3004 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 3005 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 3006 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 3007 if (tmp64 < ran_end) 3008 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 3009 else 3010 cursor.key_end.key = ran_end + MAXPHYS + 1; 3011 3012 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 3013 3014 error = hammer_ip_first(&cursor); 3015 base_offset = last_offset = 0; 3016 base_disk_offset = last_disk_offset = 0; 3017 3018 while (error == 0) { 3019 /* 3020 * Get the base file offset of the record. The key for 3021 * data records is (base + bytes) rather then (base). 3022 * 3023 * NOTE: rec_offset + rec_len may exceed the end-of-file. 3024 * The extra bytes should be zero on-disk and the BMAP op 3025 * should still be ok. 3026 */ 3027 base = &cursor.leaf->base; 3028 rec_offset = base->key - cursor.leaf->data_len; 3029 rec_len = cursor.leaf->data_len; 3030 3031 /* 3032 * Incorporate any cached truncation. 3033 * 3034 * NOTE: Modifications to rec_len based on synthesized 3035 * truncation points remove the guarantee that any extended 3036 * data on disk is zero (since the truncations may not have 3037 * taken place on-media yet). 3038 */ 3039 if (ip->flags & HAMMER_INODE_TRUNCATED) { 3040 if (hammer_cursor_ondisk(&cursor) || 3041 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 3042 if (ip->trunc_off <= rec_offset) 3043 rec_len = 0; 3044 else if (ip->trunc_off < rec_offset + rec_len) 3045 rec_len = (int)(ip->trunc_off - rec_offset); 3046 } 3047 } 3048 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 3049 if (hammer_cursor_ondisk(&cursor)) { 3050 if (ip->sync_trunc_off <= rec_offset) 3051 rec_len = 0; 3052 else if (ip->sync_trunc_off < rec_offset + rec_len) 3053 rec_len = (int)(ip->sync_trunc_off - rec_offset); 3054 } 3055 } 3056 3057 /* 3058 * Accumulate information. If we have hit a discontiguous 3059 * block reset base_offset unless we are already beyond the 3060 * requested offset. If we are, that's it, we stop. 3061 */ 3062 if (error) 3063 break; 3064 if (hammer_cursor_ondisk(&cursor)) { 3065 disk_offset = cursor.leaf->data_offset; 3066 if (rec_offset != last_offset || 3067 disk_offset != last_disk_offset) { 3068 if (rec_offset > ap->a_loffset) 3069 break; 3070 base_offset = rec_offset; 3071 base_disk_offset = disk_offset; 3072 } 3073 last_offset = rec_offset + rec_len; 3074 last_disk_offset = disk_offset + rec_len; 3075 3076 if (hammer_live_dedup) 3077 hammer_dedup_cache_add(ip, cursor.leaf); 3078 } 3079 3080 error = hammer_ip_next(&cursor); 3081 } 3082 3083 #if 0 3084 kprintf("BMAP %016llx: %016llx - %016llx\n", 3085 (long long)ap->a_loffset, 3086 (long long)base_offset, 3087 (long long)last_offset); 3088 kprintf("BMAP %16s: %016llx - %016llx\n", "", 3089 (long long)base_disk_offset, 3090 (long long)last_disk_offset); 3091 #endif 3092 3093 if (cursor.node) { 3094 hammer_cache_node(&ip->cache[1], cursor.node); 3095 #if 0 3096 kprintf("bmap_end2 %016llx ip->cache %p\n", 3097 (long long)ap->a_loffset, ip->cache[1]); 3098 #endif 3099 } 3100 hammer_done_cursor(&cursor); 3101 hammer_done_transaction(&trans); 3102 lwkt_reltoken(&hmp->fs_token); 3103 3104 /* 3105 * If we couldn't find any records or the records we did find were 3106 * all behind the requested offset, return failure. A forward 3107 * truncation can leave a hole w/ no on-disk records. 3108 */ 3109 if (last_offset == 0 || last_offset < ap->a_loffset) 3110 return (EOPNOTSUPP); 3111 3112 /* 3113 * Figure out the block size at the requested offset and adjust 3114 * our limits so the cluster_read() does not create inappropriately 3115 * sized buffer cache buffers. 3116 */ 3117 blksize = hammer_blocksize(ap->a_loffset); 3118 if (hammer_blocksize(base_offset) != blksize) { 3119 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 3120 } 3121 if (last_offset != ap->a_loffset && 3122 hammer_blocksize(last_offset - 1) != blksize) { 3123 last_offset = hammer_blockdemarc(ap->a_loffset, 3124 last_offset - 1); 3125 } 3126 3127 /* 3128 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 3129 * from occuring. 3130 */ 3131 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 3132 3133 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 3134 /* 3135 * Only large-data zones can be direct-IOd 3136 */ 3137 error = EOPNOTSUPP; 3138 } else if ((disk_offset & HAMMER_BUFMASK) || 3139 (last_offset - ap->a_loffset) < blksize) { 3140 /* 3141 * doffsetp is not aligned or the forward run size does 3142 * not cover a whole buffer, disallow the direct I/O. 3143 */ 3144 error = EOPNOTSUPP; 3145 } else { 3146 /* 3147 * We're good. 3148 */ 3149 *ap->a_doffsetp = disk_offset; 3150 if (ap->a_runb) { 3151 *ap->a_runb = ap->a_loffset - base_offset; 3152 KKASSERT(*ap->a_runb >= 0); 3153 } 3154 if (ap->a_runp) { 3155 *ap->a_runp = last_offset - ap->a_loffset; 3156 KKASSERT(*ap->a_runp >= 0); 3157 } 3158 error = 0; 3159 } 3160 return(error); 3161 } 3162 3163 /* 3164 * Write to a regular file. Because this is a strategy call the OS is 3165 * trying to actually get data onto the media. 3166 */ 3167 static 3168 int 3169 hammer_vop_strategy_write(struct vop_strategy_args *ap) 3170 { 3171 hammer_record_t record; 3172 hammer_mount_t hmp; 3173 hammer_inode_t ip; 3174 struct bio *bio; 3175 struct buf *bp; 3176 int blksize; 3177 int bytes; 3178 int error; 3179 3180 bio = ap->a_bio; 3181 bp = bio->bio_buf; 3182 ip = ap->a_vp->v_data; 3183 hmp = ip->hmp; 3184 3185 blksize = hammer_blocksize(bio->bio_offset); 3186 KKASSERT(bp->b_bufsize == blksize); 3187 3188 if (ip->flags & HAMMER_INODE_RO) { 3189 bp->b_error = EROFS; 3190 bp->b_flags |= B_ERROR; 3191 biodone(ap->a_bio); 3192 return(EROFS); 3193 } 3194 3195 lwkt_gettoken(&hmp->fs_token); 3196 3197 /* 3198 * Interlock with inode destruction (no in-kernel or directory 3199 * topology visibility). If we queue new IO while trying to 3200 * destroy the inode we can deadlock the vtrunc call in 3201 * hammer_inode_unloadable_check(). 3202 * 3203 * Besides, there's no point flushing a bp associated with an 3204 * inode that is being destroyed on-media and has no kernel 3205 * references. 3206 */ 3207 if ((ip->flags | ip->sync_flags) & 3208 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 3209 bp->b_resid = 0; 3210 biodone(ap->a_bio); 3211 lwkt_reltoken(&hmp->fs_token); 3212 return(0); 3213 } 3214 3215 /* 3216 * Reserve space and issue a direct-write from the front-end. 3217 * NOTE: The direct_io code will hammer_bread/bcopy smaller 3218 * allocations. 3219 * 3220 * An in-memory record will be installed to reference the storage 3221 * until the flusher can get to it. 3222 * 3223 * Since we own the high level bio the front-end will not try to 3224 * do a direct-read until the write completes. 3225 * 3226 * NOTE: The only time we do not reserve a full-sized buffers 3227 * worth of data is if the file is small. We do not try to 3228 * allocate a fragment (from the small-data zone) at the end of 3229 * an otherwise large file as this can lead to wildly separated 3230 * data. 3231 */ 3232 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 3233 KKASSERT(bio->bio_offset < ip->ino_data.size); 3234 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2) 3235 bytes = bp->b_bufsize; 3236 else 3237 bytes = ((int)ip->ino_data.size + 15) & ~15; 3238 3239 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 3240 bytes, &error); 3241 3242 /* 3243 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated 3244 * in hammer_vop_write(). We must flag the record so the proper 3245 * REDO_TERM_WRITE entry is generated during the flush. 3246 */ 3247 if (record) { 3248 if (bp->b_flags & B_VFSFLAG1) { 3249 record->flags |= HAMMER_RECF_REDO; 3250 bp->b_flags &= ~B_VFSFLAG1; 3251 } 3252 if (record->flags & HAMMER_RECF_DEDUPED) { 3253 bp->b_resid = 0; 3254 hammer_ip_replace_bulk(hmp, record); 3255 biodone(ap->a_bio); 3256 } else { 3257 hammer_io_direct_write(hmp, bio, record); 3258 } 3259 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 3260 hammer_flush_inode(ip, 0); 3261 } else { 3262 bp->b_bio2.bio_offset = NOOFFSET; 3263 bp->b_error = error; 3264 bp->b_flags |= B_ERROR; 3265 biodone(ap->a_bio); 3266 } 3267 lwkt_reltoken(&hmp->fs_token); 3268 return(error); 3269 } 3270 3271 /* 3272 * dounlink - disconnect a directory entry 3273 * 3274 * XXX whiteout support not really in yet 3275 */ 3276 static int 3277 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 3278 struct vnode *dvp, struct ucred *cred, 3279 int flags, int isdir) 3280 { 3281 struct namecache *ncp; 3282 hammer_inode_t dip; 3283 hammer_inode_t ip; 3284 hammer_mount_t hmp; 3285 struct hammer_cursor cursor; 3286 int64_t namekey; 3287 u_int32_t max_iterations; 3288 int nlen, error; 3289 3290 /* 3291 * Calculate the namekey and setup the key range for the scan. This 3292 * works kinda like a chained hash table where the lower 32 bits 3293 * of the namekey synthesize the chain. 3294 * 3295 * The key range is inclusive of both key_beg and key_end. 3296 */ 3297 dip = VTOI(dvp); 3298 ncp = nch->ncp; 3299 hmp = dip->hmp; 3300 3301 if (dip->flags & HAMMER_INODE_RO) 3302 return (EROFS); 3303 3304 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 3305 &max_iterations); 3306 retry: 3307 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 3308 cursor.key_beg.localization = dip->obj_localization + 3309 hammer_dir_localization(dip); 3310 cursor.key_beg.obj_id = dip->obj_id; 3311 cursor.key_beg.key = namekey; 3312 cursor.key_beg.create_tid = 0; 3313 cursor.key_beg.delete_tid = 0; 3314 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 3315 cursor.key_beg.obj_type = 0; 3316 3317 cursor.key_end = cursor.key_beg; 3318 cursor.key_end.key += max_iterations; 3319 cursor.asof = dip->obj_asof; 3320 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 3321 3322 /* 3323 * Scan all matching records (the chain), locate the one matching 3324 * the requested path component. info->last_error contains the 3325 * error code on search termination and could be 0, ENOENT, or 3326 * something else. 3327 * 3328 * The hammer_ip_*() functions merge in-memory records with on-disk 3329 * records for the purposes of the search. 3330 */ 3331 error = hammer_ip_first(&cursor); 3332 3333 while (error == 0) { 3334 error = hammer_ip_resolve_data(&cursor); 3335 if (error) 3336 break; 3337 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 3338 KKASSERT(nlen > 0); 3339 if (ncp->nc_nlen == nlen && 3340 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 3341 break; 3342 } 3343 error = hammer_ip_next(&cursor); 3344 } 3345 3346 /* 3347 * If all is ok we have to get the inode so we can adjust nlinks. 3348 * To avoid a deadlock with the flusher we must release the inode 3349 * lock on the directory when acquiring the inode for the entry. 3350 * 3351 * If the target is a directory, it must be empty. 3352 */ 3353 if (error == 0) { 3354 hammer_unlock(&cursor.ip->lock); 3355 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 3356 hmp->asof, 3357 cursor.data->entry.localization, 3358 0, &error); 3359 hammer_lock_sh(&cursor.ip->lock); 3360 if (error == ENOENT) { 3361 kprintf("HAMMER: WARNING: Removing " 3362 "dirent w/missing inode \"%s\"\n" 3363 "\tobj_id = %016llx\n", 3364 ncp->nc_name, 3365 (long long)cursor.data->entry.obj_id); 3366 error = 0; 3367 } 3368 3369 /* 3370 * If isdir >= 0 we validate that the entry is or is not a 3371 * directory. If isdir < 0 we don't care. 3372 */ 3373 if (error == 0 && isdir >= 0 && ip) { 3374 if (isdir && 3375 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 3376 error = ENOTDIR; 3377 } else if (isdir == 0 && 3378 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 3379 error = EISDIR; 3380 } 3381 } 3382 3383 /* 3384 * If we are trying to remove a directory the directory must 3385 * be empty. 3386 * 3387 * The check directory code can loop and deadlock/retry. Our 3388 * own cursor's node locks must be released to avoid a 3-way 3389 * deadlock with the flusher if the check directory code 3390 * blocks. 3391 * 3392 * If any changes whatsoever have been made to the cursor 3393 * set EDEADLK and retry. 3394 * 3395 * WARNING: See warnings in hammer_unlock_cursor() 3396 * function. 3397 */ 3398 if (error == 0 && ip && ip->ino_data.obj_type == 3399 HAMMER_OBJTYPE_DIRECTORY) { 3400 hammer_unlock_cursor(&cursor); 3401 error = hammer_ip_check_directory_empty(trans, ip); 3402 hammer_lock_cursor(&cursor); 3403 if (cursor.flags & HAMMER_CURSOR_RETEST) { 3404 kprintf("HAMMER: Warning: avoided deadlock " 3405 "on rmdir '%s'\n", 3406 ncp->nc_name); 3407 error = EDEADLK; 3408 } 3409 } 3410 3411 /* 3412 * Delete the directory entry. 3413 * 3414 * WARNING: hammer_ip_del_directory() may have to terminate 3415 * the cursor to avoid a deadlock. It is ok to call 3416 * hammer_done_cursor() twice. 3417 */ 3418 if (error == 0) { 3419 error = hammer_ip_del_directory(trans, &cursor, 3420 dip, ip); 3421 } 3422 hammer_done_cursor(&cursor); 3423 if (error == 0) { 3424 cache_setunresolved(nch); 3425 cache_setvp(nch, NULL); 3426 3427 /* 3428 * NOTE: ip->vp, if non-NULL, cannot be directly 3429 * referenced without formally acquiring the 3430 * vp since the vp might have zero refs on it, 3431 * or in the middle of a reclaim, etc. 3432 * 3433 * NOTE: The cache_setunresolved() can rip the vp 3434 * out from under us since the vp may not have 3435 * any refs, in which case ip->vp will be NULL 3436 * from the outset. 3437 */ 3438 while (ip && ip->vp) { 3439 struct vnode *vp; 3440 3441 error = hammer_get_vnode(ip, &vp); 3442 if (error == 0 && vp) { 3443 vn_unlock(vp); 3444 hammer_knote(ip->vp, NOTE_DELETE); 3445 cache_inval_vp(ip->vp, CINV_DESTROY); 3446 vrele(vp); 3447 break; 3448 } 3449 kprintf("Debug: HAMMER ip/vp race1 avoided\n"); 3450 } 3451 } 3452 if (ip) 3453 hammer_rel_inode(ip, 0); 3454 } else { 3455 hammer_done_cursor(&cursor); 3456 } 3457 if (error == EDEADLK) 3458 goto retry; 3459 3460 return (error); 3461 } 3462 3463 /************************************************************************ 3464 * FIFO AND SPECFS OPS * 3465 ************************************************************************ 3466 * 3467 */ 3468 static int 3469 hammer_vop_fifoclose (struct vop_close_args *ap) 3470 { 3471 /* XXX update itimes */ 3472 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 3473 } 3474 3475 static int 3476 hammer_vop_fiforead (struct vop_read_args *ap) 3477 { 3478 int error; 3479 3480 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3481 /* XXX update access time */ 3482 return (error); 3483 } 3484 3485 static int 3486 hammer_vop_fifowrite (struct vop_write_args *ap) 3487 { 3488 int error; 3489 3490 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3491 /* XXX update access time */ 3492 return (error); 3493 } 3494 3495 static 3496 int 3497 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 3498 { 3499 int error; 3500 3501 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 3502 if (error) 3503 error = hammer_vop_kqfilter(ap); 3504 return(error); 3505 } 3506 3507 /************************************************************************ 3508 * KQFILTER OPS * 3509 ************************************************************************ 3510 * 3511 */ 3512 static void filt_hammerdetach(struct knote *kn); 3513 static int filt_hammerread(struct knote *kn, long hint); 3514 static int filt_hammerwrite(struct knote *kn, long hint); 3515 static int filt_hammervnode(struct knote *kn, long hint); 3516 3517 static struct filterops hammerread_filtops = 3518 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread }; 3519 static struct filterops hammerwrite_filtops = 3520 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite }; 3521 static struct filterops hammervnode_filtops = 3522 { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode }; 3523 3524 static 3525 int 3526 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3527 { 3528 struct vnode *vp = ap->a_vp; 3529 struct knote *kn = ap->a_kn; 3530 3531 switch (kn->kn_filter) { 3532 case EVFILT_READ: 3533 kn->kn_fop = &hammerread_filtops; 3534 break; 3535 case EVFILT_WRITE: 3536 kn->kn_fop = &hammerwrite_filtops; 3537 break; 3538 case EVFILT_VNODE: 3539 kn->kn_fop = &hammervnode_filtops; 3540 break; 3541 default: 3542 return (EOPNOTSUPP); 3543 } 3544 3545 kn->kn_hook = (caddr_t)vp; 3546 3547 knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3548 3549 return(0); 3550 } 3551 3552 static void 3553 filt_hammerdetach(struct knote *kn) 3554 { 3555 struct vnode *vp = (void *)kn->kn_hook; 3556 3557 knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); 3558 } 3559 3560 static int 3561 filt_hammerread(struct knote *kn, long hint) 3562 { 3563 struct vnode *vp = (void *)kn->kn_hook; 3564 hammer_inode_t ip = VTOI(vp); 3565 hammer_mount_t hmp = ip->hmp; 3566 off_t off; 3567 3568 if (hint == NOTE_REVOKE) { 3569 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 3570 return(1); 3571 } 3572 lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ 3573 off = ip->ino_data.size - kn->kn_fp->f_offset; 3574 kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; 3575 lwkt_reltoken(&hmp->fs_token); 3576 if (kn->kn_sfflags & NOTE_OLDAPI) 3577 return(1); 3578 return (kn->kn_data != 0); 3579 } 3580 3581 static int 3582 filt_hammerwrite(struct knote *kn, long hint) 3583 { 3584 if (hint == NOTE_REVOKE) 3585 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 3586 kn->kn_data = 0; 3587 return (1); 3588 } 3589 3590 static int 3591 filt_hammervnode(struct knote *kn, long hint) 3592 { 3593 if (kn->kn_sfflags & hint) 3594 kn->kn_fflags |= hint; 3595 if (hint == NOTE_REVOKE) { 3596 kn->kn_flags |= EV_EOF; 3597 return (1); 3598 } 3599 return (kn->kn_fflags != 0); 3600 } 3601 3602