1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/fcntl.h> 41 #include <sys/namecache.h> 42 #include <sys/vnode.h> 43 #include <sys/lockf.h> 44 #include <sys/event.h> 45 #include <sys/stat.h> 46 #include <sys/dirent.h> 47 #include <sys/file.h> 48 #include <vm/vm_extern.h> 49 #include <vfs/fifofs/fifo.h> 50 #include "hammer.h" 51 52 /* 53 * USERFS VNOPS 54 */ 55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/ 56 static int hammer_vop_fsync(struct vop_fsync_args *); 57 static int hammer_vop_read(struct vop_read_args *); 58 static int hammer_vop_write(struct vop_write_args *); 59 static int hammer_vop_access(struct vop_access_args *); 60 static int hammer_vop_advlock(struct vop_advlock_args *); 61 static int hammer_vop_close(struct vop_close_args *); 62 static int hammer_vop_ncreate(struct vop_ncreate_args *); 63 static int hammer_vop_getattr(struct vop_getattr_args *); 64 static int hammer_vop_nresolve(struct vop_nresolve_args *); 65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 66 static int hammer_vop_nlink(struct vop_nlink_args *); 67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 68 static int hammer_vop_nmknod(struct vop_nmknod_args *); 69 static int hammer_vop_open(struct vop_open_args *); 70 static int hammer_vop_print(struct vop_print_args *); 71 static int hammer_vop_readdir(struct vop_readdir_args *); 72 static int hammer_vop_readlink(struct vop_readlink_args *); 73 static int hammer_vop_nremove(struct vop_nremove_args *); 74 static int hammer_vop_nrename(struct vop_nrename_args *); 75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 76 static int hammer_vop_markatime(struct vop_markatime_args *); 77 static int hammer_vop_setattr(struct vop_setattr_args *); 78 static int hammer_vop_strategy(struct vop_strategy_args *); 79 static int hammer_vop_bmap(struct vop_bmap_args *ap); 80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 82 static int hammer_vop_ioctl(struct vop_ioctl_args *); 83 static int hammer_vop_mountctl(struct vop_mountctl_args *); 84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 85 86 static int hammer_vop_fifoclose (struct vop_close_args *); 87 static int hammer_vop_fiforead (struct vop_read_args *); 88 static int hammer_vop_fifowrite (struct vop_write_args *); 89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 90 91 static int hammer_vop_specclose (struct vop_close_args *); 92 static int hammer_vop_specread (struct vop_read_args *); 93 static int hammer_vop_specwrite (struct vop_write_args *); 94 95 struct vop_ops hammer_vnode_vops = { 96 .vop_default = vop_defaultop, 97 .vop_fsync = hammer_vop_fsync, 98 .vop_getpages = vop_stdgetpages, 99 .vop_putpages = vop_stdputpages, 100 .vop_read = hammer_vop_read, 101 .vop_write = hammer_vop_write, 102 .vop_access = hammer_vop_access, 103 .vop_advlock = hammer_vop_advlock, 104 .vop_close = hammer_vop_close, 105 .vop_ncreate = hammer_vop_ncreate, 106 .vop_getattr = hammer_vop_getattr, 107 .vop_inactive = hammer_vop_inactive, 108 .vop_reclaim = hammer_vop_reclaim, 109 .vop_nresolve = hammer_vop_nresolve, 110 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 111 .vop_nlink = hammer_vop_nlink, 112 .vop_nmkdir = hammer_vop_nmkdir, 113 .vop_nmknod = hammer_vop_nmknod, 114 .vop_open = hammer_vop_open, 115 .vop_pathconf = vop_stdpathconf, 116 .vop_print = hammer_vop_print, 117 .vop_readdir = hammer_vop_readdir, 118 .vop_readlink = hammer_vop_readlink, 119 .vop_nremove = hammer_vop_nremove, 120 .vop_nrename = hammer_vop_nrename, 121 .vop_nrmdir = hammer_vop_nrmdir, 122 .vop_markatime = hammer_vop_markatime, 123 .vop_setattr = hammer_vop_setattr, 124 .vop_bmap = hammer_vop_bmap, 125 .vop_strategy = hammer_vop_strategy, 126 .vop_nsymlink = hammer_vop_nsymlink, 127 .vop_nwhiteout = hammer_vop_nwhiteout, 128 .vop_ioctl = hammer_vop_ioctl, 129 .vop_mountctl = hammer_vop_mountctl, 130 .vop_kqfilter = hammer_vop_kqfilter 131 }; 132 133 struct vop_ops hammer_spec_vops = { 134 .vop_default = spec_vnoperate, 135 .vop_fsync = hammer_vop_fsync, 136 .vop_read = hammer_vop_specread, 137 .vop_write = hammer_vop_specwrite, 138 .vop_access = hammer_vop_access, 139 .vop_close = hammer_vop_specclose, 140 .vop_markatime = hammer_vop_markatime, 141 .vop_getattr = hammer_vop_getattr, 142 .vop_inactive = hammer_vop_inactive, 143 .vop_reclaim = hammer_vop_reclaim, 144 .vop_setattr = hammer_vop_setattr 145 }; 146 147 struct vop_ops hammer_fifo_vops = { 148 .vop_default = fifo_vnoperate, 149 .vop_fsync = hammer_vop_fsync, 150 .vop_read = hammer_vop_fiforead, 151 .vop_write = hammer_vop_fifowrite, 152 .vop_access = hammer_vop_access, 153 .vop_close = hammer_vop_fifoclose, 154 .vop_markatime = hammer_vop_markatime, 155 .vop_getattr = hammer_vop_getattr, 156 .vop_inactive = hammer_vop_inactive, 157 .vop_reclaim = hammer_vop_reclaim, 158 .vop_setattr = hammer_vop_setattr, 159 .vop_kqfilter = hammer_vop_fifokqfilter 160 }; 161 162 static __inline 163 void 164 hammer_knote(struct vnode *vp, int flags) 165 { 166 if (flags) 167 KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags); 168 } 169 170 #ifdef DEBUG_TRUNCATE 171 struct hammer_inode *HammerTruncIp; 172 #endif 173 174 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 175 struct vnode *dvp, struct ucred *cred, 176 int flags, int isdir); 177 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 178 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 179 180 #if 0 181 static 182 int 183 hammer_vop_vnoperate(struct vop_generic_args *) 184 { 185 return (VOCALL(&hammer_vnode_vops, ap)); 186 } 187 #endif 188 189 /* 190 * hammer_vop_fsync { vp, waitfor } 191 * 192 * fsync() an inode to disk and wait for it to be completely committed 193 * such that the information would not be undone if a crash occured after 194 * return. 195 */ 196 static 197 int 198 hammer_vop_fsync(struct vop_fsync_args *ap) 199 { 200 hammer_inode_t ip = VTOI(ap->a_vp); 201 202 ++hammer_count_fsyncs; 203 vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL); 204 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 205 if (ap->a_waitfor == MNT_WAIT) { 206 vn_unlock(ap->a_vp); 207 hammer_wait_inode(ip); 208 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); 209 } 210 return (ip->error); 211 } 212 213 /* 214 * hammer_vop_read { vp, uio, ioflag, cred } 215 */ 216 static 217 int 218 hammer_vop_read(struct vop_read_args *ap) 219 { 220 struct hammer_transaction trans; 221 hammer_inode_t ip; 222 off_t offset; 223 struct buf *bp; 224 struct uio *uio; 225 int error; 226 int n; 227 int seqcount; 228 int ioseqcount; 229 int blksize; 230 231 if (ap->a_vp->v_type != VREG) 232 return (EINVAL); 233 ip = VTOI(ap->a_vp); 234 error = 0; 235 uio = ap->a_uio; 236 237 /* 238 * Allow the UIO's size to override the sequential heuristic. 239 */ 240 blksize = hammer_blocksize(uio->uio_offset); 241 seqcount = (uio->uio_resid + (blksize - 1)) / blksize; 242 ioseqcount = ap->a_ioflag >> 16; 243 if (seqcount < ioseqcount) 244 seqcount = ioseqcount; 245 246 hammer_start_transaction(&trans, ip->hmp); 247 248 /* 249 * Access the data typically in HAMMER_BUFSIZE blocks via the 250 * buffer cache, but HAMMER may use a variable block size based 251 * on the offset. 252 */ 253 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 254 int64_t base_offset; 255 int64_t file_limit; 256 257 blksize = hammer_blocksize(uio->uio_offset); 258 offset = (int)uio->uio_offset & (blksize - 1); 259 base_offset = uio->uio_offset - offset; 260 261 if (hammer_cluster_enable) { 262 /* 263 * Use file_limit to prevent cluster_read() from 264 * creating buffers of the wrong block size past 265 * the demarc. 266 */ 267 file_limit = ip->ino_data.size; 268 if (base_offset < HAMMER_XDEMARC && 269 file_limit > HAMMER_XDEMARC) { 270 file_limit = HAMMER_XDEMARC; 271 } 272 error = cluster_read(ap->a_vp, 273 file_limit, base_offset, 274 blksize, MAXPHYS, 275 seqcount, &bp); 276 } else { 277 error = bread(ap->a_vp, base_offset, blksize, &bp); 278 } 279 if (error) { 280 kprintf("error %d\n", error); 281 brelse(bp); 282 break; 283 } 284 285 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ 286 n = blksize - offset; 287 if (n > uio->uio_resid) 288 n = uio->uio_resid; 289 if (n > ip->ino_data.size - uio->uio_offset) 290 n = (int)(ip->ino_data.size - uio->uio_offset); 291 error = uiomove((char *)bp->b_data + offset, n, uio); 292 293 /* data has a lower priority then meta-data */ 294 bp->b_flags |= B_AGE; 295 bqrelse(bp); 296 if (error) 297 break; 298 hammer_stats_file_read += n; 299 } 300 if ((ip->flags & HAMMER_INODE_RO) == 0 && 301 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 302 ip->ino_data.atime = trans.time; 303 hammer_modify_inode(ip, HAMMER_INODE_ATIME); 304 } 305 hammer_done_transaction(&trans); 306 return (error); 307 } 308 309 /* 310 * hammer_vop_write { vp, uio, ioflag, cred } 311 */ 312 static 313 int 314 hammer_vop_write(struct vop_write_args *ap) 315 { 316 struct hammer_transaction trans; 317 struct hammer_inode *ip; 318 hammer_mount_t hmp; 319 struct uio *uio; 320 int offset; 321 off_t base_offset; 322 struct buf *bp; 323 int kflags; 324 int error; 325 int n; 326 int flags; 327 int delta; 328 int seqcount; 329 330 if (ap->a_vp->v_type != VREG) 331 return (EINVAL); 332 ip = VTOI(ap->a_vp); 333 hmp = ip->hmp; 334 error = 0; 335 kflags = 0; 336 seqcount = ap->a_ioflag >> 16; 337 338 if (ip->flags & HAMMER_INODE_RO) 339 return (EROFS); 340 341 /* 342 * Create a transaction to cover the operations we perform. 343 */ 344 hammer_start_transaction(&trans, hmp); 345 uio = ap->a_uio; 346 347 /* 348 * Check append mode 349 */ 350 if (ap->a_ioflag & IO_APPEND) 351 uio->uio_offset = ip->ino_data.size; 352 353 /* 354 * Check for illegal write offsets. Valid range is 0...2^63-1. 355 * 356 * NOTE: the base_off assignment is required to work around what 357 * I consider to be a GCC-4 optimization bug. 358 */ 359 if (uio->uio_offset < 0) { 360 hammer_done_transaction(&trans); 361 return (EFBIG); 362 } 363 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 364 if (uio->uio_resid > 0 && base_offset <= 0) { 365 hammer_done_transaction(&trans); 366 return (EFBIG); 367 } 368 369 /* 370 * Access the data typically in HAMMER_BUFSIZE blocks via the 371 * buffer cache, but HAMMER may use a variable block size based 372 * on the offset. 373 */ 374 while (uio->uio_resid > 0) { 375 int fixsize = 0; 376 int blksize; 377 int blkmask; 378 379 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 380 break; 381 382 blksize = hammer_blocksize(uio->uio_offset); 383 384 /* 385 * Do not allow HAMMER to blow out the buffer cache. Very 386 * large UIOs can lockout other processes due to bwillwrite() 387 * mechanics. 388 * 389 * The hammer inode is not locked during these operations. 390 * The vnode is locked which can interfere with the pageout 391 * daemon for non-UIO_NOCOPY writes but should not interfere 392 * with the buffer cache. Even so, we cannot afford to 393 * allow the pageout daemon to build up too many dirty buffer 394 * cache buffers. 395 * 396 * Only call this if we aren't being recursively called from 397 * a virtual disk device (vn), else we may deadlock. 398 */ 399 if ((ap->a_ioflag & IO_RECURSE) == 0) 400 bwillwrite(blksize); 401 402 /* 403 * Do not allow HAMMER to blow out system memory by 404 * accumulating too many records. Records are so well 405 * decoupled from the buffer cache that it is possible 406 * for userland to push data out to the media via 407 * direct-write, but build up the records queued to the 408 * backend faster then the backend can flush them out. 409 * HAMMER has hit its write limit but the frontend has 410 * no pushback to slow it down. 411 */ 412 if (hmp->rsv_recs > hammer_limit_recs / 2) { 413 /* 414 * Get the inode on the flush list 415 */ 416 if (ip->rsv_recs >= 64) 417 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 418 else if (ip->rsv_recs >= 16) 419 hammer_flush_inode(ip, 0); 420 421 /* 422 * Keep the flusher going if the system keeps 423 * queueing records. 424 */ 425 delta = hmp->count_newrecords - 426 hmp->last_newrecords; 427 if (delta < 0 || delta > hammer_limit_recs / 2) { 428 hmp->last_newrecords = hmp->count_newrecords; 429 hammer_sync_hmp(hmp, MNT_NOWAIT); 430 } 431 432 /* 433 * If we have gotten behind start slowing 434 * down the writers. 435 */ 436 delta = (hmp->rsv_recs - hammer_limit_recs) * 437 hz / hammer_limit_recs; 438 if (delta > 0) 439 tsleep(&trans, 0, "hmrslo", delta); 440 } 441 442 /* 443 * Calculate the blocksize at the current offset and figure 444 * out how much we can actually write. 445 */ 446 blkmask = blksize - 1; 447 offset = (int)uio->uio_offset & blkmask; 448 base_offset = uio->uio_offset & ~(int64_t)blkmask; 449 n = blksize - offset; 450 if (n > uio->uio_resid) 451 n = uio->uio_resid; 452 if (uio->uio_offset + n > ip->ino_data.size) { 453 vnode_pager_setsize(ap->a_vp, uio->uio_offset + n); 454 fixsize = 1; 455 kflags |= NOTE_EXTEND; 456 } 457 458 if (uio->uio_segflg == UIO_NOCOPY) { 459 /* 460 * Issuing a write with the same data backing the 461 * buffer. Instantiate the buffer to collect the 462 * backing vm pages, then read-in any missing bits. 463 * 464 * This case is used by vop_stdputpages(). 465 */ 466 bp = getblk(ap->a_vp, base_offset, 467 blksize, GETBLK_BHEAVY, 0); 468 if ((bp->b_flags & B_CACHE) == 0) { 469 bqrelse(bp); 470 error = bread(ap->a_vp, base_offset, 471 blksize, &bp); 472 } 473 } else if (offset == 0 && uio->uio_resid >= blksize) { 474 /* 475 * Even though we are entirely overwriting the buffer 476 * we may still have to zero it out to avoid a 477 * mmap/write visibility issue. 478 */ 479 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 480 if ((bp->b_flags & B_CACHE) == 0) 481 vfs_bio_clrbuf(bp); 482 } else if (base_offset >= ip->ino_data.size) { 483 /* 484 * If the base offset of the buffer is beyond the 485 * file EOF, we don't have to issue a read. 486 */ 487 bp = getblk(ap->a_vp, base_offset, 488 blksize, GETBLK_BHEAVY, 0); 489 vfs_bio_clrbuf(bp); 490 } else { 491 /* 492 * Partial overwrite, read in any missing bits then 493 * replace the portion being written. 494 */ 495 error = bread(ap->a_vp, base_offset, blksize, &bp); 496 if (error == 0) 497 bheavy(bp); 498 } 499 if (error == 0) { 500 error = uiomove((char *)bp->b_data + offset, 501 n, uio); 502 } 503 504 /* 505 * If we screwed up we have to undo any VM size changes we 506 * made. 507 */ 508 if (error) { 509 brelse(bp); 510 if (fixsize) { 511 vtruncbuf(ap->a_vp, ip->ino_data.size, 512 hammer_blocksize(ip->ino_data.size)); 513 } 514 break; 515 } 516 kflags |= NOTE_WRITE; 517 hammer_stats_file_write += n; 518 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ 519 if (ip->ino_data.size < uio->uio_offset) { 520 ip->ino_data.size = uio->uio_offset; 521 flags = HAMMER_INODE_DDIRTY; 522 vnode_pager_setsize(ap->a_vp, ip->ino_data.size); 523 } else { 524 flags = 0; 525 } 526 ip->ino_data.mtime = trans.time; 527 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 528 hammer_modify_inode(ip, flags); 529 530 /* 531 * Once we dirty the buffer any cached zone-X offset 532 * becomes invalid. HAMMER NOTE: no-history mode cannot 533 * allow overwriting over the same data sector unless 534 * we provide UNDOs for the old data, which we don't. 535 */ 536 bp->b_bio2.bio_offset = NOOFFSET; 537 538 /* 539 * Final buffer disposition. 540 */ 541 bp->b_flags |= B_AGE; 542 if (ap->a_ioflag & IO_SYNC) { 543 bwrite(bp); 544 } else if (ap->a_ioflag & IO_DIRECT) { 545 bawrite(bp); 546 } else { 547 bdwrite(bp); 548 } 549 } 550 hammer_done_transaction(&trans); 551 hammer_knote(ap->a_vp, kflags); 552 return (error); 553 } 554 555 /* 556 * hammer_vop_access { vp, mode, cred } 557 */ 558 static 559 int 560 hammer_vop_access(struct vop_access_args *ap) 561 { 562 struct hammer_inode *ip = VTOI(ap->a_vp); 563 uid_t uid; 564 gid_t gid; 565 int error; 566 567 ++hammer_stats_file_iopsr; 568 uid = hammer_to_unix_xid(&ip->ino_data.uid); 569 gid = hammer_to_unix_xid(&ip->ino_data.gid); 570 571 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 572 ip->ino_data.uflags); 573 return (error); 574 } 575 576 /* 577 * hammer_vop_advlock { vp, id, op, fl, flags } 578 */ 579 static 580 int 581 hammer_vop_advlock(struct vop_advlock_args *ap) 582 { 583 hammer_inode_t ip = VTOI(ap->a_vp); 584 585 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 586 } 587 588 /* 589 * hammer_vop_close { vp, fflag } 590 */ 591 static 592 int 593 hammer_vop_close(struct vop_close_args *ap) 594 { 595 /*hammer_inode_t ip = VTOI(ap->a_vp);*/ 596 return (vop_stdclose(ap)); 597 } 598 599 /* 600 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 601 * 602 * The operating system has already ensured that the directory entry 603 * does not exist and done all appropriate namespace locking. 604 */ 605 static 606 int 607 hammer_vop_ncreate(struct vop_ncreate_args *ap) 608 { 609 struct hammer_transaction trans; 610 struct hammer_inode *dip; 611 struct hammer_inode *nip; 612 struct nchandle *nch; 613 int error; 614 615 nch = ap->a_nch; 616 dip = VTOI(ap->a_dvp); 617 618 if (dip->flags & HAMMER_INODE_RO) 619 return (EROFS); 620 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 621 return (error); 622 623 /* 624 * Create a transaction to cover the operations we perform. 625 */ 626 hammer_start_transaction(&trans, dip->hmp); 627 ++hammer_stats_file_iopsw; 628 629 /* 630 * Create a new filesystem object of the requested type. The 631 * returned inode will be referenced and shared-locked to prevent 632 * it from being moved to the flusher. 633 */ 634 635 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 636 dip, NULL, &nip); 637 if (error) { 638 hkprintf("hammer_create_inode error %d\n", error); 639 hammer_done_transaction(&trans); 640 *ap->a_vpp = NULL; 641 return (error); 642 } 643 644 /* 645 * Add the new filesystem object to the directory. This will also 646 * bump the inode's link count. 647 */ 648 error = hammer_ip_add_directory(&trans, dip, 649 nch->ncp->nc_name, nch->ncp->nc_nlen, 650 nip); 651 if (error) 652 hkprintf("hammer_ip_add_directory error %d\n", error); 653 654 /* 655 * Finish up. 656 */ 657 if (error) { 658 hammer_rel_inode(nip, 0); 659 hammer_done_transaction(&trans); 660 *ap->a_vpp = NULL; 661 } else { 662 error = hammer_get_vnode(nip, ap->a_vpp); 663 hammer_done_transaction(&trans); 664 hammer_rel_inode(nip, 0); 665 if (error == 0) { 666 cache_setunresolved(ap->a_nch); 667 cache_setvp(ap->a_nch, *ap->a_vpp); 668 } 669 hammer_knote(ap->a_dvp, NOTE_WRITE); 670 } 671 return (error); 672 } 673 674 /* 675 * hammer_vop_getattr { vp, vap } 676 * 677 * Retrieve an inode's attribute information. When accessing inodes 678 * historically we fake the atime field to ensure consistent results. 679 * The atime field is stored in the B-Tree element and allowed to be 680 * updated without cycling the element. 681 */ 682 static 683 int 684 hammer_vop_getattr(struct vop_getattr_args *ap) 685 { 686 struct hammer_inode *ip = VTOI(ap->a_vp); 687 struct vattr *vap = ap->a_vap; 688 689 /* 690 * We want the fsid to be different when accessing a filesystem 691 * with different as-of's so programs like diff don't think 692 * the files are the same. 693 * 694 * We also want the fsid to be the same when comparing snapshots, 695 * or when comparing mirrors (which might be backed by different 696 * physical devices). HAMMER fsids are based on the PFS's 697 * shared_uuid field. 698 * 699 * XXX there is a chance of collision here. The va_fsid reported 700 * by stat is different from the more involved fsid used in the 701 * mount structure. 702 */ 703 ++hammer_stats_file_iopsr; 704 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 705 (u_int32_t)(ip->obj_asof >> 32); 706 707 vap->va_fileid = ip->ino_leaf.base.obj_id; 708 vap->va_mode = ip->ino_data.mode; 709 vap->va_nlink = ip->ino_data.nlinks; 710 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 711 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 712 vap->va_rmajor = 0; 713 vap->va_rminor = 0; 714 vap->va_size = ip->ino_data.size; 715 716 /* 717 * Special case for @@PFS softlinks. The actual size of the 718 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 719 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 720 */ 721 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 722 ip->ino_data.size == 10 && 723 ip->obj_asof == HAMMER_MAX_TID && 724 ip->obj_localization == 0 && 725 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 726 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) 727 vap->va_size = 26; 728 else 729 vap->va_size = 10; 730 } 731 732 /* 733 * We must provide a consistent atime and mtime for snapshots 734 * so people can do a 'tar cf - ... | md5' on them and get 735 * consistent results. 736 */ 737 if (ip->flags & HAMMER_INODE_RO) { 738 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 739 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 740 } else { 741 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 742 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 743 } 744 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 745 vap->va_flags = ip->ino_data.uflags; 746 vap->va_gen = 1; /* hammer inums are unique for all time */ 747 vap->va_blocksize = HAMMER_BUFSIZE; 748 if (ip->ino_data.size >= HAMMER_XDEMARC) { 749 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 750 ~HAMMER_XBUFMASK64; 751 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) { 752 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 753 ~HAMMER_BUFMASK64; 754 } else { 755 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 756 } 757 758 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 759 vap->va_filerev = 0; /* XXX */ 760 /* mtime uniquely identifies any adjustments made to the file XXX */ 761 vap->va_fsmid = ip->ino_data.mtime; 762 vap->va_uid_uuid = ip->ino_data.uid; 763 vap->va_gid_uuid = ip->ino_data.gid; 764 vap->va_fsid_uuid = ip->hmp->fsid; 765 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 766 VA_FSID_UUID_VALID; 767 768 switch (ip->ino_data.obj_type) { 769 case HAMMER_OBJTYPE_CDEV: 770 case HAMMER_OBJTYPE_BDEV: 771 vap->va_rmajor = ip->ino_data.rmajor; 772 vap->va_rminor = ip->ino_data.rminor; 773 break; 774 default: 775 break; 776 } 777 return(0); 778 } 779 780 /* 781 * hammer_vop_nresolve { nch, dvp, cred } 782 * 783 * Locate the requested directory entry. 784 */ 785 static 786 int 787 hammer_vop_nresolve(struct vop_nresolve_args *ap) 788 { 789 struct hammer_transaction trans; 790 struct namecache *ncp; 791 hammer_inode_t dip; 792 hammer_inode_t ip; 793 hammer_tid_t asof; 794 struct hammer_cursor cursor; 795 struct vnode *vp; 796 int64_t namekey; 797 int error; 798 int i; 799 int nlen; 800 int flags; 801 int ispfs; 802 int64_t obj_id; 803 u_int32_t localization; 804 u_int32_t max_iterations; 805 806 /* 807 * Misc initialization, plus handle as-of name extensions. Look for 808 * the '@@' extension. Note that as-of files and directories cannot 809 * be modified. 810 */ 811 dip = VTOI(ap->a_dvp); 812 ncp = ap->a_nch->ncp; 813 asof = dip->obj_asof; 814 localization = dip->obj_localization; /* for code consistency */ 815 nlen = ncp->nc_nlen; 816 flags = dip->flags & HAMMER_INODE_RO; 817 ispfs = 0; 818 819 hammer_simple_transaction(&trans, dip->hmp); 820 ++hammer_stats_file_iopsr; 821 822 for (i = 0; i < nlen; ++i) { 823 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 824 error = hammer_str_to_tid(ncp->nc_name + i + 2, 825 &ispfs, &asof, &localization); 826 if (error != 0) { 827 i = nlen; 828 break; 829 } 830 if (asof != HAMMER_MAX_TID) 831 flags |= HAMMER_INODE_RO; 832 break; 833 } 834 } 835 nlen = i; 836 837 /* 838 * If this is a PFS softlink we dive into the PFS 839 */ 840 if (ispfs && nlen == 0) { 841 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 842 asof, localization, 843 flags, &error); 844 if (error == 0) { 845 error = hammer_get_vnode(ip, &vp); 846 hammer_rel_inode(ip, 0); 847 } else { 848 vp = NULL; 849 } 850 if (error == 0) { 851 vn_unlock(vp); 852 cache_setvp(ap->a_nch, vp); 853 vrele(vp); 854 } 855 goto done; 856 } 857 858 /* 859 * If there is no path component the time extension is relative to dip. 860 * e.g. "fubar/@@<snapshot>" 861 * 862 * "." is handled by the kernel, but ".@@<snapshot>" is not. 863 * e.g. "fubar/.@@<snapshot>" 864 * 865 * ".." is handled by the kernel. We do not currently handle 866 * "..@<snapshot>". 867 */ 868 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 869 ip = hammer_get_inode(&trans, dip, dip->obj_id, 870 asof, dip->obj_localization, 871 flags, &error); 872 if (error == 0) { 873 error = hammer_get_vnode(ip, &vp); 874 hammer_rel_inode(ip, 0); 875 } else { 876 vp = NULL; 877 } 878 if (error == 0) { 879 vn_unlock(vp); 880 cache_setvp(ap->a_nch, vp); 881 vrele(vp); 882 } 883 goto done; 884 } 885 886 /* 887 * Calculate the namekey and setup the key range for the scan. This 888 * works kinda like a chained hash table where the lower 32 bits 889 * of the namekey synthesize the chain. 890 * 891 * The key range is inclusive of both key_beg and key_end. 892 */ 893 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 894 &max_iterations); 895 896 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 897 cursor.key_beg.localization = dip->obj_localization + 898 HAMMER_LOCALIZE_MISC; 899 cursor.key_beg.obj_id = dip->obj_id; 900 cursor.key_beg.key = namekey; 901 cursor.key_beg.create_tid = 0; 902 cursor.key_beg.delete_tid = 0; 903 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 904 cursor.key_beg.obj_type = 0; 905 906 cursor.key_end = cursor.key_beg; 907 cursor.key_end.key += max_iterations; 908 cursor.asof = asof; 909 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 910 911 /* 912 * Scan all matching records (the chain), locate the one matching 913 * the requested path component. 914 * 915 * The hammer_ip_*() functions merge in-memory records with on-disk 916 * records for the purposes of the search. 917 */ 918 obj_id = 0; 919 localization = HAMMER_DEF_LOCALIZATION; 920 921 if (error == 0) { 922 error = hammer_ip_first(&cursor); 923 while (error == 0) { 924 error = hammer_ip_resolve_data(&cursor); 925 if (error) 926 break; 927 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 928 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 929 obj_id = cursor.data->entry.obj_id; 930 localization = cursor.data->entry.localization; 931 break; 932 } 933 error = hammer_ip_next(&cursor); 934 } 935 } 936 hammer_done_cursor(&cursor); 937 938 /* 939 * Lookup the obj_id. This should always succeed. If it does not 940 * the filesystem may be damaged and we return a dummy inode. 941 */ 942 if (error == 0) { 943 ip = hammer_get_inode(&trans, dip, obj_id, 944 asof, localization, 945 flags, &error); 946 if (error == ENOENT) { 947 kprintf("HAMMER: WARNING: Missing " 948 "inode for dirent \"%s\"\n" 949 "\tobj_id = %016llx\n", 950 ncp->nc_name, (long long)obj_id); 951 error = 0; 952 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 953 asof, localization, 954 flags, &error); 955 } 956 if (error == 0) { 957 error = hammer_get_vnode(ip, &vp); 958 hammer_rel_inode(ip, 0); 959 } else { 960 vp = NULL; 961 } 962 if (error == 0) { 963 vn_unlock(vp); 964 cache_setvp(ap->a_nch, vp); 965 vrele(vp); 966 } 967 } else if (error == ENOENT) { 968 cache_setvp(ap->a_nch, NULL); 969 } 970 done: 971 hammer_done_transaction(&trans); 972 return (error); 973 } 974 975 /* 976 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 977 * 978 * Locate the parent directory of a directory vnode. 979 * 980 * dvp is referenced but not locked. *vpp must be returned referenced and 981 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 982 * at the root, instead it could indicate that the directory we were in was 983 * removed. 984 * 985 * NOTE: as-of sequences are not linked into the directory structure. If 986 * we are at the root with a different asof then the mount point, reload 987 * the same directory with the mount point's asof. I'm not sure what this 988 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 989 * get confused, but it hasn't been tested. 990 */ 991 static 992 int 993 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 994 { 995 struct hammer_transaction trans; 996 struct hammer_inode *dip; 997 struct hammer_inode *ip; 998 int64_t parent_obj_id; 999 u_int32_t parent_obj_localization; 1000 hammer_tid_t asof; 1001 int error; 1002 1003 dip = VTOI(ap->a_dvp); 1004 asof = dip->obj_asof; 1005 1006 /* 1007 * Whos are parent? This could be the root of a pseudo-filesystem 1008 * whos parent is in another localization domain. 1009 */ 1010 parent_obj_id = dip->ino_data.parent_obj_id; 1011 if (dip->obj_id == HAMMER_OBJID_ROOT) 1012 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 1013 else 1014 parent_obj_localization = dip->obj_localization; 1015 1016 if (parent_obj_id == 0) { 1017 if (dip->obj_id == HAMMER_OBJID_ROOT && 1018 asof != dip->hmp->asof) { 1019 parent_obj_id = dip->obj_id; 1020 asof = dip->hmp->asof; 1021 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1022 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 1023 dip->obj_asof); 1024 } else { 1025 *ap->a_vpp = NULL; 1026 return ENOENT; 1027 } 1028 } 1029 1030 hammer_simple_transaction(&trans, dip->hmp); 1031 ++hammer_stats_file_iopsr; 1032 1033 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1034 asof, parent_obj_localization, 1035 dip->flags, &error); 1036 if (ip) { 1037 error = hammer_get_vnode(ip, ap->a_vpp); 1038 hammer_rel_inode(ip, 0); 1039 } else { 1040 *ap->a_vpp = NULL; 1041 } 1042 hammer_done_transaction(&trans); 1043 return (error); 1044 } 1045 1046 /* 1047 * hammer_vop_nlink { nch, dvp, vp, cred } 1048 */ 1049 static 1050 int 1051 hammer_vop_nlink(struct vop_nlink_args *ap) 1052 { 1053 struct hammer_transaction trans; 1054 struct hammer_inode *dip; 1055 struct hammer_inode *ip; 1056 struct nchandle *nch; 1057 int error; 1058 1059 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1060 return(EXDEV); 1061 1062 nch = ap->a_nch; 1063 dip = VTOI(ap->a_dvp); 1064 ip = VTOI(ap->a_vp); 1065 1066 if (dip->obj_localization != ip->obj_localization) 1067 return(EXDEV); 1068 1069 if (dip->flags & HAMMER_INODE_RO) 1070 return (EROFS); 1071 if (ip->flags & HAMMER_INODE_RO) 1072 return (EROFS); 1073 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1074 return (error); 1075 1076 /* 1077 * Create a transaction to cover the operations we perform. 1078 */ 1079 hammer_start_transaction(&trans, dip->hmp); 1080 ++hammer_stats_file_iopsw; 1081 1082 /* 1083 * Add the filesystem object to the directory. Note that neither 1084 * dip nor ip are referenced or locked, but their vnodes are 1085 * referenced. This function will bump the inode's link count. 1086 */ 1087 error = hammer_ip_add_directory(&trans, dip, 1088 nch->ncp->nc_name, nch->ncp->nc_nlen, 1089 ip); 1090 1091 /* 1092 * Finish up. 1093 */ 1094 if (error == 0) { 1095 cache_setunresolved(nch); 1096 cache_setvp(nch, ap->a_vp); 1097 } 1098 hammer_done_transaction(&trans); 1099 hammer_knote(ap->a_vp, NOTE_LINK); 1100 hammer_knote(ap->a_dvp, NOTE_WRITE); 1101 return (error); 1102 } 1103 1104 /* 1105 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1106 * 1107 * The operating system has already ensured that the directory entry 1108 * does not exist and done all appropriate namespace locking. 1109 */ 1110 static 1111 int 1112 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1113 { 1114 struct hammer_transaction trans; 1115 struct hammer_inode *dip; 1116 struct hammer_inode *nip; 1117 struct nchandle *nch; 1118 int error; 1119 1120 nch = ap->a_nch; 1121 dip = VTOI(ap->a_dvp); 1122 1123 if (dip->flags & HAMMER_INODE_RO) 1124 return (EROFS); 1125 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1126 return (error); 1127 1128 /* 1129 * Create a transaction to cover the operations we perform. 1130 */ 1131 hammer_start_transaction(&trans, dip->hmp); 1132 ++hammer_stats_file_iopsw; 1133 1134 /* 1135 * Create a new filesystem object of the requested type. The 1136 * returned inode will be referenced but not locked. 1137 */ 1138 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1139 dip, NULL, &nip); 1140 if (error) { 1141 hkprintf("hammer_mkdir error %d\n", error); 1142 hammer_done_transaction(&trans); 1143 *ap->a_vpp = NULL; 1144 return (error); 1145 } 1146 /* 1147 * Add the new filesystem object to the directory. This will also 1148 * bump the inode's link count. 1149 */ 1150 error = hammer_ip_add_directory(&trans, dip, 1151 nch->ncp->nc_name, nch->ncp->nc_nlen, 1152 nip); 1153 if (error) 1154 hkprintf("hammer_mkdir (add) error %d\n", error); 1155 1156 /* 1157 * Finish up. 1158 */ 1159 if (error) { 1160 hammer_rel_inode(nip, 0); 1161 *ap->a_vpp = NULL; 1162 } else { 1163 error = hammer_get_vnode(nip, ap->a_vpp); 1164 hammer_rel_inode(nip, 0); 1165 if (error == 0) { 1166 cache_setunresolved(ap->a_nch); 1167 cache_setvp(ap->a_nch, *ap->a_vpp); 1168 } 1169 } 1170 hammer_done_transaction(&trans); 1171 if (error == 0) 1172 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1173 return (error); 1174 } 1175 1176 /* 1177 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1178 * 1179 * The operating system has already ensured that the directory entry 1180 * does not exist and done all appropriate namespace locking. 1181 */ 1182 static 1183 int 1184 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1185 { 1186 struct hammer_transaction trans; 1187 struct hammer_inode *dip; 1188 struct hammer_inode *nip; 1189 struct nchandle *nch; 1190 int error; 1191 1192 nch = ap->a_nch; 1193 dip = VTOI(ap->a_dvp); 1194 1195 if (dip->flags & HAMMER_INODE_RO) 1196 return (EROFS); 1197 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1198 return (error); 1199 1200 /* 1201 * Create a transaction to cover the operations we perform. 1202 */ 1203 hammer_start_transaction(&trans, dip->hmp); 1204 ++hammer_stats_file_iopsw; 1205 1206 /* 1207 * Create a new filesystem object of the requested type. The 1208 * returned inode will be referenced but not locked. 1209 * 1210 * If mknod specifies a directory a pseudo-fs is created. 1211 */ 1212 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1213 dip, NULL, &nip); 1214 if (error) { 1215 hammer_done_transaction(&trans); 1216 *ap->a_vpp = NULL; 1217 return (error); 1218 } 1219 1220 /* 1221 * Add the new filesystem object to the directory. This will also 1222 * bump the inode's link count. 1223 */ 1224 error = hammer_ip_add_directory(&trans, dip, 1225 nch->ncp->nc_name, nch->ncp->nc_nlen, 1226 nip); 1227 1228 /* 1229 * Finish up. 1230 */ 1231 if (error) { 1232 hammer_rel_inode(nip, 0); 1233 *ap->a_vpp = NULL; 1234 } else { 1235 error = hammer_get_vnode(nip, ap->a_vpp); 1236 hammer_rel_inode(nip, 0); 1237 if (error == 0) { 1238 cache_setunresolved(ap->a_nch); 1239 cache_setvp(ap->a_nch, *ap->a_vpp); 1240 } 1241 } 1242 hammer_done_transaction(&trans); 1243 if (error == 0) 1244 hammer_knote(ap->a_dvp, NOTE_WRITE); 1245 return (error); 1246 } 1247 1248 /* 1249 * hammer_vop_open { vp, mode, cred, fp } 1250 */ 1251 static 1252 int 1253 hammer_vop_open(struct vop_open_args *ap) 1254 { 1255 hammer_inode_t ip; 1256 1257 ++hammer_stats_file_iopsr; 1258 ip = VTOI(ap->a_vp); 1259 1260 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1261 return (EROFS); 1262 return(vop_stdopen(ap)); 1263 } 1264 1265 /* 1266 * hammer_vop_print { vp } 1267 */ 1268 static 1269 int 1270 hammer_vop_print(struct vop_print_args *ap) 1271 { 1272 return EOPNOTSUPP; 1273 } 1274 1275 /* 1276 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1277 */ 1278 static 1279 int 1280 hammer_vop_readdir(struct vop_readdir_args *ap) 1281 { 1282 struct hammer_transaction trans; 1283 struct hammer_cursor cursor; 1284 struct hammer_inode *ip; 1285 struct uio *uio; 1286 hammer_base_elm_t base; 1287 int error; 1288 int cookie_index; 1289 int ncookies; 1290 off_t *cookies; 1291 off_t saveoff; 1292 int r; 1293 int dtype; 1294 1295 ++hammer_stats_file_iopsr; 1296 ip = VTOI(ap->a_vp); 1297 uio = ap->a_uio; 1298 saveoff = uio->uio_offset; 1299 1300 if (ap->a_ncookies) { 1301 ncookies = uio->uio_resid / 16 + 1; 1302 if (ncookies > 1024) 1303 ncookies = 1024; 1304 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1305 cookie_index = 0; 1306 } else { 1307 ncookies = -1; 1308 cookies = NULL; 1309 cookie_index = 0; 1310 } 1311 1312 hammer_simple_transaction(&trans, ip->hmp); 1313 1314 /* 1315 * Handle artificial entries 1316 * 1317 * It should be noted that the minimum value for a directory 1318 * hash key on-media is 0x0000000100000000, so we can use anything 1319 * less then that to represent our 'special' key space. 1320 */ 1321 error = 0; 1322 if (saveoff == 0) { 1323 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1324 if (r) 1325 goto done; 1326 if (cookies) 1327 cookies[cookie_index] = saveoff; 1328 ++saveoff; 1329 ++cookie_index; 1330 if (cookie_index == ncookies) 1331 goto done; 1332 } 1333 if (saveoff == 1) { 1334 if (ip->ino_data.parent_obj_id) { 1335 r = vop_write_dirent(&error, uio, 1336 ip->ino_data.parent_obj_id, 1337 DT_DIR, 2, ".."); 1338 } else { 1339 r = vop_write_dirent(&error, uio, 1340 ip->obj_id, DT_DIR, 2, ".."); 1341 } 1342 if (r) 1343 goto done; 1344 if (cookies) 1345 cookies[cookie_index] = saveoff; 1346 ++saveoff; 1347 ++cookie_index; 1348 if (cookie_index == ncookies) 1349 goto done; 1350 } 1351 1352 /* 1353 * Key range (begin and end inclusive) to scan. Directory keys 1354 * directly translate to a 64 bit 'seek' position. 1355 */ 1356 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1357 cursor.key_beg.localization = ip->obj_localization + 1358 HAMMER_LOCALIZE_MISC; 1359 cursor.key_beg.obj_id = ip->obj_id; 1360 cursor.key_beg.create_tid = 0; 1361 cursor.key_beg.delete_tid = 0; 1362 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1363 cursor.key_beg.obj_type = 0; 1364 cursor.key_beg.key = saveoff; 1365 1366 cursor.key_end = cursor.key_beg; 1367 cursor.key_end.key = HAMMER_MAX_KEY; 1368 cursor.asof = ip->obj_asof; 1369 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1370 1371 error = hammer_ip_first(&cursor); 1372 1373 while (error == 0) { 1374 error = hammer_ip_resolve_data(&cursor); 1375 if (error) 1376 break; 1377 base = &cursor.leaf->base; 1378 saveoff = base->key; 1379 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1380 1381 if (base->obj_id != ip->obj_id) 1382 panic("readdir: bad record at %p", cursor.node); 1383 1384 /* 1385 * Convert pseudo-filesystems into softlinks 1386 */ 1387 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1388 r = vop_write_dirent( 1389 &error, uio, cursor.data->entry.obj_id, 1390 dtype, 1391 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1392 (void *)cursor.data->entry.name); 1393 if (r) 1394 break; 1395 ++saveoff; 1396 if (cookies) 1397 cookies[cookie_index] = base->key; 1398 ++cookie_index; 1399 if (cookie_index == ncookies) 1400 break; 1401 error = hammer_ip_next(&cursor); 1402 } 1403 hammer_done_cursor(&cursor); 1404 1405 done: 1406 hammer_done_transaction(&trans); 1407 1408 if (ap->a_eofflag) 1409 *ap->a_eofflag = (error == ENOENT); 1410 uio->uio_offset = saveoff; 1411 if (error && cookie_index == 0) { 1412 if (error == ENOENT) 1413 error = 0; 1414 if (cookies) { 1415 kfree(cookies, M_TEMP); 1416 *ap->a_ncookies = 0; 1417 *ap->a_cookies = NULL; 1418 } 1419 } else { 1420 if (error == ENOENT) 1421 error = 0; 1422 if (cookies) { 1423 *ap->a_ncookies = cookie_index; 1424 *ap->a_cookies = cookies; 1425 } 1426 } 1427 return(error); 1428 } 1429 1430 /* 1431 * hammer_vop_readlink { vp, uio, cred } 1432 */ 1433 static 1434 int 1435 hammer_vop_readlink(struct vop_readlink_args *ap) 1436 { 1437 struct hammer_transaction trans; 1438 struct hammer_cursor cursor; 1439 struct hammer_inode *ip; 1440 char buf[32]; 1441 u_int32_t localization; 1442 hammer_pseudofs_inmem_t pfsm; 1443 int error; 1444 1445 ip = VTOI(ap->a_vp); 1446 1447 /* 1448 * Shortcut if the symlink data was stuffed into ino_data. 1449 * 1450 * Also expand special "@@PFS%05d" softlinks (expansion only 1451 * occurs for non-historical (current) accesses made from the 1452 * primary filesystem). 1453 */ 1454 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1455 char *ptr; 1456 int bytes; 1457 1458 ptr = ip->ino_data.ext.symlink; 1459 bytes = (int)ip->ino_data.size; 1460 if (bytes == 10 && 1461 ip->obj_asof == HAMMER_MAX_TID && 1462 ip->obj_localization == 0 && 1463 strncmp(ptr, "@@PFS", 5) == 0) { 1464 hammer_simple_transaction(&trans, ip->hmp); 1465 bcopy(ptr + 5, buf, 5); 1466 buf[5] = 0; 1467 localization = strtoul(buf, NULL, 10) << 16; 1468 pfsm = hammer_load_pseudofs(&trans, localization, 1469 &error); 1470 if (error == 0) { 1471 if (pfsm->pfsd.mirror_flags & 1472 HAMMER_PFSD_SLAVE) { 1473 /* vap->va_size == 26 */ 1474 ksnprintf(buf, sizeof(buf), 1475 "@@0x%016llx:%05d", 1476 pfsm->pfsd.sync_end_tid, 1477 localization >> 16); 1478 } else { 1479 /* vap->va_size == 10 */ 1480 ksnprintf(buf, sizeof(buf), 1481 "@@-1:%05d", 1482 localization >> 16); 1483 #if 0 1484 ksnprintf(buf, sizeof(buf), 1485 "@@0x%016llx:%05d", 1486 HAMMER_MAX_TID, 1487 localization >> 16); 1488 #endif 1489 } 1490 ptr = buf; 1491 bytes = strlen(buf); 1492 } 1493 if (pfsm) 1494 hammer_rel_pseudofs(trans.hmp, pfsm); 1495 hammer_done_transaction(&trans); 1496 } 1497 error = uiomove(ptr, bytes, ap->a_uio); 1498 return(error); 1499 } 1500 1501 /* 1502 * Long version 1503 */ 1504 hammer_simple_transaction(&trans, ip->hmp); 1505 ++hammer_stats_file_iopsr; 1506 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1507 1508 /* 1509 * Key range (begin and end inclusive) to scan. Directory keys 1510 * directly translate to a 64 bit 'seek' position. 1511 */ 1512 cursor.key_beg.localization = ip->obj_localization + 1513 HAMMER_LOCALIZE_MISC; 1514 cursor.key_beg.obj_id = ip->obj_id; 1515 cursor.key_beg.create_tid = 0; 1516 cursor.key_beg.delete_tid = 0; 1517 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1518 cursor.key_beg.obj_type = 0; 1519 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1520 cursor.asof = ip->obj_asof; 1521 cursor.flags |= HAMMER_CURSOR_ASOF; 1522 1523 error = hammer_ip_lookup(&cursor); 1524 if (error == 0) { 1525 error = hammer_ip_resolve_data(&cursor); 1526 if (error == 0) { 1527 KKASSERT(cursor.leaf->data_len >= 1528 HAMMER_SYMLINK_NAME_OFF); 1529 error = uiomove(cursor.data->symlink.name, 1530 cursor.leaf->data_len - 1531 HAMMER_SYMLINK_NAME_OFF, 1532 ap->a_uio); 1533 } 1534 } 1535 hammer_done_cursor(&cursor); 1536 hammer_done_transaction(&trans); 1537 return(error); 1538 } 1539 1540 /* 1541 * hammer_vop_nremove { nch, dvp, cred } 1542 */ 1543 static 1544 int 1545 hammer_vop_nremove(struct vop_nremove_args *ap) 1546 { 1547 struct hammer_transaction trans; 1548 struct hammer_inode *dip; 1549 int error; 1550 1551 dip = VTOI(ap->a_dvp); 1552 1553 if (hammer_nohistory(dip) == 0 && 1554 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1555 return (error); 1556 } 1557 1558 hammer_start_transaction(&trans, dip->hmp); 1559 ++hammer_stats_file_iopsw; 1560 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1561 hammer_done_transaction(&trans); 1562 if (error == 0) 1563 hammer_knote(ap->a_dvp, NOTE_WRITE); 1564 return (error); 1565 } 1566 1567 /* 1568 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1569 */ 1570 static 1571 int 1572 hammer_vop_nrename(struct vop_nrename_args *ap) 1573 { 1574 struct hammer_transaction trans; 1575 struct namecache *fncp; 1576 struct namecache *tncp; 1577 struct hammer_inode *fdip; 1578 struct hammer_inode *tdip; 1579 struct hammer_inode *ip; 1580 struct hammer_cursor cursor; 1581 int64_t namekey; 1582 u_int32_t max_iterations; 1583 int nlen, error; 1584 1585 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1586 return(EXDEV); 1587 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1588 return(EXDEV); 1589 1590 fdip = VTOI(ap->a_fdvp); 1591 tdip = VTOI(ap->a_tdvp); 1592 fncp = ap->a_fnch->ncp; 1593 tncp = ap->a_tnch->ncp; 1594 ip = VTOI(fncp->nc_vp); 1595 KKASSERT(ip != NULL); 1596 1597 if (fdip->obj_localization != tdip->obj_localization) 1598 return(EXDEV); 1599 if (fdip->obj_localization != ip->obj_localization) 1600 return(EXDEV); 1601 1602 if (fdip->flags & HAMMER_INODE_RO) 1603 return (EROFS); 1604 if (tdip->flags & HAMMER_INODE_RO) 1605 return (EROFS); 1606 if (ip->flags & HAMMER_INODE_RO) 1607 return (EROFS); 1608 if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1609 return (error); 1610 1611 hammer_start_transaction(&trans, fdip->hmp); 1612 ++hammer_stats_file_iopsw; 1613 1614 /* 1615 * Remove tncp from the target directory and then link ip as 1616 * tncp. XXX pass trans to dounlink 1617 * 1618 * Force the inode sync-time to match the transaction so it is 1619 * in-sync with the creation of the target directory entry. 1620 */ 1621 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 1622 ap->a_cred, 0, -1); 1623 if (error == 0 || error == ENOENT) { 1624 error = hammer_ip_add_directory(&trans, tdip, 1625 tncp->nc_name, tncp->nc_nlen, 1626 ip); 1627 if (error == 0) { 1628 ip->ino_data.parent_obj_id = tdip->obj_id; 1629 ip->ino_data.ctime = trans.time; 1630 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); 1631 } 1632 } 1633 if (error) 1634 goto failed; /* XXX */ 1635 1636 /* 1637 * Locate the record in the originating directory and remove it. 1638 * 1639 * Calculate the namekey and setup the key range for the scan. This 1640 * works kinda like a chained hash table where the lower 32 bits 1641 * of the namekey synthesize the chain. 1642 * 1643 * The key range is inclusive of both key_beg and key_end. 1644 */ 1645 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 1646 &max_iterations); 1647 retry: 1648 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 1649 cursor.key_beg.localization = fdip->obj_localization + 1650 HAMMER_LOCALIZE_MISC; 1651 cursor.key_beg.obj_id = fdip->obj_id; 1652 cursor.key_beg.key = namekey; 1653 cursor.key_beg.create_tid = 0; 1654 cursor.key_beg.delete_tid = 0; 1655 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1656 cursor.key_beg.obj_type = 0; 1657 1658 cursor.key_end = cursor.key_beg; 1659 cursor.key_end.key += max_iterations; 1660 cursor.asof = fdip->obj_asof; 1661 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1662 1663 /* 1664 * Scan all matching records (the chain), locate the one matching 1665 * the requested path component. 1666 * 1667 * The hammer_ip_*() functions merge in-memory records with on-disk 1668 * records for the purposes of the search. 1669 */ 1670 error = hammer_ip_first(&cursor); 1671 while (error == 0) { 1672 if (hammer_ip_resolve_data(&cursor) != 0) 1673 break; 1674 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 1675 KKASSERT(nlen > 0); 1676 if (fncp->nc_nlen == nlen && 1677 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1678 break; 1679 } 1680 error = hammer_ip_next(&cursor); 1681 } 1682 1683 /* 1684 * If all is ok we have to get the inode so we can adjust nlinks. 1685 * 1686 * WARNING: hammer_ip_del_directory() may have to terminate the 1687 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 1688 * twice. 1689 */ 1690 if (error == 0) 1691 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 1692 1693 /* 1694 * XXX A deadlock here will break rename's atomicy for the purposes 1695 * of crash recovery. 1696 */ 1697 if (error == EDEADLK) { 1698 hammer_done_cursor(&cursor); 1699 goto retry; 1700 } 1701 1702 /* 1703 * Cleanup and tell the kernel that the rename succeeded. 1704 */ 1705 hammer_done_cursor(&cursor); 1706 if (error == 0) { 1707 cache_rename(ap->a_fnch, ap->a_tnch); 1708 hammer_knote(ap->a_fdvp, NOTE_WRITE); 1709 hammer_knote(ap->a_tdvp, NOTE_WRITE); 1710 if (ip->vp) 1711 hammer_knote(ip->vp, NOTE_RENAME); 1712 } 1713 1714 failed: 1715 hammer_done_transaction(&trans); 1716 return (error); 1717 } 1718 1719 /* 1720 * hammer_vop_nrmdir { nch, dvp, cred } 1721 */ 1722 static 1723 int 1724 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 1725 { 1726 struct hammer_transaction trans; 1727 struct hammer_inode *dip; 1728 int error; 1729 1730 dip = VTOI(ap->a_dvp); 1731 1732 if (hammer_nohistory(dip) == 0 && 1733 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1734 return (error); 1735 } 1736 1737 hammer_start_transaction(&trans, dip->hmp); 1738 ++hammer_stats_file_iopsw; 1739 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 1740 hammer_done_transaction(&trans); 1741 if (error == 0) 1742 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1743 return (error); 1744 } 1745 1746 /* 1747 * hammer_vop_markatime { vp, cred } 1748 */ 1749 static 1750 int 1751 hammer_vop_markatime(struct vop_markatime_args *ap) 1752 { 1753 struct hammer_transaction trans; 1754 struct hammer_inode *ip; 1755 1756 ip = VTOI(ap->a_vp); 1757 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1758 return (EROFS); 1759 if (ip->flags & HAMMER_INODE_RO) 1760 return (EROFS); 1761 if (ip->hmp->mp->mnt_flag & MNT_NOATIME) 1762 return (0); 1763 hammer_start_transaction(&trans, ip->hmp); 1764 ++hammer_stats_file_iopsw; 1765 1766 ip->ino_data.atime = trans.time; 1767 hammer_modify_inode(ip, HAMMER_INODE_ATIME); 1768 hammer_done_transaction(&trans); 1769 hammer_knote(ap->a_vp, NOTE_ATTRIB); 1770 return (0); 1771 } 1772 1773 /* 1774 * hammer_vop_setattr { vp, vap, cred } 1775 */ 1776 static 1777 int 1778 hammer_vop_setattr(struct vop_setattr_args *ap) 1779 { 1780 struct hammer_transaction trans; 1781 struct vattr *vap; 1782 struct hammer_inode *ip; 1783 int modflags; 1784 int error; 1785 int truncating; 1786 int blksize; 1787 int kflags; 1788 int64_t aligned_size; 1789 u_int32_t flags; 1790 1791 vap = ap->a_vap; 1792 ip = ap->a_vp->v_data; 1793 modflags = 0; 1794 kflags = 0; 1795 1796 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1797 return(EROFS); 1798 if (ip->flags & HAMMER_INODE_RO) 1799 return (EROFS); 1800 if (hammer_nohistory(ip) == 0 && 1801 (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1802 return (error); 1803 } 1804 1805 hammer_start_transaction(&trans, ip->hmp); 1806 ++hammer_stats_file_iopsw; 1807 error = 0; 1808 1809 if (vap->va_flags != VNOVAL) { 1810 flags = ip->ino_data.uflags; 1811 error = vop_helper_setattr_flags(&flags, vap->va_flags, 1812 hammer_to_unix_xid(&ip->ino_data.uid), 1813 ap->a_cred); 1814 if (error == 0) { 1815 if (ip->ino_data.uflags != flags) { 1816 ip->ino_data.uflags = flags; 1817 ip->ino_data.ctime = trans.time; 1818 modflags |= HAMMER_INODE_DDIRTY; 1819 kflags |= NOTE_ATTRIB; 1820 } 1821 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 1822 error = 0; 1823 goto done; 1824 } 1825 } 1826 goto done; 1827 } 1828 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 1829 error = EPERM; 1830 goto done; 1831 } 1832 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 1833 mode_t cur_mode = ip->ino_data.mode; 1834 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1835 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1836 uuid_t uuid_uid; 1837 uuid_t uuid_gid; 1838 1839 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 1840 ap->a_cred, 1841 &cur_uid, &cur_gid, &cur_mode); 1842 if (error == 0) { 1843 hammer_guid_to_uuid(&uuid_uid, cur_uid); 1844 hammer_guid_to_uuid(&uuid_gid, cur_gid); 1845 if (bcmp(&uuid_uid, &ip->ino_data.uid, 1846 sizeof(uuid_uid)) || 1847 bcmp(&uuid_gid, &ip->ino_data.gid, 1848 sizeof(uuid_gid)) || 1849 ip->ino_data.mode != cur_mode 1850 ) { 1851 ip->ino_data.uid = uuid_uid; 1852 ip->ino_data.gid = uuid_gid; 1853 ip->ino_data.mode = cur_mode; 1854 ip->ino_data.ctime = trans.time; 1855 modflags |= HAMMER_INODE_DDIRTY; 1856 } 1857 kflags |= NOTE_ATTRIB; 1858 } 1859 } 1860 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 1861 switch(ap->a_vp->v_type) { 1862 case VREG: 1863 if (vap->va_size == ip->ino_data.size) 1864 break; 1865 /* 1866 * XXX break atomicy, we can deadlock the backend 1867 * if we do not release the lock. Probably not a 1868 * big deal here. 1869 */ 1870 blksize = hammer_blocksize(vap->va_size); 1871 if (vap->va_size < ip->ino_data.size) { 1872 vtruncbuf(ap->a_vp, vap->va_size, blksize); 1873 truncating = 1; 1874 kflags |= NOTE_WRITE; 1875 } else { 1876 vnode_pager_setsize(ap->a_vp, vap->va_size); 1877 truncating = 0; 1878 kflags |= NOTE_WRITE | NOTE_EXTEND; 1879 } 1880 ip->ino_data.size = vap->va_size; 1881 ip->ino_data.mtime = trans.time; 1882 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 1883 1884 /* 1885 * on-media truncation is cached in the inode until 1886 * the inode is synchronized. 1887 */ 1888 if (truncating) { 1889 hammer_ip_frontend_trunc(ip, vap->va_size); 1890 #ifdef DEBUG_TRUNCATE 1891 if (HammerTruncIp == NULL) 1892 HammerTruncIp = ip; 1893 #endif 1894 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 1895 ip->flags |= HAMMER_INODE_TRUNCATED; 1896 ip->trunc_off = vap->va_size; 1897 #ifdef DEBUG_TRUNCATE 1898 if (ip == HammerTruncIp) 1899 kprintf("truncate1 %016llx\n", ip->trunc_off); 1900 #endif 1901 } else if (ip->trunc_off > vap->va_size) { 1902 ip->trunc_off = vap->va_size; 1903 #ifdef DEBUG_TRUNCATE 1904 if (ip == HammerTruncIp) 1905 kprintf("truncate2 %016llx\n", ip->trunc_off); 1906 #endif 1907 } else { 1908 #ifdef DEBUG_TRUNCATE 1909 if (ip == HammerTruncIp) 1910 kprintf("truncate3 %016llx (ignored)\n", vap->va_size); 1911 #endif 1912 } 1913 } 1914 1915 /* 1916 * If truncating we have to clean out a portion of 1917 * the last block on-disk. We do this in the 1918 * front-end buffer cache. 1919 */ 1920 aligned_size = (vap->va_size + (blksize - 1)) & 1921 ~(int64_t)(blksize - 1); 1922 if (truncating && vap->va_size < aligned_size) { 1923 struct buf *bp; 1924 int offset; 1925 1926 aligned_size -= blksize; 1927 1928 offset = (int)vap->va_size & (blksize - 1); 1929 error = bread(ap->a_vp, aligned_size, 1930 blksize, &bp); 1931 hammer_ip_frontend_trunc(ip, aligned_size); 1932 if (error == 0) { 1933 bzero(bp->b_data + offset, 1934 blksize - offset); 1935 /* must de-cache direct-io offset */ 1936 bp->b_bio2.bio_offset = NOOFFSET; 1937 bdwrite(bp); 1938 } else { 1939 kprintf("ERROR %d\n", error); 1940 brelse(bp); 1941 } 1942 } 1943 break; 1944 case VDATABASE: 1945 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 1946 ip->flags |= HAMMER_INODE_TRUNCATED; 1947 ip->trunc_off = vap->va_size; 1948 } else if (ip->trunc_off > vap->va_size) { 1949 ip->trunc_off = vap->va_size; 1950 } 1951 hammer_ip_frontend_trunc(ip, vap->va_size); 1952 ip->ino_data.size = vap->va_size; 1953 ip->ino_data.mtime = trans.time; 1954 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 1955 kflags |= NOTE_ATTRIB; 1956 break; 1957 default: 1958 error = EINVAL; 1959 goto done; 1960 } 1961 break; 1962 } 1963 if (vap->va_atime.tv_sec != VNOVAL) { 1964 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 1965 modflags |= HAMMER_INODE_ATIME; 1966 kflags |= NOTE_ATTRIB; 1967 } 1968 if (vap->va_mtime.tv_sec != VNOVAL) { 1969 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 1970 modflags |= HAMMER_INODE_MTIME; 1971 kflags |= NOTE_ATTRIB; 1972 } 1973 if (vap->va_mode != (mode_t)VNOVAL) { 1974 mode_t cur_mode = ip->ino_data.mode; 1975 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1976 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1977 1978 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 1979 cur_uid, cur_gid, &cur_mode); 1980 if (error == 0 && ip->ino_data.mode != cur_mode) { 1981 ip->ino_data.mode = cur_mode; 1982 ip->ino_data.ctime = trans.time; 1983 modflags |= HAMMER_INODE_DDIRTY; 1984 kflags |= NOTE_ATTRIB; 1985 } 1986 } 1987 done: 1988 if (error == 0) 1989 hammer_modify_inode(ip, modflags); 1990 hammer_done_transaction(&trans); 1991 hammer_knote(ap->a_vp, kflags); 1992 return (error); 1993 } 1994 1995 /* 1996 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 1997 */ 1998 static 1999 int 2000 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2001 { 2002 struct hammer_transaction trans; 2003 struct hammer_inode *dip; 2004 struct hammer_inode *nip; 2005 struct nchandle *nch; 2006 hammer_record_t record; 2007 int error; 2008 int bytes; 2009 2010 ap->a_vap->va_type = VLNK; 2011 2012 nch = ap->a_nch; 2013 dip = VTOI(ap->a_dvp); 2014 2015 if (dip->flags & HAMMER_INODE_RO) 2016 return (EROFS); 2017 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 2018 return (error); 2019 2020 /* 2021 * Create a transaction to cover the operations we perform. 2022 */ 2023 hammer_start_transaction(&trans, dip->hmp); 2024 ++hammer_stats_file_iopsw; 2025 2026 /* 2027 * Create a new filesystem object of the requested type. The 2028 * returned inode will be referenced but not locked. 2029 */ 2030 2031 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2032 dip, NULL, &nip); 2033 if (error) { 2034 hammer_done_transaction(&trans); 2035 *ap->a_vpp = NULL; 2036 return (error); 2037 } 2038 2039 /* 2040 * Add a record representing the symlink. symlink stores the link 2041 * as pure data, not a string, and is no \0 terminated. 2042 */ 2043 if (error == 0) { 2044 bytes = strlen(ap->a_target); 2045 2046 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2047 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2048 } else { 2049 record = hammer_alloc_mem_record(nip, bytes); 2050 record->type = HAMMER_MEM_RECORD_GENERAL; 2051 2052 record->leaf.base.localization = nip->obj_localization + 2053 HAMMER_LOCALIZE_MISC; 2054 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2055 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2056 record->leaf.data_len = bytes; 2057 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2058 bcopy(ap->a_target, record->data->symlink.name, bytes); 2059 error = hammer_ip_add_record(&trans, record); 2060 } 2061 2062 /* 2063 * Set the file size to the length of the link. 2064 */ 2065 if (error == 0) { 2066 nip->ino_data.size = bytes; 2067 hammer_modify_inode(nip, HAMMER_INODE_DDIRTY); 2068 } 2069 } 2070 if (error == 0) 2071 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2072 nch->ncp->nc_nlen, nip); 2073 2074 /* 2075 * Finish up. 2076 */ 2077 if (error) { 2078 hammer_rel_inode(nip, 0); 2079 *ap->a_vpp = NULL; 2080 } else { 2081 error = hammer_get_vnode(nip, ap->a_vpp); 2082 hammer_rel_inode(nip, 0); 2083 if (error == 0) { 2084 cache_setunresolved(ap->a_nch); 2085 cache_setvp(ap->a_nch, *ap->a_vpp); 2086 hammer_knote(ap->a_dvp, NOTE_WRITE); 2087 } 2088 } 2089 hammer_done_transaction(&trans); 2090 return (error); 2091 } 2092 2093 /* 2094 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2095 */ 2096 static 2097 int 2098 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2099 { 2100 struct hammer_transaction trans; 2101 struct hammer_inode *dip; 2102 int error; 2103 2104 dip = VTOI(ap->a_dvp); 2105 2106 if (hammer_nohistory(dip) == 0 && 2107 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2108 return (error); 2109 } 2110 2111 hammer_start_transaction(&trans, dip->hmp); 2112 ++hammer_stats_file_iopsw; 2113 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2114 ap->a_cred, ap->a_flags, -1); 2115 hammer_done_transaction(&trans); 2116 2117 return (error); 2118 } 2119 2120 /* 2121 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2122 */ 2123 static 2124 int 2125 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2126 { 2127 struct hammer_inode *ip = ap->a_vp->v_data; 2128 2129 ++hammer_stats_file_iopsr; 2130 return(hammer_ioctl(ip, ap->a_command, ap->a_data, 2131 ap->a_fflag, ap->a_cred)); 2132 } 2133 2134 static 2135 int 2136 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2137 { 2138 struct mount *mp; 2139 int error; 2140 2141 mp = ap->a_head.a_ops->head.vv_mount; 2142 2143 switch(ap->a_op) { 2144 case MOUNTCTL_SET_EXPORT: 2145 if (ap->a_ctllen != sizeof(struct export_args)) 2146 error = EINVAL; 2147 else 2148 error = hammer_vfs_export(mp, ap->a_op, 2149 (const struct export_args *)ap->a_ctl); 2150 break; 2151 default: 2152 error = journal_mountctl(ap); 2153 break; 2154 } 2155 return(error); 2156 } 2157 2158 /* 2159 * hammer_vop_strategy { vp, bio } 2160 * 2161 * Strategy call, used for regular file read & write only. Note that the 2162 * bp may represent a cluster. 2163 * 2164 * To simplify operation and allow better optimizations in the future, 2165 * this code does not make any assumptions with regards to buffer alignment 2166 * or size. 2167 */ 2168 static 2169 int 2170 hammer_vop_strategy(struct vop_strategy_args *ap) 2171 { 2172 struct buf *bp; 2173 int error; 2174 2175 bp = ap->a_bio->bio_buf; 2176 2177 switch(bp->b_cmd) { 2178 case BUF_CMD_READ: 2179 error = hammer_vop_strategy_read(ap); 2180 break; 2181 case BUF_CMD_WRITE: 2182 error = hammer_vop_strategy_write(ap); 2183 break; 2184 default: 2185 bp->b_error = error = EINVAL; 2186 bp->b_flags |= B_ERROR; 2187 biodone(ap->a_bio); 2188 break; 2189 } 2190 return (error); 2191 } 2192 2193 /* 2194 * Read from a regular file. Iterate the related records and fill in the 2195 * BIO/BUF. Gaps are zero-filled. 2196 * 2197 * The support code in hammer_object.c should be used to deal with mixed 2198 * in-memory and on-disk records. 2199 * 2200 * NOTE: Can be called from the cluster code with an oversized buf. 2201 * 2202 * XXX atime update 2203 */ 2204 static 2205 int 2206 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2207 { 2208 struct hammer_transaction trans; 2209 struct hammer_inode *ip; 2210 struct hammer_cursor cursor; 2211 hammer_base_elm_t base; 2212 hammer_off_t disk_offset; 2213 struct bio *bio; 2214 struct bio *nbio; 2215 struct buf *bp; 2216 int64_t rec_offset; 2217 int64_t ran_end; 2218 int64_t tmp64; 2219 int error; 2220 int boff; 2221 int roff; 2222 int n; 2223 2224 bio = ap->a_bio; 2225 bp = bio->bio_buf; 2226 ip = ap->a_vp->v_data; 2227 2228 /* 2229 * The zone-2 disk offset may have been set by the cluster code via 2230 * a BMAP operation, or else should be NOOFFSET. 2231 * 2232 * Checking the high bits for a match against zone-2 should suffice. 2233 */ 2234 nbio = push_bio(bio); 2235 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2236 HAMMER_ZONE_LARGE_DATA) { 2237 error = hammer_io_direct_read(ip->hmp, nbio, NULL); 2238 return (error); 2239 } 2240 2241 /* 2242 * Well, that sucked. Do it the hard way. If all the stars are 2243 * aligned we may still be able to issue a direct-read. 2244 */ 2245 hammer_simple_transaction(&trans, ip->hmp); 2246 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2247 2248 /* 2249 * Key range (begin and end inclusive) to scan. Note that the key's 2250 * stored in the actual records represent BASE+LEN, not BASE. The 2251 * first record containing bio_offset will have a key > bio_offset. 2252 */ 2253 cursor.key_beg.localization = ip->obj_localization + 2254 HAMMER_LOCALIZE_MISC; 2255 cursor.key_beg.obj_id = ip->obj_id; 2256 cursor.key_beg.create_tid = 0; 2257 cursor.key_beg.delete_tid = 0; 2258 cursor.key_beg.obj_type = 0; 2259 cursor.key_beg.key = bio->bio_offset + 1; 2260 cursor.asof = ip->obj_asof; 2261 cursor.flags |= HAMMER_CURSOR_ASOF; 2262 2263 cursor.key_end = cursor.key_beg; 2264 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2265 #if 0 2266 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2267 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2268 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2269 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2270 } else 2271 #endif 2272 { 2273 ran_end = bio->bio_offset + bp->b_bufsize; 2274 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2275 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2276 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2277 if (tmp64 < ran_end) 2278 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2279 else 2280 cursor.key_end.key = ran_end + MAXPHYS + 1; 2281 } 2282 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2283 2284 error = hammer_ip_first(&cursor); 2285 boff = 0; 2286 2287 while (error == 0) { 2288 /* 2289 * Get the base file offset of the record. The key for 2290 * data records is (base + bytes) rather then (base). 2291 */ 2292 base = &cursor.leaf->base; 2293 rec_offset = base->key - cursor.leaf->data_len; 2294 2295 /* 2296 * Calculate the gap, if any, and zero-fill it. 2297 * 2298 * n is the offset of the start of the record verses our 2299 * current seek offset in the bio. 2300 */ 2301 n = (int)(rec_offset - (bio->bio_offset + boff)); 2302 if (n > 0) { 2303 if (n > bp->b_bufsize - boff) 2304 n = bp->b_bufsize - boff; 2305 bzero((char *)bp->b_data + boff, n); 2306 boff += n; 2307 n = 0; 2308 } 2309 2310 /* 2311 * Calculate the data offset in the record and the number 2312 * of bytes we can copy. 2313 * 2314 * There are two degenerate cases. First, boff may already 2315 * be at bp->b_bufsize. Secondly, the data offset within 2316 * the record may exceed the record's size. 2317 */ 2318 roff = -n; 2319 rec_offset += roff; 2320 n = cursor.leaf->data_len - roff; 2321 if (n <= 0) { 2322 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff); 2323 n = 0; 2324 } else if (n > bp->b_bufsize - boff) { 2325 n = bp->b_bufsize - boff; 2326 } 2327 2328 /* 2329 * Deal with cached truncations. This cool bit of code 2330 * allows truncate()/ftruncate() to avoid having to sync 2331 * the file. 2332 * 2333 * If the frontend is truncated then all backend records are 2334 * subject to the frontend's truncation. 2335 * 2336 * If the backend is truncated then backend records on-disk 2337 * (but not in-memory) are subject to the backend's 2338 * truncation. In-memory records owned by the backend 2339 * represent data written after the truncation point on the 2340 * backend and must not be truncated. 2341 * 2342 * Truncate operations deal with frontend buffer cache 2343 * buffers and frontend-owned in-memory records synchronously. 2344 */ 2345 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2346 if (hammer_cursor_ondisk(&cursor) || 2347 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 2348 if (ip->trunc_off <= rec_offset) 2349 n = 0; 2350 else if (ip->trunc_off < rec_offset + n) 2351 n = (int)(ip->trunc_off - rec_offset); 2352 } 2353 } 2354 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2355 if (hammer_cursor_ondisk(&cursor)) { 2356 if (ip->sync_trunc_off <= rec_offset) 2357 n = 0; 2358 else if (ip->sync_trunc_off < rec_offset + n) 2359 n = (int)(ip->sync_trunc_off - rec_offset); 2360 } 2361 } 2362 2363 /* 2364 * Try to issue a direct read into our bio if possible, 2365 * otherwise resolve the element data into a hammer_buffer 2366 * and copy. 2367 * 2368 * The buffer on-disk should be zerod past any real 2369 * truncation point, but may not be for any synthesized 2370 * truncation point from above. 2371 */ 2372 disk_offset = cursor.leaf->data_offset + roff; 2373 if (boff == 0 && n == bp->b_bufsize && 2374 hammer_cursor_ondisk(&cursor) && 2375 (disk_offset & HAMMER_BUFMASK) == 0) { 2376 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2377 HAMMER_ZONE_LARGE_DATA); 2378 nbio->bio_offset = disk_offset; 2379 error = hammer_io_direct_read(trans.hmp, nbio, 2380 cursor.leaf); 2381 goto done; 2382 } else if (n) { 2383 error = hammer_ip_resolve_data(&cursor); 2384 if (error == 0) { 2385 bcopy((char *)cursor.data + roff, 2386 (char *)bp->b_data + boff, n); 2387 } 2388 } 2389 if (error) 2390 break; 2391 2392 /* 2393 * Iterate until we have filled the request. 2394 */ 2395 boff += n; 2396 if (boff == bp->b_bufsize) 2397 break; 2398 error = hammer_ip_next(&cursor); 2399 } 2400 2401 /* 2402 * There may have been a gap after the last record 2403 */ 2404 if (error == ENOENT) 2405 error = 0; 2406 if (error == 0 && boff != bp->b_bufsize) { 2407 KKASSERT(boff < bp->b_bufsize); 2408 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2409 /* boff = bp->b_bufsize; */ 2410 } 2411 bp->b_resid = 0; 2412 bp->b_error = error; 2413 if (error) 2414 bp->b_flags |= B_ERROR; 2415 biodone(ap->a_bio); 2416 2417 done: 2418 if (cursor.node) 2419 hammer_cache_node(&ip->cache[1], cursor.node); 2420 hammer_done_cursor(&cursor); 2421 hammer_done_transaction(&trans); 2422 return(error); 2423 } 2424 2425 /* 2426 * BMAP operation - used to support cluster_read() only. 2427 * 2428 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 2429 * 2430 * This routine may return EOPNOTSUPP if the opration is not supported for 2431 * the specified offset. The contents of the pointer arguments do not 2432 * need to be initialized in that case. 2433 * 2434 * If a disk address is available and properly aligned return 0 with 2435 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 2436 * to the run-length relative to that offset. Callers may assume that 2437 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 2438 * large, so return EOPNOTSUPP if it is not sufficiently large. 2439 */ 2440 static 2441 int 2442 hammer_vop_bmap(struct vop_bmap_args *ap) 2443 { 2444 struct hammer_transaction trans; 2445 struct hammer_inode *ip; 2446 struct hammer_cursor cursor; 2447 hammer_base_elm_t base; 2448 int64_t rec_offset; 2449 int64_t ran_end; 2450 int64_t tmp64; 2451 int64_t base_offset; 2452 int64_t base_disk_offset; 2453 int64_t last_offset; 2454 hammer_off_t last_disk_offset; 2455 hammer_off_t disk_offset; 2456 int rec_len; 2457 int error; 2458 int blksize; 2459 2460 ++hammer_stats_file_iopsr; 2461 ip = ap->a_vp->v_data; 2462 2463 /* 2464 * We can only BMAP regular files. We can't BMAP database files, 2465 * directories, etc. 2466 */ 2467 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 2468 return(EOPNOTSUPP); 2469 2470 /* 2471 * bmap is typically called with runp/runb both NULL when used 2472 * for writing. We do not support BMAP for writing atm. 2473 */ 2474 if (ap->a_cmd != BUF_CMD_READ) 2475 return(EOPNOTSUPP); 2476 2477 /* 2478 * Scan the B-Tree to acquire blockmap addresses, then translate 2479 * to raw addresses. 2480 */ 2481 hammer_simple_transaction(&trans, ip->hmp); 2482 #if 0 2483 kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]); 2484 #endif 2485 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2486 2487 /* 2488 * Key range (begin and end inclusive) to scan. Note that the key's 2489 * stored in the actual records represent BASE+LEN, not BASE. The 2490 * first record containing bio_offset will have a key > bio_offset. 2491 */ 2492 cursor.key_beg.localization = ip->obj_localization + 2493 HAMMER_LOCALIZE_MISC; 2494 cursor.key_beg.obj_id = ip->obj_id; 2495 cursor.key_beg.create_tid = 0; 2496 cursor.key_beg.delete_tid = 0; 2497 cursor.key_beg.obj_type = 0; 2498 if (ap->a_runb) 2499 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 2500 else 2501 cursor.key_beg.key = ap->a_loffset + 1; 2502 if (cursor.key_beg.key < 0) 2503 cursor.key_beg.key = 0; 2504 cursor.asof = ip->obj_asof; 2505 cursor.flags |= HAMMER_CURSOR_ASOF; 2506 2507 cursor.key_end = cursor.key_beg; 2508 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2509 2510 ran_end = ap->a_loffset + MAXPHYS; 2511 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2512 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2513 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2514 if (tmp64 < ran_end) 2515 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2516 else 2517 cursor.key_end.key = ran_end + MAXPHYS + 1; 2518 2519 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2520 2521 error = hammer_ip_first(&cursor); 2522 base_offset = last_offset = 0; 2523 base_disk_offset = last_disk_offset = 0; 2524 2525 while (error == 0) { 2526 /* 2527 * Get the base file offset of the record. The key for 2528 * data records is (base + bytes) rather then (base). 2529 * 2530 * NOTE: rec_offset + rec_len may exceed the end-of-file. 2531 * The extra bytes should be zero on-disk and the BMAP op 2532 * should still be ok. 2533 */ 2534 base = &cursor.leaf->base; 2535 rec_offset = base->key - cursor.leaf->data_len; 2536 rec_len = cursor.leaf->data_len; 2537 2538 /* 2539 * Incorporate any cached truncation. 2540 * 2541 * NOTE: Modifications to rec_len based on synthesized 2542 * truncation points remove the guarantee that any extended 2543 * data on disk is zero (since the truncations may not have 2544 * taken place on-media yet). 2545 */ 2546 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2547 if (hammer_cursor_ondisk(&cursor) || 2548 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 2549 if (ip->trunc_off <= rec_offset) 2550 rec_len = 0; 2551 else if (ip->trunc_off < rec_offset + rec_len) 2552 rec_len = (int)(ip->trunc_off - rec_offset); 2553 } 2554 } 2555 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2556 if (hammer_cursor_ondisk(&cursor)) { 2557 if (ip->sync_trunc_off <= rec_offset) 2558 rec_len = 0; 2559 else if (ip->sync_trunc_off < rec_offset + rec_len) 2560 rec_len = (int)(ip->sync_trunc_off - rec_offset); 2561 } 2562 } 2563 2564 /* 2565 * Accumulate information. If we have hit a discontiguous 2566 * block reset base_offset unless we are already beyond the 2567 * requested offset. If we are, that's it, we stop. 2568 */ 2569 if (error) 2570 break; 2571 if (hammer_cursor_ondisk(&cursor)) { 2572 disk_offset = cursor.leaf->data_offset; 2573 if (rec_offset != last_offset || 2574 disk_offset != last_disk_offset) { 2575 if (rec_offset > ap->a_loffset) 2576 break; 2577 base_offset = rec_offset; 2578 base_disk_offset = disk_offset; 2579 } 2580 last_offset = rec_offset + rec_len; 2581 last_disk_offset = disk_offset + rec_len; 2582 } 2583 error = hammer_ip_next(&cursor); 2584 } 2585 2586 #if 0 2587 kprintf("BMAP %016llx: %016llx - %016llx\n", 2588 ap->a_loffset, base_offset, last_offset); 2589 kprintf("BMAP %16s: %016llx - %016llx\n", 2590 "", base_disk_offset, last_disk_offset); 2591 #endif 2592 2593 if (cursor.node) { 2594 hammer_cache_node(&ip->cache[1], cursor.node); 2595 #if 0 2596 kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]); 2597 #endif 2598 } 2599 hammer_done_cursor(&cursor); 2600 hammer_done_transaction(&trans); 2601 2602 /* 2603 * If we couldn't find any records or the records we did find were 2604 * all behind the requested offset, return failure. A forward 2605 * truncation can leave a hole w/ no on-disk records. 2606 */ 2607 if (last_offset == 0 || last_offset < ap->a_loffset) 2608 return (EOPNOTSUPP); 2609 2610 /* 2611 * Figure out the block size at the requested offset and adjust 2612 * our limits so the cluster_read() does not create inappropriately 2613 * sized buffer cache buffers. 2614 */ 2615 blksize = hammer_blocksize(ap->a_loffset); 2616 if (hammer_blocksize(base_offset) != blksize) { 2617 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 2618 } 2619 if (last_offset != ap->a_loffset && 2620 hammer_blocksize(last_offset - 1) != blksize) { 2621 last_offset = hammer_blockdemarc(ap->a_loffset, 2622 last_offset - 1); 2623 } 2624 2625 /* 2626 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 2627 * from occuring. 2628 */ 2629 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 2630 2631 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 2632 /* 2633 * Only large-data zones can be direct-IOd 2634 */ 2635 error = EOPNOTSUPP; 2636 } else if ((disk_offset & HAMMER_BUFMASK) || 2637 (last_offset - ap->a_loffset) < blksize) { 2638 /* 2639 * doffsetp is not aligned or the forward run size does 2640 * not cover a whole buffer, disallow the direct I/O. 2641 */ 2642 error = EOPNOTSUPP; 2643 } else { 2644 /* 2645 * We're good. 2646 */ 2647 *ap->a_doffsetp = disk_offset; 2648 if (ap->a_runb) { 2649 *ap->a_runb = ap->a_loffset - base_offset; 2650 KKASSERT(*ap->a_runb >= 0); 2651 } 2652 if (ap->a_runp) { 2653 *ap->a_runp = last_offset - ap->a_loffset; 2654 KKASSERT(*ap->a_runp >= 0); 2655 } 2656 error = 0; 2657 } 2658 return(error); 2659 } 2660 2661 /* 2662 * Write to a regular file. Because this is a strategy call the OS is 2663 * trying to actually get data onto the media. 2664 */ 2665 static 2666 int 2667 hammer_vop_strategy_write(struct vop_strategy_args *ap) 2668 { 2669 hammer_record_t record; 2670 hammer_mount_t hmp; 2671 hammer_inode_t ip; 2672 struct bio *bio; 2673 struct buf *bp; 2674 int blksize; 2675 int bytes; 2676 int error; 2677 2678 bio = ap->a_bio; 2679 bp = bio->bio_buf; 2680 ip = ap->a_vp->v_data; 2681 hmp = ip->hmp; 2682 2683 blksize = hammer_blocksize(bio->bio_offset); 2684 KKASSERT(bp->b_bufsize == blksize); 2685 2686 if (ip->flags & HAMMER_INODE_RO) { 2687 bp->b_error = EROFS; 2688 bp->b_flags |= B_ERROR; 2689 biodone(ap->a_bio); 2690 return(EROFS); 2691 } 2692 2693 /* 2694 * Interlock with inode destruction (no in-kernel or directory 2695 * topology visibility). If we queue new IO while trying to 2696 * destroy the inode we can deadlock the vtrunc call in 2697 * hammer_inode_unloadable_check(). 2698 * 2699 * Besides, there's no point flushing a bp associated with an 2700 * inode that is being destroyed on-media and has no kernel 2701 * references. 2702 */ 2703 if ((ip->flags | ip->sync_flags) & 2704 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 2705 bp->b_resid = 0; 2706 biodone(ap->a_bio); 2707 return(0); 2708 } 2709 2710 /* 2711 * Reserve space and issue a direct-write from the front-end. 2712 * NOTE: The direct_io code will hammer_bread/bcopy smaller 2713 * allocations. 2714 * 2715 * An in-memory record will be installed to reference the storage 2716 * until the flusher can get to it. 2717 * 2718 * Since we own the high level bio the front-end will not try to 2719 * do a direct-read until the write completes. 2720 * 2721 * NOTE: The only time we do not reserve a full-sized buffers 2722 * worth of data is if the file is small. We do not try to 2723 * allocate a fragment (from the small-data zone) at the end of 2724 * an otherwise large file as this can lead to wildly separated 2725 * data. 2726 */ 2727 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 2728 KKASSERT(bio->bio_offset < ip->ino_data.size); 2729 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2) 2730 bytes = bp->b_bufsize; 2731 else 2732 bytes = ((int)ip->ino_data.size + 15) & ~15; 2733 2734 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 2735 bytes, &error); 2736 if (record) { 2737 hammer_io_direct_write(hmp, record, bio); 2738 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 2739 hammer_flush_inode(ip, 0); 2740 } else { 2741 bp->b_bio2.bio_offset = NOOFFSET; 2742 bp->b_error = error; 2743 bp->b_flags |= B_ERROR; 2744 biodone(ap->a_bio); 2745 } 2746 return(error); 2747 } 2748 2749 /* 2750 * dounlink - disconnect a directory entry 2751 * 2752 * XXX whiteout support not really in yet 2753 */ 2754 static int 2755 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 2756 struct vnode *dvp, struct ucred *cred, 2757 int flags, int isdir) 2758 { 2759 struct namecache *ncp; 2760 hammer_inode_t dip; 2761 hammer_inode_t ip; 2762 struct hammer_cursor cursor; 2763 int64_t namekey; 2764 u_int32_t max_iterations; 2765 int nlen, error; 2766 2767 /* 2768 * Calculate the namekey and setup the key range for the scan. This 2769 * works kinda like a chained hash table where the lower 32 bits 2770 * of the namekey synthesize the chain. 2771 * 2772 * The key range is inclusive of both key_beg and key_end. 2773 */ 2774 dip = VTOI(dvp); 2775 ncp = nch->ncp; 2776 2777 if (dip->flags & HAMMER_INODE_RO) 2778 return (EROFS); 2779 2780 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 2781 &max_iterations); 2782 retry: 2783 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 2784 cursor.key_beg.localization = dip->obj_localization + 2785 HAMMER_LOCALIZE_MISC; 2786 cursor.key_beg.obj_id = dip->obj_id; 2787 cursor.key_beg.key = namekey; 2788 cursor.key_beg.create_tid = 0; 2789 cursor.key_beg.delete_tid = 0; 2790 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2791 cursor.key_beg.obj_type = 0; 2792 2793 cursor.key_end = cursor.key_beg; 2794 cursor.key_end.key += max_iterations; 2795 cursor.asof = dip->obj_asof; 2796 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2797 2798 /* 2799 * Scan all matching records (the chain), locate the one matching 2800 * the requested path component. info->last_error contains the 2801 * error code on search termination and could be 0, ENOENT, or 2802 * something else. 2803 * 2804 * The hammer_ip_*() functions merge in-memory records with on-disk 2805 * records for the purposes of the search. 2806 */ 2807 error = hammer_ip_first(&cursor); 2808 2809 while (error == 0) { 2810 error = hammer_ip_resolve_data(&cursor); 2811 if (error) 2812 break; 2813 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2814 KKASSERT(nlen > 0); 2815 if (ncp->nc_nlen == nlen && 2816 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2817 break; 2818 } 2819 error = hammer_ip_next(&cursor); 2820 } 2821 2822 /* 2823 * If all is ok we have to get the inode so we can adjust nlinks. 2824 * To avoid a deadlock with the flusher we must release the inode 2825 * lock on the directory when acquiring the inode for the entry. 2826 * 2827 * If the target is a directory, it must be empty. 2828 */ 2829 if (error == 0) { 2830 hammer_unlock(&cursor.ip->lock); 2831 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 2832 dip->hmp->asof, 2833 cursor.data->entry.localization, 2834 0, &error); 2835 hammer_lock_sh(&cursor.ip->lock); 2836 if (error == ENOENT) { 2837 kprintf("HAMMER: WARNING: Removing " 2838 "dirent w/missing inode \"%s\"\n" 2839 "\tobj_id = %016llx\n", 2840 ncp->nc_name, 2841 (long long)cursor.data->entry.obj_id); 2842 error = 0; 2843 } 2844 2845 /* 2846 * If isdir >= 0 we validate that the entry is or is not a 2847 * directory. If isdir < 0 we don't care. 2848 */ 2849 if (error == 0 && isdir >= 0 && ip) { 2850 if (isdir && 2851 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 2852 error = ENOTDIR; 2853 } else if (isdir == 0 && 2854 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 2855 error = EISDIR; 2856 } 2857 } 2858 2859 /* 2860 * If we are trying to remove a directory the directory must 2861 * be empty. 2862 * 2863 * The check directory code can loop and deadlock/retry. Our 2864 * own cursor's node locks must be released to avoid a 3-way 2865 * deadlock with the flusher if the check directory code 2866 * blocks. 2867 * 2868 * If any changes whatsoever have been made to the cursor 2869 * set EDEADLK and retry. 2870 */ 2871 if (error == 0 && ip && ip->ino_data.obj_type == 2872 HAMMER_OBJTYPE_DIRECTORY) { 2873 hammer_unlock_cursor(&cursor); 2874 error = hammer_ip_check_directory_empty(trans, ip); 2875 hammer_lock_cursor(&cursor); 2876 if (cursor.flags & HAMMER_CURSOR_RETEST) { 2877 kprintf("HAMMER: Warning: avoided deadlock " 2878 "on rmdir '%s'\n", 2879 ncp->nc_name); 2880 error = EDEADLK; 2881 } 2882 } 2883 2884 /* 2885 * Delete the directory entry. 2886 * 2887 * WARNING: hammer_ip_del_directory() may have to terminate 2888 * the cursor to avoid a deadlock. It is ok to call 2889 * hammer_done_cursor() twice. 2890 */ 2891 if (error == 0) { 2892 error = hammer_ip_del_directory(trans, &cursor, 2893 dip, ip); 2894 } 2895 hammer_done_cursor(&cursor); 2896 if (error == 0) { 2897 cache_setunresolved(nch); 2898 cache_setvp(nch, NULL); 2899 /* XXX locking */ 2900 if (ip && ip->vp) { 2901 hammer_knote(ip->vp, NOTE_DELETE); 2902 cache_inval_vp(ip->vp, CINV_DESTROY); 2903 } 2904 } 2905 if (ip) 2906 hammer_rel_inode(ip, 0); 2907 } else { 2908 hammer_done_cursor(&cursor); 2909 } 2910 if (error == EDEADLK) 2911 goto retry; 2912 2913 return (error); 2914 } 2915 2916 /************************************************************************ 2917 * FIFO AND SPECFS OPS * 2918 ************************************************************************ 2919 * 2920 */ 2921 2922 static int 2923 hammer_vop_fifoclose (struct vop_close_args *ap) 2924 { 2925 /* XXX update itimes */ 2926 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 2927 } 2928 2929 static int 2930 hammer_vop_fiforead (struct vop_read_args *ap) 2931 { 2932 int error; 2933 2934 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 2935 /* XXX update access time */ 2936 return (error); 2937 } 2938 2939 static int 2940 hammer_vop_fifowrite (struct vop_write_args *ap) 2941 { 2942 int error; 2943 2944 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 2945 /* XXX update access time */ 2946 return (error); 2947 } 2948 2949 static 2950 int 2951 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 2952 { 2953 int error; 2954 2955 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 2956 if (error) 2957 error = hammer_vop_kqfilter(ap); 2958 return(error); 2959 } 2960 2961 static int 2962 hammer_vop_specclose (struct vop_close_args *ap) 2963 { 2964 /* XXX update itimes */ 2965 return (VOCALL(&spec_vnode_vops, &ap->a_head)); 2966 } 2967 2968 static int 2969 hammer_vop_specread (struct vop_read_args *ap) 2970 { 2971 /* XXX update access time */ 2972 return (VOCALL(&spec_vnode_vops, &ap->a_head)); 2973 } 2974 2975 static int 2976 hammer_vop_specwrite (struct vop_write_args *ap) 2977 { 2978 /* XXX update last change time */ 2979 return (VOCALL(&spec_vnode_vops, &ap->a_head)); 2980 } 2981 2982 /************************************************************************ 2983 * KQFILTER OPS * 2984 ************************************************************************ 2985 * 2986 */ 2987 static void filt_hammerdetach(struct knote *kn); 2988 static int filt_hammerread(struct knote *kn, long hint); 2989 static int filt_hammerwrite(struct knote *kn, long hint); 2990 static int filt_hammervnode(struct knote *kn, long hint); 2991 2992 static struct filterops hammerread_filtops = 2993 { 1, NULL, filt_hammerdetach, filt_hammerread }; 2994 static struct filterops hammerwrite_filtops = 2995 { 1, NULL, filt_hammerdetach, filt_hammerwrite }; 2996 static struct filterops hammervnode_filtops = 2997 { 1, NULL, filt_hammerdetach, filt_hammervnode }; 2998 2999 static 3000 int 3001 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3002 { 3003 struct vnode *vp = ap->a_vp; 3004 struct knote *kn = ap->a_kn; 3005 lwkt_tokref ilock; 3006 3007 switch (kn->kn_filter) { 3008 case EVFILT_READ: 3009 kn->kn_fop = &hammerread_filtops; 3010 break; 3011 case EVFILT_WRITE: 3012 kn->kn_fop = &hammerwrite_filtops; 3013 break; 3014 case EVFILT_VNODE: 3015 kn->kn_fop = &hammervnode_filtops; 3016 break; 3017 default: 3018 return (1); 3019 } 3020 3021 kn->kn_hook = (caddr_t)vp; 3022 3023 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 3024 SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext); 3025 lwkt_reltoken(&ilock); 3026 3027 return(0); 3028 } 3029 3030 static void 3031 filt_hammerdetach(struct knote *kn) 3032 { 3033 struct vnode *vp = (void *)kn->kn_hook; 3034 lwkt_tokref ilock; 3035 3036 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 3037 SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note, 3038 kn, knote, kn_selnext); 3039 lwkt_reltoken(&ilock); 3040 } 3041 3042 static int 3043 filt_hammerread(struct knote *kn, long hint) 3044 { 3045 struct vnode *vp = (void *)kn->kn_hook; 3046 hammer_inode_t ip = VTOI(vp); 3047 3048 if (hint == NOTE_REVOKE) { 3049 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 3050 return(1); 3051 } 3052 kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset; 3053 return (kn->kn_data != 0); 3054 } 3055 3056 static int 3057 filt_hammerwrite(struct knote *kn, long hint) 3058 { 3059 if (hint == NOTE_REVOKE) 3060 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 3061 kn->kn_data = 0; 3062 return (1); 3063 } 3064 3065 static int 3066 filt_hammervnode(struct knote *kn, long hint) 3067 { 3068 if (kn->kn_sfflags & hint) 3069 kn->kn_fflags |= hint; 3070 if (hint == NOTE_REVOKE) { 3071 kn->kn_flags |= EV_EOF; 3072 return (1); 3073 } 3074 return (kn->kn_fflags != 0); 3075 } 3076 3077