1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/fcntl.h> 41 #include <sys/namecache.h> 42 #include <sys/vnode.h> 43 #include <sys/lockf.h> 44 #include <sys/event.h> 45 #include <sys/stat.h> 46 #include <sys/dirent.h> 47 #include <sys/file.h> 48 #include <vm/vm_extern.h> 49 #include <vfs/fifofs/fifo.h> 50 #include "hammer.h" 51 52 /* 53 * USERFS VNOPS 54 */ 55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/ 56 static int hammer_vop_fsync(struct vop_fsync_args *); 57 static int hammer_vop_read(struct vop_read_args *); 58 static int hammer_vop_write(struct vop_write_args *); 59 static int hammer_vop_access(struct vop_access_args *); 60 static int hammer_vop_advlock(struct vop_advlock_args *); 61 static int hammer_vop_close(struct vop_close_args *); 62 static int hammer_vop_ncreate(struct vop_ncreate_args *); 63 static int hammer_vop_getattr(struct vop_getattr_args *); 64 static int hammer_vop_nresolve(struct vop_nresolve_args *); 65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 66 static int hammer_vop_nlink(struct vop_nlink_args *); 67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 68 static int hammer_vop_nmknod(struct vop_nmknod_args *); 69 static int hammer_vop_open(struct vop_open_args *); 70 static int hammer_vop_print(struct vop_print_args *); 71 static int hammer_vop_readdir(struct vop_readdir_args *); 72 static int hammer_vop_readlink(struct vop_readlink_args *); 73 static int hammer_vop_nremove(struct vop_nremove_args *); 74 static int hammer_vop_nrename(struct vop_nrename_args *); 75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 76 static int hammer_vop_setattr(struct vop_setattr_args *); 77 static int hammer_vop_strategy(struct vop_strategy_args *); 78 static int hammer_vop_bmap(struct vop_bmap_args *ap); 79 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 80 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 81 static int hammer_vop_ioctl(struct vop_ioctl_args *); 82 static int hammer_vop_mountctl(struct vop_mountctl_args *); 83 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 84 85 static int hammer_vop_fifoclose (struct vop_close_args *); 86 static int hammer_vop_fiforead (struct vop_read_args *); 87 static int hammer_vop_fifowrite (struct vop_write_args *); 88 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 89 90 static int hammer_vop_specclose (struct vop_close_args *); 91 static int hammer_vop_specread (struct vop_read_args *); 92 static int hammer_vop_specwrite (struct vop_write_args *); 93 94 struct vop_ops hammer_vnode_vops = { 95 .vop_default = vop_defaultop, 96 .vop_fsync = hammer_vop_fsync, 97 .vop_getpages = vop_stdgetpages, 98 .vop_putpages = vop_stdputpages, 99 .vop_read = hammer_vop_read, 100 .vop_write = hammer_vop_write, 101 .vop_access = hammer_vop_access, 102 .vop_advlock = hammer_vop_advlock, 103 .vop_close = hammer_vop_close, 104 .vop_ncreate = hammer_vop_ncreate, 105 .vop_getattr = hammer_vop_getattr, 106 .vop_inactive = hammer_vop_inactive, 107 .vop_reclaim = hammer_vop_reclaim, 108 .vop_nresolve = hammer_vop_nresolve, 109 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 110 .vop_nlink = hammer_vop_nlink, 111 .vop_nmkdir = hammer_vop_nmkdir, 112 .vop_nmknod = hammer_vop_nmknod, 113 .vop_open = hammer_vop_open, 114 .vop_pathconf = vop_stdpathconf, 115 .vop_print = hammer_vop_print, 116 .vop_readdir = hammer_vop_readdir, 117 .vop_readlink = hammer_vop_readlink, 118 .vop_nremove = hammer_vop_nremove, 119 .vop_nrename = hammer_vop_nrename, 120 .vop_nrmdir = hammer_vop_nrmdir, 121 .vop_setattr = hammer_vop_setattr, 122 .vop_bmap = hammer_vop_bmap, 123 .vop_strategy = hammer_vop_strategy, 124 .vop_nsymlink = hammer_vop_nsymlink, 125 .vop_nwhiteout = hammer_vop_nwhiteout, 126 .vop_ioctl = hammer_vop_ioctl, 127 .vop_mountctl = hammer_vop_mountctl, 128 .vop_kqfilter = hammer_vop_kqfilter 129 }; 130 131 struct vop_ops hammer_spec_vops = { 132 .vop_default = spec_vnoperate, 133 .vop_fsync = hammer_vop_fsync, 134 .vop_read = hammer_vop_specread, 135 .vop_write = hammer_vop_specwrite, 136 .vop_access = hammer_vop_access, 137 .vop_close = hammer_vop_specclose, 138 .vop_getattr = hammer_vop_getattr, 139 .vop_inactive = hammer_vop_inactive, 140 .vop_reclaim = hammer_vop_reclaim, 141 .vop_setattr = hammer_vop_setattr 142 }; 143 144 struct vop_ops hammer_fifo_vops = { 145 .vop_default = fifo_vnoperate, 146 .vop_fsync = hammer_vop_fsync, 147 .vop_read = hammer_vop_fiforead, 148 .vop_write = hammer_vop_fifowrite, 149 .vop_access = hammer_vop_access, 150 .vop_close = hammer_vop_fifoclose, 151 .vop_getattr = hammer_vop_getattr, 152 .vop_inactive = hammer_vop_inactive, 153 .vop_reclaim = hammer_vop_reclaim, 154 .vop_setattr = hammer_vop_setattr, 155 .vop_kqfilter = hammer_vop_fifokqfilter 156 }; 157 158 static __inline 159 void 160 hammer_knote(struct vnode *vp, int flags) 161 { 162 if (flags) 163 KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags); 164 } 165 166 #ifdef DEBUG_TRUNCATE 167 struct hammer_inode *HammerTruncIp; 168 #endif 169 170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 171 struct vnode *dvp, struct ucred *cred, 172 int flags, int isdir); 173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 175 176 #if 0 177 static 178 int 179 hammer_vop_vnoperate(struct vop_generic_args *) 180 { 181 return (VOCALL(&hammer_vnode_vops, ap)); 182 } 183 #endif 184 185 /* 186 * hammer_vop_fsync { vp, waitfor } 187 * 188 * fsync() an inode to disk and wait for it to be completely committed 189 * such that the information would not be undone if a crash occured after 190 * return. 191 */ 192 static 193 int 194 hammer_vop_fsync(struct vop_fsync_args *ap) 195 { 196 hammer_inode_t ip = VTOI(ap->a_vp); 197 198 ++hammer_count_fsyncs; 199 vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL); 200 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 201 if (ap->a_waitfor == MNT_WAIT) { 202 vn_unlock(ap->a_vp); 203 hammer_wait_inode(ip); 204 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); 205 } 206 return (ip->error); 207 } 208 209 /* 210 * hammer_vop_read { vp, uio, ioflag, cred } 211 */ 212 static 213 int 214 hammer_vop_read(struct vop_read_args *ap) 215 { 216 struct hammer_transaction trans; 217 hammer_inode_t ip; 218 off_t offset; 219 struct buf *bp; 220 struct uio *uio; 221 int error; 222 int n; 223 int seqcount; 224 int ioseqcount; 225 int blksize; 226 227 if (ap->a_vp->v_type != VREG) 228 return (EINVAL); 229 ip = VTOI(ap->a_vp); 230 error = 0; 231 uio = ap->a_uio; 232 233 /* 234 * Allow the UIO's size to override the sequential heuristic. 235 */ 236 blksize = hammer_blocksize(uio->uio_offset); 237 seqcount = (uio->uio_resid + (blksize - 1)) / blksize; 238 ioseqcount = ap->a_ioflag >> 16; 239 if (seqcount < ioseqcount) 240 seqcount = ioseqcount; 241 242 hammer_start_transaction(&trans, ip->hmp); 243 244 /* 245 * Access the data typically in HAMMER_BUFSIZE blocks via the 246 * buffer cache, but HAMMER may use a variable block size based 247 * on the offset. 248 */ 249 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 250 int64_t base_offset; 251 int64_t file_limit; 252 253 blksize = hammer_blocksize(uio->uio_offset); 254 offset = (int)uio->uio_offset & (blksize - 1); 255 base_offset = uio->uio_offset - offset; 256 257 if (hammer_cluster_enable) { 258 /* 259 * Use file_limit to prevent cluster_read() from 260 * creating buffers of the wrong block size past 261 * the demarc. 262 */ 263 file_limit = ip->ino_data.size; 264 if (base_offset < HAMMER_XDEMARC && 265 file_limit > HAMMER_XDEMARC) { 266 file_limit = HAMMER_XDEMARC; 267 } 268 error = cluster_read(ap->a_vp, 269 file_limit, base_offset, 270 blksize, MAXPHYS, 271 seqcount, &bp); 272 } else { 273 error = bread(ap->a_vp, base_offset, blksize, &bp); 274 } 275 if (error) { 276 kprintf("error %d\n", error); 277 brelse(bp); 278 break; 279 } 280 281 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ 282 n = blksize - offset; 283 if (n > uio->uio_resid) 284 n = uio->uio_resid; 285 if (n > ip->ino_data.size - uio->uio_offset) 286 n = (int)(ip->ino_data.size - uio->uio_offset); 287 error = uiomove((char *)bp->b_data + offset, n, uio); 288 289 /* data has a lower priority then meta-data */ 290 bp->b_flags |= B_AGE; 291 bqrelse(bp); 292 if (error) 293 break; 294 hammer_stats_file_read += n; 295 } 296 if ((ip->flags & HAMMER_INODE_RO) == 0 && 297 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 298 ip->ino_data.atime = trans.time; 299 hammer_modify_inode(ip, HAMMER_INODE_ATIME); 300 } 301 hammer_done_transaction(&trans); 302 return (error); 303 } 304 305 /* 306 * hammer_vop_write { vp, uio, ioflag, cred } 307 */ 308 static 309 int 310 hammer_vop_write(struct vop_write_args *ap) 311 { 312 struct hammer_transaction trans; 313 struct hammer_inode *ip; 314 hammer_mount_t hmp; 315 struct uio *uio; 316 int offset; 317 off_t base_offset; 318 struct buf *bp; 319 int kflags; 320 int error; 321 int n; 322 int flags; 323 int delta; 324 int seqcount; 325 326 if (ap->a_vp->v_type != VREG) 327 return (EINVAL); 328 ip = VTOI(ap->a_vp); 329 hmp = ip->hmp; 330 error = 0; 331 kflags = 0; 332 seqcount = ap->a_ioflag >> 16; 333 334 if (ip->flags & HAMMER_INODE_RO) 335 return (EROFS); 336 337 /* 338 * Create a transaction to cover the operations we perform. 339 */ 340 hammer_start_transaction(&trans, hmp); 341 uio = ap->a_uio; 342 343 /* 344 * Check append mode 345 */ 346 if (ap->a_ioflag & IO_APPEND) 347 uio->uio_offset = ip->ino_data.size; 348 349 /* 350 * Check for illegal write offsets. Valid range is 0...2^63-1. 351 * 352 * NOTE: the base_off assignment is required to work around what 353 * I consider to be a GCC-4 optimization bug. 354 */ 355 if (uio->uio_offset < 0) { 356 hammer_done_transaction(&trans); 357 return (EFBIG); 358 } 359 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 360 if (uio->uio_resid > 0 && base_offset <= 0) { 361 hammer_done_transaction(&trans); 362 return (EFBIG); 363 } 364 365 /* 366 * Access the data typically in HAMMER_BUFSIZE blocks via the 367 * buffer cache, but HAMMER may use a variable block size based 368 * on the offset. 369 */ 370 while (uio->uio_resid > 0) { 371 int fixsize = 0; 372 int blksize; 373 int blkmask; 374 375 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 376 break; 377 378 blksize = hammer_blocksize(uio->uio_offset); 379 380 /* 381 * Do not allow HAMMER to blow out the buffer cache. Very 382 * large UIOs can lockout other processes due to bwillwrite() 383 * mechanics. 384 * 385 * The hammer inode is not locked during these operations. 386 * The vnode is locked which can interfere with the pageout 387 * daemon for non-UIO_NOCOPY writes but should not interfere 388 * with the buffer cache. Even so, we cannot afford to 389 * allow the pageout daemon to build up too many dirty buffer 390 * cache buffers. 391 * 392 * Only call this if we aren't being recursively called from 393 * a virtual disk device (vn), else we may deadlock. 394 */ 395 if ((ap->a_ioflag & IO_RECURSE) == 0) 396 bwillwrite(blksize); 397 398 /* 399 * Do not allow HAMMER to blow out system memory by 400 * accumulating too many records. Records are so well 401 * decoupled from the buffer cache that it is possible 402 * for userland to push data out to the media via 403 * direct-write, but build up the records queued to the 404 * backend faster then the backend can flush them out. 405 * HAMMER has hit its write limit but the frontend has 406 * no pushback to slow it down. 407 */ 408 if (hmp->rsv_recs > hammer_limit_recs / 2) { 409 /* 410 * Get the inode on the flush list 411 */ 412 if (ip->rsv_recs >= 64) 413 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 414 else if (ip->rsv_recs >= 16) 415 hammer_flush_inode(ip, 0); 416 417 /* 418 * Keep the flusher going if the system keeps 419 * queueing records. 420 */ 421 delta = hmp->count_newrecords - 422 hmp->last_newrecords; 423 if (delta < 0 || delta > hammer_limit_recs / 2) { 424 hmp->last_newrecords = hmp->count_newrecords; 425 hammer_sync_hmp(hmp, MNT_NOWAIT); 426 } 427 428 /* 429 * If we have gotten behind start slowing 430 * down the writers. 431 */ 432 delta = (hmp->rsv_recs - hammer_limit_recs) * 433 hz / hammer_limit_recs; 434 if (delta > 0) 435 tsleep(&trans, 0, "hmrslo", delta); 436 } 437 438 /* 439 * Calculate the blocksize at the current offset and figure 440 * out how much we can actually write. 441 */ 442 blkmask = blksize - 1; 443 offset = (int)uio->uio_offset & blkmask; 444 base_offset = uio->uio_offset & ~(int64_t)blkmask; 445 n = blksize - offset; 446 if (n > uio->uio_resid) 447 n = uio->uio_resid; 448 if (uio->uio_offset + n > ip->ino_data.size) { 449 vnode_pager_setsize(ap->a_vp, uio->uio_offset + n); 450 fixsize = 1; 451 kflags |= NOTE_EXTEND; 452 } 453 454 if (uio->uio_segflg == UIO_NOCOPY) { 455 /* 456 * Issuing a write with the same data backing the 457 * buffer. Instantiate the buffer to collect the 458 * backing vm pages, then read-in any missing bits. 459 * 460 * This case is used by vop_stdputpages(). 461 */ 462 bp = getblk(ap->a_vp, base_offset, 463 blksize, GETBLK_BHEAVY, 0); 464 if ((bp->b_flags & B_CACHE) == 0) { 465 bqrelse(bp); 466 error = bread(ap->a_vp, base_offset, 467 blksize, &bp); 468 } 469 } else if (offset == 0 && uio->uio_resid >= blksize) { 470 /* 471 * Even though we are entirely overwriting the buffer 472 * we may still have to zero it out to avoid a 473 * mmap/write visibility issue. 474 */ 475 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 476 if ((bp->b_flags & B_CACHE) == 0) 477 vfs_bio_clrbuf(bp); 478 } else if (base_offset >= ip->ino_data.size) { 479 /* 480 * If the base offset of the buffer is beyond the 481 * file EOF, we don't have to issue a read. 482 */ 483 bp = getblk(ap->a_vp, base_offset, 484 blksize, GETBLK_BHEAVY, 0); 485 vfs_bio_clrbuf(bp); 486 } else { 487 /* 488 * Partial overwrite, read in any missing bits then 489 * replace the portion being written. 490 */ 491 error = bread(ap->a_vp, base_offset, blksize, &bp); 492 if (error == 0) 493 bheavy(bp); 494 } 495 if (error == 0) { 496 error = uiomove((char *)bp->b_data + offset, 497 n, uio); 498 } 499 500 /* 501 * If we screwed up we have to undo any VM size changes we 502 * made. 503 */ 504 if (error) { 505 brelse(bp); 506 if (fixsize) { 507 vtruncbuf(ap->a_vp, ip->ino_data.size, 508 hammer_blocksize(ip->ino_data.size)); 509 } 510 break; 511 } 512 kflags |= NOTE_WRITE; 513 hammer_stats_file_write += n; 514 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ 515 if (ip->ino_data.size < uio->uio_offset) { 516 ip->ino_data.size = uio->uio_offset; 517 flags = HAMMER_INODE_DDIRTY; 518 vnode_pager_setsize(ap->a_vp, ip->ino_data.size); 519 } else { 520 flags = 0; 521 } 522 ip->ino_data.mtime = trans.time; 523 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 524 hammer_modify_inode(ip, flags); 525 526 /* 527 * Once we dirty the buffer any cached zone-X offset 528 * becomes invalid. HAMMER NOTE: no-history mode cannot 529 * allow overwriting over the same data sector unless 530 * we provide UNDOs for the old data, which we don't. 531 */ 532 bp->b_bio2.bio_offset = NOOFFSET; 533 534 /* 535 * Final buffer disposition. 536 */ 537 bp->b_flags |= B_AGE; 538 if (ap->a_ioflag & IO_SYNC) { 539 bwrite(bp); 540 } else if (ap->a_ioflag & IO_DIRECT) { 541 bawrite(bp); 542 } else { 543 bdwrite(bp); 544 } 545 } 546 hammer_done_transaction(&trans); 547 hammer_knote(ap->a_vp, kflags); 548 return (error); 549 } 550 551 /* 552 * hammer_vop_access { vp, mode, cred } 553 */ 554 static 555 int 556 hammer_vop_access(struct vop_access_args *ap) 557 { 558 struct hammer_inode *ip = VTOI(ap->a_vp); 559 uid_t uid; 560 gid_t gid; 561 int error; 562 563 ++hammer_stats_file_iopsr; 564 uid = hammer_to_unix_xid(&ip->ino_data.uid); 565 gid = hammer_to_unix_xid(&ip->ino_data.gid); 566 567 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 568 ip->ino_data.uflags); 569 return (error); 570 } 571 572 /* 573 * hammer_vop_advlock { vp, id, op, fl, flags } 574 */ 575 static 576 int 577 hammer_vop_advlock(struct vop_advlock_args *ap) 578 { 579 hammer_inode_t ip = VTOI(ap->a_vp); 580 581 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 582 } 583 584 /* 585 * hammer_vop_close { vp, fflag } 586 */ 587 static 588 int 589 hammer_vop_close(struct vop_close_args *ap) 590 { 591 /*hammer_inode_t ip = VTOI(ap->a_vp);*/ 592 return (vop_stdclose(ap)); 593 } 594 595 /* 596 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 597 * 598 * The operating system has already ensured that the directory entry 599 * does not exist and done all appropriate namespace locking. 600 */ 601 static 602 int 603 hammer_vop_ncreate(struct vop_ncreate_args *ap) 604 { 605 struct hammer_transaction trans; 606 struct hammer_inode *dip; 607 struct hammer_inode *nip; 608 struct nchandle *nch; 609 int error; 610 611 nch = ap->a_nch; 612 dip = VTOI(ap->a_dvp); 613 614 if (dip->flags & HAMMER_INODE_RO) 615 return (EROFS); 616 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 617 return (error); 618 619 /* 620 * Create a transaction to cover the operations we perform. 621 */ 622 hammer_start_transaction(&trans, dip->hmp); 623 ++hammer_stats_file_iopsw; 624 625 /* 626 * Create a new filesystem object of the requested type. The 627 * returned inode will be referenced and shared-locked to prevent 628 * it from being moved to the flusher. 629 */ 630 631 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 632 dip, NULL, &nip); 633 if (error) { 634 hkprintf("hammer_create_inode error %d\n", error); 635 hammer_done_transaction(&trans); 636 *ap->a_vpp = NULL; 637 return (error); 638 } 639 640 /* 641 * Add the new filesystem object to the directory. This will also 642 * bump the inode's link count. 643 */ 644 error = hammer_ip_add_directory(&trans, dip, 645 nch->ncp->nc_name, nch->ncp->nc_nlen, 646 nip); 647 if (error) 648 hkprintf("hammer_ip_add_directory error %d\n", error); 649 650 /* 651 * Finish up. 652 */ 653 if (error) { 654 hammer_rel_inode(nip, 0); 655 hammer_done_transaction(&trans); 656 *ap->a_vpp = NULL; 657 } else { 658 error = hammer_get_vnode(nip, ap->a_vpp); 659 hammer_done_transaction(&trans); 660 hammer_rel_inode(nip, 0); 661 if (error == 0) { 662 cache_setunresolved(ap->a_nch); 663 cache_setvp(ap->a_nch, *ap->a_vpp); 664 } 665 hammer_knote(ap->a_dvp, NOTE_WRITE); 666 } 667 return (error); 668 } 669 670 /* 671 * hammer_vop_getattr { vp, vap } 672 * 673 * Retrieve an inode's attribute information. When accessing inodes 674 * historically we fake the atime field to ensure consistent results. 675 * The atime field is stored in the B-Tree element and allowed to be 676 * updated without cycling the element. 677 */ 678 static 679 int 680 hammer_vop_getattr(struct vop_getattr_args *ap) 681 { 682 struct hammer_inode *ip = VTOI(ap->a_vp); 683 struct vattr *vap = ap->a_vap; 684 685 /* 686 * We want the fsid to be different when accessing a filesystem 687 * with different as-of's so programs like diff don't think 688 * the files are the same. 689 * 690 * We also want the fsid to be the same when comparing snapshots, 691 * or when comparing mirrors (which might be backed by different 692 * physical devices). HAMMER fsids are based on the PFS's 693 * shared_uuid field. 694 * 695 * XXX there is a chance of collision here. The va_fsid reported 696 * by stat is different from the more involved fsid used in the 697 * mount structure. 698 */ 699 ++hammer_stats_file_iopsr; 700 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 701 (u_int32_t)(ip->obj_asof >> 32); 702 703 vap->va_fileid = ip->ino_leaf.base.obj_id; 704 vap->va_mode = ip->ino_data.mode; 705 vap->va_nlink = ip->ino_data.nlinks; 706 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 707 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 708 vap->va_rmajor = 0; 709 vap->va_rminor = 0; 710 vap->va_size = ip->ino_data.size; 711 712 /* 713 * Special case for @@PFS softlinks. The actual size of the 714 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 715 */ 716 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 717 ip->ino_data.size == 10 && 718 ip->obj_asof == HAMMER_MAX_TID && 719 ip->obj_localization == 0 && 720 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 721 vap->va_size = 26; 722 } 723 724 /* 725 * We must provide a consistent atime and mtime for snapshots 726 * so people can do a 'tar cf - ... | md5' on them and get 727 * consistent results. 728 */ 729 if (ip->flags & HAMMER_INODE_RO) { 730 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 731 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 732 } else { 733 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 734 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 735 } 736 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 737 vap->va_flags = ip->ino_data.uflags; 738 vap->va_gen = 1; /* hammer inums are unique for all time */ 739 vap->va_blocksize = HAMMER_BUFSIZE; 740 if (ip->ino_data.size >= HAMMER_XDEMARC) { 741 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 742 ~HAMMER_XBUFMASK64; 743 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) { 744 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 745 ~HAMMER_BUFMASK64; 746 } else { 747 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 748 } 749 750 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 751 vap->va_filerev = 0; /* XXX */ 752 /* mtime uniquely identifies any adjustments made to the file XXX */ 753 vap->va_fsmid = ip->ino_data.mtime; 754 vap->va_uid_uuid = ip->ino_data.uid; 755 vap->va_gid_uuid = ip->ino_data.gid; 756 vap->va_fsid_uuid = ip->hmp->fsid; 757 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 758 VA_FSID_UUID_VALID; 759 760 switch (ip->ino_data.obj_type) { 761 case HAMMER_OBJTYPE_CDEV: 762 case HAMMER_OBJTYPE_BDEV: 763 vap->va_rmajor = ip->ino_data.rmajor; 764 vap->va_rminor = ip->ino_data.rminor; 765 break; 766 default: 767 break; 768 } 769 return(0); 770 } 771 772 /* 773 * hammer_vop_nresolve { nch, dvp, cred } 774 * 775 * Locate the requested directory entry. 776 */ 777 static 778 int 779 hammer_vop_nresolve(struct vop_nresolve_args *ap) 780 { 781 struct hammer_transaction trans; 782 struct namecache *ncp; 783 hammer_inode_t dip; 784 hammer_inode_t ip; 785 hammer_tid_t asof; 786 struct hammer_cursor cursor; 787 struct vnode *vp; 788 int64_t namekey; 789 int error; 790 int i; 791 int nlen; 792 int flags; 793 int ispfs; 794 int64_t obj_id; 795 u_int32_t localization; 796 u_int32_t max_iterations; 797 798 /* 799 * Misc initialization, plus handle as-of name extensions. Look for 800 * the '@@' extension. Note that as-of files and directories cannot 801 * be modified. 802 */ 803 dip = VTOI(ap->a_dvp); 804 ncp = ap->a_nch->ncp; 805 asof = dip->obj_asof; 806 localization = dip->obj_localization; /* for code consistency */ 807 nlen = ncp->nc_nlen; 808 flags = dip->flags & HAMMER_INODE_RO; 809 ispfs = 0; 810 811 hammer_simple_transaction(&trans, dip->hmp); 812 ++hammer_stats_file_iopsr; 813 814 for (i = 0; i < nlen; ++i) { 815 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 816 error = hammer_str_to_tid(ncp->nc_name + i + 2, 817 &ispfs, &asof, &localization); 818 if (error != 0) { 819 i = nlen; 820 break; 821 } 822 if (asof != HAMMER_MAX_TID) 823 flags |= HAMMER_INODE_RO; 824 break; 825 } 826 } 827 nlen = i; 828 829 /* 830 * If this is a PFS softlink we dive into the PFS 831 */ 832 if (ispfs && nlen == 0) { 833 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 834 asof, localization, 835 flags, &error); 836 if (error == 0) { 837 error = hammer_get_vnode(ip, &vp); 838 hammer_rel_inode(ip, 0); 839 } else { 840 vp = NULL; 841 } 842 if (error == 0) { 843 vn_unlock(vp); 844 cache_setvp(ap->a_nch, vp); 845 vrele(vp); 846 } 847 goto done; 848 } 849 850 /* 851 * If there is no path component the time extension is relative to 852 * dip. 853 */ 854 if (nlen == 0) { 855 ip = hammer_get_inode(&trans, dip, dip->obj_id, 856 asof, dip->obj_localization, 857 flags, &error); 858 if (error == 0) { 859 error = hammer_get_vnode(ip, &vp); 860 hammer_rel_inode(ip, 0); 861 } else { 862 vp = NULL; 863 } 864 if (error == 0) { 865 vn_unlock(vp); 866 cache_setvp(ap->a_nch, vp); 867 vrele(vp); 868 } 869 goto done; 870 } 871 872 /* 873 * Calculate the namekey and setup the key range for the scan. This 874 * works kinda like a chained hash table where the lower 32 bits 875 * of the namekey synthesize the chain. 876 * 877 * The key range is inclusive of both key_beg and key_end. 878 */ 879 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 880 &max_iterations); 881 882 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 883 cursor.key_beg.localization = dip->obj_localization + 884 HAMMER_LOCALIZE_MISC; 885 cursor.key_beg.obj_id = dip->obj_id; 886 cursor.key_beg.key = namekey; 887 cursor.key_beg.create_tid = 0; 888 cursor.key_beg.delete_tid = 0; 889 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 890 cursor.key_beg.obj_type = 0; 891 892 cursor.key_end = cursor.key_beg; 893 cursor.key_end.key += max_iterations; 894 cursor.asof = asof; 895 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 896 897 /* 898 * Scan all matching records (the chain), locate the one matching 899 * the requested path component. 900 * 901 * The hammer_ip_*() functions merge in-memory records with on-disk 902 * records for the purposes of the search. 903 */ 904 obj_id = 0; 905 localization = HAMMER_DEF_LOCALIZATION; 906 907 if (error == 0) { 908 error = hammer_ip_first(&cursor); 909 while (error == 0) { 910 error = hammer_ip_resolve_data(&cursor); 911 if (error) 912 break; 913 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 914 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 915 obj_id = cursor.data->entry.obj_id; 916 localization = cursor.data->entry.localization; 917 break; 918 } 919 error = hammer_ip_next(&cursor); 920 } 921 } 922 hammer_done_cursor(&cursor); 923 if (error == 0) { 924 ip = hammer_get_inode(&trans, dip, obj_id, 925 asof, localization, 926 flags, &error); 927 if (error == 0) { 928 error = hammer_get_vnode(ip, &vp); 929 hammer_rel_inode(ip, 0); 930 } else { 931 vp = NULL; 932 } 933 if (error == 0) { 934 vn_unlock(vp); 935 cache_setvp(ap->a_nch, vp); 936 vrele(vp); 937 } 938 } else if (error == ENOENT) { 939 cache_setvp(ap->a_nch, NULL); 940 } 941 done: 942 hammer_done_transaction(&trans); 943 return (error); 944 } 945 946 /* 947 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 948 * 949 * Locate the parent directory of a directory vnode. 950 * 951 * dvp is referenced but not locked. *vpp must be returned referenced and 952 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 953 * at the root, instead it could indicate that the directory we were in was 954 * removed. 955 * 956 * NOTE: as-of sequences are not linked into the directory structure. If 957 * we are at the root with a different asof then the mount point, reload 958 * the same directory with the mount point's asof. I'm not sure what this 959 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 960 * get confused, but it hasn't been tested. 961 */ 962 static 963 int 964 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 965 { 966 struct hammer_transaction trans; 967 struct hammer_inode *dip; 968 struct hammer_inode *ip; 969 int64_t parent_obj_id; 970 u_int32_t parent_obj_localization; 971 hammer_tid_t asof; 972 int error; 973 974 dip = VTOI(ap->a_dvp); 975 asof = dip->obj_asof; 976 977 /* 978 * Whos are parent? This could be the root of a pseudo-filesystem 979 * whos parent is in another localization domain. 980 */ 981 parent_obj_id = dip->ino_data.parent_obj_id; 982 if (dip->obj_id == HAMMER_OBJID_ROOT) 983 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 984 else 985 parent_obj_localization = dip->obj_localization; 986 987 if (parent_obj_id == 0) { 988 if (dip->obj_id == HAMMER_OBJID_ROOT && 989 asof != dip->hmp->asof) { 990 parent_obj_id = dip->obj_id; 991 asof = dip->hmp->asof; 992 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 993 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 994 dip->obj_asof); 995 } else { 996 *ap->a_vpp = NULL; 997 return ENOENT; 998 } 999 } 1000 1001 hammer_simple_transaction(&trans, dip->hmp); 1002 ++hammer_stats_file_iopsr; 1003 1004 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1005 asof, parent_obj_localization, 1006 dip->flags, &error); 1007 if (ip) { 1008 error = hammer_get_vnode(ip, ap->a_vpp); 1009 hammer_rel_inode(ip, 0); 1010 } else { 1011 *ap->a_vpp = NULL; 1012 } 1013 hammer_done_transaction(&trans); 1014 return (error); 1015 } 1016 1017 /* 1018 * hammer_vop_nlink { nch, dvp, vp, cred } 1019 */ 1020 static 1021 int 1022 hammer_vop_nlink(struct vop_nlink_args *ap) 1023 { 1024 struct hammer_transaction trans; 1025 struct hammer_inode *dip; 1026 struct hammer_inode *ip; 1027 struct nchandle *nch; 1028 int error; 1029 1030 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1031 return(EXDEV); 1032 1033 nch = ap->a_nch; 1034 dip = VTOI(ap->a_dvp); 1035 ip = VTOI(ap->a_vp); 1036 1037 if (dip->obj_localization != ip->obj_localization) 1038 return(EXDEV); 1039 1040 if (dip->flags & HAMMER_INODE_RO) 1041 return (EROFS); 1042 if (ip->flags & HAMMER_INODE_RO) 1043 return (EROFS); 1044 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1045 return (error); 1046 1047 /* 1048 * Create a transaction to cover the operations we perform. 1049 */ 1050 hammer_start_transaction(&trans, dip->hmp); 1051 ++hammer_stats_file_iopsw; 1052 1053 /* 1054 * Add the filesystem object to the directory. Note that neither 1055 * dip nor ip are referenced or locked, but their vnodes are 1056 * referenced. This function will bump the inode's link count. 1057 */ 1058 error = hammer_ip_add_directory(&trans, dip, 1059 nch->ncp->nc_name, nch->ncp->nc_nlen, 1060 ip); 1061 1062 /* 1063 * Finish up. 1064 */ 1065 if (error == 0) { 1066 cache_setunresolved(nch); 1067 cache_setvp(nch, ap->a_vp); 1068 } 1069 hammer_done_transaction(&trans); 1070 hammer_knote(ap->a_vp, NOTE_LINK); 1071 hammer_knote(ap->a_dvp, NOTE_WRITE); 1072 return (error); 1073 } 1074 1075 /* 1076 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1077 * 1078 * The operating system has already ensured that the directory entry 1079 * does not exist and done all appropriate namespace locking. 1080 */ 1081 static 1082 int 1083 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1084 { 1085 struct hammer_transaction trans; 1086 struct hammer_inode *dip; 1087 struct hammer_inode *nip; 1088 struct nchandle *nch; 1089 int error; 1090 1091 nch = ap->a_nch; 1092 dip = VTOI(ap->a_dvp); 1093 1094 if (dip->flags & HAMMER_INODE_RO) 1095 return (EROFS); 1096 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1097 return (error); 1098 1099 /* 1100 * Create a transaction to cover the operations we perform. 1101 */ 1102 hammer_start_transaction(&trans, dip->hmp); 1103 ++hammer_stats_file_iopsw; 1104 1105 /* 1106 * Create a new filesystem object of the requested type. The 1107 * returned inode will be referenced but not locked. 1108 */ 1109 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1110 dip, NULL, &nip); 1111 if (error) { 1112 hkprintf("hammer_mkdir error %d\n", error); 1113 hammer_done_transaction(&trans); 1114 *ap->a_vpp = NULL; 1115 return (error); 1116 } 1117 /* 1118 * Add the new filesystem object to the directory. This will also 1119 * bump the inode's link count. 1120 */ 1121 error = hammer_ip_add_directory(&trans, dip, 1122 nch->ncp->nc_name, nch->ncp->nc_nlen, 1123 nip); 1124 if (error) 1125 hkprintf("hammer_mkdir (add) error %d\n", error); 1126 1127 /* 1128 * Finish up. 1129 */ 1130 if (error) { 1131 hammer_rel_inode(nip, 0); 1132 *ap->a_vpp = NULL; 1133 } else { 1134 error = hammer_get_vnode(nip, ap->a_vpp); 1135 hammer_rel_inode(nip, 0); 1136 if (error == 0) { 1137 cache_setunresolved(ap->a_nch); 1138 cache_setvp(ap->a_nch, *ap->a_vpp); 1139 } 1140 } 1141 hammer_done_transaction(&trans); 1142 if (error == 0) 1143 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1144 return (error); 1145 } 1146 1147 /* 1148 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1149 * 1150 * The operating system has already ensured that the directory entry 1151 * does not exist and done all appropriate namespace locking. 1152 */ 1153 static 1154 int 1155 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1156 { 1157 struct hammer_transaction trans; 1158 struct hammer_inode *dip; 1159 struct hammer_inode *nip; 1160 struct nchandle *nch; 1161 int error; 1162 1163 nch = ap->a_nch; 1164 dip = VTOI(ap->a_dvp); 1165 1166 if (dip->flags & HAMMER_INODE_RO) 1167 return (EROFS); 1168 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1169 return (error); 1170 1171 /* 1172 * Create a transaction to cover the operations we perform. 1173 */ 1174 hammer_start_transaction(&trans, dip->hmp); 1175 ++hammer_stats_file_iopsw; 1176 1177 /* 1178 * Create a new filesystem object of the requested type. The 1179 * returned inode will be referenced but not locked. 1180 * 1181 * If mknod specifies a directory a pseudo-fs is created. 1182 */ 1183 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1184 dip, NULL, &nip); 1185 if (error) { 1186 hammer_done_transaction(&trans); 1187 *ap->a_vpp = NULL; 1188 return (error); 1189 } 1190 1191 /* 1192 * Add the new filesystem object to the directory. This will also 1193 * bump the inode's link count. 1194 */ 1195 error = hammer_ip_add_directory(&trans, dip, 1196 nch->ncp->nc_name, nch->ncp->nc_nlen, 1197 nip); 1198 1199 /* 1200 * Finish up. 1201 */ 1202 if (error) { 1203 hammer_rel_inode(nip, 0); 1204 *ap->a_vpp = NULL; 1205 } else { 1206 error = hammer_get_vnode(nip, ap->a_vpp); 1207 hammer_rel_inode(nip, 0); 1208 if (error == 0) { 1209 cache_setunresolved(ap->a_nch); 1210 cache_setvp(ap->a_nch, *ap->a_vpp); 1211 } 1212 } 1213 hammer_done_transaction(&trans); 1214 if (error == 0) 1215 hammer_knote(ap->a_dvp, NOTE_WRITE); 1216 return (error); 1217 } 1218 1219 /* 1220 * hammer_vop_open { vp, mode, cred, fp } 1221 */ 1222 static 1223 int 1224 hammer_vop_open(struct vop_open_args *ap) 1225 { 1226 hammer_inode_t ip; 1227 1228 ++hammer_stats_file_iopsr; 1229 ip = VTOI(ap->a_vp); 1230 1231 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1232 return (EROFS); 1233 return(vop_stdopen(ap)); 1234 } 1235 1236 /* 1237 * hammer_vop_print { vp } 1238 */ 1239 static 1240 int 1241 hammer_vop_print(struct vop_print_args *ap) 1242 { 1243 return EOPNOTSUPP; 1244 } 1245 1246 /* 1247 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1248 */ 1249 static 1250 int 1251 hammer_vop_readdir(struct vop_readdir_args *ap) 1252 { 1253 struct hammer_transaction trans; 1254 struct hammer_cursor cursor; 1255 struct hammer_inode *ip; 1256 struct uio *uio; 1257 hammer_base_elm_t base; 1258 int error; 1259 int cookie_index; 1260 int ncookies; 1261 off_t *cookies; 1262 off_t saveoff; 1263 int r; 1264 int dtype; 1265 1266 ++hammer_stats_file_iopsr; 1267 ip = VTOI(ap->a_vp); 1268 uio = ap->a_uio; 1269 saveoff = uio->uio_offset; 1270 1271 if (ap->a_ncookies) { 1272 ncookies = uio->uio_resid / 16 + 1; 1273 if (ncookies > 1024) 1274 ncookies = 1024; 1275 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1276 cookie_index = 0; 1277 } else { 1278 ncookies = -1; 1279 cookies = NULL; 1280 cookie_index = 0; 1281 } 1282 1283 hammer_simple_transaction(&trans, ip->hmp); 1284 1285 /* 1286 * Handle artificial entries 1287 */ 1288 error = 0; 1289 if (saveoff == 0) { 1290 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1291 if (r) 1292 goto done; 1293 if (cookies) 1294 cookies[cookie_index] = saveoff; 1295 ++saveoff; 1296 ++cookie_index; 1297 if (cookie_index == ncookies) 1298 goto done; 1299 } 1300 if (saveoff == 1) { 1301 if (ip->ino_data.parent_obj_id) { 1302 r = vop_write_dirent(&error, uio, 1303 ip->ino_data.parent_obj_id, 1304 DT_DIR, 2, ".."); 1305 } else { 1306 r = vop_write_dirent(&error, uio, 1307 ip->obj_id, DT_DIR, 2, ".."); 1308 } 1309 if (r) 1310 goto done; 1311 if (cookies) 1312 cookies[cookie_index] = saveoff; 1313 ++saveoff; 1314 ++cookie_index; 1315 if (cookie_index == ncookies) 1316 goto done; 1317 } 1318 1319 /* 1320 * Key range (begin and end inclusive) to scan. Directory keys 1321 * directly translate to a 64 bit 'seek' position. 1322 */ 1323 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1324 cursor.key_beg.localization = ip->obj_localization + 1325 HAMMER_LOCALIZE_MISC; 1326 cursor.key_beg.obj_id = ip->obj_id; 1327 cursor.key_beg.create_tid = 0; 1328 cursor.key_beg.delete_tid = 0; 1329 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1330 cursor.key_beg.obj_type = 0; 1331 cursor.key_beg.key = saveoff; 1332 1333 cursor.key_end = cursor.key_beg; 1334 cursor.key_end.key = HAMMER_MAX_KEY; 1335 cursor.asof = ip->obj_asof; 1336 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1337 1338 error = hammer_ip_first(&cursor); 1339 1340 while (error == 0) { 1341 error = hammer_ip_resolve_data(&cursor); 1342 if (error) 1343 break; 1344 base = &cursor.leaf->base; 1345 saveoff = base->key; 1346 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1347 1348 if (base->obj_id != ip->obj_id) 1349 panic("readdir: bad record at %p", cursor.node); 1350 1351 /* 1352 * Convert pseudo-filesystems into softlinks 1353 */ 1354 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1355 r = vop_write_dirent( 1356 &error, uio, cursor.data->entry.obj_id, 1357 dtype, 1358 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1359 (void *)cursor.data->entry.name); 1360 if (r) 1361 break; 1362 ++saveoff; 1363 if (cookies) 1364 cookies[cookie_index] = base->key; 1365 ++cookie_index; 1366 if (cookie_index == ncookies) 1367 break; 1368 error = hammer_ip_next(&cursor); 1369 } 1370 hammer_done_cursor(&cursor); 1371 1372 done: 1373 hammer_done_transaction(&trans); 1374 1375 if (ap->a_eofflag) 1376 *ap->a_eofflag = (error == ENOENT); 1377 uio->uio_offset = saveoff; 1378 if (error && cookie_index == 0) { 1379 if (error == ENOENT) 1380 error = 0; 1381 if (cookies) { 1382 kfree(cookies, M_TEMP); 1383 *ap->a_ncookies = 0; 1384 *ap->a_cookies = NULL; 1385 } 1386 } else { 1387 if (error == ENOENT) 1388 error = 0; 1389 if (cookies) { 1390 *ap->a_ncookies = cookie_index; 1391 *ap->a_cookies = cookies; 1392 } 1393 } 1394 return(error); 1395 } 1396 1397 /* 1398 * hammer_vop_readlink { vp, uio, cred } 1399 */ 1400 static 1401 int 1402 hammer_vop_readlink(struct vop_readlink_args *ap) 1403 { 1404 struct hammer_transaction trans; 1405 struct hammer_cursor cursor; 1406 struct hammer_inode *ip; 1407 char buf[32]; 1408 u_int32_t localization; 1409 hammer_pseudofs_inmem_t pfsm; 1410 int error; 1411 1412 ip = VTOI(ap->a_vp); 1413 1414 /* 1415 * Shortcut if the symlink data was stuffed into ino_data. 1416 * 1417 * Also expand special "@@PFS%05d" softlinks (expansion only 1418 * occurs for non-historical (current) accesses made from the 1419 * primary filesystem). 1420 */ 1421 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1422 char *ptr; 1423 int bytes; 1424 1425 ptr = ip->ino_data.ext.symlink; 1426 bytes = (int)ip->ino_data.size; 1427 if (bytes == 10 && 1428 ip->obj_asof == HAMMER_MAX_TID && 1429 ip->obj_localization == 0 && 1430 strncmp(ptr, "@@PFS", 5) == 0) { 1431 hammer_simple_transaction(&trans, ip->hmp); 1432 bcopy(ptr + 5, buf, 5); 1433 buf[5] = 0; 1434 localization = strtoul(buf, NULL, 10) << 16; 1435 pfsm = hammer_load_pseudofs(&trans, localization, 1436 &error); 1437 if (error == 0) { 1438 if (pfsm->pfsd.mirror_flags & 1439 HAMMER_PFSD_SLAVE) { 1440 ksnprintf(buf, sizeof(buf), 1441 "@@0x%016llx:%05d", 1442 pfsm->pfsd.sync_end_tid, 1443 localization >> 16); 1444 } else { 1445 ksnprintf(buf, sizeof(buf), 1446 "@@0x%016llx:%05d", 1447 HAMMER_MAX_TID, 1448 localization >> 16); 1449 } 1450 ptr = buf; 1451 bytes = strlen(buf); 1452 } 1453 if (pfsm) 1454 hammer_rel_pseudofs(trans.hmp, pfsm); 1455 hammer_done_transaction(&trans); 1456 } 1457 error = uiomove(ptr, bytes, ap->a_uio); 1458 return(error); 1459 } 1460 1461 /* 1462 * Long version 1463 */ 1464 hammer_simple_transaction(&trans, ip->hmp); 1465 ++hammer_stats_file_iopsr; 1466 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1467 1468 /* 1469 * Key range (begin and end inclusive) to scan. Directory keys 1470 * directly translate to a 64 bit 'seek' position. 1471 */ 1472 cursor.key_beg.localization = ip->obj_localization + 1473 HAMMER_LOCALIZE_MISC; 1474 cursor.key_beg.obj_id = ip->obj_id; 1475 cursor.key_beg.create_tid = 0; 1476 cursor.key_beg.delete_tid = 0; 1477 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1478 cursor.key_beg.obj_type = 0; 1479 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1480 cursor.asof = ip->obj_asof; 1481 cursor.flags |= HAMMER_CURSOR_ASOF; 1482 1483 error = hammer_ip_lookup(&cursor); 1484 if (error == 0) { 1485 error = hammer_ip_resolve_data(&cursor); 1486 if (error == 0) { 1487 KKASSERT(cursor.leaf->data_len >= 1488 HAMMER_SYMLINK_NAME_OFF); 1489 error = uiomove(cursor.data->symlink.name, 1490 cursor.leaf->data_len - 1491 HAMMER_SYMLINK_NAME_OFF, 1492 ap->a_uio); 1493 } 1494 } 1495 hammer_done_cursor(&cursor); 1496 hammer_done_transaction(&trans); 1497 return(error); 1498 } 1499 1500 /* 1501 * hammer_vop_nremove { nch, dvp, cred } 1502 */ 1503 static 1504 int 1505 hammer_vop_nremove(struct vop_nremove_args *ap) 1506 { 1507 struct hammer_transaction trans; 1508 struct hammer_inode *dip; 1509 int error; 1510 1511 dip = VTOI(ap->a_dvp); 1512 1513 if (hammer_nohistory(dip) == 0 && 1514 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1515 return (error); 1516 } 1517 1518 hammer_start_transaction(&trans, dip->hmp); 1519 ++hammer_stats_file_iopsw; 1520 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1521 hammer_done_transaction(&trans); 1522 if (error == 0) 1523 hammer_knote(ap->a_dvp, NOTE_WRITE); 1524 return (error); 1525 } 1526 1527 /* 1528 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1529 */ 1530 static 1531 int 1532 hammer_vop_nrename(struct vop_nrename_args *ap) 1533 { 1534 struct hammer_transaction trans; 1535 struct namecache *fncp; 1536 struct namecache *tncp; 1537 struct hammer_inode *fdip; 1538 struct hammer_inode *tdip; 1539 struct hammer_inode *ip; 1540 struct hammer_cursor cursor; 1541 int64_t namekey; 1542 u_int32_t max_iterations; 1543 int nlen, error; 1544 1545 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1546 return(EXDEV); 1547 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1548 return(EXDEV); 1549 1550 fdip = VTOI(ap->a_fdvp); 1551 tdip = VTOI(ap->a_tdvp); 1552 fncp = ap->a_fnch->ncp; 1553 tncp = ap->a_tnch->ncp; 1554 ip = VTOI(fncp->nc_vp); 1555 KKASSERT(ip != NULL); 1556 1557 if (fdip->obj_localization != tdip->obj_localization) 1558 return(EXDEV); 1559 if (fdip->obj_localization != ip->obj_localization) 1560 return(EXDEV); 1561 1562 if (fdip->flags & HAMMER_INODE_RO) 1563 return (EROFS); 1564 if (tdip->flags & HAMMER_INODE_RO) 1565 return (EROFS); 1566 if (ip->flags & HAMMER_INODE_RO) 1567 return (EROFS); 1568 if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1569 return (error); 1570 1571 hammer_start_transaction(&trans, fdip->hmp); 1572 ++hammer_stats_file_iopsw; 1573 1574 /* 1575 * Remove tncp from the target directory and then link ip as 1576 * tncp. XXX pass trans to dounlink 1577 * 1578 * Force the inode sync-time to match the transaction so it is 1579 * in-sync with the creation of the target directory entry. 1580 */ 1581 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 1582 ap->a_cred, 0, -1); 1583 if (error == 0 || error == ENOENT) { 1584 error = hammer_ip_add_directory(&trans, tdip, 1585 tncp->nc_name, tncp->nc_nlen, 1586 ip); 1587 if (error == 0) { 1588 ip->ino_data.parent_obj_id = tdip->obj_id; 1589 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); 1590 } 1591 } 1592 if (error) 1593 goto failed; /* XXX */ 1594 1595 /* 1596 * Locate the record in the originating directory and remove it. 1597 * 1598 * Calculate the namekey and setup the key range for the scan. This 1599 * works kinda like a chained hash table where the lower 32 bits 1600 * of the namekey synthesize the chain. 1601 * 1602 * The key range is inclusive of both key_beg and key_end. 1603 */ 1604 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 1605 &max_iterations); 1606 retry: 1607 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 1608 cursor.key_beg.localization = fdip->obj_localization + 1609 HAMMER_LOCALIZE_MISC; 1610 cursor.key_beg.obj_id = fdip->obj_id; 1611 cursor.key_beg.key = namekey; 1612 cursor.key_beg.create_tid = 0; 1613 cursor.key_beg.delete_tid = 0; 1614 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1615 cursor.key_beg.obj_type = 0; 1616 1617 cursor.key_end = cursor.key_beg; 1618 cursor.key_end.key += max_iterations; 1619 cursor.asof = fdip->obj_asof; 1620 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1621 1622 /* 1623 * Scan all matching records (the chain), locate the one matching 1624 * the requested path component. 1625 * 1626 * The hammer_ip_*() functions merge in-memory records with on-disk 1627 * records for the purposes of the search. 1628 */ 1629 error = hammer_ip_first(&cursor); 1630 while (error == 0) { 1631 if (hammer_ip_resolve_data(&cursor) != 0) 1632 break; 1633 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 1634 KKASSERT(nlen > 0); 1635 if (fncp->nc_nlen == nlen && 1636 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1637 break; 1638 } 1639 error = hammer_ip_next(&cursor); 1640 } 1641 1642 /* 1643 * If all is ok we have to get the inode so we can adjust nlinks. 1644 * 1645 * WARNING: hammer_ip_del_directory() may have to terminate the 1646 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 1647 * twice. 1648 */ 1649 if (error == 0) 1650 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 1651 1652 /* 1653 * XXX A deadlock here will break rename's atomicy for the purposes 1654 * of crash recovery. 1655 */ 1656 if (error == EDEADLK) { 1657 hammer_done_cursor(&cursor); 1658 goto retry; 1659 } 1660 1661 /* 1662 * Cleanup and tell the kernel that the rename succeeded. 1663 */ 1664 hammer_done_cursor(&cursor); 1665 if (error == 0) { 1666 cache_rename(ap->a_fnch, ap->a_tnch); 1667 hammer_knote(ap->a_fdvp, NOTE_WRITE); 1668 hammer_knote(ap->a_tdvp, NOTE_WRITE); 1669 if (ip->vp) 1670 hammer_knote(ip->vp, NOTE_RENAME); 1671 } 1672 1673 failed: 1674 hammer_done_transaction(&trans); 1675 return (error); 1676 } 1677 1678 /* 1679 * hammer_vop_nrmdir { nch, dvp, cred } 1680 */ 1681 static 1682 int 1683 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 1684 { 1685 struct hammer_transaction trans; 1686 struct hammer_inode *dip; 1687 int error; 1688 1689 dip = VTOI(ap->a_dvp); 1690 1691 if (hammer_nohistory(dip) == 0 && 1692 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1693 return (error); 1694 } 1695 1696 hammer_start_transaction(&trans, dip->hmp); 1697 ++hammer_stats_file_iopsw; 1698 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 1699 hammer_done_transaction(&trans); 1700 if (error == 0) 1701 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1702 return (error); 1703 } 1704 1705 /* 1706 * hammer_vop_setattr { vp, vap, cred } 1707 */ 1708 static 1709 int 1710 hammer_vop_setattr(struct vop_setattr_args *ap) 1711 { 1712 struct hammer_transaction trans; 1713 struct vattr *vap; 1714 struct hammer_inode *ip; 1715 int modflags; 1716 int error; 1717 int truncating; 1718 int blksize; 1719 int kflags; 1720 int64_t aligned_size; 1721 u_int32_t flags; 1722 1723 vap = ap->a_vap; 1724 ip = ap->a_vp->v_data; 1725 modflags = 0; 1726 kflags = 0; 1727 1728 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1729 return(EROFS); 1730 if (ip->flags & HAMMER_INODE_RO) 1731 return (EROFS); 1732 if (hammer_nohistory(ip) == 0 && 1733 (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1734 return (error); 1735 } 1736 1737 hammer_start_transaction(&trans, ip->hmp); 1738 ++hammer_stats_file_iopsw; 1739 error = 0; 1740 1741 if (vap->va_flags != VNOVAL) { 1742 flags = ip->ino_data.uflags; 1743 error = vop_helper_setattr_flags(&flags, vap->va_flags, 1744 hammer_to_unix_xid(&ip->ino_data.uid), 1745 ap->a_cred); 1746 if (error == 0) { 1747 if (ip->ino_data.uflags != flags) { 1748 ip->ino_data.uflags = flags; 1749 modflags |= HAMMER_INODE_DDIRTY; 1750 kflags |= NOTE_ATTRIB; 1751 } 1752 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 1753 error = 0; 1754 goto done; 1755 } 1756 } 1757 goto done; 1758 } 1759 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 1760 error = EPERM; 1761 goto done; 1762 } 1763 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 1764 mode_t cur_mode = ip->ino_data.mode; 1765 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1766 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1767 uuid_t uuid_uid; 1768 uuid_t uuid_gid; 1769 1770 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 1771 ap->a_cred, 1772 &cur_uid, &cur_gid, &cur_mode); 1773 if (error == 0) { 1774 hammer_guid_to_uuid(&uuid_uid, cur_uid); 1775 hammer_guid_to_uuid(&uuid_gid, cur_gid); 1776 if (bcmp(&uuid_uid, &ip->ino_data.uid, 1777 sizeof(uuid_uid)) || 1778 bcmp(&uuid_gid, &ip->ino_data.gid, 1779 sizeof(uuid_gid)) || 1780 ip->ino_data.mode != cur_mode 1781 ) { 1782 ip->ino_data.uid = uuid_uid; 1783 ip->ino_data.gid = uuid_gid; 1784 ip->ino_data.mode = cur_mode; 1785 } 1786 modflags |= HAMMER_INODE_DDIRTY; 1787 kflags |= NOTE_ATTRIB; 1788 } 1789 } 1790 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 1791 switch(ap->a_vp->v_type) { 1792 case VREG: 1793 if (vap->va_size == ip->ino_data.size) 1794 break; 1795 /* 1796 * XXX break atomicy, we can deadlock the backend 1797 * if we do not release the lock. Probably not a 1798 * big deal here. 1799 */ 1800 blksize = hammer_blocksize(vap->va_size); 1801 if (vap->va_size < ip->ino_data.size) { 1802 vtruncbuf(ap->a_vp, vap->va_size, blksize); 1803 truncating = 1; 1804 kflags |= NOTE_WRITE; 1805 } else { 1806 vnode_pager_setsize(ap->a_vp, vap->va_size); 1807 truncating = 0; 1808 kflags |= NOTE_WRITE | NOTE_EXTEND; 1809 } 1810 ip->ino_data.size = vap->va_size; 1811 modflags |= HAMMER_INODE_DDIRTY; 1812 1813 /* 1814 * on-media truncation is cached in the inode until 1815 * the inode is synchronized. 1816 */ 1817 if (truncating) { 1818 hammer_ip_frontend_trunc(ip, vap->va_size); 1819 #ifdef DEBUG_TRUNCATE 1820 if (HammerTruncIp == NULL) 1821 HammerTruncIp = ip; 1822 #endif 1823 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 1824 ip->flags |= HAMMER_INODE_TRUNCATED; 1825 ip->trunc_off = vap->va_size; 1826 #ifdef DEBUG_TRUNCATE 1827 if (ip == HammerTruncIp) 1828 kprintf("truncate1 %016llx\n", ip->trunc_off); 1829 #endif 1830 } else if (ip->trunc_off > vap->va_size) { 1831 ip->trunc_off = vap->va_size; 1832 #ifdef DEBUG_TRUNCATE 1833 if (ip == HammerTruncIp) 1834 kprintf("truncate2 %016llx\n", ip->trunc_off); 1835 #endif 1836 } else { 1837 #ifdef DEBUG_TRUNCATE 1838 if (ip == HammerTruncIp) 1839 kprintf("truncate3 %016llx (ignored)\n", vap->va_size); 1840 #endif 1841 } 1842 } 1843 1844 /* 1845 * If truncating we have to clean out a portion of 1846 * the last block on-disk. We do this in the 1847 * front-end buffer cache. 1848 */ 1849 aligned_size = (vap->va_size + (blksize - 1)) & 1850 ~(int64_t)(blksize - 1); 1851 if (truncating && vap->va_size < aligned_size) { 1852 struct buf *bp; 1853 int offset; 1854 1855 aligned_size -= blksize; 1856 1857 offset = (int)vap->va_size & (blksize - 1); 1858 error = bread(ap->a_vp, aligned_size, 1859 blksize, &bp); 1860 hammer_ip_frontend_trunc(ip, aligned_size); 1861 if (error == 0) { 1862 bzero(bp->b_data + offset, 1863 blksize - offset); 1864 /* must de-cache direct-io offset */ 1865 bp->b_bio2.bio_offset = NOOFFSET; 1866 bdwrite(bp); 1867 } else { 1868 kprintf("ERROR %d\n", error); 1869 brelse(bp); 1870 } 1871 } 1872 break; 1873 case VDATABASE: 1874 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 1875 ip->flags |= HAMMER_INODE_TRUNCATED; 1876 ip->trunc_off = vap->va_size; 1877 } else if (ip->trunc_off > vap->va_size) { 1878 ip->trunc_off = vap->va_size; 1879 } 1880 hammer_ip_frontend_trunc(ip, vap->va_size); 1881 ip->ino_data.size = vap->va_size; 1882 modflags |= HAMMER_INODE_DDIRTY; 1883 kflags |= NOTE_ATTRIB; 1884 break; 1885 default: 1886 error = EINVAL; 1887 goto done; 1888 } 1889 break; 1890 } 1891 if (vap->va_atime.tv_sec != VNOVAL) { 1892 ip->ino_data.atime = 1893 hammer_timespec_to_time(&vap->va_atime); 1894 modflags |= HAMMER_INODE_ATIME; 1895 kflags |= NOTE_ATTRIB; 1896 } 1897 if (vap->va_mtime.tv_sec != VNOVAL) { 1898 ip->ino_data.mtime = 1899 hammer_timespec_to_time(&vap->va_mtime); 1900 modflags |= HAMMER_INODE_MTIME; 1901 kflags |= NOTE_ATTRIB; 1902 } 1903 if (vap->va_mode != (mode_t)VNOVAL) { 1904 mode_t cur_mode = ip->ino_data.mode; 1905 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1906 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1907 1908 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 1909 cur_uid, cur_gid, &cur_mode); 1910 if (error == 0 && ip->ino_data.mode != cur_mode) { 1911 ip->ino_data.mode = cur_mode; 1912 modflags |= HAMMER_INODE_DDIRTY; 1913 kflags |= NOTE_ATTRIB; 1914 } 1915 } 1916 done: 1917 if (error == 0) 1918 hammer_modify_inode(ip, modflags); 1919 hammer_done_transaction(&trans); 1920 hammer_knote(ap->a_vp, kflags); 1921 return (error); 1922 } 1923 1924 /* 1925 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 1926 */ 1927 static 1928 int 1929 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 1930 { 1931 struct hammer_transaction trans; 1932 struct hammer_inode *dip; 1933 struct hammer_inode *nip; 1934 struct nchandle *nch; 1935 hammer_record_t record; 1936 int error; 1937 int bytes; 1938 1939 ap->a_vap->va_type = VLNK; 1940 1941 nch = ap->a_nch; 1942 dip = VTOI(ap->a_dvp); 1943 1944 if (dip->flags & HAMMER_INODE_RO) 1945 return (EROFS); 1946 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1947 return (error); 1948 1949 /* 1950 * Create a transaction to cover the operations we perform. 1951 */ 1952 hammer_start_transaction(&trans, dip->hmp); 1953 ++hammer_stats_file_iopsw; 1954 1955 /* 1956 * Create a new filesystem object of the requested type. The 1957 * returned inode will be referenced but not locked. 1958 */ 1959 1960 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1961 dip, NULL, &nip); 1962 if (error) { 1963 hammer_done_transaction(&trans); 1964 *ap->a_vpp = NULL; 1965 return (error); 1966 } 1967 1968 /* 1969 * Add a record representing the symlink. symlink stores the link 1970 * as pure data, not a string, and is no \0 terminated. 1971 */ 1972 if (error == 0) { 1973 bytes = strlen(ap->a_target); 1974 1975 if (bytes <= HAMMER_INODE_BASESYMLEN) { 1976 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 1977 } else { 1978 record = hammer_alloc_mem_record(nip, bytes); 1979 record->type = HAMMER_MEM_RECORD_GENERAL; 1980 1981 record->leaf.base.localization = nip->obj_localization + 1982 HAMMER_LOCALIZE_MISC; 1983 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 1984 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 1985 record->leaf.data_len = bytes; 1986 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 1987 bcopy(ap->a_target, record->data->symlink.name, bytes); 1988 error = hammer_ip_add_record(&trans, record); 1989 } 1990 1991 /* 1992 * Set the file size to the length of the link. 1993 */ 1994 if (error == 0) { 1995 nip->ino_data.size = bytes; 1996 hammer_modify_inode(nip, HAMMER_INODE_DDIRTY); 1997 } 1998 } 1999 if (error == 0) 2000 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2001 nch->ncp->nc_nlen, nip); 2002 2003 /* 2004 * Finish up. 2005 */ 2006 if (error) { 2007 hammer_rel_inode(nip, 0); 2008 *ap->a_vpp = NULL; 2009 } else { 2010 error = hammer_get_vnode(nip, ap->a_vpp); 2011 hammer_rel_inode(nip, 0); 2012 if (error == 0) { 2013 cache_setunresolved(ap->a_nch); 2014 cache_setvp(ap->a_nch, *ap->a_vpp); 2015 hammer_knote(ap->a_dvp, NOTE_WRITE); 2016 } 2017 } 2018 hammer_done_transaction(&trans); 2019 return (error); 2020 } 2021 2022 /* 2023 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2024 */ 2025 static 2026 int 2027 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2028 { 2029 struct hammer_transaction trans; 2030 struct hammer_inode *dip; 2031 int error; 2032 2033 dip = VTOI(ap->a_dvp); 2034 2035 if (hammer_nohistory(dip) == 0 && 2036 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2037 return (error); 2038 } 2039 2040 hammer_start_transaction(&trans, dip->hmp); 2041 ++hammer_stats_file_iopsw; 2042 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2043 ap->a_cred, ap->a_flags, -1); 2044 hammer_done_transaction(&trans); 2045 2046 return (error); 2047 } 2048 2049 /* 2050 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2051 */ 2052 static 2053 int 2054 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2055 { 2056 struct hammer_inode *ip = ap->a_vp->v_data; 2057 2058 ++hammer_stats_file_iopsr; 2059 return(hammer_ioctl(ip, ap->a_command, ap->a_data, 2060 ap->a_fflag, ap->a_cred)); 2061 } 2062 2063 static 2064 int 2065 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2066 { 2067 struct mount *mp; 2068 int error; 2069 2070 mp = ap->a_head.a_ops->head.vv_mount; 2071 2072 switch(ap->a_op) { 2073 case MOUNTCTL_SET_EXPORT: 2074 if (ap->a_ctllen != sizeof(struct export_args)) 2075 error = EINVAL; 2076 else 2077 error = hammer_vfs_export(mp, ap->a_op, 2078 (const struct export_args *)ap->a_ctl); 2079 break; 2080 default: 2081 error = journal_mountctl(ap); 2082 break; 2083 } 2084 return(error); 2085 } 2086 2087 /* 2088 * hammer_vop_strategy { vp, bio } 2089 * 2090 * Strategy call, used for regular file read & write only. Note that the 2091 * bp may represent a cluster. 2092 * 2093 * To simplify operation and allow better optimizations in the future, 2094 * this code does not make any assumptions with regards to buffer alignment 2095 * or size. 2096 */ 2097 static 2098 int 2099 hammer_vop_strategy(struct vop_strategy_args *ap) 2100 { 2101 struct buf *bp; 2102 int error; 2103 2104 bp = ap->a_bio->bio_buf; 2105 2106 switch(bp->b_cmd) { 2107 case BUF_CMD_READ: 2108 error = hammer_vop_strategy_read(ap); 2109 break; 2110 case BUF_CMD_WRITE: 2111 error = hammer_vop_strategy_write(ap); 2112 break; 2113 default: 2114 bp->b_error = error = EINVAL; 2115 bp->b_flags |= B_ERROR; 2116 biodone(ap->a_bio); 2117 break; 2118 } 2119 return (error); 2120 } 2121 2122 /* 2123 * Read from a regular file. Iterate the related records and fill in the 2124 * BIO/BUF. Gaps are zero-filled. 2125 * 2126 * The support code in hammer_object.c should be used to deal with mixed 2127 * in-memory and on-disk records. 2128 * 2129 * NOTE: Can be called from the cluster code with an oversized buf. 2130 * 2131 * XXX atime update 2132 */ 2133 static 2134 int 2135 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2136 { 2137 struct hammer_transaction trans; 2138 struct hammer_inode *ip; 2139 struct hammer_cursor cursor; 2140 hammer_base_elm_t base; 2141 hammer_off_t disk_offset; 2142 struct bio *bio; 2143 struct bio *nbio; 2144 struct buf *bp; 2145 int64_t rec_offset; 2146 int64_t ran_end; 2147 int64_t tmp64; 2148 int error; 2149 int boff; 2150 int roff; 2151 int n; 2152 2153 bio = ap->a_bio; 2154 bp = bio->bio_buf; 2155 ip = ap->a_vp->v_data; 2156 2157 /* 2158 * The zone-2 disk offset may have been set by the cluster code via 2159 * a BMAP operation, or else should be NOOFFSET. 2160 * 2161 * Checking the high bits for a match against zone-2 should suffice. 2162 */ 2163 nbio = push_bio(bio); 2164 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2165 HAMMER_ZONE_LARGE_DATA) { 2166 error = hammer_io_direct_read(ip->hmp, nbio, NULL); 2167 return (error); 2168 } 2169 2170 /* 2171 * Well, that sucked. Do it the hard way. If all the stars are 2172 * aligned we may still be able to issue a direct-read. 2173 */ 2174 hammer_simple_transaction(&trans, ip->hmp); 2175 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2176 2177 /* 2178 * Key range (begin and end inclusive) to scan. Note that the key's 2179 * stored in the actual records represent BASE+LEN, not BASE. The 2180 * first record containing bio_offset will have a key > bio_offset. 2181 */ 2182 cursor.key_beg.localization = ip->obj_localization + 2183 HAMMER_LOCALIZE_MISC; 2184 cursor.key_beg.obj_id = ip->obj_id; 2185 cursor.key_beg.create_tid = 0; 2186 cursor.key_beg.delete_tid = 0; 2187 cursor.key_beg.obj_type = 0; 2188 cursor.key_beg.key = bio->bio_offset + 1; 2189 cursor.asof = ip->obj_asof; 2190 cursor.flags |= HAMMER_CURSOR_ASOF; 2191 2192 cursor.key_end = cursor.key_beg; 2193 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2194 #if 0 2195 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2196 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2197 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2198 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2199 } else 2200 #endif 2201 { 2202 ran_end = bio->bio_offset + bp->b_bufsize; 2203 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2204 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2205 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2206 if (tmp64 < ran_end) 2207 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2208 else 2209 cursor.key_end.key = ran_end + MAXPHYS + 1; 2210 } 2211 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2212 2213 error = hammer_ip_first(&cursor); 2214 boff = 0; 2215 2216 while (error == 0) { 2217 /* 2218 * Get the base file offset of the record. The key for 2219 * data records is (base + bytes) rather then (base). 2220 */ 2221 base = &cursor.leaf->base; 2222 rec_offset = base->key - cursor.leaf->data_len; 2223 2224 /* 2225 * Calculate the gap, if any, and zero-fill it. 2226 * 2227 * n is the offset of the start of the record verses our 2228 * current seek offset in the bio. 2229 */ 2230 n = (int)(rec_offset - (bio->bio_offset + boff)); 2231 if (n > 0) { 2232 if (n > bp->b_bufsize - boff) 2233 n = bp->b_bufsize - boff; 2234 bzero((char *)bp->b_data + boff, n); 2235 boff += n; 2236 n = 0; 2237 } 2238 2239 /* 2240 * Calculate the data offset in the record and the number 2241 * of bytes we can copy. 2242 * 2243 * There are two degenerate cases. First, boff may already 2244 * be at bp->b_bufsize. Secondly, the data offset within 2245 * the record may exceed the record's size. 2246 */ 2247 roff = -n; 2248 rec_offset += roff; 2249 n = cursor.leaf->data_len - roff; 2250 if (n <= 0) { 2251 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff); 2252 n = 0; 2253 } else if (n > bp->b_bufsize - boff) { 2254 n = bp->b_bufsize - boff; 2255 } 2256 2257 /* 2258 * Deal with cached truncations. This cool bit of code 2259 * allows truncate()/ftruncate() to avoid having to sync 2260 * the file. 2261 * 2262 * If the frontend is truncated then all backend records are 2263 * subject to the frontend's truncation. 2264 * 2265 * If the backend is truncated then backend records on-disk 2266 * (but not in-memory) are subject to the backend's 2267 * truncation. In-memory records owned by the backend 2268 * represent data written after the truncation point on the 2269 * backend and must not be truncated. 2270 * 2271 * Truncate operations deal with frontend buffer cache 2272 * buffers and frontend-owned in-memory records synchronously. 2273 */ 2274 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2275 if (hammer_cursor_ondisk(&cursor) || 2276 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 2277 if (ip->trunc_off <= rec_offset) 2278 n = 0; 2279 else if (ip->trunc_off < rec_offset + n) 2280 n = (int)(ip->trunc_off - rec_offset); 2281 } 2282 } 2283 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2284 if (hammer_cursor_ondisk(&cursor)) { 2285 if (ip->sync_trunc_off <= rec_offset) 2286 n = 0; 2287 else if (ip->sync_trunc_off < rec_offset + n) 2288 n = (int)(ip->sync_trunc_off - rec_offset); 2289 } 2290 } 2291 2292 /* 2293 * Try to issue a direct read into our bio if possible, 2294 * otherwise resolve the element data into a hammer_buffer 2295 * and copy. 2296 * 2297 * The buffer on-disk should be zerod past any real 2298 * truncation point, but may not be for any synthesized 2299 * truncation point from above. 2300 */ 2301 disk_offset = cursor.leaf->data_offset + roff; 2302 if (boff == 0 && n == bp->b_bufsize && 2303 hammer_cursor_ondisk(&cursor) && 2304 (disk_offset & HAMMER_BUFMASK) == 0) { 2305 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2306 HAMMER_ZONE_LARGE_DATA); 2307 nbio->bio_offset = disk_offset; 2308 error = hammer_io_direct_read(trans.hmp, nbio, 2309 cursor.leaf); 2310 goto done; 2311 } else if (n) { 2312 error = hammer_ip_resolve_data(&cursor); 2313 if (error == 0) { 2314 bcopy((char *)cursor.data + roff, 2315 (char *)bp->b_data + boff, n); 2316 } 2317 } 2318 if (error) 2319 break; 2320 2321 /* 2322 * Iterate until we have filled the request. 2323 */ 2324 boff += n; 2325 if (boff == bp->b_bufsize) 2326 break; 2327 error = hammer_ip_next(&cursor); 2328 } 2329 2330 /* 2331 * There may have been a gap after the last record 2332 */ 2333 if (error == ENOENT) 2334 error = 0; 2335 if (error == 0 && boff != bp->b_bufsize) { 2336 KKASSERT(boff < bp->b_bufsize); 2337 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2338 /* boff = bp->b_bufsize; */ 2339 } 2340 bp->b_resid = 0; 2341 bp->b_error = error; 2342 if (error) 2343 bp->b_flags |= B_ERROR; 2344 biodone(ap->a_bio); 2345 2346 done: 2347 if (cursor.node) 2348 hammer_cache_node(&ip->cache[1], cursor.node); 2349 hammer_done_cursor(&cursor); 2350 hammer_done_transaction(&trans); 2351 return(error); 2352 } 2353 2354 /* 2355 * BMAP operation - used to support cluster_read() only. 2356 * 2357 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 2358 * 2359 * This routine may return EOPNOTSUPP if the opration is not supported for 2360 * the specified offset. The contents of the pointer arguments do not 2361 * need to be initialized in that case. 2362 * 2363 * If a disk address is available and properly aligned return 0 with 2364 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 2365 * to the run-length relative to that offset. Callers may assume that 2366 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 2367 * large, so return EOPNOTSUPP if it is not sufficiently large. 2368 */ 2369 static 2370 int 2371 hammer_vop_bmap(struct vop_bmap_args *ap) 2372 { 2373 struct hammer_transaction trans; 2374 struct hammer_inode *ip; 2375 struct hammer_cursor cursor; 2376 hammer_base_elm_t base; 2377 int64_t rec_offset; 2378 int64_t ran_end; 2379 int64_t tmp64; 2380 int64_t base_offset; 2381 int64_t base_disk_offset; 2382 int64_t last_offset; 2383 hammer_off_t last_disk_offset; 2384 hammer_off_t disk_offset; 2385 int rec_len; 2386 int error; 2387 int blksize; 2388 2389 ++hammer_stats_file_iopsr; 2390 ip = ap->a_vp->v_data; 2391 2392 /* 2393 * We can only BMAP regular files. We can't BMAP database files, 2394 * directories, etc. 2395 */ 2396 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 2397 return(EOPNOTSUPP); 2398 2399 /* 2400 * bmap is typically called with runp/runb both NULL when used 2401 * for writing. We do not support BMAP for writing atm. 2402 */ 2403 if (ap->a_cmd != BUF_CMD_READ) 2404 return(EOPNOTSUPP); 2405 2406 /* 2407 * Scan the B-Tree to acquire blockmap addresses, then translate 2408 * to raw addresses. 2409 */ 2410 hammer_simple_transaction(&trans, ip->hmp); 2411 #if 0 2412 kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]); 2413 #endif 2414 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2415 2416 /* 2417 * Key range (begin and end inclusive) to scan. Note that the key's 2418 * stored in the actual records represent BASE+LEN, not BASE. The 2419 * first record containing bio_offset will have a key > bio_offset. 2420 */ 2421 cursor.key_beg.localization = ip->obj_localization + 2422 HAMMER_LOCALIZE_MISC; 2423 cursor.key_beg.obj_id = ip->obj_id; 2424 cursor.key_beg.create_tid = 0; 2425 cursor.key_beg.delete_tid = 0; 2426 cursor.key_beg.obj_type = 0; 2427 if (ap->a_runb) 2428 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 2429 else 2430 cursor.key_beg.key = ap->a_loffset + 1; 2431 if (cursor.key_beg.key < 0) 2432 cursor.key_beg.key = 0; 2433 cursor.asof = ip->obj_asof; 2434 cursor.flags |= HAMMER_CURSOR_ASOF; 2435 2436 cursor.key_end = cursor.key_beg; 2437 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2438 2439 ran_end = ap->a_loffset + MAXPHYS; 2440 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2441 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2442 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2443 if (tmp64 < ran_end) 2444 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2445 else 2446 cursor.key_end.key = ran_end + MAXPHYS + 1; 2447 2448 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2449 2450 error = hammer_ip_first(&cursor); 2451 base_offset = last_offset = 0; 2452 base_disk_offset = last_disk_offset = 0; 2453 2454 while (error == 0) { 2455 /* 2456 * Get the base file offset of the record. The key for 2457 * data records is (base + bytes) rather then (base). 2458 * 2459 * NOTE: rec_offset + rec_len may exceed the end-of-file. 2460 * The extra bytes should be zero on-disk and the BMAP op 2461 * should still be ok. 2462 */ 2463 base = &cursor.leaf->base; 2464 rec_offset = base->key - cursor.leaf->data_len; 2465 rec_len = cursor.leaf->data_len; 2466 2467 /* 2468 * Incorporate any cached truncation. 2469 * 2470 * NOTE: Modifications to rec_len based on synthesized 2471 * truncation points remove the guarantee that any extended 2472 * data on disk is zero (since the truncations may not have 2473 * taken place on-media yet). 2474 */ 2475 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2476 if (hammer_cursor_ondisk(&cursor) || 2477 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 2478 if (ip->trunc_off <= rec_offset) 2479 rec_len = 0; 2480 else if (ip->trunc_off < rec_offset + rec_len) 2481 rec_len = (int)(ip->trunc_off - rec_offset); 2482 } 2483 } 2484 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2485 if (hammer_cursor_ondisk(&cursor)) { 2486 if (ip->sync_trunc_off <= rec_offset) 2487 rec_len = 0; 2488 else if (ip->sync_trunc_off < rec_offset + rec_len) 2489 rec_len = (int)(ip->sync_trunc_off - rec_offset); 2490 } 2491 } 2492 2493 /* 2494 * Accumulate information. If we have hit a discontiguous 2495 * block reset base_offset unless we are already beyond the 2496 * requested offset. If we are, that's it, we stop. 2497 */ 2498 if (error) 2499 break; 2500 if (hammer_cursor_ondisk(&cursor)) { 2501 disk_offset = cursor.leaf->data_offset; 2502 if (rec_offset != last_offset || 2503 disk_offset != last_disk_offset) { 2504 if (rec_offset > ap->a_loffset) 2505 break; 2506 base_offset = rec_offset; 2507 base_disk_offset = disk_offset; 2508 } 2509 last_offset = rec_offset + rec_len; 2510 last_disk_offset = disk_offset + rec_len; 2511 } 2512 error = hammer_ip_next(&cursor); 2513 } 2514 2515 #if 0 2516 kprintf("BMAP %016llx: %016llx - %016llx\n", 2517 ap->a_loffset, base_offset, last_offset); 2518 kprintf("BMAP %16s: %016llx - %016llx\n", 2519 "", base_disk_offset, last_disk_offset); 2520 #endif 2521 2522 if (cursor.node) { 2523 hammer_cache_node(&ip->cache[1], cursor.node); 2524 #if 0 2525 kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]); 2526 #endif 2527 } 2528 hammer_done_cursor(&cursor); 2529 hammer_done_transaction(&trans); 2530 2531 /* 2532 * If we couldn't find any records or the records we did find were 2533 * all behind the requested offset, return failure. A forward 2534 * truncation can leave a hole w/ no on-disk records. 2535 */ 2536 if (last_offset == 0 || last_offset < ap->a_loffset) 2537 return (EOPNOTSUPP); 2538 2539 /* 2540 * Figure out the block size at the requested offset and adjust 2541 * our limits so the cluster_read() does not create inappropriately 2542 * sized buffer cache buffers. 2543 */ 2544 blksize = hammer_blocksize(ap->a_loffset); 2545 if (hammer_blocksize(base_offset) != blksize) { 2546 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 2547 } 2548 if (last_offset != ap->a_loffset && 2549 hammer_blocksize(last_offset - 1) != blksize) { 2550 last_offset = hammer_blockdemarc(ap->a_loffset, 2551 last_offset - 1); 2552 } 2553 2554 /* 2555 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 2556 * from occuring. 2557 */ 2558 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 2559 2560 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 2561 /* 2562 * Only large-data zones can be direct-IOd 2563 */ 2564 error = EOPNOTSUPP; 2565 } else if ((disk_offset & HAMMER_BUFMASK) || 2566 (last_offset - ap->a_loffset) < blksize) { 2567 /* 2568 * doffsetp is not aligned or the forward run size does 2569 * not cover a whole buffer, disallow the direct I/O. 2570 */ 2571 error = EOPNOTSUPP; 2572 } else { 2573 /* 2574 * We're good. 2575 */ 2576 *ap->a_doffsetp = disk_offset; 2577 if (ap->a_runb) { 2578 *ap->a_runb = ap->a_loffset - base_offset; 2579 KKASSERT(*ap->a_runb >= 0); 2580 } 2581 if (ap->a_runp) { 2582 *ap->a_runp = last_offset - ap->a_loffset; 2583 KKASSERT(*ap->a_runp >= 0); 2584 } 2585 error = 0; 2586 } 2587 return(error); 2588 } 2589 2590 /* 2591 * Write to a regular file. Because this is a strategy call the OS is 2592 * trying to actually get data onto the media. 2593 */ 2594 static 2595 int 2596 hammer_vop_strategy_write(struct vop_strategy_args *ap) 2597 { 2598 hammer_record_t record; 2599 hammer_mount_t hmp; 2600 hammer_inode_t ip; 2601 struct bio *bio; 2602 struct buf *bp; 2603 int blksize; 2604 int bytes; 2605 int error; 2606 2607 bio = ap->a_bio; 2608 bp = bio->bio_buf; 2609 ip = ap->a_vp->v_data; 2610 hmp = ip->hmp; 2611 2612 blksize = hammer_blocksize(bio->bio_offset); 2613 KKASSERT(bp->b_bufsize == blksize); 2614 2615 if (ip->flags & HAMMER_INODE_RO) { 2616 bp->b_error = EROFS; 2617 bp->b_flags |= B_ERROR; 2618 biodone(ap->a_bio); 2619 return(EROFS); 2620 } 2621 2622 /* 2623 * Interlock with inode destruction (no in-kernel or directory 2624 * topology visibility). If we queue new IO while trying to 2625 * destroy the inode we can deadlock the vtrunc call in 2626 * hammer_inode_unloadable_check(). 2627 * 2628 * Besides, there's no point flushing a bp associated with an 2629 * inode that is being destroyed on-media and has no kernel 2630 * references. 2631 */ 2632 if ((ip->flags | ip->sync_flags) & 2633 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 2634 bp->b_resid = 0; 2635 biodone(ap->a_bio); 2636 return(0); 2637 } 2638 2639 /* 2640 * Reserve space and issue a direct-write from the front-end. 2641 * NOTE: The direct_io code will hammer_bread/bcopy smaller 2642 * allocations. 2643 * 2644 * An in-memory record will be installed to reference the storage 2645 * until the flusher can get to it. 2646 * 2647 * Since we own the high level bio the front-end will not try to 2648 * do a direct-read until the write completes. 2649 * 2650 * NOTE: The only time we do not reserve a full-sized buffers 2651 * worth of data is if the file is small. We do not try to 2652 * allocate a fragment (from the small-data zone) at the end of 2653 * an otherwise large file as this can lead to wildly separated 2654 * data. 2655 */ 2656 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 2657 KKASSERT(bio->bio_offset < ip->ino_data.size); 2658 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2) 2659 bytes = bp->b_bufsize; 2660 else 2661 bytes = ((int)ip->ino_data.size + 15) & ~15; 2662 2663 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 2664 bytes, &error); 2665 if (record) { 2666 hammer_io_direct_write(hmp, record, bio); 2667 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 2668 hammer_flush_inode(ip, 0); 2669 } else { 2670 bp->b_bio2.bio_offset = NOOFFSET; 2671 bp->b_error = error; 2672 bp->b_flags |= B_ERROR; 2673 biodone(ap->a_bio); 2674 } 2675 return(error); 2676 } 2677 2678 /* 2679 * dounlink - disconnect a directory entry 2680 * 2681 * XXX whiteout support not really in yet 2682 */ 2683 static int 2684 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 2685 struct vnode *dvp, struct ucred *cred, 2686 int flags, int isdir) 2687 { 2688 struct namecache *ncp; 2689 hammer_inode_t dip; 2690 hammer_inode_t ip; 2691 struct hammer_cursor cursor; 2692 int64_t namekey; 2693 u_int32_t max_iterations; 2694 int nlen, error; 2695 2696 /* 2697 * Calculate the namekey and setup the key range for the scan. This 2698 * works kinda like a chained hash table where the lower 32 bits 2699 * of the namekey synthesize the chain. 2700 * 2701 * The key range is inclusive of both key_beg and key_end. 2702 */ 2703 dip = VTOI(dvp); 2704 ncp = nch->ncp; 2705 2706 if (dip->flags & HAMMER_INODE_RO) 2707 return (EROFS); 2708 2709 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 2710 &max_iterations); 2711 retry: 2712 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 2713 cursor.key_beg.localization = dip->obj_localization + 2714 HAMMER_LOCALIZE_MISC; 2715 cursor.key_beg.obj_id = dip->obj_id; 2716 cursor.key_beg.key = namekey; 2717 cursor.key_beg.create_tid = 0; 2718 cursor.key_beg.delete_tid = 0; 2719 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2720 cursor.key_beg.obj_type = 0; 2721 2722 cursor.key_end = cursor.key_beg; 2723 cursor.key_end.key += max_iterations; 2724 cursor.asof = dip->obj_asof; 2725 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2726 2727 /* 2728 * Scan all matching records (the chain), locate the one matching 2729 * the requested path component. info->last_error contains the 2730 * error code on search termination and could be 0, ENOENT, or 2731 * something else. 2732 * 2733 * The hammer_ip_*() functions merge in-memory records with on-disk 2734 * records for the purposes of the search. 2735 */ 2736 error = hammer_ip_first(&cursor); 2737 2738 while (error == 0) { 2739 error = hammer_ip_resolve_data(&cursor); 2740 if (error) 2741 break; 2742 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2743 KKASSERT(nlen > 0); 2744 if (ncp->nc_nlen == nlen && 2745 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2746 break; 2747 } 2748 error = hammer_ip_next(&cursor); 2749 } 2750 2751 /* 2752 * If all is ok we have to get the inode so we can adjust nlinks. 2753 * To avoid a deadlock with the flusher we must release the inode 2754 * lock on the directory when acquiring the inode for the entry. 2755 * 2756 * If the target is a directory, it must be empty. 2757 */ 2758 if (error == 0) { 2759 hammer_unlock(&cursor.ip->lock); 2760 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 2761 dip->hmp->asof, 2762 cursor.data->entry.localization, 2763 0, &error); 2764 hammer_lock_sh(&cursor.ip->lock); 2765 if (error == ENOENT) { 2766 kprintf("obj_id %016llx\n", cursor.data->entry.obj_id); 2767 Debugger("ENOENT unlinking object that should exist"); 2768 } 2769 2770 /* 2771 * If isdir >= 0 we validate that the entry is or is not a 2772 * directory. If isdir < 0 we don't care. 2773 */ 2774 if (error == 0 && isdir >= 0) { 2775 if (isdir && 2776 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 2777 error = ENOTDIR; 2778 } else if (isdir == 0 && 2779 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 2780 error = EISDIR; 2781 } 2782 } 2783 2784 /* 2785 * If we are trying to remove a directory the directory must 2786 * be empty. 2787 * 2788 * WARNING: hammer_ip_check_directory_empty() may have to 2789 * terminate the cursor to avoid a deadlock. It is ok to 2790 * call hammer_done_cursor() twice. 2791 */ 2792 if (error == 0 && ip->ino_data.obj_type == 2793 HAMMER_OBJTYPE_DIRECTORY) { 2794 error = hammer_ip_check_directory_empty(trans, ip); 2795 } 2796 2797 /* 2798 * Delete the directory entry. 2799 * 2800 * WARNING: hammer_ip_del_directory() may have to terminate 2801 * the cursor to avoid a deadlock. It is ok to call 2802 * hammer_done_cursor() twice. 2803 */ 2804 if (error == 0) { 2805 error = hammer_ip_del_directory(trans, &cursor, 2806 dip, ip); 2807 } 2808 hammer_done_cursor(&cursor); 2809 if (error == 0) { 2810 cache_setunresolved(nch); 2811 cache_setvp(nch, NULL); 2812 /* XXX locking */ 2813 if (ip->vp) { 2814 hammer_knote(ip->vp, NOTE_DELETE); 2815 cache_inval_vp(ip->vp, CINV_DESTROY); 2816 } 2817 } 2818 if (ip) 2819 hammer_rel_inode(ip, 0); 2820 } else { 2821 hammer_done_cursor(&cursor); 2822 } 2823 if (error == EDEADLK) 2824 goto retry; 2825 2826 return (error); 2827 } 2828 2829 /************************************************************************ 2830 * FIFO AND SPECFS OPS * 2831 ************************************************************************ 2832 * 2833 */ 2834 2835 static int 2836 hammer_vop_fifoclose (struct vop_close_args *ap) 2837 { 2838 /* XXX update itimes */ 2839 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 2840 } 2841 2842 static int 2843 hammer_vop_fiforead (struct vop_read_args *ap) 2844 { 2845 int error; 2846 2847 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 2848 /* XXX update access time */ 2849 return (error); 2850 } 2851 2852 static int 2853 hammer_vop_fifowrite (struct vop_write_args *ap) 2854 { 2855 int error; 2856 2857 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 2858 /* XXX update access time */ 2859 return (error); 2860 } 2861 2862 static 2863 int 2864 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 2865 { 2866 int error; 2867 2868 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 2869 if (error) 2870 error = hammer_vop_kqfilter(ap); 2871 return(error); 2872 } 2873 2874 static int 2875 hammer_vop_specclose (struct vop_close_args *ap) 2876 { 2877 /* XXX update itimes */ 2878 return (VOCALL(&spec_vnode_vops, &ap->a_head)); 2879 } 2880 2881 static int 2882 hammer_vop_specread (struct vop_read_args *ap) 2883 { 2884 /* XXX update access time */ 2885 return (VOCALL(&spec_vnode_vops, &ap->a_head)); 2886 } 2887 2888 static int 2889 hammer_vop_specwrite (struct vop_write_args *ap) 2890 { 2891 /* XXX update last change time */ 2892 return (VOCALL(&spec_vnode_vops, &ap->a_head)); 2893 } 2894 2895 /************************************************************************ 2896 * KQFILTER OPS * 2897 ************************************************************************ 2898 * 2899 */ 2900 static void filt_hammerdetach(struct knote *kn); 2901 static int filt_hammerread(struct knote *kn, long hint); 2902 static int filt_hammerwrite(struct knote *kn, long hint); 2903 static int filt_hammervnode(struct knote *kn, long hint); 2904 2905 static struct filterops hammerread_filtops = 2906 { 1, NULL, filt_hammerdetach, filt_hammerread }; 2907 static struct filterops hammerwrite_filtops = 2908 { 1, NULL, filt_hammerdetach, filt_hammerwrite }; 2909 static struct filterops hammervnode_filtops = 2910 { 1, NULL, filt_hammerdetach, filt_hammervnode }; 2911 2912 static 2913 int 2914 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 2915 { 2916 struct vnode *vp = ap->a_vp; 2917 struct knote *kn = ap->a_kn; 2918 lwkt_tokref ilock; 2919 2920 switch (kn->kn_filter) { 2921 case EVFILT_READ: 2922 kn->kn_fop = &hammerread_filtops; 2923 break; 2924 case EVFILT_WRITE: 2925 kn->kn_fop = &hammerwrite_filtops; 2926 break; 2927 case EVFILT_VNODE: 2928 kn->kn_fop = &hammervnode_filtops; 2929 break; 2930 default: 2931 return (1); 2932 } 2933 2934 kn->kn_hook = (caddr_t)vp; 2935 2936 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 2937 SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext); 2938 lwkt_reltoken(&ilock); 2939 2940 return(0); 2941 } 2942 2943 static void 2944 filt_hammerdetach(struct knote *kn) 2945 { 2946 struct vnode *vp = (void *)kn->kn_hook; 2947 lwkt_tokref ilock; 2948 2949 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 2950 SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note, 2951 kn, knote, kn_selnext); 2952 lwkt_reltoken(&ilock); 2953 } 2954 2955 static int 2956 filt_hammerread(struct knote *kn, long hint) 2957 { 2958 struct vnode *vp = (void *)kn->kn_hook; 2959 hammer_inode_t ip = VTOI(vp); 2960 2961 if (hint == NOTE_REVOKE) { 2962 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 2963 return(1); 2964 } 2965 kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset; 2966 return (kn->kn_data != 0); 2967 } 2968 2969 static int 2970 filt_hammerwrite(struct knote *kn, long hint) 2971 { 2972 if (hint == NOTE_REVOKE) 2973 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 2974 kn->kn_data = 0; 2975 return (1); 2976 } 2977 2978 static int 2979 filt_hammervnode(struct knote *kn, long hint) 2980 { 2981 if (kn->kn_sfflags & hint) 2982 kn->kn_fflags |= hint; 2983 if (hint == NOTE_REVOKE) { 2984 kn->kn_flags |= EV_EOF; 2985 return (1); 2986 } 2987 return (kn->kn_fflags != 0); 2988 } 2989 2990