1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/fcntl.h> 41 #include <sys/namecache.h> 42 #include <sys/vnode.h> 43 #include <sys/lockf.h> 44 #include <sys/event.h> 45 #include <sys/stat.h> 46 #include <sys/dirent.h> 47 #include <sys/file.h> 48 #include <vm/vm_extern.h> 49 #include <vfs/fifofs/fifo.h> 50 #include "hammer.h" 51 52 /* 53 * USERFS VNOPS 54 */ 55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/ 56 static int hammer_vop_fsync(struct vop_fsync_args *); 57 static int hammer_vop_read(struct vop_read_args *); 58 static int hammer_vop_write(struct vop_write_args *); 59 static int hammer_vop_access(struct vop_access_args *); 60 static int hammer_vop_advlock(struct vop_advlock_args *); 61 static int hammer_vop_close(struct vop_close_args *); 62 static int hammer_vop_ncreate(struct vop_ncreate_args *); 63 static int hammer_vop_getattr(struct vop_getattr_args *); 64 static int hammer_vop_nresolve(struct vop_nresolve_args *); 65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 66 static int hammer_vop_nlink(struct vop_nlink_args *); 67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 68 static int hammer_vop_nmknod(struct vop_nmknod_args *); 69 static int hammer_vop_open(struct vop_open_args *); 70 static int hammer_vop_print(struct vop_print_args *); 71 static int hammer_vop_readdir(struct vop_readdir_args *); 72 static int hammer_vop_readlink(struct vop_readlink_args *); 73 static int hammer_vop_nremove(struct vop_nremove_args *); 74 static int hammer_vop_nrename(struct vop_nrename_args *); 75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 76 static int hammer_vop_markatime(struct vop_markatime_args *); 77 static int hammer_vop_setattr(struct vop_setattr_args *); 78 static int hammer_vop_strategy(struct vop_strategy_args *); 79 static int hammer_vop_bmap(struct vop_bmap_args *ap); 80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 82 static int hammer_vop_ioctl(struct vop_ioctl_args *); 83 static int hammer_vop_mountctl(struct vop_mountctl_args *); 84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 85 86 static int hammer_vop_fifoclose (struct vop_close_args *); 87 static int hammer_vop_fiforead (struct vop_read_args *); 88 static int hammer_vop_fifowrite (struct vop_write_args *); 89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 90 91 static int hammer_vop_specclose (struct vop_close_args *); 92 static int hammer_vop_specread (struct vop_read_args *); 93 static int hammer_vop_specwrite (struct vop_write_args *); 94 95 struct vop_ops hammer_vnode_vops = { 96 .vop_default = vop_defaultop, 97 .vop_fsync = hammer_vop_fsync, 98 .vop_getpages = vop_stdgetpages, 99 .vop_putpages = vop_stdputpages, 100 .vop_read = hammer_vop_read, 101 .vop_write = hammer_vop_write, 102 .vop_access = hammer_vop_access, 103 .vop_advlock = hammer_vop_advlock, 104 .vop_close = hammer_vop_close, 105 .vop_ncreate = hammer_vop_ncreate, 106 .vop_getattr = hammer_vop_getattr, 107 .vop_inactive = hammer_vop_inactive, 108 .vop_reclaim = hammer_vop_reclaim, 109 .vop_nresolve = hammer_vop_nresolve, 110 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 111 .vop_nlink = hammer_vop_nlink, 112 .vop_nmkdir = hammer_vop_nmkdir, 113 .vop_nmknod = hammer_vop_nmknod, 114 .vop_open = hammer_vop_open, 115 .vop_pathconf = vop_stdpathconf, 116 .vop_print = hammer_vop_print, 117 .vop_readdir = hammer_vop_readdir, 118 .vop_readlink = hammer_vop_readlink, 119 .vop_nremove = hammer_vop_nremove, 120 .vop_nrename = hammer_vop_nrename, 121 .vop_nrmdir = hammer_vop_nrmdir, 122 .vop_markatime = hammer_vop_markatime, 123 .vop_setattr = hammer_vop_setattr, 124 .vop_bmap = hammer_vop_bmap, 125 .vop_strategy = hammer_vop_strategy, 126 .vop_nsymlink = hammer_vop_nsymlink, 127 .vop_nwhiteout = hammer_vop_nwhiteout, 128 .vop_ioctl = hammer_vop_ioctl, 129 .vop_mountctl = hammer_vop_mountctl, 130 .vop_kqfilter = hammer_vop_kqfilter 131 }; 132 133 struct vop_ops hammer_spec_vops = { 134 .vop_default = spec_vnoperate, 135 .vop_fsync = hammer_vop_fsync, 136 .vop_read = hammer_vop_specread, 137 .vop_write = hammer_vop_specwrite, 138 .vop_access = hammer_vop_access, 139 .vop_close = hammer_vop_specclose, 140 .vop_markatime = hammer_vop_markatime, 141 .vop_getattr = hammer_vop_getattr, 142 .vop_inactive = hammer_vop_inactive, 143 .vop_reclaim = hammer_vop_reclaim, 144 .vop_setattr = hammer_vop_setattr 145 }; 146 147 struct vop_ops hammer_fifo_vops = { 148 .vop_default = fifo_vnoperate, 149 .vop_fsync = hammer_vop_fsync, 150 .vop_read = hammer_vop_fiforead, 151 .vop_write = hammer_vop_fifowrite, 152 .vop_access = hammer_vop_access, 153 .vop_close = hammer_vop_fifoclose, 154 .vop_markatime = hammer_vop_markatime, 155 .vop_getattr = hammer_vop_getattr, 156 .vop_inactive = hammer_vop_inactive, 157 .vop_reclaim = hammer_vop_reclaim, 158 .vop_setattr = hammer_vop_setattr, 159 .vop_kqfilter = hammer_vop_fifokqfilter 160 }; 161 162 static __inline 163 void 164 hammer_knote(struct vnode *vp, int flags) 165 { 166 if (flags) 167 KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags); 168 } 169 170 #ifdef DEBUG_TRUNCATE 171 struct hammer_inode *HammerTruncIp; 172 #endif 173 174 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 175 struct vnode *dvp, struct ucred *cred, 176 int flags, int isdir); 177 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 178 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 179 180 #if 0 181 static 182 int 183 hammer_vop_vnoperate(struct vop_generic_args *) 184 { 185 return (VOCALL(&hammer_vnode_vops, ap)); 186 } 187 #endif 188 189 /* 190 * hammer_vop_fsync { vp, waitfor } 191 * 192 * fsync() an inode to disk and wait for it to be completely committed 193 * such that the information would not be undone if a crash occured after 194 * return. 195 */ 196 static 197 int 198 hammer_vop_fsync(struct vop_fsync_args *ap) 199 { 200 hammer_inode_t ip = VTOI(ap->a_vp); 201 202 ++hammer_count_fsyncs; 203 vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL); 204 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 205 if (ap->a_waitfor == MNT_WAIT) { 206 vn_unlock(ap->a_vp); 207 hammer_wait_inode(ip); 208 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); 209 } 210 return (ip->error); 211 } 212 213 /* 214 * hammer_vop_read { vp, uio, ioflag, cred } 215 */ 216 static 217 int 218 hammer_vop_read(struct vop_read_args *ap) 219 { 220 struct hammer_transaction trans; 221 hammer_inode_t ip; 222 off_t offset; 223 struct buf *bp; 224 struct uio *uio; 225 int error; 226 int n; 227 int seqcount; 228 int ioseqcount; 229 int blksize; 230 231 if (ap->a_vp->v_type != VREG) 232 return (EINVAL); 233 ip = VTOI(ap->a_vp); 234 error = 0; 235 uio = ap->a_uio; 236 237 /* 238 * Allow the UIO's size to override the sequential heuristic. 239 */ 240 blksize = hammer_blocksize(uio->uio_offset); 241 seqcount = (uio->uio_resid + (blksize - 1)) / blksize; 242 ioseqcount = ap->a_ioflag >> 16; 243 if (seqcount < ioseqcount) 244 seqcount = ioseqcount; 245 246 hammer_start_transaction(&trans, ip->hmp); 247 248 /* 249 * Access the data typically in HAMMER_BUFSIZE blocks via the 250 * buffer cache, but HAMMER may use a variable block size based 251 * on the offset. 252 */ 253 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 254 int64_t base_offset; 255 int64_t file_limit; 256 257 blksize = hammer_blocksize(uio->uio_offset); 258 offset = (int)uio->uio_offset & (blksize - 1); 259 base_offset = uio->uio_offset - offset; 260 261 if (hammer_cluster_enable) { 262 /* 263 * Use file_limit to prevent cluster_read() from 264 * creating buffers of the wrong block size past 265 * the demarc. 266 */ 267 file_limit = ip->ino_data.size; 268 if (base_offset < HAMMER_XDEMARC && 269 file_limit > HAMMER_XDEMARC) { 270 file_limit = HAMMER_XDEMARC; 271 } 272 error = cluster_read(ap->a_vp, 273 file_limit, base_offset, 274 blksize, MAXPHYS, 275 seqcount, &bp); 276 } else { 277 error = bread(ap->a_vp, base_offset, blksize, &bp); 278 } 279 if (error) { 280 kprintf("error %d\n", error); 281 brelse(bp); 282 break; 283 } 284 285 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ 286 n = blksize - offset; 287 if (n > uio->uio_resid) 288 n = uio->uio_resid; 289 if (n > ip->ino_data.size - uio->uio_offset) 290 n = (int)(ip->ino_data.size - uio->uio_offset); 291 error = uiomove((char *)bp->b_data + offset, n, uio); 292 293 /* data has a lower priority then meta-data */ 294 bp->b_flags |= B_AGE; 295 bqrelse(bp); 296 if (error) 297 break; 298 hammer_stats_file_read += n; 299 } 300 if ((ip->flags & HAMMER_INODE_RO) == 0 && 301 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 302 ip->ino_data.atime = trans.time; 303 hammer_modify_inode(ip, HAMMER_INODE_ATIME); 304 } 305 hammer_done_transaction(&trans); 306 return (error); 307 } 308 309 /* 310 * hammer_vop_write { vp, uio, ioflag, cred } 311 */ 312 static 313 int 314 hammer_vop_write(struct vop_write_args *ap) 315 { 316 struct hammer_transaction trans; 317 struct hammer_inode *ip; 318 hammer_mount_t hmp; 319 struct uio *uio; 320 int offset; 321 off_t base_offset; 322 struct buf *bp; 323 int kflags; 324 int error; 325 int n; 326 int flags; 327 int delta; 328 int seqcount; 329 330 if (ap->a_vp->v_type != VREG) 331 return (EINVAL); 332 ip = VTOI(ap->a_vp); 333 hmp = ip->hmp; 334 error = 0; 335 kflags = 0; 336 seqcount = ap->a_ioflag >> 16; 337 338 if (ip->flags & HAMMER_INODE_RO) 339 return (EROFS); 340 341 /* 342 * Create a transaction to cover the operations we perform. 343 */ 344 hammer_start_transaction(&trans, hmp); 345 uio = ap->a_uio; 346 347 /* 348 * Check append mode 349 */ 350 if (ap->a_ioflag & IO_APPEND) 351 uio->uio_offset = ip->ino_data.size; 352 353 /* 354 * Check for illegal write offsets. Valid range is 0...2^63-1. 355 * 356 * NOTE: the base_off assignment is required to work around what 357 * I consider to be a GCC-4 optimization bug. 358 */ 359 if (uio->uio_offset < 0) { 360 hammer_done_transaction(&trans); 361 return (EFBIG); 362 } 363 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 364 if (uio->uio_resid > 0 && base_offset <= 0) { 365 hammer_done_transaction(&trans); 366 return (EFBIG); 367 } 368 369 /* 370 * Access the data typically in HAMMER_BUFSIZE blocks via the 371 * buffer cache, but HAMMER may use a variable block size based 372 * on the offset. 373 */ 374 while (uio->uio_resid > 0) { 375 int fixsize = 0; 376 int blksize; 377 int blkmask; 378 379 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 380 break; 381 382 blksize = hammer_blocksize(uio->uio_offset); 383 384 /* 385 * Do not allow HAMMER to blow out the buffer cache. Very 386 * large UIOs can lockout other processes due to bwillwrite() 387 * mechanics. 388 * 389 * The hammer inode is not locked during these operations. 390 * The vnode is locked which can interfere with the pageout 391 * daemon for non-UIO_NOCOPY writes but should not interfere 392 * with the buffer cache. Even so, we cannot afford to 393 * allow the pageout daemon to build up too many dirty buffer 394 * cache buffers. 395 * 396 * Only call this if we aren't being recursively called from 397 * a virtual disk device (vn), else we may deadlock. 398 */ 399 if ((ap->a_ioflag & IO_RECURSE) == 0) 400 bwillwrite(blksize); 401 402 /* 403 * Do not allow HAMMER to blow out system memory by 404 * accumulating too many records. Records are so well 405 * decoupled from the buffer cache that it is possible 406 * for userland to push data out to the media via 407 * direct-write, but build up the records queued to the 408 * backend faster then the backend can flush them out. 409 * HAMMER has hit its write limit but the frontend has 410 * no pushback to slow it down. 411 */ 412 if (hmp->rsv_recs > hammer_limit_recs / 2) { 413 /* 414 * Get the inode on the flush list 415 */ 416 if (ip->rsv_recs >= 64) 417 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 418 else if (ip->rsv_recs >= 16) 419 hammer_flush_inode(ip, 0); 420 421 /* 422 * Keep the flusher going if the system keeps 423 * queueing records. 424 */ 425 delta = hmp->count_newrecords - 426 hmp->last_newrecords; 427 if (delta < 0 || delta > hammer_limit_recs / 2) { 428 hmp->last_newrecords = hmp->count_newrecords; 429 hammer_sync_hmp(hmp, MNT_NOWAIT); 430 } 431 432 /* 433 * If we have gotten behind start slowing 434 * down the writers. 435 */ 436 delta = (hmp->rsv_recs - hammer_limit_recs) * 437 hz / hammer_limit_recs; 438 if (delta > 0) 439 tsleep(&trans, 0, "hmrslo", delta); 440 } 441 442 /* 443 * Calculate the blocksize at the current offset and figure 444 * out how much we can actually write. 445 */ 446 blkmask = blksize - 1; 447 offset = (int)uio->uio_offset & blkmask; 448 base_offset = uio->uio_offset & ~(int64_t)blkmask; 449 n = blksize - offset; 450 if (n > uio->uio_resid) 451 n = uio->uio_resid; 452 if (uio->uio_offset + n > ip->ino_data.size) { 453 vnode_pager_setsize(ap->a_vp, uio->uio_offset + n); 454 fixsize = 1; 455 kflags |= NOTE_EXTEND; 456 } 457 458 if (uio->uio_segflg == UIO_NOCOPY) { 459 /* 460 * Issuing a write with the same data backing the 461 * buffer. Instantiate the buffer to collect the 462 * backing vm pages, then read-in any missing bits. 463 * 464 * This case is used by vop_stdputpages(). 465 */ 466 bp = getblk(ap->a_vp, base_offset, 467 blksize, GETBLK_BHEAVY, 0); 468 if ((bp->b_flags & B_CACHE) == 0) { 469 bqrelse(bp); 470 error = bread(ap->a_vp, base_offset, 471 blksize, &bp); 472 } 473 } else if (offset == 0 && uio->uio_resid >= blksize) { 474 /* 475 * Even though we are entirely overwriting the buffer 476 * we may still have to zero it out to avoid a 477 * mmap/write visibility issue. 478 */ 479 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 480 if ((bp->b_flags & B_CACHE) == 0) 481 vfs_bio_clrbuf(bp); 482 } else if (base_offset >= ip->ino_data.size) { 483 /* 484 * If the base offset of the buffer is beyond the 485 * file EOF, we don't have to issue a read. 486 */ 487 bp = getblk(ap->a_vp, base_offset, 488 blksize, GETBLK_BHEAVY, 0); 489 vfs_bio_clrbuf(bp); 490 } else { 491 /* 492 * Partial overwrite, read in any missing bits then 493 * replace the portion being written. 494 */ 495 error = bread(ap->a_vp, base_offset, blksize, &bp); 496 if (error == 0) 497 bheavy(bp); 498 } 499 if (error == 0) { 500 error = uiomove((char *)bp->b_data + offset, 501 n, uio); 502 } 503 504 /* 505 * If we screwed up we have to undo any VM size changes we 506 * made. 507 */ 508 if (error) { 509 brelse(bp); 510 if (fixsize) { 511 vtruncbuf(ap->a_vp, ip->ino_data.size, 512 hammer_blocksize(ip->ino_data.size)); 513 } 514 break; 515 } 516 kflags |= NOTE_WRITE; 517 hammer_stats_file_write += n; 518 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ 519 if (ip->ino_data.size < uio->uio_offset) { 520 ip->ino_data.size = uio->uio_offset; 521 flags = HAMMER_INODE_DDIRTY; 522 vnode_pager_setsize(ap->a_vp, ip->ino_data.size); 523 } else { 524 flags = 0; 525 } 526 ip->ino_data.mtime = trans.time; 527 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 528 hammer_modify_inode(ip, flags); 529 530 /* 531 * Once we dirty the buffer any cached zone-X offset 532 * becomes invalid. HAMMER NOTE: no-history mode cannot 533 * allow overwriting over the same data sector unless 534 * we provide UNDOs for the old data, which we don't. 535 */ 536 bp->b_bio2.bio_offset = NOOFFSET; 537 538 /* 539 * Final buffer disposition. 540 */ 541 bp->b_flags |= B_AGE; 542 if (ap->a_ioflag & IO_SYNC) { 543 bwrite(bp); 544 } else if (ap->a_ioflag & IO_DIRECT) { 545 bawrite(bp); 546 } else { 547 bdwrite(bp); 548 } 549 } 550 hammer_done_transaction(&trans); 551 hammer_knote(ap->a_vp, kflags); 552 return (error); 553 } 554 555 /* 556 * hammer_vop_access { vp, mode, cred } 557 */ 558 static 559 int 560 hammer_vop_access(struct vop_access_args *ap) 561 { 562 struct hammer_inode *ip = VTOI(ap->a_vp); 563 uid_t uid; 564 gid_t gid; 565 int error; 566 567 ++hammer_stats_file_iopsr; 568 uid = hammer_to_unix_xid(&ip->ino_data.uid); 569 gid = hammer_to_unix_xid(&ip->ino_data.gid); 570 571 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 572 ip->ino_data.uflags); 573 return (error); 574 } 575 576 /* 577 * hammer_vop_advlock { vp, id, op, fl, flags } 578 */ 579 static 580 int 581 hammer_vop_advlock(struct vop_advlock_args *ap) 582 { 583 hammer_inode_t ip = VTOI(ap->a_vp); 584 585 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 586 } 587 588 /* 589 * hammer_vop_close { vp, fflag } 590 */ 591 static 592 int 593 hammer_vop_close(struct vop_close_args *ap) 594 { 595 /*hammer_inode_t ip = VTOI(ap->a_vp);*/ 596 return (vop_stdclose(ap)); 597 } 598 599 /* 600 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 601 * 602 * The operating system has already ensured that the directory entry 603 * does not exist and done all appropriate namespace locking. 604 */ 605 static 606 int 607 hammer_vop_ncreate(struct vop_ncreate_args *ap) 608 { 609 struct hammer_transaction trans; 610 struct hammer_inode *dip; 611 struct hammer_inode *nip; 612 struct nchandle *nch; 613 int error; 614 615 nch = ap->a_nch; 616 dip = VTOI(ap->a_dvp); 617 618 if (dip->flags & HAMMER_INODE_RO) 619 return (EROFS); 620 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 621 return (error); 622 623 /* 624 * Create a transaction to cover the operations we perform. 625 */ 626 hammer_start_transaction(&trans, dip->hmp); 627 ++hammer_stats_file_iopsw; 628 629 /* 630 * Create a new filesystem object of the requested type. The 631 * returned inode will be referenced and shared-locked to prevent 632 * it from being moved to the flusher. 633 */ 634 635 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 636 dip, NULL, &nip); 637 if (error) { 638 hkprintf("hammer_create_inode error %d\n", error); 639 hammer_done_transaction(&trans); 640 *ap->a_vpp = NULL; 641 return (error); 642 } 643 644 /* 645 * Add the new filesystem object to the directory. This will also 646 * bump the inode's link count. 647 */ 648 error = hammer_ip_add_directory(&trans, dip, 649 nch->ncp->nc_name, nch->ncp->nc_nlen, 650 nip); 651 if (error) 652 hkprintf("hammer_ip_add_directory error %d\n", error); 653 654 /* 655 * Finish up. 656 */ 657 if (error) { 658 hammer_rel_inode(nip, 0); 659 hammer_done_transaction(&trans); 660 *ap->a_vpp = NULL; 661 } else { 662 error = hammer_get_vnode(nip, ap->a_vpp); 663 hammer_done_transaction(&trans); 664 hammer_rel_inode(nip, 0); 665 if (error == 0) { 666 cache_setunresolved(ap->a_nch); 667 cache_setvp(ap->a_nch, *ap->a_vpp); 668 } 669 hammer_knote(ap->a_dvp, NOTE_WRITE); 670 } 671 return (error); 672 } 673 674 /* 675 * hammer_vop_getattr { vp, vap } 676 * 677 * Retrieve an inode's attribute information. When accessing inodes 678 * historically we fake the atime field to ensure consistent results. 679 * The atime field is stored in the B-Tree element and allowed to be 680 * updated without cycling the element. 681 */ 682 static 683 int 684 hammer_vop_getattr(struct vop_getattr_args *ap) 685 { 686 struct hammer_inode *ip = VTOI(ap->a_vp); 687 struct vattr *vap = ap->a_vap; 688 689 /* 690 * We want the fsid to be different when accessing a filesystem 691 * with different as-of's so programs like diff don't think 692 * the files are the same. 693 * 694 * We also want the fsid to be the same when comparing snapshots, 695 * or when comparing mirrors (which might be backed by different 696 * physical devices). HAMMER fsids are based on the PFS's 697 * shared_uuid field. 698 * 699 * XXX there is a chance of collision here. The va_fsid reported 700 * by stat is different from the more involved fsid used in the 701 * mount structure. 702 */ 703 ++hammer_stats_file_iopsr; 704 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 705 (u_int32_t)(ip->obj_asof >> 32); 706 707 vap->va_fileid = ip->ino_leaf.base.obj_id; 708 vap->va_mode = ip->ino_data.mode; 709 vap->va_nlink = ip->ino_data.nlinks; 710 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 711 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 712 vap->va_rmajor = 0; 713 vap->va_rminor = 0; 714 vap->va_size = ip->ino_data.size; 715 716 /* 717 * Special case for @@PFS softlinks. The actual size of the 718 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 719 */ 720 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 721 ip->ino_data.size == 10 && 722 ip->obj_asof == HAMMER_MAX_TID && 723 ip->obj_localization == 0 && 724 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 725 vap->va_size = 26; 726 } 727 728 /* 729 * We must provide a consistent atime and mtime for snapshots 730 * so people can do a 'tar cf - ... | md5' on them and get 731 * consistent results. 732 */ 733 if (ip->flags & HAMMER_INODE_RO) { 734 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 735 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 736 } else { 737 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 738 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 739 } 740 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 741 vap->va_flags = ip->ino_data.uflags; 742 vap->va_gen = 1; /* hammer inums are unique for all time */ 743 vap->va_blocksize = HAMMER_BUFSIZE; 744 if (ip->ino_data.size >= HAMMER_XDEMARC) { 745 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 746 ~HAMMER_XBUFMASK64; 747 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) { 748 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 749 ~HAMMER_BUFMASK64; 750 } else { 751 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 752 } 753 754 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 755 vap->va_filerev = 0; /* XXX */ 756 /* mtime uniquely identifies any adjustments made to the file XXX */ 757 vap->va_fsmid = ip->ino_data.mtime; 758 vap->va_uid_uuid = ip->ino_data.uid; 759 vap->va_gid_uuid = ip->ino_data.gid; 760 vap->va_fsid_uuid = ip->hmp->fsid; 761 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 762 VA_FSID_UUID_VALID; 763 764 switch (ip->ino_data.obj_type) { 765 case HAMMER_OBJTYPE_CDEV: 766 case HAMMER_OBJTYPE_BDEV: 767 vap->va_rmajor = ip->ino_data.rmajor; 768 vap->va_rminor = ip->ino_data.rminor; 769 break; 770 default: 771 break; 772 } 773 return(0); 774 } 775 776 /* 777 * hammer_vop_nresolve { nch, dvp, cred } 778 * 779 * Locate the requested directory entry. 780 */ 781 static 782 int 783 hammer_vop_nresolve(struct vop_nresolve_args *ap) 784 { 785 struct hammer_transaction trans; 786 struct namecache *ncp; 787 hammer_inode_t dip; 788 hammer_inode_t ip; 789 hammer_tid_t asof; 790 struct hammer_cursor cursor; 791 struct vnode *vp; 792 int64_t namekey; 793 int error; 794 int i; 795 int nlen; 796 int flags; 797 int ispfs; 798 int64_t obj_id; 799 u_int32_t localization; 800 u_int32_t max_iterations; 801 802 /* 803 * Misc initialization, plus handle as-of name extensions. Look for 804 * the '@@' extension. Note that as-of files and directories cannot 805 * be modified. 806 */ 807 dip = VTOI(ap->a_dvp); 808 ncp = ap->a_nch->ncp; 809 asof = dip->obj_asof; 810 localization = dip->obj_localization; /* for code consistency */ 811 nlen = ncp->nc_nlen; 812 flags = dip->flags & HAMMER_INODE_RO; 813 ispfs = 0; 814 815 hammer_simple_transaction(&trans, dip->hmp); 816 ++hammer_stats_file_iopsr; 817 818 for (i = 0; i < nlen; ++i) { 819 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 820 error = hammer_str_to_tid(ncp->nc_name + i + 2, 821 &ispfs, &asof, &localization); 822 if (error != 0) { 823 i = nlen; 824 break; 825 } 826 if (asof != HAMMER_MAX_TID) 827 flags |= HAMMER_INODE_RO; 828 break; 829 } 830 } 831 nlen = i; 832 833 /* 834 * If this is a PFS softlink we dive into the PFS 835 */ 836 if (ispfs && nlen == 0) { 837 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 838 asof, localization, 839 flags, &error); 840 if (error == 0) { 841 error = hammer_get_vnode(ip, &vp); 842 hammer_rel_inode(ip, 0); 843 } else { 844 vp = NULL; 845 } 846 if (error == 0) { 847 vn_unlock(vp); 848 cache_setvp(ap->a_nch, vp); 849 vrele(vp); 850 } 851 goto done; 852 } 853 854 /* 855 * If there is no path component the time extension is relative to 856 * dip. 857 */ 858 if (nlen == 0) { 859 ip = hammer_get_inode(&trans, dip, dip->obj_id, 860 asof, dip->obj_localization, 861 flags, &error); 862 if (error == 0) { 863 error = hammer_get_vnode(ip, &vp); 864 hammer_rel_inode(ip, 0); 865 } else { 866 vp = NULL; 867 } 868 if (error == 0) { 869 vn_unlock(vp); 870 cache_setvp(ap->a_nch, vp); 871 vrele(vp); 872 } 873 goto done; 874 } 875 876 /* 877 * Calculate the namekey and setup the key range for the scan. This 878 * works kinda like a chained hash table where the lower 32 bits 879 * of the namekey synthesize the chain. 880 * 881 * The key range is inclusive of both key_beg and key_end. 882 */ 883 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 884 &max_iterations); 885 886 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 887 cursor.key_beg.localization = dip->obj_localization + 888 HAMMER_LOCALIZE_MISC; 889 cursor.key_beg.obj_id = dip->obj_id; 890 cursor.key_beg.key = namekey; 891 cursor.key_beg.create_tid = 0; 892 cursor.key_beg.delete_tid = 0; 893 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 894 cursor.key_beg.obj_type = 0; 895 896 cursor.key_end = cursor.key_beg; 897 cursor.key_end.key += max_iterations; 898 cursor.asof = asof; 899 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 900 901 /* 902 * Scan all matching records (the chain), locate the one matching 903 * the requested path component. 904 * 905 * The hammer_ip_*() functions merge in-memory records with on-disk 906 * records for the purposes of the search. 907 */ 908 obj_id = 0; 909 localization = HAMMER_DEF_LOCALIZATION; 910 911 if (error == 0) { 912 error = hammer_ip_first(&cursor); 913 while (error == 0) { 914 error = hammer_ip_resolve_data(&cursor); 915 if (error) 916 break; 917 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 918 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 919 obj_id = cursor.data->entry.obj_id; 920 localization = cursor.data->entry.localization; 921 break; 922 } 923 error = hammer_ip_next(&cursor); 924 } 925 } 926 hammer_done_cursor(&cursor); 927 if (error == 0) { 928 ip = hammer_get_inode(&trans, dip, obj_id, 929 asof, localization, 930 flags, &error); 931 if (error == 0) { 932 error = hammer_get_vnode(ip, &vp); 933 hammer_rel_inode(ip, 0); 934 } else { 935 vp = NULL; 936 } 937 if (error == 0) { 938 vn_unlock(vp); 939 cache_setvp(ap->a_nch, vp); 940 vrele(vp); 941 } 942 } else if (error == ENOENT) { 943 cache_setvp(ap->a_nch, NULL); 944 } 945 done: 946 hammer_done_transaction(&trans); 947 return (error); 948 } 949 950 /* 951 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 952 * 953 * Locate the parent directory of a directory vnode. 954 * 955 * dvp is referenced but not locked. *vpp must be returned referenced and 956 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 957 * at the root, instead it could indicate that the directory we were in was 958 * removed. 959 * 960 * NOTE: as-of sequences are not linked into the directory structure. If 961 * we are at the root with a different asof then the mount point, reload 962 * the same directory with the mount point's asof. I'm not sure what this 963 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 964 * get confused, but it hasn't been tested. 965 */ 966 static 967 int 968 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 969 { 970 struct hammer_transaction trans; 971 struct hammer_inode *dip; 972 struct hammer_inode *ip; 973 int64_t parent_obj_id; 974 u_int32_t parent_obj_localization; 975 hammer_tid_t asof; 976 int error; 977 978 dip = VTOI(ap->a_dvp); 979 asof = dip->obj_asof; 980 981 /* 982 * Whos are parent? This could be the root of a pseudo-filesystem 983 * whos parent is in another localization domain. 984 */ 985 parent_obj_id = dip->ino_data.parent_obj_id; 986 if (dip->obj_id == HAMMER_OBJID_ROOT) 987 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 988 else 989 parent_obj_localization = dip->obj_localization; 990 991 if (parent_obj_id == 0) { 992 if (dip->obj_id == HAMMER_OBJID_ROOT && 993 asof != dip->hmp->asof) { 994 parent_obj_id = dip->obj_id; 995 asof = dip->hmp->asof; 996 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 997 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 998 dip->obj_asof); 999 } else { 1000 *ap->a_vpp = NULL; 1001 return ENOENT; 1002 } 1003 } 1004 1005 hammer_simple_transaction(&trans, dip->hmp); 1006 ++hammer_stats_file_iopsr; 1007 1008 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1009 asof, parent_obj_localization, 1010 dip->flags, &error); 1011 if (ip) { 1012 error = hammer_get_vnode(ip, ap->a_vpp); 1013 hammer_rel_inode(ip, 0); 1014 } else { 1015 *ap->a_vpp = NULL; 1016 } 1017 hammer_done_transaction(&trans); 1018 return (error); 1019 } 1020 1021 /* 1022 * hammer_vop_nlink { nch, dvp, vp, cred } 1023 */ 1024 static 1025 int 1026 hammer_vop_nlink(struct vop_nlink_args *ap) 1027 { 1028 struct hammer_transaction trans; 1029 struct hammer_inode *dip; 1030 struct hammer_inode *ip; 1031 struct nchandle *nch; 1032 int error; 1033 1034 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1035 return(EXDEV); 1036 1037 nch = ap->a_nch; 1038 dip = VTOI(ap->a_dvp); 1039 ip = VTOI(ap->a_vp); 1040 1041 if (dip->obj_localization != ip->obj_localization) 1042 return(EXDEV); 1043 1044 if (dip->flags & HAMMER_INODE_RO) 1045 return (EROFS); 1046 if (ip->flags & HAMMER_INODE_RO) 1047 return (EROFS); 1048 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1049 return (error); 1050 1051 /* 1052 * Create a transaction to cover the operations we perform. 1053 */ 1054 hammer_start_transaction(&trans, dip->hmp); 1055 ++hammer_stats_file_iopsw; 1056 1057 /* 1058 * Add the filesystem object to the directory. Note that neither 1059 * dip nor ip are referenced or locked, but their vnodes are 1060 * referenced. This function will bump the inode's link count. 1061 */ 1062 error = hammer_ip_add_directory(&trans, dip, 1063 nch->ncp->nc_name, nch->ncp->nc_nlen, 1064 ip); 1065 1066 /* 1067 * Finish up. 1068 */ 1069 if (error == 0) { 1070 cache_setunresolved(nch); 1071 cache_setvp(nch, ap->a_vp); 1072 } 1073 hammer_done_transaction(&trans); 1074 hammer_knote(ap->a_vp, NOTE_LINK); 1075 hammer_knote(ap->a_dvp, NOTE_WRITE); 1076 return (error); 1077 } 1078 1079 /* 1080 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1081 * 1082 * The operating system has already ensured that the directory entry 1083 * does not exist and done all appropriate namespace locking. 1084 */ 1085 static 1086 int 1087 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1088 { 1089 struct hammer_transaction trans; 1090 struct hammer_inode *dip; 1091 struct hammer_inode *nip; 1092 struct nchandle *nch; 1093 int error; 1094 1095 nch = ap->a_nch; 1096 dip = VTOI(ap->a_dvp); 1097 1098 if (dip->flags & HAMMER_INODE_RO) 1099 return (EROFS); 1100 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1101 return (error); 1102 1103 /* 1104 * Create a transaction to cover the operations we perform. 1105 */ 1106 hammer_start_transaction(&trans, dip->hmp); 1107 ++hammer_stats_file_iopsw; 1108 1109 /* 1110 * Create a new filesystem object of the requested type. The 1111 * returned inode will be referenced but not locked. 1112 */ 1113 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1114 dip, NULL, &nip); 1115 if (error) { 1116 hkprintf("hammer_mkdir error %d\n", error); 1117 hammer_done_transaction(&trans); 1118 *ap->a_vpp = NULL; 1119 return (error); 1120 } 1121 /* 1122 * Add the new filesystem object to the directory. This will also 1123 * bump the inode's link count. 1124 */ 1125 error = hammer_ip_add_directory(&trans, dip, 1126 nch->ncp->nc_name, nch->ncp->nc_nlen, 1127 nip); 1128 if (error) 1129 hkprintf("hammer_mkdir (add) error %d\n", error); 1130 1131 /* 1132 * Finish up. 1133 */ 1134 if (error) { 1135 hammer_rel_inode(nip, 0); 1136 *ap->a_vpp = NULL; 1137 } else { 1138 error = hammer_get_vnode(nip, ap->a_vpp); 1139 hammer_rel_inode(nip, 0); 1140 if (error == 0) { 1141 cache_setunresolved(ap->a_nch); 1142 cache_setvp(ap->a_nch, *ap->a_vpp); 1143 } 1144 } 1145 hammer_done_transaction(&trans); 1146 if (error == 0) 1147 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1148 return (error); 1149 } 1150 1151 /* 1152 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1153 * 1154 * The operating system has already ensured that the directory entry 1155 * does not exist and done all appropriate namespace locking. 1156 */ 1157 static 1158 int 1159 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1160 { 1161 struct hammer_transaction trans; 1162 struct hammer_inode *dip; 1163 struct hammer_inode *nip; 1164 struct nchandle *nch; 1165 int error; 1166 1167 nch = ap->a_nch; 1168 dip = VTOI(ap->a_dvp); 1169 1170 if (dip->flags & HAMMER_INODE_RO) 1171 return (EROFS); 1172 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1173 return (error); 1174 1175 /* 1176 * Create a transaction to cover the operations we perform. 1177 */ 1178 hammer_start_transaction(&trans, dip->hmp); 1179 ++hammer_stats_file_iopsw; 1180 1181 /* 1182 * Create a new filesystem object of the requested type. The 1183 * returned inode will be referenced but not locked. 1184 * 1185 * If mknod specifies a directory a pseudo-fs is created. 1186 */ 1187 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1188 dip, NULL, &nip); 1189 if (error) { 1190 hammer_done_transaction(&trans); 1191 *ap->a_vpp = NULL; 1192 return (error); 1193 } 1194 1195 /* 1196 * Add the new filesystem object to the directory. This will also 1197 * bump the inode's link count. 1198 */ 1199 error = hammer_ip_add_directory(&trans, dip, 1200 nch->ncp->nc_name, nch->ncp->nc_nlen, 1201 nip); 1202 1203 /* 1204 * Finish up. 1205 */ 1206 if (error) { 1207 hammer_rel_inode(nip, 0); 1208 *ap->a_vpp = NULL; 1209 } else { 1210 error = hammer_get_vnode(nip, ap->a_vpp); 1211 hammer_rel_inode(nip, 0); 1212 if (error == 0) { 1213 cache_setunresolved(ap->a_nch); 1214 cache_setvp(ap->a_nch, *ap->a_vpp); 1215 } 1216 } 1217 hammer_done_transaction(&trans); 1218 if (error == 0) 1219 hammer_knote(ap->a_dvp, NOTE_WRITE); 1220 return (error); 1221 } 1222 1223 /* 1224 * hammer_vop_open { vp, mode, cred, fp } 1225 */ 1226 static 1227 int 1228 hammer_vop_open(struct vop_open_args *ap) 1229 { 1230 hammer_inode_t ip; 1231 1232 ++hammer_stats_file_iopsr; 1233 ip = VTOI(ap->a_vp); 1234 1235 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1236 return (EROFS); 1237 return(vop_stdopen(ap)); 1238 } 1239 1240 /* 1241 * hammer_vop_print { vp } 1242 */ 1243 static 1244 int 1245 hammer_vop_print(struct vop_print_args *ap) 1246 { 1247 return EOPNOTSUPP; 1248 } 1249 1250 /* 1251 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1252 */ 1253 static 1254 int 1255 hammer_vop_readdir(struct vop_readdir_args *ap) 1256 { 1257 struct hammer_transaction trans; 1258 struct hammer_cursor cursor; 1259 struct hammer_inode *ip; 1260 struct uio *uio; 1261 hammer_base_elm_t base; 1262 int error; 1263 int cookie_index; 1264 int ncookies; 1265 off_t *cookies; 1266 off_t saveoff; 1267 int r; 1268 int dtype; 1269 1270 ++hammer_stats_file_iopsr; 1271 ip = VTOI(ap->a_vp); 1272 uio = ap->a_uio; 1273 saveoff = uio->uio_offset; 1274 1275 if (ap->a_ncookies) { 1276 ncookies = uio->uio_resid / 16 + 1; 1277 if (ncookies > 1024) 1278 ncookies = 1024; 1279 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1280 cookie_index = 0; 1281 } else { 1282 ncookies = -1; 1283 cookies = NULL; 1284 cookie_index = 0; 1285 } 1286 1287 hammer_simple_transaction(&trans, ip->hmp); 1288 1289 /* 1290 * Handle artificial entries 1291 */ 1292 error = 0; 1293 if (saveoff == 0) { 1294 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1295 if (r) 1296 goto done; 1297 if (cookies) 1298 cookies[cookie_index] = saveoff; 1299 ++saveoff; 1300 ++cookie_index; 1301 if (cookie_index == ncookies) 1302 goto done; 1303 } 1304 if (saveoff == 1) { 1305 if (ip->ino_data.parent_obj_id) { 1306 r = vop_write_dirent(&error, uio, 1307 ip->ino_data.parent_obj_id, 1308 DT_DIR, 2, ".."); 1309 } else { 1310 r = vop_write_dirent(&error, uio, 1311 ip->obj_id, DT_DIR, 2, ".."); 1312 } 1313 if (r) 1314 goto done; 1315 if (cookies) 1316 cookies[cookie_index] = saveoff; 1317 ++saveoff; 1318 ++cookie_index; 1319 if (cookie_index == ncookies) 1320 goto done; 1321 } 1322 1323 /* 1324 * Key range (begin and end inclusive) to scan. Directory keys 1325 * directly translate to a 64 bit 'seek' position. 1326 */ 1327 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1328 cursor.key_beg.localization = ip->obj_localization + 1329 HAMMER_LOCALIZE_MISC; 1330 cursor.key_beg.obj_id = ip->obj_id; 1331 cursor.key_beg.create_tid = 0; 1332 cursor.key_beg.delete_tid = 0; 1333 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1334 cursor.key_beg.obj_type = 0; 1335 cursor.key_beg.key = saveoff; 1336 1337 cursor.key_end = cursor.key_beg; 1338 cursor.key_end.key = HAMMER_MAX_KEY; 1339 cursor.asof = ip->obj_asof; 1340 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1341 1342 error = hammer_ip_first(&cursor); 1343 1344 while (error == 0) { 1345 error = hammer_ip_resolve_data(&cursor); 1346 if (error) 1347 break; 1348 base = &cursor.leaf->base; 1349 saveoff = base->key; 1350 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1351 1352 if (base->obj_id != ip->obj_id) 1353 panic("readdir: bad record at %p", cursor.node); 1354 1355 /* 1356 * Convert pseudo-filesystems into softlinks 1357 */ 1358 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1359 r = vop_write_dirent( 1360 &error, uio, cursor.data->entry.obj_id, 1361 dtype, 1362 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1363 (void *)cursor.data->entry.name); 1364 if (r) 1365 break; 1366 ++saveoff; 1367 if (cookies) 1368 cookies[cookie_index] = base->key; 1369 ++cookie_index; 1370 if (cookie_index == ncookies) 1371 break; 1372 error = hammer_ip_next(&cursor); 1373 } 1374 hammer_done_cursor(&cursor); 1375 1376 done: 1377 hammer_done_transaction(&trans); 1378 1379 if (ap->a_eofflag) 1380 *ap->a_eofflag = (error == ENOENT); 1381 uio->uio_offset = saveoff; 1382 if (error && cookie_index == 0) { 1383 if (error == ENOENT) 1384 error = 0; 1385 if (cookies) { 1386 kfree(cookies, M_TEMP); 1387 *ap->a_ncookies = 0; 1388 *ap->a_cookies = NULL; 1389 } 1390 } else { 1391 if (error == ENOENT) 1392 error = 0; 1393 if (cookies) { 1394 *ap->a_ncookies = cookie_index; 1395 *ap->a_cookies = cookies; 1396 } 1397 } 1398 return(error); 1399 } 1400 1401 /* 1402 * hammer_vop_readlink { vp, uio, cred } 1403 */ 1404 static 1405 int 1406 hammer_vop_readlink(struct vop_readlink_args *ap) 1407 { 1408 struct hammer_transaction trans; 1409 struct hammer_cursor cursor; 1410 struct hammer_inode *ip; 1411 char buf[32]; 1412 u_int32_t localization; 1413 hammer_pseudofs_inmem_t pfsm; 1414 int error; 1415 1416 ip = VTOI(ap->a_vp); 1417 1418 /* 1419 * Shortcut if the symlink data was stuffed into ino_data. 1420 * 1421 * Also expand special "@@PFS%05d" softlinks (expansion only 1422 * occurs for non-historical (current) accesses made from the 1423 * primary filesystem). 1424 */ 1425 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1426 char *ptr; 1427 int bytes; 1428 1429 ptr = ip->ino_data.ext.symlink; 1430 bytes = (int)ip->ino_data.size; 1431 if (bytes == 10 && 1432 ip->obj_asof == HAMMER_MAX_TID && 1433 ip->obj_localization == 0 && 1434 strncmp(ptr, "@@PFS", 5) == 0) { 1435 hammer_simple_transaction(&trans, ip->hmp); 1436 bcopy(ptr + 5, buf, 5); 1437 buf[5] = 0; 1438 localization = strtoul(buf, NULL, 10) << 16; 1439 pfsm = hammer_load_pseudofs(&trans, localization, 1440 &error); 1441 if (error == 0) { 1442 if (pfsm->pfsd.mirror_flags & 1443 HAMMER_PFSD_SLAVE) { 1444 ksnprintf(buf, sizeof(buf), 1445 "@@0x%016llx:%05d", 1446 pfsm->pfsd.sync_end_tid, 1447 localization >> 16); 1448 } else { 1449 ksnprintf(buf, sizeof(buf), 1450 "@@0x%016llx:%05d", 1451 HAMMER_MAX_TID, 1452 localization >> 16); 1453 } 1454 ptr = buf; 1455 bytes = strlen(buf); 1456 } 1457 if (pfsm) 1458 hammer_rel_pseudofs(trans.hmp, pfsm); 1459 hammer_done_transaction(&trans); 1460 } 1461 error = uiomove(ptr, bytes, ap->a_uio); 1462 return(error); 1463 } 1464 1465 /* 1466 * Long version 1467 */ 1468 hammer_simple_transaction(&trans, ip->hmp); 1469 ++hammer_stats_file_iopsr; 1470 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1471 1472 /* 1473 * Key range (begin and end inclusive) to scan. Directory keys 1474 * directly translate to a 64 bit 'seek' position. 1475 */ 1476 cursor.key_beg.localization = ip->obj_localization + 1477 HAMMER_LOCALIZE_MISC; 1478 cursor.key_beg.obj_id = ip->obj_id; 1479 cursor.key_beg.create_tid = 0; 1480 cursor.key_beg.delete_tid = 0; 1481 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1482 cursor.key_beg.obj_type = 0; 1483 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1484 cursor.asof = ip->obj_asof; 1485 cursor.flags |= HAMMER_CURSOR_ASOF; 1486 1487 error = hammer_ip_lookup(&cursor); 1488 if (error == 0) { 1489 error = hammer_ip_resolve_data(&cursor); 1490 if (error == 0) { 1491 KKASSERT(cursor.leaf->data_len >= 1492 HAMMER_SYMLINK_NAME_OFF); 1493 error = uiomove(cursor.data->symlink.name, 1494 cursor.leaf->data_len - 1495 HAMMER_SYMLINK_NAME_OFF, 1496 ap->a_uio); 1497 } 1498 } 1499 hammer_done_cursor(&cursor); 1500 hammer_done_transaction(&trans); 1501 return(error); 1502 } 1503 1504 /* 1505 * hammer_vop_nremove { nch, dvp, cred } 1506 */ 1507 static 1508 int 1509 hammer_vop_nremove(struct vop_nremove_args *ap) 1510 { 1511 struct hammer_transaction trans; 1512 struct hammer_inode *dip; 1513 int error; 1514 1515 dip = VTOI(ap->a_dvp); 1516 1517 if (hammer_nohistory(dip) == 0 && 1518 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1519 return (error); 1520 } 1521 1522 hammer_start_transaction(&trans, dip->hmp); 1523 ++hammer_stats_file_iopsw; 1524 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1525 hammer_done_transaction(&trans); 1526 if (error == 0) 1527 hammer_knote(ap->a_dvp, NOTE_WRITE); 1528 return (error); 1529 } 1530 1531 /* 1532 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1533 */ 1534 static 1535 int 1536 hammer_vop_nrename(struct vop_nrename_args *ap) 1537 { 1538 struct hammer_transaction trans; 1539 struct namecache *fncp; 1540 struct namecache *tncp; 1541 struct hammer_inode *fdip; 1542 struct hammer_inode *tdip; 1543 struct hammer_inode *ip; 1544 struct hammer_cursor cursor; 1545 int64_t namekey; 1546 u_int32_t max_iterations; 1547 int nlen, error; 1548 1549 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1550 return(EXDEV); 1551 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1552 return(EXDEV); 1553 1554 fdip = VTOI(ap->a_fdvp); 1555 tdip = VTOI(ap->a_tdvp); 1556 fncp = ap->a_fnch->ncp; 1557 tncp = ap->a_tnch->ncp; 1558 ip = VTOI(fncp->nc_vp); 1559 KKASSERT(ip != NULL); 1560 1561 if (fdip->obj_localization != tdip->obj_localization) 1562 return(EXDEV); 1563 if (fdip->obj_localization != ip->obj_localization) 1564 return(EXDEV); 1565 1566 if (fdip->flags & HAMMER_INODE_RO) 1567 return (EROFS); 1568 if (tdip->flags & HAMMER_INODE_RO) 1569 return (EROFS); 1570 if (ip->flags & HAMMER_INODE_RO) 1571 return (EROFS); 1572 if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1573 return (error); 1574 1575 hammer_start_transaction(&trans, fdip->hmp); 1576 ++hammer_stats_file_iopsw; 1577 1578 /* 1579 * Remove tncp from the target directory and then link ip as 1580 * tncp. XXX pass trans to dounlink 1581 * 1582 * Force the inode sync-time to match the transaction so it is 1583 * in-sync with the creation of the target directory entry. 1584 */ 1585 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 1586 ap->a_cred, 0, -1); 1587 if (error == 0 || error == ENOENT) { 1588 error = hammer_ip_add_directory(&trans, tdip, 1589 tncp->nc_name, tncp->nc_nlen, 1590 ip); 1591 if (error == 0) { 1592 ip->ino_data.parent_obj_id = tdip->obj_id; 1593 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); 1594 } 1595 } 1596 if (error) 1597 goto failed; /* XXX */ 1598 1599 /* 1600 * Locate the record in the originating directory and remove it. 1601 * 1602 * Calculate the namekey and setup the key range for the scan. This 1603 * works kinda like a chained hash table where the lower 32 bits 1604 * of the namekey synthesize the chain. 1605 * 1606 * The key range is inclusive of both key_beg and key_end. 1607 */ 1608 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 1609 &max_iterations); 1610 retry: 1611 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 1612 cursor.key_beg.localization = fdip->obj_localization + 1613 HAMMER_LOCALIZE_MISC; 1614 cursor.key_beg.obj_id = fdip->obj_id; 1615 cursor.key_beg.key = namekey; 1616 cursor.key_beg.create_tid = 0; 1617 cursor.key_beg.delete_tid = 0; 1618 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1619 cursor.key_beg.obj_type = 0; 1620 1621 cursor.key_end = cursor.key_beg; 1622 cursor.key_end.key += max_iterations; 1623 cursor.asof = fdip->obj_asof; 1624 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1625 1626 /* 1627 * Scan all matching records (the chain), locate the one matching 1628 * the requested path component. 1629 * 1630 * The hammer_ip_*() functions merge in-memory records with on-disk 1631 * records for the purposes of the search. 1632 */ 1633 error = hammer_ip_first(&cursor); 1634 while (error == 0) { 1635 if (hammer_ip_resolve_data(&cursor) != 0) 1636 break; 1637 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 1638 KKASSERT(nlen > 0); 1639 if (fncp->nc_nlen == nlen && 1640 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1641 break; 1642 } 1643 error = hammer_ip_next(&cursor); 1644 } 1645 1646 /* 1647 * If all is ok we have to get the inode so we can adjust nlinks. 1648 * 1649 * WARNING: hammer_ip_del_directory() may have to terminate the 1650 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 1651 * twice. 1652 */ 1653 if (error == 0) 1654 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 1655 1656 /* 1657 * XXX A deadlock here will break rename's atomicy for the purposes 1658 * of crash recovery. 1659 */ 1660 if (error == EDEADLK) { 1661 hammer_done_cursor(&cursor); 1662 goto retry; 1663 } 1664 1665 /* 1666 * Cleanup and tell the kernel that the rename succeeded. 1667 */ 1668 hammer_done_cursor(&cursor); 1669 if (error == 0) { 1670 cache_rename(ap->a_fnch, ap->a_tnch); 1671 hammer_knote(ap->a_fdvp, NOTE_WRITE); 1672 hammer_knote(ap->a_tdvp, NOTE_WRITE); 1673 if (ip->vp) 1674 hammer_knote(ip->vp, NOTE_RENAME); 1675 } 1676 1677 failed: 1678 hammer_done_transaction(&trans); 1679 return (error); 1680 } 1681 1682 /* 1683 * hammer_vop_nrmdir { nch, dvp, cred } 1684 */ 1685 static 1686 int 1687 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 1688 { 1689 struct hammer_transaction trans; 1690 struct hammer_inode *dip; 1691 int error; 1692 1693 dip = VTOI(ap->a_dvp); 1694 1695 if (hammer_nohistory(dip) == 0 && 1696 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1697 return (error); 1698 } 1699 1700 hammer_start_transaction(&trans, dip->hmp); 1701 ++hammer_stats_file_iopsw; 1702 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 1703 hammer_done_transaction(&trans); 1704 if (error == 0) 1705 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1706 return (error); 1707 } 1708 1709 /* 1710 * hammer_vop_markatime { vp, cred } 1711 */ 1712 static 1713 int 1714 hammer_vop_markatime(struct vop_markatime_args *ap) 1715 { 1716 struct hammer_transaction trans; 1717 struct hammer_inode *ip; 1718 1719 ip = VTOI(ap->a_vp); 1720 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1721 return (EROFS); 1722 if (ip->flags & HAMMER_INODE_RO) 1723 return (EROFS); 1724 if (ip->hmp->mp->mnt_flag & MNT_NOATIME) 1725 return (0); 1726 hammer_start_transaction(&trans, ip->hmp); 1727 ++hammer_stats_file_iopsw; 1728 1729 ip->ino_data.atime = trans.time; 1730 hammer_modify_inode(ip, HAMMER_INODE_ATIME); 1731 hammer_done_transaction(&trans); 1732 hammer_knote(ap->a_vp, NOTE_ATTRIB); 1733 return (0); 1734 } 1735 1736 /* 1737 * hammer_vop_setattr { vp, vap, cred } 1738 */ 1739 static 1740 int 1741 hammer_vop_setattr(struct vop_setattr_args *ap) 1742 { 1743 struct hammer_transaction trans; 1744 struct vattr *vap; 1745 struct hammer_inode *ip; 1746 int modflags; 1747 int error; 1748 int truncating; 1749 int blksize; 1750 int kflags; 1751 int64_t aligned_size; 1752 u_int32_t flags; 1753 1754 vap = ap->a_vap; 1755 ip = ap->a_vp->v_data; 1756 modflags = 0; 1757 kflags = 0; 1758 1759 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1760 return(EROFS); 1761 if (ip->flags & HAMMER_INODE_RO) 1762 return (EROFS); 1763 if (hammer_nohistory(ip) == 0 && 1764 (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1765 return (error); 1766 } 1767 1768 hammer_start_transaction(&trans, ip->hmp); 1769 ++hammer_stats_file_iopsw; 1770 error = 0; 1771 1772 if (vap->va_flags != VNOVAL) { 1773 flags = ip->ino_data.uflags; 1774 error = vop_helper_setattr_flags(&flags, vap->va_flags, 1775 hammer_to_unix_xid(&ip->ino_data.uid), 1776 ap->a_cred); 1777 if (error == 0) { 1778 if (ip->ino_data.uflags != flags) { 1779 ip->ino_data.uflags = flags; 1780 modflags |= HAMMER_INODE_DDIRTY; 1781 kflags |= NOTE_ATTRIB; 1782 } 1783 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 1784 error = 0; 1785 goto done; 1786 } 1787 } 1788 goto done; 1789 } 1790 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 1791 error = EPERM; 1792 goto done; 1793 } 1794 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 1795 mode_t cur_mode = ip->ino_data.mode; 1796 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1797 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1798 uuid_t uuid_uid; 1799 uuid_t uuid_gid; 1800 1801 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 1802 ap->a_cred, 1803 &cur_uid, &cur_gid, &cur_mode); 1804 if (error == 0) { 1805 hammer_guid_to_uuid(&uuid_uid, cur_uid); 1806 hammer_guid_to_uuid(&uuid_gid, cur_gid); 1807 if (bcmp(&uuid_uid, &ip->ino_data.uid, 1808 sizeof(uuid_uid)) || 1809 bcmp(&uuid_gid, &ip->ino_data.gid, 1810 sizeof(uuid_gid)) || 1811 ip->ino_data.mode != cur_mode 1812 ) { 1813 ip->ino_data.uid = uuid_uid; 1814 ip->ino_data.gid = uuid_gid; 1815 ip->ino_data.mode = cur_mode; 1816 } 1817 modflags |= HAMMER_INODE_DDIRTY; 1818 kflags |= NOTE_ATTRIB; 1819 } 1820 } 1821 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 1822 switch(ap->a_vp->v_type) { 1823 case VREG: 1824 if (vap->va_size == ip->ino_data.size) 1825 break; 1826 /* 1827 * XXX break atomicy, we can deadlock the backend 1828 * if we do not release the lock. Probably not a 1829 * big deal here. 1830 */ 1831 blksize = hammer_blocksize(vap->va_size); 1832 if (vap->va_size < ip->ino_data.size) { 1833 vtruncbuf(ap->a_vp, vap->va_size, blksize); 1834 truncating = 1; 1835 kflags |= NOTE_WRITE; 1836 } else { 1837 vnode_pager_setsize(ap->a_vp, vap->va_size); 1838 truncating = 0; 1839 kflags |= NOTE_WRITE | NOTE_EXTEND; 1840 } 1841 ip->ino_data.size = vap->va_size; 1842 modflags |= HAMMER_INODE_DDIRTY; 1843 1844 /* 1845 * on-media truncation is cached in the inode until 1846 * the inode is synchronized. 1847 */ 1848 if (truncating) { 1849 hammer_ip_frontend_trunc(ip, vap->va_size); 1850 #ifdef DEBUG_TRUNCATE 1851 if (HammerTruncIp == NULL) 1852 HammerTruncIp = ip; 1853 #endif 1854 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 1855 ip->flags |= HAMMER_INODE_TRUNCATED; 1856 ip->trunc_off = vap->va_size; 1857 #ifdef DEBUG_TRUNCATE 1858 if (ip == HammerTruncIp) 1859 kprintf("truncate1 %016llx\n", ip->trunc_off); 1860 #endif 1861 } else if (ip->trunc_off > vap->va_size) { 1862 ip->trunc_off = vap->va_size; 1863 #ifdef DEBUG_TRUNCATE 1864 if (ip == HammerTruncIp) 1865 kprintf("truncate2 %016llx\n", ip->trunc_off); 1866 #endif 1867 } else { 1868 #ifdef DEBUG_TRUNCATE 1869 if (ip == HammerTruncIp) 1870 kprintf("truncate3 %016llx (ignored)\n", vap->va_size); 1871 #endif 1872 } 1873 } 1874 1875 /* 1876 * If truncating we have to clean out a portion of 1877 * the last block on-disk. We do this in the 1878 * front-end buffer cache. 1879 */ 1880 aligned_size = (vap->va_size + (blksize - 1)) & 1881 ~(int64_t)(blksize - 1); 1882 if (truncating && vap->va_size < aligned_size) { 1883 struct buf *bp; 1884 int offset; 1885 1886 aligned_size -= blksize; 1887 1888 offset = (int)vap->va_size & (blksize - 1); 1889 error = bread(ap->a_vp, aligned_size, 1890 blksize, &bp); 1891 hammer_ip_frontend_trunc(ip, aligned_size); 1892 if (error == 0) { 1893 bzero(bp->b_data + offset, 1894 blksize - offset); 1895 /* must de-cache direct-io offset */ 1896 bp->b_bio2.bio_offset = NOOFFSET; 1897 bdwrite(bp); 1898 } else { 1899 kprintf("ERROR %d\n", error); 1900 brelse(bp); 1901 } 1902 } 1903 break; 1904 case VDATABASE: 1905 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 1906 ip->flags |= HAMMER_INODE_TRUNCATED; 1907 ip->trunc_off = vap->va_size; 1908 } else if (ip->trunc_off > vap->va_size) { 1909 ip->trunc_off = vap->va_size; 1910 } 1911 hammer_ip_frontend_trunc(ip, vap->va_size); 1912 ip->ino_data.size = vap->va_size; 1913 modflags |= HAMMER_INODE_DDIRTY; 1914 kflags |= NOTE_ATTRIB; 1915 break; 1916 default: 1917 error = EINVAL; 1918 goto done; 1919 } 1920 break; 1921 } 1922 if (vap->va_atime.tv_sec != VNOVAL) { 1923 ip->ino_data.atime = 1924 hammer_timespec_to_time(&vap->va_atime); 1925 modflags |= HAMMER_INODE_ATIME; 1926 kflags |= NOTE_ATTRIB; 1927 } 1928 if (vap->va_mtime.tv_sec != VNOVAL) { 1929 ip->ino_data.mtime = 1930 hammer_timespec_to_time(&vap->va_mtime); 1931 modflags |= HAMMER_INODE_MTIME; 1932 kflags |= NOTE_ATTRIB; 1933 } 1934 if (vap->va_mode != (mode_t)VNOVAL) { 1935 mode_t cur_mode = ip->ino_data.mode; 1936 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1937 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1938 1939 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 1940 cur_uid, cur_gid, &cur_mode); 1941 if (error == 0 && ip->ino_data.mode != cur_mode) { 1942 ip->ino_data.mode = cur_mode; 1943 modflags |= HAMMER_INODE_DDIRTY; 1944 kflags |= NOTE_ATTRIB; 1945 } 1946 } 1947 done: 1948 if (error == 0) 1949 hammer_modify_inode(ip, modflags); 1950 hammer_done_transaction(&trans); 1951 hammer_knote(ap->a_vp, kflags); 1952 return (error); 1953 } 1954 1955 /* 1956 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 1957 */ 1958 static 1959 int 1960 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 1961 { 1962 struct hammer_transaction trans; 1963 struct hammer_inode *dip; 1964 struct hammer_inode *nip; 1965 struct nchandle *nch; 1966 hammer_record_t record; 1967 int error; 1968 int bytes; 1969 1970 ap->a_vap->va_type = VLNK; 1971 1972 nch = ap->a_nch; 1973 dip = VTOI(ap->a_dvp); 1974 1975 if (dip->flags & HAMMER_INODE_RO) 1976 return (EROFS); 1977 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1978 return (error); 1979 1980 /* 1981 * Create a transaction to cover the operations we perform. 1982 */ 1983 hammer_start_transaction(&trans, dip->hmp); 1984 ++hammer_stats_file_iopsw; 1985 1986 /* 1987 * Create a new filesystem object of the requested type. The 1988 * returned inode will be referenced but not locked. 1989 */ 1990 1991 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1992 dip, NULL, &nip); 1993 if (error) { 1994 hammer_done_transaction(&trans); 1995 *ap->a_vpp = NULL; 1996 return (error); 1997 } 1998 1999 /* 2000 * Add a record representing the symlink. symlink stores the link 2001 * as pure data, not a string, and is no \0 terminated. 2002 */ 2003 if (error == 0) { 2004 bytes = strlen(ap->a_target); 2005 2006 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2007 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2008 } else { 2009 record = hammer_alloc_mem_record(nip, bytes); 2010 record->type = HAMMER_MEM_RECORD_GENERAL; 2011 2012 record->leaf.base.localization = nip->obj_localization + 2013 HAMMER_LOCALIZE_MISC; 2014 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2015 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2016 record->leaf.data_len = bytes; 2017 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2018 bcopy(ap->a_target, record->data->symlink.name, bytes); 2019 error = hammer_ip_add_record(&trans, record); 2020 } 2021 2022 /* 2023 * Set the file size to the length of the link. 2024 */ 2025 if (error == 0) { 2026 nip->ino_data.size = bytes; 2027 hammer_modify_inode(nip, HAMMER_INODE_DDIRTY); 2028 } 2029 } 2030 if (error == 0) 2031 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2032 nch->ncp->nc_nlen, nip); 2033 2034 /* 2035 * Finish up. 2036 */ 2037 if (error) { 2038 hammer_rel_inode(nip, 0); 2039 *ap->a_vpp = NULL; 2040 } else { 2041 error = hammer_get_vnode(nip, ap->a_vpp); 2042 hammer_rel_inode(nip, 0); 2043 if (error == 0) { 2044 cache_setunresolved(ap->a_nch); 2045 cache_setvp(ap->a_nch, *ap->a_vpp); 2046 hammer_knote(ap->a_dvp, NOTE_WRITE); 2047 } 2048 } 2049 hammer_done_transaction(&trans); 2050 return (error); 2051 } 2052 2053 /* 2054 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2055 */ 2056 static 2057 int 2058 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2059 { 2060 struct hammer_transaction trans; 2061 struct hammer_inode *dip; 2062 int error; 2063 2064 dip = VTOI(ap->a_dvp); 2065 2066 if (hammer_nohistory(dip) == 0 && 2067 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2068 return (error); 2069 } 2070 2071 hammer_start_transaction(&trans, dip->hmp); 2072 ++hammer_stats_file_iopsw; 2073 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2074 ap->a_cred, ap->a_flags, -1); 2075 hammer_done_transaction(&trans); 2076 2077 return (error); 2078 } 2079 2080 /* 2081 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2082 */ 2083 static 2084 int 2085 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2086 { 2087 struct hammer_inode *ip = ap->a_vp->v_data; 2088 2089 ++hammer_stats_file_iopsr; 2090 return(hammer_ioctl(ip, ap->a_command, ap->a_data, 2091 ap->a_fflag, ap->a_cred)); 2092 } 2093 2094 static 2095 int 2096 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2097 { 2098 struct mount *mp; 2099 int error; 2100 2101 mp = ap->a_head.a_ops->head.vv_mount; 2102 2103 switch(ap->a_op) { 2104 case MOUNTCTL_SET_EXPORT: 2105 if (ap->a_ctllen != sizeof(struct export_args)) 2106 error = EINVAL; 2107 else 2108 error = hammer_vfs_export(mp, ap->a_op, 2109 (const struct export_args *)ap->a_ctl); 2110 break; 2111 default: 2112 error = journal_mountctl(ap); 2113 break; 2114 } 2115 return(error); 2116 } 2117 2118 /* 2119 * hammer_vop_strategy { vp, bio } 2120 * 2121 * Strategy call, used for regular file read & write only. Note that the 2122 * bp may represent a cluster. 2123 * 2124 * To simplify operation and allow better optimizations in the future, 2125 * this code does not make any assumptions with regards to buffer alignment 2126 * or size. 2127 */ 2128 static 2129 int 2130 hammer_vop_strategy(struct vop_strategy_args *ap) 2131 { 2132 struct buf *bp; 2133 int error; 2134 2135 bp = ap->a_bio->bio_buf; 2136 2137 switch(bp->b_cmd) { 2138 case BUF_CMD_READ: 2139 error = hammer_vop_strategy_read(ap); 2140 break; 2141 case BUF_CMD_WRITE: 2142 error = hammer_vop_strategy_write(ap); 2143 break; 2144 default: 2145 bp->b_error = error = EINVAL; 2146 bp->b_flags |= B_ERROR; 2147 biodone(ap->a_bio); 2148 break; 2149 } 2150 return (error); 2151 } 2152 2153 /* 2154 * Read from a regular file. Iterate the related records and fill in the 2155 * BIO/BUF. Gaps are zero-filled. 2156 * 2157 * The support code in hammer_object.c should be used to deal with mixed 2158 * in-memory and on-disk records. 2159 * 2160 * NOTE: Can be called from the cluster code with an oversized buf. 2161 * 2162 * XXX atime update 2163 */ 2164 static 2165 int 2166 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2167 { 2168 struct hammer_transaction trans; 2169 struct hammer_inode *ip; 2170 struct hammer_cursor cursor; 2171 hammer_base_elm_t base; 2172 hammer_off_t disk_offset; 2173 struct bio *bio; 2174 struct bio *nbio; 2175 struct buf *bp; 2176 int64_t rec_offset; 2177 int64_t ran_end; 2178 int64_t tmp64; 2179 int error; 2180 int boff; 2181 int roff; 2182 int n; 2183 2184 bio = ap->a_bio; 2185 bp = bio->bio_buf; 2186 ip = ap->a_vp->v_data; 2187 2188 /* 2189 * The zone-2 disk offset may have been set by the cluster code via 2190 * a BMAP operation, or else should be NOOFFSET. 2191 * 2192 * Checking the high bits for a match against zone-2 should suffice. 2193 */ 2194 nbio = push_bio(bio); 2195 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2196 HAMMER_ZONE_LARGE_DATA) { 2197 error = hammer_io_direct_read(ip->hmp, nbio, NULL); 2198 return (error); 2199 } 2200 2201 /* 2202 * Well, that sucked. Do it the hard way. If all the stars are 2203 * aligned we may still be able to issue a direct-read. 2204 */ 2205 hammer_simple_transaction(&trans, ip->hmp); 2206 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2207 2208 /* 2209 * Key range (begin and end inclusive) to scan. Note that the key's 2210 * stored in the actual records represent BASE+LEN, not BASE. The 2211 * first record containing bio_offset will have a key > bio_offset. 2212 */ 2213 cursor.key_beg.localization = ip->obj_localization + 2214 HAMMER_LOCALIZE_MISC; 2215 cursor.key_beg.obj_id = ip->obj_id; 2216 cursor.key_beg.create_tid = 0; 2217 cursor.key_beg.delete_tid = 0; 2218 cursor.key_beg.obj_type = 0; 2219 cursor.key_beg.key = bio->bio_offset + 1; 2220 cursor.asof = ip->obj_asof; 2221 cursor.flags |= HAMMER_CURSOR_ASOF; 2222 2223 cursor.key_end = cursor.key_beg; 2224 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2225 #if 0 2226 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2227 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2228 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2229 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2230 } else 2231 #endif 2232 { 2233 ran_end = bio->bio_offset + bp->b_bufsize; 2234 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2235 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2236 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2237 if (tmp64 < ran_end) 2238 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2239 else 2240 cursor.key_end.key = ran_end + MAXPHYS + 1; 2241 } 2242 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2243 2244 error = hammer_ip_first(&cursor); 2245 boff = 0; 2246 2247 while (error == 0) { 2248 /* 2249 * Get the base file offset of the record. The key for 2250 * data records is (base + bytes) rather then (base). 2251 */ 2252 base = &cursor.leaf->base; 2253 rec_offset = base->key - cursor.leaf->data_len; 2254 2255 /* 2256 * Calculate the gap, if any, and zero-fill it. 2257 * 2258 * n is the offset of the start of the record verses our 2259 * current seek offset in the bio. 2260 */ 2261 n = (int)(rec_offset - (bio->bio_offset + boff)); 2262 if (n > 0) { 2263 if (n > bp->b_bufsize - boff) 2264 n = bp->b_bufsize - boff; 2265 bzero((char *)bp->b_data + boff, n); 2266 boff += n; 2267 n = 0; 2268 } 2269 2270 /* 2271 * Calculate the data offset in the record and the number 2272 * of bytes we can copy. 2273 * 2274 * There are two degenerate cases. First, boff may already 2275 * be at bp->b_bufsize. Secondly, the data offset within 2276 * the record may exceed the record's size. 2277 */ 2278 roff = -n; 2279 rec_offset += roff; 2280 n = cursor.leaf->data_len - roff; 2281 if (n <= 0) { 2282 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff); 2283 n = 0; 2284 } else if (n > bp->b_bufsize - boff) { 2285 n = bp->b_bufsize - boff; 2286 } 2287 2288 /* 2289 * Deal with cached truncations. This cool bit of code 2290 * allows truncate()/ftruncate() to avoid having to sync 2291 * the file. 2292 * 2293 * If the frontend is truncated then all backend records are 2294 * subject to the frontend's truncation. 2295 * 2296 * If the backend is truncated then backend records on-disk 2297 * (but not in-memory) are subject to the backend's 2298 * truncation. In-memory records owned by the backend 2299 * represent data written after the truncation point on the 2300 * backend and must not be truncated. 2301 * 2302 * Truncate operations deal with frontend buffer cache 2303 * buffers and frontend-owned in-memory records synchronously. 2304 */ 2305 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2306 if (hammer_cursor_ondisk(&cursor) || 2307 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 2308 if (ip->trunc_off <= rec_offset) 2309 n = 0; 2310 else if (ip->trunc_off < rec_offset + n) 2311 n = (int)(ip->trunc_off - rec_offset); 2312 } 2313 } 2314 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2315 if (hammer_cursor_ondisk(&cursor)) { 2316 if (ip->sync_trunc_off <= rec_offset) 2317 n = 0; 2318 else if (ip->sync_trunc_off < rec_offset + n) 2319 n = (int)(ip->sync_trunc_off - rec_offset); 2320 } 2321 } 2322 2323 /* 2324 * Try to issue a direct read into our bio if possible, 2325 * otherwise resolve the element data into a hammer_buffer 2326 * and copy. 2327 * 2328 * The buffer on-disk should be zerod past any real 2329 * truncation point, but may not be for any synthesized 2330 * truncation point from above. 2331 */ 2332 disk_offset = cursor.leaf->data_offset + roff; 2333 if (boff == 0 && n == bp->b_bufsize && 2334 hammer_cursor_ondisk(&cursor) && 2335 (disk_offset & HAMMER_BUFMASK) == 0) { 2336 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2337 HAMMER_ZONE_LARGE_DATA); 2338 nbio->bio_offset = disk_offset; 2339 error = hammer_io_direct_read(trans.hmp, nbio, 2340 cursor.leaf); 2341 goto done; 2342 } else if (n) { 2343 error = hammer_ip_resolve_data(&cursor); 2344 if (error == 0) { 2345 bcopy((char *)cursor.data + roff, 2346 (char *)bp->b_data + boff, n); 2347 } 2348 } 2349 if (error) 2350 break; 2351 2352 /* 2353 * Iterate until we have filled the request. 2354 */ 2355 boff += n; 2356 if (boff == bp->b_bufsize) 2357 break; 2358 error = hammer_ip_next(&cursor); 2359 } 2360 2361 /* 2362 * There may have been a gap after the last record 2363 */ 2364 if (error == ENOENT) 2365 error = 0; 2366 if (error == 0 && boff != bp->b_bufsize) { 2367 KKASSERT(boff < bp->b_bufsize); 2368 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2369 /* boff = bp->b_bufsize; */ 2370 } 2371 bp->b_resid = 0; 2372 bp->b_error = error; 2373 if (error) 2374 bp->b_flags |= B_ERROR; 2375 biodone(ap->a_bio); 2376 2377 done: 2378 if (cursor.node) 2379 hammer_cache_node(&ip->cache[1], cursor.node); 2380 hammer_done_cursor(&cursor); 2381 hammer_done_transaction(&trans); 2382 return(error); 2383 } 2384 2385 /* 2386 * BMAP operation - used to support cluster_read() only. 2387 * 2388 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 2389 * 2390 * This routine may return EOPNOTSUPP if the opration is not supported for 2391 * the specified offset. The contents of the pointer arguments do not 2392 * need to be initialized in that case. 2393 * 2394 * If a disk address is available and properly aligned return 0 with 2395 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 2396 * to the run-length relative to that offset. Callers may assume that 2397 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 2398 * large, so return EOPNOTSUPP if it is not sufficiently large. 2399 */ 2400 static 2401 int 2402 hammer_vop_bmap(struct vop_bmap_args *ap) 2403 { 2404 struct hammer_transaction trans; 2405 struct hammer_inode *ip; 2406 struct hammer_cursor cursor; 2407 hammer_base_elm_t base; 2408 int64_t rec_offset; 2409 int64_t ran_end; 2410 int64_t tmp64; 2411 int64_t base_offset; 2412 int64_t base_disk_offset; 2413 int64_t last_offset; 2414 hammer_off_t last_disk_offset; 2415 hammer_off_t disk_offset; 2416 int rec_len; 2417 int error; 2418 int blksize; 2419 2420 ++hammer_stats_file_iopsr; 2421 ip = ap->a_vp->v_data; 2422 2423 /* 2424 * We can only BMAP regular files. We can't BMAP database files, 2425 * directories, etc. 2426 */ 2427 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 2428 return(EOPNOTSUPP); 2429 2430 /* 2431 * bmap is typically called with runp/runb both NULL when used 2432 * for writing. We do not support BMAP for writing atm. 2433 */ 2434 if (ap->a_cmd != BUF_CMD_READ) 2435 return(EOPNOTSUPP); 2436 2437 /* 2438 * Scan the B-Tree to acquire blockmap addresses, then translate 2439 * to raw addresses. 2440 */ 2441 hammer_simple_transaction(&trans, ip->hmp); 2442 #if 0 2443 kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]); 2444 #endif 2445 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2446 2447 /* 2448 * Key range (begin and end inclusive) to scan. Note that the key's 2449 * stored in the actual records represent BASE+LEN, not BASE. The 2450 * first record containing bio_offset will have a key > bio_offset. 2451 */ 2452 cursor.key_beg.localization = ip->obj_localization + 2453 HAMMER_LOCALIZE_MISC; 2454 cursor.key_beg.obj_id = ip->obj_id; 2455 cursor.key_beg.create_tid = 0; 2456 cursor.key_beg.delete_tid = 0; 2457 cursor.key_beg.obj_type = 0; 2458 if (ap->a_runb) 2459 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 2460 else 2461 cursor.key_beg.key = ap->a_loffset + 1; 2462 if (cursor.key_beg.key < 0) 2463 cursor.key_beg.key = 0; 2464 cursor.asof = ip->obj_asof; 2465 cursor.flags |= HAMMER_CURSOR_ASOF; 2466 2467 cursor.key_end = cursor.key_beg; 2468 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2469 2470 ran_end = ap->a_loffset + MAXPHYS; 2471 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2472 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2473 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2474 if (tmp64 < ran_end) 2475 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2476 else 2477 cursor.key_end.key = ran_end + MAXPHYS + 1; 2478 2479 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2480 2481 error = hammer_ip_first(&cursor); 2482 base_offset = last_offset = 0; 2483 base_disk_offset = last_disk_offset = 0; 2484 2485 while (error == 0) { 2486 /* 2487 * Get the base file offset of the record. The key for 2488 * data records is (base + bytes) rather then (base). 2489 * 2490 * NOTE: rec_offset + rec_len may exceed the end-of-file. 2491 * The extra bytes should be zero on-disk and the BMAP op 2492 * should still be ok. 2493 */ 2494 base = &cursor.leaf->base; 2495 rec_offset = base->key - cursor.leaf->data_len; 2496 rec_len = cursor.leaf->data_len; 2497 2498 /* 2499 * Incorporate any cached truncation. 2500 * 2501 * NOTE: Modifications to rec_len based on synthesized 2502 * truncation points remove the guarantee that any extended 2503 * data on disk is zero (since the truncations may not have 2504 * taken place on-media yet). 2505 */ 2506 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2507 if (hammer_cursor_ondisk(&cursor) || 2508 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 2509 if (ip->trunc_off <= rec_offset) 2510 rec_len = 0; 2511 else if (ip->trunc_off < rec_offset + rec_len) 2512 rec_len = (int)(ip->trunc_off - rec_offset); 2513 } 2514 } 2515 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2516 if (hammer_cursor_ondisk(&cursor)) { 2517 if (ip->sync_trunc_off <= rec_offset) 2518 rec_len = 0; 2519 else if (ip->sync_trunc_off < rec_offset + rec_len) 2520 rec_len = (int)(ip->sync_trunc_off - rec_offset); 2521 } 2522 } 2523 2524 /* 2525 * Accumulate information. If we have hit a discontiguous 2526 * block reset base_offset unless we are already beyond the 2527 * requested offset. If we are, that's it, we stop. 2528 */ 2529 if (error) 2530 break; 2531 if (hammer_cursor_ondisk(&cursor)) { 2532 disk_offset = cursor.leaf->data_offset; 2533 if (rec_offset != last_offset || 2534 disk_offset != last_disk_offset) { 2535 if (rec_offset > ap->a_loffset) 2536 break; 2537 base_offset = rec_offset; 2538 base_disk_offset = disk_offset; 2539 } 2540 last_offset = rec_offset + rec_len; 2541 last_disk_offset = disk_offset + rec_len; 2542 } 2543 error = hammer_ip_next(&cursor); 2544 } 2545 2546 #if 0 2547 kprintf("BMAP %016llx: %016llx - %016llx\n", 2548 ap->a_loffset, base_offset, last_offset); 2549 kprintf("BMAP %16s: %016llx - %016llx\n", 2550 "", base_disk_offset, last_disk_offset); 2551 #endif 2552 2553 if (cursor.node) { 2554 hammer_cache_node(&ip->cache[1], cursor.node); 2555 #if 0 2556 kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]); 2557 #endif 2558 } 2559 hammer_done_cursor(&cursor); 2560 hammer_done_transaction(&trans); 2561 2562 /* 2563 * If we couldn't find any records or the records we did find were 2564 * all behind the requested offset, return failure. A forward 2565 * truncation can leave a hole w/ no on-disk records. 2566 */ 2567 if (last_offset == 0 || last_offset < ap->a_loffset) 2568 return (EOPNOTSUPP); 2569 2570 /* 2571 * Figure out the block size at the requested offset and adjust 2572 * our limits so the cluster_read() does not create inappropriately 2573 * sized buffer cache buffers. 2574 */ 2575 blksize = hammer_blocksize(ap->a_loffset); 2576 if (hammer_blocksize(base_offset) != blksize) { 2577 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 2578 } 2579 if (last_offset != ap->a_loffset && 2580 hammer_blocksize(last_offset - 1) != blksize) { 2581 last_offset = hammer_blockdemarc(ap->a_loffset, 2582 last_offset - 1); 2583 } 2584 2585 /* 2586 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 2587 * from occuring. 2588 */ 2589 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 2590 2591 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 2592 /* 2593 * Only large-data zones can be direct-IOd 2594 */ 2595 error = EOPNOTSUPP; 2596 } else if ((disk_offset & HAMMER_BUFMASK) || 2597 (last_offset - ap->a_loffset) < blksize) { 2598 /* 2599 * doffsetp is not aligned or the forward run size does 2600 * not cover a whole buffer, disallow the direct I/O. 2601 */ 2602 error = EOPNOTSUPP; 2603 } else { 2604 /* 2605 * We're good. 2606 */ 2607 *ap->a_doffsetp = disk_offset; 2608 if (ap->a_runb) { 2609 *ap->a_runb = ap->a_loffset - base_offset; 2610 KKASSERT(*ap->a_runb >= 0); 2611 } 2612 if (ap->a_runp) { 2613 *ap->a_runp = last_offset - ap->a_loffset; 2614 KKASSERT(*ap->a_runp >= 0); 2615 } 2616 error = 0; 2617 } 2618 return(error); 2619 } 2620 2621 /* 2622 * Write to a regular file. Because this is a strategy call the OS is 2623 * trying to actually get data onto the media. 2624 */ 2625 static 2626 int 2627 hammer_vop_strategy_write(struct vop_strategy_args *ap) 2628 { 2629 hammer_record_t record; 2630 hammer_mount_t hmp; 2631 hammer_inode_t ip; 2632 struct bio *bio; 2633 struct buf *bp; 2634 int blksize; 2635 int bytes; 2636 int error; 2637 2638 bio = ap->a_bio; 2639 bp = bio->bio_buf; 2640 ip = ap->a_vp->v_data; 2641 hmp = ip->hmp; 2642 2643 blksize = hammer_blocksize(bio->bio_offset); 2644 KKASSERT(bp->b_bufsize == blksize); 2645 2646 if (ip->flags & HAMMER_INODE_RO) { 2647 bp->b_error = EROFS; 2648 bp->b_flags |= B_ERROR; 2649 biodone(ap->a_bio); 2650 return(EROFS); 2651 } 2652 2653 /* 2654 * Interlock with inode destruction (no in-kernel or directory 2655 * topology visibility). If we queue new IO while trying to 2656 * destroy the inode we can deadlock the vtrunc call in 2657 * hammer_inode_unloadable_check(). 2658 * 2659 * Besides, there's no point flushing a bp associated with an 2660 * inode that is being destroyed on-media and has no kernel 2661 * references. 2662 */ 2663 if ((ip->flags | ip->sync_flags) & 2664 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 2665 bp->b_resid = 0; 2666 biodone(ap->a_bio); 2667 return(0); 2668 } 2669 2670 /* 2671 * Reserve space and issue a direct-write from the front-end. 2672 * NOTE: The direct_io code will hammer_bread/bcopy smaller 2673 * allocations. 2674 * 2675 * An in-memory record will be installed to reference the storage 2676 * until the flusher can get to it. 2677 * 2678 * Since we own the high level bio the front-end will not try to 2679 * do a direct-read until the write completes. 2680 * 2681 * NOTE: The only time we do not reserve a full-sized buffers 2682 * worth of data is if the file is small. We do not try to 2683 * allocate a fragment (from the small-data zone) at the end of 2684 * an otherwise large file as this can lead to wildly separated 2685 * data. 2686 */ 2687 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 2688 KKASSERT(bio->bio_offset < ip->ino_data.size); 2689 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2) 2690 bytes = bp->b_bufsize; 2691 else 2692 bytes = ((int)ip->ino_data.size + 15) & ~15; 2693 2694 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 2695 bytes, &error); 2696 if (record) { 2697 hammer_io_direct_write(hmp, record, bio); 2698 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 2699 hammer_flush_inode(ip, 0); 2700 } else { 2701 bp->b_bio2.bio_offset = NOOFFSET; 2702 bp->b_error = error; 2703 bp->b_flags |= B_ERROR; 2704 biodone(ap->a_bio); 2705 } 2706 return(error); 2707 } 2708 2709 /* 2710 * dounlink - disconnect a directory entry 2711 * 2712 * XXX whiteout support not really in yet 2713 */ 2714 static int 2715 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 2716 struct vnode *dvp, struct ucred *cred, 2717 int flags, int isdir) 2718 { 2719 struct namecache *ncp; 2720 hammer_inode_t dip; 2721 hammer_inode_t ip; 2722 struct hammer_cursor cursor; 2723 int64_t namekey; 2724 u_int32_t max_iterations; 2725 int nlen, error; 2726 2727 /* 2728 * Calculate the namekey and setup the key range for the scan. This 2729 * works kinda like a chained hash table where the lower 32 bits 2730 * of the namekey synthesize the chain. 2731 * 2732 * The key range is inclusive of both key_beg and key_end. 2733 */ 2734 dip = VTOI(dvp); 2735 ncp = nch->ncp; 2736 2737 if (dip->flags & HAMMER_INODE_RO) 2738 return (EROFS); 2739 2740 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 2741 &max_iterations); 2742 retry: 2743 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 2744 cursor.key_beg.localization = dip->obj_localization + 2745 HAMMER_LOCALIZE_MISC; 2746 cursor.key_beg.obj_id = dip->obj_id; 2747 cursor.key_beg.key = namekey; 2748 cursor.key_beg.create_tid = 0; 2749 cursor.key_beg.delete_tid = 0; 2750 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2751 cursor.key_beg.obj_type = 0; 2752 2753 cursor.key_end = cursor.key_beg; 2754 cursor.key_end.key += max_iterations; 2755 cursor.asof = dip->obj_asof; 2756 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2757 2758 /* 2759 * Scan all matching records (the chain), locate the one matching 2760 * the requested path component. info->last_error contains the 2761 * error code on search termination and could be 0, ENOENT, or 2762 * something else. 2763 * 2764 * The hammer_ip_*() functions merge in-memory records with on-disk 2765 * records for the purposes of the search. 2766 */ 2767 error = hammer_ip_first(&cursor); 2768 2769 while (error == 0) { 2770 error = hammer_ip_resolve_data(&cursor); 2771 if (error) 2772 break; 2773 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2774 KKASSERT(nlen > 0); 2775 if (ncp->nc_nlen == nlen && 2776 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2777 break; 2778 } 2779 error = hammer_ip_next(&cursor); 2780 } 2781 2782 /* 2783 * If all is ok we have to get the inode so we can adjust nlinks. 2784 * To avoid a deadlock with the flusher we must release the inode 2785 * lock on the directory when acquiring the inode for the entry. 2786 * 2787 * If the target is a directory, it must be empty. 2788 */ 2789 if (error == 0) { 2790 hammer_unlock(&cursor.ip->lock); 2791 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 2792 dip->hmp->asof, 2793 cursor.data->entry.localization, 2794 0, &error); 2795 hammer_lock_sh(&cursor.ip->lock); 2796 if (error == ENOENT) { 2797 kprintf("obj_id %016llx\n", cursor.data->entry.obj_id); 2798 Debugger("ENOENT unlinking object that should exist"); 2799 } 2800 2801 /* 2802 * If isdir >= 0 we validate that the entry is or is not a 2803 * directory. If isdir < 0 we don't care. 2804 */ 2805 if (error == 0 && isdir >= 0) { 2806 if (isdir && 2807 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 2808 error = ENOTDIR; 2809 } else if (isdir == 0 && 2810 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 2811 error = EISDIR; 2812 } 2813 } 2814 2815 /* 2816 * If we are trying to remove a directory the directory must 2817 * be empty. 2818 * 2819 * The check directory code can loop and deadlock/retry. Our 2820 * own cursor's node locks must be released to avoid a 3-way 2821 * deadlock with the flusher if the check directory code 2822 * blocks. 2823 * 2824 * If any changes whatsoever have been made to the cursor 2825 * set EDEADLK and retry. 2826 */ 2827 if (error == 0 && ip->ino_data.obj_type == 2828 HAMMER_OBJTYPE_DIRECTORY) { 2829 hammer_unlock_cursor(&cursor); 2830 error = hammer_ip_check_directory_empty(trans, ip); 2831 hammer_lock_cursor(&cursor); 2832 if (cursor.flags & HAMMER_CURSOR_RETEST) { 2833 kprintf("HAMMER: Warning: avoided deadlock " 2834 "on rmdir '%s'\n", 2835 ncp->nc_name); 2836 error = EDEADLK; 2837 } 2838 } 2839 2840 /* 2841 * Delete the directory entry. 2842 * 2843 * WARNING: hammer_ip_del_directory() may have to terminate 2844 * the cursor to avoid a deadlock. It is ok to call 2845 * hammer_done_cursor() twice. 2846 */ 2847 if (error == 0) { 2848 error = hammer_ip_del_directory(trans, &cursor, 2849 dip, ip); 2850 } 2851 hammer_done_cursor(&cursor); 2852 if (error == 0) { 2853 cache_setunresolved(nch); 2854 cache_setvp(nch, NULL); 2855 /* XXX locking */ 2856 if (ip->vp) { 2857 hammer_knote(ip->vp, NOTE_DELETE); 2858 cache_inval_vp(ip->vp, CINV_DESTROY); 2859 } 2860 } 2861 if (ip) 2862 hammer_rel_inode(ip, 0); 2863 } else { 2864 hammer_done_cursor(&cursor); 2865 } 2866 if (error == EDEADLK) 2867 goto retry; 2868 2869 return (error); 2870 } 2871 2872 /************************************************************************ 2873 * FIFO AND SPECFS OPS * 2874 ************************************************************************ 2875 * 2876 */ 2877 2878 static int 2879 hammer_vop_fifoclose (struct vop_close_args *ap) 2880 { 2881 /* XXX update itimes */ 2882 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 2883 } 2884 2885 static int 2886 hammer_vop_fiforead (struct vop_read_args *ap) 2887 { 2888 int error; 2889 2890 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 2891 /* XXX update access time */ 2892 return (error); 2893 } 2894 2895 static int 2896 hammer_vop_fifowrite (struct vop_write_args *ap) 2897 { 2898 int error; 2899 2900 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 2901 /* XXX update access time */ 2902 return (error); 2903 } 2904 2905 static 2906 int 2907 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 2908 { 2909 int error; 2910 2911 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 2912 if (error) 2913 error = hammer_vop_kqfilter(ap); 2914 return(error); 2915 } 2916 2917 static int 2918 hammer_vop_specclose (struct vop_close_args *ap) 2919 { 2920 /* XXX update itimes */ 2921 return (VOCALL(&spec_vnode_vops, &ap->a_head)); 2922 } 2923 2924 static int 2925 hammer_vop_specread (struct vop_read_args *ap) 2926 { 2927 /* XXX update access time */ 2928 return (VOCALL(&spec_vnode_vops, &ap->a_head)); 2929 } 2930 2931 static int 2932 hammer_vop_specwrite (struct vop_write_args *ap) 2933 { 2934 /* XXX update last change time */ 2935 return (VOCALL(&spec_vnode_vops, &ap->a_head)); 2936 } 2937 2938 /************************************************************************ 2939 * KQFILTER OPS * 2940 ************************************************************************ 2941 * 2942 */ 2943 static void filt_hammerdetach(struct knote *kn); 2944 static int filt_hammerread(struct knote *kn, long hint); 2945 static int filt_hammerwrite(struct knote *kn, long hint); 2946 static int filt_hammervnode(struct knote *kn, long hint); 2947 2948 static struct filterops hammerread_filtops = 2949 { 1, NULL, filt_hammerdetach, filt_hammerread }; 2950 static struct filterops hammerwrite_filtops = 2951 { 1, NULL, filt_hammerdetach, filt_hammerwrite }; 2952 static struct filterops hammervnode_filtops = 2953 { 1, NULL, filt_hammerdetach, filt_hammervnode }; 2954 2955 static 2956 int 2957 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 2958 { 2959 struct vnode *vp = ap->a_vp; 2960 struct knote *kn = ap->a_kn; 2961 lwkt_tokref ilock; 2962 2963 switch (kn->kn_filter) { 2964 case EVFILT_READ: 2965 kn->kn_fop = &hammerread_filtops; 2966 break; 2967 case EVFILT_WRITE: 2968 kn->kn_fop = &hammerwrite_filtops; 2969 break; 2970 case EVFILT_VNODE: 2971 kn->kn_fop = &hammervnode_filtops; 2972 break; 2973 default: 2974 return (1); 2975 } 2976 2977 kn->kn_hook = (caddr_t)vp; 2978 2979 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 2980 SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext); 2981 lwkt_reltoken(&ilock); 2982 2983 return(0); 2984 } 2985 2986 static void 2987 filt_hammerdetach(struct knote *kn) 2988 { 2989 struct vnode *vp = (void *)kn->kn_hook; 2990 lwkt_tokref ilock; 2991 2992 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 2993 SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note, 2994 kn, knote, kn_selnext); 2995 lwkt_reltoken(&ilock); 2996 } 2997 2998 static int 2999 filt_hammerread(struct knote *kn, long hint) 3000 { 3001 struct vnode *vp = (void *)kn->kn_hook; 3002 hammer_inode_t ip = VTOI(vp); 3003 3004 if (hint == NOTE_REVOKE) { 3005 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 3006 return(1); 3007 } 3008 kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset; 3009 return (kn->kn_data != 0); 3010 } 3011 3012 static int 3013 filt_hammerwrite(struct knote *kn, long hint) 3014 { 3015 if (hint == NOTE_REVOKE) 3016 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 3017 kn->kn_data = 0; 3018 return (1); 3019 } 3020 3021 static int 3022 filt_hammervnode(struct knote *kn, long hint) 3023 { 3024 if (kn->kn_sfflags & hint) 3025 kn->kn_fflags |= hint; 3026 if (hint == NOTE_REVOKE) { 3027 kn->kn_flags |= EV_EOF; 3028 return (1); 3029 } 3030 return (kn->kn_fflags != 0); 3031 } 3032 3033