1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/fcntl.h> 41 #include <sys/namecache.h> 42 #include <sys/vnode.h> 43 #include <sys/lockf.h> 44 #include <sys/event.h> 45 #include <sys/stat.h> 46 #include <sys/dirent.h> 47 #include <sys/file.h> 48 #include <vm/vm_extern.h> 49 #include <vfs/fifofs/fifo.h> 50 #include "hammer.h" 51 52 /* 53 * USERFS VNOPS 54 */ 55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/ 56 static int hammer_vop_fsync(struct vop_fsync_args *); 57 static int hammer_vop_read(struct vop_read_args *); 58 static int hammer_vop_write(struct vop_write_args *); 59 static int hammer_vop_access(struct vop_access_args *); 60 static int hammer_vop_advlock(struct vop_advlock_args *); 61 static int hammer_vop_close(struct vop_close_args *); 62 static int hammer_vop_ncreate(struct vop_ncreate_args *); 63 static int hammer_vop_getattr(struct vop_getattr_args *); 64 static int hammer_vop_nresolve(struct vop_nresolve_args *); 65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *); 66 static int hammer_vop_nlink(struct vop_nlink_args *); 67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *); 68 static int hammer_vop_nmknod(struct vop_nmknod_args *); 69 static int hammer_vop_open(struct vop_open_args *); 70 static int hammer_vop_print(struct vop_print_args *); 71 static int hammer_vop_readdir(struct vop_readdir_args *); 72 static int hammer_vop_readlink(struct vop_readlink_args *); 73 static int hammer_vop_nremove(struct vop_nremove_args *); 74 static int hammer_vop_nrename(struct vop_nrename_args *); 75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *); 76 static int hammer_vop_markatime(struct vop_markatime_args *); 77 static int hammer_vop_setattr(struct vop_setattr_args *); 78 static int hammer_vop_strategy(struct vop_strategy_args *); 79 static int hammer_vop_bmap(struct vop_bmap_args *ap); 80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *); 81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *); 82 static int hammer_vop_ioctl(struct vop_ioctl_args *); 83 static int hammer_vop_mountctl(struct vop_mountctl_args *); 84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *); 85 86 static int hammer_vop_fifoclose (struct vop_close_args *); 87 static int hammer_vop_fiforead (struct vop_read_args *); 88 static int hammer_vop_fifowrite (struct vop_write_args *); 89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *); 90 91 static int hammer_vop_specclose (struct vop_close_args *); 92 static int hammer_vop_specread (struct vop_read_args *); 93 static int hammer_vop_specwrite (struct vop_write_args *); 94 static int hammer_vop_specgetattr (struct vop_getattr_args *); 95 96 struct vop_ops hammer_vnode_vops = { 97 .vop_default = vop_defaultop, 98 .vop_fsync = hammer_vop_fsync, 99 .vop_getpages = vop_stdgetpages, 100 .vop_putpages = vop_stdputpages, 101 .vop_read = hammer_vop_read, 102 .vop_write = hammer_vop_write, 103 .vop_access = hammer_vop_access, 104 .vop_advlock = hammer_vop_advlock, 105 .vop_close = hammer_vop_close, 106 .vop_ncreate = hammer_vop_ncreate, 107 .vop_getattr = hammer_vop_getattr, 108 .vop_inactive = hammer_vop_inactive, 109 .vop_reclaim = hammer_vop_reclaim, 110 .vop_nresolve = hammer_vop_nresolve, 111 .vop_nlookupdotdot = hammer_vop_nlookupdotdot, 112 .vop_nlink = hammer_vop_nlink, 113 .vop_nmkdir = hammer_vop_nmkdir, 114 .vop_nmknod = hammer_vop_nmknod, 115 .vop_open = hammer_vop_open, 116 .vop_pathconf = vop_stdpathconf, 117 .vop_print = hammer_vop_print, 118 .vop_readdir = hammer_vop_readdir, 119 .vop_readlink = hammer_vop_readlink, 120 .vop_nremove = hammer_vop_nremove, 121 .vop_nrename = hammer_vop_nrename, 122 .vop_nrmdir = hammer_vop_nrmdir, 123 .vop_markatime = hammer_vop_markatime, 124 .vop_setattr = hammer_vop_setattr, 125 .vop_bmap = hammer_vop_bmap, 126 .vop_strategy = hammer_vop_strategy, 127 .vop_nsymlink = hammer_vop_nsymlink, 128 .vop_nwhiteout = hammer_vop_nwhiteout, 129 .vop_ioctl = hammer_vop_ioctl, 130 .vop_mountctl = hammer_vop_mountctl, 131 .vop_kqfilter = hammer_vop_kqfilter 132 }; 133 134 struct vop_ops hammer_spec_vops = { 135 .vop_default = spec_vnoperate, 136 .vop_fsync = hammer_vop_fsync, 137 .vop_read = hammer_vop_specread, 138 .vop_write = hammer_vop_specwrite, 139 .vop_access = hammer_vop_access, 140 .vop_close = hammer_vop_specclose, 141 .vop_markatime = hammer_vop_markatime, 142 .vop_getattr = hammer_vop_specgetattr, 143 .vop_inactive = hammer_vop_inactive, 144 .vop_reclaim = hammer_vop_reclaim, 145 .vop_setattr = hammer_vop_setattr 146 }; 147 148 struct vop_ops hammer_fifo_vops = { 149 .vop_default = fifo_vnoperate, 150 .vop_fsync = hammer_vop_fsync, 151 .vop_read = hammer_vop_fiforead, 152 .vop_write = hammer_vop_fifowrite, 153 .vop_access = hammer_vop_access, 154 .vop_close = hammer_vop_fifoclose, 155 .vop_markatime = hammer_vop_markatime, 156 .vop_getattr = hammer_vop_getattr, 157 .vop_inactive = hammer_vop_inactive, 158 .vop_reclaim = hammer_vop_reclaim, 159 .vop_setattr = hammer_vop_setattr, 160 .vop_kqfilter = hammer_vop_fifokqfilter 161 }; 162 163 static __inline 164 void 165 hammer_knote(struct vnode *vp, int flags) 166 { 167 if (flags) 168 KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags); 169 } 170 171 #ifdef DEBUG_TRUNCATE 172 struct hammer_inode *HammerTruncIp; 173 #endif 174 175 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 176 struct vnode *dvp, struct ucred *cred, 177 int flags, int isdir); 178 static int hammer_vop_strategy_read(struct vop_strategy_args *ap); 179 static int hammer_vop_strategy_write(struct vop_strategy_args *ap); 180 181 #if 0 182 static 183 int 184 hammer_vop_vnoperate(struct vop_generic_args *) 185 { 186 return (VOCALL(&hammer_vnode_vops, ap)); 187 } 188 #endif 189 190 /* 191 * hammer_vop_fsync { vp, waitfor } 192 * 193 * fsync() an inode to disk and wait for it to be completely committed 194 * such that the information would not be undone if a crash occured after 195 * return. 196 */ 197 static 198 int 199 hammer_vop_fsync(struct vop_fsync_args *ap) 200 { 201 hammer_inode_t ip = VTOI(ap->a_vp); 202 203 ++hammer_count_fsyncs; 204 vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL); 205 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 206 if (ap->a_waitfor == MNT_WAIT) { 207 vn_unlock(ap->a_vp); 208 hammer_wait_inode(ip); 209 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); 210 } 211 return (ip->error); 212 } 213 214 /* 215 * hammer_vop_read { vp, uio, ioflag, cred } 216 */ 217 static 218 int 219 hammer_vop_read(struct vop_read_args *ap) 220 { 221 struct hammer_transaction trans; 222 hammer_inode_t ip; 223 off_t offset; 224 struct buf *bp; 225 struct uio *uio; 226 int error; 227 int n; 228 int seqcount; 229 int ioseqcount; 230 int blksize; 231 232 if (ap->a_vp->v_type != VREG) 233 return (EINVAL); 234 ip = VTOI(ap->a_vp); 235 error = 0; 236 uio = ap->a_uio; 237 238 /* 239 * Allow the UIO's size to override the sequential heuristic. 240 */ 241 blksize = hammer_blocksize(uio->uio_offset); 242 seqcount = (uio->uio_resid + (blksize - 1)) / blksize; 243 ioseqcount = ap->a_ioflag >> 16; 244 if (seqcount < ioseqcount) 245 seqcount = ioseqcount; 246 247 hammer_start_transaction(&trans, ip->hmp); 248 249 /* 250 * Access the data typically in HAMMER_BUFSIZE blocks via the 251 * buffer cache, but HAMMER may use a variable block size based 252 * on the offset. 253 */ 254 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) { 255 int64_t base_offset; 256 int64_t file_limit; 257 258 blksize = hammer_blocksize(uio->uio_offset); 259 offset = (int)uio->uio_offset & (blksize - 1); 260 base_offset = uio->uio_offset - offset; 261 262 if (hammer_cluster_enable) { 263 /* 264 * Use file_limit to prevent cluster_read() from 265 * creating buffers of the wrong block size past 266 * the demarc. 267 */ 268 file_limit = ip->ino_data.size; 269 if (base_offset < HAMMER_XDEMARC && 270 file_limit > HAMMER_XDEMARC) { 271 file_limit = HAMMER_XDEMARC; 272 } 273 error = cluster_read(ap->a_vp, 274 file_limit, base_offset, 275 blksize, MAXPHYS, 276 seqcount, &bp); 277 } else { 278 error = bread(ap->a_vp, base_offset, blksize, &bp); 279 } 280 if (error) { 281 kprintf("error %d\n", error); 282 brelse(bp); 283 break; 284 } 285 286 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ 287 n = blksize - offset; 288 if (n > uio->uio_resid) 289 n = uio->uio_resid; 290 if (n > ip->ino_data.size - uio->uio_offset) 291 n = (int)(ip->ino_data.size - uio->uio_offset); 292 error = uiomove((char *)bp->b_data + offset, n, uio); 293 294 /* data has a lower priority then meta-data */ 295 bp->b_flags |= B_AGE; 296 bqrelse(bp); 297 if (error) 298 break; 299 hammer_stats_file_read += n; 300 } 301 if ((ip->flags & HAMMER_INODE_RO) == 0 && 302 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { 303 ip->ino_data.atime = trans.time; 304 hammer_modify_inode(ip, HAMMER_INODE_ATIME); 305 } 306 hammer_done_transaction(&trans); 307 return (error); 308 } 309 310 /* 311 * hammer_vop_write { vp, uio, ioflag, cred } 312 */ 313 static 314 int 315 hammer_vop_write(struct vop_write_args *ap) 316 { 317 struct hammer_transaction trans; 318 struct hammer_inode *ip; 319 hammer_mount_t hmp; 320 struct uio *uio; 321 int offset; 322 off_t base_offset; 323 struct buf *bp; 324 int kflags; 325 int error; 326 int n; 327 int flags; 328 int delta; 329 int seqcount; 330 331 if (ap->a_vp->v_type != VREG) 332 return (EINVAL); 333 ip = VTOI(ap->a_vp); 334 hmp = ip->hmp; 335 error = 0; 336 kflags = 0; 337 seqcount = ap->a_ioflag >> 16; 338 339 if (ip->flags & HAMMER_INODE_RO) 340 return (EROFS); 341 342 /* 343 * Create a transaction to cover the operations we perform. 344 */ 345 hammer_start_transaction(&trans, hmp); 346 uio = ap->a_uio; 347 348 /* 349 * Check append mode 350 */ 351 if (ap->a_ioflag & IO_APPEND) 352 uio->uio_offset = ip->ino_data.size; 353 354 /* 355 * Check for illegal write offsets. Valid range is 0...2^63-1. 356 * 357 * NOTE: the base_off assignment is required to work around what 358 * I consider to be a GCC-4 optimization bug. 359 */ 360 if (uio->uio_offset < 0) { 361 hammer_done_transaction(&trans); 362 return (EFBIG); 363 } 364 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ 365 if (uio->uio_resid > 0 && base_offset <= 0) { 366 hammer_done_transaction(&trans); 367 return (EFBIG); 368 } 369 370 /* 371 * Access the data typically in HAMMER_BUFSIZE blocks via the 372 * buffer cache, but HAMMER may use a variable block size based 373 * on the offset. 374 */ 375 while (uio->uio_resid > 0) { 376 int fixsize = 0; 377 int blksize; 378 int blkmask; 379 380 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0) 381 break; 382 383 blksize = hammer_blocksize(uio->uio_offset); 384 385 /* 386 * Do not allow HAMMER to blow out the buffer cache. Very 387 * large UIOs can lockout other processes due to bwillwrite() 388 * mechanics. 389 * 390 * The hammer inode is not locked during these operations. 391 * The vnode is locked which can interfere with the pageout 392 * daemon for non-UIO_NOCOPY writes but should not interfere 393 * with the buffer cache. Even so, we cannot afford to 394 * allow the pageout daemon to build up too many dirty buffer 395 * cache buffers. 396 * 397 * Only call this if we aren't being recursively called from 398 * a virtual disk device (vn), else we may deadlock. 399 */ 400 if ((ap->a_ioflag & IO_RECURSE) == 0) 401 bwillwrite(blksize); 402 403 /* 404 * Do not allow HAMMER to blow out system memory by 405 * accumulating too many records. Records are so well 406 * decoupled from the buffer cache that it is possible 407 * for userland to push data out to the media via 408 * direct-write, but build up the records queued to the 409 * backend faster then the backend can flush them out. 410 * HAMMER has hit its write limit but the frontend has 411 * no pushback to slow it down. 412 */ 413 if (hmp->rsv_recs > hammer_limit_recs / 2) { 414 /* 415 * Get the inode on the flush list 416 */ 417 if (ip->rsv_recs >= 64) 418 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 419 else if (ip->rsv_recs >= 16) 420 hammer_flush_inode(ip, 0); 421 422 /* 423 * Keep the flusher going if the system keeps 424 * queueing records. 425 */ 426 delta = hmp->count_newrecords - 427 hmp->last_newrecords; 428 if (delta < 0 || delta > hammer_limit_recs / 2) { 429 hmp->last_newrecords = hmp->count_newrecords; 430 hammer_sync_hmp(hmp, MNT_NOWAIT); 431 } 432 433 /* 434 * If we have gotten behind start slowing 435 * down the writers. 436 */ 437 delta = (hmp->rsv_recs - hammer_limit_recs) * 438 hz / hammer_limit_recs; 439 if (delta > 0) 440 tsleep(&trans, 0, "hmrslo", delta); 441 } 442 443 /* 444 * Calculate the blocksize at the current offset and figure 445 * out how much we can actually write. 446 */ 447 blkmask = blksize - 1; 448 offset = (int)uio->uio_offset & blkmask; 449 base_offset = uio->uio_offset & ~(int64_t)blkmask; 450 n = blksize - offset; 451 if (n > uio->uio_resid) 452 n = uio->uio_resid; 453 if (uio->uio_offset + n > ip->ino_data.size) { 454 vnode_pager_setsize(ap->a_vp, uio->uio_offset + n); 455 fixsize = 1; 456 kflags |= NOTE_EXTEND; 457 } 458 459 if (uio->uio_segflg == UIO_NOCOPY) { 460 /* 461 * Issuing a write with the same data backing the 462 * buffer. Instantiate the buffer to collect the 463 * backing vm pages, then read-in any missing bits. 464 * 465 * This case is used by vop_stdputpages(). 466 */ 467 bp = getblk(ap->a_vp, base_offset, 468 blksize, GETBLK_BHEAVY, 0); 469 if ((bp->b_flags & B_CACHE) == 0) { 470 bqrelse(bp); 471 error = bread(ap->a_vp, base_offset, 472 blksize, &bp); 473 } 474 } else if (offset == 0 && uio->uio_resid >= blksize) { 475 /* 476 * Even though we are entirely overwriting the buffer 477 * we may still have to zero it out to avoid a 478 * mmap/write visibility issue. 479 */ 480 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0); 481 if ((bp->b_flags & B_CACHE) == 0) 482 vfs_bio_clrbuf(bp); 483 } else if (base_offset >= ip->ino_data.size) { 484 /* 485 * If the base offset of the buffer is beyond the 486 * file EOF, we don't have to issue a read. 487 */ 488 bp = getblk(ap->a_vp, base_offset, 489 blksize, GETBLK_BHEAVY, 0); 490 vfs_bio_clrbuf(bp); 491 } else { 492 /* 493 * Partial overwrite, read in any missing bits then 494 * replace the portion being written. 495 */ 496 error = bread(ap->a_vp, base_offset, blksize, &bp); 497 if (error == 0) 498 bheavy(bp); 499 } 500 if (error == 0) { 501 error = uiomove((char *)bp->b_data + offset, 502 n, uio); 503 } 504 505 /* 506 * If we screwed up we have to undo any VM size changes we 507 * made. 508 */ 509 if (error) { 510 brelse(bp); 511 if (fixsize) { 512 vtruncbuf(ap->a_vp, ip->ino_data.size, 513 hammer_blocksize(ip->ino_data.size)); 514 } 515 break; 516 } 517 kflags |= NOTE_WRITE; 518 hammer_stats_file_write += n; 519 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */ 520 if (ip->ino_data.size < uio->uio_offset) { 521 ip->ino_data.size = uio->uio_offset; 522 flags = HAMMER_INODE_DDIRTY; 523 vnode_pager_setsize(ap->a_vp, ip->ino_data.size); 524 } else { 525 flags = 0; 526 } 527 ip->ino_data.mtime = trans.time; 528 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; 529 hammer_modify_inode(ip, flags); 530 531 /* 532 * Once we dirty the buffer any cached zone-X offset 533 * becomes invalid. HAMMER NOTE: no-history mode cannot 534 * allow overwriting over the same data sector unless 535 * we provide UNDOs for the old data, which we don't. 536 */ 537 bp->b_bio2.bio_offset = NOOFFSET; 538 539 /* 540 * Final buffer disposition. 541 */ 542 bp->b_flags |= B_AGE; 543 if (ap->a_ioflag & IO_SYNC) { 544 bwrite(bp); 545 } else if (ap->a_ioflag & IO_DIRECT) { 546 bawrite(bp); 547 } else { 548 bdwrite(bp); 549 } 550 } 551 hammer_done_transaction(&trans); 552 hammer_knote(ap->a_vp, kflags); 553 return (error); 554 } 555 556 /* 557 * hammer_vop_access { vp, mode, cred } 558 */ 559 static 560 int 561 hammer_vop_access(struct vop_access_args *ap) 562 { 563 struct hammer_inode *ip = VTOI(ap->a_vp); 564 uid_t uid; 565 gid_t gid; 566 int error; 567 568 ++hammer_stats_file_iopsr; 569 uid = hammer_to_unix_xid(&ip->ino_data.uid); 570 gid = hammer_to_unix_xid(&ip->ino_data.gid); 571 572 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode, 573 ip->ino_data.uflags); 574 return (error); 575 } 576 577 /* 578 * hammer_vop_advlock { vp, id, op, fl, flags } 579 */ 580 static 581 int 582 hammer_vop_advlock(struct vop_advlock_args *ap) 583 { 584 hammer_inode_t ip = VTOI(ap->a_vp); 585 586 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size)); 587 } 588 589 /* 590 * hammer_vop_close { vp, fflag } 591 */ 592 static 593 int 594 hammer_vop_close(struct vop_close_args *ap) 595 { 596 /*hammer_inode_t ip = VTOI(ap->a_vp);*/ 597 return (vop_stdclose(ap)); 598 } 599 600 /* 601 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap } 602 * 603 * The operating system has already ensured that the directory entry 604 * does not exist and done all appropriate namespace locking. 605 */ 606 static 607 int 608 hammer_vop_ncreate(struct vop_ncreate_args *ap) 609 { 610 struct hammer_transaction trans; 611 struct hammer_inode *dip; 612 struct hammer_inode *nip; 613 struct nchandle *nch; 614 int error; 615 616 nch = ap->a_nch; 617 dip = VTOI(ap->a_dvp); 618 619 if (dip->flags & HAMMER_INODE_RO) 620 return (EROFS); 621 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 622 return (error); 623 624 /* 625 * Create a transaction to cover the operations we perform. 626 */ 627 hammer_start_transaction(&trans, dip->hmp); 628 ++hammer_stats_file_iopsw; 629 630 /* 631 * Create a new filesystem object of the requested type. The 632 * returned inode will be referenced and shared-locked to prevent 633 * it from being moved to the flusher. 634 */ 635 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 636 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 637 NULL, &nip); 638 if (error) { 639 hkprintf("hammer_create_inode error %d\n", error); 640 hammer_done_transaction(&trans); 641 *ap->a_vpp = NULL; 642 return (error); 643 } 644 645 /* 646 * Add the new filesystem object to the directory. This will also 647 * bump the inode's link count. 648 */ 649 error = hammer_ip_add_directory(&trans, dip, 650 nch->ncp->nc_name, nch->ncp->nc_nlen, 651 nip); 652 if (error) 653 hkprintf("hammer_ip_add_directory error %d\n", error); 654 655 /* 656 * Finish up. 657 */ 658 if (error) { 659 hammer_rel_inode(nip, 0); 660 hammer_done_transaction(&trans); 661 *ap->a_vpp = NULL; 662 } else { 663 error = hammer_get_vnode(nip, ap->a_vpp); 664 hammer_done_transaction(&trans); 665 hammer_rel_inode(nip, 0); 666 if (error == 0) { 667 cache_setunresolved(ap->a_nch); 668 cache_setvp(ap->a_nch, *ap->a_vpp); 669 } 670 hammer_knote(ap->a_dvp, NOTE_WRITE); 671 } 672 return (error); 673 } 674 675 /* 676 * hammer_vop_getattr { vp, vap } 677 * 678 * Retrieve an inode's attribute information. When accessing inodes 679 * historically we fake the atime field to ensure consistent results. 680 * The atime field is stored in the B-Tree element and allowed to be 681 * updated without cycling the element. 682 */ 683 static 684 int 685 hammer_vop_getattr(struct vop_getattr_args *ap) 686 { 687 struct hammer_inode *ip = VTOI(ap->a_vp); 688 struct vattr *vap = ap->a_vap; 689 690 /* 691 * We want the fsid to be different when accessing a filesystem 692 * with different as-of's so programs like diff don't think 693 * the files are the same. 694 * 695 * We also want the fsid to be the same when comparing snapshots, 696 * or when comparing mirrors (which might be backed by different 697 * physical devices). HAMMER fsids are based on the PFS's 698 * shared_uuid field. 699 * 700 * XXX there is a chance of collision here. The va_fsid reported 701 * by stat is different from the more involved fsid used in the 702 * mount structure. 703 */ 704 ++hammer_stats_file_iopsr; 705 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^ 706 (u_int32_t)(ip->obj_asof >> 32); 707 708 vap->va_fileid = ip->ino_leaf.base.obj_id; 709 vap->va_mode = ip->ino_data.mode; 710 vap->va_nlink = ip->ino_data.nlinks; 711 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid); 712 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid); 713 vap->va_rmajor = 0; 714 vap->va_rminor = 0; 715 vap->va_size = ip->ino_data.size; 716 717 /* 718 * Special case for @@PFS softlinks. The actual size of the 719 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes. 720 * or for MAX_TID is "@@-1:%05d" == 10 bytes. 721 */ 722 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK && 723 ip->ino_data.size == 10 && 724 ip->obj_asof == HAMMER_MAX_TID && 725 ip->obj_localization == 0 && 726 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) { 727 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) 728 vap->va_size = 26; 729 else 730 vap->va_size = 10; 731 } 732 733 /* 734 * We must provide a consistent atime and mtime for snapshots 735 * so people can do a 'tar cf - ... | md5' on them and get 736 * consistent results. 737 */ 738 if (ip->flags & HAMMER_INODE_RO) { 739 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime); 740 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime); 741 } else { 742 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime); 743 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime); 744 } 745 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime); 746 vap->va_flags = ip->ino_data.uflags; 747 vap->va_gen = 1; /* hammer inums are unique for all time */ 748 vap->va_blocksize = HAMMER_BUFSIZE; 749 if (ip->ino_data.size >= HAMMER_XDEMARC) { 750 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) & 751 ~HAMMER_XBUFMASK64; 752 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) { 753 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) & 754 ~HAMMER_BUFMASK64; 755 } else { 756 vap->va_bytes = (ip->ino_data.size + 15) & ~15; 757 } 758 759 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type); 760 vap->va_filerev = 0; /* XXX */ 761 /* mtime uniquely identifies any adjustments made to the file XXX */ 762 vap->va_fsmid = ip->ino_data.mtime; 763 vap->va_uid_uuid = ip->ino_data.uid; 764 vap->va_gid_uuid = ip->ino_data.gid; 765 vap->va_fsid_uuid = ip->hmp->fsid; 766 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID | 767 VA_FSID_UUID_VALID; 768 769 switch (ip->ino_data.obj_type) { 770 case HAMMER_OBJTYPE_CDEV: 771 case HAMMER_OBJTYPE_BDEV: 772 vap->va_rmajor = ip->ino_data.rmajor; 773 vap->va_rminor = ip->ino_data.rminor; 774 break; 775 default: 776 break; 777 } 778 return(0); 779 } 780 781 /* 782 * hammer_vop_nresolve { nch, dvp, cred } 783 * 784 * Locate the requested directory entry. 785 */ 786 static 787 int 788 hammer_vop_nresolve(struct vop_nresolve_args *ap) 789 { 790 struct hammer_transaction trans; 791 struct namecache *ncp; 792 hammer_inode_t dip; 793 hammer_inode_t ip; 794 hammer_tid_t asof; 795 struct hammer_cursor cursor; 796 struct vnode *vp; 797 int64_t namekey; 798 int error; 799 int i; 800 int nlen; 801 int flags; 802 int ispfs; 803 int64_t obj_id; 804 u_int32_t localization; 805 u_int32_t max_iterations; 806 807 /* 808 * Misc initialization, plus handle as-of name extensions. Look for 809 * the '@@' extension. Note that as-of files and directories cannot 810 * be modified. 811 */ 812 dip = VTOI(ap->a_dvp); 813 ncp = ap->a_nch->ncp; 814 asof = dip->obj_asof; 815 localization = dip->obj_localization; /* for code consistency */ 816 nlen = ncp->nc_nlen; 817 flags = dip->flags & HAMMER_INODE_RO; 818 ispfs = 0; 819 820 hammer_simple_transaction(&trans, dip->hmp); 821 ++hammer_stats_file_iopsr; 822 823 for (i = 0; i < nlen; ++i) { 824 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') { 825 error = hammer_str_to_tid(ncp->nc_name + i + 2, 826 &ispfs, &asof, &localization); 827 if (error != 0) { 828 i = nlen; 829 break; 830 } 831 if (asof != HAMMER_MAX_TID) 832 flags |= HAMMER_INODE_RO; 833 break; 834 } 835 } 836 nlen = i; 837 838 /* 839 * If this is a PFS softlink we dive into the PFS 840 */ 841 if (ispfs && nlen == 0) { 842 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT, 843 asof, localization, 844 flags, &error); 845 if (error == 0) { 846 error = hammer_get_vnode(ip, &vp); 847 hammer_rel_inode(ip, 0); 848 } else { 849 vp = NULL; 850 } 851 if (error == 0) { 852 vn_unlock(vp); 853 cache_setvp(ap->a_nch, vp); 854 vrele(vp); 855 } 856 goto done; 857 } 858 859 /* 860 * If there is no path component the time extension is relative to dip. 861 * e.g. "fubar/@@<snapshot>" 862 * 863 * "." is handled by the kernel, but ".@@<snapshot>" is not. 864 * e.g. "fubar/.@@<snapshot>" 865 * 866 * ".." is handled by the kernel. We do not currently handle 867 * "..@<snapshot>". 868 */ 869 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) { 870 ip = hammer_get_inode(&trans, dip, dip->obj_id, 871 asof, dip->obj_localization, 872 flags, &error); 873 if (error == 0) { 874 error = hammer_get_vnode(ip, &vp); 875 hammer_rel_inode(ip, 0); 876 } else { 877 vp = NULL; 878 } 879 if (error == 0) { 880 vn_unlock(vp); 881 cache_setvp(ap->a_nch, vp); 882 vrele(vp); 883 } 884 goto done; 885 } 886 887 /* 888 * Calculate the namekey and setup the key range for the scan. This 889 * works kinda like a chained hash table where the lower 32 bits 890 * of the namekey synthesize the chain. 891 * 892 * The key range is inclusive of both key_beg and key_end. 893 */ 894 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen, 895 &max_iterations); 896 897 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip); 898 cursor.key_beg.localization = dip->obj_localization + 899 HAMMER_LOCALIZE_MISC; 900 cursor.key_beg.obj_id = dip->obj_id; 901 cursor.key_beg.key = namekey; 902 cursor.key_beg.create_tid = 0; 903 cursor.key_beg.delete_tid = 0; 904 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 905 cursor.key_beg.obj_type = 0; 906 907 cursor.key_end = cursor.key_beg; 908 cursor.key_end.key += max_iterations; 909 cursor.asof = asof; 910 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 911 912 /* 913 * Scan all matching records (the chain), locate the one matching 914 * the requested path component. 915 * 916 * The hammer_ip_*() functions merge in-memory records with on-disk 917 * records for the purposes of the search. 918 */ 919 obj_id = 0; 920 localization = HAMMER_DEF_LOCALIZATION; 921 922 if (error == 0) { 923 error = hammer_ip_first(&cursor); 924 while (error == 0) { 925 error = hammer_ip_resolve_data(&cursor); 926 if (error) 927 break; 928 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF && 929 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 930 obj_id = cursor.data->entry.obj_id; 931 localization = cursor.data->entry.localization; 932 break; 933 } 934 error = hammer_ip_next(&cursor); 935 } 936 } 937 hammer_done_cursor(&cursor); 938 939 /* 940 * Lookup the obj_id. This should always succeed. If it does not 941 * the filesystem may be damaged and we return a dummy inode. 942 */ 943 if (error == 0) { 944 ip = hammer_get_inode(&trans, dip, obj_id, 945 asof, localization, 946 flags, &error); 947 if (error == ENOENT) { 948 kprintf("HAMMER: WARNING: Missing " 949 "inode for dirent \"%s\"\n" 950 "\tobj_id = %016llx\n", 951 ncp->nc_name, (long long)obj_id); 952 error = 0; 953 ip = hammer_get_dummy_inode(&trans, dip, obj_id, 954 asof, localization, 955 flags, &error); 956 } 957 if (error == 0) { 958 error = hammer_get_vnode(ip, &vp); 959 hammer_rel_inode(ip, 0); 960 } else { 961 vp = NULL; 962 } 963 if (error == 0) { 964 vn_unlock(vp); 965 cache_setvp(ap->a_nch, vp); 966 vrele(vp); 967 } 968 } else if (error == ENOENT) { 969 cache_setvp(ap->a_nch, NULL); 970 } 971 done: 972 hammer_done_transaction(&trans); 973 return (error); 974 } 975 976 /* 977 * hammer_vop_nlookupdotdot { dvp, vpp, cred } 978 * 979 * Locate the parent directory of a directory vnode. 980 * 981 * dvp is referenced but not locked. *vpp must be returned referenced and 982 * locked. A parent_obj_id of 0 does not necessarily indicate that we are 983 * at the root, instead it could indicate that the directory we were in was 984 * removed. 985 * 986 * NOTE: as-of sequences are not linked into the directory structure. If 987 * we are at the root with a different asof then the mount point, reload 988 * the same directory with the mount point's asof. I'm not sure what this 989 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not 990 * get confused, but it hasn't been tested. 991 */ 992 static 993 int 994 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) 995 { 996 struct hammer_transaction trans; 997 struct hammer_inode *dip; 998 struct hammer_inode *ip; 999 int64_t parent_obj_id; 1000 u_int32_t parent_obj_localization; 1001 hammer_tid_t asof; 1002 int error; 1003 1004 dip = VTOI(ap->a_dvp); 1005 asof = dip->obj_asof; 1006 1007 /* 1008 * Whos are parent? This could be the root of a pseudo-filesystem 1009 * whos parent is in another localization domain. 1010 */ 1011 parent_obj_id = dip->ino_data.parent_obj_id; 1012 if (dip->obj_id == HAMMER_OBJID_ROOT) 1013 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; 1014 else 1015 parent_obj_localization = dip->obj_localization; 1016 1017 if (parent_obj_id == 0) { 1018 if (dip->obj_id == HAMMER_OBJID_ROOT && 1019 asof != dip->hmp->asof) { 1020 parent_obj_id = dip->obj_id; 1021 asof = dip->hmp->asof; 1022 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); 1023 ksnprintf(*ap->a_fakename, 19, "0x%016llx", 1024 (long long)dip->obj_asof); 1025 } else { 1026 *ap->a_vpp = NULL; 1027 return ENOENT; 1028 } 1029 } 1030 1031 hammer_simple_transaction(&trans, dip->hmp); 1032 ++hammer_stats_file_iopsr; 1033 1034 ip = hammer_get_inode(&trans, dip, parent_obj_id, 1035 asof, parent_obj_localization, 1036 dip->flags, &error); 1037 if (ip) { 1038 error = hammer_get_vnode(ip, ap->a_vpp); 1039 hammer_rel_inode(ip, 0); 1040 } else { 1041 *ap->a_vpp = NULL; 1042 } 1043 hammer_done_transaction(&trans); 1044 return (error); 1045 } 1046 1047 /* 1048 * hammer_vop_nlink { nch, dvp, vp, cred } 1049 */ 1050 static 1051 int 1052 hammer_vop_nlink(struct vop_nlink_args *ap) 1053 { 1054 struct hammer_transaction trans; 1055 struct hammer_inode *dip; 1056 struct hammer_inode *ip; 1057 struct nchandle *nch; 1058 int error; 1059 1060 if (ap->a_dvp->v_mount != ap->a_vp->v_mount) 1061 return(EXDEV); 1062 1063 nch = ap->a_nch; 1064 dip = VTOI(ap->a_dvp); 1065 ip = VTOI(ap->a_vp); 1066 1067 if (dip->obj_localization != ip->obj_localization) 1068 return(EXDEV); 1069 1070 if (dip->flags & HAMMER_INODE_RO) 1071 return (EROFS); 1072 if (ip->flags & HAMMER_INODE_RO) 1073 return (EROFS); 1074 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1075 return (error); 1076 1077 /* 1078 * Create a transaction to cover the operations we perform. 1079 */ 1080 hammer_start_transaction(&trans, dip->hmp); 1081 ++hammer_stats_file_iopsw; 1082 1083 /* 1084 * Add the filesystem object to the directory. Note that neither 1085 * dip nor ip are referenced or locked, but their vnodes are 1086 * referenced. This function will bump the inode's link count. 1087 */ 1088 error = hammer_ip_add_directory(&trans, dip, 1089 nch->ncp->nc_name, nch->ncp->nc_nlen, 1090 ip); 1091 1092 /* 1093 * Finish up. 1094 */ 1095 if (error == 0) { 1096 cache_setunresolved(nch); 1097 cache_setvp(nch, ap->a_vp); 1098 } 1099 hammer_done_transaction(&trans); 1100 hammer_knote(ap->a_vp, NOTE_LINK); 1101 hammer_knote(ap->a_dvp, NOTE_WRITE); 1102 return (error); 1103 } 1104 1105 /* 1106 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap } 1107 * 1108 * The operating system has already ensured that the directory entry 1109 * does not exist and done all appropriate namespace locking. 1110 */ 1111 static 1112 int 1113 hammer_vop_nmkdir(struct vop_nmkdir_args *ap) 1114 { 1115 struct hammer_transaction trans; 1116 struct hammer_inode *dip; 1117 struct hammer_inode *nip; 1118 struct nchandle *nch; 1119 int error; 1120 1121 nch = ap->a_nch; 1122 dip = VTOI(ap->a_dvp); 1123 1124 if (dip->flags & HAMMER_INODE_RO) 1125 return (EROFS); 1126 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1127 return (error); 1128 1129 /* 1130 * Create a transaction to cover the operations we perform. 1131 */ 1132 hammer_start_transaction(&trans, dip->hmp); 1133 ++hammer_stats_file_iopsw; 1134 1135 /* 1136 * Create a new filesystem object of the requested type. The 1137 * returned inode will be referenced but not locked. 1138 */ 1139 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1140 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1141 NULL, &nip); 1142 if (error) { 1143 hkprintf("hammer_mkdir error %d\n", error); 1144 hammer_done_transaction(&trans); 1145 *ap->a_vpp = NULL; 1146 return (error); 1147 } 1148 /* 1149 * Add the new filesystem object to the directory. This will also 1150 * bump the inode's link count. 1151 */ 1152 error = hammer_ip_add_directory(&trans, dip, 1153 nch->ncp->nc_name, nch->ncp->nc_nlen, 1154 nip); 1155 if (error) 1156 hkprintf("hammer_mkdir (add) error %d\n", error); 1157 1158 /* 1159 * Finish up. 1160 */ 1161 if (error) { 1162 hammer_rel_inode(nip, 0); 1163 *ap->a_vpp = NULL; 1164 } else { 1165 error = hammer_get_vnode(nip, ap->a_vpp); 1166 hammer_rel_inode(nip, 0); 1167 if (error == 0) { 1168 cache_setunresolved(ap->a_nch); 1169 cache_setvp(ap->a_nch, *ap->a_vpp); 1170 } 1171 } 1172 hammer_done_transaction(&trans); 1173 if (error == 0) 1174 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1175 return (error); 1176 } 1177 1178 /* 1179 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap } 1180 * 1181 * The operating system has already ensured that the directory entry 1182 * does not exist and done all appropriate namespace locking. 1183 */ 1184 static 1185 int 1186 hammer_vop_nmknod(struct vop_nmknod_args *ap) 1187 { 1188 struct hammer_transaction trans; 1189 struct hammer_inode *dip; 1190 struct hammer_inode *nip; 1191 struct nchandle *nch; 1192 int error; 1193 1194 nch = ap->a_nch; 1195 dip = VTOI(ap->a_dvp); 1196 1197 if (dip->flags & HAMMER_INODE_RO) 1198 return (EROFS); 1199 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1200 return (error); 1201 1202 /* 1203 * Create a transaction to cover the operations we perform. 1204 */ 1205 hammer_start_transaction(&trans, dip->hmp); 1206 ++hammer_stats_file_iopsw; 1207 1208 /* 1209 * Create a new filesystem object of the requested type. The 1210 * returned inode will be referenced but not locked. 1211 * 1212 * If mknod specifies a directory a pseudo-fs is created. 1213 */ 1214 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 1215 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 1216 NULL, &nip); 1217 if (error) { 1218 hammer_done_transaction(&trans); 1219 *ap->a_vpp = NULL; 1220 return (error); 1221 } 1222 1223 /* 1224 * Add the new filesystem object to the directory. This will also 1225 * bump the inode's link count. 1226 */ 1227 error = hammer_ip_add_directory(&trans, dip, 1228 nch->ncp->nc_name, nch->ncp->nc_nlen, 1229 nip); 1230 1231 /* 1232 * Finish up. 1233 */ 1234 if (error) { 1235 hammer_rel_inode(nip, 0); 1236 *ap->a_vpp = NULL; 1237 } else { 1238 error = hammer_get_vnode(nip, ap->a_vpp); 1239 hammer_rel_inode(nip, 0); 1240 if (error == 0) { 1241 cache_setunresolved(ap->a_nch); 1242 cache_setvp(ap->a_nch, *ap->a_vpp); 1243 } 1244 } 1245 hammer_done_transaction(&trans); 1246 if (error == 0) 1247 hammer_knote(ap->a_dvp, NOTE_WRITE); 1248 return (error); 1249 } 1250 1251 /* 1252 * hammer_vop_open { vp, mode, cred, fp } 1253 */ 1254 static 1255 int 1256 hammer_vop_open(struct vop_open_args *ap) 1257 { 1258 hammer_inode_t ip; 1259 1260 ++hammer_stats_file_iopsr; 1261 ip = VTOI(ap->a_vp); 1262 1263 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO)) 1264 return (EROFS); 1265 return(vop_stdopen(ap)); 1266 } 1267 1268 /* 1269 * hammer_vop_print { vp } 1270 */ 1271 static 1272 int 1273 hammer_vop_print(struct vop_print_args *ap) 1274 { 1275 return EOPNOTSUPP; 1276 } 1277 1278 /* 1279 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies } 1280 */ 1281 static 1282 int 1283 hammer_vop_readdir(struct vop_readdir_args *ap) 1284 { 1285 struct hammer_transaction trans; 1286 struct hammer_cursor cursor; 1287 struct hammer_inode *ip; 1288 struct uio *uio; 1289 hammer_base_elm_t base; 1290 int error; 1291 int cookie_index; 1292 int ncookies; 1293 off_t *cookies; 1294 off_t saveoff; 1295 int r; 1296 int dtype; 1297 1298 ++hammer_stats_file_iopsr; 1299 ip = VTOI(ap->a_vp); 1300 uio = ap->a_uio; 1301 saveoff = uio->uio_offset; 1302 1303 if (ap->a_ncookies) { 1304 ncookies = uio->uio_resid / 16 + 1; 1305 if (ncookies > 1024) 1306 ncookies = 1024; 1307 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK); 1308 cookie_index = 0; 1309 } else { 1310 ncookies = -1; 1311 cookies = NULL; 1312 cookie_index = 0; 1313 } 1314 1315 hammer_simple_transaction(&trans, ip->hmp); 1316 1317 /* 1318 * Handle artificial entries 1319 * 1320 * It should be noted that the minimum value for a directory 1321 * hash key on-media is 0x0000000100000000, so we can use anything 1322 * less then that to represent our 'special' key space. 1323 */ 1324 error = 0; 1325 if (saveoff == 0) { 1326 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, "."); 1327 if (r) 1328 goto done; 1329 if (cookies) 1330 cookies[cookie_index] = saveoff; 1331 ++saveoff; 1332 ++cookie_index; 1333 if (cookie_index == ncookies) 1334 goto done; 1335 } 1336 if (saveoff == 1) { 1337 if (ip->ino_data.parent_obj_id) { 1338 r = vop_write_dirent(&error, uio, 1339 ip->ino_data.parent_obj_id, 1340 DT_DIR, 2, ".."); 1341 } else { 1342 r = vop_write_dirent(&error, uio, 1343 ip->obj_id, DT_DIR, 2, ".."); 1344 } 1345 if (r) 1346 goto done; 1347 if (cookies) 1348 cookies[cookie_index] = saveoff; 1349 ++saveoff; 1350 ++cookie_index; 1351 if (cookie_index == ncookies) 1352 goto done; 1353 } 1354 1355 /* 1356 * Key range (begin and end inclusive) to scan. Directory keys 1357 * directly translate to a 64 bit 'seek' position. 1358 */ 1359 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1360 cursor.key_beg.localization = ip->obj_localization + 1361 HAMMER_LOCALIZE_MISC; 1362 cursor.key_beg.obj_id = ip->obj_id; 1363 cursor.key_beg.create_tid = 0; 1364 cursor.key_beg.delete_tid = 0; 1365 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1366 cursor.key_beg.obj_type = 0; 1367 cursor.key_beg.key = saveoff; 1368 1369 cursor.key_end = cursor.key_beg; 1370 cursor.key_end.key = HAMMER_MAX_KEY; 1371 cursor.asof = ip->obj_asof; 1372 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1373 1374 error = hammer_ip_first(&cursor); 1375 1376 while (error == 0) { 1377 error = hammer_ip_resolve_data(&cursor); 1378 if (error) 1379 break; 1380 base = &cursor.leaf->base; 1381 saveoff = base->key; 1382 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF); 1383 1384 if (base->obj_id != ip->obj_id) 1385 panic("readdir: bad record at %p", cursor.node); 1386 1387 /* 1388 * Convert pseudo-filesystems into softlinks 1389 */ 1390 dtype = hammer_get_dtype(cursor.leaf->base.obj_type); 1391 r = vop_write_dirent( 1392 &error, uio, cursor.data->entry.obj_id, 1393 dtype, 1394 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF , 1395 (void *)cursor.data->entry.name); 1396 if (r) 1397 break; 1398 ++saveoff; 1399 if (cookies) 1400 cookies[cookie_index] = base->key; 1401 ++cookie_index; 1402 if (cookie_index == ncookies) 1403 break; 1404 error = hammer_ip_next(&cursor); 1405 } 1406 hammer_done_cursor(&cursor); 1407 1408 done: 1409 hammer_done_transaction(&trans); 1410 1411 if (ap->a_eofflag) 1412 *ap->a_eofflag = (error == ENOENT); 1413 uio->uio_offset = saveoff; 1414 if (error && cookie_index == 0) { 1415 if (error == ENOENT) 1416 error = 0; 1417 if (cookies) { 1418 kfree(cookies, M_TEMP); 1419 *ap->a_ncookies = 0; 1420 *ap->a_cookies = NULL; 1421 } 1422 } else { 1423 if (error == ENOENT) 1424 error = 0; 1425 if (cookies) { 1426 *ap->a_ncookies = cookie_index; 1427 *ap->a_cookies = cookies; 1428 } 1429 } 1430 return(error); 1431 } 1432 1433 /* 1434 * hammer_vop_readlink { vp, uio, cred } 1435 */ 1436 static 1437 int 1438 hammer_vop_readlink(struct vop_readlink_args *ap) 1439 { 1440 struct hammer_transaction trans; 1441 struct hammer_cursor cursor; 1442 struct hammer_inode *ip; 1443 char buf[32]; 1444 u_int32_t localization; 1445 hammer_pseudofs_inmem_t pfsm; 1446 int error; 1447 1448 ip = VTOI(ap->a_vp); 1449 1450 /* 1451 * Shortcut if the symlink data was stuffed into ino_data. 1452 * 1453 * Also expand special "@@PFS%05d" softlinks (expansion only 1454 * occurs for non-historical (current) accesses made from the 1455 * primary filesystem). 1456 */ 1457 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) { 1458 char *ptr; 1459 int bytes; 1460 1461 ptr = ip->ino_data.ext.symlink; 1462 bytes = (int)ip->ino_data.size; 1463 if (bytes == 10 && 1464 ip->obj_asof == HAMMER_MAX_TID && 1465 ip->obj_localization == 0 && 1466 strncmp(ptr, "@@PFS", 5) == 0) { 1467 hammer_simple_transaction(&trans, ip->hmp); 1468 bcopy(ptr + 5, buf, 5); 1469 buf[5] = 0; 1470 localization = strtoul(buf, NULL, 10) << 16; 1471 pfsm = hammer_load_pseudofs(&trans, localization, 1472 &error); 1473 if (error == 0) { 1474 if (pfsm->pfsd.mirror_flags & 1475 HAMMER_PFSD_SLAVE) { 1476 /* vap->va_size == 26 */ 1477 ksnprintf(buf, sizeof(buf), 1478 "@@0x%016llx:%05d", 1479 (long long)pfsm->pfsd.sync_end_tid, 1480 localization >> 16); 1481 } else { 1482 /* vap->va_size == 10 */ 1483 ksnprintf(buf, sizeof(buf), 1484 "@@-1:%05d", 1485 localization >> 16); 1486 #if 0 1487 ksnprintf(buf, sizeof(buf), 1488 "@@0x%016llx:%05d", 1489 (long long)HAMMER_MAX_TID, 1490 localization >> 16); 1491 #endif 1492 } 1493 ptr = buf; 1494 bytes = strlen(buf); 1495 } 1496 if (pfsm) 1497 hammer_rel_pseudofs(trans.hmp, pfsm); 1498 hammer_done_transaction(&trans); 1499 } 1500 error = uiomove(ptr, bytes, ap->a_uio); 1501 return(error); 1502 } 1503 1504 /* 1505 * Long version 1506 */ 1507 hammer_simple_transaction(&trans, ip->hmp); 1508 ++hammer_stats_file_iopsr; 1509 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 1510 1511 /* 1512 * Key range (begin and end inclusive) to scan. Directory keys 1513 * directly translate to a 64 bit 'seek' position. 1514 */ 1515 cursor.key_beg.localization = ip->obj_localization + 1516 HAMMER_LOCALIZE_MISC; 1517 cursor.key_beg.obj_id = ip->obj_id; 1518 cursor.key_beg.create_tid = 0; 1519 cursor.key_beg.delete_tid = 0; 1520 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 1521 cursor.key_beg.obj_type = 0; 1522 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK; 1523 cursor.asof = ip->obj_asof; 1524 cursor.flags |= HAMMER_CURSOR_ASOF; 1525 1526 error = hammer_ip_lookup(&cursor); 1527 if (error == 0) { 1528 error = hammer_ip_resolve_data(&cursor); 1529 if (error == 0) { 1530 KKASSERT(cursor.leaf->data_len >= 1531 HAMMER_SYMLINK_NAME_OFF); 1532 error = uiomove(cursor.data->symlink.name, 1533 cursor.leaf->data_len - 1534 HAMMER_SYMLINK_NAME_OFF, 1535 ap->a_uio); 1536 } 1537 } 1538 hammer_done_cursor(&cursor); 1539 hammer_done_transaction(&trans); 1540 return(error); 1541 } 1542 1543 /* 1544 * hammer_vop_nremove { nch, dvp, cred } 1545 */ 1546 static 1547 int 1548 hammer_vop_nremove(struct vop_nremove_args *ap) 1549 { 1550 struct hammer_transaction trans; 1551 struct hammer_inode *dip; 1552 int error; 1553 1554 dip = VTOI(ap->a_dvp); 1555 1556 if (hammer_nohistory(dip) == 0 && 1557 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1558 return (error); 1559 } 1560 1561 hammer_start_transaction(&trans, dip->hmp); 1562 ++hammer_stats_file_iopsw; 1563 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); 1564 hammer_done_transaction(&trans); 1565 if (error == 0) 1566 hammer_knote(ap->a_dvp, NOTE_WRITE); 1567 return (error); 1568 } 1569 1570 /* 1571 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred } 1572 */ 1573 static 1574 int 1575 hammer_vop_nrename(struct vop_nrename_args *ap) 1576 { 1577 struct hammer_transaction trans; 1578 struct namecache *fncp; 1579 struct namecache *tncp; 1580 struct hammer_inode *fdip; 1581 struct hammer_inode *tdip; 1582 struct hammer_inode *ip; 1583 struct hammer_cursor cursor; 1584 int64_t namekey; 1585 u_int32_t max_iterations; 1586 int nlen, error; 1587 1588 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 1589 return(EXDEV); 1590 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount) 1591 return(EXDEV); 1592 1593 fdip = VTOI(ap->a_fdvp); 1594 tdip = VTOI(ap->a_tdvp); 1595 fncp = ap->a_fnch->ncp; 1596 tncp = ap->a_tnch->ncp; 1597 ip = VTOI(fncp->nc_vp); 1598 KKASSERT(ip != NULL); 1599 1600 if (fdip->obj_localization != tdip->obj_localization) 1601 return(EXDEV); 1602 if (fdip->obj_localization != ip->obj_localization) 1603 return(EXDEV); 1604 1605 if (fdip->flags & HAMMER_INODE_RO) 1606 return (EROFS); 1607 if (tdip->flags & HAMMER_INODE_RO) 1608 return (EROFS); 1609 if (ip->flags & HAMMER_INODE_RO) 1610 return (EROFS); 1611 if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 1612 return (error); 1613 1614 hammer_start_transaction(&trans, fdip->hmp); 1615 ++hammer_stats_file_iopsw; 1616 1617 /* 1618 * Remove tncp from the target directory and then link ip as 1619 * tncp. XXX pass trans to dounlink 1620 * 1621 * Force the inode sync-time to match the transaction so it is 1622 * in-sync with the creation of the target directory entry. 1623 */ 1624 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, 1625 ap->a_cred, 0, -1); 1626 if (error == 0 || error == ENOENT) { 1627 error = hammer_ip_add_directory(&trans, tdip, 1628 tncp->nc_name, tncp->nc_nlen, 1629 ip); 1630 if (error == 0) { 1631 ip->ino_data.parent_obj_id = tdip->obj_id; 1632 ip->ino_data.ctime = trans.time; 1633 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); 1634 } 1635 } 1636 if (error) 1637 goto failed; /* XXX */ 1638 1639 /* 1640 * Locate the record in the originating directory and remove it. 1641 * 1642 * Calculate the namekey and setup the key range for the scan. This 1643 * works kinda like a chained hash table where the lower 32 bits 1644 * of the namekey synthesize the chain. 1645 * 1646 * The key range is inclusive of both key_beg and key_end. 1647 */ 1648 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen, 1649 &max_iterations); 1650 retry: 1651 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip); 1652 cursor.key_beg.localization = fdip->obj_localization + 1653 HAMMER_LOCALIZE_MISC; 1654 cursor.key_beg.obj_id = fdip->obj_id; 1655 cursor.key_beg.key = namekey; 1656 cursor.key_beg.create_tid = 0; 1657 cursor.key_beg.delete_tid = 0; 1658 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 1659 cursor.key_beg.obj_type = 0; 1660 1661 cursor.key_end = cursor.key_beg; 1662 cursor.key_end.key += max_iterations; 1663 cursor.asof = fdip->obj_asof; 1664 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1665 1666 /* 1667 * Scan all matching records (the chain), locate the one matching 1668 * the requested path component. 1669 * 1670 * The hammer_ip_*() functions merge in-memory records with on-disk 1671 * records for the purposes of the search. 1672 */ 1673 error = hammer_ip_first(&cursor); 1674 while (error == 0) { 1675 if (hammer_ip_resolve_data(&cursor) != 0) 1676 break; 1677 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 1678 KKASSERT(nlen > 0); 1679 if (fncp->nc_nlen == nlen && 1680 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) { 1681 break; 1682 } 1683 error = hammer_ip_next(&cursor); 1684 } 1685 1686 /* 1687 * If all is ok we have to get the inode so we can adjust nlinks. 1688 * 1689 * WARNING: hammer_ip_del_directory() may have to terminate the 1690 * cursor to avoid a recursion. It's ok to call hammer_done_cursor() 1691 * twice. 1692 */ 1693 if (error == 0) 1694 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); 1695 1696 /* 1697 * XXX A deadlock here will break rename's atomicy for the purposes 1698 * of crash recovery. 1699 */ 1700 if (error == EDEADLK) { 1701 hammer_done_cursor(&cursor); 1702 goto retry; 1703 } 1704 1705 /* 1706 * Cleanup and tell the kernel that the rename succeeded. 1707 */ 1708 hammer_done_cursor(&cursor); 1709 if (error == 0) { 1710 cache_rename(ap->a_fnch, ap->a_tnch); 1711 hammer_knote(ap->a_fdvp, NOTE_WRITE); 1712 hammer_knote(ap->a_tdvp, NOTE_WRITE); 1713 if (ip->vp) 1714 hammer_knote(ip->vp, NOTE_RENAME); 1715 } 1716 1717 failed: 1718 hammer_done_transaction(&trans); 1719 return (error); 1720 } 1721 1722 /* 1723 * hammer_vop_nrmdir { nch, dvp, cred } 1724 */ 1725 static 1726 int 1727 hammer_vop_nrmdir(struct vop_nrmdir_args *ap) 1728 { 1729 struct hammer_transaction trans; 1730 struct hammer_inode *dip; 1731 int error; 1732 1733 dip = VTOI(ap->a_dvp); 1734 1735 if (hammer_nohistory(dip) == 0 && 1736 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1737 return (error); 1738 } 1739 1740 hammer_start_transaction(&trans, dip->hmp); 1741 ++hammer_stats_file_iopsw; 1742 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); 1743 hammer_done_transaction(&trans); 1744 if (error == 0) 1745 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); 1746 return (error); 1747 } 1748 1749 /* 1750 * hammer_vop_markatime { vp, cred } 1751 */ 1752 static 1753 int 1754 hammer_vop_markatime(struct vop_markatime_args *ap) 1755 { 1756 struct hammer_transaction trans; 1757 struct hammer_inode *ip; 1758 1759 ip = VTOI(ap->a_vp); 1760 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1761 return (EROFS); 1762 if (ip->flags & HAMMER_INODE_RO) 1763 return (EROFS); 1764 if (ip->hmp->mp->mnt_flag & MNT_NOATIME) 1765 return (0); 1766 hammer_start_transaction(&trans, ip->hmp); 1767 ++hammer_stats_file_iopsw; 1768 1769 ip->ino_data.atime = trans.time; 1770 hammer_modify_inode(ip, HAMMER_INODE_ATIME); 1771 hammer_done_transaction(&trans); 1772 hammer_knote(ap->a_vp, NOTE_ATTRIB); 1773 return (0); 1774 } 1775 1776 /* 1777 * hammer_vop_setattr { vp, vap, cred } 1778 */ 1779 static 1780 int 1781 hammer_vop_setattr(struct vop_setattr_args *ap) 1782 { 1783 struct hammer_transaction trans; 1784 struct vattr *vap; 1785 struct hammer_inode *ip; 1786 int modflags; 1787 int error; 1788 int truncating; 1789 int blksize; 1790 int kflags; 1791 int64_t aligned_size; 1792 u_int32_t flags; 1793 1794 vap = ap->a_vap; 1795 ip = ap->a_vp->v_data; 1796 modflags = 0; 1797 kflags = 0; 1798 1799 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) 1800 return(EROFS); 1801 if (ip->flags & HAMMER_INODE_RO) 1802 return (EROFS); 1803 if (hammer_nohistory(ip) == 0 && 1804 (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { 1805 return (error); 1806 } 1807 1808 hammer_start_transaction(&trans, ip->hmp); 1809 ++hammer_stats_file_iopsw; 1810 error = 0; 1811 1812 if (vap->va_flags != VNOVAL) { 1813 flags = ip->ino_data.uflags; 1814 error = vop_helper_setattr_flags(&flags, vap->va_flags, 1815 hammer_to_unix_xid(&ip->ino_data.uid), 1816 ap->a_cred); 1817 if (error == 0) { 1818 if (ip->ino_data.uflags != flags) { 1819 ip->ino_data.uflags = flags; 1820 ip->ino_data.ctime = trans.time; 1821 modflags |= HAMMER_INODE_DDIRTY; 1822 kflags |= NOTE_ATTRIB; 1823 } 1824 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 1825 error = 0; 1826 goto done; 1827 } 1828 } 1829 goto done; 1830 } 1831 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { 1832 error = EPERM; 1833 goto done; 1834 } 1835 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { 1836 mode_t cur_mode = ip->ino_data.mode; 1837 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1838 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1839 uuid_t uuid_uid; 1840 uuid_t uuid_gid; 1841 1842 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid, 1843 ap->a_cred, 1844 &cur_uid, &cur_gid, &cur_mode); 1845 if (error == 0) { 1846 hammer_guid_to_uuid(&uuid_uid, cur_uid); 1847 hammer_guid_to_uuid(&uuid_gid, cur_gid); 1848 if (bcmp(&uuid_uid, &ip->ino_data.uid, 1849 sizeof(uuid_uid)) || 1850 bcmp(&uuid_gid, &ip->ino_data.gid, 1851 sizeof(uuid_gid)) || 1852 ip->ino_data.mode != cur_mode 1853 ) { 1854 ip->ino_data.uid = uuid_uid; 1855 ip->ino_data.gid = uuid_gid; 1856 ip->ino_data.mode = cur_mode; 1857 ip->ino_data.ctime = trans.time; 1858 modflags |= HAMMER_INODE_DDIRTY; 1859 } 1860 kflags |= NOTE_ATTRIB; 1861 } 1862 } 1863 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) { 1864 switch(ap->a_vp->v_type) { 1865 case VREG: 1866 if (vap->va_size == ip->ino_data.size) 1867 break; 1868 /* 1869 * XXX break atomicy, we can deadlock the backend 1870 * if we do not release the lock. Probably not a 1871 * big deal here. 1872 */ 1873 blksize = hammer_blocksize(vap->va_size); 1874 if (vap->va_size < ip->ino_data.size) { 1875 vtruncbuf(ap->a_vp, vap->va_size, blksize); 1876 truncating = 1; 1877 kflags |= NOTE_WRITE; 1878 } else { 1879 vnode_pager_setsize(ap->a_vp, vap->va_size); 1880 truncating = 0; 1881 kflags |= NOTE_WRITE | NOTE_EXTEND; 1882 } 1883 ip->ino_data.size = vap->va_size; 1884 ip->ino_data.mtime = trans.time; 1885 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 1886 1887 /* 1888 * on-media truncation is cached in the inode until 1889 * the inode is synchronized. 1890 */ 1891 if (truncating) { 1892 hammer_ip_frontend_trunc(ip, vap->va_size); 1893 #ifdef DEBUG_TRUNCATE 1894 if (HammerTruncIp == NULL) 1895 HammerTruncIp = ip; 1896 #endif 1897 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 1898 ip->flags |= HAMMER_INODE_TRUNCATED; 1899 ip->trunc_off = vap->va_size; 1900 #ifdef DEBUG_TRUNCATE 1901 if (ip == HammerTruncIp) 1902 kprintf("truncate1 %016llx\n", 1903 (long long)ip->trunc_off); 1904 #endif 1905 } else if (ip->trunc_off > vap->va_size) { 1906 ip->trunc_off = vap->va_size; 1907 #ifdef DEBUG_TRUNCATE 1908 if (ip == HammerTruncIp) 1909 kprintf("truncate2 %016llx\n", 1910 (long long)ip->trunc_off); 1911 #endif 1912 } else { 1913 #ifdef DEBUG_TRUNCATE 1914 if (ip == HammerTruncIp) 1915 kprintf("truncate3 %016llx (ignored)\n", 1916 (long long)vap->va_size); 1917 #endif 1918 } 1919 } 1920 1921 /* 1922 * If truncating we have to clean out a portion of 1923 * the last block on-disk. We do this in the 1924 * front-end buffer cache. 1925 */ 1926 aligned_size = (vap->va_size + (blksize - 1)) & 1927 ~(int64_t)(blksize - 1); 1928 if (truncating && vap->va_size < aligned_size) { 1929 struct buf *bp; 1930 int offset; 1931 1932 aligned_size -= blksize; 1933 1934 offset = (int)vap->va_size & (blksize - 1); 1935 error = bread(ap->a_vp, aligned_size, 1936 blksize, &bp); 1937 hammer_ip_frontend_trunc(ip, aligned_size); 1938 if (error == 0) { 1939 bzero(bp->b_data + offset, 1940 blksize - offset); 1941 /* must de-cache direct-io offset */ 1942 bp->b_bio2.bio_offset = NOOFFSET; 1943 bdwrite(bp); 1944 } else { 1945 kprintf("ERROR %d\n", error); 1946 brelse(bp); 1947 } 1948 } 1949 break; 1950 case VDATABASE: 1951 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) { 1952 ip->flags |= HAMMER_INODE_TRUNCATED; 1953 ip->trunc_off = vap->va_size; 1954 } else if (ip->trunc_off > vap->va_size) { 1955 ip->trunc_off = vap->va_size; 1956 } 1957 hammer_ip_frontend_trunc(ip, vap->va_size); 1958 ip->ino_data.size = vap->va_size; 1959 ip->ino_data.mtime = trans.time; 1960 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY; 1961 kflags |= NOTE_ATTRIB; 1962 break; 1963 default: 1964 error = EINVAL; 1965 goto done; 1966 } 1967 break; 1968 } 1969 if (vap->va_atime.tv_sec != VNOVAL) { 1970 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime); 1971 modflags |= HAMMER_INODE_ATIME; 1972 kflags |= NOTE_ATTRIB; 1973 } 1974 if (vap->va_mtime.tv_sec != VNOVAL) { 1975 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime); 1976 modflags |= HAMMER_INODE_MTIME; 1977 kflags |= NOTE_ATTRIB; 1978 } 1979 if (vap->va_mode != (mode_t)VNOVAL) { 1980 mode_t cur_mode = ip->ino_data.mode; 1981 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid); 1982 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid); 1983 1984 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred, 1985 cur_uid, cur_gid, &cur_mode); 1986 if (error == 0 && ip->ino_data.mode != cur_mode) { 1987 ip->ino_data.mode = cur_mode; 1988 ip->ino_data.ctime = trans.time; 1989 modflags |= HAMMER_INODE_DDIRTY; 1990 kflags |= NOTE_ATTRIB; 1991 } 1992 } 1993 done: 1994 if (error == 0) 1995 hammer_modify_inode(ip, modflags); 1996 hammer_done_transaction(&trans); 1997 hammer_knote(ap->a_vp, kflags); 1998 return (error); 1999 } 2000 2001 /* 2002 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target } 2003 */ 2004 static 2005 int 2006 hammer_vop_nsymlink(struct vop_nsymlink_args *ap) 2007 { 2008 struct hammer_transaction trans; 2009 struct hammer_inode *dip; 2010 struct hammer_inode *nip; 2011 struct nchandle *nch; 2012 hammer_record_t record; 2013 int error; 2014 int bytes; 2015 2016 ap->a_vap->va_type = VLNK; 2017 2018 nch = ap->a_nch; 2019 dip = VTOI(ap->a_dvp); 2020 2021 if (dip->flags & HAMMER_INODE_RO) 2022 return (EROFS); 2023 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) 2024 return (error); 2025 2026 /* 2027 * Create a transaction to cover the operations we perform. 2028 */ 2029 hammer_start_transaction(&trans, dip->hmp); 2030 ++hammer_stats_file_iopsw; 2031 2032 /* 2033 * Create a new filesystem object of the requested type. The 2034 * returned inode will be referenced but not locked. 2035 */ 2036 2037 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, 2038 dip, nch->ncp->nc_name, nch->ncp->nc_nlen, 2039 NULL, &nip); 2040 if (error) { 2041 hammer_done_transaction(&trans); 2042 *ap->a_vpp = NULL; 2043 return (error); 2044 } 2045 2046 /* 2047 * Add a record representing the symlink. symlink stores the link 2048 * as pure data, not a string, and is no \0 terminated. 2049 */ 2050 if (error == 0) { 2051 bytes = strlen(ap->a_target); 2052 2053 if (bytes <= HAMMER_INODE_BASESYMLEN) { 2054 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes); 2055 } else { 2056 record = hammer_alloc_mem_record(nip, bytes); 2057 record->type = HAMMER_MEM_RECORD_GENERAL; 2058 2059 record->leaf.base.localization = nip->obj_localization + 2060 HAMMER_LOCALIZE_MISC; 2061 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK; 2062 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 2063 record->leaf.data_len = bytes; 2064 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0); 2065 bcopy(ap->a_target, record->data->symlink.name, bytes); 2066 error = hammer_ip_add_record(&trans, record); 2067 } 2068 2069 /* 2070 * Set the file size to the length of the link. 2071 */ 2072 if (error == 0) { 2073 nip->ino_data.size = bytes; 2074 hammer_modify_inode(nip, HAMMER_INODE_DDIRTY); 2075 } 2076 } 2077 if (error == 0) 2078 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name, 2079 nch->ncp->nc_nlen, nip); 2080 2081 /* 2082 * Finish up. 2083 */ 2084 if (error) { 2085 hammer_rel_inode(nip, 0); 2086 *ap->a_vpp = NULL; 2087 } else { 2088 error = hammer_get_vnode(nip, ap->a_vpp); 2089 hammer_rel_inode(nip, 0); 2090 if (error == 0) { 2091 cache_setunresolved(ap->a_nch); 2092 cache_setvp(ap->a_nch, *ap->a_vpp); 2093 hammer_knote(ap->a_dvp, NOTE_WRITE); 2094 } 2095 } 2096 hammer_done_transaction(&trans); 2097 return (error); 2098 } 2099 2100 /* 2101 * hammer_vop_nwhiteout { nch, dvp, cred, flags } 2102 */ 2103 static 2104 int 2105 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) 2106 { 2107 struct hammer_transaction trans; 2108 struct hammer_inode *dip; 2109 int error; 2110 2111 dip = VTOI(ap->a_dvp); 2112 2113 if (hammer_nohistory(dip) == 0 && 2114 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) { 2115 return (error); 2116 } 2117 2118 hammer_start_transaction(&trans, dip->hmp); 2119 ++hammer_stats_file_iopsw; 2120 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, 2121 ap->a_cred, ap->a_flags, -1); 2122 hammer_done_transaction(&trans); 2123 2124 return (error); 2125 } 2126 2127 /* 2128 * hammer_vop_ioctl { vp, command, data, fflag, cred } 2129 */ 2130 static 2131 int 2132 hammer_vop_ioctl(struct vop_ioctl_args *ap) 2133 { 2134 struct hammer_inode *ip = ap->a_vp->v_data; 2135 2136 ++hammer_stats_file_iopsr; 2137 return(hammer_ioctl(ip, ap->a_command, ap->a_data, 2138 ap->a_fflag, ap->a_cred)); 2139 } 2140 2141 static 2142 int 2143 hammer_vop_mountctl(struct vop_mountctl_args *ap) 2144 { 2145 struct mount *mp; 2146 int error; 2147 2148 mp = ap->a_head.a_ops->head.vv_mount; 2149 2150 switch(ap->a_op) { 2151 case MOUNTCTL_SET_EXPORT: 2152 if (ap->a_ctllen != sizeof(struct export_args)) 2153 error = EINVAL; 2154 else 2155 error = hammer_vfs_export(mp, ap->a_op, 2156 (const struct export_args *)ap->a_ctl); 2157 break; 2158 default: 2159 error = journal_mountctl(ap); 2160 break; 2161 } 2162 return(error); 2163 } 2164 2165 /* 2166 * hammer_vop_strategy { vp, bio } 2167 * 2168 * Strategy call, used for regular file read & write only. Note that the 2169 * bp may represent a cluster. 2170 * 2171 * To simplify operation and allow better optimizations in the future, 2172 * this code does not make any assumptions with regards to buffer alignment 2173 * or size. 2174 */ 2175 static 2176 int 2177 hammer_vop_strategy(struct vop_strategy_args *ap) 2178 { 2179 struct buf *bp; 2180 int error; 2181 2182 bp = ap->a_bio->bio_buf; 2183 2184 switch(bp->b_cmd) { 2185 case BUF_CMD_READ: 2186 error = hammer_vop_strategy_read(ap); 2187 break; 2188 case BUF_CMD_WRITE: 2189 error = hammer_vop_strategy_write(ap); 2190 break; 2191 default: 2192 bp->b_error = error = EINVAL; 2193 bp->b_flags |= B_ERROR; 2194 biodone(ap->a_bio); 2195 break; 2196 } 2197 return (error); 2198 } 2199 2200 /* 2201 * Read from a regular file. Iterate the related records and fill in the 2202 * BIO/BUF. Gaps are zero-filled. 2203 * 2204 * The support code in hammer_object.c should be used to deal with mixed 2205 * in-memory and on-disk records. 2206 * 2207 * NOTE: Can be called from the cluster code with an oversized buf. 2208 * 2209 * XXX atime update 2210 */ 2211 static 2212 int 2213 hammer_vop_strategy_read(struct vop_strategy_args *ap) 2214 { 2215 struct hammer_transaction trans; 2216 struct hammer_inode *ip; 2217 struct hammer_inode *dip; 2218 struct hammer_cursor cursor; 2219 hammer_base_elm_t base; 2220 hammer_off_t disk_offset; 2221 struct bio *bio; 2222 struct bio *nbio; 2223 struct buf *bp; 2224 int64_t rec_offset; 2225 int64_t ran_end; 2226 int64_t tmp64; 2227 int error; 2228 int boff; 2229 int roff; 2230 int n; 2231 2232 bio = ap->a_bio; 2233 bp = bio->bio_buf; 2234 ip = ap->a_vp->v_data; 2235 2236 /* 2237 * The zone-2 disk offset may have been set by the cluster code via 2238 * a BMAP operation, or else should be NOOFFSET. 2239 * 2240 * Checking the high bits for a match against zone-2 should suffice. 2241 */ 2242 nbio = push_bio(bio); 2243 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == 2244 HAMMER_ZONE_LARGE_DATA) { 2245 error = hammer_io_direct_read(ip->hmp, nbio, NULL); 2246 return (error); 2247 } 2248 2249 /* 2250 * Well, that sucked. Do it the hard way. If all the stars are 2251 * aligned we may still be able to issue a direct-read. 2252 */ 2253 hammer_simple_transaction(&trans, ip->hmp); 2254 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2255 2256 /* 2257 * Key range (begin and end inclusive) to scan. Note that the key's 2258 * stored in the actual records represent BASE+LEN, not BASE. The 2259 * first record containing bio_offset will have a key > bio_offset. 2260 */ 2261 cursor.key_beg.localization = ip->obj_localization + 2262 HAMMER_LOCALIZE_MISC; 2263 cursor.key_beg.obj_id = ip->obj_id; 2264 cursor.key_beg.create_tid = 0; 2265 cursor.key_beg.delete_tid = 0; 2266 cursor.key_beg.obj_type = 0; 2267 cursor.key_beg.key = bio->bio_offset + 1; 2268 cursor.asof = ip->obj_asof; 2269 cursor.flags |= HAMMER_CURSOR_ASOF; 2270 2271 cursor.key_end = cursor.key_beg; 2272 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2273 #if 0 2274 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 2275 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 2276 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 2277 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2278 } else 2279 #endif 2280 { 2281 ran_end = bio->bio_offset + bp->b_bufsize; 2282 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2283 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2284 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2285 if (tmp64 < ran_end) 2286 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2287 else 2288 cursor.key_end.key = ran_end + MAXPHYS + 1; 2289 } 2290 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2291 2292 error = hammer_ip_first(&cursor); 2293 boff = 0; 2294 2295 while (error == 0) { 2296 /* 2297 * Get the base file offset of the record. The key for 2298 * data records is (base + bytes) rather then (base). 2299 */ 2300 base = &cursor.leaf->base; 2301 rec_offset = base->key - cursor.leaf->data_len; 2302 2303 /* 2304 * Calculate the gap, if any, and zero-fill it. 2305 * 2306 * n is the offset of the start of the record verses our 2307 * current seek offset in the bio. 2308 */ 2309 n = (int)(rec_offset - (bio->bio_offset + boff)); 2310 if (n > 0) { 2311 if (n > bp->b_bufsize - boff) 2312 n = bp->b_bufsize - boff; 2313 bzero((char *)bp->b_data + boff, n); 2314 boff += n; 2315 n = 0; 2316 } 2317 2318 /* 2319 * Calculate the data offset in the record and the number 2320 * of bytes we can copy. 2321 * 2322 * There are two degenerate cases. First, boff may already 2323 * be at bp->b_bufsize. Secondly, the data offset within 2324 * the record may exceed the record's size. 2325 */ 2326 roff = -n; 2327 rec_offset += roff; 2328 n = cursor.leaf->data_len - roff; 2329 if (n <= 0) { 2330 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff); 2331 n = 0; 2332 } else if (n > bp->b_bufsize - boff) { 2333 n = bp->b_bufsize - boff; 2334 } 2335 2336 /* 2337 * Deal with cached truncations. This cool bit of code 2338 * allows truncate()/ftruncate() to avoid having to sync 2339 * the file. 2340 * 2341 * If the frontend is truncated then all backend records are 2342 * subject to the frontend's truncation. 2343 * 2344 * If the backend is truncated then backend records on-disk 2345 * (but not in-memory) are subject to the backend's 2346 * truncation. In-memory records owned by the backend 2347 * represent data written after the truncation point on the 2348 * backend and must not be truncated. 2349 * 2350 * Truncate operations deal with frontend buffer cache 2351 * buffers and frontend-owned in-memory records synchronously. 2352 */ 2353 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2354 if (hammer_cursor_ondisk(&cursor) || 2355 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 2356 if (ip->trunc_off <= rec_offset) 2357 n = 0; 2358 else if (ip->trunc_off < rec_offset + n) 2359 n = (int)(ip->trunc_off - rec_offset); 2360 } 2361 } 2362 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2363 if (hammer_cursor_ondisk(&cursor)) { 2364 if (ip->sync_trunc_off <= rec_offset) 2365 n = 0; 2366 else if (ip->sync_trunc_off < rec_offset + n) 2367 n = (int)(ip->sync_trunc_off - rec_offset); 2368 } 2369 } 2370 2371 /* 2372 * Try to issue a direct read into our bio if possible, 2373 * otherwise resolve the element data into a hammer_buffer 2374 * and copy. 2375 * 2376 * The buffer on-disk should be zerod past any real 2377 * truncation point, but may not be for any synthesized 2378 * truncation point from above. 2379 */ 2380 disk_offset = cursor.leaf->data_offset + roff; 2381 if (boff == 0 && n == bp->b_bufsize && 2382 hammer_cursor_ondisk(&cursor) && 2383 (disk_offset & HAMMER_BUFMASK) == 0) { 2384 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == 2385 HAMMER_ZONE_LARGE_DATA); 2386 nbio->bio_offset = disk_offset; 2387 error = hammer_io_direct_read(trans.hmp, nbio, 2388 cursor.leaf); 2389 goto done; 2390 } else if (n) { 2391 error = hammer_ip_resolve_data(&cursor); 2392 if (error == 0) { 2393 bcopy((char *)cursor.data + roff, 2394 (char *)bp->b_data + boff, n); 2395 } 2396 } 2397 if (error) 2398 break; 2399 2400 /* 2401 * Iterate until we have filled the request. 2402 */ 2403 boff += n; 2404 if (boff == bp->b_bufsize) 2405 break; 2406 error = hammer_ip_next(&cursor); 2407 } 2408 2409 /* 2410 * There may have been a gap after the last record 2411 */ 2412 if (error == ENOENT) 2413 error = 0; 2414 if (error == 0 && boff != bp->b_bufsize) { 2415 KKASSERT(boff < bp->b_bufsize); 2416 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); 2417 /* boff = bp->b_bufsize; */ 2418 } 2419 bp->b_resid = 0; 2420 bp->b_error = error; 2421 if (error) 2422 bp->b_flags |= B_ERROR; 2423 biodone(ap->a_bio); 2424 2425 done: 2426 /* 2427 * Cache the b-tree node for the last data read in cache[1]. 2428 * 2429 * If we hit the file EOF then also cache the node in the 2430 * governing director's cache[3], it will be used to initialize 2431 * the inode's cache[1] for any inodes looked up via the directory. 2432 * 2433 * This doesn't reduce disk accesses since the B-Tree chain is 2434 * likely cached, but it does reduce cpu overhead when looking 2435 * up file offsets for cpdup/tar/cpio style iterations. 2436 */ 2437 if (cursor.node) 2438 hammer_cache_node(&ip->cache[1], cursor.node); 2439 if (ran_end >= ip->ino_data.size) { 2440 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id, 2441 ip->obj_asof, ip->obj_localization); 2442 if (dip) { 2443 hammer_cache_node(&dip->cache[3], cursor.node); 2444 hammer_rel_inode(dip, 0); 2445 } 2446 } 2447 hammer_done_cursor(&cursor); 2448 hammer_done_transaction(&trans); 2449 return(error); 2450 } 2451 2452 /* 2453 * BMAP operation - used to support cluster_read() only. 2454 * 2455 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 2456 * 2457 * This routine may return EOPNOTSUPP if the opration is not supported for 2458 * the specified offset. The contents of the pointer arguments do not 2459 * need to be initialized in that case. 2460 * 2461 * If a disk address is available and properly aligned return 0 with 2462 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately 2463 * to the run-length relative to that offset. Callers may assume that 2464 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently 2465 * large, so return EOPNOTSUPP if it is not sufficiently large. 2466 */ 2467 static 2468 int 2469 hammer_vop_bmap(struct vop_bmap_args *ap) 2470 { 2471 struct hammer_transaction trans; 2472 struct hammer_inode *ip; 2473 struct hammer_cursor cursor; 2474 hammer_base_elm_t base; 2475 int64_t rec_offset; 2476 int64_t ran_end; 2477 int64_t tmp64; 2478 int64_t base_offset; 2479 int64_t base_disk_offset; 2480 int64_t last_offset; 2481 hammer_off_t last_disk_offset; 2482 hammer_off_t disk_offset; 2483 int rec_len; 2484 int error; 2485 int blksize; 2486 2487 ++hammer_stats_file_iopsr; 2488 ip = ap->a_vp->v_data; 2489 2490 /* 2491 * We can only BMAP regular files. We can't BMAP database files, 2492 * directories, etc. 2493 */ 2494 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE) 2495 return(EOPNOTSUPP); 2496 2497 /* 2498 * bmap is typically called with runp/runb both NULL when used 2499 * for writing. We do not support BMAP for writing atm. 2500 */ 2501 if (ap->a_cmd != BUF_CMD_READ) 2502 return(EOPNOTSUPP); 2503 2504 /* 2505 * Scan the B-Tree to acquire blockmap addresses, then translate 2506 * to raw addresses. 2507 */ 2508 hammer_simple_transaction(&trans, ip->hmp); 2509 #if 0 2510 kprintf("bmap_beg %016llx ip->cache %p\n", 2511 (long long)ap->a_loffset, ip->cache[1]); 2512 #endif 2513 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2514 2515 /* 2516 * Key range (begin and end inclusive) to scan. Note that the key's 2517 * stored in the actual records represent BASE+LEN, not BASE. The 2518 * first record containing bio_offset will have a key > bio_offset. 2519 */ 2520 cursor.key_beg.localization = ip->obj_localization + 2521 HAMMER_LOCALIZE_MISC; 2522 cursor.key_beg.obj_id = ip->obj_id; 2523 cursor.key_beg.create_tid = 0; 2524 cursor.key_beg.delete_tid = 0; 2525 cursor.key_beg.obj_type = 0; 2526 if (ap->a_runb) 2527 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1; 2528 else 2529 cursor.key_beg.key = ap->a_loffset + 1; 2530 if (cursor.key_beg.key < 0) 2531 cursor.key_beg.key = 0; 2532 cursor.asof = ip->obj_asof; 2533 cursor.flags |= HAMMER_CURSOR_ASOF; 2534 2535 cursor.key_end = cursor.key_beg; 2536 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE); 2537 2538 ran_end = ap->a_loffset + MAXPHYS; 2539 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 2540 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 2541 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */ 2542 if (tmp64 < ran_end) 2543 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 2544 else 2545 cursor.key_end.key = ran_end + MAXPHYS + 1; 2546 2547 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 2548 2549 error = hammer_ip_first(&cursor); 2550 base_offset = last_offset = 0; 2551 base_disk_offset = last_disk_offset = 0; 2552 2553 while (error == 0) { 2554 /* 2555 * Get the base file offset of the record. The key for 2556 * data records is (base + bytes) rather then (base). 2557 * 2558 * NOTE: rec_offset + rec_len may exceed the end-of-file. 2559 * The extra bytes should be zero on-disk and the BMAP op 2560 * should still be ok. 2561 */ 2562 base = &cursor.leaf->base; 2563 rec_offset = base->key - cursor.leaf->data_len; 2564 rec_len = cursor.leaf->data_len; 2565 2566 /* 2567 * Incorporate any cached truncation. 2568 * 2569 * NOTE: Modifications to rec_len based on synthesized 2570 * truncation points remove the guarantee that any extended 2571 * data on disk is zero (since the truncations may not have 2572 * taken place on-media yet). 2573 */ 2574 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2575 if (hammer_cursor_ondisk(&cursor) || 2576 cursor.iprec->flush_state == HAMMER_FST_FLUSH) { 2577 if (ip->trunc_off <= rec_offset) 2578 rec_len = 0; 2579 else if (ip->trunc_off < rec_offset + rec_len) 2580 rec_len = (int)(ip->trunc_off - rec_offset); 2581 } 2582 } 2583 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2584 if (hammer_cursor_ondisk(&cursor)) { 2585 if (ip->sync_trunc_off <= rec_offset) 2586 rec_len = 0; 2587 else if (ip->sync_trunc_off < rec_offset + rec_len) 2588 rec_len = (int)(ip->sync_trunc_off - rec_offset); 2589 } 2590 } 2591 2592 /* 2593 * Accumulate information. If we have hit a discontiguous 2594 * block reset base_offset unless we are already beyond the 2595 * requested offset. If we are, that's it, we stop. 2596 */ 2597 if (error) 2598 break; 2599 if (hammer_cursor_ondisk(&cursor)) { 2600 disk_offset = cursor.leaf->data_offset; 2601 if (rec_offset != last_offset || 2602 disk_offset != last_disk_offset) { 2603 if (rec_offset > ap->a_loffset) 2604 break; 2605 base_offset = rec_offset; 2606 base_disk_offset = disk_offset; 2607 } 2608 last_offset = rec_offset + rec_len; 2609 last_disk_offset = disk_offset + rec_len; 2610 } 2611 error = hammer_ip_next(&cursor); 2612 } 2613 2614 #if 0 2615 kprintf("BMAP %016llx: %016llx - %016llx\n", 2616 (long long)ap->a_loffset, 2617 (long long)base_offset, 2618 (long long)last_offset); 2619 kprintf("BMAP %16s: %016llx - %016llx\n", "", 2620 (long long)base_disk_offset, 2621 (long long)last_disk_offset); 2622 #endif 2623 2624 if (cursor.node) { 2625 hammer_cache_node(&ip->cache[1], cursor.node); 2626 #if 0 2627 kprintf("bmap_end2 %016llx ip->cache %p\n", 2628 (long long)ap->a_loffset, ip->cache[1]); 2629 #endif 2630 } 2631 hammer_done_cursor(&cursor); 2632 hammer_done_transaction(&trans); 2633 2634 /* 2635 * If we couldn't find any records or the records we did find were 2636 * all behind the requested offset, return failure. A forward 2637 * truncation can leave a hole w/ no on-disk records. 2638 */ 2639 if (last_offset == 0 || last_offset < ap->a_loffset) 2640 return (EOPNOTSUPP); 2641 2642 /* 2643 * Figure out the block size at the requested offset and adjust 2644 * our limits so the cluster_read() does not create inappropriately 2645 * sized buffer cache buffers. 2646 */ 2647 blksize = hammer_blocksize(ap->a_loffset); 2648 if (hammer_blocksize(base_offset) != blksize) { 2649 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset); 2650 } 2651 if (last_offset != ap->a_loffset && 2652 hammer_blocksize(last_offset - 1) != blksize) { 2653 last_offset = hammer_blockdemarc(ap->a_loffset, 2654 last_offset - 1); 2655 } 2656 2657 /* 2658 * Returning EOPNOTSUPP simply prevents the direct-IO optimization 2659 * from occuring. 2660 */ 2661 disk_offset = base_disk_offset + (ap->a_loffset - base_offset); 2662 2663 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) { 2664 /* 2665 * Only large-data zones can be direct-IOd 2666 */ 2667 error = EOPNOTSUPP; 2668 } else if ((disk_offset & HAMMER_BUFMASK) || 2669 (last_offset - ap->a_loffset) < blksize) { 2670 /* 2671 * doffsetp is not aligned or the forward run size does 2672 * not cover a whole buffer, disallow the direct I/O. 2673 */ 2674 error = EOPNOTSUPP; 2675 } else { 2676 /* 2677 * We're good. 2678 */ 2679 *ap->a_doffsetp = disk_offset; 2680 if (ap->a_runb) { 2681 *ap->a_runb = ap->a_loffset - base_offset; 2682 KKASSERT(*ap->a_runb >= 0); 2683 } 2684 if (ap->a_runp) { 2685 *ap->a_runp = last_offset - ap->a_loffset; 2686 KKASSERT(*ap->a_runp >= 0); 2687 } 2688 error = 0; 2689 } 2690 return(error); 2691 } 2692 2693 /* 2694 * Write to a regular file. Because this is a strategy call the OS is 2695 * trying to actually get data onto the media. 2696 */ 2697 static 2698 int 2699 hammer_vop_strategy_write(struct vop_strategy_args *ap) 2700 { 2701 hammer_record_t record; 2702 hammer_mount_t hmp; 2703 hammer_inode_t ip; 2704 struct bio *bio; 2705 struct buf *bp; 2706 int blksize; 2707 int bytes; 2708 int error; 2709 2710 bio = ap->a_bio; 2711 bp = bio->bio_buf; 2712 ip = ap->a_vp->v_data; 2713 hmp = ip->hmp; 2714 2715 blksize = hammer_blocksize(bio->bio_offset); 2716 KKASSERT(bp->b_bufsize == blksize); 2717 2718 if (ip->flags & HAMMER_INODE_RO) { 2719 bp->b_error = EROFS; 2720 bp->b_flags |= B_ERROR; 2721 biodone(ap->a_bio); 2722 return(EROFS); 2723 } 2724 2725 /* 2726 * Interlock with inode destruction (no in-kernel or directory 2727 * topology visibility). If we queue new IO while trying to 2728 * destroy the inode we can deadlock the vtrunc call in 2729 * hammer_inode_unloadable_check(). 2730 * 2731 * Besides, there's no point flushing a bp associated with an 2732 * inode that is being destroyed on-media and has no kernel 2733 * references. 2734 */ 2735 if ((ip->flags | ip->sync_flags) & 2736 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { 2737 bp->b_resid = 0; 2738 biodone(ap->a_bio); 2739 return(0); 2740 } 2741 2742 /* 2743 * Reserve space and issue a direct-write from the front-end. 2744 * NOTE: The direct_io code will hammer_bread/bcopy smaller 2745 * allocations. 2746 * 2747 * An in-memory record will be installed to reference the storage 2748 * until the flusher can get to it. 2749 * 2750 * Since we own the high level bio the front-end will not try to 2751 * do a direct-read until the write completes. 2752 * 2753 * NOTE: The only time we do not reserve a full-sized buffers 2754 * worth of data is if the file is small. We do not try to 2755 * allocate a fragment (from the small-data zone) at the end of 2756 * an otherwise large file as this can lead to wildly separated 2757 * data. 2758 */ 2759 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0); 2760 KKASSERT(bio->bio_offset < ip->ino_data.size); 2761 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2) 2762 bytes = bp->b_bufsize; 2763 else 2764 bytes = ((int)ip->ino_data.size + 15) & ~15; 2765 2766 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data, 2767 bytes, &error); 2768 if (record) { 2769 hammer_io_direct_write(hmp, record, bio); 2770 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs) 2771 hammer_flush_inode(ip, 0); 2772 } else { 2773 bp->b_bio2.bio_offset = NOOFFSET; 2774 bp->b_error = error; 2775 bp->b_flags |= B_ERROR; 2776 biodone(ap->a_bio); 2777 } 2778 return(error); 2779 } 2780 2781 /* 2782 * dounlink - disconnect a directory entry 2783 * 2784 * XXX whiteout support not really in yet 2785 */ 2786 static int 2787 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, 2788 struct vnode *dvp, struct ucred *cred, 2789 int flags, int isdir) 2790 { 2791 struct namecache *ncp; 2792 hammer_inode_t dip; 2793 hammer_inode_t ip; 2794 struct hammer_cursor cursor; 2795 int64_t namekey; 2796 u_int32_t max_iterations; 2797 int nlen, error; 2798 2799 /* 2800 * Calculate the namekey and setup the key range for the scan. This 2801 * works kinda like a chained hash table where the lower 32 bits 2802 * of the namekey synthesize the chain. 2803 * 2804 * The key range is inclusive of both key_beg and key_end. 2805 */ 2806 dip = VTOI(dvp); 2807 ncp = nch->ncp; 2808 2809 if (dip->flags & HAMMER_INODE_RO) 2810 return (EROFS); 2811 2812 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen, 2813 &max_iterations); 2814 retry: 2815 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 2816 cursor.key_beg.localization = dip->obj_localization + 2817 HAMMER_LOCALIZE_MISC; 2818 cursor.key_beg.obj_id = dip->obj_id; 2819 cursor.key_beg.key = namekey; 2820 cursor.key_beg.create_tid = 0; 2821 cursor.key_beg.delete_tid = 0; 2822 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; 2823 cursor.key_beg.obj_type = 0; 2824 2825 cursor.key_end = cursor.key_beg; 2826 cursor.key_end.key += max_iterations; 2827 cursor.asof = dip->obj_asof; 2828 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2829 2830 /* 2831 * Scan all matching records (the chain), locate the one matching 2832 * the requested path component. info->last_error contains the 2833 * error code on search termination and could be 0, ENOENT, or 2834 * something else. 2835 * 2836 * The hammer_ip_*() functions merge in-memory records with on-disk 2837 * records for the purposes of the search. 2838 */ 2839 error = hammer_ip_first(&cursor); 2840 2841 while (error == 0) { 2842 error = hammer_ip_resolve_data(&cursor); 2843 if (error) 2844 break; 2845 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF; 2846 KKASSERT(nlen > 0); 2847 if (ncp->nc_nlen == nlen && 2848 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) { 2849 break; 2850 } 2851 error = hammer_ip_next(&cursor); 2852 } 2853 2854 /* 2855 * If all is ok we have to get the inode so we can adjust nlinks. 2856 * To avoid a deadlock with the flusher we must release the inode 2857 * lock on the directory when acquiring the inode for the entry. 2858 * 2859 * If the target is a directory, it must be empty. 2860 */ 2861 if (error == 0) { 2862 hammer_unlock(&cursor.ip->lock); 2863 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, 2864 dip->hmp->asof, 2865 cursor.data->entry.localization, 2866 0, &error); 2867 hammer_lock_sh(&cursor.ip->lock); 2868 if (error == ENOENT) { 2869 kprintf("HAMMER: WARNING: Removing " 2870 "dirent w/missing inode \"%s\"\n" 2871 "\tobj_id = %016llx\n", 2872 ncp->nc_name, 2873 (long long)cursor.data->entry.obj_id); 2874 error = 0; 2875 } 2876 2877 /* 2878 * If isdir >= 0 we validate that the entry is or is not a 2879 * directory. If isdir < 0 we don't care. 2880 */ 2881 if (error == 0 && isdir >= 0 && ip) { 2882 if (isdir && 2883 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) { 2884 error = ENOTDIR; 2885 } else if (isdir == 0 && 2886 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 2887 error = EISDIR; 2888 } 2889 } 2890 2891 /* 2892 * If we are trying to remove a directory the directory must 2893 * be empty. 2894 * 2895 * The check directory code can loop and deadlock/retry. Our 2896 * own cursor's node locks must be released to avoid a 3-way 2897 * deadlock with the flusher if the check directory code 2898 * blocks. 2899 * 2900 * If any changes whatsoever have been made to the cursor 2901 * set EDEADLK and retry. 2902 */ 2903 if (error == 0 && ip && ip->ino_data.obj_type == 2904 HAMMER_OBJTYPE_DIRECTORY) { 2905 hammer_unlock_cursor(&cursor); 2906 error = hammer_ip_check_directory_empty(trans, ip); 2907 hammer_lock_cursor(&cursor); 2908 if (cursor.flags & HAMMER_CURSOR_RETEST) { 2909 kprintf("HAMMER: Warning: avoided deadlock " 2910 "on rmdir '%s'\n", 2911 ncp->nc_name); 2912 error = EDEADLK; 2913 } 2914 } 2915 2916 /* 2917 * Delete the directory entry. 2918 * 2919 * WARNING: hammer_ip_del_directory() may have to terminate 2920 * the cursor to avoid a deadlock. It is ok to call 2921 * hammer_done_cursor() twice. 2922 */ 2923 if (error == 0) { 2924 error = hammer_ip_del_directory(trans, &cursor, 2925 dip, ip); 2926 } 2927 hammer_done_cursor(&cursor); 2928 if (error == 0) { 2929 cache_setunresolved(nch); 2930 cache_setvp(nch, NULL); 2931 /* XXX locking */ 2932 if (ip && ip->vp) { 2933 hammer_knote(ip->vp, NOTE_DELETE); 2934 cache_inval_vp(ip->vp, CINV_DESTROY); 2935 } 2936 } 2937 if (ip) 2938 hammer_rel_inode(ip, 0); 2939 } else { 2940 hammer_done_cursor(&cursor); 2941 } 2942 if (error == EDEADLK) 2943 goto retry; 2944 2945 return (error); 2946 } 2947 2948 /************************************************************************ 2949 * FIFO AND SPECFS OPS * 2950 ************************************************************************ 2951 * 2952 */ 2953 2954 static int 2955 hammer_vop_fifoclose (struct vop_close_args *ap) 2956 { 2957 /* XXX update itimes */ 2958 return (VOCALL(&fifo_vnode_vops, &ap->a_head)); 2959 } 2960 2961 static int 2962 hammer_vop_fiforead (struct vop_read_args *ap) 2963 { 2964 int error; 2965 2966 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 2967 /* XXX update access time */ 2968 return (error); 2969 } 2970 2971 static int 2972 hammer_vop_fifowrite (struct vop_write_args *ap) 2973 { 2974 int error; 2975 2976 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 2977 /* XXX update access time */ 2978 return (error); 2979 } 2980 2981 static 2982 int 2983 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap) 2984 { 2985 int error; 2986 2987 error = VOCALL(&fifo_vnode_vops, &ap->a_head); 2988 if (error) 2989 error = hammer_vop_kqfilter(ap); 2990 return(error); 2991 } 2992 2993 static int 2994 hammer_vop_specclose (struct vop_close_args *ap) 2995 { 2996 /* XXX update itimes */ 2997 return (VOCALL(&spec_vnode_vops, &ap->a_head)); 2998 } 2999 3000 static int 3001 hammer_vop_specread (struct vop_read_args *ap) 3002 { 3003 /* XXX update access time */ 3004 return (VOCALL(&spec_vnode_vops, &ap->a_head)); 3005 } 3006 3007 static int 3008 hammer_vop_specwrite (struct vop_write_args *ap) 3009 { 3010 /* XXX update last change time */ 3011 return (VOCALL(&spec_vnode_vops, &ap->a_head)); 3012 } 3013 3014 /* 3015 * SPECFS's getattr will override fields as necessary, but does not fill 3016 * stuff in from scratch. 3017 */ 3018 static 3019 int 3020 hammer_vop_specgetattr (struct vop_getattr_args *ap) 3021 { 3022 int error; 3023 3024 error = hammer_vop_getattr(ap); 3025 if (error == 0) 3026 VOCALL(&spec_vnode_vops, &ap->a_head); 3027 return (error); 3028 } 3029 3030 3031 /************************************************************************ 3032 * KQFILTER OPS * 3033 ************************************************************************ 3034 * 3035 */ 3036 static void filt_hammerdetach(struct knote *kn); 3037 static int filt_hammerread(struct knote *kn, long hint); 3038 static int filt_hammerwrite(struct knote *kn, long hint); 3039 static int filt_hammervnode(struct knote *kn, long hint); 3040 3041 static struct filterops hammerread_filtops = 3042 { 1, NULL, filt_hammerdetach, filt_hammerread }; 3043 static struct filterops hammerwrite_filtops = 3044 { 1, NULL, filt_hammerdetach, filt_hammerwrite }; 3045 static struct filterops hammervnode_filtops = 3046 { 1, NULL, filt_hammerdetach, filt_hammervnode }; 3047 3048 static 3049 int 3050 hammer_vop_kqfilter(struct vop_kqfilter_args *ap) 3051 { 3052 struct vnode *vp = ap->a_vp; 3053 struct knote *kn = ap->a_kn; 3054 lwkt_tokref ilock; 3055 3056 switch (kn->kn_filter) { 3057 case EVFILT_READ: 3058 kn->kn_fop = &hammerread_filtops; 3059 break; 3060 case EVFILT_WRITE: 3061 kn->kn_fop = &hammerwrite_filtops; 3062 break; 3063 case EVFILT_VNODE: 3064 kn->kn_fop = &hammervnode_filtops; 3065 break; 3066 default: 3067 return (1); 3068 } 3069 3070 kn->kn_hook = (caddr_t)vp; 3071 3072 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 3073 SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext); 3074 lwkt_reltoken(&ilock); 3075 3076 return(0); 3077 } 3078 3079 static void 3080 filt_hammerdetach(struct knote *kn) 3081 { 3082 struct vnode *vp = (void *)kn->kn_hook; 3083 lwkt_tokref ilock; 3084 3085 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 3086 SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note, 3087 kn, knote, kn_selnext); 3088 lwkt_reltoken(&ilock); 3089 } 3090 3091 static int 3092 filt_hammerread(struct knote *kn, long hint) 3093 { 3094 struct vnode *vp = (void *)kn->kn_hook; 3095 hammer_inode_t ip = VTOI(vp); 3096 3097 if (hint == NOTE_REVOKE) { 3098 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 3099 return(1); 3100 } 3101 kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset; 3102 return (kn->kn_data != 0); 3103 } 3104 3105 static int 3106 filt_hammerwrite(struct knote *kn, long hint) 3107 { 3108 if (hint == NOTE_REVOKE) 3109 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 3110 kn->kn_data = 0; 3111 return (1); 3112 } 3113 3114 static int 3115 filt_hammervnode(struct knote *kn, long hint) 3116 { 3117 if (kn->kn_sfflags & hint) 3118 kn->kn_fflags |= hint; 3119 if (hint == NOTE_REVOKE) { 3120 kn->kn_flags |= EV_EOF; 3121 return (1); 3122 } 3123 return (kn->kn_fflags != 0); 3124 } 3125 3126