1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.41 2008/05/05 20:34:48 dillon Exp $ 35 */ 36 /* 37 * Manage HAMMER's on-disk structures. These routines are primarily 38 * responsible for interfacing with the kernel's I/O subsystem and for 39 * managing in-memory structures. 40 */ 41 42 #include "hammer.h" 43 #include <sys/fcntl.h> 44 #include <sys/nlookup.h> 45 #include <sys/buf.h> 46 #include <sys/buf2.h> 47 48 static void hammer_free_volume(hammer_volume_t volume); 49 static int hammer_load_volume(hammer_volume_t volume); 50 static int hammer_load_buffer(hammer_buffer_t buffer, int isnew); 51 static int hammer_load_node(hammer_node_t node, int isnew); 52 53 /* 54 * Red-Black tree support for various structures 55 */ 56 static int 57 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2) 58 { 59 if (ip1->obj_id < ip2->obj_id) 60 return(-1); 61 if (ip1->obj_id > ip2->obj_id) 62 return(1); 63 if (ip1->obj_asof < ip2->obj_asof) 64 return(-1); 65 if (ip1->obj_asof > ip2->obj_asof) 66 return(1); 67 return(0); 68 } 69 70 static int 71 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip) 72 { 73 if (info->obj_id < ip->obj_id) 74 return(-1); 75 if (info->obj_id > ip->obj_id) 76 return(1); 77 if (info->obj_asof < ip->obj_asof) 78 return(-1); 79 if (info->obj_asof > ip->obj_asof) 80 return(1); 81 return(0); 82 } 83 84 static int 85 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2) 86 { 87 if (vol1->vol_no < vol2->vol_no) 88 return(-1); 89 if (vol1->vol_no > vol2->vol_no) 90 return(1); 91 return(0); 92 } 93 94 static int 95 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2) 96 { 97 if (buf1->zone2_offset < buf2->zone2_offset) 98 return(-1); 99 if (buf1->zone2_offset > buf2->zone2_offset) 100 return(1); 101 return(0); 102 } 103 104 static int 105 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2) 106 { 107 if (node1->node_offset < node2->node_offset) 108 return(-1); 109 if (node1->node_offset > node2->node_offset) 110 return(1); 111 return(0); 112 } 113 114 /* 115 * Note: The lookup function for hammer_ino_rb_tree winds up being named 116 * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info). The other lookup 117 * functions are normal, e.g. hammer_buf_rb_tree_RB_LOOKUP(root, zone2_offset). 118 */ 119 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare); 120 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node, 121 hammer_inode_info_cmp, hammer_inode_info_t); 122 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node, 123 hammer_vol_rb_compare, int32_t, vol_no); 124 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node, 125 hammer_buf_rb_compare, hammer_off_t, zone2_offset); 126 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node, 127 hammer_nod_rb_compare, hammer_off_t, node_offset); 128 129 /************************************************************************ 130 * VOLUMES * 131 ************************************************************************ 132 * 133 * Load a HAMMER volume by name. Returns 0 on success or a positive error 134 * code on failure. Volumes must be loaded at mount time, get_volume() will 135 * not load a new volume. 136 * 137 * Calls made to hammer_load_volume() or single-threaded 138 */ 139 int 140 hammer_install_volume(struct hammer_mount *hmp, const char *volname) 141 { 142 struct mount *mp; 143 hammer_volume_t volume; 144 struct hammer_volume_ondisk *ondisk; 145 struct nlookupdata nd; 146 struct buf *bp = NULL; 147 int error; 148 int ronly; 149 int setmp = 0; 150 151 mp = hmp->mp; 152 ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0); 153 154 /* 155 * Allocate a volume structure 156 */ 157 ++hammer_count_volumes; 158 volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO); 159 volume->vol_name = kstrdup(volname, M_HAMMER); 160 hammer_io_init(&volume->io, hmp, HAMMER_STRUCTURE_VOLUME); 161 volume->io.offset = 0LL; 162 163 /* 164 * Get the device vnode 165 */ 166 error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW); 167 if (error == 0) 168 error = nlookup(&nd); 169 if (error == 0) 170 error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp); 171 nlookup_done(&nd); 172 if (error == 0) { 173 if (vn_isdisk(volume->devvp, &error)) { 174 error = vfs_mountedon(volume->devvp); 175 } 176 } 177 if (error == 0 && 178 count_udev(volume->devvp->v_umajor, volume->devvp->v_uminor) > 0) { 179 error = EBUSY; 180 } 181 if (error == 0) { 182 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY); 183 error = vinvalbuf(volume->devvp, V_SAVE, 0, 0); 184 if (error == 0) { 185 error = VOP_OPEN(volume->devvp, 186 (ronly ? FREAD : FREAD|FWRITE), 187 FSCRED, NULL); 188 } 189 vn_unlock(volume->devvp); 190 } 191 if (error) { 192 hammer_free_volume(volume); 193 return(error); 194 } 195 volume->devvp->v_rdev->si_mountpoint = mp; 196 setmp = 1; 197 198 /* 199 * Extract the volume number from the volume header and do various 200 * sanity checks. 201 */ 202 error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp); 203 if (error) 204 goto late_failure; 205 ondisk = (void *)bp->b_data; 206 if (ondisk->vol_signature != HAMMER_FSBUF_VOLUME) { 207 kprintf("hammer_mount: volume %s has an invalid header\n", 208 volume->vol_name); 209 error = EFTYPE; 210 goto late_failure; 211 } 212 volume->vol_no = ondisk->vol_no; 213 volume->buffer_base = ondisk->vol_buf_beg; 214 volume->vol_flags = ondisk->vol_flags; 215 volume->nblocks = ondisk->vol_nblocks; 216 volume->maxbuf_off = HAMMER_ENCODE_RAW_BUFFER(volume->vol_no, 217 ondisk->vol_buf_end - ondisk->vol_buf_beg); 218 RB_INIT(&volume->rb_bufs_root); 219 220 if (RB_EMPTY(&hmp->rb_vols_root)) { 221 hmp->fsid = ondisk->vol_fsid; 222 } else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) { 223 kprintf("hammer_mount: volume %s's fsid does not match " 224 "other volumes\n", volume->vol_name); 225 error = EFTYPE; 226 goto late_failure; 227 } 228 229 /* 230 * Insert the volume structure into the red-black tree. 231 */ 232 if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) { 233 kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n", 234 volume->vol_name, volume->vol_no); 235 error = EEXIST; 236 } 237 238 /* 239 * Set the root volume . HAMMER special cases rootvol the structure. 240 * We do not hold a ref because this would prevent related I/O 241 * from being flushed. 242 */ 243 if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) { 244 hmp->rootvol = volume; 245 if (bp) { 246 brelse(bp); 247 bp = NULL; 248 } 249 hmp->fsid_udev = dev2udev(vn_todev(volume->devvp)); 250 hmp->mp->mnt_stat.f_blocks += ondisk->vol0_stat_bigblocks * 251 (HAMMER_LARGEBLOCK_SIZE / HAMMER_BUFSIZE); 252 } 253 late_failure: 254 if (bp) 255 brelse(bp); 256 if (error) { 257 /*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/ 258 if (setmp) 259 volume->devvp->v_rdev->si_mountpoint = NULL; 260 VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE); 261 hammer_free_volume(volume); 262 } 263 return (error); 264 } 265 266 /* 267 * Unload and free a HAMMER volume. Must return >= 0 to continue scan 268 * so returns -1 on failure. 269 */ 270 int 271 hammer_unload_volume(hammer_volume_t volume, void *data __unused) 272 { 273 struct hammer_mount *hmp = volume->io.hmp; 274 int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0); 275 276 /* 277 * Clean up the root volume pointer, which is held unlocked in hmp. 278 */ 279 if (hmp->rootvol == volume) 280 hmp->rootvol = NULL; 281 282 /* 283 * Unload buffers. 284 */ 285 RB_SCAN(hammer_buf_rb_tree, &volume->rb_bufs_root, NULL, 286 hammer_unload_buffer, NULL); 287 288 /* 289 * Release our buffer and flush anything left in the buffer cache. 290 */ 291 volume->io.flush = 1; 292 volume->io.waitdep = 1; 293 hammer_io_release(&volume->io); 294 295 /* 296 * There should be no references on the volume, no clusters, and 297 * no super-clusters. 298 */ 299 KKASSERT(volume->io.lock.refs == 0); 300 KKASSERT(RB_EMPTY(&volume->rb_bufs_root)); 301 302 volume->ondisk = NULL; 303 if (volume->devvp) { 304 if (volume->devvp->v_rdev && 305 volume->devvp->v_rdev->si_mountpoint == hmp->mp 306 ) { 307 volume->devvp->v_rdev->si_mountpoint = NULL; 308 } 309 if (ronly) { 310 vinvalbuf(volume->devvp, 0, 0, 0); 311 VOP_CLOSE(volume->devvp, FREAD); 312 } else { 313 vinvalbuf(volume->devvp, V_SAVE, 0, 0); 314 VOP_CLOSE(volume->devvp, FREAD|FWRITE); 315 } 316 } 317 318 /* 319 * Destroy the structure 320 */ 321 RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume); 322 hammer_free_volume(volume); 323 return(0); 324 } 325 326 static 327 void 328 hammer_free_volume(hammer_volume_t volume) 329 { 330 if (volume->vol_name) { 331 kfree(volume->vol_name, M_HAMMER); 332 volume->vol_name = NULL; 333 } 334 if (volume->devvp) { 335 vrele(volume->devvp); 336 volume->devvp = NULL; 337 } 338 --hammer_count_volumes; 339 kfree(volume, M_HAMMER); 340 } 341 342 /* 343 * Get a HAMMER volume. The volume must already exist. 344 */ 345 hammer_volume_t 346 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp) 347 { 348 struct hammer_volume *volume; 349 350 /* 351 * Locate the volume structure 352 */ 353 volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no); 354 if (volume == NULL) { 355 *errorp = ENOENT; 356 return(NULL); 357 } 358 hammer_ref(&volume->io.lock); 359 360 /* 361 * Deal with on-disk info 362 */ 363 if (volume->ondisk == NULL || volume->io.loading) { 364 *errorp = hammer_load_volume(volume); 365 if (*errorp) { 366 hammer_rel_volume(volume, 1); 367 volume = NULL; 368 } 369 } else { 370 *errorp = 0; 371 } 372 return(volume); 373 } 374 375 int 376 hammer_ref_volume(hammer_volume_t volume) 377 { 378 int error; 379 380 hammer_ref(&volume->io.lock); 381 382 /* 383 * Deal with on-disk info 384 */ 385 if (volume->ondisk == NULL || volume->io.loading) { 386 error = hammer_load_volume(volume); 387 if (error) 388 hammer_rel_volume(volume, 1); 389 } else { 390 error = 0; 391 } 392 return (error); 393 } 394 395 hammer_volume_t 396 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp) 397 { 398 hammer_volume_t volume; 399 400 volume = hmp->rootvol; 401 KKASSERT(volume != NULL); 402 hammer_ref(&volume->io.lock); 403 404 /* 405 * Deal with on-disk info 406 */ 407 if (volume->ondisk == NULL || volume->io.loading) { 408 *errorp = hammer_load_volume(volume); 409 if (*errorp) { 410 hammer_rel_volume(volume, 1); 411 volume = NULL; 412 } 413 } else { 414 *errorp = 0; 415 } 416 return (volume); 417 } 418 419 /* 420 * Load a volume's on-disk information. The volume must be referenced and 421 * not locked. We temporarily acquire an exclusive lock to interlock 422 * against releases or multiple get's. 423 */ 424 static int 425 hammer_load_volume(hammer_volume_t volume) 426 { 427 int error; 428 429 ++volume->io.loading; 430 hammer_lock_ex(&volume->io.lock); 431 432 if (volume->ondisk == NULL) { 433 error = hammer_io_read(volume->devvp, &volume->io); 434 if (error == 0) 435 volume->ondisk = (void *)volume->io.bp->b_data; 436 } else { 437 error = 0; 438 } 439 --volume->io.loading; 440 hammer_unlock(&volume->io.lock); 441 return(error); 442 } 443 444 /* 445 * Release a volume. Call hammer_io_release on the last reference. We have 446 * to acquire an exclusive lock to interlock against volume->ondisk tests 447 * in hammer_load_volume(), and hammer_io_release() also expects an exclusive 448 * lock to be held. 449 * 450 * Volumes are not unloaded from memory during normal operation. 451 */ 452 void 453 hammer_rel_volume(hammer_volume_t volume, int flush) 454 { 455 if (flush) 456 volume->io.flush = 1; 457 crit_enter(); 458 if (volume->io.lock.refs == 1) { 459 ++volume->io.loading; 460 hammer_lock_ex(&volume->io.lock); 461 if (volume->io.lock.refs == 1) { 462 volume->ondisk = NULL; 463 hammer_io_release(&volume->io); 464 } 465 --volume->io.loading; 466 hammer_unlock(&volume->io.lock); 467 } 468 hammer_unref(&volume->io.lock); 469 crit_exit(); 470 } 471 472 /************************************************************************ 473 * BUFFERS * 474 ************************************************************************ 475 * 476 * Manage buffers. Currently all blockmap-backed zones are translated 477 * to zone-2 buffer offsets. 478 */ 479 hammer_buffer_t 480 hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset, 481 int isnew, int *errorp) 482 { 483 hammer_buffer_t buffer; 484 hammer_volume_t volume; 485 hammer_off_t zoneX_offset; 486 hammer_io_type_t iotype; 487 int vol_no; 488 int zone; 489 490 zoneX_offset = buf_offset; 491 zone = HAMMER_ZONE_DECODE(buf_offset); 492 493 /* 494 * What is the buffer class? 495 */ 496 switch(zone) { 497 case HAMMER_ZONE_LARGE_DATA_INDEX: 498 case HAMMER_ZONE_SMALL_DATA_INDEX: 499 iotype = HAMMER_STRUCTURE_DATA_BUFFER; 500 break; 501 case HAMMER_ZONE_UNDO_INDEX: 502 iotype = HAMMER_STRUCTURE_UNDO_BUFFER; 503 break; 504 default: 505 iotype = HAMMER_STRUCTURE_META_BUFFER; 506 break; 507 } 508 509 /* 510 * Handle blockmap offset translations 511 */ 512 if (zone >= HAMMER_ZONE_BTREE_INDEX) { 513 buf_offset = hammer_blockmap_lookup(hmp, buf_offset, errorp); 514 KKASSERT(*errorp == 0); 515 } else if (zone == HAMMER_ZONE_UNDO_INDEX) { 516 buf_offset = hammer_undo_lookup(hmp, buf_offset, errorp); 517 KKASSERT(*errorp == 0); 518 } 519 520 /* 521 * Locate the buffer given its zone-2 offset. 522 */ 523 buf_offset &= ~HAMMER_BUFMASK64; 524 KKASSERT((buf_offset & HAMMER_ZONE_RAW_BUFFER) == 525 HAMMER_ZONE_RAW_BUFFER); 526 vol_no = HAMMER_VOL_DECODE(buf_offset); 527 volume = hammer_get_volume(hmp, vol_no, errorp); 528 if (volume == NULL) 529 return(NULL); 530 531 /* 532 * NOTE: buf_offset and maxbuf_off are both full offset 533 * specifications. 534 */ 535 KKASSERT(buf_offset < volume->maxbuf_off); 536 537 /* 538 * Locate and lock the buffer structure, creating one if necessary. 539 */ 540 again: 541 buffer = RB_LOOKUP(hammer_buf_rb_tree, &volume->rb_bufs_root, 542 buf_offset); 543 if (buffer == NULL) { 544 ++hammer_count_buffers; 545 buffer = kmalloc(sizeof(*buffer), M_HAMMER, M_WAITOK|M_ZERO); 546 buffer->zone2_offset = buf_offset; 547 buffer->volume = volume; 548 549 hammer_io_init(&buffer->io, hmp, iotype); 550 buffer->io.offset = volume->ondisk->vol_buf_beg + 551 (buf_offset & HAMMER_OFF_SHORT_MASK); 552 TAILQ_INIT(&buffer->clist); 553 hammer_ref(&buffer->io.lock); 554 555 /* 556 * Insert the buffer into the RB tree and handle late 557 * collisions. 558 */ 559 if (RB_INSERT(hammer_buf_rb_tree, &volume->rb_bufs_root, buffer)) { 560 hammer_unref(&buffer->io.lock); 561 --hammer_count_buffers; 562 kfree(buffer, M_HAMMER); 563 goto again; 564 } 565 hammer_ref(&volume->io.lock); 566 } else { 567 hammer_ref(&buffer->io.lock); 568 569 /* 570 * The buffer is no longer loose if it has a ref. 571 */ 572 if (buffer->io.mod_list == &hmp->lose_list) { 573 TAILQ_REMOVE(buffer->io.mod_list, &buffer->io, 574 mod_entry); 575 buffer->io.mod_list = NULL; 576 } 577 if (buffer->io.lock.refs == 1) 578 hammer_io_reinit(&buffer->io, iotype); 579 else 580 KKASSERT(buffer->io.type == iotype); 581 } 582 583 /* 584 * Cache the blockmap translation 585 */ 586 if ((zoneX_offset & HAMMER_ZONE_RAW_BUFFER) != HAMMER_ZONE_RAW_BUFFER) 587 buffer->zoneX_offset = zoneX_offset; 588 589 /* 590 * Deal with on-disk info 591 */ 592 if (buffer->ondisk == NULL || buffer->io.loading) { 593 *errorp = hammer_load_buffer(buffer, isnew); 594 if (*errorp) { 595 hammer_rel_buffer(buffer, 1); 596 buffer = NULL; 597 } 598 } else { 599 *errorp = 0; 600 } 601 hammer_rel_volume(volume, 0); 602 return(buffer); 603 } 604 605 static int 606 hammer_load_buffer(hammer_buffer_t buffer, int isnew) 607 { 608 hammer_volume_t volume; 609 int error; 610 611 /* 612 * Load the buffer's on-disk info 613 */ 614 volume = buffer->volume; 615 ++buffer->io.loading; 616 hammer_lock_ex(&buffer->io.lock); 617 618 if (buffer->ondisk == NULL) { 619 if (isnew) { 620 error = hammer_io_new(volume->devvp, &buffer->io); 621 } else { 622 error = hammer_io_read(volume->devvp, &buffer->io); 623 } 624 if (error == 0) 625 buffer->ondisk = (void *)buffer->io.bp->b_data; 626 } else if (isnew) { 627 error = hammer_io_new(volume->devvp, &buffer->io); 628 } else { 629 error = 0; 630 } 631 --buffer->io.loading; 632 hammer_unlock(&buffer->io.lock); 633 return (error); 634 } 635 636 /* 637 * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue. 638 */ 639 int 640 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused) 641 { 642 hammer_ref(&buffer->io.lock); 643 hammer_flush_buffer_nodes(buffer); 644 KKASSERT(buffer->io.lock.refs == 1); 645 hammer_rel_buffer(buffer, 2); 646 return(0); 647 } 648 649 /* 650 * Reference a buffer that is either already referenced or via a specially 651 * handled pointer (aka cursor->buffer). 652 */ 653 int 654 hammer_ref_buffer(hammer_buffer_t buffer) 655 { 656 int error; 657 658 hammer_ref(&buffer->io.lock); 659 660 /* 661 * No longer loose 662 */ 663 if (buffer->io.mod_list == &buffer->io.hmp->lose_list) { 664 TAILQ_REMOVE(buffer->io.mod_list, &buffer->io, mod_entry); 665 buffer->io.mod_list = NULL; 666 } 667 668 if (buffer->ondisk == NULL || buffer->io.loading) { 669 error = hammer_load_buffer(buffer, 0); 670 if (error) { 671 hammer_rel_buffer(buffer, 1); 672 /* 673 * NOTE: buffer pointer can become stale after 674 * the above release. 675 */ 676 } 677 } else { 678 error = 0; 679 } 680 return(error); 681 } 682 683 /* 684 * Release a buffer. We have to deal with several places where 685 * another thread can ref the buffer. 686 * 687 * Only destroy the structure itself if the related buffer cache buffer 688 * was disassociated from it. This ties the management of the structure 689 * to the buffer cache subsystem. buffer->ondisk determines whether the 690 * embedded io is referenced or not. 691 */ 692 void 693 hammer_rel_buffer(hammer_buffer_t buffer, int flush) 694 { 695 hammer_volume_t volume; 696 int freeme = 0; 697 698 if (flush) 699 buffer->io.flush = 1; 700 crit_enter(); 701 if (buffer->io.lock.refs == 1) { 702 ++buffer->io.loading; /* force interlock check */ 703 hammer_lock_ex(&buffer->io.lock); 704 if (buffer->io.lock.refs == 1) { 705 hammer_io_release(&buffer->io); 706 hammer_flush_buffer_nodes(buffer); 707 KKASSERT(TAILQ_EMPTY(&buffer->clist)); 708 709 if (buffer->io.bp == NULL && 710 buffer->io.lock.refs == 1) { 711 /* 712 * Final cleanup 713 */ 714 volume = buffer->volume; 715 RB_REMOVE(hammer_buf_rb_tree, 716 &volume->rb_bufs_root, buffer); 717 buffer->volume = NULL; /* sanity */ 718 hammer_rel_volume(volume, 0); 719 freeme = 1; 720 } 721 } 722 --buffer->io.loading; 723 hammer_unlock(&buffer->io.lock); 724 } 725 hammer_unref(&buffer->io.lock); 726 crit_exit(); 727 if (freeme) { 728 KKASSERT(buffer->io.mod_list == NULL); 729 --hammer_count_buffers; 730 kfree(buffer, M_HAMMER); 731 } 732 } 733 734 /* 735 * Remove the zoneX translation cache for a buffer given its zone-2 offset. 736 */ 737 void 738 hammer_uncache_buffer(hammer_mount_t hmp, hammer_off_t buf_offset) 739 { 740 hammer_volume_t volume; 741 hammer_buffer_t buffer; 742 int vol_no; 743 int error; 744 745 buf_offset &= ~HAMMER_BUFMASK64; 746 KKASSERT((buf_offset & HAMMER_ZONE_RAW_BUFFER) == 747 HAMMER_ZONE_RAW_BUFFER); 748 vol_no = HAMMER_VOL_DECODE(buf_offset); 749 volume = hammer_get_volume(hmp, vol_no, &error); 750 KKASSERT(volume != 0); 751 KKASSERT(buf_offset < volume->maxbuf_off); 752 753 buffer = RB_LOOKUP(hammer_buf_rb_tree, &volume->rb_bufs_root, 754 buf_offset); 755 if (buffer) 756 buffer->zoneX_offset = 0; 757 hammer_rel_volume(volume, 0); 758 } 759 760 /* 761 * Access the filesystem buffer containing the specified hammer offset. 762 * buf_offset is a conglomeration of the volume number and vol_buf_beg 763 * relative buffer offset. It must also have bit 55 set to be valid. 764 * (see hammer_off_t in hammer_disk.h). 765 * 766 * Any prior buffer in *bufferp will be released and replaced by the 767 * requested buffer. 768 */ 769 void * 770 hammer_bread(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp, 771 struct hammer_buffer **bufferp) 772 { 773 hammer_buffer_t buffer; 774 int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK; 775 776 buf_offset &= ~HAMMER_BUFMASK64; 777 KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) != 0); 778 779 buffer = *bufferp; 780 if (buffer == NULL || (buffer->zone2_offset != buf_offset && 781 buffer->zoneX_offset != buf_offset)) { 782 if (buffer) 783 hammer_rel_buffer(buffer, 0); 784 buffer = hammer_get_buffer(hmp, buf_offset, 0, errorp); 785 *bufferp = buffer; 786 } else { 787 *errorp = 0; 788 } 789 790 /* 791 * Return a pointer to the buffer data. 792 */ 793 if (buffer == NULL) 794 return(NULL); 795 else 796 return((char *)buffer->ondisk + xoff); 797 } 798 799 /* 800 * Access the filesystem buffer containing the specified hammer offset. 801 * No disk read operation occurs. The result buffer may contain garbage. 802 * 803 * Any prior buffer in *bufferp will be released and replaced by the 804 * requested buffer. 805 * 806 * This function marks the buffer dirty but does not increment its 807 * modify_refs count. 808 */ 809 void * 810 hammer_bnew(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp, 811 struct hammer_buffer **bufferp) 812 { 813 hammer_buffer_t buffer; 814 int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK; 815 816 buf_offset &= ~HAMMER_BUFMASK64; 817 818 buffer = *bufferp; 819 if (buffer == NULL || (buffer->zone2_offset != buf_offset && 820 buffer->zoneX_offset != buf_offset)) { 821 if (buffer) 822 hammer_rel_buffer(buffer, 0); 823 buffer = hammer_get_buffer(hmp, buf_offset, 1, errorp); 824 *bufferp = buffer; 825 } else { 826 *errorp = 0; 827 } 828 829 /* 830 * Return a pointer to the buffer data. 831 */ 832 if (buffer == NULL) 833 return(NULL); 834 else 835 return((char *)buffer->ondisk + xoff); 836 } 837 838 /************************************************************************ 839 * NODES * 840 ************************************************************************ 841 * 842 * Manage B-Tree nodes. B-Tree nodes represent the primary indexing 843 * method used by the HAMMER filesystem. 844 * 845 * Unlike other HAMMER structures, a hammer_node can be PASSIVELY 846 * associated with its buffer, and will only referenced the buffer while 847 * the node itself is referenced. 848 * 849 * A hammer_node can also be passively associated with other HAMMER 850 * structures, such as inodes, while retaining 0 references. These 851 * associations can be cleared backwards using a pointer-to-pointer in 852 * the hammer_node. 853 * 854 * This allows the HAMMER implementation to cache hammer_nodes long-term 855 * and short-cut a great deal of the infrastructure's complexity. In 856 * most cases a cached node can be reacquired without having to dip into 857 * either the buffer or cluster management code. 858 * 859 * The caller must pass a referenced cluster on call and will retain 860 * ownership of the reference on return. The node will acquire its own 861 * additional references, if necessary. 862 */ 863 hammer_node_t 864 hammer_get_node(hammer_mount_t hmp, hammer_off_t node_offset, 865 int isnew, int *errorp) 866 { 867 hammer_node_t node; 868 869 KKASSERT((node_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_BTREE); 870 871 /* 872 * Locate the structure, allocating one if necessary. 873 */ 874 again: 875 node = RB_LOOKUP(hammer_nod_rb_tree, &hmp->rb_nods_root, node_offset); 876 if (node == NULL) { 877 ++hammer_count_nodes; 878 node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO); 879 node->node_offset = node_offset; 880 node->hmp = hmp; 881 if (RB_INSERT(hammer_nod_rb_tree, &hmp->rb_nods_root, node)) { 882 --hammer_count_nodes; 883 kfree(node, M_HAMMER); 884 goto again; 885 } 886 } 887 hammer_ref(&node->lock); 888 if (node->ondisk) 889 *errorp = 0; 890 else 891 *errorp = hammer_load_node(node, isnew); 892 if (*errorp) { 893 hammer_rel_node(node); 894 node = NULL; 895 } 896 return(node); 897 } 898 899 /* 900 * Reference an already-referenced node. 901 */ 902 void 903 hammer_ref_node(hammer_node_t node) 904 { 905 KKASSERT(node->lock.refs > 0 && node->ondisk != NULL); 906 hammer_ref(&node->lock); 907 } 908 909 /* 910 * Load a node's on-disk data reference. 911 */ 912 static int 913 hammer_load_node(hammer_node_t node, int isnew) 914 { 915 hammer_buffer_t buffer; 916 int error; 917 918 error = 0; 919 ++node->loading; 920 hammer_lock_ex(&node->lock); 921 if (node->ondisk == NULL) { 922 /* 923 * This is a little confusing but the jist is that 924 * node->buffer determines whether the node is on 925 * the buffer's clist and node->ondisk determines 926 * whether the buffer is referenced. 927 * 928 * We could be racing a buffer release, in which case 929 * node->buffer may become NULL while we are blocked 930 * referencing the buffer. 931 */ 932 if ((buffer = node->buffer) != NULL) { 933 error = hammer_ref_buffer(buffer); 934 if (error == 0 && node->buffer == NULL) { 935 TAILQ_INSERT_TAIL(&buffer->clist, 936 node, entry); 937 node->buffer = buffer; 938 } 939 } else { 940 buffer = hammer_get_buffer(node->hmp, 941 node->node_offset, 0, 942 &error); 943 if (buffer) { 944 KKASSERT(error == 0); 945 TAILQ_INSERT_TAIL(&buffer->clist, 946 node, entry); 947 node->buffer = buffer; 948 } 949 } 950 if (error == 0) { 951 node->ondisk = (void *)((char *)buffer->ondisk + 952 (node->node_offset & HAMMER_BUFMASK)); 953 if (isnew == 0 && 954 hammer_crc_test_btree(node->ondisk) == 0) { 955 Debugger("CRC FAILED: B-TREE NODE"); 956 } 957 } 958 } 959 --node->loading; 960 hammer_unlock(&node->lock); 961 return (error); 962 } 963 964 /* 965 * Safely reference a node, interlock against flushes via the IO subsystem. 966 */ 967 hammer_node_t 968 hammer_ref_node_safe(struct hammer_mount *hmp, struct hammer_node **cache, 969 int *errorp) 970 { 971 hammer_node_t node; 972 973 node = *cache; 974 if (node != NULL) { 975 hammer_ref(&node->lock); 976 if (node->ondisk) 977 *errorp = 0; 978 else 979 *errorp = hammer_load_node(node, 0); 980 if (*errorp) { 981 hammer_rel_node(node); 982 node = NULL; 983 } 984 } else { 985 *errorp = ENOENT; 986 } 987 return(node); 988 } 989 990 /* 991 * Release a hammer_node. On the last release the node dereferences 992 * its underlying buffer and may or may not be destroyed. 993 */ 994 void 995 hammer_rel_node(hammer_node_t node) 996 { 997 hammer_buffer_t buffer; 998 999 /* 1000 * If this isn't the last ref just decrement the ref count and 1001 * return. 1002 */ 1003 if (node->lock.refs > 1) { 1004 hammer_unref(&node->lock); 1005 return; 1006 } 1007 1008 /* 1009 * If there is no ondisk info or no buffer the node failed to load, 1010 * remove the last reference and destroy the node. 1011 */ 1012 if (node->ondisk == NULL) { 1013 hammer_unref(&node->lock); 1014 hammer_flush_node(node); 1015 /* node is stale now */ 1016 return; 1017 } 1018 1019 /* 1020 * Do final cleanups and then either destroy the node and leave it 1021 * passively cached. The buffer reference is removed regardless. 1022 */ 1023 buffer = node->buffer; 1024 node->ondisk = NULL; 1025 1026 if ((node->flags & HAMMER_NODE_FLUSH) == 0) { 1027 hammer_unref(&node->lock); 1028 hammer_rel_buffer(buffer, 0); 1029 return; 1030 } 1031 1032 /* 1033 * Destroy the node. 1034 */ 1035 hammer_unref(&node->lock); 1036 hammer_flush_node(node); 1037 /* node is stale */ 1038 hammer_rel_buffer(buffer, 0); 1039 } 1040 1041 /* 1042 * 1043 * 1044 */ 1045 void 1046 hammer_delete_node(hammer_transaction_t trans, hammer_node_t node) 1047 { 1048 node->flags |= HAMMER_NODE_DELETED; 1049 hammer_blockmap_free(trans, node->node_offset, sizeof(*node->ondisk)); 1050 } 1051 1052 /* 1053 * Passively cache a referenced hammer_node in *cache. The caller may 1054 * release the node on return. 1055 */ 1056 void 1057 hammer_cache_node(hammer_node_t node, struct hammer_node **cache) 1058 { 1059 hammer_node_t old; 1060 1061 /* 1062 * If the node is being deleted, don't cache it! 1063 */ 1064 if (node->flags & HAMMER_NODE_DELETED) 1065 return; 1066 1067 /* 1068 * Cache the node. If we previously cached a different node we 1069 * have to give HAMMER a chance to destroy it. 1070 */ 1071 again: 1072 if (node->cache1 != cache) { 1073 if (node->cache2 != cache) { 1074 if ((old = *cache) != NULL) { 1075 KKASSERT(node->lock.refs != 0); 1076 hammer_uncache_node(cache); 1077 goto again; 1078 } 1079 if (node->cache2) 1080 *node->cache2 = NULL; 1081 node->cache2 = node->cache1; 1082 node->cache1 = cache; 1083 *cache = node; 1084 } else { 1085 struct hammer_node **tmp; 1086 tmp = node->cache1; 1087 node->cache1 = node->cache2; 1088 node->cache2 = tmp; 1089 } 1090 } 1091 } 1092 1093 void 1094 hammer_uncache_node(struct hammer_node **cache) 1095 { 1096 hammer_node_t node; 1097 1098 if ((node = *cache) != NULL) { 1099 *cache = NULL; 1100 if (node->cache1 == cache) { 1101 node->cache1 = node->cache2; 1102 node->cache2 = NULL; 1103 } else if (node->cache2 == cache) { 1104 node->cache2 = NULL; 1105 } else { 1106 panic("hammer_uncache_node: missing cache linkage"); 1107 } 1108 if (node->cache1 == NULL && node->cache2 == NULL) 1109 hammer_flush_node(node); 1110 } 1111 } 1112 1113 /* 1114 * Remove a node's cache references and destroy the node if it has no 1115 * other references or backing store. 1116 */ 1117 void 1118 hammer_flush_node(hammer_node_t node) 1119 { 1120 hammer_buffer_t buffer; 1121 1122 if (node->cache1) 1123 *node->cache1 = NULL; 1124 if (node->cache2) 1125 *node->cache2 = NULL; 1126 if (node->lock.refs == 0 && node->ondisk == NULL) { 1127 RB_REMOVE(hammer_nod_rb_tree, &node->hmp->rb_nods_root, node); 1128 if ((buffer = node->buffer) != NULL) { 1129 node->buffer = NULL; 1130 TAILQ_REMOVE(&buffer->clist, node, entry); 1131 /* buffer is unreferenced because ondisk is NULL */ 1132 } 1133 --hammer_count_nodes; 1134 kfree(node, M_HAMMER); 1135 } 1136 } 1137 1138 /* 1139 * Flush passively cached B-Tree nodes associated with this buffer. 1140 * This is only called when the buffer is about to be destroyed, so 1141 * none of the nodes should have any references. The buffer is locked. 1142 * 1143 * We may be interlocked with the buffer. 1144 */ 1145 void 1146 hammer_flush_buffer_nodes(hammer_buffer_t buffer) 1147 { 1148 hammer_node_t node; 1149 1150 while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) { 1151 KKASSERT(node->ondisk == NULL); 1152 1153 if (node->lock.refs == 0) { 1154 hammer_ref(&node->lock); 1155 node->flags |= HAMMER_NODE_FLUSH; 1156 hammer_rel_node(node); 1157 } else { 1158 KKASSERT(node->loading != 0); 1159 KKASSERT(node->buffer != NULL); 1160 buffer = node->buffer; 1161 node->buffer = NULL; 1162 TAILQ_REMOVE(&buffer->clist, node, entry); 1163 /* buffer is unreferenced because ondisk is NULL */ 1164 } 1165 } 1166 } 1167 1168 1169 /************************************************************************ 1170 * ALLOCATORS * 1171 ************************************************************************/ 1172 1173 /* 1174 * Allocate a B-Tree node. 1175 */ 1176 hammer_node_t 1177 hammer_alloc_btree(hammer_transaction_t trans, int *errorp) 1178 { 1179 hammer_buffer_t buffer = NULL; 1180 hammer_node_t node = NULL; 1181 hammer_off_t node_offset; 1182 1183 node_offset = hammer_blockmap_alloc(trans, HAMMER_ZONE_BTREE_INDEX, 1184 sizeof(struct hammer_node_ondisk), 1185 errorp); 1186 if (*errorp == 0) { 1187 node = hammer_get_node(trans->hmp, node_offset, 1, errorp); 1188 hammer_modify_node_noundo(trans, node); 1189 bzero(node->ondisk, sizeof(*node->ondisk)); 1190 hammer_modify_node_done(node); 1191 } 1192 if (buffer) 1193 hammer_rel_buffer(buffer, 0); 1194 return(node); 1195 } 1196 1197 /* 1198 * The returned buffers are already appropriately marked as being modified. 1199 * If the caller marks them again unnecessary undo records may be generated. 1200 * 1201 * In-band data is indicated by data_bufferp == NULL. Pass a data_len of 0 1202 * for zero-fill (caller modifies data_len afterwords). 1203 * 1204 * If the caller is responsible for calling hammer_modify_*() prior to making 1205 * any additional modifications to either the returned record buffer or the 1206 * returned data buffer. 1207 */ 1208 void * 1209 hammer_alloc_record(hammer_transaction_t trans, 1210 hammer_off_t *rec_offp, u_int16_t rec_type, 1211 struct hammer_buffer **rec_bufferp, 1212 int32_t data_len, void **datap, 1213 hammer_off_t *data_offp, 1214 struct hammer_buffer **data_bufferp, int *errorp) 1215 { 1216 hammer_record_ondisk_t rec; 1217 hammer_off_t rec_offset; 1218 hammer_off_t data_offset; 1219 int32_t reclen; 1220 1221 if (datap) 1222 *datap = NULL; 1223 1224 /* 1225 * Allocate the record 1226 */ 1227 rec_offset = hammer_blockmap_alloc(trans, HAMMER_ZONE_RECORD_INDEX, 1228 HAMMER_RECORD_SIZE, errorp); 1229 if (*errorp) 1230 return(NULL); 1231 if (data_offp) 1232 *data_offp = 0; 1233 1234 /* 1235 * Allocate data 1236 */ 1237 if (data_len) { 1238 if (data_bufferp == NULL) { 1239 switch(rec_type) { 1240 case HAMMER_RECTYPE_DATA: 1241 reclen = offsetof(struct hammer_data_record, 1242 data[0]); 1243 break; 1244 case HAMMER_RECTYPE_DIRENTRY: 1245 reclen = offsetof(struct hammer_entry_record, 1246 name[0]); 1247 break; 1248 default: 1249 panic("hammer_alloc_record: illegal " 1250 "in-band data"); 1251 /* NOT REACHED */ 1252 reclen = 0; 1253 break; 1254 } 1255 KKASSERT(reclen + data_len <= HAMMER_RECORD_SIZE); 1256 data_offset = rec_offset + reclen; 1257 } else if (data_len < HAMMER_BUFSIZE) { 1258 data_offset = hammer_blockmap_alloc(trans, 1259 HAMMER_ZONE_SMALL_DATA_INDEX, 1260 data_len, errorp); 1261 *data_offp = data_offset; 1262 } else { 1263 data_offset = hammer_blockmap_alloc(trans, 1264 HAMMER_ZONE_LARGE_DATA_INDEX, 1265 data_len, errorp); 1266 *data_offp = data_offset; 1267 } 1268 } else { 1269 data_offset = 0; 1270 } 1271 if (*errorp) { 1272 hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE); 1273 return(NULL); 1274 } 1275 1276 /* 1277 * Basic return values. 1278 * 1279 * Note that because this is a 'new' buffer, there is no need to 1280 * generate UNDO records for it. 1281 */ 1282 *rec_offp = rec_offset; 1283 rec = hammer_bread(trans->hmp, rec_offset, errorp, rec_bufferp); 1284 hammer_modify_buffer(trans, *rec_bufferp, NULL, 0); 1285 bzero(rec, sizeof(*rec)); 1286 KKASSERT(*errorp == 0); 1287 rec->base.data_off = data_offset; 1288 rec->base.data_len = data_len; 1289 hammer_modify_buffer_done(*rec_bufferp); 1290 1291 if (data_bufferp) { 1292 if (data_len) { 1293 *datap = hammer_bread(trans->hmp, data_offset, errorp, 1294 data_bufferp); 1295 KKASSERT(*errorp == 0); 1296 } else { 1297 *datap = NULL; 1298 } 1299 } else if (data_len) { 1300 KKASSERT(data_offset + data_len - rec_offset <= 1301 HAMMER_RECORD_SIZE); 1302 if (datap) { 1303 *datap = (void *)((char *)rec + 1304 (int32_t)(data_offset - rec_offset)); 1305 } 1306 } else { 1307 KKASSERT(datap == NULL); 1308 } 1309 KKASSERT(*errorp == 0); 1310 return(rec); 1311 } 1312 1313 /* 1314 * Allocate data. If the address of a data buffer is supplied then 1315 * any prior non-NULL *data_bufferp will be released and *data_bufferp 1316 * will be set to the related buffer. The caller must release it when 1317 * finally done. The initial *data_bufferp should be set to NULL by 1318 * the caller. 1319 * 1320 * The caller is responsible for making hammer_modify*() calls on the 1321 * *data_bufferp. 1322 */ 1323 void * 1324 hammer_alloc_data(hammer_transaction_t trans, int32_t data_len, 1325 hammer_off_t *data_offsetp, 1326 struct hammer_buffer **data_bufferp, int *errorp) 1327 { 1328 void *data; 1329 1330 /* 1331 * Allocate data 1332 */ 1333 if (data_len) { 1334 if (data_len < HAMMER_BUFSIZE) { 1335 *data_offsetp = hammer_blockmap_alloc(trans, 1336 HAMMER_ZONE_SMALL_DATA_INDEX, 1337 data_len, errorp); 1338 } else { 1339 *data_offsetp = hammer_blockmap_alloc(trans, 1340 HAMMER_ZONE_LARGE_DATA_INDEX, 1341 data_len, errorp); 1342 } 1343 } else { 1344 *data_offsetp = 0; 1345 } 1346 if (*errorp == 0 && data_bufferp) { 1347 if (data_len) { 1348 data = hammer_bread(trans->hmp, *data_offsetp, errorp, 1349 data_bufferp); 1350 KKASSERT(*errorp == 0); 1351 } else { 1352 data = NULL; 1353 } 1354 } else { 1355 data = NULL; 1356 } 1357 KKASSERT(*errorp == 0); 1358 return(data); 1359 } 1360 1361 /* 1362 * Sync dirty buffers to the media and clean-up any loose ends. 1363 */ 1364 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data); 1365 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data); 1366 1367 int 1368 hammer_sync_hmp(hammer_mount_t hmp, int waitfor) 1369 { 1370 struct hammer_sync_info info; 1371 1372 info.error = 0; 1373 info.waitfor = waitfor; 1374 1375 vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_NOWAIT, 1376 hammer_sync_scan1, hammer_sync_scan2, &info); 1377 if (waitfor == MNT_WAIT) 1378 hammer_flusher_sync(hmp); 1379 else 1380 hammer_flusher_async(hmp); 1381 1382 return(info.error); 1383 } 1384 1385 static int 1386 hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data) 1387 { 1388 struct hammer_inode *ip; 1389 1390 ip = VTOI(vp); 1391 if (vp->v_type == VNON || ip == NULL || 1392 ((ip->flags & HAMMER_INODE_MODMASK) == 0 && 1393 RB_EMPTY(&vp->v_rbdirty_tree))) { 1394 return(-1); 1395 } 1396 return(0); 1397 } 1398 1399 static int 1400 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data) 1401 { 1402 struct hammer_sync_info *info = data; 1403 struct hammer_inode *ip; 1404 int error; 1405 1406 ip = VTOI(vp); 1407 if (vp->v_type == VNON || vp->v_type == VBAD || 1408 ((ip->flags & HAMMER_INODE_MODMASK) == 0 && 1409 RB_EMPTY(&vp->v_rbdirty_tree))) { 1410 return(0); 1411 } 1412 error = VOP_FSYNC(vp, info->waitfor); 1413 if (error) 1414 info->error = error; 1415 return(0); 1416 } 1417 1418