1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * HAMMER B-Tree index 37 * 38 * HAMMER implements a modified B+Tree. In documentation this will 39 * simply be refered to as the HAMMER B-Tree. Basically a HAMMER B-Tree 40 * looks like a B+Tree (A B-Tree which stores its records only at the leafs 41 * of the tree), but adds two additional boundary elements which describe 42 * the left-most and right-most element a node is able to represent. In 43 * otherwords, we have boundary elements at the two ends of a B-Tree node 44 * with no valid sub-tree pointer for the right-most element. 45 * 46 * A B-Tree internal node looks like this: 47 * 48 * B N N N N N N B <-- boundary and internal elements 49 * S S S S S S S <-- subtree pointers 50 * 51 * A B-Tree leaf node basically looks like this: 52 * 53 * L L L L L L L L <-- leaf elemenets 54 * 55 * The radix for an internal node is 1 less then a leaf but we get a 56 * number of significant benefits for our troubles. 57 * The left-hand boundary (B in the left) is integrated into the first 58 * element so it doesn't require 2 elements to accomodate boundaries. 59 * 60 * The big benefit to using a B-Tree containing boundary information 61 * is that it is possible to cache pointers into the middle of the tree 62 * and not have to start searches, insertions, OR deletions at the root 63 * node. In particular, searches are able to progress in a definitive 64 * direction from any point in the tree without revisting nodes. This 65 * greatly improves the efficiency of many operations, most especially 66 * record appends. 67 * 68 * B-Trees also make the stacking of trees fairly straightforward. 69 * 70 * INSERTIONS: A search performed with the intention of doing 71 * an insert will guarantee that the terminal leaf node is not full by 72 * splitting full nodes. Splits occur top-down during the dive down the 73 * B-Tree. 74 * 75 * DELETIONS: A deletion makes no attempt to proactively balance the 76 * tree and will recursively remove nodes that become empty. If a 77 * deadlock occurs a deletion may not be able to remove an empty leaf. 78 * Deletions never allow internal nodes to become empty (that would blow 79 * up the boundaries). 80 */ 81 #include "hammer.h" 82 83 static int btree_search(hammer_cursor_t cursor, int flags); 84 static int btree_split_internal(hammer_cursor_t cursor); 85 static int btree_split_leaf(hammer_cursor_t cursor); 86 static int btree_remove(hammer_cursor_t cursor, int *ndelete); 87 static __inline int btree_node_is_full(hammer_node_ondisk_t node); 88 static int hammer_btree_mirror_propagate(hammer_cursor_t cursor, 89 hammer_tid_t mirror_tid); 90 static void hammer_make_separator(hammer_base_elm_t key1, 91 hammer_base_elm_t key2, hammer_base_elm_t dest); 92 static void hammer_cursor_mirror_filter(hammer_cursor_t cursor); 93 static __inline void hammer_debug_btree_elm(hammer_cursor_t cursor, 94 hammer_btree_elm_t elm, const char *s, int res); 95 static __inline void hammer_debug_btree_parent(hammer_cursor_t cursor, 96 const char *s); 97 98 /* 99 * Iterate records after a search. The cursor is iterated forwards past 100 * the current record until a record matching the key-range requirements 101 * is found. ENOENT is returned if the iteration goes past the ending 102 * key. 103 * 104 * The iteration is inclusive of key_beg and can be inclusive or exclusive 105 * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set. 106 * 107 * When doing an as-of search (cursor->asof != 0), key_beg.create_tid 108 * may be modified by B-Tree functions. 109 * 110 * cursor->key_beg may or may not be modified by this function during 111 * the iteration. XXX future - in case of an inverted lock we may have 112 * to reinitiate the lookup and set key_beg to properly pick up where we 113 * left off. 114 * 115 * If HAMMER_CURSOR_ITERATE_CHECK is set it is possible that the cursor 116 * was reverse indexed due to being moved to a parent while unlocked, 117 * and something else might have inserted an element outside the iteration 118 * range. When this case occurs the iterator just keeps iterating until 119 * it gets back into the iteration range (instead of asserting). 120 * 121 * NOTE! EDEADLK *CANNOT* be returned by this procedure. 122 */ 123 int 124 hammer_btree_iterate(hammer_cursor_t cursor) 125 { 126 hammer_node_ondisk_t node; 127 hammer_btree_elm_t elm; 128 hammer_mount_t hmp; 129 int error = 0; 130 int r; 131 int s; 132 133 /* 134 * Skip past the current record 135 */ 136 hmp = cursor->trans->hmp; 137 node = cursor->node->ondisk; 138 if (node == NULL) 139 return(ENOENT); 140 if (cursor->index < node->count && 141 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 142 ++cursor->index; 143 } 144 145 /* 146 * HAMMER can wind up being cpu-bound. 147 */ 148 if (++hmp->check_yield > hammer_yield_check) { 149 hmp->check_yield = 0; 150 lwkt_user_yield(); 151 } 152 153 154 /* 155 * Loop until an element is found or we are done. 156 */ 157 for (;;) { 158 /* 159 * We iterate up the tree and then index over one element 160 * while we are at the last element in the current node. 161 * 162 * If we are at the root of the filesystem, cursor_up 163 * returns ENOENT. 164 * 165 * XXX this could be optimized by storing the information in 166 * the parent reference. 167 * 168 * XXX we can lose the node lock temporarily, this could mess 169 * up our scan. 170 */ 171 ++hammer_stats_btree_iterations; 172 hammer_flusher_clean_loose_ios(hmp); 173 174 if (cursor->index == node->count) { 175 if (hammer_debug_btree) { 176 hkprintf("BRACKETU %016jx[%d] -> %016jx[%d] td=%p\n", 177 (intmax_t)cursor->node->node_offset, 178 cursor->index, 179 (intmax_t)(cursor->parent ? cursor->parent->node_offset : -1), 180 cursor->parent_index, 181 curthread); 182 } 183 KKASSERT(cursor->parent == NULL || 184 cursor->parent->ondisk->elms[cursor->parent_index].internal.subtree_offset == cursor->node->node_offset); 185 error = hammer_cursor_up(cursor); 186 if (error) 187 break; 188 /* reload stale pointer */ 189 node = cursor->node->ondisk; 190 KKASSERT(cursor->index != node->count); 191 192 /* 193 * If we are reblocking we want to return internal 194 * nodes. Note that the internal node will be 195 * returned multiple times, on each upward recursion 196 * from its children. The caller selects which 197 * revisit it cares about (usually first or last only). 198 */ 199 if (cursor->flags & HAMMER_CURSOR_REBLOCKING) { 200 cursor->flags |= HAMMER_CURSOR_ATEDISK; 201 return(0); 202 } 203 ++cursor->index; 204 continue; 205 } 206 207 /* 208 * Check internal or leaf element. Determine if the record 209 * at the cursor has gone beyond the end of our range. 210 * 211 * We recurse down through internal nodes. 212 */ 213 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 214 elm = &node->elms[cursor->index]; 215 216 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 217 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 218 if (hammer_debug_btree) { 219 hammer_debug_btree_elm(cursor, elm, "BRACKETL", r); 220 hammer_debug_btree_elm(cursor, elm + 1, "BRACKETR", s); 221 } 222 223 if (r < 0) { 224 error = ENOENT; 225 break; 226 } 227 if (r == 0 && (cursor->flags & 228 HAMMER_CURSOR_END_INCLUSIVE) == 0) { 229 error = ENOENT; 230 break; 231 } 232 233 /* 234 * Better not be zero 235 */ 236 KKASSERT(elm->internal.subtree_offset != 0); 237 238 if (s <= 0) { 239 /* 240 * If running the mirror filter see if we 241 * can skip one or more entire sub-trees. 242 * If we can we return the internal node 243 * and the caller processes the skipped 244 * range (see mirror_read). 245 */ 246 if (cursor->flags & 247 HAMMER_CURSOR_MIRROR_FILTERED) { 248 if (elm->internal.mirror_tid < 249 cursor->cmirror->mirror_tid) { 250 hammer_cursor_mirror_filter(cursor); 251 return(0); 252 } 253 } 254 } else { 255 /* 256 * Normally it would be impossible for the 257 * cursor to have gotten back-indexed, 258 * but it can happen if a node is deleted 259 * and the cursor is moved to its parent 260 * internal node. ITERATE_CHECK will be set. 261 */ 262 KKASSERT(cursor->flags & 263 HAMMER_CURSOR_ITERATE_CHECK); 264 hdkprintf("DEBUG: Caught parent seek " 265 "in internal iteration\n"); 266 } 267 268 error = hammer_cursor_down(cursor); 269 if (error) 270 break; 271 KKASSERT(cursor->index == 0); 272 /* reload stale pointer */ 273 node = cursor->node->ondisk; 274 continue; 275 } else { 276 elm = &node->elms[cursor->index]; 277 r = hammer_btree_cmp(&cursor->key_end, &elm->base); 278 if (hammer_debug_btree) { 279 hammer_debug_btree_elm(cursor, elm, "ELEMENT", r); 280 } 281 if (r < 0) { 282 error = ENOENT; 283 break; 284 } 285 286 /* 287 * We support both end-inclusive and 288 * end-exclusive searches. 289 */ 290 if (r == 0 && 291 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 292 error = ENOENT; 293 break; 294 } 295 296 /* 297 * If ITERATE_CHECK is set an unlocked cursor may 298 * have been moved to a parent and the iterate can 299 * happen upon elements that are not in the requested 300 * range. 301 */ 302 if (cursor->flags & HAMMER_CURSOR_ITERATE_CHECK) { 303 s = hammer_btree_cmp(&cursor->key_beg, 304 &elm->base); 305 if (s > 0) { 306 hdkprintf("DEBUG: Caught parent seek " 307 "in leaf iteration\n"); 308 ++cursor->index; 309 continue; 310 } 311 } 312 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 313 314 /* 315 * Return the element 316 */ 317 switch(elm->leaf.base.btype) { 318 case HAMMER_BTREE_TYPE_RECORD: 319 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 320 hammer_btree_chkts(cursor->asof, &elm->base)) { 321 ++cursor->index; 322 continue; 323 } 324 error = 0; 325 break; 326 default: 327 error = EINVAL; 328 break; 329 } 330 if (error) 331 break; 332 } 333 334 /* 335 * Return entry 336 */ 337 if (hammer_debug_btree) { 338 elm = &cursor->node->ondisk->elms[cursor->index]; 339 hammer_debug_btree_elm(cursor, elm, "ITERATE", 0xffff); 340 } 341 return(0); 342 } 343 return(error); 344 } 345 346 /* 347 * We hit an internal element that we could skip as part of a mirroring 348 * scan. Calculate the entire range being skipped. 349 * 350 * It is important to include any gaps between the parent's left_bound 351 * and the node's left_bound, and same goes for the right side. 352 */ 353 static void 354 hammer_cursor_mirror_filter(hammer_cursor_t cursor) 355 { 356 struct hammer_cmirror *cmirror; 357 hammer_node_ondisk_t ondisk; 358 hammer_btree_elm_t elm; 359 360 ondisk = cursor->node->ondisk; 361 cmirror = cursor->cmirror; 362 363 /* 364 * Calculate the skipped range 365 */ 366 elm = &ondisk->elms[cursor->index]; 367 if (cursor->index == 0) 368 cmirror->skip_beg = *cursor->left_bound; 369 else 370 cmirror->skip_beg = elm->internal.base; 371 while (cursor->index < ondisk->count) { 372 if (elm->internal.mirror_tid >= cmirror->mirror_tid) 373 break; 374 ++cursor->index; 375 ++elm; 376 } 377 if (cursor->index == ondisk->count) 378 cmirror->skip_end = *cursor->right_bound; 379 else 380 cmirror->skip_end = elm->internal.base; 381 382 /* 383 * clip the returned result. 384 */ 385 if (hammer_btree_cmp(&cmirror->skip_beg, &cursor->key_beg) < 0) 386 cmirror->skip_beg = cursor->key_beg; 387 if (hammer_btree_cmp(&cmirror->skip_end, &cursor->key_end) > 0) 388 cmirror->skip_end = cursor->key_end; 389 } 390 391 /* 392 * Iterate in the reverse direction. This is used by the pruning code to 393 * avoid overlapping records. 394 */ 395 int 396 hammer_btree_iterate_reverse(hammer_cursor_t cursor) 397 { 398 hammer_node_ondisk_t node; 399 hammer_btree_elm_t elm; 400 hammer_mount_t hmp; 401 int error = 0; 402 int r; 403 int s; 404 405 /* mirror filtering not supported for reverse iteration */ 406 KKASSERT ((cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) == 0); 407 408 /* 409 * Skip past the current record. For various reasons the cursor 410 * may end up set to -1 or set to point at the end of the current 411 * node. These cases must be addressed. 412 */ 413 node = cursor->node->ondisk; 414 if (node == NULL) 415 return(ENOENT); 416 if (cursor->index != -1 && 417 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 418 --cursor->index; 419 } 420 if (cursor->index == cursor->node->ondisk->count) 421 --cursor->index; 422 423 /* 424 * HAMMER can wind up being cpu-bound. 425 */ 426 hmp = cursor->trans->hmp; 427 if (++hmp->check_yield > hammer_yield_check) { 428 hmp->check_yield = 0; 429 lwkt_user_yield(); 430 } 431 432 /* 433 * Loop until an element is found or we are done. 434 */ 435 for (;;) { 436 ++hammer_stats_btree_iterations; 437 hammer_flusher_clean_loose_ios(hmp); 438 439 /* 440 * We iterate up the tree and then index over one element 441 * while we are at the last element in the current node. 442 */ 443 if (cursor->index == -1) { 444 error = hammer_cursor_up(cursor); 445 if (error) { 446 cursor->index = 0; /* sanity */ 447 break; 448 } 449 /* reload stale pointer */ 450 node = cursor->node->ondisk; 451 KKASSERT(cursor->index != node->count); 452 --cursor->index; 453 continue; 454 } 455 456 /* 457 * Check internal or leaf element. Determine if the record 458 * at the cursor has gone beyond the end of our range. 459 * 460 * We recurse down through internal nodes. 461 */ 462 KKASSERT(cursor->index != node->count); 463 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 464 elm = &node->elms[cursor->index]; 465 466 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 467 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 468 if (hammer_debug_btree) { 469 hammer_debug_btree_elm(cursor, elm, "BRACKETL", r); 470 hammer_debug_btree_elm(cursor, elm + 1, "BRACKETR", s); 471 } 472 473 if (s >= 0) { 474 error = ENOENT; 475 break; 476 } 477 478 /* 479 * It shouldn't be possible to be seeked past key_end, 480 * even if the cursor got moved to a parent. 481 */ 482 KKASSERT(r >= 0); 483 484 /* 485 * Better not be zero 486 */ 487 KKASSERT(elm->internal.subtree_offset != 0); 488 489 error = hammer_cursor_down(cursor); 490 if (error) 491 break; 492 KKASSERT(cursor->index == 0); 493 /* reload stale pointer */ 494 node = cursor->node->ondisk; 495 496 /* this can assign -1 if the leaf was empty */ 497 cursor->index = node->count - 1; 498 continue; 499 } else { 500 elm = &node->elms[cursor->index]; 501 s = hammer_btree_cmp(&cursor->key_beg, &elm->base); 502 if (hammer_debug_btree) { 503 hammer_debug_btree_elm(cursor, elm, "ELEMENTR", s); 504 } 505 if (s > 0) { 506 error = ENOENT; 507 break; 508 } 509 510 /* 511 * It shouldn't be possible to be seeked past key_end, 512 * even if the cursor got moved to a parent. 513 */ 514 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 515 516 /* 517 * Return the element 518 */ 519 switch(elm->leaf.base.btype) { 520 case HAMMER_BTREE_TYPE_RECORD: 521 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 522 hammer_btree_chkts(cursor->asof, &elm->base)) { 523 --cursor->index; 524 continue; 525 } 526 error = 0; 527 break; 528 default: 529 error = EINVAL; 530 break; 531 } 532 if (error) 533 break; 534 } 535 536 /* 537 * Return entry 538 */ 539 if (hammer_debug_btree) { 540 elm = &cursor->node->ondisk->elms[cursor->index]; 541 hammer_debug_btree_elm(cursor, elm, "ITERATER", 0xffff); 542 } 543 return(0); 544 } 545 return(error); 546 } 547 548 /* 549 * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry 550 * could not be found, EDEADLK if inserting and a retry is needed, and a 551 * fatal error otherwise. When retrying, the caller must terminate the 552 * cursor and reinitialize it. EDEADLK cannot be returned if not inserting. 553 * 554 * The cursor is suitably positioned for a deletion on success, and suitably 555 * positioned for an insertion on ENOENT if HAMMER_CURSOR_INSERT was 556 * specified. 557 * 558 * The cursor may begin anywhere, the search will traverse the tree in 559 * either direction to locate the requested element. 560 * 561 * Most of the logic implementing historical searches is handled here. We 562 * do an initial lookup with create_tid set to the asof TID. Due to the 563 * way records are laid out, a backwards iteration may be required if 564 * ENOENT is returned to locate the historical record. Here's the 565 * problem: 566 * 567 * create_tid: 10 15 20 568 * LEAF1 LEAF2 569 * records: (11) (18) 570 * 571 * Lets say we want to do a lookup AS-OF timestamp 17. We will traverse 572 * LEAF2 but the only record in LEAF2 has a create_tid of 18, which is 573 * not visible and thus causes ENOENT to be returned. We really need 574 * to check record 11 in LEAF1. If it also fails then the search fails 575 * (e.g. it might represent the range 11-16 and thus still not match our 576 * AS-OF timestamp of 17). Note that LEAF1 could be empty, requiring 577 * further iterations. 578 * 579 * If this case occurs btree_search() will set HAMMER_CURSOR_CREATE_CHECK 580 * and the cursor->create_check TID if an iteration might be needed. 581 * In the above example create_check would be set to 14. 582 */ 583 int 584 hammer_btree_lookup(hammer_cursor_t cursor) 585 { 586 int error; 587 588 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 589 KKASSERT ((cursor->flags & HAMMER_CURSOR_INSERT) == 0 || 590 cursor->trans->sync_lock_refs > 0); 591 ++hammer_stats_btree_lookups; 592 if (cursor->flags & HAMMER_CURSOR_ASOF) { 593 KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0); 594 cursor->key_beg.create_tid = cursor->asof; 595 for (;;) { 596 cursor->flags &= ~HAMMER_CURSOR_CREATE_CHECK; 597 error = btree_search(cursor, 0); 598 if (error != ENOENT || 599 (cursor->flags & HAMMER_CURSOR_CREATE_CHECK) == 0) { 600 /* 601 * Stop if no error. 602 * Stop if error other then ENOENT. 603 * Stop if ENOENT and not special case. 604 */ 605 break; 606 } 607 if (hammer_debug_btree) { 608 hkprintf("CREATE_CHECK %016jx\n", 609 (intmax_t)cursor->create_check); 610 } 611 cursor->key_beg.create_tid = cursor->create_check; 612 /* loop */ 613 } 614 } else { 615 error = btree_search(cursor, 0); 616 } 617 if (error == 0) 618 error = hammer_btree_extract(cursor, cursor->flags); 619 return(error); 620 } 621 622 /* 623 * Execute the logic required to start an iteration. The first record 624 * located within the specified range is returned and iteration control 625 * flags are adjusted for successive hammer_btree_iterate() calls. 626 * 627 * Set ATEDISK so a low-level caller can call btree_first/btree_iterate 628 * in a loop without worrying about it. Higher-level merged searches will 629 * adjust the flag appropriately. 630 */ 631 int 632 hammer_btree_first(hammer_cursor_t cursor) 633 { 634 int error; 635 636 error = hammer_btree_lookup(cursor); 637 if (error == ENOENT) { 638 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 639 error = hammer_btree_iterate(cursor); 640 } 641 cursor->flags |= HAMMER_CURSOR_ATEDISK; 642 return(error); 643 } 644 645 /* 646 * Similarly but for an iteration in the reverse direction. 647 * 648 * Set ATEDISK when iterating backwards to skip the current entry, 649 * which after an ENOENT lookup will be pointing beyond our end point. 650 * 651 * Set ATEDISK so a low-level caller can call btree_last/btree_iterate_reverse 652 * in a loop without worrying about it. Higher-level merged searches will 653 * adjust the flag appropriately. 654 */ 655 int 656 hammer_btree_last(hammer_cursor_t cursor) 657 { 658 struct hammer_base_elm save; 659 int error; 660 661 save = cursor->key_beg; 662 cursor->key_beg = cursor->key_end; 663 error = hammer_btree_lookup(cursor); 664 cursor->key_beg = save; 665 if (error == ENOENT || 666 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 667 cursor->flags |= HAMMER_CURSOR_ATEDISK; 668 error = hammer_btree_iterate_reverse(cursor); 669 } 670 cursor->flags |= HAMMER_CURSOR_ATEDISK; 671 return(error); 672 } 673 674 /* 675 * Extract the record and/or data associated with the cursor's current 676 * position. Any prior record or data stored in the cursor is replaced. 677 * 678 * NOTE: All extractions occur at the leaf of the B-Tree. 679 */ 680 int 681 hammer_btree_extract(hammer_cursor_t cursor, int flags) 682 { 683 hammer_node_ondisk_t node; 684 hammer_btree_elm_t elm; 685 hammer_off_t data_off; 686 hammer_mount_t hmp; 687 int32_t data_len; 688 int error; 689 690 /* 691 * Certain types of corruption can result in a NULL node pointer. 692 */ 693 if (cursor->node == NULL) { 694 hkprintf("NULL cursor->node, filesystem might " 695 "have gotten corrupted\n"); 696 return (EINVAL); 697 } 698 699 /* 700 * The case where the data reference resolves to the same buffer 701 * as the record reference must be handled. 702 */ 703 node = cursor->node->ondisk; 704 elm = &node->elms[cursor->index]; 705 cursor->data = NULL; 706 hmp = cursor->node->hmp; 707 708 /* 709 * There is nothing to extract for an internal element. 710 */ 711 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) 712 return(EINVAL); 713 714 /* 715 * Only record types have data. 716 */ 717 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 718 cursor->leaf = &elm->leaf; 719 720 /* 721 * Returns here unless HAMMER_CURSOR_GET_DATA is set. 722 */ 723 if ((flags & HAMMER_CURSOR_GET_DATA) == 0) 724 return(0); 725 726 if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD) 727 return(EINVAL); 728 data_off = elm->leaf.data_offset; 729 data_len = elm->leaf.data_len; 730 if (data_off == 0) 731 return(0); 732 733 /* 734 * Load the data 735 */ 736 KKASSERT(data_len >= 0 && data_len <= HAMMER_XBUFSIZE); 737 cursor->data = hammer_bread_ext(hmp, data_off, data_len, 738 &error, &cursor->data_buffer); 739 740 /* 741 * Mark the data buffer as not being meta-data if it isn't 742 * meta-data (sometimes bulk data is accessed via a volume 743 * block device). 744 */ 745 if (error == 0) { 746 switch(elm->leaf.base.rec_type) { 747 case HAMMER_RECTYPE_DATA: 748 case HAMMER_RECTYPE_DB: 749 if ((data_off & HAMMER_ZONE_LARGE_DATA) == 0) 750 break; 751 if (hammer_double_buffer == 0 || 752 (cursor->flags & HAMMER_CURSOR_NOSWAPCACHE)) { 753 hammer_io_notmeta(cursor->data_buffer); 754 } 755 break; 756 default: 757 break; 758 } 759 } 760 761 /* 762 * Deal with CRC errors on the extracted data. 763 */ 764 if (error == 0 && 765 hammer_crc_test_leaf(hmp->version, cursor->data, &elm->leaf) == 0) { 766 hdkprintf("CRC DATA @ %016jx/%d FAILED\n", 767 (intmax_t)elm->leaf.data_offset, elm->leaf.data_len); 768 if (hammer_debug_critical) 769 Debugger("CRC FAILED: DATA"); 770 if (cursor->trans->flags & HAMMER_TRANSF_CRCDOM) 771 error = EDOM; /* less critical (mirroring) */ 772 else 773 error = EIO; /* critical */ 774 } 775 return(error); 776 } 777 778 779 /* 780 * Insert a leaf element into the B-Tree at the current cursor position. 781 * The cursor is positioned such that the element at and beyond the cursor 782 * are shifted to make room for the new record. 783 * 784 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT 785 * flag set and that call must return ENOENT before this function can be 786 * called. ENOSPC is returned if there is no room to insert a new record. 787 * 788 * The caller may depend on the cursor's exclusive lock after return to 789 * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE). 790 */ 791 int 792 hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm, 793 int *doprop) 794 { 795 hammer_node_ondisk_t node; 796 int i; 797 int error; 798 799 *doprop = 0; 800 if ((error = hammer_cursor_upgrade_node(cursor)) != 0) 801 return(error); 802 ++hammer_stats_btree_inserts; 803 804 /* 805 * Insert the element at the leaf node and update the count in the 806 * parent. It is possible for parent to be NULL, indicating that 807 * the filesystem's ROOT B-Tree node is a leaf itself, which is 808 * possible. The root inode can never be deleted so the leaf should 809 * never be empty. 810 * 811 * Remember that leaf nodes do not have boundaries. 812 */ 813 hammer_modify_node_all(cursor->trans, cursor->node); 814 node = cursor->node->ondisk; 815 i = cursor->index; 816 KKASSERT(elm->base.btype != 0); 817 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 818 KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS); 819 if (i != node->count) { 820 bcopy(&node->elms[i], &node->elms[i+1], 821 (node->count - i) * sizeof(*elm)); 822 } 823 node->elms[i].leaf = *elm; 824 ++node->count; 825 hammer_cursor_inserted_element(cursor->node, i); 826 827 /* 828 * Update the leaf node's aggregate mirror_tid for mirroring 829 * support. 830 */ 831 if (node->mirror_tid < elm->base.delete_tid) { 832 node->mirror_tid = elm->base.delete_tid; 833 *doprop = 1; 834 } 835 if (node->mirror_tid < elm->base.create_tid) { 836 node->mirror_tid = elm->base.create_tid; 837 *doprop = 1; 838 } 839 hammer_modify_node_done(cursor->node); 840 841 /* 842 * Debugging sanity checks. 843 */ 844 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->base) <= 0); 845 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->base) > 0); 846 if (i) { 847 KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->base) < 0); 848 } 849 if (i != node->count - 1) 850 KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->base) > 0); 851 852 return(0); 853 } 854 855 /* 856 * Delete a record from the B-Tree at the current cursor position. 857 * The cursor is positioned such that the current element is the one 858 * to be deleted. 859 * 860 * On return the cursor will be positioned after the deleted element and 861 * MAY point to an internal node. It will be suitable for the continuation 862 * of an iteration but not for an insertion or deletion. 863 * 864 * Deletions will attempt to partially rebalance the B-Tree in an upward 865 * direction, but will terminate rather then deadlock. Empty internal nodes 866 * are never allowed by a deletion which deadlocks may end up giving us an 867 * empty leaf. The pruner will clean up and rebalance the tree. 868 * 869 * This function can return EDEADLK, requiring the caller to retry the 870 * operation after clearing the deadlock. 871 * 872 * This function will store the number of deleted btree nodes in *ndelete 873 * if ndelete is not NULL. 874 */ 875 int 876 hammer_btree_delete(hammer_cursor_t cursor, int *ndelete) 877 { 878 hammer_node_ondisk_t ondisk; 879 hammer_node_t node; 880 hammer_node_t parent __debugvar; 881 int error; 882 int i; 883 884 KKASSERT (cursor->trans->sync_lock_refs > 0); 885 if (ndelete) 886 *ndelete = 0; 887 if ((error = hammer_cursor_upgrade(cursor)) != 0) 888 return(error); 889 ++hammer_stats_btree_deletes; 890 891 /* 892 * Delete the element from the leaf node. 893 * 894 * Remember that leaf nodes do not have boundaries. 895 */ 896 node = cursor->node; 897 ondisk = node->ondisk; 898 i = cursor->index; 899 900 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF); 901 KKASSERT(i >= 0 && i < ondisk->count); 902 hammer_modify_node_all(cursor->trans, node); 903 if (i + 1 != ondisk->count) { 904 bcopy(&ondisk->elms[i+1], &ondisk->elms[i], 905 (ondisk->count - i - 1) * sizeof(ondisk->elms[0])); 906 } 907 --ondisk->count; 908 hammer_modify_node_done(node); 909 hammer_cursor_deleted_element(node, i); 910 911 /* 912 * Validate local parent 913 */ 914 if (ondisk->parent) { 915 parent = cursor->parent; 916 917 KKASSERT(parent != NULL); 918 KKASSERT(parent->node_offset == ondisk->parent); 919 } 920 921 /* 922 * If the leaf becomes empty it must be detached from the parent, 923 * potentially recursing through to the filesystem root. 924 * 925 * This may reposition the cursor at one of the parent's of the 926 * current node. 927 * 928 * Ignore deadlock errors, that simply means that btree_remove 929 * was unable to recurse and had to leave us with an empty leaf. 930 */ 931 KKASSERT(cursor->index <= ondisk->count); 932 if (ondisk->count == 0) { 933 error = btree_remove(cursor, ndelete); 934 if (error == EDEADLK) 935 error = 0; 936 } else { 937 error = 0; 938 } 939 KKASSERT(cursor->parent == NULL || 940 cursor->parent_index < cursor->parent->ondisk->count); 941 return(error); 942 } 943 944 /* 945 * PRIMARY B-TREE SEARCH SUPPORT PROCEDURE 946 * 947 * Search the filesystem B-Tree for cursor->key_beg, return the matching node. 948 * 949 * The search can begin ANYWHERE in the B-Tree. As a first step the search 950 * iterates up the tree as necessary to properly position itself prior to 951 * actually doing the sarch. 952 * 953 * INSERTIONS: The search will split full nodes and leaves on its way down 954 * and guarentee that the leaf it ends up on is not full. If we run out 955 * of space the search continues to the leaf, but ENOSPC is returned. 956 * 957 * The search is only guarenteed to end up on a leaf if an error code of 0 958 * is returned, or if inserting and an error code of ENOENT is returned. 959 * Otherwise it can stop at an internal node. On success a search returns 960 * a leaf node. 961 * 962 * COMPLEXITY WARNING! This is the core B-Tree search code for the entire 963 * filesystem, and it is not simple code. Please note the following facts: 964 * 965 * - Internal node recursions have a boundary on the left AND right. The 966 * right boundary is non-inclusive. The create_tid is a generic part 967 * of the key for internal nodes. 968 * 969 * - Filesystem lookups typically set HAMMER_CURSOR_ASOF, indicating a 970 * historical search. ASOF and INSERT are mutually exclusive. When 971 * doing an as-of lookup btree_search() checks for a right-edge boundary 972 * case. If while recursing down the left-edge differs from the key 973 * by ONLY its create_tid, HAMMER_CURSOR_CREATE_CHECK is set along 974 * with cursor->create_check. This is used by btree_lookup() to iterate. 975 * The iteration backwards because as-of searches can wind up going 976 * down the wrong branch of the B-Tree. 977 */ 978 static 979 int 980 btree_search(hammer_cursor_t cursor, int flags) 981 { 982 hammer_node_ondisk_t node; 983 hammer_btree_elm_t elm; 984 int error; 985 int enospc = 0; 986 int i; 987 int r; 988 int s; 989 990 flags |= cursor->flags; 991 ++hammer_stats_btree_searches; 992 993 if (hammer_debug_btree) { 994 hammer_debug_btree_elm(cursor, 995 (hammer_btree_elm_t)&cursor->key_beg, 996 "SEARCH", 0xffff); 997 if (cursor->parent) 998 hammer_debug_btree_parent(cursor, "SEARCHP"); 999 } 1000 1001 /* 1002 * Move our cursor up the tree until we find a node whos range covers 1003 * the key we are trying to locate. 1004 * 1005 * The left bound is inclusive, the right bound is non-inclusive. 1006 * It is ok to cursor up too far. 1007 */ 1008 for (;;) { 1009 r = hammer_btree_cmp(&cursor->key_beg, cursor->left_bound); 1010 s = hammer_btree_cmp(&cursor->key_beg, cursor->right_bound); 1011 if (r >= 0 && s < 0) 1012 break; 1013 KKASSERT(cursor->parent); 1014 ++hammer_stats_btree_iterations; 1015 error = hammer_cursor_up(cursor); 1016 if (error) 1017 goto done; 1018 } 1019 1020 /* 1021 * The delete-checks below are based on node, not parent. Set the 1022 * initial delete-check based on the parent. 1023 */ 1024 if (r == 1) { 1025 KKASSERT(cursor->left_bound->create_tid != 1); 1026 cursor->create_check = cursor->left_bound->create_tid - 1; 1027 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1028 } 1029 1030 /* 1031 * We better have ended up with a node somewhere. 1032 */ 1033 KKASSERT(cursor->node != NULL); 1034 1035 /* 1036 * If we are inserting we can't start at a full node if the parent 1037 * is also full (because there is no way to split the node), 1038 * continue running up the tree until the requirement is satisfied 1039 * or we hit the root of the filesystem. 1040 * 1041 * (If inserting we aren't doing an as-of search so we don't have 1042 * to worry about create_check). 1043 */ 1044 while (flags & HAMMER_CURSOR_INSERT) { 1045 if (btree_node_is_full(cursor->node->ondisk) == 0) 1046 break; 1047 if (cursor->node->ondisk->parent == 0 || 1048 cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) { 1049 break; 1050 } 1051 ++hammer_stats_btree_iterations; 1052 error = hammer_cursor_up(cursor); 1053 /* node may have become stale */ 1054 if (error) 1055 goto done; 1056 } 1057 1058 /* 1059 * Push down through internal nodes to locate the requested key. 1060 */ 1061 node = cursor->node->ondisk; 1062 while (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 1063 /* 1064 * Scan the node to find the subtree index to push down into. 1065 * We go one-past, then back-up. 1066 * 1067 * We must proactively remove deleted elements which may 1068 * have been left over from a deadlocked btree_remove(). 1069 * 1070 * The left and right boundaries are included in the loop 1071 * in order to detect edge cases. 1072 * 1073 * If the separator only differs by create_tid (r == 1) 1074 * and we are doing an as-of search, we may end up going 1075 * down a branch to the left of the one containing the 1076 * desired key. This requires numerous special cases. 1077 */ 1078 ++hammer_stats_btree_iterations; 1079 if (hammer_debug_btree) { 1080 hkprintf("SEARCH-I %016jx count=%d\n", 1081 (intmax_t)cursor->node->node_offset, 1082 node->count); 1083 } 1084 1085 /* 1086 * Try to shortcut the search before dropping into the 1087 * linear loop. Locate the first node where r <= 1. 1088 */ 1089 i = hammer_btree_search_node(&cursor->key_beg, node); 1090 while (i <= node->count) { 1091 ++hammer_stats_btree_elements; 1092 elm = &node->elms[i]; 1093 r = hammer_btree_cmp(&cursor->key_beg, &elm->base); 1094 if (hammer_debug_btree > 2) { 1095 hkprintf(" IELM %p [%d] r=%d\n", 1096 &node->elms[i], i, r); 1097 } 1098 if (r < 0) 1099 break; 1100 if (r == 1) { 1101 KKASSERT(elm->base.create_tid != 1); 1102 cursor->create_check = elm->base.create_tid - 1; 1103 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1104 } 1105 ++i; 1106 } 1107 if (hammer_debug_btree) { 1108 hkprintf("SEARCH-I preI=%d/%d r=%d\n", 1109 i, node->count, r); 1110 } 1111 1112 /* 1113 * The first two cases (i == 0 or i == node->count + 1) 1114 * occur when the parent's idea of the boundary 1115 * is wider then the child's idea of the boundary, and 1116 * require special handling. If not inserting we can 1117 * terminate the search early for these cases but the 1118 * child's boundaries cannot be unconditionally modified. 1119 * 1120 * The last case (neither of the above) fits in child's 1121 * idea of the boundary, so we can simply push down the 1122 * cursor. 1123 */ 1124 if (i == 0) { 1125 /* 1126 * If i == 0 the search terminated to the LEFT of the 1127 * left_boundary but to the RIGHT of the parent's left 1128 * boundary. 1129 */ 1130 uint8_t save; 1131 1132 elm = &node->elms[0]; 1133 1134 /* 1135 * If we aren't inserting we can stop here. 1136 */ 1137 if ((flags & (HAMMER_CURSOR_INSERT | 1138 HAMMER_CURSOR_PRUNING)) == 0) { 1139 cursor->index = 0; 1140 return(ENOENT); 1141 } 1142 1143 /* 1144 * Correct a left-hand boundary mismatch. 1145 * 1146 * We can only do this if we can upgrade the lock, 1147 * and synchronized as a background cursor (i.e. 1148 * inserting or pruning). 1149 * 1150 * WARNING: We can only do this if inserting, i.e. 1151 * we are running on the backend. 1152 */ 1153 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1154 return(error); 1155 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1156 hammer_modify_node_field(cursor->trans, cursor->node, 1157 elms[0]); 1158 save = node->elms[0].base.btype; 1159 node->elms[0].base = *cursor->left_bound; 1160 node->elms[0].base.btype = save; 1161 hammer_modify_node_done(cursor->node); 1162 } else if (i == node->count + 1) { 1163 /* 1164 * If i == node->count + 1 the search terminated to 1165 * the RIGHT of the right boundary but to the LEFT 1166 * of the parent's right boundary. If we aren't 1167 * inserting we can stop here. 1168 * 1169 * Note that the last element in this case is 1170 * elms[i-2] prior to adjustments to 'i'. 1171 */ 1172 --i; 1173 if ((flags & (HAMMER_CURSOR_INSERT | 1174 HAMMER_CURSOR_PRUNING)) == 0) { 1175 cursor->index = i; 1176 return (ENOENT); 1177 } 1178 1179 /* 1180 * Correct a right-hand boundary mismatch. 1181 * (actual push-down record is i-2 prior to 1182 * adjustments to i). 1183 * 1184 * We can only do this if we can upgrade the lock, 1185 * and synchronized as a background cursor (i.e. 1186 * inserting or pruning). 1187 * 1188 * WARNING: We can only do this if inserting, i.e. 1189 * we are running on the backend. 1190 */ 1191 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1192 return(error); 1193 elm = &node->elms[i]; 1194 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1195 hammer_modify_node(cursor->trans, cursor->node, 1196 &elm->base, sizeof(elm->base)); 1197 elm->base = *cursor->right_bound; 1198 hammer_modify_node_done(cursor->node); 1199 --i; 1200 } else { 1201 /* 1202 * The push-down index is now i - 1. If we had 1203 * terminated on the right boundary this will point 1204 * us at the last element. 1205 */ 1206 --i; 1207 } 1208 cursor->index = i; 1209 elm = &node->elms[i]; 1210 1211 if (hammer_debug_btree) { 1212 hammer_debug_btree_elm(cursor, elm, "RESULT-I", 0xffff); 1213 } 1214 1215 /* 1216 * We better have a valid subtree offset. 1217 */ 1218 KKASSERT(elm->internal.subtree_offset != 0); 1219 1220 /* 1221 * Handle insertion and deletion requirements. 1222 * 1223 * If inserting split full nodes. The split code will 1224 * adjust cursor->node and cursor->index if the current 1225 * index winds up in the new node. 1226 * 1227 * If inserting and a left or right edge case was detected, 1228 * we cannot correct the left or right boundary and must 1229 * prepend and append an empty leaf node in order to make 1230 * the boundary correction. 1231 * 1232 * If we run out of space we set enospc but continue on 1233 * to a leaf. 1234 */ 1235 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) { 1236 if (btree_node_is_full(node)) { 1237 error = btree_split_internal(cursor); 1238 if (error) { 1239 if (error != ENOSPC) 1240 goto done; 1241 enospc = 1; 1242 } 1243 /* 1244 * reload stale pointers 1245 */ 1246 i = cursor->index; 1247 node = cursor->node->ondisk; 1248 } 1249 } 1250 1251 /* 1252 * Push down (push into new node, existing node becomes 1253 * the parent) and continue the search. 1254 */ 1255 error = hammer_cursor_down(cursor); 1256 /* node may have become stale */ 1257 if (error) 1258 goto done; 1259 node = cursor->node->ondisk; 1260 } 1261 1262 /* 1263 * We are at a leaf, do a linear search of the key array. 1264 * 1265 * On success the index is set to the matching element and 0 1266 * is returned. 1267 * 1268 * On failure the index is set to the insertion point and ENOENT 1269 * is returned. 1270 * 1271 * Boundaries are not stored in leaf nodes, so the index can wind 1272 * up to the left of element 0 (index == 0) or past the end of 1273 * the array (index == node->count). It is also possible that the 1274 * leaf might be empty. 1275 */ 1276 ++hammer_stats_btree_iterations; 1277 KKASSERT (node->type == HAMMER_BTREE_TYPE_LEAF); 1278 KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS); 1279 if (hammer_debug_btree) { 1280 hkprintf("SEARCH-L %016jx count=%d\n", 1281 (intmax_t)cursor->node->node_offset, 1282 node->count); 1283 } 1284 1285 /* 1286 * Try to shortcut the search before dropping into the 1287 * linear loop. Locate the first node where r <= 1. 1288 */ 1289 i = hammer_btree_search_node(&cursor->key_beg, node); 1290 while (i < node->count) { 1291 ++hammer_stats_btree_elements; 1292 elm = &node->elms[i]; 1293 1294 r = hammer_btree_cmp(&cursor->key_beg, &elm->leaf.base); 1295 1296 if (hammer_debug_btree > 1) 1297 hkprintf(" LELM %p [%d] r=%d\n", &node->elms[i], i, r); 1298 1299 /* 1300 * We are at a record element. Stop if we've flipped past 1301 * key_beg, not counting the create_tid test. Allow the 1302 * r == 1 case (key_beg > element but differs only by its 1303 * create_tid) to fall through to the AS-OF check. 1304 */ 1305 KKASSERT (elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD); 1306 1307 if (r < 0) 1308 goto failed; 1309 if (r > 1) { 1310 ++i; 1311 continue; 1312 } 1313 1314 /* 1315 * Check our as-of timestamp against the element. 1316 */ 1317 if (flags & HAMMER_CURSOR_ASOF) { 1318 if (hammer_btree_chkts(cursor->asof, 1319 &node->elms[i].base) != 0) { 1320 ++i; 1321 continue; 1322 } 1323 /* success */ 1324 } else { 1325 if (r > 0) { /* can only be +1 */ 1326 ++i; 1327 continue; 1328 } 1329 /* success */ 1330 } 1331 cursor->index = i; 1332 error = 0; 1333 if (hammer_debug_btree) { 1334 hkprintf("RESULT-L %016jx[%d] (SUCCESS)\n", 1335 (intmax_t)cursor->node->node_offset, i); 1336 } 1337 goto done; 1338 } 1339 1340 /* 1341 * The search of the leaf node failed. i is the insertion point. 1342 */ 1343 failed: 1344 if (hammer_debug_btree) { 1345 hkprintf("RESULT-L %016jx[%d] (FAILED)\n", 1346 (intmax_t)cursor->node->node_offset, i); 1347 } 1348 1349 /* 1350 * No exact match was found, i is now at the insertion point. 1351 * 1352 * If inserting split a full leaf before returning. This 1353 * may have the side effect of adjusting cursor->node and 1354 * cursor->index. 1355 */ 1356 cursor->index = i; 1357 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0 && 1358 btree_node_is_full(node)) { 1359 error = btree_split_leaf(cursor); 1360 if (error) { 1361 if (error != ENOSPC) 1362 goto done; 1363 enospc = 1; 1364 } 1365 /* 1366 * reload stale pointers 1367 */ 1368 /* NOT USED 1369 i = cursor->index; 1370 node = &cursor->node->internal; 1371 */ 1372 } 1373 1374 /* 1375 * We reached a leaf but did not find the key we were looking for. 1376 * If this is an insert we will be properly positioned for an insert 1377 * (ENOENT) or unable to insert (ENOSPC). 1378 */ 1379 error = enospc ? ENOSPC : ENOENT; 1380 done: 1381 return(error); 1382 } 1383 1384 /* 1385 * Heuristical search for the first element whos comparison is <= 1. May 1386 * return an index whos compare result is > 1 but may only return an index 1387 * whos compare result is <= 1 if it is the first element with that result. 1388 */ 1389 int 1390 hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node) 1391 { 1392 int b; 1393 int s; 1394 int i; 1395 int r; 1396 1397 /* 1398 * Don't bother if the node does not have very many elements 1399 */ 1400 b = 0; 1401 s = node->count; 1402 while (s - b > 4) { 1403 i = b + (s - b) / 2; 1404 ++hammer_stats_btree_elements; 1405 r = hammer_btree_cmp(elm, &node->elms[i].leaf.base); 1406 if (r <= 1) { 1407 s = i; 1408 } else { 1409 b = i; 1410 } 1411 } 1412 return(b); 1413 } 1414 1415 1416 /************************************************************************ 1417 * SPLITTING AND MERGING * 1418 ************************************************************************ 1419 * 1420 * These routines do all the dirty work required to split and merge nodes. 1421 */ 1422 1423 /* 1424 * Split an internal node into two nodes and move the separator at the split 1425 * point to the parent. 1426 * 1427 * (cursor->node, cursor->index) indicates the element the caller intends 1428 * to push into. We will adjust node and index if that element winds 1429 * up in the split node. 1430 * 1431 * If we are at the root of the filesystem a new root must be created with 1432 * two elements, one pointing to the original root and one pointing to the 1433 * newly allocated split node. 1434 */ 1435 static 1436 int 1437 btree_split_internal(hammer_cursor_t cursor) 1438 { 1439 hammer_node_ondisk_t ondisk; 1440 hammer_node_t node; 1441 hammer_node_t parent; 1442 hammer_node_t new_node; 1443 hammer_btree_elm_t elm; 1444 hammer_btree_elm_t parent_elm; 1445 struct hammer_node_lock lockroot; 1446 hammer_mount_t hmp = cursor->trans->hmp; 1447 int parent_index; 1448 int made_root; 1449 int split; 1450 int error; 1451 int i; 1452 const int esize = sizeof(*elm); 1453 1454 hammer_node_lock_init(&lockroot, cursor->node); 1455 error = hammer_btree_lock_children(cursor, 1, &lockroot, NULL); 1456 if (error) 1457 goto done; 1458 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1459 goto done; 1460 ++hammer_stats_btree_splits; 1461 1462 /* 1463 * Calculate the split point. If the insertion point is at the 1464 * end of the leaf we adjust the split point significantly to the 1465 * right to try to optimize node fill and flag it. If we hit 1466 * that same leaf again our heuristic failed and we don't try 1467 * to optimize node fill (it could lead to a degenerate case). 1468 */ 1469 node = cursor->node; 1470 ondisk = node->ondisk; 1471 KKASSERT(ondisk->count > 4); 1472 if (cursor->index == ondisk->count && 1473 (node->flags & HAMMER_NODE_NONLINEAR) == 0) { 1474 split = (ondisk->count + 1) * 3 / 4; 1475 node->flags |= HAMMER_NODE_NONLINEAR; 1476 } else { 1477 /* 1478 * We are splitting but elms[split] will be promoted to 1479 * the parent, leaving the right hand node with one less 1480 * element. If the insertion point will be on the 1481 * left-hand side adjust the split point to give the 1482 * right hand side one additional node. 1483 */ 1484 split = (ondisk->count + 1) / 2; 1485 if (cursor->index <= split) 1486 --split; 1487 } 1488 1489 /* 1490 * If we are at the root of the filesystem, create a new root node 1491 * with 1 element and split normally. Avoid making major 1492 * modifications until we know the whole operation will work. 1493 */ 1494 if (ondisk->parent == 0) { 1495 parent = hammer_alloc_btree(cursor->trans, 0, &error); 1496 if (parent == NULL) 1497 goto done; 1498 hammer_lock_ex(&parent->lock); 1499 hammer_modify_node_noundo(cursor->trans, parent); 1500 ondisk = parent->ondisk; 1501 ondisk->count = 1; 1502 ondisk->parent = 0; 1503 ondisk->mirror_tid = node->ondisk->mirror_tid; 1504 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1505 ondisk->elms[0].base = hmp->root_btree_beg; 1506 ondisk->elms[0].base.btype = node->ondisk->type; 1507 ondisk->elms[0].internal.subtree_offset = node->node_offset; 1508 ondisk->elms[0].internal.mirror_tid = ondisk->mirror_tid; 1509 ondisk->elms[1].base = hmp->root_btree_end; 1510 hammer_modify_node_done(parent); 1511 made_root = 1; 1512 parent_index = 0; /* index of current node in parent */ 1513 } else { 1514 made_root = 0; 1515 parent = cursor->parent; 1516 parent_index = cursor->parent_index; 1517 } 1518 1519 /* 1520 * Split node into new_node at the split point. 1521 * 1522 * B O O O P N N B <-- P = node->elms[split] (index 4) 1523 * 0 1 2 3 4 5 6 <-- subtree indices 1524 * 1525 * x x P x x 1526 * s S S s 1527 * / \ 1528 * B O O O B B N N B <--- inner boundary points are 'P' 1529 * 0 1 2 3 4 5 6 1530 */ 1531 new_node = hammer_alloc_btree(cursor->trans, 0, &error); 1532 if (new_node == NULL) { 1533 if (made_root) { 1534 hammer_unlock(&parent->lock); 1535 hammer_delete_node(cursor->trans, parent); 1536 hammer_rel_node(parent); 1537 } 1538 goto done; 1539 } 1540 hammer_lock_ex(&new_node->lock); 1541 1542 /* 1543 * Create the new node. P becomes the left-hand boundary in the 1544 * new node. Copy the right-hand boundary as well. 1545 * 1546 * elm is the new separator. 1547 */ 1548 hammer_modify_node_noundo(cursor->trans, new_node); 1549 hammer_modify_node_all(cursor->trans, node); 1550 ondisk = node->ondisk; 1551 elm = &ondisk->elms[split]; 1552 bcopy(elm, &new_node->ondisk->elms[0], 1553 (ondisk->count - split + 1) * esize); /* +1 for boundary */ 1554 new_node->ondisk->count = ondisk->count - split; 1555 new_node->ondisk->parent = parent->node_offset; 1556 new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1557 new_node->ondisk->mirror_tid = ondisk->mirror_tid; 1558 KKASSERT(ondisk->type == new_node->ondisk->type); 1559 hammer_cursor_split_node(node, new_node, split); 1560 1561 /* 1562 * Cleanup the original node. Elm (P) becomes the new boundary, 1563 * its subtree_offset was moved to the new node. If we had created 1564 * a new root its parent pointer may have changed. 1565 */ 1566 elm->base.btype = HAMMER_BTREE_TYPE_NONE; 1567 elm->internal.subtree_offset = 0; 1568 ondisk->count = split; 1569 1570 /* 1571 * Insert the separator into the parent, fixup the parent's 1572 * reference to the original node, and reference the new node. 1573 * The separator is P. 1574 * 1575 * Remember that ondisk->count does not include the right-hand boundary. 1576 */ 1577 hammer_modify_node_all(cursor->trans, parent); 1578 ondisk = parent->ondisk; 1579 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1580 parent_elm = &ondisk->elms[parent_index+1]; 1581 bcopy(parent_elm, parent_elm + 1, 1582 (ondisk->count - parent_index) * esize); 1583 1584 /* 1585 * Why not use hammer_make_separator() here ? 1586 */ 1587 parent_elm->internal.base = elm->base; /* separator P */ 1588 parent_elm->internal.base.btype = new_node->ondisk->type; 1589 parent_elm->internal.subtree_offset = new_node->node_offset; 1590 parent_elm->internal.mirror_tid = new_node->ondisk->mirror_tid; 1591 ++ondisk->count; 1592 hammer_modify_node_done(parent); 1593 hammer_cursor_inserted_element(parent, parent_index + 1); 1594 1595 /* 1596 * The children of new_node need their parent pointer set to new_node. 1597 * The children have already been locked by 1598 * hammer_btree_lock_children(). 1599 */ 1600 for (i = 0; i < new_node->ondisk->count; ++i) { 1601 elm = &new_node->ondisk->elms[i]; 1602 error = btree_set_parent_of_child(cursor->trans, new_node, elm); 1603 if (error) { 1604 hpanic("btree-fixup problem"); 1605 } 1606 } 1607 hammer_modify_node_done(new_node); 1608 1609 /* 1610 * The filesystem's root B-Tree pointer may have to be updated. 1611 */ 1612 if (made_root) { 1613 hammer_volume_t volume; 1614 1615 volume = hammer_get_root_volume(hmp, &error); 1616 KKASSERT(error == 0); 1617 1618 hammer_modify_volume_field(cursor->trans, volume, 1619 vol0_btree_root); 1620 volume->ondisk->vol0_btree_root = parent->node_offset; 1621 hammer_modify_volume_done(volume); 1622 node->ondisk->parent = parent->node_offset; 1623 if (cursor->parent) { 1624 hammer_unlock(&cursor->parent->lock); 1625 hammer_rel_node(cursor->parent); 1626 } 1627 cursor->parent = parent; /* lock'd and ref'd */ 1628 hammer_rel_volume(volume, 0); 1629 } 1630 hammer_modify_node_done(node); 1631 1632 /* 1633 * Ok, now adjust the cursor depending on which element the original 1634 * index was pointing at. If we are >= the split point the push node 1635 * is now in the new node. 1636 * 1637 * NOTE: If we are at the split point itself we cannot stay with the 1638 * original node because the push index will point at the right-hand 1639 * boundary, which is illegal. 1640 * 1641 * NOTE: The cursor's parent or parent_index must be adjusted for 1642 * the case where a new parent (new root) was created, and the case 1643 * where the cursor is now pointing at the split node. 1644 */ 1645 if (cursor->index >= split) { 1646 cursor->parent_index = parent_index + 1; 1647 cursor->index -= split; 1648 hammer_unlock(&cursor->node->lock); 1649 hammer_rel_node(cursor->node); 1650 cursor->node = new_node; /* locked and ref'd */ 1651 } else { 1652 cursor->parent_index = parent_index; 1653 hammer_unlock(&new_node->lock); 1654 hammer_rel_node(new_node); 1655 } 1656 1657 /* 1658 * Fixup left and right bounds 1659 */ 1660 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1661 cursor->left_bound = &parent_elm[0].internal.base; 1662 cursor->right_bound = &parent_elm[1].internal.base; 1663 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1664 &cursor->node->ondisk->elms[0].internal.base) <= 0); 1665 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1666 &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0); 1667 1668 done: 1669 hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, NULL); 1670 hammer_cursor_downgrade(cursor); 1671 return (error); 1672 } 1673 1674 /* 1675 * Same as the above, but splits a full leaf node. 1676 */ 1677 static 1678 int 1679 btree_split_leaf(hammer_cursor_t cursor) 1680 { 1681 hammer_node_ondisk_t ondisk; 1682 hammer_node_t parent; 1683 hammer_node_t leaf; 1684 hammer_mount_t hmp; 1685 hammer_node_t new_leaf; 1686 hammer_btree_elm_t elm; 1687 hammer_btree_elm_t parent_elm; 1688 hammer_base_elm_t mid_boundary; 1689 int parent_index; 1690 int made_root; 1691 int split; 1692 int error; 1693 const size_t esize = sizeof(*elm); 1694 1695 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1696 return(error); 1697 ++hammer_stats_btree_splits; 1698 1699 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1700 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1701 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1702 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1703 1704 /* 1705 * Calculate the split point. If the insertion point is at the 1706 * end of the leaf we adjust the split point significantly to the 1707 * right to try to optimize node fill and flag it. If we hit 1708 * that same leaf again our heuristic failed and we don't try 1709 * to optimize node fill (it could lead to a degenerate case). 1710 */ 1711 leaf = cursor->node; 1712 ondisk = leaf->ondisk; 1713 KKASSERT(ondisk->count > 4); 1714 if (cursor->index == ondisk->count && 1715 (leaf->flags & HAMMER_NODE_NONLINEAR) == 0) { 1716 split = (ondisk->count + 1) * 3 / 4; 1717 leaf->flags |= HAMMER_NODE_NONLINEAR; 1718 } else { 1719 split = (ondisk->count + 1) / 2; 1720 } 1721 1722 #if 0 1723 /* 1724 * If the insertion point is at the split point shift the 1725 * split point left so we don't have to worry about 1726 */ 1727 if (cursor->index == split) 1728 --split; 1729 #endif 1730 KKASSERT(split > 0 && split < ondisk->count); 1731 1732 error = 0; 1733 hmp = leaf->hmp; 1734 1735 elm = &ondisk->elms[split]; 1736 1737 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm[-1].leaf.base) <= 0); 1738 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->leaf.base) <= 0); 1739 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->leaf.base) > 0); 1740 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm[1].leaf.base) > 0); 1741 1742 /* 1743 * If we are at the root of the tree, create a new root node with 1744 * 1 element and split normally. Avoid making major modifications 1745 * until we know the whole operation will work. 1746 */ 1747 if (ondisk->parent == 0) { 1748 parent = hammer_alloc_btree(cursor->trans, 0, &error); 1749 if (parent == NULL) 1750 goto done; 1751 hammer_lock_ex(&parent->lock); 1752 hammer_modify_node_noundo(cursor->trans, parent); 1753 ondisk = parent->ondisk; 1754 ondisk->count = 1; 1755 ondisk->parent = 0; 1756 ondisk->mirror_tid = leaf->ondisk->mirror_tid; 1757 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1758 ondisk->elms[0].base = hmp->root_btree_beg; 1759 ondisk->elms[0].base.btype = leaf->ondisk->type; 1760 ondisk->elms[0].internal.subtree_offset = leaf->node_offset; 1761 ondisk->elms[0].internal.mirror_tid = ondisk->mirror_tid; 1762 ondisk->elms[1].base = hmp->root_btree_end; 1763 hammer_modify_node_done(parent); 1764 made_root = 1; 1765 parent_index = 0; /* insertion point in parent */ 1766 } else { 1767 made_root = 0; 1768 parent = cursor->parent; 1769 parent_index = cursor->parent_index; 1770 } 1771 1772 /* 1773 * Split leaf into new_leaf at the split point. Select a separator 1774 * value in-between the two leafs but with a bent towards the right 1775 * leaf since comparisons use an 'elm >= separator' inequality. 1776 * 1777 * L L L L L L L L 1778 * 1779 * x x P x x 1780 * s S S s 1781 * / \ 1782 * L L L L L L L L 1783 */ 1784 new_leaf = hammer_alloc_btree(cursor->trans, 0, &error); 1785 if (new_leaf == NULL) { 1786 if (made_root) { 1787 hammer_unlock(&parent->lock); 1788 hammer_delete_node(cursor->trans, parent); 1789 hammer_rel_node(parent); 1790 } 1791 goto done; 1792 } 1793 hammer_lock_ex(&new_leaf->lock); 1794 1795 /* 1796 * Create the new node and copy the leaf elements from the split 1797 * point on to the new node. 1798 */ 1799 hammer_modify_node_all(cursor->trans, leaf); 1800 hammer_modify_node_noundo(cursor->trans, new_leaf); 1801 ondisk = leaf->ondisk; 1802 elm = &ondisk->elms[split]; 1803 bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize); 1804 new_leaf->ondisk->count = ondisk->count - split; 1805 new_leaf->ondisk->parent = parent->node_offset; 1806 new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF; 1807 new_leaf->ondisk->mirror_tid = ondisk->mirror_tid; 1808 KKASSERT(ondisk->type == new_leaf->ondisk->type); 1809 hammer_modify_node_done(new_leaf); 1810 hammer_cursor_split_node(leaf, new_leaf, split); 1811 1812 /* 1813 * Cleanup the original node. Because this is a leaf node and 1814 * leaf nodes do not have a right-hand boundary, there 1815 * aren't any special edge cases to clean up. We just fixup the 1816 * count. 1817 */ 1818 ondisk->count = split; 1819 1820 /* 1821 * Insert the separator into the parent, fixup the parent's 1822 * reference to the original node, and reference the new node. 1823 * The separator is P. 1824 * 1825 * Remember that ondisk->count does not include the right-hand boundary. 1826 * We are copying parent_index+1 to parent_index+2, not +0 to +1. 1827 */ 1828 hammer_modify_node_all(cursor->trans, parent); 1829 ondisk = parent->ondisk; 1830 KKASSERT(split != 0); 1831 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1832 parent_elm = &ondisk->elms[parent_index+1]; 1833 bcopy(parent_elm, parent_elm + 1, 1834 (ondisk->count - parent_index) * esize); 1835 1836 /* 1837 * elm[-1] is the right-most elm in the original node. 1838 * elm[0] equals the left-most elm at index=0 in the new node. 1839 * parent_elm[-1] and parent_elm point to original and new node. 1840 * Update the parent_elm base to meet >elm[-1] and <=elm[0]. 1841 */ 1842 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base); 1843 parent_elm->internal.base.btype = new_leaf->ondisk->type; 1844 parent_elm->internal.subtree_offset = new_leaf->node_offset; 1845 parent_elm->internal.mirror_tid = new_leaf->ondisk->mirror_tid; 1846 mid_boundary = &parent_elm->base; 1847 ++ondisk->count; 1848 hammer_modify_node_done(parent); 1849 hammer_cursor_inserted_element(parent, parent_index + 1); 1850 1851 /* 1852 * The filesystem's root B-Tree pointer may have to be updated. 1853 */ 1854 if (made_root) { 1855 hammer_volume_t volume; 1856 1857 volume = hammer_get_root_volume(hmp, &error); 1858 KKASSERT(error == 0); 1859 1860 hammer_modify_volume_field(cursor->trans, volume, 1861 vol0_btree_root); 1862 volume->ondisk->vol0_btree_root = parent->node_offset; 1863 hammer_modify_volume_done(volume); 1864 leaf->ondisk->parent = parent->node_offset; 1865 if (cursor->parent) { 1866 hammer_unlock(&cursor->parent->lock); 1867 hammer_rel_node(cursor->parent); 1868 } 1869 cursor->parent = parent; /* lock'd and ref'd */ 1870 hammer_rel_volume(volume, 0); 1871 } 1872 hammer_modify_node_done(leaf); 1873 1874 /* 1875 * Ok, now adjust the cursor depending on which element the original 1876 * index was pointing at. If we are >= the split point the push node 1877 * is now in the new node. 1878 * 1879 * NOTE: If we are at the split point itself we need to select the 1880 * old or new node based on where key_beg's insertion point will be. 1881 * If we pick the wrong side the inserted element will wind up in 1882 * the wrong leaf node and outside that node's bounds. 1883 */ 1884 if (cursor->index > split || 1885 (cursor->index == split && 1886 hammer_btree_cmp(&cursor->key_beg, mid_boundary) >= 0)) { 1887 cursor->parent_index = parent_index + 1; 1888 cursor->index -= split; 1889 hammer_unlock(&cursor->node->lock); 1890 hammer_rel_node(cursor->node); 1891 cursor->node = new_leaf; 1892 } else { 1893 cursor->parent_index = parent_index; 1894 hammer_unlock(&new_leaf->lock); 1895 hammer_rel_node(new_leaf); 1896 } 1897 1898 /* 1899 * Fixup left and right bounds 1900 */ 1901 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1902 cursor->left_bound = &parent_elm[0].internal.base; 1903 cursor->right_bound = &parent_elm[1].internal.base; 1904 1905 /* 1906 * Assert that the bounds are correct. 1907 */ 1908 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1909 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1910 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1911 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1912 KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->key_beg) <= 0); 1913 KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->key_beg) > 0); 1914 1915 done: 1916 hammer_cursor_downgrade(cursor); 1917 return (error); 1918 } 1919 1920 #if 0 1921 1922 /* 1923 * Recursively correct the right-hand boundary's create_tid to (tid) as 1924 * long as the rest of the key matches. We have to recurse upward in 1925 * the tree as well as down the left side of each parent's right node. 1926 * 1927 * Return EDEADLK if we were only partially successful, forcing the caller 1928 * to try again. The original cursor is not modified. This routine can 1929 * also fail with EDEADLK if it is forced to throw away a portion of its 1930 * record history. 1931 * 1932 * The caller must pass a downgraded cursor to us (otherwise we can't dup it). 1933 */ 1934 struct hammer_rhb { 1935 TAILQ_ENTRY(hammer_rhb) entry; 1936 hammer_node_t node; 1937 int index; 1938 }; 1939 1940 TAILQ_HEAD(hammer_rhb_list, hammer_rhb); 1941 1942 int 1943 hammer_btree_correct_rhb(hammer_cursor_t cursor, hammer_tid_t tid) 1944 { 1945 hammer_mount_t hmp; 1946 struct hammer_rhb_list rhb_list; 1947 hammer_base_elm_t elm; 1948 hammer_node_t orig_node; 1949 struct hammer_rhb *rhb; 1950 int orig_index; 1951 int error; 1952 1953 TAILQ_INIT(&rhb_list); 1954 hmp = cursor->trans->hmp; 1955 1956 /* 1957 * Save our position so we can restore it on return. This also 1958 * gives us a stable 'elm'. 1959 */ 1960 orig_node = cursor->node; 1961 hammer_ref_node(orig_node); 1962 hammer_lock_sh(&orig_node->lock); 1963 orig_index = cursor->index; 1964 elm = &orig_node->ondisk->elms[orig_index].base; 1965 1966 /* 1967 * Now build a list of parents going up, allocating a rhb 1968 * structure for each one. 1969 */ 1970 while (cursor->parent) { 1971 /* 1972 * Stop if we no longer have any right-bounds to fix up 1973 */ 1974 if (elm->obj_id != cursor->right_bound->obj_id || 1975 elm->rec_type != cursor->right_bound->rec_type || 1976 elm->key != cursor->right_bound->key) { 1977 break; 1978 } 1979 1980 /* 1981 * Stop if the right-hand bound's create_tid does not 1982 * need to be corrected. 1983 */ 1984 if (cursor->right_bound->create_tid >= tid) 1985 break; 1986 1987 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 1988 rhb->node = cursor->parent; 1989 rhb->index = cursor->parent_index; 1990 hammer_ref_node(rhb->node); 1991 hammer_lock_sh(&rhb->node->lock); 1992 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 1993 1994 hammer_cursor_up(cursor); 1995 } 1996 1997 /* 1998 * now safely adjust the right hand bound for each rhb. This may 1999 * also require taking the right side of the tree and iterating down 2000 * ITS left side. 2001 */ 2002 error = 0; 2003 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2004 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2005 if (error) 2006 break; 2007 TAILQ_REMOVE(&rhb_list, rhb, entry); 2008 hammer_unlock(&rhb->node->lock); 2009 hammer_rel_node(rhb->node); 2010 kfree(rhb, hmp->m_misc); 2011 2012 switch (cursor->node->ondisk->type) { 2013 case HAMMER_BTREE_TYPE_INTERNAL: 2014 /* 2015 * Right-boundary for parent at internal node 2016 * is one element to the right of the element whos 2017 * right boundary needs adjusting. We must then 2018 * traverse down the left side correcting any left 2019 * bounds (which may now be too far to the left). 2020 */ 2021 ++cursor->index; 2022 error = hammer_btree_correct_lhb(cursor, tid); 2023 break; 2024 default: 2025 hpanic("Bad node type"); 2026 error = EINVAL; 2027 break; 2028 } 2029 } 2030 2031 /* 2032 * Cleanup 2033 */ 2034 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2035 TAILQ_REMOVE(&rhb_list, rhb, entry); 2036 hammer_unlock(&rhb->node->lock); 2037 hammer_rel_node(rhb->node); 2038 kfree(rhb, hmp->m_misc); 2039 } 2040 error = hammer_cursor_seek(cursor, orig_node, orig_index); 2041 hammer_unlock(&orig_node->lock); 2042 hammer_rel_node(orig_node); 2043 return (error); 2044 } 2045 2046 /* 2047 * Similar to rhb (in fact, rhb calls lhb), but corrects the left hand 2048 * bound going downward starting at the current cursor position. 2049 * 2050 * This function does not restore the cursor after use. 2051 */ 2052 int 2053 hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid) 2054 { 2055 struct hammer_rhb_list rhb_list; 2056 hammer_base_elm_t elm; 2057 hammer_base_elm_t cmp; 2058 struct hammer_rhb *rhb; 2059 hammer_mount_t hmp; 2060 int error; 2061 2062 TAILQ_INIT(&rhb_list); 2063 hmp = cursor->trans->hmp; 2064 2065 cmp = &cursor->node->ondisk->elms[cursor->index].base; 2066 2067 /* 2068 * Record the node and traverse down the left-hand side for all 2069 * matching records needing a boundary correction. 2070 */ 2071 error = 0; 2072 for (;;) { 2073 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2074 rhb->node = cursor->node; 2075 rhb->index = cursor->index; 2076 hammer_ref_node(rhb->node); 2077 hammer_lock_sh(&rhb->node->lock); 2078 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2079 2080 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2081 /* 2082 * Nothing to traverse down if we are at the right 2083 * boundary of an internal node. 2084 */ 2085 if (cursor->index == cursor->node->ondisk->count) 2086 break; 2087 } else { 2088 elm = &cursor->node->ondisk->elms[cursor->index].base; 2089 if (elm->btype == HAMMER_BTREE_TYPE_RECORD) 2090 break; 2091 hpanic("Illegal leaf record type %02x", elm->btype); 2092 } 2093 error = hammer_cursor_down(cursor); 2094 if (error) 2095 break; 2096 2097 elm = &cursor->node->ondisk->elms[cursor->index].base; 2098 if (elm->obj_id != cmp->obj_id || 2099 elm->rec_type != cmp->rec_type || 2100 elm->key != cmp->key) { 2101 break; 2102 } 2103 if (elm->create_tid >= tid) 2104 break; 2105 2106 } 2107 2108 /* 2109 * Now we can safely adjust the left-hand boundary from the bottom-up. 2110 * The last element we remove from the list is the caller's right hand 2111 * boundary, which must also be adjusted. 2112 */ 2113 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2114 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2115 if (error) 2116 break; 2117 TAILQ_REMOVE(&rhb_list, rhb, entry); 2118 hammer_unlock(&rhb->node->lock); 2119 hammer_rel_node(rhb->node); 2120 kfree(rhb, hmp->m_misc); 2121 2122 elm = &cursor->node->ondisk->elms[cursor->index].base; 2123 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2124 hammer_modify_node(cursor->trans, cursor->node, 2125 &elm->create_tid, 2126 sizeof(elm->create_tid)); 2127 elm->create_tid = tid; 2128 hammer_modify_node_done(cursor->node); 2129 } else { 2130 hpanic("Bad element type"); 2131 } 2132 } 2133 2134 /* 2135 * Cleanup 2136 */ 2137 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2138 TAILQ_REMOVE(&rhb_list, rhb, entry); 2139 hammer_unlock(&rhb->node->lock); 2140 hammer_rel_node(rhb->node); 2141 kfree(rhb, hmp->m_misc); 2142 } 2143 return (error); 2144 } 2145 2146 #endif 2147 2148 /* 2149 * Attempt to remove the locked, empty or want-to-be-empty B-Tree node at 2150 * (cursor->node). Returns 0 on success, EDEADLK if we could not complete 2151 * the operation due to a deadlock, or some other error. 2152 * 2153 * This routine is initially called with an empty leaf and may be 2154 * recursively called with single-element internal nodes. 2155 * 2156 * It should also be noted that when removing empty leaves we must be sure 2157 * to test and update mirror_tid because another thread may have deadlocked 2158 * against us (or someone) trying to propagate it up and cannot retry once 2159 * the node has been deleted. 2160 * 2161 * On return the cursor may end up pointing to an internal node, suitable 2162 * for further iteration but not for an immediate insertion or deletion. 2163 */ 2164 static int 2165 btree_remove(hammer_cursor_t cursor, int *ndelete) 2166 { 2167 hammer_node_ondisk_t ondisk; 2168 hammer_btree_elm_t elm; 2169 hammer_node_t node; 2170 hammer_node_t parent; 2171 const int esize = sizeof(*elm); 2172 int error; 2173 2174 node = cursor->node; 2175 2176 /* 2177 * When deleting the root of the filesystem convert it to 2178 * an empty leaf node. Internal nodes cannot be empty. 2179 */ 2180 ondisk = node->ondisk; 2181 if (ondisk->parent == 0) { 2182 KKASSERT(cursor->parent == NULL); 2183 hammer_modify_node_all(cursor->trans, node); 2184 KKASSERT(ondisk == node->ondisk); 2185 ondisk->type = HAMMER_BTREE_TYPE_LEAF; 2186 ondisk->count = 0; 2187 hammer_modify_node_done(node); 2188 cursor->index = 0; 2189 return(0); 2190 } 2191 2192 parent = cursor->parent; 2193 2194 /* 2195 * Attempt to remove the parent's reference to the child. If the 2196 * parent would become empty we have to recurse. If we fail we 2197 * leave the parent pointing to an empty leaf node. 2198 * 2199 * We have to recurse successfully before we can delete the internal 2200 * node as it is illegal to have empty internal nodes. Even though 2201 * the operation may be aborted we must still fixup any unlocked 2202 * cursors as if we had deleted the element prior to recursing 2203 * (by calling hammer_cursor_deleted_element()) so those cursors 2204 * are properly forced up the chain by the recursion. 2205 */ 2206 if (parent->ondisk->count == 1) { 2207 /* 2208 * This special cursor_up_locked() call leaves the original 2209 * node exclusively locked and referenced, leaves the 2210 * original parent locked (as the new node), and locks the 2211 * new parent. It can return EDEADLK. 2212 * 2213 * We cannot call hammer_cursor_removed_node() until we are 2214 * actually able to remove the node. If we did then tracked 2215 * cursors in the middle of iterations could be repointed 2216 * to a parent node. If this occurs they could end up 2217 * scanning newly inserted records into the node (that could 2218 * not be deleted) when they push down again. 2219 * 2220 * Due to the way the recursion works the final parent is left 2221 * in cursor->parent after the recursion returns. Each 2222 * layer on the way back up is thus able to call 2223 * hammer_cursor_removed_node() and 'jump' the node up to 2224 * the (same) final parent. 2225 * 2226 * NOTE! The local variable 'parent' is invalid after we 2227 * call hammer_cursor_up_locked(). 2228 */ 2229 error = hammer_cursor_up_locked(cursor); 2230 parent = NULL; 2231 2232 if (error == 0) { 2233 hammer_cursor_deleted_element(cursor->node, 0); 2234 error = btree_remove(cursor, ndelete); 2235 if (error == 0) { 2236 KKASSERT(node != cursor->node); 2237 hammer_cursor_removed_node( 2238 node, cursor->node, cursor->index); 2239 hammer_modify_node_all(cursor->trans, node); 2240 ondisk = node->ondisk; 2241 ondisk->type = HAMMER_BTREE_TYPE_DELETED; 2242 ondisk->count = 0; 2243 hammer_modify_node_done(node); 2244 hammer_flush_node(node, 0); 2245 hammer_delete_node(cursor->trans, node); 2246 if (ndelete) 2247 (*ndelete)++; 2248 } else { 2249 /* 2250 * Defer parent removal because we could not 2251 * get the lock, just let the leaf remain 2252 * empty. 2253 */ 2254 /* 2255 * hammer show doesn't consider this as an error. 2256 */ 2257 } 2258 hammer_unlock(&node->lock); 2259 hammer_rel_node(node); 2260 } else { 2261 /* 2262 * Defer parent removal because we could not 2263 * get the lock, just let the leaf remain 2264 * empty. 2265 */ 2266 /* 2267 * hammer show doesn't consider this as an error. 2268 */ 2269 } 2270 } else { 2271 KKASSERT(parent->ondisk->count > 1); 2272 2273 hammer_modify_node_all(cursor->trans, parent); 2274 ondisk = parent->ondisk; 2275 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 2276 2277 elm = &ondisk->elms[cursor->parent_index]; 2278 KKASSERT(elm->internal.subtree_offset == node->node_offset); 2279 KKASSERT(ondisk->count > 0); 2280 2281 /* 2282 * We must retain the highest mirror_tid. The deleted 2283 * range is now encompassed by the element to the left. 2284 * If we are already at the left edge the new left edge 2285 * inherits mirror_tid. 2286 * 2287 * Note that bounds of the parent to our parent may create 2288 * a gap to the left of our left-most node or to the right 2289 * of our right-most node. The gap is silently included 2290 * in the mirror_tid's area of effect from the point of view 2291 * of the scan. 2292 */ 2293 if (cursor->parent_index) { 2294 if (elm[-1].internal.mirror_tid < 2295 elm[0].internal.mirror_tid) { 2296 elm[-1].internal.mirror_tid = 2297 elm[0].internal.mirror_tid; 2298 } 2299 } else { 2300 if (elm[1].internal.mirror_tid < 2301 elm[0].internal.mirror_tid) { 2302 elm[1].internal.mirror_tid = 2303 elm[0].internal.mirror_tid; 2304 } 2305 } 2306 2307 /* 2308 * Delete the subtree reference in the parent. Include 2309 * boundary element at end. 2310 */ 2311 bcopy(&elm[1], &elm[0], 2312 (ondisk->count - cursor->parent_index) * esize); 2313 --ondisk->count; 2314 hammer_modify_node_done(parent); 2315 hammer_cursor_removed_node(node, parent, cursor->parent_index); 2316 hammer_cursor_deleted_element(parent, cursor->parent_index); 2317 hammer_flush_node(node, 0); 2318 hammer_delete_node(cursor->trans, node); 2319 2320 /* 2321 * cursor->node is invalid, cursor up to make the cursor 2322 * valid again. We have to flag the condition in case 2323 * another thread wiggles an insertion in during an 2324 * iteration. 2325 */ 2326 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK; 2327 error = hammer_cursor_up(cursor); 2328 if (ndelete) 2329 (*ndelete)++; 2330 } 2331 return (error); 2332 } 2333 2334 /* 2335 * Propagate mirror_tid up the B-Tree starting at the current cursor. 2336 * 2337 * WARNING! Because we push and pop the passed cursor, it may be 2338 * modified by other B-Tree operations while it is unlocked 2339 * and things like the node & leaf pointers, and indexes might 2340 * change. 2341 */ 2342 void 2343 hammer_btree_do_propagation(hammer_cursor_t cursor, 2344 hammer_btree_leaf_elm_t leaf) 2345 { 2346 hammer_cursor_t ncursor; 2347 hammer_tid_t mirror_tid; 2348 int error __debugvar; 2349 2350 /* 2351 * We do not propagate a mirror_tid if the filesystem was mounted 2352 * in no-mirror mode. 2353 */ 2354 if (cursor->trans->hmp->master_id < 0) 2355 return; 2356 2357 /* 2358 * This is a bit of a hack because we cannot deadlock or return 2359 * EDEADLK here. The related operation has already completed and 2360 * we must propagate the mirror_tid now regardless. 2361 * 2362 * Generate a new cursor which inherits the original's locks and 2363 * unlock the original. Use the new cursor to propagate the 2364 * mirror_tid. Then clean up the new cursor and reacquire locks 2365 * on the original. 2366 * 2367 * hammer_dup_cursor() cannot dup locks. The dup inherits the 2368 * original's locks and the original is tracked and must be 2369 * re-locked. 2370 */ 2371 mirror_tid = cursor->node->ondisk->mirror_tid; 2372 KKASSERT(mirror_tid != 0); 2373 ncursor = hammer_push_cursor(cursor); 2374 error = hammer_btree_mirror_propagate(ncursor, mirror_tid); 2375 KKASSERT(error == 0); 2376 hammer_pop_cursor(cursor, ncursor); 2377 /* WARNING: cursor's leaf pointer may change after pop */ 2378 } 2379 2380 2381 /* 2382 * Propagate a mirror TID update upwards through the B-Tree to the root. 2383 * 2384 * A locked internal node must be passed in. The node will remain locked 2385 * on return. 2386 * 2387 * This function syncs mirror_tid at the specified internal node's element, 2388 * adjusts the node's aggregation mirror_tid, and then recurses upwards. 2389 */ 2390 static int 2391 hammer_btree_mirror_propagate(hammer_cursor_t cursor, hammer_tid_t mirror_tid) 2392 { 2393 hammer_btree_internal_elm_t elm; 2394 hammer_node_t node; 2395 int error; 2396 2397 for (;;) { 2398 error = hammer_cursor_up(cursor); 2399 if (error == 0) 2400 error = hammer_cursor_upgrade(cursor); 2401 2402 /* 2403 * We can ignore HAMMER_CURSOR_ITERATE_CHECK, the 2404 * cursor will still be properly positioned for 2405 * mirror propagation, just not for iterations. 2406 */ 2407 while (error == EDEADLK) { 2408 hammer_recover_cursor(cursor); 2409 error = hammer_cursor_upgrade(cursor); 2410 } 2411 if (error) 2412 break; 2413 2414 /* 2415 * If the cursor deadlocked it could end up at a leaf 2416 * after we lost the lock. 2417 */ 2418 node = cursor->node; 2419 if (node->ondisk->type != HAMMER_BTREE_TYPE_INTERNAL) 2420 continue; 2421 2422 /* 2423 * Adjust the node's element 2424 */ 2425 elm = &node->ondisk->elms[cursor->index].internal; 2426 if (elm->mirror_tid >= mirror_tid) 2427 break; 2428 hammer_modify_node(cursor->trans, node, &elm->mirror_tid, 2429 sizeof(elm->mirror_tid)); 2430 elm->mirror_tid = mirror_tid; 2431 hammer_modify_node_done(node); 2432 if (hammer_debug_general & 0x0002) { 2433 hdkprintf("propagate %016jx @%016jx:%d\n", 2434 (intmax_t)mirror_tid, 2435 (intmax_t)node->node_offset, 2436 cursor->index); 2437 } 2438 2439 2440 /* 2441 * Adjust the node's mirror_tid aggregator 2442 */ 2443 if (node->ondisk->mirror_tid >= mirror_tid) 2444 return(0); 2445 hammer_modify_node_field(cursor->trans, node, mirror_tid); 2446 node->ondisk->mirror_tid = mirror_tid; 2447 hammer_modify_node_done(node); 2448 if (hammer_debug_general & 0x0002) { 2449 hdkprintf("propagate %016jx @%016jx\n", 2450 (intmax_t)mirror_tid, 2451 (intmax_t)node->node_offset); 2452 } 2453 } 2454 if (error == ENOENT) 2455 error = 0; 2456 return(error); 2457 } 2458 2459 /* 2460 * Return a pointer to node's parent. If there is no error, 2461 * *parent_index is set to an index of parent's elm that points 2462 * to this node. 2463 */ 2464 hammer_node_t 2465 hammer_btree_get_parent(hammer_transaction_t trans, hammer_node_t node, 2466 int *parent_indexp, int *errorp, int try_exclusive) 2467 { 2468 hammer_node_t parent; 2469 hammer_btree_elm_t elm; 2470 int i; 2471 2472 /* 2473 * Get the node 2474 */ 2475 parent = hammer_get_node(trans, node->ondisk->parent, 0, errorp); 2476 if (*errorp) { 2477 KKASSERT(parent == NULL); 2478 return(NULL); 2479 } 2480 KKASSERT ((parent->flags & HAMMER_NODE_DELETED) == 0); 2481 2482 /* 2483 * Lock the node 2484 */ 2485 if (try_exclusive) { 2486 if (hammer_lock_ex_try(&parent->lock)) { 2487 hammer_rel_node(parent); 2488 *errorp = EDEADLK; 2489 return(NULL); 2490 } 2491 } else { 2492 hammer_lock_sh(&parent->lock); 2493 } 2494 2495 /* 2496 * Figure out which element in the parent is pointing to the 2497 * child. 2498 */ 2499 if (node->ondisk->count) { 2500 i = hammer_btree_search_node(&node->ondisk->elms[0].base, 2501 parent->ondisk); 2502 } else { 2503 i = 0; 2504 } 2505 while (i < parent->ondisk->count) { 2506 elm = &parent->ondisk->elms[i]; 2507 if (elm->internal.subtree_offset == node->node_offset) 2508 break; 2509 ++i; 2510 } 2511 if (i == parent->ondisk->count) { 2512 hammer_unlock(&parent->lock); 2513 hpanic("Bad B-Tree link: parent %p node %p", parent, node); 2514 } 2515 *parent_indexp = i; 2516 KKASSERT(*errorp == 0); 2517 return(parent); 2518 } 2519 2520 /* 2521 * The element (elm) has been moved to a new internal node (node). 2522 * 2523 * If the element represents a pointer to an internal node that node's 2524 * parent must be adjusted to the element's new location. 2525 * 2526 * XXX deadlock potential here with our exclusive locks 2527 */ 2528 int 2529 btree_set_parent_of_child(hammer_transaction_t trans, hammer_node_t node, 2530 hammer_btree_elm_t elm) 2531 { 2532 hammer_node_t child; 2533 int error; 2534 2535 error = 0; 2536 2537 if (hammer_is_internal_node_elm(elm)) { 2538 child = hammer_get_node(trans, elm->internal.subtree_offset, 2539 0, &error); 2540 if (error == 0) { 2541 hammer_modify_node_field(trans, child, parent); 2542 child->ondisk->parent = node->node_offset; 2543 hammer_modify_node_done(child); 2544 hammer_rel_node(child); 2545 } 2546 } 2547 return(error); 2548 } 2549 2550 /* 2551 * Initialize the root of a recursive B-Tree node lock list structure. 2552 */ 2553 void 2554 hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node) 2555 { 2556 TAILQ_INIT(&parent->list); 2557 parent->parent = NULL; 2558 parent->node = node; 2559 parent->index = -1; 2560 parent->count = node->ondisk->count; 2561 parent->copy = NULL; 2562 parent->flags = 0; 2563 } 2564 2565 /* 2566 * Initialize a cache of hammer_node_lock's including space allocated 2567 * for node copies. 2568 * 2569 * This is used by the rebalancing code to preallocate the copy space 2570 * for ~4096 B-Tree nodes (16MB of data) prior to acquiring any HAMMER 2571 * locks, otherwise we can blow out the pageout daemon's emergency 2572 * reserve and deadlock it. 2573 * 2574 * NOTE: HAMMER_NODE_LOCK_LCACHE is not set on items cached in the lcache. 2575 * The flag is set when the item is pulled off the cache for use. 2576 */ 2577 void 2578 hammer_btree_lcache_init(hammer_mount_t hmp, hammer_node_lock_t lcache, 2579 int depth) 2580 { 2581 hammer_node_lock_t item; 2582 int count; 2583 2584 for (count = 1; depth; --depth) 2585 count *= HAMMER_BTREE_LEAF_ELMS; 2586 bzero(lcache, sizeof(*lcache)); 2587 TAILQ_INIT(&lcache->list); 2588 while (count) { 2589 item = kmalloc(sizeof(*item), hmp->m_misc, M_WAITOK|M_ZERO); 2590 item->copy = kmalloc(sizeof(*item->copy), 2591 hmp->m_misc, M_WAITOK); 2592 TAILQ_INIT(&item->list); 2593 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2594 --count; 2595 } 2596 } 2597 2598 void 2599 hammer_btree_lcache_free(hammer_mount_t hmp, hammer_node_lock_t lcache) 2600 { 2601 hammer_node_lock_t item; 2602 2603 while ((item = TAILQ_FIRST(&lcache->list)) != NULL) { 2604 TAILQ_REMOVE(&lcache->list, item, entry); 2605 KKASSERT(item->copy); 2606 KKASSERT(TAILQ_EMPTY(&item->list)); 2607 kfree(item->copy, hmp->m_misc); 2608 kfree(item, hmp->m_misc); 2609 } 2610 KKASSERT(lcache->copy == NULL); 2611 } 2612 2613 /* 2614 * Exclusively lock all the children of node. This is used by the split 2615 * code to prevent anyone from accessing the children of a cursor node 2616 * while we fix-up its parent offset. 2617 * 2618 * If we don't lock the children we can really mess up cursors which block 2619 * trying to cursor-up into our node. 2620 * 2621 * On failure EDEADLK (or some other error) is returned. If a deadlock 2622 * error is returned the cursor is adjusted to block on termination. 2623 * 2624 * The caller is responsible for managing parent->node, the root's node 2625 * is usually aliased from a cursor. 2626 */ 2627 int 2628 hammer_btree_lock_children(hammer_cursor_t cursor, int depth, 2629 hammer_node_lock_t parent, 2630 hammer_node_lock_t lcache) 2631 { 2632 hammer_node_t node; 2633 hammer_node_lock_t item; 2634 hammer_node_ondisk_t ondisk; 2635 hammer_btree_elm_t elm; 2636 hammer_node_t child; 2637 hammer_mount_t hmp; 2638 int error; 2639 int i; 2640 2641 node = parent->node; 2642 ondisk = node->ondisk; 2643 error = 0; 2644 hmp = cursor->trans->hmp; 2645 2646 if (ondisk->type != HAMMER_BTREE_TYPE_INTERNAL) 2647 return(0); /* This could return non-zero */ 2648 2649 /* 2650 * We really do not want to block on I/O with exclusive locks held, 2651 * pre-get the children before trying to lock the mess. This is 2652 * only done one-level deep for now. 2653 */ 2654 for (i = 0; i < ondisk->count; ++i) { 2655 ++hammer_stats_btree_elements; 2656 elm = &ondisk->elms[i]; 2657 child = hammer_get_node(cursor->trans, 2658 elm->internal.subtree_offset, 2659 0, &error); 2660 if (child) 2661 hammer_rel_node(child); 2662 } 2663 2664 /* 2665 * Do it for real 2666 */ 2667 for (i = 0; error == 0 && i < ondisk->count; ++i) { 2668 ++hammer_stats_btree_elements; 2669 elm = &ondisk->elms[i]; 2670 2671 KKASSERT(elm->internal.subtree_offset != 0); 2672 child = hammer_get_node(cursor->trans, 2673 elm->internal.subtree_offset, 2674 0, &error); 2675 if (child) { 2676 if (hammer_lock_ex_try(&child->lock) != 0) { 2677 if (cursor->deadlk_node == NULL) { 2678 cursor->deadlk_node = child; 2679 hammer_ref_node(cursor->deadlk_node); 2680 } 2681 error = EDEADLK; 2682 hammer_rel_node(child); 2683 } else { 2684 if (lcache) { 2685 item = TAILQ_FIRST(&lcache->list); 2686 KKASSERT(item != NULL); 2687 item->flags |= HAMMER_NODE_LOCK_LCACHE; 2688 TAILQ_REMOVE(&lcache->list, item, entry); 2689 } else { 2690 item = kmalloc(sizeof(*item), 2691 hmp->m_misc, 2692 M_WAITOK|M_ZERO); 2693 TAILQ_INIT(&item->list); 2694 } 2695 2696 TAILQ_INSERT_TAIL(&parent->list, item, entry); 2697 item->parent = parent; 2698 item->node = child; 2699 item->index = i; 2700 item->count = child->ondisk->count; 2701 2702 /* 2703 * Recurse (used by the rebalancing code) 2704 */ 2705 if (depth > 1 && elm->base.btype == HAMMER_BTREE_TYPE_INTERNAL) { 2706 error = hammer_btree_lock_children( 2707 cursor, 2708 depth - 1, 2709 item, 2710 lcache); 2711 } 2712 } 2713 } 2714 } 2715 if (error) 2716 hammer_btree_unlock_children(hmp, parent, lcache); 2717 return(error); 2718 } 2719 2720 /* 2721 * Create an in-memory copy of all B-Tree nodes listed, recursively, 2722 * including the parent. 2723 */ 2724 void 2725 hammer_btree_lock_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2726 { 2727 hammer_mount_t hmp = cursor->trans->hmp; 2728 hammer_node_lock_t item; 2729 2730 if (parent->copy == NULL) { 2731 KKASSERT((parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0); 2732 parent->copy = kmalloc(sizeof(*parent->copy), 2733 hmp->m_misc, M_WAITOK); 2734 } 2735 KKASSERT((parent->flags & HAMMER_NODE_LOCK_UPDATED) == 0); 2736 *parent->copy = *parent->node->ondisk; 2737 TAILQ_FOREACH(item, &parent->list, entry) { 2738 hammer_btree_lock_copy(cursor, item); 2739 } 2740 } 2741 2742 /* 2743 * Recursively sync modified copies to the media. 2744 */ 2745 int 2746 hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2747 { 2748 hammer_node_lock_t item; 2749 int count = 0; 2750 2751 if (parent->flags & HAMMER_NODE_LOCK_UPDATED) { 2752 ++count; 2753 hammer_modify_node_all(cursor->trans, parent->node); 2754 *parent->node->ondisk = *parent->copy; 2755 hammer_modify_node_done(parent->node); 2756 if (parent->copy->type == HAMMER_BTREE_TYPE_DELETED) { 2757 hammer_flush_node(parent->node, 0); 2758 hammer_delete_node(cursor->trans, parent->node); 2759 } 2760 } 2761 TAILQ_FOREACH(item, &parent->list, entry) { 2762 count += hammer_btree_sync_copy(cursor, item); 2763 } 2764 return(count); 2765 } 2766 2767 /* 2768 * Release previously obtained node locks. The caller is responsible for 2769 * cleaning up parent->node itself (its usually just aliased from a cursor), 2770 * but this function will take care of the copies. 2771 * 2772 * NOTE: The root node is not placed in the lcache and node->copy is not 2773 * deallocated when lcache != NULL. 2774 */ 2775 void 2776 hammer_btree_unlock_children(hammer_mount_t hmp, hammer_node_lock_t parent, 2777 hammer_node_lock_t lcache) 2778 { 2779 hammer_node_lock_t item; 2780 hammer_node_ondisk_t copy; 2781 2782 while ((item = TAILQ_FIRST(&parent->list)) != NULL) { 2783 TAILQ_REMOVE(&parent->list, item, entry); 2784 hammer_btree_unlock_children(hmp, item, lcache); 2785 hammer_unlock(&item->node->lock); 2786 hammer_rel_node(item->node); 2787 if (lcache) { 2788 /* 2789 * NOTE: When placing the item back in the lcache 2790 * the flag is cleared by the bzero(). 2791 * Remaining fields are cleared as a safety 2792 * measure. 2793 */ 2794 KKASSERT(item->flags & HAMMER_NODE_LOCK_LCACHE); 2795 KKASSERT(TAILQ_EMPTY(&item->list)); 2796 copy = item->copy; 2797 bzero(item, sizeof(*item)); 2798 TAILQ_INIT(&item->list); 2799 item->copy = copy; 2800 if (copy) 2801 bzero(copy, sizeof(*copy)); 2802 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2803 } else { 2804 kfree(item, hmp->m_misc); 2805 } 2806 } 2807 if (parent->copy && (parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0) { 2808 kfree(parent->copy, hmp->m_misc); 2809 parent->copy = NULL; /* safety */ 2810 } 2811 } 2812 2813 /************************************************************************ 2814 * MISCELLANIOUS SUPPORT * 2815 ************************************************************************/ 2816 2817 /* 2818 * Compare two B-Tree elements, return -N, 0, or +N (e.g. similar to strcmp). 2819 * 2820 * Note that for this particular function a return value of -1, 0, or +1 2821 * can denote a match if create_tid is otherwise discounted. A create_tid 2822 * of zero is considered to be 'infinity' in comparisons. 2823 * 2824 * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c. 2825 */ 2826 int 2827 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2) 2828 { 2829 if (key1->localization < key2->localization) 2830 return(-5); 2831 if (key1->localization > key2->localization) 2832 return(5); 2833 2834 if (key1->obj_id < key2->obj_id) 2835 return(-4); 2836 if (key1->obj_id > key2->obj_id) 2837 return(4); 2838 2839 if (key1->rec_type < key2->rec_type) 2840 return(-3); 2841 if (key1->rec_type > key2->rec_type) 2842 return(3); 2843 2844 if (key1->key < key2->key) 2845 return(-2); 2846 if (key1->key > key2->key) 2847 return(2); 2848 2849 /* 2850 * A create_tid of zero indicates a record which is undeletable 2851 * and must be considered to have a value of positive infinity. 2852 */ 2853 if (key1->create_tid == 0) { 2854 if (key2->create_tid == 0) 2855 return(0); 2856 return(1); 2857 } 2858 if (key2->create_tid == 0) 2859 return(-1); 2860 if (key1->create_tid < key2->create_tid) 2861 return(-1); 2862 if (key1->create_tid > key2->create_tid) 2863 return(1); 2864 return(0); 2865 } 2866 2867 /* 2868 * Test a timestamp against an element to determine whether the 2869 * element is visible. A timestamp of 0 means 'infinity'. 2870 */ 2871 int 2872 hammer_btree_chkts(hammer_tid_t asof, hammer_base_elm_t base) 2873 { 2874 if (asof == 0) { 2875 if (base->delete_tid) 2876 return(1); 2877 return(0); 2878 } 2879 if (asof < base->create_tid) 2880 return(-1); 2881 if (base->delete_tid && asof >= base->delete_tid) 2882 return(1); 2883 return(0); 2884 } 2885 2886 /* 2887 * Create a separator half way inbetween key1 and key2. For fields just 2888 * one unit apart, the separator will match key2. key1 is on the left-hand 2889 * side and key2 is on the right-hand side. 2890 * 2891 * key2 must be >= the separator. It is ok for the separator to match key2. 2892 * 2893 * NOTE: Even if key1 does not match key2, the separator may wind up matching 2894 * key2. 2895 * 2896 * NOTE: It might be beneficial to just scrap this whole mess and just 2897 * set the separator to key2. 2898 */ 2899 #define MAKE_SEPARATOR(key1, key2, dest, field) \ 2900 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1); 2901 2902 static void 2903 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2, 2904 hammer_base_elm_t dest) 2905 { 2906 bzero(dest, sizeof(*dest)); 2907 2908 dest->rec_type = key2->rec_type; 2909 dest->key = key2->key; 2910 dest->obj_id = key2->obj_id; 2911 dest->create_tid = key2->create_tid; 2912 2913 MAKE_SEPARATOR(key1, key2, dest, localization); 2914 if (key1->localization == key2->localization) { 2915 MAKE_SEPARATOR(key1, key2, dest, obj_id); 2916 if (key1->obj_id == key2->obj_id) { 2917 MAKE_SEPARATOR(key1, key2, dest, rec_type); 2918 if (key1->rec_type == key2->rec_type) { 2919 MAKE_SEPARATOR(key1, key2, dest, key); 2920 /* 2921 * Don't bother creating a separator for 2922 * create_tid, which also conveniently avoids 2923 * having to handle the create_tid == 0 2924 * (infinity) case. Just leave create_tid 2925 * set to key2. 2926 * 2927 * Worst case, dest matches key2 exactly, 2928 * which is acceptable. 2929 */ 2930 } 2931 } 2932 } 2933 } 2934 2935 #undef MAKE_SEPARATOR 2936 2937 /* 2938 * Return whether a generic internal or leaf node is full 2939 */ 2940 static __inline 2941 int 2942 btree_node_is_full(hammer_node_ondisk_t node) 2943 { 2944 int n; 2945 2946 n = hammer_node_max_elements(node->type); 2947 if (n == -1) 2948 hpanic("bad type %d", node->type); 2949 2950 return(n == node->count); 2951 } 2952 2953 void 2954 hammer_print_btree_node(hammer_node_ondisk_t ondisk) 2955 { 2956 int i, n; 2957 2958 kprintf("node %p count=%d parent=%016jx type=%c\n", 2959 ondisk, ondisk->count, 2960 (intmax_t)ondisk->parent, ondisk->type); 2961 2962 switch (ondisk->type) { 2963 case HAMMER_BTREE_TYPE_INTERNAL: 2964 n = ondisk->count + 1; /* count is NOT boundary inclusive */ 2965 break; 2966 case HAMMER_BTREE_TYPE_LEAF: 2967 n = ondisk->count; /* there is no boundary */ 2968 break; 2969 default: 2970 return; /* nothing to do */ 2971 } 2972 2973 /* 2974 * Dump elements including boundary. 2975 */ 2976 for (i = 0; i < n; ++i) { 2977 kprintf(" %2d", i); 2978 hammer_print_btree_elm(&ondisk->elms[i]); 2979 } 2980 } 2981 2982 void 2983 hammer_print_btree_elm(hammer_btree_elm_t elm) 2984 { 2985 kprintf("\tobj_id = %016jx\n", (intmax_t)elm->base.obj_id); 2986 kprintf("\tkey = %016jx\n", (intmax_t)elm->base.key); 2987 kprintf("\tcreate_tid = %016jx\n", (intmax_t)elm->base.create_tid); 2988 kprintf("\tdelete_tid = %016jx\n", (intmax_t)elm->base.delete_tid); 2989 kprintf("\trec_type = %04x\n", elm->base.rec_type); 2990 kprintf("\tobj_type = %02x\n", elm->base.obj_type); 2991 kprintf("\tbtype = %02x (%c)\n", elm->base.btype, 2992 hammer_elm_btype(elm)); 2993 kprintf("\tlocalization = %08x\n", elm->base.localization); 2994 2995 if (hammer_is_internal_node_elm(elm)) { 2996 kprintf("\tsubtree_off = %016jx\n", 2997 (intmax_t)elm->internal.subtree_offset); 2998 } else if (hammer_is_leaf_node_elm(elm)) { 2999 kprintf("\tdata_offset = %016jx\n", 3000 (intmax_t)elm->leaf.data_offset); 3001 kprintf("\tdata_len = %08x\n", elm->leaf.data_len); 3002 kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc); 3003 } 3004 } 3005 3006 static __inline 3007 void 3008 hammer_debug_btree_elm(hammer_cursor_t cursor, hammer_btree_elm_t elm, 3009 const char *s, int res) 3010 { 3011 hkprintf("%-8s %016jx[%02d] %c " 3012 "lo=%08x obj=%016jx rec=%02x key=%016jx tid=%016jx td=%p " 3013 "r=%d\n", 3014 s, 3015 (intmax_t)cursor->node->node_offset, 3016 cursor->index, 3017 hammer_elm_btype(elm), 3018 elm->base.localization, 3019 (intmax_t)elm->base.obj_id, 3020 elm->base.rec_type, 3021 (intmax_t)elm->base.key, 3022 (intmax_t)elm->base.create_tid, 3023 curthread, 3024 res); 3025 } 3026 3027 static __inline 3028 void 3029 hammer_debug_btree_parent(hammer_cursor_t cursor, const char *s) 3030 { 3031 hammer_btree_elm_t elm = 3032 &cursor->parent->ondisk->elms[cursor->parent_index]; 3033 3034 hkprintf("%-8s %016jx[%d] %c " 3035 "(%016jx/%016jx %016jx/%016jx) (%p/%p %p/%p)\n", 3036 s, 3037 (intmax_t)cursor->parent->node_offset, 3038 cursor->parent_index, 3039 hammer_elm_btype(elm), 3040 (intmax_t)cursor->left_bound->obj_id, 3041 (intmax_t)elm->internal.base.obj_id, 3042 (intmax_t)cursor->right_bound->obj_id, 3043 (intmax_t)(elm + 1)->internal.base.obj_id, 3044 cursor->left_bound, 3045 elm, 3046 cursor->right_bound, 3047 elm + 1); 3048 } 3049