1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * HAMMER B-Tree index 37 * 38 * HAMMER implements a modified B+Tree. In documentation this will 39 * simply be refered to as the HAMMER B-Tree. Basically a HAMMER B-Tree 40 * looks like a B+Tree (A B-Tree which stores its records only at the leafs 41 * of the tree), but adds two additional boundary elements which describe 42 * the left-most and right-most element a node is able to represent. In 43 * otherwords, we have boundary elements at the two ends of a B-Tree node 44 * with no valid sub-tree pointer for the right-most element. 45 * 46 * A B-Tree internal node looks like this: 47 * 48 * B N N N N N N B <-- boundary and internal elements 49 * S S S S S S S <-- subtree pointers 50 * 51 * A B-Tree leaf node basically looks like this: 52 * 53 * L L L L L L L L <-- leaf elemenets 54 * 55 * The radix for an internal node is 1 less then a leaf but we get a 56 * number of significant benefits for our troubles. 57 * The left-hand boundary (B in the left) is integrated into the first 58 * element so it doesn't require 2 elements to accomodate boundaries. 59 * 60 * The big benefit to using a B-Tree containing boundary information 61 * is that it is possible to cache pointers into the middle of the tree 62 * and not have to start searches, insertions, OR deletions at the root 63 * node. In particular, searches are able to progress in a definitive 64 * direction from any point in the tree without revisting nodes. This 65 * greatly improves the efficiency of many operations, most especially 66 * record appends. 67 * 68 * B-Trees also make the stacking of trees fairly straightforward. 69 * 70 * INSERTIONS: A search performed with the intention of doing 71 * an insert will guarantee that the terminal leaf node is not full by 72 * splitting full nodes. Splits occur top-down during the dive down the 73 * B-Tree. 74 * 75 * DELETIONS: A deletion makes no attempt to proactively balance the 76 * tree and will recursively remove nodes that become empty. If a 77 * deadlock occurs a deletion may not be able to remove an empty leaf. 78 * Deletions never allow internal nodes to become empty (that would blow 79 * up the boundaries). 80 */ 81 #include "hammer.h" 82 #include <sys/buf.h> 83 #include <sys/buf2.h> 84 85 static int btree_search(hammer_cursor_t cursor, int flags); 86 static int btree_split_internal(hammer_cursor_t cursor); 87 static int btree_split_leaf(hammer_cursor_t cursor); 88 static int btree_remove(hammer_cursor_t cursor, int *ndelete); 89 static __inline int btree_node_is_full(hammer_node_ondisk_t node); 90 static __inline int btree_max_elements(u_int8_t type); 91 static int hammer_btree_mirror_propagate(hammer_cursor_t cursor, 92 hammer_tid_t mirror_tid); 93 static void hammer_make_separator(hammer_base_elm_t key1, 94 hammer_base_elm_t key2, hammer_base_elm_t dest); 95 static void hammer_cursor_mirror_filter(hammer_cursor_t cursor); 96 97 /* 98 * Iterate records after a search. The cursor is iterated forwards past 99 * the current record until a record matching the key-range requirements 100 * is found. ENOENT is returned if the iteration goes past the ending 101 * key. 102 * 103 * The iteration is inclusive of key_beg and can be inclusive or exclusive 104 * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set. 105 * 106 * When doing an as-of search (cursor->asof != 0), key_beg.create_tid 107 * may be modified by B-Tree functions. 108 * 109 * cursor->key_beg may or may not be modified by this function during 110 * the iteration. XXX future - in case of an inverted lock we may have 111 * to reinitiate the lookup and set key_beg to properly pick up where we 112 * left off. 113 * 114 * If HAMMER_CURSOR_ITERATE_CHECK is set it is possible that the cursor 115 * was reverse indexed due to being moved to a parent while unlocked, 116 * and something else might have inserted an element outside the iteration 117 * range. When this case occurs the iterator just keeps iterating until 118 * it gets back into the iteration range (instead of asserting). 119 * 120 * NOTE! EDEADLK *CANNOT* be returned by this procedure. 121 */ 122 int 123 hammer_btree_iterate(hammer_cursor_t cursor) 124 { 125 hammer_node_ondisk_t node; 126 hammer_btree_elm_t elm; 127 hammer_mount_t hmp; 128 int error = 0; 129 int r; 130 int s; 131 132 /* 133 * Skip past the current record 134 */ 135 hmp = cursor->trans->hmp; 136 node = cursor->node->ondisk; 137 if (node == NULL) 138 return(ENOENT); 139 if (cursor->index < node->count && 140 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 141 ++cursor->index; 142 } 143 144 /* 145 * HAMMER can wind up being cpu-bound. 146 */ 147 if (++hmp->check_yield > hammer_yield_check) { 148 hmp->check_yield = 0; 149 lwkt_user_yield(); 150 } 151 152 153 /* 154 * Loop until an element is found or we are done. 155 */ 156 for (;;) { 157 /* 158 * We iterate up the tree and then index over one element 159 * while we are at the last element in the current node. 160 * 161 * If we are at the root of the filesystem, cursor_up 162 * returns ENOENT. 163 * 164 * XXX this could be optimized by storing the information in 165 * the parent reference. 166 * 167 * XXX we can lose the node lock temporarily, this could mess 168 * up our scan. 169 */ 170 ++hammer_stats_btree_iterations; 171 hammer_flusher_clean_loose_ios(hmp); 172 173 if (cursor->index == node->count) { 174 if (hammer_debug_btree) { 175 kprintf("BRACKETU %016llx[%d] -> %016llx[%d] (td=%p)\n", 176 (long long)cursor->node->node_offset, 177 cursor->index, 178 (long long)(cursor->parent ? cursor->parent->node_offset : -1), 179 cursor->parent_index, 180 curthread); 181 } 182 KKASSERT(cursor->parent == NULL || cursor->parent->ondisk->elms[cursor->parent_index].internal.subtree_offset == cursor->node->node_offset); 183 error = hammer_cursor_up(cursor); 184 if (error) 185 break; 186 /* reload stale pointer */ 187 node = cursor->node->ondisk; 188 KKASSERT(cursor->index != node->count); 189 190 /* 191 * If we are reblocking we want to return internal 192 * nodes. Note that the internal node will be 193 * returned multiple times, on each upward recursion 194 * from its children. The caller selects which 195 * revisit it cares about (usually first or last only). 196 */ 197 if (cursor->flags & HAMMER_CURSOR_REBLOCKING) { 198 cursor->flags |= HAMMER_CURSOR_ATEDISK; 199 return(0); 200 } 201 ++cursor->index; 202 continue; 203 } 204 205 /* 206 * Check internal or leaf element. Determine if the record 207 * at the cursor has gone beyond the end of our range. 208 * 209 * We recurse down through internal nodes. 210 */ 211 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 212 elm = &node->elms[cursor->index]; 213 214 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 215 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 216 if (hammer_debug_btree) { 217 kprintf("BRACKETL %016llx[%d] %016llx %02x " 218 "key=%016llx lo=%02x %d (td=%p)\n", 219 (long long)cursor->node->node_offset, 220 cursor->index, 221 (long long)elm[0].internal.base.obj_id, 222 elm[0].internal.base.rec_type, 223 (long long)elm[0].internal.base.key, 224 elm[0].internal.base.localization, 225 r, 226 curthread 227 ); 228 kprintf("BRACKETR %016llx[%d] %016llx %02x " 229 "key=%016llx lo=%02x %d\n", 230 (long long)cursor->node->node_offset, 231 cursor->index + 1, 232 (long long)elm[1].internal.base.obj_id, 233 elm[1].internal.base.rec_type, 234 (long long)elm[1].internal.base.key, 235 elm[1].internal.base.localization, 236 s 237 ); 238 } 239 240 if (r < 0) { 241 error = ENOENT; 242 break; 243 } 244 if (r == 0 && (cursor->flags & 245 HAMMER_CURSOR_END_INCLUSIVE) == 0) { 246 error = ENOENT; 247 break; 248 } 249 250 /* 251 * Better not be zero 252 */ 253 KKASSERT(elm->internal.subtree_offset != 0); 254 255 if (s <= 0) { 256 /* 257 * If running the mirror filter see if we 258 * can skip one or more entire sub-trees. 259 * If we can we return the internal node 260 * and the caller processes the skipped 261 * range (see mirror_read). 262 */ 263 if (cursor->flags & 264 HAMMER_CURSOR_MIRROR_FILTERED) { 265 if (elm->internal.mirror_tid < 266 cursor->cmirror->mirror_tid) { 267 hammer_cursor_mirror_filter(cursor); 268 return(0); 269 } 270 } 271 } else { 272 /* 273 * Normally it would be impossible for the 274 * cursor to have gotten back-indexed, 275 * but it can happen if a node is deleted 276 * and the cursor is moved to its parent 277 * internal node. ITERATE_CHECK will be set. 278 */ 279 KKASSERT(cursor->flags & 280 HAMMER_CURSOR_ITERATE_CHECK); 281 kprintf("hammer_btree_iterate: " 282 "DEBUG: Caught parent seek " 283 "in internal iteration\n"); 284 } 285 286 error = hammer_cursor_down(cursor); 287 if (error) 288 break; 289 KKASSERT(cursor->index == 0); 290 /* reload stale pointer */ 291 node = cursor->node->ondisk; 292 continue; 293 } else { 294 elm = &node->elms[cursor->index]; 295 r = hammer_btree_cmp(&cursor->key_end, &elm->base); 296 if (hammer_debug_btree) { 297 kprintf("ELEMENT %016llx:%d %c %016llx %02x " 298 "key=%016llx lo=%02x %d\n", 299 (long long)cursor->node->node_offset, 300 cursor->index, 301 (elm[0].leaf.base.btype ? 302 elm[0].leaf.base.btype : '?'), 303 (long long)elm[0].leaf.base.obj_id, 304 elm[0].leaf.base.rec_type, 305 (long long)elm[0].leaf.base.key, 306 elm[0].leaf.base.localization, 307 r 308 ); 309 } 310 if (r < 0) { 311 error = ENOENT; 312 break; 313 } 314 315 /* 316 * We support both end-inclusive and 317 * end-exclusive searches. 318 */ 319 if (r == 0 && 320 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 321 error = ENOENT; 322 break; 323 } 324 325 /* 326 * If ITERATE_CHECK is set an unlocked cursor may 327 * have been moved to a parent and the iterate can 328 * happen upon elements that are not in the requested 329 * range. 330 */ 331 if (cursor->flags & HAMMER_CURSOR_ITERATE_CHECK) { 332 s = hammer_btree_cmp(&cursor->key_beg, 333 &elm->base); 334 if (s > 0) { 335 kprintf("hammer_btree_iterate: " 336 "DEBUG: Caught parent seek " 337 "in leaf iteration\n"); 338 ++cursor->index; 339 continue; 340 } 341 } 342 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 343 344 /* 345 * Return the element 346 */ 347 switch(elm->leaf.base.btype) { 348 case HAMMER_BTREE_TYPE_RECORD: 349 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 350 hammer_btree_chkts(cursor->asof, &elm->base)) { 351 ++cursor->index; 352 continue; 353 } 354 error = 0; 355 break; 356 default: 357 error = EINVAL; 358 break; 359 } 360 if (error) 361 break; 362 } 363 364 /* 365 * Return entry 366 */ 367 if (hammer_debug_btree) { 368 int i = cursor->index; 369 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 370 kprintf("ITERATE %p:%d %016llx %02x " 371 "key=%016llx lo=%02x\n", 372 cursor->node, i, 373 (long long)elm->internal.base.obj_id, 374 elm->internal.base.rec_type, 375 (long long)elm->internal.base.key, 376 elm->internal.base.localization 377 ); 378 } 379 return(0); 380 } 381 return(error); 382 } 383 384 /* 385 * We hit an internal element that we could skip as part of a mirroring 386 * scan. Calculate the entire range being skipped. 387 * 388 * It is important to include any gaps between the parent's left_bound 389 * and the node's left_bound, and same goes for the right side. 390 */ 391 static void 392 hammer_cursor_mirror_filter(hammer_cursor_t cursor) 393 { 394 struct hammer_cmirror *cmirror; 395 hammer_node_ondisk_t ondisk; 396 hammer_btree_elm_t elm; 397 398 ondisk = cursor->node->ondisk; 399 cmirror = cursor->cmirror; 400 401 /* 402 * Calculate the skipped range 403 */ 404 elm = &ondisk->elms[cursor->index]; 405 if (cursor->index == 0) 406 cmirror->skip_beg = *cursor->left_bound; 407 else 408 cmirror->skip_beg = elm->internal.base; 409 while (cursor->index < ondisk->count) { 410 if (elm->internal.mirror_tid >= cmirror->mirror_tid) 411 break; 412 ++cursor->index; 413 ++elm; 414 } 415 if (cursor->index == ondisk->count) 416 cmirror->skip_end = *cursor->right_bound; 417 else 418 cmirror->skip_end = elm->internal.base; 419 420 /* 421 * clip the returned result. 422 */ 423 if (hammer_btree_cmp(&cmirror->skip_beg, &cursor->key_beg) < 0) 424 cmirror->skip_beg = cursor->key_beg; 425 if (hammer_btree_cmp(&cmirror->skip_end, &cursor->key_end) > 0) 426 cmirror->skip_end = cursor->key_end; 427 } 428 429 /* 430 * Iterate in the reverse direction. This is used by the pruning code to 431 * avoid overlapping records. 432 */ 433 int 434 hammer_btree_iterate_reverse(hammer_cursor_t cursor) 435 { 436 hammer_node_ondisk_t node; 437 hammer_btree_elm_t elm; 438 hammer_mount_t hmp; 439 int error = 0; 440 int r; 441 int s; 442 443 /* mirror filtering not supported for reverse iteration */ 444 KKASSERT ((cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) == 0); 445 446 /* 447 * Skip past the current record. For various reasons the cursor 448 * may end up set to -1 or set to point at the end of the current 449 * node. These cases must be addressed. 450 */ 451 node = cursor->node->ondisk; 452 if (node == NULL) 453 return(ENOENT); 454 if (cursor->index != -1 && 455 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 456 --cursor->index; 457 } 458 if (cursor->index == cursor->node->ondisk->count) 459 --cursor->index; 460 461 /* 462 * HAMMER can wind up being cpu-bound. 463 */ 464 hmp = cursor->trans->hmp; 465 if (++hmp->check_yield > hammer_yield_check) { 466 hmp->check_yield = 0; 467 lwkt_user_yield(); 468 } 469 470 /* 471 * Loop until an element is found or we are done. 472 */ 473 for (;;) { 474 ++hammer_stats_btree_iterations; 475 hammer_flusher_clean_loose_ios(hmp); 476 477 /* 478 * We iterate up the tree and then index over one element 479 * while we are at the last element in the current node. 480 */ 481 if (cursor->index == -1) { 482 error = hammer_cursor_up(cursor); 483 if (error) { 484 cursor->index = 0; /* sanity */ 485 break; 486 } 487 /* reload stale pointer */ 488 node = cursor->node->ondisk; 489 KKASSERT(cursor->index != node->count); 490 --cursor->index; 491 continue; 492 } 493 494 /* 495 * Check internal or leaf element. Determine if the record 496 * at the cursor has gone beyond the end of our range. 497 * 498 * We recurse down through internal nodes. 499 */ 500 KKASSERT(cursor->index != node->count); 501 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 502 elm = &node->elms[cursor->index]; 503 504 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 505 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 506 if (hammer_debug_btree) { 507 kprintf("BRACKETL %016llx[%d] %016llx %02x " 508 "key=%016llx lo=%02x %d (td=%p)\n", 509 (long long)cursor->node->node_offset, 510 cursor->index, 511 (long long)elm[0].internal.base.obj_id, 512 elm[0].internal.base.rec_type, 513 (long long)elm[0].internal.base.key, 514 elm[0].internal.base.localization, 515 r, 516 curthread 517 ); 518 kprintf("BRACKETR %016llx[%d] %016llx %02x " 519 "key=%016llx lo=%02x %d\n", 520 (long long)cursor->node->node_offset, 521 cursor->index + 1, 522 (long long)elm[1].internal.base.obj_id, 523 elm[1].internal.base.rec_type, 524 (long long)elm[1].internal.base.key, 525 elm[1].internal.base.localization, 526 s 527 ); 528 } 529 530 if (s >= 0) { 531 error = ENOENT; 532 break; 533 } 534 535 /* 536 * It shouldn't be possible to be seeked past key_end, 537 * even if the cursor got moved to a parent. 538 */ 539 KKASSERT(r >= 0); 540 541 /* 542 * Better not be zero 543 */ 544 KKASSERT(elm->internal.subtree_offset != 0); 545 546 error = hammer_cursor_down(cursor); 547 if (error) 548 break; 549 KKASSERT(cursor->index == 0); 550 /* reload stale pointer */ 551 node = cursor->node->ondisk; 552 553 /* this can assign -1 if the leaf was empty */ 554 cursor->index = node->count - 1; 555 continue; 556 } else { 557 elm = &node->elms[cursor->index]; 558 s = hammer_btree_cmp(&cursor->key_beg, &elm->base); 559 if (hammer_debug_btree) { 560 kprintf("ELEMENTR %016llx:%d %c %016llx %02x " 561 "key=%016llx lo=%02x %d\n", 562 (long long)cursor->node->node_offset, 563 cursor->index, 564 (elm[0].leaf.base.btype ? 565 elm[0].leaf.base.btype : '?'), 566 (long long)elm[0].leaf.base.obj_id, 567 elm[0].leaf.base.rec_type, 568 (long long)elm[0].leaf.base.key, 569 elm[0].leaf.base.localization, 570 s 571 ); 572 } 573 if (s > 0) { 574 error = ENOENT; 575 break; 576 } 577 578 /* 579 * It shouldn't be possible to be seeked past key_end, 580 * even if the cursor got moved to a parent. 581 */ 582 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 583 584 /* 585 * Return the element 586 */ 587 switch(elm->leaf.base.btype) { 588 case HAMMER_BTREE_TYPE_RECORD: 589 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 590 hammer_btree_chkts(cursor->asof, &elm->base)) { 591 --cursor->index; 592 continue; 593 } 594 error = 0; 595 break; 596 default: 597 error = EINVAL; 598 break; 599 } 600 if (error) 601 break; 602 } 603 604 /* 605 * Return entry 606 */ 607 if (hammer_debug_btree) { 608 int i = cursor->index; 609 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 610 kprintf("ITERATER %p:%d %016llx %02x " 611 "key=%016llx lo=%02x\n", 612 cursor->node, i, 613 (long long)elm->internal.base.obj_id, 614 elm->internal.base.rec_type, 615 (long long)elm->internal.base.key, 616 elm->internal.base.localization 617 ); 618 } 619 return(0); 620 } 621 return(error); 622 } 623 624 /* 625 * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry 626 * could not be found, EDEADLK if inserting and a retry is needed, and a 627 * fatal error otherwise. When retrying, the caller must terminate the 628 * cursor and reinitialize it. EDEADLK cannot be returned if not inserting. 629 * 630 * The cursor is suitably positioned for a deletion on success, and suitably 631 * positioned for an insertion on ENOENT if HAMMER_CURSOR_INSERT was 632 * specified. 633 * 634 * The cursor may begin anywhere, the search will traverse the tree in 635 * either direction to locate the requested element. 636 * 637 * Most of the logic implementing historical searches is handled here. We 638 * do an initial lookup with create_tid set to the asof TID. Due to the 639 * way records are laid out, a backwards iteration may be required if 640 * ENOENT is returned to locate the historical record. Here's the 641 * problem: 642 * 643 * create_tid: 10 15 20 644 * LEAF1 LEAF2 645 * records: (11) (18) 646 * 647 * Lets say we want to do a lookup AS-OF timestamp 17. We will traverse 648 * LEAF2 but the only record in LEAF2 has a create_tid of 18, which is 649 * not visible and thus causes ENOENT to be returned. We really need 650 * to check record 11 in LEAF1. If it also fails then the search fails 651 * (e.g. it might represent the range 11-16 and thus still not match our 652 * AS-OF timestamp of 17). Note that LEAF1 could be empty, requiring 653 * further iterations. 654 * 655 * If this case occurs btree_search() will set HAMMER_CURSOR_CREATE_CHECK 656 * and the cursor->create_check TID if an iteration might be needed. 657 * In the above example create_check would be set to 14. 658 */ 659 int 660 hammer_btree_lookup(hammer_cursor_t cursor) 661 { 662 int error; 663 664 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 665 KKASSERT ((cursor->flags & HAMMER_CURSOR_INSERT) == 0 || 666 cursor->trans->sync_lock_refs > 0); 667 ++hammer_stats_btree_lookups; 668 if (cursor->flags & HAMMER_CURSOR_ASOF) { 669 KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0); 670 cursor->key_beg.create_tid = cursor->asof; 671 for (;;) { 672 cursor->flags &= ~HAMMER_CURSOR_CREATE_CHECK; 673 error = btree_search(cursor, 0); 674 if (error != ENOENT || 675 (cursor->flags & HAMMER_CURSOR_CREATE_CHECK) == 0) { 676 /* 677 * Stop if no error. 678 * Stop if error other then ENOENT. 679 * Stop if ENOENT and not special case. 680 */ 681 break; 682 } 683 if (hammer_debug_btree) { 684 kprintf("CREATE_CHECK %016llx\n", 685 (long long)cursor->create_check); 686 } 687 cursor->key_beg.create_tid = cursor->create_check; 688 /* loop */ 689 } 690 } else { 691 error = btree_search(cursor, 0); 692 } 693 if (error == 0) 694 error = hammer_btree_extract(cursor, cursor->flags); 695 return(error); 696 } 697 698 /* 699 * Execute the logic required to start an iteration. The first record 700 * located within the specified range is returned and iteration control 701 * flags are adjusted for successive hammer_btree_iterate() calls. 702 * 703 * Set ATEDISK so a low-level caller can call btree_first/btree_iterate 704 * in a loop without worrying about it. Higher-level merged searches will 705 * adjust the flag appropriately. 706 */ 707 int 708 hammer_btree_first(hammer_cursor_t cursor) 709 { 710 int error; 711 712 error = hammer_btree_lookup(cursor); 713 if (error == ENOENT) { 714 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 715 error = hammer_btree_iterate(cursor); 716 } 717 cursor->flags |= HAMMER_CURSOR_ATEDISK; 718 return(error); 719 } 720 721 /* 722 * Similarly but for an iteration in the reverse direction. 723 * 724 * Set ATEDISK when iterating backwards to skip the current entry, 725 * which after an ENOENT lookup will be pointing beyond our end point. 726 * 727 * Set ATEDISK so a low-level caller can call btree_last/btree_iterate_reverse 728 * in a loop without worrying about it. Higher-level merged searches will 729 * adjust the flag appropriately. 730 */ 731 int 732 hammer_btree_last(hammer_cursor_t cursor) 733 { 734 struct hammer_base_elm save; 735 int error; 736 737 save = cursor->key_beg; 738 cursor->key_beg = cursor->key_end; 739 error = hammer_btree_lookup(cursor); 740 cursor->key_beg = save; 741 if (error == ENOENT || 742 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 743 cursor->flags |= HAMMER_CURSOR_ATEDISK; 744 error = hammer_btree_iterate_reverse(cursor); 745 } 746 cursor->flags |= HAMMER_CURSOR_ATEDISK; 747 return(error); 748 } 749 750 /* 751 * Extract the record and/or data associated with the cursor's current 752 * position. Any prior record or data stored in the cursor is replaced. 753 * 754 * NOTE: All extractions occur at the leaf of the B-Tree. 755 */ 756 int 757 hammer_btree_extract(hammer_cursor_t cursor, int flags) 758 { 759 hammer_node_ondisk_t node; 760 hammer_btree_elm_t elm; 761 hammer_off_t data_off; 762 hammer_mount_t hmp; 763 int32_t data_len; 764 int error; 765 766 /* 767 * Certain types of corruption can result in a NULL node pointer. 768 */ 769 if (cursor->node == NULL) { 770 kprintf("hammer: NULL cursor->node, filesystem might " 771 "have gotten corrupted\n"); 772 return (EINVAL); 773 } 774 775 /* 776 * The case where the data reference resolves to the same buffer 777 * as the record reference must be handled. 778 */ 779 node = cursor->node->ondisk; 780 elm = &node->elms[cursor->index]; 781 cursor->data = NULL; 782 hmp = cursor->node->hmp; 783 784 /* 785 * There is nothing to extract for an internal element. 786 */ 787 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) 788 return(EINVAL); 789 790 /* 791 * Only record types have data. 792 */ 793 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 794 cursor->leaf = &elm->leaf; 795 796 if ((flags & HAMMER_CURSOR_GET_DATA) == 0) 797 return(0); 798 if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD) 799 return(0); 800 data_off = elm->leaf.data_offset; 801 data_len = elm->leaf.data_len; 802 if (data_off == 0) 803 return(0); 804 805 /* 806 * Load the data 807 */ 808 KKASSERT(data_len >= 0 && data_len <= HAMMER_XBUFSIZE); 809 cursor->data = hammer_bread_ext(hmp, data_off, data_len, 810 &error, &cursor->data_buffer); 811 812 /* 813 * Mark the data buffer as not being meta-data if it isn't 814 * meta-data (sometimes bulk data is accessed via a volume 815 * block device). 816 */ 817 if (error == 0) { 818 switch(elm->leaf.base.rec_type) { 819 case HAMMER_RECTYPE_DATA: 820 case HAMMER_RECTYPE_DB: 821 if ((data_off & HAMMER_ZONE_LARGE_DATA) == 0) 822 break; 823 if (hammer_double_buffer == 0 || 824 (cursor->flags & HAMMER_CURSOR_NOSWAPCACHE)) { 825 hammer_io_notmeta(cursor->data_buffer); 826 } 827 break; 828 default: 829 break; 830 } 831 } 832 833 /* 834 * Deal with CRC errors on the extracted data. 835 */ 836 if (error == 0 && 837 hammer_crc_test_leaf(cursor->data, &elm->leaf) == 0) { 838 kprintf("CRC DATA @ %016llx/%d FAILED\n", 839 (long long)elm->leaf.data_offset, elm->leaf.data_len); 840 if (hammer_debug_critical) 841 Debugger("CRC FAILED: DATA"); 842 if (cursor->trans->flags & HAMMER_TRANSF_CRCDOM) 843 error = EDOM; /* less critical (mirroring) */ 844 else 845 error = EIO; /* critical */ 846 } 847 return(error); 848 } 849 850 851 /* 852 * Insert a leaf element into the B-Tree at the current cursor position. 853 * The cursor is positioned such that the element at and beyond the cursor 854 * are shifted to make room for the new record. 855 * 856 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT 857 * flag set and that call must return ENOENT before this function can be 858 * called. ENOSPC is returned if there is no room to insert a new record. 859 * 860 * The caller may depend on the cursor's exclusive lock after return to 861 * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE). 862 */ 863 int 864 hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm, 865 int *doprop) 866 { 867 hammer_node_ondisk_t node; 868 int i; 869 int error; 870 871 *doprop = 0; 872 if ((error = hammer_cursor_upgrade_node(cursor)) != 0) 873 return(error); 874 ++hammer_stats_btree_inserts; 875 876 /* 877 * Insert the element at the leaf node and update the count in the 878 * parent. It is possible for parent to be NULL, indicating that 879 * the filesystem's ROOT B-Tree node is a leaf itself, which is 880 * possible. The root inode can never be deleted so the leaf should 881 * never be empty. 882 * 883 * Remember that leaf nodes do not have boundaries. 884 */ 885 hammer_modify_node_all(cursor->trans, cursor->node); 886 node = cursor->node->ondisk; 887 i = cursor->index; 888 KKASSERT(elm->base.btype != 0); 889 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 890 KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS); 891 if (i != node->count) { 892 bcopy(&node->elms[i], &node->elms[i+1], 893 (node->count - i) * sizeof(*elm)); 894 } 895 node->elms[i].leaf = *elm; 896 ++node->count; 897 hammer_cursor_inserted_element(cursor->node, i); 898 899 /* 900 * Update the leaf node's aggregate mirror_tid for mirroring 901 * support. 902 */ 903 if (node->mirror_tid < elm->base.delete_tid) { 904 node->mirror_tid = elm->base.delete_tid; 905 *doprop = 1; 906 } 907 if (node->mirror_tid < elm->base.create_tid) { 908 node->mirror_tid = elm->base.create_tid; 909 *doprop = 1; 910 } 911 hammer_modify_node_done(cursor->node); 912 913 /* 914 * Debugging sanity checks. 915 */ 916 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->base) <= 0); 917 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->base) > 0); 918 if (i) { 919 KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->base) < 0); 920 } 921 if (i != node->count - 1) 922 KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->base) > 0); 923 924 return(0); 925 } 926 927 /* 928 * Delete a record from the B-Tree at the current cursor position. 929 * The cursor is positioned such that the current element is the one 930 * to be deleted. 931 * 932 * On return the cursor will be positioned after the deleted element and 933 * MAY point to an internal node. It will be suitable for the continuation 934 * of an iteration but not for an insertion or deletion. 935 * 936 * Deletions will attempt to partially rebalance the B-Tree in an upward 937 * direction, but will terminate rather then deadlock. Empty internal nodes 938 * are never allowed by a deletion which deadlocks may end up giving us an 939 * empty leaf. The pruner will clean up and rebalance the tree. 940 * 941 * This function can return EDEADLK, requiring the caller to retry the 942 * operation after clearing the deadlock. 943 * 944 * This function will store the number of deleted btree nodes in *ndelete 945 * if ndelete is not NULL. 946 */ 947 int 948 hammer_btree_delete(hammer_cursor_t cursor, int *ndelete) 949 { 950 hammer_node_ondisk_t ondisk; 951 hammer_node_t node; 952 hammer_node_t parent __debugvar; 953 int error; 954 int i; 955 956 KKASSERT (cursor->trans->sync_lock_refs > 0); 957 if (ndelete) 958 *ndelete = 0; 959 if ((error = hammer_cursor_upgrade(cursor)) != 0) 960 return(error); 961 ++hammer_stats_btree_deletes; 962 963 /* 964 * Delete the element from the leaf node. 965 * 966 * Remember that leaf nodes do not have boundaries. 967 */ 968 node = cursor->node; 969 ondisk = node->ondisk; 970 i = cursor->index; 971 972 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF); 973 KKASSERT(i >= 0 && i < ondisk->count); 974 hammer_modify_node_all(cursor->trans, node); 975 if (i + 1 != ondisk->count) { 976 bcopy(&ondisk->elms[i+1], &ondisk->elms[i], 977 (ondisk->count - i - 1) * sizeof(ondisk->elms[0])); 978 } 979 --ondisk->count; 980 hammer_modify_node_done(node); 981 hammer_cursor_deleted_element(node, i); 982 983 /* 984 * Validate local parent 985 */ 986 if (ondisk->parent) { 987 parent = cursor->parent; 988 989 KKASSERT(parent != NULL); 990 KKASSERT(parent->node_offset == ondisk->parent); 991 } 992 993 /* 994 * If the leaf becomes empty it must be detached from the parent, 995 * potentially recursing through to the filesystem root. 996 * 997 * This may reposition the cursor at one of the parent's of the 998 * current node. 999 * 1000 * Ignore deadlock errors, that simply means that btree_remove 1001 * was unable to recurse and had to leave us with an empty leaf. 1002 */ 1003 KKASSERT(cursor->index <= ondisk->count); 1004 if (ondisk->count == 0) { 1005 error = btree_remove(cursor, ndelete); 1006 if (error == EDEADLK) 1007 error = 0; 1008 } else { 1009 error = 0; 1010 } 1011 KKASSERT(cursor->parent == NULL || 1012 cursor->parent_index < cursor->parent->ondisk->count); 1013 return(error); 1014 } 1015 1016 /* 1017 * PRIMARY B-TREE SEARCH SUPPORT PROCEDURE 1018 * 1019 * Search the filesystem B-Tree for cursor->key_beg, return the matching node. 1020 * 1021 * The search can begin ANYWHERE in the B-Tree. As a first step the search 1022 * iterates up the tree as necessary to properly position itself prior to 1023 * actually doing the sarch. 1024 * 1025 * INSERTIONS: The search will split full nodes and leaves on its way down 1026 * and guarentee that the leaf it ends up on is not full. If we run out 1027 * of space the search continues to the leaf, but ENOSPC is returned. 1028 * 1029 * The search is only guarenteed to end up on a leaf if an error code of 0 1030 * is returned, or if inserting and an error code of ENOENT is returned. 1031 * Otherwise it can stop at an internal node. On success a search returns 1032 * a leaf node. 1033 * 1034 * COMPLEXITY WARNING! This is the core B-Tree search code for the entire 1035 * filesystem, and it is not simple code. Please note the following facts: 1036 * 1037 * - Internal node recursions have a boundary on the left AND right. The 1038 * right boundary is non-inclusive. The create_tid is a generic part 1039 * of the key for internal nodes. 1040 * 1041 * - Filesystem lookups typically set HAMMER_CURSOR_ASOF, indicating a 1042 * historical search. ASOF and INSERT are mutually exclusive. When 1043 * doing an as-of lookup btree_search() checks for a right-edge boundary 1044 * case. If while recursing down the left-edge differs from the key 1045 * by ONLY its create_tid, HAMMER_CURSOR_CREATE_CHECK is set along 1046 * with cursor->create_check. This is used by btree_lookup() to iterate. 1047 * The iteration backwards because as-of searches can wind up going 1048 * down the wrong branch of the B-Tree. 1049 */ 1050 static 1051 int 1052 btree_search(hammer_cursor_t cursor, int flags) 1053 { 1054 hammer_node_ondisk_t node; 1055 hammer_btree_elm_t elm; 1056 int error; 1057 int enospc = 0; 1058 int i; 1059 int r; 1060 int s; 1061 1062 flags |= cursor->flags; 1063 ++hammer_stats_btree_searches; 1064 1065 if (hammer_debug_btree) { 1066 kprintf("SEARCH %016llx[%d] %016llx %02x key=%016llx cre=%016llx lo=%02x (td=%p)\n", 1067 (long long)cursor->node->node_offset, 1068 cursor->index, 1069 (long long)cursor->key_beg.obj_id, 1070 cursor->key_beg.rec_type, 1071 (long long)cursor->key_beg.key, 1072 (long long)cursor->key_beg.create_tid, 1073 cursor->key_beg.localization, 1074 curthread 1075 ); 1076 if (cursor->parent) 1077 kprintf("SEARCHP %016llx[%d] (%016llx/%016llx %016llx/%016llx) (%p/%p %p/%p)\n", 1078 (long long)cursor->parent->node_offset, 1079 cursor->parent_index, 1080 (long long)cursor->left_bound->obj_id, 1081 (long long)cursor->parent->ondisk->elms[cursor->parent_index].internal.base.obj_id, 1082 (long long)cursor->right_bound->obj_id, 1083 (long long)cursor->parent->ondisk->elms[cursor->parent_index+1].internal.base.obj_id, 1084 cursor->left_bound, 1085 &cursor->parent->ondisk->elms[cursor->parent_index], 1086 cursor->right_bound, 1087 &cursor->parent->ondisk->elms[cursor->parent_index+1] 1088 ); 1089 } 1090 1091 /* 1092 * Move our cursor up the tree until we find a node whos range covers 1093 * the key we are trying to locate. 1094 * 1095 * The left bound is inclusive, the right bound is non-inclusive. 1096 * It is ok to cursor up too far. 1097 */ 1098 for (;;) { 1099 r = hammer_btree_cmp(&cursor->key_beg, cursor->left_bound); 1100 s = hammer_btree_cmp(&cursor->key_beg, cursor->right_bound); 1101 if (r >= 0 && s < 0) 1102 break; 1103 KKASSERT(cursor->parent); 1104 ++hammer_stats_btree_iterations; 1105 error = hammer_cursor_up(cursor); 1106 if (error) 1107 goto done; 1108 } 1109 1110 /* 1111 * The delete-checks below are based on node, not parent. Set the 1112 * initial delete-check based on the parent. 1113 */ 1114 if (r == 1) { 1115 KKASSERT(cursor->left_bound->create_tid != 1); 1116 cursor->create_check = cursor->left_bound->create_tid - 1; 1117 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1118 } 1119 1120 /* 1121 * We better have ended up with a node somewhere. 1122 */ 1123 KKASSERT(cursor->node != NULL); 1124 1125 /* 1126 * If we are inserting we can't start at a full node if the parent 1127 * is also full (because there is no way to split the node), 1128 * continue running up the tree until the requirement is satisfied 1129 * or we hit the root of the filesystem. 1130 * 1131 * (If inserting we aren't doing an as-of search so we don't have 1132 * to worry about create_check). 1133 */ 1134 while (flags & HAMMER_CURSOR_INSERT) { 1135 if (btree_node_is_full(cursor->node->ondisk) == 0) 1136 break; 1137 if (cursor->node->ondisk->parent == 0 || 1138 cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) { 1139 break; 1140 } 1141 ++hammer_stats_btree_iterations; 1142 error = hammer_cursor_up(cursor); 1143 /* node may have become stale */ 1144 if (error) 1145 goto done; 1146 } 1147 1148 /* 1149 * Push down through internal nodes to locate the requested key. 1150 */ 1151 node = cursor->node->ondisk; 1152 while (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 1153 /* 1154 * Scan the node to find the subtree index to push down into. 1155 * We go one-past, then back-up. 1156 * 1157 * We must proactively remove deleted elements which may 1158 * have been left over from a deadlocked btree_remove(). 1159 * 1160 * The left and right boundaries are included in the loop 1161 * in order to detect edge cases. 1162 * 1163 * If the separator only differs by create_tid (r == 1) 1164 * and we are doing an as-of search, we may end up going 1165 * down a branch to the left of the one containing the 1166 * desired key. This requires numerous special cases. 1167 */ 1168 ++hammer_stats_btree_iterations; 1169 if (hammer_debug_btree) { 1170 kprintf("SEARCH-I %016llx count=%d\n", 1171 (long long)cursor->node->node_offset, 1172 node->count); 1173 } 1174 1175 /* 1176 * Try to shortcut the search before dropping into the 1177 * linear loop. Locate the first node where r <= 1. 1178 */ 1179 i = hammer_btree_search_node(&cursor->key_beg, node); 1180 while (i <= node->count) { 1181 ++hammer_stats_btree_elements; 1182 elm = &node->elms[i]; 1183 r = hammer_btree_cmp(&cursor->key_beg, &elm->base); 1184 if (hammer_debug_btree > 2) { 1185 kprintf(" IELM %p %d r=%d\n", 1186 &node->elms[i], i, r); 1187 } 1188 if (r < 0) 1189 break; 1190 if (r == 1) { 1191 KKASSERT(elm->base.create_tid != 1); 1192 cursor->create_check = elm->base.create_tid - 1; 1193 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1194 } 1195 ++i; 1196 } 1197 if (hammer_debug_btree) { 1198 kprintf("SEARCH-I preI=%d/%d r=%d\n", 1199 i, node->count, r); 1200 } 1201 1202 /* 1203 * These cases occur when the parent's idea of the boundary 1204 * is wider then the child's idea of the boundary, and 1205 * require special handling. If not inserting we can 1206 * terminate the search early for these cases but the 1207 * child's boundaries cannot be unconditionally modified. 1208 */ 1209 if (i == 0) { 1210 /* 1211 * If i == 0 the search terminated to the LEFT of the 1212 * left_boundary but to the RIGHT of the parent's left 1213 * boundary. 1214 */ 1215 u_int8_t save; 1216 1217 elm = &node->elms[0]; 1218 1219 /* 1220 * If we aren't inserting we can stop here. 1221 */ 1222 if ((flags & (HAMMER_CURSOR_INSERT | 1223 HAMMER_CURSOR_PRUNING)) == 0) { 1224 cursor->index = 0; 1225 return(ENOENT); 1226 } 1227 1228 /* 1229 * Correct a left-hand boundary mismatch. 1230 * 1231 * We can only do this if we can upgrade the lock, 1232 * and synchronized as a background cursor (i.e. 1233 * inserting or pruning). 1234 * 1235 * WARNING: We can only do this if inserting, i.e. 1236 * we are running on the backend. 1237 */ 1238 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1239 return(error); 1240 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1241 hammer_modify_node_field(cursor->trans, cursor->node, 1242 elms[0]); 1243 save = node->elms[0].base.btype; 1244 node->elms[0].base = *cursor->left_bound; 1245 node->elms[0].base.btype = save; 1246 hammer_modify_node_done(cursor->node); 1247 } else if (i == node->count + 1) { 1248 /* 1249 * If i == node->count + 1 the search terminated to 1250 * the RIGHT of the right boundary but to the LEFT 1251 * of the parent's right boundary. If we aren't 1252 * inserting we can stop here. 1253 * 1254 * Note that the last element in this case is 1255 * elms[i-2] prior to adjustments to 'i'. 1256 */ 1257 --i; 1258 if ((flags & (HAMMER_CURSOR_INSERT | 1259 HAMMER_CURSOR_PRUNING)) == 0) { 1260 cursor->index = i; 1261 return (ENOENT); 1262 } 1263 1264 /* 1265 * Correct a right-hand boundary mismatch. 1266 * (actual push-down record is i-2 prior to 1267 * adjustments to i). 1268 * 1269 * We can only do this if we can upgrade the lock, 1270 * and synchronized as a background cursor (i.e. 1271 * inserting or pruning). 1272 * 1273 * WARNING: We can only do this if inserting, i.e. 1274 * we are running on the backend. 1275 */ 1276 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1277 return(error); 1278 elm = &node->elms[i]; 1279 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1280 hammer_modify_node(cursor->trans, cursor->node, 1281 &elm->base, sizeof(elm->base)); 1282 elm->base = *cursor->right_bound; 1283 hammer_modify_node_done(cursor->node); 1284 --i; 1285 } else { 1286 /* 1287 * The push-down index is now i - 1. If we had 1288 * terminated on the right boundary this will point 1289 * us at the last element. 1290 */ 1291 --i; 1292 } 1293 cursor->index = i; 1294 elm = &node->elms[i]; 1295 1296 if (hammer_debug_btree) { 1297 kprintf("RESULT-I %016llx[%d] %016llx %02x " 1298 "key=%016llx cre=%016llx lo=%02x\n", 1299 (long long)cursor->node->node_offset, 1300 i, 1301 (long long)elm->internal.base.obj_id, 1302 elm->internal.base.rec_type, 1303 (long long)elm->internal.base.key, 1304 (long long)elm->internal.base.create_tid, 1305 elm->internal.base.localization 1306 ); 1307 } 1308 1309 /* 1310 * We better have a valid subtree offset. 1311 */ 1312 KKASSERT(elm->internal.subtree_offset != 0); 1313 1314 /* 1315 * Handle insertion and deletion requirements. 1316 * 1317 * If inserting split full nodes. The split code will 1318 * adjust cursor->node and cursor->index if the current 1319 * index winds up in the new node. 1320 * 1321 * If inserting and a left or right edge case was detected, 1322 * we cannot correct the left or right boundary and must 1323 * prepend and append an empty leaf node in order to make 1324 * the boundary correction. 1325 * 1326 * If we run out of space we set enospc but continue on 1327 * to a leaf. 1328 */ 1329 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) { 1330 if (btree_node_is_full(node)) { 1331 error = btree_split_internal(cursor); 1332 if (error) { 1333 if (error != ENOSPC) 1334 goto done; 1335 enospc = 1; 1336 } 1337 /* 1338 * reload stale pointers 1339 */ 1340 i = cursor->index; 1341 node = cursor->node->ondisk; 1342 } 1343 } 1344 1345 /* 1346 * Push down (push into new node, existing node becomes 1347 * the parent) and continue the search. 1348 */ 1349 error = hammer_cursor_down(cursor); 1350 /* node may have become stale */ 1351 if (error) 1352 goto done; 1353 node = cursor->node->ondisk; 1354 } 1355 1356 /* 1357 * We are at a leaf, do a linear search of the key array. 1358 * 1359 * On success the index is set to the matching element and 0 1360 * is returned. 1361 * 1362 * On failure the index is set to the insertion point and ENOENT 1363 * is returned. 1364 * 1365 * Boundaries are not stored in leaf nodes, so the index can wind 1366 * up to the left of element 0 (index == 0) or past the end of 1367 * the array (index == node->count). It is also possible that the 1368 * leaf might be empty. 1369 */ 1370 ++hammer_stats_btree_iterations; 1371 KKASSERT (node->type == HAMMER_BTREE_TYPE_LEAF); 1372 KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS); 1373 if (hammer_debug_btree) { 1374 kprintf("SEARCH-L %016llx count=%d\n", 1375 (long long)cursor->node->node_offset, 1376 node->count); 1377 } 1378 1379 /* 1380 * Try to shortcut the search before dropping into the 1381 * linear loop. Locate the first node where r <= 1. 1382 */ 1383 i = hammer_btree_search_node(&cursor->key_beg, node); 1384 while (i < node->count) { 1385 ++hammer_stats_btree_elements; 1386 elm = &node->elms[i]; 1387 1388 r = hammer_btree_cmp(&cursor->key_beg, &elm->leaf.base); 1389 1390 if (hammer_debug_btree > 1) 1391 kprintf(" ELM %p %d r=%d\n", &node->elms[i], i, r); 1392 1393 /* 1394 * We are at a record element. Stop if we've flipped past 1395 * key_beg, not counting the create_tid test. Allow the 1396 * r == 1 case (key_beg > element but differs only by its 1397 * create_tid) to fall through to the AS-OF check. 1398 */ 1399 KKASSERT (elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD); 1400 1401 if (r < 0) 1402 goto failed; 1403 if (r > 1) { 1404 ++i; 1405 continue; 1406 } 1407 1408 /* 1409 * Check our as-of timestamp against the element. 1410 */ 1411 if (flags & HAMMER_CURSOR_ASOF) { 1412 if (hammer_btree_chkts(cursor->asof, 1413 &node->elms[i].base) != 0) { 1414 ++i; 1415 continue; 1416 } 1417 /* success */ 1418 } else { 1419 if (r > 0) { /* can only be +1 */ 1420 ++i; 1421 continue; 1422 } 1423 /* success */ 1424 } 1425 cursor->index = i; 1426 error = 0; 1427 if (hammer_debug_btree) { 1428 kprintf("RESULT-L %016llx[%d] (SUCCESS)\n", 1429 (long long)cursor->node->node_offset, i); 1430 } 1431 goto done; 1432 } 1433 1434 /* 1435 * The search of the leaf node failed. i is the insertion point. 1436 */ 1437 failed: 1438 if (hammer_debug_btree) { 1439 kprintf("RESULT-L %016llx[%d] (FAILED)\n", 1440 (long long)cursor->node->node_offset, i); 1441 } 1442 1443 /* 1444 * No exact match was found, i is now at the insertion point. 1445 * 1446 * If inserting split a full leaf before returning. This 1447 * may have the side effect of adjusting cursor->node and 1448 * cursor->index. 1449 */ 1450 cursor->index = i; 1451 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0 && 1452 btree_node_is_full(node)) { 1453 error = btree_split_leaf(cursor); 1454 if (error) { 1455 if (error != ENOSPC) 1456 goto done; 1457 enospc = 1; 1458 } 1459 /* 1460 * reload stale pointers 1461 */ 1462 /* NOT USED 1463 i = cursor->index; 1464 node = &cursor->node->internal; 1465 */ 1466 } 1467 1468 /* 1469 * We reached a leaf but did not find the key we were looking for. 1470 * If this is an insert we will be properly positioned for an insert 1471 * (ENOENT) or unable to insert (ENOSPC). 1472 */ 1473 error = enospc ? ENOSPC : ENOENT; 1474 done: 1475 return(error); 1476 } 1477 1478 /* 1479 * Heuristical search for the first element whos comparison is <= 1. May 1480 * return an index whos compare result is > 1 but may only return an index 1481 * whos compare result is <= 1 if it is the first element with that result. 1482 */ 1483 int 1484 hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node) 1485 { 1486 int b; 1487 int s; 1488 int i; 1489 int r; 1490 1491 /* 1492 * Don't bother if the node does not have very many elements 1493 */ 1494 b = 0; 1495 s = node->count; 1496 while (s - b > 4) { 1497 i = b + (s - b) / 2; 1498 ++hammer_stats_btree_elements; 1499 r = hammer_btree_cmp(elm, &node->elms[i].leaf.base); 1500 if (r <= 1) { 1501 s = i; 1502 } else { 1503 b = i; 1504 } 1505 } 1506 return(b); 1507 } 1508 1509 1510 /************************************************************************ 1511 * SPLITTING AND MERGING * 1512 ************************************************************************ 1513 * 1514 * These routines do all the dirty work required to split and merge nodes. 1515 */ 1516 1517 /* 1518 * Split an internal node into two nodes and move the separator at the split 1519 * point to the parent. 1520 * 1521 * (cursor->node, cursor->index) indicates the element the caller intends 1522 * to push into. We will adjust node and index if that element winds 1523 * up in the split node. 1524 * 1525 * If we are at the root of the filesystem a new root must be created with 1526 * two elements, one pointing to the original root and one pointing to the 1527 * newly allocated split node. 1528 */ 1529 static 1530 int 1531 btree_split_internal(hammer_cursor_t cursor) 1532 { 1533 hammer_node_ondisk_t ondisk; 1534 hammer_node_t node; 1535 hammer_node_t parent; 1536 hammer_node_t new_node; 1537 hammer_btree_elm_t elm; 1538 hammer_btree_elm_t parent_elm; 1539 struct hammer_node_lock lockroot; 1540 hammer_mount_t hmp = cursor->trans->hmp; 1541 int parent_index; 1542 int made_root; 1543 int split; 1544 int error; 1545 int i; 1546 const int esize = sizeof(*elm); 1547 1548 hammer_node_lock_init(&lockroot, cursor->node); 1549 error = hammer_btree_lock_children(cursor, 1, &lockroot, NULL); 1550 if (error) 1551 goto done; 1552 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1553 goto done; 1554 ++hammer_stats_btree_splits; 1555 1556 /* 1557 * Calculate the split point. If the insertion point is at the 1558 * end of the leaf we adjust the split point significantly to the 1559 * right to try to optimize node fill and flag it. If we hit 1560 * that same leaf again our heuristic failed and we don't try 1561 * to optimize node fill (it could lead to a degenerate case). 1562 */ 1563 node = cursor->node; 1564 ondisk = node->ondisk; 1565 KKASSERT(ondisk->count > 4); 1566 if (cursor->index == ondisk->count && 1567 (node->flags & HAMMER_NODE_NONLINEAR) == 0) { 1568 split = (ondisk->count + 1) * 3 / 4; 1569 node->flags |= HAMMER_NODE_NONLINEAR; 1570 } else { 1571 /* 1572 * We are splitting but elms[split] will be promoted to 1573 * the parent, leaving the right hand node with one less 1574 * element. If the insertion point will be on the 1575 * left-hand side adjust the split point to give the 1576 * right hand side one additional node. 1577 */ 1578 split = (ondisk->count + 1) / 2; 1579 if (cursor->index <= split) 1580 --split; 1581 } 1582 1583 /* 1584 * If we are at the root of the filesystem, create a new root node 1585 * with 1 element and split normally. Avoid making major 1586 * modifications until we know the whole operation will work. 1587 */ 1588 if (ondisk->parent == 0) { 1589 parent = hammer_alloc_btree(cursor->trans, 0, &error); 1590 if (parent == NULL) 1591 goto done; 1592 hammer_lock_ex(&parent->lock); 1593 hammer_modify_node_noundo(cursor->trans, parent); 1594 ondisk = parent->ondisk; 1595 ondisk->count = 1; 1596 ondisk->parent = 0; 1597 ondisk->mirror_tid = node->ondisk->mirror_tid; 1598 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1599 ondisk->elms[0].base = hmp->root_btree_beg; 1600 ondisk->elms[0].base.btype = node->ondisk->type; 1601 ondisk->elms[0].internal.subtree_offset = node->node_offset; 1602 ondisk->elms[0].internal.mirror_tid = ondisk->mirror_tid; 1603 ondisk->elms[1].base = hmp->root_btree_end; 1604 hammer_modify_node_done(parent); 1605 /* ondisk->elms[1].base.btype - not used */ 1606 made_root = 1; 1607 parent_index = 0; /* index of current node in parent */ 1608 } else { 1609 made_root = 0; 1610 parent = cursor->parent; 1611 parent_index = cursor->parent_index; 1612 } 1613 1614 /* 1615 * Split node into new_node at the split point. 1616 * 1617 * B O O O P N N B <-- P = node->elms[split] (index 4) 1618 * 0 1 2 3 4 5 6 <-- subtree indices 1619 * 1620 * x x P x x 1621 * s S S s 1622 * / \ 1623 * B O O O B B N N B <--- inner boundary points are 'P' 1624 * 0 1 2 3 4 5 6 1625 */ 1626 new_node = hammer_alloc_btree(cursor->trans, 0, &error); 1627 if (new_node == NULL) { 1628 if (made_root) { 1629 hammer_unlock(&parent->lock); 1630 hammer_delete_node(cursor->trans, parent); 1631 hammer_rel_node(parent); 1632 } 1633 goto done; 1634 } 1635 hammer_lock_ex(&new_node->lock); 1636 1637 /* 1638 * Create the new node. P becomes the left-hand boundary in the 1639 * new node. Copy the right-hand boundary as well. 1640 * 1641 * elm is the new separator. 1642 */ 1643 hammer_modify_node_noundo(cursor->trans, new_node); 1644 hammer_modify_node_all(cursor->trans, node); 1645 ondisk = node->ondisk; 1646 elm = &ondisk->elms[split]; 1647 bcopy(elm, &new_node->ondisk->elms[0], 1648 (ondisk->count - split + 1) * esize); 1649 new_node->ondisk->count = ondisk->count - split; 1650 new_node->ondisk->parent = parent->node_offset; 1651 new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1652 new_node->ondisk->mirror_tid = ondisk->mirror_tid; 1653 KKASSERT(ondisk->type == new_node->ondisk->type); 1654 hammer_cursor_split_node(node, new_node, split); 1655 1656 /* 1657 * Cleanup the original node. Elm (P) becomes the new boundary, 1658 * its subtree_offset was moved to the new node. If we had created 1659 * a new root its parent pointer may have changed. 1660 */ 1661 elm->internal.subtree_offset = 0; 1662 ondisk->count = split; 1663 1664 /* 1665 * Insert the separator into the parent, fixup the parent's 1666 * reference to the original node, and reference the new node. 1667 * The separator is P. 1668 * 1669 * Remember that base.count does not include the right-hand boundary. 1670 */ 1671 hammer_modify_node_all(cursor->trans, parent); 1672 ondisk = parent->ondisk; 1673 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1674 parent_elm = &ondisk->elms[parent_index+1]; 1675 bcopy(parent_elm, parent_elm + 1, 1676 (ondisk->count - parent_index) * esize); 1677 parent_elm->internal.base = elm->base; /* separator P */ 1678 parent_elm->internal.base.btype = new_node->ondisk->type; 1679 parent_elm->internal.subtree_offset = new_node->node_offset; 1680 parent_elm->internal.mirror_tid = new_node->ondisk->mirror_tid; 1681 ++ondisk->count; 1682 hammer_modify_node_done(parent); 1683 hammer_cursor_inserted_element(parent, parent_index + 1); 1684 1685 /* 1686 * The children of new_node need their parent pointer set to new_node. 1687 * The children have already been locked by 1688 * hammer_btree_lock_children(). 1689 */ 1690 for (i = 0; i < new_node->ondisk->count; ++i) { 1691 elm = &new_node->ondisk->elms[i]; 1692 error = btree_set_parent(cursor->trans, new_node, elm); 1693 if (error) { 1694 panic("btree_split_internal: btree-fixup problem"); 1695 } 1696 } 1697 hammer_modify_node_done(new_node); 1698 1699 /* 1700 * The filesystem's root B-Tree pointer may have to be updated. 1701 */ 1702 if (made_root) { 1703 hammer_volume_t volume; 1704 1705 volume = hammer_get_root_volume(hmp, &error); 1706 KKASSERT(error == 0); 1707 1708 hammer_modify_volume_field(cursor->trans, volume, 1709 vol0_btree_root); 1710 volume->ondisk->vol0_btree_root = parent->node_offset; 1711 hammer_modify_volume_done(volume); 1712 node->ondisk->parent = parent->node_offset; 1713 if (cursor->parent) { 1714 hammer_unlock(&cursor->parent->lock); 1715 hammer_rel_node(cursor->parent); 1716 } 1717 cursor->parent = parent; /* lock'd and ref'd */ 1718 hammer_rel_volume(volume, 0); 1719 } 1720 hammer_modify_node_done(node); 1721 1722 /* 1723 * Ok, now adjust the cursor depending on which element the original 1724 * index was pointing at. If we are >= the split point the push node 1725 * is now in the new node. 1726 * 1727 * NOTE: If we are at the split point itself we cannot stay with the 1728 * original node because the push index will point at the right-hand 1729 * boundary, which is illegal. 1730 * 1731 * NOTE: The cursor's parent or parent_index must be adjusted for 1732 * the case where a new parent (new root) was created, and the case 1733 * where the cursor is now pointing at the split node. 1734 */ 1735 if (cursor->index >= split) { 1736 cursor->parent_index = parent_index + 1; 1737 cursor->index -= split; 1738 hammer_unlock(&cursor->node->lock); 1739 hammer_rel_node(cursor->node); 1740 cursor->node = new_node; /* locked and ref'd */ 1741 } else { 1742 cursor->parent_index = parent_index; 1743 hammer_unlock(&new_node->lock); 1744 hammer_rel_node(new_node); 1745 } 1746 1747 /* 1748 * Fixup left and right bounds 1749 */ 1750 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1751 cursor->left_bound = &parent_elm[0].internal.base; 1752 cursor->right_bound = &parent_elm[1].internal.base; 1753 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1754 &cursor->node->ondisk->elms[0].internal.base) <= 0); 1755 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1756 &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0); 1757 1758 done: 1759 hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, NULL); 1760 hammer_cursor_downgrade(cursor); 1761 return (error); 1762 } 1763 1764 /* 1765 * Same as the above, but splits a full leaf node. 1766 */ 1767 static 1768 int 1769 btree_split_leaf(hammer_cursor_t cursor) 1770 { 1771 hammer_node_ondisk_t ondisk; 1772 hammer_node_t parent; 1773 hammer_node_t leaf; 1774 hammer_mount_t hmp; 1775 hammer_node_t new_leaf; 1776 hammer_btree_elm_t elm; 1777 hammer_btree_elm_t parent_elm; 1778 hammer_base_elm_t mid_boundary; 1779 int parent_index; 1780 int made_root; 1781 int split; 1782 int error; 1783 const size_t esize = sizeof(*elm); 1784 1785 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1786 return(error); 1787 ++hammer_stats_btree_splits; 1788 1789 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1790 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1791 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1792 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1793 1794 /* 1795 * Calculate the split point. If the insertion point is at the 1796 * end of the leaf we adjust the split point significantly to the 1797 * right to try to optimize node fill and flag it. If we hit 1798 * that same leaf again our heuristic failed and we don't try 1799 * to optimize node fill (it could lead to a degenerate case). 1800 */ 1801 leaf = cursor->node; 1802 ondisk = leaf->ondisk; 1803 KKASSERT(ondisk->count > 4); 1804 if (cursor->index == ondisk->count && 1805 (leaf->flags & HAMMER_NODE_NONLINEAR) == 0) { 1806 split = (ondisk->count + 1) * 3 / 4; 1807 leaf->flags |= HAMMER_NODE_NONLINEAR; 1808 } else { 1809 split = (ondisk->count + 1) / 2; 1810 } 1811 1812 #if 0 1813 /* 1814 * If the insertion point is at the split point shift the 1815 * split point left so we don't have to worry about 1816 */ 1817 if (cursor->index == split) 1818 --split; 1819 #endif 1820 KKASSERT(split > 0 && split < ondisk->count); 1821 1822 error = 0; 1823 hmp = leaf->hmp; 1824 1825 elm = &ondisk->elms[split]; 1826 1827 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm[-1].leaf.base) <= 0); 1828 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->leaf.base) <= 0); 1829 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->leaf.base) > 0); 1830 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm[1].leaf.base) > 0); 1831 1832 /* 1833 * If we are at the root of the tree, create a new root node with 1834 * 1 element and split normally. Avoid making major modifications 1835 * until we know the whole operation will work. 1836 */ 1837 if (ondisk->parent == 0) { 1838 parent = hammer_alloc_btree(cursor->trans, 0, &error); 1839 if (parent == NULL) 1840 goto done; 1841 hammer_lock_ex(&parent->lock); 1842 hammer_modify_node_noundo(cursor->trans, parent); 1843 ondisk = parent->ondisk; 1844 ondisk->count = 1; 1845 ondisk->parent = 0; 1846 ondisk->mirror_tid = leaf->ondisk->mirror_tid; 1847 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1848 ondisk->elms[0].base = hmp->root_btree_beg; 1849 ondisk->elms[0].base.btype = leaf->ondisk->type; 1850 ondisk->elms[0].internal.subtree_offset = leaf->node_offset; 1851 ondisk->elms[0].internal.mirror_tid = ondisk->mirror_tid; 1852 ondisk->elms[1].base = hmp->root_btree_end; 1853 /* ondisk->elms[1].base.btype = not used */ 1854 hammer_modify_node_done(parent); 1855 made_root = 1; 1856 parent_index = 0; /* insertion point in parent */ 1857 } else { 1858 made_root = 0; 1859 parent = cursor->parent; 1860 parent_index = cursor->parent_index; 1861 } 1862 1863 /* 1864 * Split leaf into new_leaf at the split point. Select a separator 1865 * value in-between the two leafs but with a bent towards the right 1866 * leaf since comparisons use an 'elm >= separator' inequality. 1867 * 1868 * L L L L L L L L 1869 * 1870 * x x P x x 1871 * s S S s 1872 * / \ 1873 * L L L L L L L L 1874 */ 1875 new_leaf = hammer_alloc_btree(cursor->trans, 0, &error); 1876 if (new_leaf == NULL) { 1877 if (made_root) { 1878 hammer_unlock(&parent->lock); 1879 hammer_delete_node(cursor->trans, parent); 1880 hammer_rel_node(parent); 1881 } 1882 goto done; 1883 } 1884 hammer_lock_ex(&new_leaf->lock); 1885 1886 /* 1887 * Create the new node and copy the leaf elements from the split 1888 * point on to the new node. 1889 */ 1890 hammer_modify_node_all(cursor->trans, leaf); 1891 hammer_modify_node_noundo(cursor->trans, new_leaf); 1892 ondisk = leaf->ondisk; 1893 elm = &ondisk->elms[split]; 1894 bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize); 1895 new_leaf->ondisk->count = ondisk->count - split; 1896 new_leaf->ondisk->parent = parent->node_offset; 1897 new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF; 1898 new_leaf->ondisk->mirror_tid = ondisk->mirror_tid; 1899 KKASSERT(ondisk->type == new_leaf->ondisk->type); 1900 hammer_modify_node_done(new_leaf); 1901 hammer_cursor_split_node(leaf, new_leaf, split); 1902 1903 /* 1904 * Cleanup the original node. Because this is a leaf node and 1905 * leaf nodes do not have a right-hand boundary, there 1906 * aren't any special edge cases to clean up. We just fixup the 1907 * count. 1908 */ 1909 ondisk->count = split; 1910 1911 /* 1912 * Insert the separator into the parent, fixup the parent's 1913 * reference to the original node, and reference the new node. 1914 * The separator is P. 1915 * 1916 * Remember that base.count does not include the right-hand boundary. 1917 * We are copying parent_index+1 to parent_index+2, not +0 to +1. 1918 */ 1919 hammer_modify_node_all(cursor->trans, parent); 1920 ondisk = parent->ondisk; 1921 KKASSERT(split != 0); 1922 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1923 parent_elm = &ondisk->elms[parent_index+1]; 1924 bcopy(parent_elm, parent_elm + 1, 1925 (ondisk->count - parent_index) * esize); 1926 1927 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base); 1928 parent_elm->internal.base.btype = new_leaf->ondisk->type; 1929 parent_elm->internal.subtree_offset = new_leaf->node_offset; 1930 parent_elm->internal.mirror_tid = new_leaf->ondisk->mirror_tid; 1931 mid_boundary = &parent_elm->base; 1932 ++ondisk->count; 1933 hammer_modify_node_done(parent); 1934 hammer_cursor_inserted_element(parent, parent_index + 1); 1935 1936 /* 1937 * The filesystem's root B-Tree pointer may have to be updated. 1938 */ 1939 if (made_root) { 1940 hammer_volume_t volume; 1941 1942 volume = hammer_get_root_volume(hmp, &error); 1943 KKASSERT(error == 0); 1944 1945 hammer_modify_volume_field(cursor->trans, volume, 1946 vol0_btree_root); 1947 volume->ondisk->vol0_btree_root = parent->node_offset; 1948 hammer_modify_volume_done(volume); 1949 leaf->ondisk->parent = parent->node_offset; 1950 if (cursor->parent) { 1951 hammer_unlock(&cursor->parent->lock); 1952 hammer_rel_node(cursor->parent); 1953 } 1954 cursor->parent = parent; /* lock'd and ref'd */ 1955 hammer_rel_volume(volume, 0); 1956 } 1957 hammer_modify_node_done(leaf); 1958 1959 /* 1960 * Ok, now adjust the cursor depending on which element the original 1961 * index was pointing at. If we are >= the split point the push node 1962 * is now in the new node. 1963 * 1964 * NOTE: If we are at the split point itself we need to select the 1965 * old or new node based on where key_beg's insertion point will be. 1966 * If we pick the wrong side the inserted element will wind up in 1967 * the wrong leaf node and outside that node's bounds. 1968 */ 1969 if (cursor->index > split || 1970 (cursor->index == split && 1971 hammer_btree_cmp(&cursor->key_beg, mid_boundary) >= 0)) { 1972 cursor->parent_index = parent_index + 1; 1973 cursor->index -= split; 1974 hammer_unlock(&cursor->node->lock); 1975 hammer_rel_node(cursor->node); 1976 cursor->node = new_leaf; 1977 } else { 1978 cursor->parent_index = parent_index; 1979 hammer_unlock(&new_leaf->lock); 1980 hammer_rel_node(new_leaf); 1981 } 1982 1983 /* 1984 * Fixup left and right bounds 1985 */ 1986 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1987 cursor->left_bound = &parent_elm[0].internal.base; 1988 cursor->right_bound = &parent_elm[1].internal.base; 1989 1990 /* 1991 * Assert that the bounds are correct. 1992 */ 1993 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1994 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1995 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1996 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1997 KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->key_beg) <= 0); 1998 KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->key_beg) > 0); 1999 2000 done: 2001 hammer_cursor_downgrade(cursor); 2002 return (error); 2003 } 2004 2005 #if 0 2006 2007 /* 2008 * Recursively correct the right-hand boundary's create_tid to (tid) as 2009 * long as the rest of the key matches. We have to recurse upward in 2010 * the tree as well as down the left side of each parent's right node. 2011 * 2012 * Return EDEADLK if we were only partially successful, forcing the caller 2013 * to try again. The original cursor is not modified. This routine can 2014 * also fail with EDEADLK if it is forced to throw away a portion of its 2015 * record history. 2016 * 2017 * The caller must pass a downgraded cursor to us (otherwise we can't dup it). 2018 */ 2019 struct hammer_rhb { 2020 TAILQ_ENTRY(hammer_rhb) entry; 2021 hammer_node_t node; 2022 int index; 2023 }; 2024 2025 TAILQ_HEAD(hammer_rhb_list, hammer_rhb); 2026 2027 int 2028 hammer_btree_correct_rhb(hammer_cursor_t cursor, hammer_tid_t tid) 2029 { 2030 struct hammer_mount *hmp; 2031 struct hammer_rhb_list rhb_list; 2032 hammer_base_elm_t elm; 2033 hammer_node_t orig_node; 2034 struct hammer_rhb *rhb; 2035 int orig_index; 2036 int error; 2037 2038 TAILQ_INIT(&rhb_list); 2039 hmp = cursor->trans->hmp; 2040 2041 /* 2042 * Save our position so we can restore it on return. This also 2043 * gives us a stable 'elm'. 2044 */ 2045 orig_node = cursor->node; 2046 hammer_ref_node(orig_node); 2047 hammer_lock_sh(&orig_node->lock); 2048 orig_index = cursor->index; 2049 elm = &orig_node->ondisk->elms[orig_index].base; 2050 2051 /* 2052 * Now build a list of parents going up, allocating a rhb 2053 * structure for each one. 2054 */ 2055 while (cursor->parent) { 2056 /* 2057 * Stop if we no longer have any right-bounds to fix up 2058 */ 2059 if (elm->obj_id != cursor->right_bound->obj_id || 2060 elm->rec_type != cursor->right_bound->rec_type || 2061 elm->key != cursor->right_bound->key) { 2062 break; 2063 } 2064 2065 /* 2066 * Stop if the right-hand bound's create_tid does not 2067 * need to be corrected. 2068 */ 2069 if (cursor->right_bound->create_tid >= tid) 2070 break; 2071 2072 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2073 rhb->node = cursor->parent; 2074 rhb->index = cursor->parent_index; 2075 hammer_ref_node(rhb->node); 2076 hammer_lock_sh(&rhb->node->lock); 2077 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2078 2079 hammer_cursor_up(cursor); 2080 } 2081 2082 /* 2083 * now safely adjust the right hand bound for each rhb. This may 2084 * also require taking the right side of the tree and iterating down 2085 * ITS left side. 2086 */ 2087 error = 0; 2088 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2089 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2090 if (error) 2091 break; 2092 TAILQ_REMOVE(&rhb_list, rhb, entry); 2093 hammer_unlock(&rhb->node->lock); 2094 hammer_rel_node(rhb->node); 2095 kfree(rhb, hmp->m_misc); 2096 2097 switch (cursor->node->ondisk->type) { 2098 case HAMMER_BTREE_TYPE_INTERNAL: 2099 /* 2100 * Right-boundary for parent at internal node 2101 * is one element to the right of the element whos 2102 * right boundary needs adjusting. We must then 2103 * traverse down the left side correcting any left 2104 * bounds (which may now be too far to the left). 2105 */ 2106 ++cursor->index; 2107 error = hammer_btree_correct_lhb(cursor, tid); 2108 break; 2109 default: 2110 panic("hammer_btree_correct_rhb(): Bad node type"); 2111 error = EINVAL; 2112 break; 2113 } 2114 } 2115 2116 /* 2117 * Cleanup 2118 */ 2119 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2120 TAILQ_REMOVE(&rhb_list, rhb, entry); 2121 hammer_unlock(&rhb->node->lock); 2122 hammer_rel_node(rhb->node); 2123 kfree(rhb, hmp->m_misc); 2124 } 2125 error = hammer_cursor_seek(cursor, orig_node, orig_index); 2126 hammer_unlock(&orig_node->lock); 2127 hammer_rel_node(orig_node); 2128 return (error); 2129 } 2130 2131 /* 2132 * Similar to rhb (in fact, rhb calls lhb), but corrects the left hand 2133 * bound going downward starting at the current cursor position. 2134 * 2135 * This function does not restore the cursor after use. 2136 */ 2137 int 2138 hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid) 2139 { 2140 struct hammer_rhb_list rhb_list; 2141 hammer_base_elm_t elm; 2142 hammer_base_elm_t cmp; 2143 struct hammer_rhb *rhb; 2144 struct hammer_mount *hmp; 2145 int error; 2146 2147 TAILQ_INIT(&rhb_list); 2148 hmp = cursor->trans->hmp; 2149 2150 cmp = &cursor->node->ondisk->elms[cursor->index].base; 2151 2152 /* 2153 * Record the node and traverse down the left-hand side for all 2154 * matching records needing a boundary correction. 2155 */ 2156 error = 0; 2157 for (;;) { 2158 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2159 rhb->node = cursor->node; 2160 rhb->index = cursor->index; 2161 hammer_ref_node(rhb->node); 2162 hammer_lock_sh(&rhb->node->lock); 2163 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2164 2165 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2166 /* 2167 * Nothing to traverse down if we are at the right 2168 * boundary of an internal node. 2169 */ 2170 if (cursor->index == cursor->node->ondisk->count) 2171 break; 2172 } else { 2173 elm = &cursor->node->ondisk->elms[cursor->index].base; 2174 if (elm->btype == HAMMER_BTREE_TYPE_RECORD) 2175 break; 2176 panic("Illegal leaf record type %02x", elm->btype); 2177 } 2178 error = hammer_cursor_down(cursor); 2179 if (error) 2180 break; 2181 2182 elm = &cursor->node->ondisk->elms[cursor->index].base; 2183 if (elm->obj_id != cmp->obj_id || 2184 elm->rec_type != cmp->rec_type || 2185 elm->key != cmp->key) { 2186 break; 2187 } 2188 if (elm->create_tid >= tid) 2189 break; 2190 2191 } 2192 2193 /* 2194 * Now we can safely adjust the left-hand boundary from the bottom-up. 2195 * The last element we remove from the list is the caller's right hand 2196 * boundary, which must also be adjusted. 2197 */ 2198 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2199 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2200 if (error) 2201 break; 2202 TAILQ_REMOVE(&rhb_list, rhb, entry); 2203 hammer_unlock(&rhb->node->lock); 2204 hammer_rel_node(rhb->node); 2205 kfree(rhb, hmp->m_misc); 2206 2207 elm = &cursor->node->ondisk->elms[cursor->index].base; 2208 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2209 hammer_modify_node(cursor->trans, cursor->node, 2210 &elm->create_tid, 2211 sizeof(elm->create_tid)); 2212 elm->create_tid = tid; 2213 hammer_modify_node_done(cursor->node); 2214 } else { 2215 panic("hammer_btree_correct_lhb(): Bad element type"); 2216 } 2217 } 2218 2219 /* 2220 * Cleanup 2221 */ 2222 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2223 TAILQ_REMOVE(&rhb_list, rhb, entry); 2224 hammer_unlock(&rhb->node->lock); 2225 hammer_rel_node(rhb->node); 2226 kfree(rhb, hmp->m_misc); 2227 } 2228 return (error); 2229 } 2230 2231 #endif 2232 2233 /* 2234 * Attempt to remove the locked, empty or want-to-be-empty B-Tree node at 2235 * (cursor->node). Returns 0 on success, EDEADLK if we could not complete 2236 * the operation due to a deadlock, or some other error. 2237 * 2238 * This routine is initially called with an empty leaf and may be 2239 * recursively called with single-element internal nodes. 2240 * 2241 * It should also be noted that when removing empty leaves we must be sure 2242 * to test and update mirror_tid because another thread may have deadlocked 2243 * against us (or someone) trying to propagate it up and cannot retry once 2244 * the node has been deleted. 2245 * 2246 * On return the cursor may end up pointing to an internal node, suitable 2247 * for further iteration but not for an immediate insertion or deletion. 2248 */ 2249 static int 2250 btree_remove(hammer_cursor_t cursor, int *ndelete) 2251 { 2252 hammer_node_ondisk_t ondisk; 2253 hammer_btree_elm_t elm; 2254 hammer_node_t node; 2255 hammer_node_t parent; 2256 const int esize = sizeof(*elm); 2257 int error; 2258 2259 node = cursor->node; 2260 2261 /* 2262 * When deleting the root of the filesystem convert it to 2263 * an empty leaf node. Internal nodes cannot be empty. 2264 */ 2265 ondisk = node->ondisk; 2266 if (ondisk->parent == 0) { 2267 KKASSERT(cursor->parent == NULL); 2268 hammer_modify_node_all(cursor->trans, node); 2269 KKASSERT(ondisk == node->ondisk); 2270 ondisk->type = HAMMER_BTREE_TYPE_LEAF; 2271 ondisk->count = 0; 2272 hammer_modify_node_done(node); 2273 cursor->index = 0; 2274 return(0); 2275 } 2276 2277 parent = cursor->parent; 2278 2279 /* 2280 * Attempt to remove the parent's reference to the child. If the 2281 * parent would become empty we have to recurse. If we fail we 2282 * leave the parent pointing to an empty leaf node. 2283 * 2284 * We have to recurse successfully before we can delete the internal 2285 * node as it is illegal to have empty internal nodes. Even though 2286 * the operation may be aborted we must still fixup any unlocked 2287 * cursors as if we had deleted the element prior to recursing 2288 * (by calling hammer_cursor_deleted_element()) so those cursors 2289 * are properly forced up the chain by the recursion. 2290 */ 2291 if (parent->ondisk->count == 1) { 2292 /* 2293 * This special cursor_up_locked() call leaves the original 2294 * node exclusively locked and referenced, leaves the 2295 * original parent locked (as the new node), and locks the 2296 * new parent. It can return EDEADLK. 2297 * 2298 * We cannot call hammer_cursor_removed_node() until we are 2299 * actually able to remove the node. If we did then tracked 2300 * cursors in the middle of iterations could be repointed 2301 * to a parent node. If this occurs they could end up 2302 * scanning newly inserted records into the node (that could 2303 * not be deleted) when they push down again. 2304 * 2305 * Due to the way the recursion works the final parent is left 2306 * in cursor->parent after the recursion returns. Each 2307 * layer on the way back up is thus able to call 2308 * hammer_cursor_removed_node() and 'jump' the node up to 2309 * the (same) final parent. 2310 * 2311 * NOTE! The local variable 'parent' is invalid after we 2312 * call hammer_cursor_up_locked(). 2313 */ 2314 error = hammer_cursor_up_locked(cursor); 2315 parent = NULL; 2316 2317 if (error == 0) { 2318 hammer_cursor_deleted_element(cursor->node, 0); 2319 error = btree_remove(cursor, ndelete); 2320 if (error == 0) { 2321 KKASSERT(node != cursor->node); 2322 hammer_cursor_removed_node( 2323 node, cursor->node, 2324 cursor->index); 2325 hammer_modify_node_all(cursor->trans, node); 2326 ondisk = node->ondisk; 2327 ondisk->type = HAMMER_BTREE_TYPE_DELETED; 2328 ondisk->count = 0; 2329 hammer_modify_node_done(node); 2330 hammer_flush_node(node, 0); 2331 hammer_delete_node(cursor->trans, node); 2332 if (ndelete) 2333 (*ndelete)++; 2334 } else { 2335 /* 2336 * Defer parent removal because we could not 2337 * get the lock, just let the leaf remain 2338 * empty. 2339 */ 2340 /**/ 2341 } 2342 hammer_unlock(&node->lock); 2343 hammer_rel_node(node); 2344 } else { 2345 /* 2346 * Defer parent removal because we could not 2347 * get the lock, just let the leaf remain 2348 * empty. 2349 */ 2350 /**/ 2351 } 2352 } else { 2353 KKASSERT(parent->ondisk->count > 1); 2354 2355 hammer_modify_node_all(cursor->trans, parent); 2356 ondisk = parent->ondisk; 2357 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 2358 2359 elm = &ondisk->elms[cursor->parent_index]; 2360 KKASSERT(elm->internal.subtree_offset == node->node_offset); 2361 KKASSERT(ondisk->count > 0); 2362 2363 /* 2364 * We must retain the highest mirror_tid. The deleted 2365 * range is now encompassed by the element to the left. 2366 * If we are already at the left edge the new left edge 2367 * inherits mirror_tid. 2368 * 2369 * Note that bounds of the parent to our parent may create 2370 * a gap to the left of our left-most node or to the right 2371 * of our right-most node. The gap is silently included 2372 * in the mirror_tid's area of effect from the point of view 2373 * of the scan. 2374 */ 2375 if (cursor->parent_index) { 2376 if (elm[-1].internal.mirror_tid < 2377 elm[0].internal.mirror_tid) { 2378 elm[-1].internal.mirror_tid = 2379 elm[0].internal.mirror_tid; 2380 } 2381 } else { 2382 if (elm[1].internal.mirror_tid < 2383 elm[0].internal.mirror_tid) { 2384 elm[1].internal.mirror_tid = 2385 elm[0].internal.mirror_tid; 2386 } 2387 } 2388 2389 /* 2390 * Delete the subtree reference in the parent. Include 2391 * boundary element at end. 2392 */ 2393 bcopy(&elm[1], &elm[0], 2394 (ondisk->count - cursor->parent_index) * esize); 2395 --ondisk->count; 2396 hammer_modify_node_done(parent); 2397 hammer_cursor_removed_node(node, parent, cursor->parent_index); 2398 hammer_cursor_deleted_element(parent, cursor->parent_index); 2399 hammer_flush_node(node, 0); 2400 hammer_delete_node(cursor->trans, node); 2401 2402 /* 2403 * cursor->node is invalid, cursor up to make the cursor 2404 * valid again. We have to flag the condition in case 2405 * another thread wiggles an insertion in during an 2406 * iteration. 2407 */ 2408 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK; 2409 error = hammer_cursor_up(cursor); 2410 if (ndelete) 2411 (*ndelete)++; 2412 } 2413 return (error); 2414 } 2415 2416 /* 2417 * Propagate cursor->trans->tid up the B-Tree starting at the current 2418 * cursor position using pseudofs info gleaned from the passed inode. 2419 * 2420 * The passed inode has no relationship to the cursor position other 2421 * then being in the same pseudofs as the insertion or deletion we 2422 * are propagating the mirror_tid for. 2423 * 2424 * WARNING! Because we push and pop the passed cursor, it may be 2425 * modified by other B-Tree operations while it is unlocked 2426 * and things like the node & leaf pointers, and indexes might 2427 * change. 2428 */ 2429 void 2430 hammer_btree_do_propagation(hammer_cursor_t cursor, 2431 hammer_pseudofs_inmem_t pfsm, 2432 hammer_btree_leaf_elm_t leaf) 2433 { 2434 hammer_cursor_t ncursor; 2435 hammer_tid_t mirror_tid; 2436 int error __debugvar; 2437 2438 /* 2439 * We do not propagate a mirror_tid if the filesystem was mounted 2440 * in no-mirror mode. 2441 */ 2442 if (cursor->trans->hmp->master_id < 0) 2443 return; 2444 2445 /* 2446 * This is a bit of a hack because we cannot deadlock or return 2447 * EDEADLK here. The related operation has already completed and 2448 * we must propagate the mirror_tid now regardless. 2449 * 2450 * Generate a new cursor which inherits the original's locks and 2451 * unlock the original. Use the new cursor to propagate the 2452 * mirror_tid. Then clean up the new cursor and reacquire locks 2453 * on the original. 2454 * 2455 * hammer_dup_cursor() cannot dup locks. The dup inherits the 2456 * original's locks and the original is tracked and must be 2457 * re-locked. 2458 */ 2459 mirror_tid = cursor->node->ondisk->mirror_tid; 2460 KKASSERT(mirror_tid != 0); 2461 ncursor = hammer_push_cursor(cursor); 2462 error = hammer_btree_mirror_propagate(ncursor, mirror_tid); 2463 KKASSERT(error == 0); 2464 hammer_pop_cursor(cursor, ncursor); 2465 /* WARNING: cursor's leaf pointer may change after pop */ 2466 } 2467 2468 2469 /* 2470 * Propagate a mirror TID update upwards through the B-Tree to the root. 2471 * 2472 * A locked internal node must be passed in. The node will remain locked 2473 * on return. 2474 * 2475 * This function syncs mirror_tid at the specified internal node's element, 2476 * adjusts the node's aggregation mirror_tid, and then recurses upwards. 2477 */ 2478 static int 2479 hammer_btree_mirror_propagate(hammer_cursor_t cursor, hammer_tid_t mirror_tid) 2480 { 2481 hammer_btree_internal_elm_t elm; 2482 hammer_node_t node; 2483 int error; 2484 2485 for (;;) { 2486 error = hammer_cursor_up(cursor); 2487 if (error == 0) 2488 error = hammer_cursor_upgrade(cursor); 2489 2490 /* 2491 * We can ignore HAMMER_CURSOR_ITERATE_CHECK, the 2492 * cursor will still be properly positioned for 2493 * mirror propagation, just not for iterations. 2494 */ 2495 while (error == EDEADLK) { 2496 hammer_recover_cursor(cursor); 2497 error = hammer_cursor_upgrade(cursor); 2498 } 2499 if (error) 2500 break; 2501 2502 /* 2503 * If the cursor deadlocked it could end up at a leaf 2504 * after we lost the lock. 2505 */ 2506 node = cursor->node; 2507 if (node->ondisk->type != HAMMER_BTREE_TYPE_INTERNAL) 2508 continue; 2509 2510 /* 2511 * Adjust the node's element 2512 */ 2513 elm = &node->ondisk->elms[cursor->index].internal; 2514 if (elm->mirror_tid >= mirror_tid) 2515 break; 2516 hammer_modify_node(cursor->trans, node, &elm->mirror_tid, 2517 sizeof(elm->mirror_tid)); 2518 elm->mirror_tid = mirror_tid; 2519 hammer_modify_node_done(node); 2520 if (hammer_debug_general & 0x0002) { 2521 kprintf("mirror_propagate: propagate " 2522 "%016llx @%016llx:%d\n", 2523 (long long)mirror_tid, 2524 (long long)node->node_offset, 2525 cursor->index); 2526 } 2527 2528 2529 /* 2530 * Adjust the node's mirror_tid aggregator 2531 */ 2532 if (node->ondisk->mirror_tid >= mirror_tid) 2533 return(0); 2534 hammer_modify_node_field(cursor->trans, node, mirror_tid); 2535 node->ondisk->mirror_tid = mirror_tid; 2536 hammer_modify_node_done(node); 2537 if (hammer_debug_general & 0x0002) { 2538 kprintf("mirror_propagate: propagate " 2539 "%016llx @%016llx\n", 2540 (long long)mirror_tid, 2541 (long long)node->node_offset); 2542 } 2543 } 2544 if (error == ENOENT) 2545 error = 0; 2546 return(error); 2547 } 2548 2549 hammer_node_t 2550 hammer_btree_get_parent(hammer_transaction_t trans, hammer_node_t node, 2551 int *parent_indexp, int *errorp, int try_exclusive) 2552 { 2553 hammer_node_t parent; 2554 hammer_btree_elm_t elm; 2555 int i; 2556 2557 /* 2558 * Get the node 2559 */ 2560 parent = hammer_get_node(trans, node->ondisk->parent, 0, errorp); 2561 if (*errorp) { 2562 KKASSERT(parent == NULL); 2563 return(NULL); 2564 } 2565 KKASSERT ((parent->flags & HAMMER_NODE_DELETED) == 0); 2566 2567 /* 2568 * Lock the node 2569 */ 2570 if (try_exclusive) { 2571 if (hammer_lock_ex_try(&parent->lock)) { 2572 hammer_rel_node(parent); 2573 *errorp = EDEADLK; 2574 return(NULL); 2575 } 2576 } else { 2577 hammer_lock_sh(&parent->lock); 2578 } 2579 2580 /* 2581 * Figure out which element in the parent is pointing to the 2582 * child. 2583 */ 2584 if (node->ondisk->count) { 2585 i = hammer_btree_search_node(&node->ondisk->elms[0].base, 2586 parent->ondisk); 2587 } else { 2588 i = 0; 2589 } 2590 while (i < parent->ondisk->count) { 2591 elm = &parent->ondisk->elms[i]; 2592 if (elm->internal.subtree_offset == node->node_offset) 2593 break; 2594 ++i; 2595 } 2596 if (i == parent->ondisk->count) { 2597 hammer_unlock(&parent->lock); 2598 panic("Bad B-Tree link: parent %p node %p", parent, node); 2599 } 2600 *parent_indexp = i; 2601 KKASSERT(*errorp == 0); 2602 return(parent); 2603 } 2604 2605 /* 2606 * The element (elm) has been moved to a new internal node (node). 2607 * 2608 * If the element represents a pointer to an internal node that node's 2609 * parent must be adjusted to the element's new location. 2610 * 2611 * XXX deadlock potential here with our exclusive locks 2612 */ 2613 int 2614 btree_set_parent(hammer_transaction_t trans, hammer_node_t node, 2615 hammer_btree_elm_t elm) 2616 { 2617 hammer_node_t child; 2618 int error; 2619 2620 error = 0; 2621 2622 switch(elm->base.btype) { 2623 case HAMMER_BTREE_TYPE_INTERNAL: 2624 case HAMMER_BTREE_TYPE_LEAF: 2625 child = hammer_get_node(trans, elm->internal.subtree_offset, 2626 0, &error); 2627 if (error == 0) { 2628 hammer_modify_node_field(trans, child, parent); 2629 child->ondisk->parent = node->node_offset; 2630 hammer_modify_node_done(child); 2631 hammer_rel_node(child); 2632 } 2633 break; 2634 default: 2635 break; 2636 } 2637 return(error); 2638 } 2639 2640 /* 2641 * Initialize the root of a recursive B-Tree node lock list structure. 2642 */ 2643 void 2644 hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node) 2645 { 2646 TAILQ_INIT(&parent->list); 2647 parent->parent = NULL; 2648 parent->node = node; 2649 parent->index = -1; 2650 parent->count = node->ondisk->count; 2651 parent->copy = NULL; 2652 parent->flags = 0; 2653 } 2654 2655 /* 2656 * Initialize a cache of hammer_node_lock's including space allocated 2657 * for node copies. 2658 * 2659 * This is used by the rebalancing code to preallocate the copy space 2660 * for ~4096 B-Tree nodes (16MB of data) prior to acquiring any HAMMER 2661 * locks, otherwise we can blow out the pageout daemon's emergency 2662 * reserve and deadlock it. 2663 * 2664 * NOTE: HAMMER_NODE_LOCK_LCACHE is not set on items cached in the lcache. 2665 * The flag is set when the item is pulled off the cache for use. 2666 */ 2667 void 2668 hammer_btree_lcache_init(hammer_mount_t hmp, hammer_node_lock_t lcache, 2669 int depth) 2670 { 2671 hammer_node_lock_t item; 2672 int count; 2673 2674 for (count = 1; depth; --depth) 2675 count *= HAMMER_BTREE_LEAF_ELMS; 2676 bzero(lcache, sizeof(*lcache)); 2677 TAILQ_INIT(&lcache->list); 2678 while (count) { 2679 item = kmalloc(sizeof(*item), hmp->m_misc, M_WAITOK|M_ZERO); 2680 item->copy = kmalloc(sizeof(*item->copy), 2681 hmp->m_misc, M_WAITOK); 2682 TAILQ_INIT(&item->list); 2683 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2684 --count; 2685 } 2686 } 2687 2688 void 2689 hammer_btree_lcache_free(hammer_mount_t hmp, hammer_node_lock_t lcache) 2690 { 2691 hammer_node_lock_t item; 2692 2693 while ((item = TAILQ_FIRST(&lcache->list)) != NULL) { 2694 TAILQ_REMOVE(&lcache->list, item, entry); 2695 KKASSERT(item->copy); 2696 KKASSERT(TAILQ_EMPTY(&item->list)); 2697 kfree(item->copy, hmp->m_misc); 2698 kfree(item, hmp->m_misc); 2699 } 2700 KKASSERT(lcache->copy == NULL); 2701 } 2702 2703 /* 2704 * Exclusively lock all the children of node. This is used by the split 2705 * code to prevent anyone from accessing the children of a cursor node 2706 * while we fix-up its parent offset. 2707 * 2708 * If we don't lock the children we can really mess up cursors which block 2709 * trying to cursor-up into our node. 2710 * 2711 * On failure EDEADLK (or some other error) is returned. If a deadlock 2712 * error is returned the cursor is adjusted to block on termination. 2713 * 2714 * The caller is responsible for managing parent->node, the root's node 2715 * is usually aliased from a cursor. 2716 */ 2717 int 2718 hammer_btree_lock_children(hammer_cursor_t cursor, int depth, 2719 hammer_node_lock_t parent, 2720 hammer_node_lock_t lcache) 2721 { 2722 hammer_node_t node; 2723 hammer_node_lock_t item; 2724 hammer_node_ondisk_t ondisk; 2725 hammer_btree_elm_t elm; 2726 hammer_node_t child; 2727 struct hammer_mount *hmp; 2728 int error; 2729 int i; 2730 2731 node = parent->node; 2732 ondisk = node->ondisk; 2733 error = 0; 2734 hmp = cursor->trans->hmp; 2735 2736 /* 2737 * We really do not want to block on I/O with exclusive locks held, 2738 * pre-get the children before trying to lock the mess. This is 2739 * only done one-level deep for now. 2740 */ 2741 for (i = 0; i < ondisk->count; ++i) { 2742 ++hammer_stats_btree_elements; 2743 elm = &ondisk->elms[i]; 2744 if (elm->base.btype != HAMMER_BTREE_TYPE_LEAF && 2745 elm->base.btype != HAMMER_BTREE_TYPE_INTERNAL) { 2746 continue; 2747 } 2748 child = hammer_get_node(cursor->trans, 2749 elm->internal.subtree_offset, 2750 0, &error); 2751 if (child) 2752 hammer_rel_node(child); 2753 } 2754 2755 /* 2756 * Do it for real 2757 */ 2758 for (i = 0; error == 0 && i < ondisk->count; ++i) { 2759 ++hammer_stats_btree_elements; 2760 elm = &ondisk->elms[i]; 2761 2762 switch(elm->base.btype) { 2763 case HAMMER_BTREE_TYPE_INTERNAL: 2764 case HAMMER_BTREE_TYPE_LEAF: 2765 KKASSERT(elm->internal.subtree_offset != 0); 2766 child = hammer_get_node(cursor->trans, 2767 elm->internal.subtree_offset, 2768 0, &error); 2769 break; 2770 default: 2771 child = NULL; 2772 break; 2773 } 2774 if (child) { 2775 if (hammer_lock_ex_try(&child->lock) != 0) { 2776 if (cursor->deadlk_node == NULL) { 2777 cursor->deadlk_node = child; 2778 hammer_ref_node(cursor->deadlk_node); 2779 } 2780 error = EDEADLK; 2781 hammer_rel_node(child); 2782 } else { 2783 if (lcache) { 2784 item = TAILQ_FIRST(&lcache->list); 2785 KKASSERT(item != NULL); 2786 item->flags |= HAMMER_NODE_LOCK_LCACHE; 2787 TAILQ_REMOVE(&lcache->list, 2788 item, entry); 2789 } else { 2790 item = kmalloc(sizeof(*item), 2791 hmp->m_misc, 2792 M_WAITOK|M_ZERO); 2793 TAILQ_INIT(&item->list); 2794 } 2795 2796 TAILQ_INSERT_TAIL(&parent->list, item, entry); 2797 item->parent = parent; 2798 item->node = child; 2799 item->index = i; 2800 item->count = child->ondisk->count; 2801 2802 /* 2803 * Recurse (used by the rebalancing code) 2804 */ 2805 if (depth > 1 && elm->base.btype == HAMMER_BTREE_TYPE_INTERNAL) { 2806 error = hammer_btree_lock_children( 2807 cursor, 2808 depth - 1, 2809 item, 2810 lcache); 2811 } 2812 } 2813 } 2814 } 2815 if (error) 2816 hammer_btree_unlock_children(hmp, parent, lcache); 2817 return(error); 2818 } 2819 2820 /* 2821 * Create an in-memory copy of all B-Tree nodes listed, recursively, 2822 * including the parent. 2823 */ 2824 void 2825 hammer_btree_lock_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2826 { 2827 hammer_mount_t hmp = cursor->trans->hmp; 2828 hammer_node_lock_t item; 2829 2830 if (parent->copy == NULL) { 2831 KKASSERT((parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0); 2832 parent->copy = kmalloc(sizeof(*parent->copy), 2833 hmp->m_misc, M_WAITOK); 2834 } 2835 KKASSERT((parent->flags & HAMMER_NODE_LOCK_UPDATED) == 0); 2836 *parent->copy = *parent->node->ondisk; 2837 TAILQ_FOREACH(item, &parent->list, entry) { 2838 hammer_btree_lock_copy(cursor, item); 2839 } 2840 } 2841 2842 /* 2843 * Recursively sync modified copies to the media. 2844 */ 2845 int 2846 hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2847 { 2848 hammer_node_lock_t item; 2849 int count = 0; 2850 2851 if (parent->flags & HAMMER_NODE_LOCK_UPDATED) { 2852 ++count; 2853 hammer_modify_node_all(cursor->trans, parent->node); 2854 *parent->node->ondisk = *parent->copy; 2855 hammer_modify_node_done(parent->node); 2856 if (parent->copy->type == HAMMER_BTREE_TYPE_DELETED) { 2857 hammer_flush_node(parent->node, 0); 2858 hammer_delete_node(cursor->trans, parent->node); 2859 } 2860 } 2861 TAILQ_FOREACH(item, &parent->list, entry) { 2862 count += hammer_btree_sync_copy(cursor, item); 2863 } 2864 return(count); 2865 } 2866 2867 /* 2868 * Release previously obtained node locks. The caller is responsible for 2869 * cleaning up parent->node itself (its usually just aliased from a cursor), 2870 * but this function will take care of the copies. 2871 * 2872 * NOTE: The root node is not placed in the lcache and node->copy is not 2873 * deallocated when lcache != NULL. 2874 */ 2875 void 2876 hammer_btree_unlock_children(hammer_mount_t hmp, hammer_node_lock_t parent, 2877 hammer_node_lock_t lcache) 2878 { 2879 hammer_node_lock_t item; 2880 hammer_node_ondisk_t copy; 2881 2882 while ((item = TAILQ_FIRST(&parent->list)) != NULL) { 2883 TAILQ_REMOVE(&parent->list, item, entry); 2884 hammer_btree_unlock_children(hmp, item, lcache); 2885 hammer_unlock(&item->node->lock); 2886 hammer_rel_node(item->node); 2887 if (lcache) { 2888 /* 2889 * NOTE: When placing the item back in the lcache 2890 * the flag is cleared by the bzero(). 2891 * Remaining fields are cleared as a safety 2892 * measure. 2893 */ 2894 KKASSERT(item->flags & HAMMER_NODE_LOCK_LCACHE); 2895 KKASSERT(TAILQ_EMPTY(&item->list)); 2896 copy = item->copy; 2897 bzero(item, sizeof(*item)); 2898 TAILQ_INIT(&item->list); 2899 item->copy = copy; 2900 if (copy) 2901 bzero(copy, sizeof(*copy)); 2902 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2903 } else { 2904 kfree(item, hmp->m_misc); 2905 } 2906 } 2907 if (parent->copy && (parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0) { 2908 kfree(parent->copy, hmp->m_misc); 2909 parent->copy = NULL; /* safety */ 2910 } 2911 } 2912 2913 /************************************************************************ 2914 * MISCELLANIOUS SUPPORT * 2915 ************************************************************************/ 2916 2917 /* 2918 * Compare two B-Tree elements, return -N, 0, or +N (e.g. similar to strcmp). 2919 * 2920 * Note that for this particular function a return value of -1, 0, or +1 2921 * can denote a match if create_tid is otherwise discounted. A create_tid 2922 * of zero is considered to be 'infinity' in comparisons. 2923 * 2924 * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c. 2925 */ 2926 int 2927 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2) 2928 { 2929 if (key1->localization < key2->localization) 2930 return(-5); 2931 if (key1->localization > key2->localization) 2932 return(5); 2933 2934 if (key1->obj_id < key2->obj_id) 2935 return(-4); 2936 if (key1->obj_id > key2->obj_id) 2937 return(4); 2938 2939 if (key1->rec_type < key2->rec_type) 2940 return(-3); 2941 if (key1->rec_type > key2->rec_type) 2942 return(3); 2943 2944 if (key1->key < key2->key) 2945 return(-2); 2946 if (key1->key > key2->key) 2947 return(2); 2948 2949 /* 2950 * A create_tid of zero indicates a record which is undeletable 2951 * and must be considered to have a value of positive infinity. 2952 */ 2953 if (key1->create_tid == 0) { 2954 if (key2->create_tid == 0) 2955 return(0); 2956 return(1); 2957 } 2958 if (key2->create_tid == 0) 2959 return(-1); 2960 if (key1->create_tid < key2->create_tid) 2961 return(-1); 2962 if (key1->create_tid > key2->create_tid) 2963 return(1); 2964 return(0); 2965 } 2966 2967 /* 2968 * Test a timestamp against an element to determine whether the 2969 * element is visible. A timestamp of 0 means 'infinity'. 2970 */ 2971 int 2972 hammer_btree_chkts(hammer_tid_t asof, hammer_base_elm_t base) 2973 { 2974 if (asof == 0) { 2975 if (base->delete_tid) 2976 return(1); 2977 return(0); 2978 } 2979 if (asof < base->create_tid) 2980 return(-1); 2981 if (base->delete_tid && asof >= base->delete_tid) 2982 return(1); 2983 return(0); 2984 } 2985 2986 /* 2987 * Create a separator half way inbetween key1 and key2. For fields just 2988 * one unit apart, the separator will match key2. key1 is on the left-hand 2989 * side and key2 is on the right-hand side. 2990 * 2991 * key2 must be >= the separator. It is ok for the separator to match key2. 2992 * 2993 * NOTE: Even if key1 does not match key2, the separator may wind up matching 2994 * key2. 2995 * 2996 * NOTE: It might be beneficial to just scrap this whole mess and just 2997 * set the separator to key2. 2998 */ 2999 #define MAKE_SEPARATOR(key1, key2, dest, field) \ 3000 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1); 3001 3002 static void 3003 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2, 3004 hammer_base_elm_t dest) 3005 { 3006 bzero(dest, sizeof(*dest)); 3007 3008 dest->rec_type = key2->rec_type; 3009 dest->key = key2->key; 3010 dest->obj_id = key2->obj_id; 3011 dest->create_tid = key2->create_tid; 3012 3013 MAKE_SEPARATOR(key1, key2, dest, localization); 3014 if (key1->localization == key2->localization) { 3015 MAKE_SEPARATOR(key1, key2, dest, obj_id); 3016 if (key1->obj_id == key2->obj_id) { 3017 MAKE_SEPARATOR(key1, key2, dest, rec_type); 3018 if (key1->rec_type == key2->rec_type) { 3019 MAKE_SEPARATOR(key1, key2, dest, key); 3020 /* 3021 * Don't bother creating a separator for 3022 * create_tid, which also conveniently avoids 3023 * having to handle the create_tid == 0 3024 * (infinity) case. Just leave create_tid 3025 * set to key2. 3026 * 3027 * Worst case, dest matches key2 exactly, 3028 * which is acceptable. 3029 */ 3030 } 3031 } 3032 } 3033 } 3034 3035 #undef MAKE_SEPARATOR 3036 3037 /* 3038 * Return whether a generic internal or leaf node is full 3039 */ 3040 static __inline 3041 int 3042 btree_node_is_full(hammer_node_ondisk_t node) 3043 { 3044 return(btree_max_elements(node->type) == node->count); 3045 } 3046 3047 static __inline 3048 int 3049 btree_max_elements(u_int8_t type) 3050 { 3051 if (type == HAMMER_BTREE_TYPE_LEAF) 3052 return(HAMMER_BTREE_LEAF_ELMS); 3053 if (type == HAMMER_BTREE_TYPE_INTERNAL) 3054 return(HAMMER_BTREE_INT_ELMS); 3055 panic("btree_max_elements: bad type %d", type); 3056 } 3057 3058 void 3059 hammer_print_btree_node(hammer_node_ondisk_t ondisk) 3060 { 3061 hammer_btree_elm_t elm; 3062 int i; 3063 3064 kprintf("node %p count=%d parent=%016llx type=%c\n", 3065 ondisk, ondisk->count, 3066 (long long)ondisk->parent, ondisk->type); 3067 3068 /* 3069 * Dump both boundary elements if an internal node 3070 */ 3071 if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 3072 for (i = 0; i <= ondisk->count; ++i) { 3073 elm = &ondisk->elms[i]; 3074 hammer_print_btree_elm(elm, ondisk->type, i); 3075 } 3076 } else { 3077 for (i = 0; i < ondisk->count; ++i) { 3078 elm = &ondisk->elms[i]; 3079 hammer_print_btree_elm(elm, ondisk->type, i); 3080 } 3081 } 3082 } 3083 3084 void 3085 hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i) 3086 { 3087 kprintf(" %2d", i); 3088 kprintf("\tobj_id = %016llx\n", (long long)elm->base.obj_id); 3089 kprintf("\tkey = %016llx\n", (long long)elm->base.key); 3090 kprintf("\tcreate_tid = %016llx\n", (long long)elm->base.create_tid); 3091 kprintf("\tdelete_tid = %016llx\n", (long long)elm->base.delete_tid); 3092 kprintf("\trec_type = %04x\n", elm->base.rec_type); 3093 kprintf("\tobj_type = %02x\n", elm->base.obj_type); 3094 kprintf("\tbtype = %02x (%c)\n", 3095 elm->base.btype, 3096 (elm->base.btype ? elm->base.btype : '?')); 3097 kprintf("\tlocalization = %02x\n", elm->base.localization); 3098 3099 switch(type) { 3100 case HAMMER_BTREE_TYPE_INTERNAL: 3101 kprintf("\tsubtree_off = %016llx\n", 3102 (long long)elm->internal.subtree_offset); 3103 break; 3104 case HAMMER_BTREE_TYPE_RECORD: 3105 kprintf("\tdata_offset = %016llx\n", 3106 (long long)elm->leaf.data_offset); 3107 kprintf("\tdata_len = %08x\n", elm->leaf.data_len); 3108 kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc); 3109 break; 3110 } 3111 } 3112