1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * HAMMER B-Tree index 37 * 38 * HAMMER implements a modified B+Tree. In documentation this will 39 * simply be refered to as the HAMMER B-Tree. Basically a HAMMER B-Tree 40 * looks like a B+Tree (A B-Tree which stores its records only at the leafs 41 * of the tree), but adds two additional boundary elements which describe 42 * the left-most and right-most element a node is able to represent. In 43 * otherwords, we have boundary elements at the two ends of a B-Tree node 44 * with no valid sub-tree pointer for the right-most element. 45 * 46 * A B-Tree internal node looks like this: 47 * 48 * B N N N N N N B <-- boundary and internal elements 49 * S S S S S S S <-- subtree pointers 50 * 51 * A B-Tree leaf node basically looks like this: 52 * 53 * L L L L L L L L <-- leaf elemenets 54 * 55 * The radix for an internal node is 1 less then a leaf but we get a 56 * number of significant benefits for our troubles. 57 * The left-hand boundary (B in the left) is integrated into the first 58 * element so it doesn't require 2 elements to accomodate boundaries. 59 * 60 * The big benefit to using a B-Tree containing boundary information 61 * is that it is possible to cache pointers into the middle of the tree 62 * and not have to start searches, insertions, OR deletions at the root 63 * node. In particular, searches are able to progress in a definitive 64 * direction from any point in the tree without revisting nodes. This 65 * greatly improves the efficiency of many operations, most especially 66 * record appends. 67 * 68 * B-Trees also make the stacking of trees fairly straightforward. 69 * 70 * INSERTIONS: A search performed with the intention of doing 71 * an insert will guarantee that the terminal leaf node is not full by 72 * splitting full nodes. Splits occur top-down during the dive down the 73 * B-Tree. 74 * 75 * DELETIONS: A deletion makes no attempt to proactively balance the 76 * tree and will recursively remove nodes that become empty. If a 77 * deadlock occurs a deletion may not be able to remove an empty leaf. 78 * Deletions never allow internal nodes to become empty (that would blow 79 * up the boundaries). 80 */ 81 #include "hammer.h" 82 83 static int btree_search(hammer_cursor_t cursor, int flags); 84 static int btree_split_internal(hammer_cursor_t cursor); 85 static int btree_split_leaf(hammer_cursor_t cursor); 86 static int btree_remove(hammer_cursor_t cursor, int *ndelete); 87 static __inline int btree_node_is_full(hammer_node_ondisk_t node); 88 static __inline int btree_max_elements(u_int8_t type); 89 static int hammer_btree_mirror_propagate(hammer_cursor_t cursor, 90 hammer_tid_t mirror_tid); 91 static void hammer_make_separator(hammer_base_elm_t key1, 92 hammer_base_elm_t key2, hammer_base_elm_t dest); 93 static void hammer_cursor_mirror_filter(hammer_cursor_t cursor); 94 95 /* 96 * Iterate records after a search. The cursor is iterated forwards past 97 * the current record until a record matching the key-range requirements 98 * is found. ENOENT is returned if the iteration goes past the ending 99 * key. 100 * 101 * The iteration is inclusive of key_beg and can be inclusive or exclusive 102 * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set. 103 * 104 * When doing an as-of search (cursor->asof != 0), key_beg.create_tid 105 * may be modified by B-Tree functions. 106 * 107 * cursor->key_beg may or may not be modified by this function during 108 * the iteration. XXX future - in case of an inverted lock we may have 109 * to reinitiate the lookup and set key_beg to properly pick up where we 110 * left off. 111 * 112 * If HAMMER_CURSOR_ITERATE_CHECK is set it is possible that the cursor 113 * was reverse indexed due to being moved to a parent while unlocked, 114 * and something else might have inserted an element outside the iteration 115 * range. When this case occurs the iterator just keeps iterating until 116 * it gets back into the iteration range (instead of asserting). 117 * 118 * NOTE! EDEADLK *CANNOT* be returned by this procedure. 119 */ 120 int 121 hammer_btree_iterate(hammer_cursor_t cursor) 122 { 123 hammer_node_ondisk_t node; 124 hammer_btree_elm_t elm; 125 hammer_mount_t hmp; 126 int error = 0; 127 int r; 128 int s; 129 130 /* 131 * Skip past the current record 132 */ 133 hmp = cursor->trans->hmp; 134 node = cursor->node->ondisk; 135 if (node == NULL) 136 return(ENOENT); 137 if (cursor->index < node->count && 138 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 139 ++cursor->index; 140 } 141 142 /* 143 * HAMMER can wind up being cpu-bound. 144 */ 145 if (++hmp->check_yield > hammer_yield_check) { 146 hmp->check_yield = 0; 147 lwkt_user_yield(); 148 } 149 150 151 /* 152 * Loop until an element is found or we are done. 153 */ 154 for (;;) { 155 /* 156 * We iterate up the tree and then index over one element 157 * while we are at the last element in the current node. 158 * 159 * If we are at the root of the filesystem, cursor_up 160 * returns ENOENT. 161 * 162 * XXX this could be optimized by storing the information in 163 * the parent reference. 164 * 165 * XXX we can lose the node lock temporarily, this could mess 166 * up our scan. 167 */ 168 ++hammer_stats_btree_iterations; 169 hammer_flusher_clean_loose_ios(hmp); 170 171 if (cursor->index == node->count) { 172 if (hammer_debug_btree) { 173 kprintf("BRACKETU %016llx[%d] -> %016llx[%d] (td=%p)\n", 174 (long long)cursor->node->node_offset, 175 cursor->index, 176 (long long)(cursor->parent ? cursor->parent->node_offset : -1), 177 cursor->parent_index, 178 curthread); 179 } 180 KKASSERT(cursor->parent == NULL || 181 cursor->parent->ondisk->elms[cursor->parent_index].internal.subtree_offset == cursor->node->node_offset); 182 error = hammer_cursor_up(cursor); 183 if (error) 184 break; 185 /* reload stale pointer */ 186 node = cursor->node->ondisk; 187 KKASSERT(cursor->index != node->count); 188 189 /* 190 * If we are reblocking we want to return internal 191 * nodes. Note that the internal node will be 192 * returned multiple times, on each upward recursion 193 * from its children. The caller selects which 194 * revisit it cares about (usually first or last only). 195 */ 196 if (cursor->flags & HAMMER_CURSOR_REBLOCKING) { 197 cursor->flags |= HAMMER_CURSOR_ATEDISK; 198 return(0); 199 } 200 ++cursor->index; 201 continue; 202 } 203 204 /* 205 * Check internal or leaf element. Determine if the record 206 * at the cursor has gone beyond the end of our range. 207 * 208 * We recurse down through internal nodes. 209 */ 210 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 211 elm = &node->elms[cursor->index]; 212 213 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 214 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 215 if (hammer_debug_btree) { 216 kprintf("BRACKETL %016llx[%d] %016llx %02x " 217 "key=%016llx lo=%02x %d (td=%p)\n", 218 (long long)cursor->node->node_offset, 219 cursor->index, 220 (long long)elm[0].internal.base.obj_id, 221 elm[0].internal.base.rec_type, 222 (long long)elm[0].internal.base.key, 223 elm[0].internal.base.localization, 224 r, 225 curthread 226 ); 227 kprintf("BRACKETR %016llx[%d] %016llx %02x " 228 "key=%016llx lo=%02x %d\n", 229 (long long)cursor->node->node_offset, 230 cursor->index + 1, 231 (long long)elm[1].internal.base.obj_id, 232 elm[1].internal.base.rec_type, 233 (long long)elm[1].internal.base.key, 234 elm[1].internal.base.localization, 235 s 236 ); 237 } 238 239 if (r < 0) { 240 error = ENOENT; 241 break; 242 } 243 if (r == 0 && (cursor->flags & 244 HAMMER_CURSOR_END_INCLUSIVE) == 0) { 245 error = ENOENT; 246 break; 247 } 248 249 /* 250 * Better not be zero 251 */ 252 KKASSERT(elm->internal.subtree_offset != 0); 253 254 if (s <= 0) { 255 /* 256 * If running the mirror filter see if we 257 * can skip one or more entire sub-trees. 258 * If we can we return the internal node 259 * and the caller processes the skipped 260 * range (see mirror_read). 261 */ 262 if (cursor->flags & 263 HAMMER_CURSOR_MIRROR_FILTERED) { 264 if (elm->internal.mirror_tid < 265 cursor->cmirror->mirror_tid) { 266 hammer_cursor_mirror_filter(cursor); 267 return(0); 268 } 269 } 270 } else { 271 /* 272 * Normally it would be impossible for the 273 * cursor to have gotten back-indexed, 274 * but it can happen if a node is deleted 275 * and the cursor is moved to its parent 276 * internal node. ITERATE_CHECK will be set. 277 */ 278 KKASSERT(cursor->flags & 279 HAMMER_CURSOR_ITERATE_CHECK); 280 kprintf("hammer_btree_iterate: " 281 "DEBUG: Caught parent seek " 282 "in internal iteration\n"); 283 } 284 285 error = hammer_cursor_down(cursor); 286 if (error) 287 break; 288 KKASSERT(cursor->index == 0); 289 /* reload stale pointer */ 290 node = cursor->node->ondisk; 291 continue; 292 } else { 293 elm = &node->elms[cursor->index]; 294 r = hammer_btree_cmp(&cursor->key_end, &elm->base); 295 if (hammer_debug_btree) { 296 kprintf("ELEMENT %016llx:%d %c %016llx %02x " 297 "key=%016llx lo=%02x %d\n", 298 (long long)cursor->node->node_offset, 299 cursor->index, 300 (elm[0].leaf.base.btype ? 301 elm[0].leaf.base.btype : '?'), 302 (long long)elm[0].leaf.base.obj_id, 303 elm[0].leaf.base.rec_type, 304 (long long)elm[0].leaf.base.key, 305 elm[0].leaf.base.localization, 306 r 307 ); 308 } 309 if (r < 0) { 310 error = ENOENT; 311 break; 312 } 313 314 /* 315 * We support both end-inclusive and 316 * end-exclusive searches. 317 */ 318 if (r == 0 && 319 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 320 error = ENOENT; 321 break; 322 } 323 324 /* 325 * If ITERATE_CHECK is set an unlocked cursor may 326 * have been moved to a parent and the iterate can 327 * happen upon elements that are not in the requested 328 * range. 329 */ 330 if (cursor->flags & HAMMER_CURSOR_ITERATE_CHECK) { 331 s = hammer_btree_cmp(&cursor->key_beg, 332 &elm->base); 333 if (s > 0) { 334 kprintf("hammer_btree_iterate: " 335 "DEBUG: Caught parent seek " 336 "in leaf iteration\n"); 337 ++cursor->index; 338 continue; 339 } 340 } 341 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 342 343 /* 344 * Return the element 345 */ 346 switch(elm->leaf.base.btype) { 347 case HAMMER_BTREE_TYPE_RECORD: 348 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 349 hammer_btree_chkts(cursor->asof, &elm->base)) { 350 ++cursor->index; 351 continue; 352 } 353 error = 0; 354 break; 355 default: 356 error = EINVAL; 357 break; 358 } 359 if (error) 360 break; 361 } 362 363 /* 364 * Return entry 365 */ 366 if (hammer_debug_btree) { 367 int i = cursor->index; 368 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 369 kprintf("ITERATE %p:%d %016llx %02x " 370 "key=%016llx lo=%02x\n", 371 cursor->node, i, 372 (long long)elm->internal.base.obj_id, 373 elm->internal.base.rec_type, 374 (long long)elm->internal.base.key, 375 elm->internal.base.localization 376 ); 377 } 378 return(0); 379 } 380 return(error); 381 } 382 383 /* 384 * We hit an internal element that we could skip as part of a mirroring 385 * scan. Calculate the entire range being skipped. 386 * 387 * It is important to include any gaps between the parent's left_bound 388 * and the node's left_bound, and same goes for the right side. 389 */ 390 static void 391 hammer_cursor_mirror_filter(hammer_cursor_t cursor) 392 { 393 struct hammer_cmirror *cmirror; 394 hammer_node_ondisk_t ondisk; 395 hammer_btree_elm_t elm; 396 397 ondisk = cursor->node->ondisk; 398 cmirror = cursor->cmirror; 399 400 /* 401 * Calculate the skipped range 402 */ 403 elm = &ondisk->elms[cursor->index]; 404 if (cursor->index == 0) 405 cmirror->skip_beg = *cursor->left_bound; 406 else 407 cmirror->skip_beg = elm->internal.base; 408 while (cursor->index < ondisk->count) { 409 if (elm->internal.mirror_tid >= cmirror->mirror_tid) 410 break; 411 ++cursor->index; 412 ++elm; 413 } 414 if (cursor->index == ondisk->count) 415 cmirror->skip_end = *cursor->right_bound; 416 else 417 cmirror->skip_end = elm->internal.base; 418 419 /* 420 * clip the returned result. 421 */ 422 if (hammer_btree_cmp(&cmirror->skip_beg, &cursor->key_beg) < 0) 423 cmirror->skip_beg = cursor->key_beg; 424 if (hammer_btree_cmp(&cmirror->skip_end, &cursor->key_end) > 0) 425 cmirror->skip_end = cursor->key_end; 426 } 427 428 /* 429 * Iterate in the reverse direction. This is used by the pruning code to 430 * avoid overlapping records. 431 */ 432 int 433 hammer_btree_iterate_reverse(hammer_cursor_t cursor) 434 { 435 hammer_node_ondisk_t node; 436 hammer_btree_elm_t elm; 437 hammer_mount_t hmp; 438 int error = 0; 439 int r; 440 int s; 441 442 /* mirror filtering not supported for reverse iteration */ 443 KKASSERT ((cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) == 0); 444 445 /* 446 * Skip past the current record. For various reasons the cursor 447 * may end up set to -1 or set to point at the end of the current 448 * node. These cases must be addressed. 449 */ 450 node = cursor->node->ondisk; 451 if (node == NULL) 452 return(ENOENT); 453 if (cursor->index != -1 && 454 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 455 --cursor->index; 456 } 457 if (cursor->index == cursor->node->ondisk->count) 458 --cursor->index; 459 460 /* 461 * HAMMER can wind up being cpu-bound. 462 */ 463 hmp = cursor->trans->hmp; 464 if (++hmp->check_yield > hammer_yield_check) { 465 hmp->check_yield = 0; 466 lwkt_user_yield(); 467 } 468 469 /* 470 * Loop until an element is found or we are done. 471 */ 472 for (;;) { 473 ++hammer_stats_btree_iterations; 474 hammer_flusher_clean_loose_ios(hmp); 475 476 /* 477 * We iterate up the tree and then index over one element 478 * while we are at the last element in the current node. 479 */ 480 if (cursor->index == -1) { 481 error = hammer_cursor_up(cursor); 482 if (error) { 483 cursor->index = 0; /* sanity */ 484 break; 485 } 486 /* reload stale pointer */ 487 node = cursor->node->ondisk; 488 KKASSERT(cursor->index != node->count); 489 --cursor->index; 490 continue; 491 } 492 493 /* 494 * Check internal or leaf element. Determine if the record 495 * at the cursor has gone beyond the end of our range. 496 * 497 * We recurse down through internal nodes. 498 */ 499 KKASSERT(cursor->index != node->count); 500 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 501 elm = &node->elms[cursor->index]; 502 503 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 504 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 505 if (hammer_debug_btree) { 506 kprintf("BRACKETL %016llx[%d] %016llx %02x " 507 "key=%016llx lo=%02x %d (td=%p)\n", 508 (long long)cursor->node->node_offset, 509 cursor->index, 510 (long long)elm[0].internal.base.obj_id, 511 elm[0].internal.base.rec_type, 512 (long long)elm[0].internal.base.key, 513 elm[0].internal.base.localization, 514 r, 515 curthread 516 ); 517 kprintf("BRACKETR %016llx[%d] %016llx %02x " 518 "key=%016llx lo=%02x %d\n", 519 (long long)cursor->node->node_offset, 520 cursor->index + 1, 521 (long long)elm[1].internal.base.obj_id, 522 elm[1].internal.base.rec_type, 523 (long long)elm[1].internal.base.key, 524 elm[1].internal.base.localization, 525 s 526 ); 527 } 528 529 if (s >= 0) { 530 error = ENOENT; 531 break; 532 } 533 534 /* 535 * It shouldn't be possible to be seeked past key_end, 536 * even if the cursor got moved to a parent. 537 */ 538 KKASSERT(r >= 0); 539 540 /* 541 * Better not be zero 542 */ 543 KKASSERT(elm->internal.subtree_offset != 0); 544 545 error = hammer_cursor_down(cursor); 546 if (error) 547 break; 548 KKASSERT(cursor->index == 0); 549 /* reload stale pointer */ 550 node = cursor->node->ondisk; 551 552 /* this can assign -1 if the leaf was empty */ 553 cursor->index = node->count - 1; 554 continue; 555 } else { 556 elm = &node->elms[cursor->index]; 557 s = hammer_btree_cmp(&cursor->key_beg, &elm->base); 558 if (hammer_debug_btree) { 559 kprintf("ELEMENTR %016llx:%d %c %016llx %02x " 560 "key=%016llx lo=%02x %d\n", 561 (long long)cursor->node->node_offset, 562 cursor->index, 563 (elm[0].leaf.base.btype ? 564 elm[0].leaf.base.btype : '?'), 565 (long long)elm[0].leaf.base.obj_id, 566 elm[0].leaf.base.rec_type, 567 (long long)elm[0].leaf.base.key, 568 elm[0].leaf.base.localization, 569 s 570 ); 571 } 572 if (s > 0) { 573 error = ENOENT; 574 break; 575 } 576 577 /* 578 * It shouldn't be possible to be seeked past key_end, 579 * even if the cursor got moved to a parent. 580 */ 581 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 582 583 /* 584 * Return the element 585 */ 586 switch(elm->leaf.base.btype) { 587 case HAMMER_BTREE_TYPE_RECORD: 588 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 589 hammer_btree_chkts(cursor->asof, &elm->base)) { 590 --cursor->index; 591 continue; 592 } 593 error = 0; 594 break; 595 default: 596 error = EINVAL; 597 break; 598 } 599 if (error) 600 break; 601 } 602 603 /* 604 * Return entry 605 */ 606 if (hammer_debug_btree) { 607 int i = cursor->index; 608 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 609 kprintf("ITERATER %p:%d %016llx %02x " 610 "key=%016llx lo=%02x\n", 611 cursor->node, i, 612 (long long)elm->internal.base.obj_id, 613 elm->internal.base.rec_type, 614 (long long)elm->internal.base.key, 615 elm->internal.base.localization 616 ); 617 } 618 return(0); 619 } 620 return(error); 621 } 622 623 /* 624 * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry 625 * could not be found, EDEADLK if inserting and a retry is needed, and a 626 * fatal error otherwise. When retrying, the caller must terminate the 627 * cursor and reinitialize it. EDEADLK cannot be returned if not inserting. 628 * 629 * The cursor is suitably positioned for a deletion on success, and suitably 630 * positioned for an insertion on ENOENT if HAMMER_CURSOR_INSERT was 631 * specified. 632 * 633 * The cursor may begin anywhere, the search will traverse the tree in 634 * either direction to locate the requested element. 635 * 636 * Most of the logic implementing historical searches is handled here. We 637 * do an initial lookup with create_tid set to the asof TID. Due to the 638 * way records are laid out, a backwards iteration may be required if 639 * ENOENT is returned to locate the historical record. Here's the 640 * problem: 641 * 642 * create_tid: 10 15 20 643 * LEAF1 LEAF2 644 * records: (11) (18) 645 * 646 * Lets say we want to do a lookup AS-OF timestamp 17. We will traverse 647 * LEAF2 but the only record in LEAF2 has a create_tid of 18, which is 648 * not visible and thus causes ENOENT to be returned. We really need 649 * to check record 11 in LEAF1. If it also fails then the search fails 650 * (e.g. it might represent the range 11-16 and thus still not match our 651 * AS-OF timestamp of 17). Note that LEAF1 could be empty, requiring 652 * further iterations. 653 * 654 * If this case occurs btree_search() will set HAMMER_CURSOR_CREATE_CHECK 655 * and the cursor->create_check TID if an iteration might be needed. 656 * In the above example create_check would be set to 14. 657 */ 658 int 659 hammer_btree_lookup(hammer_cursor_t cursor) 660 { 661 int error; 662 663 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 664 KKASSERT ((cursor->flags & HAMMER_CURSOR_INSERT) == 0 || 665 cursor->trans->sync_lock_refs > 0); 666 ++hammer_stats_btree_lookups; 667 if (cursor->flags & HAMMER_CURSOR_ASOF) { 668 KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0); 669 cursor->key_beg.create_tid = cursor->asof; 670 for (;;) { 671 cursor->flags &= ~HAMMER_CURSOR_CREATE_CHECK; 672 error = btree_search(cursor, 0); 673 if (error != ENOENT || 674 (cursor->flags & HAMMER_CURSOR_CREATE_CHECK) == 0) { 675 /* 676 * Stop if no error. 677 * Stop if error other then ENOENT. 678 * Stop if ENOENT and not special case. 679 */ 680 break; 681 } 682 if (hammer_debug_btree) { 683 kprintf("CREATE_CHECK %016llx\n", 684 (long long)cursor->create_check); 685 } 686 cursor->key_beg.create_tid = cursor->create_check; 687 /* loop */ 688 } 689 } else { 690 error = btree_search(cursor, 0); 691 } 692 if (error == 0) 693 error = hammer_btree_extract(cursor, cursor->flags); 694 return(error); 695 } 696 697 /* 698 * Execute the logic required to start an iteration. The first record 699 * located within the specified range is returned and iteration control 700 * flags are adjusted for successive hammer_btree_iterate() calls. 701 * 702 * Set ATEDISK so a low-level caller can call btree_first/btree_iterate 703 * in a loop without worrying about it. Higher-level merged searches will 704 * adjust the flag appropriately. 705 */ 706 int 707 hammer_btree_first(hammer_cursor_t cursor) 708 { 709 int error; 710 711 error = hammer_btree_lookup(cursor); 712 if (error == ENOENT) { 713 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 714 error = hammer_btree_iterate(cursor); 715 } 716 cursor->flags |= HAMMER_CURSOR_ATEDISK; 717 return(error); 718 } 719 720 /* 721 * Similarly but for an iteration in the reverse direction. 722 * 723 * Set ATEDISK when iterating backwards to skip the current entry, 724 * which after an ENOENT lookup will be pointing beyond our end point. 725 * 726 * Set ATEDISK so a low-level caller can call btree_last/btree_iterate_reverse 727 * in a loop without worrying about it. Higher-level merged searches will 728 * adjust the flag appropriately. 729 */ 730 int 731 hammer_btree_last(hammer_cursor_t cursor) 732 { 733 struct hammer_base_elm save; 734 int error; 735 736 save = cursor->key_beg; 737 cursor->key_beg = cursor->key_end; 738 error = hammer_btree_lookup(cursor); 739 cursor->key_beg = save; 740 if (error == ENOENT || 741 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 742 cursor->flags |= HAMMER_CURSOR_ATEDISK; 743 error = hammer_btree_iterate_reverse(cursor); 744 } 745 cursor->flags |= HAMMER_CURSOR_ATEDISK; 746 return(error); 747 } 748 749 /* 750 * Extract the record and/or data associated with the cursor's current 751 * position. Any prior record or data stored in the cursor is replaced. 752 * 753 * NOTE: All extractions occur at the leaf of the B-Tree. 754 */ 755 int 756 hammer_btree_extract(hammer_cursor_t cursor, int flags) 757 { 758 hammer_node_ondisk_t node; 759 hammer_btree_elm_t elm; 760 hammer_off_t data_off; 761 hammer_mount_t hmp; 762 int32_t data_len; 763 int error; 764 765 /* 766 * Certain types of corruption can result in a NULL node pointer. 767 */ 768 if (cursor->node == NULL) { 769 kprintf("hammer: NULL cursor->node, filesystem might " 770 "have gotten corrupted\n"); 771 return (EINVAL); 772 } 773 774 /* 775 * The case where the data reference resolves to the same buffer 776 * as the record reference must be handled. 777 */ 778 node = cursor->node->ondisk; 779 elm = &node->elms[cursor->index]; 780 cursor->data = NULL; 781 hmp = cursor->node->hmp; 782 783 /* 784 * There is nothing to extract for an internal element. 785 */ 786 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) 787 return(EINVAL); 788 789 /* 790 * Only record types have data. 791 */ 792 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 793 cursor->leaf = &elm->leaf; 794 795 if ((flags & HAMMER_CURSOR_GET_DATA) == 0) 796 return(0); 797 if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD) 798 return(0); 799 data_off = elm->leaf.data_offset; 800 data_len = elm->leaf.data_len; 801 if (data_off == 0) 802 return(0); 803 804 /* 805 * Load the data 806 */ 807 KKASSERT(data_len >= 0 && data_len <= HAMMER_XBUFSIZE); 808 cursor->data = hammer_bread_ext(hmp, data_off, data_len, 809 &error, &cursor->data_buffer); 810 811 /* 812 * Mark the data buffer as not being meta-data if it isn't 813 * meta-data (sometimes bulk data is accessed via a volume 814 * block device). 815 */ 816 if (error == 0) { 817 switch(elm->leaf.base.rec_type) { 818 case HAMMER_RECTYPE_DATA: 819 case HAMMER_RECTYPE_DB: 820 if ((data_off & HAMMER_ZONE_LARGE_DATA) == 0) 821 break; 822 if (hammer_double_buffer == 0 || 823 (cursor->flags & HAMMER_CURSOR_NOSWAPCACHE)) { 824 hammer_io_notmeta(cursor->data_buffer); 825 } 826 break; 827 default: 828 break; 829 } 830 } 831 832 /* 833 * Deal with CRC errors on the extracted data. 834 */ 835 if (error == 0 && 836 hammer_crc_test_leaf(cursor->data, &elm->leaf) == 0) { 837 kprintf("CRC DATA @ %016llx/%d FAILED\n", 838 (long long)elm->leaf.data_offset, elm->leaf.data_len); 839 if (hammer_debug_critical) 840 Debugger("CRC FAILED: DATA"); 841 if (cursor->trans->flags & HAMMER_TRANSF_CRCDOM) 842 error = EDOM; /* less critical (mirroring) */ 843 else 844 error = EIO; /* critical */ 845 } 846 return(error); 847 } 848 849 850 /* 851 * Insert a leaf element into the B-Tree at the current cursor position. 852 * The cursor is positioned such that the element at and beyond the cursor 853 * are shifted to make room for the new record. 854 * 855 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT 856 * flag set and that call must return ENOENT before this function can be 857 * called. ENOSPC is returned if there is no room to insert a new record. 858 * 859 * The caller may depend on the cursor's exclusive lock after return to 860 * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE). 861 */ 862 int 863 hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm, 864 int *doprop) 865 { 866 hammer_node_ondisk_t node; 867 int i; 868 int error; 869 870 *doprop = 0; 871 if ((error = hammer_cursor_upgrade_node(cursor)) != 0) 872 return(error); 873 ++hammer_stats_btree_inserts; 874 875 /* 876 * Insert the element at the leaf node and update the count in the 877 * parent. It is possible for parent to be NULL, indicating that 878 * the filesystem's ROOT B-Tree node is a leaf itself, which is 879 * possible. The root inode can never be deleted so the leaf should 880 * never be empty. 881 * 882 * Remember that leaf nodes do not have boundaries. 883 */ 884 hammer_modify_node_all(cursor->trans, cursor->node); 885 node = cursor->node->ondisk; 886 i = cursor->index; 887 KKASSERT(elm->base.btype != 0); 888 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 889 KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS); 890 if (i != node->count) { 891 bcopy(&node->elms[i], &node->elms[i+1], 892 (node->count - i) * sizeof(*elm)); 893 } 894 node->elms[i].leaf = *elm; 895 ++node->count; 896 hammer_cursor_inserted_element(cursor->node, i); 897 898 /* 899 * Update the leaf node's aggregate mirror_tid for mirroring 900 * support. 901 */ 902 if (node->mirror_tid < elm->base.delete_tid) { 903 node->mirror_tid = elm->base.delete_tid; 904 *doprop = 1; 905 } 906 if (node->mirror_tid < elm->base.create_tid) { 907 node->mirror_tid = elm->base.create_tid; 908 *doprop = 1; 909 } 910 hammer_modify_node_done(cursor->node); 911 912 /* 913 * Debugging sanity checks. 914 */ 915 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->base) <= 0); 916 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->base) > 0); 917 if (i) { 918 KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->base) < 0); 919 } 920 if (i != node->count - 1) 921 KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->base) > 0); 922 923 return(0); 924 } 925 926 /* 927 * Delete a record from the B-Tree at the current cursor position. 928 * The cursor is positioned such that the current element is the one 929 * to be deleted. 930 * 931 * On return the cursor will be positioned after the deleted element and 932 * MAY point to an internal node. It will be suitable for the continuation 933 * of an iteration but not for an insertion or deletion. 934 * 935 * Deletions will attempt to partially rebalance the B-Tree in an upward 936 * direction, but will terminate rather then deadlock. Empty internal nodes 937 * are never allowed by a deletion which deadlocks may end up giving us an 938 * empty leaf. The pruner will clean up and rebalance the tree. 939 * 940 * This function can return EDEADLK, requiring the caller to retry the 941 * operation after clearing the deadlock. 942 * 943 * This function will store the number of deleted btree nodes in *ndelete 944 * if ndelete is not NULL. 945 */ 946 int 947 hammer_btree_delete(hammer_cursor_t cursor, int *ndelete) 948 { 949 hammer_node_ondisk_t ondisk; 950 hammer_node_t node; 951 hammer_node_t parent __debugvar; 952 int error; 953 int i; 954 955 KKASSERT (cursor->trans->sync_lock_refs > 0); 956 if (ndelete) 957 *ndelete = 0; 958 if ((error = hammer_cursor_upgrade(cursor)) != 0) 959 return(error); 960 ++hammer_stats_btree_deletes; 961 962 /* 963 * Delete the element from the leaf node. 964 * 965 * Remember that leaf nodes do not have boundaries. 966 */ 967 node = cursor->node; 968 ondisk = node->ondisk; 969 i = cursor->index; 970 971 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF); 972 KKASSERT(i >= 0 && i < ondisk->count); 973 hammer_modify_node_all(cursor->trans, node); 974 if (i + 1 != ondisk->count) { 975 bcopy(&ondisk->elms[i+1], &ondisk->elms[i], 976 (ondisk->count - i - 1) * sizeof(ondisk->elms[0])); 977 } 978 --ondisk->count; 979 hammer_modify_node_done(node); 980 hammer_cursor_deleted_element(node, i); 981 982 /* 983 * Validate local parent 984 */ 985 if (ondisk->parent) { 986 parent = cursor->parent; 987 988 KKASSERT(parent != NULL); 989 KKASSERT(parent->node_offset == ondisk->parent); 990 } 991 992 /* 993 * If the leaf becomes empty it must be detached from the parent, 994 * potentially recursing through to the filesystem root. 995 * 996 * This may reposition the cursor at one of the parent's of the 997 * current node. 998 * 999 * Ignore deadlock errors, that simply means that btree_remove 1000 * was unable to recurse and had to leave us with an empty leaf. 1001 */ 1002 KKASSERT(cursor->index <= ondisk->count); 1003 if (ondisk->count == 0) { 1004 error = btree_remove(cursor, ndelete); 1005 if (error == EDEADLK) 1006 error = 0; 1007 } else { 1008 error = 0; 1009 } 1010 KKASSERT(cursor->parent == NULL || 1011 cursor->parent_index < cursor->parent->ondisk->count); 1012 return(error); 1013 } 1014 1015 /* 1016 * PRIMARY B-TREE SEARCH SUPPORT PROCEDURE 1017 * 1018 * Search the filesystem B-Tree for cursor->key_beg, return the matching node. 1019 * 1020 * The search can begin ANYWHERE in the B-Tree. As a first step the search 1021 * iterates up the tree as necessary to properly position itself prior to 1022 * actually doing the sarch. 1023 * 1024 * INSERTIONS: The search will split full nodes and leaves on its way down 1025 * and guarentee that the leaf it ends up on is not full. If we run out 1026 * of space the search continues to the leaf, but ENOSPC is returned. 1027 * 1028 * The search is only guarenteed to end up on a leaf if an error code of 0 1029 * is returned, or if inserting and an error code of ENOENT is returned. 1030 * Otherwise it can stop at an internal node. On success a search returns 1031 * a leaf node. 1032 * 1033 * COMPLEXITY WARNING! This is the core B-Tree search code for the entire 1034 * filesystem, and it is not simple code. Please note the following facts: 1035 * 1036 * - Internal node recursions have a boundary on the left AND right. The 1037 * right boundary is non-inclusive. The create_tid is a generic part 1038 * of the key for internal nodes. 1039 * 1040 * - Filesystem lookups typically set HAMMER_CURSOR_ASOF, indicating a 1041 * historical search. ASOF and INSERT are mutually exclusive. When 1042 * doing an as-of lookup btree_search() checks for a right-edge boundary 1043 * case. If while recursing down the left-edge differs from the key 1044 * by ONLY its create_tid, HAMMER_CURSOR_CREATE_CHECK is set along 1045 * with cursor->create_check. This is used by btree_lookup() to iterate. 1046 * The iteration backwards because as-of searches can wind up going 1047 * down the wrong branch of the B-Tree. 1048 */ 1049 static 1050 int 1051 btree_search(hammer_cursor_t cursor, int flags) 1052 { 1053 hammer_node_ondisk_t node; 1054 hammer_btree_elm_t elm; 1055 int error; 1056 int enospc = 0; 1057 int i; 1058 int r; 1059 int s; 1060 1061 flags |= cursor->flags; 1062 ++hammer_stats_btree_searches; 1063 1064 if (hammer_debug_btree) { 1065 kprintf("SEARCH %016llx[%d] %016llx %02x key=%016llx cre=%016llx lo=%02x (td=%p)\n", 1066 (long long)cursor->node->node_offset, 1067 cursor->index, 1068 (long long)cursor->key_beg.obj_id, 1069 cursor->key_beg.rec_type, 1070 (long long)cursor->key_beg.key, 1071 (long long)cursor->key_beg.create_tid, 1072 cursor->key_beg.localization, 1073 curthread 1074 ); 1075 if (cursor->parent) 1076 kprintf("SEARCHP %016llx[%d] (%016llx/%016llx %016llx/%016llx) (%p/%p %p/%p)\n", 1077 (long long)cursor->parent->node_offset, 1078 cursor->parent_index, 1079 (long long)cursor->left_bound->obj_id, 1080 (long long)cursor->parent->ondisk->elms[cursor->parent_index].internal.base.obj_id, 1081 (long long)cursor->right_bound->obj_id, 1082 (long long)cursor->parent->ondisk->elms[cursor->parent_index+1].internal.base.obj_id, 1083 cursor->left_bound, 1084 &cursor->parent->ondisk->elms[cursor->parent_index], 1085 cursor->right_bound, 1086 &cursor->parent->ondisk->elms[cursor->parent_index+1] 1087 ); 1088 } 1089 1090 /* 1091 * Move our cursor up the tree until we find a node whos range covers 1092 * the key we are trying to locate. 1093 * 1094 * The left bound is inclusive, the right bound is non-inclusive. 1095 * It is ok to cursor up too far. 1096 */ 1097 for (;;) { 1098 r = hammer_btree_cmp(&cursor->key_beg, cursor->left_bound); 1099 s = hammer_btree_cmp(&cursor->key_beg, cursor->right_bound); 1100 if (r >= 0 && s < 0) 1101 break; 1102 KKASSERT(cursor->parent); 1103 ++hammer_stats_btree_iterations; 1104 error = hammer_cursor_up(cursor); 1105 if (error) 1106 goto done; 1107 } 1108 1109 /* 1110 * The delete-checks below are based on node, not parent. Set the 1111 * initial delete-check based on the parent. 1112 */ 1113 if (r == 1) { 1114 KKASSERT(cursor->left_bound->create_tid != 1); 1115 cursor->create_check = cursor->left_bound->create_tid - 1; 1116 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1117 } 1118 1119 /* 1120 * We better have ended up with a node somewhere. 1121 */ 1122 KKASSERT(cursor->node != NULL); 1123 1124 /* 1125 * If we are inserting we can't start at a full node if the parent 1126 * is also full (because there is no way to split the node), 1127 * continue running up the tree until the requirement is satisfied 1128 * or we hit the root of the filesystem. 1129 * 1130 * (If inserting we aren't doing an as-of search so we don't have 1131 * to worry about create_check). 1132 */ 1133 while (flags & HAMMER_CURSOR_INSERT) { 1134 if (btree_node_is_full(cursor->node->ondisk) == 0) 1135 break; 1136 if (cursor->node->ondisk->parent == 0 || 1137 cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) { 1138 break; 1139 } 1140 ++hammer_stats_btree_iterations; 1141 error = hammer_cursor_up(cursor); 1142 /* node may have become stale */ 1143 if (error) 1144 goto done; 1145 } 1146 1147 /* 1148 * Push down through internal nodes to locate the requested key. 1149 */ 1150 node = cursor->node->ondisk; 1151 while (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 1152 /* 1153 * Scan the node to find the subtree index to push down into. 1154 * We go one-past, then back-up. 1155 * 1156 * We must proactively remove deleted elements which may 1157 * have been left over from a deadlocked btree_remove(). 1158 * 1159 * The left and right boundaries are included in the loop 1160 * in order to detect edge cases. 1161 * 1162 * If the separator only differs by create_tid (r == 1) 1163 * and we are doing an as-of search, we may end up going 1164 * down a branch to the left of the one containing the 1165 * desired key. This requires numerous special cases. 1166 */ 1167 ++hammer_stats_btree_iterations; 1168 if (hammer_debug_btree) { 1169 kprintf("SEARCH-I %016llx count=%d\n", 1170 (long long)cursor->node->node_offset, 1171 node->count); 1172 } 1173 1174 /* 1175 * Try to shortcut the search before dropping into the 1176 * linear loop. Locate the first node where r <= 1. 1177 */ 1178 i = hammer_btree_search_node(&cursor->key_beg, node); 1179 while (i <= node->count) { 1180 ++hammer_stats_btree_elements; 1181 elm = &node->elms[i]; 1182 r = hammer_btree_cmp(&cursor->key_beg, &elm->base); 1183 if (hammer_debug_btree > 2) { 1184 kprintf(" IELM %p %d r=%d\n", 1185 &node->elms[i], i, r); 1186 } 1187 if (r < 0) 1188 break; 1189 if (r == 1) { 1190 KKASSERT(elm->base.create_tid != 1); 1191 cursor->create_check = elm->base.create_tid - 1; 1192 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1193 } 1194 ++i; 1195 } 1196 if (hammer_debug_btree) { 1197 kprintf("SEARCH-I preI=%d/%d r=%d\n", 1198 i, node->count, r); 1199 } 1200 1201 /* 1202 * These cases occur when the parent's idea of the boundary 1203 * is wider then the child's idea of the boundary, and 1204 * require special handling. If not inserting we can 1205 * terminate the search early for these cases but the 1206 * child's boundaries cannot be unconditionally modified. 1207 */ 1208 if (i == 0) { 1209 /* 1210 * If i == 0 the search terminated to the LEFT of the 1211 * left_boundary but to the RIGHT of the parent's left 1212 * boundary. 1213 */ 1214 u_int8_t save; 1215 1216 elm = &node->elms[0]; 1217 1218 /* 1219 * If we aren't inserting we can stop here. 1220 */ 1221 if ((flags & (HAMMER_CURSOR_INSERT | 1222 HAMMER_CURSOR_PRUNING)) == 0) { 1223 cursor->index = 0; 1224 return(ENOENT); 1225 } 1226 1227 /* 1228 * Correct a left-hand boundary mismatch. 1229 * 1230 * We can only do this if we can upgrade the lock, 1231 * and synchronized as a background cursor (i.e. 1232 * inserting or pruning). 1233 * 1234 * WARNING: We can only do this if inserting, i.e. 1235 * we are running on the backend. 1236 */ 1237 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1238 return(error); 1239 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1240 hammer_modify_node_field(cursor->trans, cursor->node, 1241 elms[0]); 1242 save = node->elms[0].base.btype; 1243 node->elms[0].base = *cursor->left_bound; 1244 node->elms[0].base.btype = save; 1245 hammer_modify_node_done(cursor->node); 1246 } else if (i == node->count + 1) { 1247 /* 1248 * If i == node->count + 1 the search terminated to 1249 * the RIGHT of the right boundary but to the LEFT 1250 * of the parent's right boundary. If we aren't 1251 * inserting we can stop here. 1252 * 1253 * Note that the last element in this case is 1254 * elms[i-2] prior to adjustments to 'i'. 1255 */ 1256 --i; 1257 if ((flags & (HAMMER_CURSOR_INSERT | 1258 HAMMER_CURSOR_PRUNING)) == 0) { 1259 cursor->index = i; 1260 return (ENOENT); 1261 } 1262 1263 /* 1264 * Correct a right-hand boundary mismatch. 1265 * (actual push-down record is i-2 prior to 1266 * adjustments to i). 1267 * 1268 * We can only do this if we can upgrade the lock, 1269 * and synchronized as a background cursor (i.e. 1270 * inserting or pruning). 1271 * 1272 * WARNING: We can only do this if inserting, i.e. 1273 * we are running on the backend. 1274 */ 1275 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1276 return(error); 1277 elm = &node->elms[i]; 1278 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1279 hammer_modify_node(cursor->trans, cursor->node, 1280 &elm->base, sizeof(elm->base)); 1281 elm->base = *cursor->right_bound; 1282 hammer_modify_node_done(cursor->node); 1283 --i; 1284 } else { 1285 /* 1286 * The push-down index is now i - 1. If we had 1287 * terminated on the right boundary this will point 1288 * us at the last element. 1289 */ 1290 --i; 1291 } 1292 cursor->index = i; 1293 elm = &node->elms[i]; 1294 1295 if (hammer_debug_btree) { 1296 kprintf("RESULT-I %016llx[%d] %016llx %02x " 1297 "key=%016llx cre=%016llx lo=%02x\n", 1298 (long long)cursor->node->node_offset, 1299 i, 1300 (long long)elm->internal.base.obj_id, 1301 elm->internal.base.rec_type, 1302 (long long)elm->internal.base.key, 1303 (long long)elm->internal.base.create_tid, 1304 elm->internal.base.localization 1305 ); 1306 } 1307 1308 /* 1309 * We better have a valid subtree offset. 1310 */ 1311 KKASSERT(elm->internal.subtree_offset != 0); 1312 1313 /* 1314 * Handle insertion and deletion requirements. 1315 * 1316 * If inserting split full nodes. The split code will 1317 * adjust cursor->node and cursor->index if the current 1318 * index winds up in the new node. 1319 * 1320 * If inserting and a left or right edge case was detected, 1321 * we cannot correct the left or right boundary and must 1322 * prepend and append an empty leaf node in order to make 1323 * the boundary correction. 1324 * 1325 * If we run out of space we set enospc but continue on 1326 * to a leaf. 1327 */ 1328 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) { 1329 if (btree_node_is_full(node)) { 1330 error = btree_split_internal(cursor); 1331 if (error) { 1332 if (error != ENOSPC) 1333 goto done; 1334 enospc = 1; 1335 } 1336 /* 1337 * reload stale pointers 1338 */ 1339 i = cursor->index; 1340 node = cursor->node->ondisk; 1341 } 1342 } 1343 1344 /* 1345 * Push down (push into new node, existing node becomes 1346 * the parent) and continue the search. 1347 */ 1348 error = hammer_cursor_down(cursor); 1349 /* node may have become stale */ 1350 if (error) 1351 goto done; 1352 node = cursor->node->ondisk; 1353 } 1354 1355 /* 1356 * We are at a leaf, do a linear search of the key array. 1357 * 1358 * On success the index is set to the matching element and 0 1359 * is returned. 1360 * 1361 * On failure the index is set to the insertion point and ENOENT 1362 * is returned. 1363 * 1364 * Boundaries are not stored in leaf nodes, so the index can wind 1365 * up to the left of element 0 (index == 0) or past the end of 1366 * the array (index == node->count). It is also possible that the 1367 * leaf might be empty. 1368 */ 1369 ++hammer_stats_btree_iterations; 1370 KKASSERT (node->type == HAMMER_BTREE_TYPE_LEAF); 1371 KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS); 1372 if (hammer_debug_btree) { 1373 kprintf("SEARCH-L %016llx count=%d\n", 1374 (long long)cursor->node->node_offset, 1375 node->count); 1376 } 1377 1378 /* 1379 * Try to shortcut the search before dropping into the 1380 * linear loop. Locate the first node where r <= 1. 1381 */ 1382 i = hammer_btree_search_node(&cursor->key_beg, node); 1383 while (i < node->count) { 1384 ++hammer_stats_btree_elements; 1385 elm = &node->elms[i]; 1386 1387 r = hammer_btree_cmp(&cursor->key_beg, &elm->leaf.base); 1388 1389 if (hammer_debug_btree > 1) 1390 kprintf(" ELM %p %d r=%d\n", &node->elms[i], i, r); 1391 1392 /* 1393 * We are at a record element. Stop if we've flipped past 1394 * key_beg, not counting the create_tid test. Allow the 1395 * r == 1 case (key_beg > element but differs only by its 1396 * create_tid) to fall through to the AS-OF check. 1397 */ 1398 KKASSERT (elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD); 1399 1400 if (r < 0) 1401 goto failed; 1402 if (r > 1) { 1403 ++i; 1404 continue; 1405 } 1406 1407 /* 1408 * Check our as-of timestamp against the element. 1409 */ 1410 if (flags & HAMMER_CURSOR_ASOF) { 1411 if (hammer_btree_chkts(cursor->asof, 1412 &node->elms[i].base) != 0) { 1413 ++i; 1414 continue; 1415 } 1416 /* success */ 1417 } else { 1418 if (r > 0) { /* can only be +1 */ 1419 ++i; 1420 continue; 1421 } 1422 /* success */ 1423 } 1424 cursor->index = i; 1425 error = 0; 1426 if (hammer_debug_btree) { 1427 kprintf("RESULT-L %016llx[%d] (SUCCESS)\n", 1428 (long long)cursor->node->node_offset, i); 1429 } 1430 goto done; 1431 } 1432 1433 /* 1434 * The search of the leaf node failed. i is the insertion point. 1435 */ 1436 failed: 1437 if (hammer_debug_btree) { 1438 kprintf("RESULT-L %016llx[%d] (FAILED)\n", 1439 (long long)cursor->node->node_offset, i); 1440 } 1441 1442 /* 1443 * No exact match was found, i is now at the insertion point. 1444 * 1445 * If inserting split a full leaf before returning. This 1446 * may have the side effect of adjusting cursor->node and 1447 * cursor->index. 1448 */ 1449 cursor->index = i; 1450 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0 && 1451 btree_node_is_full(node)) { 1452 error = btree_split_leaf(cursor); 1453 if (error) { 1454 if (error != ENOSPC) 1455 goto done; 1456 enospc = 1; 1457 } 1458 /* 1459 * reload stale pointers 1460 */ 1461 /* NOT USED 1462 i = cursor->index; 1463 node = &cursor->node->internal; 1464 */ 1465 } 1466 1467 /* 1468 * We reached a leaf but did not find the key we were looking for. 1469 * If this is an insert we will be properly positioned for an insert 1470 * (ENOENT) or unable to insert (ENOSPC). 1471 */ 1472 error = enospc ? ENOSPC : ENOENT; 1473 done: 1474 return(error); 1475 } 1476 1477 /* 1478 * Heuristical search for the first element whos comparison is <= 1. May 1479 * return an index whos compare result is > 1 but may only return an index 1480 * whos compare result is <= 1 if it is the first element with that result. 1481 */ 1482 int 1483 hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node) 1484 { 1485 int b; 1486 int s; 1487 int i; 1488 int r; 1489 1490 /* 1491 * Don't bother if the node does not have very many elements 1492 */ 1493 b = 0; 1494 s = node->count; 1495 while (s - b > 4) { 1496 i = b + (s - b) / 2; 1497 ++hammer_stats_btree_elements; 1498 r = hammer_btree_cmp(elm, &node->elms[i].leaf.base); 1499 if (r <= 1) { 1500 s = i; 1501 } else { 1502 b = i; 1503 } 1504 } 1505 return(b); 1506 } 1507 1508 1509 /************************************************************************ 1510 * SPLITTING AND MERGING * 1511 ************************************************************************ 1512 * 1513 * These routines do all the dirty work required to split and merge nodes. 1514 */ 1515 1516 /* 1517 * Split an internal node into two nodes and move the separator at the split 1518 * point to the parent. 1519 * 1520 * (cursor->node, cursor->index) indicates the element the caller intends 1521 * to push into. We will adjust node and index if that element winds 1522 * up in the split node. 1523 * 1524 * If we are at the root of the filesystem a new root must be created with 1525 * two elements, one pointing to the original root and one pointing to the 1526 * newly allocated split node. 1527 */ 1528 static 1529 int 1530 btree_split_internal(hammer_cursor_t cursor) 1531 { 1532 hammer_node_ondisk_t ondisk; 1533 hammer_node_t node; 1534 hammer_node_t parent; 1535 hammer_node_t new_node; 1536 hammer_btree_elm_t elm; 1537 hammer_btree_elm_t parent_elm; 1538 struct hammer_node_lock lockroot; 1539 hammer_mount_t hmp = cursor->trans->hmp; 1540 int parent_index; 1541 int made_root; 1542 int split; 1543 int error; 1544 int i; 1545 const int esize = sizeof(*elm); 1546 1547 hammer_node_lock_init(&lockroot, cursor->node); 1548 error = hammer_btree_lock_children(cursor, 1, &lockroot, NULL); 1549 if (error) 1550 goto done; 1551 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1552 goto done; 1553 ++hammer_stats_btree_splits; 1554 1555 /* 1556 * Calculate the split point. If the insertion point is at the 1557 * end of the leaf we adjust the split point significantly to the 1558 * right to try to optimize node fill and flag it. If we hit 1559 * that same leaf again our heuristic failed and we don't try 1560 * to optimize node fill (it could lead to a degenerate case). 1561 */ 1562 node = cursor->node; 1563 ondisk = node->ondisk; 1564 KKASSERT(ondisk->count > 4); 1565 if (cursor->index == ondisk->count && 1566 (node->flags & HAMMER_NODE_NONLINEAR) == 0) { 1567 split = (ondisk->count + 1) * 3 / 4; 1568 node->flags |= HAMMER_NODE_NONLINEAR; 1569 } else { 1570 /* 1571 * We are splitting but elms[split] will be promoted to 1572 * the parent, leaving the right hand node with one less 1573 * element. If the insertion point will be on the 1574 * left-hand side adjust the split point to give the 1575 * right hand side one additional node. 1576 */ 1577 split = (ondisk->count + 1) / 2; 1578 if (cursor->index <= split) 1579 --split; 1580 } 1581 1582 /* 1583 * If we are at the root of the filesystem, create a new root node 1584 * with 1 element and split normally. Avoid making major 1585 * modifications until we know the whole operation will work. 1586 */ 1587 if (ondisk->parent == 0) { 1588 parent = hammer_alloc_btree(cursor->trans, 0, &error); 1589 if (parent == NULL) 1590 goto done; 1591 hammer_lock_ex(&parent->lock); 1592 hammer_modify_node_noundo(cursor->trans, parent); 1593 ondisk = parent->ondisk; 1594 ondisk->count = 1; 1595 ondisk->parent = 0; 1596 ondisk->mirror_tid = node->ondisk->mirror_tid; 1597 ondisk->signature = node->ondisk->signature; 1598 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1599 ondisk->elms[0].base = hmp->root_btree_beg; 1600 ondisk->elms[0].base.btype = node->ondisk->type; 1601 ondisk->elms[0].internal.subtree_offset = node->node_offset; 1602 ondisk->elms[0].internal.mirror_tid = ondisk->mirror_tid; 1603 ondisk->elms[1].base = hmp->root_btree_end; 1604 hammer_modify_node_done(parent); 1605 made_root = 1; 1606 parent_index = 0; /* index of current node in parent */ 1607 } else { 1608 made_root = 0; 1609 parent = cursor->parent; 1610 parent_index = cursor->parent_index; 1611 } 1612 1613 /* 1614 * Split node into new_node at the split point. 1615 * 1616 * B O O O P N N B <-- P = node->elms[split] (index 4) 1617 * 0 1 2 3 4 5 6 <-- subtree indices 1618 * 1619 * x x P x x 1620 * s S S s 1621 * / \ 1622 * B O O O B B N N B <--- inner boundary points are 'P' 1623 * 0 1 2 3 4 5 6 1624 */ 1625 new_node = hammer_alloc_btree(cursor->trans, 0, &error); 1626 if (new_node == NULL) { 1627 if (made_root) { 1628 hammer_unlock(&parent->lock); 1629 hammer_delete_node(cursor->trans, parent); 1630 hammer_rel_node(parent); 1631 } 1632 goto done; 1633 } 1634 hammer_lock_ex(&new_node->lock); 1635 1636 /* 1637 * Create the new node. P becomes the left-hand boundary in the 1638 * new node. Copy the right-hand boundary as well. 1639 * 1640 * elm is the new separator. 1641 */ 1642 hammer_modify_node_noundo(cursor->trans, new_node); 1643 hammer_modify_node_all(cursor->trans, node); 1644 ondisk = node->ondisk; 1645 elm = &ondisk->elms[split]; 1646 bcopy(elm, &new_node->ondisk->elms[0], 1647 (ondisk->count - split + 1) * esize); 1648 new_node->ondisk->count = ondisk->count - split; 1649 new_node->ondisk->parent = parent->node_offset; 1650 new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1651 new_node->ondisk->mirror_tid = ondisk->mirror_tid; 1652 KKASSERT(ondisk->type == new_node->ondisk->type); 1653 hammer_cursor_split_node(node, new_node, split); 1654 1655 /* 1656 * Cleanup the original node. Elm (P) becomes the new boundary, 1657 * its subtree_offset was moved to the new node. If we had created 1658 * a new root its parent pointer may have changed. 1659 */ 1660 elm->internal.subtree_offset = 0; 1661 ondisk->count = split; 1662 1663 /* 1664 * Insert the separator into the parent, fixup the parent's 1665 * reference to the original node, and reference the new node. 1666 * The separator is P. 1667 * 1668 * Remember that base.count does not include the right-hand boundary. 1669 */ 1670 hammer_modify_node_all(cursor->trans, parent); 1671 ondisk = parent->ondisk; 1672 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1673 parent_elm = &ondisk->elms[parent_index+1]; 1674 bcopy(parent_elm, parent_elm + 1, 1675 (ondisk->count - parent_index) * esize); 1676 parent_elm->internal.base = elm->base; /* separator P */ 1677 parent_elm->internal.base.btype = new_node->ondisk->type; 1678 parent_elm->internal.subtree_offset = new_node->node_offset; 1679 parent_elm->internal.mirror_tid = new_node->ondisk->mirror_tid; 1680 ++ondisk->count; 1681 hammer_modify_node_done(parent); 1682 hammer_cursor_inserted_element(parent, parent_index + 1); 1683 1684 /* 1685 * The children of new_node need their parent pointer set to new_node. 1686 * The children have already been locked by 1687 * hammer_btree_lock_children(). 1688 */ 1689 for (i = 0; i < new_node->ondisk->count; ++i) { 1690 elm = &new_node->ondisk->elms[i]; 1691 error = btree_set_parent(cursor->trans, new_node, elm); 1692 if (error) { 1693 panic("btree_split_internal: btree-fixup problem"); 1694 } 1695 } 1696 hammer_modify_node_done(new_node); 1697 1698 /* 1699 * The filesystem's root B-Tree pointer may have to be updated. 1700 */ 1701 if (made_root) { 1702 hammer_volume_t volume; 1703 1704 volume = hammer_get_root_volume(hmp, &error); 1705 KKASSERT(error == 0); 1706 1707 hammer_modify_volume_field(cursor->trans, volume, 1708 vol0_btree_root); 1709 volume->ondisk->vol0_btree_root = parent->node_offset; 1710 hammer_modify_volume_done(volume); 1711 node->ondisk->parent = parent->node_offset; 1712 /* node->ondisk->signature = 0; */ 1713 if (cursor->parent) { 1714 hammer_unlock(&cursor->parent->lock); 1715 hammer_rel_node(cursor->parent); 1716 } 1717 cursor->parent = parent; /* lock'd and ref'd */ 1718 hammer_rel_volume(volume, 0); 1719 } 1720 hammer_modify_node_done(node); 1721 1722 /* 1723 * Ok, now adjust the cursor depending on which element the original 1724 * index was pointing at. If we are >= the split point the push node 1725 * is now in the new node. 1726 * 1727 * NOTE: If we are at the split point itself we cannot stay with the 1728 * original node because the push index will point at the right-hand 1729 * boundary, which is illegal. 1730 * 1731 * NOTE: The cursor's parent or parent_index must be adjusted for 1732 * the case where a new parent (new root) was created, and the case 1733 * where the cursor is now pointing at the split node. 1734 */ 1735 if (cursor->index >= split) { 1736 cursor->parent_index = parent_index + 1; 1737 cursor->index -= split; 1738 hammer_unlock(&cursor->node->lock); 1739 hammer_rel_node(cursor->node); 1740 cursor->node = new_node; /* locked and ref'd */ 1741 } else { 1742 cursor->parent_index = parent_index; 1743 hammer_unlock(&new_node->lock); 1744 hammer_rel_node(new_node); 1745 } 1746 1747 /* 1748 * Fixup left and right bounds 1749 */ 1750 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1751 cursor->left_bound = &parent_elm[0].internal.base; 1752 cursor->right_bound = &parent_elm[1].internal.base; 1753 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1754 &cursor->node->ondisk->elms[0].internal.base) <= 0); 1755 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1756 &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0); 1757 1758 done: 1759 hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, NULL); 1760 hammer_cursor_downgrade(cursor); 1761 return (error); 1762 } 1763 1764 /* 1765 * Same as the above, but splits a full leaf node. 1766 */ 1767 static 1768 int 1769 btree_split_leaf(hammer_cursor_t cursor) 1770 { 1771 hammer_node_ondisk_t ondisk; 1772 hammer_node_t parent; 1773 hammer_node_t leaf; 1774 hammer_mount_t hmp; 1775 hammer_node_t new_leaf; 1776 hammer_btree_elm_t elm; 1777 hammer_btree_elm_t parent_elm; 1778 hammer_base_elm_t mid_boundary; 1779 int parent_index; 1780 int made_root; 1781 int split; 1782 int error; 1783 const size_t esize = sizeof(*elm); 1784 1785 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1786 return(error); 1787 ++hammer_stats_btree_splits; 1788 1789 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1790 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1791 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1792 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1793 1794 /* 1795 * Calculate the split point. If the insertion point is at the 1796 * end of the leaf we adjust the split point significantly to the 1797 * right to try to optimize node fill and flag it. If we hit 1798 * that same leaf again our heuristic failed and we don't try 1799 * to optimize node fill (it could lead to a degenerate case). 1800 */ 1801 leaf = cursor->node; 1802 ondisk = leaf->ondisk; 1803 KKASSERT(ondisk->count > 4); 1804 if (cursor->index == ondisk->count && 1805 (leaf->flags & HAMMER_NODE_NONLINEAR) == 0) { 1806 split = (ondisk->count + 1) * 3 / 4; 1807 leaf->flags |= HAMMER_NODE_NONLINEAR; 1808 } else { 1809 split = (ondisk->count + 1) / 2; 1810 } 1811 1812 #if 0 1813 /* 1814 * If the insertion point is at the split point shift the 1815 * split point left so we don't have to worry about 1816 */ 1817 if (cursor->index == split) 1818 --split; 1819 #endif 1820 KKASSERT(split > 0 && split < ondisk->count); 1821 1822 error = 0; 1823 hmp = leaf->hmp; 1824 1825 elm = &ondisk->elms[split]; 1826 1827 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm[-1].leaf.base) <= 0); 1828 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->leaf.base) <= 0); 1829 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->leaf.base) > 0); 1830 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm[1].leaf.base) > 0); 1831 1832 /* 1833 * If we are at the root of the tree, create a new root node with 1834 * 1 element and split normally. Avoid making major modifications 1835 * until we know the whole operation will work. 1836 */ 1837 if (ondisk->parent == 0) { 1838 parent = hammer_alloc_btree(cursor->trans, 0, &error); 1839 if (parent == NULL) 1840 goto done; 1841 hammer_lock_ex(&parent->lock); 1842 hammer_modify_node_noundo(cursor->trans, parent); 1843 ondisk = parent->ondisk; 1844 ondisk->count = 1; 1845 ondisk->parent = 0; 1846 ondisk->mirror_tid = leaf->ondisk->mirror_tid; 1847 ondisk->signature = leaf->ondisk->signature; 1848 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1849 ondisk->elms[0].base = hmp->root_btree_beg; 1850 ondisk->elms[0].base.btype = leaf->ondisk->type; 1851 ondisk->elms[0].internal.subtree_offset = leaf->node_offset; 1852 ondisk->elms[0].internal.mirror_tid = ondisk->mirror_tid; 1853 ondisk->elms[1].base = hmp->root_btree_end; 1854 hammer_modify_node_done(parent); 1855 made_root = 1; 1856 parent_index = 0; /* insertion point in parent */ 1857 } else { 1858 made_root = 0; 1859 parent = cursor->parent; 1860 parent_index = cursor->parent_index; 1861 } 1862 1863 /* 1864 * Split leaf into new_leaf at the split point. Select a separator 1865 * value in-between the two leafs but with a bent towards the right 1866 * leaf since comparisons use an 'elm >= separator' inequality. 1867 * 1868 * L L L L L L L L 1869 * 1870 * x x P x x 1871 * s S S s 1872 * / \ 1873 * L L L L L L L L 1874 */ 1875 new_leaf = hammer_alloc_btree(cursor->trans, 0, &error); 1876 if (new_leaf == NULL) { 1877 if (made_root) { 1878 hammer_unlock(&parent->lock); 1879 hammer_delete_node(cursor->trans, parent); 1880 hammer_rel_node(parent); 1881 } 1882 goto done; 1883 } 1884 hammer_lock_ex(&new_leaf->lock); 1885 1886 /* 1887 * Create the new node and copy the leaf elements from the split 1888 * point on to the new node. 1889 */ 1890 hammer_modify_node_all(cursor->trans, leaf); 1891 hammer_modify_node_noundo(cursor->trans, new_leaf); 1892 ondisk = leaf->ondisk; 1893 elm = &ondisk->elms[split]; 1894 bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize); 1895 new_leaf->ondisk->count = ondisk->count - split; 1896 new_leaf->ondisk->parent = parent->node_offset; 1897 new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF; 1898 new_leaf->ondisk->mirror_tid = ondisk->mirror_tid; 1899 KKASSERT(ondisk->type == new_leaf->ondisk->type); 1900 hammer_modify_node_done(new_leaf); 1901 hammer_cursor_split_node(leaf, new_leaf, split); 1902 1903 /* 1904 * Cleanup the original node. Because this is a leaf node and 1905 * leaf nodes do not have a right-hand boundary, there 1906 * aren't any special edge cases to clean up. We just fixup the 1907 * count. 1908 */ 1909 ondisk->count = split; 1910 1911 /* 1912 * Insert the separator into the parent, fixup the parent's 1913 * reference to the original node, and reference the new node. 1914 * The separator is P. 1915 * 1916 * Remember that base.count does not include the right-hand boundary. 1917 * We are copying parent_index+1 to parent_index+2, not +0 to +1. 1918 */ 1919 hammer_modify_node_all(cursor->trans, parent); 1920 ondisk = parent->ondisk; 1921 KKASSERT(split != 0); 1922 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1923 parent_elm = &ondisk->elms[parent_index+1]; 1924 bcopy(parent_elm, parent_elm + 1, 1925 (ondisk->count - parent_index) * esize); 1926 1927 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base); 1928 parent_elm->internal.base.btype = new_leaf->ondisk->type; 1929 parent_elm->internal.subtree_offset = new_leaf->node_offset; 1930 parent_elm->internal.mirror_tid = new_leaf->ondisk->mirror_tid; 1931 mid_boundary = &parent_elm->base; 1932 ++ondisk->count; 1933 hammer_modify_node_done(parent); 1934 hammer_cursor_inserted_element(parent, parent_index + 1); 1935 1936 /* 1937 * The filesystem's root B-Tree pointer may have to be updated. 1938 */ 1939 if (made_root) { 1940 hammer_volume_t volume; 1941 1942 volume = hammer_get_root_volume(hmp, &error); 1943 KKASSERT(error == 0); 1944 1945 hammer_modify_volume_field(cursor->trans, volume, 1946 vol0_btree_root); 1947 volume->ondisk->vol0_btree_root = parent->node_offset; 1948 hammer_modify_volume_done(volume); 1949 leaf->ondisk->parent = parent->node_offset; 1950 /* leaf->ondisk->signature = 0; */ 1951 if (cursor->parent) { 1952 hammer_unlock(&cursor->parent->lock); 1953 hammer_rel_node(cursor->parent); 1954 } 1955 cursor->parent = parent; /* lock'd and ref'd */ 1956 hammer_rel_volume(volume, 0); 1957 } 1958 hammer_modify_node_done(leaf); 1959 1960 /* 1961 * Ok, now adjust the cursor depending on which element the original 1962 * index was pointing at. If we are >= the split point the push node 1963 * is now in the new node. 1964 * 1965 * NOTE: If we are at the split point itself we need to select the 1966 * old or new node based on where key_beg's insertion point will be. 1967 * If we pick the wrong side the inserted element will wind up in 1968 * the wrong leaf node and outside that node's bounds. 1969 */ 1970 if (cursor->index > split || 1971 (cursor->index == split && 1972 hammer_btree_cmp(&cursor->key_beg, mid_boundary) >= 0)) { 1973 cursor->parent_index = parent_index + 1; 1974 cursor->index -= split; 1975 hammer_unlock(&cursor->node->lock); 1976 hammer_rel_node(cursor->node); 1977 cursor->node = new_leaf; 1978 } else { 1979 cursor->parent_index = parent_index; 1980 hammer_unlock(&new_leaf->lock); 1981 hammer_rel_node(new_leaf); 1982 } 1983 1984 /* 1985 * Fixup left and right bounds 1986 */ 1987 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1988 cursor->left_bound = &parent_elm[0].internal.base; 1989 cursor->right_bound = &parent_elm[1].internal.base; 1990 1991 /* 1992 * Assert that the bounds are correct. 1993 */ 1994 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1995 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1996 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1997 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1998 KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->key_beg) <= 0); 1999 KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->key_beg) > 0); 2000 2001 done: 2002 hammer_cursor_downgrade(cursor); 2003 return (error); 2004 } 2005 2006 #if 0 2007 2008 /* 2009 * Recursively correct the right-hand boundary's create_tid to (tid) as 2010 * long as the rest of the key matches. We have to recurse upward in 2011 * the tree as well as down the left side of each parent's right node. 2012 * 2013 * Return EDEADLK if we were only partially successful, forcing the caller 2014 * to try again. The original cursor is not modified. This routine can 2015 * also fail with EDEADLK if it is forced to throw away a portion of its 2016 * record history. 2017 * 2018 * The caller must pass a downgraded cursor to us (otherwise we can't dup it). 2019 */ 2020 struct hammer_rhb { 2021 TAILQ_ENTRY(hammer_rhb) entry; 2022 hammer_node_t node; 2023 int index; 2024 }; 2025 2026 TAILQ_HEAD(hammer_rhb_list, hammer_rhb); 2027 2028 int 2029 hammer_btree_correct_rhb(hammer_cursor_t cursor, hammer_tid_t tid) 2030 { 2031 struct hammer_mount *hmp; 2032 struct hammer_rhb_list rhb_list; 2033 hammer_base_elm_t elm; 2034 hammer_node_t orig_node; 2035 struct hammer_rhb *rhb; 2036 int orig_index; 2037 int error; 2038 2039 TAILQ_INIT(&rhb_list); 2040 hmp = cursor->trans->hmp; 2041 2042 /* 2043 * Save our position so we can restore it on return. This also 2044 * gives us a stable 'elm'. 2045 */ 2046 orig_node = cursor->node; 2047 hammer_ref_node(orig_node); 2048 hammer_lock_sh(&orig_node->lock); 2049 orig_index = cursor->index; 2050 elm = &orig_node->ondisk->elms[orig_index].base; 2051 2052 /* 2053 * Now build a list of parents going up, allocating a rhb 2054 * structure for each one. 2055 */ 2056 while (cursor->parent) { 2057 /* 2058 * Stop if we no longer have any right-bounds to fix up 2059 */ 2060 if (elm->obj_id != cursor->right_bound->obj_id || 2061 elm->rec_type != cursor->right_bound->rec_type || 2062 elm->key != cursor->right_bound->key) { 2063 break; 2064 } 2065 2066 /* 2067 * Stop if the right-hand bound's create_tid does not 2068 * need to be corrected. 2069 */ 2070 if (cursor->right_bound->create_tid >= tid) 2071 break; 2072 2073 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2074 rhb->node = cursor->parent; 2075 rhb->index = cursor->parent_index; 2076 hammer_ref_node(rhb->node); 2077 hammer_lock_sh(&rhb->node->lock); 2078 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2079 2080 hammer_cursor_up(cursor); 2081 } 2082 2083 /* 2084 * now safely adjust the right hand bound for each rhb. This may 2085 * also require taking the right side of the tree and iterating down 2086 * ITS left side. 2087 */ 2088 error = 0; 2089 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2090 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2091 if (error) 2092 break; 2093 TAILQ_REMOVE(&rhb_list, rhb, entry); 2094 hammer_unlock(&rhb->node->lock); 2095 hammer_rel_node(rhb->node); 2096 kfree(rhb, hmp->m_misc); 2097 2098 switch (cursor->node->ondisk->type) { 2099 case HAMMER_BTREE_TYPE_INTERNAL: 2100 /* 2101 * Right-boundary for parent at internal node 2102 * is one element to the right of the element whos 2103 * right boundary needs adjusting. We must then 2104 * traverse down the left side correcting any left 2105 * bounds (which may now be too far to the left). 2106 */ 2107 ++cursor->index; 2108 error = hammer_btree_correct_lhb(cursor, tid); 2109 break; 2110 default: 2111 panic("hammer_btree_correct_rhb(): Bad node type"); 2112 error = EINVAL; 2113 break; 2114 } 2115 } 2116 2117 /* 2118 * Cleanup 2119 */ 2120 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2121 TAILQ_REMOVE(&rhb_list, rhb, entry); 2122 hammer_unlock(&rhb->node->lock); 2123 hammer_rel_node(rhb->node); 2124 kfree(rhb, hmp->m_misc); 2125 } 2126 error = hammer_cursor_seek(cursor, orig_node, orig_index); 2127 hammer_unlock(&orig_node->lock); 2128 hammer_rel_node(orig_node); 2129 return (error); 2130 } 2131 2132 /* 2133 * Similar to rhb (in fact, rhb calls lhb), but corrects the left hand 2134 * bound going downward starting at the current cursor position. 2135 * 2136 * This function does not restore the cursor after use. 2137 */ 2138 int 2139 hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid) 2140 { 2141 struct hammer_rhb_list rhb_list; 2142 hammer_base_elm_t elm; 2143 hammer_base_elm_t cmp; 2144 struct hammer_rhb *rhb; 2145 struct hammer_mount *hmp; 2146 int error; 2147 2148 TAILQ_INIT(&rhb_list); 2149 hmp = cursor->trans->hmp; 2150 2151 cmp = &cursor->node->ondisk->elms[cursor->index].base; 2152 2153 /* 2154 * Record the node and traverse down the left-hand side for all 2155 * matching records needing a boundary correction. 2156 */ 2157 error = 0; 2158 for (;;) { 2159 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2160 rhb->node = cursor->node; 2161 rhb->index = cursor->index; 2162 hammer_ref_node(rhb->node); 2163 hammer_lock_sh(&rhb->node->lock); 2164 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2165 2166 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2167 /* 2168 * Nothing to traverse down if we are at the right 2169 * boundary of an internal node. 2170 */ 2171 if (cursor->index == cursor->node->ondisk->count) 2172 break; 2173 } else { 2174 elm = &cursor->node->ondisk->elms[cursor->index].base; 2175 if (elm->btype == HAMMER_BTREE_TYPE_RECORD) 2176 break; 2177 panic("Illegal leaf record type %02x", elm->btype); 2178 } 2179 error = hammer_cursor_down(cursor); 2180 if (error) 2181 break; 2182 2183 elm = &cursor->node->ondisk->elms[cursor->index].base; 2184 if (elm->obj_id != cmp->obj_id || 2185 elm->rec_type != cmp->rec_type || 2186 elm->key != cmp->key) { 2187 break; 2188 } 2189 if (elm->create_tid >= tid) 2190 break; 2191 2192 } 2193 2194 /* 2195 * Now we can safely adjust the left-hand boundary from the bottom-up. 2196 * The last element we remove from the list is the caller's right hand 2197 * boundary, which must also be adjusted. 2198 */ 2199 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2200 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2201 if (error) 2202 break; 2203 TAILQ_REMOVE(&rhb_list, rhb, entry); 2204 hammer_unlock(&rhb->node->lock); 2205 hammer_rel_node(rhb->node); 2206 kfree(rhb, hmp->m_misc); 2207 2208 elm = &cursor->node->ondisk->elms[cursor->index].base; 2209 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2210 hammer_modify_node(cursor->trans, cursor->node, 2211 &elm->create_tid, 2212 sizeof(elm->create_tid)); 2213 elm->create_tid = tid; 2214 hammer_modify_node_done(cursor->node); 2215 } else { 2216 panic("hammer_btree_correct_lhb(): Bad element type"); 2217 } 2218 } 2219 2220 /* 2221 * Cleanup 2222 */ 2223 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2224 TAILQ_REMOVE(&rhb_list, rhb, entry); 2225 hammer_unlock(&rhb->node->lock); 2226 hammer_rel_node(rhb->node); 2227 kfree(rhb, hmp->m_misc); 2228 } 2229 return (error); 2230 } 2231 2232 #endif 2233 2234 /* 2235 * Attempt to remove the locked, empty or want-to-be-empty B-Tree node at 2236 * (cursor->node). Returns 0 on success, EDEADLK if we could not complete 2237 * the operation due to a deadlock, or some other error. 2238 * 2239 * This routine is initially called with an empty leaf and may be 2240 * recursively called with single-element internal nodes. 2241 * 2242 * It should also be noted that when removing empty leaves we must be sure 2243 * to test and update mirror_tid because another thread may have deadlocked 2244 * against us (or someone) trying to propagate it up and cannot retry once 2245 * the node has been deleted. 2246 * 2247 * On return the cursor may end up pointing to an internal node, suitable 2248 * for further iteration but not for an immediate insertion or deletion. 2249 */ 2250 static int 2251 btree_remove(hammer_cursor_t cursor, int *ndelete) 2252 { 2253 hammer_node_ondisk_t ondisk; 2254 hammer_btree_elm_t elm; 2255 hammer_node_t node; 2256 hammer_node_t parent; 2257 const int esize = sizeof(*elm); 2258 int error; 2259 2260 node = cursor->node; 2261 2262 /* 2263 * When deleting the root of the filesystem convert it to 2264 * an empty leaf node. Internal nodes cannot be empty. 2265 */ 2266 ondisk = node->ondisk; 2267 if (ondisk->parent == 0) { 2268 KKASSERT(cursor->parent == NULL); 2269 hammer_modify_node_all(cursor->trans, node); 2270 KKASSERT(ondisk == node->ondisk); 2271 ondisk->type = HAMMER_BTREE_TYPE_LEAF; 2272 ondisk->count = 0; 2273 hammer_modify_node_done(node); 2274 cursor->index = 0; 2275 return(0); 2276 } 2277 2278 parent = cursor->parent; 2279 2280 /* 2281 * Attempt to remove the parent's reference to the child. If the 2282 * parent would become empty we have to recurse. If we fail we 2283 * leave the parent pointing to an empty leaf node. 2284 * 2285 * We have to recurse successfully before we can delete the internal 2286 * node as it is illegal to have empty internal nodes. Even though 2287 * the operation may be aborted we must still fixup any unlocked 2288 * cursors as if we had deleted the element prior to recursing 2289 * (by calling hammer_cursor_deleted_element()) so those cursors 2290 * are properly forced up the chain by the recursion. 2291 */ 2292 if (parent->ondisk->count == 1) { 2293 /* 2294 * This special cursor_up_locked() call leaves the original 2295 * node exclusively locked and referenced, leaves the 2296 * original parent locked (as the new node), and locks the 2297 * new parent. It can return EDEADLK. 2298 * 2299 * We cannot call hammer_cursor_removed_node() until we are 2300 * actually able to remove the node. If we did then tracked 2301 * cursors in the middle of iterations could be repointed 2302 * to a parent node. If this occurs they could end up 2303 * scanning newly inserted records into the node (that could 2304 * not be deleted) when they push down again. 2305 * 2306 * Due to the way the recursion works the final parent is left 2307 * in cursor->parent after the recursion returns. Each 2308 * layer on the way back up is thus able to call 2309 * hammer_cursor_removed_node() and 'jump' the node up to 2310 * the (same) final parent. 2311 * 2312 * NOTE! The local variable 'parent' is invalid after we 2313 * call hammer_cursor_up_locked(). 2314 */ 2315 error = hammer_cursor_up_locked(cursor); 2316 parent = NULL; 2317 2318 if (error == 0) { 2319 hammer_cursor_deleted_element(cursor->node, 0); 2320 error = btree_remove(cursor, ndelete); 2321 if (error == 0) { 2322 KKASSERT(node != cursor->node); 2323 hammer_cursor_removed_node( 2324 node, cursor->node, cursor->index); 2325 hammer_modify_node_all(cursor->trans, node); 2326 ondisk = node->ondisk; 2327 ondisk->type = HAMMER_BTREE_TYPE_DELETED; 2328 ondisk->count = 0; 2329 hammer_modify_node_done(node); 2330 hammer_flush_node(node, 0); 2331 hammer_delete_node(cursor->trans, node); 2332 if (ndelete) 2333 (*ndelete)++; 2334 } else { 2335 /* 2336 * Defer parent removal because we could not 2337 * get the lock, just let the leaf remain 2338 * empty. 2339 */ 2340 /**/ 2341 } 2342 hammer_unlock(&node->lock); 2343 hammer_rel_node(node); 2344 } else { 2345 /* 2346 * Defer parent removal because we could not 2347 * get the lock, just let the leaf remain 2348 * empty. 2349 */ 2350 /**/ 2351 } 2352 } else { 2353 KKASSERT(parent->ondisk->count > 1); 2354 2355 hammer_modify_node_all(cursor->trans, parent); 2356 ondisk = parent->ondisk; 2357 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 2358 2359 elm = &ondisk->elms[cursor->parent_index]; 2360 KKASSERT(elm->internal.subtree_offset == node->node_offset); 2361 KKASSERT(ondisk->count > 0); 2362 2363 /* 2364 * We must retain the highest mirror_tid. The deleted 2365 * range is now encompassed by the element to the left. 2366 * If we are already at the left edge the new left edge 2367 * inherits mirror_tid. 2368 * 2369 * Note that bounds of the parent to our parent may create 2370 * a gap to the left of our left-most node or to the right 2371 * of our right-most node. The gap is silently included 2372 * in the mirror_tid's area of effect from the point of view 2373 * of the scan. 2374 */ 2375 if (cursor->parent_index) { 2376 if (elm[-1].internal.mirror_tid < 2377 elm[0].internal.mirror_tid) { 2378 elm[-1].internal.mirror_tid = 2379 elm[0].internal.mirror_tid; 2380 } 2381 } else { 2382 if (elm[1].internal.mirror_tid < 2383 elm[0].internal.mirror_tid) { 2384 elm[1].internal.mirror_tid = 2385 elm[0].internal.mirror_tid; 2386 } 2387 } 2388 2389 /* 2390 * Delete the subtree reference in the parent. Include 2391 * boundary element at end. 2392 */ 2393 bcopy(&elm[1], &elm[0], 2394 (ondisk->count - cursor->parent_index) * esize); 2395 --ondisk->count; 2396 hammer_modify_node_done(parent); 2397 hammer_cursor_removed_node(node, parent, cursor->parent_index); 2398 hammer_cursor_deleted_element(parent, cursor->parent_index); 2399 hammer_flush_node(node, 0); 2400 hammer_delete_node(cursor->trans, node); 2401 2402 /* 2403 * cursor->node is invalid, cursor up to make the cursor 2404 * valid again. We have to flag the condition in case 2405 * another thread wiggles an insertion in during an 2406 * iteration. 2407 */ 2408 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK; 2409 error = hammer_cursor_up(cursor); 2410 if (ndelete) 2411 (*ndelete)++; 2412 } 2413 return (error); 2414 } 2415 2416 /* 2417 * Propagate cursor->trans->tid up the B-Tree starting at the current 2418 * cursor position using pseudofs info gleaned from the passed inode. 2419 * 2420 * The passed inode has no relationship to the cursor position other 2421 * then being in the same pseudofs as the insertion or deletion we 2422 * are propagating the mirror_tid for. 2423 * 2424 * WARNING! Because we push and pop the passed cursor, it may be 2425 * modified by other B-Tree operations while it is unlocked 2426 * and things like the node & leaf pointers, and indexes might 2427 * change. 2428 */ 2429 void 2430 hammer_btree_do_propagation(hammer_cursor_t cursor, 2431 hammer_pseudofs_inmem_t pfsm, 2432 hammer_btree_leaf_elm_t leaf) 2433 { 2434 hammer_cursor_t ncursor; 2435 hammer_tid_t mirror_tid; 2436 int error __debugvar; 2437 2438 /* 2439 * We do not propagate a mirror_tid if the filesystem was mounted 2440 * in no-mirror mode. 2441 */ 2442 if (cursor->trans->hmp->master_id < 0) 2443 return; 2444 2445 /* 2446 * This is a bit of a hack because we cannot deadlock or return 2447 * EDEADLK here. The related operation has already completed and 2448 * we must propagate the mirror_tid now regardless. 2449 * 2450 * Generate a new cursor which inherits the original's locks and 2451 * unlock the original. Use the new cursor to propagate the 2452 * mirror_tid. Then clean up the new cursor and reacquire locks 2453 * on the original. 2454 * 2455 * hammer_dup_cursor() cannot dup locks. The dup inherits the 2456 * original's locks and the original is tracked and must be 2457 * re-locked. 2458 */ 2459 mirror_tid = cursor->node->ondisk->mirror_tid; 2460 KKASSERT(mirror_tid != 0); 2461 ncursor = hammer_push_cursor(cursor); 2462 error = hammer_btree_mirror_propagate(ncursor, mirror_tid); 2463 KKASSERT(error == 0); 2464 hammer_pop_cursor(cursor, ncursor); 2465 /* WARNING: cursor's leaf pointer may change after pop */ 2466 } 2467 2468 2469 /* 2470 * Propagate a mirror TID update upwards through the B-Tree to the root. 2471 * 2472 * A locked internal node must be passed in. The node will remain locked 2473 * on return. 2474 * 2475 * This function syncs mirror_tid at the specified internal node's element, 2476 * adjusts the node's aggregation mirror_tid, and then recurses upwards. 2477 */ 2478 static int 2479 hammer_btree_mirror_propagate(hammer_cursor_t cursor, hammer_tid_t mirror_tid) 2480 { 2481 hammer_btree_internal_elm_t elm; 2482 hammer_node_t node; 2483 int error; 2484 2485 for (;;) { 2486 error = hammer_cursor_up(cursor); 2487 if (error == 0) 2488 error = hammer_cursor_upgrade(cursor); 2489 2490 /* 2491 * We can ignore HAMMER_CURSOR_ITERATE_CHECK, the 2492 * cursor will still be properly positioned for 2493 * mirror propagation, just not for iterations. 2494 */ 2495 while (error == EDEADLK) { 2496 hammer_recover_cursor(cursor); 2497 error = hammer_cursor_upgrade(cursor); 2498 } 2499 if (error) 2500 break; 2501 2502 /* 2503 * If the cursor deadlocked it could end up at a leaf 2504 * after we lost the lock. 2505 */ 2506 node = cursor->node; 2507 if (node->ondisk->type != HAMMER_BTREE_TYPE_INTERNAL) 2508 continue; 2509 2510 /* 2511 * Adjust the node's element 2512 */ 2513 elm = &node->ondisk->elms[cursor->index].internal; 2514 if (elm->mirror_tid >= mirror_tid) 2515 break; 2516 hammer_modify_node(cursor->trans, node, &elm->mirror_tid, 2517 sizeof(elm->mirror_tid)); 2518 elm->mirror_tid = mirror_tid; 2519 hammer_modify_node_done(node); 2520 if (hammer_debug_general & 0x0002) { 2521 kprintf("mirror_propagate: propagate " 2522 "%016llx @%016llx:%d\n", 2523 (long long)mirror_tid, 2524 (long long)node->node_offset, 2525 cursor->index); 2526 } 2527 2528 2529 /* 2530 * Adjust the node's mirror_tid aggregator 2531 */ 2532 if (node->ondisk->mirror_tid >= mirror_tid) 2533 return(0); 2534 hammer_modify_node_field(cursor->trans, node, mirror_tid); 2535 node->ondisk->mirror_tid = mirror_tid; 2536 hammer_modify_node_done(node); 2537 if (hammer_debug_general & 0x0002) { 2538 kprintf("mirror_propagate: propagate " 2539 "%016llx @%016llx\n", 2540 (long long)mirror_tid, 2541 (long long)node->node_offset); 2542 } 2543 } 2544 if (error == ENOENT) 2545 error = 0; 2546 return(error); 2547 } 2548 2549 hammer_node_t 2550 hammer_btree_get_parent(hammer_transaction_t trans, hammer_node_t node, 2551 int *parent_indexp, int *errorp, int try_exclusive) 2552 { 2553 hammer_node_t parent; 2554 hammer_btree_elm_t elm; 2555 int i; 2556 2557 /* 2558 * Get the node 2559 */ 2560 parent = hammer_get_node(trans, node->ondisk->parent, 0, errorp); 2561 if (*errorp) { 2562 KKASSERT(parent == NULL); 2563 return(NULL); 2564 } 2565 KKASSERT ((parent->flags & HAMMER_NODE_DELETED) == 0); 2566 2567 /* 2568 * Lock the node 2569 */ 2570 if (try_exclusive) { 2571 if (hammer_lock_ex_try(&parent->lock)) { 2572 hammer_rel_node(parent); 2573 *errorp = EDEADLK; 2574 return(NULL); 2575 } 2576 } else { 2577 hammer_lock_sh(&parent->lock); 2578 } 2579 2580 /* 2581 * Figure out which element in the parent is pointing to the 2582 * child. 2583 */ 2584 if (node->ondisk->count) { 2585 i = hammer_btree_search_node(&node->ondisk->elms[0].base, 2586 parent->ondisk); 2587 } else { 2588 i = 0; 2589 } 2590 while (i < parent->ondisk->count) { 2591 elm = &parent->ondisk->elms[i]; 2592 if (elm->internal.subtree_offset == node->node_offset) 2593 break; 2594 ++i; 2595 } 2596 if (i == parent->ondisk->count) { 2597 hammer_unlock(&parent->lock); 2598 panic("Bad B-Tree link: parent %p node %p", parent, node); 2599 } 2600 *parent_indexp = i; 2601 KKASSERT(*errorp == 0); 2602 return(parent); 2603 } 2604 2605 /* 2606 * The element (elm) has been moved to a new internal node (node). 2607 * 2608 * If the element represents a pointer to an internal node that node's 2609 * parent must be adjusted to the element's new location. 2610 * 2611 * XXX deadlock potential here with our exclusive locks 2612 */ 2613 int 2614 btree_set_parent(hammer_transaction_t trans, hammer_node_t node, 2615 hammer_btree_elm_t elm) 2616 { 2617 hammer_node_t child; 2618 int error; 2619 2620 error = 0; 2621 2622 switch(elm->base.btype) { 2623 case HAMMER_BTREE_TYPE_INTERNAL: 2624 case HAMMER_BTREE_TYPE_LEAF: 2625 child = hammer_get_node(trans, elm->internal.subtree_offset, 2626 0, &error); 2627 if (error == 0) { 2628 hammer_modify_node_field(trans, child, parent); 2629 child->ondisk->parent = node->node_offset; 2630 hammer_modify_node_done(child); 2631 hammer_rel_node(child); 2632 } 2633 break; 2634 default: 2635 break; 2636 } 2637 return(error); 2638 } 2639 2640 /* 2641 * Initialize the root of a recursive B-Tree node lock list structure. 2642 */ 2643 void 2644 hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node) 2645 { 2646 TAILQ_INIT(&parent->list); 2647 parent->parent = NULL; 2648 parent->node = node; 2649 parent->index = -1; 2650 parent->count = node->ondisk->count; 2651 parent->copy = NULL; 2652 parent->flags = 0; 2653 } 2654 2655 /* 2656 * Initialize a cache of hammer_node_lock's including space allocated 2657 * for node copies. 2658 * 2659 * This is used by the rebalancing code to preallocate the copy space 2660 * for ~4096 B-Tree nodes (16MB of data) prior to acquiring any HAMMER 2661 * locks, otherwise we can blow out the pageout daemon's emergency 2662 * reserve and deadlock it. 2663 * 2664 * NOTE: HAMMER_NODE_LOCK_LCACHE is not set on items cached in the lcache. 2665 * The flag is set when the item is pulled off the cache for use. 2666 */ 2667 void 2668 hammer_btree_lcache_init(hammer_mount_t hmp, hammer_node_lock_t lcache, 2669 int depth) 2670 { 2671 hammer_node_lock_t item; 2672 int count; 2673 2674 for (count = 1; depth; --depth) 2675 count *= HAMMER_BTREE_LEAF_ELMS; 2676 bzero(lcache, sizeof(*lcache)); 2677 TAILQ_INIT(&lcache->list); 2678 while (count) { 2679 item = kmalloc(sizeof(*item), hmp->m_misc, M_WAITOK|M_ZERO); 2680 item->copy = kmalloc(sizeof(*item->copy), 2681 hmp->m_misc, M_WAITOK); 2682 TAILQ_INIT(&item->list); 2683 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2684 --count; 2685 } 2686 } 2687 2688 void 2689 hammer_btree_lcache_free(hammer_mount_t hmp, hammer_node_lock_t lcache) 2690 { 2691 hammer_node_lock_t item; 2692 2693 while ((item = TAILQ_FIRST(&lcache->list)) != NULL) { 2694 TAILQ_REMOVE(&lcache->list, item, entry); 2695 KKASSERT(item->copy); 2696 KKASSERT(TAILQ_EMPTY(&item->list)); 2697 kfree(item->copy, hmp->m_misc); 2698 kfree(item, hmp->m_misc); 2699 } 2700 KKASSERT(lcache->copy == NULL); 2701 } 2702 2703 /* 2704 * Exclusively lock all the children of node. This is used by the split 2705 * code to prevent anyone from accessing the children of a cursor node 2706 * while we fix-up its parent offset. 2707 * 2708 * If we don't lock the children we can really mess up cursors which block 2709 * trying to cursor-up into our node. 2710 * 2711 * On failure EDEADLK (or some other error) is returned. If a deadlock 2712 * error is returned the cursor is adjusted to block on termination. 2713 * 2714 * The caller is responsible for managing parent->node, the root's node 2715 * is usually aliased from a cursor. 2716 */ 2717 int 2718 hammer_btree_lock_children(hammer_cursor_t cursor, int depth, 2719 hammer_node_lock_t parent, 2720 hammer_node_lock_t lcache) 2721 { 2722 hammer_node_t node; 2723 hammer_node_lock_t item; 2724 hammer_node_ondisk_t ondisk; 2725 hammer_btree_elm_t elm; 2726 hammer_node_t child; 2727 struct hammer_mount *hmp; 2728 int error; 2729 int i; 2730 2731 node = parent->node; 2732 ondisk = node->ondisk; 2733 error = 0; 2734 hmp = cursor->trans->hmp; 2735 2736 /* 2737 * We really do not want to block on I/O with exclusive locks held, 2738 * pre-get the children before trying to lock the mess. This is 2739 * only done one-level deep for now. 2740 */ 2741 for (i = 0; i < ondisk->count; ++i) { 2742 ++hammer_stats_btree_elements; 2743 elm = &ondisk->elms[i]; 2744 if (elm->base.btype != HAMMER_BTREE_TYPE_LEAF && 2745 elm->base.btype != HAMMER_BTREE_TYPE_INTERNAL) { 2746 continue; 2747 } 2748 child = hammer_get_node(cursor->trans, 2749 elm->internal.subtree_offset, 2750 0, &error); 2751 if (child) 2752 hammer_rel_node(child); 2753 } 2754 2755 /* 2756 * Do it for real 2757 */ 2758 for (i = 0; error == 0 && i < ondisk->count; ++i) { 2759 ++hammer_stats_btree_elements; 2760 elm = &ondisk->elms[i]; 2761 2762 switch(elm->base.btype) { 2763 case HAMMER_BTREE_TYPE_INTERNAL: 2764 case HAMMER_BTREE_TYPE_LEAF: 2765 KKASSERT(elm->internal.subtree_offset != 0); 2766 child = hammer_get_node(cursor->trans, 2767 elm->internal.subtree_offset, 2768 0, &error); 2769 break; 2770 default: 2771 child = NULL; 2772 break; 2773 } 2774 if (child) { 2775 if (hammer_lock_ex_try(&child->lock) != 0) { 2776 if (cursor->deadlk_node == NULL) { 2777 cursor->deadlk_node = child; 2778 hammer_ref_node(cursor->deadlk_node); 2779 } 2780 error = EDEADLK; 2781 hammer_rel_node(child); 2782 } else { 2783 if (lcache) { 2784 item = TAILQ_FIRST(&lcache->list); 2785 KKASSERT(item != NULL); 2786 item->flags |= HAMMER_NODE_LOCK_LCACHE; 2787 TAILQ_REMOVE(&lcache->list, item, entry); 2788 } else { 2789 item = kmalloc(sizeof(*item), 2790 hmp->m_misc, 2791 M_WAITOK|M_ZERO); 2792 TAILQ_INIT(&item->list); 2793 } 2794 2795 TAILQ_INSERT_TAIL(&parent->list, item, entry); 2796 item->parent = parent; 2797 item->node = child; 2798 item->index = i; 2799 item->count = child->ondisk->count; 2800 2801 /* 2802 * Recurse (used by the rebalancing code) 2803 */ 2804 if (depth > 1 && elm->base.btype == HAMMER_BTREE_TYPE_INTERNAL) { 2805 error = hammer_btree_lock_children( 2806 cursor, 2807 depth - 1, 2808 item, 2809 lcache); 2810 } 2811 } 2812 } 2813 } 2814 if (error) 2815 hammer_btree_unlock_children(hmp, parent, lcache); 2816 return(error); 2817 } 2818 2819 /* 2820 * Create an in-memory copy of all B-Tree nodes listed, recursively, 2821 * including the parent. 2822 */ 2823 void 2824 hammer_btree_lock_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2825 { 2826 hammer_mount_t hmp = cursor->trans->hmp; 2827 hammer_node_lock_t item; 2828 2829 if (parent->copy == NULL) { 2830 KKASSERT((parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0); 2831 parent->copy = kmalloc(sizeof(*parent->copy), 2832 hmp->m_misc, M_WAITOK); 2833 } 2834 KKASSERT((parent->flags & HAMMER_NODE_LOCK_UPDATED) == 0); 2835 *parent->copy = *parent->node->ondisk; 2836 TAILQ_FOREACH(item, &parent->list, entry) { 2837 hammer_btree_lock_copy(cursor, item); 2838 } 2839 } 2840 2841 /* 2842 * Recursively sync modified copies to the media. 2843 */ 2844 int 2845 hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2846 { 2847 hammer_node_lock_t item; 2848 int count = 0; 2849 2850 if (parent->flags & HAMMER_NODE_LOCK_UPDATED) { 2851 ++count; 2852 hammer_modify_node_all(cursor->trans, parent->node); 2853 *parent->node->ondisk = *parent->copy; 2854 hammer_modify_node_done(parent->node); 2855 if (parent->copy->type == HAMMER_BTREE_TYPE_DELETED) { 2856 hammer_flush_node(parent->node, 0); 2857 hammer_delete_node(cursor->trans, parent->node); 2858 } 2859 } 2860 TAILQ_FOREACH(item, &parent->list, entry) { 2861 count += hammer_btree_sync_copy(cursor, item); 2862 } 2863 return(count); 2864 } 2865 2866 /* 2867 * Release previously obtained node locks. The caller is responsible for 2868 * cleaning up parent->node itself (its usually just aliased from a cursor), 2869 * but this function will take care of the copies. 2870 * 2871 * NOTE: The root node is not placed in the lcache and node->copy is not 2872 * deallocated when lcache != NULL. 2873 */ 2874 void 2875 hammer_btree_unlock_children(hammer_mount_t hmp, hammer_node_lock_t parent, 2876 hammer_node_lock_t lcache) 2877 { 2878 hammer_node_lock_t item; 2879 hammer_node_ondisk_t copy; 2880 2881 while ((item = TAILQ_FIRST(&parent->list)) != NULL) { 2882 TAILQ_REMOVE(&parent->list, item, entry); 2883 hammer_btree_unlock_children(hmp, item, lcache); 2884 hammer_unlock(&item->node->lock); 2885 hammer_rel_node(item->node); 2886 if (lcache) { 2887 /* 2888 * NOTE: When placing the item back in the lcache 2889 * the flag is cleared by the bzero(). 2890 * Remaining fields are cleared as a safety 2891 * measure. 2892 */ 2893 KKASSERT(item->flags & HAMMER_NODE_LOCK_LCACHE); 2894 KKASSERT(TAILQ_EMPTY(&item->list)); 2895 copy = item->copy; 2896 bzero(item, sizeof(*item)); 2897 TAILQ_INIT(&item->list); 2898 item->copy = copy; 2899 if (copy) 2900 bzero(copy, sizeof(*copy)); 2901 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2902 } else { 2903 kfree(item, hmp->m_misc); 2904 } 2905 } 2906 if (parent->copy && (parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0) { 2907 kfree(parent->copy, hmp->m_misc); 2908 parent->copy = NULL; /* safety */ 2909 } 2910 } 2911 2912 /************************************************************************ 2913 * MISCELLANIOUS SUPPORT * 2914 ************************************************************************/ 2915 2916 /* 2917 * Compare two B-Tree elements, return -N, 0, or +N (e.g. similar to strcmp). 2918 * 2919 * Note that for this particular function a return value of -1, 0, or +1 2920 * can denote a match if create_tid is otherwise discounted. A create_tid 2921 * of zero is considered to be 'infinity' in comparisons. 2922 * 2923 * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c. 2924 */ 2925 int 2926 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2) 2927 { 2928 if (key1->localization < key2->localization) 2929 return(-5); 2930 if (key1->localization > key2->localization) 2931 return(5); 2932 2933 if (key1->obj_id < key2->obj_id) 2934 return(-4); 2935 if (key1->obj_id > key2->obj_id) 2936 return(4); 2937 2938 if (key1->rec_type < key2->rec_type) 2939 return(-3); 2940 if (key1->rec_type > key2->rec_type) 2941 return(3); 2942 2943 if (key1->key < key2->key) 2944 return(-2); 2945 if (key1->key > key2->key) 2946 return(2); 2947 2948 /* 2949 * A create_tid of zero indicates a record which is undeletable 2950 * and must be considered to have a value of positive infinity. 2951 */ 2952 if (key1->create_tid == 0) { 2953 if (key2->create_tid == 0) 2954 return(0); 2955 return(1); 2956 } 2957 if (key2->create_tid == 0) 2958 return(-1); 2959 if (key1->create_tid < key2->create_tid) 2960 return(-1); 2961 if (key1->create_tid > key2->create_tid) 2962 return(1); 2963 return(0); 2964 } 2965 2966 /* 2967 * Test a timestamp against an element to determine whether the 2968 * element is visible. A timestamp of 0 means 'infinity'. 2969 */ 2970 int 2971 hammer_btree_chkts(hammer_tid_t asof, hammer_base_elm_t base) 2972 { 2973 if (asof == 0) { 2974 if (base->delete_tid) 2975 return(1); 2976 return(0); 2977 } 2978 if (asof < base->create_tid) 2979 return(-1); 2980 if (base->delete_tid && asof >= base->delete_tid) 2981 return(1); 2982 return(0); 2983 } 2984 2985 /* 2986 * Create a separator half way inbetween key1 and key2. For fields just 2987 * one unit apart, the separator will match key2. key1 is on the left-hand 2988 * side and key2 is on the right-hand side. 2989 * 2990 * key2 must be >= the separator. It is ok for the separator to match key2. 2991 * 2992 * NOTE: Even if key1 does not match key2, the separator may wind up matching 2993 * key2. 2994 * 2995 * NOTE: It might be beneficial to just scrap this whole mess and just 2996 * set the separator to key2. 2997 */ 2998 #define MAKE_SEPARATOR(key1, key2, dest, field) \ 2999 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1); 3000 3001 static void 3002 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2, 3003 hammer_base_elm_t dest) 3004 { 3005 bzero(dest, sizeof(*dest)); 3006 3007 dest->rec_type = key2->rec_type; 3008 dest->key = key2->key; 3009 dest->obj_id = key2->obj_id; 3010 dest->create_tid = key2->create_tid; 3011 3012 MAKE_SEPARATOR(key1, key2, dest, localization); 3013 if (key1->localization == key2->localization) { 3014 MAKE_SEPARATOR(key1, key2, dest, obj_id); 3015 if (key1->obj_id == key2->obj_id) { 3016 MAKE_SEPARATOR(key1, key2, dest, rec_type); 3017 if (key1->rec_type == key2->rec_type) { 3018 MAKE_SEPARATOR(key1, key2, dest, key); 3019 /* 3020 * Don't bother creating a separator for 3021 * create_tid, which also conveniently avoids 3022 * having to handle the create_tid == 0 3023 * (infinity) case. Just leave create_tid 3024 * set to key2. 3025 * 3026 * Worst case, dest matches key2 exactly, 3027 * which is acceptable. 3028 */ 3029 } 3030 } 3031 } 3032 } 3033 3034 #undef MAKE_SEPARATOR 3035 3036 /* 3037 * Return whether a generic internal or leaf node is full 3038 */ 3039 static __inline 3040 int 3041 btree_node_is_full(hammer_node_ondisk_t node) 3042 { 3043 return(btree_max_elements(node->type) == node->count); 3044 } 3045 3046 static __inline 3047 int 3048 btree_max_elements(u_int8_t type) 3049 { 3050 int n; 3051 3052 n = hammer_node_max_elements(type); 3053 if (n == -1) 3054 panic("btree_max_elements: bad type %d", type); 3055 return(n); 3056 } 3057 3058 void 3059 hammer_print_btree_node(hammer_node_ondisk_t ondisk) 3060 { 3061 hammer_btree_elm_t elm; 3062 int i; 3063 3064 kprintf("node %p count=%d parent=%016llx type=%c\n", 3065 ondisk, ondisk->count, 3066 (long long)ondisk->parent, ondisk->type); 3067 3068 /* 3069 * Dump both boundary elements if an internal node 3070 */ 3071 if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 3072 for (i = 0; i <= ondisk->count; ++i) { 3073 elm = &ondisk->elms[i]; 3074 hammer_print_btree_elm(elm, ondisk->type, i); 3075 } 3076 } else { 3077 for (i = 0; i < ondisk->count; ++i) { 3078 elm = &ondisk->elms[i]; 3079 hammer_print_btree_elm(elm, ondisk->type, i); 3080 } 3081 } 3082 } 3083 3084 void 3085 hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i) 3086 { 3087 kprintf(" %2d", i); 3088 kprintf("\tobj_id = %016llx\n", (long long)elm->base.obj_id); 3089 kprintf("\tkey = %016llx\n", (long long)elm->base.key); 3090 kprintf("\tcreate_tid = %016llx\n", (long long)elm->base.create_tid); 3091 kprintf("\tdelete_tid = %016llx\n", (long long)elm->base.delete_tid); 3092 kprintf("\trec_type = %04x\n", elm->base.rec_type); 3093 kprintf("\tobj_type = %02x\n", elm->base.obj_type); 3094 kprintf("\tbtype = %02x (%c)\n", 3095 elm->base.btype, 3096 (elm->base.btype ? elm->base.btype : '?')); 3097 kprintf("\tlocalization = %02x\n", elm->base.localization); 3098 3099 switch(type) { 3100 case HAMMER_BTREE_TYPE_INTERNAL: 3101 kprintf("\tsubtree_off = %016llx\n", 3102 (long long)elm->internal.subtree_offset); 3103 break; 3104 case HAMMER_BTREE_TYPE_RECORD: 3105 kprintf("\tdata_offset = %016llx\n", 3106 (long long)elm->leaf.data_offset); 3107 kprintf("\tdata_len = %08x\n", elm->leaf.data_len); 3108 kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc); 3109 break; 3110 } 3111 } 3112