1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * HAMMER B-Tree index 37 * 38 * HAMMER implements a modified B+Tree. In documentation this will 39 * simply be refered to as the HAMMER B-Tree. Basically a HAMMER B-Tree 40 * looks like a B+Tree (A B-Tree which stores its records only at the leafs 41 * of the tree), but adds two additional boundary elements which describe 42 * the left-most and right-most element a node is able to represent. In 43 * otherwords, we have boundary elements at the two ends of a B-Tree node 44 * with no valid sub-tree pointer for the right-most element. 45 * 46 * A B-Tree internal node looks like this: 47 * 48 * B N N N N N N B <-- boundary and internal elements 49 * S S S S S S S <-- subtree pointers 50 * 51 * A B-Tree leaf node basically looks like this: 52 * 53 * L L L L L L L L <-- leaf elemenets 54 * 55 * The radix for an internal node is 1 less then a leaf but we get a 56 * number of significant benefits for our troubles. 57 * The left-hand boundary (B in the left) is integrated into the first 58 * element so it doesn't require 2 elements to accomodate boundaries. 59 * 60 * The big benefit to using a B-Tree containing boundary information 61 * is that it is possible to cache pointers into the middle of the tree 62 * and not have to start searches, insertions, OR deletions at the root 63 * node. In particular, searches are able to progress in a definitive 64 * direction from any point in the tree without revisting nodes. This 65 * greatly improves the efficiency of many operations, most especially 66 * record appends. 67 * 68 * B-Trees also make the stacking of trees fairly straightforward. 69 * 70 * INSERTIONS: A search performed with the intention of doing 71 * an insert will guarantee that the terminal leaf node is not full by 72 * splitting full nodes. Splits occur top-down during the dive down the 73 * B-Tree. 74 * 75 * DELETIONS: A deletion makes no attempt to proactively balance the 76 * tree and will recursively remove nodes that become empty. If a 77 * deadlock occurs a deletion may not be able to remove an empty leaf. 78 * Deletions never allow internal nodes to become empty (that would blow 79 * up the boundaries). 80 */ 81 #include "hammer.h" 82 #include <sys/buf.h> 83 #include <sys/buf2.h> 84 85 static int btree_search(hammer_cursor_t cursor, int flags); 86 static int btree_split_internal(hammer_cursor_t cursor); 87 static int btree_split_leaf(hammer_cursor_t cursor); 88 static int btree_remove(hammer_cursor_t cursor, int *ndelete); 89 static __inline int btree_node_is_full(hammer_node_ondisk_t node); 90 static __inline int btree_max_elements(u_int8_t type); 91 static int hammer_btree_mirror_propagate(hammer_cursor_t cursor, 92 hammer_tid_t mirror_tid); 93 static void hammer_make_separator(hammer_base_elm_t key1, 94 hammer_base_elm_t key2, hammer_base_elm_t dest); 95 static void hammer_cursor_mirror_filter(hammer_cursor_t cursor); 96 97 /* 98 * Iterate records after a search. The cursor is iterated forwards past 99 * the current record until a record matching the key-range requirements 100 * is found. ENOENT is returned if the iteration goes past the ending 101 * key. 102 * 103 * The iteration is inclusive of key_beg and can be inclusive or exclusive 104 * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set. 105 * 106 * When doing an as-of search (cursor->asof != 0), key_beg.create_tid 107 * may be modified by B-Tree functions. 108 * 109 * cursor->key_beg may or may not be modified by this function during 110 * the iteration. XXX future - in case of an inverted lock we may have 111 * to reinitiate the lookup and set key_beg to properly pick up where we 112 * left off. 113 * 114 * If HAMMER_CURSOR_ITERATE_CHECK is set it is possible that the cursor 115 * was reverse indexed due to being moved to a parent while unlocked, 116 * and something else might have inserted an element outside the iteration 117 * range. When this case occurs the iterator just keeps iterating until 118 * it gets back into the iteration range (instead of asserting). 119 * 120 * NOTE! EDEADLK *CANNOT* be returned by this procedure. 121 */ 122 int 123 hammer_btree_iterate(hammer_cursor_t cursor) 124 { 125 hammer_node_ondisk_t node; 126 hammer_btree_elm_t elm; 127 hammer_mount_t hmp; 128 int error = 0; 129 int r; 130 int s; 131 132 /* 133 * Skip past the current record 134 */ 135 hmp = cursor->trans->hmp; 136 node = cursor->node->ondisk; 137 if (node == NULL) 138 return(ENOENT); 139 if (cursor->index < node->count && 140 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 141 ++cursor->index; 142 } 143 144 /* 145 * HAMMER can wind up being cpu-bound. 146 */ 147 if (++hmp->check_yield > hammer_yield_check) { 148 hmp->check_yield = 0; 149 lwkt_user_yield(); 150 } 151 152 153 /* 154 * Loop until an element is found or we are done. 155 */ 156 for (;;) { 157 /* 158 * We iterate up the tree and then index over one element 159 * while we are at the last element in the current node. 160 * 161 * If we are at the root of the filesystem, cursor_up 162 * returns ENOENT. 163 * 164 * XXX this could be optimized by storing the information in 165 * the parent reference. 166 * 167 * XXX we can lose the node lock temporarily, this could mess 168 * up our scan. 169 */ 170 ++hammer_stats_btree_iterations; 171 hammer_flusher_clean_loose_ios(hmp); 172 173 if (cursor->index == node->count) { 174 if (hammer_debug_btree) { 175 kprintf("BRACKETU %016llx[%d] -> %016llx[%d] (td=%p)\n", 176 (long long)cursor->node->node_offset, 177 cursor->index, 178 (long long)(cursor->parent ? cursor->parent->node_offset : -1), 179 cursor->parent_index, 180 curthread); 181 } 182 KKASSERT(cursor->parent == NULL || cursor->parent->ondisk->elms[cursor->parent_index].internal.subtree_offset == cursor->node->node_offset); 183 error = hammer_cursor_up(cursor); 184 if (error) 185 break; 186 /* reload stale pointer */ 187 node = cursor->node->ondisk; 188 KKASSERT(cursor->index != node->count); 189 190 /* 191 * If we are reblocking we want to return internal 192 * nodes. Note that the internal node will be 193 * returned multiple times, on each upward recursion 194 * from its children. The caller selects which 195 * revisit it cares about (usually first or last only). 196 */ 197 if (cursor->flags & HAMMER_CURSOR_REBLOCKING) { 198 cursor->flags |= HAMMER_CURSOR_ATEDISK; 199 return(0); 200 } 201 ++cursor->index; 202 continue; 203 } 204 205 /* 206 * Check internal or leaf element. Determine if the record 207 * at the cursor has gone beyond the end of our range. 208 * 209 * We recurse down through internal nodes. 210 */ 211 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 212 elm = &node->elms[cursor->index]; 213 214 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 215 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 216 if (hammer_debug_btree) { 217 kprintf("BRACKETL %016llx[%d] %016llx %02x " 218 "key=%016llx lo=%02x %d (td=%p)\n", 219 (long long)cursor->node->node_offset, 220 cursor->index, 221 (long long)elm[0].internal.base.obj_id, 222 elm[0].internal.base.rec_type, 223 (long long)elm[0].internal.base.key, 224 elm[0].internal.base.localization, 225 r, 226 curthread 227 ); 228 kprintf("BRACKETR %016llx[%d] %016llx %02x " 229 "key=%016llx lo=%02x %d\n", 230 (long long)cursor->node->node_offset, 231 cursor->index + 1, 232 (long long)elm[1].internal.base.obj_id, 233 elm[1].internal.base.rec_type, 234 (long long)elm[1].internal.base.key, 235 elm[1].internal.base.localization, 236 s 237 ); 238 } 239 240 if (r < 0) { 241 error = ENOENT; 242 break; 243 } 244 if (r == 0 && (cursor->flags & 245 HAMMER_CURSOR_END_INCLUSIVE) == 0) { 246 error = ENOENT; 247 break; 248 } 249 250 /* 251 * Better not be zero 252 */ 253 KKASSERT(elm->internal.subtree_offset != 0); 254 255 if (s <= 0) { 256 /* 257 * If running the mirror filter see if we 258 * can skip one or more entire sub-trees. 259 * If we can we return the internal node 260 * and the caller processes the skipped 261 * range (see mirror_read). 262 */ 263 if (cursor->flags & 264 HAMMER_CURSOR_MIRROR_FILTERED) { 265 if (elm->internal.mirror_tid < 266 cursor->cmirror->mirror_tid) { 267 hammer_cursor_mirror_filter(cursor); 268 return(0); 269 } 270 } 271 } else { 272 /* 273 * Normally it would be impossible for the 274 * cursor to have gotten back-indexed, 275 * but it can happen if a node is deleted 276 * and the cursor is moved to its parent 277 * internal node. ITERATE_CHECK will be set. 278 */ 279 KKASSERT(cursor->flags & 280 HAMMER_CURSOR_ITERATE_CHECK); 281 kprintf("hammer_btree_iterate: " 282 "DEBUG: Caught parent seek " 283 "in internal iteration\n"); 284 } 285 286 error = hammer_cursor_down(cursor); 287 if (error) 288 break; 289 KKASSERT(cursor->index == 0); 290 /* reload stale pointer */ 291 node = cursor->node->ondisk; 292 continue; 293 } else { 294 elm = &node->elms[cursor->index]; 295 r = hammer_btree_cmp(&cursor->key_end, &elm->base); 296 if (hammer_debug_btree) { 297 kprintf("ELEMENT %016llx:%d %c %016llx %02x " 298 "key=%016llx lo=%02x %d\n", 299 (long long)cursor->node->node_offset, 300 cursor->index, 301 (elm[0].leaf.base.btype ? 302 elm[0].leaf.base.btype : '?'), 303 (long long)elm[0].leaf.base.obj_id, 304 elm[0].leaf.base.rec_type, 305 (long long)elm[0].leaf.base.key, 306 elm[0].leaf.base.localization, 307 r 308 ); 309 } 310 if (r < 0) { 311 error = ENOENT; 312 break; 313 } 314 315 /* 316 * We support both end-inclusive and 317 * end-exclusive searches. 318 */ 319 if (r == 0 && 320 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 321 error = ENOENT; 322 break; 323 } 324 325 /* 326 * If ITERATE_CHECK is set an unlocked cursor may 327 * have been moved to a parent and the iterate can 328 * happen upon elements that are not in the requested 329 * range. 330 */ 331 if (cursor->flags & HAMMER_CURSOR_ITERATE_CHECK) { 332 s = hammer_btree_cmp(&cursor->key_beg, 333 &elm->base); 334 if (s > 0) { 335 kprintf("hammer_btree_iterate: " 336 "DEBUG: Caught parent seek " 337 "in leaf iteration\n"); 338 ++cursor->index; 339 continue; 340 } 341 } 342 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 343 344 /* 345 * Return the element 346 */ 347 switch(elm->leaf.base.btype) { 348 case HAMMER_BTREE_TYPE_RECORD: 349 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 350 hammer_btree_chkts(cursor->asof, &elm->base)) { 351 ++cursor->index; 352 continue; 353 } 354 error = 0; 355 break; 356 default: 357 error = EINVAL; 358 break; 359 } 360 if (error) 361 break; 362 } 363 364 /* 365 * Return entry 366 */ 367 if (hammer_debug_btree) { 368 int i = cursor->index; 369 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 370 kprintf("ITERATE %p:%d %016llx %02x " 371 "key=%016llx lo=%02x\n", 372 cursor->node, i, 373 (long long)elm->internal.base.obj_id, 374 elm->internal.base.rec_type, 375 (long long)elm->internal.base.key, 376 elm->internal.base.localization 377 ); 378 } 379 return(0); 380 } 381 return(error); 382 } 383 384 /* 385 * We hit an internal element that we could skip as part of a mirroring 386 * scan. Calculate the entire range being skipped. 387 * 388 * It is important to include any gaps between the parent's left_bound 389 * and the node's left_bound, and same goes for the right side. 390 */ 391 static void 392 hammer_cursor_mirror_filter(hammer_cursor_t cursor) 393 { 394 struct hammer_cmirror *cmirror; 395 hammer_node_ondisk_t ondisk; 396 hammer_btree_elm_t elm; 397 398 ondisk = cursor->node->ondisk; 399 cmirror = cursor->cmirror; 400 401 /* 402 * Calculate the skipped range 403 */ 404 elm = &ondisk->elms[cursor->index]; 405 if (cursor->index == 0) 406 cmirror->skip_beg = *cursor->left_bound; 407 else 408 cmirror->skip_beg = elm->internal.base; 409 while (cursor->index < ondisk->count) { 410 if (elm->internal.mirror_tid >= cmirror->mirror_tid) 411 break; 412 ++cursor->index; 413 ++elm; 414 } 415 if (cursor->index == ondisk->count) 416 cmirror->skip_end = *cursor->right_bound; 417 else 418 cmirror->skip_end = elm->internal.base; 419 420 /* 421 * clip the returned result. 422 */ 423 if (hammer_btree_cmp(&cmirror->skip_beg, &cursor->key_beg) < 0) 424 cmirror->skip_beg = cursor->key_beg; 425 if (hammer_btree_cmp(&cmirror->skip_end, &cursor->key_end) > 0) 426 cmirror->skip_end = cursor->key_end; 427 } 428 429 /* 430 * Iterate in the reverse direction. This is used by the pruning code to 431 * avoid overlapping records. 432 */ 433 int 434 hammer_btree_iterate_reverse(hammer_cursor_t cursor) 435 { 436 hammer_node_ondisk_t node; 437 hammer_btree_elm_t elm; 438 hammer_mount_t hmp; 439 int error = 0; 440 int r; 441 int s; 442 443 /* mirror filtering not supported for reverse iteration */ 444 KKASSERT ((cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) == 0); 445 446 /* 447 * Skip past the current record. For various reasons the cursor 448 * may end up set to -1 or set to point at the end of the current 449 * node. These cases must be addressed. 450 */ 451 node = cursor->node->ondisk; 452 if (node == NULL) 453 return(ENOENT); 454 if (cursor->index != -1 && 455 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 456 --cursor->index; 457 } 458 if (cursor->index == cursor->node->ondisk->count) 459 --cursor->index; 460 461 /* 462 * HAMMER can wind up being cpu-bound. 463 */ 464 hmp = cursor->trans->hmp; 465 if (++hmp->check_yield > hammer_yield_check) { 466 hmp->check_yield = 0; 467 lwkt_user_yield(); 468 } 469 470 /* 471 * Loop until an element is found or we are done. 472 */ 473 for (;;) { 474 ++hammer_stats_btree_iterations; 475 hammer_flusher_clean_loose_ios(hmp); 476 477 /* 478 * We iterate up the tree and then index over one element 479 * while we are at the last element in the current node. 480 */ 481 if (cursor->index == -1) { 482 error = hammer_cursor_up(cursor); 483 if (error) { 484 cursor->index = 0; /* sanity */ 485 break; 486 } 487 /* reload stale pointer */ 488 node = cursor->node->ondisk; 489 KKASSERT(cursor->index != node->count); 490 --cursor->index; 491 continue; 492 } 493 494 /* 495 * Check internal or leaf element. Determine if the record 496 * at the cursor has gone beyond the end of our range. 497 * 498 * We recurse down through internal nodes. 499 */ 500 KKASSERT(cursor->index != node->count); 501 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 502 elm = &node->elms[cursor->index]; 503 504 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 505 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 506 if (hammer_debug_btree) { 507 kprintf("BRACKETL %016llx[%d] %016llx %02x " 508 "key=%016llx lo=%02x %d (td=%p)\n", 509 (long long)cursor->node->node_offset, 510 cursor->index, 511 (long long)elm[0].internal.base.obj_id, 512 elm[0].internal.base.rec_type, 513 (long long)elm[0].internal.base.key, 514 elm[0].internal.base.localization, 515 r, 516 curthread 517 ); 518 kprintf("BRACKETR %016llx[%d] %016llx %02x " 519 "key=%016llx lo=%02x %d\n", 520 (long long)cursor->node->node_offset, 521 cursor->index + 1, 522 (long long)elm[1].internal.base.obj_id, 523 elm[1].internal.base.rec_type, 524 (long long)elm[1].internal.base.key, 525 elm[1].internal.base.localization, 526 s 527 ); 528 } 529 530 if (s >= 0) { 531 error = ENOENT; 532 break; 533 } 534 535 /* 536 * It shouldn't be possible to be seeked past key_end, 537 * even if the cursor got moved to a parent. 538 */ 539 KKASSERT(r >= 0); 540 541 /* 542 * Better not be zero 543 */ 544 KKASSERT(elm->internal.subtree_offset != 0); 545 546 error = hammer_cursor_down(cursor); 547 if (error) 548 break; 549 KKASSERT(cursor->index == 0); 550 /* reload stale pointer */ 551 node = cursor->node->ondisk; 552 553 /* this can assign -1 if the leaf was empty */ 554 cursor->index = node->count - 1; 555 continue; 556 } else { 557 elm = &node->elms[cursor->index]; 558 s = hammer_btree_cmp(&cursor->key_beg, &elm->base); 559 if (hammer_debug_btree) { 560 kprintf("ELEMENTR %016llx:%d %c %016llx %02x " 561 "key=%016llx lo=%02x %d\n", 562 (long long)cursor->node->node_offset, 563 cursor->index, 564 (elm[0].leaf.base.btype ? 565 elm[0].leaf.base.btype : '?'), 566 (long long)elm[0].leaf.base.obj_id, 567 elm[0].leaf.base.rec_type, 568 (long long)elm[0].leaf.base.key, 569 elm[0].leaf.base.localization, 570 s 571 ); 572 } 573 if (s > 0) { 574 error = ENOENT; 575 break; 576 } 577 578 /* 579 * It shouldn't be possible to be seeked past key_end, 580 * even if the cursor got moved to a parent. 581 */ 582 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 583 584 /* 585 * Return the element 586 */ 587 switch(elm->leaf.base.btype) { 588 case HAMMER_BTREE_TYPE_RECORD: 589 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 590 hammer_btree_chkts(cursor->asof, &elm->base)) { 591 --cursor->index; 592 continue; 593 } 594 error = 0; 595 break; 596 default: 597 error = EINVAL; 598 break; 599 } 600 if (error) 601 break; 602 } 603 604 /* 605 * Return entry 606 */ 607 if (hammer_debug_btree) { 608 int i = cursor->index; 609 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 610 kprintf("ITERATER %p:%d %016llx %02x " 611 "key=%016llx lo=%02x\n", 612 cursor->node, i, 613 (long long)elm->internal.base.obj_id, 614 elm->internal.base.rec_type, 615 (long long)elm->internal.base.key, 616 elm->internal.base.localization 617 ); 618 } 619 return(0); 620 } 621 return(error); 622 } 623 624 /* 625 * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry 626 * could not be found, EDEADLK if inserting and a retry is needed, and a 627 * fatal error otherwise. When retrying, the caller must terminate the 628 * cursor and reinitialize it. EDEADLK cannot be returned if not inserting. 629 * 630 * The cursor is suitably positioned for a deletion on success, and suitably 631 * positioned for an insertion on ENOENT if HAMMER_CURSOR_INSERT was 632 * specified. 633 * 634 * The cursor may begin anywhere, the search will traverse the tree in 635 * either direction to locate the requested element. 636 * 637 * Most of the logic implementing historical searches is handled here. We 638 * do an initial lookup with create_tid set to the asof TID. Due to the 639 * way records are laid out, a backwards iteration may be required if 640 * ENOENT is returned to locate the historical record. Here's the 641 * problem: 642 * 643 * create_tid: 10 15 20 644 * LEAF1 LEAF2 645 * records: (11) (18) 646 * 647 * Lets say we want to do a lookup AS-OF timestamp 17. We will traverse 648 * LEAF2 but the only record in LEAF2 has a create_tid of 18, which is 649 * not visible and thus causes ENOENT to be returned. We really need 650 * to check record 11 in LEAF1. If it also fails then the search fails 651 * (e.g. it might represent the range 11-16 and thus still not match our 652 * AS-OF timestamp of 17). Note that LEAF1 could be empty, requiring 653 * further iterations. 654 * 655 * If this case occurs btree_search() will set HAMMER_CURSOR_CREATE_CHECK 656 * and the cursor->create_check TID if an iteration might be needed. 657 * In the above example create_check would be set to 14. 658 */ 659 int 660 hammer_btree_lookup(hammer_cursor_t cursor) 661 { 662 int error; 663 664 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 665 KKASSERT ((cursor->flags & HAMMER_CURSOR_INSERT) == 0 || 666 cursor->trans->sync_lock_refs > 0); 667 ++hammer_stats_btree_lookups; 668 if (cursor->flags & HAMMER_CURSOR_ASOF) { 669 KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0); 670 cursor->key_beg.create_tid = cursor->asof; 671 for (;;) { 672 cursor->flags &= ~HAMMER_CURSOR_CREATE_CHECK; 673 error = btree_search(cursor, 0); 674 if (error != ENOENT || 675 (cursor->flags & HAMMER_CURSOR_CREATE_CHECK) == 0) { 676 /* 677 * Stop if no error. 678 * Stop if error other then ENOENT. 679 * Stop if ENOENT and not special case. 680 */ 681 break; 682 } 683 if (hammer_debug_btree) { 684 kprintf("CREATE_CHECK %016llx\n", 685 (long long)cursor->create_check); 686 } 687 cursor->key_beg.create_tid = cursor->create_check; 688 /* loop */ 689 } 690 } else { 691 error = btree_search(cursor, 0); 692 } 693 if (error == 0) 694 error = hammer_btree_extract(cursor, cursor->flags); 695 return(error); 696 } 697 698 /* 699 * Execute the logic required to start an iteration. The first record 700 * located within the specified range is returned and iteration control 701 * flags are adjusted for successive hammer_btree_iterate() calls. 702 * 703 * Set ATEDISK so a low-level caller can call btree_first/btree_iterate 704 * in a loop without worrying about it. Higher-level merged searches will 705 * adjust the flag appropriately. 706 */ 707 int 708 hammer_btree_first(hammer_cursor_t cursor) 709 { 710 int error; 711 712 error = hammer_btree_lookup(cursor); 713 if (error == ENOENT) { 714 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 715 error = hammer_btree_iterate(cursor); 716 } 717 cursor->flags |= HAMMER_CURSOR_ATEDISK; 718 return(error); 719 } 720 721 /* 722 * Similarly but for an iteration in the reverse direction. 723 * 724 * Set ATEDISK when iterating backwards to skip the current entry, 725 * which after an ENOENT lookup will be pointing beyond our end point. 726 * 727 * Set ATEDISK so a low-level caller can call btree_last/btree_iterate_reverse 728 * in a loop without worrying about it. Higher-level merged searches will 729 * adjust the flag appropriately. 730 */ 731 int 732 hammer_btree_last(hammer_cursor_t cursor) 733 { 734 struct hammer_base_elm save; 735 int error; 736 737 save = cursor->key_beg; 738 cursor->key_beg = cursor->key_end; 739 error = hammer_btree_lookup(cursor); 740 cursor->key_beg = save; 741 if (error == ENOENT || 742 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 743 cursor->flags |= HAMMER_CURSOR_ATEDISK; 744 error = hammer_btree_iterate_reverse(cursor); 745 } 746 cursor->flags |= HAMMER_CURSOR_ATEDISK; 747 return(error); 748 } 749 750 /* 751 * Extract the record and/or data associated with the cursor's current 752 * position. Any prior record or data stored in the cursor is replaced. 753 * 754 * NOTE: All extractions occur at the leaf of the B-Tree. 755 */ 756 int 757 hammer_btree_extract(hammer_cursor_t cursor, int flags) 758 { 759 hammer_node_ondisk_t node; 760 hammer_btree_elm_t elm; 761 hammer_off_t data_off; 762 hammer_mount_t hmp; 763 int32_t data_len; 764 int error; 765 766 /* 767 * Certain types of corruption can result in a NULL node pointer. 768 */ 769 if (cursor->node == NULL) { 770 kprintf("hammer: NULL cursor->node, filesystem might " 771 "have gotten corrupted\n"); 772 return (EINVAL); 773 } 774 775 /* 776 * The case where the data reference resolves to the same buffer 777 * as the record reference must be handled. 778 */ 779 node = cursor->node->ondisk; 780 elm = &node->elms[cursor->index]; 781 cursor->data = NULL; 782 hmp = cursor->node->hmp; 783 784 /* 785 * There is nothing to extract for an internal element. 786 */ 787 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) 788 return(EINVAL); 789 790 /* 791 * Only record types have data. 792 */ 793 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 794 cursor->leaf = &elm->leaf; 795 796 if ((flags & HAMMER_CURSOR_GET_DATA) == 0) 797 return(0); 798 if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD) 799 return(0); 800 data_off = elm->leaf.data_offset; 801 data_len = elm->leaf.data_len; 802 if (data_off == 0) 803 return(0); 804 805 /* 806 * Load the data 807 */ 808 KKASSERT(data_len >= 0 && data_len <= HAMMER_XBUFSIZE); 809 cursor->data = hammer_bread_ext(hmp, data_off, data_len, 810 &error, &cursor->data_buffer); 811 812 /* 813 * Mark the data buffer as not being meta-data if it isn't 814 * meta-data (sometimes bulk data is accessed via a volume 815 * block device). 816 */ 817 if (error == 0) { 818 switch(elm->leaf.base.rec_type) { 819 case HAMMER_RECTYPE_DATA: 820 case HAMMER_RECTYPE_DB: 821 if ((data_off & HAMMER_ZONE_LARGE_DATA) == 0) 822 break; 823 if (hammer_double_buffer == 0 || 824 (cursor->flags & HAMMER_CURSOR_NOSWAPCACHE)) { 825 hammer_io_notmeta(cursor->data_buffer); 826 } 827 break; 828 default: 829 break; 830 } 831 } 832 833 /* 834 * Deal with CRC errors on the extracted data. 835 */ 836 if (error == 0 && 837 hammer_crc_test_leaf(cursor->data, &elm->leaf) == 0) { 838 kprintf("CRC DATA @ %016llx/%d FAILED\n", 839 (long long)elm->leaf.data_offset, elm->leaf.data_len); 840 if (hammer_debug_critical) 841 Debugger("CRC FAILED: DATA"); 842 if (cursor->trans->flags & HAMMER_TRANSF_CRCDOM) 843 error = EDOM; /* less critical (mirroring) */ 844 else 845 error = EIO; /* critical */ 846 } 847 return(error); 848 } 849 850 851 /* 852 * Insert a leaf element into the B-Tree at the current cursor position. 853 * The cursor is positioned such that the element at and beyond the cursor 854 * are shifted to make room for the new record. 855 * 856 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT 857 * flag set and that call must return ENOENT before this function can be 858 * called. ENOSPC is returned if there is no room to insert a new record. 859 * 860 * The caller may depend on the cursor's exclusive lock after return to 861 * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE). 862 */ 863 int 864 hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm, 865 int *doprop) 866 { 867 hammer_node_ondisk_t node; 868 int i; 869 int error; 870 871 *doprop = 0; 872 if ((error = hammer_cursor_upgrade_node(cursor)) != 0) 873 return(error); 874 ++hammer_stats_btree_inserts; 875 876 /* 877 * Insert the element at the leaf node and update the count in the 878 * parent. It is possible for parent to be NULL, indicating that 879 * the filesystem's ROOT B-Tree node is a leaf itself, which is 880 * possible. The root inode can never be deleted so the leaf should 881 * never be empty. 882 * 883 * Remember that leaf nodes do not have boundaries. 884 */ 885 hammer_modify_node_all(cursor->trans, cursor->node); 886 node = cursor->node->ondisk; 887 i = cursor->index; 888 KKASSERT(elm->base.btype != 0); 889 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 890 KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS); 891 if (i != node->count) { 892 bcopy(&node->elms[i], &node->elms[i+1], 893 (node->count - i) * sizeof(*elm)); 894 } 895 node->elms[i].leaf = *elm; 896 ++node->count; 897 hammer_cursor_inserted_element(cursor->node, i); 898 899 /* 900 * Update the leaf node's aggregate mirror_tid for mirroring 901 * support. 902 */ 903 if (node->mirror_tid < elm->base.delete_tid) { 904 node->mirror_tid = elm->base.delete_tid; 905 *doprop = 1; 906 } 907 if (node->mirror_tid < elm->base.create_tid) { 908 node->mirror_tid = elm->base.create_tid; 909 *doprop = 1; 910 } 911 hammer_modify_node_done(cursor->node); 912 913 /* 914 * Debugging sanity checks. 915 */ 916 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->base) <= 0); 917 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->base) > 0); 918 if (i) { 919 KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->base) < 0); 920 } 921 if (i != node->count - 1) 922 KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->base) > 0); 923 924 return(0); 925 } 926 927 /* 928 * Delete a record from the B-Tree at the current cursor position. 929 * The cursor is positioned such that the current element is the one 930 * to be deleted. 931 * 932 * On return the cursor will be positioned after the deleted element and 933 * MAY point to an internal node. It will be suitable for the continuation 934 * of an iteration but not for an insertion or deletion. 935 * 936 * Deletions will attempt to partially rebalance the B-Tree in an upward 937 * direction, but will terminate rather then deadlock. Empty internal nodes 938 * are never allowed by a deletion which deadlocks may end up giving us an 939 * empty leaf. The pruner will clean up and rebalance the tree. 940 * 941 * This function can return EDEADLK, requiring the caller to retry the 942 * operation after clearing the deadlock. 943 * 944 * This function will store the number of deleted btree nodes in *ndelete 945 * if ndelete is not NULL. 946 */ 947 int 948 hammer_btree_delete(hammer_cursor_t cursor, int *ndelete) 949 { 950 hammer_node_ondisk_t ondisk; 951 hammer_node_t node; 952 hammer_node_t parent __debugvar; 953 int error; 954 int i; 955 956 KKASSERT (cursor->trans->sync_lock_refs > 0); 957 if (ndelete) 958 *ndelete = 0; 959 if ((error = hammer_cursor_upgrade(cursor)) != 0) 960 return(error); 961 ++hammer_stats_btree_deletes; 962 963 /* 964 * Delete the element from the leaf node. 965 * 966 * Remember that leaf nodes do not have boundaries. 967 */ 968 node = cursor->node; 969 ondisk = node->ondisk; 970 i = cursor->index; 971 972 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF); 973 KKASSERT(i >= 0 && i < ondisk->count); 974 hammer_modify_node_all(cursor->trans, node); 975 if (i + 1 != ondisk->count) { 976 bcopy(&ondisk->elms[i+1], &ondisk->elms[i], 977 (ondisk->count - i - 1) * sizeof(ondisk->elms[0])); 978 } 979 --ondisk->count; 980 hammer_modify_node_done(node); 981 hammer_cursor_deleted_element(node, i); 982 983 /* 984 * Validate local parent 985 */ 986 if (ondisk->parent) { 987 parent = cursor->parent; 988 989 KKASSERT(parent != NULL); 990 KKASSERT(parent->node_offset == ondisk->parent); 991 } 992 993 /* 994 * If the leaf becomes empty it must be detached from the parent, 995 * potentially recursing through to the filesystem root. 996 * 997 * This may reposition the cursor at one of the parent's of the 998 * current node. 999 * 1000 * Ignore deadlock errors, that simply means that btree_remove 1001 * was unable to recurse and had to leave us with an empty leaf. 1002 */ 1003 KKASSERT(cursor->index <= ondisk->count); 1004 if (ondisk->count == 0) { 1005 error = btree_remove(cursor, ndelete); 1006 if (error == EDEADLK) 1007 error = 0; 1008 } else { 1009 error = 0; 1010 } 1011 KKASSERT(cursor->parent == NULL || 1012 cursor->parent_index < cursor->parent->ondisk->count); 1013 return(error); 1014 } 1015 1016 /* 1017 * PRIMARY B-TREE SEARCH SUPPORT PROCEDURE 1018 * 1019 * Search the filesystem B-Tree for cursor->key_beg, return the matching node. 1020 * 1021 * The search can begin ANYWHERE in the B-Tree. As a first step the search 1022 * iterates up the tree as necessary to properly position itself prior to 1023 * actually doing the sarch. 1024 * 1025 * INSERTIONS: The search will split full nodes and leaves on its way down 1026 * and guarentee that the leaf it ends up on is not full. If we run out 1027 * of space the search continues to the leaf, but ENOSPC is returned. 1028 * 1029 * The search is only guarenteed to end up on a leaf if an error code of 0 1030 * is returned, or if inserting and an error code of ENOENT is returned. 1031 * Otherwise it can stop at an internal node. On success a search returns 1032 * a leaf node. 1033 * 1034 * COMPLEXITY WARNING! This is the core B-Tree search code for the entire 1035 * filesystem, and it is not simple code. Please note the following facts: 1036 * 1037 * - Internal node recursions have a boundary on the left AND right. The 1038 * right boundary is non-inclusive. The create_tid is a generic part 1039 * of the key for internal nodes. 1040 * 1041 * - Filesystem lookups typically set HAMMER_CURSOR_ASOF, indicating a 1042 * historical search. ASOF and INSERT are mutually exclusive. When 1043 * doing an as-of lookup btree_search() checks for a right-edge boundary 1044 * case. If while recursing down the left-edge differs from the key 1045 * by ONLY its create_tid, HAMMER_CURSOR_CREATE_CHECK is set along 1046 * with cursor->create_check. This is used by btree_lookup() to iterate. 1047 * The iteration backwards because as-of searches can wind up going 1048 * down the wrong branch of the B-Tree. 1049 */ 1050 static 1051 int 1052 btree_search(hammer_cursor_t cursor, int flags) 1053 { 1054 hammer_node_ondisk_t node; 1055 hammer_btree_elm_t elm; 1056 int error; 1057 int enospc = 0; 1058 int i; 1059 int r; 1060 int s; 1061 1062 flags |= cursor->flags; 1063 ++hammer_stats_btree_searches; 1064 1065 if (hammer_debug_btree) { 1066 kprintf("SEARCH %016llx[%d] %016llx %02x key=%016llx cre=%016llx lo=%02x (td=%p)\n", 1067 (long long)cursor->node->node_offset, 1068 cursor->index, 1069 (long long)cursor->key_beg.obj_id, 1070 cursor->key_beg.rec_type, 1071 (long long)cursor->key_beg.key, 1072 (long long)cursor->key_beg.create_tid, 1073 cursor->key_beg.localization, 1074 curthread 1075 ); 1076 if (cursor->parent) 1077 kprintf("SEARCHP %016llx[%d] (%016llx/%016llx %016llx/%016llx) (%p/%p %p/%p)\n", 1078 (long long)cursor->parent->node_offset, 1079 cursor->parent_index, 1080 (long long)cursor->left_bound->obj_id, 1081 (long long)cursor->parent->ondisk->elms[cursor->parent_index].internal.base.obj_id, 1082 (long long)cursor->right_bound->obj_id, 1083 (long long)cursor->parent->ondisk->elms[cursor->parent_index+1].internal.base.obj_id, 1084 cursor->left_bound, 1085 &cursor->parent->ondisk->elms[cursor->parent_index], 1086 cursor->right_bound, 1087 &cursor->parent->ondisk->elms[cursor->parent_index+1] 1088 ); 1089 } 1090 1091 /* 1092 * Move our cursor up the tree until we find a node whos range covers 1093 * the key we are trying to locate. 1094 * 1095 * The left bound is inclusive, the right bound is non-inclusive. 1096 * It is ok to cursor up too far. 1097 */ 1098 for (;;) { 1099 r = hammer_btree_cmp(&cursor->key_beg, cursor->left_bound); 1100 s = hammer_btree_cmp(&cursor->key_beg, cursor->right_bound); 1101 if (r >= 0 && s < 0) 1102 break; 1103 KKASSERT(cursor->parent); 1104 ++hammer_stats_btree_iterations; 1105 error = hammer_cursor_up(cursor); 1106 if (error) 1107 goto done; 1108 } 1109 1110 /* 1111 * The delete-checks below are based on node, not parent. Set the 1112 * initial delete-check based on the parent. 1113 */ 1114 if (r == 1) { 1115 KKASSERT(cursor->left_bound->create_tid != 1); 1116 cursor->create_check = cursor->left_bound->create_tid - 1; 1117 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1118 } 1119 1120 /* 1121 * We better have ended up with a node somewhere. 1122 */ 1123 KKASSERT(cursor->node != NULL); 1124 1125 /* 1126 * If we are inserting we can't start at a full node if the parent 1127 * is also full (because there is no way to split the node), 1128 * continue running up the tree until the requirement is satisfied 1129 * or we hit the root of the filesystem. 1130 * 1131 * (If inserting we aren't doing an as-of search so we don't have 1132 * to worry about create_check). 1133 */ 1134 while (flags & HAMMER_CURSOR_INSERT) { 1135 if (btree_node_is_full(cursor->node->ondisk) == 0) 1136 break; 1137 if (cursor->node->ondisk->parent == 0 || 1138 cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) { 1139 break; 1140 } 1141 ++hammer_stats_btree_iterations; 1142 error = hammer_cursor_up(cursor); 1143 /* node may have become stale */ 1144 if (error) 1145 goto done; 1146 } 1147 1148 /* 1149 * Push down through internal nodes to locate the requested key. 1150 */ 1151 node = cursor->node->ondisk; 1152 while (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 1153 /* 1154 * Scan the node to find the subtree index to push down into. 1155 * We go one-past, then back-up. 1156 * 1157 * We must proactively remove deleted elements which may 1158 * have been left over from a deadlocked btree_remove(). 1159 * 1160 * The left and right boundaries are included in the loop 1161 * in order to detect edge cases. 1162 * 1163 * If the separator only differs by create_tid (r == 1) 1164 * and we are doing an as-of search, we may end up going 1165 * down a branch to the left of the one containing the 1166 * desired key. This requires numerous special cases. 1167 */ 1168 ++hammer_stats_btree_iterations; 1169 if (hammer_debug_btree) { 1170 kprintf("SEARCH-I %016llx count=%d\n", 1171 (long long)cursor->node->node_offset, 1172 node->count); 1173 } 1174 1175 /* 1176 * Try to shortcut the search before dropping into the 1177 * linear loop. Locate the first node where r <= 1. 1178 */ 1179 i = hammer_btree_search_node(&cursor->key_beg, node); 1180 while (i <= node->count) { 1181 ++hammer_stats_btree_elements; 1182 elm = &node->elms[i]; 1183 r = hammer_btree_cmp(&cursor->key_beg, &elm->base); 1184 if (hammer_debug_btree > 2) { 1185 kprintf(" IELM %p %d r=%d\n", 1186 &node->elms[i], i, r); 1187 } 1188 if (r < 0) 1189 break; 1190 if (r == 1) { 1191 KKASSERT(elm->base.create_tid != 1); 1192 cursor->create_check = elm->base.create_tid - 1; 1193 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1194 } 1195 ++i; 1196 } 1197 if (hammer_debug_btree) { 1198 kprintf("SEARCH-I preI=%d/%d r=%d\n", 1199 i, node->count, r); 1200 } 1201 1202 /* 1203 * These cases occur when the parent's idea of the boundary 1204 * is wider then the child's idea of the boundary, and 1205 * require special handling. If not inserting we can 1206 * terminate the search early for these cases but the 1207 * child's boundaries cannot be unconditionally modified. 1208 */ 1209 if (i == 0) { 1210 /* 1211 * If i == 0 the search terminated to the LEFT of the 1212 * left_boundary but to the RIGHT of the parent's left 1213 * boundary. 1214 */ 1215 u_int8_t save; 1216 1217 elm = &node->elms[0]; 1218 1219 /* 1220 * If we aren't inserting we can stop here. 1221 */ 1222 if ((flags & (HAMMER_CURSOR_INSERT | 1223 HAMMER_CURSOR_PRUNING)) == 0) { 1224 cursor->index = 0; 1225 return(ENOENT); 1226 } 1227 1228 /* 1229 * Correct a left-hand boundary mismatch. 1230 * 1231 * We can only do this if we can upgrade the lock, 1232 * and synchronized as a background cursor (i.e. 1233 * inserting or pruning). 1234 * 1235 * WARNING: We can only do this if inserting, i.e. 1236 * we are running on the backend. 1237 */ 1238 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1239 return(error); 1240 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1241 hammer_modify_node_field(cursor->trans, cursor->node, 1242 elms[0]); 1243 save = node->elms[0].base.btype; 1244 node->elms[0].base = *cursor->left_bound; 1245 node->elms[0].base.btype = save; 1246 hammer_modify_node_done(cursor->node); 1247 } else if (i == node->count + 1) { 1248 /* 1249 * If i == node->count + 1 the search terminated to 1250 * the RIGHT of the right boundary but to the LEFT 1251 * of the parent's right boundary. If we aren't 1252 * inserting we can stop here. 1253 * 1254 * Note that the last element in this case is 1255 * elms[i-2] prior to adjustments to 'i'. 1256 */ 1257 --i; 1258 if ((flags & (HAMMER_CURSOR_INSERT | 1259 HAMMER_CURSOR_PRUNING)) == 0) { 1260 cursor->index = i; 1261 return (ENOENT); 1262 } 1263 1264 /* 1265 * Correct a right-hand boundary mismatch. 1266 * (actual push-down record is i-2 prior to 1267 * adjustments to i). 1268 * 1269 * We can only do this if we can upgrade the lock, 1270 * and synchronized as a background cursor (i.e. 1271 * inserting or pruning). 1272 * 1273 * WARNING: We can only do this if inserting, i.e. 1274 * we are running on the backend. 1275 */ 1276 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1277 return(error); 1278 elm = &node->elms[i]; 1279 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1280 hammer_modify_node(cursor->trans, cursor->node, 1281 &elm->base, sizeof(elm->base)); 1282 elm->base = *cursor->right_bound; 1283 hammer_modify_node_done(cursor->node); 1284 --i; 1285 } else { 1286 /* 1287 * The push-down index is now i - 1. If we had 1288 * terminated on the right boundary this will point 1289 * us at the last element. 1290 */ 1291 --i; 1292 } 1293 cursor->index = i; 1294 elm = &node->elms[i]; 1295 1296 if (hammer_debug_btree) { 1297 kprintf("RESULT-I %016llx[%d] %016llx %02x " 1298 "key=%016llx cre=%016llx lo=%02x\n", 1299 (long long)cursor->node->node_offset, 1300 i, 1301 (long long)elm->internal.base.obj_id, 1302 elm->internal.base.rec_type, 1303 (long long)elm->internal.base.key, 1304 (long long)elm->internal.base.create_tid, 1305 elm->internal.base.localization 1306 ); 1307 } 1308 1309 /* 1310 * We better have a valid subtree offset. 1311 */ 1312 KKASSERT(elm->internal.subtree_offset != 0); 1313 1314 /* 1315 * Handle insertion and deletion requirements. 1316 * 1317 * If inserting split full nodes. The split code will 1318 * adjust cursor->node and cursor->index if the current 1319 * index winds up in the new node. 1320 * 1321 * If inserting and a left or right edge case was detected, 1322 * we cannot correct the left or right boundary and must 1323 * prepend and append an empty leaf node in order to make 1324 * the boundary correction. 1325 * 1326 * If we run out of space we set enospc but continue on 1327 * to a leaf. 1328 */ 1329 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) { 1330 if (btree_node_is_full(node)) { 1331 error = btree_split_internal(cursor); 1332 if (error) { 1333 if (error != ENOSPC) 1334 goto done; 1335 enospc = 1; 1336 } 1337 /* 1338 * reload stale pointers 1339 */ 1340 i = cursor->index; 1341 node = cursor->node->ondisk; 1342 } 1343 } 1344 1345 /* 1346 * Push down (push into new node, existing node becomes 1347 * the parent) and continue the search. 1348 */ 1349 error = hammer_cursor_down(cursor); 1350 /* node may have become stale */ 1351 if (error) 1352 goto done; 1353 node = cursor->node->ondisk; 1354 } 1355 1356 /* 1357 * We are at a leaf, do a linear search of the key array. 1358 * 1359 * On success the index is set to the matching element and 0 1360 * is returned. 1361 * 1362 * On failure the index is set to the insertion point and ENOENT 1363 * is returned. 1364 * 1365 * Boundaries are not stored in leaf nodes, so the index can wind 1366 * up to the left of element 0 (index == 0) or past the end of 1367 * the array (index == node->count). It is also possible that the 1368 * leaf might be empty. 1369 */ 1370 ++hammer_stats_btree_iterations; 1371 KKASSERT (node->type == HAMMER_BTREE_TYPE_LEAF); 1372 KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS); 1373 if (hammer_debug_btree) { 1374 kprintf("SEARCH-L %016llx count=%d\n", 1375 (long long)cursor->node->node_offset, 1376 node->count); 1377 } 1378 1379 /* 1380 * Try to shortcut the search before dropping into the 1381 * linear loop. Locate the first node where r <= 1. 1382 */ 1383 i = hammer_btree_search_node(&cursor->key_beg, node); 1384 while (i < node->count) { 1385 ++hammer_stats_btree_elements; 1386 elm = &node->elms[i]; 1387 1388 r = hammer_btree_cmp(&cursor->key_beg, &elm->leaf.base); 1389 1390 if (hammer_debug_btree > 1) 1391 kprintf(" ELM %p %d r=%d\n", &node->elms[i], i, r); 1392 1393 /* 1394 * We are at a record element. Stop if we've flipped past 1395 * key_beg, not counting the create_tid test. Allow the 1396 * r == 1 case (key_beg > element but differs only by its 1397 * create_tid) to fall through to the AS-OF check. 1398 */ 1399 KKASSERT (elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD); 1400 1401 if (r < 0) 1402 goto failed; 1403 if (r > 1) { 1404 ++i; 1405 continue; 1406 } 1407 1408 /* 1409 * Check our as-of timestamp against the element. 1410 */ 1411 if (flags & HAMMER_CURSOR_ASOF) { 1412 if (hammer_btree_chkts(cursor->asof, 1413 &node->elms[i].base) != 0) { 1414 ++i; 1415 continue; 1416 } 1417 /* success */ 1418 } else { 1419 if (r > 0) { /* can only be +1 */ 1420 ++i; 1421 continue; 1422 } 1423 /* success */ 1424 } 1425 cursor->index = i; 1426 error = 0; 1427 if (hammer_debug_btree) { 1428 kprintf("RESULT-L %016llx[%d] (SUCCESS)\n", 1429 (long long)cursor->node->node_offset, i); 1430 } 1431 goto done; 1432 } 1433 1434 /* 1435 * The search of the leaf node failed. i is the insertion point. 1436 */ 1437 failed: 1438 if (hammer_debug_btree) { 1439 kprintf("RESULT-L %016llx[%d] (FAILED)\n", 1440 (long long)cursor->node->node_offset, i); 1441 } 1442 1443 /* 1444 * No exact match was found, i is now at the insertion point. 1445 * 1446 * If inserting split a full leaf before returning. This 1447 * may have the side effect of adjusting cursor->node and 1448 * cursor->index. 1449 */ 1450 cursor->index = i; 1451 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0 && 1452 btree_node_is_full(node)) { 1453 error = btree_split_leaf(cursor); 1454 if (error) { 1455 if (error != ENOSPC) 1456 goto done; 1457 enospc = 1; 1458 } 1459 /* 1460 * reload stale pointers 1461 */ 1462 /* NOT USED 1463 i = cursor->index; 1464 node = &cursor->node->internal; 1465 */ 1466 } 1467 1468 /* 1469 * We reached a leaf but did not find the key we were looking for. 1470 * If this is an insert we will be properly positioned for an insert 1471 * (ENOENT) or unable to insert (ENOSPC). 1472 */ 1473 error = enospc ? ENOSPC : ENOENT; 1474 done: 1475 return(error); 1476 } 1477 1478 /* 1479 * Heuristical search for the first element whos comparison is <= 1. May 1480 * return an index whos compare result is > 1 but may only return an index 1481 * whos compare result is <= 1 if it is the first element with that result. 1482 */ 1483 int 1484 hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node) 1485 { 1486 int b; 1487 int s; 1488 int i; 1489 int r; 1490 1491 /* 1492 * Don't bother if the node does not have very many elements 1493 */ 1494 b = 0; 1495 s = node->count; 1496 while (s - b > 4) { 1497 i = b + (s - b) / 2; 1498 ++hammer_stats_btree_elements; 1499 r = hammer_btree_cmp(elm, &node->elms[i].leaf.base); 1500 if (r <= 1) { 1501 s = i; 1502 } else { 1503 b = i; 1504 } 1505 } 1506 return(b); 1507 } 1508 1509 1510 /************************************************************************ 1511 * SPLITTING AND MERGING * 1512 ************************************************************************ 1513 * 1514 * These routines do all the dirty work required to split and merge nodes. 1515 */ 1516 1517 /* 1518 * Split an internal node into two nodes and move the separator at the split 1519 * point to the parent. 1520 * 1521 * (cursor->node, cursor->index) indicates the element the caller intends 1522 * to push into. We will adjust node and index if that element winds 1523 * up in the split node. 1524 * 1525 * If we are at the root of the filesystem a new root must be created with 1526 * two elements, one pointing to the original root and one pointing to the 1527 * newly allocated split node. 1528 */ 1529 static 1530 int 1531 btree_split_internal(hammer_cursor_t cursor) 1532 { 1533 hammer_node_ondisk_t ondisk; 1534 hammer_node_t node; 1535 hammer_node_t parent; 1536 hammer_node_t new_node; 1537 hammer_btree_elm_t elm; 1538 hammer_btree_elm_t parent_elm; 1539 struct hammer_node_lock lockroot; 1540 hammer_mount_t hmp = cursor->trans->hmp; 1541 int parent_index; 1542 int made_root; 1543 int split; 1544 int error; 1545 int i; 1546 const int esize = sizeof(*elm); 1547 1548 hammer_node_lock_init(&lockroot, cursor->node); 1549 error = hammer_btree_lock_children(cursor, 1, &lockroot, NULL); 1550 if (error) 1551 goto done; 1552 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1553 goto done; 1554 ++hammer_stats_btree_splits; 1555 1556 /* 1557 * Calculate the split point. If the insertion point is at the 1558 * end of the leaf we adjust the split point significantly to the 1559 * right to try to optimize node fill and flag it. If we hit 1560 * that same leaf again our heuristic failed and we don't try 1561 * to optimize node fill (it could lead to a degenerate case). 1562 */ 1563 node = cursor->node; 1564 ondisk = node->ondisk; 1565 KKASSERT(ondisk->count > 4); 1566 if (cursor->index == ondisk->count && 1567 (node->flags & HAMMER_NODE_NONLINEAR) == 0) { 1568 split = (ondisk->count + 1) * 3 / 4; 1569 node->flags |= HAMMER_NODE_NONLINEAR; 1570 } else { 1571 /* 1572 * We are splitting but elms[split] will be promoted to 1573 * the parent, leaving the right hand node with one less 1574 * element. If the insertion point will be on the 1575 * left-hand side adjust the split point to give the 1576 * right hand side one additional node. 1577 */ 1578 split = (ondisk->count + 1) / 2; 1579 if (cursor->index <= split) 1580 --split; 1581 } 1582 1583 /* 1584 * If we are at the root of the filesystem, create a new root node 1585 * with 1 element and split normally. Avoid making major 1586 * modifications until we know the whole operation will work. 1587 */ 1588 if (ondisk->parent == 0) { 1589 parent = hammer_alloc_btree(cursor->trans, 0, &error); 1590 if (parent == NULL) 1591 goto done; 1592 hammer_lock_ex(&parent->lock); 1593 hammer_modify_node_noundo(cursor->trans, parent); 1594 ondisk = parent->ondisk; 1595 ondisk->count = 1; 1596 ondisk->parent = 0; 1597 ondisk->mirror_tid = node->ondisk->mirror_tid; 1598 ondisk->signature = node->ondisk->signature; 1599 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1600 ondisk->elms[0].base = hmp->root_btree_beg; 1601 ondisk->elms[0].base.btype = node->ondisk->type; 1602 ondisk->elms[0].internal.subtree_offset = node->node_offset; 1603 ondisk->elms[0].internal.mirror_tid = ondisk->mirror_tid; 1604 ondisk->elms[1].base = hmp->root_btree_end; 1605 hammer_modify_node_done(parent); 1606 made_root = 1; 1607 parent_index = 0; /* index of current node in parent */ 1608 } else { 1609 made_root = 0; 1610 parent = cursor->parent; 1611 parent_index = cursor->parent_index; 1612 } 1613 1614 /* 1615 * Split node into new_node at the split point. 1616 * 1617 * B O O O P N N B <-- P = node->elms[split] (index 4) 1618 * 0 1 2 3 4 5 6 <-- subtree indices 1619 * 1620 * x x P x x 1621 * s S S s 1622 * / \ 1623 * B O O O B B N N B <--- inner boundary points are 'P' 1624 * 0 1 2 3 4 5 6 1625 */ 1626 new_node = hammer_alloc_btree(cursor->trans, 0, &error); 1627 if (new_node == NULL) { 1628 if (made_root) { 1629 hammer_unlock(&parent->lock); 1630 hammer_delete_node(cursor->trans, parent); 1631 hammer_rel_node(parent); 1632 } 1633 goto done; 1634 } 1635 hammer_lock_ex(&new_node->lock); 1636 1637 /* 1638 * Create the new node. P becomes the left-hand boundary in the 1639 * new node. Copy the right-hand boundary as well. 1640 * 1641 * elm is the new separator. 1642 */ 1643 hammer_modify_node_noundo(cursor->trans, new_node); 1644 hammer_modify_node_all(cursor->trans, node); 1645 ondisk = node->ondisk; 1646 elm = &ondisk->elms[split]; 1647 bcopy(elm, &new_node->ondisk->elms[0], 1648 (ondisk->count - split + 1) * esize); 1649 new_node->ondisk->count = ondisk->count - split; 1650 new_node->ondisk->parent = parent->node_offset; 1651 new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1652 new_node->ondisk->mirror_tid = ondisk->mirror_tid; 1653 KKASSERT(ondisk->type == new_node->ondisk->type); 1654 hammer_cursor_split_node(node, new_node, split); 1655 1656 /* 1657 * Cleanup the original node. Elm (P) becomes the new boundary, 1658 * its subtree_offset was moved to the new node. If we had created 1659 * a new root its parent pointer may have changed. 1660 */ 1661 elm->internal.subtree_offset = 0; 1662 ondisk->count = split; 1663 1664 /* 1665 * Insert the separator into the parent, fixup the parent's 1666 * reference to the original node, and reference the new node. 1667 * The separator is P. 1668 * 1669 * Remember that base.count does not include the right-hand boundary. 1670 */ 1671 hammer_modify_node_all(cursor->trans, parent); 1672 ondisk = parent->ondisk; 1673 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1674 parent_elm = &ondisk->elms[parent_index+1]; 1675 bcopy(parent_elm, parent_elm + 1, 1676 (ondisk->count - parent_index) * esize); 1677 parent_elm->internal.base = elm->base; /* separator P */ 1678 parent_elm->internal.base.btype = new_node->ondisk->type; 1679 parent_elm->internal.subtree_offset = new_node->node_offset; 1680 parent_elm->internal.mirror_tid = new_node->ondisk->mirror_tid; 1681 ++ondisk->count; 1682 hammer_modify_node_done(parent); 1683 hammer_cursor_inserted_element(parent, parent_index + 1); 1684 1685 /* 1686 * The children of new_node need their parent pointer set to new_node. 1687 * The children have already been locked by 1688 * hammer_btree_lock_children(). 1689 */ 1690 for (i = 0; i < new_node->ondisk->count; ++i) { 1691 elm = &new_node->ondisk->elms[i]; 1692 error = btree_set_parent(cursor->trans, new_node, elm); 1693 if (error) { 1694 panic("btree_split_internal: btree-fixup problem"); 1695 } 1696 } 1697 hammer_modify_node_done(new_node); 1698 1699 /* 1700 * The filesystem's root B-Tree pointer may have to be updated. 1701 */ 1702 if (made_root) { 1703 hammer_volume_t volume; 1704 1705 volume = hammer_get_root_volume(hmp, &error); 1706 KKASSERT(error == 0); 1707 1708 hammer_modify_volume_field(cursor->trans, volume, 1709 vol0_btree_root); 1710 volume->ondisk->vol0_btree_root = parent->node_offset; 1711 hammer_modify_volume_done(volume); 1712 node->ondisk->parent = parent->node_offset; 1713 /* node->ondisk->signature = 0; */ 1714 if (cursor->parent) { 1715 hammer_unlock(&cursor->parent->lock); 1716 hammer_rel_node(cursor->parent); 1717 } 1718 cursor->parent = parent; /* lock'd and ref'd */ 1719 hammer_rel_volume(volume, 0); 1720 } 1721 hammer_modify_node_done(node); 1722 1723 /* 1724 * Ok, now adjust the cursor depending on which element the original 1725 * index was pointing at. If we are >= the split point the push node 1726 * is now in the new node. 1727 * 1728 * NOTE: If we are at the split point itself we cannot stay with the 1729 * original node because the push index will point at the right-hand 1730 * boundary, which is illegal. 1731 * 1732 * NOTE: The cursor's parent or parent_index must be adjusted for 1733 * the case where a new parent (new root) was created, and the case 1734 * where the cursor is now pointing at the split node. 1735 */ 1736 if (cursor->index >= split) { 1737 cursor->parent_index = parent_index + 1; 1738 cursor->index -= split; 1739 hammer_unlock(&cursor->node->lock); 1740 hammer_rel_node(cursor->node); 1741 cursor->node = new_node; /* locked and ref'd */ 1742 } else { 1743 cursor->parent_index = parent_index; 1744 hammer_unlock(&new_node->lock); 1745 hammer_rel_node(new_node); 1746 } 1747 1748 /* 1749 * Fixup left and right bounds 1750 */ 1751 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1752 cursor->left_bound = &parent_elm[0].internal.base; 1753 cursor->right_bound = &parent_elm[1].internal.base; 1754 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1755 &cursor->node->ondisk->elms[0].internal.base) <= 0); 1756 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1757 &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0); 1758 1759 done: 1760 hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, NULL); 1761 hammer_cursor_downgrade(cursor); 1762 return (error); 1763 } 1764 1765 /* 1766 * Same as the above, but splits a full leaf node. 1767 */ 1768 static 1769 int 1770 btree_split_leaf(hammer_cursor_t cursor) 1771 { 1772 hammer_node_ondisk_t ondisk; 1773 hammer_node_t parent; 1774 hammer_node_t leaf; 1775 hammer_mount_t hmp; 1776 hammer_node_t new_leaf; 1777 hammer_btree_elm_t elm; 1778 hammer_btree_elm_t parent_elm; 1779 hammer_base_elm_t mid_boundary; 1780 int parent_index; 1781 int made_root; 1782 int split; 1783 int error; 1784 const size_t esize = sizeof(*elm); 1785 1786 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1787 return(error); 1788 ++hammer_stats_btree_splits; 1789 1790 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1791 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1792 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1793 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1794 1795 /* 1796 * Calculate the split point. If the insertion point is at the 1797 * end of the leaf we adjust the split point significantly to the 1798 * right to try to optimize node fill and flag it. If we hit 1799 * that same leaf again our heuristic failed and we don't try 1800 * to optimize node fill (it could lead to a degenerate case). 1801 */ 1802 leaf = cursor->node; 1803 ondisk = leaf->ondisk; 1804 KKASSERT(ondisk->count > 4); 1805 if (cursor->index == ondisk->count && 1806 (leaf->flags & HAMMER_NODE_NONLINEAR) == 0) { 1807 split = (ondisk->count + 1) * 3 / 4; 1808 leaf->flags |= HAMMER_NODE_NONLINEAR; 1809 } else { 1810 split = (ondisk->count + 1) / 2; 1811 } 1812 1813 #if 0 1814 /* 1815 * If the insertion point is at the split point shift the 1816 * split point left so we don't have to worry about 1817 */ 1818 if (cursor->index == split) 1819 --split; 1820 #endif 1821 KKASSERT(split > 0 && split < ondisk->count); 1822 1823 error = 0; 1824 hmp = leaf->hmp; 1825 1826 elm = &ondisk->elms[split]; 1827 1828 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm[-1].leaf.base) <= 0); 1829 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->leaf.base) <= 0); 1830 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->leaf.base) > 0); 1831 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm[1].leaf.base) > 0); 1832 1833 /* 1834 * If we are at the root of the tree, create a new root node with 1835 * 1 element and split normally. Avoid making major modifications 1836 * until we know the whole operation will work. 1837 */ 1838 if (ondisk->parent == 0) { 1839 parent = hammer_alloc_btree(cursor->trans, 0, &error); 1840 if (parent == NULL) 1841 goto done; 1842 hammer_lock_ex(&parent->lock); 1843 hammer_modify_node_noundo(cursor->trans, parent); 1844 ondisk = parent->ondisk; 1845 ondisk->count = 1; 1846 ondisk->parent = 0; 1847 ondisk->mirror_tid = leaf->ondisk->mirror_tid; 1848 ondisk->signature = leaf->ondisk->signature; 1849 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1850 ondisk->elms[0].base = hmp->root_btree_beg; 1851 ondisk->elms[0].base.btype = leaf->ondisk->type; 1852 ondisk->elms[0].internal.subtree_offset = leaf->node_offset; 1853 ondisk->elms[0].internal.mirror_tid = ondisk->mirror_tid; 1854 ondisk->elms[1].base = hmp->root_btree_end; 1855 hammer_modify_node_done(parent); 1856 made_root = 1; 1857 parent_index = 0; /* insertion point in parent */ 1858 } else { 1859 made_root = 0; 1860 parent = cursor->parent; 1861 parent_index = cursor->parent_index; 1862 } 1863 1864 /* 1865 * Split leaf into new_leaf at the split point. Select a separator 1866 * value in-between the two leafs but with a bent towards the right 1867 * leaf since comparisons use an 'elm >= separator' inequality. 1868 * 1869 * L L L L L L L L 1870 * 1871 * x x P x x 1872 * s S S s 1873 * / \ 1874 * L L L L L L L L 1875 */ 1876 new_leaf = hammer_alloc_btree(cursor->trans, 0, &error); 1877 if (new_leaf == NULL) { 1878 if (made_root) { 1879 hammer_unlock(&parent->lock); 1880 hammer_delete_node(cursor->trans, parent); 1881 hammer_rel_node(parent); 1882 } 1883 goto done; 1884 } 1885 hammer_lock_ex(&new_leaf->lock); 1886 1887 /* 1888 * Create the new node and copy the leaf elements from the split 1889 * point on to the new node. 1890 */ 1891 hammer_modify_node_all(cursor->trans, leaf); 1892 hammer_modify_node_noundo(cursor->trans, new_leaf); 1893 ondisk = leaf->ondisk; 1894 elm = &ondisk->elms[split]; 1895 bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize); 1896 new_leaf->ondisk->count = ondisk->count - split; 1897 new_leaf->ondisk->parent = parent->node_offset; 1898 new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF; 1899 new_leaf->ondisk->mirror_tid = ondisk->mirror_tid; 1900 KKASSERT(ondisk->type == new_leaf->ondisk->type); 1901 hammer_modify_node_done(new_leaf); 1902 hammer_cursor_split_node(leaf, new_leaf, split); 1903 1904 /* 1905 * Cleanup the original node. Because this is a leaf node and 1906 * leaf nodes do not have a right-hand boundary, there 1907 * aren't any special edge cases to clean up. We just fixup the 1908 * count. 1909 */ 1910 ondisk->count = split; 1911 1912 /* 1913 * Insert the separator into the parent, fixup the parent's 1914 * reference to the original node, and reference the new node. 1915 * The separator is P. 1916 * 1917 * Remember that base.count does not include the right-hand boundary. 1918 * We are copying parent_index+1 to parent_index+2, not +0 to +1. 1919 */ 1920 hammer_modify_node_all(cursor->trans, parent); 1921 ondisk = parent->ondisk; 1922 KKASSERT(split != 0); 1923 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1924 parent_elm = &ondisk->elms[parent_index+1]; 1925 bcopy(parent_elm, parent_elm + 1, 1926 (ondisk->count - parent_index) * esize); 1927 1928 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base); 1929 parent_elm->internal.base.btype = new_leaf->ondisk->type; 1930 parent_elm->internal.subtree_offset = new_leaf->node_offset; 1931 parent_elm->internal.mirror_tid = new_leaf->ondisk->mirror_tid; 1932 mid_boundary = &parent_elm->base; 1933 ++ondisk->count; 1934 hammer_modify_node_done(parent); 1935 hammer_cursor_inserted_element(parent, parent_index + 1); 1936 1937 /* 1938 * The filesystem's root B-Tree pointer may have to be updated. 1939 */ 1940 if (made_root) { 1941 hammer_volume_t volume; 1942 1943 volume = hammer_get_root_volume(hmp, &error); 1944 KKASSERT(error == 0); 1945 1946 hammer_modify_volume_field(cursor->trans, volume, 1947 vol0_btree_root); 1948 volume->ondisk->vol0_btree_root = parent->node_offset; 1949 hammer_modify_volume_done(volume); 1950 leaf->ondisk->parent = parent->node_offset; 1951 /* leaf->ondisk->signature = 0; */ 1952 if (cursor->parent) { 1953 hammer_unlock(&cursor->parent->lock); 1954 hammer_rel_node(cursor->parent); 1955 } 1956 cursor->parent = parent; /* lock'd and ref'd */ 1957 hammer_rel_volume(volume, 0); 1958 } 1959 hammer_modify_node_done(leaf); 1960 1961 /* 1962 * Ok, now adjust the cursor depending on which element the original 1963 * index was pointing at. If we are >= the split point the push node 1964 * is now in the new node. 1965 * 1966 * NOTE: If we are at the split point itself we need to select the 1967 * old or new node based on where key_beg's insertion point will be. 1968 * If we pick the wrong side the inserted element will wind up in 1969 * the wrong leaf node and outside that node's bounds. 1970 */ 1971 if (cursor->index > split || 1972 (cursor->index == split && 1973 hammer_btree_cmp(&cursor->key_beg, mid_boundary) >= 0)) { 1974 cursor->parent_index = parent_index + 1; 1975 cursor->index -= split; 1976 hammer_unlock(&cursor->node->lock); 1977 hammer_rel_node(cursor->node); 1978 cursor->node = new_leaf; 1979 } else { 1980 cursor->parent_index = parent_index; 1981 hammer_unlock(&new_leaf->lock); 1982 hammer_rel_node(new_leaf); 1983 } 1984 1985 /* 1986 * Fixup left and right bounds 1987 */ 1988 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1989 cursor->left_bound = &parent_elm[0].internal.base; 1990 cursor->right_bound = &parent_elm[1].internal.base; 1991 1992 /* 1993 * Assert that the bounds are correct. 1994 */ 1995 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1996 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1997 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1998 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1999 KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->key_beg) <= 0); 2000 KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->key_beg) > 0); 2001 2002 done: 2003 hammer_cursor_downgrade(cursor); 2004 return (error); 2005 } 2006 2007 #if 0 2008 2009 /* 2010 * Recursively correct the right-hand boundary's create_tid to (tid) as 2011 * long as the rest of the key matches. We have to recurse upward in 2012 * the tree as well as down the left side of each parent's right node. 2013 * 2014 * Return EDEADLK if we were only partially successful, forcing the caller 2015 * to try again. The original cursor is not modified. This routine can 2016 * also fail with EDEADLK if it is forced to throw away a portion of its 2017 * record history. 2018 * 2019 * The caller must pass a downgraded cursor to us (otherwise we can't dup it). 2020 */ 2021 struct hammer_rhb { 2022 TAILQ_ENTRY(hammer_rhb) entry; 2023 hammer_node_t node; 2024 int index; 2025 }; 2026 2027 TAILQ_HEAD(hammer_rhb_list, hammer_rhb); 2028 2029 int 2030 hammer_btree_correct_rhb(hammer_cursor_t cursor, hammer_tid_t tid) 2031 { 2032 struct hammer_mount *hmp; 2033 struct hammer_rhb_list rhb_list; 2034 hammer_base_elm_t elm; 2035 hammer_node_t orig_node; 2036 struct hammer_rhb *rhb; 2037 int orig_index; 2038 int error; 2039 2040 TAILQ_INIT(&rhb_list); 2041 hmp = cursor->trans->hmp; 2042 2043 /* 2044 * Save our position so we can restore it on return. This also 2045 * gives us a stable 'elm'. 2046 */ 2047 orig_node = cursor->node; 2048 hammer_ref_node(orig_node); 2049 hammer_lock_sh(&orig_node->lock); 2050 orig_index = cursor->index; 2051 elm = &orig_node->ondisk->elms[orig_index].base; 2052 2053 /* 2054 * Now build a list of parents going up, allocating a rhb 2055 * structure for each one. 2056 */ 2057 while (cursor->parent) { 2058 /* 2059 * Stop if we no longer have any right-bounds to fix up 2060 */ 2061 if (elm->obj_id != cursor->right_bound->obj_id || 2062 elm->rec_type != cursor->right_bound->rec_type || 2063 elm->key != cursor->right_bound->key) { 2064 break; 2065 } 2066 2067 /* 2068 * Stop if the right-hand bound's create_tid does not 2069 * need to be corrected. 2070 */ 2071 if (cursor->right_bound->create_tid >= tid) 2072 break; 2073 2074 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2075 rhb->node = cursor->parent; 2076 rhb->index = cursor->parent_index; 2077 hammer_ref_node(rhb->node); 2078 hammer_lock_sh(&rhb->node->lock); 2079 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2080 2081 hammer_cursor_up(cursor); 2082 } 2083 2084 /* 2085 * now safely adjust the right hand bound for each rhb. This may 2086 * also require taking the right side of the tree and iterating down 2087 * ITS left side. 2088 */ 2089 error = 0; 2090 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2091 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2092 if (error) 2093 break; 2094 TAILQ_REMOVE(&rhb_list, rhb, entry); 2095 hammer_unlock(&rhb->node->lock); 2096 hammer_rel_node(rhb->node); 2097 kfree(rhb, hmp->m_misc); 2098 2099 switch (cursor->node->ondisk->type) { 2100 case HAMMER_BTREE_TYPE_INTERNAL: 2101 /* 2102 * Right-boundary for parent at internal node 2103 * is one element to the right of the element whos 2104 * right boundary needs adjusting. We must then 2105 * traverse down the left side correcting any left 2106 * bounds (which may now be too far to the left). 2107 */ 2108 ++cursor->index; 2109 error = hammer_btree_correct_lhb(cursor, tid); 2110 break; 2111 default: 2112 panic("hammer_btree_correct_rhb(): Bad node type"); 2113 error = EINVAL; 2114 break; 2115 } 2116 } 2117 2118 /* 2119 * Cleanup 2120 */ 2121 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2122 TAILQ_REMOVE(&rhb_list, rhb, entry); 2123 hammer_unlock(&rhb->node->lock); 2124 hammer_rel_node(rhb->node); 2125 kfree(rhb, hmp->m_misc); 2126 } 2127 error = hammer_cursor_seek(cursor, orig_node, orig_index); 2128 hammer_unlock(&orig_node->lock); 2129 hammer_rel_node(orig_node); 2130 return (error); 2131 } 2132 2133 /* 2134 * Similar to rhb (in fact, rhb calls lhb), but corrects the left hand 2135 * bound going downward starting at the current cursor position. 2136 * 2137 * This function does not restore the cursor after use. 2138 */ 2139 int 2140 hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid) 2141 { 2142 struct hammer_rhb_list rhb_list; 2143 hammer_base_elm_t elm; 2144 hammer_base_elm_t cmp; 2145 struct hammer_rhb *rhb; 2146 struct hammer_mount *hmp; 2147 int error; 2148 2149 TAILQ_INIT(&rhb_list); 2150 hmp = cursor->trans->hmp; 2151 2152 cmp = &cursor->node->ondisk->elms[cursor->index].base; 2153 2154 /* 2155 * Record the node and traverse down the left-hand side for all 2156 * matching records needing a boundary correction. 2157 */ 2158 error = 0; 2159 for (;;) { 2160 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2161 rhb->node = cursor->node; 2162 rhb->index = cursor->index; 2163 hammer_ref_node(rhb->node); 2164 hammer_lock_sh(&rhb->node->lock); 2165 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2166 2167 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2168 /* 2169 * Nothing to traverse down if we are at the right 2170 * boundary of an internal node. 2171 */ 2172 if (cursor->index == cursor->node->ondisk->count) 2173 break; 2174 } else { 2175 elm = &cursor->node->ondisk->elms[cursor->index].base; 2176 if (elm->btype == HAMMER_BTREE_TYPE_RECORD) 2177 break; 2178 panic("Illegal leaf record type %02x", elm->btype); 2179 } 2180 error = hammer_cursor_down(cursor); 2181 if (error) 2182 break; 2183 2184 elm = &cursor->node->ondisk->elms[cursor->index].base; 2185 if (elm->obj_id != cmp->obj_id || 2186 elm->rec_type != cmp->rec_type || 2187 elm->key != cmp->key) { 2188 break; 2189 } 2190 if (elm->create_tid >= tid) 2191 break; 2192 2193 } 2194 2195 /* 2196 * Now we can safely adjust the left-hand boundary from the bottom-up. 2197 * The last element we remove from the list is the caller's right hand 2198 * boundary, which must also be adjusted. 2199 */ 2200 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2201 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2202 if (error) 2203 break; 2204 TAILQ_REMOVE(&rhb_list, rhb, entry); 2205 hammer_unlock(&rhb->node->lock); 2206 hammer_rel_node(rhb->node); 2207 kfree(rhb, hmp->m_misc); 2208 2209 elm = &cursor->node->ondisk->elms[cursor->index].base; 2210 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2211 hammer_modify_node(cursor->trans, cursor->node, 2212 &elm->create_tid, 2213 sizeof(elm->create_tid)); 2214 elm->create_tid = tid; 2215 hammer_modify_node_done(cursor->node); 2216 } else { 2217 panic("hammer_btree_correct_lhb(): Bad element type"); 2218 } 2219 } 2220 2221 /* 2222 * Cleanup 2223 */ 2224 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2225 TAILQ_REMOVE(&rhb_list, rhb, entry); 2226 hammer_unlock(&rhb->node->lock); 2227 hammer_rel_node(rhb->node); 2228 kfree(rhb, hmp->m_misc); 2229 } 2230 return (error); 2231 } 2232 2233 #endif 2234 2235 /* 2236 * Attempt to remove the locked, empty or want-to-be-empty B-Tree node at 2237 * (cursor->node). Returns 0 on success, EDEADLK if we could not complete 2238 * the operation due to a deadlock, or some other error. 2239 * 2240 * This routine is initially called with an empty leaf and may be 2241 * recursively called with single-element internal nodes. 2242 * 2243 * It should also be noted that when removing empty leaves we must be sure 2244 * to test and update mirror_tid because another thread may have deadlocked 2245 * against us (or someone) trying to propagate it up and cannot retry once 2246 * the node has been deleted. 2247 * 2248 * On return the cursor may end up pointing to an internal node, suitable 2249 * for further iteration but not for an immediate insertion or deletion. 2250 */ 2251 static int 2252 btree_remove(hammer_cursor_t cursor, int *ndelete) 2253 { 2254 hammer_node_ondisk_t ondisk; 2255 hammer_btree_elm_t elm; 2256 hammer_node_t node; 2257 hammer_node_t parent; 2258 const int esize = sizeof(*elm); 2259 int error; 2260 2261 node = cursor->node; 2262 2263 /* 2264 * When deleting the root of the filesystem convert it to 2265 * an empty leaf node. Internal nodes cannot be empty. 2266 */ 2267 ondisk = node->ondisk; 2268 if (ondisk->parent == 0) { 2269 KKASSERT(cursor->parent == NULL); 2270 hammer_modify_node_all(cursor->trans, node); 2271 KKASSERT(ondisk == node->ondisk); 2272 ondisk->type = HAMMER_BTREE_TYPE_LEAF; 2273 ondisk->count = 0; 2274 hammer_modify_node_done(node); 2275 cursor->index = 0; 2276 return(0); 2277 } 2278 2279 parent = cursor->parent; 2280 2281 /* 2282 * Attempt to remove the parent's reference to the child. If the 2283 * parent would become empty we have to recurse. If we fail we 2284 * leave the parent pointing to an empty leaf node. 2285 * 2286 * We have to recurse successfully before we can delete the internal 2287 * node as it is illegal to have empty internal nodes. Even though 2288 * the operation may be aborted we must still fixup any unlocked 2289 * cursors as if we had deleted the element prior to recursing 2290 * (by calling hammer_cursor_deleted_element()) so those cursors 2291 * are properly forced up the chain by the recursion. 2292 */ 2293 if (parent->ondisk->count == 1) { 2294 /* 2295 * This special cursor_up_locked() call leaves the original 2296 * node exclusively locked and referenced, leaves the 2297 * original parent locked (as the new node), and locks the 2298 * new parent. It can return EDEADLK. 2299 * 2300 * We cannot call hammer_cursor_removed_node() until we are 2301 * actually able to remove the node. If we did then tracked 2302 * cursors in the middle of iterations could be repointed 2303 * to a parent node. If this occurs they could end up 2304 * scanning newly inserted records into the node (that could 2305 * not be deleted) when they push down again. 2306 * 2307 * Due to the way the recursion works the final parent is left 2308 * in cursor->parent after the recursion returns. Each 2309 * layer on the way back up is thus able to call 2310 * hammer_cursor_removed_node() and 'jump' the node up to 2311 * the (same) final parent. 2312 * 2313 * NOTE! The local variable 'parent' is invalid after we 2314 * call hammer_cursor_up_locked(). 2315 */ 2316 error = hammer_cursor_up_locked(cursor); 2317 parent = NULL; 2318 2319 if (error == 0) { 2320 hammer_cursor_deleted_element(cursor->node, 0); 2321 error = btree_remove(cursor, ndelete); 2322 if (error == 0) { 2323 KKASSERT(node != cursor->node); 2324 hammer_cursor_removed_node( 2325 node, cursor->node, 2326 cursor->index); 2327 hammer_modify_node_all(cursor->trans, node); 2328 ondisk = node->ondisk; 2329 ondisk->type = HAMMER_BTREE_TYPE_DELETED; 2330 ondisk->count = 0; 2331 hammer_modify_node_done(node); 2332 hammer_flush_node(node, 0); 2333 hammer_delete_node(cursor->trans, node); 2334 if (ndelete) 2335 (*ndelete)++; 2336 } else { 2337 /* 2338 * Defer parent removal because we could not 2339 * get the lock, just let the leaf remain 2340 * empty. 2341 */ 2342 /**/ 2343 } 2344 hammer_unlock(&node->lock); 2345 hammer_rel_node(node); 2346 } else { 2347 /* 2348 * Defer parent removal because we could not 2349 * get the lock, just let the leaf remain 2350 * empty. 2351 */ 2352 /**/ 2353 } 2354 } else { 2355 KKASSERT(parent->ondisk->count > 1); 2356 2357 hammer_modify_node_all(cursor->trans, parent); 2358 ondisk = parent->ondisk; 2359 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 2360 2361 elm = &ondisk->elms[cursor->parent_index]; 2362 KKASSERT(elm->internal.subtree_offset == node->node_offset); 2363 KKASSERT(ondisk->count > 0); 2364 2365 /* 2366 * We must retain the highest mirror_tid. The deleted 2367 * range is now encompassed by the element to the left. 2368 * If we are already at the left edge the new left edge 2369 * inherits mirror_tid. 2370 * 2371 * Note that bounds of the parent to our parent may create 2372 * a gap to the left of our left-most node or to the right 2373 * of our right-most node. The gap is silently included 2374 * in the mirror_tid's area of effect from the point of view 2375 * of the scan. 2376 */ 2377 if (cursor->parent_index) { 2378 if (elm[-1].internal.mirror_tid < 2379 elm[0].internal.mirror_tid) { 2380 elm[-1].internal.mirror_tid = 2381 elm[0].internal.mirror_tid; 2382 } 2383 } else { 2384 if (elm[1].internal.mirror_tid < 2385 elm[0].internal.mirror_tid) { 2386 elm[1].internal.mirror_tid = 2387 elm[0].internal.mirror_tid; 2388 } 2389 } 2390 2391 /* 2392 * Delete the subtree reference in the parent. Include 2393 * boundary element at end. 2394 */ 2395 bcopy(&elm[1], &elm[0], 2396 (ondisk->count - cursor->parent_index) * esize); 2397 --ondisk->count; 2398 hammer_modify_node_done(parent); 2399 hammer_cursor_removed_node(node, parent, cursor->parent_index); 2400 hammer_cursor_deleted_element(parent, cursor->parent_index); 2401 hammer_flush_node(node, 0); 2402 hammer_delete_node(cursor->trans, node); 2403 2404 /* 2405 * cursor->node is invalid, cursor up to make the cursor 2406 * valid again. We have to flag the condition in case 2407 * another thread wiggles an insertion in during an 2408 * iteration. 2409 */ 2410 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK; 2411 error = hammer_cursor_up(cursor); 2412 if (ndelete) 2413 (*ndelete)++; 2414 } 2415 return (error); 2416 } 2417 2418 /* 2419 * Propagate cursor->trans->tid up the B-Tree starting at the current 2420 * cursor position using pseudofs info gleaned from the passed inode. 2421 * 2422 * The passed inode has no relationship to the cursor position other 2423 * then being in the same pseudofs as the insertion or deletion we 2424 * are propagating the mirror_tid for. 2425 * 2426 * WARNING! Because we push and pop the passed cursor, it may be 2427 * modified by other B-Tree operations while it is unlocked 2428 * and things like the node & leaf pointers, and indexes might 2429 * change. 2430 */ 2431 void 2432 hammer_btree_do_propagation(hammer_cursor_t cursor, 2433 hammer_pseudofs_inmem_t pfsm, 2434 hammer_btree_leaf_elm_t leaf) 2435 { 2436 hammer_cursor_t ncursor; 2437 hammer_tid_t mirror_tid; 2438 int error __debugvar; 2439 2440 /* 2441 * We do not propagate a mirror_tid if the filesystem was mounted 2442 * in no-mirror mode. 2443 */ 2444 if (cursor->trans->hmp->master_id < 0) 2445 return; 2446 2447 /* 2448 * This is a bit of a hack because we cannot deadlock or return 2449 * EDEADLK here. The related operation has already completed and 2450 * we must propagate the mirror_tid now regardless. 2451 * 2452 * Generate a new cursor which inherits the original's locks and 2453 * unlock the original. Use the new cursor to propagate the 2454 * mirror_tid. Then clean up the new cursor and reacquire locks 2455 * on the original. 2456 * 2457 * hammer_dup_cursor() cannot dup locks. The dup inherits the 2458 * original's locks and the original is tracked and must be 2459 * re-locked. 2460 */ 2461 mirror_tid = cursor->node->ondisk->mirror_tid; 2462 KKASSERT(mirror_tid != 0); 2463 ncursor = hammer_push_cursor(cursor); 2464 error = hammer_btree_mirror_propagate(ncursor, mirror_tid); 2465 KKASSERT(error == 0); 2466 hammer_pop_cursor(cursor, ncursor); 2467 /* WARNING: cursor's leaf pointer may change after pop */ 2468 } 2469 2470 2471 /* 2472 * Propagate a mirror TID update upwards through the B-Tree to the root. 2473 * 2474 * A locked internal node must be passed in. The node will remain locked 2475 * on return. 2476 * 2477 * This function syncs mirror_tid at the specified internal node's element, 2478 * adjusts the node's aggregation mirror_tid, and then recurses upwards. 2479 */ 2480 static int 2481 hammer_btree_mirror_propagate(hammer_cursor_t cursor, hammer_tid_t mirror_tid) 2482 { 2483 hammer_btree_internal_elm_t elm; 2484 hammer_node_t node; 2485 int error; 2486 2487 for (;;) { 2488 error = hammer_cursor_up(cursor); 2489 if (error == 0) 2490 error = hammer_cursor_upgrade(cursor); 2491 2492 /* 2493 * We can ignore HAMMER_CURSOR_ITERATE_CHECK, the 2494 * cursor will still be properly positioned for 2495 * mirror propagation, just not for iterations. 2496 */ 2497 while (error == EDEADLK) { 2498 hammer_recover_cursor(cursor); 2499 error = hammer_cursor_upgrade(cursor); 2500 } 2501 if (error) 2502 break; 2503 2504 /* 2505 * If the cursor deadlocked it could end up at a leaf 2506 * after we lost the lock. 2507 */ 2508 node = cursor->node; 2509 if (node->ondisk->type != HAMMER_BTREE_TYPE_INTERNAL) 2510 continue; 2511 2512 /* 2513 * Adjust the node's element 2514 */ 2515 elm = &node->ondisk->elms[cursor->index].internal; 2516 if (elm->mirror_tid >= mirror_tid) 2517 break; 2518 hammer_modify_node(cursor->trans, node, &elm->mirror_tid, 2519 sizeof(elm->mirror_tid)); 2520 elm->mirror_tid = mirror_tid; 2521 hammer_modify_node_done(node); 2522 if (hammer_debug_general & 0x0002) { 2523 kprintf("mirror_propagate: propagate " 2524 "%016llx @%016llx:%d\n", 2525 (long long)mirror_tid, 2526 (long long)node->node_offset, 2527 cursor->index); 2528 } 2529 2530 2531 /* 2532 * Adjust the node's mirror_tid aggregator 2533 */ 2534 if (node->ondisk->mirror_tid >= mirror_tid) 2535 return(0); 2536 hammer_modify_node_field(cursor->trans, node, mirror_tid); 2537 node->ondisk->mirror_tid = mirror_tid; 2538 hammer_modify_node_done(node); 2539 if (hammer_debug_general & 0x0002) { 2540 kprintf("mirror_propagate: propagate " 2541 "%016llx @%016llx\n", 2542 (long long)mirror_tid, 2543 (long long)node->node_offset); 2544 } 2545 } 2546 if (error == ENOENT) 2547 error = 0; 2548 return(error); 2549 } 2550 2551 hammer_node_t 2552 hammer_btree_get_parent(hammer_transaction_t trans, hammer_node_t node, 2553 int *parent_indexp, int *errorp, int try_exclusive) 2554 { 2555 hammer_node_t parent; 2556 hammer_btree_elm_t elm; 2557 int i; 2558 2559 /* 2560 * Get the node 2561 */ 2562 parent = hammer_get_node(trans, node->ondisk->parent, 0, errorp); 2563 if (*errorp) { 2564 KKASSERT(parent == NULL); 2565 return(NULL); 2566 } 2567 KKASSERT ((parent->flags & HAMMER_NODE_DELETED) == 0); 2568 2569 /* 2570 * Lock the node 2571 */ 2572 if (try_exclusive) { 2573 if (hammer_lock_ex_try(&parent->lock)) { 2574 hammer_rel_node(parent); 2575 *errorp = EDEADLK; 2576 return(NULL); 2577 } 2578 } else { 2579 hammer_lock_sh(&parent->lock); 2580 } 2581 2582 /* 2583 * Figure out which element in the parent is pointing to the 2584 * child. 2585 */ 2586 if (node->ondisk->count) { 2587 i = hammer_btree_search_node(&node->ondisk->elms[0].base, 2588 parent->ondisk); 2589 } else { 2590 i = 0; 2591 } 2592 while (i < parent->ondisk->count) { 2593 elm = &parent->ondisk->elms[i]; 2594 if (elm->internal.subtree_offset == node->node_offset) 2595 break; 2596 ++i; 2597 } 2598 if (i == parent->ondisk->count) { 2599 hammer_unlock(&parent->lock); 2600 panic("Bad B-Tree link: parent %p node %p", parent, node); 2601 } 2602 *parent_indexp = i; 2603 KKASSERT(*errorp == 0); 2604 return(parent); 2605 } 2606 2607 /* 2608 * The element (elm) has been moved to a new internal node (node). 2609 * 2610 * If the element represents a pointer to an internal node that node's 2611 * parent must be adjusted to the element's new location. 2612 * 2613 * XXX deadlock potential here with our exclusive locks 2614 */ 2615 int 2616 btree_set_parent(hammer_transaction_t trans, hammer_node_t node, 2617 hammer_btree_elm_t elm) 2618 { 2619 hammer_node_t child; 2620 int error; 2621 2622 error = 0; 2623 2624 switch(elm->base.btype) { 2625 case HAMMER_BTREE_TYPE_INTERNAL: 2626 case HAMMER_BTREE_TYPE_LEAF: 2627 child = hammer_get_node(trans, elm->internal.subtree_offset, 2628 0, &error); 2629 if (error == 0) { 2630 hammer_modify_node_field(trans, child, parent); 2631 child->ondisk->parent = node->node_offset; 2632 hammer_modify_node_done(child); 2633 hammer_rel_node(child); 2634 } 2635 break; 2636 default: 2637 break; 2638 } 2639 return(error); 2640 } 2641 2642 /* 2643 * Initialize the root of a recursive B-Tree node lock list structure. 2644 */ 2645 void 2646 hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node) 2647 { 2648 TAILQ_INIT(&parent->list); 2649 parent->parent = NULL; 2650 parent->node = node; 2651 parent->index = -1; 2652 parent->count = node->ondisk->count; 2653 parent->copy = NULL; 2654 parent->flags = 0; 2655 } 2656 2657 /* 2658 * Initialize a cache of hammer_node_lock's including space allocated 2659 * for node copies. 2660 * 2661 * This is used by the rebalancing code to preallocate the copy space 2662 * for ~4096 B-Tree nodes (16MB of data) prior to acquiring any HAMMER 2663 * locks, otherwise we can blow out the pageout daemon's emergency 2664 * reserve and deadlock it. 2665 * 2666 * NOTE: HAMMER_NODE_LOCK_LCACHE is not set on items cached in the lcache. 2667 * The flag is set when the item is pulled off the cache for use. 2668 */ 2669 void 2670 hammer_btree_lcache_init(hammer_mount_t hmp, hammer_node_lock_t lcache, 2671 int depth) 2672 { 2673 hammer_node_lock_t item; 2674 int count; 2675 2676 for (count = 1; depth; --depth) 2677 count *= HAMMER_BTREE_LEAF_ELMS; 2678 bzero(lcache, sizeof(*lcache)); 2679 TAILQ_INIT(&lcache->list); 2680 while (count) { 2681 item = kmalloc(sizeof(*item), hmp->m_misc, M_WAITOK|M_ZERO); 2682 item->copy = kmalloc(sizeof(*item->copy), 2683 hmp->m_misc, M_WAITOK); 2684 TAILQ_INIT(&item->list); 2685 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2686 --count; 2687 } 2688 } 2689 2690 void 2691 hammer_btree_lcache_free(hammer_mount_t hmp, hammer_node_lock_t lcache) 2692 { 2693 hammer_node_lock_t item; 2694 2695 while ((item = TAILQ_FIRST(&lcache->list)) != NULL) { 2696 TAILQ_REMOVE(&lcache->list, item, entry); 2697 KKASSERT(item->copy); 2698 KKASSERT(TAILQ_EMPTY(&item->list)); 2699 kfree(item->copy, hmp->m_misc); 2700 kfree(item, hmp->m_misc); 2701 } 2702 KKASSERT(lcache->copy == NULL); 2703 } 2704 2705 /* 2706 * Exclusively lock all the children of node. This is used by the split 2707 * code to prevent anyone from accessing the children of a cursor node 2708 * while we fix-up its parent offset. 2709 * 2710 * If we don't lock the children we can really mess up cursors which block 2711 * trying to cursor-up into our node. 2712 * 2713 * On failure EDEADLK (or some other error) is returned. If a deadlock 2714 * error is returned the cursor is adjusted to block on termination. 2715 * 2716 * The caller is responsible for managing parent->node, the root's node 2717 * is usually aliased from a cursor. 2718 */ 2719 int 2720 hammer_btree_lock_children(hammer_cursor_t cursor, int depth, 2721 hammer_node_lock_t parent, 2722 hammer_node_lock_t lcache) 2723 { 2724 hammer_node_t node; 2725 hammer_node_lock_t item; 2726 hammer_node_ondisk_t ondisk; 2727 hammer_btree_elm_t elm; 2728 hammer_node_t child; 2729 struct hammer_mount *hmp; 2730 int error; 2731 int i; 2732 2733 node = parent->node; 2734 ondisk = node->ondisk; 2735 error = 0; 2736 hmp = cursor->trans->hmp; 2737 2738 /* 2739 * We really do not want to block on I/O with exclusive locks held, 2740 * pre-get the children before trying to lock the mess. This is 2741 * only done one-level deep for now. 2742 */ 2743 for (i = 0; i < ondisk->count; ++i) { 2744 ++hammer_stats_btree_elements; 2745 elm = &ondisk->elms[i]; 2746 if (elm->base.btype != HAMMER_BTREE_TYPE_LEAF && 2747 elm->base.btype != HAMMER_BTREE_TYPE_INTERNAL) { 2748 continue; 2749 } 2750 child = hammer_get_node(cursor->trans, 2751 elm->internal.subtree_offset, 2752 0, &error); 2753 if (child) 2754 hammer_rel_node(child); 2755 } 2756 2757 /* 2758 * Do it for real 2759 */ 2760 for (i = 0; error == 0 && i < ondisk->count; ++i) { 2761 ++hammer_stats_btree_elements; 2762 elm = &ondisk->elms[i]; 2763 2764 switch(elm->base.btype) { 2765 case HAMMER_BTREE_TYPE_INTERNAL: 2766 case HAMMER_BTREE_TYPE_LEAF: 2767 KKASSERT(elm->internal.subtree_offset != 0); 2768 child = hammer_get_node(cursor->trans, 2769 elm->internal.subtree_offset, 2770 0, &error); 2771 break; 2772 default: 2773 child = NULL; 2774 break; 2775 } 2776 if (child) { 2777 if (hammer_lock_ex_try(&child->lock) != 0) { 2778 if (cursor->deadlk_node == NULL) { 2779 cursor->deadlk_node = child; 2780 hammer_ref_node(cursor->deadlk_node); 2781 } 2782 error = EDEADLK; 2783 hammer_rel_node(child); 2784 } else { 2785 if (lcache) { 2786 item = TAILQ_FIRST(&lcache->list); 2787 KKASSERT(item != NULL); 2788 item->flags |= HAMMER_NODE_LOCK_LCACHE; 2789 TAILQ_REMOVE(&lcache->list, 2790 item, entry); 2791 } else { 2792 item = kmalloc(sizeof(*item), 2793 hmp->m_misc, 2794 M_WAITOK|M_ZERO); 2795 TAILQ_INIT(&item->list); 2796 } 2797 2798 TAILQ_INSERT_TAIL(&parent->list, item, entry); 2799 item->parent = parent; 2800 item->node = child; 2801 item->index = i; 2802 item->count = child->ondisk->count; 2803 2804 /* 2805 * Recurse (used by the rebalancing code) 2806 */ 2807 if (depth > 1 && elm->base.btype == HAMMER_BTREE_TYPE_INTERNAL) { 2808 error = hammer_btree_lock_children( 2809 cursor, 2810 depth - 1, 2811 item, 2812 lcache); 2813 } 2814 } 2815 } 2816 } 2817 if (error) 2818 hammer_btree_unlock_children(hmp, parent, lcache); 2819 return(error); 2820 } 2821 2822 /* 2823 * Create an in-memory copy of all B-Tree nodes listed, recursively, 2824 * including the parent. 2825 */ 2826 void 2827 hammer_btree_lock_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2828 { 2829 hammer_mount_t hmp = cursor->trans->hmp; 2830 hammer_node_lock_t item; 2831 2832 if (parent->copy == NULL) { 2833 KKASSERT((parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0); 2834 parent->copy = kmalloc(sizeof(*parent->copy), 2835 hmp->m_misc, M_WAITOK); 2836 } 2837 KKASSERT((parent->flags & HAMMER_NODE_LOCK_UPDATED) == 0); 2838 *parent->copy = *parent->node->ondisk; 2839 TAILQ_FOREACH(item, &parent->list, entry) { 2840 hammer_btree_lock_copy(cursor, item); 2841 } 2842 } 2843 2844 /* 2845 * Recursively sync modified copies to the media. 2846 */ 2847 int 2848 hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2849 { 2850 hammer_node_lock_t item; 2851 int count = 0; 2852 2853 if (parent->flags & HAMMER_NODE_LOCK_UPDATED) { 2854 ++count; 2855 hammer_modify_node_all(cursor->trans, parent->node); 2856 *parent->node->ondisk = *parent->copy; 2857 hammer_modify_node_done(parent->node); 2858 if (parent->copy->type == HAMMER_BTREE_TYPE_DELETED) { 2859 hammer_flush_node(parent->node, 0); 2860 hammer_delete_node(cursor->trans, parent->node); 2861 } 2862 } 2863 TAILQ_FOREACH(item, &parent->list, entry) { 2864 count += hammer_btree_sync_copy(cursor, item); 2865 } 2866 return(count); 2867 } 2868 2869 /* 2870 * Release previously obtained node locks. The caller is responsible for 2871 * cleaning up parent->node itself (its usually just aliased from a cursor), 2872 * but this function will take care of the copies. 2873 * 2874 * NOTE: The root node is not placed in the lcache and node->copy is not 2875 * deallocated when lcache != NULL. 2876 */ 2877 void 2878 hammer_btree_unlock_children(hammer_mount_t hmp, hammer_node_lock_t parent, 2879 hammer_node_lock_t lcache) 2880 { 2881 hammer_node_lock_t item; 2882 hammer_node_ondisk_t copy; 2883 2884 while ((item = TAILQ_FIRST(&parent->list)) != NULL) { 2885 TAILQ_REMOVE(&parent->list, item, entry); 2886 hammer_btree_unlock_children(hmp, item, lcache); 2887 hammer_unlock(&item->node->lock); 2888 hammer_rel_node(item->node); 2889 if (lcache) { 2890 /* 2891 * NOTE: When placing the item back in the lcache 2892 * the flag is cleared by the bzero(). 2893 * Remaining fields are cleared as a safety 2894 * measure. 2895 */ 2896 KKASSERT(item->flags & HAMMER_NODE_LOCK_LCACHE); 2897 KKASSERT(TAILQ_EMPTY(&item->list)); 2898 copy = item->copy; 2899 bzero(item, sizeof(*item)); 2900 TAILQ_INIT(&item->list); 2901 item->copy = copy; 2902 if (copy) 2903 bzero(copy, sizeof(*copy)); 2904 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2905 } else { 2906 kfree(item, hmp->m_misc); 2907 } 2908 } 2909 if (parent->copy && (parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0) { 2910 kfree(parent->copy, hmp->m_misc); 2911 parent->copy = NULL; /* safety */ 2912 } 2913 } 2914 2915 /************************************************************************ 2916 * MISCELLANIOUS SUPPORT * 2917 ************************************************************************/ 2918 2919 /* 2920 * Compare two B-Tree elements, return -N, 0, or +N (e.g. similar to strcmp). 2921 * 2922 * Note that for this particular function a return value of -1, 0, or +1 2923 * can denote a match if create_tid is otherwise discounted. A create_tid 2924 * of zero is considered to be 'infinity' in comparisons. 2925 * 2926 * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c. 2927 */ 2928 int 2929 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2) 2930 { 2931 if (key1->localization < key2->localization) 2932 return(-5); 2933 if (key1->localization > key2->localization) 2934 return(5); 2935 2936 if (key1->obj_id < key2->obj_id) 2937 return(-4); 2938 if (key1->obj_id > key2->obj_id) 2939 return(4); 2940 2941 if (key1->rec_type < key2->rec_type) 2942 return(-3); 2943 if (key1->rec_type > key2->rec_type) 2944 return(3); 2945 2946 if (key1->key < key2->key) 2947 return(-2); 2948 if (key1->key > key2->key) 2949 return(2); 2950 2951 /* 2952 * A create_tid of zero indicates a record which is undeletable 2953 * and must be considered to have a value of positive infinity. 2954 */ 2955 if (key1->create_tid == 0) { 2956 if (key2->create_tid == 0) 2957 return(0); 2958 return(1); 2959 } 2960 if (key2->create_tid == 0) 2961 return(-1); 2962 if (key1->create_tid < key2->create_tid) 2963 return(-1); 2964 if (key1->create_tid > key2->create_tid) 2965 return(1); 2966 return(0); 2967 } 2968 2969 /* 2970 * Test a timestamp against an element to determine whether the 2971 * element is visible. A timestamp of 0 means 'infinity'. 2972 */ 2973 int 2974 hammer_btree_chkts(hammer_tid_t asof, hammer_base_elm_t base) 2975 { 2976 if (asof == 0) { 2977 if (base->delete_tid) 2978 return(1); 2979 return(0); 2980 } 2981 if (asof < base->create_tid) 2982 return(-1); 2983 if (base->delete_tid && asof >= base->delete_tid) 2984 return(1); 2985 return(0); 2986 } 2987 2988 /* 2989 * Create a separator half way inbetween key1 and key2. For fields just 2990 * one unit apart, the separator will match key2. key1 is on the left-hand 2991 * side and key2 is on the right-hand side. 2992 * 2993 * key2 must be >= the separator. It is ok for the separator to match key2. 2994 * 2995 * NOTE: Even if key1 does not match key2, the separator may wind up matching 2996 * key2. 2997 * 2998 * NOTE: It might be beneficial to just scrap this whole mess and just 2999 * set the separator to key2. 3000 */ 3001 #define MAKE_SEPARATOR(key1, key2, dest, field) \ 3002 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1); 3003 3004 static void 3005 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2, 3006 hammer_base_elm_t dest) 3007 { 3008 bzero(dest, sizeof(*dest)); 3009 3010 dest->rec_type = key2->rec_type; 3011 dest->key = key2->key; 3012 dest->obj_id = key2->obj_id; 3013 dest->create_tid = key2->create_tid; 3014 3015 MAKE_SEPARATOR(key1, key2, dest, localization); 3016 if (key1->localization == key2->localization) { 3017 MAKE_SEPARATOR(key1, key2, dest, obj_id); 3018 if (key1->obj_id == key2->obj_id) { 3019 MAKE_SEPARATOR(key1, key2, dest, rec_type); 3020 if (key1->rec_type == key2->rec_type) { 3021 MAKE_SEPARATOR(key1, key2, dest, key); 3022 /* 3023 * Don't bother creating a separator for 3024 * create_tid, which also conveniently avoids 3025 * having to handle the create_tid == 0 3026 * (infinity) case. Just leave create_tid 3027 * set to key2. 3028 * 3029 * Worst case, dest matches key2 exactly, 3030 * which is acceptable. 3031 */ 3032 } 3033 } 3034 } 3035 } 3036 3037 #undef MAKE_SEPARATOR 3038 3039 /* 3040 * Return whether a generic internal or leaf node is full 3041 */ 3042 static __inline 3043 int 3044 btree_node_is_full(hammer_node_ondisk_t node) 3045 { 3046 return(btree_max_elements(node->type) == node->count); 3047 } 3048 3049 static __inline 3050 int 3051 btree_max_elements(u_int8_t type) 3052 { 3053 int n; 3054 3055 n = hammer_node_max_elements(type); 3056 if (n == -1) 3057 panic("btree_max_elements: bad type %d", type); 3058 return(n); 3059 } 3060 3061 void 3062 hammer_print_btree_node(hammer_node_ondisk_t ondisk) 3063 { 3064 hammer_btree_elm_t elm; 3065 int i; 3066 3067 kprintf("node %p count=%d parent=%016llx type=%c\n", 3068 ondisk, ondisk->count, 3069 (long long)ondisk->parent, ondisk->type); 3070 3071 /* 3072 * Dump both boundary elements if an internal node 3073 */ 3074 if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 3075 for (i = 0; i <= ondisk->count; ++i) { 3076 elm = &ondisk->elms[i]; 3077 hammer_print_btree_elm(elm, ondisk->type, i); 3078 } 3079 } else { 3080 for (i = 0; i < ondisk->count; ++i) { 3081 elm = &ondisk->elms[i]; 3082 hammer_print_btree_elm(elm, ondisk->type, i); 3083 } 3084 } 3085 } 3086 3087 void 3088 hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i) 3089 { 3090 kprintf(" %2d", i); 3091 kprintf("\tobj_id = %016llx\n", (long long)elm->base.obj_id); 3092 kprintf("\tkey = %016llx\n", (long long)elm->base.key); 3093 kprintf("\tcreate_tid = %016llx\n", (long long)elm->base.create_tid); 3094 kprintf("\tdelete_tid = %016llx\n", (long long)elm->base.delete_tid); 3095 kprintf("\trec_type = %04x\n", elm->base.rec_type); 3096 kprintf("\tobj_type = %02x\n", elm->base.obj_type); 3097 kprintf("\tbtype = %02x (%c)\n", 3098 elm->base.btype, 3099 (elm->base.btype ? elm->base.btype : '?')); 3100 kprintf("\tlocalization = %02x\n", elm->base.localization); 3101 3102 switch(type) { 3103 case HAMMER_BTREE_TYPE_INTERNAL: 3104 kprintf("\tsubtree_off = %016llx\n", 3105 (long long)elm->internal.subtree_offset); 3106 break; 3107 case HAMMER_BTREE_TYPE_RECORD: 3108 kprintf("\tdata_offset = %016llx\n", 3109 (long long)elm->leaf.data_offset); 3110 kprintf("\tdata_len = %08x\n", elm->leaf.data_len); 3111 kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc); 3112 break; 3113 } 3114 } 3115