1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * HAMMER B-Tree index 37 * 38 * HAMMER implements a modified B+Tree. In documentation this will 39 * simply be refered to as the HAMMER B-Tree. Basically a HAMMER B-Tree 40 * looks like a B+Tree (A B-Tree which stores its records only at the leafs 41 * of the tree), but adds two additional boundary elements which describe 42 * the left-most and right-most element a node is able to represent. In 43 * otherwords, we have boundary elements at the two ends of a B-Tree node 44 * instead of sub-tree pointers. 45 * 46 * A B-Tree internal node looks like this: 47 * 48 * B N N N N N N B <-- boundary and internal elements 49 * S S S S S S S <-- subtree pointers 50 * 51 * A B-Tree leaf node basically looks like this: 52 * 53 * L L L L L L L L <-- leaf elemenets 54 * 55 * The radix for an internal node is 1 less then a leaf but we get a 56 * number of significant benefits for our troubles. 57 * 58 * The big benefit to using a B-Tree containing boundary information 59 * is that it is possible to cache pointers into the middle of the tree 60 * and not have to start searches, insertions, OR deletions at the root 61 * node. In particular, searches are able to progress in a definitive 62 * direction from any point in the tree without revisting nodes. This 63 * greatly improves the efficiency of many operations, most especially 64 * record appends. 65 * 66 * B-Trees also make the stacking of trees fairly straightforward. 67 * 68 * INSERTIONS: A search performed with the intention of doing 69 * an insert will guarantee that the terminal leaf node is not full by 70 * splitting full nodes. Splits occur top-down during the dive down the 71 * B-Tree. 72 * 73 * DELETIONS: A deletion makes no attempt to proactively balance the 74 * tree and will recursively remove nodes that become empty. If a 75 * deadlock occurs a deletion may not be able to remove an empty leaf. 76 * Deletions never allow internal nodes to become empty (that would blow 77 * up the boundaries). 78 */ 79 #include "hammer.h" 80 #include <sys/buf.h> 81 #include <sys/buf2.h> 82 83 static int btree_search(hammer_cursor_t cursor, int flags); 84 static int btree_split_internal(hammer_cursor_t cursor); 85 static int btree_split_leaf(hammer_cursor_t cursor); 86 static int btree_remove(hammer_cursor_t cursor); 87 static int btree_node_is_full(hammer_node_ondisk_t node); 88 static int hammer_btree_mirror_propagate(hammer_cursor_t cursor, 89 hammer_tid_t mirror_tid); 90 static void hammer_make_separator(hammer_base_elm_t key1, 91 hammer_base_elm_t key2, hammer_base_elm_t dest); 92 static void hammer_cursor_mirror_filter(hammer_cursor_t cursor); 93 94 /* 95 * Iterate records after a search. The cursor is iterated forwards past 96 * the current record until a record matching the key-range requirements 97 * is found. ENOENT is returned if the iteration goes past the ending 98 * key. 99 * 100 * The iteration is inclusive of key_beg and can be inclusive or exclusive 101 * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set. 102 * 103 * When doing an as-of search (cursor->asof != 0), key_beg.create_tid 104 * may be modified by B-Tree functions. 105 * 106 * cursor->key_beg may or may not be modified by this function during 107 * the iteration. XXX future - in case of an inverted lock we may have 108 * to reinitiate the lookup and set key_beg to properly pick up where we 109 * left off. 110 * 111 * If HAMMER_CURSOR_ITERATE_CHECK is set it is possible that the cursor 112 * was reverse indexed due to being moved to a parent while unlocked, 113 * and something else might have inserted an element outside the iteration 114 * range. When this case occurs the iterator just keeps iterating until 115 * it gets back into the iteration range (instead of asserting). 116 * 117 * NOTE! EDEADLK *CANNOT* be returned by this procedure. 118 */ 119 int 120 hammer_btree_iterate(hammer_cursor_t cursor) 121 { 122 hammer_node_ondisk_t node; 123 hammer_btree_elm_t elm; 124 hammer_mount_t hmp; 125 int error = 0; 126 int r; 127 int s; 128 129 /* 130 * Skip past the current record 131 */ 132 hmp = cursor->trans->hmp; 133 node = cursor->node->ondisk; 134 if (node == NULL) 135 return(ENOENT); 136 if (cursor->index < node->count && 137 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 138 ++cursor->index; 139 } 140 141 /* 142 * HAMMER can wind up being cpu-bound. 143 */ 144 if (++hmp->check_yield > hammer_yield_check) { 145 hmp->check_yield = 0; 146 lwkt_user_yield(); 147 } 148 149 150 /* 151 * Loop until an element is found or we are done. 152 */ 153 for (;;) { 154 /* 155 * We iterate up the tree and then index over one element 156 * while we are at the last element in the current node. 157 * 158 * If we are at the root of the filesystem, cursor_up 159 * returns ENOENT. 160 * 161 * XXX this could be optimized by storing the information in 162 * the parent reference. 163 * 164 * XXX we can lose the node lock temporarily, this could mess 165 * up our scan. 166 */ 167 ++hammer_stats_btree_iterations; 168 hammer_flusher_clean_loose_ios(hmp); 169 170 if (cursor->index == node->count) { 171 if (hammer_debug_btree) { 172 kprintf("BRACKETU %016llx[%d] -> %016llx[%d] (td=%p)\n", 173 (long long)cursor->node->node_offset, 174 cursor->index, 175 (long long)(cursor->parent ? cursor->parent->node_offset : -1), 176 cursor->parent_index, 177 curthread); 178 } 179 KKASSERT(cursor->parent == NULL || cursor->parent->ondisk->elms[cursor->parent_index].internal.subtree_offset == cursor->node->node_offset); 180 error = hammer_cursor_up(cursor); 181 if (error) 182 break; 183 /* reload stale pointer */ 184 node = cursor->node->ondisk; 185 KKASSERT(cursor->index != node->count); 186 187 /* 188 * If we are reblocking we want to return internal 189 * nodes. Note that the internal node will be 190 * returned multiple times, on each upward recursion 191 * from its children. The caller selects which 192 * revisit it cares about (usually first or last only). 193 */ 194 if (cursor->flags & HAMMER_CURSOR_REBLOCKING) { 195 cursor->flags |= HAMMER_CURSOR_ATEDISK; 196 return(0); 197 } 198 ++cursor->index; 199 continue; 200 } 201 202 /* 203 * Check internal or leaf element. Determine if the record 204 * at the cursor has gone beyond the end of our range. 205 * 206 * We recurse down through internal nodes. 207 */ 208 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 209 elm = &node->elms[cursor->index]; 210 211 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 212 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 213 if (hammer_debug_btree) { 214 kprintf("BRACKETL %016llx[%d] %016llx %02x %016llx lo=%02x %d (td=%p)\n", 215 (long long)cursor->node->node_offset, 216 cursor->index, 217 (long long)elm[0].internal.base.obj_id, 218 elm[0].internal.base.rec_type, 219 (long long)elm[0].internal.base.key, 220 elm[0].internal.base.localization, 221 r, 222 curthread 223 ); 224 kprintf("BRACKETR %016llx[%d] %016llx %02x %016llx lo=%02x %d\n", 225 (long long)cursor->node->node_offset, 226 cursor->index + 1, 227 (long long)elm[1].internal.base.obj_id, 228 elm[1].internal.base.rec_type, 229 (long long)elm[1].internal.base.key, 230 elm[1].internal.base.localization, 231 s 232 ); 233 } 234 235 if (r < 0) { 236 error = ENOENT; 237 break; 238 } 239 if (r == 0 && (cursor->flags & 240 HAMMER_CURSOR_END_INCLUSIVE) == 0) { 241 error = ENOENT; 242 break; 243 } 244 245 /* 246 * Better not be zero 247 */ 248 KKASSERT(elm->internal.subtree_offset != 0); 249 250 if (s <= 0) { 251 /* 252 * If running the mirror filter see if we 253 * can skip one or more entire sub-trees. 254 * If we can we return the internal node 255 * and the caller processes the skipped 256 * range (see mirror_read). 257 */ 258 if (cursor->flags & 259 HAMMER_CURSOR_MIRROR_FILTERED) { 260 if (elm->internal.mirror_tid < 261 cursor->cmirror->mirror_tid) { 262 hammer_cursor_mirror_filter(cursor); 263 return(0); 264 } 265 } 266 } else { 267 /* 268 * Normally it would be impossible for the 269 * cursor to have gotten back-indexed, 270 * but it can happen if a node is deleted 271 * and the cursor is moved to its parent 272 * internal node. ITERATE_CHECK will be set. 273 */ 274 KKASSERT(cursor->flags & 275 HAMMER_CURSOR_ITERATE_CHECK); 276 kprintf("hammer_btree_iterate: " 277 "DEBUG: Caught parent seek " 278 "in internal iteration\n"); 279 } 280 281 error = hammer_cursor_down(cursor); 282 if (error) 283 break; 284 KKASSERT(cursor->index == 0); 285 /* reload stale pointer */ 286 node = cursor->node->ondisk; 287 continue; 288 } else { 289 elm = &node->elms[cursor->index]; 290 r = hammer_btree_cmp(&cursor->key_end, &elm->base); 291 if (hammer_debug_btree) { 292 kprintf("ELEMENT %016llx:%d %c %016llx %02x %016llx lo=%02x %d\n", 293 (long long)cursor->node->node_offset, 294 cursor->index, 295 (elm[0].leaf.base.btype ? 296 elm[0].leaf.base.btype : '?'), 297 (long long)elm[0].leaf.base.obj_id, 298 elm[0].leaf.base.rec_type, 299 (long long)elm[0].leaf.base.key, 300 elm[0].leaf.base.localization, 301 r 302 ); 303 } 304 if (r < 0) { 305 error = ENOENT; 306 break; 307 } 308 309 /* 310 * We support both end-inclusive and 311 * end-exclusive searches. 312 */ 313 if (r == 0 && 314 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 315 error = ENOENT; 316 break; 317 } 318 319 /* 320 * If ITERATE_CHECK is set an unlocked cursor may 321 * have been moved to a parent and the iterate can 322 * happen upon elements that are not in the requested 323 * range. 324 */ 325 if (cursor->flags & HAMMER_CURSOR_ITERATE_CHECK) { 326 s = hammer_btree_cmp(&cursor->key_beg, 327 &elm->base); 328 if (s > 0) { 329 kprintf("hammer_btree_iterate: " 330 "DEBUG: Caught parent seek " 331 "in leaf iteration\n"); 332 ++cursor->index; 333 continue; 334 } 335 } 336 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 337 338 /* 339 * Return the element 340 */ 341 switch(elm->leaf.base.btype) { 342 case HAMMER_BTREE_TYPE_RECORD: 343 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 344 hammer_btree_chkts(cursor->asof, &elm->base)) { 345 ++cursor->index; 346 continue; 347 } 348 error = 0; 349 break; 350 default: 351 error = EINVAL; 352 break; 353 } 354 if (error) 355 break; 356 } 357 /* 358 * node pointer invalid after loop 359 */ 360 361 /* 362 * Return entry 363 */ 364 if (hammer_debug_btree) { 365 int i = cursor->index; 366 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 367 kprintf("ITERATE %p:%d %016llx %02x %016llx lo=%02x\n", 368 cursor->node, i, 369 (long long)elm->internal.base.obj_id, 370 elm->internal.base.rec_type, 371 (long long)elm->internal.base.key, 372 elm->internal.base.localization 373 ); 374 } 375 return(0); 376 } 377 return(error); 378 } 379 380 /* 381 * We hit an internal element that we could skip as part of a mirroring 382 * scan. Calculate the entire range being skipped. 383 * 384 * It is important to include any gaps between the parent's left_bound 385 * and the node's left_bound, and same goes for the right side. 386 */ 387 static void 388 hammer_cursor_mirror_filter(hammer_cursor_t cursor) 389 { 390 struct hammer_cmirror *cmirror; 391 hammer_node_ondisk_t ondisk; 392 hammer_btree_elm_t elm; 393 394 ondisk = cursor->node->ondisk; 395 cmirror = cursor->cmirror; 396 397 /* 398 * Calculate the skipped range 399 */ 400 elm = &ondisk->elms[cursor->index]; 401 if (cursor->index == 0) 402 cmirror->skip_beg = *cursor->left_bound; 403 else 404 cmirror->skip_beg = elm->internal.base; 405 while (cursor->index < ondisk->count) { 406 if (elm->internal.mirror_tid >= cmirror->mirror_tid) 407 break; 408 ++cursor->index; 409 ++elm; 410 } 411 if (cursor->index == ondisk->count) 412 cmirror->skip_end = *cursor->right_bound; 413 else 414 cmirror->skip_end = elm->internal.base; 415 416 /* 417 * clip the returned result. 418 */ 419 if (hammer_btree_cmp(&cmirror->skip_beg, &cursor->key_beg) < 0) 420 cmirror->skip_beg = cursor->key_beg; 421 if (hammer_btree_cmp(&cmirror->skip_end, &cursor->key_end) > 0) 422 cmirror->skip_end = cursor->key_end; 423 } 424 425 /* 426 * Iterate in the reverse direction. This is used by the pruning code to 427 * avoid overlapping records. 428 */ 429 int 430 hammer_btree_iterate_reverse(hammer_cursor_t cursor) 431 { 432 hammer_node_ondisk_t node; 433 hammer_btree_elm_t elm; 434 hammer_mount_t hmp; 435 int error = 0; 436 int r; 437 int s; 438 439 /* mirror filtering not supported for reverse iteration */ 440 KKASSERT ((cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) == 0); 441 442 /* 443 * Skip past the current record. For various reasons the cursor 444 * may end up set to -1 or set to point at the end of the current 445 * node. These cases must be addressed. 446 */ 447 node = cursor->node->ondisk; 448 if (node == NULL) 449 return(ENOENT); 450 if (cursor->index != -1 && 451 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 452 --cursor->index; 453 } 454 if (cursor->index == cursor->node->ondisk->count) 455 --cursor->index; 456 457 /* 458 * HAMMER can wind up being cpu-bound. 459 */ 460 hmp = cursor->trans->hmp; 461 if (++hmp->check_yield > hammer_yield_check) { 462 hmp->check_yield = 0; 463 lwkt_user_yield(); 464 } 465 466 /* 467 * Loop until an element is found or we are done. 468 */ 469 for (;;) { 470 ++hammer_stats_btree_iterations; 471 hammer_flusher_clean_loose_ios(hmp); 472 473 /* 474 * We iterate up the tree and then index over one element 475 * while we are at the last element in the current node. 476 */ 477 if (cursor->index == -1) { 478 error = hammer_cursor_up(cursor); 479 if (error) { 480 cursor->index = 0; /* sanity */ 481 break; 482 } 483 /* reload stale pointer */ 484 node = cursor->node->ondisk; 485 KKASSERT(cursor->index != node->count); 486 --cursor->index; 487 continue; 488 } 489 490 /* 491 * Check internal or leaf element. Determine if the record 492 * at the cursor has gone beyond the end of our range. 493 * 494 * We recurse down through internal nodes. 495 */ 496 KKASSERT(cursor->index != node->count); 497 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 498 elm = &node->elms[cursor->index]; 499 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 500 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 501 if (hammer_debug_btree) { 502 kprintf("BRACKETL %016llx[%d] %016llx %02x %016llx lo=%02x %d\n", 503 (long long)cursor->node->node_offset, 504 cursor->index, 505 (long long)elm[0].internal.base.obj_id, 506 elm[0].internal.base.rec_type, 507 (long long)elm[0].internal.base.key, 508 elm[0].internal.base.localization, 509 r 510 ); 511 kprintf("BRACKETR %016llx[%d] %016llx %02x %016llx lo=%02x %d\n", 512 (long long)cursor->node->node_offset, 513 cursor->index + 1, 514 (long long)elm[1].internal.base.obj_id, 515 elm[1].internal.base.rec_type, 516 (long long)elm[1].internal.base.key, 517 elm[1].internal.base.localization, 518 s 519 ); 520 } 521 522 if (s >= 0) { 523 error = ENOENT; 524 break; 525 } 526 527 /* 528 * It shouldn't be possible to be seeked past key_end, 529 * even if the cursor got moved to a parent. 530 */ 531 KKASSERT(r >= 0); 532 533 /* 534 * Better not be zero 535 */ 536 KKASSERT(elm->internal.subtree_offset != 0); 537 538 error = hammer_cursor_down(cursor); 539 if (error) 540 break; 541 KKASSERT(cursor->index == 0); 542 /* reload stale pointer */ 543 node = cursor->node->ondisk; 544 545 /* this can assign -1 if the leaf was empty */ 546 cursor->index = node->count - 1; 547 continue; 548 } else { 549 elm = &node->elms[cursor->index]; 550 s = hammer_btree_cmp(&cursor->key_beg, &elm->base); 551 if (hammer_debug_btree) { 552 kprintf("ELEMENT %016llx:%d %c %016llx %02x %016llx lo=%02x %d\n", 553 (long long)cursor->node->node_offset, 554 cursor->index, 555 (elm[0].leaf.base.btype ? 556 elm[0].leaf.base.btype : '?'), 557 (long long)elm[0].leaf.base.obj_id, 558 elm[0].leaf.base.rec_type, 559 (long long)elm[0].leaf.base.key, 560 elm[0].leaf.base.localization, 561 s 562 ); 563 } 564 if (s > 0) { 565 error = ENOENT; 566 break; 567 } 568 569 /* 570 * It shouldn't be possible to be seeked past key_end, 571 * even if the cursor got moved to a parent. 572 */ 573 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 574 575 /* 576 * Return the element 577 */ 578 switch(elm->leaf.base.btype) { 579 case HAMMER_BTREE_TYPE_RECORD: 580 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 581 hammer_btree_chkts(cursor->asof, &elm->base)) { 582 --cursor->index; 583 continue; 584 } 585 error = 0; 586 break; 587 default: 588 error = EINVAL; 589 break; 590 } 591 if (error) 592 break; 593 } 594 /* 595 * node pointer invalid after loop 596 */ 597 598 /* 599 * Return entry 600 */ 601 if (hammer_debug_btree) { 602 int i = cursor->index; 603 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 604 kprintf("ITERATE %p:%d %016llx %02x %016llx lo=%02x\n", 605 cursor->node, i, 606 (long long)elm->internal.base.obj_id, 607 elm->internal.base.rec_type, 608 (long long)elm->internal.base.key, 609 elm->internal.base.localization 610 ); 611 } 612 return(0); 613 } 614 return(error); 615 } 616 617 /* 618 * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry 619 * could not be found, EDEADLK if inserting and a retry is needed, and a 620 * fatal error otherwise. When retrying, the caller must terminate the 621 * cursor and reinitialize it. EDEADLK cannot be returned if not inserting. 622 * 623 * The cursor is suitably positioned for a deletion on success, and suitably 624 * positioned for an insertion on ENOENT if HAMMER_CURSOR_INSERT was 625 * specified. 626 * 627 * The cursor may begin anywhere, the search will traverse the tree in 628 * either direction to locate the requested element. 629 * 630 * Most of the logic implementing historical searches is handled here. We 631 * do an initial lookup with create_tid set to the asof TID. Due to the 632 * way records are laid out, a backwards iteration may be required if 633 * ENOENT is returned to locate the historical record. Here's the 634 * problem: 635 * 636 * create_tid: 10 15 20 637 * LEAF1 LEAF2 638 * records: (11) (18) 639 * 640 * Lets say we want to do a lookup AS-OF timestamp 17. We will traverse 641 * LEAF2 but the only record in LEAF2 has a create_tid of 18, which is 642 * not visible and thus causes ENOENT to be returned. We really need 643 * to check record 11 in LEAF1. If it also fails then the search fails 644 * (e.g. it might represent the range 11-16 and thus still not match our 645 * AS-OF timestamp of 17). Note that LEAF1 could be empty, requiring 646 * further iterations. 647 * 648 * If this case occurs btree_search() will set HAMMER_CURSOR_CREATE_CHECK 649 * and the cursor->create_check TID if an iteration might be needed. 650 * In the above example create_check would be set to 14. 651 */ 652 int 653 hammer_btree_lookup(hammer_cursor_t cursor) 654 { 655 int error; 656 657 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 658 KKASSERT ((cursor->flags & HAMMER_CURSOR_INSERT) == 0 || 659 cursor->trans->sync_lock_refs > 0); 660 ++hammer_stats_btree_lookups; 661 if (cursor->flags & HAMMER_CURSOR_ASOF) { 662 KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0); 663 cursor->key_beg.create_tid = cursor->asof; 664 for (;;) { 665 cursor->flags &= ~HAMMER_CURSOR_CREATE_CHECK; 666 error = btree_search(cursor, 0); 667 if (error != ENOENT || 668 (cursor->flags & HAMMER_CURSOR_CREATE_CHECK) == 0) { 669 /* 670 * Stop if no error. 671 * Stop if error other then ENOENT. 672 * Stop if ENOENT and not special case. 673 */ 674 break; 675 } 676 if (hammer_debug_btree) { 677 kprintf("CREATE_CHECK %016llx\n", 678 (long long)cursor->create_check); 679 } 680 cursor->key_beg.create_tid = cursor->create_check; 681 /* loop */ 682 } 683 } else { 684 error = btree_search(cursor, 0); 685 } 686 if (error == 0) 687 error = hammer_btree_extract(cursor, cursor->flags); 688 return(error); 689 } 690 691 /* 692 * Execute the logic required to start an iteration. The first record 693 * located within the specified range is returned and iteration control 694 * flags are adjusted for successive hammer_btree_iterate() calls. 695 * 696 * Set ATEDISK so a low-level caller can call btree_first/btree_iterate 697 * in a loop without worrying about it. Higher-level merged searches will 698 * adjust the flag appropriately. 699 */ 700 int 701 hammer_btree_first(hammer_cursor_t cursor) 702 { 703 int error; 704 705 error = hammer_btree_lookup(cursor); 706 if (error == ENOENT) { 707 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 708 error = hammer_btree_iterate(cursor); 709 } 710 cursor->flags |= HAMMER_CURSOR_ATEDISK; 711 return(error); 712 } 713 714 /* 715 * Similarly but for an iteration in the reverse direction. 716 * 717 * Set ATEDISK when iterating backwards to skip the current entry, 718 * which after an ENOENT lookup will be pointing beyond our end point. 719 * 720 * Set ATEDISK so a low-level caller can call btree_last/btree_iterate_reverse 721 * in a loop without worrying about it. Higher-level merged searches will 722 * adjust the flag appropriately. 723 */ 724 int 725 hammer_btree_last(hammer_cursor_t cursor) 726 { 727 struct hammer_base_elm save; 728 int error; 729 730 save = cursor->key_beg; 731 cursor->key_beg = cursor->key_end; 732 error = hammer_btree_lookup(cursor); 733 cursor->key_beg = save; 734 if (error == ENOENT || 735 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 736 cursor->flags |= HAMMER_CURSOR_ATEDISK; 737 error = hammer_btree_iterate_reverse(cursor); 738 } 739 cursor->flags |= HAMMER_CURSOR_ATEDISK; 740 return(error); 741 } 742 743 /* 744 * Extract the record and/or data associated with the cursor's current 745 * position. Any prior record or data stored in the cursor is replaced. 746 * The cursor must be positioned at a leaf node. 747 * 748 * NOTE: All extractions occur at the leaf of the B-Tree. 749 */ 750 int 751 hammer_btree_extract(hammer_cursor_t cursor, int flags) 752 { 753 hammer_node_ondisk_t node; 754 hammer_btree_elm_t elm; 755 hammer_off_t data_off; 756 hammer_mount_t hmp; 757 int32_t data_len; 758 int error; 759 760 /* 761 * The case where the data reference resolves to the same buffer 762 * as the record reference must be handled. 763 */ 764 node = cursor->node->ondisk; 765 elm = &node->elms[cursor->index]; 766 cursor->data = NULL; 767 hmp = cursor->node->hmp; 768 769 /* 770 * There is nothing to extract for an internal element. 771 */ 772 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) 773 return(EINVAL); 774 775 /* 776 * Only record types have data. 777 */ 778 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 779 cursor->leaf = &elm->leaf; 780 781 if ((flags & HAMMER_CURSOR_GET_DATA) == 0) 782 return(0); 783 if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD) 784 return(0); 785 data_off = elm->leaf.data_offset; 786 data_len = elm->leaf.data_len; 787 if (data_off == 0) 788 return(0); 789 790 /* 791 * Load the data 792 */ 793 KKASSERT(data_len >= 0 && data_len <= HAMMER_XBUFSIZE); 794 cursor->data = hammer_bread_ext(hmp, data_off, data_len, 795 &error, &cursor->data_buffer); 796 797 /* 798 * Mark the data buffer as not being meta-data if it isn't 799 * meta-data (sometimes bulk data is accessed via a volume 800 * block device). 801 */ 802 if (error == 0) { 803 switch(elm->leaf.base.rec_type) { 804 case HAMMER_RECTYPE_DATA: 805 case HAMMER_RECTYPE_DB: 806 if ((data_off & HAMMER_ZONE_LARGE_DATA) == 0) 807 break; 808 if (hammer_double_buffer == 0 || 809 (cursor->flags & HAMMER_CURSOR_NOSWAPCACHE)) { 810 hammer_io_notmeta(cursor->data_buffer); 811 } 812 break; 813 default: 814 break; 815 } 816 } 817 818 /* 819 * Deal with CRC errors on the extracted data. 820 */ 821 if (error == 0 && 822 hammer_crc_test_leaf(cursor->data, &elm->leaf) == 0) { 823 kprintf("CRC DATA @ %016llx/%d FAILED\n", 824 (long long)elm->leaf.data_offset, elm->leaf.data_len); 825 if (hammer_debug_critical) 826 Debugger("CRC FAILED: DATA"); 827 if (cursor->trans->flags & HAMMER_TRANSF_CRCDOM) 828 error = EDOM; /* less critical (mirroring) */ 829 else 830 error = EIO; /* critical */ 831 } 832 return(error); 833 } 834 835 836 /* 837 * Insert a leaf element into the B-Tree at the current cursor position. 838 * The cursor is positioned such that the element at and beyond the cursor 839 * are shifted to make room for the new record. 840 * 841 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT 842 * flag set and that call must return ENOENT before this function can be 843 * called. 844 * 845 * The caller may depend on the cursor's exclusive lock after return to 846 * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE). 847 * 848 * ENOSPC is returned if there is no room to insert a new record. 849 */ 850 int 851 hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm, 852 int *doprop) 853 { 854 hammer_node_ondisk_t node; 855 int i; 856 int error; 857 858 *doprop = 0; 859 if ((error = hammer_cursor_upgrade_node(cursor)) != 0) 860 return(error); 861 ++hammer_stats_btree_inserts; 862 863 /* 864 * Insert the element at the leaf node and update the count in the 865 * parent. It is possible for parent to be NULL, indicating that 866 * the filesystem's ROOT B-Tree node is a leaf itself, which is 867 * possible. The root inode can never be deleted so the leaf should 868 * never be empty. 869 * 870 * Remember that the right-hand boundary is not included in the 871 * count. 872 */ 873 hammer_modify_node_all(cursor->trans, cursor->node); 874 node = cursor->node->ondisk; 875 i = cursor->index; 876 KKASSERT(elm->base.btype != 0); 877 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 878 KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS); 879 if (i != node->count) { 880 bcopy(&node->elms[i], &node->elms[i+1], 881 (node->count - i) * sizeof(*elm)); 882 } 883 node->elms[i].leaf = *elm; 884 ++node->count; 885 hammer_cursor_inserted_element(cursor->node, i); 886 887 /* 888 * Update the leaf node's aggregate mirror_tid for mirroring 889 * support. 890 */ 891 if (node->mirror_tid < elm->base.delete_tid) { 892 node->mirror_tid = elm->base.delete_tid; 893 *doprop = 1; 894 } 895 if (node->mirror_tid < elm->base.create_tid) { 896 node->mirror_tid = elm->base.create_tid; 897 *doprop = 1; 898 } 899 hammer_modify_node_done(cursor->node); 900 901 /* 902 * Debugging sanity checks. 903 */ 904 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->base) <= 0); 905 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->base) > 0); 906 if (i) { 907 KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->base) < 0); 908 } 909 if (i != node->count - 1) 910 KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->base) > 0); 911 912 return(0); 913 } 914 915 /* 916 * Delete a record from the B-Tree at the current cursor position. 917 * The cursor is positioned such that the current element is the one 918 * to be deleted. 919 * 920 * On return the cursor will be positioned after the deleted element and 921 * MAY point to an internal node. It will be suitable for the continuation 922 * of an iteration but not for an insertion or deletion. 923 * 924 * Deletions will attempt to partially rebalance the B-Tree in an upward 925 * direction, but will terminate rather then deadlock. Empty internal nodes 926 * are never allowed by a deletion which deadlocks may end up giving us an 927 * empty leaf. The pruner will clean up and rebalance the tree. 928 * 929 * This function can return EDEADLK, requiring the caller to retry the 930 * operation after clearing the deadlock. 931 */ 932 int 933 hammer_btree_delete(hammer_cursor_t cursor) 934 { 935 hammer_node_ondisk_t ondisk; 936 hammer_node_t node; 937 hammer_node_t parent __debugvar; 938 int error; 939 int i; 940 941 KKASSERT (cursor->trans->sync_lock_refs > 0); 942 if ((error = hammer_cursor_upgrade(cursor)) != 0) 943 return(error); 944 ++hammer_stats_btree_deletes; 945 946 /* 947 * Delete the element from the leaf node. 948 * 949 * Remember that leaf nodes do not have boundaries. 950 */ 951 node = cursor->node; 952 ondisk = node->ondisk; 953 i = cursor->index; 954 955 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF); 956 KKASSERT(i >= 0 && i < ondisk->count); 957 hammer_modify_node_all(cursor->trans, node); 958 if (i + 1 != ondisk->count) { 959 bcopy(&ondisk->elms[i+1], &ondisk->elms[i], 960 (ondisk->count - i - 1) * sizeof(ondisk->elms[0])); 961 } 962 --ondisk->count; 963 hammer_modify_node_done(node); 964 hammer_cursor_deleted_element(node, i); 965 966 /* 967 * Validate local parent 968 */ 969 if (ondisk->parent) { 970 parent = cursor->parent; 971 972 KKASSERT(parent != NULL); 973 KKASSERT(parent->node_offset == ondisk->parent); 974 } 975 976 /* 977 * If the leaf becomes empty it must be detached from the parent, 978 * potentially recursing through to the filesystem root. 979 * 980 * This may reposition the cursor at one of the parent's of the 981 * current node. 982 * 983 * Ignore deadlock errors, that simply means that btree_remove 984 * was unable to recurse and had to leave us with an empty leaf. 985 */ 986 KKASSERT(cursor->index <= ondisk->count); 987 if (ondisk->count == 0) { 988 error = btree_remove(cursor); 989 if (error == EDEADLK) 990 error = 0; 991 } else { 992 error = 0; 993 } 994 KKASSERT(cursor->parent == NULL || 995 cursor->parent_index < cursor->parent->ondisk->count); 996 return(error); 997 } 998 999 /* 1000 * PRIMAY B-TREE SEARCH SUPPORT PROCEDURE 1001 * 1002 * Search the filesystem B-Tree for cursor->key_beg, return the matching node. 1003 * 1004 * The search can begin ANYWHERE in the B-Tree. As a first step the search 1005 * iterates up the tree as necessary to properly position itself prior to 1006 * actually doing the sarch. 1007 * 1008 * INSERTIONS: The search will split full nodes and leaves on its way down 1009 * and guarentee that the leaf it ends up on is not full. If we run out 1010 * of space the search continues to the leaf (to position the cursor for 1011 * the spike), but ENOSPC is returned. 1012 * 1013 * The search is only guarenteed to end up on a leaf if an error code of 0 1014 * is returned, or if inserting and an error code of ENOENT is returned. 1015 * Otherwise it can stop at an internal node. On success a search returns 1016 * a leaf node. 1017 * 1018 * COMPLEXITY WARNING! This is the core B-Tree search code for the entire 1019 * filesystem, and it is not simple code. Please note the following facts: 1020 * 1021 * - Internal node recursions have a boundary on the left AND right. The 1022 * right boundary is non-inclusive. The create_tid is a generic part 1023 * of the key for internal nodes. 1024 * 1025 * - Leaf nodes contain terminal elements only now. 1026 * 1027 * - Filesystem lookups typically set HAMMER_CURSOR_ASOF, indicating a 1028 * historical search. ASOF and INSERT are mutually exclusive. When 1029 * doing an as-of lookup btree_search() checks for a right-edge boundary 1030 * case. If while recursing down the left-edge differs from the key 1031 * by ONLY its create_tid, HAMMER_CURSOR_CREATE_CHECK is set along 1032 * with cursor->create_check. This is used by btree_lookup() to iterate. 1033 * The iteration backwards because as-of searches can wind up going 1034 * down the wrong branch of the B-Tree. 1035 */ 1036 static 1037 int 1038 btree_search(hammer_cursor_t cursor, int flags) 1039 { 1040 hammer_node_ondisk_t node; 1041 hammer_btree_elm_t elm; 1042 int error; 1043 int enospc = 0; 1044 int i; 1045 int r; 1046 int s; 1047 1048 flags |= cursor->flags; 1049 ++hammer_stats_btree_searches; 1050 1051 if (hammer_debug_btree) { 1052 kprintf("SEARCH %016llx[%d] %016llx %02x key=%016llx cre=%016llx lo=%02x (td = %p)\n", 1053 (long long)cursor->node->node_offset, 1054 cursor->index, 1055 (long long)cursor->key_beg.obj_id, 1056 cursor->key_beg.rec_type, 1057 (long long)cursor->key_beg.key, 1058 (long long)cursor->key_beg.create_tid, 1059 cursor->key_beg.localization, 1060 curthread 1061 ); 1062 if (cursor->parent) 1063 kprintf("SEARCHP %016llx[%d] (%016llx/%016llx %016llx/%016llx) (%p/%p %p/%p)\n", 1064 (long long)cursor->parent->node_offset, 1065 cursor->parent_index, 1066 (long long)cursor->left_bound->obj_id, 1067 (long long)cursor->parent->ondisk->elms[cursor->parent_index].internal.base.obj_id, 1068 (long long)cursor->right_bound->obj_id, 1069 (long long)cursor->parent->ondisk->elms[cursor->parent_index+1].internal.base.obj_id, 1070 cursor->left_bound, 1071 &cursor->parent->ondisk->elms[cursor->parent_index], 1072 cursor->right_bound, 1073 &cursor->parent->ondisk->elms[cursor->parent_index+1] 1074 ); 1075 } 1076 1077 /* 1078 * Move our cursor up the tree until we find a node whos range covers 1079 * the key we are trying to locate. 1080 * 1081 * The left bound is inclusive, the right bound is non-inclusive. 1082 * It is ok to cursor up too far. 1083 */ 1084 for (;;) { 1085 r = hammer_btree_cmp(&cursor->key_beg, cursor->left_bound); 1086 s = hammer_btree_cmp(&cursor->key_beg, cursor->right_bound); 1087 if (r >= 0 && s < 0) 1088 break; 1089 KKASSERT(cursor->parent); 1090 ++hammer_stats_btree_iterations; 1091 error = hammer_cursor_up(cursor); 1092 if (error) 1093 goto done; 1094 } 1095 1096 /* 1097 * The delete-checks below are based on node, not parent. Set the 1098 * initial delete-check based on the parent. 1099 */ 1100 if (r == 1) { 1101 KKASSERT(cursor->left_bound->create_tid != 1); 1102 cursor->create_check = cursor->left_bound->create_tid - 1; 1103 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1104 } 1105 1106 /* 1107 * We better have ended up with a node somewhere. 1108 */ 1109 KKASSERT(cursor->node != NULL); 1110 1111 /* 1112 * If we are inserting we can't start at a full node if the parent 1113 * is also full (because there is no way to split the node), 1114 * continue running up the tree until the requirement is satisfied 1115 * or we hit the root of the filesystem. 1116 * 1117 * (If inserting we aren't doing an as-of search so we don't have 1118 * to worry about create_check). 1119 */ 1120 while ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) { 1121 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 1122 if (btree_node_is_full(cursor->node->ondisk) == 0) 1123 break; 1124 } else { 1125 if (btree_node_is_full(cursor->node->ondisk) ==0) 1126 break; 1127 } 1128 if (cursor->node->ondisk->parent == 0 || 1129 cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) { 1130 break; 1131 } 1132 ++hammer_stats_btree_iterations; 1133 error = hammer_cursor_up(cursor); 1134 /* node may have become stale */ 1135 if (error) 1136 goto done; 1137 } 1138 1139 /* 1140 * Push down through internal nodes to locate the requested key. 1141 */ 1142 node = cursor->node->ondisk; 1143 while (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 1144 /* 1145 * Scan the node to find the subtree index to push down into. 1146 * We go one-past, then back-up. 1147 * 1148 * We must proactively remove deleted elements which may 1149 * have been left over from a deadlocked btree_remove(). 1150 * 1151 * The left and right boundaries are included in the loop 1152 * in order to detect edge cases. 1153 * 1154 * If the separator only differs by create_tid (r == 1) 1155 * and we are doing an as-of search, we may end up going 1156 * down a branch to the left of the one containing the 1157 * desired key. This requires numerous special cases. 1158 */ 1159 ++hammer_stats_btree_iterations; 1160 if (hammer_debug_btree) { 1161 kprintf("SEARCH-I %016llx count=%d\n", 1162 (long long)cursor->node->node_offset, 1163 node->count); 1164 } 1165 1166 /* 1167 * Try to shortcut the search before dropping into the 1168 * linear loop. Locate the first node where r <= 1. 1169 */ 1170 i = hammer_btree_search_node(&cursor->key_beg, node); 1171 while (i <= node->count) { 1172 ++hammer_stats_btree_elements; 1173 elm = &node->elms[i]; 1174 r = hammer_btree_cmp(&cursor->key_beg, &elm->base); 1175 if (hammer_debug_btree > 2) { 1176 kprintf(" IELM %p %d r=%d\n", 1177 &node->elms[i], i, r); 1178 } 1179 if (r < 0) 1180 break; 1181 if (r == 1) { 1182 KKASSERT(elm->base.create_tid != 1); 1183 cursor->create_check = elm->base.create_tid - 1; 1184 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1185 } 1186 ++i; 1187 } 1188 if (hammer_debug_btree) { 1189 kprintf("SEARCH-I preI=%d/%d r=%d\n", 1190 i, node->count, r); 1191 } 1192 1193 /* 1194 * These cases occur when the parent's idea of the boundary 1195 * is wider then the child's idea of the boundary, and 1196 * require special handling. If not inserting we can 1197 * terminate the search early for these cases but the 1198 * child's boundaries cannot be unconditionally modified. 1199 */ 1200 if (i == 0) { 1201 /* 1202 * If i == 0 the search terminated to the LEFT of the 1203 * left_boundary but to the RIGHT of the parent's left 1204 * boundary. 1205 */ 1206 u_int8_t save; 1207 1208 elm = &node->elms[0]; 1209 1210 /* 1211 * If we aren't inserting we can stop here. 1212 */ 1213 if ((flags & (HAMMER_CURSOR_INSERT | 1214 HAMMER_CURSOR_PRUNING)) == 0) { 1215 cursor->index = 0; 1216 return(ENOENT); 1217 } 1218 1219 /* 1220 * Correct a left-hand boundary mismatch. 1221 * 1222 * We can only do this if we can upgrade the lock, 1223 * and synchronized as a background cursor (i.e. 1224 * inserting or pruning). 1225 * 1226 * WARNING: We can only do this if inserting, i.e. 1227 * we are running on the backend. 1228 */ 1229 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1230 return(error); 1231 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1232 hammer_modify_node_field(cursor->trans, cursor->node, 1233 elms[0]); 1234 save = node->elms[0].base.btype; 1235 node->elms[0].base = *cursor->left_bound; 1236 node->elms[0].base.btype = save; 1237 hammer_modify_node_done(cursor->node); 1238 } else if (i == node->count + 1) { 1239 /* 1240 * If i == node->count + 1 the search terminated to 1241 * the RIGHT of the right boundary but to the LEFT 1242 * of the parent's right boundary. If we aren't 1243 * inserting we can stop here. 1244 * 1245 * Note that the last element in this case is 1246 * elms[i-2] prior to adjustments to 'i'. 1247 */ 1248 --i; 1249 if ((flags & (HAMMER_CURSOR_INSERT | 1250 HAMMER_CURSOR_PRUNING)) == 0) { 1251 cursor->index = i; 1252 return (ENOENT); 1253 } 1254 1255 /* 1256 * Correct a right-hand boundary mismatch. 1257 * (actual push-down record is i-2 prior to 1258 * adjustments to i). 1259 * 1260 * We can only do this if we can upgrade the lock, 1261 * and synchronized as a background cursor (i.e. 1262 * inserting or pruning). 1263 * 1264 * WARNING: We can only do this if inserting, i.e. 1265 * we are running on the backend. 1266 */ 1267 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1268 return(error); 1269 elm = &node->elms[i]; 1270 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1271 hammer_modify_node(cursor->trans, cursor->node, 1272 &elm->base, sizeof(elm->base)); 1273 elm->base = *cursor->right_bound; 1274 hammer_modify_node_done(cursor->node); 1275 --i; 1276 } else { 1277 /* 1278 * The push-down index is now i - 1. If we had 1279 * terminated on the right boundary this will point 1280 * us at the last element. 1281 */ 1282 --i; 1283 } 1284 cursor->index = i; 1285 elm = &node->elms[i]; 1286 1287 if (hammer_debug_btree) { 1288 kprintf("RESULT-I %016llx[%d] %016llx %02x " 1289 "key=%016llx cre=%016llx lo=%02x\n", 1290 (long long)cursor->node->node_offset, 1291 i, 1292 (long long)elm->internal.base.obj_id, 1293 elm->internal.base.rec_type, 1294 (long long)elm->internal.base.key, 1295 (long long)elm->internal.base.create_tid, 1296 elm->internal.base.localization 1297 ); 1298 } 1299 1300 /* 1301 * We better have a valid subtree offset. 1302 */ 1303 KKASSERT(elm->internal.subtree_offset != 0); 1304 1305 /* 1306 * Handle insertion and deletion requirements. 1307 * 1308 * If inserting split full nodes. The split code will 1309 * adjust cursor->node and cursor->index if the current 1310 * index winds up in the new node. 1311 * 1312 * If inserting and a left or right edge case was detected, 1313 * we cannot correct the left or right boundary and must 1314 * prepend and append an empty leaf node in order to make 1315 * the boundary correction. 1316 * 1317 * If we run out of space we set enospc and continue on 1318 * to a leaf to provide the spike code with a good point 1319 * of entry. 1320 */ 1321 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) { 1322 if (btree_node_is_full(node)) { 1323 error = btree_split_internal(cursor); 1324 if (error) { 1325 if (error != ENOSPC) 1326 goto done; 1327 enospc = 1; 1328 } 1329 /* 1330 * reload stale pointers 1331 */ 1332 i = cursor->index; 1333 node = cursor->node->ondisk; 1334 } 1335 } 1336 1337 /* 1338 * Push down (push into new node, existing node becomes 1339 * the parent) and continue the search. 1340 */ 1341 error = hammer_cursor_down(cursor); 1342 /* node may have become stale */ 1343 if (error) 1344 goto done; 1345 node = cursor->node->ondisk; 1346 } 1347 1348 /* 1349 * We are at a leaf, do a linear search of the key array. 1350 * 1351 * On success the index is set to the matching element and 0 1352 * is returned. 1353 * 1354 * On failure the index is set to the insertion point and ENOENT 1355 * is returned. 1356 * 1357 * Boundaries are not stored in leaf nodes, so the index can wind 1358 * up to the left of element 0 (index == 0) or past the end of 1359 * the array (index == node->count). It is also possible that the 1360 * leaf might be empty. 1361 */ 1362 ++hammer_stats_btree_iterations; 1363 KKASSERT (node->type == HAMMER_BTREE_TYPE_LEAF); 1364 KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS); 1365 if (hammer_debug_btree) { 1366 kprintf("SEARCH-L %016llx count=%d\n", 1367 (long long)cursor->node->node_offset, 1368 node->count); 1369 } 1370 1371 /* 1372 * Try to shortcut the search before dropping into the 1373 * linear loop. Locate the first node where r <= 1. 1374 */ 1375 i = hammer_btree_search_node(&cursor->key_beg, node); 1376 while (i < node->count) { 1377 ++hammer_stats_btree_elements; 1378 elm = &node->elms[i]; 1379 1380 r = hammer_btree_cmp(&cursor->key_beg, &elm->leaf.base); 1381 1382 if (hammer_debug_btree > 1) 1383 kprintf(" ELM %p %d r=%d\n", &node->elms[i], i, r); 1384 1385 /* 1386 * We are at a record element. Stop if we've flipped past 1387 * key_beg, not counting the create_tid test. Allow the 1388 * r == 1 case (key_beg > element but differs only by its 1389 * create_tid) to fall through to the AS-OF check. 1390 */ 1391 KKASSERT (elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD); 1392 1393 if (r < 0) 1394 goto failed; 1395 if (r > 1) { 1396 ++i; 1397 continue; 1398 } 1399 1400 /* 1401 * Check our as-of timestamp against the element. 1402 */ 1403 if (flags & HAMMER_CURSOR_ASOF) { 1404 if (hammer_btree_chkts(cursor->asof, 1405 &node->elms[i].base) != 0) { 1406 ++i; 1407 continue; 1408 } 1409 /* success */ 1410 } else { 1411 if (r > 0) { /* can only be +1 */ 1412 ++i; 1413 continue; 1414 } 1415 /* success */ 1416 } 1417 cursor->index = i; 1418 error = 0; 1419 if (hammer_debug_btree) { 1420 kprintf("RESULT-L %016llx[%d] (SUCCESS)\n", 1421 (long long)cursor->node->node_offset, i); 1422 } 1423 goto done; 1424 } 1425 1426 /* 1427 * The search of the leaf node failed. i is the insertion point. 1428 */ 1429 failed: 1430 if (hammer_debug_btree) { 1431 kprintf("RESULT-L %016llx[%d] (FAILED)\n", 1432 (long long)cursor->node->node_offset, i); 1433 } 1434 1435 /* 1436 * No exact match was found, i is now at the insertion point. 1437 * 1438 * If inserting split a full leaf before returning. This 1439 * may have the side effect of adjusting cursor->node and 1440 * cursor->index. 1441 */ 1442 cursor->index = i; 1443 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0 && 1444 btree_node_is_full(node)) { 1445 error = btree_split_leaf(cursor); 1446 if (error) { 1447 if (error != ENOSPC) 1448 goto done; 1449 enospc = 1; 1450 } 1451 /* 1452 * reload stale pointers 1453 */ 1454 /* NOT USED 1455 i = cursor->index; 1456 node = &cursor->node->internal; 1457 */ 1458 } 1459 1460 /* 1461 * We reached a leaf but did not find the key we were looking for. 1462 * If this is an insert we will be properly positioned for an insert 1463 * (ENOENT) or spike (ENOSPC) operation. 1464 */ 1465 error = enospc ? ENOSPC : ENOENT; 1466 done: 1467 return(error); 1468 } 1469 1470 /* 1471 * Heuristical search for the first element whos comparison is <= 1. May 1472 * return an index whos compare result is > 1 but may only return an index 1473 * whos compare result is <= 1 if it is the first element with that result. 1474 */ 1475 int 1476 hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node) 1477 { 1478 int b; 1479 int s; 1480 int i; 1481 int r; 1482 1483 /* 1484 * Don't bother if the node does not have very many elements 1485 */ 1486 b = 0; 1487 s = node->count; 1488 while (s - b > 4) { 1489 i = b + (s - b) / 2; 1490 ++hammer_stats_btree_elements; 1491 r = hammer_btree_cmp(elm, &node->elms[i].leaf.base); 1492 if (r <= 1) { 1493 s = i; 1494 } else { 1495 b = i; 1496 } 1497 } 1498 return(b); 1499 } 1500 1501 1502 /************************************************************************ 1503 * SPLITTING AND MERGING * 1504 ************************************************************************ 1505 * 1506 * These routines do all the dirty work required to split and merge nodes. 1507 */ 1508 1509 /* 1510 * Split an internal node into two nodes and move the separator at the split 1511 * point to the parent. 1512 * 1513 * (cursor->node, cursor->index) indicates the element the caller intends 1514 * to push into. We will adjust node and index if that element winds 1515 * up in the split node. 1516 * 1517 * If we are at the root of the filesystem a new root must be created with 1518 * two elements, one pointing to the original root and one pointing to the 1519 * newly allocated split node. 1520 */ 1521 static 1522 int 1523 btree_split_internal(hammer_cursor_t cursor) 1524 { 1525 hammer_node_ondisk_t ondisk; 1526 hammer_node_t node; 1527 hammer_node_t parent; 1528 hammer_node_t new_node; 1529 hammer_btree_elm_t elm; 1530 hammer_btree_elm_t parent_elm; 1531 struct hammer_node_lock lockroot; 1532 hammer_mount_t hmp = cursor->trans->hmp; 1533 int parent_index; 1534 int made_root; 1535 int split; 1536 int error; 1537 int i; 1538 const int esize = sizeof(*elm); 1539 1540 hammer_node_lock_init(&lockroot, cursor->node); 1541 error = hammer_btree_lock_children(cursor, 1, &lockroot, NULL); 1542 if (error) 1543 goto done; 1544 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1545 goto done; 1546 ++hammer_stats_btree_splits; 1547 1548 /* 1549 * Calculate the split point. If the insertion point is at the 1550 * end of the leaf we adjust the split point significantly to the 1551 * right to try to optimize node fill and flag it. If we hit 1552 * that same leaf again our heuristic failed and we don't try 1553 * to optimize node fill (it could lead to a degenerate case). 1554 */ 1555 node = cursor->node; 1556 ondisk = node->ondisk; 1557 KKASSERT(ondisk->count > 4); 1558 if (cursor->index == ondisk->count && 1559 (node->flags & HAMMER_NODE_NONLINEAR) == 0) { 1560 split = (ondisk->count + 1) * 3 / 4; 1561 node->flags |= HAMMER_NODE_NONLINEAR; 1562 } else { 1563 /* 1564 * We are splitting but elms[split] will be promoted to 1565 * the parent, leaving the right hand node with one less 1566 * element. If the insertion point will be on the 1567 * left-hand side adjust the split point to give the 1568 * right hand side one additional node. 1569 */ 1570 split = (ondisk->count + 1) / 2; 1571 if (cursor->index <= split) 1572 --split; 1573 } 1574 1575 /* 1576 * If we are at the root of the filesystem, create a new root node 1577 * with 1 element and split normally. Avoid making major 1578 * modifications until we know the whole operation will work. 1579 */ 1580 if (ondisk->parent == 0) { 1581 parent = hammer_alloc_btree(cursor->trans, 0, &error); 1582 if (parent == NULL) 1583 goto done; 1584 hammer_lock_ex(&parent->lock); 1585 hammer_modify_node_noundo(cursor->trans, parent); 1586 ondisk = parent->ondisk; 1587 ondisk->count = 1; 1588 ondisk->parent = 0; 1589 ondisk->mirror_tid = node->ondisk->mirror_tid; 1590 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1591 ondisk->elms[0].base = hmp->root_btree_beg; 1592 ondisk->elms[0].base.btype = node->ondisk->type; 1593 ondisk->elms[0].internal.subtree_offset = node->node_offset; 1594 ondisk->elms[1].base = hmp->root_btree_end; 1595 hammer_modify_node_done(parent); 1596 /* ondisk->elms[1].base.btype - not used */ 1597 made_root = 1; 1598 parent_index = 0; /* index of current node in parent */ 1599 } else { 1600 made_root = 0; 1601 parent = cursor->parent; 1602 parent_index = cursor->parent_index; 1603 } 1604 1605 /* 1606 * Split node into new_node at the split point. 1607 * 1608 * B O O O P N N B <-- P = node->elms[split] (index 4) 1609 * 0 1 2 3 4 5 6 <-- subtree indices 1610 * 1611 * x x P x x 1612 * s S S s 1613 * / \ 1614 * B O O O B B N N B <--- inner boundary points are 'P' 1615 * 0 1 2 3 4 5 6 1616 */ 1617 new_node = hammer_alloc_btree(cursor->trans, 0, &error); 1618 if (new_node == NULL) { 1619 if (made_root) { 1620 hammer_unlock(&parent->lock); 1621 hammer_delete_node(cursor->trans, parent); 1622 hammer_rel_node(parent); 1623 } 1624 goto done; 1625 } 1626 hammer_lock_ex(&new_node->lock); 1627 1628 /* 1629 * Create the new node. P becomes the left-hand boundary in the 1630 * new node. Copy the right-hand boundary as well. 1631 * 1632 * elm is the new separator. 1633 */ 1634 hammer_modify_node_noundo(cursor->trans, new_node); 1635 hammer_modify_node_all(cursor->trans, node); 1636 ondisk = node->ondisk; 1637 elm = &ondisk->elms[split]; 1638 bcopy(elm, &new_node->ondisk->elms[0], 1639 (ondisk->count - split + 1) * esize); 1640 new_node->ondisk->count = ondisk->count - split; 1641 new_node->ondisk->parent = parent->node_offset; 1642 new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1643 new_node->ondisk->mirror_tid = ondisk->mirror_tid; 1644 KKASSERT(ondisk->type == new_node->ondisk->type); 1645 hammer_cursor_split_node(node, new_node, split); 1646 1647 /* 1648 * Cleanup the original node. Elm (P) becomes the new boundary, 1649 * its subtree_offset was moved to the new node. If we had created 1650 * a new root its parent pointer may have changed. 1651 */ 1652 elm->internal.subtree_offset = 0; 1653 ondisk->count = split; 1654 1655 /* 1656 * Insert the separator into the parent, fixup the parent's 1657 * reference to the original node, and reference the new node. 1658 * The separator is P. 1659 * 1660 * Remember that base.count does not include the right-hand boundary. 1661 */ 1662 hammer_modify_node_all(cursor->trans, parent); 1663 ondisk = parent->ondisk; 1664 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1665 parent_elm = &ondisk->elms[parent_index+1]; 1666 bcopy(parent_elm, parent_elm + 1, 1667 (ondisk->count - parent_index) * esize); 1668 parent_elm->internal.base = elm->base; /* separator P */ 1669 parent_elm->internal.base.btype = new_node->ondisk->type; 1670 parent_elm->internal.subtree_offset = new_node->node_offset; 1671 parent_elm->internal.mirror_tid = new_node->ondisk->mirror_tid; 1672 ++ondisk->count; 1673 hammer_modify_node_done(parent); 1674 hammer_cursor_inserted_element(parent, parent_index + 1); 1675 1676 /* 1677 * The children of new_node need their parent pointer set to new_node. 1678 * The children have already been locked by 1679 * hammer_btree_lock_children(). 1680 */ 1681 for (i = 0; i < new_node->ondisk->count; ++i) { 1682 elm = &new_node->ondisk->elms[i]; 1683 error = btree_set_parent(cursor->trans, new_node, elm); 1684 if (error) { 1685 panic("btree_split_internal: btree-fixup problem"); 1686 } 1687 } 1688 hammer_modify_node_done(new_node); 1689 1690 /* 1691 * The filesystem's root B-Tree pointer may have to be updated. 1692 */ 1693 if (made_root) { 1694 hammer_volume_t volume; 1695 1696 volume = hammer_get_root_volume(hmp, &error); 1697 KKASSERT(error == 0); 1698 1699 hammer_modify_volume_field(cursor->trans, volume, 1700 vol0_btree_root); 1701 volume->ondisk->vol0_btree_root = parent->node_offset; 1702 hammer_modify_volume_done(volume); 1703 node->ondisk->parent = parent->node_offset; 1704 if (cursor->parent) { 1705 hammer_unlock(&cursor->parent->lock); 1706 hammer_rel_node(cursor->parent); 1707 } 1708 cursor->parent = parent; /* lock'd and ref'd */ 1709 hammer_rel_volume(volume, 0); 1710 } 1711 hammer_modify_node_done(node); 1712 1713 /* 1714 * Ok, now adjust the cursor depending on which element the original 1715 * index was pointing at. If we are >= the split point the push node 1716 * is now in the new node. 1717 * 1718 * NOTE: If we are at the split point itself we cannot stay with the 1719 * original node because the push index will point at the right-hand 1720 * boundary, which is illegal. 1721 * 1722 * NOTE: The cursor's parent or parent_index must be adjusted for 1723 * the case where a new parent (new root) was created, and the case 1724 * where the cursor is now pointing at the split node. 1725 */ 1726 if (cursor->index >= split) { 1727 cursor->parent_index = parent_index + 1; 1728 cursor->index -= split; 1729 hammer_unlock(&cursor->node->lock); 1730 hammer_rel_node(cursor->node); 1731 cursor->node = new_node; /* locked and ref'd */ 1732 } else { 1733 cursor->parent_index = parent_index; 1734 hammer_unlock(&new_node->lock); 1735 hammer_rel_node(new_node); 1736 } 1737 1738 /* 1739 * Fixup left and right bounds 1740 */ 1741 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1742 cursor->left_bound = &parent_elm[0].internal.base; 1743 cursor->right_bound = &parent_elm[1].internal.base; 1744 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1745 &cursor->node->ondisk->elms[0].internal.base) <= 0); 1746 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1747 &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0); 1748 1749 done: 1750 hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, NULL); 1751 hammer_cursor_downgrade(cursor); 1752 return (error); 1753 } 1754 1755 /* 1756 * Same as the above, but splits a full leaf node. 1757 * 1758 * This function 1759 */ 1760 static 1761 int 1762 btree_split_leaf(hammer_cursor_t cursor) 1763 { 1764 hammer_node_ondisk_t ondisk; 1765 hammer_node_t parent; 1766 hammer_node_t leaf; 1767 hammer_mount_t hmp; 1768 hammer_node_t new_leaf; 1769 hammer_btree_elm_t elm; 1770 hammer_btree_elm_t parent_elm; 1771 hammer_base_elm_t mid_boundary; 1772 int parent_index; 1773 int made_root; 1774 int split; 1775 int error; 1776 const size_t esize = sizeof(*elm); 1777 1778 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1779 return(error); 1780 ++hammer_stats_btree_splits; 1781 1782 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1783 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1784 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1785 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1786 1787 /* 1788 * Calculate the split point. If the insertion point is at the 1789 * end of the leaf we adjust the split point significantly to the 1790 * right to try to optimize node fill and flag it. If we hit 1791 * that same leaf again our heuristic failed and we don't try 1792 * to optimize node fill (it could lead to a degenerate case). 1793 * 1794 * Spikes are made up of two leaf elements which cannot be 1795 * safely split. 1796 */ 1797 leaf = cursor->node; 1798 ondisk = leaf->ondisk; 1799 KKASSERT(ondisk->count > 4); 1800 if (cursor->index == ondisk->count && 1801 (leaf->flags & HAMMER_NODE_NONLINEAR) == 0) { 1802 split = (ondisk->count + 1) * 3 / 4; 1803 leaf->flags |= HAMMER_NODE_NONLINEAR; 1804 } else { 1805 split = (ondisk->count + 1) / 2; 1806 } 1807 1808 #if 0 1809 /* 1810 * If the insertion point is at the split point shift the 1811 * split point left so we don't have to worry about 1812 */ 1813 if (cursor->index == split) 1814 --split; 1815 #endif 1816 KKASSERT(split > 0 && split < ondisk->count); 1817 1818 error = 0; 1819 hmp = leaf->hmp; 1820 1821 elm = &ondisk->elms[split]; 1822 1823 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm[-1].leaf.base) <= 0); 1824 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->leaf.base) <= 0); 1825 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->leaf.base) > 0); 1826 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm[1].leaf.base) > 0); 1827 1828 /* 1829 * If we are at the root of the tree, create a new root node with 1830 * 1 element and split normally. Avoid making major modifications 1831 * until we know the whole operation will work. 1832 */ 1833 if (ondisk->parent == 0) { 1834 parent = hammer_alloc_btree(cursor->trans, 0, &error); 1835 if (parent == NULL) 1836 goto done; 1837 hammer_lock_ex(&parent->lock); 1838 hammer_modify_node_noundo(cursor->trans, parent); 1839 ondisk = parent->ondisk; 1840 ondisk->count = 1; 1841 ondisk->parent = 0; 1842 ondisk->mirror_tid = leaf->ondisk->mirror_tid; 1843 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1844 ondisk->elms[0].base = hmp->root_btree_beg; 1845 ondisk->elms[0].base.btype = leaf->ondisk->type; 1846 ondisk->elms[0].internal.subtree_offset = leaf->node_offset; 1847 ondisk->elms[1].base = hmp->root_btree_end; 1848 /* ondisk->elms[1].base.btype = not used */ 1849 hammer_modify_node_done(parent); 1850 made_root = 1; 1851 parent_index = 0; /* insertion point in parent */ 1852 } else { 1853 made_root = 0; 1854 parent = cursor->parent; 1855 parent_index = cursor->parent_index; 1856 } 1857 1858 /* 1859 * Split leaf into new_leaf at the split point. Select a separator 1860 * value in-between the two leafs but with a bent towards the right 1861 * leaf since comparisons use an 'elm >= separator' inequality. 1862 * 1863 * L L L L L L L L 1864 * 1865 * x x P x x 1866 * s S S s 1867 * / \ 1868 * L L L L L L L L 1869 */ 1870 new_leaf = hammer_alloc_btree(cursor->trans, 0, &error); 1871 if (new_leaf == NULL) { 1872 if (made_root) { 1873 hammer_unlock(&parent->lock); 1874 hammer_delete_node(cursor->trans, parent); 1875 hammer_rel_node(parent); 1876 } 1877 goto done; 1878 } 1879 hammer_lock_ex(&new_leaf->lock); 1880 1881 /* 1882 * Create the new node and copy the leaf elements from the split 1883 * point on to the new node. 1884 */ 1885 hammer_modify_node_all(cursor->trans, leaf); 1886 hammer_modify_node_noundo(cursor->trans, new_leaf); 1887 ondisk = leaf->ondisk; 1888 elm = &ondisk->elms[split]; 1889 bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize); 1890 new_leaf->ondisk->count = ondisk->count - split; 1891 new_leaf->ondisk->parent = parent->node_offset; 1892 new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF; 1893 new_leaf->ondisk->mirror_tid = ondisk->mirror_tid; 1894 KKASSERT(ondisk->type == new_leaf->ondisk->type); 1895 hammer_modify_node_done(new_leaf); 1896 hammer_cursor_split_node(leaf, new_leaf, split); 1897 1898 /* 1899 * Cleanup the original node. Because this is a leaf node and 1900 * leaf nodes do not have a right-hand boundary, there 1901 * aren't any special edge cases to clean up. We just fixup the 1902 * count. 1903 */ 1904 ondisk->count = split; 1905 1906 /* 1907 * Insert the separator into the parent, fixup the parent's 1908 * reference to the original node, and reference the new node. 1909 * The separator is P. 1910 * 1911 * Remember that base.count does not include the right-hand boundary. 1912 * We are copying parent_index+1 to parent_index+2, not +0 to +1. 1913 */ 1914 hammer_modify_node_all(cursor->trans, parent); 1915 ondisk = parent->ondisk; 1916 KKASSERT(split != 0); 1917 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1918 parent_elm = &ondisk->elms[parent_index+1]; 1919 bcopy(parent_elm, parent_elm + 1, 1920 (ondisk->count - parent_index) * esize); 1921 1922 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base); 1923 parent_elm->internal.base.btype = new_leaf->ondisk->type; 1924 parent_elm->internal.subtree_offset = new_leaf->node_offset; 1925 parent_elm->internal.mirror_tid = new_leaf->ondisk->mirror_tid; 1926 mid_boundary = &parent_elm->base; 1927 ++ondisk->count; 1928 hammer_modify_node_done(parent); 1929 hammer_cursor_inserted_element(parent, parent_index + 1); 1930 1931 /* 1932 * The filesystem's root B-Tree pointer may have to be updated. 1933 */ 1934 if (made_root) { 1935 hammer_volume_t volume; 1936 1937 volume = hammer_get_root_volume(hmp, &error); 1938 KKASSERT(error == 0); 1939 1940 hammer_modify_volume_field(cursor->trans, volume, 1941 vol0_btree_root); 1942 volume->ondisk->vol0_btree_root = parent->node_offset; 1943 hammer_modify_volume_done(volume); 1944 leaf->ondisk->parent = parent->node_offset; 1945 if (cursor->parent) { 1946 hammer_unlock(&cursor->parent->lock); 1947 hammer_rel_node(cursor->parent); 1948 } 1949 cursor->parent = parent; /* lock'd and ref'd */ 1950 hammer_rel_volume(volume, 0); 1951 } 1952 hammer_modify_node_done(leaf); 1953 1954 /* 1955 * Ok, now adjust the cursor depending on which element the original 1956 * index was pointing at. If we are >= the split point the push node 1957 * is now in the new node. 1958 * 1959 * NOTE: If we are at the split point itself we need to select the 1960 * old or new node based on where key_beg's insertion point will be. 1961 * If we pick the wrong side the inserted element will wind up in 1962 * the wrong leaf node and outside that node's bounds. 1963 */ 1964 if (cursor->index > split || 1965 (cursor->index == split && 1966 hammer_btree_cmp(&cursor->key_beg, mid_boundary) >= 0)) { 1967 cursor->parent_index = parent_index + 1; 1968 cursor->index -= split; 1969 hammer_unlock(&cursor->node->lock); 1970 hammer_rel_node(cursor->node); 1971 cursor->node = new_leaf; 1972 } else { 1973 cursor->parent_index = parent_index; 1974 hammer_unlock(&new_leaf->lock); 1975 hammer_rel_node(new_leaf); 1976 } 1977 1978 /* 1979 * Fixup left and right bounds 1980 */ 1981 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1982 cursor->left_bound = &parent_elm[0].internal.base; 1983 cursor->right_bound = &parent_elm[1].internal.base; 1984 1985 /* 1986 * Assert that the bounds are correct. 1987 */ 1988 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1989 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1990 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1991 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1992 KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->key_beg) <= 0); 1993 KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->key_beg) > 0); 1994 1995 done: 1996 hammer_cursor_downgrade(cursor); 1997 return (error); 1998 } 1999 2000 #if 0 2001 2002 /* 2003 * Recursively correct the right-hand boundary's create_tid to (tid) as 2004 * long as the rest of the key matches. We have to recurse upward in 2005 * the tree as well as down the left side of each parent's right node. 2006 * 2007 * Return EDEADLK if we were only partially successful, forcing the caller 2008 * to try again. The original cursor is not modified. This routine can 2009 * also fail with EDEADLK if it is forced to throw away a portion of its 2010 * record history. 2011 * 2012 * The caller must pass a downgraded cursor to us (otherwise we can't dup it). 2013 */ 2014 struct hammer_rhb { 2015 TAILQ_ENTRY(hammer_rhb) entry; 2016 hammer_node_t node; 2017 int index; 2018 }; 2019 2020 TAILQ_HEAD(hammer_rhb_list, hammer_rhb); 2021 2022 int 2023 hammer_btree_correct_rhb(hammer_cursor_t cursor, hammer_tid_t tid) 2024 { 2025 struct hammer_mount *hmp; 2026 struct hammer_rhb_list rhb_list; 2027 hammer_base_elm_t elm; 2028 hammer_node_t orig_node; 2029 struct hammer_rhb *rhb; 2030 int orig_index; 2031 int error; 2032 2033 TAILQ_INIT(&rhb_list); 2034 hmp = cursor->trans->hmp; 2035 2036 /* 2037 * Save our position so we can restore it on return. This also 2038 * gives us a stable 'elm'. 2039 */ 2040 orig_node = cursor->node; 2041 hammer_ref_node(orig_node); 2042 hammer_lock_sh(&orig_node->lock); 2043 orig_index = cursor->index; 2044 elm = &orig_node->ondisk->elms[orig_index].base; 2045 2046 /* 2047 * Now build a list of parents going up, allocating a rhb 2048 * structure for each one. 2049 */ 2050 while (cursor->parent) { 2051 /* 2052 * Stop if we no longer have any right-bounds to fix up 2053 */ 2054 if (elm->obj_id != cursor->right_bound->obj_id || 2055 elm->rec_type != cursor->right_bound->rec_type || 2056 elm->key != cursor->right_bound->key) { 2057 break; 2058 } 2059 2060 /* 2061 * Stop if the right-hand bound's create_tid does not 2062 * need to be corrected. 2063 */ 2064 if (cursor->right_bound->create_tid >= tid) 2065 break; 2066 2067 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2068 rhb->node = cursor->parent; 2069 rhb->index = cursor->parent_index; 2070 hammer_ref_node(rhb->node); 2071 hammer_lock_sh(&rhb->node->lock); 2072 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2073 2074 hammer_cursor_up(cursor); 2075 } 2076 2077 /* 2078 * now safely adjust the right hand bound for each rhb. This may 2079 * also require taking the right side of the tree and iterating down 2080 * ITS left side. 2081 */ 2082 error = 0; 2083 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2084 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2085 if (error) 2086 break; 2087 TAILQ_REMOVE(&rhb_list, rhb, entry); 2088 hammer_unlock(&rhb->node->lock); 2089 hammer_rel_node(rhb->node); 2090 kfree(rhb, hmp->m_misc); 2091 2092 switch (cursor->node->ondisk->type) { 2093 case HAMMER_BTREE_TYPE_INTERNAL: 2094 /* 2095 * Right-boundary for parent at internal node 2096 * is one element to the right of the element whos 2097 * right boundary needs adjusting. We must then 2098 * traverse down the left side correcting any left 2099 * bounds (which may now be too far to the left). 2100 */ 2101 ++cursor->index; 2102 error = hammer_btree_correct_lhb(cursor, tid); 2103 break; 2104 default: 2105 panic("hammer_btree_correct_rhb(): Bad node type"); 2106 error = EINVAL; 2107 break; 2108 } 2109 } 2110 2111 /* 2112 * Cleanup 2113 */ 2114 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2115 TAILQ_REMOVE(&rhb_list, rhb, entry); 2116 hammer_unlock(&rhb->node->lock); 2117 hammer_rel_node(rhb->node); 2118 kfree(rhb, hmp->m_misc); 2119 } 2120 error = hammer_cursor_seek(cursor, orig_node, orig_index); 2121 hammer_unlock(&orig_node->lock); 2122 hammer_rel_node(orig_node); 2123 return (error); 2124 } 2125 2126 /* 2127 * Similar to rhb (in fact, rhb calls lhb), but corrects the left hand 2128 * bound going downward starting at the current cursor position. 2129 * 2130 * This function does not restore the cursor after use. 2131 */ 2132 int 2133 hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid) 2134 { 2135 struct hammer_rhb_list rhb_list; 2136 hammer_base_elm_t elm; 2137 hammer_base_elm_t cmp; 2138 struct hammer_rhb *rhb; 2139 struct hammer_mount *hmp; 2140 int error; 2141 2142 TAILQ_INIT(&rhb_list); 2143 hmp = cursor->trans->hmp; 2144 2145 cmp = &cursor->node->ondisk->elms[cursor->index].base; 2146 2147 /* 2148 * Record the node and traverse down the left-hand side for all 2149 * matching records needing a boundary correction. 2150 */ 2151 error = 0; 2152 for (;;) { 2153 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2154 rhb->node = cursor->node; 2155 rhb->index = cursor->index; 2156 hammer_ref_node(rhb->node); 2157 hammer_lock_sh(&rhb->node->lock); 2158 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2159 2160 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2161 /* 2162 * Nothing to traverse down if we are at the right 2163 * boundary of an internal node. 2164 */ 2165 if (cursor->index == cursor->node->ondisk->count) 2166 break; 2167 } else { 2168 elm = &cursor->node->ondisk->elms[cursor->index].base; 2169 if (elm->btype == HAMMER_BTREE_TYPE_RECORD) 2170 break; 2171 panic("Illegal leaf record type %02x", elm->btype); 2172 } 2173 error = hammer_cursor_down(cursor); 2174 if (error) 2175 break; 2176 2177 elm = &cursor->node->ondisk->elms[cursor->index].base; 2178 if (elm->obj_id != cmp->obj_id || 2179 elm->rec_type != cmp->rec_type || 2180 elm->key != cmp->key) { 2181 break; 2182 } 2183 if (elm->create_tid >= tid) 2184 break; 2185 2186 } 2187 2188 /* 2189 * Now we can safely adjust the left-hand boundary from the bottom-up. 2190 * The last element we remove from the list is the caller's right hand 2191 * boundary, which must also be adjusted. 2192 */ 2193 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2194 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2195 if (error) 2196 break; 2197 TAILQ_REMOVE(&rhb_list, rhb, entry); 2198 hammer_unlock(&rhb->node->lock); 2199 hammer_rel_node(rhb->node); 2200 kfree(rhb, hmp->m_misc); 2201 2202 elm = &cursor->node->ondisk->elms[cursor->index].base; 2203 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2204 hammer_modify_node(cursor->trans, cursor->node, 2205 &elm->create_tid, 2206 sizeof(elm->create_tid)); 2207 elm->create_tid = tid; 2208 hammer_modify_node_done(cursor->node); 2209 } else { 2210 panic("hammer_btree_correct_lhb(): Bad element type"); 2211 } 2212 } 2213 2214 /* 2215 * Cleanup 2216 */ 2217 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2218 TAILQ_REMOVE(&rhb_list, rhb, entry); 2219 hammer_unlock(&rhb->node->lock); 2220 hammer_rel_node(rhb->node); 2221 kfree(rhb, hmp->m_misc); 2222 } 2223 return (error); 2224 } 2225 2226 #endif 2227 2228 /* 2229 * Attempt to remove the locked, empty or want-to-be-empty B-Tree node at 2230 * (cursor->node). Returns 0 on success, EDEADLK if we could not complete 2231 * the operation due to a deadlock, or some other error. 2232 * 2233 * This routine is initially called with an empty leaf and may be 2234 * recursively called with single-element internal nodes. 2235 * 2236 * It should also be noted that when removing empty leaves we must be sure 2237 * to test and update mirror_tid because another thread may have deadlocked 2238 * against us (or someone) trying to propagate it up and cannot retry once 2239 * the node has been deleted. 2240 * 2241 * On return the cursor may end up pointing to an internal node, suitable 2242 * for further iteration but not for an immediate insertion or deletion. 2243 */ 2244 static int 2245 btree_remove(hammer_cursor_t cursor) 2246 { 2247 hammer_node_ondisk_t ondisk; 2248 hammer_btree_elm_t elm; 2249 hammer_node_t node; 2250 hammer_node_t parent; 2251 const int esize = sizeof(*elm); 2252 int error; 2253 2254 node = cursor->node; 2255 2256 /* 2257 * When deleting the root of the filesystem convert it to 2258 * an empty leaf node. Internal nodes cannot be empty. 2259 */ 2260 ondisk = node->ondisk; 2261 if (ondisk->parent == 0) { 2262 KKASSERT(cursor->parent == NULL); 2263 hammer_modify_node_all(cursor->trans, node); 2264 KKASSERT(ondisk == node->ondisk); 2265 ondisk->type = HAMMER_BTREE_TYPE_LEAF; 2266 ondisk->count = 0; 2267 hammer_modify_node_done(node); 2268 cursor->index = 0; 2269 return(0); 2270 } 2271 2272 parent = cursor->parent; 2273 2274 /* 2275 * Attempt to remove the parent's reference to the child. If the 2276 * parent would become empty we have to recurse. If we fail we 2277 * leave the parent pointing to an empty leaf node. 2278 * 2279 * We have to recurse successfully before we can delete the internal 2280 * node as it is illegal to have empty internal nodes. Even though 2281 * the operation may be aborted we must still fixup any unlocked 2282 * cursors as if we had deleted the element prior to recursing 2283 * (by calling hammer_cursor_deleted_element()) so those cursors 2284 * are properly forced up the chain by the recursion. 2285 */ 2286 if (parent->ondisk->count == 1) { 2287 /* 2288 * This special cursor_up_locked() call leaves the original 2289 * node exclusively locked and referenced, leaves the 2290 * original parent locked (as the new node), and locks the 2291 * new parent. It can return EDEADLK. 2292 * 2293 * We cannot call hammer_cursor_removed_node() until we are 2294 * actually able to remove the node. If we did then tracked 2295 * cursors in the middle of iterations could be repointed 2296 * to a parent node. If this occurs they could end up 2297 * scanning newly inserted records into the node (that could 2298 * not be deleted) when they push down again. 2299 * 2300 * Due to the way the recursion works the final parent is left 2301 * in cursor->parent after the recursion returns. Each 2302 * layer on the way back up is thus able to call 2303 * hammer_cursor_removed_node() and 'jump' the node up to 2304 * the (same) final parent. 2305 * 2306 * NOTE! The local variable 'parent' is invalid after we 2307 * call hammer_cursor_up_locked(). 2308 */ 2309 error = hammer_cursor_up_locked(cursor); 2310 parent = NULL; 2311 2312 if (error == 0) { 2313 hammer_cursor_deleted_element(cursor->node, 0); 2314 error = btree_remove(cursor); 2315 if (error == 0) { 2316 KKASSERT(node != cursor->node); 2317 hammer_cursor_removed_node( 2318 node, cursor->node, 2319 cursor->index); 2320 hammer_modify_node_all(cursor->trans, node); 2321 ondisk = node->ondisk; 2322 ondisk->type = HAMMER_BTREE_TYPE_DELETED; 2323 ondisk->count = 0; 2324 hammer_modify_node_done(node); 2325 hammer_flush_node(node, 0); 2326 hammer_delete_node(cursor->trans, node); 2327 } else { 2328 /* 2329 * Defer parent removal because we could not 2330 * get the lock, just let the leaf remain 2331 * empty. 2332 */ 2333 /**/ 2334 } 2335 hammer_unlock(&node->lock); 2336 hammer_rel_node(node); 2337 } else { 2338 /* 2339 * Defer parent removal because we could not 2340 * get the lock, just let the leaf remain 2341 * empty. 2342 */ 2343 /**/ 2344 } 2345 } else { 2346 KKASSERT(parent->ondisk->count > 1); 2347 2348 hammer_modify_node_all(cursor->trans, parent); 2349 ondisk = parent->ondisk; 2350 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 2351 2352 elm = &ondisk->elms[cursor->parent_index]; 2353 KKASSERT(elm->internal.subtree_offset == node->node_offset); 2354 KKASSERT(ondisk->count > 0); 2355 2356 /* 2357 * We must retain the highest mirror_tid. The deleted 2358 * range is now encompassed by the element to the left. 2359 * If we are already at the left edge the new left edge 2360 * inherits mirror_tid. 2361 * 2362 * Note that bounds of the parent to our parent may create 2363 * a gap to the left of our left-most node or to the right 2364 * of our right-most node. The gap is silently included 2365 * in the mirror_tid's area of effect from the point of view 2366 * of the scan. 2367 */ 2368 if (cursor->parent_index) { 2369 if (elm[-1].internal.mirror_tid < 2370 elm[0].internal.mirror_tid) { 2371 elm[-1].internal.mirror_tid = 2372 elm[0].internal.mirror_tid; 2373 } 2374 } else { 2375 if (elm[1].internal.mirror_tid < 2376 elm[0].internal.mirror_tid) { 2377 elm[1].internal.mirror_tid = 2378 elm[0].internal.mirror_tid; 2379 } 2380 } 2381 2382 /* 2383 * Delete the subtree reference in the parent. Include 2384 * boundary element at end. 2385 */ 2386 bcopy(&elm[1], &elm[0], 2387 (ondisk->count - cursor->parent_index) * esize); 2388 --ondisk->count; 2389 hammer_modify_node_done(parent); 2390 hammer_cursor_removed_node(node, parent, cursor->parent_index); 2391 hammer_cursor_deleted_element(parent, cursor->parent_index); 2392 hammer_flush_node(node, 0); 2393 hammer_delete_node(cursor->trans, node); 2394 2395 /* 2396 * cursor->node is invalid, cursor up to make the cursor 2397 * valid again. We have to flag the condition in case 2398 * another thread wiggles an insertion in during an 2399 * iteration. 2400 */ 2401 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK; 2402 error = hammer_cursor_up(cursor); 2403 } 2404 return (error); 2405 } 2406 2407 /* 2408 * Propagate cursor->trans->tid up the B-Tree starting at the current 2409 * cursor position using pseudofs info gleaned from the passed inode. 2410 * 2411 * The passed inode has no relationship to the cursor position other 2412 * then being in the same pseudofs as the insertion or deletion we 2413 * are propagating the mirror_tid for. 2414 * 2415 * WARNING! Because we push and pop the passed cursor, it may be 2416 * modified by other B-Tree operations while it is unlocked 2417 * and things like the node & leaf pointers, and indexes might 2418 * change. 2419 */ 2420 void 2421 hammer_btree_do_propagation(hammer_cursor_t cursor, 2422 hammer_pseudofs_inmem_t pfsm, 2423 hammer_btree_leaf_elm_t leaf) 2424 { 2425 hammer_cursor_t ncursor; 2426 hammer_tid_t mirror_tid; 2427 int error __debugvar; 2428 2429 /* 2430 * We do not propagate a mirror_tid if the filesystem was mounted 2431 * in no-mirror mode. 2432 */ 2433 if (cursor->trans->hmp->master_id < 0) 2434 return; 2435 2436 /* 2437 * This is a bit of a hack because we cannot deadlock or return 2438 * EDEADLK here. The related operation has already completed and 2439 * we must propagate the mirror_tid now regardless. 2440 * 2441 * Generate a new cursor which inherits the original's locks and 2442 * unlock the original. Use the new cursor to propagate the 2443 * mirror_tid. Then clean up the new cursor and reacquire locks 2444 * on the original. 2445 * 2446 * hammer_dup_cursor() cannot dup locks. The dup inherits the 2447 * original's locks and the original is tracked and must be 2448 * re-locked. 2449 */ 2450 mirror_tid = cursor->node->ondisk->mirror_tid; 2451 KKASSERT(mirror_tid != 0); 2452 ncursor = hammer_push_cursor(cursor); 2453 error = hammer_btree_mirror_propagate(ncursor, mirror_tid); 2454 KKASSERT(error == 0); 2455 hammer_pop_cursor(cursor, ncursor); 2456 /* WARNING: cursor's leaf pointer may change after pop */ 2457 } 2458 2459 2460 /* 2461 * Propagate a mirror TID update upwards through the B-Tree to the root. 2462 * 2463 * A locked internal node must be passed in. The node will remain locked 2464 * on return. 2465 * 2466 * This function syncs mirror_tid at the specified internal node's element, 2467 * adjusts the node's aggregation mirror_tid, and then recurses upwards. 2468 */ 2469 static int 2470 hammer_btree_mirror_propagate(hammer_cursor_t cursor, hammer_tid_t mirror_tid) 2471 { 2472 hammer_btree_internal_elm_t elm; 2473 hammer_node_t node; 2474 int error; 2475 2476 for (;;) { 2477 error = hammer_cursor_up(cursor); 2478 if (error == 0) 2479 error = hammer_cursor_upgrade(cursor); 2480 2481 /* 2482 * We can ignore HAMMER_CURSOR_ITERATE_CHECK, the 2483 * cursor will still be properly positioned for 2484 * mirror propagation, just not for iterations. 2485 */ 2486 while (error == EDEADLK) { 2487 hammer_recover_cursor(cursor); 2488 error = hammer_cursor_upgrade(cursor); 2489 } 2490 if (error) 2491 break; 2492 2493 /* 2494 * If the cursor deadlocked it could end up at a leaf 2495 * after we lost the lock. 2496 */ 2497 node = cursor->node; 2498 if (node->ondisk->type != HAMMER_BTREE_TYPE_INTERNAL) 2499 continue; 2500 2501 /* 2502 * Adjust the node's element 2503 */ 2504 elm = &node->ondisk->elms[cursor->index].internal; 2505 if (elm->mirror_tid >= mirror_tid) 2506 break; 2507 hammer_modify_node(cursor->trans, node, &elm->mirror_tid, 2508 sizeof(elm->mirror_tid)); 2509 elm->mirror_tid = mirror_tid; 2510 hammer_modify_node_done(node); 2511 if (hammer_debug_general & 0x0002) { 2512 kprintf("mirror_propagate: propagate " 2513 "%016llx @%016llx:%d\n", 2514 (long long)mirror_tid, 2515 (long long)node->node_offset, 2516 cursor->index); 2517 } 2518 2519 2520 /* 2521 * Adjust the node's mirror_tid aggregator 2522 */ 2523 if (node->ondisk->mirror_tid >= mirror_tid) 2524 return(0); 2525 hammer_modify_node_field(cursor->trans, node, mirror_tid); 2526 node->ondisk->mirror_tid = mirror_tid; 2527 hammer_modify_node_done(node); 2528 if (hammer_debug_general & 0x0002) { 2529 kprintf("mirror_propagate: propagate " 2530 "%016llx @%016llx\n", 2531 (long long)mirror_tid, 2532 (long long)node->node_offset); 2533 } 2534 } 2535 if (error == ENOENT) 2536 error = 0; 2537 return(error); 2538 } 2539 2540 hammer_node_t 2541 hammer_btree_get_parent(hammer_transaction_t trans, hammer_node_t node, 2542 int *parent_indexp, int *errorp, int try_exclusive) 2543 { 2544 hammer_node_t parent; 2545 hammer_btree_elm_t elm; 2546 int i; 2547 2548 /* 2549 * Get the node 2550 */ 2551 parent = hammer_get_node(trans, node->ondisk->parent, 0, errorp); 2552 if (*errorp) { 2553 KKASSERT(parent == NULL); 2554 return(NULL); 2555 } 2556 KKASSERT ((parent->flags & HAMMER_NODE_DELETED) == 0); 2557 2558 /* 2559 * Lock the node 2560 */ 2561 if (try_exclusive) { 2562 if (hammer_lock_ex_try(&parent->lock)) { 2563 hammer_rel_node(parent); 2564 *errorp = EDEADLK; 2565 return(NULL); 2566 } 2567 } else { 2568 hammer_lock_sh(&parent->lock); 2569 } 2570 2571 /* 2572 * Figure out which element in the parent is pointing to the 2573 * child. 2574 */ 2575 if (node->ondisk->count) { 2576 i = hammer_btree_search_node(&node->ondisk->elms[0].base, 2577 parent->ondisk); 2578 } else { 2579 i = 0; 2580 } 2581 while (i < parent->ondisk->count) { 2582 elm = &parent->ondisk->elms[i]; 2583 if (elm->internal.subtree_offset == node->node_offset) 2584 break; 2585 ++i; 2586 } 2587 if (i == parent->ondisk->count) { 2588 hammer_unlock(&parent->lock); 2589 panic("Bad B-Tree link: parent %p node %p", parent, node); 2590 } 2591 *parent_indexp = i; 2592 KKASSERT(*errorp == 0); 2593 return(parent); 2594 } 2595 2596 /* 2597 * The element (elm) has been moved to a new internal node (node). 2598 * 2599 * If the element represents a pointer to an internal node that node's 2600 * parent must be adjusted to the element's new location. 2601 * 2602 * XXX deadlock potential here with our exclusive locks 2603 */ 2604 int 2605 btree_set_parent(hammer_transaction_t trans, hammer_node_t node, 2606 hammer_btree_elm_t elm) 2607 { 2608 hammer_node_t child; 2609 int error; 2610 2611 error = 0; 2612 2613 switch(elm->base.btype) { 2614 case HAMMER_BTREE_TYPE_INTERNAL: 2615 case HAMMER_BTREE_TYPE_LEAF: 2616 child = hammer_get_node(trans, elm->internal.subtree_offset, 2617 0, &error); 2618 if (error == 0) { 2619 hammer_modify_node_field(trans, child, parent); 2620 child->ondisk->parent = node->node_offset; 2621 hammer_modify_node_done(child); 2622 hammer_rel_node(child); 2623 } 2624 break; 2625 default: 2626 break; 2627 } 2628 return(error); 2629 } 2630 2631 /* 2632 * Initialize the root of a recursive B-Tree node lock list structure. 2633 */ 2634 void 2635 hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node) 2636 { 2637 TAILQ_INIT(&parent->list); 2638 parent->parent = NULL; 2639 parent->node = node; 2640 parent->index = -1; 2641 parent->count = node->ondisk->count; 2642 parent->copy = NULL; 2643 parent->flags = 0; 2644 } 2645 2646 /* 2647 * Initialize a cache of hammer_node_lock's including space allocated 2648 * for node copies. 2649 * 2650 * This is used by the rebalancing code to preallocate the copy space 2651 * for ~4096 B-Tree nodes (16MB of data) prior to acquiring any HAMMER 2652 * locks, otherwise we can blow out the pageout daemon's emergency 2653 * reserve and deadlock it. 2654 * 2655 * NOTE: HAMMER_NODE_LOCK_LCACHE is not set on items cached in the lcache. 2656 * The flag is set when the item is pulled off the cache for use. 2657 */ 2658 void 2659 hammer_btree_lcache_init(hammer_mount_t hmp, hammer_node_lock_t lcache, 2660 int depth) 2661 { 2662 hammer_node_lock_t item; 2663 int count; 2664 2665 for (count = 1; depth; --depth) 2666 count *= HAMMER_BTREE_LEAF_ELMS; 2667 bzero(lcache, sizeof(*lcache)); 2668 TAILQ_INIT(&lcache->list); 2669 while (count) { 2670 item = kmalloc(sizeof(*item), hmp->m_misc, M_WAITOK|M_ZERO); 2671 item->copy = kmalloc(sizeof(*item->copy), 2672 hmp->m_misc, M_WAITOK); 2673 TAILQ_INIT(&item->list); 2674 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2675 --count; 2676 } 2677 } 2678 2679 void 2680 hammer_btree_lcache_free(hammer_mount_t hmp, hammer_node_lock_t lcache) 2681 { 2682 hammer_node_lock_t item; 2683 2684 while ((item = TAILQ_FIRST(&lcache->list)) != NULL) { 2685 TAILQ_REMOVE(&lcache->list, item, entry); 2686 KKASSERT(item->copy); 2687 KKASSERT(TAILQ_EMPTY(&item->list)); 2688 kfree(item->copy, hmp->m_misc); 2689 kfree(item, hmp->m_misc); 2690 } 2691 KKASSERT(lcache->copy == NULL); 2692 } 2693 2694 /* 2695 * Exclusively lock all the children of node. This is used by the split 2696 * code to prevent anyone from accessing the children of a cursor node 2697 * while we fix-up its parent offset. 2698 * 2699 * If we don't lock the children we can really mess up cursors which block 2700 * trying to cursor-up into our node. 2701 * 2702 * On failure EDEADLK (or some other error) is returned. If a deadlock 2703 * error is returned the cursor is adjusted to block on termination. 2704 * 2705 * The caller is responsible for managing parent->node, the root's node 2706 * is usually aliased from a cursor. 2707 */ 2708 int 2709 hammer_btree_lock_children(hammer_cursor_t cursor, int depth, 2710 hammer_node_lock_t parent, 2711 hammer_node_lock_t lcache) 2712 { 2713 hammer_node_t node; 2714 hammer_node_lock_t item; 2715 hammer_node_ondisk_t ondisk; 2716 hammer_btree_elm_t elm; 2717 hammer_node_t child; 2718 struct hammer_mount *hmp; 2719 int error; 2720 int i; 2721 2722 node = parent->node; 2723 ondisk = node->ondisk; 2724 error = 0; 2725 hmp = cursor->trans->hmp; 2726 2727 /* 2728 * We really do not want to block on I/O with exclusive locks held, 2729 * pre-get the children before trying to lock the mess. This is 2730 * only done one-level deep for now. 2731 */ 2732 for (i = 0; i < ondisk->count; ++i) { 2733 ++hammer_stats_btree_elements; 2734 elm = &ondisk->elms[i]; 2735 if (elm->base.btype != HAMMER_BTREE_TYPE_LEAF && 2736 elm->base.btype != HAMMER_BTREE_TYPE_INTERNAL) { 2737 continue; 2738 } 2739 child = hammer_get_node(cursor->trans, 2740 elm->internal.subtree_offset, 2741 0, &error); 2742 if (child) 2743 hammer_rel_node(child); 2744 } 2745 2746 /* 2747 * Do it for real 2748 */ 2749 for (i = 0; error == 0 && i < ondisk->count; ++i) { 2750 ++hammer_stats_btree_elements; 2751 elm = &ondisk->elms[i]; 2752 2753 switch(elm->base.btype) { 2754 case HAMMER_BTREE_TYPE_INTERNAL: 2755 case HAMMER_BTREE_TYPE_LEAF: 2756 KKASSERT(elm->internal.subtree_offset != 0); 2757 child = hammer_get_node(cursor->trans, 2758 elm->internal.subtree_offset, 2759 0, &error); 2760 break; 2761 default: 2762 child = NULL; 2763 break; 2764 } 2765 if (child) { 2766 if (hammer_lock_ex_try(&child->lock) != 0) { 2767 if (cursor->deadlk_node == NULL) { 2768 cursor->deadlk_node = child; 2769 hammer_ref_node(cursor->deadlk_node); 2770 } 2771 error = EDEADLK; 2772 hammer_rel_node(child); 2773 } else { 2774 if (lcache) { 2775 item = TAILQ_FIRST(&lcache->list); 2776 KKASSERT(item != NULL); 2777 item->flags |= HAMMER_NODE_LOCK_LCACHE; 2778 TAILQ_REMOVE(&lcache->list, 2779 item, entry); 2780 } else { 2781 item = kmalloc(sizeof(*item), 2782 hmp->m_misc, 2783 M_WAITOK|M_ZERO); 2784 TAILQ_INIT(&item->list); 2785 } 2786 2787 TAILQ_INSERT_TAIL(&parent->list, item, entry); 2788 item->parent = parent; 2789 item->node = child; 2790 item->index = i; 2791 item->count = child->ondisk->count; 2792 2793 /* 2794 * Recurse (used by the rebalancing code) 2795 */ 2796 if (depth > 1 && elm->base.btype == HAMMER_BTREE_TYPE_INTERNAL) { 2797 error = hammer_btree_lock_children( 2798 cursor, 2799 depth - 1, 2800 item, 2801 lcache); 2802 } 2803 } 2804 } 2805 } 2806 if (error) 2807 hammer_btree_unlock_children(hmp, parent, lcache); 2808 return(error); 2809 } 2810 2811 /* 2812 * Create an in-memory copy of all B-Tree nodes listed, recursively, 2813 * including the parent. 2814 */ 2815 void 2816 hammer_btree_lock_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2817 { 2818 hammer_mount_t hmp = cursor->trans->hmp; 2819 hammer_node_lock_t item; 2820 2821 if (parent->copy == NULL) { 2822 KKASSERT((parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0); 2823 parent->copy = kmalloc(sizeof(*parent->copy), 2824 hmp->m_misc, M_WAITOK); 2825 } 2826 KKASSERT((parent->flags & HAMMER_NODE_LOCK_UPDATED) == 0); 2827 *parent->copy = *parent->node->ondisk; 2828 TAILQ_FOREACH(item, &parent->list, entry) { 2829 hammer_btree_lock_copy(cursor, item); 2830 } 2831 } 2832 2833 /* 2834 * Recursively sync modified copies to the media. 2835 */ 2836 int 2837 hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2838 { 2839 hammer_node_lock_t item; 2840 int count = 0; 2841 2842 if (parent->flags & HAMMER_NODE_LOCK_UPDATED) { 2843 ++count; 2844 hammer_modify_node_all(cursor->trans, parent->node); 2845 *parent->node->ondisk = *parent->copy; 2846 hammer_modify_node_done(parent->node); 2847 if (parent->copy->type == HAMMER_BTREE_TYPE_DELETED) { 2848 hammer_flush_node(parent->node, 0); 2849 hammer_delete_node(cursor->trans, parent->node); 2850 } 2851 } 2852 TAILQ_FOREACH(item, &parent->list, entry) { 2853 count += hammer_btree_sync_copy(cursor, item); 2854 } 2855 return(count); 2856 } 2857 2858 /* 2859 * Release previously obtained node locks. The caller is responsible for 2860 * cleaning up parent->node itself (its usually just aliased from a cursor), 2861 * but this function will take care of the copies. 2862 * 2863 * NOTE: The root node is not placed in the lcache and node->copy is not 2864 * deallocated when lcache != NULL. 2865 */ 2866 void 2867 hammer_btree_unlock_children(hammer_mount_t hmp, hammer_node_lock_t parent, 2868 hammer_node_lock_t lcache) 2869 { 2870 hammer_node_lock_t item; 2871 hammer_node_ondisk_t copy; 2872 2873 while ((item = TAILQ_FIRST(&parent->list)) != NULL) { 2874 TAILQ_REMOVE(&parent->list, item, entry); 2875 hammer_btree_unlock_children(hmp, item, lcache); 2876 hammer_unlock(&item->node->lock); 2877 hammer_rel_node(item->node); 2878 if (lcache) { 2879 /* 2880 * NOTE: When placing the item back in the lcache 2881 * the flag is cleared by the bzero(). 2882 * Remaining fields are cleared as a safety 2883 * measure. 2884 */ 2885 KKASSERT(item->flags & HAMMER_NODE_LOCK_LCACHE); 2886 KKASSERT(TAILQ_EMPTY(&item->list)); 2887 copy = item->copy; 2888 bzero(item, sizeof(*item)); 2889 TAILQ_INIT(&item->list); 2890 item->copy = copy; 2891 if (copy) 2892 bzero(copy, sizeof(*copy)); 2893 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2894 } else { 2895 kfree(item, hmp->m_misc); 2896 } 2897 } 2898 if (parent->copy && (parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0) { 2899 kfree(parent->copy, hmp->m_misc); 2900 parent->copy = NULL; /* safety */ 2901 } 2902 } 2903 2904 /************************************************************************ 2905 * MISCELLANIOUS SUPPORT * 2906 ************************************************************************/ 2907 2908 /* 2909 * Compare two B-Tree elements, return -N, 0, or +N (e.g. similar to strcmp). 2910 * 2911 * Note that for this particular function a return value of -1, 0, or +1 2912 * can denote a match if create_tid is otherwise discounted. A create_tid 2913 * of zero is considered to be 'infinity' in comparisons. 2914 * 2915 * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c. 2916 */ 2917 int 2918 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2) 2919 { 2920 if (key1->localization < key2->localization) 2921 return(-5); 2922 if (key1->localization > key2->localization) 2923 return(5); 2924 2925 if (key1->obj_id < key2->obj_id) 2926 return(-4); 2927 if (key1->obj_id > key2->obj_id) 2928 return(4); 2929 2930 if (key1->rec_type < key2->rec_type) 2931 return(-3); 2932 if (key1->rec_type > key2->rec_type) 2933 return(3); 2934 2935 if (key1->key < key2->key) 2936 return(-2); 2937 if (key1->key > key2->key) 2938 return(2); 2939 2940 /* 2941 * A create_tid of zero indicates a record which is undeletable 2942 * and must be considered to have a value of positive infinity. 2943 */ 2944 if (key1->create_tid == 0) { 2945 if (key2->create_tid == 0) 2946 return(0); 2947 return(1); 2948 } 2949 if (key2->create_tid == 0) 2950 return(-1); 2951 if (key1->create_tid < key2->create_tid) 2952 return(-1); 2953 if (key1->create_tid > key2->create_tid) 2954 return(1); 2955 return(0); 2956 } 2957 2958 /* 2959 * Test a timestamp against an element to determine whether the 2960 * element is visible. A timestamp of 0 means 'infinity'. 2961 */ 2962 int 2963 hammer_btree_chkts(hammer_tid_t asof, hammer_base_elm_t base) 2964 { 2965 if (asof == 0) { 2966 if (base->delete_tid) 2967 return(1); 2968 return(0); 2969 } 2970 if (asof < base->create_tid) 2971 return(-1); 2972 if (base->delete_tid && asof >= base->delete_tid) 2973 return(1); 2974 return(0); 2975 } 2976 2977 /* 2978 * Create a separator half way inbetween key1 and key2. For fields just 2979 * one unit apart, the separator will match key2. key1 is on the left-hand 2980 * side and key2 is on the right-hand side. 2981 * 2982 * key2 must be >= the separator. It is ok for the separator to match key2. 2983 * 2984 * NOTE: Even if key1 does not match key2, the separator may wind up matching 2985 * key2. 2986 * 2987 * NOTE: It might be beneficial to just scrap this whole mess and just 2988 * set the separator to key2. 2989 */ 2990 #define MAKE_SEPARATOR(key1, key2, dest, field) \ 2991 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1); 2992 2993 static void 2994 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2, 2995 hammer_base_elm_t dest) 2996 { 2997 bzero(dest, sizeof(*dest)); 2998 2999 dest->rec_type = key2->rec_type; 3000 dest->key = key2->key; 3001 dest->obj_id = key2->obj_id; 3002 dest->create_tid = key2->create_tid; 3003 3004 MAKE_SEPARATOR(key1, key2, dest, localization); 3005 if (key1->localization == key2->localization) { 3006 MAKE_SEPARATOR(key1, key2, dest, obj_id); 3007 if (key1->obj_id == key2->obj_id) { 3008 MAKE_SEPARATOR(key1, key2, dest, rec_type); 3009 if (key1->rec_type == key2->rec_type) { 3010 MAKE_SEPARATOR(key1, key2, dest, key); 3011 /* 3012 * Don't bother creating a separator for 3013 * create_tid, which also conveniently avoids 3014 * having to handle the create_tid == 0 3015 * (infinity) case. Just leave create_tid 3016 * set to key2. 3017 * 3018 * Worst case, dest matches key2 exactly, 3019 * which is acceptable. 3020 */ 3021 } 3022 } 3023 } 3024 } 3025 3026 #undef MAKE_SEPARATOR 3027 3028 /* 3029 * Return whether a generic internal or leaf node is full 3030 */ 3031 static int 3032 btree_node_is_full(hammer_node_ondisk_t node) 3033 { 3034 switch(node->type) { 3035 case HAMMER_BTREE_TYPE_INTERNAL: 3036 if (node->count == HAMMER_BTREE_INT_ELMS) 3037 return(1); 3038 break; 3039 case HAMMER_BTREE_TYPE_LEAF: 3040 if (node->count == HAMMER_BTREE_LEAF_ELMS) 3041 return(1); 3042 break; 3043 default: 3044 panic("illegal btree subtype"); 3045 } 3046 return(0); 3047 } 3048 3049 #if 0 3050 static int 3051 btree_max_elements(u_int8_t type) 3052 { 3053 if (type == HAMMER_BTREE_TYPE_LEAF) 3054 return(HAMMER_BTREE_LEAF_ELMS); 3055 if (type == HAMMER_BTREE_TYPE_INTERNAL) 3056 return(HAMMER_BTREE_INT_ELMS); 3057 panic("btree_max_elements: bad type %d", type); 3058 } 3059 #endif 3060 3061 void 3062 hammer_print_btree_node(hammer_node_ondisk_t ondisk) 3063 { 3064 hammer_btree_elm_t elm; 3065 int i; 3066 3067 kprintf("node %p count=%d parent=%016llx type=%c\n", 3068 ondisk, ondisk->count, 3069 (long long)ondisk->parent, ondisk->type); 3070 3071 /* 3072 * Dump both boundary elements if an internal node 3073 */ 3074 if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 3075 for (i = 0; i <= ondisk->count; ++i) { 3076 elm = &ondisk->elms[i]; 3077 hammer_print_btree_elm(elm, ondisk->type, i); 3078 } 3079 } else { 3080 for (i = 0; i < ondisk->count; ++i) { 3081 elm = &ondisk->elms[i]; 3082 hammer_print_btree_elm(elm, ondisk->type, i); 3083 } 3084 } 3085 } 3086 3087 void 3088 hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i) 3089 { 3090 kprintf(" %2d", i); 3091 kprintf("\tobj_id = %016llx\n", (long long)elm->base.obj_id); 3092 kprintf("\tkey = %016llx\n", (long long)elm->base.key); 3093 kprintf("\tcreate_tid = %016llx\n", (long long)elm->base.create_tid); 3094 kprintf("\tdelete_tid = %016llx\n", (long long)elm->base.delete_tid); 3095 kprintf("\trec_type = %04x\n", elm->base.rec_type); 3096 kprintf("\tobj_type = %02x\n", elm->base.obj_type); 3097 kprintf("\tbtype = %02x (%c)\n", 3098 elm->base.btype, 3099 (elm->base.btype ? elm->base.btype : '?')); 3100 kprintf("\tlocalization = %02x\n", elm->base.localization); 3101 3102 switch(type) { 3103 case HAMMER_BTREE_TYPE_INTERNAL: 3104 kprintf("\tsubtree_off = %016llx\n", 3105 (long long)elm->internal.subtree_offset); 3106 break; 3107 case HAMMER_BTREE_TYPE_RECORD: 3108 kprintf("\tdata_offset = %016llx\n", 3109 (long long)elm->leaf.data_offset); 3110 kprintf("\tdata_len = %08x\n", elm->leaf.data_len); 3111 kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc); 3112 break; 3113 } 3114 } 3115