1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.76 2008/08/06 15:38:58 dillon Exp $ 35 */ 36 37 /* 38 * HAMMER B-Tree index 39 * 40 * HAMMER implements a modified B+Tree. In documentation this will 41 * simply be refered to as the HAMMER B-Tree. Basically a HAMMER B-Tree 42 * looks like a B+Tree (A B-Tree which stores its records only at the leafs 43 * of the tree), but adds two additional boundary elements which describe 44 * the left-most and right-most element a node is able to represent. In 45 * otherwords, we have boundary elements at the two ends of a B-Tree node 46 * instead of sub-tree pointers. 47 * 48 * A B-Tree internal node looks like this: 49 * 50 * B N N N N N N B <-- boundary and internal elements 51 * S S S S S S S <-- subtree pointers 52 * 53 * A B-Tree leaf node basically looks like this: 54 * 55 * L L L L L L L L <-- leaf elemenets 56 * 57 * The radix for an internal node is 1 less then a leaf but we get a 58 * number of significant benefits for our troubles. 59 * 60 * The big benefit to using a B-Tree containing boundary information 61 * is that it is possible to cache pointers into the middle of the tree 62 * and not have to start searches, insertions, OR deletions at the root 63 * node. In particular, searches are able to progress in a definitive 64 * direction from any point in the tree without revisting nodes. This 65 * greatly improves the efficiency of many operations, most especially 66 * record appends. 67 * 68 * B-Trees also make the stacking of trees fairly straightforward. 69 * 70 * INSERTIONS: A search performed with the intention of doing 71 * an insert will guarantee that the terminal leaf node is not full by 72 * splitting full nodes. Splits occur top-down during the dive down the 73 * B-Tree. 74 * 75 * DELETIONS: A deletion makes no attempt to proactively balance the 76 * tree and will recursively remove nodes that become empty. If a 77 * deadlock occurs a deletion may not be able to remove an empty leaf. 78 * Deletions never allow internal nodes to become empty (that would blow 79 * up the boundaries). 80 */ 81 #include "hammer.h" 82 #include <sys/buf.h> 83 #include <sys/buf2.h> 84 85 static int btree_search(hammer_cursor_t cursor, int flags); 86 static int btree_split_internal(hammer_cursor_t cursor); 87 static int btree_split_leaf(hammer_cursor_t cursor); 88 static int btree_remove(hammer_cursor_t cursor); 89 static int btree_node_is_full(hammer_node_ondisk_t node); 90 static int hammer_btree_mirror_propagate(hammer_cursor_t cursor, 91 hammer_tid_t mirror_tid); 92 static void hammer_make_separator(hammer_base_elm_t key1, 93 hammer_base_elm_t key2, hammer_base_elm_t dest); 94 static void hammer_cursor_mirror_filter(hammer_cursor_t cursor); 95 96 /* 97 * Iterate records after a search. The cursor is iterated forwards past 98 * the current record until a record matching the key-range requirements 99 * is found. ENOENT is returned if the iteration goes past the ending 100 * key. 101 * 102 * The iteration is inclusive of key_beg and can be inclusive or exclusive 103 * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set. 104 * 105 * When doing an as-of search (cursor->asof != 0), key_beg.create_tid 106 * may be modified by B-Tree functions. 107 * 108 * cursor->key_beg may or may not be modified by this function during 109 * the iteration. XXX future - in case of an inverted lock we may have 110 * to reinitiate the lookup and set key_beg to properly pick up where we 111 * left off. 112 * 113 * NOTE! EDEADLK *CANNOT* be returned by this procedure. 114 */ 115 int 116 hammer_btree_iterate(hammer_cursor_t cursor) 117 { 118 hammer_node_ondisk_t node; 119 hammer_btree_elm_t elm; 120 hammer_mount_t hmp; 121 int error = 0; 122 int r; 123 int s; 124 125 /* 126 * Skip past the current record 127 */ 128 hmp = cursor->trans->hmp; 129 node = cursor->node->ondisk; 130 if (node == NULL) 131 return(ENOENT); 132 if (cursor->index < node->count && 133 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 134 ++cursor->index; 135 } 136 137 /* 138 * HAMMER can wind up being cpu-bound. 139 */ 140 if (++hmp->check_yield > hammer_yield_check) { 141 hmp->check_yield = 0; 142 lwkt_user_yield(); 143 } 144 145 146 /* 147 * Loop until an element is found or we are done. 148 */ 149 for (;;) { 150 /* 151 * We iterate up the tree and then index over one element 152 * while we are at the last element in the current node. 153 * 154 * If we are at the root of the filesystem, cursor_up 155 * returns ENOENT. 156 * 157 * XXX this could be optimized by storing the information in 158 * the parent reference. 159 * 160 * XXX we can lose the node lock temporarily, this could mess 161 * up our scan. 162 */ 163 ++hammer_stats_btree_iterations; 164 hammer_flusher_clean_loose_ios(hmp); 165 166 if (cursor->index == node->count) { 167 if (hammer_debug_btree) { 168 kprintf("BRACKETU %016llx[%d] -> %016llx[%d] (td=%p)\n", 169 (long long)cursor->node->node_offset, 170 cursor->index, 171 (long long)(cursor->parent ? cursor->parent->node_offset : -1), 172 cursor->parent_index, 173 curthread); 174 } 175 KKASSERT(cursor->parent == NULL || cursor->parent->ondisk->elms[cursor->parent_index].internal.subtree_offset == cursor->node->node_offset); 176 error = hammer_cursor_up(cursor); 177 if (error) 178 break; 179 /* reload stale pointer */ 180 node = cursor->node->ondisk; 181 KKASSERT(cursor->index != node->count); 182 183 /* 184 * If we are reblocking we want to return internal 185 * nodes. Note that the internal node will be 186 * returned multiple times, on each upward recursion 187 * from its children. The caller selects which 188 * revisit it cares about (usually first or last only). 189 */ 190 if (cursor->flags & HAMMER_CURSOR_REBLOCKING) { 191 cursor->flags |= HAMMER_CURSOR_ATEDISK; 192 return(0); 193 } 194 ++cursor->index; 195 continue; 196 } 197 198 /* 199 * Check internal or leaf element. Determine if the record 200 * at the cursor has gone beyond the end of our range. 201 * 202 * We recurse down through internal nodes. 203 */ 204 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 205 elm = &node->elms[cursor->index]; 206 207 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 208 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 209 if (hammer_debug_btree) { 210 kprintf("BRACKETL %016llx[%d] %016llx %02x %016llx lo=%02x %d (td=%p)\n", 211 (long long)cursor->node->node_offset, 212 cursor->index, 213 (long long)elm[0].internal.base.obj_id, 214 elm[0].internal.base.rec_type, 215 (long long)elm[0].internal.base.key, 216 elm[0].internal.base.localization, 217 r, 218 curthread 219 ); 220 kprintf("BRACKETR %016llx[%d] %016llx %02x %016llx lo=%02x %d\n", 221 (long long)cursor->node->node_offset, 222 cursor->index + 1, 223 (long long)elm[1].internal.base.obj_id, 224 elm[1].internal.base.rec_type, 225 (long long)elm[1].internal.base.key, 226 elm[1].internal.base.localization, 227 s 228 ); 229 } 230 231 if (r < 0) { 232 error = ENOENT; 233 break; 234 } 235 if (r == 0 && (cursor->flags & 236 HAMMER_CURSOR_END_INCLUSIVE) == 0) { 237 error = ENOENT; 238 break; 239 } 240 KKASSERT(s <= 0); 241 242 /* 243 * Better not be zero 244 */ 245 KKASSERT(elm->internal.subtree_offset != 0); 246 247 /* 248 * If running the mirror filter see if we can skip 249 * one or more entire sub-trees. If we can we 250 * return the internal mode and the caller processes 251 * the skipped range (see mirror_read) 252 */ 253 if (cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) { 254 if (elm->internal.mirror_tid < 255 cursor->cmirror->mirror_tid) { 256 hammer_cursor_mirror_filter(cursor); 257 return(0); 258 } 259 } 260 261 error = hammer_cursor_down(cursor); 262 if (error) 263 break; 264 KKASSERT(cursor->index == 0); 265 /* reload stale pointer */ 266 node = cursor->node->ondisk; 267 continue; 268 } else { 269 elm = &node->elms[cursor->index]; 270 r = hammer_btree_cmp(&cursor->key_end, &elm->base); 271 if (hammer_debug_btree) { 272 kprintf("ELEMENT %016llx:%d %c %016llx %02x %016llx lo=%02x %d\n", 273 (long long)cursor->node->node_offset, 274 cursor->index, 275 (elm[0].leaf.base.btype ? 276 elm[0].leaf.base.btype : '?'), 277 (long long)elm[0].leaf.base.obj_id, 278 elm[0].leaf.base.rec_type, 279 (long long)elm[0].leaf.base.key, 280 elm[0].leaf.base.localization, 281 r 282 ); 283 } 284 if (r < 0) { 285 error = ENOENT; 286 break; 287 } 288 289 /* 290 * We support both end-inclusive and 291 * end-exclusive searches. 292 */ 293 if (r == 0 && 294 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 295 error = ENOENT; 296 break; 297 } 298 299 switch(elm->leaf.base.btype) { 300 case HAMMER_BTREE_TYPE_RECORD: 301 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 302 hammer_btree_chkts(cursor->asof, &elm->base)) { 303 ++cursor->index; 304 continue; 305 } 306 error = 0; 307 break; 308 default: 309 error = EINVAL; 310 break; 311 } 312 if (error) 313 break; 314 } 315 /* 316 * node pointer invalid after loop 317 */ 318 319 /* 320 * Return entry 321 */ 322 if (hammer_debug_btree) { 323 int i = cursor->index; 324 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 325 kprintf("ITERATE %p:%d %016llx %02x %016llx lo=%02x\n", 326 cursor->node, i, 327 (long long)elm->internal.base.obj_id, 328 elm->internal.base.rec_type, 329 (long long)elm->internal.base.key, 330 elm->internal.base.localization 331 ); 332 } 333 return(0); 334 } 335 return(error); 336 } 337 338 /* 339 * We hit an internal element that we could skip as part of a mirroring 340 * scan. Calculate the entire range being skipped. 341 * 342 * It is important to include any gaps between the parent's left_bound 343 * and the node's left_bound, and same goes for the right side. 344 */ 345 static void 346 hammer_cursor_mirror_filter(hammer_cursor_t cursor) 347 { 348 struct hammer_cmirror *cmirror; 349 hammer_node_ondisk_t ondisk; 350 hammer_btree_elm_t elm; 351 352 ondisk = cursor->node->ondisk; 353 cmirror = cursor->cmirror; 354 355 /* 356 * Calculate the skipped range 357 */ 358 elm = &ondisk->elms[cursor->index]; 359 if (cursor->index == 0) 360 cmirror->skip_beg = *cursor->left_bound; 361 else 362 cmirror->skip_beg = elm->internal.base; 363 while (cursor->index < ondisk->count) { 364 if (elm->internal.mirror_tid >= cmirror->mirror_tid) 365 break; 366 ++cursor->index; 367 ++elm; 368 } 369 if (cursor->index == ondisk->count) 370 cmirror->skip_end = *cursor->right_bound; 371 else 372 cmirror->skip_end = elm->internal.base; 373 374 /* 375 * clip the returned result. 376 */ 377 if (hammer_btree_cmp(&cmirror->skip_beg, &cursor->key_beg) < 0) 378 cmirror->skip_beg = cursor->key_beg; 379 if (hammer_btree_cmp(&cmirror->skip_end, &cursor->key_end) > 0) 380 cmirror->skip_end = cursor->key_end; 381 } 382 383 /* 384 * Iterate in the reverse direction. This is used by the pruning code to 385 * avoid overlapping records. 386 */ 387 int 388 hammer_btree_iterate_reverse(hammer_cursor_t cursor) 389 { 390 hammer_node_ondisk_t node; 391 hammer_btree_elm_t elm; 392 int error = 0; 393 int r; 394 int s; 395 396 /* mirror filtering not supported for reverse iteration */ 397 KKASSERT ((cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) == 0); 398 399 /* 400 * Skip past the current record. For various reasons the cursor 401 * may end up set to -1 or set to point at the end of the current 402 * node. These cases must be addressed. 403 */ 404 node = cursor->node->ondisk; 405 if (node == NULL) 406 return(ENOENT); 407 if (cursor->index != -1 && 408 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 409 --cursor->index; 410 } 411 if (cursor->index == cursor->node->ondisk->count) 412 --cursor->index; 413 414 /* 415 * Loop until an element is found or we are done. 416 */ 417 for (;;) { 418 ++hammer_stats_btree_iterations; 419 hammer_flusher_clean_loose_ios(cursor->trans->hmp); 420 421 /* 422 * We iterate up the tree and then index over one element 423 * while we are at the last element in the current node. 424 */ 425 if (cursor->index == -1) { 426 error = hammer_cursor_up(cursor); 427 if (error) { 428 cursor->index = 0; /* sanity */ 429 break; 430 } 431 /* reload stale pointer */ 432 node = cursor->node->ondisk; 433 KKASSERT(cursor->index != node->count); 434 --cursor->index; 435 continue; 436 } 437 438 /* 439 * Check internal or leaf element. Determine if the record 440 * at the cursor has gone beyond the end of our range. 441 * 442 * We recurse down through internal nodes. 443 */ 444 KKASSERT(cursor->index != node->count); 445 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 446 elm = &node->elms[cursor->index]; 447 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 448 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 449 if (hammer_debug_btree) { 450 kprintf("BRACKETL %016llx[%d] %016llx %02x %016llx lo=%02x %d\n", 451 (long long)cursor->node->node_offset, 452 cursor->index, 453 (long long)elm[0].internal.base.obj_id, 454 elm[0].internal.base.rec_type, 455 (long long)elm[0].internal.base.key, 456 elm[0].internal.base.localization, 457 r 458 ); 459 kprintf("BRACKETR %016llx[%d] %016llx %02x %016llx lo=%02x %d\n", 460 (long long)cursor->node->node_offset, 461 cursor->index + 1, 462 (long long)elm[1].internal.base.obj_id, 463 elm[1].internal.base.rec_type, 464 (long long)elm[1].internal.base.key, 465 elm[1].internal.base.localization, 466 s 467 ); 468 } 469 470 if (s >= 0) { 471 error = ENOENT; 472 break; 473 } 474 KKASSERT(r >= 0); 475 476 /* 477 * Better not be zero 478 */ 479 KKASSERT(elm->internal.subtree_offset != 0); 480 481 error = hammer_cursor_down(cursor); 482 if (error) 483 break; 484 KKASSERT(cursor->index == 0); 485 /* reload stale pointer */ 486 node = cursor->node->ondisk; 487 488 /* this can assign -1 if the leaf was empty */ 489 cursor->index = node->count - 1; 490 continue; 491 } else { 492 elm = &node->elms[cursor->index]; 493 s = hammer_btree_cmp(&cursor->key_beg, &elm->base); 494 if (hammer_debug_btree) { 495 kprintf("ELEMENT %016llx:%d %c %016llx %02x %016llx lo=%02x %d\n", 496 (long long)cursor->node->node_offset, 497 cursor->index, 498 (elm[0].leaf.base.btype ? 499 elm[0].leaf.base.btype : '?'), 500 (long long)elm[0].leaf.base.obj_id, 501 elm[0].leaf.base.rec_type, 502 (long long)elm[0].leaf.base.key, 503 elm[0].leaf.base.localization, 504 s 505 ); 506 } 507 if (s > 0) { 508 error = ENOENT; 509 break; 510 } 511 512 switch(elm->leaf.base.btype) { 513 case HAMMER_BTREE_TYPE_RECORD: 514 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 515 hammer_btree_chkts(cursor->asof, &elm->base)) { 516 --cursor->index; 517 continue; 518 } 519 error = 0; 520 break; 521 default: 522 error = EINVAL; 523 break; 524 } 525 if (error) 526 break; 527 } 528 /* 529 * node pointer invalid after loop 530 */ 531 532 /* 533 * Return entry 534 */ 535 if (hammer_debug_btree) { 536 int i = cursor->index; 537 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 538 kprintf("ITERATE %p:%d %016llx %02x %016llx lo=%02x\n", 539 cursor->node, i, 540 (long long)elm->internal.base.obj_id, 541 elm->internal.base.rec_type, 542 (long long)elm->internal.base.key, 543 elm->internal.base.localization 544 ); 545 } 546 return(0); 547 } 548 return(error); 549 } 550 551 /* 552 * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry 553 * could not be found, EDEADLK if inserting and a retry is needed, and a 554 * fatal error otherwise. When retrying, the caller must terminate the 555 * cursor and reinitialize it. EDEADLK cannot be returned if not inserting. 556 * 557 * The cursor is suitably positioned for a deletion on success, and suitably 558 * positioned for an insertion on ENOENT if HAMMER_CURSOR_INSERT was 559 * specified. 560 * 561 * The cursor may begin anywhere, the search will traverse the tree in 562 * either direction to locate the requested element. 563 * 564 * Most of the logic implementing historical searches is handled here. We 565 * do an initial lookup with create_tid set to the asof TID. Due to the 566 * way records are laid out, a backwards iteration may be required if 567 * ENOENT is returned to locate the historical record. Here's the 568 * problem: 569 * 570 * create_tid: 10 15 20 571 * LEAF1 LEAF2 572 * records: (11) (18) 573 * 574 * Lets say we want to do a lookup AS-OF timestamp 17. We will traverse 575 * LEAF2 but the only record in LEAF2 has a create_tid of 18, which is 576 * not visible and thus causes ENOENT to be returned. We really need 577 * to check record 11 in LEAF1. If it also fails then the search fails 578 * (e.g. it might represent the range 11-16 and thus still not match our 579 * AS-OF timestamp of 17). Note that LEAF1 could be empty, requiring 580 * further iterations. 581 * 582 * If this case occurs btree_search() will set HAMMER_CURSOR_CREATE_CHECK 583 * and the cursor->create_check TID if an iteration might be needed. 584 * In the above example create_check would be set to 14. 585 */ 586 int 587 hammer_btree_lookup(hammer_cursor_t cursor) 588 { 589 int error; 590 591 KKASSERT ((cursor->flags & HAMMER_CURSOR_INSERT) == 0 || 592 cursor->trans->sync_lock_refs > 0); 593 ++hammer_stats_btree_lookups; 594 if (cursor->flags & HAMMER_CURSOR_ASOF) { 595 KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0); 596 cursor->key_beg.create_tid = cursor->asof; 597 for (;;) { 598 cursor->flags &= ~HAMMER_CURSOR_CREATE_CHECK; 599 error = btree_search(cursor, 0); 600 if (error != ENOENT || 601 (cursor->flags & HAMMER_CURSOR_CREATE_CHECK) == 0) { 602 /* 603 * Stop if no error. 604 * Stop if error other then ENOENT. 605 * Stop if ENOENT and not special case. 606 */ 607 break; 608 } 609 if (hammer_debug_btree) { 610 kprintf("CREATE_CHECK %016llx\n", 611 (long long)cursor->create_check); 612 } 613 cursor->key_beg.create_tid = cursor->create_check; 614 /* loop */ 615 } 616 } else { 617 error = btree_search(cursor, 0); 618 } 619 if (error == 0) 620 error = hammer_btree_extract(cursor, cursor->flags); 621 return(error); 622 } 623 624 /* 625 * Execute the logic required to start an iteration. The first record 626 * located within the specified range is returned and iteration control 627 * flags are adjusted for successive hammer_btree_iterate() calls. 628 * 629 * Set ATEDISK so a low-level caller can call btree_first/btree_iterate 630 * in a loop without worrying about it. Higher-level merged searches will 631 * adjust the flag appropriately. 632 */ 633 int 634 hammer_btree_first(hammer_cursor_t cursor) 635 { 636 int error; 637 638 error = hammer_btree_lookup(cursor); 639 if (error == ENOENT) { 640 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 641 error = hammer_btree_iterate(cursor); 642 } 643 cursor->flags |= HAMMER_CURSOR_ATEDISK; 644 return(error); 645 } 646 647 /* 648 * Similarly but for an iteration in the reverse direction. 649 * 650 * Set ATEDISK when iterating backwards to skip the current entry, 651 * which after an ENOENT lookup will be pointing beyond our end point. 652 * 653 * Set ATEDISK so a low-level caller can call btree_last/btree_iterate_reverse 654 * in a loop without worrying about it. Higher-level merged searches will 655 * adjust the flag appropriately. 656 */ 657 int 658 hammer_btree_last(hammer_cursor_t cursor) 659 { 660 struct hammer_base_elm save; 661 int error; 662 663 save = cursor->key_beg; 664 cursor->key_beg = cursor->key_end; 665 error = hammer_btree_lookup(cursor); 666 cursor->key_beg = save; 667 if (error == ENOENT || 668 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 669 cursor->flags |= HAMMER_CURSOR_ATEDISK; 670 error = hammer_btree_iterate_reverse(cursor); 671 } 672 cursor->flags |= HAMMER_CURSOR_ATEDISK; 673 return(error); 674 } 675 676 /* 677 * Extract the record and/or data associated with the cursor's current 678 * position. Any prior record or data stored in the cursor is replaced. 679 * The cursor must be positioned at a leaf node. 680 * 681 * NOTE: All extractions occur at the leaf of the B-Tree. 682 */ 683 int 684 hammer_btree_extract(hammer_cursor_t cursor, int flags) 685 { 686 hammer_node_ondisk_t node; 687 hammer_btree_elm_t elm; 688 hammer_off_t data_off; 689 hammer_mount_t hmp; 690 int32_t data_len; 691 int error; 692 693 /* 694 * The case where the data reference resolves to the same buffer 695 * as the record reference must be handled. 696 */ 697 node = cursor->node->ondisk; 698 elm = &node->elms[cursor->index]; 699 cursor->data = NULL; 700 hmp = cursor->node->hmp; 701 702 /* 703 * There is nothing to extract for an internal element. 704 */ 705 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) 706 return(EINVAL); 707 708 /* 709 * Only record types have data. 710 */ 711 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 712 cursor->leaf = &elm->leaf; 713 714 if ((flags & HAMMER_CURSOR_GET_DATA) == 0) 715 return(0); 716 if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD) 717 return(0); 718 data_off = elm->leaf.data_offset; 719 data_len = elm->leaf.data_len; 720 if (data_off == 0) 721 return(0); 722 723 /* 724 * Load the data 725 */ 726 KKASSERT(data_len >= 0 && data_len <= HAMMER_XBUFSIZE); 727 cursor->data = hammer_bread_ext(hmp, data_off, data_len, 728 &error, &cursor->data_buffer); 729 if (hammer_crc_test_leaf(cursor->data, &elm->leaf) == 0) { 730 kprintf("CRC DATA @ %016llx/%d FAILED\n", 731 (long long)elm->leaf.data_offset, elm->leaf.data_len); 732 if (hammer_debug_debug & 0x0001) 733 Debugger("CRC FAILED: DATA"); 734 if (cursor->trans->flags & HAMMER_TRANSF_CRCDOM) 735 error = EDOM; /* less critical (mirroring) */ 736 else 737 error = EIO; /* critical */ 738 } 739 return(error); 740 } 741 742 743 /* 744 * Insert a leaf element into the B-Tree at the current cursor position. 745 * The cursor is positioned such that the element at and beyond the cursor 746 * are shifted to make room for the new record. 747 * 748 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT 749 * flag set and that call must return ENOENT before this function can be 750 * called. 751 * 752 * The caller may depend on the cursor's exclusive lock after return to 753 * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE). 754 * 755 * ENOSPC is returned if there is no room to insert a new record. 756 */ 757 int 758 hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm, 759 int *doprop) 760 { 761 hammer_node_ondisk_t node; 762 int i; 763 int error; 764 765 *doprop = 0; 766 if ((error = hammer_cursor_upgrade_node(cursor)) != 0) 767 return(error); 768 ++hammer_stats_btree_inserts; 769 770 /* 771 * Insert the element at the leaf node and update the count in the 772 * parent. It is possible for parent to be NULL, indicating that 773 * the filesystem's ROOT B-Tree node is a leaf itself, which is 774 * possible. The root inode can never be deleted so the leaf should 775 * never be empty. 776 * 777 * Remember that the right-hand boundary is not included in the 778 * count. 779 */ 780 hammer_modify_node_all(cursor->trans, cursor->node); 781 node = cursor->node->ondisk; 782 i = cursor->index; 783 KKASSERT(elm->base.btype != 0); 784 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 785 KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS); 786 if (i != node->count) { 787 bcopy(&node->elms[i], &node->elms[i+1], 788 (node->count - i) * sizeof(*elm)); 789 } 790 node->elms[i].leaf = *elm; 791 ++node->count; 792 hammer_cursor_inserted_element(cursor->node, i); 793 794 /* 795 * Update the leaf node's aggregate mirror_tid for mirroring 796 * support. 797 */ 798 if (node->mirror_tid < elm->base.delete_tid) { 799 node->mirror_tid = elm->base.delete_tid; 800 *doprop = 1; 801 } 802 if (node->mirror_tid < elm->base.create_tid) { 803 node->mirror_tid = elm->base.create_tid; 804 *doprop = 1; 805 } 806 hammer_modify_node_done(cursor->node); 807 808 /* 809 * Debugging sanity checks. 810 */ 811 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->base) <= 0); 812 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->base) > 0); 813 if (i) { 814 KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->base) < 0); 815 } 816 if (i != node->count - 1) 817 KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->base) > 0); 818 819 return(0); 820 } 821 822 /* 823 * Delete a record from the B-Tree at the current cursor position. 824 * The cursor is positioned such that the current element is the one 825 * to be deleted. 826 * 827 * On return the cursor will be positioned after the deleted element and 828 * MAY point to an internal node. It will be suitable for the continuation 829 * of an iteration but not for an insertion or deletion. 830 * 831 * Deletions will attempt to partially rebalance the B-Tree in an upward 832 * direction, but will terminate rather then deadlock. Empty internal nodes 833 * are never allowed by a deletion which deadlocks may end up giving us an 834 * empty leaf. The pruner will clean up and rebalance the tree. 835 * 836 * This function can return EDEADLK, requiring the caller to retry the 837 * operation after clearing the deadlock. 838 */ 839 int 840 hammer_btree_delete(hammer_cursor_t cursor) 841 { 842 hammer_node_ondisk_t ondisk; 843 hammer_node_t node; 844 hammer_node_t parent; 845 int error; 846 int i; 847 848 KKASSERT (cursor->trans->sync_lock_refs > 0); 849 if ((error = hammer_cursor_upgrade(cursor)) != 0) 850 return(error); 851 ++hammer_stats_btree_deletes; 852 853 /* 854 * Delete the element from the leaf node. 855 * 856 * Remember that leaf nodes do not have boundaries. 857 */ 858 node = cursor->node; 859 ondisk = node->ondisk; 860 i = cursor->index; 861 862 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF); 863 KKASSERT(i >= 0 && i < ondisk->count); 864 hammer_modify_node_all(cursor->trans, node); 865 if (i + 1 != ondisk->count) { 866 bcopy(&ondisk->elms[i+1], &ondisk->elms[i], 867 (ondisk->count - i - 1) * sizeof(ondisk->elms[0])); 868 } 869 --ondisk->count; 870 hammer_modify_node_done(node); 871 hammer_cursor_deleted_element(node, i); 872 873 /* 874 * Validate local parent 875 */ 876 if (ondisk->parent) { 877 parent = cursor->parent; 878 879 KKASSERT(parent != NULL); 880 KKASSERT(parent->node_offset == ondisk->parent); 881 } 882 883 /* 884 * If the leaf becomes empty it must be detached from the parent, 885 * potentially recursing through to the filesystem root. 886 * 887 * This may reposition the cursor at one of the parent's of the 888 * current node. 889 * 890 * Ignore deadlock errors, that simply means that btree_remove 891 * was unable to recurse and had to leave us with an empty leaf. 892 */ 893 KKASSERT(cursor->index <= ondisk->count); 894 if (ondisk->count == 0) { 895 error = btree_remove(cursor); 896 if (error == EDEADLK) 897 error = 0; 898 } else { 899 error = 0; 900 } 901 KKASSERT(cursor->parent == NULL || 902 cursor->parent_index < cursor->parent->ondisk->count); 903 return(error); 904 } 905 906 /* 907 * PRIMAY B-TREE SEARCH SUPPORT PROCEDURE 908 * 909 * Search the filesystem B-Tree for cursor->key_beg, return the matching node. 910 * 911 * The search can begin ANYWHERE in the B-Tree. As a first step the search 912 * iterates up the tree as necessary to properly position itself prior to 913 * actually doing the sarch. 914 * 915 * INSERTIONS: The search will split full nodes and leaves on its way down 916 * and guarentee that the leaf it ends up on is not full. If we run out 917 * of space the search continues to the leaf (to position the cursor for 918 * the spike), but ENOSPC is returned. 919 * 920 * The search is only guarenteed to end up on a leaf if an error code of 0 921 * is returned, or if inserting and an error code of ENOENT is returned. 922 * Otherwise it can stop at an internal node. On success a search returns 923 * a leaf node. 924 * 925 * COMPLEXITY WARNING! This is the core B-Tree search code for the entire 926 * filesystem, and it is not simple code. Please note the following facts: 927 * 928 * - Internal node recursions have a boundary on the left AND right. The 929 * right boundary is non-inclusive. The create_tid is a generic part 930 * of the key for internal nodes. 931 * 932 * - Leaf nodes contain terminal elements only now. 933 * 934 * - Filesystem lookups typically set HAMMER_CURSOR_ASOF, indicating a 935 * historical search. ASOF and INSERT are mutually exclusive. When 936 * doing an as-of lookup btree_search() checks for a right-edge boundary 937 * case. If while recursing down the left-edge differs from the key 938 * by ONLY its create_tid, HAMMER_CURSOR_CREATE_CHECK is set along 939 * with cursor->create_check. This is used by btree_lookup() to iterate. 940 * The iteration backwards because as-of searches can wind up going 941 * down the wrong branch of the B-Tree. 942 */ 943 static 944 int 945 btree_search(hammer_cursor_t cursor, int flags) 946 { 947 hammer_node_ondisk_t node; 948 hammer_btree_elm_t elm; 949 int error; 950 int enospc = 0; 951 int i; 952 int r; 953 int s; 954 955 flags |= cursor->flags; 956 ++hammer_stats_btree_searches; 957 958 if (hammer_debug_btree) { 959 kprintf("SEARCH %016llx[%d] %016llx %02x key=%016llx cre=%016llx lo=%02x (td = %p)\n", 960 (long long)cursor->node->node_offset, 961 cursor->index, 962 (long long)cursor->key_beg.obj_id, 963 cursor->key_beg.rec_type, 964 (long long)cursor->key_beg.key, 965 (long long)cursor->key_beg.create_tid, 966 cursor->key_beg.localization, 967 curthread 968 ); 969 if (cursor->parent) 970 kprintf("SEARCHP %016llx[%d] (%016llx/%016llx %016llx/%016llx) (%p/%p %p/%p)\n", 971 (long long)cursor->parent->node_offset, 972 cursor->parent_index, 973 (long long)cursor->left_bound->obj_id, 974 (long long)cursor->parent->ondisk->elms[cursor->parent_index].internal.base.obj_id, 975 (long long)cursor->right_bound->obj_id, 976 (long long)cursor->parent->ondisk->elms[cursor->parent_index+1].internal.base.obj_id, 977 cursor->left_bound, 978 &cursor->parent->ondisk->elms[cursor->parent_index], 979 cursor->right_bound, 980 &cursor->parent->ondisk->elms[cursor->parent_index+1] 981 ); 982 } 983 984 /* 985 * Move our cursor up the tree until we find a node whos range covers 986 * the key we are trying to locate. 987 * 988 * The left bound is inclusive, the right bound is non-inclusive. 989 * It is ok to cursor up too far. 990 */ 991 for (;;) { 992 r = hammer_btree_cmp(&cursor->key_beg, cursor->left_bound); 993 s = hammer_btree_cmp(&cursor->key_beg, cursor->right_bound); 994 if (r >= 0 && s < 0) 995 break; 996 KKASSERT(cursor->parent); 997 ++hammer_stats_btree_iterations; 998 error = hammer_cursor_up(cursor); 999 if (error) 1000 goto done; 1001 } 1002 1003 /* 1004 * The delete-checks below are based on node, not parent. Set the 1005 * initial delete-check based on the parent. 1006 */ 1007 if (r == 1) { 1008 KKASSERT(cursor->left_bound->create_tid != 1); 1009 cursor->create_check = cursor->left_bound->create_tid - 1; 1010 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1011 } 1012 1013 /* 1014 * We better have ended up with a node somewhere. 1015 */ 1016 KKASSERT(cursor->node != NULL); 1017 1018 /* 1019 * If we are inserting we can't start at a full node if the parent 1020 * is also full (because there is no way to split the node), 1021 * continue running up the tree until the requirement is satisfied 1022 * or we hit the root of the filesystem. 1023 * 1024 * (If inserting we aren't doing an as-of search so we don't have 1025 * to worry about create_check). 1026 */ 1027 while ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) { 1028 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 1029 if (btree_node_is_full(cursor->node->ondisk) == 0) 1030 break; 1031 } else { 1032 if (btree_node_is_full(cursor->node->ondisk) ==0) 1033 break; 1034 } 1035 if (cursor->node->ondisk->parent == 0 || 1036 cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) { 1037 break; 1038 } 1039 ++hammer_stats_btree_iterations; 1040 error = hammer_cursor_up(cursor); 1041 /* node may have become stale */ 1042 if (error) 1043 goto done; 1044 } 1045 1046 /* 1047 * Push down through internal nodes to locate the requested key. 1048 */ 1049 node = cursor->node->ondisk; 1050 while (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 1051 /* 1052 * Scan the node to find the subtree index to push down into. 1053 * We go one-past, then back-up. 1054 * 1055 * We must proactively remove deleted elements which may 1056 * have been left over from a deadlocked btree_remove(). 1057 * 1058 * The left and right boundaries are included in the loop 1059 * in order to detect edge cases. 1060 * 1061 * If the separator only differs by create_tid (r == 1) 1062 * and we are doing an as-of search, we may end up going 1063 * down a branch to the left of the one containing the 1064 * desired key. This requires numerous special cases. 1065 */ 1066 ++hammer_stats_btree_iterations; 1067 if (hammer_debug_btree) { 1068 kprintf("SEARCH-I %016llx count=%d\n", 1069 (long long)cursor->node->node_offset, 1070 node->count); 1071 } 1072 1073 /* 1074 * Try to shortcut the search before dropping into the 1075 * linear loop. Locate the first node where r <= 1. 1076 */ 1077 i = hammer_btree_search_node(&cursor->key_beg, node); 1078 while (i <= node->count) { 1079 ++hammer_stats_btree_elements; 1080 elm = &node->elms[i]; 1081 r = hammer_btree_cmp(&cursor->key_beg, &elm->base); 1082 if (hammer_debug_btree > 2) { 1083 kprintf(" IELM %p %d r=%d\n", 1084 &node->elms[i], i, r); 1085 } 1086 if (r < 0) 1087 break; 1088 if (r == 1) { 1089 KKASSERT(elm->base.create_tid != 1); 1090 cursor->create_check = elm->base.create_tid - 1; 1091 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1092 } 1093 ++i; 1094 } 1095 if (hammer_debug_btree) { 1096 kprintf("SEARCH-I preI=%d/%d r=%d\n", 1097 i, node->count, r); 1098 } 1099 1100 /* 1101 * These cases occur when the parent's idea of the boundary 1102 * is wider then the child's idea of the boundary, and 1103 * require special handling. If not inserting we can 1104 * terminate the search early for these cases but the 1105 * child's boundaries cannot be unconditionally modified. 1106 */ 1107 if (i == 0) { 1108 /* 1109 * If i == 0 the search terminated to the LEFT of the 1110 * left_boundary but to the RIGHT of the parent's left 1111 * boundary. 1112 */ 1113 u_int8_t save; 1114 1115 elm = &node->elms[0]; 1116 1117 /* 1118 * If we aren't inserting we can stop here. 1119 */ 1120 if ((flags & (HAMMER_CURSOR_INSERT | 1121 HAMMER_CURSOR_PRUNING)) == 0) { 1122 cursor->index = 0; 1123 return(ENOENT); 1124 } 1125 1126 /* 1127 * Correct a left-hand boundary mismatch. 1128 * 1129 * We can only do this if we can upgrade the lock, 1130 * and synchronized as a background cursor (i.e. 1131 * inserting or pruning). 1132 * 1133 * WARNING: We can only do this if inserting, i.e. 1134 * we are running on the backend. 1135 */ 1136 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1137 return(error); 1138 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1139 hammer_modify_node_field(cursor->trans, cursor->node, 1140 elms[0]); 1141 save = node->elms[0].base.btype; 1142 node->elms[0].base = *cursor->left_bound; 1143 node->elms[0].base.btype = save; 1144 hammer_modify_node_done(cursor->node); 1145 } else if (i == node->count + 1) { 1146 /* 1147 * If i == node->count + 1 the search terminated to 1148 * the RIGHT of the right boundary but to the LEFT 1149 * of the parent's right boundary. If we aren't 1150 * inserting we can stop here. 1151 * 1152 * Note that the last element in this case is 1153 * elms[i-2] prior to adjustments to 'i'. 1154 */ 1155 --i; 1156 if ((flags & (HAMMER_CURSOR_INSERT | 1157 HAMMER_CURSOR_PRUNING)) == 0) { 1158 cursor->index = i; 1159 return (ENOENT); 1160 } 1161 1162 /* 1163 * Correct a right-hand boundary mismatch. 1164 * (actual push-down record is i-2 prior to 1165 * adjustments to i). 1166 * 1167 * We can only do this if we can upgrade the lock, 1168 * and synchronized as a background cursor (i.e. 1169 * inserting or pruning). 1170 * 1171 * WARNING: We can only do this if inserting, i.e. 1172 * we are running on the backend. 1173 */ 1174 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1175 return(error); 1176 elm = &node->elms[i]; 1177 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1178 hammer_modify_node(cursor->trans, cursor->node, 1179 &elm->base, sizeof(elm->base)); 1180 elm->base = *cursor->right_bound; 1181 hammer_modify_node_done(cursor->node); 1182 --i; 1183 } else { 1184 /* 1185 * The push-down index is now i - 1. If we had 1186 * terminated on the right boundary this will point 1187 * us at the last element. 1188 */ 1189 --i; 1190 } 1191 cursor->index = i; 1192 elm = &node->elms[i]; 1193 1194 if (hammer_debug_btree) { 1195 kprintf("RESULT-I %016llx[%d] %016llx %02x " 1196 "key=%016llx cre=%016llx lo=%02x\n", 1197 (long long)cursor->node->node_offset, 1198 i, 1199 (long long)elm->internal.base.obj_id, 1200 elm->internal.base.rec_type, 1201 (long long)elm->internal.base.key, 1202 (long long)elm->internal.base.create_tid, 1203 elm->internal.base.localization 1204 ); 1205 } 1206 1207 /* 1208 * We better have a valid subtree offset. 1209 */ 1210 KKASSERT(elm->internal.subtree_offset != 0); 1211 1212 /* 1213 * Handle insertion and deletion requirements. 1214 * 1215 * If inserting split full nodes. The split code will 1216 * adjust cursor->node and cursor->index if the current 1217 * index winds up in the new node. 1218 * 1219 * If inserting and a left or right edge case was detected, 1220 * we cannot correct the left or right boundary and must 1221 * prepend and append an empty leaf node in order to make 1222 * the boundary correction. 1223 * 1224 * If we run out of space we set enospc and continue on 1225 * to a leaf to provide the spike code with a good point 1226 * of entry. 1227 */ 1228 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) { 1229 if (btree_node_is_full(node)) { 1230 error = btree_split_internal(cursor); 1231 if (error) { 1232 if (error != ENOSPC) 1233 goto done; 1234 enospc = 1; 1235 } 1236 /* 1237 * reload stale pointers 1238 */ 1239 i = cursor->index; 1240 node = cursor->node->ondisk; 1241 } 1242 } 1243 1244 /* 1245 * Push down (push into new node, existing node becomes 1246 * the parent) and continue the search. 1247 */ 1248 error = hammer_cursor_down(cursor); 1249 /* node may have become stale */ 1250 if (error) 1251 goto done; 1252 node = cursor->node->ondisk; 1253 } 1254 1255 /* 1256 * We are at a leaf, do a linear search of the key array. 1257 * 1258 * On success the index is set to the matching element and 0 1259 * is returned. 1260 * 1261 * On failure the index is set to the insertion point and ENOENT 1262 * is returned. 1263 * 1264 * Boundaries are not stored in leaf nodes, so the index can wind 1265 * up to the left of element 0 (index == 0) or past the end of 1266 * the array (index == node->count). It is also possible that the 1267 * leaf might be empty. 1268 */ 1269 ++hammer_stats_btree_iterations; 1270 KKASSERT (node->type == HAMMER_BTREE_TYPE_LEAF); 1271 KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS); 1272 if (hammer_debug_btree) { 1273 kprintf("SEARCH-L %016llx count=%d\n", 1274 (long long)cursor->node->node_offset, 1275 node->count); 1276 } 1277 1278 /* 1279 * Try to shortcut the search before dropping into the 1280 * linear loop. Locate the first node where r <= 1. 1281 */ 1282 i = hammer_btree_search_node(&cursor->key_beg, node); 1283 while (i < node->count) { 1284 ++hammer_stats_btree_elements; 1285 elm = &node->elms[i]; 1286 1287 r = hammer_btree_cmp(&cursor->key_beg, &elm->leaf.base); 1288 1289 if (hammer_debug_btree > 1) 1290 kprintf(" ELM %p %d r=%d\n", &node->elms[i], i, r); 1291 1292 /* 1293 * We are at a record element. Stop if we've flipped past 1294 * key_beg, not counting the create_tid test. Allow the 1295 * r == 1 case (key_beg > element but differs only by its 1296 * create_tid) to fall through to the AS-OF check. 1297 */ 1298 KKASSERT (elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD); 1299 1300 if (r < 0) 1301 goto failed; 1302 if (r > 1) { 1303 ++i; 1304 continue; 1305 } 1306 1307 /* 1308 * Check our as-of timestamp against the element. 1309 */ 1310 if (flags & HAMMER_CURSOR_ASOF) { 1311 if (hammer_btree_chkts(cursor->asof, 1312 &node->elms[i].base) != 0) { 1313 ++i; 1314 continue; 1315 } 1316 /* success */ 1317 } else { 1318 if (r > 0) { /* can only be +1 */ 1319 ++i; 1320 continue; 1321 } 1322 /* success */ 1323 } 1324 cursor->index = i; 1325 error = 0; 1326 if (hammer_debug_btree) { 1327 kprintf("RESULT-L %016llx[%d] (SUCCESS)\n", 1328 (long long)cursor->node->node_offset, i); 1329 } 1330 goto done; 1331 } 1332 1333 /* 1334 * The search of the leaf node failed. i is the insertion point. 1335 */ 1336 failed: 1337 if (hammer_debug_btree) { 1338 kprintf("RESULT-L %016llx[%d] (FAILED)\n", 1339 (long long)cursor->node->node_offset, i); 1340 } 1341 1342 /* 1343 * No exact match was found, i is now at the insertion point. 1344 * 1345 * If inserting split a full leaf before returning. This 1346 * may have the side effect of adjusting cursor->node and 1347 * cursor->index. 1348 */ 1349 cursor->index = i; 1350 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0 && 1351 btree_node_is_full(node)) { 1352 error = btree_split_leaf(cursor); 1353 if (error) { 1354 if (error != ENOSPC) 1355 goto done; 1356 enospc = 1; 1357 } 1358 /* 1359 * reload stale pointers 1360 */ 1361 /* NOT USED 1362 i = cursor->index; 1363 node = &cursor->node->internal; 1364 */ 1365 } 1366 1367 /* 1368 * We reached a leaf but did not find the key we were looking for. 1369 * If this is an insert we will be properly positioned for an insert 1370 * (ENOENT) or spike (ENOSPC) operation. 1371 */ 1372 error = enospc ? ENOSPC : ENOENT; 1373 done: 1374 return(error); 1375 } 1376 1377 /* 1378 * Heuristical search for the first element whos comparison is <= 1. May 1379 * return an index whos compare result is > 1 but may only return an index 1380 * whos compare result is <= 1 if it is the first element with that result. 1381 */ 1382 int 1383 hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node) 1384 { 1385 int b; 1386 int s; 1387 int i; 1388 int r; 1389 1390 /* 1391 * Don't bother if the node does not have very many elements 1392 */ 1393 b = 0; 1394 s = node->count; 1395 while (s - b > 4) { 1396 i = b + (s - b) / 2; 1397 ++hammer_stats_btree_elements; 1398 r = hammer_btree_cmp(elm, &node->elms[i].leaf.base); 1399 if (r <= 1) { 1400 s = i; 1401 } else { 1402 b = i; 1403 } 1404 } 1405 return(b); 1406 } 1407 1408 1409 /************************************************************************ 1410 * SPLITTING AND MERGING * 1411 ************************************************************************ 1412 * 1413 * These routines do all the dirty work required to split and merge nodes. 1414 */ 1415 1416 /* 1417 * Split an internal node into two nodes and move the separator at the split 1418 * point to the parent. 1419 * 1420 * (cursor->node, cursor->index) indicates the element the caller intends 1421 * to push into. We will adjust node and index if that element winds 1422 * up in the split node. 1423 * 1424 * If we are at the root of the filesystem a new root must be created with 1425 * two elements, one pointing to the original root and one pointing to the 1426 * newly allocated split node. 1427 */ 1428 static 1429 int 1430 btree_split_internal(hammer_cursor_t cursor) 1431 { 1432 hammer_node_ondisk_t ondisk; 1433 hammer_node_t node; 1434 hammer_node_t parent; 1435 hammer_node_t new_node; 1436 hammer_btree_elm_t elm; 1437 hammer_btree_elm_t parent_elm; 1438 struct hammer_node_lock lockroot; 1439 hammer_mount_t hmp = cursor->trans->hmp; 1440 hammer_off_t hint; 1441 int parent_index; 1442 int made_root; 1443 int split; 1444 int error; 1445 int i; 1446 const int esize = sizeof(*elm); 1447 1448 hammer_node_lock_init(&lockroot, cursor->node); 1449 error = hammer_btree_lock_children(cursor, 1, &lockroot); 1450 if (error) 1451 goto done; 1452 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1453 goto done; 1454 ++hammer_stats_btree_splits; 1455 1456 /* 1457 * We are splitting but elms[split] will be promoted to the parent, 1458 * leaving the right hand node with one less element. If the 1459 * insertion point will be on the left-hand side adjust the split 1460 * point to give the right hand side one additional node. 1461 */ 1462 node = cursor->node; 1463 ondisk = node->ondisk; 1464 split = (ondisk->count + 1) / 2; 1465 if (cursor->index <= split) 1466 --split; 1467 1468 /* 1469 * If we are at the root of the filesystem, create a new root node 1470 * with 1 element and split normally. Avoid making major 1471 * modifications until we know the whole operation will work. 1472 */ 1473 if (ondisk->parent == 0) { 1474 parent = hammer_alloc_btree(cursor->trans, node->node_offset, 1475 &error); 1476 if (parent == NULL) 1477 goto done; 1478 hammer_lock_ex(&parent->lock); 1479 hammer_modify_node_noundo(cursor->trans, parent); 1480 ondisk = parent->ondisk; 1481 ondisk->count = 1; 1482 ondisk->parent = 0; 1483 ondisk->mirror_tid = node->ondisk->mirror_tid; 1484 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1485 ondisk->elms[0].base = hmp->root_btree_beg; 1486 ondisk->elms[0].base.btype = node->ondisk->type; 1487 ondisk->elms[0].internal.subtree_offset = node->node_offset; 1488 ondisk->elms[1].base = hmp->root_btree_end; 1489 hammer_modify_node_done(parent); 1490 /* ondisk->elms[1].base.btype - not used */ 1491 made_root = 1; 1492 parent_index = 0; /* index of current node in parent */ 1493 } else { 1494 made_root = 0; 1495 parent = cursor->parent; 1496 parent_index = cursor->parent_index; 1497 } 1498 1499 /* 1500 * Calculate a hint for the allocation of the new B-Tree node. 1501 * The most likely expansion is coming from the insertion point 1502 * at cursor->index, so try to localize the allocation of our 1503 * new node to accomodate that sub-tree. 1504 * 1505 * Use the right-most sub-tree when expandinging on the right edge. 1506 * This is a very common case when copying a directory tree. 1507 */ 1508 if (cursor->index == ondisk->count) 1509 hint = ondisk->elms[cursor->index - 1].internal.subtree_offset; 1510 else 1511 hint = ondisk->elms[cursor->index].internal.subtree_offset; 1512 1513 /* 1514 * Split node into new_node at the split point. 1515 * 1516 * B O O O P N N B <-- P = node->elms[split] (index 4) 1517 * 0 1 2 3 4 5 6 <-- subtree indices 1518 * 1519 * x x P x x 1520 * s S S s 1521 * / \ 1522 * B O O O B B N N B <--- inner boundary points are 'P' 1523 * 0 1 2 3 4 5 6 1524 */ 1525 new_node = hammer_alloc_btree(cursor->trans, hint, &error); 1526 if (new_node == NULL) { 1527 if (made_root) { 1528 hammer_unlock(&parent->lock); 1529 hammer_delete_node(cursor->trans, parent); 1530 hammer_rel_node(parent); 1531 } 1532 goto done; 1533 } 1534 hammer_lock_ex(&new_node->lock); 1535 1536 /* 1537 * Create the new node. P becomes the left-hand boundary in the 1538 * new node. Copy the right-hand boundary as well. 1539 * 1540 * elm is the new separator. 1541 */ 1542 hammer_modify_node_noundo(cursor->trans, new_node); 1543 hammer_modify_node_all(cursor->trans, node); 1544 ondisk = node->ondisk; 1545 elm = &ondisk->elms[split]; 1546 bcopy(elm, &new_node->ondisk->elms[0], 1547 (ondisk->count - split + 1) * esize); 1548 new_node->ondisk->count = ondisk->count - split; 1549 new_node->ondisk->parent = parent->node_offset; 1550 new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1551 new_node->ondisk->mirror_tid = ondisk->mirror_tid; 1552 KKASSERT(ondisk->type == new_node->ondisk->type); 1553 hammer_cursor_split_node(node, new_node, split); 1554 1555 /* 1556 * Cleanup the original node. Elm (P) becomes the new boundary, 1557 * its subtree_offset was moved to the new node. If we had created 1558 * a new root its parent pointer may have changed. 1559 */ 1560 elm->internal.subtree_offset = 0; 1561 ondisk->count = split; 1562 1563 /* 1564 * Insert the separator into the parent, fixup the parent's 1565 * reference to the original node, and reference the new node. 1566 * The separator is P. 1567 * 1568 * Remember that base.count does not include the right-hand boundary. 1569 */ 1570 hammer_modify_node_all(cursor->trans, parent); 1571 ondisk = parent->ondisk; 1572 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1573 parent_elm = &ondisk->elms[parent_index+1]; 1574 bcopy(parent_elm, parent_elm + 1, 1575 (ondisk->count - parent_index) * esize); 1576 parent_elm->internal.base = elm->base; /* separator P */ 1577 parent_elm->internal.base.btype = new_node->ondisk->type; 1578 parent_elm->internal.subtree_offset = new_node->node_offset; 1579 parent_elm->internal.mirror_tid = new_node->ondisk->mirror_tid; 1580 ++ondisk->count; 1581 hammer_modify_node_done(parent); 1582 hammer_cursor_inserted_element(parent, parent_index + 1); 1583 1584 /* 1585 * The children of new_node need their parent pointer set to new_node. 1586 * The children have already been locked by 1587 * hammer_btree_lock_children(). 1588 */ 1589 for (i = 0; i < new_node->ondisk->count; ++i) { 1590 elm = &new_node->ondisk->elms[i]; 1591 error = btree_set_parent(cursor->trans, new_node, elm); 1592 if (error) { 1593 panic("btree_split_internal: btree-fixup problem"); 1594 } 1595 } 1596 hammer_modify_node_done(new_node); 1597 1598 /* 1599 * The filesystem's root B-Tree pointer may have to be updated. 1600 */ 1601 if (made_root) { 1602 hammer_volume_t volume; 1603 1604 volume = hammer_get_root_volume(hmp, &error); 1605 KKASSERT(error == 0); 1606 1607 hammer_modify_volume_field(cursor->trans, volume, 1608 vol0_btree_root); 1609 volume->ondisk->vol0_btree_root = parent->node_offset; 1610 hammer_modify_volume_done(volume); 1611 node->ondisk->parent = parent->node_offset; 1612 if (cursor->parent) { 1613 hammer_unlock(&cursor->parent->lock); 1614 hammer_rel_node(cursor->parent); 1615 } 1616 cursor->parent = parent; /* lock'd and ref'd */ 1617 hammer_rel_volume(volume, 0); 1618 } 1619 hammer_modify_node_done(node); 1620 1621 /* 1622 * Ok, now adjust the cursor depending on which element the original 1623 * index was pointing at. If we are >= the split point the push node 1624 * is now in the new node. 1625 * 1626 * NOTE: If we are at the split point itself we cannot stay with the 1627 * original node because the push index will point at the right-hand 1628 * boundary, which is illegal. 1629 * 1630 * NOTE: The cursor's parent or parent_index must be adjusted for 1631 * the case where a new parent (new root) was created, and the case 1632 * where the cursor is now pointing at the split node. 1633 */ 1634 if (cursor->index >= split) { 1635 cursor->parent_index = parent_index + 1; 1636 cursor->index -= split; 1637 hammer_unlock(&cursor->node->lock); 1638 hammer_rel_node(cursor->node); 1639 cursor->node = new_node; /* locked and ref'd */ 1640 } else { 1641 cursor->parent_index = parent_index; 1642 hammer_unlock(&new_node->lock); 1643 hammer_rel_node(new_node); 1644 } 1645 1646 /* 1647 * Fixup left and right bounds 1648 */ 1649 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1650 cursor->left_bound = &parent_elm[0].internal.base; 1651 cursor->right_bound = &parent_elm[1].internal.base; 1652 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1653 &cursor->node->ondisk->elms[0].internal.base) <= 0); 1654 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1655 &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0); 1656 1657 done: 1658 hammer_btree_unlock_children(cursor, &lockroot); 1659 hammer_cursor_downgrade(cursor); 1660 return (error); 1661 } 1662 1663 /* 1664 * Same as the above, but splits a full leaf node. 1665 * 1666 * This function 1667 */ 1668 static 1669 int 1670 btree_split_leaf(hammer_cursor_t cursor) 1671 { 1672 hammer_node_ondisk_t ondisk; 1673 hammer_node_t parent; 1674 hammer_node_t leaf; 1675 hammer_mount_t hmp; 1676 hammer_node_t new_leaf; 1677 hammer_btree_elm_t elm; 1678 hammer_btree_elm_t parent_elm; 1679 hammer_base_elm_t mid_boundary; 1680 hammer_off_t hint; 1681 int parent_index; 1682 int made_root; 1683 int split; 1684 int error; 1685 const size_t esize = sizeof(*elm); 1686 1687 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1688 return(error); 1689 ++hammer_stats_btree_splits; 1690 1691 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1692 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1693 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1694 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1695 1696 /* 1697 * Calculate the split point. If the insertion point will be on 1698 * the left-hand side adjust the split point to give the right 1699 * hand side one additional node. 1700 * 1701 * Spikes are made up of two leaf elements which cannot be 1702 * safely split. 1703 */ 1704 leaf = cursor->node; 1705 ondisk = leaf->ondisk; 1706 KKASSERT(ondisk->count > 2); 1707 split = (ondisk->count + 1) / 2; 1708 if (cursor->index <= split) 1709 --split; 1710 error = 0; 1711 hmp = leaf->hmp; 1712 1713 elm = &ondisk->elms[split]; 1714 1715 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm[-1].leaf.base) <= 0); 1716 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->leaf.base) <= 0); 1717 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->leaf.base) > 0); 1718 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm[1].leaf.base) > 0); 1719 1720 /* 1721 * If we are at the root of the tree, create a new root node with 1722 * 1 element and split normally. Avoid making major modifications 1723 * until we know the whole operation will work. 1724 */ 1725 if (ondisk->parent == 0) { 1726 parent = hammer_alloc_btree(cursor->trans, leaf->node_offset, 1727 &error); 1728 if (parent == NULL) 1729 goto done; 1730 hammer_lock_ex(&parent->lock); 1731 hammer_modify_node_noundo(cursor->trans, parent); 1732 ondisk = parent->ondisk; 1733 ondisk->count = 1; 1734 ondisk->parent = 0; 1735 ondisk->mirror_tid = leaf->ondisk->mirror_tid; 1736 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1737 ondisk->elms[0].base = hmp->root_btree_beg; 1738 ondisk->elms[0].base.btype = leaf->ondisk->type; 1739 ondisk->elms[0].internal.subtree_offset = leaf->node_offset; 1740 ondisk->elms[1].base = hmp->root_btree_end; 1741 /* ondisk->elms[1].base.btype = not used */ 1742 hammer_modify_node_done(parent); 1743 made_root = 1; 1744 parent_index = 0; /* insertion point in parent */ 1745 } else { 1746 made_root = 0; 1747 parent = cursor->parent; 1748 parent_index = cursor->parent_index; 1749 } 1750 1751 /* 1752 * Calculate a hint for the allocation of the new B-Tree leaf node. 1753 * For now just try to localize it within the same bigblock as 1754 * the current leaf. 1755 * 1756 * If the insertion point is at the end of the leaf we recognize a 1757 * likely append sequence of some sort (data, meta-data, inodes, 1758 * whatever). Set the hint to zero to allocate out of linear space 1759 * instead of trying to completely fill previously hinted space. 1760 * 1761 * This also sets the stage for recursive splits to localize using 1762 * the new space. 1763 */ 1764 ondisk = leaf->ondisk; 1765 if (cursor->index == ondisk->count) 1766 hint = 0; 1767 else 1768 hint = leaf->node_offset; 1769 1770 /* 1771 * Split leaf into new_leaf at the split point. Select a separator 1772 * value in-between the two leafs but with a bent towards the right 1773 * leaf since comparisons use an 'elm >= separator' inequality. 1774 * 1775 * L L L L L L L L 1776 * 1777 * x x P x x 1778 * s S S s 1779 * / \ 1780 * L L L L L L L L 1781 */ 1782 new_leaf = hammer_alloc_btree(cursor->trans, hint, &error); 1783 if (new_leaf == NULL) { 1784 if (made_root) { 1785 hammer_unlock(&parent->lock); 1786 hammer_delete_node(cursor->trans, parent); 1787 hammer_rel_node(parent); 1788 } 1789 goto done; 1790 } 1791 hammer_lock_ex(&new_leaf->lock); 1792 1793 /* 1794 * Create the new node and copy the leaf elements from the split 1795 * point on to the new node. 1796 */ 1797 hammer_modify_node_all(cursor->trans, leaf); 1798 hammer_modify_node_noundo(cursor->trans, new_leaf); 1799 ondisk = leaf->ondisk; 1800 elm = &ondisk->elms[split]; 1801 bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize); 1802 new_leaf->ondisk->count = ondisk->count - split; 1803 new_leaf->ondisk->parent = parent->node_offset; 1804 new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF; 1805 new_leaf->ondisk->mirror_tid = ondisk->mirror_tid; 1806 KKASSERT(ondisk->type == new_leaf->ondisk->type); 1807 hammer_modify_node_done(new_leaf); 1808 hammer_cursor_split_node(leaf, new_leaf, split); 1809 1810 /* 1811 * Cleanup the original node. Because this is a leaf node and 1812 * leaf nodes do not have a right-hand boundary, there 1813 * aren't any special edge cases to clean up. We just fixup the 1814 * count. 1815 */ 1816 ondisk->count = split; 1817 1818 /* 1819 * Insert the separator into the parent, fixup the parent's 1820 * reference to the original node, and reference the new node. 1821 * The separator is P. 1822 * 1823 * Remember that base.count does not include the right-hand boundary. 1824 * We are copying parent_index+1 to parent_index+2, not +0 to +1. 1825 */ 1826 hammer_modify_node_all(cursor->trans, parent); 1827 ondisk = parent->ondisk; 1828 KKASSERT(split != 0); 1829 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1830 parent_elm = &ondisk->elms[parent_index+1]; 1831 bcopy(parent_elm, parent_elm + 1, 1832 (ondisk->count - parent_index) * esize); 1833 1834 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base); 1835 parent_elm->internal.base.btype = new_leaf->ondisk->type; 1836 parent_elm->internal.subtree_offset = new_leaf->node_offset; 1837 parent_elm->internal.mirror_tid = new_leaf->ondisk->mirror_tid; 1838 mid_boundary = &parent_elm->base; 1839 ++ondisk->count; 1840 hammer_modify_node_done(parent); 1841 hammer_cursor_inserted_element(parent, parent_index + 1); 1842 1843 /* 1844 * The filesystem's root B-Tree pointer may have to be updated. 1845 */ 1846 if (made_root) { 1847 hammer_volume_t volume; 1848 1849 volume = hammer_get_root_volume(hmp, &error); 1850 KKASSERT(error == 0); 1851 1852 hammer_modify_volume_field(cursor->trans, volume, 1853 vol0_btree_root); 1854 volume->ondisk->vol0_btree_root = parent->node_offset; 1855 hammer_modify_volume_done(volume); 1856 leaf->ondisk->parent = parent->node_offset; 1857 if (cursor->parent) { 1858 hammer_unlock(&cursor->parent->lock); 1859 hammer_rel_node(cursor->parent); 1860 } 1861 cursor->parent = parent; /* lock'd and ref'd */ 1862 hammer_rel_volume(volume, 0); 1863 } 1864 hammer_modify_node_done(leaf); 1865 1866 /* 1867 * Ok, now adjust the cursor depending on which element the original 1868 * index was pointing at. If we are >= the split point the push node 1869 * is now in the new node. 1870 * 1871 * NOTE: If we are at the split point itself we need to select the 1872 * old or new node based on where key_beg's insertion point will be. 1873 * If we pick the wrong side the inserted element will wind up in 1874 * the wrong leaf node and outside that node's bounds. 1875 */ 1876 if (cursor->index > split || 1877 (cursor->index == split && 1878 hammer_btree_cmp(&cursor->key_beg, mid_boundary) >= 0)) { 1879 cursor->parent_index = parent_index + 1; 1880 cursor->index -= split; 1881 hammer_unlock(&cursor->node->lock); 1882 hammer_rel_node(cursor->node); 1883 cursor->node = new_leaf; 1884 } else { 1885 cursor->parent_index = parent_index; 1886 hammer_unlock(&new_leaf->lock); 1887 hammer_rel_node(new_leaf); 1888 } 1889 1890 /* 1891 * Fixup left and right bounds 1892 */ 1893 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1894 cursor->left_bound = &parent_elm[0].internal.base; 1895 cursor->right_bound = &parent_elm[1].internal.base; 1896 1897 /* 1898 * Assert that the bounds are correct. 1899 */ 1900 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1901 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1902 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1903 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1904 KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->key_beg) <= 0); 1905 KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->key_beg) > 0); 1906 1907 done: 1908 hammer_cursor_downgrade(cursor); 1909 return (error); 1910 } 1911 1912 #if 0 1913 1914 /* 1915 * Recursively correct the right-hand boundary's create_tid to (tid) as 1916 * long as the rest of the key matches. We have to recurse upward in 1917 * the tree as well as down the left side of each parent's right node. 1918 * 1919 * Return EDEADLK if we were only partially successful, forcing the caller 1920 * to try again. The original cursor is not modified. This routine can 1921 * also fail with EDEADLK if it is forced to throw away a portion of its 1922 * record history. 1923 * 1924 * The caller must pass a downgraded cursor to us (otherwise we can't dup it). 1925 */ 1926 struct hammer_rhb { 1927 TAILQ_ENTRY(hammer_rhb) entry; 1928 hammer_node_t node; 1929 int index; 1930 }; 1931 1932 TAILQ_HEAD(hammer_rhb_list, hammer_rhb); 1933 1934 int 1935 hammer_btree_correct_rhb(hammer_cursor_t cursor, hammer_tid_t tid) 1936 { 1937 struct hammer_mount *hmp; 1938 struct hammer_rhb_list rhb_list; 1939 hammer_base_elm_t elm; 1940 hammer_node_t orig_node; 1941 struct hammer_rhb *rhb; 1942 int orig_index; 1943 int error; 1944 1945 TAILQ_INIT(&rhb_list); 1946 hmp = cursor->trans->hmp; 1947 1948 /* 1949 * Save our position so we can restore it on return. This also 1950 * gives us a stable 'elm'. 1951 */ 1952 orig_node = cursor->node; 1953 hammer_ref_node(orig_node); 1954 hammer_lock_sh(&orig_node->lock); 1955 orig_index = cursor->index; 1956 elm = &orig_node->ondisk->elms[orig_index].base; 1957 1958 /* 1959 * Now build a list of parents going up, allocating a rhb 1960 * structure for each one. 1961 */ 1962 while (cursor->parent) { 1963 /* 1964 * Stop if we no longer have any right-bounds to fix up 1965 */ 1966 if (elm->obj_id != cursor->right_bound->obj_id || 1967 elm->rec_type != cursor->right_bound->rec_type || 1968 elm->key != cursor->right_bound->key) { 1969 break; 1970 } 1971 1972 /* 1973 * Stop if the right-hand bound's create_tid does not 1974 * need to be corrected. 1975 */ 1976 if (cursor->right_bound->create_tid >= tid) 1977 break; 1978 1979 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 1980 rhb->node = cursor->parent; 1981 rhb->index = cursor->parent_index; 1982 hammer_ref_node(rhb->node); 1983 hammer_lock_sh(&rhb->node->lock); 1984 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 1985 1986 hammer_cursor_up(cursor); 1987 } 1988 1989 /* 1990 * now safely adjust the right hand bound for each rhb. This may 1991 * also require taking the right side of the tree and iterating down 1992 * ITS left side. 1993 */ 1994 error = 0; 1995 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 1996 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 1997 if (error) 1998 break; 1999 TAILQ_REMOVE(&rhb_list, rhb, entry); 2000 hammer_unlock(&rhb->node->lock); 2001 hammer_rel_node(rhb->node); 2002 kfree(rhb, hmp->m_misc); 2003 2004 switch (cursor->node->ondisk->type) { 2005 case HAMMER_BTREE_TYPE_INTERNAL: 2006 /* 2007 * Right-boundary for parent at internal node 2008 * is one element to the right of the element whos 2009 * right boundary needs adjusting. We must then 2010 * traverse down the left side correcting any left 2011 * bounds (which may now be too far to the left). 2012 */ 2013 ++cursor->index; 2014 error = hammer_btree_correct_lhb(cursor, tid); 2015 break; 2016 default: 2017 panic("hammer_btree_correct_rhb(): Bad node type"); 2018 error = EINVAL; 2019 break; 2020 } 2021 } 2022 2023 /* 2024 * Cleanup 2025 */ 2026 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2027 TAILQ_REMOVE(&rhb_list, rhb, entry); 2028 hammer_unlock(&rhb->node->lock); 2029 hammer_rel_node(rhb->node); 2030 kfree(rhb, hmp->m_misc); 2031 } 2032 error = hammer_cursor_seek(cursor, orig_node, orig_index); 2033 hammer_unlock(&orig_node->lock); 2034 hammer_rel_node(orig_node); 2035 return (error); 2036 } 2037 2038 /* 2039 * Similar to rhb (in fact, rhb calls lhb), but corrects the left hand 2040 * bound going downward starting at the current cursor position. 2041 * 2042 * This function does not restore the cursor after use. 2043 */ 2044 int 2045 hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid) 2046 { 2047 struct hammer_rhb_list rhb_list; 2048 hammer_base_elm_t elm; 2049 hammer_base_elm_t cmp; 2050 struct hammer_rhb *rhb; 2051 struct hammer_mount *hmp; 2052 int error; 2053 2054 TAILQ_INIT(&rhb_list); 2055 hmp = cursor->trans->hmp; 2056 2057 cmp = &cursor->node->ondisk->elms[cursor->index].base; 2058 2059 /* 2060 * Record the node and traverse down the left-hand side for all 2061 * matching records needing a boundary correction. 2062 */ 2063 error = 0; 2064 for (;;) { 2065 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2066 rhb->node = cursor->node; 2067 rhb->index = cursor->index; 2068 hammer_ref_node(rhb->node); 2069 hammer_lock_sh(&rhb->node->lock); 2070 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2071 2072 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2073 /* 2074 * Nothing to traverse down if we are at the right 2075 * boundary of an internal node. 2076 */ 2077 if (cursor->index == cursor->node->ondisk->count) 2078 break; 2079 } else { 2080 elm = &cursor->node->ondisk->elms[cursor->index].base; 2081 if (elm->btype == HAMMER_BTREE_TYPE_RECORD) 2082 break; 2083 panic("Illegal leaf record type %02x", elm->btype); 2084 } 2085 error = hammer_cursor_down(cursor); 2086 if (error) 2087 break; 2088 2089 elm = &cursor->node->ondisk->elms[cursor->index].base; 2090 if (elm->obj_id != cmp->obj_id || 2091 elm->rec_type != cmp->rec_type || 2092 elm->key != cmp->key) { 2093 break; 2094 } 2095 if (elm->create_tid >= tid) 2096 break; 2097 2098 } 2099 2100 /* 2101 * Now we can safely adjust the left-hand boundary from the bottom-up. 2102 * The last element we remove from the list is the caller's right hand 2103 * boundary, which must also be adjusted. 2104 */ 2105 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2106 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2107 if (error) 2108 break; 2109 TAILQ_REMOVE(&rhb_list, rhb, entry); 2110 hammer_unlock(&rhb->node->lock); 2111 hammer_rel_node(rhb->node); 2112 kfree(rhb, hmp->m_misc); 2113 2114 elm = &cursor->node->ondisk->elms[cursor->index].base; 2115 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2116 hammer_modify_node(cursor->trans, cursor->node, 2117 &elm->create_tid, 2118 sizeof(elm->create_tid)); 2119 elm->create_tid = tid; 2120 hammer_modify_node_done(cursor->node); 2121 } else { 2122 panic("hammer_btree_correct_lhb(): Bad element type"); 2123 } 2124 } 2125 2126 /* 2127 * Cleanup 2128 */ 2129 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2130 TAILQ_REMOVE(&rhb_list, rhb, entry); 2131 hammer_unlock(&rhb->node->lock); 2132 hammer_rel_node(rhb->node); 2133 kfree(rhb, hmp->m_misc); 2134 } 2135 return (error); 2136 } 2137 2138 #endif 2139 2140 /* 2141 * Attempt to remove the locked, empty or want-to-be-empty B-Tree node at 2142 * (cursor->node). Returns 0 on success, EDEADLK if we could not complete 2143 * the operation due to a deadlock, or some other error. 2144 * 2145 * This routine is initially called with an empty leaf and may be 2146 * recursively called with single-element internal nodes. 2147 * 2148 * It should also be noted that when removing empty leaves we must be sure 2149 * to test and update mirror_tid because another thread may have deadlocked 2150 * against us (or someone) trying to propagate it up and cannot retry once 2151 * the node has been deleted. 2152 * 2153 * On return the cursor may end up pointing to an internal node, suitable 2154 * for further iteration but not for an immediate insertion or deletion. 2155 */ 2156 static int 2157 btree_remove(hammer_cursor_t cursor) 2158 { 2159 hammer_node_ondisk_t ondisk; 2160 hammer_btree_elm_t elm; 2161 hammer_node_t node; 2162 hammer_node_t parent; 2163 const int esize = sizeof(*elm); 2164 int error; 2165 2166 node = cursor->node; 2167 2168 /* 2169 * When deleting the root of the filesystem convert it to 2170 * an empty leaf node. Internal nodes cannot be empty. 2171 */ 2172 ondisk = node->ondisk; 2173 if (ondisk->parent == 0) { 2174 KKASSERT(cursor->parent == NULL); 2175 hammer_modify_node_all(cursor->trans, node); 2176 KKASSERT(ondisk == node->ondisk); 2177 ondisk->type = HAMMER_BTREE_TYPE_LEAF; 2178 ondisk->count = 0; 2179 hammer_modify_node_done(node); 2180 cursor->index = 0; 2181 return(0); 2182 } 2183 2184 parent = cursor->parent; 2185 hammer_cursor_removed_node(node, parent, cursor->parent_index); 2186 2187 /* 2188 * Attempt to remove the parent's reference to the child. If the 2189 * parent would become empty we have to recurse. If we fail we 2190 * leave the parent pointing to an empty leaf node. 2191 * 2192 * We have to recurse successfully before we can delete the internal 2193 * node as it is illegal to have empty internal nodes. Even though 2194 * the operation may be aborted we must still fixup any unlocked 2195 * cursors as if we had deleted the element prior to recursing 2196 * (by calling hammer_cursor_deleted_element()) so those cursors 2197 * are properly forced up the chain by the recursion. 2198 */ 2199 if (parent->ondisk->count == 1) { 2200 /* 2201 * This special cursor_up_locked() call leaves the original 2202 * node exclusively locked and referenced, leaves the 2203 * original parent locked (as the new node), and locks the 2204 * new parent. It can return EDEADLK. 2205 */ 2206 error = hammer_cursor_up_locked(cursor); 2207 if (error == 0) { 2208 hammer_cursor_deleted_element(cursor->node, 0); 2209 error = btree_remove(cursor); 2210 if (error == 0) { 2211 hammer_modify_node_all(cursor->trans, node); 2212 ondisk = node->ondisk; 2213 ondisk->type = HAMMER_BTREE_TYPE_DELETED; 2214 ondisk->count = 0; 2215 hammer_modify_node_done(node); 2216 hammer_flush_node(node); 2217 hammer_delete_node(cursor->trans, node); 2218 } else { 2219 /* 2220 * Defer parent removal because we could not 2221 * get the lock, just let the leaf remain 2222 * empty. 2223 */ 2224 /**/ 2225 } 2226 hammer_unlock(&node->lock); 2227 hammer_rel_node(node); 2228 } else { 2229 /* 2230 * Defer parent removal because we could not 2231 * get the lock, just let the leaf remain 2232 * empty. 2233 */ 2234 /**/ 2235 } 2236 } else { 2237 KKASSERT(parent->ondisk->count > 1); 2238 2239 hammer_modify_node_all(cursor->trans, parent); 2240 ondisk = parent->ondisk; 2241 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 2242 2243 elm = &ondisk->elms[cursor->parent_index]; 2244 KKASSERT(elm->internal.subtree_offset == node->node_offset); 2245 KKASSERT(ondisk->count > 0); 2246 2247 /* 2248 * We must retain the highest mirror_tid. The deleted 2249 * range is now encompassed by the element to the left. 2250 * If we are already at the left edge the new left edge 2251 * inherits mirror_tid. 2252 * 2253 * Note that bounds of the parent to our parent may create 2254 * a gap to the left of our left-most node or to the right 2255 * of our right-most node. The gap is silently included 2256 * in the mirror_tid's area of effect from the point of view 2257 * of the scan. 2258 */ 2259 if (cursor->parent_index) { 2260 if (elm[-1].internal.mirror_tid < 2261 elm[0].internal.mirror_tid) { 2262 elm[-1].internal.mirror_tid = 2263 elm[0].internal.mirror_tid; 2264 } 2265 } else { 2266 if (elm[1].internal.mirror_tid < 2267 elm[0].internal.mirror_tid) { 2268 elm[1].internal.mirror_tid = 2269 elm[0].internal.mirror_tid; 2270 } 2271 } 2272 2273 /* 2274 * Delete the subtree reference in the parent 2275 */ 2276 bcopy(&elm[1], &elm[0], 2277 (ondisk->count - cursor->parent_index) * esize); 2278 --ondisk->count; 2279 hammer_modify_node_done(parent); 2280 hammer_cursor_deleted_element(parent, cursor->parent_index); 2281 hammer_flush_node(node); 2282 hammer_delete_node(cursor->trans, node); 2283 2284 /* 2285 * cursor->node is invalid, cursor up to make the cursor 2286 * valid again. 2287 */ 2288 error = hammer_cursor_up(cursor); 2289 } 2290 return (error); 2291 } 2292 2293 /* 2294 * Propagate cursor->trans->tid up the B-Tree starting at the current 2295 * cursor position using pseudofs info gleaned from the passed inode. 2296 * 2297 * The passed inode has no relationship to the cursor position other 2298 * then being in the same pseudofs as the insertion or deletion we 2299 * are propagating the mirror_tid for. 2300 * 2301 * WARNING! Because we push and pop the passed cursor, it may be 2302 * modified by other B-Tree operations while it is unlocked 2303 * and things like the node & leaf pointers, and indexes might 2304 * change. 2305 */ 2306 void 2307 hammer_btree_do_propagation(hammer_cursor_t cursor, 2308 hammer_pseudofs_inmem_t pfsm, 2309 hammer_btree_leaf_elm_t leaf) 2310 { 2311 hammer_cursor_t ncursor; 2312 hammer_tid_t mirror_tid; 2313 int error; 2314 2315 /* 2316 * We do not propagate a mirror_tid if the filesystem was mounted 2317 * in no-mirror mode. 2318 */ 2319 if (cursor->trans->hmp->master_id < 0) 2320 return; 2321 2322 /* 2323 * This is a bit of a hack because we cannot deadlock or return 2324 * EDEADLK here. The related operation has already completed and 2325 * we must propagate the mirror_tid now regardless. 2326 * 2327 * Generate a new cursor which inherits the original's locks and 2328 * unlock the original. Use the new cursor to propagate the 2329 * mirror_tid. Then clean up the new cursor and reacquire locks 2330 * on the original. 2331 * 2332 * hammer_dup_cursor() cannot dup locks. The dup inherits the 2333 * original's locks and the original is tracked and must be 2334 * re-locked. 2335 */ 2336 mirror_tid = cursor->node->ondisk->mirror_tid; 2337 KKASSERT(mirror_tid != 0); 2338 ncursor = hammer_push_cursor(cursor); 2339 error = hammer_btree_mirror_propagate(ncursor, mirror_tid); 2340 KKASSERT(error == 0); 2341 hammer_pop_cursor(cursor, ncursor); 2342 /* WARNING: cursor's leaf pointer may change after pop */ 2343 } 2344 2345 2346 /* 2347 * Propagate a mirror TID update upwards through the B-Tree to the root. 2348 * 2349 * A locked internal node must be passed in. The node will remain locked 2350 * on return. 2351 * 2352 * This function syncs mirror_tid at the specified internal node's element, 2353 * adjusts the node's aggregation mirror_tid, and then recurses upwards. 2354 */ 2355 static int 2356 hammer_btree_mirror_propagate(hammer_cursor_t cursor, hammer_tid_t mirror_tid) 2357 { 2358 hammer_btree_internal_elm_t elm; 2359 hammer_node_t node; 2360 int error; 2361 2362 for (;;) { 2363 error = hammer_cursor_up(cursor); 2364 if (error == 0) 2365 error = hammer_cursor_upgrade(cursor); 2366 while (error == EDEADLK) { 2367 hammer_recover_cursor(cursor); 2368 error = hammer_cursor_upgrade(cursor); 2369 } 2370 if (error) 2371 break; 2372 node = cursor->node; 2373 KKASSERT (node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 2374 2375 /* 2376 * Adjust the node's element 2377 */ 2378 elm = &node->ondisk->elms[cursor->index].internal; 2379 if (elm->mirror_tid >= mirror_tid) 2380 break; 2381 hammer_modify_node(cursor->trans, node, &elm->mirror_tid, 2382 sizeof(elm->mirror_tid)); 2383 elm->mirror_tid = mirror_tid; 2384 hammer_modify_node_done(node); 2385 if (hammer_debug_general & 0x0002) { 2386 kprintf("mirror_propagate: propagate " 2387 "%016llx @%016llx:%d\n", 2388 (long long)mirror_tid, 2389 (long long)node->node_offset, 2390 cursor->index); 2391 } 2392 2393 2394 /* 2395 * Adjust the node's mirror_tid aggregator 2396 */ 2397 if (node->ondisk->mirror_tid >= mirror_tid) 2398 return(0); 2399 hammer_modify_node_field(cursor->trans, node, mirror_tid); 2400 node->ondisk->mirror_tid = mirror_tid; 2401 hammer_modify_node_done(node); 2402 if (hammer_debug_general & 0x0002) { 2403 kprintf("mirror_propagate: propagate " 2404 "%016llx @%016llx\n", 2405 (long long)mirror_tid, 2406 (long long)node->node_offset); 2407 } 2408 } 2409 if (error == ENOENT) 2410 error = 0; 2411 return(error); 2412 } 2413 2414 hammer_node_t 2415 hammer_btree_get_parent(hammer_transaction_t trans, hammer_node_t node, 2416 int *parent_indexp, int *errorp, int try_exclusive) 2417 { 2418 hammer_node_t parent; 2419 hammer_btree_elm_t elm; 2420 int i; 2421 2422 /* 2423 * Get the node 2424 */ 2425 parent = hammer_get_node(trans, node->ondisk->parent, 0, errorp); 2426 if (*errorp) { 2427 KKASSERT(parent == NULL); 2428 return(NULL); 2429 } 2430 KKASSERT ((parent->flags & HAMMER_NODE_DELETED) == 0); 2431 2432 /* 2433 * Lock the node 2434 */ 2435 if (try_exclusive) { 2436 if (hammer_lock_ex_try(&parent->lock)) { 2437 hammer_rel_node(parent); 2438 *errorp = EDEADLK; 2439 return(NULL); 2440 } 2441 } else { 2442 hammer_lock_sh(&parent->lock); 2443 } 2444 2445 /* 2446 * Figure out which element in the parent is pointing to the 2447 * child. 2448 */ 2449 if (node->ondisk->count) { 2450 i = hammer_btree_search_node(&node->ondisk->elms[0].base, 2451 parent->ondisk); 2452 } else { 2453 i = 0; 2454 } 2455 while (i < parent->ondisk->count) { 2456 elm = &parent->ondisk->elms[i]; 2457 if (elm->internal.subtree_offset == node->node_offset) 2458 break; 2459 ++i; 2460 } 2461 if (i == parent->ondisk->count) { 2462 hammer_unlock(&parent->lock); 2463 panic("Bad B-Tree link: parent %p node %p\n", parent, node); 2464 } 2465 *parent_indexp = i; 2466 KKASSERT(*errorp == 0); 2467 return(parent); 2468 } 2469 2470 /* 2471 * The element (elm) has been moved to a new internal node (node). 2472 * 2473 * If the element represents a pointer to an internal node that node's 2474 * parent must be adjusted to the element's new location. 2475 * 2476 * XXX deadlock potential here with our exclusive locks 2477 */ 2478 int 2479 btree_set_parent(hammer_transaction_t trans, hammer_node_t node, 2480 hammer_btree_elm_t elm) 2481 { 2482 hammer_node_t child; 2483 int error; 2484 2485 error = 0; 2486 2487 switch(elm->base.btype) { 2488 case HAMMER_BTREE_TYPE_INTERNAL: 2489 case HAMMER_BTREE_TYPE_LEAF: 2490 child = hammer_get_node(trans, elm->internal.subtree_offset, 2491 0, &error); 2492 if (error == 0) { 2493 hammer_modify_node_field(trans, child, parent); 2494 child->ondisk->parent = node->node_offset; 2495 hammer_modify_node_done(child); 2496 hammer_rel_node(child); 2497 } 2498 break; 2499 default: 2500 break; 2501 } 2502 return(error); 2503 } 2504 2505 /* 2506 * Initialize the root of a recursive B-Tree node lock list structure. 2507 */ 2508 void 2509 hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node) 2510 { 2511 TAILQ_INIT(&parent->list); 2512 parent->parent = NULL; 2513 parent->node = node; 2514 parent->index = -1; 2515 parent->count = node->ondisk->count; 2516 parent->copy = NULL; 2517 parent->flags = 0; 2518 } 2519 2520 /* 2521 * Exclusively lock all the children of node. This is used by the split 2522 * code to prevent anyone from accessing the children of a cursor node 2523 * while we fix-up its parent offset. 2524 * 2525 * If we don't lock the children we can really mess up cursors which block 2526 * trying to cursor-up into our node. 2527 * 2528 * On failure EDEADLK (or some other error) is returned. If a deadlock 2529 * error is returned the cursor is adjusted to block on termination. 2530 * 2531 * The caller is responsible for managing parent->node, the root's node 2532 * is usually aliased from a cursor. 2533 */ 2534 int 2535 hammer_btree_lock_children(hammer_cursor_t cursor, int depth, 2536 hammer_node_lock_t parent) 2537 { 2538 hammer_node_t node; 2539 hammer_node_lock_t item; 2540 hammer_node_ondisk_t ondisk; 2541 hammer_btree_elm_t elm; 2542 hammer_node_t child; 2543 struct hammer_mount *hmp; 2544 int error; 2545 int i; 2546 2547 node = parent->node; 2548 ondisk = node->ondisk; 2549 error = 0; 2550 hmp = cursor->trans->hmp; 2551 2552 /* 2553 * We really do not want to block on I/O with exclusive locks held, 2554 * pre-get the children before trying to lock the mess. This is 2555 * only done one-level deep for now. 2556 */ 2557 for (i = 0; i < ondisk->count; ++i) { 2558 ++hammer_stats_btree_elements; 2559 elm = &ondisk->elms[i]; 2560 if (elm->base.btype != HAMMER_BTREE_TYPE_LEAF && 2561 elm->base.btype != HAMMER_BTREE_TYPE_INTERNAL) { 2562 continue; 2563 } 2564 child = hammer_get_node(cursor->trans, 2565 elm->internal.subtree_offset, 2566 0, &error); 2567 if (child) 2568 hammer_rel_node(child); 2569 } 2570 2571 /* 2572 * Do it for real 2573 */ 2574 for (i = 0; error == 0 && i < ondisk->count; ++i) { 2575 ++hammer_stats_btree_elements; 2576 elm = &ondisk->elms[i]; 2577 2578 switch(elm->base.btype) { 2579 case HAMMER_BTREE_TYPE_INTERNAL: 2580 case HAMMER_BTREE_TYPE_LEAF: 2581 KKASSERT(elm->internal.subtree_offset != 0); 2582 child = hammer_get_node(cursor->trans, 2583 elm->internal.subtree_offset, 2584 0, &error); 2585 break; 2586 default: 2587 child = NULL; 2588 break; 2589 } 2590 if (child) { 2591 if (hammer_lock_ex_try(&child->lock) != 0) { 2592 if (cursor->deadlk_node == NULL) { 2593 cursor->deadlk_node = child; 2594 hammer_ref_node(cursor->deadlk_node); 2595 } 2596 error = EDEADLK; 2597 hammer_rel_node(child); 2598 } else { 2599 item = kmalloc(sizeof(*item), hmp->m_misc, 2600 M_WAITOK|M_ZERO); 2601 TAILQ_INSERT_TAIL(&parent->list, item, entry); 2602 TAILQ_INIT(&item->list); 2603 item->parent = parent; 2604 item->node = child; 2605 item->index = i; 2606 item->count = child->ondisk->count; 2607 2608 /* 2609 * Recurse (used by the rebalancing code) 2610 */ 2611 if (depth > 1 && elm->base.btype == HAMMER_BTREE_TYPE_INTERNAL) { 2612 error = hammer_btree_lock_children( 2613 cursor, 2614 depth - 1, 2615 item); 2616 } 2617 } 2618 } 2619 } 2620 if (error) 2621 hammer_btree_unlock_children(cursor, parent); 2622 return(error); 2623 } 2624 2625 /* 2626 * Create an in-memory copy of all B-Tree nodes listed, recursively, 2627 * including the parent. 2628 */ 2629 void 2630 hammer_btree_lock_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2631 { 2632 hammer_mount_t hmp = cursor->trans->hmp; 2633 hammer_node_lock_t item; 2634 2635 if (parent->copy == NULL) { 2636 parent->copy = kmalloc(sizeof(*parent->copy), hmp->m_misc, 2637 M_WAITOK); 2638 *parent->copy = *parent->node->ondisk; 2639 } 2640 TAILQ_FOREACH(item, &parent->list, entry) { 2641 hammer_btree_lock_copy(cursor, item); 2642 } 2643 } 2644 2645 /* 2646 * Recursively sync modified copies to the media. 2647 */ 2648 int 2649 hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2650 { 2651 hammer_node_lock_t item; 2652 int count = 0; 2653 2654 if (parent->flags & HAMMER_NODE_LOCK_UPDATED) { 2655 ++count; 2656 hammer_modify_node_all(cursor->trans, parent->node); 2657 *parent->node->ondisk = *parent->copy; 2658 hammer_modify_node_done(parent->node); 2659 if (parent->copy->type == HAMMER_BTREE_TYPE_DELETED) { 2660 hammer_flush_node(parent->node); 2661 hammer_delete_node(cursor->trans, parent->node); 2662 } 2663 } 2664 TAILQ_FOREACH(item, &parent->list, entry) { 2665 count += hammer_btree_sync_copy(cursor, item); 2666 } 2667 return(count); 2668 } 2669 2670 /* 2671 * Release previously obtained node locks. The caller is responsible for 2672 * cleaning up parent->node itself (its usually just aliased from a cursor), 2673 * but this function will take care of the copies. 2674 */ 2675 void 2676 hammer_btree_unlock_children(hammer_cursor_t cursor, hammer_node_lock_t parent) 2677 { 2678 hammer_node_lock_t item; 2679 2680 if (parent->copy) { 2681 kfree(parent->copy, cursor->trans->hmp->m_misc); 2682 parent->copy = NULL; /* safety */ 2683 } 2684 while ((item = TAILQ_FIRST(&parent->list)) != NULL) { 2685 TAILQ_REMOVE(&parent->list, item, entry); 2686 hammer_btree_unlock_children(cursor, item); 2687 hammer_unlock(&item->node->lock); 2688 hammer_rel_node(item->node); 2689 kfree(item, cursor->trans->hmp->m_misc); 2690 } 2691 } 2692 2693 /************************************************************************ 2694 * MISCELLANIOUS SUPPORT * 2695 ************************************************************************/ 2696 2697 /* 2698 * Compare two B-Tree elements, return -N, 0, or +N (e.g. similar to strcmp). 2699 * 2700 * Note that for this particular function a return value of -1, 0, or +1 2701 * can denote a match if create_tid is otherwise discounted. A create_tid 2702 * of zero is considered to be 'infinity' in comparisons. 2703 * 2704 * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c. 2705 */ 2706 int 2707 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2) 2708 { 2709 if (key1->localization < key2->localization) 2710 return(-5); 2711 if (key1->localization > key2->localization) 2712 return(5); 2713 2714 if (key1->obj_id < key2->obj_id) 2715 return(-4); 2716 if (key1->obj_id > key2->obj_id) 2717 return(4); 2718 2719 if (key1->rec_type < key2->rec_type) 2720 return(-3); 2721 if (key1->rec_type > key2->rec_type) 2722 return(3); 2723 2724 if (key1->key < key2->key) 2725 return(-2); 2726 if (key1->key > key2->key) 2727 return(2); 2728 2729 /* 2730 * A create_tid of zero indicates a record which is undeletable 2731 * and must be considered to have a value of positive infinity. 2732 */ 2733 if (key1->create_tid == 0) { 2734 if (key2->create_tid == 0) 2735 return(0); 2736 return(1); 2737 } 2738 if (key2->create_tid == 0) 2739 return(-1); 2740 if (key1->create_tid < key2->create_tid) 2741 return(-1); 2742 if (key1->create_tid > key2->create_tid) 2743 return(1); 2744 return(0); 2745 } 2746 2747 /* 2748 * Test a timestamp against an element to determine whether the 2749 * element is visible. A timestamp of 0 means 'infinity'. 2750 */ 2751 int 2752 hammer_btree_chkts(hammer_tid_t asof, hammer_base_elm_t base) 2753 { 2754 if (asof == 0) { 2755 if (base->delete_tid) 2756 return(1); 2757 return(0); 2758 } 2759 if (asof < base->create_tid) 2760 return(-1); 2761 if (base->delete_tid && asof >= base->delete_tid) 2762 return(1); 2763 return(0); 2764 } 2765 2766 /* 2767 * Create a separator half way inbetween key1 and key2. For fields just 2768 * one unit apart, the separator will match key2. key1 is on the left-hand 2769 * side and key2 is on the right-hand side. 2770 * 2771 * key2 must be >= the separator. It is ok for the separator to match key2. 2772 * 2773 * NOTE: Even if key1 does not match key2, the separator may wind up matching 2774 * key2. 2775 * 2776 * NOTE: It might be beneficial to just scrap this whole mess and just 2777 * set the separator to key2. 2778 */ 2779 #define MAKE_SEPARATOR(key1, key2, dest, field) \ 2780 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1); 2781 2782 static void 2783 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2, 2784 hammer_base_elm_t dest) 2785 { 2786 bzero(dest, sizeof(*dest)); 2787 2788 dest->rec_type = key2->rec_type; 2789 dest->key = key2->key; 2790 dest->obj_id = key2->obj_id; 2791 dest->create_tid = key2->create_tid; 2792 2793 MAKE_SEPARATOR(key1, key2, dest, localization); 2794 if (key1->localization == key2->localization) { 2795 MAKE_SEPARATOR(key1, key2, dest, obj_id); 2796 if (key1->obj_id == key2->obj_id) { 2797 MAKE_SEPARATOR(key1, key2, dest, rec_type); 2798 if (key1->rec_type == key2->rec_type) { 2799 MAKE_SEPARATOR(key1, key2, dest, key); 2800 /* 2801 * Don't bother creating a separator for 2802 * create_tid, which also conveniently avoids 2803 * having to handle the create_tid == 0 2804 * (infinity) case. Just leave create_tid 2805 * set to key2. 2806 * 2807 * Worst case, dest matches key2 exactly, 2808 * which is acceptable. 2809 */ 2810 } 2811 } 2812 } 2813 } 2814 2815 #undef MAKE_SEPARATOR 2816 2817 /* 2818 * Return whether a generic internal or leaf node is full 2819 */ 2820 static int 2821 btree_node_is_full(hammer_node_ondisk_t node) 2822 { 2823 switch(node->type) { 2824 case HAMMER_BTREE_TYPE_INTERNAL: 2825 if (node->count == HAMMER_BTREE_INT_ELMS) 2826 return(1); 2827 break; 2828 case HAMMER_BTREE_TYPE_LEAF: 2829 if (node->count == HAMMER_BTREE_LEAF_ELMS) 2830 return(1); 2831 break; 2832 default: 2833 panic("illegal btree subtype"); 2834 } 2835 return(0); 2836 } 2837 2838 #if 0 2839 static int 2840 btree_max_elements(u_int8_t type) 2841 { 2842 if (type == HAMMER_BTREE_TYPE_LEAF) 2843 return(HAMMER_BTREE_LEAF_ELMS); 2844 if (type == HAMMER_BTREE_TYPE_INTERNAL) 2845 return(HAMMER_BTREE_INT_ELMS); 2846 panic("btree_max_elements: bad type %d\n", type); 2847 } 2848 #endif 2849 2850 void 2851 hammer_print_btree_node(hammer_node_ondisk_t ondisk) 2852 { 2853 hammer_btree_elm_t elm; 2854 int i; 2855 2856 kprintf("node %p count=%d parent=%016llx type=%c\n", 2857 ondisk, ondisk->count, 2858 (long long)ondisk->parent, ondisk->type); 2859 2860 /* 2861 * Dump both boundary elements if an internal node 2862 */ 2863 if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2864 for (i = 0; i <= ondisk->count; ++i) { 2865 elm = &ondisk->elms[i]; 2866 hammer_print_btree_elm(elm, ondisk->type, i); 2867 } 2868 } else { 2869 for (i = 0; i < ondisk->count; ++i) { 2870 elm = &ondisk->elms[i]; 2871 hammer_print_btree_elm(elm, ondisk->type, i); 2872 } 2873 } 2874 } 2875 2876 void 2877 hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i) 2878 { 2879 kprintf(" %2d", i); 2880 kprintf("\tobj_id = %016llx\n", (long long)elm->base.obj_id); 2881 kprintf("\tkey = %016llx\n", (long long)elm->base.key); 2882 kprintf("\tcreate_tid = %016llx\n", (long long)elm->base.create_tid); 2883 kprintf("\tdelete_tid = %016llx\n", (long long)elm->base.delete_tid); 2884 kprintf("\trec_type = %04x\n", elm->base.rec_type); 2885 kprintf("\tobj_type = %02x\n", elm->base.obj_type); 2886 kprintf("\tbtype = %02x (%c)\n", 2887 elm->base.btype, 2888 (elm->base.btype ? elm->base.btype : '?')); 2889 kprintf("\tlocalization = %02x\n", elm->base.localization); 2890 2891 switch(type) { 2892 case HAMMER_BTREE_TYPE_INTERNAL: 2893 kprintf("\tsubtree_off = %016llx\n", 2894 (long long)elm->internal.subtree_offset); 2895 break; 2896 case HAMMER_BTREE_TYPE_RECORD: 2897 kprintf("\tdata_offset = %016llx\n", 2898 (long long)elm->leaf.data_offset); 2899 kprintf("\tdata_len = %08x\n", elm->leaf.data_len); 2900 kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc); 2901 break; 2902 } 2903 } 2904