1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * HAMMER B-Tree index 37 * 38 * HAMMER implements a modified B+Tree. In documentation this will 39 * simply be refered to as the HAMMER B-Tree. Basically a HAMMER B-Tree 40 * looks like a B+Tree (A B-Tree which stores its records only at the leafs 41 * of the tree), but adds two additional boundary elements which describe 42 * the left-most and right-most element a node is able to represent. In 43 * otherwords, we have boundary elements at the two ends of a B-Tree node 44 * instead of sub-tree pointers. 45 * 46 * A B-Tree internal node looks like this: 47 * 48 * B N N N N N N B <-- boundary and internal elements 49 * S S S S S S S <-- subtree pointers 50 * 51 * A B-Tree leaf node basically looks like this: 52 * 53 * L L L L L L L L <-- leaf elemenets 54 * 55 * The radix for an internal node is 1 less then a leaf but we get a 56 * number of significant benefits for our troubles. 57 * 58 * The big benefit to using a B-Tree containing boundary information 59 * is that it is possible to cache pointers into the middle of the tree 60 * and not have to start searches, insertions, OR deletions at the root 61 * node. In particular, searches are able to progress in a definitive 62 * direction from any point in the tree without revisting nodes. This 63 * greatly improves the efficiency of many operations, most especially 64 * record appends. 65 * 66 * B-Trees also make the stacking of trees fairly straightforward. 67 * 68 * INSERTIONS: A search performed with the intention of doing 69 * an insert will guarantee that the terminal leaf node is not full by 70 * splitting full nodes. Splits occur top-down during the dive down the 71 * B-Tree. 72 * 73 * DELETIONS: A deletion makes no attempt to proactively balance the 74 * tree and will recursively remove nodes that become empty. If a 75 * deadlock occurs a deletion may not be able to remove an empty leaf. 76 * Deletions never allow internal nodes to become empty (that would blow 77 * up the boundaries). 78 */ 79 #include "hammer.h" 80 #include <sys/buf.h> 81 #include <sys/buf2.h> 82 83 static int btree_search(hammer_cursor_t cursor, int flags); 84 static int btree_split_internal(hammer_cursor_t cursor); 85 static int btree_split_leaf(hammer_cursor_t cursor); 86 static int btree_remove(hammer_cursor_t cursor); 87 static int btree_node_is_full(hammer_node_ondisk_t node); 88 static int hammer_btree_mirror_propagate(hammer_cursor_t cursor, 89 hammer_tid_t mirror_tid); 90 static void hammer_make_separator(hammer_base_elm_t key1, 91 hammer_base_elm_t key2, hammer_base_elm_t dest); 92 static void hammer_cursor_mirror_filter(hammer_cursor_t cursor); 93 94 /* 95 * Iterate records after a search. The cursor is iterated forwards past 96 * the current record until a record matching the key-range requirements 97 * is found. ENOENT is returned if the iteration goes past the ending 98 * key. 99 * 100 * The iteration is inclusive of key_beg and can be inclusive or exclusive 101 * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set. 102 * 103 * When doing an as-of search (cursor->asof != 0), key_beg.create_tid 104 * may be modified by B-Tree functions. 105 * 106 * cursor->key_beg may or may not be modified by this function during 107 * the iteration. XXX future - in case of an inverted lock we may have 108 * to reinitiate the lookup and set key_beg to properly pick up where we 109 * left off. 110 * 111 * If HAMMER_CURSOR_ITERATE_CHECK is set it is possible that the cursor 112 * was reverse indexed due to being moved to a parent while unlocked, 113 * and something else might have inserted an element outside the iteration 114 * range. When this case occurs the iterator just keeps iterating until 115 * it gets back into the iteration range (instead of asserting). 116 * 117 * NOTE! EDEADLK *CANNOT* be returned by this procedure. 118 */ 119 int 120 hammer_btree_iterate(hammer_cursor_t cursor) 121 { 122 hammer_node_ondisk_t node; 123 hammer_btree_elm_t elm; 124 hammer_mount_t hmp; 125 int error = 0; 126 int r; 127 int s; 128 129 /* 130 * Skip past the current record 131 */ 132 hmp = cursor->trans->hmp; 133 node = cursor->node->ondisk; 134 if (node == NULL) 135 return(ENOENT); 136 if (cursor->index < node->count && 137 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 138 ++cursor->index; 139 } 140 141 /* 142 * HAMMER can wind up being cpu-bound. 143 */ 144 if (++hmp->check_yield > hammer_yield_check) { 145 hmp->check_yield = 0; 146 lwkt_user_yield(); 147 } 148 149 150 /* 151 * Loop until an element is found or we are done. 152 */ 153 for (;;) { 154 /* 155 * We iterate up the tree and then index over one element 156 * while we are at the last element in the current node. 157 * 158 * If we are at the root of the filesystem, cursor_up 159 * returns ENOENT. 160 * 161 * XXX this could be optimized by storing the information in 162 * the parent reference. 163 * 164 * XXX we can lose the node lock temporarily, this could mess 165 * up our scan. 166 */ 167 ++hammer_stats_btree_iterations; 168 hammer_flusher_clean_loose_ios(hmp); 169 170 if (cursor->index == node->count) { 171 if (hammer_debug_btree) { 172 kprintf("BRACKETU %016llx[%d] -> %016llx[%d] (td=%p)\n", 173 (long long)cursor->node->node_offset, 174 cursor->index, 175 (long long)(cursor->parent ? cursor->parent->node_offset : -1), 176 cursor->parent_index, 177 curthread); 178 } 179 KKASSERT(cursor->parent == NULL || cursor->parent->ondisk->elms[cursor->parent_index].internal.subtree_offset == cursor->node->node_offset); 180 error = hammer_cursor_up(cursor); 181 if (error) 182 break; 183 /* reload stale pointer */ 184 node = cursor->node->ondisk; 185 KKASSERT(cursor->index != node->count); 186 187 /* 188 * If we are reblocking we want to return internal 189 * nodes. Note that the internal node will be 190 * returned multiple times, on each upward recursion 191 * from its children. The caller selects which 192 * revisit it cares about (usually first or last only). 193 */ 194 if (cursor->flags & HAMMER_CURSOR_REBLOCKING) { 195 cursor->flags |= HAMMER_CURSOR_ATEDISK; 196 return(0); 197 } 198 ++cursor->index; 199 continue; 200 } 201 202 /* 203 * Check internal or leaf element. Determine if the record 204 * at the cursor has gone beyond the end of our range. 205 * 206 * We recurse down through internal nodes. 207 */ 208 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 209 elm = &node->elms[cursor->index]; 210 211 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 212 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 213 if (hammer_debug_btree) { 214 kprintf("BRACKETL %016llx[%d] %016llx %02x %016llx lo=%02x %d (td=%p)\n", 215 (long long)cursor->node->node_offset, 216 cursor->index, 217 (long long)elm[0].internal.base.obj_id, 218 elm[0].internal.base.rec_type, 219 (long long)elm[0].internal.base.key, 220 elm[0].internal.base.localization, 221 r, 222 curthread 223 ); 224 kprintf("BRACKETR %016llx[%d] %016llx %02x %016llx lo=%02x %d\n", 225 (long long)cursor->node->node_offset, 226 cursor->index + 1, 227 (long long)elm[1].internal.base.obj_id, 228 elm[1].internal.base.rec_type, 229 (long long)elm[1].internal.base.key, 230 elm[1].internal.base.localization, 231 s 232 ); 233 } 234 235 if (r < 0) { 236 error = ENOENT; 237 break; 238 } 239 if (r == 0 && (cursor->flags & 240 HAMMER_CURSOR_END_INCLUSIVE) == 0) { 241 error = ENOENT; 242 break; 243 } 244 245 /* 246 * Better not be zero 247 */ 248 KKASSERT(elm->internal.subtree_offset != 0); 249 250 if (s <= 0) { 251 /* 252 * If running the mirror filter see if we 253 * can skip one or more entire sub-trees. 254 * If we can we return the internal node 255 * and the caller processes the skipped 256 * range (see mirror_read). 257 */ 258 if (cursor->flags & 259 HAMMER_CURSOR_MIRROR_FILTERED) { 260 if (elm->internal.mirror_tid < 261 cursor->cmirror->mirror_tid) { 262 hammer_cursor_mirror_filter(cursor); 263 return(0); 264 } 265 } 266 } else { 267 /* 268 * Normally it would be impossible for the 269 * cursor to have gotten back-indexed, 270 * but it can happen if a node is deleted 271 * and the cursor is moved to its parent 272 * internal node. ITERATE_CHECK will be set. 273 */ 274 KKASSERT(cursor->flags & 275 HAMMER_CURSOR_ITERATE_CHECK); 276 kprintf("hammer_btree_iterate: " 277 "DEBUG: Caught parent seek " 278 "in internal iteration\n"); 279 } 280 281 error = hammer_cursor_down(cursor); 282 if (error) 283 break; 284 KKASSERT(cursor->index == 0); 285 /* reload stale pointer */ 286 node = cursor->node->ondisk; 287 continue; 288 } else { 289 elm = &node->elms[cursor->index]; 290 r = hammer_btree_cmp(&cursor->key_end, &elm->base); 291 if (hammer_debug_btree) { 292 kprintf("ELEMENT %016llx:%d %c %016llx %02x %016llx lo=%02x %d\n", 293 (long long)cursor->node->node_offset, 294 cursor->index, 295 (elm[0].leaf.base.btype ? 296 elm[0].leaf.base.btype : '?'), 297 (long long)elm[0].leaf.base.obj_id, 298 elm[0].leaf.base.rec_type, 299 (long long)elm[0].leaf.base.key, 300 elm[0].leaf.base.localization, 301 r 302 ); 303 } 304 if (r < 0) { 305 error = ENOENT; 306 break; 307 } 308 309 /* 310 * We support both end-inclusive and 311 * end-exclusive searches. 312 */ 313 if (r == 0 && 314 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 315 error = ENOENT; 316 break; 317 } 318 319 /* 320 * If ITERATE_CHECK is set an unlocked cursor may 321 * have been moved to a parent and the iterate can 322 * happen upon elements that are not in the requested 323 * range. 324 */ 325 if (cursor->flags & HAMMER_CURSOR_ITERATE_CHECK) { 326 s = hammer_btree_cmp(&cursor->key_beg, 327 &elm->base); 328 if (s > 0) { 329 kprintf("hammer_btree_iterate: " 330 "DEBUG: Caught parent seek " 331 "in leaf iteration\n"); 332 ++cursor->index; 333 continue; 334 } 335 } 336 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 337 338 /* 339 * Return the element 340 */ 341 switch(elm->leaf.base.btype) { 342 case HAMMER_BTREE_TYPE_RECORD: 343 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 344 hammer_btree_chkts(cursor->asof, &elm->base)) { 345 ++cursor->index; 346 continue; 347 } 348 error = 0; 349 break; 350 default: 351 error = EINVAL; 352 break; 353 } 354 if (error) 355 break; 356 } 357 /* 358 * node pointer invalid after loop 359 */ 360 361 /* 362 * Return entry 363 */ 364 if (hammer_debug_btree) { 365 int i = cursor->index; 366 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 367 kprintf("ITERATE %p:%d %016llx %02x %016llx lo=%02x\n", 368 cursor->node, i, 369 (long long)elm->internal.base.obj_id, 370 elm->internal.base.rec_type, 371 (long long)elm->internal.base.key, 372 elm->internal.base.localization 373 ); 374 } 375 return(0); 376 } 377 return(error); 378 } 379 380 /* 381 * We hit an internal element that we could skip as part of a mirroring 382 * scan. Calculate the entire range being skipped. 383 * 384 * It is important to include any gaps between the parent's left_bound 385 * and the node's left_bound, and same goes for the right side. 386 */ 387 static void 388 hammer_cursor_mirror_filter(hammer_cursor_t cursor) 389 { 390 struct hammer_cmirror *cmirror; 391 hammer_node_ondisk_t ondisk; 392 hammer_btree_elm_t elm; 393 394 ondisk = cursor->node->ondisk; 395 cmirror = cursor->cmirror; 396 397 /* 398 * Calculate the skipped range 399 */ 400 elm = &ondisk->elms[cursor->index]; 401 if (cursor->index == 0) 402 cmirror->skip_beg = *cursor->left_bound; 403 else 404 cmirror->skip_beg = elm->internal.base; 405 while (cursor->index < ondisk->count) { 406 if (elm->internal.mirror_tid >= cmirror->mirror_tid) 407 break; 408 ++cursor->index; 409 ++elm; 410 } 411 if (cursor->index == ondisk->count) 412 cmirror->skip_end = *cursor->right_bound; 413 else 414 cmirror->skip_end = elm->internal.base; 415 416 /* 417 * clip the returned result. 418 */ 419 if (hammer_btree_cmp(&cmirror->skip_beg, &cursor->key_beg) < 0) 420 cmirror->skip_beg = cursor->key_beg; 421 if (hammer_btree_cmp(&cmirror->skip_end, &cursor->key_end) > 0) 422 cmirror->skip_end = cursor->key_end; 423 } 424 425 /* 426 * Iterate in the reverse direction. This is used by the pruning code to 427 * avoid overlapping records. 428 */ 429 int 430 hammer_btree_iterate_reverse(hammer_cursor_t cursor) 431 { 432 hammer_node_ondisk_t node; 433 hammer_btree_elm_t elm; 434 hammer_mount_t hmp; 435 int error = 0; 436 int r; 437 int s; 438 439 /* mirror filtering not supported for reverse iteration */ 440 KKASSERT ((cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) == 0); 441 442 /* 443 * Skip past the current record. For various reasons the cursor 444 * may end up set to -1 or set to point at the end of the current 445 * node. These cases must be addressed. 446 */ 447 node = cursor->node->ondisk; 448 if (node == NULL) 449 return(ENOENT); 450 if (cursor->index != -1 && 451 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 452 --cursor->index; 453 } 454 if (cursor->index == cursor->node->ondisk->count) 455 --cursor->index; 456 457 /* 458 * HAMMER can wind up being cpu-bound. 459 */ 460 hmp = cursor->trans->hmp; 461 if (++hmp->check_yield > hammer_yield_check) { 462 hmp->check_yield = 0; 463 lwkt_user_yield(); 464 } 465 466 /* 467 * Loop until an element is found or we are done. 468 */ 469 for (;;) { 470 ++hammer_stats_btree_iterations; 471 hammer_flusher_clean_loose_ios(hmp); 472 473 /* 474 * We iterate up the tree and then index over one element 475 * while we are at the last element in the current node. 476 */ 477 if (cursor->index == -1) { 478 error = hammer_cursor_up(cursor); 479 if (error) { 480 cursor->index = 0; /* sanity */ 481 break; 482 } 483 /* reload stale pointer */ 484 node = cursor->node->ondisk; 485 KKASSERT(cursor->index != node->count); 486 --cursor->index; 487 continue; 488 } 489 490 /* 491 * Check internal or leaf element. Determine if the record 492 * at the cursor has gone beyond the end of our range. 493 * 494 * We recurse down through internal nodes. 495 */ 496 KKASSERT(cursor->index != node->count); 497 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 498 elm = &node->elms[cursor->index]; 499 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 500 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 501 if (hammer_debug_btree) { 502 kprintf("BRACKETL %016llx[%d] %016llx %02x %016llx lo=%02x %d\n", 503 (long long)cursor->node->node_offset, 504 cursor->index, 505 (long long)elm[0].internal.base.obj_id, 506 elm[0].internal.base.rec_type, 507 (long long)elm[0].internal.base.key, 508 elm[0].internal.base.localization, 509 r 510 ); 511 kprintf("BRACKETR %016llx[%d] %016llx %02x %016llx lo=%02x %d\n", 512 (long long)cursor->node->node_offset, 513 cursor->index + 1, 514 (long long)elm[1].internal.base.obj_id, 515 elm[1].internal.base.rec_type, 516 (long long)elm[1].internal.base.key, 517 elm[1].internal.base.localization, 518 s 519 ); 520 } 521 522 if (s >= 0) { 523 error = ENOENT; 524 break; 525 } 526 527 /* 528 * It shouldn't be possible to be seeked past key_end, 529 * even if the cursor got moved to a parent. 530 */ 531 KKASSERT(r >= 0); 532 533 /* 534 * Better not be zero 535 */ 536 KKASSERT(elm->internal.subtree_offset != 0); 537 538 error = hammer_cursor_down(cursor); 539 if (error) 540 break; 541 KKASSERT(cursor->index == 0); 542 /* reload stale pointer */ 543 node = cursor->node->ondisk; 544 545 /* this can assign -1 if the leaf was empty */ 546 cursor->index = node->count - 1; 547 continue; 548 } else { 549 elm = &node->elms[cursor->index]; 550 s = hammer_btree_cmp(&cursor->key_beg, &elm->base); 551 if (hammer_debug_btree) { 552 kprintf("ELEMENT %016llx:%d %c %016llx %02x %016llx lo=%02x %d\n", 553 (long long)cursor->node->node_offset, 554 cursor->index, 555 (elm[0].leaf.base.btype ? 556 elm[0].leaf.base.btype : '?'), 557 (long long)elm[0].leaf.base.obj_id, 558 elm[0].leaf.base.rec_type, 559 (long long)elm[0].leaf.base.key, 560 elm[0].leaf.base.localization, 561 s 562 ); 563 } 564 if (s > 0) { 565 error = ENOENT; 566 break; 567 } 568 569 /* 570 * It shouldn't be possible to be seeked past key_end, 571 * even if the cursor got moved to a parent. 572 */ 573 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 574 575 /* 576 * Return the element 577 */ 578 switch(elm->leaf.base.btype) { 579 case HAMMER_BTREE_TYPE_RECORD: 580 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 581 hammer_btree_chkts(cursor->asof, &elm->base)) { 582 --cursor->index; 583 continue; 584 } 585 error = 0; 586 break; 587 default: 588 error = EINVAL; 589 break; 590 } 591 if (error) 592 break; 593 } 594 /* 595 * node pointer invalid after loop 596 */ 597 598 /* 599 * Return entry 600 */ 601 if (hammer_debug_btree) { 602 int i = cursor->index; 603 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 604 kprintf("ITERATE %p:%d %016llx %02x %016llx lo=%02x\n", 605 cursor->node, i, 606 (long long)elm->internal.base.obj_id, 607 elm->internal.base.rec_type, 608 (long long)elm->internal.base.key, 609 elm->internal.base.localization 610 ); 611 } 612 return(0); 613 } 614 return(error); 615 } 616 617 /* 618 * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry 619 * could not be found, EDEADLK if inserting and a retry is needed, and a 620 * fatal error otherwise. When retrying, the caller must terminate the 621 * cursor and reinitialize it. EDEADLK cannot be returned if not inserting. 622 * 623 * The cursor is suitably positioned for a deletion on success, and suitably 624 * positioned for an insertion on ENOENT if HAMMER_CURSOR_INSERT was 625 * specified. 626 * 627 * The cursor may begin anywhere, the search will traverse the tree in 628 * either direction to locate the requested element. 629 * 630 * Most of the logic implementing historical searches is handled here. We 631 * do an initial lookup with create_tid set to the asof TID. Due to the 632 * way records are laid out, a backwards iteration may be required if 633 * ENOENT is returned to locate the historical record. Here's the 634 * problem: 635 * 636 * create_tid: 10 15 20 637 * LEAF1 LEAF2 638 * records: (11) (18) 639 * 640 * Lets say we want to do a lookup AS-OF timestamp 17. We will traverse 641 * LEAF2 but the only record in LEAF2 has a create_tid of 18, which is 642 * not visible and thus causes ENOENT to be returned. We really need 643 * to check record 11 in LEAF1. If it also fails then the search fails 644 * (e.g. it might represent the range 11-16 and thus still not match our 645 * AS-OF timestamp of 17). Note that LEAF1 could be empty, requiring 646 * further iterations. 647 * 648 * If this case occurs btree_search() will set HAMMER_CURSOR_CREATE_CHECK 649 * and the cursor->create_check TID if an iteration might be needed. 650 * In the above example create_check would be set to 14. 651 */ 652 int 653 hammer_btree_lookup(hammer_cursor_t cursor) 654 { 655 int error; 656 657 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 658 KKASSERT ((cursor->flags & HAMMER_CURSOR_INSERT) == 0 || 659 cursor->trans->sync_lock_refs > 0); 660 ++hammer_stats_btree_lookups; 661 if (cursor->flags & HAMMER_CURSOR_ASOF) { 662 KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0); 663 cursor->key_beg.create_tid = cursor->asof; 664 for (;;) { 665 cursor->flags &= ~HAMMER_CURSOR_CREATE_CHECK; 666 error = btree_search(cursor, 0); 667 if (error != ENOENT || 668 (cursor->flags & HAMMER_CURSOR_CREATE_CHECK) == 0) { 669 /* 670 * Stop if no error. 671 * Stop if error other then ENOENT. 672 * Stop if ENOENT and not special case. 673 */ 674 break; 675 } 676 if (hammer_debug_btree) { 677 kprintf("CREATE_CHECK %016llx\n", 678 (long long)cursor->create_check); 679 } 680 cursor->key_beg.create_tid = cursor->create_check; 681 /* loop */ 682 } 683 } else { 684 error = btree_search(cursor, 0); 685 } 686 if (error == 0) 687 error = hammer_btree_extract(cursor, cursor->flags); 688 return(error); 689 } 690 691 /* 692 * Execute the logic required to start an iteration. The first record 693 * located within the specified range is returned and iteration control 694 * flags are adjusted for successive hammer_btree_iterate() calls. 695 * 696 * Set ATEDISK so a low-level caller can call btree_first/btree_iterate 697 * in a loop without worrying about it. Higher-level merged searches will 698 * adjust the flag appropriately. 699 */ 700 int 701 hammer_btree_first(hammer_cursor_t cursor) 702 { 703 int error; 704 705 error = hammer_btree_lookup(cursor); 706 if (error == ENOENT) { 707 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 708 error = hammer_btree_iterate(cursor); 709 } 710 cursor->flags |= HAMMER_CURSOR_ATEDISK; 711 return(error); 712 } 713 714 /* 715 * Similarly but for an iteration in the reverse direction. 716 * 717 * Set ATEDISK when iterating backwards to skip the current entry, 718 * which after an ENOENT lookup will be pointing beyond our end point. 719 * 720 * Set ATEDISK so a low-level caller can call btree_last/btree_iterate_reverse 721 * in a loop without worrying about it. Higher-level merged searches will 722 * adjust the flag appropriately. 723 */ 724 int 725 hammer_btree_last(hammer_cursor_t cursor) 726 { 727 struct hammer_base_elm save; 728 int error; 729 730 save = cursor->key_beg; 731 cursor->key_beg = cursor->key_end; 732 error = hammer_btree_lookup(cursor); 733 cursor->key_beg = save; 734 if (error == ENOENT || 735 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 736 cursor->flags |= HAMMER_CURSOR_ATEDISK; 737 error = hammer_btree_iterate_reverse(cursor); 738 } 739 cursor->flags |= HAMMER_CURSOR_ATEDISK; 740 return(error); 741 } 742 743 /* 744 * Extract the record and/or data associated with the cursor's current 745 * position. Any prior record or data stored in the cursor is replaced. 746 * The cursor must be positioned at a leaf node. 747 * 748 * NOTE: All extractions occur at the leaf of the B-Tree. 749 */ 750 int 751 hammer_btree_extract(hammer_cursor_t cursor, int flags) 752 { 753 hammer_node_ondisk_t node; 754 hammer_btree_elm_t elm; 755 hammer_off_t data_off; 756 hammer_mount_t hmp; 757 int32_t data_len; 758 int error; 759 760 /* 761 * The case where the data reference resolves to the same buffer 762 * as the record reference must be handled. 763 */ 764 node = cursor->node->ondisk; 765 elm = &node->elms[cursor->index]; 766 cursor->data = NULL; 767 hmp = cursor->node->hmp; 768 769 /* 770 * There is nothing to extract for an internal element. 771 */ 772 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) 773 return(EINVAL); 774 775 /* 776 * Only record types have data. 777 */ 778 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 779 cursor->leaf = &elm->leaf; 780 781 if ((flags & HAMMER_CURSOR_GET_DATA) == 0) 782 return(0); 783 if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD) 784 return(0); 785 data_off = elm->leaf.data_offset; 786 data_len = elm->leaf.data_len; 787 if (data_off == 0) 788 return(0); 789 790 /* 791 * Load the data 792 */ 793 KKASSERT(data_len >= 0 && data_len <= HAMMER_XBUFSIZE); 794 cursor->data = hammer_bread_ext(hmp, data_off, data_len, 795 &error, &cursor->data_buffer); 796 797 /* 798 * Mark the data buffer as not being meta-data if it isn't 799 * meta-data (sometimes bulk data is accessed via a volume 800 * block device). 801 */ 802 if (error == 0) { 803 switch(elm->leaf.base.rec_type) { 804 case HAMMER_RECTYPE_DATA: 805 case HAMMER_RECTYPE_DB: 806 if ((data_off & HAMMER_ZONE_LARGE_DATA) == 0) 807 break; 808 if (hammer_double_buffer == 0 || 809 (cursor->flags & HAMMER_CURSOR_NOSWAPCACHE)) { 810 hammer_io_notmeta(cursor->data_buffer); 811 } 812 break; 813 default: 814 break; 815 } 816 } 817 818 /* 819 * Deal with CRC errors on the extracted data. 820 */ 821 if (error == 0 && 822 hammer_crc_test_leaf(cursor->data, &elm->leaf) == 0) { 823 kprintf("CRC DATA @ %016llx/%d FAILED\n", 824 (long long)elm->leaf.data_offset, elm->leaf.data_len); 825 if (hammer_debug_critical) 826 Debugger("CRC FAILED: DATA"); 827 if (cursor->trans->flags & HAMMER_TRANSF_CRCDOM) 828 error = EDOM; /* less critical (mirroring) */ 829 else 830 error = EIO; /* critical */ 831 } 832 return(error); 833 } 834 835 836 /* 837 * Insert a leaf element into the B-Tree at the current cursor position. 838 * The cursor is positioned such that the element at and beyond the cursor 839 * are shifted to make room for the new record. 840 * 841 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT 842 * flag set and that call must return ENOENT before this function can be 843 * called. 844 * 845 * The caller may depend on the cursor's exclusive lock after return to 846 * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE). 847 * 848 * ENOSPC is returned if there is no room to insert a new record. 849 */ 850 int 851 hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm, 852 int *doprop) 853 { 854 hammer_node_ondisk_t node; 855 int i; 856 int error; 857 858 *doprop = 0; 859 if ((error = hammer_cursor_upgrade_node(cursor)) != 0) 860 return(error); 861 ++hammer_stats_btree_inserts; 862 863 /* 864 * Insert the element at the leaf node and update the count in the 865 * parent. It is possible for parent to be NULL, indicating that 866 * the filesystem's ROOT B-Tree node is a leaf itself, which is 867 * possible. The root inode can never be deleted so the leaf should 868 * never be empty. 869 * 870 * Remember that the right-hand boundary is not included in the 871 * count. 872 */ 873 hammer_modify_node_all(cursor->trans, cursor->node); 874 node = cursor->node->ondisk; 875 i = cursor->index; 876 KKASSERT(elm->base.btype != 0); 877 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 878 KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS); 879 if (i != node->count) { 880 bcopy(&node->elms[i], &node->elms[i+1], 881 (node->count - i) * sizeof(*elm)); 882 } 883 node->elms[i].leaf = *elm; 884 ++node->count; 885 hammer_cursor_inserted_element(cursor->node, i); 886 887 /* 888 * Update the leaf node's aggregate mirror_tid for mirroring 889 * support. 890 */ 891 if (node->mirror_tid < elm->base.delete_tid) { 892 node->mirror_tid = elm->base.delete_tid; 893 *doprop = 1; 894 } 895 if (node->mirror_tid < elm->base.create_tid) { 896 node->mirror_tid = elm->base.create_tid; 897 *doprop = 1; 898 } 899 hammer_modify_node_done(cursor->node); 900 901 /* 902 * Debugging sanity checks. 903 */ 904 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->base) <= 0); 905 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->base) > 0); 906 if (i) { 907 KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->base) < 0); 908 } 909 if (i != node->count - 1) 910 KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->base) > 0); 911 912 return(0); 913 } 914 915 /* 916 * Delete a record from the B-Tree at the current cursor position. 917 * The cursor is positioned such that the current element is the one 918 * to be deleted. 919 * 920 * On return the cursor will be positioned after the deleted element and 921 * MAY point to an internal node. It will be suitable for the continuation 922 * of an iteration but not for an insertion or deletion. 923 * 924 * Deletions will attempt to partially rebalance the B-Tree in an upward 925 * direction, but will terminate rather then deadlock. Empty internal nodes 926 * are never allowed by a deletion which deadlocks may end up giving us an 927 * empty leaf. The pruner will clean up and rebalance the tree. 928 * 929 * This function can return EDEADLK, requiring the caller to retry the 930 * operation after clearing the deadlock. 931 */ 932 int 933 hammer_btree_delete(hammer_cursor_t cursor) 934 { 935 hammer_node_ondisk_t ondisk; 936 hammer_node_t node; 937 hammer_node_t parent __debugvar; 938 int error; 939 int i; 940 941 KKASSERT (cursor->trans->sync_lock_refs > 0); 942 if ((error = hammer_cursor_upgrade(cursor)) != 0) 943 return(error); 944 ++hammer_stats_btree_deletes; 945 946 /* 947 * Delete the element from the leaf node. 948 * 949 * Remember that leaf nodes do not have boundaries. 950 */ 951 node = cursor->node; 952 ondisk = node->ondisk; 953 i = cursor->index; 954 955 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF); 956 KKASSERT(i >= 0 && i < ondisk->count); 957 hammer_modify_node_all(cursor->trans, node); 958 if (i + 1 != ondisk->count) { 959 bcopy(&ondisk->elms[i+1], &ondisk->elms[i], 960 (ondisk->count - i - 1) * sizeof(ondisk->elms[0])); 961 } 962 --ondisk->count; 963 hammer_modify_node_done(node); 964 hammer_cursor_deleted_element(node, i); 965 966 /* 967 * Validate local parent 968 */ 969 if (ondisk->parent) { 970 parent = cursor->parent; 971 972 KKASSERT(parent != NULL); 973 KKASSERT(parent->node_offset == ondisk->parent); 974 } 975 976 /* 977 * If the leaf becomes empty it must be detached from the parent, 978 * potentially recursing through to the filesystem root. 979 * 980 * This may reposition the cursor at one of the parent's of the 981 * current node. 982 * 983 * Ignore deadlock errors, that simply means that btree_remove 984 * was unable to recurse and had to leave us with an empty leaf. 985 */ 986 KKASSERT(cursor->index <= ondisk->count); 987 if (ondisk->count == 0) { 988 error = btree_remove(cursor); 989 if (error == EDEADLK) 990 error = 0; 991 } else { 992 error = 0; 993 } 994 KKASSERT(cursor->parent == NULL || 995 cursor->parent_index < cursor->parent->ondisk->count); 996 return(error); 997 } 998 999 /* 1000 * PRIMAY B-TREE SEARCH SUPPORT PROCEDURE 1001 * 1002 * Search the filesystem B-Tree for cursor->key_beg, return the matching node. 1003 * 1004 * The search can begin ANYWHERE in the B-Tree. As a first step the search 1005 * iterates up the tree as necessary to properly position itself prior to 1006 * actually doing the sarch. 1007 * 1008 * INSERTIONS: The search will split full nodes and leaves on its way down 1009 * and guarentee that the leaf it ends up on is not full. If we run out 1010 * of space the search continues to the leaf, but ENOSPC is returned. 1011 * 1012 * The search is only guarenteed to end up on a leaf if an error code of 0 1013 * is returned, or if inserting and an error code of ENOENT is returned. 1014 * Otherwise it can stop at an internal node. On success a search returns 1015 * a leaf node. 1016 * 1017 * COMPLEXITY WARNING! This is the core B-Tree search code for the entire 1018 * filesystem, and it is not simple code. Please note the following facts: 1019 * 1020 * - Internal node recursions have a boundary on the left AND right. The 1021 * right boundary is non-inclusive. The create_tid is a generic part 1022 * of the key for internal nodes. 1023 * 1024 * - Leaf nodes contain terminal elements only now. 1025 * 1026 * - Filesystem lookups typically set HAMMER_CURSOR_ASOF, indicating a 1027 * historical search. ASOF and INSERT are mutually exclusive. When 1028 * doing an as-of lookup btree_search() checks for a right-edge boundary 1029 * case. If while recursing down the left-edge differs from the key 1030 * by ONLY its create_tid, HAMMER_CURSOR_CREATE_CHECK is set along 1031 * with cursor->create_check. This is used by btree_lookup() to iterate. 1032 * The iteration backwards because as-of searches can wind up going 1033 * down the wrong branch of the B-Tree. 1034 */ 1035 static 1036 int 1037 btree_search(hammer_cursor_t cursor, int flags) 1038 { 1039 hammer_node_ondisk_t node; 1040 hammer_btree_elm_t elm; 1041 int error; 1042 int enospc = 0; 1043 int i; 1044 int r; 1045 int s; 1046 1047 flags |= cursor->flags; 1048 ++hammer_stats_btree_searches; 1049 1050 if (hammer_debug_btree) { 1051 kprintf("SEARCH %016llx[%d] %016llx %02x key=%016llx cre=%016llx lo=%02x (td = %p)\n", 1052 (long long)cursor->node->node_offset, 1053 cursor->index, 1054 (long long)cursor->key_beg.obj_id, 1055 cursor->key_beg.rec_type, 1056 (long long)cursor->key_beg.key, 1057 (long long)cursor->key_beg.create_tid, 1058 cursor->key_beg.localization, 1059 curthread 1060 ); 1061 if (cursor->parent) 1062 kprintf("SEARCHP %016llx[%d] (%016llx/%016llx %016llx/%016llx) (%p/%p %p/%p)\n", 1063 (long long)cursor->parent->node_offset, 1064 cursor->parent_index, 1065 (long long)cursor->left_bound->obj_id, 1066 (long long)cursor->parent->ondisk->elms[cursor->parent_index].internal.base.obj_id, 1067 (long long)cursor->right_bound->obj_id, 1068 (long long)cursor->parent->ondisk->elms[cursor->parent_index+1].internal.base.obj_id, 1069 cursor->left_bound, 1070 &cursor->parent->ondisk->elms[cursor->parent_index], 1071 cursor->right_bound, 1072 &cursor->parent->ondisk->elms[cursor->parent_index+1] 1073 ); 1074 } 1075 1076 /* 1077 * Move our cursor up the tree until we find a node whos range covers 1078 * the key we are trying to locate. 1079 * 1080 * The left bound is inclusive, the right bound is non-inclusive. 1081 * It is ok to cursor up too far. 1082 */ 1083 for (;;) { 1084 r = hammer_btree_cmp(&cursor->key_beg, cursor->left_bound); 1085 s = hammer_btree_cmp(&cursor->key_beg, cursor->right_bound); 1086 if (r >= 0 && s < 0) 1087 break; 1088 KKASSERT(cursor->parent); 1089 ++hammer_stats_btree_iterations; 1090 error = hammer_cursor_up(cursor); 1091 if (error) 1092 goto done; 1093 } 1094 1095 /* 1096 * The delete-checks below are based on node, not parent. Set the 1097 * initial delete-check based on the parent. 1098 */ 1099 if (r == 1) { 1100 KKASSERT(cursor->left_bound->create_tid != 1); 1101 cursor->create_check = cursor->left_bound->create_tid - 1; 1102 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1103 } 1104 1105 /* 1106 * We better have ended up with a node somewhere. 1107 */ 1108 KKASSERT(cursor->node != NULL); 1109 1110 /* 1111 * If we are inserting we can't start at a full node if the parent 1112 * is also full (because there is no way to split the node), 1113 * continue running up the tree until the requirement is satisfied 1114 * or we hit the root of the filesystem. 1115 * 1116 * (If inserting we aren't doing an as-of search so we don't have 1117 * to worry about create_check). 1118 */ 1119 while (flags & HAMMER_CURSOR_INSERT) { 1120 if (btree_node_is_full(cursor->node->ondisk) == 0) 1121 break; 1122 if (cursor->node->ondisk->parent == 0 || 1123 cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) { 1124 break; 1125 } 1126 ++hammer_stats_btree_iterations; 1127 error = hammer_cursor_up(cursor); 1128 /* node may have become stale */ 1129 if (error) 1130 goto done; 1131 } 1132 1133 /* 1134 * Push down through internal nodes to locate the requested key. 1135 */ 1136 node = cursor->node->ondisk; 1137 while (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 1138 /* 1139 * Scan the node to find the subtree index to push down into. 1140 * We go one-past, then back-up. 1141 * 1142 * We must proactively remove deleted elements which may 1143 * have been left over from a deadlocked btree_remove(). 1144 * 1145 * The left and right boundaries are included in the loop 1146 * in order to detect edge cases. 1147 * 1148 * If the separator only differs by create_tid (r == 1) 1149 * and we are doing an as-of search, we may end up going 1150 * down a branch to the left of the one containing the 1151 * desired key. This requires numerous special cases. 1152 */ 1153 ++hammer_stats_btree_iterations; 1154 if (hammer_debug_btree) { 1155 kprintf("SEARCH-I %016llx count=%d\n", 1156 (long long)cursor->node->node_offset, 1157 node->count); 1158 } 1159 1160 /* 1161 * Try to shortcut the search before dropping into the 1162 * linear loop. Locate the first node where r <= 1. 1163 */ 1164 i = hammer_btree_search_node(&cursor->key_beg, node); 1165 while (i <= node->count) { 1166 ++hammer_stats_btree_elements; 1167 elm = &node->elms[i]; 1168 r = hammer_btree_cmp(&cursor->key_beg, &elm->base); 1169 if (hammer_debug_btree > 2) { 1170 kprintf(" IELM %p %d r=%d\n", 1171 &node->elms[i], i, r); 1172 } 1173 if (r < 0) 1174 break; 1175 if (r == 1) { 1176 KKASSERT(elm->base.create_tid != 1); 1177 cursor->create_check = elm->base.create_tid - 1; 1178 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1179 } 1180 ++i; 1181 } 1182 if (hammer_debug_btree) { 1183 kprintf("SEARCH-I preI=%d/%d r=%d\n", 1184 i, node->count, r); 1185 } 1186 1187 /* 1188 * These cases occur when the parent's idea of the boundary 1189 * is wider then the child's idea of the boundary, and 1190 * require special handling. If not inserting we can 1191 * terminate the search early for these cases but the 1192 * child's boundaries cannot be unconditionally modified. 1193 */ 1194 if (i == 0) { 1195 /* 1196 * If i == 0 the search terminated to the LEFT of the 1197 * left_boundary but to the RIGHT of the parent's left 1198 * boundary. 1199 */ 1200 u_int8_t save; 1201 1202 elm = &node->elms[0]; 1203 1204 /* 1205 * If we aren't inserting we can stop here. 1206 */ 1207 if ((flags & (HAMMER_CURSOR_INSERT | 1208 HAMMER_CURSOR_PRUNING)) == 0) { 1209 cursor->index = 0; 1210 return(ENOENT); 1211 } 1212 1213 /* 1214 * Correct a left-hand boundary mismatch. 1215 * 1216 * We can only do this if we can upgrade the lock, 1217 * and synchronized as a background cursor (i.e. 1218 * inserting or pruning). 1219 * 1220 * WARNING: We can only do this if inserting, i.e. 1221 * we are running on the backend. 1222 */ 1223 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1224 return(error); 1225 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1226 hammer_modify_node_field(cursor->trans, cursor->node, 1227 elms[0]); 1228 save = node->elms[0].base.btype; 1229 node->elms[0].base = *cursor->left_bound; 1230 node->elms[0].base.btype = save; 1231 hammer_modify_node_done(cursor->node); 1232 } else if (i == node->count + 1) { 1233 /* 1234 * If i == node->count + 1 the search terminated to 1235 * the RIGHT of the right boundary but to the LEFT 1236 * of the parent's right boundary. If we aren't 1237 * inserting we can stop here. 1238 * 1239 * Note that the last element in this case is 1240 * elms[i-2] prior to adjustments to 'i'. 1241 */ 1242 --i; 1243 if ((flags & (HAMMER_CURSOR_INSERT | 1244 HAMMER_CURSOR_PRUNING)) == 0) { 1245 cursor->index = i; 1246 return (ENOENT); 1247 } 1248 1249 /* 1250 * Correct a right-hand boundary mismatch. 1251 * (actual push-down record is i-2 prior to 1252 * adjustments to i). 1253 * 1254 * We can only do this if we can upgrade the lock, 1255 * and synchronized as a background cursor (i.e. 1256 * inserting or pruning). 1257 * 1258 * WARNING: We can only do this if inserting, i.e. 1259 * we are running on the backend. 1260 */ 1261 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1262 return(error); 1263 elm = &node->elms[i]; 1264 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1265 hammer_modify_node(cursor->trans, cursor->node, 1266 &elm->base, sizeof(elm->base)); 1267 elm->base = *cursor->right_bound; 1268 hammer_modify_node_done(cursor->node); 1269 --i; 1270 } else { 1271 /* 1272 * The push-down index is now i - 1. If we had 1273 * terminated on the right boundary this will point 1274 * us at the last element. 1275 */ 1276 --i; 1277 } 1278 cursor->index = i; 1279 elm = &node->elms[i]; 1280 1281 if (hammer_debug_btree) { 1282 kprintf("RESULT-I %016llx[%d] %016llx %02x " 1283 "key=%016llx cre=%016llx lo=%02x\n", 1284 (long long)cursor->node->node_offset, 1285 i, 1286 (long long)elm->internal.base.obj_id, 1287 elm->internal.base.rec_type, 1288 (long long)elm->internal.base.key, 1289 (long long)elm->internal.base.create_tid, 1290 elm->internal.base.localization 1291 ); 1292 } 1293 1294 /* 1295 * We better have a valid subtree offset. 1296 */ 1297 KKASSERT(elm->internal.subtree_offset != 0); 1298 1299 /* 1300 * Handle insertion and deletion requirements. 1301 * 1302 * If inserting split full nodes. The split code will 1303 * adjust cursor->node and cursor->index if the current 1304 * index winds up in the new node. 1305 * 1306 * If inserting and a left or right edge case was detected, 1307 * we cannot correct the left or right boundary and must 1308 * prepend and append an empty leaf node in order to make 1309 * the boundary correction. 1310 * 1311 * If we run out of space we set enospc but continue on 1312 * to a leaf. 1313 */ 1314 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) { 1315 if (btree_node_is_full(node)) { 1316 error = btree_split_internal(cursor); 1317 if (error) { 1318 if (error != ENOSPC) 1319 goto done; 1320 enospc = 1; 1321 } 1322 /* 1323 * reload stale pointers 1324 */ 1325 i = cursor->index; 1326 node = cursor->node->ondisk; 1327 } 1328 } 1329 1330 /* 1331 * Push down (push into new node, existing node becomes 1332 * the parent) and continue the search. 1333 */ 1334 error = hammer_cursor_down(cursor); 1335 /* node may have become stale */ 1336 if (error) 1337 goto done; 1338 node = cursor->node->ondisk; 1339 } 1340 1341 /* 1342 * We are at a leaf, do a linear search of the key array. 1343 * 1344 * On success the index is set to the matching element and 0 1345 * is returned. 1346 * 1347 * On failure the index is set to the insertion point and ENOENT 1348 * is returned. 1349 * 1350 * Boundaries are not stored in leaf nodes, so the index can wind 1351 * up to the left of element 0 (index == 0) or past the end of 1352 * the array (index == node->count). It is also possible that the 1353 * leaf might be empty. 1354 */ 1355 ++hammer_stats_btree_iterations; 1356 KKASSERT (node->type == HAMMER_BTREE_TYPE_LEAF); 1357 KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS); 1358 if (hammer_debug_btree) { 1359 kprintf("SEARCH-L %016llx count=%d\n", 1360 (long long)cursor->node->node_offset, 1361 node->count); 1362 } 1363 1364 /* 1365 * Try to shortcut the search before dropping into the 1366 * linear loop. Locate the first node where r <= 1. 1367 */ 1368 i = hammer_btree_search_node(&cursor->key_beg, node); 1369 while (i < node->count) { 1370 ++hammer_stats_btree_elements; 1371 elm = &node->elms[i]; 1372 1373 r = hammer_btree_cmp(&cursor->key_beg, &elm->leaf.base); 1374 1375 if (hammer_debug_btree > 1) 1376 kprintf(" ELM %p %d r=%d\n", &node->elms[i], i, r); 1377 1378 /* 1379 * We are at a record element. Stop if we've flipped past 1380 * key_beg, not counting the create_tid test. Allow the 1381 * r == 1 case (key_beg > element but differs only by its 1382 * create_tid) to fall through to the AS-OF check. 1383 */ 1384 KKASSERT (elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD); 1385 1386 if (r < 0) 1387 goto failed; 1388 if (r > 1) { 1389 ++i; 1390 continue; 1391 } 1392 1393 /* 1394 * Check our as-of timestamp against the element. 1395 */ 1396 if (flags & HAMMER_CURSOR_ASOF) { 1397 if (hammer_btree_chkts(cursor->asof, 1398 &node->elms[i].base) != 0) { 1399 ++i; 1400 continue; 1401 } 1402 /* success */ 1403 } else { 1404 if (r > 0) { /* can only be +1 */ 1405 ++i; 1406 continue; 1407 } 1408 /* success */ 1409 } 1410 cursor->index = i; 1411 error = 0; 1412 if (hammer_debug_btree) { 1413 kprintf("RESULT-L %016llx[%d] (SUCCESS)\n", 1414 (long long)cursor->node->node_offset, i); 1415 } 1416 goto done; 1417 } 1418 1419 /* 1420 * The search of the leaf node failed. i is the insertion point. 1421 */ 1422 failed: 1423 if (hammer_debug_btree) { 1424 kprintf("RESULT-L %016llx[%d] (FAILED)\n", 1425 (long long)cursor->node->node_offset, i); 1426 } 1427 1428 /* 1429 * No exact match was found, i is now at the insertion point. 1430 * 1431 * If inserting split a full leaf before returning. This 1432 * may have the side effect of adjusting cursor->node and 1433 * cursor->index. 1434 */ 1435 cursor->index = i; 1436 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0 && 1437 btree_node_is_full(node)) { 1438 error = btree_split_leaf(cursor); 1439 if (error) { 1440 if (error != ENOSPC) 1441 goto done; 1442 enospc = 1; 1443 } 1444 /* 1445 * reload stale pointers 1446 */ 1447 /* NOT USED 1448 i = cursor->index; 1449 node = &cursor->node->internal; 1450 */ 1451 } 1452 1453 /* 1454 * We reached a leaf but did not find the key we were looking for. 1455 * If this is an insert we will be properly positioned for an insert 1456 * (ENOENT) or unable to insert (ENOSPC). 1457 */ 1458 error = enospc ? ENOSPC : ENOENT; 1459 done: 1460 return(error); 1461 } 1462 1463 /* 1464 * Heuristical search for the first element whos comparison is <= 1. May 1465 * return an index whos compare result is > 1 but may only return an index 1466 * whos compare result is <= 1 if it is the first element with that result. 1467 */ 1468 int 1469 hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node) 1470 { 1471 int b; 1472 int s; 1473 int i; 1474 int r; 1475 1476 /* 1477 * Don't bother if the node does not have very many elements 1478 */ 1479 b = 0; 1480 s = node->count; 1481 while (s - b > 4) { 1482 i = b + (s - b) / 2; 1483 ++hammer_stats_btree_elements; 1484 r = hammer_btree_cmp(elm, &node->elms[i].leaf.base); 1485 if (r <= 1) { 1486 s = i; 1487 } else { 1488 b = i; 1489 } 1490 } 1491 return(b); 1492 } 1493 1494 1495 /************************************************************************ 1496 * SPLITTING AND MERGING * 1497 ************************************************************************ 1498 * 1499 * These routines do all the dirty work required to split and merge nodes. 1500 */ 1501 1502 /* 1503 * Split an internal node into two nodes and move the separator at the split 1504 * point to the parent. 1505 * 1506 * (cursor->node, cursor->index) indicates the element the caller intends 1507 * to push into. We will adjust node and index if that element winds 1508 * up in the split node. 1509 * 1510 * If we are at the root of the filesystem a new root must be created with 1511 * two elements, one pointing to the original root and one pointing to the 1512 * newly allocated split node. 1513 */ 1514 static 1515 int 1516 btree_split_internal(hammer_cursor_t cursor) 1517 { 1518 hammer_node_ondisk_t ondisk; 1519 hammer_node_t node; 1520 hammer_node_t parent; 1521 hammer_node_t new_node; 1522 hammer_btree_elm_t elm; 1523 hammer_btree_elm_t parent_elm; 1524 struct hammer_node_lock lockroot; 1525 hammer_mount_t hmp = cursor->trans->hmp; 1526 int parent_index; 1527 int made_root; 1528 int split; 1529 int error; 1530 int i; 1531 const int esize = sizeof(*elm); 1532 1533 hammer_node_lock_init(&lockroot, cursor->node); 1534 error = hammer_btree_lock_children(cursor, 1, &lockroot, NULL); 1535 if (error) 1536 goto done; 1537 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1538 goto done; 1539 ++hammer_stats_btree_splits; 1540 1541 /* 1542 * Calculate the split point. If the insertion point is at the 1543 * end of the leaf we adjust the split point significantly to the 1544 * right to try to optimize node fill and flag it. If we hit 1545 * that same leaf again our heuristic failed and we don't try 1546 * to optimize node fill (it could lead to a degenerate case). 1547 */ 1548 node = cursor->node; 1549 ondisk = node->ondisk; 1550 KKASSERT(ondisk->count > 4); 1551 if (cursor->index == ondisk->count && 1552 (node->flags & HAMMER_NODE_NONLINEAR) == 0) { 1553 split = (ondisk->count + 1) * 3 / 4; 1554 node->flags |= HAMMER_NODE_NONLINEAR; 1555 } else { 1556 /* 1557 * We are splitting but elms[split] will be promoted to 1558 * the parent, leaving the right hand node with one less 1559 * element. If the insertion point will be on the 1560 * left-hand side adjust the split point to give the 1561 * right hand side one additional node. 1562 */ 1563 split = (ondisk->count + 1) / 2; 1564 if (cursor->index <= split) 1565 --split; 1566 } 1567 1568 /* 1569 * If we are at the root of the filesystem, create a new root node 1570 * with 1 element and split normally. Avoid making major 1571 * modifications until we know the whole operation will work. 1572 */ 1573 if (ondisk->parent == 0) { 1574 parent = hammer_alloc_btree(cursor->trans, 0, &error); 1575 if (parent == NULL) 1576 goto done; 1577 hammer_lock_ex(&parent->lock); 1578 hammer_modify_node_noundo(cursor->trans, parent); 1579 ondisk = parent->ondisk; 1580 ondisk->count = 1; 1581 ondisk->parent = 0; 1582 ondisk->mirror_tid = node->ondisk->mirror_tid; 1583 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1584 ondisk->elms[0].base = hmp->root_btree_beg; 1585 ondisk->elms[0].base.btype = node->ondisk->type; 1586 ondisk->elms[0].internal.subtree_offset = node->node_offset; 1587 ondisk->elms[1].base = hmp->root_btree_end; 1588 hammer_modify_node_done(parent); 1589 /* ondisk->elms[1].base.btype - not used */ 1590 made_root = 1; 1591 parent_index = 0; /* index of current node in parent */ 1592 } else { 1593 made_root = 0; 1594 parent = cursor->parent; 1595 parent_index = cursor->parent_index; 1596 } 1597 1598 /* 1599 * Split node into new_node at the split point. 1600 * 1601 * B O O O P N N B <-- P = node->elms[split] (index 4) 1602 * 0 1 2 3 4 5 6 <-- subtree indices 1603 * 1604 * x x P x x 1605 * s S S s 1606 * / \ 1607 * B O O O B B N N B <--- inner boundary points are 'P' 1608 * 0 1 2 3 4 5 6 1609 */ 1610 new_node = hammer_alloc_btree(cursor->trans, 0, &error); 1611 if (new_node == NULL) { 1612 if (made_root) { 1613 hammer_unlock(&parent->lock); 1614 hammer_delete_node(cursor->trans, parent); 1615 hammer_rel_node(parent); 1616 } 1617 goto done; 1618 } 1619 hammer_lock_ex(&new_node->lock); 1620 1621 /* 1622 * Create the new node. P becomes the left-hand boundary in the 1623 * new node. Copy the right-hand boundary as well. 1624 * 1625 * elm is the new separator. 1626 */ 1627 hammer_modify_node_noundo(cursor->trans, new_node); 1628 hammer_modify_node_all(cursor->trans, node); 1629 ondisk = node->ondisk; 1630 elm = &ondisk->elms[split]; 1631 bcopy(elm, &new_node->ondisk->elms[0], 1632 (ondisk->count - split + 1) * esize); 1633 new_node->ondisk->count = ondisk->count - split; 1634 new_node->ondisk->parent = parent->node_offset; 1635 new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1636 new_node->ondisk->mirror_tid = ondisk->mirror_tid; 1637 KKASSERT(ondisk->type == new_node->ondisk->type); 1638 hammer_cursor_split_node(node, new_node, split); 1639 1640 /* 1641 * Cleanup the original node. Elm (P) becomes the new boundary, 1642 * its subtree_offset was moved to the new node. If we had created 1643 * a new root its parent pointer may have changed. 1644 */ 1645 elm->internal.subtree_offset = 0; 1646 ondisk->count = split; 1647 1648 /* 1649 * Insert the separator into the parent, fixup the parent's 1650 * reference to the original node, and reference the new node. 1651 * The separator is P. 1652 * 1653 * Remember that base.count does not include the right-hand boundary. 1654 */ 1655 hammer_modify_node_all(cursor->trans, parent); 1656 ondisk = parent->ondisk; 1657 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1658 parent_elm = &ondisk->elms[parent_index+1]; 1659 bcopy(parent_elm, parent_elm + 1, 1660 (ondisk->count - parent_index) * esize); 1661 parent_elm->internal.base = elm->base; /* separator P */ 1662 parent_elm->internal.base.btype = new_node->ondisk->type; 1663 parent_elm->internal.subtree_offset = new_node->node_offset; 1664 parent_elm->internal.mirror_tid = new_node->ondisk->mirror_tid; 1665 ++ondisk->count; 1666 hammer_modify_node_done(parent); 1667 hammer_cursor_inserted_element(parent, parent_index + 1); 1668 1669 /* 1670 * The children of new_node need their parent pointer set to new_node. 1671 * The children have already been locked by 1672 * hammer_btree_lock_children(). 1673 */ 1674 for (i = 0; i < new_node->ondisk->count; ++i) { 1675 elm = &new_node->ondisk->elms[i]; 1676 error = btree_set_parent(cursor->trans, new_node, elm); 1677 if (error) { 1678 panic("btree_split_internal: btree-fixup problem"); 1679 } 1680 } 1681 hammer_modify_node_done(new_node); 1682 1683 /* 1684 * The filesystem's root B-Tree pointer may have to be updated. 1685 */ 1686 if (made_root) { 1687 hammer_volume_t volume; 1688 1689 volume = hammer_get_root_volume(hmp, &error); 1690 KKASSERT(error == 0); 1691 1692 hammer_modify_volume_field(cursor->trans, volume, 1693 vol0_btree_root); 1694 volume->ondisk->vol0_btree_root = parent->node_offset; 1695 hammer_modify_volume_done(volume); 1696 node->ondisk->parent = parent->node_offset; 1697 if (cursor->parent) { 1698 hammer_unlock(&cursor->parent->lock); 1699 hammer_rel_node(cursor->parent); 1700 } 1701 cursor->parent = parent; /* lock'd and ref'd */ 1702 hammer_rel_volume(volume, 0); 1703 } 1704 hammer_modify_node_done(node); 1705 1706 /* 1707 * Ok, now adjust the cursor depending on which element the original 1708 * index was pointing at. If we are >= the split point the push node 1709 * is now in the new node. 1710 * 1711 * NOTE: If we are at the split point itself we cannot stay with the 1712 * original node because the push index will point at the right-hand 1713 * boundary, which is illegal. 1714 * 1715 * NOTE: The cursor's parent or parent_index must be adjusted for 1716 * the case where a new parent (new root) was created, and the case 1717 * where the cursor is now pointing at the split node. 1718 */ 1719 if (cursor->index >= split) { 1720 cursor->parent_index = parent_index + 1; 1721 cursor->index -= split; 1722 hammer_unlock(&cursor->node->lock); 1723 hammer_rel_node(cursor->node); 1724 cursor->node = new_node; /* locked and ref'd */ 1725 } else { 1726 cursor->parent_index = parent_index; 1727 hammer_unlock(&new_node->lock); 1728 hammer_rel_node(new_node); 1729 } 1730 1731 /* 1732 * Fixup left and right bounds 1733 */ 1734 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1735 cursor->left_bound = &parent_elm[0].internal.base; 1736 cursor->right_bound = &parent_elm[1].internal.base; 1737 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1738 &cursor->node->ondisk->elms[0].internal.base) <= 0); 1739 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1740 &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0); 1741 1742 done: 1743 hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, NULL); 1744 hammer_cursor_downgrade(cursor); 1745 return (error); 1746 } 1747 1748 /* 1749 * Same as the above, but splits a full leaf node. 1750 * 1751 * This function 1752 */ 1753 static 1754 int 1755 btree_split_leaf(hammer_cursor_t cursor) 1756 { 1757 hammer_node_ondisk_t ondisk; 1758 hammer_node_t parent; 1759 hammer_node_t leaf; 1760 hammer_mount_t hmp; 1761 hammer_node_t new_leaf; 1762 hammer_btree_elm_t elm; 1763 hammer_btree_elm_t parent_elm; 1764 hammer_base_elm_t mid_boundary; 1765 int parent_index; 1766 int made_root; 1767 int split; 1768 int error; 1769 const size_t esize = sizeof(*elm); 1770 1771 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1772 return(error); 1773 ++hammer_stats_btree_splits; 1774 1775 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1776 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1777 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1778 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1779 1780 /* 1781 * Calculate the split point. If the insertion point is at the 1782 * end of the leaf we adjust the split point significantly to the 1783 * right to try to optimize node fill and flag it. If we hit 1784 * that same leaf again our heuristic failed and we don't try 1785 * to optimize node fill (it could lead to a degenerate case). 1786 */ 1787 leaf = cursor->node; 1788 ondisk = leaf->ondisk; 1789 KKASSERT(ondisk->count > 4); 1790 if (cursor->index == ondisk->count && 1791 (leaf->flags & HAMMER_NODE_NONLINEAR) == 0) { 1792 split = (ondisk->count + 1) * 3 / 4; 1793 leaf->flags |= HAMMER_NODE_NONLINEAR; 1794 } else { 1795 split = (ondisk->count + 1) / 2; 1796 } 1797 1798 #if 0 1799 /* 1800 * If the insertion point is at the split point shift the 1801 * split point left so we don't have to worry about 1802 */ 1803 if (cursor->index == split) 1804 --split; 1805 #endif 1806 KKASSERT(split > 0 && split < ondisk->count); 1807 1808 error = 0; 1809 hmp = leaf->hmp; 1810 1811 elm = &ondisk->elms[split]; 1812 1813 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm[-1].leaf.base) <= 0); 1814 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->leaf.base) <= 0); 1815 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->leaf.base) > 0); 1816 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm[1].leaf.base) > 0); 1817 1818 /* 1819 * If we are at the root of the tree, create a new root node with 1820 * 1 element and split normally. Avoid making major modifications 1821 * until we know the whole operation will work. 1822 */ 1823 if (ondisk->parent == 0) { 1824 parent = hammer_alloc_btree(cursor->trans, 0, &error); 1825 if (parent == NULL) 1826 goto done; 1827 hammer_lock_ex(&parent->lock); 1828 hammer_modify_node_noundo(cursor->trans, parent); 1829 ondisk = parent->ondisk; 1830 ondisk->count = 1; 1831 ondisk->parent = 0; 1832 ondisk->mirror_tid = leaf->ondisk->mirror_tid; 1833 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1834 ondisk->elms[0].base = hmp->root_btree_beg; 1835 ondisk->elms[0].base.btype = leaf->ondisk->type; 1836 ondisk->elms[0].internal.subtree_offset = leaf->node_offset; 1837 ondisk->elms[1].base = hmp->root_btree_end; 1838 /* ondisk->elms[1].base.btype = not used */ 1839 hammer_modify_node_done(parent); 1840 made_root = 1; 1841 parent_index = 0; /* insertion point in parent */ 1842 } else { 1843 made_root = 0; 1844 parent = cursor->parent; 1845 parent_index = cursor->parent_index; 1846 } 1847 1848 /* 1849 * Split leaf into new_leaf at the split point. Select a separator 1850 * value in-between the two leafs but with a bent towards the right 1851 * leaf since comparisons use an 'elm >= separator' inequality. 1852 * 1853 * L L L L L L L L 1854 * 1855 * x x P x x 1856 * s S S s 1857 * / \ 1858 * L L L L L L L L 1859 */ 1860 new_leaf = hammer_alloc_btree(cursor->trans, 0, &error); 1861 if (new_leaf == NULL) { 1862 if (made_root) { 1863 hammer_unlock(&parent->lock); 1864 hammer_delete_node(cursor->trans, parent); 1865 hammer_rel_node(parent); 1866 } 1867 goto done; 1868 } 1869 hammer_lock_ex(&new_leaf->lock); 1870 1871 /* 1872 * Create the new node and copy the leaf elements from the split 1873 * point on to the new node. 1874 */ 1875 hammer_modify_node_all(cursor->trans, leaf); 1876 hammer_modify_node_noundo(cursor->trans, new_leaf); 1877 ondisk = leaf->ondisk; 1878 elm = &ondisk->elms[split]; 1879 bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize); 1880 new_leaf->ondisk->count = ondisk->count - split; 1881 new_leaf->ondisk->parent = parent->node_offset; 1882 new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF; 1883 new_leaf->ondisk->mirror_tid = ondisk->mirror_tid; 1884 KKASSERT(ondisk->type == new_leaf->ondisk->type); 1885 hammer_modify_node_done(new_leaf); 1886 hammer_cursor_split_node(leaf, new_leaf, split); 1887 1888 /* 1889 * Cleanup the original node. Because this is a leaf node and 1890 * leaf nodes do not have a right-hand boundary, there 1891 * aren't any special edge cases to clean up. We just fixup the 1892 * count. 1893 */ 1894 ondisk->count = split; 1895 1896 /* 1897 * Insert the separator into the parent, fixup the parent's 1898 * reference to the original node, and reference the new node. 1899 * The separator is P. 1900 * 1901 * Remember that base.count does not include the right-hand boundary. 1902 * We are copying parent_index+1 to parent_index+2, not +0 to +1. 1903 */ 1904 hammer_modify_node_all(cursor->trans, parent); 1905 ondisk = parent->ondisk; 1906 KKASSERT(split != 0); 1907 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1908 parent_elm = &ondisk->elms[parent_index+1]; 1909 bcopy(parent_elm, parent_elm + 1, 1910 (ondisk->count - parent_index) * esize); 1911 1912 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base); 1913 parent_elm->internal.base.btype = new_leaf->ondisk->type; 1914 parent_elm->internal.subtree_offset = new_leaf->node_offset; 1915 parent_elm->internal.mirror_tid = new_leaf->ondisk->mirror_tid; 1916 mid_boundary = &parent_elm->base; 1917 ++ondisk->count; 1918 hammer_modify_node_done(parent); 1919 hammer_cursor_inserted_element(parent, parent_index + 1); 1920 1921 /* 1922 * The filesystem's root B-Tree pointer may have to be updated. 1923 */ 1924 if (made_root) { 1925 hammer_volume_t volume; 1926 1927 volume = hammer_get_root_volume(hmp, &error); 1928 KKASSERT(error == 0); 1929 1930 hammer_modify_volume_field(cursor->trans, volume, 1931 vol0_btree_root); 1932 volume->ondisk->vol0_btree_root = parent->node_offset; 1933 hammer_modify_volume_done(volume); 1934 leaf->ondisk->parent = parent->node_offset; 1935 if (cursor->parent) { 1936 hammer_unlock(&cursor->parent->lock); 1937 hammer_rel_node(cursor->parent); 1938 } 1939 cursor->parent = parent; /* lock'd and ref'd */ 1940 hammer_rel_volume(volume, 0); 1941 } 1942 hammer_modify_node_done(leaf); 1943 1944 /* 1945 * Ok, now adjust the cursor depending on which element the original 1946 * index was pointing at. If we are >= the split point the push node 1947 * is now in the new node. 1948 * 1949 * NOTE: If we are at the split point itself we need to select the 1950 * old or new node based on where key_beg's insertion point will be. 1951 * If we pick the wrong side the inserted element will wind up in 1952 * the wrong leaf node and outside that node's bounds. 1953 */ 1954 if (cursor->index > split || 1955 (cursor->index == split && 1956 hammer_btree_cmp(&cursor->key_beg, mid_boundary) >= 0)) { 1957 cursor->parent_index = parent_index + 1; 1958 cursor->index -= split; 1959 hammer_unlock(&cursor->node->lock); 1960 hammer_rel_node(cursor->node); 1961 cursor->node = new_leaf; 1962 } else { 1963 cursor->parent_index = parent_index; 1964 hammer_unlock(&new_leaf->lock); 1965 hammer_rel_node(new_leaf); 1966 } 1967 1968 /* 1969 * Fixup left and right bounds 1970 */ 1971 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1972 cursor->left_bound = &parent_elm[0].internal.base; 1973 cursor->right_bound = &parent_elm[1].internal.base; 1974 1975 /* 1976 * Assert that the bounds are correct. 1977 */ 1978 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1979 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1980 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1981 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1982 KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->key_beg) <= 0); 1983 KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->key_beg) > 0); 1984 1985 done: 1986 hammer_cursor_downgrade(cursor); 1987 return (error); 1988 } 1989 1990 #if 0 1991 1992 /* 1993 * Recursively correct the right-hand boundary's create_tid to (tid) as 1994 * long as the rest of the key matches. We have to recurse upward in 1995 * the tree as well as down the left side of each parent's right node. 1996 * 1997 * Return EDEADLK if we were only partially successful, forcing the caller 1998 * to try again. The original cursor is not modified. This routine can 1999 * also fail with EDEADLK if it is forced to throw away a portion of its 2000 * record history. 2001 * 2002 * The caller must pass a downgraded cursor to us (otherwise we can't dup it). 2003 */ 2004 struct hammer_rhb { 2005 TAILQ_ENTRY(hammer_rhb) entry; 2006 hammer_node_t node; 2007 int index; 2008 }; 2009 2010 TAILQ_HEAD(hammer_rhb_list, hammer_rhb); 2011 2012 int 2013 hammer_btree_correct_rhb(hammer_cursor_t cursor, hammer_tid_t tid) 2014 { 2015 struct hammer_mount *hmp; 2016 struct hammer_rhb_list rhb_list; 2017 hammer_base_elm_t elm; 2018 hammer_node_t orig_node; 2019 struct hammer_rhb *rhb; 2020 int orig_index; 2021 int error; 2022 2023 TAILQ_INIT(&rhb_list); 2024 hmp = cursor->trans->hmp; 2025 2026 /* 2027 * Save our position so we can restore it on return. This also 2028 * gives us a stable 'elm'. 2029 */ 2030 orig_node = cursor->node; 2031 hammer_ref_node(orig_node); 2032 hammer_lock_sh(&orig_node->lock); 2033 orig_index = cursor->index; 2034 elm = &orig_node->ondisk->elms[orig_index].base; 2035 2036 /* 2037 * Now build a list of parents going up, allocating a rhb 2038 * structure for each one. 2039 */ 2040 while (cursor->parent) { 2041 /* 2042 * Stop if we no longer have any right-bounds to fix up 2043 */ 2044 if (elm->obj_id != cursor->right_bound->obj_id || 2045 elm->rec_type != cursor->right_bound->rec_type || 2046 elm->key != cursor->right_bound->key) { 2047 break; 2048 } 2049 2050 /* 2051 * Stop if the right-hand bound's create_tid does not 2052 * need to be corrected. 2053 */ 2054 if (cursor->right_bound->create_tid >= tid) 2055 break; 2056 2057 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2058 rhb->node = cursor->parent; 2059 rhb->index = cursor->parent_index; 2060 hammer_ref_node(rhb->node); 2061 hammer_lock_sh(&rhb->node->lock); 2062 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2063 2064 hammer_cursor_up(cursor); 2065 } 2066 2067 /* 2068 * now safely adjust the right hand bound for each rhb. This may 2069 * also require taking the right side of the tree and iterating down 2070 * ITS left side. 2071 */ 2072 error = 0; 2073 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2074 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2075 if (error) 2076 break; 2077 TAILQ_REMOVE(&rhb_list, rhb, entry); 2078 hammer_unlock(&rhb->node->lock); 2079 hammer_rel_node(rhb->node); 2080 kfree(rhb, hmp->m_misc); 2081 2082 switch (cursor->node->ondisk->type) { 2083 case HAMMER_BTREE_TYPE_INTERNAL: 2084 /* 2085 * Right-boundary for parent at internal node 2086 * is one element to the right of the element whos 2087 * right boundary needs adjusting. We must then 2088 * traverse down the left side correcting any left 2089 * bounds (which may now be too far to the left). 2090 */ 2091 ++cursor->index; 2092 error = hammer_btree_correct_lhb(cursor, tid); 2093 break; 2094 default: 2095 panic("hammer_btree_correct_rhb(): Bad node type"); 2096 error = EINVAL; 2097 break; 2098 } 2099 } 2100 2101 /* 2102 * Cleanup 2103 */ 2104 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2105 TAILQ_REMOVE(&rhb_list, rhb, entry); 2106 hammer_unlock(&rhb->node->lock); 2107 hammer_rel_node(rhb->node); 2108 kfree(rhb, hmp->m_misc); 2109 } 2110 error = hammer_cursor_seek(cursor, orig_node, orig_index); 2111 hammer_unlock(&orig_node->lock); 2112 hammer_rel_node(orig_node); 2113 return (error); 2114 } 2115 2116 /* 2117 * Similar to rhb (in fact, rhb calls lhb), but corrects the left hand 2118 * bound going downward starting at the current cursor position. 2119 * 2120 * This function does not restore the cursor after use. 2121 */ 2122 int 2123 hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid) 2124 { 2125 struct hammer_rhb_list rhb_list; 2126 hammer_base_elm_t elm; 2127 hammer_base_elm_t cmp; 2128 struct hammer_rhb *rhb; 2129 struct hammer_mount *hmp; 2130 int error; 2131 2132 TAILQ_INIT(&rhb_list); 2133 hmp = cursor->trans->hmp; 2134 2135 cmp = &cursor->node->ondisk->elms[cursor->index].base; 2136 2137 /* 2138 * Record the node and traverse down the left-hand side for all 2139 * matching records needing a boundary correction. 2140 */ 2141 error = 0; 2142 for (;;) { 2143 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2144 rhb->node = cursor->node; 2145 rhb->index = cursor->index; 2146 hammer_ref_node(rhb->node); 2147 hammer_lock_sh(&rhb->node->lock); 2148 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2149 2150 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2151 /* 2152 * Nothing to traverse down if we are at the right 2153 * boundary of an internal node. 2154 */ 2155 if (cursor->index == cursor->node->ondisk->count) 2156 break; 2157 } else { 2158 elm = &cursor->node->ondisk->elms[cursor->index].base; 2159 if (elm->btype == HAMMER_BTREE_TYPE_RECORD) 2160 break; 2161 panic("Illegal leaf record type %02x", elm->btype); 2162 } 2163 error = hammer_cursor_down(cursor); 2164 if (error) 2165 break; 2166 2167 elm = &cursor->node->ondisk->elms[cursor->index].base; 2168 if (elm->obj_id != cmp->obj_id || 2169 elm->rec_type != cmp->rec_type || 2170 elm->key != cmp->key) { 2171 break; 2172 } 2173 if (elm->create_tid >= tid) 2174 break; 2175 2176 } 2177 2178 /* 2179 * Now we can safely adjust the left-hand boundary from the bottom-up. 2180 * The last element we remove from the list is the caller's right hand 2181 * boundary, which must also be adjusted. 2182 */ 2183 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2184 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2185 if (error) 2186 break; 2187 TAILQ_REMOVE(&rhb_list, rhb, entry); 2188 hammer_unlock(&rhb->node->lock); 2189 hammer_rel_node(rhb->node); 2190 kfree(rhb, hmp->m_misc); 2191 2192 elm = &cursor->node->ondisk->elms[cursor->index].base; 2193 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2194 hammer_modify_node(cursor->trans, cursor->node, 2195 &elm->create_tid, 2196 sizeof(elm->create_tid)); 2197 elm->create_tid = tid; 2198 hammer_modify_node_done(cursor->node); 2199 } else { 2200 panic("hammer_btree_correct_lhb(): Bad element type"); 2201 } 2202 } 2203 2204 /* 2205 * Cleanup 2206 */ 2207 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2208 TAILQ_REMOVE(&rhb_list, rhb, entry); 2209 hammer_unlock(&rhb->node->lock); 2210 hammer_rel_node(rhb->node); 2211 kfree(rhb, hmp->m_misc); 2212 } 2213 return (error); 2214 } 2215 2216 #endif 2217 2218 /* 2219 * Attempt to remove the locked, empty or want-to-be-empty B-Tree node at 2220 * (cursor->node). Returns 0 on success, EDEADLK if we could not complete 2221 * the operation due to a deadlock, or some other error. 2222 * 2223 * This routine is initially called with an empty leaf and may be 2224 * recursively called with single-element internal nodes. 2225 * 2226 * It should also be noted that when removing empty leaves we must be sure 2227 * to test and update mirror_tid because another thread may have deadlocked 2228 * against us (or someone) trying to propagate it up and cannot retry once 2229 * the node has been deleted. 2230 * 2231 * On return the cursor may end up pointing to an internal node, suitable 2232 * for further iteration but not for an immediate insertion or deletion. 2233 */ 2234 static int 2235 btree_remove(hammer_cursor_t cursor) 2236 { 2237 hammer_node_ondisk_t ondisk; 2238 hammer_btree_elm_t elm; 2239 hammer_node_t node; 2240 hammer_node_t parent; 2241 const int esize = sizeof(*elm); 2242 int error; 2243 2244 node = cursor->node; 2245 2246 /* 2247 * When deleting the root of the filesystem convert it to 2248 * an empty leaf node. Internal nodes cannot be empty. 2249 */ 2250 ondisk = node->ondisk; 2251 if (ondisk->parent == 0) { 2252 KKASSERT(cursor->parent == NULL); 2253 hammer_modify_node_all(cursor->trans, node); 2254 KKASSERT(ondisk == node->ondisk); 2255 ondisk->type = HAMMER_BTREE_TYPE_LEAF; 2256 ondisk->count = 0; 2257 hammer_modify_node_done(node); 2258 cursor->index = 0; 2259 return(0); 2260 } 2261 2262 parent = cursor->parent; 2263 2264 /* 2265 * Attempt to remove the parent's reference to the child. If the 2266 * parent would become empty we have to recurse. If we fail we 2267 * leave the parent pointing to an empty leaf node. 2268 * 2269 * We have to recurse successfully before we can delete the internal 2270 * node as it is illegal to have empty internal nodes. Even though 2271 * the operation may be aborted we must still fixup any unlocked 2272 * cursors as if we had deleted the element prior to recursing 2273 * (by calling hammer_cursor_deleted_element()) so those cursors 2274 * are properly forced up the chain by the recursion. 2275 */ 2276 if (parent->ondisk->count == 1) { 2277 /* 2278 * This special cursor_up_locked() call leaves the original 2279 * node exclusively locked and referenced, leaves the 2280 * original parent locked (as the new node), and locks the 2281 * new parent. It can return EDEADLK. 2282 * 2283 * We cannot call hammer_cursor_removed_node() until we are 2284 * actually able to remove the node. If we did then tracked 2285 * cursors in the middle of iterations could be repointed 2286 * to a parent node. If this occurs they could end up 2287 * scanning newly inserted records into the node (that could 2288 * not be deleted) when they push down again. 2289 * 2290 * Due to the way the recursion works the final parent is left 2291 * in cursor->parent after the recursion returns. Each 2292 * layer on the way back up is thus able to call 2293 * hammer_cursor_removed_node() and 'jump' the node up to 2294 * the (same) final parent. 2295 * 2296 * NOTE! The local variable 'parent' is invalid after we 2297 * call hammer_cursor_up_locked(). 2298 */ 2299 error = hammer_cursor_up_locked(cursor); 2300 parent = NULL; 2301 2302 if (error == 0) { 2303 hammer_cursor_deleted_element(cursor->node, 0); 2304 error = btree_remove(cursor); 2305 if (error == 0) { 2306 KKASSERT(node != cursor->node); 2307 hammer_cursor_removed_node( 2308 node, cursor->node, 2309 cursor->index); 2310 hammer_modify_node_all(cursor->trans, node); 2311 ondisk = node->ondisk; 2312 ondisk->type = HAMMER_BTREE_TYPE_DELETED; 2313 ondisk->count = 0; 2314 hammer_modify_node_done(node); 2315 hammer_flush_node(node, 0); 2316 hammer_delete_node(cursor->trans, node); 2317 } else { 2318 /* 2319 * Defer parent removal because we could not 2320 * get the lock, just let the leaf remain 2321 * empty. 2322 */ 2323 /**/ 2324 } 2325 hammer_unlock(&node->lock); 2326 hammer_rel_node(node); 2327 } else { 2328 /* 2329 * Defer parent removal because we could not 2330 * get the lock, just let the leaf remain 2331 * empty. 2332 */ 2333 /**/ 2334 } 2335 } else { 2336 KKASSERT(parent->ondisk->count > 1); 2337 2338 hammer_modify_node_all(cursor->trans, parent); 2339 ondisk = parent->ondisk; 2340 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 2341 2342 elm = &ondisk->elms[cursor->parent_index]; 2343 KKASSERT(elm->internal.subtree_offset == node->node_offset); 2344 KKASSERT(ondisk->count > 0); 2345 2346 /* 2347 * We must retain the highest mirror_tid. The deleted 2348 * range is now encompassed by the element to the left. 2349 * If we are already at the left edge the new left edge 2350 * inherits mirror_tid. 2351 * 2352 * Note that bounds of the parent to our parent may create 2353 * a gap to the left of our left-most node or to the right 2354 * of our right-most node. The gap is silently included 2355 * in the mirror_tid's area of effect from the point of view 2356 * of the scan. 2357 */ 2358 if (cursor->parent_index) { 2359 if (elm[-1].internal.mirror_tid < 2360 elm[0].internal.mirror_tid) { 2361 elm[-1].internal.mirror_tid = 2362 elm[0].internal.mirror_tid; 2363 } 2364 } else { 2365 if (elm[1].internal.mirror_tid < 2366 elm[0].internal.mirror_tid) { 2367 elm[1].internal.mirror_tid = 2368 elm[0].internal.mirror_tid; 2369 } 2370 } 2371 2372 /* 2373 * Delete the subtree reference in the parent. Include 2374 * boundary element at end. 2375 */ 2376 bcopy(&elm[1], &elm[0], 2377 (ondisk->count - cursor->parent_index) * esize); 2378 --ondisk->count; 2379 hammer_modify_node_done(parent); 2380 hammer_cursor_removed_node(node, parent, cursor->parent_index); 2381 hammer_cursor_deleted_element(parent, cursor->parent_index); 2382 hammer_flush_node(node, 0); 2383 hammer_delete_node(cursor->trans, node); 2384 2385 /* 2386 * cursor->node is invalid, cursor up to make the cursor 2387 * valid again. We have to flag the condition in case 2388 * another thread wiggles an insertion in during an 2389 * iteration. 2390 */ 2391 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK; 2392 error = hammer_cursor_up(cursor); 2393 } 2394 return (error); 2395 } 2396 2397 /* 2398 * Propagate cursor->trans->tid up the B-Tree starting at the current 2399 * cursor position using pseudofs info gleaned from the passed inode. 2400 * 2401 * The passed inode has no relationship to the cursor position other 2402 * then being in the same pseudofs as the insertion or deletion we 2403 * are propagating the mirror_tid for. 2404 * 2405 * WARNING! Because we push and pop the passed cursor, it may be 2406 * modified by other B-Tree operations while it is unlocked 2407 * and things like the node & leaf pointers, and indexes might 2408 * change. 2409 */ 2410 void 2411 hammer_btree_do_propagation(hammer_cursor_t cursor, 2412 hammer_pseudofs_inmem_t pfsm, 2413 hammer_btree_leaf_elm_t leaf) 2414 { 2415 hammer_cursor_t ncursor; 2416 hammer_tid_t mirror_tid; 2417 int error __debugvar; 2418 2419 /* 2420 * We do not propagate a mirror_tid if the filesystem was mounted 2421 * in no-mirror mode. 2422 */ 2423 if (cursor->trans->hmp->master_id < 0) 2424 return; 2425 2426 /* 2427 * This is a bit of a hack because we cannot deadlock or return 2428 * EDEADLK here. The related operation has already completed and 2429 * we must propagate the mirror_tid now regardless. 2430 * 2431 * Generate a new cursor which inherits the original's locks and 2432 * unlock the original. Use the new cursor to propagate the 2433 * mirror_tid. Then clean up the new cursor and reacquire locks 2434 * on the original. 2435 * 2436 * hammer_dup_cursor() cannot dup locks. The dup inherits the 2437 * original's locks and the original is tracked and must be 2438 * re-locked. 2439 */ 2440 mirror_tid = cursor->node->ondisk->mirror_tid; 2441 KKASSERT(mirror_tid != 0); 2442 ncursor = hammer_push_cursor(cursor); 2443 error = hammer_btree_mirror_propagate(ncursor, mirror_tid); 2444 KKASSERT(error == 0); 2445 hammer_pop_cursor(cursor, ncursor); 2446 /* WARNING: cursor's leaf pointer may change after pop */ 2447 } 2448 2449 2450 /* 2451 * Propagate a mirror TID update upwards through the B-Tree to the root. 2452 * 2453 * A locked internal node must be passed in. The node will remain locked 2454 * on return. 2455 * 2456 * This function syncs mirror_tid at the specified internal node's element, 2457 * adjusts the node's aggregation mirror_tid, and then recurses upwards. 2458 */ 2459 static int 2460 hammer_btree_mirror_propagate(hammer_cursor_t cursor, hammer_tid_t mirror_tid) 2461 { 2462 hammer_btree_internal_elm_t elm; 2463 hammer_node_t node; 2464 int error; 2465 2466 for (;;) { 2467 error = hammer_cursor_up(cursor); 2468 if (error == 0) 2469 error = hammer_cursor_upgrade(cursor); 2470 2471 /* 2472 * We can ignore HAMMER_CURSOR_ITERATE_CHECK, the 2473 * cursor will still be properly positioned for 2474 * mirror propagation, just not for iterations. 2475 */ 2476 while (error == EDEADLK) { 2477 hammer_recover_cursor(cursor); 2478 error = hammer_cursor_upgrade(cursor); 2479 } 2480 if (error) 2481 break; 2482 2483 /* 2484 * If the cursor deadlocked it could end up at a leaf 2485 * after we lost the lock. 2486 */ 2487 node = cursor->node; 2488 if (node->ondisk->type != HAMMER_BTREE_TYPE_INTERNAL) 2489 continue; 2490 2491 /* 2492 * Adjust the node's element 2493 */ 2494 elm = &node->ondisk->elms[cursor->index].internal; 2495 if (elm->mirror_tid >= mirror_tid) 2496 break; 2497 hammer_modify_node(cursor->trans, node, &elm->mirror_tid, 2498 sizeof(elm->mirror_tid)); 2499 elm->mirror_tid = mirror_tid; 2500 hammer_modify_node_done(node); 2501 if (hammer_debug_general & 0x0002) { 2502 kprintf("mirror_propagate: propagate " 2503 "%016llx @%016llx:%d\n", 2504 (long long)mirror_tid, 2505 (long long)node->node_offset, 2506 cursor->index); 2507 } 2508 2509 2510 /* 2511 * Adjust the node's mirror_tid aggregator 2512 */ 2513 if (node->ondisk->mirror_tid >= mirror_tid) 2514 return(0); 2515 hammer_modify_node_field(cursor->trans, node, mirror_tid); 2516 node->ondisk->mirror_tid = mirror_tid; 2517 hammer_modify_node_done(node); 2518 if (hammer_debug_general & 0x0002) { 2519 kprintf("mirror_propagate: propagate " 2520 "%016llx @%016llx\n", 2521 (long long)mirror_tid, 2522 (long long)node->node_offset); 2523 } 2524 } 2525 if (error == ENOENT) 2526 error = 0; 2527 return(error); 2528 } 2529 2530 hammer_node_t 2531 hammer_btree_get_parent(hammer_transaction_t trans, hammer_node_t node, 2532 int *parent_indexp, int *errorp, int try_exclusive) 2533 { 2534 hammer_node_t parent; 2535 hammer_btree_elm_t elm; 2536 int i; 2537 2538 /* 2539 * Get the node 2540 */ 2541 parent = hammer_get_node(trans, node->ondisk->parent, 0, errorp); 2542 if (*errorp) { 2543 KKASSERT(parent == NULL); 2544 return(NULL); 2545 } 2546 KKASSERT ((parent->flags & HAMMER_NODE_DELETED) == 0); 2547 2548 /* 2549 * Lock the node 2550 */ 2551 if (try_exclusive) { 2552 if (hammer_lock_ex_try(&parent->lock)) { 2553 hammer_rel_node(parent); 2554 *errorp = EDEADLK; 2555 return(NULL); 2556 } 2557 } else { 2558 hammer_lock_sh(&parent->lock); 2559 } 2560 2561 /* 2562 * Figure out which element in the parent is pointing to the 2563 * child. 2564 */ 2565 if (node->ondisk->count) { 2566 i = hammer_btree_search_node(&node->ondisk->elms[0].base, 2567 parent->ondisk); 2568 } else { 2569 i = 0; 2570 } 2571 while (i < parent->ondisk->count) { 2572 elm = &parent->ondisk->elms[i]; 2573 if (elm->internal.subtree_offset == node->node_offset) 2574 break; 2575 ++i; 2576 } 2577 if (i == parent->ondisk->count) { 2578 hammer_unlock(&parent->lock); 2579 panic("Bad B-Tree link: parent %p node %p", parent, node); 2580 } 2581 *parent_indexp = i; 2582 KKASSERT(*errorp == 0); 2583 return(parent); 2584 } 2585 2586 /* 2587 * The element (elm) has been moved to a new internal node (node). 2588 * 2589 * If the element represents a pointer to an internal node that node's 2590 * parent must be adjusted to the element's new location. 2591 * 2592 * XXX deadlock potential here with our exclusive locks 2593 */ 2594 int 2595 btree_set_parent(hammer_transaction_t trans, hammer_node_t node, 2596 hammer_btree_elm_t elm) 2597 { 2598 hammer_node_t child; 2599 int error; 2600 2601 error = 0; 2602 2603 switch(elm->base.btype) { 2604 case HAMMER_BTREE_TYPE_INTERNAL: 2605 case HAMMER_BTREE_TYPE_LEAF: 2606 child = hammer_get_node(trans, elm->internal.subtree_offset, 2607 0, &error); 2608 if (error == 0) { 2609 hammer_modify_node_field(trans, child, parent); 2610 child->ondisk->parent = node->node_offset; 2611 hammer_modify_node_done(child); 2612 hammer_rel_node(child); 2613 } 2614 break; 2615 default: 2616 break; 2617 } 2618 return(error); 2619 } 2620 2621 /* 2622 * Initialize the root of a recursive B-Tree node lock list structure. 2623 */ 2624 void 2625 hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node) 2626 { 2627 TAILQ_INIT(&parent->list); 2628 parent->parent = NULL; 2629 parent->node = node; 2630 parent->index = -1; 2631 parent->count = node->ondisk->count; 2632 parent->copy = NULL; 2633 parent->flags = 0; 2634 } 2635 2636 /* 2637 * Initialize a cache of hammer_node_lock's including space allocated 2638 * for node copies. 2639 * 2640 * This is used by the rebalancing code to preallocate the copy space 2641 * for ~4096 B-Tree nodes (16MB of data) prior to acquiring any HAMMER 2642 * locks, otherwise we can blow out the pageout daemon's emergency 2643 * reserve and deadlock it. 2644 * 2645 * NOTE: HAMMER_NODE_LOCK_LCACHE is not set on items cached in the lcache. 2646 * The flag is set when the item is pulled off the cache for use. 2647 */ 2648 void 2649 hammer_btree_lcache_init(hammer_mount_t hmp, hammer_node_lock_t lcache, 2650 int depth) 2651 { 2652 hammer_node_lock_t item; 2653 int count; 2654 2655 for (count = 1; depth; --depth) 2656 count *= HAMMER_BTREE_LEAF_ELMS; 2657 bzero(lcache, sizeof(*lcache)); 2658 TAILQ_INIT(&lcache->list); 2659 while (count) { 2660 item = kmalloc(sizeof(*item), hmp->m_misc, M_WAITOK|M_ZERO); 2661 item->copy = kmalloc(sizeof(*item->copy), 2662 hmp->m_misc, M_WAITOK); 2663 TAILQ_INIT(&item->list); 2664 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2665 --count; 2666 } 2667 } 2668 2669 void 2670 hammer_btree_lcache_free(hammer_mount_t hmp, hammer_node_lock_t lcache) 2671 { 2672 hammer_node_lock_t item; 2673 2674 while ((item = TAILQ_FIRST(&lcache->list)) != NULL) { 2675 TAILQ_REMOVE(&lcache->list, item, entry); 2676 KKASSERT(item->copy); 2677 KKASSERT(TAILQ_EMPTY(&item->list)); 2678 kfree(item->copy, hmp->m_misc); 2679 kfree(item, hmp->m_misc); 2680 } 2681 KKASSERT(lcache->copy == NULL); 2682 } 2683 2684 /* 2685 * Exclusively lock all the children of node. This is used by the split 2686 * code to prevent anyone from accessing the children of a cursor node 2687 * while we fix-up its parent offset. 2688 * 2689 * If we don't lock the children we can really mess up cursors which block 2690 * trying to cursor-up into our node. 2691 * 2692 * On failure EDEADLK (or some other error) is returned. If a deadlock 2693 * error is returned the cursor is adjusted to block on termination. 2694 * 2695 * The caller is responsible for managing parent->node, the root's node 2696 * is usually aliased from a cursor. 2697 */ 2698 int 2699 hammer_btree_lock_children(hammer_cursor_t cursor, int depth, 2700 hammer_node_lock_t parent, 2701 hammer_node_lock_t lcache) 2702 { 2703 hammer_node_t node; 2704 hammer_node_lock_t item; 2705 hammer_node_ondisk_t ondisk; 2706 hammer_btree_elm_t elm; 2707 hammer_node_t child; 2708 struct hammer_mount *hmp; 2709 int error; 2710 int i; 2711 2712 node = parent->node; 2713 ondisk = node->ondisk; 2714 error = 0; 2715 hmp = cursor->trans->hmp; 2716 2717 /* 2718 * We really do not want to block on I/O with exclusive locks held, 2719 * pre-get the children before trying to lock the mess. This is 2720 * only done one-level deep for now. 2721 */ 2722 for (i = 0; i < ondisk->count; ++i) { 2723 ++hammer_stats_btree_elements; 2724 elm = &ondisk->elms[i]; 2725 if (elm->base.btype != HAMMER_BTREE_TYPE_LEAF && 2726 elm->base.btype != HAMMER_BTREE_TYPE_INTERNAL) { 2727 continue; 2728 } 2729 child = hammer_get_node(cursor->trans, 2730 elm->internal.subtree_offset, 2731 0, &error); 2732 if (child) 2733 hammer_rel_node(child); 2734 } 2735 2736 /* 2737 * Do it for real 2738 */ 2739 for (i = 0; error == 0 && i < ondisk->count; ++i) { 2740 ++hammer_stats_btree_elements; 2741 elm = &ondisk->elms[i]; 2742 2743 switch(elm->base.btype) { 2744 case HAMMER_BTREE_TYPE_INTERNAL: 2745 case HAMMER_BTREE_TYPE_LEAF: 2746 KKASSERT(elm->internal.subtree_offset != 0); 2747 child = hammer_get_node(cursor->trans, 2748 elm->internal.subtree_offset, 2749 0, &error); 2750 break; 2751 default: 2752 child = NULL; 2753 break; 2754 } 2755 if (child) { 2756 if (hammer_lock_ex_try(&child->lock) != 0) { 2757 if (cursor->deadlk_node == NULL) { 2758 cursor->deadlk_node = child; 2759 hammer_ref_node(cursor->deadlk_node); 2760 } 2761 error = EDEADLK; 2762 hammer_rel_node(child); 2763 } else { 2764 if (lcache) { 2765 item = TAILQ_FIRST(&lcache->list); 2766 KKASSERT(item != NULL); 2767 item->flags |= HAMMER_NODE_LOCK_LCACHE; 2768 TAILQ_REMOVE(&lcache->list, 2769 item, entry); 2770 } else { 2771 item = kmalloc(sizeof(*item), 2772 hmp->m_misc, 2773 M_WAITOK|M_ZERO); 2774 TAILQ_INIT(&item->list); 2775 } 2776 2777 TAILQ_INSERT_TAIL(&parent->list, item, entry); 2778 item->parent = parent; 2779 item->node = child; 2780 item->index = i; 2781 item->count = child->ondisk->count; 2782 2783 /* 2784 * Recurse (used by the rebalancing code) 2785 */ 2786 if (depth > 1 && elm->base.btype == HAMMER_BTREE_TYPE_INTERNAL) { 2787 error = hammer_btree_lock_children( 2788 cursor, 2789 depth - 1, 2790 item, 2791 lcache); 2792 } 2793 } 2794 } 2795 } 2796 if (error) 2797 hammer_btree_unlock_children(hmp, parent, lcache); 2798 return(error); 2799 } 2800 2801 /* 2802 * Create an in-memory copy of all B-Tree nodes listed, recursively, 2803 * including the parent. 2804 */ 2805 void 2806 hammer_btree_lock_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2807 { 2808 hammer_mount_t hmp = cursor->trans->hmp; 2809 hammer_node_lock_t item; 2810 2811 if (parent->copy == NULL) { 2812 KKASSERT((parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0); 2813 parent->copy = kmalloc(sizeof(*parent->copy), 2814 hmp->m_misc, M_WAITOK); 2815 } 2816 KKASSERT((parent->flags & HAMMER_NODE_LOCK_UPDATED) == 0); 2817 *parent->copy = *parent->node->ondisk; 2818 TAILQ_FOREACH(item, &parent->list, entry) { 2819 hammer_btree_lock_copy(cursor, item); 2820 } 2821 } 2822 2823 /* 2824 * Recursively sync modified copies to the media. 2825 */ 2826 int 2827 hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2828 { 2829 hammer_node_lock_t item; 2830 int count = 0; 2831 2832 if (parent->flags & HAMMER_NODE_LOCK_UPDATED) { 2833 ++count; 2834 hammer_modify_node_all(cursor->trans, parent->node); 2835 *parent->node->ondisk = *parent->copy; 2836 hammer_modify_node_done(parent->node); 2837 if (parent->copy->type == HAMMER_BTREE_TYPE_DELETED) { 2838 hammer_flush_node(parent->node, 0); 2839 hammer_delete_node(cursor->trans, parent->node); 2840 } 2841 } 2842 TAILQ_FOREACH(item, &parent->list, entry) { 2843 count += hammer_btree_sync_copy(cursor, item); 2844 } 2845 return(count); 2846 } 2847 2848 /* 2849 * Release previously obtained node locks. The caller is responsible for 2850 * cleaning up parent->node itself (its usually just aliased from a cursor), 2851 * but this function will take care of the copies. 2852 * 2853 * NOTE: The root node is not placed in the lcache and node->copy is not 2854 * deallocated when lcache != NULL. 2855 */ 2856 void 2857 hammer_btree_unlock_children(hammer_mount_t hmp, hammer_node_lock_t parent, 2858 hammer_node_lock_t lcache) 2859 { 2860 hammer_node_lock_t item; 2861 hammer_node_ondisk_t copy; 2862 2863 while ((item = TAILQ_FIRST(&parent->list)) != NULL) { 2864 TAILQ_REMOVE(&parent->list, item, entry); 2865 hammer_btree_unlock_children(hmp, item, lcache); 2866 hammer_unlock(&item->node->lock); 2867 hammer_rel_node(item->node); 2868 if (lcache) { 2869 /* 2870 * NOTE: When placing the item back in the lcache 2871 * the flag is cleared by the bzero(). 2872 * Remaining fields are cleared as a safety 2873 * measure. 2874 */ 2875 KKASSERT(item->flags & HAMMER_NODE_LOCK_LCACHE); 2876 KKASSERT(TAILQ_EMPTY(&item->list)); 2877 copy = item->copy; 2878 bzero(item, sizeof(*item)); 2879 TAILQ_INIT(&item->list); 2880 item->copy = copy; 2881 if (copy) 2882 bzero(copy, sizeof(*copy)); 2883 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2884 } else { 2885 kfree(item, hmp->m_misc); 2886 } 2887 } 2888 if (parent->copy && (parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0) { 2889 kfree(parent->copy, hmp->m_misc); 2890 parent->copy = NULL; /* safety */ 2891 } 2892 } 2893 2894 /************************************************************************ 2895 * MISCELLANIOUS SUPPORT * 2896 ************************************************************************/ 2897 2898 /* 2899 * Compare two B-Tree elements, return -N, 0, or +N (e.g. similar to strcmp). 2900 * 2901 * Note that for this particular function a return value of -1, 0, or +1 2902 * can denote a match if create_tid is otherwise discounted. A create_tid 2903 * of zero is considered to be 'infinity' in comparisons. 2904 * 2905 * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c. 2906 */ 2907 int 2908 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2) 2909 { 2910 if (key1->localization < key2->localization) 2911 return(-5); 2912 if (key1->localization > key2->localization) 2913 return(5); 2914 2915 if (key1->obj_id < key2->obj_id) 2916 return(-4); 2917 if (key1->obj_id > key2->obj_id) 2918 return(4); 2919 2920 if (key1->rec_type < key2->rec_type) 2921 return(-3); 2922 if (key1->rec_type > key2->rec_type) 2923 return(3); 2924 2925 if (key1->key < key2->key) 2926 return(-2); 2927 if (key1->key > key2->key) 2928 return(2); 2929 2930 /* 2931 * A create_tid of zero indicates a record which is undeletable 2932 * and must be considered to have a value of positive infinity. 2933 */ 2934 if (key1->create_tid == 0) { 2935 if (key2->create_tid == 0) 2936 return(0); 2937 return(1); 2938 } 2939 if (key2->create_tid == 0) 2940 return(-1); 2941 if (key1->create_tid < key2->create_tid) 2942 return(-1); 2943 if (key1->create_tid > key2->create_tid) 2944 return(1); 2945 return(0); 2946 } 2947 2948 /* 2949 * Test a timestamp against an element to determine whether the 2950 * element is visible. A timestamp of 0 means 'infinity'. 2951 */ 2952 int 2953 hammer_btree_chkts(hammer_tid_t asof, hammer_base_elm_t base) 2954 { 2955 if (asof == 0) { 2956 if (base->delete_tid) 2957 return(1); 2958 return(0); 2959 } 2960 if (asof < base->create_tid) 2961 return(-1); 2962 if (base->delete_tid && asof >= base->delete_tid) 2963 return(1); 2964 return(0); 2965 } 2966 2967 /* 2968 * Create a separator half way inbetween key1 and key2. For fields just 2969 * one unit apart, the separator will match key2. key1 is on the left-hand 2970 * side and key2 is on the right-hand side. 2971 * 2972 * key2 must be >= the separator. It is ok for the separator to match key2. 2973 * 2974 * NOTE: Even if key1 does not match key2, the separator may wind up matching 2975 * key2. 2976 * 2977 * NOTE: It might be beneficial to just scrap this whole mess and just 2978 * set the separator to key2. 2979 */ 2980 #define MAKE_SEPARATOR(key1, key2, dest, field) \ 2981 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1); 2982 2983 static void 2984 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2, 2985 hammer_base_elm_t dest) 2986 { 2987 bzero(dest, sizeof(*dest)); 2988 2989 dest->rec_type = key2->rec_type; 2990 dest->key = key2->key; 2991 dest->obj_id = key2->obj_id; 2992 dest->create_tid = key2->create_tid; 2993 2994 MAKE_SEPARATOR(key1, key2, dest, localization); 2995 if (key1->localization == key2->localization) { 2996 MAKE_SEPARATOR(key1, key2, dest, obj_id); 2997 if (key1->obj_id == key2->obj_id) { 2998 MAKE_SEPARATOR(key1, key2, dest, rec_type); 2999 if (key1->rec_type == key2->rec_type) { 3000 MAKE_SEPARATOR(key1, key2, dest, key); 3001 /* 3002 * Don't bother creating a separator for 3003 * create_tid, which also conveniently avoids 3004 * having to handle the create_tid == 0 3005 * (infinity) case. Just leave create_tid 3006 * set to key2. 3007 * 3008 * Worst case, dest matches key2 exactly, 3009 * which is acceptable. 3010 */ 3011 } 3012 } 3013 } 3014 } 3015 3016 #undef MAKE_SEPARATOR 3017 3018 /* 3019 * Return whether a generic internal or leaf node is full 3020 */ 3021 static int 3022 btree_node_is_full(hammer_node_ondisk_t node) 3023 { 3024 switch(node->type) { 3025 case HAMMER_BTREE_TYPE_INTERNAL: 3026 if (node->count == HAMMER_BTREE_INT_ELMS) 3027 return(1); 3028 break; 3029 case HAMMER_BTREE_TYPE_LEAF: 3030 if (node->count == HAMMER_BTREE_LEAF_ELMS) 3031 return(1); 3032 break; 3033 default: 3034 panic("illegal btree subtype"); 3035 } 3036 return(0); 3037 } 3038 3039 #if 0 3040 static int 3041 btree_max_elements(u_int8_t type) 3042 { 3043 if (type == HAMMER_BTREE_TYPE_LEAF) 3044 return(HAMMER_BTREE_LEAF_ELMS); 3045 if (type == HAMMER_BTREE_TYPE_INTERNAL) 3046 return(HAMMER_BTREE_INT_ELMS); 3047 panic("btree_max_elements: bad type %d", type); 3048 } 3049 #endif 3050 3051 void 3052 hammer_print_btree_node(hammer_node_ondisk_t ondisk) 3053 { 3054 hammer_btree_elm_t elm; 3055 int i; 3056 3057 kprintf("node %p count=%d parent=%016llx type=%c\n", 3058 ondisk, ondisk->count, 3059 (long long)ondisk->parent, ondisk->type); 3060 3061 /* 3062 * Dump both boundary elements if an internal node 3063 */ 3064 if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 3065 for (i = 0; i <= ondisk->count; ++i) { 3066 elm = &ondisk->elms[i]; 3067 hammer_print_btree_elm(elm, ondisk->type, i); 3068 } 3069 } else { 3070 for (i = 0; i < ondisk->count; ++i) { 3071 elm = &ondisk->elms[i]; 3072 hammer_print_btree_elm(elm, ondisk->type, i); 3073 } 3074 } 3075 } 3076 3077 void 3078 hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i) 3079 { 3080 kprintf(" %2d", i); 3081 kprintf("\tobj_id = %016llx\n", (long long)elm->base.obj_id); 3082 kprintf("\tkey = %016llx\n", (long long)elm->base.key); 3083 kprintf("\tcreate_tid = %016llx\n", (long long)elm->base.create_tid); 3084 kprintf("\tdelete_tid = %016llx\n", (long long)elm->base.delete_tid); 3085 kprintf("\trec_type = %04x\n", elm->base.rec_type); 3086 kprintf("\tobj_type = %02x\n", elm->base.obj_type); 3087 kprintf("\tbtype = %02x (%c)\n", 3088 elm->base.btype, 3089 (elm->base.btype ? elm->base.btype : '?')); 3090 kprintf("\tlocalization = %02x\n", elm->base.localization); 3091 3092 switch(type) { 3093 case HAMMER_BTREE_TYPE_INTERNAL: 3094 kprintf("\tsubtree_off = %016llx\n", 3095 (long long)elm->internal.subtree_offset); 3096 break; 3097 case HAMMER_BTREE_TYPE_RECORD: 3098 kprintf("\tdata_offset = %016llx\n", 3099 (long long)elm->leaf.data_offset); 3100 kprintf("\tdata_len = %08x\n", elm->leaf.data_len); 3101 kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc); 3102 break; 3103 } 3104 } 3105