1 /* 2 * Copyright (c) 2009 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "hammer.h" 36 37 static int rebalance_node(struct hammer_ioc_rebalance *rebal, 38 hammer_cursor_t cursor, hammer_node_lock_t lcache); 39 static void rebalance_closeout(hammer_node_lock_t base_item, int base_count, 40 hammer_btree_elm_t elm); 41 static void rebalance_parent_ptrs(hammer_node_lock_t base_item, int index, 42 hammer_node_lock_t item, hammer_node_lock_t chld_item); 43 44 /* 45 * Iterate through the specified range of object ids and rebalance B-Tree 46 * leaf and internal nodes we encounter. A forwards iteration is used. 47 * 48 * All leafs are at the same depth. We use the b-tree scan code loosely 49 * to position ourselves and create degenerate cases to skip indices 50 * that we have rebalanced in bulk. 51 */ 52 53 int 54 hammer_ioc_rebalance(hammer_transaction_t trans, hammer_inode_t ip, 55 struct hammer_ioc_rebalance *rebal) 56 { 57 struct hammer_cursor cursor; 58 struct hammer_node_lock lcache; 59 hammer_btree_leaf_elm_t elm; 60 int error; 61 int seq; 62 63 if ((rebal->key_beg.localization | rebal->key_end.localization) & 64 HAMMER_LOCALIZE_PSEUDOFS_MASK) { 65 return(EINVAL); 66 } 67 if (rebal->key_beg.localization > rebal->key_end.localization) 68 return(EINVAL); 69 if (rebal->key_beg.localization == rebal->key_end.localization) { 70 if (rebal->key_beg.obj_id > rebal->key_end.obj_id) 71 return(EINVAL); 72 /* key-space limitations - no check needed */ 73 } 74 if (rebal->saturation < HAMMER_BTREE_INT_ELMS / 2) 75 rebal->saturation = HAMMER_BTREE_INT_ELMS / 2; 76 if (rebal->saturation > HAMMER_BTREE_INT_ELMS) 77 rebal->saturation = HAMMER_BTREE_INT_ELMS; 78 79 rebal->key_cur = rebal->key_beg; 80 rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK; 81 rebal->key_cur.localization += ip->obj_localization; 82 83 hammer_btree_lcache_init(trans->hmp, &lcache, 2); 84 85 seq = trans->hmp->flusher.done; 86 87 /* 88 * Scan forwards. Retries typically occur if a deadlock is detected. 89 */ 90 retry: 91 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 92 if (error) { 93 hammer_done_cursor(&cursor); 94 goto failed; 95 } 96 cursor.key_beg = rebal->key_cur; 97 cursor.key_end = rebal->key_end; 98 cursor.key_end.localization &= HAMMER_LOCALIZE_MASK; 99 cursor.key_end.localization += ip->obj_localization; 100 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 101 cursor.flags |= HAMMER_CURSOR_BACKEND; 102 103 /* 104 * Cause internal nodes to be returned on the way up. Internal nodes 105 * are not returned on the way down so we can create a degenerate 106 * case to handle internal nodes as a trailing function of their 107 * sub-trees. 108 * 109 * Note that by not setting INSERTING or PRUNING no boundary 110 * corrections will be made and a sync lock is not needed for the 111 * B-Tree scan itself. 112 */ 113 cursor.flags |= HAMMER_CURSOR_REBLOCKING; 114 115 error = hammer_btree_first(&cursor); 116 117 while (error == 0) { 118 /* 119 * Rebalancing can be hard on the memory allocator, make 120 * sure there is enough free memory before doing it. 121 */ 122 if (vm_test_nominal()) { 123 hammer_unlock_cursor(&cursor); 124 vm_wait_nominal(); 125 hammer_lock_cursor(&cursor); 126 } 127 128 /* 129 * We only care about internal nodes visited for the last 130 * time on the way up... that is, a trailing scan of the 131 * internal node after all of its children have been recursed 132 * through. 133 */ 134 if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 135 /* 136 * Leave cursor.index alone, we want to recurse 137 * through all children of the internal node before 138 * visiting it. 139 * 140 * Process the internal node on the way up after 141 * the last child's sub-tree has been balanced. 142 */ 143 if (cursor.index == cursor.node->ondisk->count - 1) { 144 hammer_sync_lock_sh(trans); 145 error = rebalance_node(rebal, &cursor, &lcache); 146 hammer_sync_unlock(trans); 147 } 148 } else { 149 /* 150 * We don't need to iterate through all the leaf 151 * elements, we only care about the parent (internal) 152 * node. 153 */ 154 cursor.index = cursor.node->ondisk->count - 1; 155 } 156 if (error) 157 break; 158 159 /* 160 * Update returned scan position and do a flush if 161 * necessary. 162 * 163 * WARNING: We extract the base using the leaf element 164 * type but this could be an internal node. The 165 * base is the same either way. 166 * 167 * However, due to the rebalancing operation the 168 * cursor position may have exceeded the right-hand 169 * boundary. 170 * 171 * WARNING: See warnings in hammer_unlock_cursor() 172 * function. 173 */ 174 elm = &cursor.node->ondisk->elms[cursor.index].leaf; 175 rebal->key_cur = elm->base; 176 ++rebal->stat_ncount; 177 178 while (hammer_flusher_meta_halflimit(trans->hmp) || 179 hammer_flusher_undo_exhausted(trans, 2)) { 180 hammer_unlock_cursor(&cursor); 181 hammer_flusher_wait(trans->hmp, seq); 182 hammer_lock_cursor(&cursor); 183 seq = hammer_flusher_async_one(trans->hmp); 184 } 185 186 /* 187 * Before iterating check if the rebalance operation caused 188 * the cursor to index past the right-hand boundary and make 189 * sure to stop if it does. Otherwise the iteration may 190 * panic e.g. due to the key maxing out its fields and no 191 * longer being within the strict bounds of the root node. 192 */ 193 if (hammer_btree_cmp(&rebal->key_cur, &cursor.key_end) > 0) { 194 rebal->key_cur = cursor.key_end; 195 break; 196 } 197 198 /* 199 * Iterate, stop if a signal was received. 200 */ 201 if ((error = hammer_signal_check(trans->hmp)) != 0) 202 break; 203 error = hammer_btree_iterate(&cursor); 204 } 205 if (error == ENOENT) 206 error = 0; 207 hammer_done_cursor(&cursor); 208 if (error == EDEADLK) { 209 ++rebal->stat_collisions; 210 goto retry; 211 } 212 if (error == EINTR) { 213 rebal->head.flags |= HAMMER_IOC_HEAD_INTR; 214 error = 0; 215 } 216 failed: 217 rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK; 218 hammer_btree_lcache_free(trans->hmp, &lcache); 219 return(error); 220 } 221 222 /* 223 * Rebalance an internal node, called via a trailing upward recursion. 224 * All the children have already been individually rebalanced. 225 * 226 * To rebalance we scan the elements in the children and pack them, 227 * so we actually need to lock the children and the children's children. 228 * 229 * INTERNAL_NODE 230 * / / | | | \ \ 231 * C C C C C C C children (first level) (internal or leaf nodes) 232 * children's elements (second level) 233 * 234 * <<<---------- pack children's elements, possibly remove excess 235 * children after packing. 236 * 237 * NOTE: The mirror_tids, parent pointers, and child pointers must be updated. 238 * Any live tracked B-Tree nodes must be updated (we worm out of that 239 * by not allowing any). And boundary elements must be preserved. 240 * 241 * NOTE: If the children are leaf nodes we may have a degenerate case 242 * case where there are no elements in the leafs. 243 * 244 * XXX live-tracked 245 */ 246 static int 247 rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor, 248 struct hammer_node_lock *lcache) 249 { 250 struct hammer_node_lock lockroot; 251 hammer_node_lock_t base_item; 252 hammer_node_lock_t chld_item; 253 hammer_node_lock_t item; 254 hammer_btree_elm_t elm; 255 hammer_node_t node; 256 hammer_tid_t tid; 257 u_int8_t type1 __debugvar; 258 int base_count; 259 int root_count; 260 int avg_elms; 261 int count; 262 int error; 263 int i; 264 int n; 265 266 /* 267 * Lock the parent node via the cursor, collect and lock our 268 * children and children's children. 269 * 270 * By the way, this is a LOT of locks. 271 */ 272 hammer_node_lock_init(&lockroot, cursor->node); 273 error = hammer_cursor_upgrade(cursor); 274 if (error) 275 goto done; 276 error = hammer_btree_lock_children(cursor, 2, &lockroot, lcache); 277 if (error) 278 goto done; 279 280 /* 281 * Make a copy of all the locked on-disk data to simplify the element 282 * shifting we are going to have to do. We will modify the copy 283 * first. 284 */ 285 hammer_btree_lock_copy(cursor, &lockroot); 286 287 /* 288 * Look at the first child node. 289 */ 290 if (TAILQ_FIRST(&lockroot.list) == NULL) 291 goto done; 292 type1 = TAILQ_FIRST(&lockroot.list)->node->ondisk->type; 293 294 /* 295 * Figure out the total number of children's children and 296 * calculate the average number of elements per child. 297 * 298 * The minimum avg_elms is 1 when count > 0. avg_elms * root_elms 299 * is always greater or equal to count. 300 * 301 * If count == 0 we hit a degenerate case which will cause 302 * avg_elms to also calculate as 0. 303 */ 304 if (hammer_debug_general & 0x1000) 305 kprintf("lockroot %p count %d\n", &lockroot, lockroot.count); 306 count = 0; 307 TAILQ_FOREACH(item, &lockroot.list, entry) { 308 if (hammer_debug_general & 0x1000) 309 kprintf("add count %d\n", item->count); 310 count += item->count; 311 KKASSERT(item->node->ondisk->type == type1); 312 } 313 avg_elms = (count + (lockroot.count - 1)) / lockroot.count; 314 KKASSERT(avg_elms >= 0); 315 316 /* 317 * If the average number of elements per child is too low then 318 * calculate the desired number of children (n) such that the 319 * average number of elements is reasonable. 320 * 321 * If the desired number of children is 1 then avg_elms will 322 * wind up being count, which may still be smaller then saturation 323 * but that is ok. 324 */ 325 if (count && avg_elms < rebal->saturation) { 326 n = (count + (rebal->saturation - 1)) / rebal->saturation; 327 avg_elms = (count + (n - 1)) / n; 328 } 329 330 /* 331 * Pack the elements in the children. Elements for each item is 332 * packed into base_item until avg_elms is reached, then base_item 333 * iterates. 334 * 335 * hammer_cursor_moved_element() is called for each element moved 336 * to update tracked cursors, including the index beyond the last 337 * element (at count). 338 * 339 * Any cursors tracking the internal node itself must also be 340 * updated, potentially repointing them at a leaf and clearing 341 * ATEDISK. 342 */ 343 base_item = TAILQ_FIRST(&lockroot.list); 344 base_count = 0; 345 root_count = 0; 346 347 TAILQ_FOREACH(item, &lockroot.list, entry) { 348 node = item->node; 349 KKASSERT(item->count == node->ondisk->count); 350 chld_item = TAILQ_FIRST(&item->list); 351 for (i = 0; i < item->count; ++i) { 352 /* 353 * Closeout. If the next element is at index 0 354 * just use the existing separator in the parent. 355 */ 356 if (base_count == avg_elms) { 357 if (i == 0) { 358 elm = &lockroot.node->ondisk->elms[ 359 item->index]; 360 } else { 361 elm = &node->ondisk->elms[i]; 362 } 363 rebalance_closeout(base_item, base_count, elm); 364 base_item = TAILQ_NEXT(base_item, entry); 365 KKASSERT(base_item); 366 base_count = 0; 367 ++root_count; 368 } 369 370 /* 371 * Check degenerate no-work case. Otherwise pack 372 * the element. 373 * 374 * All changes are made to the copy. 375 */ 376 if (item == base_item && i == base_count) { 377 ++base_count; 378 if (chld_item) 379 chld_item = TAILQ_NEXT(chld_item, entry); 380 continue; 381 } 382 383 /* 384 * Pack element. 385 */ 386 elm = &base_item->copy->elms[base_count]; 387 *elm = node->ondisk->elms[i]; 388 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 389 390 /* 391 * Adjust the mirror_tid of the target and the 392 * internal element linkage. 393 * 394 * The parent node (lockroot.node) should already 395 * have an aggregate mirror_tid so we do not have 396 * to update that. However, it is possible for us 397 * to catch a hammer_btree_mirror_propagate() with 398 * its pants down. Update the parent if necessary. 399 */ 400 tid = node->ondisk->mirror_tid; 401 402 if (base_item->copy->mirror_tid < tid) { 403 base_item->copy->mirror_tid = tid; 404 if (lockroot.copy->mirror_tid < tid) { 405 lockroot.copy->mirror_tid = tid; 406 lockroot.flags |= 407 HAMMER_NODE_LOCK_UPDATED; 408 } 409 if (lockroot.copy->elms[root_count]. 410 internal.mirror_tid < tid) { 411 lockroot.copy->elms[root_count]. 412 internal.mirror_tid = tid; 413 lockroot.flags |= 414 HAMMER_NODE_LOCK_UPDATED; 415 } 416 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 417 } 418 419 /* 420 * We moved elm. The parent pointers for any 421 * children of elm must be repointed. 422 */ 423 if (item != base_item && 424 node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 425 KKASSERT(chld_item); 426 rebalance_parent_ptrs(base_item, base_count, 427 item, chld_item); 428 } 429 hammer_cursor_moved_element(item->parent->node, 430 item->index, 431 node, i, 432 base_item->node, 433 base_count); 434 ++base_count; 435 if (chld_item) 436 chld_item = TAILQ_NEXT(chld_item, entry); 437 } 438 439 /* 440 * Always call at the end (i == number of elements) in 441 * case a cursor is sitting indexed there. 442 */ 443 hammer_cursor_moved_element(item->parent->node, item->index, 444 node, i, 445 base_item->node, base_count); 446 } 447 448 /* 449 * Packing complete, close-out base_item using the right-hand 450 * boundary of the original parent. 451 * 452 * If we will be deleting nodes from the root shift the old 453 * right-hand-boundary to the new ending index. 454 */ 455 elm = &lockroot.node->ondisk->elms[lockroot.node->ondisk->count]; 456 rebalance_closeout(base_item, base_count, elm); 457 ++root_count; 458 if (lockroot.copy->count != root_count) { 459 lockroot.copy->count = root_count; 460 lockroot.copy->elms[root_count] = *elm; 461 lockroot.flags |= HAMMER_NODE_LOCK_UPDATED; 462 } 463 464 /* 465 * Any extra items beyond base_item are now completely empty and 466 * can be destroyed. Queue the destruction up in the copy. Note 467 * that none of the destroyed nodes are part of our cursor. 468 * 469 * The cursor is locked so it isn't on the tracking list. It 470 * should have been pointing at the boundary element (at root_count). 471 * When deleting elements from the root (which is cursor.node), we 472 * have to update the cursor.index manually to keep it in bounds. 473 */ 474 while ((base_item = TAILQ_NEXT(base_item, entry)) != NULL) { 475 hammer_cursor_removed_node(base_item->node, lockroot.node, 476 base_count); 477 hammer_cursor_deleted_element(lockroot.node, base_count); 478 base_item->copy->type = HAMMER_BTREE_TYPE_DELETED; 479 base_item->copy->count = 0; 480 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 481 if (cursor->index > lockroot.copy->count) 482 --cursor->index; 483 ++rebal->stat_deletions; 484 } 485 486 /* 487 * All done, sync the locked child tree to disk. This will also 488 * flush and delete deleted nodes. 489 */ 490 rebal->stat_nrebal += hammer_btree_sync_copy(cursor, &lockroot); 491 done: 492 hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, lcache); 493 hammer_cursor_downgrade(cursor); 494 return (error); 495 } 496 497 /* 498 * Close-out the child base_item. This node contains base_count 499 * elements. 500 * 501 * If the node is an internal node the right-hand boundary must be 502 * set to elm. 503 */ 504 static 505 void 506 rebalance_closeout(hammer_node_lock_t base_item, int base_count, 507 hammer_btree_elm_t elm) 508 { 509 hammer_node_lock_t parent; 510 hammer_btree_elm_t base_elm; 511 hammer_btree_elm_t rbound_elm; 512 u_int8_t save; 513 514 /* 515 * Update the count. NOTE: base_count can be 0 for the 516 * degenerate leaf case. 517 */ 518 if (hammer_debug_general & 0x1000) { 519 kprintf("rebalance_closeout %016llx:", 520 (long long)base_item->node->node_offset); 521 } 522 if (base_item->copy->count != base_count) { 523 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 524 base_item->copy->count = base_count; 525 if (hammer_debug_general & 0x1000) 526 kprintf(" (count update)"); 527 } 528 529 /* 530 * If we are closing out an internal node we must assign 531 * a right-hand boundary. Use the element contents as the 532 * right-hand boundary. 533 * 534 * Internal nodes are required to have at least one child, 535 * otherwise the left and right boundary would end up being 536 * the same element. Only leaf nodes can be empty. 537 * 538 * Rebalancing may cut-off an internal node such that the 539 * new right hand boundary is the next element anyway, but 540 * we still have to make sure that subtree_offset, btype, 541 * and mirror_tid are all 0. 542 */ 543 if (base_item->copy->type == HAMMER_BTREE_TYPE_INTERNAL) { 544 KKASSERT(base_count != 0); 545 base_elm = &base_item->copy->elms[base_count]; 546 547 if (bcmp(base_elm, elm, sizeof(*elm)) != 0 || 548 elm->internal.subtree_offset || 549 elm->internal.mirror_tid || 550 elm->base.btype) { 551 *base_elm = *elm; 552 base_elm->internal.subtree_offset = 0; 553 base_elm->internal.mirror_tid = 0; 554 base_elm->base.btype = 0; 555 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 556 if (hammer_debug_general & 0x1000) 557 kprintf(" (rhs update)"); 558 } else { 559 if (hammer_debug_general & 0x1000) 560 kprintf(" (rhs same)"); 561 } 562 } 563 564 /* 565 * The parent's boundary must be updated. Be careful to retain 566 * the btype and non-base internal fields as that information is 567 * unrelated. 568 */ 569 parent = base_item->parent; 570 rbound_elm = &parent->copy->elms[base_item->index + 1]; 571 if (bcmp(&rbound_elm->base, &elm->base, sizeof(elm->base)) != 0) { 572 save = rbound_elm->base.btype; 573 rbound_elm->base = elm->base; 574 rbound_elm->base.btype = save; 575 parent->flags |= HAMMER_NODE_LOCK_UPDATED; 576 if (hammer_debug_general & 0x1000) { 577 kprintf(" (parent bound update %d)", 578 base_item->index + 1); 579 } 580 } 581 if (hammer_debug_general & 0x1000) 582 kprintf("\n"); 583 } 584 585 /* 586 * An element in item has moved to base_item. We must update the parent 587 * pointer of the node the element points to (which is chld_item). 588 */ 589 static 590 void 591 rebalance_parent_ptrs(hammer_node_lock_t base_item, int index, 592 hammer_node_lock_t item, hammer_node_lock_t chld_item) 593 { 594 KKASSERT(chld_item->node->ondisk->parent == item->node->node_offset); 595 chld_item->copy->parent = base_item->node->node_offset; 596 chld_item->flags |= HAMMER_NODE_LOCK_UPDATED; 597 hammer_cursor_parent_changed(chld_item->node, 598 item->node, base_item->node, index); 599 } 600