1 /* 2 * Copyright (c) 2009 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "hammer.h" 36 37 static int rebalance_node(struct hammer_ioc_rebalance *rebal, 38 hammer_cursor_t cursor, hammer_node_lock_t lcache); 39 static void rebalance_closeout(hammer_node_lock_t base_item, int base_count, 40 hammer_btree_elm_t elm); 41 static void rebalance_parent_ptrs(hammer_node_lock_t base_item, int index, 42 hammer_node_lock_t item, hammer_node_lock_t chld_item); 43 44 /* 45 * Iterate through the specified range of object ids and rebalance B-Tree 46 * leaf and internal nodes we encounter. A forwards iteration is used. 47 * 48 * All leafs are at the same depth. We use the b-tree scan code loosely 49 * to position ourselves and create degenerate cases to skip indices 50 * that we have rebalanced in bulk. 51 */ 52 53 int 54 hammer_ioc_rebalance(hammer_transaction_t trans, hammer_inode_t ip, 55 struct hammer_ioc_rebalance *rebal) 56 { 57 struct hammer_cursor cursor; 58 struct hammer_node_lock lcache; 59 hammer_btree_leaf_elm_t elm; 60 int error; 61 int seq; 62 uint32_t key_end_localization; 63 64 if ((rebal->key_beg.localization | rebal->key_end.localization) & 65 HAMMER_LOCALIZE_PSEUDOFS_MASK) { 66 return(EINVAL); 67 } 68 if (rebal->key_beg.localization > rebal->key_end.localization) 69 return(EINVAL); 70 if (rebal->key_beg.localization == rebal->key_end.localization) { 71 if (rebal->key_beg.obj_id > rebal->key_end.obj_id) 72 return(EINVAL); 73 /* key-space limitations - no check needed */ 74 } 75 if (rebal->saturation < HAMMER_BTREE_INT_ELMS / 2) 76 rebal->saturation = HAMMER_BTREE_INT_ELMS / 2; 77 if (rebal->saturation > HAMMER_BTREE_INT_ELMS) 78 rebal->saturation = HAMMER_BTREE_INT_ELMS; 79 80 /* 81 * Ioctl caller has only set localization type to rebalance. 82 * Initialize cursor key localization with ip localization. 83 */ 84 rebal->key_cur = rebal->key_beg; 85 rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK; 86 if (rebal->allpfs == 0) 87 rebal->key_cur.localization |= ip->obj_localization; 88 89 key_end_localization = rebal->key_end.localization; 90 key_end_localization &= HAMMER_LOCALIZE_MASK; 91 if (rebal->allpfs == 0) 92 key_end_localization |= ip->obj_localization; 93 else 94 key_end_localization |= pfs_to_lo(HAMMER_MAX_PFSID); 95 96 hammer_btree_lcache_init(trans->hmp, &lcache, 2); 97 98 seq = trans->hmp->flusher.done; 99 100 /* 101 * Scan forwards. Retries typically occur if a deadlock is detected. 102 */ 103 retry: 104 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 105 if (error) { 106 hammer_done_cursor(&cursor); 107 goto failed; 108 } 109 cursor.key_beg = rebal->key_cur; 110 cursor.key_end = rebal->key_end; 111 cursor.key_end.localization = key_end_localization; 112 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 113 cursor.flags |= HAMMER_CURSOR_BACKEND; 114 115 /* 116 * Cause internal nodes to be returned on the way up. Internal nodes 117 * are not returned on the way down so we can create a degenerate 118 * case to handle internal nodes as a trailing function of their 119 * sub-trees. 120 * 121 * Note that by not setting INSERTING or PRUNING no boundary 122 * corrections will be made and a sync lock is not needed for the 123 * B-Tree scan itself. 124 */ 125 cursor.flags |= HAMMER_CURSOR_REBLOCKING; 126 127 error = hammer_btree_first(&cursor); 128 129 while (error == 0) { 130 /* 131 * Rebalancing can be hard on the memory allocator, make 132 * sure there is enough free memory before doing it. 133 */ 134 if (vm_test_nominal()) { 135 hammer_unlock_cursor(&cursor); 136 vm_wait_nominal(); 137 hammer_lock_cursor(&cursor); 138 } 139 140 /* 141 * Filesystem went read-only during rebalancing 142 */ 143 if (trans->hmp->ronly) { 144 error = EROFS; 145 break; 146 } 147 148 /* 149 * We only care about internal nodes visited for the last 150 * time on the way up... that is, a trailing scan of the 151 * internal node after all of its children have been recursed 152 * through. 153 */ 154 if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 155 /* 156 * Leave cursor.index alone, we want to recurse 157 * through all children of the internal node before 158 * visiting it. 159 * 160 * Process the internal node on the way up after 161 * the last child's sub-tree has been balanced. 162 */ 163 if (cursor.index == cursor.node->ondisk->count - 1) { 164 hammer_sync_lock_sh(trans); 165 error = rebalance_node(rebal, &cursor, &lcache); 166 hammer_sync_unlock(trans); 167 } 168 } else { 169 /* 170 * We don't need to iterate through all the leaf 171 * elements, we only care about the parent (internal) 172 * node. 173 */ 174 cursor.index = cursor.node->ondisk->count - 1; 175 } 176 if (error) 177 break; 178 179 /* 180 * Update returned scan position and do a flush if 181 * necessary. 182 * 183 * WARNING: We extract the base using the leaf element 184 * type but this could be an internal node. The 185 * base is the same either way. 186 * 187 * However, due to the rebalancing operation the 188 * cursor position may have exceeded the right-hand 189 * boundary. 190 * 191 * WARNING: See warnings in hammer_unlock_cursor() 192 * function. 193 */ 194 elm = &cursor.node->ondisk->elms[cursor.index].leaf; 195 rebal->key_cur = elm->base; 196 ++rebal->stat_ncount; 197 198 while (hammer_flusher_meta_halflimit(trans->hmp) || 199 hammer_flusher_undo_exhausted(trans, 2)) { 200 hammer_unlock_cursor(&cursor); 201 hammer_flusher_wait(trans->hmp, seq); 202 hammer_lock_cursor(&cursor); 203 seq = hammer_flusher_async_one(trans->hmp); 204 } 205 206 /* 207 * Before iterating check if the rebalance operation caused 208 * the cursor to index past the right-hand boundary and make 209 * sure to stop if it does. Otherwise the iteration may 210 * panic e.g. due to the key maxing out its fields and no 211 * longer being within the strict bounds of the root node. 212 */ 213 if (hammer_btree_cmp(&rebal->key_cur, &cursor.key_end) > 0) { 214 rebal->key_cur = cursor.key_end; 215 break; 216 } 217 218 /* 219 * Iterate, stop if a signal was received. 220 */ 221 if ((error = hammer_signal_check(trans->hmp)) != 0) 222 break; 223 error = hammer_btree_iterate(&cursor); 224 } 225 if (error == ENOENT) 226 error = 0; 227 hammer_done_cursor(&cursor); 228 if (error == EDEADLK) { 229 ++rebal->stat_collisions; 230 goto retry; 231 } 232 if (error == EINTR) { 233 rebal->head.flags |= HAMMER_IOC_HEAD_INTR; 234 error = 0; 235 } 236 failed: 237 rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK; 238 hammer_btree_lcache_free(trans->hmp, &lcache); 239 return(error); 240 } 241 242 /* 243 * Rebalance an internal node, called via a trailing upward recursion. 244 * All the children have already been individually rebalanced. 245 * 246 * To rebalance we scan the elements in the children and pack them, 247 * so we actually need to lock the children and the children's children. 248 * 249 * INTERNAL_NODE 250 * / / | | | \ \ 251 * C C C C C C C children (first level) (internal or leaf nodes) 252 * children's elements (second level) 253 * 254 * <<<---------- pack children's elements, possibly remove excess 255 * children after packing. 256 * 257 * NOTE: The mirror_tids, parent pointers, and child pointers must be updated. 258 * Any live tracked B-Tree nodes must be updated (we worm out of that 259 * by not allowing any). And boundary elements must be preserved. 260 * 261 * NOTE: If the children are leaf nodes we may have a degenerate case 262 * case where there are no elements in the leafs. 263 * 264 * XXX live-tracked 265 */ 266 static int 267 rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor, 268 hammer_node_lock_t lcache) 269 { 270 struct hammer_node_lock lockroot; 271 hammer_node_lock_t base_item; 272 hammer_node_lock_t chld_item; 273 hammer_node_lock_t item; 274 hammer_btree_elm_t elm; 275 hammer_node_t node; 276 hammer_tid_t tid; 277 uint8_t type1 __debugvar; 278 int base_count; 279 int root_count; 280 int avg_elms; 281 int count; 282 int error; 283 int i; 284 int n; 285 286 /* 287 * Lock the parent node via the cursor, collect and lock our 288 * children and children's children. 289 * 290 * By the way, this is a LOT of locks. 291 */ 292 hammer_node_lock_init(&lockroot, cursor->node); 293 error = hammer_cursor_upgrade(cursor); 294 if (error) 295 goto done; 296 error = hammer_btree_lock_children(cursor, 2, &lockroot, lcache); 297 if (error) 298 goto done; 299 300 /* 301 * Make a copy of all the locked on-disk data to simplify the element 302 * shifting we are going to have to do. We will modify the copy 303 * first. 304 */ 305 hammer_btree_lock_copy(cursor, &lockroot); 306 307 /* 308 * Look at the first child node. 309 */ 310 if (TAILQ_FIRST(&lockroot.list) == NULL) 311 goto done; 312 type1 = TAILQ_FIRST(&lockroot.list)->node->ondisk->type; 313 314 /* 315 * Figure out the total number of children's children and 316 * calculate the average number of elements per child. 317 * 318 * The minimum avg_elms is 1 when count > 0. avg_elms * root_elms 319 * is always greater or equal to count. 320 * 321 * If count == 0 we hit a degenerate case which will cause 322 * avg_elms to also calculate as 0. 323 */ 324 if (hammer_debug_general & 0x1000) 325 hdkprintf("lockroot %p count %d\n", &lockroot, lockroot.count); 326 count = 0; 327 TAILQ_FOREACH(item, &lockroot.list, entry) { 328 if (hammer_debug_general & 0x1000) 329 hdkprintf("add count %d\n", item->count); 330 count += item->count; 331 KKASSERT(item->node->ondisk->type == type1); 332 } 333 avg_elms = (count + (lockroot.count - 1)) / lockroot.count; 334 KKASSERT(avg_elms >= 0); 335 336 /* 337 * If the average number of elements per child is too low then 338 * calculate the desired number of children (n) such that the 339 * average number of elements is reasonable. 340 * 341 * If the desired number of children is 1 then avg_elms will 342 * wind up being count, which may still be smaller then saturation 343 * but that is ok. 344 */ 345 if (count && avg_elms < rebal->saturation) { 346 n = (count + (rebal->saturation - 1)) / rebal->saturation; 347 avg_elms = (count + (n - 1)) / n; 348 } 349 350 /* 351 * Pack the elements in the children. Elements for each item is 352 * packed into base_item until avg_elms is reached, then base_item 353 * iterates. 354 * 355 * hammer_cursor_moved_element() is called for each element moved 356 * to update tracked cursors, including the index beyond the last 357 * element (at count). 358 * 359 * Any cursors tracking the internal node itself must also be 360 * updated, potentially repointing them at a leaf and clearing 361 * ATEDISK. 362 */ 363 base_item = TAILQ_FIRST(&lockroot.list); 364 base_count = 0; 365 root_count = 0; 366 367 TAILQ_FOREACH(item, &lockroot.list, entry) { 368 node = item->node; 369 KKASSERT(item->count == node->ondisk->count); 370 chld_item = TAILQ_FIRST(&item->list); 371 for (i = 0; i < item->count; ++i) { 372 /* 373 * Closeout. If the next element is at index 0 374 * just use the existing separator in the parent. 375 */ 376 if (base_count == avg_elms) { 377 if (i == 0) { 378 elm = &lockroot.node->ondisk->elms[ 379 item->index]; 380 } else { 381 elm = &node->ondisk->elms[i]; 382 } 383 rebalance_closeout(base_item, base_count, elm); 384 base_item = TAILQ_NEXT(base_item, entry); 385 KKASSERT(base_item); 386 base_count = 0; 387 ++root_count; 388 } 389 390 /* 391 * Check degenerate no-work case. Otherwise pack 392 * the element. 393 * 394 * All changes are made to the copy. 395 */ 396 if (item == base_item && i == base_count) { 397 ++base_count; 398 if (chld_item) 399 chld_item = TAILQ_NEXT(chld_item, entry); 400 continue; 401 } 402 403 /* 404 * Pack element. 405 */ 406 elm = &base_item->copy->elms[base_count]; 407 *elm = node->ondisk->elms[i]; 408 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 409 410 /* 411 * Adjust the mirror_tid of the target and the 412 * internal element linkage. 413 * 414 * The parent node (lockroot.node) should already 415 * have an aggregate mirror_tid so we do not have 416 * to update that. However, it is possible for us 417 * to catch a hammer_btree_mirror_propagate() with 418 * its pants down. Update the parent if necessary. 419 */ 420 tid = node->ondisk->mirror_tid; 421 422 if (base_item->copy->mirror_tid < tid) { 423 base_item->copy->mirror_tid = tid; 424 if (lockroot.copy->mirror_tid < tid) { 425 lockroot.copy->mirror_tid = tid; 426 lockroot.flags |= 427 HAMMER_NODE_LOCK_UPDATED; 428 } 429 if (lockroot.copy->elms[root_count]. 430 internal.mirror_tid < tid) { 431 lockroot.copy->elms[root_count]. 432 internal.mirror_tid = tid; 433 lockroot.flags |= 434 HAMMER_NODE_LOCK_UPDATED; 435 } 436 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 437 } 438 439 /* 440 * We moved elm. The parent pointers for any 441 * children of elm must be repointed. 442 */ 443 if (item != base_item && 444 node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 445 KKASSERT(chld_item); 446 rebalance_parent_ptrs(base_item, base_count, 447 item, chld_item); 448 } 449 hammer_cursor_moved_element(item->parent->node, 450 item->index, 451 node, i, 452 base_item->node, 453 base_count); 454 ++base_count; 455 if (chld_item) 456 chld_item = TAILQ_NEXT(chld_item, entry); 457 } 458 459 /* 460 * Always call at the end (i == number of elements) in 461 * case a cursor is sitting indexed there. 462 */ 463 hammer_cursor_moved_element(item->parent->node, item->index, 464 node, i, 465 base_item->node, base_count); 466 } 467 468 /* 469 * Packing complete, close-out base_item using the right-hand 470 * boundary of the original parent. 471 * 472 * If we will be deleting nodes from the root shift the old 473 * right-hand-boundary to the new ending index. 474 */ 475 elm = &lockroot.node->ondisk->elms[lockroot.node->ondisk->count]; 476 rebalance_closeout(base_item, base_count, elm); 477 ++root_count; 478 if (lockroot.copy->count != root_count) { 479 lockroot.copy->count = root_count; 480 lockroot.copy->elms[root_count] = *elm; 481 lockroot.flags |= HAMMER_NODE_LOCK_UPDATED; 482 } 483 484 /* 485 * Any extra items beyond base_item are now completely empty and 486 * can be destroyed. Queue the destruction up in the copy. Note 487 * that none of the destroyed nodes are part of our cursor. 488 * 489 * The cursor is locked so it isn't on the tracking list. It 490 * should have been pointing at the boundary element (at root_count). 491 * When deleting elements from the root (which is cursor.node), we 492 * have to update the cursor.index manually to keep it in bounds. 493 */ 494 while ((base_item = TAILQ_NEXT(base_item, entry)) != NULL) { 495 hammer_cursor_removed_node(base_item->node, lockroot.node, 496 base_count); 497 hammer_cursor_deleted_element(lockroot.node, base_count); 498 base_item->copy->type = HAMMER_BTREE_TYPE_DELETED; 499 base_item->copy->count = 0; 500 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 501 if (cursor->index > lockroot.copy->count) 502 --cursor->index; 503 ++rebal->stat_deletions; 504 } 505 506 /* 507 * All done, sync the locked child tree to disk. This will also 508 * flush and delete deleted nodes. 509 */ 510 rebal->stat_nrebal += hammer_btree_sync_copy(cursor, &lockroot); 511 done: 512 hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, lcache); 513 hammer_cursor_downgrade(cursor); 514 return (error); 515 } 516 517 /* 518 * Close-out the child base_item. This node contains base_count 519 * elements. 520 * 521 * If the node is an internal node the right-hand boundary must be 522 * set to elm. 523 */ 524 static 525 void 526 rebalance_closeout(hammer_node_lock_t base_item, int base_count, 527 hammer_btree_elm_t elm) 528 { 529 hammer_node_lock_t parent; 530 hammer_btree_elm_t base_elm; 531 hammer_btree_elm_t rbound_elm; 532 uint8_t save; 533 534 /* 535 * Update the count. NOTE: base_count can be 0 for the 536 * degenerate leaf case. 537 */ 538 if (hammer_debug_general & 0x1000) { 539 hdkprintf("%016jx:", (intmax_t)base_item->node->node_offset); 540 } 541 if (base_item->copy->count != base_count) { 542 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 543 base_item->copy->count = base_count; 544 if (hammer_debug_general & 0x1000) 545 kprintf(" (count update)"); 546 } 547 548 /* 549 * If we are closing out an internal node we must assign 550 * a right-hand boundary. Use the element contents as the 551 * right-hand boundary. 552 * 553 * Internal nodes are required to have at least one child, 554 * otherwise the left and right boundary would end up being 555 * the same element. Only leaf nodes can be empty. 556 * 557 * Rebalancing may cut-off an internal node such that the 558 * new right hand boundary is the next element anyway, but 559 * we still have to make sure that subtree_offset, btype, 560 * and mirror_tid are all 0. 561 */ 562 if (base_item->copy->type == HAMMER_BTREE_TYPE_INTERNAL) { 563 KKASSERT(base_count != 0); 564 base_elm = &base_item->copy->elms[base_count]; 565 566 if (bcmp(base_elm, elm, sizeof(*elm)) != 0 || 567 elm->internal.subtree_offset || 568 elm->internal.mirror_tid || 569 elm->base.btype) { 570 *base_elm = *elm; 571 base_elm->internal.subtree_offset = 0; 572 base_elm->internal.mirror_tid = 0; 573 base_elm->base.btype = 0; 574 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 575 if (hammer_debug_general & 0x1000) 576 kprintf(" (rhs update)"); 577 } else { 578 if (hammer_debug_general & 0x1000) 579 kprintf(" (rhs same)"); 580 } 581 } 582 583 /* 584 * The parent's boundary must be updated. Be careful to retain 585 * the btype and non-base internal fields as that information is 586 * unrelated. 587 */ 588 parent = base_item->parent; 589 rbound_elm = &parent->copy->elms[base_item->index + 1]; 590 if (bcmp(&rbound_elm->base, &elm->base, sizeof(elm->base)) != 0) { 591 save = rbound_elm->base.btype; 592 rbound_elm->base = elm->base; 593 rbound_elm->base.btype = save; 594 parent->flags |= HAMMER_NODE_LOCK_UPDATED; 595 if (hammer_debug_general & 0x1000) { 596 kprintf(" (parent bound update %d)", 597 base_item->index + 1); 598 } 599 } 600 if (hammer_debug_general & 0x1000) 601 kprintf("\n"); 602 } 603 604 /* 605 * An element in item has moved to base_item. We must update the parent 606 * pointer of the node the element points to (which is chld_item). 607 */ 608 static 609 void 610 rebalance_parent_ptrs(hammer_node_lock_t base_item, int index, 611 hammer_node_lock_t item, hammer_node_lock_t chld_item) 612 { 613 KKASSERT(chld_item->node->ondisk->parent == item->node->node_offset); 614 chld_item->copy->parent = base_item->node->node_offset; 615 chld_item->flags |= HAMMER_NODE_LOCK_UPDATED; 616 hammer_cursor_parent_changed(chld_item->node, 617 item->node, base_item->node, index); 618 } 619