1 /* 2 * Copyright (c) 2009 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "hammer.h" 36 37 static int rebalance_node(struct hammer_ioc_rebalance *rebal, 38 hammer_cursor_t cursor); 39 static void rebalance_closeout(hammer_node_lock_t base_item, int base_count, 40 hammer_btree_elm_t elm); 41 static void rebalance_parent_ptrs(hammer_node_lock_t base_item, int index, 42 hammer_node_lock_t item, hammer_node_lock_t chld_item); 43 44 /* 45 * Iterate through the specified range of object ids and rebalance B-Tree 46 * leaf and internal nodes we encounter. A forwards iteration is used. 47 * 48 * All leafs are at the same depth. We use the b-tree scan code loosely 49 * to position ourselves and create degenerate cases to skip indices 50 * that we have rebalanced in bulk. 51 */ 52 53 int 54 hammer_ioc_rebalance(hammer_transaction_t trans, hammer_inode_t ip, 55 struct hammer_ioc_rebalance *rebal) 56 { 57 struct hammer_cursor cursor; 58 hammer_btree_leaf_elm_t elm; 59 int error; 60 int seq; 61 62 if ((rebal->key_beg.localization | rebal->key_end.localization) & 63 HAMMER_LOCALIZE_PSEUDOFS_MASK) { 64 return(EINVAL); 65 } 66 if (rebal->key_beg.localization > rebal->key_end.localization) 67 return(EINVAL); 68 if (rebal->key_beg.localization == rebal->key_end.localization) { 69 if (rebal->key_beg.obj_id > rebal->key_end.obj_id) 70 return(EINVAL); 71 /* key-space limitations - no check needed */ 72 } 73 if (rebal->saturation < HAMMER_BTREE_INT_ELMS / 2) 74 rebal->saturation = HAMMER_BTREE_INT_ELMS / 2; 75 if (rebal->saturation > HAMMER_BTREE_INT_ELMS) 76 rebal->saturation = HAMMER_BTREE_INT_ELMS; 77 78 rebal->key_cur = rebal->key_beg; 79 rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK; 80 rebal->key_cur.localization += ip->obj_localization; 81 82 seq = trans->hmp->flusher.act; 83 84 /* 85 * Scan forwards. Retries typically occur if a deadlock is detected. 86 */ 87 retry: 88 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 89 if (error) { 90 hammer_done_cursor(&cursor); 91 goto failed; 92 } 93 cursor.key_beg = rebal->key_cur; 94 cursor.key_end = rebal->key_end; 95 cursor.key_end.localization &= HAMMER_LOCALIZE_MASK; 96 cursor.key_end.localization += ip->obj_localization; 97 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 98 cursor.flags |= HAMMER_CURSOR_BACKEND; 99 100 /* 101 * Cause internal nodes to be returned on the way up. Internal nodes 102 * are not returned on the way down so we can create a degenerate 103 * case to handle internal nodes as a trailing function of their 104 * sub-trees. 105 * 106 * Note that by not setting INSERTING or PRUNING no boundary 107 * corrections will be made and a sync lock is not needed for the 108 * B-Tree scan itself. 109 */ 110 cursor.flags |= HAMMER_CURSOR_REBLOCKING; 111 112 error = hammer_btree_first(&cursor); 113 114 while (error == 0) { 115 /* 116 * We only care about internal nodes visited for the last 117 * time on the way up... that is, a trailing scan of the 118 * internal node after all of its children have been recursed 119 * through. 120 */ 121 if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 122 /* 123 * Leave cursor.index alone, we want to recurse 124 * through all children of the internal node before 125 * visiting it. 126 * 127 * Process the internal node on the way up after 128 * the last child's sub-tree has been balanced. 129 */ 130 if (cursor.index == cursor.node->ondisk->count - 1) { 131 hammer_sync_lock_sh(trans); 132 error = rebalance_node(rebal, &cursor); 133 hammer_sync_unlock(trans); 134 } 135 } else { 136 /* 137 * We don't need to iterate through all the leaf 138 * elements, we only care about the parent (internal) 139 * node. 140 */ 141 cursor.index = cursor.node->ondisk->count - 1; 142 } 143 if (error) 144 break; 145 146 /* 147 * Update returned scan position and do a flush if 148 * necessary. 149 * 150 * WARNING: See warnings in hammer_unlock_cursor() 151 * function. 152 */ 153 elm = &cursor.node->ondisk->elms[cursor.index].leaf; 154 rebal->key_cur = elm->base; 155 ++rebal->stat_ncount; 156 157 while (hammer_flusher_meta_halflimit(trans->hmp) || 158 hammer_flusher_undo_exhausted(trans, 2)) { 159 hammer_unlock_cursor(&cursor); 160 hammer_flusher_wait(trans->hmp, seq); 161 hammer_lock_cursor(&cursor); 162 seq = hammer_flusher_async_one(trans->hmp); 163 } 164 165 /* 166 * Iterate, stop if a signal was received. 167 */ 168 if ((error = hammer_signal_check(trans->hmp)) != 0) 169 break; 170 error = hammer_btree_iterate(&cursor); 171 } 172 if (error == ENOENT) 173 error = 0; 174 hammer_done_cursor(&cursor); 175 if (error == EDEADLK) { 176 ++rebal->stat_collisions; 177 goto retry; 178 } 179 if (error == EINTR) { 180 rebal->head.flags |= HAMMER_IOC_HEAD_INTR; 181 error = 0; 182 } 183 failed: 184 rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK; 185 return(error); 186 } 187 188 /* 189 * Rebalance an internal node, called via a trailing upward recursion. 190 * All the children have already been individually rebalanced. 191 * 192 * To rebalance we scan the elements in the children and pack them, 193 * so we actually need to lock the children and the children's children. 194 * 195 * INTERNAL_NODE 196 * / / | | | \ \ 197 * C C C C C C C children (first level) (internal or leaf nodes) 198 * children's elements (second level) 199 * 200 * <<<---------- pack children's elements, possibly remove excess 201 * children after packing. 202 * 203 * NOTE: The mirror_tids, parent pointers, and child pointers must be updated. 204 * Any live tracked B-Tree nodes must be updated (we worm out of that 205 * by not allowing any). And boundary elements must be preserved. 206 * 207 * NOTE: If the children are leaf nodes we may have a degenerate case 208 * case where there are no elements in the leafs. 209 * 210 * XXX live-tracked 211 */ 212 static int 213 rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor) 214 { 215 struct hammer_node_lock lockroot; 216 hammer_node_lock_t base_item; 217 hammer_node_lock_t chld_item; 218 hammer_node_lock_t item; 219 hammer_btree_elm_t elm; 220 hammer_node_t node; 221 u_int8_t type1; 222 int base_count; 223 int root_count; 224 int avg_elms; 225 int count; 226 int error; 227 int i; 228 int n; 229 230 /* 231 * Lock the parent node via the cursor, collect and lock our 232 * children and children's children. 233 * 234 * By the way, this is a LOT of locks. 235 */ 236 hammer_node_lock_init(&lockroot, cursor->node); 237 error = hammer_cursor_upgrade(cursor); 238 if (error) 239 goto done; 240 error = hammer_btree_lock_children(cursor, 2, &lockroot); 241 if (error) 242 goto done; 243 244 /* 245 * Make a copy of all the locked on-disk data to simplify the element 246 * shifting we are going to have to do. We will modify the copy 247 * first. 248 */ 249 hammer_btree_lock_copy(cursor, &lockroot); 250 251 /* 252 * Look at the first child node. 253 */ 254 if (TAILQ_FIRST(&lockroot.list) == NULL) 255 goto done; 256 type1 = TAILQ_FIRST(&lockroot.list)->node->ondisk->type; 257 258 /* 259 * Figure out the total number of children's children and 260 * calculate the average number of elements per child. 261 * 262 * The minimum avg_elms is 1 when count > 0. avg_elms * root_elms 263 * is always greater or equal to count. 264 * 265 * If count == 0 we hit a degenerate case which will cause 266 * avg_elms to also calculate as 0. 267 */ 268 if (hammer_debug_general & 0x1000) 269 kprintf("lockroot %p count %d\n", &lockroot, lockroot.count); 270 count = 0; 271 TAILQ_FOREACH(item, &lockroot.list, entry) { 272 if (hammer_debug_general & 0x1000) 273 kprintf("add count %d\n", item->count); 274 count += item->count; 275 KKASSERT(item->node->ondisk->type == type1); 276 } 277 avg_elms = (count + (lockroot.count - 1)) / lockroot.count; 278 KKASSERT(avg_elms >= 0); 279 280 /* 281 * If the average number of elements per child is too low then 282 * calculate the desired number of children (n) such that the 283 * average number of elements is reasonable. 284 * 285 * If the desired number of children is 1 then avg_elms will 286 * wind up being count, which may still be smaller then saturation 287 * but that is ok. 288 */ 289 if (count && avg_elms < rebal->saturation) { 290 n = (count + (rebal->saturation - 1)) / rebal->saturation; 291 avg_elms = (count + (n - 1)) / n; 292 } 293 294 /* 295 * Pack the elements in the children. Elements for each item is 296 * packed into base_item until avg_elms is reached, then base_item 297 * iterates. 298 * 299 * hammer_cursor_moved_element() is called for each element moved 300 * to update tracked cursors, including the index beyond the last 301 * element (at count). 302 */ 303 base_item = TAILQ_FIRST(&lockroot.list); 304 base_count = 0; 305 root_count = 0; 306 307 TAILQ_FOREACH(item, &lockroot.list, entry) { 308 node = item->node; 309 KKASSERT(item->count == node->ondisk->count); 310 chld_item = TAILQ_FIRST(&item->list); 311 for (i = 0; i < item->count; ++i) { 312 /* 313 * Closeout. If the next element is at index 0 314 * just use the existing separator in the parent. 315 */ 316 if (base_count == avg_elms) { 317 if (i == 0) { 318 elm = &lockroot.node->ondisk->elms[ 319 item->index]; 320 } else { 321 elm = &node->ondisk->elms[i]; 322 } 323 rebalance_closeout(base_item, base_count, elm); 324 base_item = TAILQ_NEXT(base_item, entry); 325 KKASSERT(base_item); 326 base_count = 0; 327 ++root_count; 328 } 329 330 /* 331 * Check degenerate no-work case. Otherwise pack 332 * the element. 333 * 334 * All changes are made to the copy. 335 */ 336 if (item == base_item && i == base_count) { 337 ++base_count; 338 if (chld_item) 339 chld_item = TAILQ_NEXT(chld_item, entry); 340 continue; 341 } 342 343 /* 344 * Pack element. 345 */ 346 elm = &base_item->copy->elms[base_count]; 347 *elm = node->ondisk->elms[i]; 348 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 349 350 /* 351 * Adjust the mirror_tid of the target. The parent 352 * node (lockroot.node) should already have an 353 * aggregate mirror_tid so we do not have to update 354 * that. 355 * 356 * However, it is possible for us to catch a 357 * hammer_btree_mirror_propagate() with its pants 358 * down. Update the parent if necessary. 359 */ 360 if (base_item->copy->mirror_tid < 361 node->ondisk->mirror_tid) { 362 base_item->copy->mirror_tid = 363 node->ondisk->mirror_tid; 364 if (lockroot.copy->mirror_tid < 365 node->ondisk->mirror_tid) { 366 kprintf("HAMMER: Warning: rebalance " 367 "caught race against " 368 "propagate\n"); 369 lockroot.copy->mirror_tid = 370 node->ondisk->mirror_tid; 371 lockroot.flags |= 372 HAMMER_NODE_LOCK_UPDATED; 373 } 374 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 375 } 376 377 /* 378 * We moved elm. The parent pointers for any 379 * children of elm must be repointed. 380 */ 381 if (item != base_item && 382 node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 383 KKASSERT(chld_item); 384 rebalance_parent_ptrs(base_item, base_count, 385 item, chld_item); 386 } 387 hammer_cursor_moved_element(node, base_item->node, 388 i, base_count); 389 ++base_count; 390 if (chld_item) 391 chld_item = TAILQ_NEXT(chld_item, entry); 392 } 393 394 /* 395 * Always call at the end (i == number of elements) in 396 * case a cursor is sitting indexed there. 397 */ 398 hammer_cursor_moved_element(node, base_item->node, 399 i, base_count); 400 } 401 402 /* 403 * Packing complete, close-out base_item using the right-hand 404 * boundary of the original parent. 405 * 406 * If we will be deleting nodes from the root shift the old 407 * right-hand-boundary to the new ending index. 408 */ 409 elm = &lockroot.node->ondisk->elms[lockroot.node->ondisk->count]; 410 rebalance_closeout(base_item, base_count, elm); 411 ++root_count; 412 if (lockroot.copy->count != root_count) { 413 lockroot.copy->count = root_count; 414 lockroot.copy->elms[root_count] = *elm; 415 lockroot.flags |= HAMMER_NODE_LOCK_UPDATED; 416 } 417 418 /* 419 * Any extra items beyond base_item are now completely empty and 420 * can be destroyed. Queue the destruction up in the copy. Note 421 * that none of the destroyed nodes are part of our cursor. 422 * 423 * The cursor is locked so it isn't on the tracking list. It 424 * should have been pointing at the boundary element (at root_count). 425 * When deleting elements from the root (which is cursor.node), we 426 * have to update the cursor.index manually to keep it in bounds. 427 */ 428 while ((base_item = TAILQ_NEXT(base_item, entry)) != NULL) { 429 hammer_cursor_removed_node(base_item->node, lockroot.node, 430 base_count); 431 hammer_cursor_deleted_element(lockroot.node, base_count); 432 base_item->copy->type = HAMMER_BTREE_TYPE_DELETED; 433 base_item->copy->count = 0; 434 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 435 if (cursor->index > lockroot.copy->count) 436 --cursor->index; 437 ++rebal->stat_deletions; 438 } 439 440 /* 441 * All done, sync the locked child tree to disk. This will also 442 * flush and delete deleted nodes. 443 */ 444 rebal->stat_nrebal += hammer_btree_sync_copy(cursor, &lockroot); 445 done: 446 hammer_btree_unlock_children(cursor, &lockroot); 447 hammer_cursor_downgrade(cursor); 448 return (error); 449 } 450 451 /* 452 * Close-out the child base_item. This node contains base_count 453 * elements. 454 * 455 * If the node is an internal node the right-hand boundary must be 456 * set to elm. 457 */ 458 static 459 void 460 rebalance_closeout(hammer_node_lock_t base_item, int base_count, 461 hammer_btree_elm_t elm) 462 { 463 hammer_node_lock_t parent; 464 hammer_btree_elm_t base_elm; 465 hammer_btree_elm_t rbound_elm; 466 u_int8_t save; 467 468 /* 469 * Update the count. NOTE: base_count can be 0 for the 470 * degenerate leaf case. 471 */ 472 if (hammer_debug_general & 0x1000) { 473 kprintf("rebalance_closeout %016llx:", 474 (long long)base_item->node->node_offset); 475 } 476 if (base_item->copy->count != base_count) { 477 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 478 base_item->copy->count = base_count; 479 if (hammer_debug_general & 0x1000) 480 kprintf(" (count update)"); 481 } 482 483 /* 484 * If we are closing out an internal node we must assign 485 * a right-hand boundary. Use the element contents as the 486 * right-hand boundary. 487 * 488 * Internal nodes are required to have at least one child, 489 * otherwise the left and right boundary would end up being 490 * the same element. Only leaf nodes can be empty. 491 * 492 * Rebalancing may cut-off an internal node such that the 493 * new right hand boundary is the next element anyway, but 494 * we still have to make sure that subtree_offset, btype, 495 * and mirror_tid are all 0. 496 */ 497 if (base_item->copy->type == HAMMER_BTREE_TYPE_INTERNAL) { 498 KKASSERT(base_count != 0); 499 base_elm = &base_item->copy->elms[base_count]; 500 501 if (bcmp(base_elm, elm, sizeof(*elm)) != 0 || 502 elm->internal.subtree_offset || 503 elm->internal.mirror_tid || 504 elm->base.btype) { 505 *base_elm = *elm; 506 base_elm->internal.subtree_offset = 0; 507 base_elm->internal.mirror_tid = 0; 508 base_elm->base.btype = 0; 509 base_item->flags |= HAMMER_NODE_LOCK_UPDATED; 510 if (hammer_debug_general & 0x1000) 511 kprintf(" (rhs update)"); 512 } else { 513 if (hammer_debug_general & 0x1000) 514 kprintf(" (rhs same)"); 515 } 516 } 517 518 /* 519 * The parent's boundary must be updated. Be careful to retain 520 * the btype and non-base internal fields as that information is 521 * unrelated. 522 */ 523 parent = base_item->parent; 524 rbound_elm = &parent->copy->elms[base_item->index + 1]; 525 if (bcmp(&rbound_elm->base, &elm->base, sizeof(elm->base)) != 0) { 526 save = rbound_elm->base.btype; 527 rbound_elm->base = elm->base; 528 rbound_elm->base.btype = save; 529 parent->flags |= HAMMER_NODE_LOCK_UPDATED; 530 if (hammer_debug_general & 0x1000) { 531 kprintf(" (parent bound update %d)", 532 base_item->index + 1); 533 } 534 } 535 if (hammer_debug_general & 0x1000) 536 kprintf("\n"); 537 } 538 539 /* 540 * An element in item has moved to base_item. We must update the parent 541 * pointer of the node the element points to (which is chld_item). 542 */ 543 static 544 void 545 rebalance_parent_ptrs(hammer_node_lock_t base_item, int index, 546 hammer_node_lock_t item, hammer_node_lock_t chld_item) 547 { 548 KKASSERT(chld_item->node->ondisk->parent == item->node->node_offset); 549 chld_item->copy->parent = base_item->node->node_offset; 550 chld_item->flags |= HAMMER_NODE_LOCK_UPDATED; 551 hammer_cursor_parent_changed(chld_item->node, 552 item->node, base_item->node, index); 553 } 554