1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * HAMMER B-Tree index - cursor support routines 37 */ 38 #include "hammer.h" 39 40 static int hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive); 41 42 /* 43 * Initialize a fresh cursor using the B-Tree node cache. If the cache 44 * is not available initialize a fresh cursor at the root of the filesystem. 45 */ 46 int 47 hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor, 48 hammer_node_cache_t cache, hammer_inode_t ip) 49 { 50 hammer_volume_t volume; 51 hammer_node_t node; 52 hammer_mount_t hmp; 53 u_int tticks; 54 int error; 55 56 bzero(cursor, sizeof(*cursor)); 57 58 cursor->trans = trans; 59 hmp = trans->hmp; 60 61 /* 62 * As the number of inodes queued to the flusher increases we use 63 * time-domain multiplexing to control read vs flush performance. 64 * We have to do it here, before acquiring any ip or node locks, 65 * to avoid deadlocking or excessively delaying the flusher. 66 * 67 * The full time period is hammer_tdmux_ticks, typically 1/5 of 68 * a second. 69 * 70 * inode allocation begins to get restrained at 2/4 the limit 71 * via the "hmrrcm" mechanism in hammer_inode. We want to begin 72 * limiting read activity before that to try to avoid processes 73 * stalling out in "hmrrcm". 74 */ 75 tticks = hammer_tdmux_ticks; 76 if (trans->type != HAMMER_TRANS_FLS && tticks && 77 hmp->count_reclaims > hammer_limit_reclaims / tticks && 78 hmp->count_reclaims > hammer_autoflush * 2 && 79 hammer_flusher_running(hmp)) { 80 u_int rticks; 81 u_int xticks; 82 u_int dummy; 83 84 /* 85 * 0 ... xticks ... tticks 86 * 87 * rticks is the calculated position, xticks is the demarc 88 * where values below xticks are reserved for the flusher 89 * and values >= to xticks may be used by the frontend. 90 * 91 * At least one tick is always made available for the 92 * frontend. 93 */ 94 rticks = (u_int)ticks % tticks; 95 xticks = hmp->count_reclaims * tticks / hammer_limit_reclaims; 96 97 /* 98 * Ensure rticks and xticks are stable 99 */ 100 cpu_ccfence(); 101 if (rticks < xticks) { 102 if (hammer_debug_general & 0x0004) 103 hdkprintf("rt %3u, xt %3u, tt %3u\n", 104 rticks, xticks, tticks); 105 tsleep(&dummy, 0, "htdmux", xticks - rticks); 106 } 107 } 108 109 /* 110 * If the cursor operation is on behalf of an inode, lock 111 * the inode. 112 * 113 * When acquiring a shared lock on an inode on which the backend 114 * flusher deadlocked, wait up to hammer_tdmux_ticks (1 second) 115 * for the deadlock to clear. 116 */ 117 if ((cursor->ip = ip) != NULL) { 118 ++ip->cursor_ip_refs; 119 if (trans->type == HAMMER_TRANS_FLS) { 120 hammer_lock_ex(&ip->lock); 121 } else { 122 #if 0 123 if (ip->cursor_exclreq_count) { 124 tsleep(&ip->cursor_exclreq_count, 0, 125 "hstag1", hammer_tdmux_ticks); 126 } 127 #endif 128 hammer_lock_sh(&ip->lock); 129 } 130 } 131 132 /* 133 * Step 1 - acquire a locked node from the cache if possible 134 */ 135 if (cache && cache->node) { 136 node = hammer_ref_node_safe(trans, cache, &error); 137 if (error == 0) { 138 hammer_lock_sh(&node->lock); 139 if (node->flags & HAMMER_NODE_DELETED) { 140 hammer_unlock(&node->lock); 141 hammer_rel_node(node); 142 node = NULL; 143 } 144 } 145 if (node == NULL) 146 ++hammer_stats_btree_root_iterations; 147 } else { 148 node = NULL; 149 ++hammer_stats_btree_root_iterations; 150 } 151 152 /* 153 * Step 2 - If we couldn't get a node from the cache, get 154 * the one from the root of the filesystem. 155 */ 156 while (node == NULL) { 157 volume = hammer_get_root_volume(hmp, &error); 158 if (error) 159 break; 160 node = hammer_get_node(trans, volume->ondisk->vol0_btree_root, 161 0, &error); 162 hammer_rel_volume(volume, 0); 163 if (error) 164 break; 165 /* 166 * When the frontend acquires the root b-tree node while the 167 * backend is deadlocked on it, wait up to hammer_tdmux_ticks 168 * (1 second) for the deadlock to clear. 169 */ 170 #if 0 171 if (node->cursor_exclreq_count && 172 cursor->trans->type != HAMMER_TRANS_FLS) { 173 tsleep(&node->cursor_exclreq_count, 0, 174 "hstag3", hammer_tdmux_ticks); 175 } 176 #endif 177 hammer_lock_sh(&node->lock); 178 179 /* 180 * If someone got in before we could lock the node, retry. 181 */ 182 if (node->flags & HAMMER_NODE_DELETED) { 183 hammer_unlock(&node->lock); 184 hammer_rel_node(node); 185 node = NULL; 186 continue; 187 } 188 if (volume->ondisk->vol0_btree_root != node->node_offset) { 189 hammer_unlock(&node->lock); 190 hammer_rel_node(node); 191 node = NULL; 192 continue; 193 } 194 } 195 196 /* 197 * Step 3 - finish initializing the cursor by acquiring the parent 198 */ 199 cursor->node = node; 200 if (error == 0) 201 error = hammer_load_cursor_parent(cursor, 0); 202 KKASSERT(error == 0); 203 /* if (error) hammer_done_cursor(cursor); */ 204 return(error); 205 } 206 207 /* 208 * Normalize a cursor. Sometimes cursors can be left in a state 209 * where node is NULL. If the cursor is in this state, cursor up. 210 */ 211 void 212 hammer_normalize_cursor(hammer_cursor_t cursor) 213 { 214 if (cursor->node == NULL) { 215 KKASSERT(cursor->parent != NULL); 216 hammer_cursor_up(cursor); 217 } 218 } 219 220 221 /* 222 * We are finished with a cursor. We NULL out various fields as sanity 223 * check, in case the structure is inappropriately used afterwords. 224 */ 225 void 226 hammer_done_cursor(hammer_cursor_t cursor) 227 { 228 hammer_inode_t ip; 229 230 KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0); 231 if (cursor->parent) { 232 hammer_unlock(&cursor->parent->lock); 233 hammer_rel_node(cursor->parent); 234 cursor->parent = NULL; 235 } 236 if (cursor->node) { 237 hammer_unlock(&cursor->node->lock); 238 hammer_rel_node(cursor->node); 239 cursor->node = NULL; 240 } 241 if (cursor->data_buffer) { 242 hammer_rel_buffer(cursor->data_buffer, 0); 243 cursor->data_buffer = NULL; 244 } 245 if ((ip = cursor->ip) != NULL) { 246 KKASSERT(ip->cursor_ip_refs > 0); 247 --ip->cursor_ip_refs; 248 hammer_unlock(&ip->lock); 249 cursor->ip = NULL; 250 } 251 if (cursor->iprec) { 252 hammer_rel_mem_record(cursor->iprec); 253 cursor->iprec = NULL; 254 } 255 256 /* 257 * If we deadlocked this node will be referenced. Do a quick 258 * lock/unlock to wait for the deadlock condition to clear. 259 * 260 * Maintain exclreq_count / wakeup as necessary to notify new 261 * entrants into ip. We continue to hold the fs_token so our 262 * EDEADLK retry loop should get its chance before another thread 263 * steals the lock. 264 */ 265 if (cursor->deadlk_node) { 266 #if 0 267 if (ip && cursor->trans->type == HAMMER_TRANS_FLS) 268 ++ip->cursor_exclreq_count; 269 ++cursor->deadlk_node->cursor_exclreq_count; 270 #endif 271 hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk"); 272 hammer_unlock(&cursor->deadlk_node->lock); 273 #if 0 274 if (--cursor->deadlk_node->cursor_exclreq_count == 0) 275 wakeup(&cursor->deadlk_node->cursor_exclreq_count); 276 if (ip && cursor->trans->type == HAMMER_TRANS_FLS) { 277 if (--ip->cursor_exclreq_count == 0) 278 wakeup(&ip->cursor_exclreq_count); 279 } 280 #endif 281 hammer_rel_node(cursor->deadlk_node); 282 cursor->deadlk_node = NULL; 283 } 284 if (cursor->deadlk_rec) { 285 hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr"); 286 hammer_rel_mem_record(cursor->deadlk_rec); 287 cursor->deadlk_rec = NULL; 288 } 289 290 cursor->data = NULL; 291 cursor->leaf = NULL; 292 cursor->left_bound = NULL; 293 cursor->right_bound = NULL; 294 cursor->trans = NULL; 295 } 296 297 /* 298 * Upgrade cursor->node and cursor->parent to exclusive locks. This 299 * function can return EDEADLK. 300 * 301 * The lock must already be either held shared or already held exclusively 302 * by us. 303 * 304 * We upgrade the parent first as it is the most likely to collide first 305 * with the downward traversal that the frontend typically does. 306 * 307 * If we fail to upgrade the lock and cursor->deadlk_node is NULL, 308 * we add another reference to the node that failed and set 309 * cursor->deadlk_node so hammer_done_cursor() can block on it. 310 */ 311 int 312 hammer_cursor_upgrade(hammer_cursor_t cursor) 313 { 314 int error; 315 316 if (cursor->parent) { 317 error = hammer_lock_upgrade(&cursor->parent->lock, 1); 318 if (error && cursor->deadlk_node == NULL) { 319 cursor->deadlk_node = cursor->parent; 320 hammer_ref_node(cursor->deadlk_node); 321 } 322 } else { 323 error = 0; 324 } 325 if (error == 0) { 326 error = hammer_lock_upgrade(&cursor->node->lock, 1); 327 if (error && cursor->deadlk_node == NULL) { 328 cursor->deadlk_node = cursor->node; 329 hammer_ref_node(cursor->deadlk_node); 330 } 331 } 332 #if 0 333 error = hammer_lock_upgrade(&cursor->node->lock, 1); 334 if (error && cursor->deadlk_node == NULL) { 335 cursor->deadlk_node = cursor->node; 336 hammer_ref_node(cursor->deadlk_node); 337 } else if (error == 0 && cursor->parent) { 338 error = hammer_lock_upgrade(&cursor->parent->lock, 1); 339 if (error && cursor->deadlk_node == NULL) { 340 cursor->deadlk_node = cursor->parent; 341 hammer_ref_node(cursor->deadlk_node); 342 } 343 } 344 #endif 345 return(error); 346 } 347 348 int 349 hammer_cursor_upgrade_node(hammer_cursor_t cursor) 350 { 351 int error; 352 353 error = hammer_lock_upgrade(&cursor->node->lock, 1); 354 if (error && cursor->deadlk_node == NULL) { 355 cursor->deadlk_node = cursor->node; 356 hammer_ref_node(cursor->deadlk_node); 357 } 358 return(error); 359 } 360 361 /* 362 * Downgrade cursor->node and cursor->parent to shared locks. 363 */ 364 void 365 hammer_cursor_downgrade(hammer_cursor_t cursor) 366 { 367 if (hammer_lock_excl_owned(&cursor->node->lock, curthread)) 368 hammer_lock_downgrade(&cursor->node->lock, 1); 369 if (cursor->parent && 370 hammer_lock_excl_owned(&cursor->parent->lock, curthread)) { 371 hammer_lock_downgrade(&cursor->parent->lock, 1); 372 } 373 } 374 375 /* 376 * Upgrade and downgrade pairs of cursors. This is used by the dedup 377 * code which must deal with two cursors. A special function is needed 378 * because some of the nodes may be shared between the two cursors, 379 * resulting in share counts > 1 which will normally cause an upgrade 380 * to fail. 381 */ 382 static __noinline 383 int 384 collect_node(hammer_node_t *array, int *counts, int n, hammer_node_t node) 385 { 386 int i; 387 388 for (i = 0; i < n; ++i) { 389 if (array[i] == node) 390 break; 391 } 392 if (i == n) { 393 array[i] = node; 394 counts[i] = 1; 395 ++i; 396 } else { 397 ++counts[i]; 398 } 399 return(i); 400 } 401 402 int 403 hammer_cursor_upgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2) 404 { 405 hammer_node_t nodes[4]; 406 int counts[4]; 407 int error; 408 int i; 409 int n; 410 411 n = collect_node(nodes, counts, 0, cursor1->node); 412 if (cursor1->parent) 413 n = collect_node(nodes, counts, n, cursor1->parent); 414 n = collect_node(nodes, counts, n, cursor2->node); 415 if (cursor2->parent) 416 n = collect_node(nodes, counts, n, cursor2->parent); 417 418 error = 0; 419 for (i = 0; i < n; ++i) { 420 error = hammer_lock_upgrade(&nodes[i]->lock, counts[i]); 421 if (error) 422 break; 423 } 424 if (error) { 425 while (--i >= 0) 426 hammer_lock_downgrade(&nodes[i]->lock, counts[i]); 427 } 428 return (error); 429 } 430 431 void 432 hammer_cursor_downgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2) 433 { 434 hammer_node_t nodes[4]; 435 int counts[4]; 436 int i; 437 int n; 438 439 n = collect_node(nodes, counts, 0, cursor1->node); 440 if (cursor1->parent) 441 n = collect_node(nodes, counts, n, cursor1->parent); 442 n = collect_node(nodes, counts, n, cursor2->node); 443 if (cursor2->parent) 444 n = collect_node(nodes, counts, n, cursor2->parent); 445 446 for (i = 0; i < n; ++i) 447 hammer_lock_downgrade(&nodes[i]->lock, counts[i]); 448 } 449 450 /* 451 * Seek the cursor to the specified node and index. 452 * 453 * The caller must ref the node prior to calling this routine and release 454 * it after it returns. If the seek succeeds the cursor will gain its own 455 * ref on the node. 456 */ 457 int 458 hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node, int index) 459 { 460 int error; 461 462 hammer_cursor_downgrade(cursor); 463 error = 0; 464 465 if (cursor->node != node) { 466 hammer_unlock(&cursor->node->lock); 467 hammer_rel_node(cursor->node); 468 cursor->node = node; 469 hammer_ref_node(node); 470 hammer_lock_sh(&node->lock); 471 KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0); 472 473 if (cursor->parent) { 474 hammer_unlock(&cursor->parent->lock); 475 hammer_rel_node(cursor->parent); 476 cursor->parent = NULL; 477 cursor->parent_index = 0; 478 } 479 error = hammer_load_cursor_parent(cursor, 0); 480 } 481 cursor->index = index; 482 return (error); 483 } 484 485 /* 486 * Load the parent of cursor->node into cursor->parent. 487 */ 488 static 489 int 490 hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive) 491 { 492 hammer_mount_t hmp; 493 hammer_node_t parent; 494 hammer_node_t node; 495 hammer_btree_elm_t elm; 496 int error; 497 int parent_index; 498 499 hmp = cursor->trans->hmp; 500 501 if (cursor->node->ondisk->parent) { 502 node = cursor->node; 503 parent = hammer_btree_get_parent(cursor->trans, node, 504 &parent_index, 505 &error, try_exclusive); 506 if (error == 0) { 507 elm = &parent->ondisk->elms[parent_index]; 508 cursor->parent = parent; 509 cursor->parent_index = parent_index; 510 cursor->left_bound = &elm[0].internal.base; 511 cursor->right_bound = &elm[1].internal.base; 512 } 513 } else { 514 cursor->parent = NULL; 515 cursor->parent_index = 0; 516 cursor->left_bound = &hmp->root_btree_beg; 517 cursor->right_bound = &hmp->root_btree_end; 518 error = 0; 519 } 520 return(error); 521 } 522 523 /* 524 * Cursor up to our parent node. Return ENOENT if we are at the root of 525 * the filesystem. 526 */ 527 int 528 hammer_cursor_up(hammer_cursor_t cursor) 529 { 530 int error; 531 532 hammer_cursor_downgrade(cursor); 533 534 /* 535 * If the parent is NULL we are at the root of the B-Tree and 536 * return ENOENT. 537 */ 538 if (cursor->parent == NULL) 539 return (ENOENT); 540 541 /* 542 * Set the node to its parent. 543 */ 544 hammer_unlock(&cursor->node->lock); 545 hammer_rel_node(cursor->node); 546 cursor->node = cursor->parent; 547 cursor->index = cursor->parent_index; 548 cursor->parent = NULL; 549 cursor->parent_index = 0; 550 551 error = hammer_load_cursor_parent(cursor, 0); 552 return(error); 553 } 554 555 /* 556 * Special cursor up given a locked cursor. The orignal node is not 557 * unlocked or released and the cursor is not downgraded. 558 * 559 * This function can fail with EDEADLK. 560 * 561 * This function is only run when recursively deleting parent nodes 562 * to get rid of an empty leaf. 563 */ 564 int 565 hammer_cursor_up_locked(hammer_cursor_t cursor) 566 { 567 hammer_node_t save; 568 int error; 569 int save_index; 570 571 /* 572 * If the parent is NULL we are at the root of the B-Tree and 573 * return ENOENT. 574 */ 575 if (cursor->parent == NULL) 576 return (ENOENT); 577 578 save = cursor->node; 579 save_index = cursor->index; 580 581 /* 582 * Set the node to its parent. 583 */ 584 cursor->node = cursor->parent; 585 cursor->index = cursor->parent_index; 586 cursor->parent = NULL; 587 cursor->parent_index = 0; 588 589 /* 590 * load the new parent, attempt to exclusively lock it. Note that 591 * we are still holding the old parent (now cursor->node) exclusively 592 * locked. 593 * 594 * This can return EDEADLK. Undo the operation on any error. These 595 * up sequences can occur during iterations so be sure to restore 596 * the index. 597 */ 598 error = hammer_load_cursor_parent(cursor, 1); 599 if (error) { 600 cursor->parent = cursor->node; 601 cursor->parent_index = cursor->index; 602 cursor->node = save; 603 cursor->index = save_index; 604 } 605 return(error); 606 } 607 608 609 /* 610 * Cursor down through the current node, which must be an internal node. 611 * 612 * This routine adjusts the cursor and sets index to 0. 613 */ 614 int 615 hammer_cursor_down(hammer_cursor_t cursor) 616 { 617 hammer_node_t node; 618 hammer_btree_elm_t elm; 619 int error; 620 621 /* 622 * The current node becomes the current parent 623 */ 624 hammer_cursor_downgrade(cursor); 625 node = cursor->node; 626 KKASSERT(cursor->index >= 0 && cursor->index < node->ondisk->count); 627 if (cursor->parent) { 628 hammer_unlock(&cursor->parent->lock); 629 hammer_rel_node(cursor->parent); 630 } 631 cursor->parent = node; 632 cursor->parent_index = cursor->index; 633 cursor->node = NULL; 634 cursor->index = 0; 635 636 /* 637 * Extract element to push into at (node,index), set bounds. 638 */ 639 elm = &node->ondisk->elms[cursor->parent_index]; 640 641 /* 642 * Ok, push down into elm of an internal node. 643 */ 644 KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 645 KKASSERT(elm->internal.subtree_offset != 0); 646 cursor->left_bound = &elm[0].internal.base; 647 cursor->right_bound = &elm[1].internal.base; 648 node = hammer_get_node(cursor->trans, 649 elm->internal.subtree_offset, 0, &error); 650 if (error == 0) { 651 KASSERT(elm->base.btype == node->ondisk->type, 652 ("BTYPE MISMATCH %c %c NODE %p", 653 elm->base.btype, node->ondisk->type, node)); 654 if (node->ondisk->parent != cursor->parent->node_offset) 655 hpanic("node %p %016jx vs %016jx", 656 node, 657 (intmax_t)node->ondisk->parent, 658 (intmax_t)cursor->parent->node_offset); 659 KKASSERT(node->ondisk->parent == cursor->parent->node_offset); 660 } 661 662 /* 663 * If no error occured we can lock the new child node. If the 664 * node is deadlock flagged wait up to hammer_tdmux_ticks (1 second) 665 * for the deadlock to clear. Otherwise a large number of concurrent 666 * readers can continuously stall the flusher. 667 * 668 * We specifically do this in the cursor_down() code in order to 669 * deal with frontend top-down searches smashing against bottom-up 670 * flusher-based mirror updates. These collisions typically occur 671 * above the inode in the B-Tree and are not covered by the 672 * ip->cursor_exclreq_count logic. 673 */ 674 if (error == 0) { 675 #if 0 676 if (node->cursor_exclreq_count && 677 cursor->trans->type != HAMMER_TRANS_FLS) { 678 tsleep(&node->cursor_exclreq_count, 0, 679 "hstag2", hammer_tdmux_ticks); 680 } 681 #endif 682 hammer_lock_sh(&node->lock); 683 KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0); 684 cursor->node = node; 685 cursor->index = 0; 686 } 687 return(error); 688 } 689 690 /************************************************************************ 691 * DEADLOCK RECOVERY * 692 ************************************************************************ 693 * 694 * These are the new deadlock recovery functions. Currently they are only 695 * used for the mirror propagation and physical node removal cases but 696 * ultimately the intention is to use them for all deadlock recovery 697 * operations. 698 * 699 * WARNING! The contents of the cursor may be modified while unlocked. 700 * passive modifications including adjusting the node, parent, 701 * indexes, and leaf pointer. 702 * 703 * An outright removal of the element the cursor was pointing at 704 * will cause the HAMMER_CURSOR_TRACKED_RIPOUT flag to be set, 705 * which chains to causing the HAMMER_CURSOR_RETEST to be set 706 * when the cursor is locked again. 707 */ 708 void 709 hammer_unlock_cursor(hammer_cursor_t cursor) 710 { 711 hammer_node_t node; 712 713 KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0); 714 KKASSERT(cursor->node); 715 716 /* 717 * Release the cursor's locks and track B-Tree operations on node. 718 * While being tracked our cursor can be modified by other threads 719 * and the node may be replaced. 720 */ 721 if (cursor->parent) { 722 hammer_unlock(&cursor->parent->lock); 723 hammer_rel_node(cursor->parent); 724 cursor->parent = NULL; 725 } 726 node = cursor->node; 727 cursor->flags |= HAMMER_CURSOR_TRACKED; 728 TAILQ_INSERT_TAIL(&node->cursor_list, cursor, deadlk_entry); 729 hammer_unlock(&node->lock); 730 } 731 732 /* 733 * Get the cursor heated up again. The cursor's node may have 734 * changed and we might have to locate the new parent. 735 * 736 * If the exact element we were on got deleted RIPOUT will be 737 * set and we must clear ATEDISK so an iteration does not skip 738 * the element after it. 739 */ 740 int 741 hammer_lock_cursor(hammer_cursor_t cursor) 742 { 743 hammer_node_t node; 744 int error; 745 746 KKASSERT(cursor->flags & HAMMER_CURSOR_TRACKED); 747 748 /* 749 * Relock the node 750 */ 751 for (;;) { 752 node = cursor->node; 753 hammer_ref_node(node); 754 hammer_lock_sh(&node->lock); 755 if (cursor->node == node) { 756 hammer_rel_node(node); 757 break; 758 } 759 hammer_unlock(&node->lock); 760 hammer_rel_node(node); 761 } 762 763 /* 764 * Untrack the cursor, clean up, and re-establish the parent node. 765 */ 766 TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry); 767 cursor->flags &= ~HAMMER_CURSOR_TRACKED; 768 769 /* 770 * If a ripout has occured iterations must re-test the (new) 771 * current element. Clearing ATEDISK prevents the element from 772 * being skipped and RETEST causes it to be re-tested. 773 */ 774 if (cursor->flags & HAMMER_CURSOR_TRACKED_RIPOUT) { 775 cursor->flags &= ~HAMMER_CURSOR_TRACKED_RIPOUT; 776 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 777 cursor->flags |= HAMMER_CURSOR_RETEST; 778 } 779 error = hammer_load_cursor_parent(cursor, 0); 780 return(error); 781 } 782 783 /* 784 * Recover from a deadlocked cursor, tracking any node removals or 785 * replacements. If the cursor's current node is removed by another 786 * thread (via btree_remove()) the cursor will be seeked upwards. 787 * 788 * The caller is working a modifying operation and must be holding the 789 * sync lock (shared). We do not release the sync lock because this 790 * would break atomicy. 791 */ 792 int 793 hammer_recover_cursor(hammer_cursor_t cursor) 794 { 795 hammer_transaction_t trans __debugvar; 796 #if 0 797 hammer_inode_t ip; 798 #endif 799 int error; 800 801 hammer_unlock_cursor(cursor); 802 803 #if 0 804 ip = cursor->ip; 805 #endif 806 trans = cursor->trans; 807 KKASSERT(trans->sync_lock_refs > 0); 808 809 /* 810 * Wait for the deadlock to clear. 811 * 812 * Maintain exclreq_count / wakeup as necessary to notify new 813 * entrants into ip. We continue to hold the fs_token so our 814 * EDEADLK retry loop should get its chance before another thread 815 * steals the lock. 816 */ 817 if (cursor->deadlk_node) { 818 #if 0 819 if (ip && trans->type == HAMMER_TRANS_FLS) 820 ++ip->cursor_exclreq_count; 821 ++cursor->deadlk_node->cursor_exclreq_count; 822 #endif 823 hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk"); 824 hammer_unlock(&cursor->deadlk_node->lock); 825 #if 0 826 if (--cursor->deadlk_node->cursor_exclreq_count == 0) 827 wakeup(&cursor->deadlk_node->cursor_exclreq_count); 828 if (ip && trans->type == HAMMER_TRANS_FLS) { 829 if (--ip->cursor_exclreq_count == 0) 830 wakeup(&ip->cursor_exclreq_count); 831 } 832 #endif 833 hammer_rel_node(cursor->deadlk_node); 834 cursor->deadlk_node = NULL; 835 } 836 if (cursor->deadlk_rec) { 837 hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr"); 838 hammer_rel_mem_record(cursor->deadlk_rec); 839 cursor->deadlk_rec = NULL; 840 } 841 error = hammer_lock_cursor(cursor); 842 return(error); 843 } 844 845 /* 846 * Dup ocursor to ncursor. ncursor inherits ocursor's locks and ocursor 847 * is effectively unlocked and becomes tracked. If ocursor was not locked 848 * then ncursor also inherits the tracking. 849 * 850 * After the caller finishes working with ncursor it must be cleaned up 851 * with hammer_done_cursor(), and the caller must re-lock ocursor. 852 */ 853 hammer_cursor_t 854 hammer_push_cursor(hammer_cursor_t ocursor) 855 { 856 hammer_cursor_t ncursor; 857 hammer_inode_t ip; 858 hammer_node_t node; 859 hammer_mount_t hmp; 860 861 hmp = ocursor->trans->hmp; 862 ncursor = kmalloc(sizeof(*ncursor), hmp->m_misc, M_WAITOK | M_ZERO); 863 bcopy(ocursor, ncursor, sizeof(*ocursor)); 864 865 node = ocursor->node; 866 hammer_ref_node(node); 867 if ((ocursor->flags & HAMMER_CURSOR_TRACKED) == 0) { 868 ocursor->flags |= HAMMER_CURSOR_TRACKED; 869 TAILQ_INSERT_TAIL(&node->cursor_list, ocursor, deadlk_entry); 870 } 871 if (ncursor->parent) 872 ocursor->parent = NULL; 873 ocursor->data_buffer = NULL; 874 ocursor->leaf = NULL; 875 ocursor->data = NULL; 876 if (ncursor->flags & HAMMER_CURSOR_TRACKED) 877 TAILQ_INSERT_TAIL(&node->cursor_list, ncursor, deadlk_entry); 878 if ((ip = ncursor->ip) != NULL) { 879 ++ip->cursor_ip_refs; 880 } 881 if (ncursor->iprec) 882 hammer_ref(&ncursor->iprec->lock); 883 return(ncursor); 884 } 885 886 /* 887 * Destroy ncursor and restore ocursor 888 * 889 * This is a temporary hack for the release. We can't afford to lose 890 * the IP lock until the IP object scan code is able to deal with it, 891 * so have ocursor inherit it back. 892 */ 893 void 894 hammer_pop_cursor(hammer_cursor_t ocursor, hammer_cursor_t ncursor) 895 { 896 hammer_mount_t hmp; 897 hammer_inode_t ip; 898 899 hmp = ncursor->trans->hmp; 900 ip = ncursor->ip; 901 ncursor->ip = NULL; 902 if (ip) 903 --ip->cursor_ip_refs; 904 hammer_done_cursor(ncursor); 905 kfree(ncursor, hmp->m_misc); 906 KKASSERT(ocursor->ip == ip); 907 hammer_lock_cursor(ocursor); 908 } 909 910 /* 911 * onode is being replaced by nnode by the reblocking code. 912 */ 913 void 914 hammer_cursor_replaced_node(hammer_node_t onode, hammer_node_t nnode) 915 { 916 hammer_cursor_t cursor; 917 hammer_node_ondisk_t ondisk; 918 hammer_node_ondisk_t nndisk; 919 920 ondisk = onode->ondisk; 921 nndisk = nnode->ondisk; 922 923 while ((cursor = TAILQ_FIRST(&onode->cursor_list)) != NULL) { 924 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 925 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 926 KKASSERT(cursor->node == onode); 927 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 928 cursor->leaf = &nndisk->elms[cursor->index].leaf; 929 cursor->node = nnode; 930 hammer_ref_node(nnode); 931 hammer_rel_node(onode); 932 } 933 } 934 935 /* 936 * We have removed <node> from the parent and collapsed the parent. 937 * 938 * Cursors in deadlock recovery are seeked upward to the parent so the 939 * btree_remove() recursion works properly even though we have marked 940 * the cursor as requiring a reseek. 941 * 942 * This is the only cursor function which sets HAMMER_CURSOR_ITERATE_CHECK, 943 * meaning the cursor is no longer definitively pointing at an element 944 * within its iteration (if the cursor is being used to iterate). The 945 * iteration code will take this into account instead of asserting if the 946 * cursor is outside the iteration range. 947 */ 948 void 949 hammer_cursor_removed_node(hammer_node_t node, hammer_node_t parent, int index) 950 { 951 hammer_cursor_t cursor; 952 hammer_node_ondisk_t ondisk; 953 954 KKASSERT(parent != NULL); 955 ondisk = node->ondisk; 956 957 while ((cursor = TAILQ_FIRST(&node->cursor_list)) != NULL) { 958 KKASSERT(cursor->node == node); 959 KKASSERT(cursor->index == 0); 960 TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry); 961 TAILQ_INSERT_TAIL(&parent->cursor_list, cursor, deadlk_entry); 962 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 963 cursor->leaf = NULL; 964 cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT; 965 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK; 966 cursor->node = parent; 967 cursor->index = index; 968 hammer_ref_node(parent); 969 hammer_rel_node(node); 970 } 971 } 972 973 /* 974 * node was split at (onode, index) with elements >= index moved to nnode. 975 */ 976 void 977 hammer_cursor_split_node(hammer_node_t onode, hammer_node_t nnode, int index) 978 { 979 hammer_cursor_t cursor; 980 hammer_node_ondisk_t ondisk; 981 hammer_node_ondisk_t nndisk; 982 983 ondisk = onode->ondisk; 984 nndisk = nnode->ondisk; 985 986 again: 987 TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { 988 KKASSERT(cursor->node == onode); 989 if (cursor->index < index) 990 continue; 991 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 992 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 993 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 994 cursor->leaf = &nndisk->elms[cursor->index - index].leaf; 995 cursor->node = nnode; 996 cursor->index -= index; 997 hammer_ref_node(nnode); 998 hammer_rel_node(onode); 999 goto again; 1000 } 1001 } 1002 1003 /* 1004 * An element was moved from one node to another or within a node. The 1005 * index may also represent the end of the node (index == numelements). 1006 * 1007 * {oparent,pindex} is the parent node's pointer to onode/oindex. 1008 * 1009 * This is used by the rebalancing code. This is not an insertion or 1010 * deletion and any additional elements, including the degenerate case at 1011 * the end of the node, will be dealt with by additional distinct calls. 1012 */ 1013 void 1014 hammer_cursor_moved_element(hammer_node_t oparent, int pindex, 1015 hammer_node_t onode, int oindex, 1016 hammer_node_t nnode, int nindex) 1017 { 1018 hammer_cursor_t cursor; 1019 hammer_node_ondisk_t ondisk; 1020 hammer_node_ondisk_t nndisk; 1021 1022 /* 1023 * Adjust any cursors pointing at the element 1024 */ 1025 ondisk = onode->ondisk; 1026 nndisk = nnode->ondisk; 1027 again1: 1028 TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { 1029 KKASSERT(cursor->node == onode); 1030 if (cursor->index != oindex) 1031 continue; 1032 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 1033 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 1034 if (cursor->leaf == &ondisk->elms[oindex].leaf) 1035 cursor->leaf = &nndisk->elms[nindex].leaf; 1036 cursor->node = nnode; 1037 cursor->index = nindex; 1038 hammer_ref_node(nnode); 1039 hammer_rel_node(onode); 1040 goto again1; 1041 } 1042 1043 /* 1044 * When moving the first element of onode to a different node any 1045 * cursor which is pointing at (oparent,pindex) must be repointed 1046 * to nnode and ATEDISK must be cleared. 1047 * 1048 * This prevents cursors from losing track due to insertions. 1049 * Insertions temporarily release the cursor in order to update 1050 * the mirror_tids. It primarily effects the mirror_write code. 1051 * The other code paths generally only do a single insertion and 1052 * then relookup or drop the cursor. 1053 */ 1054 if (onode == nnode || oindex) 1055 return; 1056 ondisk = oparent->ondisk; 1057 again2: 1058 TAILQ_FOREACH(cursor, &oparent->cursor_list, deadlk_entry) { 1059 KKASSERT(cursor->node == oparent); 1060 if (cursor->index != pindex) 1061 continue; 1062 hkprintf("debug: shifted cursor pointing at parent\n" 1063 "parent %016jx:%d onode %016jx:%d nnode %016jx:%d\n", 1064 (intmax_t)oparent->node_offset, pindex, 1065 (intmax_t)onode->node_offset, oindex, 1066 (intmax_t)nnode->node_offset, nindex); 1067 TAILQ_REMOVE(&oparent->cursor_list, cursor, deadlk_entry); 1068 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 1069 if (cursor->leaf == &ondisk->elms[oindex].leaf) 1070 cursor->leaf = &nndisk->elms[nindex].leaf; 1071 cursor->node = nnode; 1072 cursor->index = nindex; 1073 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1074 hammer_ref_node(nnode); 1075 hammer_rel_node(oparent); 1076 goto again2; 1077 } 1078 } 1079 1080 /* 1081 * The B-Tree element pointing to the specified node was moved from (oparent) 1082 * to (nparent, nindex). We must locate any tracked cursors pointing at 1083 * node and adjust their parent accordingly. 1084 * 1085 * This is used by the rebalancing code when packing elements causes an 1086 * element to shift from one node to another. 1087 */ 1088 void 1089 hammer_cursor_parent_changed(hammer_node_t node, hammer_node_t oparent, 1090 hammer_node_t nparent, int nindex) 1091 { 1092 hammer_cursor_t cursor; 1093 1094 again: 1095 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1096 KKASSERT(cursor->node == node); 1097 if (cursor->parent == oparent) { 1098 cursor->parent = nparent; 1099 cursor->parent_index = nindex; 1100 hammer_ref_node(nparent); 1101 hammer_rel_node(oparent); 1102 goto again; 1103 } 1104 } 1105 } 1106 1107 /* 1108 * Deleted element at (node, index) 1109 * 1110 * Shift indexes >= index 1111 */ 1112 void 1113 hammer_cursor_deleted_element(hammer_node_t node, int index) 1114 { 1115 hammer_cursor_t cursor; 1116 hammer_node_ondisk_t ondisk; 1117 1118 ondisk = node->ondisk; 1119 1120 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1121 KKASSERT(cursor->node == node); 1122 if (cursor->index == index) { 1123 cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT; 1124 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1125 cursor->leaf = NULL; 1126 } else if (cursor->index > index) { 1127 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1128 cursor->leaf = &ondisk->elms[cursor->index - 1].leaf; 1129 --cursor->index; 1130 } 1131 } 1132 } 1133 1134 /* 1135 * Inserted element at (node, index) 1136 * 1137 * Shift indexes >= index 1138 */ 1139 void 1140 hammer_cursor_inserted_element(hammer_node_t node, int index) 1141 { 1142 hammer_cursor_t cursor; 1143 hammer_node_ondisk_t ondisk; 1144 1145 ondisk = node->ondisk; 1146 1147 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1148 KKASSERT(cursor->node == node); 1149 if (cursor->index >= index) { 1150 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1151 cursor->leaf = &ondisk->elms[cursor->index + 1].leaf; 1152 ++cursor->index; 1153 } 1154 } 1155 } 1156 1157 /* 1158 * Invalidate the cached data buffer associated with a cursor. 1159 * 1160 * This needs to be done when the underlying block is being freed or 1161 * the referenced buffer can prevent the related buffer cache buffer 1162 * from being properly invalidated. 1163 */ 1164 void 1165 hammer_cursor_invalidate_cache(hammer_cursor_t cursor) 1166 { 1167 if (cursor->data_buffer) { 1168 hammer_rel_buffer(cursor->data_buffer, 0); 1169 cursor->data_buffer = NULL; 1170 cursor->data = NULL; 1171 } 1172 } 1173 1174