1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * HAMMER B-Tree index - cursor support routines 37 */ 38 #include "hammer.h" 39 40 static int hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive); 41 42 /* 43 * Initialize a fresh cursor using the B-Tree node cache. If the cache 44 * is not available initialize a fresh cursor at the root of the filesystem. 45 */ 46 int 47 hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor, 48 hammer_node_cache_t cache, hammer_inode_t ip) 49 { 50 hammer_volume_t volume; 51 hammer_node_t node; 52 hammer_mount_t hmp; 53 u_int tticks; 54 int error; 55 56 bzero(cursor, sizeof(*cursor)); 57 58 cursor->trans = trans; 59 hmp = trans->hmp; 60 61 /* 62 * As the number of inodes queued to the flusher increases we use 63 * time-domain multiplexing to control read vs flush performance. 64 * We have to do it here, before acquiring any ip or node locks, 65 * to avoid deadlocking or excessively delaying the flusher. 66 * 67 * The full time period is hammer_tdmux_ticks, typically 1/5 of 68 * a second. 69 * 70 * inode allocation begins to get restrained at 2/4 the limit 71 * via the "hmrrcm" mechanism in hammer_inode. We want to begin 72 * limiting read activity before that to try to avoid processes 73 * stalling out in "hmrrcm". 74 */ 75 tticks = hammer_tdmux_ticks; 76 if (trans->type != HAMMER_TRANS_FLS && tticks && 77 hmp->count_reclaims > hammer_limit_reclaims / tticks && 78 hmp->count_reclaims > hammer_autoflush * 2 && 79 hammer_flusher_running(hmp)) { 80 u_int rticks; 81 u_int xticks; 82 u_int dummy; 83 84 /* 85 * 0 ... xticks ... tticks 86 * 87 * rticks is the calculated position, xticks is the demarc 88 * where values below xticks are reserved for the flusher 89 * and values >= to xticks may be used by the frontend. 90 * 91 * At least one tick is always made available for the 92 * frontend. 93 */ 94 rticks = (u_int)ticks % tticks; 95 xticks = hmp->count_reclaims * tticks / hammer_limit_reclaims; 96 97 /* 98 * Ensure rticks and xticks are stable 99 */ 100 cpu_ccfence(); 101 if (rticks < xticks) { 102 if (hammer_debug_general & 0x0004) 103 kprintf("rt %3u, xt %3u, tt %3u\n", 104 rticks, xticks, tticks); 105 tsleep(&dummy, 0, "htdmux", xticks - rticks); 106 } 107 } 108 109 /* 110 * If the cursor operation is on behalf of an inode, lock 111 * the inode. 112 * 113 * When acquiring a shared lock on an inode on which the backend 114 * flusher deadlocked, wait up to hammer_tdmux_ticks (1 second) 115 * for the deadlock to clear. 116 */ 117 if ((cursor->ip = ip) != NULL) { 118 ++ip->cursor_ip_refs; 119 if (trans->type == HAMMER_TRANS_FLS) { 120 hammer_lock_ex(&ip->lock); 121 } else { 122 #if 0 123 if (ip->cursor_exclreq_count) { 124 tsleep(&ip->cursor_exclreq_count, 0, 125 "hstag1", hammer_tdmux_ticks); 126 } 127 #endif 128 hammer_lock_sh(&ip->lock); 129 } 130 } 131 132 /* 133 * Step 1 - acquire a locked node from the cache if possible 134 */ 135 if (cache && cache->node) { 136 node = hammer_ref_node_safe(trans, cache, &error); 137 if (error == 0) { 138 hammer_lock_sh(&node->lock); 139 if (node->flags & HAMMER_NODE_DELETED) { 140 hammer_unlock(&node->lock); 141 hammer_rel_node(node); 142 node = NULL; 143 } 144 } 145 if (node == NULL) 146 ++hammer_stats_btree_root_iterations; 147 } else { 148 node = NULL; 149 ++hammer_stats_btree_root_iterations; 150 } 151 152 /* 153 * Step 2 - If we couldn't get a node from the cache, get 154 * the one from the root of the filesystem. 155 */ 156 while (node == NULL) { 157 volume = hammer_get_root_volume(hmp, &error); 158 if (error) 159 break; 160 node = hammer_get_node(trans, volume->ondisk->vol0_btree_root, 161 0, &error); 162 hammer_rel_volume(volume, 0); 163 if (error) 164 break; 165 /* 166 * When the frontend acquires the root b-tree node while the 167 * backend is deadlocked on it, wait up to hammer_tdmux_ticks 168 * (1 second) for the deadlock to clear. 169 */ 170 #if 0 171 if (node->cursor_exclreq_count && 172 cursor->trans->type != HAMMER_TRANS_FLS) { 173 tsleep(&node->cursor_exclreq_count, 0, 174 "hstag3", hammer_tdmux_ticks); 175 } 176 #endif 177 hammer_lock_sh(&node->lock); 178 179 /* 180 * If someone got in before we could lock the node, retry. 181 */ 182 if (node->flags & HAMMER_NODE_DELETED) { 183 hammer_unlock(&node->lock); 184 hammer_rel_node(node); 185 node = NULL; 186 continue; 187 } 188 if (volume->ondisk->vol0_btree_root != node->node_offset) { 189 hammer_unlock(&node->lock); 190 hammer_rel_node(node); 191 node = NULL; 192 continue; 193 } 194 } 195 196 /* 197 * Step 3 - finish initializing the cursor by acquiring the parent 198 */ 199 cursor->node = node; 200 if (error == 0) 201 error = hammer_load_cursor_parent(cursor, 0); 202 KKASSERT(error == 0); 203 /* if (error) hammer_done_cursor(cursor); */ 204 return(error); 205 } 206 207 /* 208 * Normalize a cursor. Sometimes cursors can be left in a state 209 * where node is NULL. If the cursor is in this state, cursor up. 210 */ 211 void 212 hammer_normalize_cursor(hammer_cursor_t cursor) 213 { 214 if (cursor->node == NULL) { 215 KKASSERT(cursor->parent != NULL); 216 hammer_cursor_up(cursor); 217 } 218 } 219 220 221 /* 222 * We are finished with a cursor. We NULL out various fields as sanity 223 * check, in case the structure is inappropriately used afterwords. 224 */ 225 void 226 hammer_done_cursor(hammer_cursor_t cursor) 227 { 228 hammer_inode_t ip; 229 230 KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0); 231 if (cursor->parent) { 232 hammer_unlock(&cursor->parent->lock); 233 hammer_rel_node(cursor->parent); 234 cursor->parent = NULL; 235 } 236 if (cursor->node) { 237 hammer_unlock(&cursor->node->lock); 238 hammer_rel_node(cursor->node); 239 cursor->node = NULL; 240 } 241 if (cursor->data_buffer) { 242 hammer_rel_buffer(cursor->data_buffer, 0); 243 cursor->data_buffer = NULL; 244 } 245 if ((ip = cursor->ip) != NULL) { 246 KKASSERT(ip->cursor_ip_refs > 0); 247 --ip->cursor_ip_refs; 248 hammer_unlock(&ip->lock); 249 cursor->ip = NULL; 250 } 251 if (cursor->iprec) { 252 hammer_rel_mem_record(cursor->iprec); 253 cursor->iprec = NULL; 254 } 255 256 /* 257 * If we deadlocked this node will be referenced. Do a quick 258 * lock/unlock to wait for the deadlock condition to clear. 259 * 260 * Maintain exclreq_count / wakeup as necessary to notify new 261 * entrants into ip. We continue to hold the fs_token so our 262 * EDEADLK retry loop should get its chance before another thread 263 * steals the lock. 264 */ 265 if (cursor->deadlk_node) { 266 #if 0 267 if (ip && cursor->trans->type == HAMMER_TRANS_FLS) 268 ++ip->cursor_exclreq_count; 269 ++cursor->deadlk_node->cursor_exclreq_count; 270 #endif 271 hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk"); 272 hammer_unlock(&cursor->deadlk_node->lock); 273 #if 0 274 if (--cursor->deadlk_node->cursor_exclreq_count == 0) 275 wakeup(&cursor->deadlk_node->cursor_exclreq_count); 276 if (ip && cursor->trans->type == HAMMER_TRANS_FLS) { 277 if (--ip->cursor_exclreq_count == 0) 278 wakeup(&ip->cursor_exclreq_count); 279 } 280 #endif 281 hammer_rel_node(cursor->deadlk_node); 282 cursor->deadlk_node = NULL; 283 } 284 if (cursor->deadlk_rec) { 285 hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr"); 286 hammer_rel_mem_record(cursor->deadlk_rec); 287 cursor->deadlk_rec = NULL; 288 } 289 290 cursor->data = NULL; 291 cursor->leaf = NULL; 292 cursor->left_bound = NULL; 293 cursor->right_bound = NULL; 294 cursor->trans = NULL; 295 } 296 297 /* 298 * Upgrade cursor->node and cursor->parent to exclusive locks. This 299 * function can return EDEADLK. 300 * 301 * The lock must already be either held shared or already held exclusively 302 * by us. 303 * 304 * We upgrade the parent first as it is the most likely to collide first 305 * with the downward traversal that the frontend typically does. 306 * 307 * If we fail to upgrade the lock and cursor->deadlk_node is NULL, 308 * we add another reference to the node that failed and set 309 * cursor->deadlk_node so hammer_done_cursor() can block on it. 310 */ 311 int 312 hammer_cursor_upgrade(hammer_cursor_t cursor) 313 { 314 int error; 315 316 if (cursor->parent) { 317 error = hammer_lock_upgrade(&cursor->parent->lock, 1); 318 if (error && cursor->deadlk_node == NULL) { 319 cursor->deadlk_node = cursor->parent; 320 hammer_ref_node(cursor->deadlk_node); 321 } 322 } else { 323 error = 0; 324 } 325 if (error == 0) { 326 error = hammer_lock_upgrade(&cursor->node->lock, 1); 327 if (error && cursor->deadlk_node == NULL) { 328 cursor->deadlk_node = cursor->node; 329 hammer_ref_node(cursor->deadlk_node); 330 } 331 } 332 #if 0 333 error = hammer_lock_upgrade(&cursor->node->lock, 1); 334 if (error && cursor->deadlk_node == NULL) { 335 cursor->deadlk_node = cursor->node; 336 hammer_ref_node(cursor->deadlk_node); 337 } else if (error == 0 && cursor->parent) { 338 error = hammer_lock_upgrade(&cursor->parent->lock, 1); 339 if (error && cursor->deadlk_node == NULL) { 340 cursor->deadlk_node = cursor->parent; 341 hammer_ref_node(cursor->deadlk_node); 342 } 343 } 344 #endif 345 return(error); 346 } 347 348 int 349 hammer_cursor_upgrade_node(hammer_cursor_t cursor) 350 { 351 int error; 352 353 error = hammer_lock_upgrade(&cursor->node->lock, 1); 354 if (error && cursor->deadlk_node == NULL) { 355 cursor->deadlk_node = cursor->node; 356 hammer_ref_node(cursor->deadlk_node); 357 } 358 return(error); 359 } 360 361 /* 362 * Downgrade cursor->node and cursor->parent to shared locks. This 363 * function can return EDEADLK. 364 */ 365 void 366 hammer_cursor_downgrade(hammer_cursor_t cursor) 367 { 368 if (hammer_lock_excl_owned(&cursor->node->lock, curthread)) 369 hammer_lock_downgrade(&cursor->node->lock, 1); 370 if (cursor->parent && 371 hammer_lock_excl_owned(&cursor->parent->lock, curthread)) { 372 hammer_lock_downgrade(&cursor->parent->lock, 1); 373 } 374 } 375 376 /* 377 * Upgrade and downgrade pairs of cursors. This is used by the dedup 378 * code which must deal with two cursors. A special function is needed 379 * because some of the nodes may be shared between the two cursors, 380 * resulting in share counts > 1 which will normally cause an upgrade 381 * to fail. 382 */ 383 static __noinline 384 int 385 collect_node(hammer_node_t *array, int *counts, int n, hammer_node_t node) 386 { 387 int i; 388 389 for (i = 0; i < n; ++i) { 390 if (array[i] == node) 391 break; 392 } 393 if (i == n) { 394 array[i] = node; 395 counts[i] = 1; 396 ++i; 397 } else { 398 ++counts[i]; 399 } 400 return(i); 401 } 402 403 int 404 hammer_cursor_upgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2) 405 { 406 hammer_node_t nodes[4]; 407 int counts[4]; 408 int error; 409 int i; 410 int n; 411 412 n = collect_node(nodes, counts, 0, cursor1->node); 413 if (cursor1->parent) 414 n = collect_node(nodes, counts, n, cursor1->parent); 415 n = collect_node(nodes, counts, n, cursor2->node); 416 if (cursor2->parent) 417 n = collect_node(nodes, counts, n, cursor2->parent); 418 419 error = 0; 420 for (i = 0; i < n; ++i) { 421 error = hammer_lock_upgrade(&nodes[i]->lock, counts[i]); 422 if (error) 423 break; 424 } 425 if (error) { 426 while (--i >= 0) 427 hammer_lock_downgrade(&nodes[i]->lock, counts[i]); 428 } 429 return (error); 430 } 431 432 void 433 hammer_cursor_downgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2) 434 { 435 hammer_node_t nodes[4]; 436 int counts[4]; 437 int i; 438 int n; 439 440 n = collect_node(nodes, counts, 0, cursor1->node); 441 if (cursor1->parent) 442 n = collect_node(nodes, counts, n, cursor1->parent); 443 n = collect_node(nodes, counts, n, cursor2->node); 444 if (cursor2->parent) 445 n = collect_node(nodes, counts, n, cursor2->parent); 446 447 for (i = 0; i < n; ++i) 448 hammer_lock_downgrade(&nodes[i]->lock, counts[i]); 449 } 450 451 /* 452 * Seek the cursor to the specified node and index. 453 * 454 * The caller must ref the node prior to calling this routine and release 455 * it after it returns. If the seek succeeds the cursor will gain its own 456 * ref on the node. 457 */ 458 int 459 hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node, int index) 460 { 461 int error; 462 463 hammer_cursor_downgrade(cursor); 464 error = 0; 465 466 if (cursor->node != node) { 467 hammer_unlock(&cursor->node->lock); 468 hammer_rel_node(cursor->node); 469 cursor->node = node; 470 hammer_ref_node(node); 471 hammer_lock_sh(&node->lock); 472 KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0); 473 474 if (cursor->parent) { 475 hammer_unlock(&cursor->parent->lock); 476 hammer_rel_node(cursor->parent); 477 cursor->parent = NULL; 478 cursor->parent_index = 0; 479 } 480 error = hammer_load_cursor_parent(cursor, 0); 481 } 482 cursor->index = index; 483 return (error); 484 } 485 486 /* 487 * Load the parent of cursor->node into cursor->parent. 488 */ 489 static 490 int 491 hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive) 492 { 493 hammer_mount_t hmp; 494 hammer_node_t parent; 495 hammer_node_t node; 496 hammer_btree_elm_t elm; 497 int error; 498 int parent_index; 499 500 hmp = cursor->trans->hmp; 501 502 if (cursor->node->ondisk->parent) { 503 node = cursor->node; 504 parent = hammer_btree_get_parent(cursor->trans, node, 505 &parent_index, 506 &error, try_exclusive); 507 if (error == 0) { 508 elm = &parent->ondisk->elms[parent_index]; 509 cursor->parent = parent; 510 cursor->parent_index = parent_index; 511 cursor->left_bound = &elm[0].internal.base; 512 cursor->right_bound = &elm[1].internal.base; 513 } 514 } else { 515 cursor->parent = NULL; 516 cursor->parent_index = 0; 517 cursor->left_bound = &hmp->root_btree_beg; 518 cursor->right_bound = &hmp->root_btree_end; 519 error = 0; 520 } 521 return(error); 522 } 523 524 /* 525 * Cursor up to our parent node. Return ENOENT if we are at the root of 526 * the filesystem. 527 */ 528 int 529 hammer_cursor_up(hammer_cursor_t cursor) 530 { 531 int error; 532 533 hammer_cursor_downgrade(cursor); 534 535 /* 536 * If the parent is NULL we are at the root of the B-Tree and 537 * return ENOENT. 538 */ 539 if (cursor->parent == NULL) 540 return (ENOENT); 541 542 /* 543 * Set the node to its parent. 544 */ 545 hammer_unlock(&cursor->node->lock); 546 hammer_rel_node(cursor->node); 547 cursor->node = cursor->parent; 548 cursor->index = cursor->parent_index; 549 cursor->parent = NULL; 550 cursor->parent_index = 0; 551 552 error = hammer_load_cursor_parent(cursor, 0); 553 return(error); 554 } 555 556 /* 557 * Special cursor up given a locked cursor. The orignal node is not 558 * unlocked or released and the cursor is not downgraded. 559 * 560 * This function can fail with EDEADLK. 561 * 562 * This function is only run when recursively deleting parent nodes 563 * to get rid of an empty leaf. 564 */ 565 int 566 hammer_cursor_up_locked(hammer_cursor_t cursor) 567 { 568 hammer_node_t save; 569 int error; 570 int save_index; 571 572 /* 573 * If the parent is NULL we are at the root of the B-Tree and 574 * return ENOENT. 575 */ 576 if (cursor->parent == NULL) 577 return (ENOENT); 578 579 save = cursor->node; 580 save_index = cursor->index; 581 582 /* 583 * Set the node to its parent. 584 */ 585 cursor->node = cursor->parent; 586 cursor->index = cursor->parent_index; 587 cursor->parent = NULL; 588 cursor->parent_index = 0; 589 590 /* 591 * load the new parent, attempt to exclusively lock it. Note that 592 * we are still holding the old parent (now cursor->node) exclusively 593 * locked. 594 * 595 * This can return EDEADLK. Undo the operation on any error. These 596 * up sequences can occur during iterations so be sure to restore 597 * the index. 598 */ 599 error = hammer_load_cursor_parent(cursor, 1); 600 if (error) { 601 cursor->parent = cursor->node; 602 cursor->parent_index = cursor->index; 603 cursor->node = save; 604 cursor->index = save_index; 605 } 606 return(error); 607 } 608 609 610 /* 611 * Cursor down through the current node, which must be an internal node. 612 * 613 * This routine adjusts the cursor and sets index to 0. 614 */ 615 int 616 hammer_cursor_down(hammer_cursor_t cursor) 617 { 618 hammer_node_t node; 619 hammer_btree_elm_t elm; 620 int error; 621 622 /* 623 * The current node becomes the current parent 624 */ 625 hammer_cursor_downgrade(cursor); 626 node = cursor->node; 627 KKASSERT(cursor->index >= 0 && cursor->index < node->ondisk->count); 628 if (cursor->parent) { 629 hammer_unlock(&cursor->parent->lock); 630 hammer_rel_node(cursor->parent); 631 } 632 cursor->parent = node; 633 cursor->parent_index = cursor->index; 634 cursor->node = NULL; 635 cursor->index = 0; 636 637 /* 638 * Extract element to push into at (node,index), set bounds. 639 */ 640 elm = &node->ondisk->elms[cursor->parent_index]; 641 642 /* 643 * Ok, push down into elm. If elm specifies an internal or leaf 644 * node the current node must be an internal node. 645 */ 646 switch(elm->base.btype) { 647 case HAMMER_BTREE_TYPE_INTERNAL: 648 case HAMMER_BTREE_TYPE_LEAF: 649 KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 650 KKASSERT(elm->internal.subtree_offset != 0); 651 cursor->left_bound = &elm[0].internal.base; 652 cursor->right_bound = &elm[1].internal.base; 653 node = hammer_get_node(cursor->trans, 654 elm->internal.subtree_offset, 0, &error); 655 if (error == 0) { 656 KASSERT(elm->base.btype == node->ondisk->type, ("BTYPE MISMATCH %c %c NODE %p", elm->base.btype, node->ondisk->type, node)); 657 if (node->ondisk->parent != cursor->parent->node_offset) 658 panic("node %p %016llx vs %016llx", node, (long long)node->ondisk->parent, (long long)cursor->parent->node_offset); 659 KKASSERT(node->ondisk->parent == cursor->parent->node_offset); 660 } 661 break; 662 default: 663 panic("hammer_cursor_down: illegal btype %02x (%c)", 664 elm->base.btype, 665 (elm->base.btype ? elm->base.btype : '?')); 666 break; 667 } 668 669 /* 670 * If no error occured we can lock the new child node. If the 671 * node is deadlock flagged wait up to hammer_tdmux_ticks (1 second) 672 * for the deadlock to clear. Otherwise a large number of concurrent 673 * readers can continuously stall the flusher. 674 * 675 * We specifically do this in the cursor_down() code in order to 676 * deal with frontend top-down searches smashing against bottom-up 677 * flusher-based mirror updates. These collisions typically occur 678 * above the inode in the B-Tree and are not covered by the 679 * ip->cursor_exclreq_count logic. 680 */ 681 if (error == 0) { 682 #if 0 683 if (node->cursor_exclreq_count && 684 cursor->trans->type != HAMMER_TRANS_FLS) { 685 tsleep(&node->cursor_exclreq_count, 0, 686 "hstag2", hammer_tdmux_ticks); 687 } 688 #endif 689 hammer_lock_sh(&node->lock); 690 KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0); 691 cursor->node = node; 692 cursor->index = 0; 693 } 694 return(error); 695 } 696 697 /************************************************************************ 698 * DEADLOCK RECOVERY * 699 ************************************************************************ 700 * 701 * These are the new deadlock recovery functions. Currently they are only 702 * used for the mirror propagation and physical node removal cases but 703 * ultimately the intention is to use them for all deadlock recovery 704 * operations. 705 * 706 * WARNING! The contents of the cursor may be modified while unlocked. 707 * passive modifications including adjusting the node, parent, 708 * indexes, and leaf pointer. 709 * 710 * An outright removal of the element the cursor was pointing at 711 * will cause the HAMMER_CURSOR_TRACKED_RIPOUT flag to be set, 712 * which chains to causing the HAMMER_CURSOR_RETEST to be set 713 * when the cursor is locked again. 714 */ 715 void 716 hammer_unlock_cursor(hammer_cursor_t cursor) 717 { 718 hammer_node_t node; 719 720 KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0); 721 KKASSERT(cursor->node); 722 723 /* 724 * Release the cursor's locks and track B-Tree operations on node. 725 * While being tracked our cursor can be modified by other threads 726 * and the node may be replaced. 727 */ 728 if (cursor->parent) { 729 hammer_unlock(&cursor->parent->lock); 730 hammer_rel_node(cursor->parent); 731 cursor->parent = NULL; 732 } 733 node = cursor->node; 734 cursor->flags |= HAMMER_CURSOR_TRACKED; 735 TAILQ_INSERT_TAIL(&node->cursor_list, cursor, deadlk_entry); 736 hammer_unlock(&node->lock); 737 } 738 739 /* 740 * Get the cursor heated up again. The cursor's node may have 741 * changed and we might have to locate the new parent. 742 * 743 * If the exact element we were on got deleted RIPOUT will be 744 * set and we must clear ATEDISK so an iteration does not skip 745 * the element after it. 746 */ 747 int 748 hammer_lock_cursor(hammer_cursor_t cursor) 749 { 750 hammer_node_t node; 751 int error; 752 753 KKASSERT(cursor->flags & HAMMER_CURSOR_TRACKED); 754 755 /* 756 * Relock the node 757 */ 758 for (;;) { 759 node = cursor->node; 760 hammer_ref_node(node); 761 hammer_lock_sh(&node->lock); 762 if (cursor->node == node) { 763 hammer_rel_node(node); 764 break; 765 } 766 hammer_unlock(&node->lock); 767 hammer_rel_node(node); 768 } 769 770 /* 771 * Untrack the cursor, clean up, and re-establish the parent node. 772 */ 773 TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry); 774 cursor->flags &= ~HAMMER_CURSOR_TRACKED; 775 776 /* 777 * If a ripout has occured iterations must re-test the (new) 778 * current element. Clearing ATEDISK prevents the element from 779 * being skipped and RETEST causes it to be re-tested. 780 */ 781 if (cursor->flags & HAMMER_CURSOR_TRACKED_RIPOUT) { 782 cursor->flags &= ~HAMMER_CURSOR_TRACKED_RIPOUT; 783 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 784 cursor->flags |= HAMMER_CURSOR_RETEST; 785 } 786 error = hammer_load_cursor_parent(cursor, 0); 787 return(error); 788 } 789 790 /* 791 * Recover from a deadlocked cursor, tracking any node removals or 792 * replacements. If the cursor's current node is removed by another 793 * thread (via btree_remove()) the cursor will be seeked upwards. 794 * 795 * The caller is working a modifying operation and must be holding the 796 * sync lock (shared). We do not release the sync lock because this 797 * would break atomicy. 798 */ 799 int 800 hammer_recover_cursor(hammer_cursor_t cursor) 801 { 802 hammer_transaction_t trans __debugvar; 803 #if 0 804 hammer_inode_t ip; 805 #endif 806 int error; 807 808 hammer_unlock_cursor(cursor); 809 810 #if 0 811 ip = cursor->ip; 812 #endif 813 trans = cursor->trans; 814 KKASSERT(trans->sync_lock_refs > 0); 815 816 /* 817 * Wait for the deadlock to clear. 818 * 819 * Maintain exclreq_count / wakeup as necessary to notify new 820 * entrants into ip. We continue to hold the fs_token so our 821 * EDEADLK retry loop should get its chance before another thread 822 * steals the lock. 823 */ 824 if (cursor->deadlk_node) { 825 #if 0 826 if (ip && trans->type == HAMMER_TRANS_FLS) 827 ++ip->cursor_exclreq_count; 828 ++cursor->deadlk_node->cursor_exclreq_count; 829 #endif 830 hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk"); 831 hammer_unlock(&cursor->deadlk_node->lock); 832 #if 0 833 if (--cursor->deadlk_node->cursor_exclreq_count == 0) 834 wakeup(&cursor->deadlk_node->cursor_exclreq_count); 835 if (ip && trans->type == HAMMER_TRANS_FLS) { 836 if (--ip->cursor_exclreq_count == 0) 837 wakeup(&ip->cursor_exclreq_count); 838 } 839 #endif 840 hammer_rel_node(cursor->deadlk_node); 841 cursor->deadlk_node = NULL; 842 } 843 if (cursor->deadlk_rec) { 844 hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr"); 845 hammer_rel_mem_record(cursor->deadlk_rec); 846 cursor->deadlk_rec = NULL; 847 } 848 error = hammer_lock_cursor(cursor); 849 return(error); 850 } 851 852 /* 853 * Dup ocursor to ncursor. ncursor inherits ocursor's locks and ocursor 854 * is effectively unlocked and becomes tracked. If ocursor was not locked 855 * then ncursor also inherits the tracking. 856 * 857 * After the caller finishes working with ncursor it must be cleaned up 858 * with hammer_done_cursor(), and the caller must re-lock ocursor. 859 */ 860 hammer_cursor_t 861 hammer_push_cursor(hammer_cursor_t ocursor) 862 { 863 hammer_cursor_t ncursor; 864 hammer_inode_t ip; 865 hammer_node_t node; 866 hammer_mount_t hmp; 867 868 hmp = ocursor->trans->hmp; 869 ncursor = kmalloc(sizeof(*ncursor), hmp->m_misc, M_WAITOK | M_ZERO); 870 bcopy(ocursor, ncursor, sizeof(*ocursor)); 871 872 node = ocursor->node; 873 hammer_ref_node(node); 874 if ((ocursor->flags & HAMMER_CURSOR_TRACKED) == 0) { 875 ocursor->flags |= HAMMER_CURSOR_TRACKED; 876 TAILQ_INSERT_TAIL(&node->cursor_list, ocursor, deadlk_entry); 877 } 878 if (ncursor->parent) 879 ocursor->parent = NULL; 880 ocursor->data_buffer = NULL; 881 ocursor->leaf = NULL; 882 ocursor->data = NULL; 883 if (ncursor->flags & HAMMER_CURSOR_TRACKED) 884 TAILQ_INSERT_TAIL(&node->cursor_list, ncursor, deadlk_entry); 885 if ((ip = ncursor->ip) != NULL) { 886 ++ip->cursor_ip_refs; 887 } 888 if (ncursor->iprec) 889 hammer_ref(&ncursor->iprec->lock); 890 return(ncursor); 891 } 892 893 /* 894 * Destroy ncursor and restore ocursor 895 * 896 * This is a temporary hack for the release. We can't afford to lose 897 * the IP lock until the IP object scan code is able to deal with it, 898 * so have ocursor inherit it back. 899 */ 900 void 901 hammer_pop_cursor(hammer_cursor_t ocursor, hammer_cursor_t ncursor) 902 { 903 hammer_mount_t hmp; 904 hammer_inode_t ip; 905 906 hmp = ncursor->trans->hmp; 907 ip = ncursor->ip; 908 ncursor->ip = NULL; 909 if (ip) 910 --ip->cursor_ip_refs; 911 hammer_done_cursor(ncursor); 912 kfree(ncursor, hmp->m_misc); 913 KKASSERT(ocursor->ip == ip); 914 hammer_lock_cursor(ocursor); 915 } 916 917 /* 918 * onode is being replaced by nnode by the reblocking code. 919 */ 920 void 921 hammer_cursor_replaced_node(hammer_node_t onode, hammer_node_t nnode) 922 { 923 hammer_cursor_t cursor; 924 hammer_node_ondisk_t ondisk; 925 hammer_node_ondisk_t nndisk; 926 927 ondisk = onode->ondisk; 928 nndisk = nnode->ondisk; 929 930 while ((cursor = TAILQ_FIRST(&onode->cursor_list)) != NULL) { 931 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 932 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 933 KKASSERT(cursor->node == onode); 934 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 935 cursor->leaf = &nndisk->elms[cursor->index].leaf; 936 cursor->node = nnode; 937 hammer_ref_node(nnode); 938 hammer_rel_node(onode); 939 } 940 } 941 942 /* 943 * We have removed <node> from the parent and collapsed the parent. 944 * 945 * Cursors in deadlock recovery are seeked upward to the parent so the 946 * btree_remove() recursion works properly even though we have marked 947 * the cursor as requiring a reseek. 948 * 949 * This is the only cursor function which sets HAMMER_CURSOR_ITERATE_CHECK, 950 * meaning the cursor is no longer definitively pointing at an element 951 * within its iteration (if the cursor is being used to iterate). The 952 * iteration code will take this into account instead of asserting if the 953 * cursor is outside the iteration range. 954 */ 955 void 956 hammer_cursor_removed_node(hammer_node_t node, hammer_node_t parent, int index) 957 { 958 hammer_cursor_t cursor; 959 hammer_node_ondisk_t ondisk; 960 961 KKASSERT(parent != NULL); 962 ondisk = node->ondisk; 963 964 while ((cursor = TAILQ_FIRST(&node->cursor_list)) != NULL) { 965 KKASSERT(cursor->node == node); 966 KKASSERT(cursor->index == 0); 967 TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry); 968 TAILQ_INSERT_TAIL(&parent->cursor_list, cursor, deadlk_entry); 969 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 970 cursor->leaf = NULL; 971 cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT; 972 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK; 973 cursor->node = parent; 974 cursor->index = index; 975 hammer_ref_node(parent); 976 hammer_rel_node(node); 977 } 978 } 979 980 /* 981 * node was split at (onode, index) with elements >= index moved to nnode. 982 */ 983 void 984 hammer_cursor_split_node(hammer_node_t onode, hammer_node_t nnode, int index) 985 { 986 hammer_cursor_t cursor; 987 hammer_node_ondisk_t ondisk; 988 hammer_node_ondisk_t nndisk; 989 990 ondisk = onode->ondisk; 991 nndisk = nnode->ondisk; 992 993 again: 994 TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { 995 KKASSERT(cursor->node == onode); 996 if (cursor->index < index) 997 continue; 998 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 999 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 1000 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1001 cursor->leaf = &nndisk->elms[cursor->index - index].leaf; 1002 cursor->node = nnode; 1003 cursor->index -= index; 1004 hammer_ref_node(nnode); 1005 hammer_rel_node(onode); 1006 goto again; 1007 } 1008 } 1009 1010 /* 1011 * An element was moved from one node to another or within a node. The 1012 * index may also represent the end of the node (index == numelements). 1013 * 1014 * {oparent,pindex} is the parent node's pointer to onode/oindex. 1015 * 1016 * This is used by the rebalancing code. This is not an insertion or 1017 * deletion and any additional elements, including the degenerate case at 1018 * the end of the node, will be dealt with by additional distinct calls. 1019 */ 1020 void 1021 hammer_cursor_moved_element(hammer_node_t oparent, int pindex, 1022 hammer_node_t onode, int oindex, 1023 hammer_node_t nnode, int nindex) 1024 { 1025 hammer_cursor_t cursor; 1026 hammer_node_ondisk_t ondisk; 1027 hammer_node_ondisk_t nndisk; 1028 1029 /* 1030 * Adjust any cursors pointing at the element 1031 */ 1032 ondisk = onode->ondisk; 1033 nndisk = nnode->ondisk; 1034 again1: 1035 TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { 1036 KKASSERT(cursor->node == onode); 1037 if (cursor->index != oindex) 1038 continue; 1039 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 1040 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 1041 if (cursor->leaf == &ondisk->elms[oindex].leaf) 1042 cursor->leaf = &nndisk->elms[nindex].leaf; 1043 cursor->node = nnode; 1044 cursor->index = nindex; 1045 hammer_ref_node(nnode); 1046 hammer_rel_node(onode); 1047 goto again1; 1048 } 1049 1050 /* 1051 * When moving the first element of onode to a different node any 1052 * cursor which is pointing at (oparent,pindex) must be repointed 1053 * to nnode and ATEDISK must be cleared. 1054 * 1055 * This prevents cursors from losing track due to insertions. 1056 * Insertions temporarily release the cursor in order to update 1057 * the mirror_tids. It primarily effects the mirror_write code. 1058 * The other code paths generally only do a single insertion and 1059 * then relookup or drop the cursor. 1060 */ 1061 if (onode == nnode || oindex) 1062 return; 1063 ondisk = oparent->ondisk; 1064 again2: 1065 TAILQ_FOREACH(cursor, &oparent->cursor_list, deadlk_entry) { 1066 KKASSERT(cursor->node == oparent); 1067 if (cursor->index != pindex) 1068 continue; 1069 kprintf("HAMMER debug: shifted cursor pointing at parent\n" 1070 "parent %016jx:%d onode %016jx:%d nnode %016jx:%d\n", 1071 (intmax_t)oparent->node_offset, pindex, 1072 (intmax_t)onode->node_offset, oindex, 1073 (intmax_t)nnode->node_offset, nindex); 1074 TAILQ_REMOVE(&oparent->cursor_list, cursor, deadlk_entry); 1075 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 1076 if (cursor->leaf == &ondisk->elms[oindex].leaf) 1077 cursor->leaf = &nndisk->elms[nindex].leaf; 1078 cursor->node = nnode; 1079 cursor->index = nindex; 1080 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1081 hammer_ref_node(nnode); 1082 hammer_rel_node(oparent); 1083 goto again2; 1084 } 1085 } 1086 1087 /* 1088 * The B-Tree element pointing to the specified node was moved from (oparent) 1089 * to (nparent, nindex). We must locate any tracked cursors pointing at 1090 * node and adjust their parent accordingly. 1091 * 1092 * This is used by the rebalancing code when packing elements causes an 1093 * element to shift from one node to another. 1094 */ 1095 void 1096 hammer_cursor_parent_changed(hammer_node_t node, hammer_node_t oparent, 1097 hammer_node_t nparent, int nindex) 1098 { 1099 hammer_cursor_t cursor; 1100 1101 again: 1102 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1103 KKASSERT(cursor->node == node); 1104 if (cursor->parent == oparent) { 1105 cursor->parent = nparent; 1106 cursor->parent_index = nindex; 1107 hammer_ref_node(nparent); 1108 hammer_rel_node(oparent); 1109 goto again; 1110 } 1111 } 1112 } 1113 1114 /* 1115 * Deleted element at (node, index) 1116 * 1117 * Shift indexes >= index 1118 */ 1119 void 1120 hammer_cursor_deleted_element(hammer_node_t node, int index) 1121 { 1122 hammer_cursor_t cursor; 1123 hammer_node_ondisk_t ondisk; 1124 1125 ondisk = node->ondisk; 1126 1127 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1128 KKASSERT(cursor->node == node); 1129 if (cursor->index == index) { 1130 cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT; 1131 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1132 cursor->leaf = NULL; 1133 } else if (cursor->index > index) { 1134 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1135 cursor->leaf = &ondisk->elms[cursor->index - 1].leaf; 1136 --cursor->index; 1137 } 1138 } 1139 } 1140 1141 /* 1142 * Inserted element at (node, index) 1143 * 1144 * Shift indexes >= index 1145 */ 1146 void 1147 hammer_cursor_inserted_element(hammer_node_t node, int index) 1148 { 1149 hammer_cursor_t cursor; 1150 hammer_node_ondisk_t ondisk; 1151 1152 ondisk = node->ondisk; 1153 1154 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1155 KKASSERT(cursor->node == node); 1156 if (cursor->index >= index) { 1157 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1158 cursor->leaf = &ondisk->elms[cursor->index + 1].leaf; 1159 ++cursor->index; 1160 } 1161 } 1162 } 1163 1164 /* 1165 * Invalidate the cached data buffer associated with a cursor. 1166 * 1167 * This needs to be done when the underlying block is being freed or 1168 * the referenced buffer can prevent the related buffer cache buffer 1169 * from being properly invalidated. 1170 */ 1171 void 1172 hammer_cursor_invalidate_cache(hammer_cursor_t cursor) 1173 { 1174 if (cursor->data_buffer) { 1175 hammer_rel_buffer(cursor->data_buffer, 0); 1176 cursor->data_buffer = NULL; 1177 cursor->data = NULL; 1178 } 1179 } 1180 1181