1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * HAMMER B-Tree index - cursor support routines 37 */ 38 #include "hammer.h" 39 40 static int hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive); 41 42 /* 43 * Initialize a fresh cursor using the B-Tree node cache. If the cache 44 * is not available initialize a fresh cursor at the root of the filesystem. 45 */ 46 int 47 hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor, 48 hammer_node_cache_t cache, hammer_inode_t ip) 49 { 50 hammer_volume_t volume; 51 hammer_node_t node; 52 hammer_mount_t hmp; 53 u_int tticks; 54 int error; 55 56 bzero(cursor, sizeof(*cursor)); 57 58 cursor->trans = trans; 59 hmp = trans->hmp; 60 61 /* 62 * As the number of inodes queued to the flusher increases we use 63 * time-domain multiplexing to control read vs flush performance. 64 * We have to do it here, before acquiring any ip or node locks, 65 * to avoid deadlocking or excessively delaying the flusher. 66 * 67 * The full time period is hammer_tdmux_ticks, typically 1/5 of 68 * a second. 69 * 70 * inode allocation begins to get restrained at 2/4 the limit 71 * via the "hmrrcm" mechanism in hammer_inode. We want to begin 72 * limiting read activity before that to try to avoid processes 73 * stalling out in "hmrrcm". 74 */ 75 tticks = hammer_tdmux_ticks; 76 if (trans->type != HAMMER_TRANS_FLS && tticks && 77 hmp->count_reclaims > hammer_limit_reclaims / tticks && 78 hmp->count_reclaims > hammer_autoflush * 2 && 79 hammer_flusher_running(hmp)) { 80 u_int rticks; 81 u_int xticks; 82 u_int dummy; 83 84 /* 85 * 0 ... xticks ... tticks 86 * 87 * rticks is the calculated position, xticks is the demarc 88 * where values below xticks are reserved for the flusher 89 * and values >= to xticks may be used by the frontend. 90 * 91 * At least one tick is always made available for the 92 * frontend. 93 */ 94 rticks = (u_int)ticks % tticks; 95 xticks = hmp->count_reclaims * tticks / hammer_limit_reclaims; 96 97 /* 98 * Ensure rticks and xticks are stable 99 */ 100 cpu_ccfence(); 101 if (rticks < xticks) { 102 if (hammer_debug_general & 0x0004) 103 kprintf("rt %3u, xt %3u, tt %3u\n", 104 rticks, xticks, tticks); 105 tsleep(&dummy, 0, "htdmux", xticks - rticks); 106 } 107 } 108 109 /* 110 * If the cursor operation is on behalf of an inode, lock 111 * the inode. 112 * 113 * When acquiring a shared lock on an inode on which the backend 114 * flusher deadlocked, wait up to hammer_tdmux_ticks (1 second) 115 * for the deadlock to clear. 116 */ 117 if ((cursor->ip = ip) != NULL) { 118 ++ip->cursor_ip_refs; 119 if (trans->type == HAMMER_TRANS_FLS) { 120 hammer_lock_ex(&ip->lock); 121 } else { 122 #if 0 123 if (ip->cursor_exclreq_count) { 124 tsleep(&ip->cursor_exclreq_count, 0, 125 "hstag1", hammer_tdmux_ticks); 126 } 127 #endif 128 hammer_lock_sh(&ip->lock); 129 } 130 } 131 132 /* 133 * Step 1 - acquire a locked node from the cache if possible 134 */ 135 if (cache && cache->node) { 136 node = hammer_ref_node_safe(trans, cache, &error); 137 if (error == 0) { 138 hammer_lock_sh(&node->lock); 139 if (node->flags & HAMMER_NODE_DELETED) { 140 hammer_unlock(&node->lock); 141 hammer_rel_node(node); 142 node = NULL; 143 } 144 } 145 if (node == NULL) 146 ++hammer_stats_btree_root_iterations; 147 } else { 148 node = NULL; 149 ++hammer_stats_btree_root_iterations; 150 } 151 152 /* 153 * Step 2 - If we couldn't get a node from the cache, get 154 * the one from the root of the filesystem. 155 */ 156 while (node == NULL) { 157 volume = hammer_get_root_volume(hmp, &error); 158 if (error) 159 break; 160 node = hammer_get_node(trans, volume->ondisk->vol0_btree_root, 161 0, &error); 162 hammer_rel_volume(volume, 0); 163 if (error) 164 break; 165 /* 166 * When the frontend acquires the root b-tree node while the 167 * backend is deadlocked on it, wait up to hammer_tdmux_ticks 168 * (1 second) for the deadlock to clear. 169 */ 170 #if 0 171 if (node->cursor_exclreq_count && 172 cursor->trans->type != HAMMER_TRANS_FLS) { 173 tsleep(&node->cursor_exclreq_count, 0, 174 "hstag3", hammer_tdmux_ticks); 175 } 176 #endif 177 hammer_lock_sh(&node->lock); 178 179 /* 180 * If someone got in before we could lock the node, retry. 181 */ 182 if (node->flags & HAMMER_NODE_DELETED) { 183 hammer_unlock(&node->lock); 184 hammer_rel_node(node); 185 node = NULL; 186 continue; 187 } 188 if (volume->ondisk->vol0_btree_root != node->node_offset) { 189 hammer_unlock(&node->lock); 190 hammer_rel_node(node); 191 node = NULL; 192 continue; 193 } 194 } 195 196 /* 197 * Step 3 - finish initializing the cursor by acquiring the parent 198 */ 199 cursor->node = node; 200 if (error == 0) 201 error = hammer_load_cursor_parent(cursor, 0); 202 KKASSERT(error == 0); 203 /* if (error) hammer_done_cursor(cursor); */ 204 return(error); 205 } 206 207 /* 208 * Normalize a cursor. Sometimes cursors can be left in a state 209 * where node is NULL. If the cursor is in this state, cursor up. 210 */ 211 void 212 hammer_normalize_cursor(hammer_cursor_t cursor) 213 { 214 if (cursor->node == NULL) { 215 KKASSERT(cursor->parent != NULL); 216 hammer_cursor_up(cursor); 217 } 218 } 219 220 221 /* 222 * We are finished with a cursor. We NULL out various fields as sanity 223 * check, in case the structure is inappropriately used afterwords. 224 */ 225 void 226 hammer_done_cursor(hammer_cursor_t cursor) 227 { 228 hammer_inode_t ip; 229 230 KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0); 231 if (cursor->parent) { 232 hammer_unlock(&cursor->parent->lock); 233 hammer_rel_node(cursor->parent); 234 cursor->parent = NULL; 235 } 236 if (cursor->node) { 237 hammer_unlock(&cursor->node->lock); 238 hammer_rel_node(cursor->node); 239 cursor->node = NULL; 240 } 241 if (cursor->data_buffer) { 242 hammer_rel_buffer(cursor->data_buffer, 0); 243 cursor->data_buffer = NULL; 244 } 245 if ((ip = cursor->ip) != NULL) { 246 KKASSERT(ip->cursor_ip_refs > 0); 247 --ip->cursor_ip_refs; 248 hammer_unlock(&ip->lock); 249 cursor->ip = NULL; 250 } 251 if (cursor->iprec) { 252 hammer_rel_mem_record(cursor->iprec); 253 cursor->iprec = NULL; 254 } 255 256 /* 257 * If we deadlocked this node will be referenced. Do a quick 258 * lock/unlock to wait for the deadlock condition to clear. 259 * 260 * Maintain exclreq_count / wakeup as necessary to notify new 261 * entrants into ip. We continue to hold the fs_token so our 262 * EDEADLK retry loop should get its chance before another thread 263 * steals the lock. 264 */ 265 if (cursor->deadlk_node) { 266 #if 0 267 if (ip && cursor->trans->type == HAMMER_TRANS_FLS) 268 ++ip->cursor_exclreq_count; 269 ++cursor->deadlk_node->cursor_exclreq_count; 270 #endif 271 hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk"); 272 hammer_unlock(&cursor->deadlk_node->lock); 273 #if 0 274 if (--cursor->deadlk_node->cursor_exclreq_count == 0) 275 wakeup(&cursor->deadlk_node->cursor_exclreq_count); 276 if (ip && cursor->trans->type == HAMMER_TRANS_FLS) { 277 if (--ip->cursor_exclreq_count == 0) 278 wakeup(&ip->cursor_exclreq_count); 279 } 280 #endif 281 hammer_rel_node(cursor->deadlk_node); 282 cursor->deadlk_node = NULL; 283 } 284 if (cursor->deadlk_rec) { 285 hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr"); 286 hammer_rel_mem_record(cursor->deadlk_rec); 287 cursor->deadlk_rec = NULL; 288 } 289 290 cursor->data = NULL; 291 cursor->leaf = NULL; 292 cursor->left_bound = NULL; 293 cursor->right_bound = NULL; 294 cursor->trans = NULL; 295 } 296 297 /* 298 * Upgrade cursor->node and cursor->parent to exclusive locks. This 299 * function can return EDEADLK. 300 * 301 * The lock must already be either held shared or already held exclusively 302 * by us. 303 * 304 * We upgrade the parent first as it is the most likely to collide first 305 * with the downward traversal that the frontend typically does. 306 * 307 * If we fail to upgrade the lock and cursor->deadlk_node is NULL, 308 * we add another reference to the node that failed and set 309 * cursor->deadlk_node so hammer_done_cursor() can block on it. 310 */ 311 int 312 hammer_cursor_upgrade(hammer_cursor_t cursor) 313 { 314 int error; 315 316 if (cursor->parent) { 317 error = hammer_lock_upgrade(&cursor->parent->lock, 1); 318 if (error && cursor->deadlk_node == NULL) { 319 cursor->deadlk_node = cursor->parent; 320 hammer_ref_node(cursor->deadlk_node); 321 } 322 } else { 323 error = 0; 324 } 325 if (error == 0) { 326 error = hammer_lock_upgrade(&cursor->node->lock, 1); 327 if (error && cursor->deadlk_node == NULL) { 328 cursor->deadlk_node = cursor->node; 329 hammer_ref_node(cursor->deadlk_node); 330 } 331 } 332 #if 0 333 error = hammer_lock_upgrade(&cursor->node->lock, 1); 334 if (error && cursor->deadlk_node == NULL) { 335 cursor->deadlk_node = cursor->node; 336 hammer_ref_node(cursor->deadlk_node); 337 } else if (error == 0 && cursor->parent) { 338 error = hammer_lock_upgrade(&cursor->parent->lock, 1); 339 if (error && cursor->deadlk_node == NULL) { 340 cursor->deadlk_node = cursor->parent; 341 hammer_ref_node(cursor->deadlk_node); 342 } 343 } 344 #endif 345 return(error); 346 } 347 348 int 349 hammer_cursor_upgrade_node(hammer_cursor_t cursor) 350 { 351 int error; 352 353 error = hammer_lock_upgrade(&cursor->node->lock, 1); 354 if (error && cursor->deadlk_node == NULL) { 355 cursor->deadlk_node = cursor->node; 356 hammer_ref_node(cursor->deadlk_node); 357 } 358 return(error); 359 } 360 361 /* 362 * Downgrade cursor->node and cursor->parent to shared locks. This 363 * function can return EDEADLK. 364 */ 365 void 366 hammer_cursor_downgrade(hammer_cursor_t cursor) 367 { 368 if (hammer_lock_excl_owned(&cursor->node->lock, curthread)) 369 hammer_lock_downgrade(&cursor->node->lock, 1); 370 if (cursor->parent && 371 hammer_lock_excl_owned(&cursor->parent->lock, curthread)) { 372 hammer_lock_downgrade(&cursor->parent->lock, 1); 373 } 374 } 375 376 /* 377 * Upgrade and downgrade pairs of cursors. This is used by the dedup 378 * code which must deal with two cursors. A special function is needed 379 * because some of the nodes may be shared between the two cursors, 380 * resulting in share counts > 1 which will normally cause an upgrade 381 * to fail. 382 */ 383 static __noinline 384 int 385 collect_node(hammer_node_t *array, int *counts, int n, hammer_node_t node) 386 { 387 int i; 388 389 for (i = 0; i < n; ++i) { 390 if (array[i] == node) 391 break; 392 } 393 if (i == n) { 394 array[i] = node; 395 counts[i] = 1; 396 ++i; 397 } else { 398 ++counts[i]; 399 } 400 return(i); 401 } 402 403 int 404 hammer_cursor_upgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2) 405 { 406 hammer_node_t nodes[4]; 407 int counts[4]; 408 int error; 409 int i; 410 int n; 411 412 n = collect_node(nodes, counts, 0, cursor1->node); 413 if (cursor1->parent) 414 n = collect_node(nodes, counts, n, cursor1->parent); 415 n = collect_node(nodes, counts, n, cursor2->node); 416 if (cursor2->parent) 417 n = collect_node(nodes, counts, n, cursor2->parent); 418 419 error = 0; 420 for (i = 0; i < n; ++i) { 421 error = hammer_lock_upgrade(&nodes[i]->lock, counts[i]); 422 if (error) 423 break; 424 } 425 if (error) { 426 while (--i >= 0) 427 hammer_lock_downgrade(&nodes[i]->lock, counts[i]); 428 } 429 return (error); 430 } 431 432 void 433 hammer_cursor_downgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2) 434 { 435 hammer_node_t nodes[4]; 436 int counts[4]; 437 int i; 438 int n; 439 440 n = collect_node(nodes, counts, 0, cursor1->node); 441 if (cursor1->parent) 442 n = collect_node(nodes, counts, n, cursor1->parent); 443 n = collect_node(nodes, counts, n, cursor2->node); 444 if (cursor2->parent) 445 n = collect_node(nodes, counts, n, cursor2->parent); 446 447 for (i = 0; i < n; ++i) 448 hammer_lock_downgrade(&nodes[i]->lock, counts[i]); 449 } 450 451 /* 452 * Seek the cursor to the specified node and index. 453 * 454 * The caller must ref the node prior to calling this routine and release 455 * it after it returns. If the seek succeeds the cursor will gain its own 456 * ref on the node. 457 */ 458 int 459 hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node, int index) 460 { 461 int error; 462 463 hammer_cursor_downgrade(cursor); 464 error = 0; 465 466 if (cursor->node != node) { 467 hammer_unlock(&cursor->node->lock); 468 hammer_rel_node(cursor->node); 469 cursor->node = node; 470 hammer_ref_node(node); 471 hammer_lock_sh(&node->lock); 472 KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0); 473 474 if (cursor->parent) { 475 hammer_unlock(&cursor->parent->lock); 476 hammer_rel_node(cursor->parent); 477 cursor->parent = NULL; 478 cursor->parent_index = 0; 479 } 480 error = hammer_load_cursor_parent(cursor, 0); 481 } 482 cursor->index = index; 483 return (error); 484 } 485 486 /* 487 * Load the parent of cursor->node into cursor->parent. 488 */ 489 static 490 int 491 hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive) 492 { 493 hammer_mount_t hmp; 494 hammer_node_t parent; 495 hammer_node_t node; 496 hammer_btree_elm_t elm; 497 int error; 498 int parent_index; 499 500 hmp = cursor->trans->hmp; 501 502 if (cursor->node->ondisk->parent) { 503 node = cursor->node; 504 parent = hammer_btree_get_parent(cursor->trans, node, 505 &parent_index, 506 &error, try_exclusive); 507 if (error == 0) { 508 elm = &parent->ondisk->elms[parent_index]; 509 cursor->parent = parent; 510 cursor->parent_index = parent_index; 511 cursor->left_bound = &elm[0].internal.base; 512 cursor->right_bound = &elm[1].internal.base; 513 } 514 } else { 515 cursor->parent = NULL; 516 cursor->parent_index = 0; 517 cursor->left_bound = &hmp->root_btree_beg; 518 cursor->right_bound = &hmp->root_btree_end; 519 error = 0; 520 } 521 return(error); 522 } 523 524 /* 525 * Cursor up to our parent node. Return ENOENT if we are at the root of 526 * the filesystem. 527 */ 528 int 529 hammer_cursor_up(hammer_cursor_t cursor) 530 { 531 int error; 532 533 hammer_cursor_downgrade(cursor); 534 535 /* 536 * If the parent is NULL we are at the root of the B-Tree and 537 * return ENOENT. 538 */ 539 if (cursor->parent == NULL) 540 return (ENOENT); 541 542 /* 543 * Set the node to its parent. 544 */ 545 hammer_unlock(&cursor->node->lock); 546 hammer_rel_node(cursor->node); 547 cursor->node = cursor->parent; 548 cursor->index = cursor->parent_index; 549 cursor->parent = NULL; 550 cursor->parent_index = 0; 551 552 error = hammer_load_cursor_parent(cursor, 0); 553 return(error); 554 } 555 556 /* 557 * Special cursor up given a locked cursor. The orignal node is not 558 * unlocked or released and the cursor is not downgraded. 559 * 560 * This function can fail with EDEADLK. 561 * 562 * This function is only run when recursively deleting parent nodes 563 * to get rid of an empty leaf. 564 */ 565 int 566 hammer_cursor_up_locked(hammer_cursor_t cursor) 567 { 568 hammer_node_t save; 569 int error; 570 int save_index; 571 572 /* 573 * If the parent is NULL we are at the root of the B-Tree and 574 * return ENOENT. 575 */ 576 if (cursor->parent == NULL) 577 return (ENOENT); 578 579 save = cursor->node; 580 save_index = cursor->index; 581 582 /* 583 * Set the node to its parent. 584 */ 585 cursor->node = cursor->parent; 586 cursor->index = cursor->parent_index; 587 cursor->parent = NULL; 588 cursor->parent_index = 0; 589 590 /* 591 * load the new parent, attempt to exclusively lock it. Note that 592 * we are still holding the old parent (now cursor->node) exclusively 593 * locked. 594 * 595 * This can return EDEADLK. Undo the operation on any error. These 596 * up sequences can occur during iterations so be sure to restore 597 * the index. 598 */ 599 error = hammer_load_cursor_parent(cursor, 1); 600 if (error) { 601 cursor->parent = cursor->node; 602 cursor->parent_index = cursor->index; 603 cursor->node = save; 604 cursor->index = save_index; 605 } 606 return(error); 607 } 608 609 610 /* 611 * Cursor down through the current node, which must be an internal node. 612 * 613 * This routine adjusts the cursor and sets index to 0. 614 */ 615 int 616 hammer_cursor_down(hammer_cursor_t cursor) 617 { 618 hammer_node_t node; 619 hammer_btree_elm_t elm; 620 int error; 621 622 /* 623 * The current node becomes the current parent 624 */ 625 hammer_cursor_downgrade(cursor); 626 node = cursor->node; 627 KKASSERT(cursor->index >= 0 && cursor->index < node->ondisk->count); 628 if (cursor->parent) { 629 hammer_unlock(&cursor->parent->lock); 630 hammer_rel_node(cursor->parent); 631 } 632 cursor->parent = node; 633 cursor->parent_index = cursor->index; 634 cursor->node = NULL; 635 cursor->index = 0; 636 637 /* 638 * Extract element to push into at (node,index), set bounds. 639 */ 640 elm = &node->ondisk->elms[cursor->parent_index]; 641 642 /* 643 * Ok, push down into elm. If elm specifies an internal or leaf 644 * node the current node must be an internal node. If elm specifies 645 * a spike then the current node must be a leaf node. 646 */ 647 switch(elm->base.btype) { 648 case HAMMER_BTREE_TYPE_INTERNAL: 649 case HAMMER_BTREE_TYPE_LEAF: 650 KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 651 KKASSERT(elm->internal.subtree_offset != 0); 652 cursor->left_bound = &elm[0].internal.base; 653 cursor->right_bound = &elm[1].internal.base; 654 node = hammer_get_node(cursor->trans, 655 elm->internal.subtree_offset, 0, &error); 656 if (error == 0) { 657 KASSERT(elm->base.btype == node->ondisk->type, ("BTYPE MISMATCH %c %c NODE %p", elm->base.btype, node->ondisk->type, node)); 658 if (node->ondisk->parent != cursor->parent->node_offset) 659 panic("node %p %016llx vs %016llx", node, (long long)node->ondisk->parent, (long long)cursor->parent->node_offset); 660 KKASSERT(node->ondisk->parent == cursor->parent->node_offset); 661 } 662 break; 663 default: 664 panic("hammer_cursor_down: illegal btype %02x (%c)", 665 elm->base.btype, 666 (elm->base.btype ? elm->base.btype : '?')); 667 break; 668 } 669 670 /* 671 * If no error occured we can lock the new child node. If the 672 * node is deadlock flagged wait up to hammer_tdmux_ticks (1 second) 673 * for the deadlock to clear. Otherwise a large number of concurrent 674 * readers can continuously stall the flusher. 675 * 676 * We specifically do this in the cursor_down() code in order to 677 * deal with frontend top-down searches smashing against bottom-up 678 * flusher-based mirror updates. These collisions typically occur 679 * above the inode in the B-Tree and are not covered by the 680 * ip->cursor_exclreq_count logic. 681 */ 682 if (error == 0) { 683 #if 0 684 if (node->cursor_exclreq_count && 685 cursor->trans->type != HAMMER_TRANS_FLS) { 686 tsleep(&node->cursor_exclreq_count, 0, 687 "hstag2", hammer_tdmux_ticks); 688 } 689 #endif 690 hammer_lock_sh(&node->lock); 691 KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0); 692 cursor->node = node; 693 cursor->index = 0; 694 } 695 return(error); 696 } 697 698 /************************************************************************ 699 * DEADLOCK RECOVERY * 700 ************************************************************************ 701 * 702 * These are the new deadlock recovery functions. Currently they are only 703 * used for the mirror propagation and physical node removal cases but 704 * ultimately the intention is to use them for all deadlock recovery 705 * operations. 706 * 707 * WARNING! The contents of the cursor may be modified while unlocked. 708 * passive modifications including adjusting the node, parent, 709 * indexes, and leaf pointer. 710 * 711 * An outright removal of the element the cursor was pointing at 712 * will cause the HAMMER_CURSOR_TRACKED_RIPOUT flag to be set, 713 * which chains to causing the HAMMER_CURSOR_RETEST to be set 714 * when the cursor is locked again. 715 */ 716 void 717 hammer_unlock_cursor(hammer_cursor_t cursor) 718 { 719 hammer_node_t node; 720 721 KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0); 722 KKASSERT(cursor->node); 723 724 /* 725 * Release the cursor's locks and track B-Tree operations on node. 726 * While being tracked our cursor can be modified by other threads 727 * and the node may be replaced. 728 */ 729 if (cursor->parent) { 730 hammer_unlock(&cursor->parent->lock); 731 hammer_rel_node(cursor->parent); 732 cursor->parent = NULL; 733 } 734 node = cursor->node; 735 cursor->flags |= HAMMER_CURSOR_TRACKED; 736 TAILQ_INSERT_TAIL(&node->cursor_list, cursor, deadlk_entry); 737 hammer_unlock(&node->lock); 738 } 739 740 /* 741 * Get the cursor heated up again. The cursor's node may have 742 * changed and we might have to locate the new parent. 743 * 744 * If the exact element we were on got deleted RIPOUT will be 745 * set and we must clear ATEDISK so an iteration does not skip 746 * the element after it. 747 */ 748 int 749 hammer_lock_cursor(hammer_cursor_t cursor) 750 { 751 hammer_node_t node; 752 int error; 753 754 KKASSERT(cursor->flags & HAMMER_CURSOR_TRACKED); 755 756 /* 757 * Relock the node 758 */ 759 for (;;) { 760 node = cursor->node; 761 hammer_ref_node(node); 762 hammer_lock_sh(&node->lock); 763 if (cursor->node == node) { 764 hammer_rel_node(node); 765 break; 766 } 767 hammer_unlock(&node->lock); 768 hammer_rel_node(node); 769 } 770 771 /* 772 * Untrack the cursor, clean up, and re-establish the parent node. 773 */ 774 TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry); 775 cursor->flags &= ~HAMMER_CURSOR_TRACKED; 776 777 /* 778 * If a ripout has occured iterations must re-test the (new) 779 * current element. Clearing ATEDISK prevents the element from 780 * being skipped and RETEST causes it to be re-tested. 781 */ 782 if (cursor->flags & HAMMER_CURSOR_TRACKED_RIPOUT) { 783 cursor->flags &= ~HAMMER_CURSOR_TRACKED_RIPOUT; 784 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 785 cursor->flags |= HAMMER_CURSOR_RETEST; 786 } 787 error = hammer_load_cursor_parent(cursor, 0); 788 return(error); 789 } 790 791 /* 792 * Recover from a deadlocked cursor, tracking any node removals or 793 * replacements. If the cursor's current node is removed by another 794 * thread (via btree_remove()) the cursor will be seeked upwards. 795 * 796 * The caller is working a modifying operation and must be holding the 797 * sync lock (shared). We do not release the sync lock because this 798 * would break atomicy. 799 */ 800 int 801 hammer_recover_cursor(hammer_cursor_t cursor) 802 { 803 hammer_transaction_t trans __debugvar; 804 #if 0 805 hammer_inode_t ip; 806 #endif 807 int error; 808 809 hammer_unlock_cursor(cursor); 810 811 #if 0 812 ip = cursor->ip; 813 #endif 814 trans = cursor->trans; 815 KKASSERT(trans->sync_lock_refs > 0); 816 817 /* 818 * Wait for the deadlock to clear. 819 * 820 * Maintain exclreq_count / wakeup as necessary to notify new 821 * entrants into ip. We continue to hold the fs_token so our 822 * EDEADLK retry loop should get its chance before another thread 823 * steals the lock. 824 */ 825 if (cursor->deadlk_node) { 826 #if 0 827 if (ip && trans->type == HAMMER_TRANS_FLS) 828 ++ip->cursor_exclreq_count; 829 ++cursor->deadlk_node->cursor_exclreq_count; 830 #endif 831 hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk"); 832 hammer_unlock(&cursor->deadlk_node->lock); 833 #if 0 834 if (--cursor->deadlk_node->cursor_exclreq_count == 0) 835 wakeup(&cursor->deadlk_node->cursor_exclreq_count); 836 if (ip && trans->type == HAMMER_TRANS_FLS) { 837 if (--ip->cursor_exclreq_count == 0) 838 wakeup(&ip->cursor_exclreq_count); 839 } 840 #endif 841 hammer_rel_node(cursor->deadlk_node); 842 cursor->deadlk_node = NULL; 843 } 844 if (cursor->deadlk_rec) { 845 hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr"); 846 hammer_rel_mem_record(cursor->deadlk_rec); 847 cursor->deadlk_rec = NULL; 848 } 849 error = hammer_lock_cursor(cursor); 850 return(error); 851 } 852 853 /* 854 * Dup ocursor to ncursor. ncursor inherits ocursor's locks and ocursor 855 * is effectively unlocked and becomes tracked. If ocursor was not locked 856 * then ncursor also inherits the tracking. 857 * 858 * After the caller finishes working with ncursor it must be cleaned up 859 * with hammer_done_cursor(), and the caller must re-lock ocursor. 860 */ 861 hammer_cursor_t 862 hammer_push_cursor(hammer_cursor_t ocursor) 863 { 864 hammer_cursor_t ncursor; 865 hammer_inode_t ip; 866 hammer_node_t node; 867 hammer_mount_t hmp; 868 869 hmp = ocursor->trans->hmp; 870 ncursor = kmalloc(sizeof(*ncursor), hmp->m_misc, M_WAITOK | M_ZERO); 871 bcopy(ocursor, ncursor, sizeof(*ocursor)); 872 873 node = ocursor->node; 874 hammer_ref_node(node); 875 if ((ocursor->flags & HAMMER_CURSOR_TRACKED) == 0) { 876 ocursor->flags |= HAMMER_CURSOR_TRACKED; 877 TAILQ_INSERT_TAIL(&node->cursor_list, ocursor, deadlk_entry); 878 } 879 if (ncursor->parent) 880 ocursor->parent = NULL; 881 ocursor->data_buffer = NULL; 882 ocursor->leaf = NULL; 883 ocursor->data = NULL; 884 if (ncursor->flags & HAMMER_CURSOR_TRACKED) 885 TAILQ_INSERT_TAIL(&node->cursor_list, ncursor, deadlk_entry); 886 if ((ip = ncursor->ip) != NULL) { 887 ++ip->cursor_ip_refs; 888 } 889 if (ncursor->iprec) 890 hammer_ref(&ncursor->iprec->lock); 891 return(ncursor); 892 } 893 894 /* 895 * Destroy ncursor and restore ocursor 896 * 897 * This is a temporary hack for the release. We can't afford to lose 898 * the IP lock until the IP object scan code is able to deal with it, 899 * so have ocursor inherit it back. 900 */ 901 void 902 hammer_pop_cursor(hammer_cursor_t ocursor, hammer_cursor_t ncursor) 903 { 904 hammer_mount_t hmp; 905 hammer_inode_t ip; 906 907 hmp = ncursor->trans->hmp; 908 ip = ncursor->ip; 909 ncursor->ip = NULL; 910 if (ip) 911 --ip->cursor_ip_refs; 912 hammer_done_cursor(ncursor); 913 kfree(ncursor, hmp->m_misc); 914 KKASSERT(ocursor->ip == ip); 915 hammer_lock_cursor(ocursor); 916 } 917 918 /* 919 * onode is being replaced by nnode by the reblocking code. 920 */ 921 void 922 hammer_cursor_replaced_node(hammer_node_t onode, hammer_node_t nnode) 923 { 924 hammer_cursor_t cursor; 925 hammer_node_ondisk_t ondisk; 926 hammer_node_ondisk_t nndisk; 927 928 ondisk = onode->ondisk; 929 nndisk = nnode->ondisk; 930 931 while ((cursor = TAILQ_FIRST(&onode->cursor_list)) != NULL) { 932 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 933 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 934 KKASSERT(cursor->node == onode); 935 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 936 cursor->leaf = &nndisk->elms[cursor->index].leaf; 937 cursor->node = nnode; 938 hammer_ref_node(nnode); 939 hammer_rel_node(onode); 940 } 941 } 942 943 /* 944 * We have removed <node> from the parent and collapsed the parent. 945 * 946 * Cursors in deadlock recovery are seeked upward to the parent so the 947 * btree_remove() recursion works properly even though we have marked 948 * the cursor as requiring a reseek. 949 * 950 * This is the only cursor function which sets HAMMER_CURSOR_ITERATE_CHECK, 951 * meaning the cursor is no longer definitively pointing at an element 952 * within its iteration (if the cursor is being used to iterate). The 953 * iteration code will take this into account instead of asserting if the 954 * cursor is outside the iteration range. 955 */ 956 void 957 hammer_cursor_removed_node(hammer_node_t node, hammer_node_t parent, int index) 958 { 959 hammer_cursor_t cursor; 960 hammer_node_ondisk_t ondisk; 961 962 KKASSERT(parent != NULL); 963 ondisk = node->ondisk; 964 965 while ((cursor = TAILQ_FIRST(&node->cursor_list)) != NULL) { 966 KKASSERT(cursor->node == node); 967 KKASSERT(cursor->index == 0); 968 TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry); 969 TAILQ_INSERT_TAIL(&parent->cursor_list, cursor, deadlk_entry); 970 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 971 cursor->leaf = NULL; 972 cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT; 973 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK; 974 cursor->node = parent; 975 cursor->index = index; 976 hammer_ref_node(parent); 977 hammer_rel_node(node); 978 } 979 } 980 981 /* 982 * node was split at (onode, index) with elements >= index moved to nnode. 983 */ 984 void 985 hammer_cursor_split_node(hammer_node_t onode, hammer_node_t nnode, int index) 986 { 987 hammer_cursor_t cursor; 988 hammer_node_ondisk_t ondisk; 989 hammer_node_ondisk_t nndisk; 990 991 ondisk = onode->ondisk; 992 nndisk = nnode->ondisk; 993 994 again: 995 TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { 996 KKASSERT(cursor->node == onode); 997 if (cursor->index < index) 998 continue; 999 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 1000 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 1001 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1002 cursor->leaf = &nndisk->elms[cursor->index - index].leaf; 1003 cursor->node = nnode; 1004 cursor->index -= index; 1005 hammer_ref_node(nnode); 1006 hammer_rel_node(onode); 1007 goto again; 1008 } 1009 } 1010 1011 /* 1012 * An element was moved from one node to another or within a node. The 1013 * index may also represent the end of the node (index == numelements). 1014 * 1015 * {oparent,pindex} is the parent node's pointer to onode/oindex. 1016 * 1017 * This is used by the rebalancing code. This is not an insertion or 1018 * deletion and any additional elements, including the degenerate case at 1019 * the end of the node, will be dealt with by additional distinct calls. 1020 */ 1021 void 1022 hammer_cursor_moved_element(hammer_node_t oparent, int pindex, 1023 hammer_node_t onode, int oindex, 1024 hammer_node_t nnode, int nindex) 1025 { 1026 hammer_cursor_t cursor; 1027 hammer_node_ondisk_t ondisk; 1028 hammer_node_ondisk_t nndisk; 1029 1030 /* 1031 * Adjust any cursors pointing at the element 1032 */ 1033 ondisk = onode->ondisk; 1034 nndisk = nnode->ondisk; 1035 again1: 1036 TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { 1037 KKASSERT(cursor->node == onode); 1038 if (cursor->index != oindex) 1039 continue; 1040 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 1041 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 1042 if (cursor->leaf == &ondisk->elms[oindex].leaf) 1043 cursor->leaf = &nndisk->elms[nindex].leaf; 1044 cursor->node = nnode; 1045 cursor->index = nindex; 1046 hammer_ref_node(nnode); 1047 hammer_rel_node(onode); 1048 goto again1; 1049 } 1050 1051 /* 1052 * When moving the first element of onode to a different node any 1053 * cursor which is pointing at (oparent,pindex) must be repointed 1054 * to nnode and ATEDISK must be cleared. 1055 * 1056 * This prevents cursors from losing track due to insertions. 1057 * Insertions temporarily release the cursor in order to update 1058 * the mirror_tids. It primarily effects the mirror_write code. 1059 * The other code paths generally only do a single insertion and 1060 * then relookup or drop the cursor. 1061 */ 1062 if (onode == nnode || oindex) 1063 return; 1064 ondisk = oparent->ondisk; 1065 again2: 1066 TAILQ_FOREACH(cursor, &oparent->cursor_list, deadlk_entry) { 1067 KKASSERT(cursor->node == oparent); 1068 if (cursor->index != pindex) 1069 continue; 1070 kprintf("HAMMER debug: shifted cursor pointing at parent\n" 1071 "parent %016jx:%d onode %016jx:%d nnode %016jx:%d\n", 1072 (intmax_t)oparent->node_offset, pindex, 1073 (intmax_t)onode->node_offset, oindex, 1074 (intmax_t)nnode->node_offset, nindex); 1075 TAILQ_REMOVE(&oparent->cursor_list, cursor, deadlk_entry); 1076 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 1077 if (cursor->leaf == &ondisk->elms[oindex].leaf) 1078 cursor->leaf = &nndisk->elms[nindex].leaf; 1079 cursor->node = nnode; 1080 cursor->index = nindex; 1081 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1082 hammer_ref_node(nnode); 1083 hammer_rel_node(oparent); 1084 goto again2; 1085 } 1086 } 1087 1088 /* 1089 * The B-Tree element pointing to the specified node was moved from (oparent) 1090 * to (nparent, nindex). We must locate any tracked cursors pointing at 1091 * node and adjust their parent accordingly. 1092 * 1093 * This is used by the rebalancing code when packing elements causes an 1094 * element to shift from one node to another. 1095 */ 1096 void 1097 hammer_cursor_parent_changed(hammer_node_t node, hammer_node_t oparent, 1098 hammer_node_t nparent, int nindex) 1099 { 1100 hammer_cursor_t cursor; 1101 1102 again: 1103 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1104 KKASSERT(cursor->node == node); 1105 if (cursor->parent == oparent) { 1106 cursor->parent = nparent; 1107 cursor->parent_index = nindex; 1108 hammer_ref_node(nparent); 1109 hammer_rel_node(oparent); 1110 goto again; 1111 } 1112 } 1113 } 1114 1115 /* 1116 * Deleted element at (node, index) 1117 * 1118 * Shift indexes >= index 1119 */ 1120 void 1121 hammer_cursor_deleted_element(hammer_node_t node, int index) 1122 { 1123 hammer_cursor_t cursor; 1124 hammer_node_ondisk_t ondisk; 1125 1126 ondisk = node->ondisk; 1127 1128 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1129 KKASSERT(cursor->node == node); 1130 if (cursor->index == index) { 1131 cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT; 1132 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1133 cursor->leaf = NULL; 1134 } else if (cursor->index > index) { 1135 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1136 cursor->leaf = &ondisk->elms[cursor->index - 1].leaf; 1137 --cursor->index; 1138 } 1139 } 1140 } 1141 1142 /* 1143 * Inserted element at (node, index) 1144 * 1145 * Shift indexes >= index 1146 */ 1147 void 1148 hammer_cursor_inserted_element(hammer_node_t node, int index) 1149 { 1150 hammer_cursor_t cursor; 1151 hammer_node_ondisk_t ondisk; 1152 1153 ondisk = node->ondisk; 1154 1155 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1156 KKASSERT(cursor->node == node); 1157 if (cursor->index >= index) { 1158 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1159 cursor->leaf = &ondisk->elms[cursor->index + 1].leaf; 1160 ++cursor->index; 1161 } 1162 } 1163 } 1164 1165 /* 1166 * Invalidate the cached data buffer associated with a cursor. 1167 * 1168 * This needs to be done when the underlying block is being freed or 1169 * the referenced buffer can prevent the related buffer cache buffer 1170 * from being properly invalidated. 1171 */ 1172 void 1173 hammer_cursor_invalidate_cache(hammer_cursor_t cursor) 1174 { 1175 if (cursor->data_buffer) { 1176 hammer_rel_buffer(cursor->data_buffer, 0); 1177 cursor->data_buffer = NULL; 1178 cursor->data = NULL; 1179 } 1180 } 1181 1182