1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.42 2008/08/06 15:38:58 dillon Exp $ 35 */ 36 37 /* 38 * HAMMER B-Tree index - cursor support routines 39 */ 40 #include "hammer.h" 41 42 static int hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive); 43 44 /* 45 * Initialize a fresh cursor using the B-Tree node cache. If the cache 46 * is not available initialize a fresh cursor at the root of the filesystem. 47 */ 48 int 49 hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor, 50 hammer_node_cache_t cache, hammer_inode_t ip) 51 { 52 hammer_volume_t volume; 53 hammer_node_t node; 54 hammer_mount_t hmp; 55 u_int tticks; 56 int error; 57 58 bzero(cursor, sizeof(*cursor)); 59 60 cursor->trans = trans; 61 hmp = trans->hmp; 62 63 /* 64 * As the number of inodes queued to the flusher increases we use 65 * time-domain multiplexing to control read vs flush performance. 66 * We have to do it here, before acquiring any ip or node locks, 67 * to avoid deadlocking or excessively delaying the flusher. 68 * 69 * The full time period is hammer_tdmux_ticks, typically 1/5 of 70 * a second. 71 * 72 * inode allocation begins to get restrained at 2/4 the limit 73 * via the "hmrrcm" mechanism in hammer_inode. We want to begin 74 * limiting read activity before that to try to avoid processes 75 * stalling out in "hmrrcm". 76 */ 77 tticks = hammer_tdmux_ticks; 78 if (trans->type != HAMMER_TRANS_FLS && tticks && 79 hmp->count_reclaims > hammer_limit_reclaims / tticks && 80 hmp->count_reclaims > hammer_autoflush * 2 && 81 hammer_flusher_running(hmp)) { 82 u_int rticks; 83 u_int xticks; 84 u_int dummy; 85 86 /* 87 * 0 ... xticks ... tticks 88 * 89 * rticks is the calculated position, xticks is the demarc 90 * where values below xticks are reserved for the flusher 91 * and values >= to xticks may be used by the frontend. 92 * 93 * At least one tick is always made available for the 94 * frontend. 95 */ 96 rticks = (u_int)ticks % tticks; 97 xticks = hmp->count_reclaims * tticks / hammer_limit_reclaims; 98 99 /* 100 * Ensure rticks and xticks are stable 101 */ 102 cpu_ccfence(); 103 if (rticks < xticks) { 104 if (hammer_debug_general & 0x0004) 105 kprintf("rt %3u, xt %3u, tt %3u\n", 106 rticks, xticks, tticks); 107 tsleep(&dummy, 0, "htdmux", xticks - rticks); 108 } 109 } 110 111 /* 112 * If the cursor operation is on behalf of an inode, lock 113 * the inode. 114 * 115 * When acquiring a shared lock on an inode on which the backend 116 * flusher deadlocked, wait up to hammer_tdmux_ticks (1 second) 117 * for the deadlock to clear. 118 */ 119 if ((cursor->ip = ip) != NULL) { 120 ++ip->cursor_ip_refs; 121 if (trans->type == HAMMER_TRANS_FLS) { 122 hammer_lock_ex(&ip->lock); 123 } else { 124 #if 0 125 if (ip->cursor_exclreq_count) { 126 tsleep(&ip->cursor_exclreq_count, 0, 127 "hstag1", hammer_tdmux_ticks); 128 } 129 #endif 130 hammer_lock_sh(&ip->lock); 131 } 132 } 133 134 /* 135 * Step 1 - acquire a locked node from the cache if possible 136 */ 137 if (cache && cache->node) { 138 node = hammer_ref_node_safe(trans, cache, &error); 139 if (error == 0) { 140 hammer_lock_sh(&node->lock); 141 if (node->flags & HAMMER_NODE_DELETED) { 142 hammer_unlock(&node->lock); 143 hammer_rel_node(node); 144 node = NULL; 145 } 146 } 147 if (node == NULL) 148 ++hammer_stats_btree_root_iterations; 149 } else { 150 node = NULL; 151 ++hammer_stats_btree_root_iterations; 152 } 153 154 /* 155 * Step 2 - If we couldn't get a node from the cache, get 156 * the one from the root of the filesystem. 157 */ 158 while (node == NULL) { 159 volume = hammer_get_root_volume(hmp, &error); 160 if (error) 161 break; 162 node = hammer_get_node(trans, volume->ondisk->vol0_btree_root, 163 0, &error); 164 hammer_rel_volume(volume, 0); 165 if (error) 166 break; 167 /* 168 * When the frontend acquires the root b-tree node while the 169 * backend is deadlocked on it, wait up to hammer_tdmux_ticks 170 * (1 second) for the deadlock to clear. 171 */ 172 #if 0 173 if (node->cursor_exclreq_count && 174 cursor->trans->type != HAMMER_TRANS_FLS) { 175 tsleep(&node->cursor_exclreq_count, 0, 176 "hstag3", hammer_tdmux_ticks); 177 } 178 #endif 179 hammer_lock_sh(&node->lock); 180 181 /* 182 * If someone got in before we could lock the node, retry. 183 */ 184 if (node->flags & HAMMER_NODE_DELETED) { 185 hammer_unlock(&node->lock); 186 hammer_rel_node(node); 187 node = NULL; 188 continue; 189 } 190 if (volume->ondisk->vol0_btree_root != node->node_offset) { 191 hammer_unlock(&node->lock); 192 hammer_rel_node(node); 193 node = NULL; 194 continue; 195 } 196 } 197 198 /* 199 * Step 3 - finish initializing the cursor by acquiring the parent 200 */ 201 cursor->node = node; 202 if (error == 0) 203 error = hammer_load_cursor_parent(cursor, 0); 204 KKASSERT(error == 0); 205 /* if (error) hammer_done_cursor(cursor); */ 206 return(error); 207 } 208 209 /* 210 * Normalize a cursor. Sometimes cursors can be left in a state 211 * where node is NULL. If the cursor is in this state, cursor up. 212 */ 213 void 214 hammer_normalize_cursor(hammer_cursor_t cursor) 215 { 216 if (cursor->node == NULL) { 217 KKASSERT(cursor->parent != NULL); 218 hammer_cursor_up(cursor); 219 } 220 } 221 222 223 /* 224 * We are finished with a cursor. We NULL out various fields as sanity 225 * check, in case the structure is inappropriately used afterwords. 226 */ 227 void 228 hammer_done_cursor(hammer_cursor_t cursor) 229 { 230 hammer_inode_t ip; 231 232 KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0); 233 if (cursor->parent) { 234 hammer_unlock(&cursor->parent->lock); 235 hammer_rel_node(cursor->parent); 236 cursor->parent = NULL; 237 } 238 if (cursor->node) { 239 hammer_unlock(&cursor->node->lock); 240 hammer_rel_node(cursor->node); 241 cursor->node = NULL; 242 } 243 if (cursor->data_buffer) { 244 hammer_rel_buffer(cursor->data_buffer, 0); 245 cursor->data_buffer = NULL; 246 } 247 if ((ip = cursor->ip) != NULL) { 248 KKASSERT(ip->cursor_ip_refs > 0); 249 --ip->cursor_ip_refs; 250 hammer_unlock(&ip->lock); 251 cursor->ip = NULL; 252 } 253 if (cursor->iprec) { 254 hammer_rel_mem_record(cursor->iprec); 255 cursor->iprec = NULL; 256 } 257 258 /* 259 * If we deadlocked this node will be referenced. Do a quick 260 * lock/unlock to wait for the deadlock condition to clear. 261 * 262 * Maintain exclreq_count / wakeup as necessary to notify new 263 * entrants into ip. We continue to hold the fs_token so our 264 * EDEADLK retry loop should get its chance before another thread 265 * steals the lock. 266 */ 267 if (cursor->deadlk_node) { 268 #if 0 269 if (ip && cursor->trans->type == HAMMER_TRANS_FLS) 270 ++ip->cursor_exclreq_count; 271 ++cursor->deadlk_node->cursor_exclreq_count; 272 #endif 273 hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk"); 274 hammer_unlock(&cursor->deadlk_node->lock); 275 #if 0 276 if (--cursor->deadlk_node->cursor_exclreq_count == 0) 277 wakeup(&cursor->deadlk_node->cursor_exclreq_count); 278 if (ip && cursor->trans->type == HAMMER_TRANS_FLS) { 279 if (--ip->cursor_exclreq_count == 0) 280 wakeup(&ip->cursor_exclreq_count); 281 } 282 #endif 283 hammer_rel_node(cursor->deadlk_node); 284 cursor->deadlk_node = NULL; 285 } 286 if (cursor->deadlk_rec) { 287 hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr"); 288 hammer_rel_mem_record(cursor->deadlk_rec); 289 cursor->deadlk_rec = NULL; 290 } 291 292 cursor->data = NULL; 293 cursor->leaf = NULL; 294 cursor->left_bound = NULL; 295 cursor->right_bound = NULL; 296 cursor->trans = NULL; 297 } 298 299 /* 300 * Upgrade cursor->node and cursor->parent to exclusive locks. This 301 * function can return EDEADLK. 302 * 303 * The lock must already be either held shared or already held exclusively 304 * by us. 305 * 306 * We upgrade the parent first as it is the most likely to collide first 307 * with the downward traversal that the frontend typically does. 308 * 309 * If we fail to upgrade the lock and cursor->deadlk_node is NULL, 310 * we add another reference to the node that failed and set 311 * cursor->deadlk_node so hammer_done_cursor() can block on it. 312 */ 313 int 314 hammer_cursor_upgrade(hammer_cursor_t cursor) 315 { 316 int error; 317 318 if (cursor->parent) { 319 error = hammer_lock_upgrade(&cursor->parent->lock, 1); 320 if (error && cursor->deadlk_node == NULL) { 321 cursor->deadlk_node = cursor->parent; 322 hammer_ref_node(cursor->deadlk_node); 323 } 324 } else { 325 error = 0; 326 } 327 if (error == 0) { 328 error = hammer_lock_upgrade(&cursor->node->lock, 1); 329 if (error && cursor->deadlk_node == NULL) { 330 cursor->deadlk_node = cursor->node; 331 hammer_ref_node(cursor->deadlk_node); 332 } 333 } 334 #if 0 335 error = hammer_lock_upgrade(&cursor->node->lock, 1); 336 if (error && cursor->deadlk_node == NULL) { 337 cursor->deadlk_node = cursor->node; 338 hammer_ref_node(cursor->deadlk_node); 339 } else if (error == 0 && cursor->parent) { 340 error = hammer_lock_upgrade(&cursor->parent->lock, 1); 341 if (error && cursor->deadlk_node == NULL) { 342 cursor->deadlk_node = cursor->parent; 343 hammer_ref_node(cursor->deadlk_node); 344 } 345 } 346 #endif 347 return(error); 348 } 349 350 int 351 hammer_cursor_upgrade_node(hammer_cursor_t cursor) 352 { 353 int error; 354 355 error = hammer_lock_upgrade(&cursor->node->lock, 1); 356 if (error && cursor->deadlk_node == NULL) { 357 cursor->deadlk_node = cursor->node; 358 hammer_ref_node(cursor->deadlk_node); 359 } 360 return(error); 361 } 362 363 /* 364 * Downgrade cursor->node and cursor->parent to shared locks. This 365 * function can return EDEADLK. 366 */ 367 void 368 hammer_cursor_downgrade(hammer_cursor_t cursor) 369 { 370 if (hammer_lock_excl_owned(&cursor->node->lock, curthread)) 371 hammer_lock_downgrade(&cursor->node->lock, 1); 372 if (cursor->parent && 373 hammer_lock_excl_owned(&cursor->parent->lock, curthread)) { 374 hammer_lock_downgrade(&cursor->parent->lock, 1); 375 } 376 } 377 378 /* 379 * Upgrade and downgrade pairs of cursors. This is used by the dedup 380 * code which must deal with two cursors. A special function is needed 381 * because some of the nodes may be shared between the two cursors, 382 * resulting in share counts > 1 which will normally cause an upgrade 383 * to fail. 384 */ 385 static __noinline 386 int 387 collect_node(hammer_node_t *array, int *counts, int n, hammer_node_t node) 388 { 389 int i; 390 391 for (i = 0; i < n; ++i) { 392 if (array[i] == node) 393 break; 394 } 395 if (i == n) { 396 array[i] = node; 397 counts[i] = 1; 398 ++i; 399 } else { 400 ++counts[i]; 401 } 402 return(i); 403 } 404 405 int 406 hammer_cursor_upgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2) 407 { 408 hammer_node_t nodes[4]; 409 int counts[4]; 410 int error; 411 int i; 412 int n; 413 414 n = collect_node(nodes, counts, 0, cursor1->node); 415 if (cursor1->parent) 416 n = collect_node(nodes, counts, n, cursor1->parent); 417 n = collect_node(nodes, counts, n, cursor2->node); 418 if (cursor2->parent) 419 n = collect_node(nodes, counts, n, cursor2->parent); 420 421 error = 0; 422 for (i = 0; i < n; ++i) { 423 error = hammer_lock_upgrade(&nodes[i]->lock, counts[i]); 424 if (error) 425 break; 426 } 427 if (error) { 428 while (--i >= 0) 429 hammer_lock_downgrade(&nodes[i]->lock, counts[i]); 430 } 431 return (error); 432 } 433 434 void 435 hammer_cursor_downgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2) 436 { 437 hammer_node_t nodes[4]; 438 int counts[4]; 439 int i; 440 int n; 441 442 n = collect_node(nodes, counts, 0, cursor1->node); 443 if (cursor1->parent) 444 n = collect_node(nodes, counts, n, cursor1->parent); 445 n = collect_node(nodes, counts, n, cursor2->node); 446 if (cursor2->parent) 447 n = collect_node(nodes, counts, n, cursor2->parent); 448 449 for (i = 0; i < n; ++i) 450 hammer_lock_downgrade(&nodes[i]->lock, counts[i]); 451 } 452 453 /* 454 * Seek the cursor to the specified node and index. 455 * 456 * The caller must ref the node prior to calling this routine and release 457 * it after it returns. If the seek succeeds the cursor will gain its own 458 * ref on the node. 459 */ 460 int 461 hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node, int index) 462 { 463 int error; 464 465 hammer_cursor_downgrade(cursor); 466 error = 0; 467 468 if (cursor->node != node) { 469 hammer_unlock(&cursor->node->lock); 470 hammer_rel_node(cursor->node); 471 cursor->node = node; 472 hammer_ref_node(node); 473 hammer_lock_sh(&node->lock); 474 KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0); 475 476 if (cursor->parent) { 477 hammer_unlock(&cursor->parent->lock); 478 hammer_rel_node(cursor->parent); 479 cursor->parent = NULL; 480 cursor->parent_index = 0; 481 } 482 error = hammer_load_cursor_parent(cursor, 0); 483 } 484 cursor->index = index; 485 return (error); 486 } 487 488 /* 489 * Load the parent of cursor->node into cursor->parent. 490 */ 491 static 492 int 493 hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive) 494 { 495 hammer_mount_t hmp; 496 hammer_node_t parent; 497 hammer_node_t node; 498 hammer_btree_elm_t elm; 499 int error; 500 int parent_index; 501 502 hmp = cursor->trans->hmp; 503 504 if (cursor->node->ondisk->parent) { 505 node = cursor->node; 506 parent = hammer_btree_get_parent(cursor->trans, node, 507 &parent_index, 508 &error, try_exclusive); 509 if (error == 0) { 510 elm = &parent->ondisk->elms[parent_index]; 511 cursor->parent = parent; 512 cursor->parent_index = parent_index; 513 cursor->left_bound = &elm[0].internal.base; 514 cursor->right_bound = &elm[1].internal.base; 515 } 516 } else { 517 cursor->parent = NULL; 518 cursor->parent_index = 0; 519 cursor->left_bound = &hmp->root_btree_beg; 520 cursor->right_bound = &hmp->root_btree_end; 521 error = 0; 522 } 523 return(error); 524 } 525 526 /* 527 * Cursor up to our parent node. Return ENOENT if we are at the root of 528 * the filesystem. 529 */ 530 int 531 hammer_cursor_up(hammer_cursor_t cursor) 532 { 533 int error; 534 535 hammer_cursor_downgrade(cursor); 536 537 /* 538 * If the parent is NULL we are at the root of the B-Tree and 539 * return ENOENT. 540 */ 541 if (cursor->parent == NULL) 542 return (ENOENT); 543 544 /* 545 * Set the node to its parent. 546 */ 547 hammer_unlock(&cursor->node->lock); 548 hammer_rel_node(cursor->node); 549 cursor->node = cursor->parent; 550 cursor->index = cursor->parent_index; 551 cursor->parent = NULL; 552 cursor->parent_index = 0; 553 554 error = hammer_load_cursor_parent(cursor, 0); 555 return(error); 556 } 557 558 /* 559 * Special cursor up given a locked cursor. The orignal node is not 560 * unlocked or released and the cursor is not downgraded. 561 * 562 * This function can fail with EDEADLK. 563 * 564 * This function is only run when recursively deleting parent nodes 565 * to get rid of an empty leaf. 566 */ 567 int 568 hammer_cursor_up_locked(hammer_cursor_t cursor) 569 { 570 hammer_node_t save; 571 int error; 572 int save_index; 573 574 /* 575 * If the parent is NULL we are at the root of the B-Tree and 576 * return ENOENT. 577 */ 578 if (cursor->parent == NULL) 579 return (ENOENT); 580 581 save = cursor->node; 582 save_index = cursor->index; 583 584 /* 585 * Set the node to its parent. 586 */ 587 cursor->node = cursor->parent; 588 cursor->index = cursor->parent_index; 589 cursor->parent = NULL; 590 cursor->parent_index = 0; 591 592 /* 593 * load the new parent, attempt to exclusively lock it. Note that 594 * we are still holding the old parent (now cursor->node) exclusively 595 * locked. 596 * 597 * This can return EDEADLK. Undo the operation on any error. These 598 * up sequences can occur during iterations so be sure to restore 599 * the index. 600 */ 601 error = hammer_load_cursor_parent(cursor, 1); 602 if (error) { 603 cursor->parent = cursor->node; 604 cursor->parent_index = cursor->index; 605 cursor->node = save; 606 cursor->index = save_index; 607 } 608 return(error); 609 } 610 611 612 /* 613 * Cursor down through the current node, which must be an internal node. 614 * 615 * This routine adjusts the cursor and sets index to 0. 616 */ 617 int 618 hammer_cursor_down(hammer_cursor_t cursor) 619 { 620 hammer_node_t node; 621 hammer_btree_elm_t elm; 622 int error; 623 624 /* 625 * The current node becomes the current parent 626 */ 627 hammer_cursor_downgrade(cursor); 628 node = cursor->node; 629 KKASSERT(cursor->index >= 0 && cursor->index < node->ondisk->count); 630 if (cursor->parent) { 631 hammer_unlock(&cursor->parent->lock); 632 hammer_rel_node(cursor->parent); 633 } 634 cursor->parent = node; 635 cursor->parent_index = cursor->index; 636 cursor->node = NULL; 637 cursor->index = 0; 638 639 /* 640 * Extract element to push into at (node,index), set bounds. 641 */ 642 elm = &node->ondisk->elms[cursor->parent_index]; 643 644 /* 645 * Ok, push down into elm. If elm specifies an internal or leaf 646 * node the current node must be an internal node. If elm specifies 647 * a spike then the current node must be a leaf node. 648 */ 649 switch(elm->base.btype) { 650 case HAMMER_BTREE_TYPE_INTERNAL: 651 case HAMMER_BTREE_TYPE_LEAF: 652 KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 653 KKASSERT(elm->internal.subtree_offset != 0); 654 cursor->left_bound = &elm[0].internal.base; 655 cursor->right_bound = &elm[1].internal.base; 656 node = hammer_get_node(cursor->trans, 657 elm->internal.subtree_offset, 0, &error); 658 if (error == 0) { 659 KASSERT(elm->base.btype == node->ondisk->type, ("BTYPE MISMATCH %c %c NODE %p\n", elm->base.btype, node->ondisk->type, node)); 660 if (node->ondisk->parent != cursor->parent->node_offset) 661 panic("node %p %016llx vs %016llx\n", node, (long long)node->ondisk->parent, (long long)cursor->parent->node_offset); 662 KKASSERT(node->ondisk->parent == cursor->parent->node_offset); 663 } 664 break; 665 default: 666 panic("hammer_cursor_down: illegal btype %02x (%c)\n", 667 elm->base.btype, 668 (elm->base.btype ? elm->base.btype : '?')); 669 break; 670 } 671 672 /* 673 * If no error occured we can lock the new child node. If the 674 * node is deadlock flagged wait up to hammer_tdmux_ticks (1 second) 675 * for the deadlock to clear. Otherwise a large number of concurrent 676 * readers can continuously stall the flusher. 677 * 678 * We specifically do this in the cursor_down() code in order to 679 * deal with frontend top-down searches smashing against bottom-up 680 * flusher-based mirror updates. These collisions typically occur 681 * above the inode in the B-Tree and are not covered by the 682 * ip->cursor_exclreq_count logic. 683 */ 684 if (error == 0) { 685 #if 0 686 if (node->cursor_exclreq_count && 687 cursor->trans->type != HAMMER_TRANS_FLS) { 688 tsleep(&node->cursor_exclreq_count, 0, 689 "hstag2", hammer_tdmux_ticks); 690 } 691 #endif 692 hammer_lock_sh(&node->lock); 693 KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0); 694 cursor->node = node; 695 cursor->index = 0; 696 } 697 return(error); 698 } 699 700 /************************************************************************ 701 * DEADLOCK RECOVERY * 702 ************************************************************************ 703 * 704 * These are the new deadlock recovery functions. Currently they are only 705 * used for the mirror propagation and physical node removal cases but 706 * ultimately the intention is to use them for all deadlock recovery 707 * operations. 708 * 709 * WARNING! The contents of the cursor may be modified while unlocked. 710 * passive modifications including adjusting the node, parent, 711 * indexes, and leaf pointer. 712 * 713 * An outright removal of the element the cursor was pointing at 714 * will cause the HAMMER_CURSOR_TRACKED_RIPOUT flag to be set, 715 * which chains to causing the HAMMER_CURSOR_RETEST to be set 716 * when the cursor is locked again. 717 */ 718 void 719 hammer_unlock_cursor(hammer_cursor_t cursor) 720 { 721 hammer_node_t node; 722 723 KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0); 724 KKASSERT(cursor->node); 725 726 /* 727 * Release the cursor's locks and track B-Tree operations on node. 728 * While being tracked our cursor can be modified by other threads 729 * and the node may be replaced. 730 */ 731 if (cursor->parent) { 732 hammer_unlock(&cursor->parent->lock); 733 hammer_rel_node(cursor->parent); 734 cursor->parent = NULL; 735 } 736 node = cursor->node; 737 cursor->flags |= HAMMER_CURSOR_TRACKED; 738 TAILQ_INSERT_TAIL(&node->cursor_list, cursor, deadlk_entry); 739 hammer_unlock(&node->lock); 740 } 741 742 /* 743 * Get the cursor heated up again. The cursor's node may have 744 * changed and we might have to locate the new parent. 745 * 746 * If the exact element we were on got deleted RIPOUT will be 747 * set and we must clear ATEDISK so an iteration does not skip 748 * the element after it. 749 */ 750 int 751 hammer_lock_cursor(hammer_cursor_t cursor) 752 { 753 hammer_node_t node; 754 int error; 755 756 KKASSERT(cursor->flags & HAMMER_CURSOR_TRACKED); 757 758 /* 759 * Relock the node 760 */ 761 for (;;) { 762 node = cursor->node; 763 hammer_ref_node(node); 764 hammer_lock_sh(&node->lock); 765 if (cursor->node == node) { 766 hammer_rel_node(node); 767 break; 768 } 769 hammer_unlock(&node->lock); 770 hammer_rel_node(node); 771 } 772 773 /* 774 * Untrack the cursor, clean up, and re-establish the parent node. 775 */ 776 TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry); 777 cursor->flags &= ~HAMMER_CURSOR_TRACKED; 778 779 /* 780 * If a ripout has occured iterations must re-test the (new) 781 * current element. Clearing ATEDISK prevents the element from 782 * being skipped and RETEST causes it to be re-tested. 783 */ 784 if (cursor->flags & HAMMER_CURSOR_TRACKED_RIPOUT) { 785 cursor->flags &= ~HAMMER_CURSOR_TRACKED_RIPOUT; 786 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 787 cursor->flags |= HAMMER_CURSOR_RETEST; 788 } 789 error = hammer_load_cursor_parent(cursor, 0); 790 return(error); 791 } 792 793 /* 794 * Recover from a deadlocked cursor, tracking any node removals or 795 * replacements. If the cursor's current node is removed by another 796 * thread (via btree_remove()) the cursor will be seeked upwards. 797 * 798 * The caller is working a modifying operation and must be holding the 799 * sync lock (shared). We do not release the sync lock because this 800 * would break atomicy. 801 */ 802 int 803 hammer_recover_cursor(hammer_cursor_t cursor) 804 { 805 hammer_transaction_t trans; 806 hammer_inode_t ip; 807 int error; 808 809 hammer_unlock_cursor(cursor); 810 811 ip = cursor->ip; 812 trans = cursor->trans; 813 KKASSERT(trans->sync_lock_refs > 0); 814 815 /* 816 * Wait for the deadlock to clear. 817 * 818 * Maintain exclreq_count / wakeup as necessary to notify new 819 * entrants into ip. We continue to hold the fs_token so our 820 * EDEADLK retry loop should get its chance before another thread 821 * steals the lock. 822 */ 823 if (cursor->deadlk_node) { 824 #if 0 825 if (ip && trans->type == HAMMER_TRANS_FLS) 826 ++ip->cursor_exclreq_count; 827 ++cursor->deadlk_node->cursor_exclreq_count; 828 #endif 829 hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk"); 830 hammer_unlock(&cursor->deadlk_node->lock); 831 #if 0 832 if (--cursor->deadlk_node->cursor_exclreq_count == 0) 833 wakeup(&cursor->deadlk_node->cursor_exclreq_count); 834 if (ip && trans->type == HAMMER_TRANS_FLS) { 835 if (--ip->cursor_exclreq_count == 0) 836 wakeup(&ip->cursor_exclreq_count); 837 } 838 #endif 839 hammer_rel_node(cursor->deadlk_node); 840 cursor->deadlk_node = NULL; 841 } 842 if (cursor->deadlk_rec) { 843 hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr"); 844 hammer_rel_mem_record(cursor->deadlk_rec); 845 cursor->deadlk_rec = NULL; 846 } 847 error = hammer_lock_cursor(cursor); 848 return(error); 849 } 850 851 /* 852 * Dup ocursor to ncursor. ncursor inherits ocursor's locks and ocursor 853 * is effectively unlocked and becomes tracked. If ocursor was not locked 854 * then ncursor also inherits the tracking. 855 * 856 * After the caller finishes working with ncursor it must be cleaned up 857 * with hammer_done_cursor(), and the caller must re-lock ocursor. 858 */ 859 hammer_cursor_t 860 hammer_push_cursor(hammer_cursor_t ocursor) 861 { 862 hammer_cursor_t ncursor; 863 hammer_inode_t ip; 864 hammer_node_t node; 865 hammer_mount_t hmp; 866 867 hmp = ocursor->trans->hmp; 868 ncursor = kmalloc(sizeof(*ncursor), hmp->m_misc, M_WAITOK | M_ZERO); 869 bcopy(ocursor, ncursor, sizeof(*ocursor)); 870 871 node = ocursor->node; 872 hammer_ref_node(node); 873 if ((ocursor->flags & HAMMER_CURSOR_TRACKED) == 0) { 874 ocursor->flags |= HAMMER_CURSOR_TRACKED; 875 TAILQ_INSERT_TAIL(&node->cursor_list, ocursor, deadlk_entry); 876 } 877 if (ncursor->parent) 878 ocursor->parent = NULL; 879 ocursor->data_buffer = NULL; 880 ocursor->leaf = NULL; 881 ocursor->data = NULL; 882 if (ncursor->flags & HAMMER_CURSOR_TRACKED) 883 TAILQ_INSERT_TAIL(&node->cursor_list, ncursor, deadlk_entry); 884 if ((ip = ncursor->ip) != NULL) { 885 ++ip->cursor_ip_refs; 886 } 887 if (ncursor->iprec) 888 hammer_ref(&ncursor->iprec->lock); 889 return(ncursor); 890 } 891 892 /* 893 * Destroy ncursor and restore ocursor 894 * 895 * This is a temporary hack for the release. We can't afford to lose 896 * the IP lock until the IP object scan code is able to deal with it, 897 * so have ocursor inherit it back. 898 */ 899 void 900 hammer_pop_cursor(hammer_cursor_t ocursor, hammer_cursor_t ncursor) 901 { 902 hammer_mount_t hmp; 903 hammer_inode_t ip; 904 905 hmp = ncursor->trans->hmp; 906 ip = ncursor->ip; 907 ncursor->ip = NULL; 908 if (ip) 909 --ip->cursor_ip_refs; 910 hammer_done_cursor(ncursor); 911 kfree(ncursor, hmp->m_misc); 912 KKASSERT(ocursor->ip == ip); 913 hammer_lock_cursor(ocursor); 914 } 915 916 /* 917 * onode is being replaced by nnode by the reblocking code. 918 */ 919 void 920 hammer_cursor_replaced_node(hammer_node_t onode, hammer_node_t nnode) 921 { 922 hammer_cursor_t cursor; 923 hammer_node_ondisk_t ondisk; 924 hammer_node_ondisk_t nndisk; 925 926 ondisk = onode->ondisk; 927 nndisk = nnode->ondisk; 928 929 while ((cursor = TAILQ_FIRST(&onode->cursor_list)) != NULL) { 930 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 931 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 932 KKASSERT(cursor->node == onode); 933 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 934 cursor->leaf = &nndisk->elms[cursor->index].leaf; 935 cursor->node = nnode; 936 hammer_ref_node(nnode); 937 hammer_rel_node(onode); 938 } 939 } 940 941 /* 942 * We have removed <node> from the parent and collapsed the parent. 943 * 944 * Cursors in deadlock recovery are seeked upward to the parent so the 945 * btree_remove() recursion works properly even though we have marked 946 * the cursor as requiring a reseek. 947 * 948 * This is the only cursor function which sets HAMMER_CURSOR_ITERATE_CHECK, 949 * meaning the cursor is no longer definitively pointing at an element 950 * within its iteration (if the cursor is being used to iterate). The 951 * iteration code will take this into account instead of asserting if the 952 * cursor is outside the iteration range. 953 */ 954 void 955 hammer_cursor_removed_node(hammer_node_t node, hammer_node_t parent, int index) 956 { 957 hammer_cursor_t cursor; 958 hammer_node_ondisk_t ondisk; 959 960 KKASSERT(parent != NULL); 961 ondisk = node->ondisk; 962 963 while ((cursor = TAILQ_FIRST(&node->cursor_list)) != NULL) { 964 KKASSERT(cursor->node == node); 965 KKASSERT(cursor->index == 0); 966 TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry); 967 TAILQ_INSERT_TAIL(&parent->cursor_list, cursor, deadlk_entry); 968 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 969 cursor->leaf = NULL; 970 cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT; 971 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK; 972 cursor->node = parent; 973 cursor->index = index; 974 hammer_ref_node(parent); 975 hammer_rel_node(node); 976 } 977 } 978 979 /* 980 * node was split at (onode, index) with elements >= index moved to nnode. 981 */ 982 void 983 hammer_cursor_split_node(hammer_node_t onode, hammer_node_t nnode, int index) 984 { 985 hammer_cursor_t cursor; 986 hammer_node_ondisk_t ondisk; 987 hammer_node_ondisk_t nndisk; 988 989 ondisk = onode->ondisk; 990 nndisk = nnode->ondisk; 991 992 again: 993 TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { 994 KKASSERT(cursor->node == onode); 995 if (cursor->index < index) 996 continue; 997 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 998 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 999 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1000 cursor->leaf = &nndisk->elms[cursor->index - index].leaf; 1001 cursor->node = nnode; 1002 cursor->index -= index; 1003 hammer_ref_node(nnode); 1004 hammer_rel_node(onode); 1005 goto again; 1006 } 1007 } 1008 1009 /* 1010 * An element was moved from one node to another or within a node. The 1011 * index may also represent the end of the node (index == numelements). 1012 * 1013 * {oparent,pindex} is the parent node's pointer to onode/oindex. 1014 * 1015 * This is used by the rebalancing code. This is not an insertion or 1016 * deletion and any additional elements, including the degenerate case at 1017 * the end of the node, will be dealt with by additional distinct calls. 1018 */ 1019 void 1020 hammer_cursor_moved_element(hammer_node_t oparent, int pindex, 1021 hammer_node_t onode, int oindex, 1022 hammer_node_t nnode, int nindex) 1023 { 1024 hammer_cursor_t cursor; 1025 hammer_node_ondisk_t ondisk; 1026 hammer_node_ondisk_t nndisk; 1027 1028 /* 1029 * Adjust any cursors pointing at the element 1030 */ 1031 ondisk = onode->ondisk; 1032 nndisk = nnode->ondisk; 1033 again1: 1034 TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { 1035 KKASSERT(cursor->node == onode); 1036 if (cursor->index != oindex) 1037 continue; 1038 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 1039 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 1040 if (cursor->leaf == &ondisk->elms[oindex].leaf) 1041 cursor->leaf = &nndisk->elms[nindex].leaf; 1042 cursor->node = nnode; 1043 cursor->index = nindex; 1044 hammer_ref_node(nnode); 1045 hammer_rel_node(onode); 1046 goto again1; 1047 } 1048 1049 /* 1050 * When moving the first element of onode to a different node any 1051 * cursor which is pointing at (oparent,pindex) must be repointed 1052 * to nnode and ATEDISK must be cleared. 1053 * 1054 * This prevents cursors from losing track due to insertions. 1055 * Insertions temporarily release the cursor in order to update 1056 * the mirror_tids. It primarily effects the mirror_write code. 1057 * The other code paths generally only do a single insertion and 1058 * then relookup or drop the cursor. 1059 */ 1060 if (onode == nnode || oindex) 1061 return; 1062 ondisk = oparent->ondisk; 1063 again2: 1064 TAILQ_FOREACH(cursor, &oparent->cursor_list, deadlk_entry) { 1065 KKASSERT(cursor->node == oparent); 1066 if (cursor->index != pindex) 1067 continue; 1068 kprintf("HAMMER debug: shifted cursor pointing at parent\n" 1069 "parent %016jx:%d onode %016jx:%d nnode %016jx:%d\n", 1070 (intmax_t)oparent->node_offset, pindex, 1071 (intmax_t)onode->node_offset, oindex, 1072 (intmax_t)nnode->node_offset, nindex); 1073 TAILQ_REMOVE(&oparent->cursor_list, cursor, deadlk_entry); 1074 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 1075 if (cursor->leaf == &ondisk->elms[oindex].leaf) 1076 cursor->leaf = &nndisk->elms[nindex].leaf; 1077 cursor->node = nnode; 1078 cursor->index = nindex; 1079 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1080 hammer_ref_node(nnode); 1081 hammer_rel_node(oparent); 1082 goto again2; 1083 } 1084 } 1085 1086 /* 1087 * The B-Tree element pointing to the specified node was moved from (oparent) 1088 * to (nparent, nindex). We must locate any tracked cursors pointing at 1089 * node and adjust their parent accordingly. 1090 * 1091 * This is used by the rebalancing code when packing elements causes an 1092 * element to shift from one node to another. 1093 */ 1094 void 1095 hammer_cursor_parent_changed(hammer_node_t node, hammer_node_t oparent, 1096 hammer_node_t nparent, int nindex) 1097 { 1098 hammer_cursor_t cursor; 1099 1100 again: 1101 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1102 KKASSERT(cursor->node == node); 1103 if (cursor->parent == oparent) { 1104 cursor->parent = nparent; 1105 cursor->parent_index = nindex; 1106 hammer_ref_node(nparent); 1107 hammer_rel_node(oparent); 1108 goto again; 1109 } 1110 } 1111 } 1112 1113 /* 1114 * Deleted element at (node, index) 1115 * 1116 * Shift indexes >= index 1117 */ 1118 void 1119 hammer_cursor_deleted_element(hammer_node_t node, int index) 1120 { 1121 hammer_cursor_t cursor; 1122 hammer_node_ondisk_t ondisk; 1123 1124 ondisk = node->ondisk; 1125 1126 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1127 KKASSERT(cursor->node == node); 1128 if (cursor->index == index) { 1129 cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT; 1130 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1131 cursor->leaf = NULL; 1132 } else if (cursor->index > index) { 1133 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1134 cursor->leaf = &ondisk->elms[cursor->index - 1].leaf; 1135 --cursor->index; 1136 } 1137 } 1138 } 1139 1140 /* 1141 * Inserted element at (node, index) 1142 * 1143 * Shift indexes >= index 1144 */ 1145 void 1146 hammer_cursor_inserted_element(hammer_node_t node, int index) 1147 { 1148 hammer_cursor_t cursor; 1149 hammer_node_ondisk_t ondisk; 1150 1151 ondisk = node->ondisk; 1152 1153 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1154 KKASSERT(cursor->node == node); 1155 if (cursor->index >= index) { 1156 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1157 cursor->leaf = &ondisk->elms[cursor->index + 1].leaf; 1158 ++cursor->index; 1159 } 1160 } 1161 } 1162 1163 /* 1164 * Invalidate the cached data buffer associated with a cursor. 1165 * 1166 * This needs to be done when the underlying block is being freed or 1167 * the referenced buffer can prevent the related buffer cache buffer 1168 * from being properly invalidated. 1169 */ 1170 void 1171 hammer_cursor_invalidate_cache(hammer_cursor_t cursor) 1172 { 1173 if (cursor->data_buffer) { 1174 hammer_rel_buffer(cursor->data_buffer, 0); 1175 cursor->data_buffer = NULL; 1176 cursor->data = NULL; 1177 } 1178 } 1179 1180