1 /* 2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information from different nodes into a single entity. It allows direct 37 * access to media data as long as it is not blockref array data (which 38 * will obviously have to be different at each node). 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 * 48 * WARNING! This module is *extremely* complex. It must issue asynchronous 49 * locks and I/O, do quorum and/or master-slave processing, and 50 * it must operate properly even if some nodes are broken (which 51 * can also mean indefinite locks). 52 * 53 * CLUSTER OPERATIONS 54 * 55 * Cluster operations can be broken down into three pieces: 56 * 57 * (1) Chain locking and data retrieval. 58 * hammer2_cluster_lock() 59 * hammer2_cluster_parent() 60 * 61 * - Most complex functions, quorum management on transaction ids. 62 * 63 * - Locking and data accesses must be internally asynchronous. 64 * 65 * - Validate and manage cache coherency primitives (cache state 66 * is stored in chain topologies but must be validated by these 67 * functions). 68 * 69 * (2) Lookups and Scans 70 * hammer2_cluster_lookup() 71 * hammer2_cluster_next() 72 * 73 * - Depend on locking & data retrieval functions, but still complex. 74 * 75 * - Must do quorum management on transaction ids. 76 * 77 * - Lookup and Iteration ops Must be internally asynchronous. 78 * 79 * (3) Modifying Operations 80 * hammer2_cluster_create() 81 * hammer2_cluster_rename() 82 * hammer2_cluster_delete() 83 * hammer2_cluster_modify() 84 * hammer2_cluster_modsync() 85 * 86 * - Can usually punt on failures, operation continues unless quorum 87 * is lost. If quorum is lost, must wait for resynchronization 88 * (depending on the management mode). 89 * 90 * - Must disconnect node on failures (also not flush), remount, and 91 * resynchronize. 92 * 93 * - Network links (via kdmsg) are relatively easy to issue as the 94 * complex underworkings of hammer2_chain.c don't have to messed 95 * with (the protocol is at a higher level than block-level). 96 * 97 * - Multiple local disk nodes (i.e. block devices) are another matter. 98 * Chain operations have to be dispatched to per-node threads (xN) 99 * because we can't asynchronize potentially very complex chain 100 * operations in hammer2_chain.c (it would be a huge mess). 101 * 102 * (these threads are also used to terminate incoming kdmsg ops from 103 * other machines). 104 * 105 * - Single-node filesystems do not use threads and will simply call 106 * hammer2_chain.c functions directly. This short-cut is handled 107 * at the base of each cluster function. 108 */ 109 #include <sys/cdefs.h> 110 #include <sys/param.h> 111 #include <sys/systm.h> 112 #include <sys/types.h> 113 #include <sys/lock.h> 114 #include <sys/uuid.h> 115 116 #include "hammer2.h" 117 118 /* 119 * Returns TRUE if any chain in the cluster needs to be resized. 120 */ 121 int 122 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes) 123 { 124 hammer2_chain_t *chain; 125 int i; 126 127 for (i = 0; i < cluster->nchains; ++i) { 128 chain = cluster->array[i].chain; 129 if (chain && chain->bytes != bytes) 130 return 1; 131 } 132 return 0; 133 } 134 135 uint8_t 136 hammer2_cluster_type(hammer2_cluster_t *cluster) 137 { 138 return(cluster->focus->bref.type); 139 } 140 141 int 142 hammer2_cluster_modified(hammer2_cluster_t *cluster) 143 { 144 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); 145 } 146 147 /* 148 * Return a bref representative of the cluster. Any data offset is removed 149 * (since it would only be applicable to a particular chain in the cluster). 150 * 151 * However, the radix portion of data_off is used for many purposes and will 152 * be retained. 153 */ 154 void 155 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 156 { 157 *bref = cluster->focus->bref; 158 bref->data_off &= HAMMER2_OFF_MASK_RADIX; 159 } 160 161 /* 162 * Return non-zero if the chain representing an inode has been flagged 163 * as having been unlinked. Allows the vnode reclaim to avoid loading 164 * the inode data from disk e.g. when unmount or recycling old, clean 165 * vnodes. 166 */ 167 int 168 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster) 169 { 170 hammer2_chain_t *chain; 171 int flags; 172 int i; 173 174 flags = 0; 175 for (i = 0; i < cluster->nchains; ++i) { 176 chain = cluster->array[i].chain; 177 if (chain) 178 flags |= chain->flags; 179 } 180 return (flags & HAMMER2_CHAIN_UNLINKED); 181 } 182 183 void 184 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 185 { 186 hammer2_chain_t *chain; 187 int i; 188 189 for (i = 0; i < cluster->nchains; ++i) { 190 chain = cluster->array[i].chain; 191 if (chain) 192 atomic_set_int(&chain->flags, flags); 193 } 194 } 195 196 void 197 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 198 { 199 hammer2_chain_t *chain; 200 int i; 201 202 for (i = 0; i < cluster->nchains; ++i) { 203 chain = cluster->array[i].chain; 204 if (chain) 205 atomic_clear_int(&chain->flags, flags); 206 } 207 } 208 209 void 210 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster) 211 { 212 hammer2_chain_t *chain; 213 int i; 214 215 for (i = 0; i < cluster->nchains; ++i) { 216 chain = cluster->array[i].chain; 217 if (chain) 218 hammer2_chain_setflush(trans, chain); 219 } 220 } 221 222 void 223 hammer2_cluster_setmethod_check(hammer2_trans_t *trans, 224 hammer2_cluster_t *cluster, 225 int check_algo) 226 { 227 hammer2_chain_t *chain; 228 int i; 229 230 for (i = 0; i < cluster->nchains; ++i) { 231 chain = cluster->array[i].chain; 232 if (chain) { 233 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 234 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1); 235 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo); 236 } 237 } 238 } 239 240 /* 241 * Create a cluster with one ref from the specified chain. The chain 242 * is not further referenced. The caller typically supplies a locked 243 * chain and transfers ownership to the cluster. 244 * 245 * The returned cluster will be focused on the chain (strictly speaking, 246 * the focus should be NULL if the chain is not locked but we do not check 247 * for this condition). 248 */ 249 hammer2_cluster_t * 250 hammer2_cluster_from_chain(hammer2_chain_t *chain) 251 { 252 hammer2_cluster_t *cluster; 253 254 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 255 cluster->array[0].chain = chain; 256 cluster->nchains = 1; 257 cluster->focus = chain; 258 cluster->pmp = chain->pmp; 259 cluster->refs = 1; 260 261 return cluster; 262 } 263 264 /* 265 * Allocates a cluster and its underlying chain structures. The underlying 266 * chains will be locked. The cluster and underlying chains will have one 267 * ref and will be focused on the first chain. 268 * 269 * XXX focus on first chain. 270 */ 271 hammer2_cluster_t * 272 hammer2_cluster_alloc(hammer2_pfs_t *pmp, 273 hammer2_trans_t *trans, hammer2_blockref_t *bref) 274 { 275 hammer2_cluster_t *cluster; 276 hammer2_cluster_t *rcluster; 277 hammer2_chain_t *chain; 278 hammer2_chain_t *rchain; 279 #if 0 280 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 281 #endif 282 int i; 283 284 KKASSERT(pmp != NULL); 285 286 /* 287 * Construct the appropriate system structure. 288 */ 289 switch(bref->type) { 290 case HAMMER2_BREF_TYPE_INODE: 291 case HAMMER2_BREF_TYPE_INDIRECT: 292 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 293 case HAMMER2_BREF_TYPE_DATA: 294 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 295 /* 296 * Chain's are really only associated with the hmp but we 297 * maintain a pmp association for per-mount memory tracking 298 * purposes. The pmp can be NULL. 299 */ 300 break; 301 case HAMMER2_BREF_TYPE_VOLUME: 302 case HAMMER2_BREF_TYPE_FREEMAP: 303 chain = NULL; 304 panic("hammer2_cluster_alloc volume type illegal for op"); 305 default: 306 chain = NULL; 307 panic("hammer2_cluster_alloc: unrecognized blockref type: %d", 308 bref->type); 309 } 310 311 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 312 cluster->refs = 1; 313 314 rcluster = &pmp->iroot->cluster; 315 for (i = 0; i < rcluster->nchains; ++i) { 316 rchain = rcluster->array[i].chain; 317 chain = hammer2_chain_alloc(rchain->hmp, pmp, trans, bref); 318 #if 0 319 chain->hmp = rchain->hmp; 320 chain->bref = *bref; 321 chain->bytes = bytes; 322 chain->refs = 1; 323 chain->flags = HAMMER2_CHAIN_ALLOCATED; 324 #endif 325 326 /* 327 * NOTE: When loading a chain from backing store or creating a 328 * snapshot, trans will be NULL and the caller is 329 * responsible for setting these fields. 330 */ 331 cluster->array[i].chain = chain; 332 } 333 cluster->nchains = i; 334 cluster->pmp = pmp; 335 cluster->focus = cluster->array[0].chain; 336 337 return (cluster); 338 } 339 340 /* 341 * Add a reference to a cluster. 342 * 343 * We must also ref the underlying chains in order to allow ref/unlock 344 * sequences to later re-lock. 345 */ 346 void 347 hammer2_cluster_ref(hammer2_cluster_t *cluster) 348 { 349 hammer2_chain_t *chain; 350 int i; 351 352 atomic_add_int(&cluster->refs, 1); 353 for (i = 0; i < cluster->nchains; ++i) { 354 chain = cluster->array[i].chain; 355 if (chain) 356 hammer2_chain_ref(chain); 357 } 358 } 359 360 /* 361 * Drop the caller's reference to the cluster. When the ref count drops to 362 * zero this function frees the cluster and drops all underlying chains. 363 * 364 * In-progress read I/Os are typically detached from the cluster once the 365 * first one returns (the remaining stay attached to the DIOs but are then 366 * ignored and drop naturally). 367 */ 368 void 369 hammer2_cluster_drop(hammer2_cluster_t *cluster) 370 { 371 hammer2_chain_t *chain; 372 int i; 373 374 KKASSERT(cluster->refs > 0); 375 for (i = 0; i < cluster->nchains; ++i) { 376 chain = cluster->array[i].chain; 377 if (chain) { 378 hammer2_chain_drop(chain); 379 if (cluster->refs == 1) 380 cluster->array[i].chain = NULL; 381 } 382 } 383 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 384 cluster->focus = NULL; /* safety */ 385 kfree(cluster, M_HAMMER2); 386 /* cluster is invalid */ 387 } 388 } 389 390 void 391 hammer2_cluster_wait(hammer2_cluster_t *cluster) 392 { 393 tsleep(cluster->focus, 0, "h2clcw", 1); 394 } 395 396 /* 397 * Lock and ref a cluster. This adds a ref to the cluster and its chains 398 * and then locks them. 399 * 400 * The act of locking a cluster sets its focus if not already set. 401 */ 402 int 403 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 404 { 405 hammer2_chain_t *chain; 406 hammer2_chain_t *tmp; 407 int i; 408 int error; 409 410 if ((how & HAMMER2_RESOLVE_NOREF) == 0) 411 atomic_add_int(&cluster->refs, 1); 412 413 error = 0; 414 415 for (i = 0; i < cluster->nchains; ++i) { 416 chain = cluster->array[i].chain; 417 if (chain) { 418 error = hammer2_chain_lock(chain, how); 419 if (error) { 420 while (--i >= 0) { 421 tmp = cluster->array[i].chain; 422 hammer2_chain_unlock(tmp); 423 } 424 atomic_add_int(&cluster->refs, -1); 425 break; 426 } 427 if (cluster->focus == NULL) 428 cluster->focus = chain; 429 } 430 } 431 return error; 432 } 433 434 /* 435 * Replace the contents of dst with src, adding a reference to src's chains. 436 * dst is assumed to already have a ref and any chains present in dst are 437 * assumed to be locked and will be unlocked. 438 * 439 * If the chains in src are locked, only one of (src) or (dst) should be 440 * considered locked by the caller after return, not both. 441 */ 442 void 443 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src) 444 { 445 hammer2_chain_t *chain; 446 hammer2_chain_t *tmp; 447 int i; 448 449 KKASSERT(dst->refs == 1); 450 dst->focus = NULL; 451 452 for (i = 0; i < src->nchains; ++i) { 453 chain = src->array[i].chain; 454 if (chain) { 455 hammer2_chain_ref(chain); 456 if (i < dst->nchains && 457 (tmp = dst->array[i].chain) != NULL) { 458 hammer2_chain_unlock(tmp); 459 } 460 dst->array[i].chain = chain; 461 if (dst->focus == NULL) 462 dst->focus = chain; 463 } 464 } 465 while (i < dst->nchains) { 466 chain = dst->array[i].chain; 467 if (chain) { 468 hammer2_chain_unlock(chain); 469 dst->array[i].chain = NULL; 470 } 471 ++i; 472 } 473 dst->nchains = src->nchains; 474 } 475 476 /* 477 * Replace the contents of the locked destination with the contents of the 478 * locked source. Destination must have one ref. 479 * 480 * Returns with the destination still with one ref and the copied chains 481 * with an additional lock (representing their state on the destination). 482 * The original chains associated with the destination are unlocked. 483 */ 484 void 485 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src) 486 { 487 hammer2_chain_t *chain; 488 hammer2_chain_t *tmp; 489 int i; 490 491 KKASSERT(dst->refs == 1); 492 493 dst->focus = NULL; 494 for (i = 0; i < src->nchains; ++i) { 495 chain = src->array[i].chain; 496 if (chain) { 497 hammer2_chain_lock(chain, 0); 498 if (i < dst->nchains && 499 (tmp = dst->array[i].chain) != NULL) { 500 hammer2_chain_unlock(tmp); 501 } 502 dst->array[i].chain = chain; 503 if (dst->focus == NULL) 504 dst->focus = chain; 505 } 506 } 507 while (i < dst->nchains) { 508 chain = dst->array[i].chain; 509 if (chain) { 510 hammer2_chain_unlock(chain); 511 dst->array[i].chain = NULL; 512 } 513 ++i; 514 } 515 dst->nchains = src->nchains; 516 } 517 518 /* 519 * Copy a cluster, returned a ref'd cluster. All underlying chains 520 * are also ref'd, but not locked. The cluster focus is not set because 521 * the cluster is not yet locked (and the originating cluster does not 522 * have to be locked either). 523 */ 524 hammer2_cluster_t * 525 hammer2_cluster_copy(hammer2_cluster_t *ocluster) 526 { 527 hammer2_pfs_t *pmp = ocluster->pmp; 528 hammer2_cluster_t *ncluster; 529 hammer2_chain_t *chain; 530 int i; 531 532 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO); 533 ncluster->pmp = pmp; 534 ncluster->nchains = ocluster->nchains; 535 ncluster->refs = 1; 536 537 for (i = 0; i < ocluster->nchains; ++i) { 538 chain = ocluster->array[i].chain; 539 ncluster->array[i].chain = chain; 540 if (chain) 541 hammer2_chain_ref(chain); 542 } 543 return (ncluster); 544 } 545 546 /* 547 * Unlock and deref a cluster. The cluster is destroyed if this is the 548 * last ref. 549 */ 550 void 551 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 552 { 553 hammer2_chain_t *chain; 554 int i; 555 556 KKASSERT(cluster->refs > 0); 557 for (i = 0; i < cluster->nchains; ++i) { 558 chain = cluster->array[i].chain; 559 if (chain) { 560 hammer2_chain_unlock(chain); 561 if (cluster->refs == 1) 562 cluster->array[i].chain = NULL; /* safety */ 563 } 564 } 565 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 566 cluster->focus = NULL; 567 kfree(cluster, M_HAMMER2); 568 /* cluster = NULL; safety */ 569 } 570 } 571 572 /* 573 * Resize the cluster's physical storage allocation in-place. This may 574 * replace the cluster's chains. 575 */ 576 void 577 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 578 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 579 int nradix, int flags) 580 { 581 hammer2_chain_t *chain; 582 int i; 583 584 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */ 585 KKASSERT(cparent->nchains == cluster->nchains); 586 587 cluster->focus = NULL; 588 for (i = 0; i < cluster->nchains; ++i) { 589 chain = cluster->array[i].chain; 590 if (chain) { 591 KKASSERT(cparent->array[i].chain); 592 hammer2_chain_resize(trans, ip, 593 cparent->array[i].chain, chain, 594 nradix, flags); 595 if (cluster->focus == NULL) 596 cluster->focus = chain; 597 } 598 } 599 } 600 601 /* 602 * Set an inode's cluster modified, marking the related chains RW and 603 * duplicating them if necessary. 604 * 605 * The passed-in chain is a localized copy of the chain previously acquired 606 * when the inode was locked (and possilby replaced in the mean time), and 607 * must also be updated. In fact, we update it first and then synchronize 608 * the inode's cluster cache. 609 */ 610 hammer2_inode_data_t * 611 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 612 hammer2_cluster_t *cluster, int flags) 613 { 614 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 615 hammer2_cluster_modify(trans, cluster, flags); 616 617 hammer2_inode_repoint(ip, NULL, cluster); 618 if (ip->vp) 619 vsetisdirty(ip->vp); 620 return (&hammer2_cluster_wdata(cluster)->ipdata); 621 } 622 623 /* 624 * Adjust the cluster's chains to allow modification and adjust the 625 * focus. Data will be accessible on return. 626 */ 627 void 628 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster, 629 int flags) 630 { 631 hammer2_chain_t *chain; 632 int i; 633 634 cluster->focus = NULL; 635 for (i = 0; i < cluster->nchains; ++i) { 636 chain = cluster->array[i].chain; 637 if (chain) { 638 hammer2_chain_modify(trans, chain, flags); 639 if (cluster->focus == NULL) 640 cluster->focus = chain; 641 } 642 } 643 } 644 645 /* 646 * Synchronize modifications from the focus to other chains in a cluster. 647 * Convenient because nominal API users can just modify the contents of the 648 * focus (at least for non-blockref data). 649 * 650 * Nominal front-end operations only edit non-block-table data in a single 651 * chain. This code copies such modifications to the other chains in the 652 * cluster. Blocktable modifications are handled on a chain-by-chain basis 653 * by both the frontend and the backend and will explode in fireworks if 654 * blindly copied. 655 */ 656 void 657 hammer2_cluster_modsync(hammer2_cluster_t *cluster) 658 { 659 hammer2_chain_t *focus; 660 hammer2_chain_t *scan; 661 const hammer2_inode_data_t *ripdata; 662 hammer2_inode_data_t *wipdata; 663 int i; 664 665 focus = cluster->focus; 666 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED); 667 668 for (i = 0; i < cluster->nchains; ++i) { 669 scan = cluster->array[i].chain; 670 if (scan == NULL || scan == focus) 671 continue; 672 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED); 673 KKASSERT(focus->bytes == scan->bytes && 674 focus->bref.type == scan->bref.type); 675 switch(focus->bref.type) { 676 case HAMMER2_BREF_TYPE_INODE: 677 ripdata = &focus->data->ipdata; 678 wipdata = &scan->data->ipdata; 679 if ((ripdata->op_flags & 680 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 681 bcopy(ripdata, wipdata, 682 offsetof(hammer2_inode_data_t, u)); 683 break; 684 } 685 /* fall through to full copy */ 686 case HAMMER2_BREF_TYPE_DATA: 687 bcopy(focus->data, scan->data, focus->bytes); 688 break; 689 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 690 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 691 case HAMMER2_BREF_TYPE_FREEMAP: 692 case HAMMER2_BREF_TYPE_VOLUME: 693 panic("hammer2_cluster_modsync: illegal node type"); 694 /* NOT REACHED */ 695 break; 696 default: 697 panic("hammer2_cluster_modsync: unknown node type"); 698 break; 699 } 700 } 701 } 702 703 /* 704 * Lookup initialization/completion API 705 */ 706 hammer2_cluster_t * 707 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags) 708 { 709 hammer2_cluster_t *cluster; 710 int i; 711 712 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 713 cluster->pmp = cparent->pmp; /* can be NULL */ 714 /* cluster->focus = NULL; already null */ 715 716 for (i = 0; i < cparent->nchains; ++i) { 717 cluster->array[i].chain = cparent->array[i].chain; 718 if (cluster->focus == NULL) 719 cluster->focus = cluster->array[i].chain; 720 } 721 cluster->nchains = cparent->nchains; 722 723 /* 724 * Independently lock (this will also give cluster 1 ref) 725 */ 726 if (flags & HAMMER2_LOOKUP_SHARED) { 727 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS | 728 HAMMER2_RESOLVE_SHARED); 729 } else { 730 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); 731 } 732 return (cluster); 733 } 734 735 void 736 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent) 737 { 738 if (cparent) 739 hammer2_cluster_unlock(cparent); 740 } 741 742 /* 743 * Locate first match or overlap under parent, return a new cluster 744 */ 745 hammer2_cluster_t * 746 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp, 747 hammer2_key_t key_beg, hammer2_key_t key_end, 748 int flags, int *ddflagp) 749 { 750 hammer2_pfs_t *pmp; 751 hammer2_cluster_t *cluster; 752 hammer2_chain_t *chain; 753 hammer2_key_t key_accum; 754 hammer2_key_t key_next; 755 hammer2_key_t bref_key; 756 int bref_keybits; 757 int null_count; 758 int ddflag; 759 int i; 760 uint8_t bref_type; 761 u_int bytes; 762 763 pmp = cparent->pmp; /* can be NULL */ 764 key_accum = *key_nextp; 765 null_count = 0; 766 bref_type = 0; 767 bref_key = 0; 768 bref_keybits = 0; 769 bytes = 0; 770 771 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 772 cluster->pmp = pmp; /* can be NULL */ 773 cluster->refs = 1; 774 /* cluster->focus = NULL; already null */ 775 cparent->focus = NULL; 776 *ddflagp = 0; 777 778 for (i = 0; i < cparent->nchains; ++i) { 779 key_next = *key_nextp; 780 if (cparent->array[i].chain == NULL) { 781 ++null_count; 782 continue; 783 } 784 chain = hammer2_chain_lookup(&cparent->array[i].chain, 785 &key_next, 786 key_beg, key_end, 787 &cparent->array[i].cache_index, 788 flags, &ddflag); 789 if (cparent->focus == NULL) 790 cparent->focus = cparent->array[i].chain; 791 cluster->array[i].chain = chain; 792 if (chain == NULL) { 793 ++null_count; 794 } else { 795 if (cluster->focus == NULL) { 796 bref_type = chain->bref.type; 797 bref_key = chain->bref.key; 798 bref_keybits = chain->bref.keybits; 799 bytes = chain->bytes; 800 *ddflagp = ddflag; 801 cluster->focus = chain; 802 } 803 KKASSERT(bref_type == chain->bref.type); 804 KKASSERT(bref_key == chain->bref.key); 805 KKASSERT(bref_keybits == chain->bref.keybits); 806 KKASSERT(bytes == chain->bytes); 807 KKASSERT(*ddflagp == ddflag); 808 } 809 if (key_accum > key_next) 810 key_accum = key_next; 811 } 812 *key_nextp = key_accum; 813 cluster->nchains = i; 814 815 if (null_count == i) { 816 hammer2_cluster_drop(cluster); 817 cluster = NULL; 818 } 819 820 return (cluster); 821 } 822 823 /* 824 * Locate next match or overlap under parent, replace cluster 825 */ 826 hammer2_cluster_t * 827 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 828 hammer2_key_t *key_nextp, 829 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 830 { 831 hammer2_chain_t *chain; 832 hammer2_key_t key_accum; 833 hammer2_key_t key_next; 834 int null_count; 835 int i; 836 837 key_accum = *key_nextp; 838 null_count = 0; 839 cluster->focus = NULL; 840 cparent->focus = NULL; 841 842 for (i = 0; i < cparent->nchains; ++i) { 843 key_next = *key_nextp; 844 chain = cluster->array[i].chain; 845 if (chain == NULL) { 846 if (cparent->focus == NULL) 847 cparent->focus = cparent->array[i].chain; 848 ++null_count; 849 continue; 850 } 851 if (cparent->array[i].chain == NULL) { 852 if (flags & HAMMER2_LOOKUP_NOLOCK) 853 hammer2_chain_drop(chain); 854 else 855 hammer2_chain_unlock(chain); 856 ++null_count; 857 continue; 858 } 859 chain = hammer2_chain_next(&cparent->array[i].chain, chain, 860 &key_next, key_beg, key_end, 861 &cparent->array[i].cache_index, 862 flags); 863 if (cparent->focus == NULL) 864 cparent->focus = cparent->array[i].chain; 865 cluster->array[i].chain = chain; 866 if (chain == NULL) { 867 ++null_count; 868 } else if (cluster->focus == NULL) { 869 cluster->focus = chain; 870 } 871 if (key_accum > key_next) 872 key_accum = key_next; 873 } 874 875 if (null_count == i) { 876 hammer2_cluster_drop(cluster); 877 cluster = NULL; 878 } 879 return(cluster); 880 } 881 882 #if 0 883 /* 884 * XXX initial NULL cluster needs reworking (pass **clusterp ?) 885 * 886 * The raw scan function is similar to lookup/next but does not seek to a key. 887 * Blockrefs are iterated via first_chain = (parent, NULL) and 888 * next_chain = (parent, chain). 889 * 890 * The passed-in parent must be locked and its data resolved. The returned 891 * chain will be locked. Pass chain == NULL to acquire the first sub-chain 892 * under parent and then iterate with the passed-in chain (which this 893 * function will unlock). 894 */ 895 hammer2_cluster_t * 896 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 897 int flags) 898 { 899 hammer2_chain_t *chain; 900 int null_count; 901 int i; 902 903 null_count = 0; 904 905 for (i = 0; i < cparent->nchains; ++i) { 906 chain = cluster->array[i].chain; 907 if (chain == NULL) { 908 ++null_count; 909 continue; 910 } 911 if (cparent->array[i].chain == NULL) { 912 if (flags & HAMMER2_LOOKUP_NOLOCK) 913 hammer2_chain_drop(chain); 914 else 915 hammer2_chain_unlock(chain); 916 ++null_count; 917 continue; 918 } 919 920 chain = hammer2_chain_scan(cparent->array[i].chain, chain, 921 &cparent->array[i].cache_index, 922 flags); 923 cluster->array[i].chain = chain; 924 if (chain == NULL) 925 ++null_count; 926 } 927 928 if (null_count == i) { 929 hammer2_cluster_drop(cluster); 930 cluster = NULL; 931 } 932 return(cluster); 933 } 934 935 #endif 936 937 /* 938 * Create a new cluster using the specified key 939 */ 940 int 941 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 942 hammer2_cluster_t **clusterp, 943 hammer2_key_t key, int keybits, 944 int type, size_t bytes, int flags) 945 { 946 hammer2_cluster_t *cluster; 947 hammer2_pfs_t *pmp; 948 int error; 949 int i; 950 951 pmp = trans->pmp; /* can be NULL */ 952 953 if ((cluster = *clusterp) == NULL) { 954 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, 955 M_WAITOK | M_ZERO); 956 cluster->pmp = pmp; /* can be NULL */ 957 cluster->refs = 1; 958 } 959 cluster->focus = NULL; 960 cparent->focus = NULL; 961 962 /* 963 * NOTE: cluster->array[] entries can initially be NULL. If 964 * *clusterp is supplied, skip NULL entries, otherwise 965 * create new chains. 966 */ 967 for (i = 0; i < cparent->nchains; ++i) { 968 if (*clusterp && cluster->array[i].chain == NULL) { 969 if (cparent->focus == NULL) 970 cparent->focus = cparent->array[i].chain; 971 continue; 972 } 973 error = hammer2_chain_create(trans, &cparent->array[i].chain, 974 &cluster->array[i].chain, pmp, 975 key, keybits, 976 type, bytes, flags); 977 KKASSERT(error == 0); 978 if (cparent->focus == NULL) 979 cparent->focus = cparent->array[i].chain; 980 if (cluster->focus == NULL) 981 cluster->focus = cluster->array[i].chain; 982 } 983 cluster->nchains = i; 984 *clusterp = cluster; 985 986 return error; 987 } 988 989 /* 990 * Rename a cluster to a new parent. 991 * 992 * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields 993 * are used from a passed-in non-NULL bref pointer. All other fields 994 * are extracted from the original chain for each chain in the 995 * iteration. 996 */ 997 void 998 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, 999 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 1000 int flags) 1001 { 1002 hammer2_chain_t *chain; 1003 hammer2_blockref_t xbref; 1004 int i; 1005 1006 cluster->focus = NULL; 1007 cparent->focus = NULL; 1008 1009 for (i = 0; i < cluster->nchains; ++i) { 1010 chain = cluster->array[i].chain; 1011 if (chain) { 1012 if (bref) { 1013 xbref = chain->bref; 1014 xbref.key = bref->key; 1015 xbref.keybits = bref->keybits; 1016 hammer2_chain_rename(trans, &xbref, 1017 &cparent->array[i].chain, 1018 chain, flags); 1019 } else { 1020 hammer2_chain_rename(trans, NULL, 1021 &cparent->array[i].chain, 1022 chain, flags); 1023 } 1024 cluster->array[i].chain = chain; 1025 if (cluster->focus == NULL) 1026 cluster->focus = chain; 1027 if (cparent->focus == NULL) 1028 cparent->focus = cparent->array[i].chain; 1029 } else { 1030 if (cparent->focus == NULL) 1031 cparent->focus = cparent->array[i].chain; 1032 } 1033 } 1034 } 1035 1036 /* 1037 * Mark a cluster deleted 1038 */ 1039 void 1040 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 1041 hammer2_cluster_t *cluster, int flags) 1042 { 1043 hammer2_chain_t *chain; 1044 hammer2_chain_t *parent; 1045 int i; 1046 1047 if (cparent == NULL) { 1048 kprintf("cparent is NULL\n"); 1049 return; 1050 } 1051 1052 for (i = 0; i < cluster->nchains; ++i) { 1053 parent = (i < cparent->nchains) ? 1054 cparent->array[i].chain : NULL; 1055 chain = cluster->array[i].chain; 1056 if (chain == NULL) 1057 continue; 1058 if (chain->parent != parent) { 1059 kprintf("hammer2_cluster_delete: parent " 1060 "mismatch chain=%p parent=%p against=%p\n", 1061 chain, chain->parent, parent); 1062 } else { 1063 hammer2_chain_delete(trans, parent, chain, flags); 1064 } 1065 } 1066 } 1067 1068 /* 1069 * Create a snapshot of the specified {parent, ochain} with the specified 1070 * label. The originating hammer2_inode must be exclusively locked for 1071 * safety. 1072 * 1073 * The ioctl code has already synced the filesystem. 1074 */ 1075 int 1076 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster, 1077 hammer2_ioc_pfs_t *pfs) 1078 { 1079 hammer2_dev_t *hmp; 1080 hammer2_cluster_t *ncluster; 1081 const hammer2_inode_data_t *ripdata; 1082 hammer2_inode_data_t *wipdata; 1083 hammer2_chain_t *nchain; 1084 hammer2_inode_t *nip; 1085 size_t name_len; 1086 hammer2_key_t lhc; 1087 struct vattr vat; 1088 #if 0 1089 uuid_t opfs_clid; 1090 #endif 1091 int error; 1092 int i; 1093 1094 kprintf("snapshot %s\n", pfs->name); 1095 1096 name_len = strlen(pfs->name); 1097 lhc = hammer2_dirhash(pfs->name, name_len); 1098 1099 /* 1100 * Get the clid 1101 */ 1102 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata; 1103 #if 0 1104 opfs_clid = ripdata->pfs_clid; 1105 #endif 1106 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */ 1107 1108 /* 1109 * Create the snapshot directory under the super-root 1110 * 1111 * Set PFS type, generate a unique filesystem id, and generate 1112 * a cluster id. Use the same clid when snapshotting a PFS root, 1113 * which theoretically allows the snapshot to be used as part of 1114 * the same cluster (perhaps as a cache). 1115 * 1116 * Copy the (flushed) blockref array. Theoretically we could use 1117 * chain_duplicate() but it becomes difficult to disentangle 1118 * the shared core so for now just brute-force it. 1119 */ 1120 VATTR_NULL(&vat); 1121 vat.va_type = VDIR; 1122 vat.va_mode = 0755; 1123 ncluster = NULL; 1124 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat, 1125 proc0.p_ucred, pfs->name, name_len, 1126 &ncluster, 1127 HAMMER2_INSERT_PFSROOT, &error); 1128 1129 if (nip) { 1130 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0); 1131 wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT; 1132 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT; 1133 kern_uuidgen(&wipdata->pfs_fsid, 1); 1134 1135 /* 1136 * Give the snapshot its own private cluster. As a snapshot 1137 * no further synchronization with the original cluster will 1138 * be done. 1139 */ 1140 #if 0 1141 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1142 wipdata->pfs_clid = opfs_clid; 1143 else 1144 kern_uuidgen(&wipdata->pfs_clid, 1); 1145 #endif 1146 kern_uuidgen(&wipdata->pfs_clid, 1); 1147 1148 for (i = 0; i < ncluster->nchains; ++i) { 1149 nchain = ncluster->array[i].chain; 1150 if (nchain) 1151 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 1152 } 1153 #if 0 1154 /* XXX can't set this unless we do an explicit flush, which 1155 we also need a pmp assigned to do, else the flush code 1156 won't flush ncluster because it thinks it is crossing a 1157 flush boundary */ 1158 hammer2_cluster_set_chainflags(ncluster, 1159 HAMMER2_CHAIN_PFSBOUNDARY); 1160 #endif 1161 1162 /* XXX hack blockset copy */ 1163 /* XXX doesn't work with real cluster */ 1164 KKASSERT(ocluster->nchains == 1); 1165 wipdata->u.blockset = ripdata->u.blockset; 1166 hammer2_cluster_modsync(ncluster); 1167 for (i = 0; i < ncluster->nchains; ++i) { 1168 nchain = ncluster->array[i].chain; 1169 if (nchain) 1170 hammer2_flush(trans, nchain); 1171 } 1172 hammer2_inode_unlock_ex(nip, ncluster); 1173 } 1174 return (error); 1175 } 1176 1177 /* 1178 * Return locked parent cluster given a locked child. The child remains 1179 * locked on return. The new parent's focus follows the child's focus 1180 * and the parent is always resolved. 1181 */ 1182 hammer2_cluster_t * 1183 hammer2_cluster_parent(hammer2_cluster_t *cluster) 1184 { 1185 hammer2_cluster_t *cparent; 1186 int i; 1187 1188 cparent = hammer2_cluster_copy(cluster); 1189 for (i = 0; i < cparent->nchains; ++i) { 1190 hammer2_chain_t *chain; 1191 hammer2_chain_t *rchain; 1192 1193 /* 1194 * Calculate parent for each element. Old chain has an extra 1195 * ref for cparent but the lock remains with cluster. 1196 */ 1197 chain = cparent->array[i].chain; 1198 if (chain == NULL) 1199 continue; 1200 while ((rchain = chain->parent) != NULL) { 1201 hammer2_chain_ref(rchain); 1202 hammer2_chain_unlock(chain); 1203 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS); 1204 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1205 hammer2_chain_drop(rchain); 1206 if (chain->parent == rchain) 1207 break; 1208 hammer2_chain_unlock(rchain); 1209 } 1210 if (cluster->focus == chain) 1211 cparent->focus = rchain; 1212 cparent->array[i].chain = rchain; 1213 hammer2_chain_drop(chain); 1214 } 1215 return cparent; 1216 } 1217 1218 /************************************************************************ 1219 * CLUSTER I/O * 1220 ************************************************************************ 1221 * 1222 * 1223 * WARNING! blockref[] array data is not universal. These functions should 1224 * only be used to access universal data. 1225 * 1226 * NOTE! The rdata call will wait for at least one of the chain I/Os to 1227 * complete if necessary. The I/O's should have already been 1228 * initiated by the cluster_lock/chain_lock operation. 1229 * 1230 * The cluster must already be in a modified state before wdata 1231 * is called. The data will already be available for this case. 1232 */ 1233 const hammer2_media_data_t * 1234 hammer2_cluster_rdata(hammer2_cluster_t *cluster) 1235 { 1236 return(cluster->focus->data); 1237 } 1238 1239 hammer2_media_data_t * 1240 hammer2_cluster_wdata(hammer2_cluster_t *cluster) 1241 { 1242 KKASSERT(hammer2_cluster_modified(cluster)); 1243 return(cluster->focus->data); 1244 } 1245 1246 /* 1247 * Load async into independent buffer - used to load logical buffers from 1248 * underlying device data. The callback is made for the first validated 1249 * data found, or NULL if no valid data is available. 1250 * 1251 * NOTE! The cluster structure is either unique or serialized (e.g. embedded 1252 * in the inode with an exclusive lock held), the chain structure may be 1253 * shared. 1254 */ 1255 void 1256 hammer2_cluster_load_async(hammer2_cluster_t *cluster, 1257 void (*callback)(hammer2_iocb_t *iocb), void *ptr) 1258 { 1259 hammer2_chain_t *chain; 1260 hammer2_iocb_t *iocb; 1261 hammer2_dev_t *hmp; 1262 hammer2_blockref_t *bref; 1263 int i; 1264 1265 /* 1266 * Try to find a chain whos data is already resolved. If none can 1267 * be found, start with the first chain. 1268 */ 1269 chain = NULL; 1270 for (i = 0; i < cluster->nchains; ++i) { 1271 chain = cluster->array[i].chain; 1272 if (chain && chain->data) 1273 break; 1274 } 1275 if (i == cluster->nchains) { 1276 chain = cluster->array[0].chain; 1277 i = 0; 1278 } 1279 1280 iocb = &cluster->iocb; 1281 iocb->callback = callback; 1282 iocb->dio = NULL; /* for already-validated case */ 1283 iocb->cluster = cluster; 1284 iocb->chain = chain; 1285 iocb->ptr = ptr; 1286 iocb->lbase = (off_t)i; 1287 iocb->flags = 0; 1288 iocb->error = 0; 1289 1290 /* 1291 * Data already validated 1292 */ 1293 if (chain->data) { 1294 callback(iocb); 1295 return; 1296 } 1297 1298 /* 1299 * We must resolve to a device buffer, either by issuing I/O or 1300 * by creating a zero-fill element. We do not mark the buffer 1301 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1302 * API must still be used to do that). 1303 * 1304 * The device buffer is variable-sized in powers of 2 down 1305 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1306 * chunk always contains buffers of the same size. (XXX) 1307 * 1308 * The minimum physical IO size may be larger than the variable 1309 * block size. 1310 */ 1311 bref = &chain->bref; 1312 hmp = chain->hmp; 1313 1314 #if 0 1315 /* handled by callback? <- TODO XXX even needed for loads? */ 1316 /* 1317 * The getblk() optimization for a 100% overwrite can only be used 1318 * if the physical block size matches the request. 1319 */ 1320 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 1321 chain->bytes == hammer2_devblksize(chain->bytes)) { 1322 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio); 1323 KKASSERT(error == 0); 1324 iocb->dio = dio; 1325 callback(iocb); 1326 return; 1327 } 1328 #endif 1329 1330 /* 1331 * Otherwise issue a read 1332 */ 1333 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1334 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb); 1335 } 1336 1337 /************************************************************************ 1338 * NODE FAILURES * 1339 ************************************************************************ 1340 * 1341 * A node failure can occur for numerous reasons. 1342 * 1343 * - A read I/O may fail 1344 * - A write I/O may fail 1345 * - An unexpected chain might be found (or be missing) 1346 * - A node might disconnect temporarily and reconnect later 1347 * (for example, a USB stick could get pulled, or a node might 1348 * be programmatically disconnected). 1349 * - A node might run out of space during a modifying operation. 1350 * 1351 * When a read failure or an unexpected chain state is found, the chain and 1352 * parent chain at the failure point for the nodes involved (the nodes 1353 * which we determine to be in error) are flagged as failed and removed 1354 * from the cluster. The node itself is allowed to remain active. The 1355 * highest common point (usually a parent chain) is queued to the 1356 * resynchronization thread for action. 1357 * 1358 * When a write I/O fails or a node runs out of space, we first adjust 1359 * as if a read failure occurs but we further disable flushes on the 1360 * ENTIRE node. Concurrent modifying transactions are allowed to complete 1361 * but any new modifying transactions will automatically remove the node 1362 * from consideration in all related cluster structures and not generate 1363 * any new modified chains. The ROOT chain for the failed node(s) is queued 1364 * to the resynchronization thread for action. 1365 * 1366 * A temporary disconnect is handled as if a write failure occurred. 1367 * 1368 * Any of these failures might or might not stall related high level VNOPS, 1369 * depending on what has failed, what nodes remain, the type of cluster, 1370 * and the operating state of the cluster. 1371 * 1372 * FLUSH ON WRITE-DISABLED NODES 1373 * 1374 * A flush on a write-disabled node is not allowed to write anything because 1375 * we cannot safely update the mirror_tid anywhere on the failed node. The 1376 * synchronization thread uses mirror_tid to calculate incremental resyncs. 1377 * Dirty meta-data related to the failed node is thrown away. 1378 * 1379 * Dirty buffer cache buffers and inodes are only thrown away if they can be 1380 * retired... that is, if the filesystem still has enough nodes to complete 1381 * the operation. 1382 */ 1383 1384 /************************************************************************ 1385 * SYNCHRONIZATION THREAD * 1386 ************************************************************************ 1387 * 1388 * This thread is responsible for [re]synchronizing the cluster representing 1389 * a PFS. Any out-of-sync or failed node starts this thread on a 1390 * node-by-node basis when the failure is detected. 1391 * 1392 * Clusters needing resynchronization are queued at the highest point 1393 * where the parent on the failed node is still valid, or a special 1394 * incremental scan from the ROOT is queued if no parent exists. This 1395 * thread is also responsible for waiting for reconnections of the failed 1396 * node if the cause was due to a disconnect, and waiting for space to be 1397 * freed up if the cause was due to running out of space. 1398 * 1399 * If the cause is due to a node running out of space, this thread will also 1400 * remove older (unlocked) snapshots to make new space, recover space, and 1401 * then start resynchronization. 1402 * 1403 * Each resynchronization pass virtually snapshots the PFS on the good nodes 1404 * and synchronizes using that snapshot against the target node. This 1405 * ensures a consistent chain topology and also avoids interference between 1406 * the resynchronization thread and frontend operations. 1407 * 1408 * Since these are per-node threads it is possible to resynchronize several 1409 * nodes at once. 1410 */ 1411