1 /* 2 * Copyright (c) 2013-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information from different nodes into a single entity. It allows direct 37 * access to media data as long as it is not blockref array data (which 38 * will obviously have to be different at each node). 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 * 48 * WARNING! This module is *extremely* complex. It must issue asynchronous 49 * locks and I/O, do quorum and/or master-slave processing, and 50 * it must operate properly even if some nodes are broken (which 51 * can also mean indefinite locks). 52 * 53 * CLUSTER OPERATIONS 54 * 55 * Cluster operations can be broken down into three pieces: 56 * 57 * (1) Chain locking and data retrieval. 58 * 59 * - Most complex functions, quorum management on transaction ids. 60 * 61 * - Locking and data accesses must be internally asynchronous. 62 * 63 * - Validate and manage cache coherency primitives (cache state 64 * is stored in chain topologies but must be validated by these 65 * functions). 66 * 67 * (2) Lookups and Scans 68 * hammer2_cluster_lookup() 69 * hammer2_cluster_next() 70 * 71 * - Depend on locking & data retrieval functions, but still complex. 72 * 73 * - Must do quorum management on transaction ids. 74 * 75 * - Lookup and Iteration ops Must be internally asynchronous. 76 * 77 * (3) Modifying Operations 78 * hammer2_cluster_create() 79 * 80 * - Can usually punt on failures, operation continues unless quorum 81 * is lost. If quorum is lost, must wait for resynchronization 82 * (depending on the management mode). 83 * 84 * - Must disconnect node on failures (also not flush), remount, and 85 * resynchronize. 86 * 87 * - Network links (via kdmsg) are relatively easy to issue as the 88 * complex underworkings of hammer2_chain.c don't have to messed 89 * with (the protocol is at a higher level than block-level). 90 * 91 * - Multiple local disk nodes (i.e. block devices) are another matter. 92 * Chain operations have to be dispatched to per-node threads (xN) 93 * because we can't asynchronize potentially very complex chain 94 * operations in hammer2_chain.c (it would be a huge mess). 95 * 96 * (these threads are also used to terminate incoming kdmsg ops from 97 * other machines). 98 * 99 * - Single-node filesystems do not use threads and will simply call 100 * hammer2_chain.c functions directly. This short-cut is handled 101 * at the base of each cluster function. 102 */ 103 #include <sys/cdefs.h> 104 #include <sys/param.h> 105 #include <sys/systm.h> 106 #include <sys/types.h> 107 108 #include "hammer2.h" 109 110 /* 111 * Returns the bref type of the cluster's foucs. 112 * 113 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0). 114 * The cluster must be locked. 115 */ 116 uint8_t 117 hammer2_cluster_type(hammer2_cluster_t *cluster) 118 { 119 if (cluster->error == 0) { 120 KKASSERT(cluster->focus != NULL); 121 return(cluster->focus->bref.type); 122 } 123 return 0; 124 } 125 126 /* 127 * Returns the bref of the cluster's focus, sans any data-offset information 128 * (since offset information is per-node and wouldn't be useful). 129 * 130 * Callers use this function to access modify_tid, mirror_tid, type, 131 * key, and keybits. 132 * 133 * If the cluster is errored, returns an empty bref. 134 * The cluster must be locked. 135 */ 136 void 137 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 138 { 139 if (cluster->error == 0) { 140 KKASSERT(cluster->focus != NULL); 141 *bref = cluster->focus->bref; 142 bref->data_off = 0; 143 } else { 144 bzero(bref, sizeof(*bref)); 145 } 146 } 147 148 /* 149 * Create a degenerate cluster with one ref from a single locked chain. 150 * The returned cluster will be focused on the chain and inherit its 151 * error state. 152 * 153 * The chain's lock and reference are transfered to the new cluster, so 154 * the caller should not try to unlock the chain separately. 155 * 156 * We fake the flags. 157 */ 158 void 159 hammer2_dummy_xop_from_chain(hammer2_xop_head_t *xop, hammer2_chain_t *chain) 160 { 161 hammer2_cluster_t *cluster; 162 163 bzero(xop, sizeof(*xop)); 164 165 cluster = &xop->cluster; 166 cluster->array[0].chain = chain; 167 cluster->array[0].flags = HAMMER2_CITEM_FEMOD; 168 cluster->nchains = 1; 169 cluster->focus = chain; 170 cluster->focus_index = 0; 171 cluster->pmp = chain->pmp; 172 cluster->refs = 1; 173 cluster->error = chain->error; 174 cluster->flags = HAMMER2_CLUSTER_LOCKED | 175 HAMMER2_CLUSTER_WRHARD | 176 HAMMER2_CLUSTER_RDHARD | 177 HAMMER2_CLUSTER_MSYNCED | 178 HAMMER2_CLUSTER_SSYNCED; 179 } 180 181 /* 182 * Add a reference to a cluster and its underlying chains. 183 * 184 * We must also ref the underlying chains in order to allow ref/unlock 185 * sequences to later re-lock. 186 */ 187 void 188 hammer2_cluster_ref(hammer2_cluster_t *cluster) 189 { 190 atomic_add_int(&cluster->refs, 1); 191 } 192 193 /* 194 * Drop the caller's reference to the cluster. When the ref count drops to 195 * zero this function frees the cluster and drops all underlying chains. 196 * 197 * In-progress read I/Os are typically detached from the cluster once the 198 * first one returns (the remaining stay attached to the DIOs but are then 199 * ignored and drop naturally). 200 */ 201 void 202 hammer2_cluster_drop(hammer2_cluster_t *cluster) 203 { 204 hammer2_chain_t *chain; 205 int i; 206 207 KKASSERT(cluster->refs > 0); 208 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 209 cluster->focus = NULL; /* safety XXX chg to assert */ 210 cluster->focus_index = 0; 211 212 for (i = 0; i < cluster->nchains; ++i) { 213 chain = cluster->array[i].chain; 214 if (chain) { 215 hammer2_chain_drop(chain); 216 cluster->array[i].chain = NULL; /* safety */ 217 } 218 } 219 cluster->nchains = 0; /* safety */ 220 221 kfree(cluster, M_HAMMER2); 222 /* cluster is invalid */ 223 } 224 } 225 226 /* 227 * Lock a cluster. Cluster must already be referenced. Focus is maintained. 228 * 229 * WARNING! This function expects the caller to handle resolution of the 230 * cluster. We never re-resolve the cluster in this function, 231 * because it might be used to temporarily unlock/relock a cparent 232 * in an iteration or recursrion, and the cparents elements do not 233 * necessarily match. 234 */ 235 void 236 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 237 { 238 hammer2_chain_t *chain; 239 int i; 240 241 /* cannot be on inode-embedded cluster template, must be on copy */ 242 KKASSERT(cluster->refs > 0); 243 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0); 244 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) { 245 panic("hammer2_cluster_lock: cluster %p already locked!\n", 246 cluster); 247 } 248 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 249 250 /* 251 * Lock chains and resolve state. 252 */ 253 for (i = 0; i < cluster->nchains; ++i) { 254 chain = cluster->array[i].chain; 255 if (chain == NULL) 256 continue; 257 hammer2_chain_lock(chain, how); 258 } 259 } 260 261 void 262 hammer2_cluster_unhold(hammer2_cluster_t *cluster) 263 { 264 hammer2_chain_t *chain; 265 int i; 266 267 for (i = 0; i < cluster->nchains; ++i) { 268 chain = cluster->array[i].chain; 269 if (chain == NULL) 270 continue; 271 hammer2_chain_unhold(chain); 272 } 273 } 274 275 void 276 hammer2_cluster_rehold(hammer2_cluster_t *cluster) 277 { 278 hammer2_chain_t *chain; 279 int i; 280 281 for (i = 0; i < cluster->nchains; ++i) { 282 chain = cluster->array[i].chain; 283 if (chain == NULL) 284 continue; 285 hammer2_chain_rehold(chain); 286 } 287 } 288 289 /* 290 * Calculate the clustering state for the cluster and set its focus. 291 * This routine must be called with care. For example, it should not 292 * normally be called after relocking a non-leaf cluster because parent 293 * clusters help iterations and each element might be at a slightly different 294 * indirect node (each node's topology is independently indexed). 295 * 296 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal 297 * operations. Typically this is only set on a quorum of MASTERs or 298 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER 299 * is present, this bit is *not* set on a quorum of MASTERs. The 300 * synchronization code ignores this bit, but all hammer2_cluster_*() calls 301 * that create/modify/delete elements use it. 302 * 303 * The chains making up the cluster may be narrowed down based on quorum 304 * acceptability. 305 * 306 * If a failure occurs the operation must be aborted by higher-level code and 307 * retried. XXX 308 */ 309 void 310 hammer2_cluster_resolve(hammer2_cluster_t *cluster) 311 { 312 hammer2_chain_t *chain; 313 hammer2_chain_t *focus; 314 hammer2_pfs_t *pmp; 315 hammer2_tid_t quorum_tid; 316 hammer2_tid_t last_best_quorum_tid; 317 int focus_pfs_type; 318 uint32_t nflags; 319 int ttlmasters; 320 int ttlslaves; 321 int nmasters; 322 int nslaves; 323 int nquorum; 324 int smpresent; 325 int i; 326 327 cluster->error = 0; 328 cluster->focus = NULL; 329 330 focus_pfs_type = 0; 331 nflags = 0; 332 ttlmasters = 0; 333 ttlslaves = 0; 334 nmasters = 0; 335 nslaves = 0; 336 337 /* 338 * Calculate quorum 339 */ 340 pmp = cluster->pmp; 341 KKASSERT(pmp != NULL || cluster->nchains == 0); 342 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 343 smpresent = 0; 344 345 /* 346 * Pass 1 347 * 348 * NOTE: A NULL chain is not necessarily an error, it could be 349 * e.g. a lookup failure or the end of an iteration. 350 * Process normally. 351 */ 352 for (i = 0; i < cluster->nchains; ++i) { 353 chain = cluster->array[i].chain; 354 if (chain && chain->error) { 355 if (cluster->focus == NULL || cluster->focus == chain) { 356 /* error will be overridden by valid focus */ 357 cluster->error = chain->error; 358 } 359 360 /* 361 * Must count total masters and slaves whether the 362 * chain is errored or not. 363 */ 364 switch (cluster->pmp->pfs_types[i]) { 365 case HAMMER2_PFSTYPE_SUPROOT: 366 case HAMMER2_PFSTYPE_MASTER: 367 ++ttlmasters; 368 break; 369 case HAMMER2_PFSTYPE_SLAVE: 370 ++ttlslaves; 371 break; 372 } 373 continue; 374 } 375 switch (cluster->pmp->pfs_types[i]) { 376 case HAMMER2_PFSTYPE_MASTER: 377 ++ttlmasters; 378 break; 379 case HAMMER2_PFSTYPE_SLAVE: 380 ++ttlslaves; 381 break; 382 case HAMMER2_PFSTYPE_SOFT_MASTER: 383 nflags |= HAMMER2_CLUSTER_WRSOFT; 384 nflags |= HAMMER2_CLUSTER_RDSOFT; 385 smpresent = 1; 386 break; 387 case HAMMER2_PFSTYPE_SOFT_SLAVE: 388 nflags |= HAMMER2_CLUSTER_RDSOFT; 389 break; 390 case HAMMER2_PFSTYPE_SUPROOT: 391 /* 392 * Degenerate cluster representing the super-root 393 * topology on a single device. Fake stuff so 394 * cluster ops work as expected. 395 */ 396 nflags |= HAMMER2_CLUSTER_WRHARD; 397 nflags |= HAMMER2_CLUSTER_RDHARD; 398 cluster->focus_index = i; 399 cluster->focus = chain; 400 cluster->error = chain ? chain->error : 0; 401 ++ttlmasters; 402 break; 403 default: 404 break; 405 } 406 } 407 408 /* 409 * Pass 2 410 * 411 * Resolve masters. Calculate nmasters for the highest matching 412 * TID, if a quorum cannot be attained try the next lower matching 413 * TID until we exhaust TIDs. 414 * 415 * NOTE: A NULL chain is not necessarily an error, it could be 416 * e.g. a lookup failure or the end of an iteration. 417 * Process normally. 418 */ 419 last_best_quorum_tid = HAMMER2_TID_MAX; 420 quorum_tid = 0; /* fix gcc warning */ 421 422 while (nmasters < nquorum && last_best_quorum_tid != 0) { 423 nmasters = 0; 424 quorum_tid = 0; 425 426 for (i = 0; i < cluster->nchains; ++i) { 427 switch (cluster->pmp->pfs_types[i]) { 428 case HAMMER2_PFSTYPE_SUPROOT: 429 case HAMMER2_PFSTYPE_MASTER: 430 break; 431 default: 432 continue; 433 } 434 chain = cluster->array[i].chain; 435 436 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 437 /* 438 * Invalid as in unsynchronized, cannot be 439 * used to calculate the quorum. 440 */ 441 } else if (chain == NULL && quorum_tid == 0) { 442 /* 443 * NULL chain on master matches NULL chains 444 * on other masters. 445 */ 446 ++nmasters; 447 } else if (quorum_tid < last_best_quorum_tid && 448 chain != NULL && 449 (quorum_tid < chain->bref.modify_tid || 450 nmasters == 0)) { 451 /* 452 * Better TID located, reset nmasters count. 453 */ 454 nmasters = 1; 455 quorum_tid = chain->bref.modify_tid; 456 } else if (chain && 457 quorum_tid == chain->bref.modify_tid) { 458 /* 459 * TID matches current collection. 460 */ 461 ++nmasters; 462 } 463 } 464 if (nmasters >= nquorum) 465 break; 466 last_best_quorum_tid = quorum_tid; 467 } 468 469 /* 470 * Pass 3 471 * 472 * NOTE: A NULL chain is not necessarily an error, it could be 473 * e.g. a lookup failure or the end of an iteration. 474 * Process normally. 475 */ 476 for (i = 0; i < cluster->nchains; ++i) { 477 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 478 chain = cluster->array[i].chain; 479 if (chain && chain->error) { 480 if (cluster->focus == NULL || cluster->focus == chain) { 481 /* error will be overridden by valid focus */ 482 cluster->error = chain->error; 483 } 484 continue; 485 } 486 487 switch (cluster->pmp->pfs_types[i]) { 488 case HAMMER2_PFSTYPE_MASTER: 489 /* 490 * We must have enough up-to-date masters to reach 491 * a quorum and the master modify_tid must match 492 * the quorum's modify_tid. 493 * 494 * Do not select an errored or out-of-sync master. 495 */ 496 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 497 nflags |= HAMMER2_CLUSTER_UNHARD; 498 } else if (nmasters >= nquorum && 499 (chain == NULL || chain->error == 0) && 500 ((chain == NULL && quorum_tid == 0) || 501 (chain != NULL && quorum_tid == 502 chain->bref.modify_tid))) { 503 nflags |= HAMMER2_CLUSTER_WRHARD; 504 nflags |= HAMMER2_CLUSTER_RDHARD; 505 if (!smpresent) { 506 cluster->array[i].flags |= 507 HAMMER2_CITEM_FEMOD; 508 } 509 if (cluster->focus == NULL || 510 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) { 511 focus_pfs_type = HAMMER2_PFSTYPE_MASTER; 512 cluster->focus_index = i; 513 cluster->focus = chain; /* NULL ok */ 514 cluster->error = chain ? chain->error : 515 0; 516 } 517 } else if (chain == NULL || chain->error == 0) { 518 nflags |= HAMMER2_CLUSTER_UNHARD; 519 } 520 break; 521 case HAMMER2_PFSTYPE_SLAVE: 522 /* 523 * We must have enough up-to-date masters to reach 524 * a quorum and the slave modify_tid must match the 525 * quorum's modify_tid. 526 * 527 * Do not select an errored slave. 528 */ 529 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 530 nflags |= HAMMER2_CLUSTER_UNHARD; 531 } else if (nmasters >= nquorum && 532 (chain == NULL || chain->error == 0) && 533 ((chain == NULL && quorum_tid == 0) || 534 (chain && quorum_tid == 535 chain->bref.modify_tid))) { 536 ++nslaves; 537 nflags |= HAMMER2_CLUSTER_RDHARD; 538 } else if (chain == NULL || chain->error == 0) { 539 nflags |= HAMMER2_CLUSTER_UNSOFT; 540 } 541 break; 542 case HAMMER2_PFSTYPE_SOFT_MASTER: 543 /* 544 * Directly mounted soft master always wins. There 545 * should be only one. 546 */ 547 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER); 548 cluster->focus_index = i; 549 cluster->focus = chain; 550 cluster->error = chain ? chain->error : 0; 551 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER; 552 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 553 break; 554 case HAMMER2_PFSTYPE_SOFT_SLAVE: 555 /* 556 * Directly mounted soft slave always wins. There 557 * should be only one. 558 */ 559 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE); 560 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) { 561 cluster->focus_index = i; 562 cluster->focus = chain; 563 cluster->error = chain ? chain->error : 0; 564 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE; 565 } 566 break; 567 case HAMMER2_PFSTYPE_SUPROOT: 568 /* 569 * spmp (degenerate case) 570 */ 571 KKASSERT(i == 0); 572 cluster->focus_index = i; 573 cluster->focus = chain; 574 cluster->error = chain ? chain->error : 0; 575 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT; 576 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 577 break; 578 default: 579 break; 580 } 581 } 582 583 /* 584 * Focus now set, adjust ddflag. Skip this pass if the focus 585 * is bad or if we are at the PFS root (the bref won't match at 586 * the PFS root, obviously). 587 */ 588 focus = cluster->focus; 589 if (focus) { 590 cluster->ddflag = 591 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); 592 } else { 593 cluster->ddflag = 0; 594 goto skip4; 595 } 596 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 597 goto skip4; 598 599 /* 600 * Pass 4 601 * 602 * Validate the elements that were not marked invalid. They should 603 * match. 604 */ 605 for (i = 0; i < cluster->nchains; ++i) { 606 int ddflag; 607 608 chain = cluster->array[i].chain; 609 610 if (chain == NULL) 611 continue; 612 if (chain == focus) 613 continue; 614 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 615 continue; 616 617 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 618 if (chain->bref.type != focus->bref.type || 619 chain->bref.key != focus->bref.key || 620 chain->bref.keybits != focus->bref.keybits || 621 chain->bref.modify_tid != focus->bref.modify_tid || 622 chain->bytes != focus->bytes || 623 ddflag != cluster->ddflag) { 624 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 625 if (hammer2_debug & 1) 626 kprintf("cluster_resolve: matching modify_tid failed " 627 "bref test: idx=%d type=%02x/%02x " 628 "key=%016jx/%d-%016jx/%d " 629 "mod=%016jx/%016jx bytes=%u/%u\n", 630 i, 631 chain->bref.type, focus->bref.type, 632 chain->bref.key, chain->bref.keybits, 633 focus->bref.key, focus->bref.keybits, 634 chain->bref.modify_tid, focus->bref.modify_tid, 635 chain->bytes, focus->bytes); 636 if (hammer2_debug & 0x4000) 637 panic("cluster_resolve"); 638 /* flag issue and force resync? */ 639 } 640 } 641 skip4: 642 643 if (ttlslaves == 0) 644 nflags |= HAMMER2_CLUSTER_NOSOFT; 645 if (ttlmasters == 0) 646 nflags |= HAMMER2_CLUSTER_NOHARD; 647 648 /* 649 * Set SSYNCED or MSYNCED for slaves and masters respectively if 650 * all available nodes (even if 0 are available) are fully 651 * synchronized. This is used by the synchronization thread to 652 * determine if there is work it could potentially accomplish. 653 */ 654 if (nslaves == ttlslaves) 655 nflags |= HAMMER2_CLUSTER_SSYNCED; 656 if (nmasters == ttlmasters) 657 nflags |= HAMMER2_CLUSTER_MSYNCED; 658 659 /* 660 * Determine if the cluster was successfully locked for the 661 * requested operation and generate an error code. The cluster 662 * will not be locked (or ref'd) if an error is returned. 663 */ 664 atomic_set_int(&cluster->flags, nflags); 665 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 666 } 667 668 /* 669 * This is used by the XOPS subsystem to calculate the state of 670 * the collection and tell hammer2_xop_collect() what to do with it. 671 * The collection can be in various states of desynchronization, the 672 * caller specifically wants to resolve the passed-in key. 673 * 674 * Return values: 675 * 0 - Quorum agreement, key is valid 676 * 677 * ENOENT - Quorum agreement, end of scan 678 * 679 * ESRCH - Quorum agreement, key is INVALID (caller should 680 * skip key). 681 * 682 * EIO - Quorum agreement but all elements had errors. 683 * 684 * EDEADLK - No quorum agreement possible for key, a repair 685 * may be needed. Caller has to decide what to do, 686 * possibly iterating the key or generating an EIO. 687 * 688 * EINPROGRESS - No quorum agreement yet, but agreement is still 689 * possible if caller waits for more responses. Caller 690 * should not iterate key. 691 * 692 * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds. 693 * 694 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE 695 */ 696 int 697 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags) 698 { 699 hammer2_chain_t *chain; 700 hammer2_chain_t *focus; 701 hammer2_pfs_t *pmp; 702 hammer2_tid_t quorum_tid; 703 hammer2_tid_t last_best_quorum_tid; 704 uint32_t nflags; 705 int ttlmasters; 706 int ttlslaves; 707 int nmasters; 708 int nmasters_keymatch; 709 int nslaves; 710 int nquorum; 711 int umasters; /* unknown masters (still in progress) */ 712 int smpresent; 713 int error; 714 int i; 715 716 cluster->error = 0; 717 cluster->focus = NULL; 718 719 pmp = cluster->pmp; 720 KKASSERT(pmp != NULL || cluster->nchains == 0); 721 722 /* 723 * Calculate quorum 724 */ 725 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 726 smpresent = 0; 727 nflags = 0; 728 ttlmasters = 0; 729 ttlslaves = 0; 730 731 /* 732 * Pass 1 733 * 734 * NOTE: A NULL chain is not necessarily an error, it could be 735 * e.g. a lookup failure or the end of an iteration. 736 * Process normally. 737 */ 738 for (i = 0; i < cluster->nchains; ++i) { 739 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 740 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 741 742 chain = cluster->array[i].chain; 743 error = cluster->array[i].error; 744 if (chain && error) { 745 if (cluster->focus == NULL || cluster->focus == chain) { 746 /* error will be overridden by valid focus */ 747 /* XXX */ 748 } 749 750 /* 751 * Must count total masters and slaves whether the 752 * chain is errored or not. 753 */ 754 switch (cluster->pmp->pfs_types[i]) { 755 case HAMMER2_PFSTYPE_SUPROOT: 756 case HAMMER2_PFSTYPE_MASTER: 757 ++ttlmasters; 758 break; 759 case HAMMER2_PFSTYPE_SLAVE: 760 ++ttlslaves; 761 break; 762 } 763 continue; 764 } 765 switch (cluster->pmp->pfs_types[i]) { 766 case HAMMER2_PFSTYPE_MASTER: 767 ++ttlmasters; 768 break; 769 case HAMMER2_PFSTYPE_SLAVE: 770 ++ttlslaves; 771 break; 772 case HAMMER2_PFSTYPE_SOFT_MASTER: 773 nflags |= HAMMER2_CLUSTER_WRSOFT; 774 nflags |= HAMMER2_CLUSTER_RDSOFT; 775 smpresent = 1; 776 break; 777 case HAMMER2_PFSTYPE_SOFT_SLAVE: 778 nflags |= HAMMER2_CLUSTER_RDSOFT; 779 break; 780 case HAMMER2_PFSTYPE_SUPROOT: 781 /* 782 * Degenerate cluster representing the super-root 783 * topology on a single device. Fake stuff so 784 * cluster ops work as expected. 785 */ 786 ++ttlmasters; 787 nflags |= HAMMER2_CLUSTER_WRHARD; 788 nflags |= HAMMER2_CLUSTER_RDHARD; 789 cluster->focus_index = i; 790 cluster->focus = chain; 791 cluster->error = error; 792 break; 793 default: 794 break; 795 } 796 } 797 798 /* 799 * Pass 2 800 * 801 * Resolve nmasters - master nodes fully match 802 * 803 * Resolve umasters - master nodes operation still 804 * in progress 805 * 806 * Resolve nmasters_keymatch - master nodes match the passed-in 807 * key and may or may not match 808 * the quorum-agreed tid. 809 * 810 * The quorum-agreed TID is the highest matching TID. 811 */ 812 last_best_quorum_tid = HAMMER2_TID_MAX; 813 umasters = 0; 814 nmasters = 0; 815 nmasters_keymatch = 0; 816 quorum_tid = 0; /* fix gcc warning */ 817 818 while (nmasters < nquorum && last_best_quorum_tid != 0) { 819 umasters = 0; 820 nmasters = 0; 821 nmasters_keymatch = 0; 822 quorum_tid = 0; 823 824 for (i = 0; i < cluster->nchains; ++i) { 825 /* XXX SOFT smpresent handling */ 826 switch(cluster->pmp->pfs_types[i]) { 827 case HAMMER2_PFSTYPE_MASTER: 828 case HAMMER2_PFSTYPE_SUPROOT: 829 break; 830 default: 831 continue; 832 } 833 834 chain = cluster->array[i].chain; 835 error = cluster->array[i].error; 836 837 /* 838 * Skip elements still in progress. umasters keeps 839 * track of masters that might still be in-progress. 840 */ 841 if (chain == NULL && (cluster->array[i].flags & 842 HAMMER2_CITEM_NULL) == 0) { 843 ++umasters; 844 continue; 845 } 846 847 /* 848 * Key match? 849 */ 850 if (flags & HAMMER2_CHECK_NULL) { 851 if (chain == NULL) { 852 ++nmasters; 853 ++nmasters_keymatch; 854 if (cluster->error == 0) 855 cluster->error = error; 856 } 857 } else if (chain && 858 (key == (hammer2_key_t)-1 || 859 chain->bref.key == key)) { 860 ++nmasters_keymatch; 861 862 if (chain->bref.modify_tid < 863 last_best_quorum_tid && 864 quorum_tid < chain->bref.modify_tid) { 865 /* 866 * Select new TID as master if better 867 * than any found so far in this loop, 868 * as long as it does not reach the 869 * best tid found in the previous loop. 870 */ 871 nmasters = 0; 872 quorum_tid = chain->bref.modify_tid; 873 } 874 if (quorum_tid == chain->bref.modify_tid) { 875 /* 876 * TID matches current collection. 877 * 878 * (error handled in next pass) 879 */ 880 ++nmasters; 881 if (chain->error == 0) { 882 cluster->focus = chain; 883 cluster->focus_index = i; 884 } 885 } 886 } 887 } 888 if (nmasters >= nquorum) 889 break; 890 last_best_quorum_tid = quorum_tid; 891 } 892 893 /* 894 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n", 895 nmasters, nquorum, nmasters_keymatch, umasters); 896 */ 897 898 /* 899 * Early return if we do not have enough masters. 900 */ 901 if (nmasters < nquorum) { 902 if (nmasters + umasters >= nquorum) 903 return HAMMER2_ERROR_EINPROGRESS; 904 if (nmasters_keymatch < nquorum) 905 return HAMMER2_ERROR_ESRCH; 906 return HAMMER2_ERROR_EDEADLK; 907 } 908 909 /* 910 * Validated end of scan. 911 */ 912 if (flags & HAMMER2_CHECK_NULL) { 913 if (cluster->error == 0) 914 cluster->error = HAMMER2_ERROR_ENOENT; 915 return cluster->error; 916 } 917 918 /* 919 * If we have a NULL focus at this point the agreeing quorum all 920 * had chain errors. 921 */ 922 if (cluster->focus == NULL) 923 return HAMMER2_ERROR_EIO; 924 925 /* 926 * Pass 3 927 * 928 * We have quorum agreement, validate elements, not end of scan. 929 */ 930 nslaves = 0; 931 cluster->error = 0; 932 933 for (i = 0; i < cluster->nchains; ++i) { 934 chain = cluster->array[i].chain; 935 error = cluster->array[i].error; 936 if (chain == NULL || 937 chain->bref.key != key || 938 chain->bref.modify_tid != quorum_tid) { 939 continue; 940 } 941 942 /* 943 * Quorum Match 944 * 945 * XXX for now, cumulative error. 946 */ 947 if (cluster->error == 0) 948 cluster->error = error; 949 950 switch (cluster->pmp->pfs_types[i]) { 951 case HAMMER2_PFSTYPE_MASTER: 952 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 953 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 954 nflags |= HAMMER2_CLUSTER_WRHARD; 955 nflags |= HAMMER2_CLUSTER_RDHARD; 956 break; 957 case HAMMER2_PFSTYPE_SLAVE: 958 /* 959 * We must have enough up-to-date masters to reach 960 * a quorum and the slave modify_tid must match the 961 * quorum's modify_tid. 962 * 963 * Do not select an errored slave. 964 */ 965 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 966 nflags |= HAMMER2_CLUSTER_RDHARD; 967 ++nslaves; 968 break; 969 case HAMMER2_PFSTYPE_SOFT_MASTER: 970 /* 971 * Directly mounted soft master always wins. There 972 * should be only one. 973 */ 974 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 975 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 976 break; 977 case HAMMER2_PFSTYPE_SOFT_SLAVE: 978 /* 979 * Directly mounted soft slave always wins. There 980 * should be only one. 981 * 982 * XXX 983 */ 984 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 985 break; 986 case HAMMER2_PFSTYPE_SUPROOT: 987 /* 988 * spmp (degenerate case) 989 */ 990 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 991 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 992 nflags |= HAMMER2_CLUSTER_WRHARD; 993 nflags |= HAMMER2_CLUSTER_RDHARD; 994 break; 995 default: 996 break; 997 } 998 } 999 1000 /* 1001 * Focus now set, adjust ddflag. Skip this pass if the focus 1002 * is bad or if we are at the PFS root (the bref won't match at 1003 * the PFS root, obviously). 1004 * 1005 * focus is probably not locked and it isn't safe to test its 1006 * content (e.g. focus->data, focus->dio, other content). We 1007 * do not synchronize the dio to the cpu here. In fact, in numerous 1008 * situations the frontend doesn't even need to access its dio/data, 1009 * so synchronizing it here would be wasteful. 1010 */ 1011 focus = cluster->focus; 1012 if (focus) { 1013 cluster->ddflag = 1014 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); 1015 } else { 1016 cluster->ddflag = 0; 1017 goto skip4; 1018 } 1019 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1020 goto skip4; 1021 1022 /* 1023 * Pass 4 1024 * 1025 * Validate the elements that were not marked invalid. They should 1026 * match. 1027 */ 1028 for (i = 0; i < cluster->nchains; ++i) { 1029 int ddflag; 1030 1031 chain = cluster->array[i].chain; 1032 1033 if (chain == NULL) 1034 continue; 1035 if (chain == focus) 1036 continue; 1037 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 1038 continue; 1039 1040 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 1041 if (chain->bref.type != focus->bref.type || 1042 chain->bref.key != focus->bref.key || 1043 chain->bref.keybits != focus->bref.keybits || 1044 chain->bref.modify_tid != focus->bref.modify_tid || 1045 chain->bytes != focus->bytes || 1046 ddflag != cluster->ddflag) { 1047 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1048 if (hammer2_debug & 1) 1049 kprintf("cluster_resolve: matching modify_tid failed " 1050 "bref test: idx=%d type=%02x/%02x " 1051 "key=%016jx/%d-%016jx/%d " 1052 "mod=%016jx/%016jx bytes=%u/%u\n", 1053 i, 1054 chain->bref.type, focus->bref.type, 1055 chain->bref.key, chain->bref.keybits, 1056 focus->bref.key, focus->bref.keybits, 1057 chain->bref.modify_tid, focus->bref.modify_tid, 1058 chain->bytes, focus->bytes); 1059 if (hammer2_debug & 0x4000) 1060 panic("cluster_resolve"); 1061 /* flag issue and force resync? */ 1062 } 1063 } 1064 skip4: 1065 1066 if (ttlslaves == 0) 1067 nflags |= HAMMER2_CLUSTER_NOSOFT; 1068 if (ttlmasters == 0) 1069 nflags |= HAMMER2_CLUSTER_NOHARD; 1070 1071 /* 1072 * Set SSYNCED or MSYNCED for slaves and masters respectively if 1073 * all available nodes (even if 0 are available) are fully 1074 * synchronized. This is used by the synchronization thread to 1075 * determine if there is work it could potentially accomplish. 1076 */ 1077 if (nslaves == ttlslaves) 1078 nflags |= HAMMER2_CLUSTER_SSYNCED; 1079 if (nmasters == ttlmasters) 1080 nflags |= HAMMER2_CLUSTER_MSYNCED; 1081 1082 /* 1083 * Determine if the cluster was successfully locked for the 1084 * requested operation and generate an error code. The cluster 1085 * will not be locked (or ref'd) if an error is returned. 1086 */ 1087 atomic_set_int(&cluster->flags, nflags); 1088 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 1089 1090 return cluster->error; 1091 } 1092 1093 /* 1094 * This is used by the sync thread to force non-NULL elements of a copy 1095 * of the pmp->iroot cluster to be good which is required to prime the 1096 * sync. 1097 */ 1098 void 1099 hammer2_cluster_forcegood(hammer2_cluster_t *cluster) 1100 { 1101 int i; 1102 1103 for (i = 0; i < cluster->nchains; ++i) { 1104 if (cluster->array[i].chain) 1105 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 1106 } 1107 } 1108 1109 /* 1110 * Unlock a cluster. Refcount and focus is maintained. 1111 */ 1112 void 1113 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 1114 { 1115 hammer2_chain_t *chain; 1116 int i; 1117 1118 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) { 1119 kprintf("hammer2_cluster_unlock: cluster %p not locked\n", 1120 cluster); 1121 } 1122 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 1123 KKASSERT(cluster->refs > 0); 1124 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 1125 1126 for (i = 0; i < cluster->nchains; ++i) { 1127 chain = cluster->array[i].chain; 1128 if (chain) 1129 hammer2_chain_unlock(chain); 1130 } 1131 } 1132