1 /* 2 * Copyright (c) 2013-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information from different nodes into a single entity. It allows direct 37 * access to media data as long as it is not blockref array data (which 38 * will obviously have to be different at each node). 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 * 48 * WARNING! This module is *extremely* complex. It must issue asynchronous 49 * locks and I/O, do quorum and/or master-slave processing, and 50 * it must operate properly even if some nodes are broken (which 51 * can also mean indefinite locks). 52 * 53 * CLUSTER OPERATIONS 54 * 55 * Cluster operations can be broken down into three pieces: 56 * 57 * (1) Chain locking and data retrieval. 58 * 59 * - Most complex functions, quorum management on transaction ids. 60 * 61 * - Locking and data accesses must be internally asynchronous. 62 * 63 * - Validate and manage cache coherency primitives (cache state 64 * is stored in chain topologies but must be validated by these 65 * functions). 66 * 67 * (2) Lookups and Scans 68 * hammer2_cluster_lookup() 69 * hammer2_cluster_next() 70 * 71 * - Depend on locking & data retrieval functions, but still complex. 72 * 73 * - Must do quorum management on transaction ids. 74 * 75 * - Lookup and Iteration ops Must be internally asynchronous. 76 * 77 * (3) Modifying Operations 78 * hammer2_cluster_create() 79 * 80 * - Can usually punt on failures, operation continues unless quorum 81 * is lost. If quorum is lost, must wait for resynchronization 82 * (depending on the management mode). 83 * 84 * - Must disconnect node on failures (also not flush), remount, and 85 * resynchronize. 86 * 87 * - Network links (via kdmsg) are relatively easy to issue as the 88 * complex underworkings of hammer2_chain.c don't have to messed 89 * with (the protocol is at a higher level than block-level). 90 * 91 * - Multiple local disk nodes (i.e. block devices) are another matter. 92 * Chain operations have to be dispatched to per-node threads (xN) 93 * because we can't asynchronize potentially very complex chain 94 * operations in hammer2_chain.c (it would be a huge mess). 95 * 96 * (these threads are also used to terminate incoming kdmsg ops from 97 * other machines). 98 * 99 * - Single-node filesystems do not use threads and will simply call 100 * hammer2_chain.c functions directly. This short-cut is handled 101 * at the base of each cluster function. 102 */ 103 #include <sys/cdefs.h> 104 #include <sys/param.h> 105 #include <sys/systm.h> 106 #include <sys/types.h> 107 #include <sys/lock.h> 108 #include <sys/uuid.h> 109 110 #include "hammer2.h" 111 112 /* 113 * Returns the bref type of the cluster's foucs. 114 * 115 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0). 116 * The cluster must be locked. 117 */ 118 uint8_t 119 hammer2_cluster_type(hammer2_cluster_t *cluster) 120 { 121 if (cluster->error == 0) { 122 KKASSERT(cluster->focus != NULL); 123 return(cluster->focus->bref.type); 124 } 125 return 0; 126 } 127 128 /* 129 * Returns the bref of the cluster's focus, sans any data-offset information 130 * (since offset information is per-node and wouldn't be useful). 131 * 132 * Callers use this function to access modify_tid, mirror_tid, type, 133 * key, and keybits. 134 * 135 * If the cluster is errored, returns an empty bref. 136 * The cluster must be locked. 137 */ 138 void 139 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 140 { 141 if (cluster->error == 0) { 142 KKASSERT(cluster->focus != NULL); 143 *bref = cluster->focus->bref; 144 bref->data_off = 0; 145 } else { 146 bzero(bref, sizeof(*bref)); 147 } 148 } 149 150 /* 151 * Create a degenerate cluster with one ref from a single locked chain. 152 * The returned cluster will be focused on the chain and inherit its 153 * error state. 154 * 155 * The chain's lock and reference are transfered to the new cluster, so 156 * the caller should not try to unlock the chain separately. 157 * 158 * We fake the flags. 159 */ 160 void 161 hammer2_dummy_xop_from_chain(hammer2_xop_head_t *xop, hammer2_chain_t *chain) 162 { 163 hammer2_cluster_t *cluster; 164 165 bzero(xop, sizeof(*xop)); 166 167 cluster = &xop->cluster; 168 cluster->array[0].chain = chain; 169 cluster->array[0].flags = HAMMER2_CITEM_FEMOD; 170 cluster->nchains = 1; 171 cluster->focus = chain; 172 cluster->focus_index = 0; 173 cluster->pmp = chain->pmp; 174 cluster->refs = 1; 175 cluster->error = chain->error; 176 cluster->flags = HAMMER2_CLUSTER_LOCKED | 177 HAMMER2_CLUSTER_WRHARD | 178 HAMMER2_CLUSTER_RDHARD | 179 HAMMER2_CLUSTER_MSYNCED | 180 HAMMER2_CLUSTER_SSYNCED; 181 } 182 183 /* 184 * Add a reference to a cluster and its underlying chains. 185 * 186 * We must also ref the underlying chains in order to allow ref/unlock 187 * sequences to later re-lock. 188 */ 189 void 190 hammer2_cluster_ref(hammer2_cluster_t *cluster) 191 { 192 atomic_add_int(&cluster->refs, 1); 193 } 194 195 /* 196 * Drop the caller's reference to the cluster. When the ref count drops to 197 * zero this function frees the cluster and drops all underlying chains. 198 * 199 * In-progress read I/Os are typically detached from the cluster once the 200 * first one returns (the remaining stay attached to the DIOs but are then 201 * ignored and drop naturally). 202 */ 203 void 204 hammer2_cluster_drop(hammer2_cluster_t *cluster) 205 { 206 hammer2_chain_t *chain; 207 int i; 208 209 KKASSERT(cluster->refs > 0); 210 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 211 cluster->focus = NULL; /* safety XXX chg to assert */ 212 cluster->focus_index = 0; 213 214 for (i = 0; i < cluster->nchains; ++i) { 215 chain = cluster->array[i].chain; 216 if (chain) { 217 hammer2_chain_drop(chain); 218 cluster->array[i].chain = NULL; /* safety */ 219 } 220 } 221 cluster->nchains = 0; /* safety */ 222 223 kfree(cluster, M_HAMMER2); 224 /* cluster is invalid */ 225 } 226 } 227 228 /* 229 * Lock a cluster. Cluster must already be referenced. Focus is maintained. 230 * 231 * WARNING! This function expects the caller to handle resolution of the 232 * cluster. We never re-resolve the cluster in this function, 233 * because it might be used to temporarily unlock/relock a cparent 234 * in an iteration or recursrion, and the cparents elements do not 235 * necessarily match. 236 */ 237 void 238 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 239 { 240 hammer2_chain_t *chain; 241 int i; 242 243 /* cannot be on inode-embedded cluster template, must be on copy */ 244 KKASSERT(cluster->refs > 0); 245 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0); 246 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) { 247 panic("hammer2_cluster_lock: cluster %p already locked!\n", 248 cluster); 249 } 250 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 251 252 /* 253 * Lock chains and resolve state. 254 */ 255 for (i = 0; i < cluster->nchains; ++i) { 256 chain = cluster->array[i].chain; 257 if (chain == NULL) 258 continue; 259 hammer2_chain_lock(chain, how); 260 } 261 } 262 263 void 264 hammer2_cluster_unhold(hammer2_cluster_t *cluster) 265 { 266 hammer2_chain_t *chain; 267 int i; 268 269 for (i = 0; i < cluster->nchains; ++i) { 270 chain = cluster->array[i].chain; 271 if (chain == NULL) 272 continue; 273 hammer2_chain_unhold(chain); 274 } 275 } 276 277 void 278 hammer2_cluster_rehold(hammer2_cluster_t *cluster) 279 { 280 hammer2_chain_t *chain; 281 int i; 282 283 for (i = 0; i < cluster->nchains; ++i) { 284 chain = cluster->array[i].chain; 285 if (chain == NULL) 286 continue; 287 hammer2_chain_rehold(chain); 288 } 289 } 290 291 /* 292 * Calculate the clustering state for the cluster and set its focus. 293 * This routine must be called with care. For example, it should not 294 * normally be called after relocking a non-leaf cluster because parent 295 * clusters help iterations and each element might be at a slightly different 296 * indirect node (each node's topology is independently indexed). 297 * 298 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal 299 * operations. Typically this is only set on a quorum of MASTERs or 300 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER 301 * is present, this bit is *not* set on a quorum of MASTERs. The 302 * synchronization code ignores this bit, but all hammer2_cluster_*() calls 303 * that create/modify/delete elements use it. 304 * 305 * The chains making up the cluster may be narrowed down based on quorum 306 * acceptability. 307 * 308 * If a failure occurs the operation must be aborted by higher-level code and 309 * retried. XXX 310 */ 311 void 312 hammer2_cluster_resolve(hammer2_cluster_t *cluster) 313 { 314 hammer2_chain_t *chain; 315 hammer2_chain_t *focus; 316 hammer2_pfs_t *pmp; 317 hammer2_tid_t quorum_tid; 318 hammer2_tid_t last_best_quorum_tid; 319 int focus_pfs_type; 320 uint32_t nflags; 321 int ttlmasters; 322 int ttlslaves; 323 int nmasters; 324 int nslaves; 325 int nquorum; 326 int smpresent; 327 int i; 328 329 cluster->error = 0; 330 cluster->focus = NULL; 331 332 focus_pfs_type = 0; 333 nflags = 0; 334 ttlmasters = 0; 335 ttlslaves = 0; 336 nmasters = 0; 337 nslaves = 0; 338 339 /* 340 * Calculate quorum 341 */ 342 pmp = cluster->pmp; 343 KKASSERT(pmp != NULL || cluster->nchains == 0); 344 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 345 smpresent = 0; 346 347 /* 348 * Pass 1 349 * 350 * NOTE: A NULL chain is not necessarily an error, it could be 351 * e.g. a lookup failure or the end of an iteration. 352 * Process normally. 353 */ 354 for (i = 0; i < cluster->nchains; ++i) { 355 chain = cluster->array[i].chain; 356 if (chain && chain->error) { 357 if (cluster->focus == NULL || cluster->focus == chain) { 358 /* error will be overridden by valid focus */ 359 cluster->error = chain->error; 360 } 361 362 /* 363 * Must count total masters and slaves whether the 364 * chain is errored or not. 365 */ 366 switch (cluster->pmp->pfs_types[i]) { 367 case HAMMER2_PFSTYPE_SUPROOT: 368 case HAMMER2_PFSTYPE_MASTER: 369 ++ttlmasters; 370 break; 371 case HAMMER2_PFSTYPE_SLAVE: 372 ++ttlslaves; 373 break; 374 } 375 continue; 376 } 377 switch (cluster->pmp->pfs_types[i]) { 378 case HAMMER2_PFSTYPE_MASTER: 379 ++ttlmasters; 380 break; 381 case HAMMER2_PFSTYPE_SLAVE: 382 ++ttlslaves; 383 break; 384 case HAMMER2_PFSTYPE_SOFT_MASTER: 385 nflags |= HAMMER2_CLUSTER_WRSOFT; 386 nflags |= HAMMER2_CLUSTER_RDSOFT; 387 smpresent = 1; 388 break; 389 case HAMMER2_PFSTYPE_SOFT_SLAVE: 390 nflags |= HAMMER2_CLUSTER_RDSOFT; 391 break; 392 case HAMMER2_PFSTYPE_SUPROOT: 393 /* 394 * Degenerate cluster representing the super-root 395 * topology on a single device. Fake stuff so 396 * cluster ops work as expected. 397 */ 398 nflags |= HAMMER2_CLUSTER_WRHARD; 399 nflags |= HAMMER2_CLUSTER_RDHARD; 400 cluster->focus_index = i; 401 cluster->focus = chain; 402 cluster->error = chain ? chain->error : 0; 403 ++ttlmasters; 404 break; 405 default: 406 break; 407 } 408 } 409 410 /* 411 * Pass 2 412 * 413 * Resolve masters. Calculate nmasters for the highest matching 414 * TID, if a quorum cannot be attained try the next lower matching 415 * TID until we exhaust TIDs. 416 * 417 * NOTE: A NULL chain is not necessarily an error, it could be 418 * e.g. a lookup failure or the end of an iteration. 419 * Process normally. 420 */ 421 last_best_quorum_tid = HAMMER2_TID_MAX; 422 quorum_tid = 0; /* fix gcc warning */ 423 424 while (nmasters < nquorum && last_best_quorum_tid != 0) { 425 nmasters = 0; 426 quorum_tid = 0; 427 428 for (i = 0; i < cluster->nchains; ++i) { 429 switch (cluster->pmp->pfs_types[i]) { 430 case HAMMER2_PFSTYPE_SUPROOT: 431 case HAMMER2_PFSTYPE_MASTER: 432 break; 433 default: 434 continue; 435 } 436 chain = cluster->array[i].chain; 437 438 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 439 /* 440 * Invalid as in unsynchronized, cannot be 441 * used to calculate the quorum. 442 */ 443 } else if (chain == NULL && quorum_tid == 0) { 444 /* 445 * NULL chain on master matches NULL chains 446 * on other masters. 447 */ 448 ++nmasters; 449 } else if (quorum_tid < last_best_quorum_tid && 450 chain != NULL && 451 (quorum_tid < chain->bref.modify_tid || 452 nmasters == 0)) { 453 /* 454 * Better TID located, reset nmasters count. 455 */ 456 nmasters = 1; 457 quorum_tid = chain->bref.modify_tid; 458 } else if (chain && 459 quorum_tid == chain->bref.modify_tid) { 460 /* 461 * TID matches current collection. 462 */ 463 ++nmasters; 464 } 465 } 466 if (nmasters >= nquorum) 467 break; 468 last_best_quorum_tid = quorum_tid; 469 } 470 471 /* 472 * Pass 3 473 * 474 * NOTE: A NULL chain is not necessarily an error, it could be 475 * e.g. a lookup failure or the end of an iteration. 476 * Process normally. 477 */ 478 for (i = 0; i < cluster->nchains; ++i) { 479 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 480 chain = cluster->array[i].chain; 481 if (chain && chain->error) { 482 if (cluster->focus == NULL || cluster->focus == chain) { 483 /* error will be overridden by valid focus */ 484 cluster->error = chain->error; 485 } 486 continue; 487 } 488 489 switch (cluster->pmp->pfs_types[i]) { 490 case HAMMER2_PFSTYPE_MASTER: 491 /* 492 * We must have enough up-to-date masters to reach 493 * a quorum and the master modify_tid must match 494 * the quorum's modify_tid. 495 * 496 * Do not select an errored or out-of-sync master. 497 */ 498 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 499 nflags |= HAMMER2_CLUSTER_UNHARD; 500 } else if (nmasters >= nquorum && 501 (chain == NULL || chain->error == 0) && 502 ((chain == NULL && quorum_tid == 0) || 503 (chain != NULL && quorum_tid == 504 chain->bref.modify_tid))) { 505 nflags |= HAMMER2_CLUSTER_WRHARD; 506 nflags |= HAMMER2_CLUSTER_RDHARD; 507 if (!smpresent) { 508 cluster->array[i].flags |= 509 HAMMER2_CITEM_FEMOD; 510 } 511 if (cluster->focus == NULL || 512 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) { 513 focus_pfs_type = HAMMER2_PFSTYPE_MASTER; 514 cluster->focus_index = i; 515 cluster->focus = chain; /* NULL ok */ 516 cluster->error = chain ? chain->error : 517 0; 518 } 519 } else if (chain == NULL || chain->error == 0) { 520 nflags |= HAMMER2_CLUSTER_UNHARD; 521 } 522 break; 523 case HAMMER2_PFSTYPE_SLAVE: 524 /* 525 * We must have enough up-to-date masters to reach 526 * a quorum and the slave modify_tid must match the 527 * quorum's modify_tid. 528 * 529 * Do not select an errored slave. 530 */ 531 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 532 nflags |= HAMMER2_CLUSTER_UNHARD; 533 } else if (nmasters >= nquorum && 534 (chain == NULL || chain->error == 0) && 535 ((chain == NULL && quorum_tid == 0) || 536 (chain && quorum_tid == 537 chain->bref.modify_tid))) { 538 ++nslaves; 539 nflags |= HAMMER2_CLUSTER_RDHARD; 540 } else if (chain == NULL || chain->error == 0) { 541 nflags |= HAMMER2_CLUSTER_UNSOFT; 542 } 543 break; 544 case HAMMER2_PFSTYPE_SOFT_MASTER: 545 /* 546 * Directly mounted soft master always wins. There 547 * should be only one. 548 */ 549 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER); 550 cluster->focus_index = i; 551 cluster->focus = chain; 552 cluster->error = chain ? chain->error : 0; 553 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER; 554 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 555 break; 556 case HAMMER2_PFSTYPE_SOFT_SLAVE: 557 /* 558 * Directly mounted soft slave always wins. There 559 * should be only one. 560 */ 561 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE); 562 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) { 563 cluster->focus_index = i; 564 cluster->focus = chain; 565 cluster->error = chain ? chain->error : 0; 566 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE; 567 } 568 break; 569 case HAMMER2_PFSTYPE_SUPROOT: 570 /* 571 * spmp (degenerate case) 572 */ 573 KKASSERT(i == 0); 574 cluster->focus_index = i; 575 cluster->focus = chain; 576 cluster->error = chain ? chain->error : 0; 577 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT; 578 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 579 break; 580 default: 581 break; 582 } 583 } 584 585 /* 586 * Focus now set, adjust ddflag. Skip this pass if the focus 587 * is bad or if we are at the PFS root (the bref won't match at 588 * the PFS root, obviously). 589 */ 590 focus = cluster->focus; 591 if (focus) { 592 cluster->ddflag = 593 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); 594 } else { 595 cluster->ddflag = 0; 596 goto skip4; 597 } 598 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 599 goto skip4; 600 601 /* 602 * Pass 4 603 * 604 * Validate the elements that were not marked invalid. They should 605 * match. 606 */ 607 for (i = 0; i < cluster->nchains; ++i) { 608 int ddflag; 609 610 chain = cluster->array[i].chain; 611 612 if (chain == NULL) 613 continue; 614 if (chain == focus) 615 continue; 616 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 617 continue; 618 619 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 620 if (chain->bref.type != focus->bref.type || 621 chain->bref.key != focus->bref.key || 622 chain->bref.keybits != focus->bref.keybits || 623 chain->bref.modify_tid != focus->bref.modify_tid || 624 chain->bytes != focus->bytes || 625 ddflag != cluster->ddflag) { 626 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 627 if (hammer2_debug & 1) 628 kprintf("cluster_resolve: matching modify_tid failed " 629 "bref test: idx=%d type=%02x/%02x " 630 "key=%016jx/%d-%016jx/%d " 631 "mod=%016jx/%016jx bytes=%u/%u\n", 632 i, 633 chain->bref.type, focus->bref.type, 634 chain->bref.key, chain->bref.keybits, 635 focus->bref.key, focus->bref.keybits, 636 chain->bref.modify_tid, focus->bref.modify_tid, 637 chain->bytes, focus->bytes); 638 if (hammer2_debug & 0x4000) 639 panic("cluster_resolve"); 640 /* flag issue and force resync? */ 641 } 642 } 643 skip4: 644 645 if (ttlslaves == 0) 646 nflags |= HAMMER2_CLUSTER_NOSOFT; 647 if (ttlmasters == 0) 648 nflags |= HAMMER2_CLUSTER_NOHARD; 649 650 /* 651 * Set SSYNCED or MSYNCED for slaves and masters respectively if 652 * all available nodes (even if 0 are available) are fully 653 * synchronized. This is used by the synchronization thread to 654 * determine if there is work it could potentially accomplish. 655 */ 656 if (nslaves == ttlslaves) 657 nflags |= HAMMER2_CLUSTER_SSYNCED; 658 if (nmasters == ttlmasters) 659 nflags |= HAMMER2_CLUSTER_MSYNCED; 660 661 /* 662 * Determine if the cluster was successfully locked for the 663 * requested operation and generate an error code. The cluster 664 * will not be locked (or ref'd) if an error is returned. 665 */ 666 atomic_set_int(&cluster->flags, nflags); 667 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 668 } 669 670 /* 671 * This is used by the XOPS subsystem to calculate the state of 672 * the collection and tell hammer2_xop_collect() what to do with it. 673 * The collection can be in various states of desynchronization, the 674 * caller specifically wants to resolve the passed-in key. 675 * 676 * Return values: 677 * 0 - Quorum agreement, key is valid 678 * 679 * ENOENT - Quorum agreement, end of scan 680 * 681 * ESRCH - Quorum agreement, key is INVALID (caller should 682 * skip key). 683 * 684 * EIO - Quorum agreement but all elements had errors. 685 * 686 * EDEADLK - No quorum agreement possible for key, a repair 687 * may be needed. Caller has to decide what to do, 688 * possibly iterating the key or generating an EIO. 689 * 690 * EINPROGRESS - No quorum agreement yet, but agreement is still 691 * possible if caller waits for more responses. Caller 692 * should not iterate key. 693 * 694 * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds. 695 * 696 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE 697 */ 698 int 699 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags) 700 { 701 hammer2_chain_t *chain; 702 hammer2_chain_t *focus; 703 hammer2_pfs_t *pmp; 704 hammer2_tid_t quorum_tid; 705 hammer2_tid_t last_best_quorum_tid; 706 uint32_t nflags; 707 int ttlmasters; 708 int ttlslaves; 709 int nmasters; 710 int nmasters_keymatch; 711 int nslaves; 712 int nquorum; 713 int umasters; /* unknown masters (still in progress) */ 714 int smpresent; 715 int error; 716 int i; 717 718 cluster->error = 0; 719 cluster->focus = NULL; 720 721 pmp = cluster->pmp; 722 KKASSERT(pmp != NULL || cluster->nchains == 0); 723 724 /* 725 * Calculate quorum 726 */ 727 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 728 smpresent = 0; 729 nflags = 0; 730 ttlmasters = 0; 731 ttlslaves = 0; 732 733 /* 734 * Pass 1 735 * 736 * NOTE: A NULL chain is not necessarily an error, it could be 737 * e.g. a lookup failure or the end of an iteration. 738 * Process normally. 739 */ 740 for (i = 0; i < cluster->nchains; ++i) { 741 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 742 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 743 744 chain = cluster->array[i].chain; 745 error = cluster->array[i].error; 746 if (chain && error) { 747 if (cluster->focus == NULL || cluster->focus == chain) { 748 /* error will be overridden by valid focus */ 749 /* XXX */ 750 } 751 752 /* 753 * Must count total masters and slaves whether the 754 * chain is errored or not. 755 */ 756 switch (cluster->pmp->pfs_types[i]) { 757 case HAMMER2_PFSTYPE_SUPROOT: 758 case HAMMER2_PFSTYPE_MASTER: 759 ++ttlmasters; 760 break; 761 case HAMMER2_PFSTYPE_SLAVE: 762 ++ttlslaves; 763 break; 764 } 765 continue; 766 } 767 switch (cluster->pmp->pfs_types[i]) { 768 case HAMMER2_PFSTYPE_MASTER: 769 ++ttlmasters; 770 break; 771 case HAMMER2_PFSTYPE_SLAVE: 772 ++ttlslaves; 773 break; 774 case HAMMER2_PFSTYPE_SOFT_MASTER: 775 nflags |= HAMMER2_CLUSTER_WRSOFT; 776 nflags |= HAMMER2_CLUSTER_RDSOFT; 777 smpresent = 1; 778 break; 779 case HAMMER2_PFSTYPE_SOFT_SLAVE: 780 nflags |= HAMMER2_CLUSTER_RDSOFT; 781 break; 782 case HAMMER2_PFSTYPE_SUPROOT: 783 /* 784 * Degenerate cluster representing the super-root 785 * topology on a single device. Fake stuff so 786 * cluster ops work as expected. 787 */ 788 ++ttlmasters; 789 nflags |= HAMMER2_CLUSTER_WRHARD; 790 nflags |= HAMMER2_CLUSTER_RDHARD; 791 cluster->focus_index = i; 792 cluster->focus = chain; 793 cluster->error = error; 794 break; 795 default: 796 break; 797 } 798 } 799 800 /* 801 * Pass 2 802 * 803 * Resolve nmasters - master nodes fully match 804 * 805 * Resolve umasters - master nodes operation still 806 * in progress 807 * 808 * Resolve nmasters_keymatch - master nodes match the passed-in 809 * key and may or may not match 810 * the quorum-agreed tid. 811 * 812 * The quorum-agreed TID is the highest matching TID. 813 */ 814 last_best_quorum_tid = HAMMER2_TID_MAX; 815 umasters = 0; 816 nmasters = 0; 817 nmasters_keymatch = 0; 818 quorum_tid = 0; /* fix gcc warning */ 819 820 while (nmasters < nquorum && last_best_quorum_tid != 0) { 821 umasters = 0; 822 nmasters = 0; 823 nmasters_keymatch = 0; 824 quorum_tid = 0; 825 826 for (i = 0; i < cluster->nchains; ++i) { 827 /* XXX SOFT smpresent handling */ 828 switch(cluster->pmp->pfs_types[i]) { 829 case HAMMER2_PFSTYPE_MASTER: 830 case HAMMER2_PFSTYPE_SUPROOT: 831 break; 832 default: 833 continue; 834 } 835 836 chain = cluster->array[i].chain; 837 error = cluster->array[i].error; 838 839 /* 840 * Skip elements still in progress. umasters keeps 841 * track of masters that might still be in-progress. 842 */ 843 if (chain == NULL && (cluster->array[i].flags & 844 HAMMER2_CITEM_NULL) == 0) { 845 ++umasters; 846 continue; 847 } 848 849 /* 850 * Key match? 851 */ 852 if (flags & HAMMER2_CHECK_NULL) { 853 if (chain == NULL) { 854 ++nmasters; 855 ++nmasters_keymatch; 856 if (cluster->error == 0) 857 cluster->error = error; 858 } 859 } else if (chain && 860 (key == (hammer2_key_t)-1 || 861 chain->bref.key == key)) { 862 ++nmasters_keymatch; 863 864 if (chain->bref.modify_tid < 865 last_best_quorum_tid && 866 quorum_tid < chain->bref.modify_tid) { 867 /* 868 * Select new TID as master if better 869 * than any found so far in this loop, 870 * as long as it does not reach the 871 * best tid found in the previous loop. 872 */ 873 nmasters = 0; 874 quorum_tid = chain->bref.modify_tid; 875 } 876 if (quorum_tid == chain->bref.modify_tid) { 877 /* 878 * TID matches current collection. 879 * 880 * (error handled in next pass) 881 */ 882 ++nmasters; 883 if (chain->error == 0) { 884 cluster->focus = chain; 885 cluster->focus_index = i; 886 } 887 } 888 } 889 } 890 if (nmasters >= nquorum) 891 break; 892 last_best_quorum_tid = quorum_tid; 893 } 894 895 /* 896 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n", 897 nmasters, nquorum, nmasters_keymatch, umasters); 898 */ 899 900 /* 901 * Early return if we do not have enough masters. 902 */ 903 if (nmasters < nquorum) { 904 if (nmasters + umasters >= nquorum) 905 return HAMMER2_ERROR_EINPROGRESS; 906 if (nmasters_keymatch < nquorum) 907 return HAMMER2_ERROR_ESRCH; 908 return HAMMER2_ERROR_EDEADLK; 909 } 910 911 /* 912 * Validated end of scan. 913 */ 914 if (flags & HAMMER2_CHECK_NULL) { 915 if (cluster->error == 0) 916 cluster->error = HAMMER2_ERROR_ENOENT; 917 return cluster->error; 918 } 919 920 /* 921 * If we have a NULL focus at this point the agreeing quorum all 922 * had chain errors. 923 */ 924 if (cluster->focus == NULL) 925 return HAMMER2_ERROR_EIO; 926 927 /* 928 * Pass 3 929 * 930 * We have quorum agreement, validate elements, not end of scan. 931 */ 932 nslaves = 0; 933 cluster->error = 0; 934 935 for (i = 0; i < cluster->nchains; ++i) { 936 chain = cluster->array[i].chain; 937 error = cluster->array[i].error; 938 if (chain == NULL || 939 chain->bref.key != key || 940 chain->bref.modify_tid != quorum_tid) { 941 continue; 942 } 943 944 /* 945 * Quorum Match 946 * 947 * XXX for now, cumulative error. 948 */ 949 if (cluster->error == 0) 950 cluster->error = error; 951 952 switch (cluster->pmp->pfs_types[i]) { 953 case HAMMER2_PFSTYPE_MASTER: 954 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 955 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 956 nflags |= HAMMER2_CLUSTER_WRHARD; 957 nflags |= HAMMER2_CLUSTER_RDHARD; 958 break; 959 case HAMMER2_PFSTYPE_SLAVE: 960 /* 961 * We must have enough up-to-date masters to reach 962 * a quorum and the slave modify_tid must match the 963 * quorum's modify_tid. 964 * 965 * Do not select an errored slave. 966 */ 967 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 968 nflags |= HAMMER2_CLUSTER_RDHARD; 969 ++nslaves; 970 break; 971 case HAMMER2_PFSTYPE_SOFT_MASTER: 972 /* 973 * Directly mounted soft master always wins. There 974 * should be only one. 975 */ 976 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 977 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 978 break; 979 case HAMMER2_PFSTYPE_SOFT_SLAVE: 980 /* 981 * Directly mounted soft slave always wins. There 982 * should be only one. 983 * 984 * XXX 985 */ 986 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 987 break; 988 case HAMMER2_PFSTYPE_SUPROOT: 989 /* 990 * spmp (degenerate case) 991 */ 992 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 993 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 994 nflags |= HAMMER2_CLUSTER_WRHARD; 995 nflags |= HAMMER2_CLUSTER_RDHARD; 996 break; 997 default: 998 break; 999 } 1000 } 1001 1002 /* 1003 * Focus now set, adjust ddflag. Skip this pass if the focus 1004 * is bad or if we are at the PFS root (the bref won't match at 1005 * the PFS root, obviously). 1006 * 1007 * focus is probably not locked and it isn't safe to test its 1008 * content (e.g. focus->data, focus->dio, other content). We 1009 * do not synchronize the dio to the cpu here. In fact, in numerous 1010 * situations the frontend doesn't even need to access its dio/data, 1011 * so synchronizing it here would be wasteful. 1012 */ 1013 focus = cluster->focus; 1014 if (focus) { 1015 cluster->ddflag = 1016 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); 1017 } else { 1018 cluster->ddflag = 0; 1019 goto skip4; 1020 } 1021 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1022 goto skip4; 1023 1024 /* 1025 * Pass 4 1026 * 1027 * Validate the elements that were not marked invalid. They should 1028 * match. 1029 */ 1030 for (i = 0; i < cluster->nchains; ++i) { 1031 int ddflag; 1032 1033 chain = cluster->array[i].chain; 1034 1035 if (chain == NULL) 1036 continue; 1037 if (chain == focus) 1038 continue; 1039 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 1040 continue; 1041 1042 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 1043 if (chain->bref.type != focus->bref.type || 1044 chain->bref.key != focus->bref.key || 1045 chain->bref.keybits != focus->bref.keybits || 1046 chain->bref.modify_tid != focus->bref.modify_tid || 1047 chain->bytes != focus->bytes || 1048 ddflag != cluster->ddflag) { 1049 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1050 if (hammer2_debug & 1) 1051 kprintf("cluster_resolve: matching modify_tid failed " 1052 "bref test: idx=%d type=%02x/%02x " 1053 "key=%016jx/%d-%016jx/%d " 1054 "mod=%016jx/%016jx bytes=%u/%u\n", 1055 i, 1056 chain->bref.type, focus->bref.type, 1057 chain->bref.key, chain->bref.keybits, 1058 focus->bref.key, focus->bref.keybits, 1059 chain->bref.modify_tid, focus->bref.modify_tid, 1060 chain->bytes, focus->bytes); 1061 if (hammer2_debug & 0x4000) 1062 panic("cluster_resolve"); 1063 /* flag issue and force resync? */ 1064 } 1065 } 1066 skip4: 1067 1068 if (ttlslaves == 0) 1069 nflags |= HAMMER2_CLUSTER_NOSOFT; 1070 if (ttlmasters == 0) 1071 nflags |= HAMMER2_CLUSTER_NOHARD; 1072 1073 /* 1074 * Set SSYNCED or MSYNCED for slaves and masters respectively if 1075 * all available nodes (even if 0 are available) are fully 1076 * synchronized. This is used by the synchronization thread to 1077 * determine if there is work it could potentially accomplish. 1078 */ 1079 if (nslaves == ttlslaves) 1080 nflags |= HAMMER2_CLUSTER_SSYNCED; 1081 if (nmasters == ttlmasters) 1082 nflags |= HAMMER2_CLUSTER_MSYNCED; 1083 1084 /* 1085 * Determine if the cluster was successfully locked for the 1086 * requested operation and generate an error code. The cluster 1087 * will not be locked (or ref'd) if an error is returned. 1088 */ 1089 atomic_set_int(&cluster->flags, nflags); 1090 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 1091 1092 return cluster->error; 1093 } 1094 1095 /* 1096 * This is used by the sync thread to force non-NULL elements of a copy 1097 * of the pmp->iroot cluster to be good which is required to prime the 1098 * sync. 1099 */ 1100 void 1101 hammer2_cluster_forcegood(hammer2_cluster_t *cluster) 1102 { 1103 int i; 1104 1105 for (i = 0; i < cluster->nchains; ++i) { 1106 if (cluster->array[i].chain) 1107 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 1108 } 1109 } 1110 1111 /* 1112 * Unlock a cluster. Refcount and focus is maintained. 1113 */ 1114 void 1115 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 1116 { 1117 hammer2_chain_t *chain; 1118 int i; 1119 1120 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) { 1121 kprintf("hammer2_cluster_unlock: cluster %p not locked\n", 1122 cluster); 1123 } 1124 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 1125 KKASSERT(cluster->refs > 0); 1126 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 1127 1128 for (i = 0; i < cluster->nchains; ++i) { 1129 chain = cluster->array[i].chain; 1130 if (chain) 1131 hammer2_chain_unlock(chain); 1132 } 1133 } 1134