1 /* 2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information from different nodes into a single entity. It allows direct 37 * access to media data as long as it is not blockref array data (which 38 * will obviously have to be different at each node). 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 * 48 * WARNING! This module is *extremely* complex. It must issue asynchronous 49 * locks and I/O, do quorum and/or master-slave processing, and 50 * it must operate properly even if some nodes are broken (which 51 * can also mean indefinite locks). 52 * 53 * CLUSTER OPERATIONS 54 * 55 * Cluster operations can be broken down into three pieces: 56 * 57 * (1) Chain locking and data retrieval. 58 * 59 * - Most complex functions, quorum management on transaction ids. 60 * 61 * - Locking and data accesses must be internally asynchronous. 62 * 63 * - Validate and manage cache coherency primitives (cache state 64 * is stored in chain topologies but must be validated by these 65 * functions). 66 * 67 * (2) Lookups and Scans 68 * hammer2_cluster_lookup() 69 * hammer2_cluster_next() 70 * 71 * - Depend on locking & data retrieval functions, but still complex. 72 * 73 * - Must do quorum management on transaction ids. 74 * 75 * - Lookup and Iteration ops Must be internally asynchronous. 76 * 77 * (3) Modifying Operations 78 * hammer2_cluster_create() 79 * 80 * - Can usually punt on failures, operation continues unless quorum 81 * is lost. If quorum is lost, must wait for resynchronization 82 * (depending on the management mode). 83 * 84 * - Must disconnect node on failures (also not flush), remount, and 85 * resynchronize. 86 * 87 * - Network links (via kdmsg) are relatively easy to issue as the 88 * complex underworkings of hammer2_chain.c don't have to messed 89 * with (the protocol is at a higher level than block-level). 90 * 91 * - Multiple local disk nodes (i.e. block devices) are another matter. 92 * Chain operations have to be dispatched to per-node threads (xN) 93 * because we can't asynchronize potentially very complex chain 94 * operations in hammer2_chain.c (it would be a huge mess). 95 * 96 * (these threads are also used to terminate incoming kdmsg ops from 97 * other machines). 98 * 99 * - Single-node filesystems do not use threads and will simply call 100 * hammer2_chain.c functions directly. This short-cut is handled 101 * at the base of each cluster function. 102 */ 103 #include <sys/cdefs.h> 104 #include <sys/param.h> 105 #include <sys/systm.h> 106 #include <sys/types.h> 107 #include <sys/lock.h> 108 #include <sys/uuid.h> 109 110 #include "hammer2.h" 111 112 /* 113 * Returns the bref type of the cluster's foucs. 114 * 115 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0). 116 * The cluster must be locked. 117 */ 118 uint8_t 119 hammer2_cluster_type(hammer2_cluster_t *cluster) 120 { 121 if (cluster->error == 0) { 122 KKASSERT(cluster->focus != NULL); 123 return(cluster->focus->bref.type); 124 } 125 return 0; 126 } 127 128 #ifdef INVARIANTS 129 /* 130 * Returns non-zero if the cluster's focus is flagged as being modified. 131 * 132 * If the cluster is errored, returns 0. 133 */ 134 static 135 int 136 hammer2_cluster_modified(hammer2_cluster_t *cluster) 137 { 138 if (cluster->error == 0) { 139 KKASSERT(cluster->focus != NULL); 140 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); 141 } 142 return 0; 143 } 144 #endif 145 146 /* 147 * Returns the bref of the cluster's focus, sans any data-offset information 148 * (since offset information is per-node and wouldn't be useful). 149 * 150 * Callers use this function to access modify_tid, mirror_tid, type, 151 * key, and keybits. 152 * 153 * If the cluster is errored, returns an empty bref. 154 * The cluster must be locked. 155 */ 156 void 157 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 158 { 159 if (cluster->error == 0) { 160 KKASSERT(cluster->focus != NULL); 161 *bref = cluster->focus->bref; 162 bref->data_off = 0; 163 } else { 164 bzero(bref, sizeof(*bref)); 165 } 166 } 167 168 /* 169 * Create a degenerate cluster with one ref from a single locked chain. 170 * The returned cluster will be focused on the chain and inherit its 171 * error state. 172 * 173 * The chain's lock and reference are transfered to the new cluster, so 174 * the caller should not try to unlock the chain separately. 175 * 176 * We fake the flags. 177 */ 178 hammer2_cluster_t * 179 hammer2_cluster_from_chain(hammer2_chain_t *chain) 180 { 181 hammer2_cluster_t *cluster; 182 183 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 184 cluster->array[0].chain = chain; 185 cluster->array[0].flags = HAMMER2_CITEM_FEMOD; 186 cluster->nchains = 1; 187 cluster->focus = chain; 188 cluster->focus_index = 0; 189 cluster->pmp = chain->pmp; 190 cluster->refs = 1; 191 cluster->error = chain->error; 192 cluster->flags = HAMMER2_CLUSTER_LOCKED | 193 HAMMER2_CLUSTER_WRHARD | 194 HAMMER2_CLUSTER_RDHARD | 195 HAMMER2_CLUSTER_MSYNCED | 196 HAMMER2_CLUSTER_SSYNCED; 197 198 return cluster; 199 } 200 201 /* 202 * Add a reference to a cluster and its underlying chains. 203 * 204 * We must also ref the underlying chains in order to allow ref/unlock 205 * sequences to later re-lock. 206 */ 207 void 208 hammer2_cluster_ref(hammer2_cluster_t *cluster) 209 { 210 atomic_add_int(&cluster->refs, 1); 211 } 212 213 /* 214 * Drop the caller's reference to the cluster. When the ref count drops to 215 * zero this function frees the cluster and drops all underlying chains. 216 * 217 * In-progress read I/Os are typically detached from the cluster once the 218 * first one returns (the remaining stay attached to the DIOs but are then 219 * ignored and drop naturally). 220 */ 221 void 222 hammer2_cluster_drop(hammer2_cluster_t *cluster) 223 { 224 hammer2_chain_t *chain; 225 int i; 226 227 KKASSERT(cluster->refs > 0); 228 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 229 cluster->focus = NULL; /* safety XXX chg to assert */ 230 cluster->focus_index = 0; 231 232 for (i = 0; i < cluster->nchains; ++i) { 233 chain = cluster->array[i].chain; 234 if (chain) { 235 hammer2_chain_drop(chain); 236 cluster->array[i].chain = NULL; /* safety */ 237 } 238 } 239 cluster->nchains = 0; /* safety */ 240 241 kfree(cluster, M_HAMMER2); 242 /* cluster is invalid */ 243 } 244 } 245 246 /* 247 * Lock a cluster. Cluster must already be referenced. Focus is maintained. 248 * 249 * WARNING! This function expects the caller to handle resolution of the 250 * cluster. We never re-resolve the cluster in this function, 251 * because it might be used to temporarily unlock/relock a cparent 252 * in an iteration or recursrion, and the cparents elements do not 253 * necessarily match. 254 */ 255 void 256 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 257 { 258 hammer2_chain_t *chain; 259 int i; 260 261 /* cannot be on inode-embedded cluster template, must be on copy */ 262 KKASSERT(cluster->refs > 0); 263 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0); 264 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) { 265 panic("hammer2_cluster_lock: cluster %p already locked!\n", 266 cluster); 267 } 268 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 269 270 /* 271 * Lock chains and resolve state. 272 */ 273 for (i = 0; i < cluster->nchains; ++i) { 274 chain = cluster->array[i].chain; 275 if (chain == NULL) 276 continue; 277 hammer2_chain_lock(chain, how); 278 } 279 } 280 281 /* 282 * Calculate the clustering state for the cluster and set its focus. 283 * This routine must be called with care. For example, it should not 284 * normally be called after relocking a non-leaf cluster because parent 285 * clusters help iterations and each element might be at a slightly different 286 * indirect node (each node's topology is independently indexed). 287 * 288 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal 289 * operations. Typically this is only set on a quorum of MASTERs or 290 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER 291 * is present, this bit is *not* set on a quorum of MASTERs. The 292 * synchronization code ignores this bit, but all hammer2_cluster_*() calls 293 * that create/modify/delete elements use it. 294 * 295 * The chains making up the cluster may be narrowed down based on quorum 296 * acceptability, and if RESOLVE_RDONLY is specified the chains can be 297 * narrowed down to a single chain as long as the entire subtopology is known 298 * to be intact. So, for example, we can narrow a read-only op to a single 299 * fast SLAVE but if we focus a CACHE chain we must still retain at least 300 * a SLAVE to ensure that the subtopology can be accessed. 301 * 302 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need 303 * to be maintained once the topology is validated as-of the top level of 304 * the operation. 305 * 306 * If a failure occurs the operation must be aborted by higher-level code and 307 * retried. XXX 308 */ 309 void 310 hammer2_cluster_resolve(hammer2_cluster_t *cluster) 311 { 312 hammer2_chain_t *chain; 313 hammer2_chain_t *focus; 314 hammer2_pfs_t *pmp; 315 hammer2_tid_t quorum_tid; 316 hammer2_tid_t last_best_quorum_tid; 317 int focus_pfs_type; 318 uint32_t nflags; 319 int ttlmasters; 320 int ttlslaves; 321 int nmasters; 322 int nslaves; 323 int nquorum; 324 int smpresent; 325 int i; 326 327 cluster->error = 0; 328 cluster->focus = NULL; 329 330 focus_pfs_type = 0; 331 nflags = 0; 332 ttlmasters = 0; 333 ttlslaves = 0; 334 nmasters = 0; 335 nslaves = 0; 336 337 /* 338 * Calculate quorum 339 */ 340 pmp = cluster->pmp; 341 KKASSERT(pmp != NULL || cluster->nchains == 0); 342 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 343 smpresent = 0; 344 345 /* 346 * Pass 1 347 * 348 * NOTE: A NULL chain is not necessarily an error, it could be 349 * e.g. a lookup failure or the end of an iteration. 350 * Process normally. 351 */ 352 for (i = 0; i < cluster->nchains; ++i) { 353 chain = cluster->array[i].chain; 354 if (chain && chain->error) { 355 if (cluster->focus == NULL || cluster->focus == chain) { 356 /* error will be overridden by valid focus */ 357 cluster->error = chain->error; 358 } 359 360 /* 361 * Must count total masters and slaves whether the 362 * chain is errored or not. 363 */ 364 switch (cluster->pmp->pfs_types[i]) { 365 case HAMMER2_PFSTYPE_SUPROOT: 366 case HAMMER2_PFSTYPE_MASTER: 367 ++ttlmasters; 368 break; 369 case HAMMER2_PFSTYPE_SLAVE: 370 ++ttlslaves; 371 break; 372 } 373 continue; 374 } 375 switch (cluster->pmp->pfs_types[i]) { 376 case HAMMER2_PFSTYPE_MASTER: 377 ++ttlmasters; 378 break; 379 case HAMMER2_PFSTYPE_SLAVE: 380 ++ttlslaves; 381 break; 382 case HAMMER2_PFSTYPE_SOFT_MASTER: 383 nflags |= HAMMER2_CLUSTER_WRSOFT; 384 nflags |= HAMMER2_CLUSTER_RDSOFT; 385 smpresent = 1; 386 break; 387 case HAMMER2_PFSTYPE_SOFT_SLAVE: 388 nflags |= HAMMER2_CLUSTER_RDSOFT; 389 break; 390 case HAMMER2_PFSTYPE_SUPROOT: 391 /* 392 * Degenerate cluster representing the super-root 393 * topology on a single device. Fake stuff so 394 * cluster ops work as expected. 395 */ 396 nflags |= HAMMER2_CLUSTER_WRHARD; 397 nflags |= HAMMER2_CLUSTER_RDHARD; 398 cluster->focus_index = i; 399 cluster->focus = chain; 400 cluster->error = chain ? chain->error : 0; 401 ++ttlmasters; 402 break; 403 default: 404 break; 405 } 406 } 407 408 /* 409 * Pass 2 410 * 411 * Resolve masters. Calculate nmasters for the highest matching 412 * TID, if a quorum cannot be attained try the next lower matching 413 * TID until we exhaust TIDs. 414 * 415 * NOTE: A NULL chain is not necessarily an error, it could be 416 * e.g. a lookup failure or the end of an iteration. 417 * Process normally. 418 */ 419 last_best_quorum_tid = HAMMER2_TID_MAX; 420 quorum_tid = 0; /* fix gcc warning */ 421 422 while (nmasters < nquorum && last_best_quorum_tid != 0) { 423 nmasters = 0; 424 quorum_tid = 0; 425 426 for (i = 0; i < cluster->nchains; ++i) { 427 switch (cluster->pmp->pfs_types[i]) { 428 case HAMMER2_PFSTYPE_SUPROOT: 429 case HAMMER2_PFSTYPE_MASTER: 430 break; 431 default: 432 continue; 433 } 434 chain = cluster->array[i].chain; 435 436 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 437 /* 438 * Invalid as in unsynchronized, cannot be 439 * used to calculate the quorum. 440 */ 441 } else if (chain == NULL && quorum_tid == 0) { 442 /* 443 * NULL chain on master matches NULL chains 444 * on other masters. 445 */ 446 ++nmasters; 447 } else if (quorum_tid < last_best_quorum_tid && 448 chain != NULL && 449 (quorum_tid < chain->bref.modify_tid || 450 nmasters == 0)) { 451 /* 452 * Better TID located, reset nmasters count. 453 */ 454 nmasters = 1; 455 quorum_tid = chain->bref.modify_tid; 456 } else if (chain && 457 quorum_tid == chain->bref.modify_tid) { 458 /* 459 * TID matches current collection. 460 */ 461 ++nmasters; 462 } 463 } 464 if (nmasters >= nquorum) 465 break; 466 last_best_quorum_tid = quorum_tid; 467 } 468 469 /* 470 * Pass 3 471 * 472 * NOTE: A NULL chain is not necessarily an error, it could be 473 * e.g. a lookup failure or the end of an iteration. 474 * Process normally. 475 */ 476 for (i = 0; i < cluster->nchains; ++i) { 477 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 478 chain = cluster->array[i].chain; 479 if (chain && chain->error) { 480 if (cluster->focus == NULL || cluster->focus == chain) { 481 /* error will be overridden by valid focus */ 482 cluster->error = chain->error; 483 } 484 continue; 485 } 486 487 switch (cluster->pmp->pfs_types[i]) { 488 case HAMMER2_PFSTYPE_MASTER: 489 /* 490 * We must have enough up-to-date masters to reach 491 * a quorum and the master modify_tid must match 492 * the quorum's modify_tid. 493 * 494 * Do not select an errored or out-of-sync master. 495 */ 496 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 497 nflags |= HAMMER2_CLUSTER_UNHARD; 498 } else if (nmasters >= nquorum && 499 (chain == NULL || chain->error == 0) && 500 ((chain == NULL && quorum_tid == 0) || 501 (chain != NULL && quorum_tid == 502 chain->bref.modify_tid))) { 503 nflags |= HAMMER2_CLUSTER_WRHARD; 504 nflags |= HAMMER2_CLUSTER_RDHARD; 505 if (!smpresent) { 506 cluster->array[i].flags |= 507 HAMMER2_CITEM_FEMOD; 508 } 509 if (cluster->focus == NULL || 510 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) { 511 focus_pfs_type = HAMMER2_PFSTYPE_MASTER; 512 cluster->focus_index = i; 513 cluster->focus = chain; /* NULL ok */ 514 cluster->error = chain ? chain->error : 515 0; 516 } 517 } else if (chain == NULL || chain->error == 0) { 518 nflags |= HAMMER2_CLUSTER_UNHARD; 519 } 520 break; 521 case HAMMER2_PFSTYPE_SLAVE: 522 /* 523 * We must have enough up-to-date masters to reach 524 * a quorum and the slave modify_tid must match the 525 * quorum's modify_tid. 526 * 527 * Do not select an errored slave. 528 */ 529 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 530 nflags |= HAMMER2_CLUSTER_UNHARD; 531 } else if (nmasters >= nquorum && 532 (chain == NULL || chain->error == 0) && 533 ((chain == NULL && quorum_tid == 0) || 534 (chain && quorum_tid == 535 chain->bref.modify_tid))) { 536 ++nslaves; 537 nflags |= HAMMER2_CLUSTER_RDHARD; 538 #if 0 539 /* XXX optimize for RESOLVE_RDONLY */ 540 if (cluster->focus == NULL) { 541 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE; 542 cluster->focus_index = i; 543 cluster->focus = chain; /* NULL ok */ 544 cluster->error = chain ? chain->error : 545 0; 546 } 547 #endif 548 } else if (chain == NULL || chain->error == 0) { 549 nflags |= HAMMER2_CLUSTER_UNSOFT; 550 } 551 break; 552 case HAMMER2_PFSTYPE_SOFT_MASTER: 553 /* 554 * Directly mounted soft master always wins. There 555 * should be only one. 556 */ 557 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER); 558 cluster->focus_index = i; 559 cluster->focus = chain; 560 cluster->error = chain ? chain->error : 0; 561 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER; 562 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 563 break; 564 case HAMMER2_PFSTYPE_SOFT_SLAVE: 565 /* 566 * Directly mounted soft slave always wins. There 567 * should be only one. 568 */ 569 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE); 570 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) { 571 cluster->focus_index = i; 572 cluster->focus = chain; 573 cluster->error = chain ? chain->error : 0; 574 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE; 575 } 576 break; 577 case HAMMER2_PFSTYPE_SUPROOT: 578 /* 579 * spmp (degenerate case) 580 */ 581 KKASSERT(i == 0); 582 cluster->focus_index = i; 583 cluster->focus = chain; 584 cluster->error = chain ? chain->error : 0; 585 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT; 586 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 587 break; 588 default: 589 break; 590 } 591 } 592 593 /* 594 * Focus now set, adjust ddflag. Skip this pass if the focus 595 * is bad or if we are at the PFS root (the bref won't match at 596 * the PFS root, obviously). 597 */ 598 focus = cluster->focus; 599 if (focus) { 600 cluster->ddflag = 601 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); 602 } else { 603 cluster->ddflag = 0; 604 goto skip4; 605 } 606 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 607 goto skip4; 608 609 /* 610 * Pass 4 611 * 612 * Validate the elements that were not marked invalid. They should 613 * match. 614 */ 615 for (i = 0; i < cluster->nchains; ++i) { 616 int ddflag; 617 618 chain = cluster->array[i].chain; 619 620 if (chain == NULL) 621 continue; 622 if (chain == focus) 623 continue; 624 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 625 continue; 626 627 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 628 if (chain->bref.type != focus->bref.type || 629 chain->bref.key != focus->bref.key || 630 chain->bref.keybits != focus->bref.keybits || 631 chain->bref.modify_tid != focus->bref.modify_tid || 632 chain->bytes != focus->bytes || 633 ddflag != cluster->ddflag) { 634 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 635 if (hammer2_debug & 1) 636 kprintf("cluster_resolve: matching modify_tid failed " 637 "bref test: idx=%d type=%02x/%02x " 638 "key=%016jx/%d-%016jx/%d " 639 "mod=%016jx/%016jx bytes=%u/%u\n", 640 i, 641 chain->bref.type, focus->bref.type, 642 chain->bref.key, chain->bref.keybits, 643 focus->bref.key, focus->bref.keybits, 644 chain->bref.modify_tid, focus->bref.modify_tid, 645 chain->bytes, focus->bytes); 646 if (hammer2_debug & 0x4000) 647 panic("cluster_resolve"); 648 /* flag issue and force resync? */ 649 } 650 } 651 skip4: 652 653 if (ttlslaves == 0) 654 nflags |= HAMMER2_CLUSTER_NOSOFT; 655 if (ttlmasters == 0) 656 nflags |= HAMMER2_CLUSTER_NOHARD; 657 658 /* 659 * Set SSYNCED or MSYNCED for slaves and masters respectively if 660 * all available nodes (even if 0 are available) are fully 661 * synchronized. This is used by the synchronization thread to 662 * determine if there is work it could potentially accomplish. 663 */ 664 if (nslaves == ttlslaves) 665 nflags |= HAMMER2_CLUSTER_SSYNCED; 666 if (nmasters == ttlmasters) 667 nflags |= HAMMER2_CLUSTER_MSYNCED; 668 669 /* 670 * Determine if the cluster was successfully locked for the 671 * requested operation and generate an error code. The cluster 672 * will not be locked (or ref'd) if an error is returned. 673 */ 674 atomic_set_int(&cluster->flags, nflags); 675 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 676 } 677 678 /* 679 * This is used by the XOPS subsystem to calculate the state of 680 * the collection and tell hammer2_xop_collect() what to do with it. 681 * The collection can be in various states of desynchronization, the 682 * caller specifically wants to resolve the passed-in key. 683 * 684 * Return values: 685 * 0 - Quorum agreement, key is valid 686 * 687 * ENOENT - Quorum agreement, end of scan 688 * 689 * ESRCH - Quorum agreement, key is INVALID (caller should 690 * skip key). 691 * 692 * EIO - Quorum agreement but all elements had errors. 693 * 694 * EDEADLK - No quorum agreement possible for key, a repair 695 * may be needed. Caller has to decide what to do, 696 * possibly iterating the key or generating an EIO. 697 * 698 * EINPROGRESS - No quorum agreement yet, but agreement is still 699 * possible if caller waits for more responses. Caller 700 * should not iterate key. 701 * 702 * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds. 703 * 704 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE 705 */ 706 int 707 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags) 708 { 709 hammer2_chain_t *chain; 710 hammer2_chain_t *focus; 711 hammer2_pfs_t *pmp; 712 hammer2_tid_t quorum_tid; 713 hammer2_tid_t last_best_quorum_tid; 714 uint32_t nflags; 715 int ttlmasters; 716 int ttlslaves; 717 int nmasters; 718 int nmasters_keymatch; 719 int nslaves; 720 int nquorum; 721 int umasters; /* unknown masters (still in progress) */ 722 int smpresent; 723 int error; 724 int i; 725 726 cluster->error = 0; 727 cluster->focus = NULL; 728 729 pmp = cluster->pmp; 730 KKASSERT(pmp != NULL || cluster->nchains == 0); 731 732 /* 733 * Calculate quorum 734 */ 735 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 736 smpresent = 0; 737 nflags = 0; 738 ttlmasters = 0; 739 ttlslaves = 0; 740 741 /* 742 * Pass 1 743 * 744 * NOTE: A NULL chain is not necessarily an error, it could be 745 * e.g. a lookup failure or the end of an iteration. 746 * Process normally. 747 */ 748 for (i = 0; i < cluster->nchains; ++i) { 749 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 750 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 751 752 chain = cluster->array[i].chain; 753 error = cluster->array[i].error; 754 if (chain && error) { 755 if (cluster->focus == NULL || cluster->focus == chain) { 756 /* error will be overridden by valid focus */ 757 /* XXX */ 758 } 759 760 /* 761 * Must count total masters and slaves whether the 762 * chain is errored or not. 763 */ 764 switch (cluster->pmp->pfs_types[i]) { 765 case HAMMER2_PFSTYPE_SUPROOT: 766 case HAMMER2_PFSTYPE_MASTER: 767 ++ttlmasters; 768 break; 769 case HAMMER2_PFSTYPE_SLAVE: 770 ++ttlslaves; 771 break; 772 } 773 continue; 774 } 775 switch (cluster->pmp->pfs_types[i]) { 776 case HAMMER2_PFSTYPE_MASTER: 777 ++ttlmasters; 778 break; 779 case HAMMER2_PFSTYPE_SLAVE: 780 ++ttlslaves; 781 break; 782 case HAMMER2_PFSTYPE_SOFT_MASTER: 783 nflags |= HAMMER2_CLUSTER_WRSOFT; 784 nflags |= HAMMER2_CLUSTER_RDSOFT; 785 smpresent = 1; 786 break; 787 case HAMMER2_PFSTYPE_SOFT_SLAVE: 788 nflags |= HAMMER2_CLUSTER_RDSOFT; 789 break; 790 case HAMMER2_PFSTYPE_SUPROOT: 791 /* 792 * Degenerate cluster representing the super-root 793 * topology on a single device. Fake stuff so 794 * cluster ops work as expected. 795 */ 796 ++ttlmasters; 797 nflags |= HAMMER2_CLUSTER_WRHARD; 798 nflags |= HAMMER2_CLUSTER_RDHARD; 799 cluster->focus_index = i; 800 cluster->focus = chain; 801 cluster->error = error; 802 break; 803 default: 804 break; 805 } 806 } 807 808 /* 809 * Pass 2 810 * 811 * Resolve nmasters - master nodes fully match 812 * 813 * Resolve umasters - master nodes operation still 814 * in progress 815 * 816 * Resolve nmasters_keymatch - master nodes match the passed-in 817 * key and may or may not match 818 * the quorum-agreed tid. 819 * 820 * The quorum-agreed TID is the highest matching TID. 821 */ 822 last_best_quorum_tid = HAMMER2_TID_MAX; 823 umasters = 0; 824 nmasters = 0; 825 nmasters_keymatch = 0; 826 quorum_tid = 0; /* fix gcc warning */ 827 828 while (nmasters < nquorum && last_best_quorum_tid != 0) { 829 umasters = 0; 830 nmasters = 0; 831 nmasters_keymatch = 0; 832 quorum_tid = 0; 833 834 for (i = 0; i < cluster->nchains; ++i) { 835 /* XXX SOFT smpresent handling */ 836 switch(cluster->pmp->pfs_types[i]) { 837 case HAMMER2_PFSTYPE_MASTER: 838 case HAMMER2_PFSTYPE_SUPROOT: 839 break; 840 default: 841 continue; 842 } 843 844 chain = cluster->array[i].chain; 845 error = cluster->array[i].error; 846 847 /* 848 * Skip elements still in progress. umasters keeps 849 * track of masters that might still be in-progress. 850 */ 851 if (chain == NULL && (cluster->array[i].flags & 852 HAMMER2_CITEM_NULL) == 0) { 853 ++umasters; 854 continue; 855 } 856 857 /* 858 * Key match? 859 */ 860 if (flags & HAMMER2_CHECK_NULL) { 861 if (chain == NULL) { 862 ++nmasters; 863 ++nmasters_keymatch; 864 if (cluster->error == 0) 865 cluster->error = error; 866 } 867 } else if (chain && 868 (key == (hammer2_key_t)-1 || 869 chain->bref.key == key)) { 870 ++nmasters_keymatch; 871 872 if (chain->bref.modify_tid < 873 last_best_quorum_tid && 874 quorum_tid < chain->bref.modify_tid) { 875 /* 876 * Select new TID as master if better 877 * than any found so far in this loop, 878 * as long as it does not reach the 879 * best tid found in the previous loop. 880 */ 881 nmasters = 0; 882 quorum_tid = chain->bref.modify_tid; 883 } 884 if (quorum_tid == chain->bref.modify_tid) { 885 /* 886 * TID matches current collection. 887 * 888 * (error handled in next pass) 889 */ 890 ++nmasters; 891 if (chain->error == 0) { 892 cluster->focus = chain; 893 cluster->focus_index = i; 894 } 895 } 896 } 897 } 898 if (nmasters >= nquorum) 899 break; 900 last_best_quorum_tid = quorum_tid; 901 } 902 903 /* 904 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n", 905 nmasters, nquorum, nmasters_keymatch, umasters); 906 */ 907 908 /* 909 * Early return if we do not have enough masters. 910 */ 911 if (nmasters < nquorum) { 912 if (nmasters + umasters >= nquorum) 913 return HAMMER2_ERROR_EINPROGRESS; 914 if (nmasters_keymatch < nquorum) 915 return HAMMER2_ERROR_ESRCH; 916 return HAMMER2_ERROR_EDEADLK; 917 } 918 919 /* 920 * Validated end of scan. 921 */ 922 if (flags & HAMMER2_CHECK_NULL) { 923 if (cluster->error == 0) 924 cluster->error = HAMMER2_ERROR_ENOENT; 925 return cluster->error; 926 } 927 928 /* 929 * If we have a NULL focus at this point the agreeing quorum all 930 * had chain errors. 931 */ 932 if (cluster->focus == NULL) 933 return HAMMER2_ERROR_EIO; 934 935 /* 936 * Pass 3 937 * 938 * We have quorum agreement, validate elements, not end of scan. 939 */ 940 nslaves = 0; 941 cluster->error = 0; 942 943 for (i = 0; i < cluster->nchains; ++i) { 944 chain = cluster->array[i].chain; 945 error = cluster->array[i].error; 946 if (chain == NULL || 947 chain->bref.key != key || 948 chain->bref.modify_tid != quorum_tid) { 949 continue; 950 } 951 952 /* 953 * Quorum Match 954 * 955 * XXX for now, cumulative error. 956 */ 957 if (cluster->error == 0) 958 cluster->error = error; 959 960 switch (cluster->pmp->pfs_types[i]) { 961 case HAMMER2_PFSTYPE_MASTER: 962 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 963 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 964 nflags |= HAMMER2_CLUSTER_WRHARD; 965 nflags |= HAMMER2_CLUSTER_RDHARD; 966 break; 967 case HAMMER2_PFSTYPE_SLAVE: 968 /* 969 * We must have enough up-to-date masters to reach 970 * a quorum and the slave modify_tid must match the 971 * quorum's modify_tid. 972 * 973 * Do not select an errored slave. 974 */ 975 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 976 nflags |= HAMMER2_CLUSTER_RDHARD; 977 ++nslaves; 978 break; 979 case HAMMER2_PFSTYPE_SOFT_MASTER: 980 /* 981 * Directly mounted soft master always wins. There 982 * should be only one. 983 */ 984 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 985 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 986 break; 987 case HAMMER2_PFSTYPE_SOFT_SLAVE: 988 /* 989 * Directly mounted soft slave always wins. There 990 * should be only one. 991 * 992 * XXX 993 */ 994 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 995 break; 996 case HAMMER2_PFSTYPE_SUPROOT: 997 /* 998 * spmp (degenerate case) 999 */ 1000 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 1001 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 1002 nflags |= HAMMER2_CLUSTER_WRHARD; 1003 nflags |= HAMMER2_CLUSTER_RDHARD; 1004 break; 1005 default: 1006 break; 1007 } 1008 } 1009 1010 /* 1011 * Focus now set, adjust ddflag. Skip this pass if the focus 1012 * is bad or if we are at the PFS root (the bref won't match at 1013 * the PFS root, obviously). 1014 */ 1015 focus = cluster->focus; 1016 if (focus) { 1017 cluster->ddflag = 1018 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); 1019 } else { 1020 cluster->ddflag = 0; 1021 goto skip4; 1022 } 1023 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1024 goto skip4; 1025 1026 /* 1027 * Pass 4 1028 * 1029 * Validate the elements that were not marked invalid. They should 1030 * match. 1031 */ 1032 for (i = 0; i < cluster->nchains; ++i) { 1033 int ddflag; 1034 1035 chain = cluster->array[i].chain; 1036 1037 if (chain == NULL) 1038 continue; 1039 if (chain == focus) 1040 continue; 1041 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 1042 continue; 1043 1044 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 1045 if (chain->bref.type != focus->bref.type || 1046 chain->bref.key != focus->bref.key || 1047 chain->bref.keybits != focus->bref.keybits || 1048 chain->bref.modify_tid != focus->bref.modify_tid || 1049 chain->bytes != focus->bytes || 1050 ddflag != cluster->ddflag) { 1051 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1052 if (hammer2_debug & 1) 1053 kprintf("cluster_resolve: matching modify_tid failed " 1054 "bref test: idx=%d type=%02x/%02x " 1055 "key=%016jx/%d-%016jx/%d " 1056 "mod=%016jx/%016jx bytes=%u/%u\n", 1057 i, 1058 chain->bref.type, focus->bref.type, 1059 chain->bref.key, chain->bref.keybits, 1060 focus->bref.key, focus->bref.keybits, 1061 chain->bref.modify_tid, focus->bref.modify_tid, 1062 chain->bytes, focus->bytes); 1063 if (hammer2_debug & 0x4000) 1064 panic("cluster_resolve"); 1065 /* flag issue and force resync? */ 1066 } 1067 } 1068 skip4: 1069 1070 if (ttlslaves == 0) 1071 nflags |= HAMMER2_CLUSTER_NOSOFT; 1072 if (ttlmasters == 0) 1073 nflags |= HAMMER2_CLUSTER_NOHARD; 1074 1075 /* 1076 * Set SSYNCED or MSYNCED for slaves and masters respectively if 1077 * all available nodes (even if 0 are available) are fully 1078 * synchronized. This is used by the synchronization thread to 1079 * determine if there is work it could potentially accomplish. 1080 */ 1081 if (nslaves == ttlslaves) 1082 nflags |= HAMMER2_CLUSTER_SSYNCED; 1083 if (nmasters == ttlmasters) 1084 nflags |= HAMMER2_CLUSTER_MSYNCED; 1085 1086 /* 1087 * Determine if the cluster was successfully locked for the 1088 * requested operation and generate an error code. The cluster 1089 * will not be locked (or ref'd) if an error is returned. 1090 */ 1091 atomic_set_int(&cluster->flags, nflags); 1092 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 1093 1094 return cluster->error; 1095 } 1096 1097 /* 1098 * This is used by the sync thread to force non-NULL elements of a copy 1099 * of the pmp->iroot cluster to be good which is required to prime the 1100 * sync. 1101 */ 1102 void 1103 hammer2_cluster_forcegood(hammer2_cluster_t *cluster) 1104 { 1105 int i; 1106 1107 for (i = 0; i < cluster->nchains; ++i) { 1108 if (cluster->array[i].chain) 1109 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 1110 } 1111 } 1112 1113 /* 1114 * Unlock a cluster. Refcount and focus is maintained. 1115 */ 1116 void 1117 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 1118 { 1119 hammer2_chain_t *chain; 1120 int i; 1121 1122 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) { 1123 kprintf("hammer2_cluster_unlock: cluster %p not locked\n", 1124 cluster); 1125 } 1126 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 1127 KKASSERT(cluster->refs > 0); 1128 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 1129 1130 for (i = 0; i < cluster->nchains; ++i) { 1131 chain = cluster->array[i].chain; 1132 if (chain) 1133 hammer2_chain_unlock(chain); 1134 } 1135 } 1136 1137 /************************************************************************ 1138 * CLUSTER I/O * 1139 ************************************************************************ 1140 * 1141 * 1142 * WARNING! blockref[] array data is not universal. These functions should 1143 * only be used to access universal data. 1144 * 1145 * NOTE! The rdata call will wait for at least one of the chain I/Os to 1146 * complete if necessary. The I/O's should have already been 1147 * initiated by the cluster_lock/chain_lock operation. 1148 * 1149 * The cluster must already be in a modified state before wdata 1150 * is called. The data will already be available for this case. 1151 */ 1152 const hammer2_media_data_t * 1153 hammer2_cluster_rdata(hammer2_cluster_t *cluster) 1154 { 1155 hammer2_chain_t *chain; 1156 1157 chain = cluster->focus; 1158 KKASSERT(chain != NULL && chain->lockcnt); 1159 if (chain->dio) 1160 hammer2_io_bkvasync(chain->dio); 1161 return (chain->data); 1162 } 1163 1164 hammer2_media_data_t * 1165 hammer2_cluster_wdata(hammer2_cluster_t *cluster) 1166 { 1167 hammer2_chain_t *chain; 1168 1169 chain = cluster->focus; 1170 KKASSERT(chain != NULL && chain->lockcnt && 1171 hammer2_cluster_modified(cluster)); 1172 if (chain->dio) 1173 hammer2_io_bkvasync(chain->dio); 1174 return(chain->data); 1175 } 1176