1 /* 2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information from different nodes into a single entity. It allows direct 37 * access to media data as long as it is not blockref array data (which 38 * will obviously have to be different at each node). 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 * 48 * WARNING! This module is *extremely* complex. It must issue asynchronous 49 * locks and I/O, do quorum and/or master-slave processing, and 50 * it must operate properly even if some nodes are broken (which 51 * can also mean indefinite locks). 52 * 53 * CLUSTER OPERATIONS 54 * 55 * Cluster operations can be broken down into three pieces: 56 * 57 * (1) Chain locking and data retrieval. 58 * 59 * - Most complex functions, quorum management on transaction ids. 60 * 61 * - Locking and data accesses must be internally asynchronous. 62 * 63 * - Validate and manage cache coherency primitives (cache state 64 * is stored in chain topologies but must be validated by these 65 * functions). 66 * 67 * (2) Lookups and Scans 68 * hammer2_cluster_lookup() 69 * hammer2_cluster_next() 70 * 71 * - Depend on locking & data retrieval functions, but still complex. 72 * 73 * - Must do quorum management on transaction ids. 74 * 75 * - Lookup and Iteration ops Must be internally asynchronous. 76 * 77 * (3) Modifying Operations 78 * hammer2_cluster_create() 79 * 80 * - Can usually punt on failures, operation continues unless quorum 81 * is lost. If quorum is lost, must wait for resynchronization 82 * (depending on the management mode). 83 * 84 * - Must disconnect node on failures (also not flush), remount, and 85 * resynchronize. 86 * 87 * - Network links (via kdmsg) are relatively easy to issue as the 88 * complex underworkings of hammer2_chain.c don't have to messed 89 * with (the protocol is at a higher level than block-level). 90 * 91 * - Multiple local disk nodes (i.e. block devices) are another matter. 92 * Chain operations have to be dispatched to per-node threads (xN) 93 * because we can't asynchronize potentially very complex chain 94 * operations in hammer2_chain.c (it would be a huge mess). 95 * 96 * (these threads are also used to terminate incoming kdmsg ops from 97 * other machines). 98 * 99 * - Single-node filesystems do not use threads and will simply call 100 * hammer2_chain.c functions directly. This short-cut is handled 101 * at the base of each cluster function. 102 */ 103 #include <sys/cdefs.h> 104 #include <sys/param.h> 105 #include <sys/systm.h> 106 #include <sys/types.h> 107 #include <sys/lock.h> 108 #include <sys/uuid.h> 109 110 #include "hammer2.h" 111 112 /* 113 * Returns the bref type of the cluster's foucs. 114 * 115 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0). 116 * The cluster must be locked. 117 */ 118 uint8_t 119 hammer2_cluster_type(hammer2_cluster_t *cluster) 120 { 121 if (cluster->error == 0) { 122 KKASSERT(cluster->focus != NULL); 123 return(cluster->focus->bref.type); 124 } 125 return 0; 126 } 127 128 /* 129 * Returns non-zero if the cluster's focus is flagged as being modified. 130 * 131 * If the cluster is errored, returns 0. 132 */ 133 static 134 int 135 hammer2_cluster_modified(hammer2_cluster_t *cluster) 136 { 137 if (cluster->error == 0) { 138 KKASSERT(cluster->focus != NULL); 139 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); 140 } 141 return 0; 142 } 143 144 /* 145 * Returns the bref of the cluster's focus, sans any data-offset information 146 * (since offset information is per-node and wouldn't be useful). 147 * 148 * Callers use this function to access modify_tid, mirror_tid, type, 149 * key, and keybits. 150 * 151 * If the cluster is errored, returns an empty bref. 152 * The cluster must be locked. 153 */ 154 void 155 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 156 { 157 if (cluster->error == 0) { 158 KKASSERT(cluster->focus != NULL); 159 *bref = cluster->focus->bref; 160 bref->data_off = 0; 161 } else { 162 bzero(bref, sizeof(*bref)); 163 } 164 } 165 166 /* 167 * Create a degenerate cluster with one ref from a single locked chain. 168 * The returned cluster will be focused on the chain and inherit its 169 * error state. 170 * 171 * The chain's lock and reference are transfered to the new cluster, so 172 * the caller should not try to unlock the chain separately. 173 * 174 * We fake the flags. 175 */ 176 hammer2_cluster_t * 177 hammer2_cluster_from_chain(hammer2_chain_t *chain) 178 { 179 hammer2_cluster_t *cluster; 180 181 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 182 cluster->array[0].chain = chain; 183 cluster->array[0].flags = HAMMER2_CITEM_FEMOD; 184 cluster->nchains = 1; 185 cluster->focus = chain; 186 cluster->focus_index = 0; 187 cluster->pmp = chain->pmp; 188 cluster->refs = 1; 189 cluster->error = chain->error; 190 cluster->flags = HAMMER2_CLUSTER_LOCKED | 191 HAMMER2_CLUSTER_WRHARD | 192 HAMMER2_CLUSTER_RDHARD | 193 HAMMER2_CLUSTER_MSYNCED | 194 HAMMER2_CLUSTER_SSYNCED; 195 196 return cluster; 197 } 198 199 /* 200 * Add a reference to a cluster and its underlying chains. 201 * 202 * We must also ref the underlying chains in order to allow ref/unlock 203 * sequences to later re-lock. 204 */ 205 void 206 hammer2_cluster_ref(hammer2_cluster_t *cluster) 207 { 208 atomic_add_int(&cluster->refs, 1); 209 } 210 211 /* 212 * Drop the caller's reference to the cluster. When the ref count drops to 213 * zero this function frees the cluster and drops all underlying chains. 214 * 215 * In-progress read I/Os are typically detached from the cluster once the 216 * first one returns (the remaining stay attached to the DIOs but are then 217 * ignored and drop naturally). 218 */ 219 void 220 hammer2_cluster_drop(hammer2_cluster_t *cluster) 221 { 222 hammer2_chain_t *chain; 223 int i; 224 225 KKASSERT(cluster->refs > 0); 226 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 227 cluster->focus = NULL; /* safety XXX chg to assert */ 228 cluster->focus_index = 0; 229 230 for (i = 0; i < cluster->nchains; ++i) { 231 chain = cluster->array[i].chain; 232 if (chain) { 233 hammer2_chain_drop(chain); 234 cluster->array[i].chain = NULL; /* safety */ 235 } 236 } 237 cluster->nchains = 0; /* safety */ 238 239 kfree(cluster, M_HAMMER2); 240 /* cluster is invalid */ 241 } 242 } 243 244 /* 245 * Lock a cluster. Cluster must already be referenced. Focus is maintained. 246 * 247 * WARNING! This function expects the caller to handle resolution of the 248 * cluster. We never re-resolve the cluster in this function, 249 * because it might be used to temporarily unlock/relock a cparent 250 * in an iteration or recursrion, and the cparents elements do not 251 * necessarily match. 252 */ 253 void 254 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 255 { 256 hammer2_chain_t *chain; 257 int i; 258 259 /* cannot be on inode-embedded cluster template, must be on copy */ 260 KKASSERT(cluster->refs > 0); 261 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0); 262 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) { 263 panic("hammer2_cluster_lock: cluster %p already locked!\n", 264 cluster); 265 } 266 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 267 268 /* 269 * Lock chains and resolve state. 270 */ 271 for (i = 0; i < cluster->nchains; ++i) { 272 chain = cluster->array[i].chain; 273 if (chain == NULL) 274 continue; 275 hammer2_chain_lock(chain, how); 276 } 277 } 278 279 /* 280 * Calculate the clustering state for the cluster and set its focus. 281 * This routine must be called with care. For example, it should not 282 * normally be called after relocking a non-leaf cluster because parent 283 * clusters help iterations and each element might be at a slightly different 284 * indirect node (each node's topology is independently indexed). 285 * 286 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal 287 * operations. Typically this is only set on a quorum of MASTERs or 288 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER 289 * is present, this bit is *not* set on a quorum of MASTERs. The 290 * synchronization code ignores this bit, but all hammer2_cluster_*() calls 291 * that create/modify/delete elements use it. 292 * 293 * The chains making up the cluster may be narrowed down based on quorum 294 * acceptability, and if RESOLVE_RDONLY is specified the chains can be 295 * narrowed down to a single chain as long as the entire subtopology is known 296 * to be intact. So, for example, we can narrow a read-only op to a single 297 * fast SLAVE but if we focus a CACHE chain we must still retain at least 298 * a SLAVE to ensure that the subtopology can be accessed. 299 * 300 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need 301 * to be maintained once the topology is validated as-of the top level of 302 * the operation. 303 * 304 * If a failure occurs the operation must be aborted by higher-level code and 305 * retried. XXX 306 */ 307 void 308 hammer2_cluster_resolve(hammer2_cluster_t *cluster) 309 { 310 hammer2_chain_t *chain; 311 hammer2_chain_t *focus; 312 hammer2_pfs_t *pmp; 313 hammer2_tid_t quorum_tid; 314 hammer2_tid_t last_best_quorum_tid; 315 int focus_pfs_type; 316 uint32_t nflags; 317 int ttlmasters; 318 int ttlslaves; 319 int nmasters; 320 int nslaves; 321 int nquorum; 322 int smpresent; 323 int i; 324 325 cluster->error = 0; 326 cluster->focus = NULL; 327 328 focus_pfs_type = 0; 329 nflags = 0; 330 ttlmasters = 0; 331 ttlslaves = 0; 332 nmasters = 0; 333 nslaves = 0; 334 335 /* 336 * Calculate quorum 337 */ 338 pmp = cluster->pmp; 339 KKASSERT(pmp != NULL || cluster->nchains == 0); 340 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 341 smpresent = 0; 342 343 /* 344 * Pass 1 345 * 346 * NOTE: A NULL chain is not necessarily an error, it could be 347 * e.g. a lookup failure or the end of an iteration. 348 * Process normally. 349 */ 350 for (i = 0; i < cluster->nchains; ++i) { 351 chain = cluster->array[i].chain; 352 if (chain && chain->error) { 353 if (cluster->focus == NULL || cluster->focus == chain) { 354 /* error will be overridden by valid focus */ 355 cluster->error = chain->error; 356 } 357 358 /* 359 * Must count total masters and slaves whether the 360 * chain is errored or not. 361 */ 362 switch (cluster->pmp->pfs_types[i]) { 363 case HAMMER2_PFSTYPE_SUPROOT: 364 case HAMMER2_PFSTYPE_MASTER: 365 ++ttlmasters; 366 break; 367 case HAMMER2_PFSTYPE_SLAVE: 368 ++ttlslaves; 369 break; 370 } 371 continue; 372 } 373 switch (cluster->pmp->pfs_types[i]) { 374 case HAMMER2_PFSTYPE_MASTER: 375 ++ttlmasters; 376 break; 377 case HAMMER2_PFSTYPE_SLAVE: 378 ++ttlslaves; 379 break; 380 case HAMMER2_PFSTYPE_SOFT_MASTER: 381 nflags |= HAMMER2_CLUSTER_WRSOFT; 382 nflags |= HAMMER2_CLUSTER_RDSOFT; 383 smpresent = 1; 384 break; 385 case HAMMER2_PFSTYPE_SOFT_SLAVE: 386 nflags |= HAMMER2_CLUSTER_RDSOFT; 387 break; 388 case HAMMER2_PFSTYPE_SUPROOT: 389 /* 390 * Degenerate cluster representing the super-root 391 * topology on a single device. Fake stuff so 392 * cluster ops work as expected. 393 */ 394 nflags |= HAMMER2_CLUSTER_WRHARD; 395 nflags |= HAMMER2_CLUSTER_RDHARD; 396 cluster->focus_index = i; 397 cluster->focus = chain; 398 cluster->error = chain ? chain->error : 0; 399 ++ttlmasters; 400 break; 401 default: 402 break; 403 } 404 } 405 406 /* 407 * Pass 2 408 * 409 * Resolve masters. Calculate nmasters for the highest matching 410 * TID, if a quorum cannot be attained try the next lower matching 411 * TID until we exhaust TIDs. 412 * 413 * NOTE: A NULL chain is not necessarily an error, it could be 414 * e.g. a lookup failure or the end of an iteration. 415 * Process normally. 416 */ 417 last_best_quorum_tid = HAMMER2_TID_MAX; 418 quorum_tid = 0; /* fix gcc warning */ 419 420 while (nmasters < nquorum && last_best_quorum_tid != 0) { 421 nmasters = 0; 422 quorum_tid = 0; 423 424 for (i = 0; i < cluster->nchains; ++i) { 425 switch (cluster->pmp->pfs_types[i]) { 426 case HAMMER2_PFSTYPE_SUPROOT: 427 case HAMMER2_PFSTYPE_MASTER: 428 break; 429 default: 430 continue; 431 } 432 chain = cluster->array[i].chain; 433 434 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 435 /* 436 * Invalid as in unsynchronized, cannot be 437 * used to calculate the quorum. 438 */ 439 } else if (chain == NULL && quorum_tid == 0) { 440 /* 441 * NULL chain on master matches NULL chains 442 * on other masters. 443 */ 444 ++nmasters; 445 } else if (quorum_tid < last_best_quorum_tid && 446 chain != NULL && 447 (quorum_tid < chain->bref.modify_tid || 448 nmasters == 0)) { 449 /* 450 * Better TID located, reset nmasters count. 451 */ 452 nmasters = 1; 453 quorum_tid = chain->bref.modify_tid; 454 } else if (chain && 455 quorum_tid == chain->bref.modify_tid) { 456 /* 457 * TID matches current collection. 458 */ 459 ++nmasters; 460 } 461 } 462 if (nmasters >= nquorum) 463 break; 464 last_best_quorum_tid = quorum_tid; 465 } 466 467 /* 468 * Pass 3 469 * 470 * NOTE: A NULL chain is not necessarily an error, it could be 471 * e.g. a lookup failure or the end of an iteration. 472 * Process normally. 473 */ 474 for (i = 0; i < cluster->nchains; ++i) { 475 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 476 chain = cluster->array[i].chain; 477 if (chain && chain->error) { 478 if (cluster->focus == NULL || cluster->focus == chain) { 479 /* error will be overridden by valid focus */ 480 cluster->error = chain->error; 481 } 482 continue; 483 } 484 485 switch (cluster->pmp->pfs_types[i]) { 486 case HAMMER2_PFSTYPE_MASTER: 487 /* 488 * We must have enough up-to-date masters to reach 489 * a quorum and the master modify_tid must match 490 * the quorum's modify_tid. 491 * 492 * Do not select an errored or out-of-sync master. 493 */ 494 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 495 nflags |= HAMMER2_CLUSTER_UNHARD; 496 } else if (nmasters >= nquorum && 497 (chain == NULL || chain->error == 0) && 498 ((chain == NULL && quorum_tid == 0) || 499 (chain != NULL && quorum_tid == 500 chain->bref.modify_tid))) { 501 nflags |= HAMMER2_CLUSTER_WRHARD; 502 nflags |= HAMMER2_CLUSTER_RDHARD; 503 if (!smpresent) { 504 cluster->array[i].flags |= 505 HAMMER2_CITEM_FEMOD; 506 } 507 if (cluster->focus == NULL || 508 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) { 509 focus_pfs_type = HAMMER2_PFSTYPE_MASTER; 510 cluster->focus_index = i; 511 cluster->focus = chain; /* NULL ok */ 512 cluster->error = chain ? chain->error : 513 0; 514 } 515 } else if (chain == NULL || chain->error == 0) { 516 nflags |= HAMMER2_CLUSTER_UNHARD; 517 } 518 break; 519 case HAMMER2_PFSTYPE_SLAVE: 520 /* 521 * We must have enough up-to-date masters to reach 522 * a quorum and the slave modify_tid must match the 523 * quorum's modify_tid. 524 * 525 * Do not select an errored slave. 526 */ 527 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 528 nflags |= HAMMER2_CLUSTER_UNHARD; 529 } else if (nmasters >= nquorum && 530 (chain == NULL || chain->error == 0) && 531 ((chain == NULL && quorum_tid == 0) || 532 (chain && quorum_tid == 533 chain->bref.modify_tid))) { 534 ++nslaves; 535 nflags |= HAMMER2_CLUSTER_RDHARD; 536 #if 0 537 /* XXX optimize for RESOLVE_RDONLY */ 538 if (cluster->focus == NULL) { 539 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE; 540 cluster->focus_index = i; 541 cluster->focus = chain; /* NULL ok */ 542 cluster->error = chain ? chain->error : 543 0; 544 } 545 #endif 546 } else if (chain == NULL || chain->error == 0) { 547 nflags |= HAMMER2_CLUSTER_UNSOFT; 548 } 549 break; 550 case HAMMER2_PFSTYPE_SOFT_MASTER: 551 /* 552 * Directly mounted soft master always wins. There 553 * should be only one. 554 */ 555 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER); 556 cluster->focus_index = i; 557 cluster->focus = chain; 558 cluster->error = chain ? chain->error : 0; 559 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER; 560 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 561 break; 562 case HAMMER2_PFSTYPE_SOFT_SLAVE: 563 /* 564 * Directly mounted soft slave always wins. There 565 * should be only one. 566 */ 567 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE); 568 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) { 569 cluster->focus_index = i; 570 cluster->focus = chain; 571 cluster->error = chain ? chain->error : 0; 572 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE; 573 } 574 break; 575 case HAMMER2_PFSTYPE_SUPROOT: 576 /* 577 * spmp (degenerate case) 578 */ 579 KKASSERT(i == 0); 580 cluster->focus_index = i; 581 cluster->focus = chain; 582 cluster->error = chain ? chain->error : 0; 583 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT; 584 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 585 break; 586 default: 587 break; 588 } 589 } 590 591 /* 592 * Focus now set, adjust ddflag. Skip this pass if the focus 593 * is bad or if we are at the PFS root (the bref won't match at 594 * the PFS root, obviously). 595 */ 596 focus = cluster->focus; 597 if (focus) { 598 cluster->ddflag = 599 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); 600 } else { 601 cluster->ddflag = 0; 602 goto skip4; 603 } 604 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 605 goto skip4; 606 607 /* 608 * Pass 4 609 * 610 * Validate the elements that were not marked invalid. They should 611 * match. 612 */ 613 for (i = 0; i < cluster->nchains; ++i) { 614 int ddflag; 615 616 chain = cluster->array[i].chain; 617 618 if (chain == NULL) 619 continue; 620 if (chain == focus) 621 continue; 622 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 623 continue; 624 625 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 626 if (chain->bref.type != focus->bref.type || 627 chain->bref.key != focus->bref.key || 628 chain->bref.keybits != focus->bref.keybits || 629 chain->bref.modify_tid != focus->bref.modify_tid || 630 chain->bytes != focus->bytes || 631 ddflag != cluster->ddflag) { 632 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 633 if (hammer2_debug & 1) 634 kprintf("cluster_resolve: matching modify_tid failed " 635 "bref test: idx=%d type=%02x/%02x " 636 "key=%016jx/%d-%016jx/%d " 637 "mod=%016jx/%016jx bytes=%u/%u\n", 638 i, 639 chain->bref.type, focus->bref.type, 640 chain->bref.key, chain->bref.keybits, 641 focus->bref.key, focus->bref.keybits, 642 chain->bref.modify_tid, focus->bref.modify_tid, 643 chain->bytes, focus->bytes); 644 if (hammer2_debug & 0x4000) 645 panic("cluster_resolve"); 646 /* flag issue and force resync? */ 647 } 648 } 649 skip4: 650 651 if (ttlslaves == 0) 652 nflags |= HAMMER2_CLUSTER_NOSOFT; 653 if (ttlmasters == 0) 654 nflags |= HAMMER2_CLUSTER_NOHARD; 655 656 /* 657 * Set SSYNCED or MSYNCED for slaves and masters respectively if 658 * all available nodes (even if 0 are available) are fully 659 * synchronized. This is used by the synchronization thread to 660 * determine if there is work it could potentially accomplish. 661 */ 662 if (nslaves == ttlslaves) 663 nflags |= HAMMER2_CLUSTER_SSYNCED; 664 if (nmasters == ttlmasters) 665 nflags |= HAMMER2_CLUSTER_MSYNCED; 666 667 /* 668 * Determine if the cluster was successfully locked for the 669 * requested operation and generate an error code. The cluster 670 * will not be locked (or ref'd) if an error is returned. 671 */ 672 atomic_set_int(&cluster->flags, nflags); 673 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 674 } 675 676 /* 677 * This is used by the XOPS subsystem to calculate the state of 678 * the collection and tell hammer2_xop_collect() what to do with it. 679 * The collection can be in various states of desynchronization, the 680 * caller specifically wants to resolve the passed-in key. 681 * 682 * Return values: 683 * 0 - Quorum agreement, key is valid 684 * 685 * ENOENT - Quorum agreement, end of scan 686 * 687 * ESRCH - Quorum agreement, key is INVALID (caller should 688 * skip key). 689 * 690 * EIO - Quorum agreement but all elements had errors. 691 * 692 * EDEADLK - No quorum agreement possible for key, a repair 693 * may be needed. Caller has to decide what to do, 694 * possibly iterating the key or generating an EIO. 695 * 696 * EINPROGRESS - No quorum agreement yet, but agreement is still 697 * possible if caller waits for more responses. Caller 698 * should not iterate key. 699 * 700 * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds. 701 * 702 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE 703 */ 704 int 705 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags) 706 { 707 hammer2_chain_t *chain; 708 hammer2_chain_t *focus; 709 hammer2_pfs_t *pmp; 710 hammer2_tid_t quorum_tid; 711 hammer2_tid_t last_best_quorum_tid; 712 uint32_t nflags; 713 int ttlmasters; 714 int ttlslaves; 715 int nmasters; 716 int nmasters_keymatch; 717 int nslaves; 718 int nquorum; 719 int umasters; /* unknown masters (still in progress) */ 720 int smpresent; 721 int error; 722 int i; 723 724 cluster->error = 0; 725 cluster->focus = NULL; 726 727 pmp = cluster->pmp; 728 KKASSERT(pmp != NULL || cluster->nchains == 0); 729 730 /* 731 * Calculate quorum 732 */ 733 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 734 smpresent = 0; 735 nflags = 0; 736 ttlmasters = 0; 737 ttlslaves = 0; 738 739 /* 740 * Pass 1 741 * 742 * NOTE: A NULL chain is not necessarily an error, it could be 743 * e.g. a lookup failure or the end of an iteration. 744 * Process normally. 745 */ 746 for (i = 0; i < cluster->nchains; ++i) { 747 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 748 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 749 750 chain = cluster->array[i].chain; 751 error = cluster->array[i].error; 752 if (chain && error) { 753 if (cluster->focus == NULL || cluster->focus == chain) { 754 /* error will be overridden by valid focus */ 755 /* XXX */ 756 } 757 758 /* 759 * Must count total masters and slaves whether the 760 * chain is errored or not. 761 */ 762 switch (cluster->pmp->pfs_types[i]) { 763 case HAMMER2_PFSTYPE_SUPROOT: 764 case HAMMER2_PFSTYPE_MASTER: 765 ++ttlmasters; 766 break; 767 case HAMMER2_PFSTYPE_SLAVE: 768 ++ttlslaves; 769 break; 770 } 771 continue; 772 } 773 switch (cluster->pmp->pfs_types[i]) { 774 case HAMMER2_PFSTYPE_MASTER: 775 ++ttlmasters; 776 break; 777 case HAMMER2_PFSTYPE_SLAVE: 778 ++ttlslaves; 779 break; 780 case HAMMER2_PFSTYPE_SOFT_MASTER: 781 nflags |= HAMMER2_CLUSTER_WRSOFT; 782 nflags |= HAMMER2_CLUSTER_RDSOFT; 783 smpresent = 1; 784 break; 785 case HAMMER2_PFSTYPE_SOFT_SLAVE: 786 nflags |= HAMMER2_CLUSTER_RDSOFT; 787 break; 788 case HAMMER2_PFSTYPE_SUPROOT: 789 /* 790 * Degenerate cluster representing the super-root 791 * topology on a single device. Fake stuff so 792 * cluster ops work as expected. 793 */ 794 ++ttlmasters; 795 nflags |= HAMMER2_CLUSTER_WRHARD; 796 nflags |= HAMMER2_CLUSTER_RDHARD; 797 cluster->focus_index = i; 798 cluster->focus = chain; 799 cluster->error = error; 800 break; 801 default: 802 break; 803 } 804 } 805 806 /* 807 * Pass 2 808 * 809 * Resolve nmasters - master nodes fully match 810 * 811 * Resolve umasters - master nodes operation still 812 * in progress 813 * 814 * Resolve nmasters_keymatch - master nodes match the passed-in 815 * key and may or may not match 816 * the quorum-agreed tid. 817 * 818 * The quorum-agreed TID is the highest matching TID. 819 */ 820 last_best_quorum_tid = HAMMER2_TID_MAX; 821 umasters = 0; 822 nmasters = 0; 823 nmasters_keymatch = 0; 824 quorum_tid = 0; /* fix gcc warning */ 825 826 while (nmasters < nquorum && last_best_quorum_tid != 0) { 827 umasters = 0; 828 nmasters = 0; 829 nmasters_keymatch = 0; 830 quorum_tid = 0; 831 832 for (i = 0; i < cluster->nchains; ++i) { 833 /* XXX SOFT smpresent handling */ 834 switch(cluster->pmp->pfs_types[i]) { 835 case HAMMER2_PFSTYPE_MASTER: 836 case HAMMER2_PFSTYPE_SUPROOT: 837 break; 838 default: 839 continue; 840 } 841 842 chain = cluster->array[i].chain; 843 error = cluster->array[i].error; 844 845 /* 846 * Skip elements still in progress. umasters keeps 847 * track of masters that might still be in-progress. 848 */ 849 if (chain == NULL && (cluster->array[i].flags & 850 HAMMER2_CITEM_NULL) == 0) { 851 ++umasters; 852 continue; 853 } 854 855 /* 856 * Key match? 857 */ 858 if (flags & HAMMER2_CHECK_NULL) { 859 if (chain == NULL) { 860 ++nmasters; 861 ++nmasters_keymatch; 862 if (cluster->error == 0) 863 cluster->error = error; 864 } 865 } else if (chain && 866 (key == (hammer2_key_t)-1 || 867 chain->bref.key == key)) { 868 ++nmasters_keymatch; 869 870 if (chain->bref.modify_tid < 871 last_best_quorum_tid && 872 quorum_tid < chain->bref.modify_tid) { 873 /* 874 * Select new TID as master if better 875 * than any found so far in this loop, 876 * as long as it does not reach the 877 * best tid found in the previous loop. 878 */ 879 nmasters = 0; 880 quorum_tid = chain->bref.modify_tid; 881 } 882 if (quorum_tid == chain->bref.modify_tid) { 883 /* 884 * TID matches current collection. 885 * 886 * (error handled in next pass) 887 */ 888 ++nmasters; 889 if (chain->error == 0) { 890 cluster->focus = chain; 891 cluster->focus_index = i; 892 } 893 } 894 } 895 } 896 if (nmasters >= nquorum) 897 break; 898 last_best_quorum_tid = quorum_tid; 899 } 900 901 /* 902 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n", 903 nmasters, nquorum, nmasters_keymatch, umasters); 904 */ 905 906 /* 907 * Early return if we do not have enough masters. 908 */ 909 if (nmasters < nquorum) { 910 if (nmasters + umasters >= nquorum) 911 return EINPROGRESS; 912 if (nmasters_keymatch < nquorum) 913 return ESRCH; 914 return EDEADLK; 915 } 916 917 /* 918 * Validated end of scan. 919 */ 920 if (flags & HAMMER2_CHECK_NULL) { 921 if (cluster->error == 0) 922 cluster->error = ENOENT; 923 return cluster->error; 924 } 925 926 /* 927 * If we have a NULL focus at this point the agreeing quorum all 928 * had chain errors. 929 */ 930 if (cluster->focus == NULL) 931 return EIO; 932 933 /* 934 * Pass 3 935 * 936 * We have quorum agreement, validate elements, not end of scan. 937 */ 938 nslaves = 0; 939 cluster->error = 0; 940 941 for (i = 0; i < cluster->nchains; ++i) { 942 chain = cluster->array[i].chain; 943 error = cluster->array[i].error; 944 if (chain == NULL || 945 chain->bref.key != key || 946 chain->bref.modify_tid != quorum_tid) { 947 continue; 948 } 949 950 /* 951 * Quorum Match 952 * 953 * XXX for now, cumulative error. 954 */ 955 if (cluster->error == 0) 956 cluster->error = error; 957 958 switch (cluster->pmp->pfs_types[i]) { 959 case HAMMER2_PFSTYPE_MASTER: 960 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 961 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 962 nflags |= HAMMER2_CLUSTER_WRHARD; 963 nflags |= HAMMER2_CLUSTER_RDHARD; 964 break; 965 case HAMMER2_PFSTYPE_SLAVE: 966 /* 967 * We must have enough up-to-date masters to reach 968 * a quorum and the slave modify_tid must match the 969 * quorum's modify_tid. 970 * 971 * Do not select an errored slave. 972 */ 973 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 974 nflags |= HAMMER2_CLUSTER_RDHARD; 975 ++nslaves; 976 break; 977 case HAMMER2_PFSTYPE_SOFT_MASTER: 978 /* 979 * Directly mounted soft master always wins. There 980 * should be only one. 981 */ 982 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 983 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 984 break; 985 case HAMMER2_PFSTYPE_SOFT_SLAVE: 986 /* 987 * Directly mounted soft slave always wins. There 988 * should be only one. 989 * 990 * XXX 991 */ 992 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 993 break; 994 case HAMMER2_PFSTYPE_SUPROOT: 995 /* 996 * spmp (degenerate case) 997 */ 998 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 999 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 1000 nflags |= HAMMER2_CLUSTER_WRHARD; 1001 nflags |= HAMMER2_CLUSTER_RDHARD; 1002 break; 1003 default: 1004 break; 1005 } 1006 } 1007 1008 /* 1009 * Focus now set, adjust ddflag. Skip this pass if the focus 1010 * is bad or if we are at the PFS root (the bref won't match at 1011 * the PFS root, obviously). 1012 */ 1013 focus = cluster->focus; 1014 if (focus) { 1015 cluster->ddflag = 1016 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); 1017 } else { 1018 cluster->ddflag = 0; 1019 goto skip4; 1020 } 1021 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1022 goto skip4; 1023 1024 /* 1025 * Pass 4 1026 * 1027 * Validate the elements that were not marked invalid. They should 1028 * match. 1029 */ 1030 for (i = 0; i < cluster->nchains; ++i) { 1031 int ddflag; 1032 1033 chain = cluster->array[i].chain; 1034 1035 if (chain == NULL) 1036 continue; 1037 if (chain == focus) 1038 continue; 1039 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 1040 continue; 1041 1042 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 1043 if (chain->bref.type != focus->bref.type || 1044 chain->bref.key != focus->bref.key || 1045 chain->bref.keybits != focus->bref.keybits || 1046 chain->bref.modify_tid != focus->bref.modify_tid || 1047 chain->bytes != focus->bytes || 1048 ddflag != cluster->ddflag) { 1049 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1050 if (hammer2_debug & 1) 1051 kprintf("cluster_resolve: matching modify_tid failed " 1052 "bref test: idx=%d type=%02x/%02x " 1053 "key=%016jx/%d-%016jx/%d " 1054 "mod=%016jx/%016jx bytes=%u/%u\n", 1055 i, 1056 chain->bref.type, focus->bref.type, 1057 chain->bref.key, chain->bref.keybits, 1058 focus->bref.key, focus->bref.keybits, 1059 chain->bref.modify_tid, focus->bref.modify_tid, 1060 chain->bytes, focus->bytes); 1061 if (hammer2_debug & 0x4000) 1062 panic("cluster_resolve"); 1063 /* flag issue and force resync? */ 1064 } 1065 } 1066 skip4: 1067 1068 if (ttlslaves == 0) 1069 nflags |= HAMMER2_CLUSTER_NOSOFT; 1070 if (ttlmasters == 0) 1071 nflags |= HAMMER2_CLUSTER_NOHARD; 1072 1073 /* 1074 * Set SSYNCED or MSYNCED for slaves and masters respectively if 1075 * all available nodes (even if 0 are available) are fully 1076 * synchronized. This is used by the synchronization thread to 1077 * determine if there is work it could potentially accomplish. 1078 */ 1079 if (nslaves == ttlslaves) 1080 nflags |= HAMMER2_CLUSTER_SSYNCED; 1081 if (nmasters == ttlmasters) 1082 nflags |= HAMMER2_CLUSTER_MSYNCED; 1083 1084 /* 1085 * Determine if the cluster was successfully locked for the 1086 * requested operation and generate an error code. The cluster 1087 * will not be locked (or ref'd) if an error is returned. 1088 */ 1089 atomic_set_int(&cluster->flags, nflags); 1090 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 1091 1092 return cluster->error; 1093 } 1094 1095 /* 1096 * This is used by the sync thread to force non-NULL elements of a copy 1097 * of the pmp->iroot cluster to be good which is required to prime the 1098 * sync. 1099 */ 1100 void 1101 hammer2_cluster_forcegood(hammer2_cluster_t *cluster) 1102 { 1103 int i; 1104 1105 for (i = 0; i < cluster->nchains; ++i) { 1106 if (cluster->array[i].chain) 1107 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 1108 } 1109 } 1110 1111 /* 1112 * Copy a cluster, returned a ref'd cluster. All underlying chains 1113 * are also ref'd, but not locked. Focus state is also copied. 1114 * 1115 * Original cluster does not have to be locked but usually is. 1116 * New cluster will not be flagged as locked. 1117 * 1118 * Callers using this function to initialize a new cluster from an inode 1119 * generally lock and resolve the resulting cluster. 1120 * 1121 * Callers which use this function to save/restore a cluster structure 1122 * generally retain the focus state and do not re-resolve it. Caller should 1123 * not try to re-resolve internal (cparent) node state during an iteration 1124 * as the individual tracking elements of cparent in an iteration may not 1125 * match even though they are correct. 1126 */ 1127 hammer2_cluster_t * 1128 hammer2_cluster_copy(hammer2_cluster_t *ocluster) 1129 { 1130 hammer2_pfs_t *pmp = ocluster->pmp; 1131 hammer2_cluster_t *ncluster; 1132 hammer2_chain_t *chain; 1133 int i; 1134 1135 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO); 1136 ncluster->pmp = pmp; 1137 ncluster->nchains = ocluster->nchains; 1138 ncluster->refs = 1; 1139 1140 for (i = 0; i < ocluster->nchains; ++i) { 1141 chain = ocluster->array[i].chain; 1142 ncluster->array[i].chain = chain; 1143 ncluster->array[i].flags = ocluster->array[i].flags; 1144 if (chain) 1145 hammer2_chain_ref(chain); 1146 } 1147 ncluster->focus_index = ocluster->focus_index; 1148 ncluster->focus = ocluster->focus; 1149 ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED | 1150 HAMMER2_CLUSTER_INODE); 1151 1152 return (ncluster); 1153 } 1154 1155 /* 1156 * Unlock a cluster. Refcount and focus is maintained. 1157 */ 1158 void 1159 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 1160 { 1161 hammer2_chain_t *chain; 1162 int i; 1163 1164 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) { 1165 kprintf("hammer2_cluster_unlock: cluster %p not locked\n", 1166 cluster); 1167 } 1168 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 1169 KKASSERT(cluster->refs > 0); 1170 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 1171 1172 for (i = 0; i < cluster->nchains; ++i) { 1173 chain = cluster->array[i].chain; 1174 if (chain) 1175 hammer2_chain_unlock(chain); 1176 } 1177 } 1178 1179 /************************************************************************ 1180 * CLUSTER I/O * 1181 ************************************************************************ 1182 * 1183 * 1184 * WARNING! blockref[] array data is not universal. These functions should 1185 * only be used to access universal data. 1186 * 1187 * NOTE! The rdata call will wait for at least one of the chain I/Os to 1188 * complete if necessary. The I/O's should have already been 1189 * initiated by the cluster_lock/chain_lock operation. 1190 * 1191 * The cluster must already be in a modified state before wdata 1192 * is called. The data will already be available for this case. 1193 */ 1194 const hammer2_media_data_t * 1195 hammer2_cluster_rdata(hammer2_cluster_t *cluster) 1196 { 1197 KKASSERT(cluster->focus != NULL); 1198 return(cluster->focus->data); 1199 } 1200 1201 hammer2_media_data_t * 1202 hammer2_cluster_wdata(hammer2_cluster_t *cluster) 1203 { 1204 KKASSERT(cluster->focus != NULL); 1205 KKASSERT(hammer2_cluster_modified(cluster)); 1206 return(cluster->focus->data); 1207 } 1208