1 /* 2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information from different nodes into a single entity. It allows direct 37 * access to media data as long as it is not blockref array data (which 38 * will obviously have to be different at each node). 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 * 48 * WARNING! This module is *extremely* complex. It must issue asynchronous 49 * locks and I/O, do quorum and/or master-slave processing, and 50 * it must operate properly even if some nodes are broken (which 51 * can also mean indefinite locks). 52 * 53 * CLUSTER OPERATIONS 54 * 55 * Cluster operations can be broken down into three pieces: 56 * 57 * (1) Chain locking and data retrieval. 58 * hammer2_cluster_lock() 59 * hammer2_cluster_parent() 60 * 61 * - Most complex functions, quorum management on transaction ids. 62 * 63 * - Locking and data accesses must be internally asynchronous. 64 * 65 * - Validate and manage cache coherency primitives (cache state 66 * is stored in chain topologies but must be validated by these 67 * functions). 68 * 69 * (2) Lookups and Scans 70 * hammer2_cluster_lookup() 71 * hammer2_cluster_next() 72 * 73 * - Depend on locking & data retrieval functions, but still complex. 74 * 75 * - Must do quorum management on transaction ids. 76 * 77 * - Lookup and Iteration ops Must be internally asynchronous. 78 * 79 * (3) Modifying Operations 80 * hammer2_cluster_create() 81 * hammer2_cluster_rename() 82 * hammer2_cluster_delete() 83 * hammer2_cluster_modify() 84 * hammer2_cluster_modsync() 85 * 86 * - Can usually punt on failures, operation continues unless quorum 87 * is lost. If quorum is lost, must wait for resynchronization 88 * (depending on the management mode). 89 * 90 * - Must disconnect node on failures (also not flush), remount, and 91 * resynchronize. 92 * 93 * - Network links (via kdmsg) are relatively easy to issue as the 94 * complex underworkings of hammer2_chain.c don't have to messed 95 * with (the protocol is at a higher level than block-level). 96 * 97 * - Multiple local disk nodes (i.e. block devices) are another matter. 98 * Chain operations have to be dispatched to per-node threads (xN) 99 * because we can't asynchronize potentially very complex chain 100 * operations in hammer2_chain.c (it would be a huge mess). 101 * 102 * (these threads are also used to terminate incoming kdmsg ops from 103 * other machines). 104 * 105 * - Single-node filesystems do not use threads and will simply call 106 * hammer2_chain.c functions directly. This short-cut is handled 107 * at the base of each cluster function. 108 */ 109 #include <sys/cdefs.h> 110 #include <sys/param.h> 111 #include <sys/systm.h> 112 #include <sys/types.h> 113 #include <sys/lock.h> 114 #include <sys/uuid.h> 115 116 #include "hammer2.h" 117 118 /* 119 * Returns non-zero if any chain in the cluster needs to be resized. 120 * Errored elements are not used in the calculation. 121 */ 122 int 123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes) 124 { 125 hammer2_chain_t *chain; 126 int i; 127 128 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 129 for (i = 0; i < cluster->nchains; ++i) { 130 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 131 continue; 132 chain = cluster->array[i].chain; 133 if (chain == NULL) 134 continue; 135 if (chain->error) 136 continue; 137 if (chain->bytes != bytes) 138 return 1; 139 } 140 return 0; 141 } 142 143 /* 144 * Returns the bref type of the cluster's foucs. 145 * 146 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0). 147 * The cluster must be locked. 148 */ 149 uint8_t 150 hammer2_cluster_type(hammer2_cluster_t *cluster) 151 { 152 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 153 if (cluster->error == 0) 154 return(cluster->focus->bref.type); 155 return 0; 156 } 157 158 /* 159 * Returns non-zero if the cluster's focus is flagged as being modified. 160 * 161 * If the cluster is errored, returns 0. 162 */ 163 int 164 hammer2_cluster_modified(hammer2_cluster_t *cluster) 165 { 166 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 167 if (cluster->error == 0) 168 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); 169 return 0; 170 } 171 172 /* 173 * Returns the bref of the cluster's focus, sans any data-offset information 174 * (since offset information is per-node and wouldn't be useful). 175 * 176 * Callers use this function to access modify_tid, mirror_tid, type, 177 * key, and keybits. 178 * 179 * If the cluster is errored, returns an empty bref. 180 * The cluster must be locked. 181 */ 182 void 183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 184 { 185 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 186 if (cluster->error == 0) { 187 *bref = cluster->focus->bref; 188 bref->data_off = 0; 189 } else { 190 bzero(bref, sizeof(*bref)); 191 } 192 } 193 194 /* 195 * Return non-zero if the chain representing an inode has been flagged 196 * as having been unlinked. Allows the vnode reclaim to avoid loading 197 * the inode data from disk e.g. when unmount or recycling old, clean 198 * vnodes. 199 * 200 * The cluster does not need to be locked. 201 * The focus cannot be used since the cluster might not be locked. 202 */ 203 int 204 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster) 205 { 206 hammer2_chain_t *chain; 207 int flags; 208 int i; 209 210 flags = 0; 211 for (i = 0; i < cluster->nchains; ++i) { 212 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 213 continue; 214 chain = cluster->array[i].chain; 215 if (chain) 216 flags |= chain->flags; 217 } 218 return (flags & HAMMER2_CHAIN_UNLINKED); 219 } 220 221 /* 222 * Set a bitmask of flags in all chains related to a cluster. 223 * The cluster should probably be locked. 224 * 225 * XXX Only operate on FEMOD elements? 226 */ 227 void 228 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 229 { 230 hammer2_chain_t *chain; 231 int i; 232 233 for (i = 0; i < cluster->nchains; ++i) { 234 chain = cluster->array[i].chain; 235 if (chain) 236 atomic_set_int(&chain->flags, flags); 237 } 238 } 239 240 /* 241 * Set a bitmask of flags in all chains related to a cluster. 242 * The cluster should probably be locked. 243 * 244 * XXX Only operate on FEMOD elements? 245 */ 246 void 247 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 248 { 249 hammer2_chain_t *chain; 250 int i; 251 252 for (i = 0; i < cluster->nchains; ++i) { 253 chain = cluster->array[i].chain; 254 if (chain) 255 atomic_clear_int(&chain->flags, flags); 256 } 257 } 258 259 /* 260 * Flag the cluster for flushing recursively up to the root. Despite the 261 * work it does, this is relatively benign. It just makes sure that the 262 * flusher has top-down visibility to this cluster. 263 * 264 * Errored chains are not flagged for flushing. 265 * 266 * The cluster should probably be locked. 267 */ 268 void 269 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster) 270 { 271 hammer2_chain_t *chain; 272 int i; 273 274 for (i = 0; i < cluster->nchains; ++i) { 275 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 276 continue; 277 chain = cluster->array[i].chain; 278 if (chain == NULL) 279 continue; 280 if (chain->error) 281 continue; 282 hammer2_chain_setflush(trans, chain); 283 } 284 } 285 286 /* 287 * Set the check mode for the cluster. 288 * Errored elements of the cluster are ignored. 289 * 290 * The cluster must be locked and modified. 291 */ 292 void 293 hammer2_cluster_setmethod_check(hammer2_trans_t *trans, 294 hammer2_cluster_t *cluster, 295 int check_algo) 296 { 297 hammer2_chain_t *chain; 298 int i; 299 300 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 301 for (i = 0; i < cluster->nchains; ++i) { 302 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 303 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 304 continue; 305 } 306 chain = cluster->array[i].chain; 307 if (chain == NULL) 308 continue; 309 if (chain->error) 310 continue; 311 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 312 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1); 313 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo); 314 } 315 } 316 317 /* 318 * Create a degenerate cluster with one ref from a single locked chain. 319 * The returned cluster will be focused on the chain and inherit its 320 * error state. 321 * 322 * The chain's lock and reference are transfered to the new cluster, so 323 * the caller should not try to unlock the chain separately. 324 * 325 * We fake the flags. 326 */ 327 hammer2_cluster_t * 328 hammer2_cluster_from_chain(hammer2_chain_t *chain) 329 { 330 hammer2_cluster_t *cluster; 331 332 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 333 cluster->array[0].chain = chain; 334 cluster->array[0].flags = HAMMER2_CITEM_FEMOD; 335 cluster->nchains = 1; 336 cluster->focus = chain; 337 cluster->focus_index = 0; 338 cluster->pmp = chain->pmp; 339 cluster->refs = 1; 340 cluster->error = chain->error; 341 cluster->flags = HAMMER2_CLUSTER_LOCKED | 342 HAMMER2_CLUSTER_WRHARD | 343 HAMMER2_CLUSTER_RDHARD | 344 HAMMER2_CLUSTER_MSYNCED | 345 HAMMER2_CLUSTER_SSYNCED; 346 347 return cluster; 348 } 349 350 /* 351 * Add a reference to a cluster and its underlying chains. 352 * 353 * We must also ref the underlying chains in order to allow ref/unlock 354 * sequences to later re-lock. 355 */ 356 void 357 hammer2_cluster_ref(hammer2_cluster_t *cluster) 358 { 359 atomic_add_int(&cluster->refs, 1); 360 #if 0 361 hammer2_chain_t *chain; 362 int i; 363 364 for (i = 0; i < cluster->nchains; ++i) { 365 chain = cluster->array[i].chain; 366 if (chain) 367 hammer2_chain_ref(chain); 368 } 369 #endif 370 } 371 372 /* 373 * Drop the caller's reference to the cluster. When the ref count drops to 374 * zero this function frees the cluster and drops all underlying chains. 375 * 376 * In-progress read I/Os are typically detached from the cluster once the 377 * first one returns (the remaining stay attached to the DIOs but are then 378 * ignored and drop naturally). 379 */ 380 void 381 hammer2_cluster_drop(hammer2_cluster_t *cluster) 382 { 383 hammer2_chain_t *chain; 384 int i; 385 386 KKASSERT(cluster->refs > 0); 387 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 388 cluster->focus = NULL; /* safety XXX chg to assert */ 389 cluster->focus_index = 0; 390 391 for (i = 0; i < cluster->nchains; ++i) { 392 chain = cluster->array[i].chain; 393 if (chain) { 394 hammer2_chain_drop(chain); 395 cluster->array[i].chain = NULL; /* safety */ 396 } 397 } 398 cluster->nchains = 0; /* safety */ 399 400 kfree(cluster, M_HAMMER2); 401 /* cluster is invalid */ 402 } 403 } 404 405 void 406 hammer2_cluster_wait(hammer2_cluster_t *cluster) 407 { 408 tsleep(cluster->focus, 0, "h2clcw", 1); 409 } 410 411 /* 412 * Lock and ref a cluster. This adds a ref to the cluster and its chains 413 * and then locks them, modified by various RESOLVE flags. 414 * 415 * The act of locking a cluster sets its focus. Note that cluster elements 416 * flagged with HAMMER2_CITEM_INVALID cannot be set as a focus. Locking a 417 * cluster does not adjust this flag since exact matches only matter for leafs 418 * (parents can depend on minor differences in topology). 419 * 420 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal 421 * operations. Typically this is only set on a quorum of MASTERs or 422 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER 423 * is present, this bit is *not* set on a quorum of MASTERs. The 424 * synchronization code ignores this bit, but all hammer2_cluster_*() calls 425 * that create/modify/delete elements use it. 426 * 427 * The chains making up the cluster may be narrowed down based on quorum 428 * acceptability, and if RESOLVE_RDONLY is specified the chains can be 429 * narrowed down to a single chain as long as the entire subtopology is known 430 * to be intact. So, for example, we can narrow a read-only op to a single 431 * fast SLAVE but if we focus a CACHE chain we must still retain at least 432 * a SLAVE to ensure that the subtopology can be accessed. 433 * 434 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need 435 * to be maintained once the topology is validated as-of the top level of 436 * the operation. 437 * 438 * If a failure occurs the operation must be aborted by higher-level code and 439 * retried. XXX 440 */ 441 void 442 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 443 { 444 hammer2_chain_t *chain; 445 int i; 446 447 /* cannot be on inode-embedded cluster template, must be on copy */ 448 KKASSERT(cluster->refs > 0); 449 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0); 450 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) { 451 panic("hammer2_cluster_lock: cluster %p already locked!\n", 452 cluster); 453 } else { 454 KKASSERT(cluster->focus == NULL); 455 } 456 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 457 458 /* 459 * Lock chains and resolve state. 460 */ 461 for (i = 0; i < cluster->nchains; ++i) { 462 chain = cluster->array[i].chain; 463 if (chain == NULL) 464 continue; 465 hammer2_chain_lock(chain, how); 466 } 467 468 hammer2_cluster_resolve(cluster); 469 } 470 471 void 472 hammer2_cluster_resolve(hammer2_cluster_t *cluster) 473 { 474 hammer2_chain_t *chain; 475 hammer2_pfs_t *pmp; 476 hammer2_tid_t quorum_tid; 477 int focus_pfs_type; 478 uint32_t nflags; 479 int ttlmasters; 480 int ttlslaves; 481 int nmasters; 482 int nslaves; 483 int nquorum; 484 int smpresent; 485 int i; 486 487 cluster->error = 0; 488 489 quorum_tid = 0; 490 focus_pfs_type = 0; 491 nflags = 0; 492 ttlmasters = 0; 493 ttlslaves = 0; 494 nmasters = 0; 495 nslaves = 0; 496 497 /* 498 * Calculate quorum 499 */ 500 pmp = cluster->pmp; 501 KKASSERT(pmp != NULL || cluster->nchains == 0); 502 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 503 smpresent = 0; 504 505 /* 506 * Pass 1 507 */ 508 for (i = 0; i < cluster->nchains; ++i) { 509 chain = cluster->array[i].chain; 510 if (chain == NULL) 511 continue; 512 if (chain->error) { 513 if (cluster->focus == NULL || cluster->focus == chain) { 514 /* error will be overridden by valid focus */ 515 cluster->error = chain->error; 516 } 517 518 /* 519 * Must count total masters and slaves whether the 520 * chain is errored or not. 521 */ 522 switch (cluster->pmp->pfs_types[i]) { 523 case HAMMER2_PFSTYPE_MASTER: 524 ++ttlmasters; 525 break; 526 case HAMMER2_PFSTYPE_SLAVE: 527 ++ttlslaves; 528 break; 529 } 530 continue; 531 } 532 switch (cluster->pmp->pfs_types[i]) { 533 case HAMMER2_PFSTYPE_MASTER: 534 ++ttlmasters; 535 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 536 /* 537 * Invalid as in unsynchronized, cannot be 538 * used to calculate the quorum. 539 */ 540 } else if (quorum_tid < chain->bref.modify_tid || 541 nmasters == 0) { 542 nmasters = 1; 543 quorum_tid = chain->bref.modify_tid; 544 } else if (quorum_tid == chain->bref.modify_tid) { 545 ++nmasters; 546 } 547 break; 548 case HAMMER2_PFSTYPE_SLAVE: 549 ++ttlslaves; 550 break; 551 case HAMMER2_PFSTYPE_SOFT_MASTER: 552 nflags |= HAMMER2_CLUSTER_WRSOFT; 553 nflags |= HAMMER2_CLUSTER_RDSOFT; 554 smpresent = 1; 555 break; 556 case HAMMER2_PFSTYPE_SOFT_SLAVE: 557 nflags |= HAMMER2_CLUSTER_RDSOFT; 558 break; 559 case HAMMER2_PFSTYPE_SUPROOT: 560 /* 561 * Degenerate cluster representing the super-root 562 * topology on a single device. Fake stuff so 563 * cluster ops work as expected. 564 */ 565 nflags |= HAMMER2_CLUSTER_WRHARD; 566 nflags |= HAMMER2_CLUSTER_RDHARD; 567 cluster->focus_index = i; 568 cluster->focus = chain; 569 cluster->error = chain->error; 570 break; 571 default: 572 break; 573 } 574 } 575 576 /* 577 * Pass 2 578 */ 579 for (i = 0; i < cluster->nchains; ++i) { 580 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 581 chain = cluster->array[i].chain; 582 if (chain == NULL) 583 continue; 584 if (chain->error) { 585 if (cluster->focus == NULL || cluster->focus == chain) { 586 /* error will be overridden by valid focus */ 587 cluster->error = chain->error; 588 } 589 continue; 590 } 591 592 switch (cluster->pmp->pfs_types[i]) { 593 case HAMMER2_PFSTYPE_MASTER: 594 /* 595 * We must have enough up-to-date masters to reach 596 * a quorum and the master modify_tid must match 597 * the quorum's modify_tid. 598 * 599 * Do not select an errored or out-of-sync master. 600 */ 601 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 602 nflags |= HAMMER2_CLUSTER_UNHARD; 603 } else if (nmasters >= nquorum && 604 chain->error == 0 && 605 quorum_tid == chain->bref.modify_tid) { 606 nflags |= HAMMER2_CLUSTER_WRHARD; 607 nflags |= HAMMER2_CLUSTER_RDHARD; 608 if (!smpresent) { 609 cluster->array[i].flags |= 610 HAMMER2_CITEM_FEMOD; 611 } 612 if (cluster->focus == NULL || 613 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) { 614 focus_pfs_type = HAMMER2_PFSTYPE_MASTER; 615 cluster->focus_index = i; 616 cluster->focus = chain; 617 cluster->error = chain->error; 618 } 619 } else if (chain->error == 0) { 620 nflags |= HAMMER2_CLUSTER_UNHARD; 621 } 622 break; 623 case HAMMER2_PFSTYPE_SLAVE: 624 /* 625 * We must have enough up-to-date masters to reach 626 * a quorum and the slave modify_tid must match the 627 * quorum's modify_tid. 628 * 629 * Do not select an errored slave. 630 */ 631 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 632 nflags |= HAMMER2_CLUSTER_UNHARD; 633 } else if (nmasters >= nquorum && 634 chain->error == 0 && 635 quorum_tid == chain->bref.modify_tid) { 636 ++nslaves; 637 nflags |= HAMMER2_CLUSTER_RDHARD; 638 if (cluster->focus == NULL) { 639 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE; 640 cluster->focus_index = i; 641 cluster->focus = chain; 642 cluster->error = chain->error; 643 } 644 } else if (chain->error == 0) { 645 nflags |= HAMMER2_CLUSTER_UNSOFT; 646 } 647 break; 648 case HAMMER2_PFSTYPE_SOFT_MASTER: 649 /* 650 * Directly mounted soft master always wins. There 651 * should be only one. 652 */ 653 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER); 654 cluster->focus_index = i; 655 cluster->focus = chain; 656 cluster->error = chain->error; 657 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER; 658 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 659 break; 660 case HAMMER2_PFSTYPE_SOFT_SLAVE: 661 /* 662 * Directly mounted soft slave always wins. There 663 * should be only one. 664 */ 665 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE); 666 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) { 667 cluster->focus_index = i; 668 cluster->focus = chain; 669 cluster->error = chain->error; 670 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE; 671 } 672 break; 673 case HAMMER2_PFSTYPE_SUPROOT: 674 /* 675 * spmp (degenerate case) 676 */ 677 KKASSERT(i == 0); 678 cluster->focus_index = i; 679 cluster->focus = chain; 680 cluster->error = chain->error; 681 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT; 682 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 683 break; 684 default: 685 break; 686 } 687 } 688 689 if (ttlslaves == 0) 690 nflags |= HAMMER2_CLUSTER_NOSOFT; 691 if (ttlmasters == 0) 692 nflags |= HAMMER2_CLUSTER_NOHARD; 693 694 /* 695 * Set SSYNCED or MSYNCED for slaves and masters respectively if 696 * all available nodes (even if 0 are available) are fully 697 * synchronized. This is used by the synchronization thread to 698 * determine if there is work it could potentially accomplish. 699 */ 700 if (nslaves == ttlslaves) 701 nflags |= HAMMER2_CLUSTER_SSYNCED; 702 if (nmasters == ttlmasters) 703 nflags |= HAMMER2_CLUSTER_MSYNCED; 704 705 /* 706 * Determine if the cluster was successfully locked for the 707 * requested operation and generate an error code. The cluster 708 * will not be locked (or ref'd) if an error is returned. 709 * 710 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok() 711 * to determine if reading or writing is possible. If writing, the 712 * cluster still requires a call to hammer2_cluster_modify() first. 713 */ 714 atomic_set_int(&cluster->flags, nflags); 715 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 716 } 717 718 /* 719 * Copy a cluster, returned a ref'd cluster. All underlying chains 720 * are also ref'd, but not locked. 721 * 722 * The cluster focus is not set because the cluster is not yet locked 723 * (and the originating cluster does not have to be locked either). 724 */ 725 hammer2_cluster_t * 726 hammer2_cluster_copy(hammer2_cluster_t *ocluster) 727 { 728 hammer2_pfs_t *pmp = ocluster->pmp; 729 hammer2_cluster_t *ncluster; 730 hammer2_chain_t *chain; 731 int i; 732 733 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO); 734 ncluster->pmp = pmp; 735 ncluster->nchains = ocluster->nchains; 736 ncluster->refs = 1; 737 ncluster->flags = 0; /* cluster not locked */ 738 739 for (i = 0; i < ocluster->nchains; ++i) { 740 chain = ocluster->array[i].chain; 741 ncluster->array[i].chain = chain; 742 if (chain) 743 hammer2_chain_ref(chain); 744 } 745 return (ncluster); 746 } 747 748 /* 749 * Unlock and deref a cluster. The cluster is destroyed if this is the 750 * last ref. 751 */ 752 void 753 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 754 { 755 hammer2_chain_t *chain; 756 int i; 757 758 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) { 759 kprintf("hammer2_cluster_unlock: cluster %p not locked\n", 760 cluster); 761 } 762 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 763 KKASSERT(cluster->refs > 0); 764 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 765 766 for (i = 0; i < cluster->nchains; ++i) { 767 chain = cluster->array[i].chain; 768 if (chain) 769 hammer2_chain_unlock(chain); 770 } 771 cluster->focus_index = 0; 772 cluster->focus = NULL; 773 } 774 775 /* 776 * Resize the cluster's physical storage allocation in-place. This may 777 * replace the cluster's chains. 778 */ 779 void 780 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 781 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 782 int nradix, int flags) 783 { 784 hammer2_chain_t *chain; 785 int i; 786 787 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */ 788 KKASSERT(cparent->nchains == cluster->nchains); 789 790 for (i = 0; i < cluster->nchains; ++i) { 791 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 792 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 793 continue; 794 } 795 chain = cluster->array[i].chain; 796 if (chain) { 797 KKASSERT(cparent->array[i].chain); 798 hammer2_chain_resize(trans, ip, 799 cparent->array[i].chain, chain, 800 nradix, flags); 801 } 802 } 803 } 804 805 /* 806 * Set an inode's cluster modified, marking the related chains RW and 807 * duplicating them if necessary. 808 * 809 * The passed-in chain is a localized copy of the chain previously acquired 810 * when the inode was locked (and possilby replaced in the mean time), and 811 * must also be updated. In fact, we update it first and then synchronize 812 * the inode's cluster cache. 813 */ 814 hammer2_inode_data_t * 815 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 816 hammer2_cluster_t *cluster, int flags) 817 { 818 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 819 hammer2_cluster_modify(trans, cluster, flags); 820 821 hammer2_inode_repoint(ip, NULL, cluster); 822 if (ip->vp) 823 vsetisdirty(ip->vp); 824 return (&hammer2_cluster_wdata(cluster)->ipdata); 825 } 826 827 /* 828 * Adjust the cluster's chains to allow modification and adjust the 829 * focus. Data will be accessible on return. 830 * 831 * If our focused master errors on modify, re-resolve the cluster to 832 * try to select a different master. 833 */ 834 void 835 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster, 836 int flags) 837 { 838 hammer2_chain_t *chain; 839 int resolve_again; 840 int i; 841 842 resolve_again = 0; 843 for (i = 0; i < cluster->nchains; ++i) { 844 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 845 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 846 continue; 847 } 848 chain = cluster->array[i].chain; 849 if (chain == NULL) 850 continue; 851 if (chain->error) 852 continue; 853 hammer2_chain_modify(trans, chain, flags); 854 if (cluster->focus == chain && chain->error) { 855 cluster->error = chain->error; 856 resolve_again = 1; 857 } 858 } 859 if (resolve_again) 860 hammer2_cluster_resolve(cluster); 861 } 862 863 /* 864 * Synchronize modifications from the focus to other chains in a cluster. 865 * Convenient because nominal API users can just modify the contents of the 866 * focus (at least for non-blockref data). 867 * 868 * Nominal front-end operations only edit non-block-table data in a single 869 * chain. This code copies such modifications to the other chains in the 870 * cluster. Blocktable modifications are handled on a chain-by-chain basis 871 * by both the frontend and the backend and will explode in fireworks if 872 * blindly copied. 873 */ 874 void 875 hammer2_cluster_modsync(hammer2_cluster_t *cluster) 876 { 877 hammer2_chain_t *focus; 878 hammer2_chain_t *scan; 879 const hammer2_inode_data_t *ripdata; 880 hammer2_inode_data_t *wipdata; 881 int i; 882 883 focus = cluster->focus; 884 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED); 885 886 for (i = 0; i < cluster->nchains; ++i) { 887 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 888 continue; 889 scan = cluster->array[i].chain; 890 if (scan == NULL || scan == focus) 891 continue; 892 if (scan->error) 893 continue; 894 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED); 895 KKASSERT(focus->bytes == scan->bytes && 896 focus->bref.type == scan->bref.type); 897 switch(focus->bref.type) { 898 case HAMMER2_BREF_TYPE_INODE: 899 ripdata = &focus->data->ipdata; 900 wipdata = &scan->data->ipdata; 901 if ((ripdata->op_flags & 902 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 903 bcopy(ripdata, wipdata, 904 offsetof(hammer2_inode_data_t, u)); 905 break; 906 } 907 /* fall through to full copy */ 908 case HAMMER2_BREF_TYPE_DATA: 909 bcopy(focus->data, scan->data, focus->bytes); 910 break; 911 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 912 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 913 case HAMMER2_BREF_TYPE_FREEMAP: 914 case HAMMER2_BREF_TYPE_VOLUME: 915 panic("hammer2_cluster_modsync: illegal node type"); 916 /* NOT REACHED */ 917 break; 918 default: 919 panic("hammer2_cluster_modsync: unknown node type"); 920 break; 921 } 922 } 923 } 924 925 /* 926 * Lookup initialization/completion API. Returns a locked cluster with 1 ref. 927 */ 928 hammer2_cluster_t * 929 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags) 930 { 931 hammer2_cluster_t *cluster; 932 933 cluster = hammer2_cluster_copy(cparent); 934 if (flags & HAMMER2_LOOKUP_SHARED) { 935 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS | 936 HAMMER2_RESOLVE_SHARED); 937 } else { 938 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); 939 } 940 return (cluster); 941 } 942 943 void 944 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent) 945 { 946 if (cparent) { 947 hammer2_cluster_unlock(cparent); 948 hammer2_cluster_drop(cparent); 949 } 950 } 951 952 /* 953 * Locate first match or overlap under parent, return a new cluster 954 */ 955 hammer2_cluster_t * 956 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp, 957 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 958 { 959 hammer2_pfs_t *pmp; 960 hammer2_cluster_t *cluster; 961 hammer2_chain_t *chain; 962 hammer2_chain_t *focus; 963 hammer2_key_t key_accum; 964 hammer2_key_t key_next; 965 int null_count; 966 int i; 967 968 pmp = cparent->pmp; /* can be NULL */ 969 key_accum = *key_nextp; 970 null_count = 0; 971 972 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 973 cluster->pmp = pmp; /* can be NULL */ 974 cluster->refs = 1; 975 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 976 cluster->flags |= HAMMER2_CLUSTER_LOCKED; 977 978 /* 979 * Pass-1, issue lookup and find focus. 980 */ 981 for (i = 0; i < cparent->nchains; ++i) { 982 cluster->array[i].flags = cparent->array[i].flags; 983 key_next = *key_nextp; 984 985 /* 986 * Nothing to base the lookup, or parent was not synchronized. 987 */ 988 if (cparent->array[i].chain == NULL || 989 (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) { 990 ++null_count; 991 continue; 992 } 993 994 chain = hammer2_chain_lookup(&cparent->array[i].chain, 995 &key_next, 996 key_beg, key_end, 997 &cparent->array[i].cache_index, 998 flags); 999 cluster->array[i].chain = chain; 1000 if (chain == NULL) { 1001 ++null_count; 1002 } else if (chain->error) { 1003 /* 1004 * Leave errored chain in cluster, but it cannot be 1005 * the cluster's focus. It is still possible for an 1006 * error'd chain to be synchronized (since we have 1007 * the bref), synchronization state will be handled 1008 * in pass-2. 1009 */ 1010 ; 1011 } else if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 1012 /* 1013 * Leave unsynchronized chain in cluster, but it cannot 1014 * be the cluster's focus. 1015 */ 1016 ; 1017 } else { 1018 int ddflag = (chain->bref.type == 1019 HAMMER2_BREF_TYPE_INODE); 1020 1021 if (cluster->focus == NULL) { 1022 cluster->focus_index = i; 1023 cluster->focus = chain; 1024 cluster->ddflag = ddflag; 1025 } 1026 if (cparent->focus == cparent->array[i].chain) { 1027 cluster->focus_index = i; 1028 cluster->focus = chain; 1029 cluster->ddflag = ddflag; 1030 } 1031 } 1032 if (key_accum > key_next) 1033 key_accum = key_next; 1034 } 1035 1036 /* 1037 * Pass-2 invalidate mismatches 1038 */ 1039 focus = cluster->focus; 1040 if (focus == NULL) 1041 goto done; 1042 1043 for (i = 0; i < cparent->nchains; ++i) { 1044 int ddflag; 1045 1046 chain = cluster->array[i].chain; 1047 1048 if (chain == NULL) 1049 continue; 1050 if (chain == focus) 1051 continue; 1052 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 1053 continue; 1054 1055 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 1056 if (chain->bref.type != focus->bref.type || 1057 chain->bref.key != focus->bref.key || 1058 chain->bref.keybits != focus->bref.keybits || 1059 chain->bref.modify_tid != focus->bref.modify_tid || 1060 chain->bytes != focus->bytes || 1061 ddflag != cluster->ddflag) { 1062 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1063 } 1064 } 1065 1066 /* 1067 * Resolve cluster flags. A lookup or locking failure could wind 1068 * up changing the cluster. 1069 */ 1070 done: 1071 *key_nextp = key_accum; 1072 cluster->nchains = i; 1073 hammer2_cluster_resolve(cluster); 1074 1075 if (null_count == i) { 1076 hammer2_cluster_drop(cluster); 1077 cluster = NULL; 1078 } 1079 1080 return (cluster); 1081 } 1082 1083 /* 1084 * Locate next match or overlap under parent, replace cluster 1085 */ 1086 hammer2_cluster_t * 1087 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 1088 hammer2_key_t *key_nextp, 1089 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 1090 { 1091 hammer2_chain_t *ochain; 1092 hammer2_chain_t *nchain; 1093 hammer2_chain_t *focus; 1094 hammer2_key_t key_accum; 1095 hammer2_key_t key_next; 1096 int null_count; 1097 int i; 1098 1099 key_accum = *key_nextp; 1100 null_count = 0; 1101 cluster->focus = NULL; 1102 cparent->focus = NULL; 1103 cluster->focus_index = 0; 1104 cparent->focus_index = 0; 1105 1106 cluster->ddflag = 0; 1107 1108 for (i = 0; i < cparent->nchains; ++i) { 1109 key_next = *key_nextp; 1110 ochain = cluster->array[i].chain; 1111 1112 /* 1113 * Nothing to iterate from. These cases can occur under 1114 * normal operations. For example, during synchronization 1115 * a slave might reach the end of its scan while records 1116 * are still left on the master(s). 1117 */ 1118 if (ochain == NULL) { 1119 ++null_count; 1120 continue; 1121 } 1122 if (cparent->array[i].chain == NULL || 1123 (cparent->array[i].flags & HAMMER2_CITEM_INVALID) || 1124 (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) { 1125 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1126 hammer2_chain_unlock(ochain); 1127 hammer2_chain_drop(ochain); 1128 cluster->array[i].chain = NULL; 1129 ++null_count; 1130 continue; 1131 } 1132 1133 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain, 1134 &key_next, key_beg, key_end, 1135 &cparent->array[i].cache_index, 1136 flags); 1137 /* ochain now invalid but can still be used for focus check */ 1138 1139 cluster->array[i].chain = nchain; 1140 if (nchain == NULL) { 1141 ++null_count; 1142 } else if (nchain->error) { 1143 /* 1144 * Leave errored chain in cluster, but it cannot be 1145 * the cluster's focus. 1146 */ 1147 ; 1148 } else { 1149 int ddflag = (nchain->bref.type == 1150 HAMMER2_BREF_TYPE_INODE); 1151 1152 /* 1153 * Possible new focus. 1154 */ 1155 if (cluster->focus == NULL) { 1156 cluster->ddflag = ddflag; 1157 cluster->focus_index = i; 1158 cluster->focus = nchain; 1159 } 1160 1161 /* 1162 * Fixup pre-existing focus. 1163 */ 1164 if (cluster->focus == ochain) { 1165 cluster->focus_index = i; 1166 cluster->focus = nchain; 1167 } 1168 } 1169 if (key_accum > key_next) 1170 key_accum = key_next; 1171 } 1172 1173 /* 1174 * Pass-2 invalidate mismatches 1175 */ 1176 focus = cluster->focus; 1177 if (focus == NULL) 1178 goto done; 1179 1180 for (i = 0; i < cparent->nchains; ++i) { 1181 int ddflag; 1182 1183 nchain = cluster->array[i].chain; 1184 1185 if (nchain == NULL) 1186 continue; 1187 if (nchain == focus) 1188 continue; 1189 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 1190 continue; 1191 1192 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE); 1193 if (nchain->bref.type != focus->bref.type || 1194 nchain->bref.key != focus->bref.key || 1195 nchain->bref.keybits != focus->bref.keybits || 1196 nchain->bref.modify_tid != focus->bref.modify_tid || 1197 nchain->bytes != focus->bytes || 1198 ddflag != cluster->ddflag) { 1199 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1200 } 1201 } 1202 1203 done: 1204 *key_nextp = key_accum; 1205 cluster->nchains = i; 1206 hammer2_cluster_resolve(cluster); 1207 1208 if (null_count == i) { 1209 hammer2_cluster_drop(cluster); 1210 cluster = NULL; 1211 } 1212 return(cluster); 1213 } 1214 1215 /* 1216 * Advance just one chain in the cluster and recalculate the invalid bit. 1217 * (used during synchronization to advance past a chain being deleted). 1218 * 1219 * The chain being advanced must not be the focus and the clusters in 1220 * question must have already passed normal cluster_lookup/cluster_next 1221 * checks. 1222 * 1223 * The cluster always remains intact on return, so void function. 1224 */ 1225 void 1226 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent, 1227 hammer2_cluster_t *cluster, 1228 hammer2_key_t *key_nextp, 1229 hammer2_key_t key_beg, 1230 hammer2_key_t key_end, 1231 int i, int flags) 1232 { 1233 hammer2_chain_t *ochain; 1234 hammer2_chain_t *nchain; 1235 hammer2_chain_t *focus; 1236 hammer2_key_t key_accum; 1237 hammer2_key_t key_next; 1238 int ddflag; 1239 1240 key_accum = *key_nextp; 1241 key_next = *key_nextp; 1242 ochain = cluster->array[i].chain; 1243 if (ochain == NULL) 1244 goto done; 1245 KKASSERT(ochain != cluster->focus); 1246 1247 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain, 1248 &key_next, key_beg, key_end, 1249 &cparent->array[i].cache_index, 1250 flags); 1251 /* ochain now invalid */ 1252 1253 /* 1254 * Install nchain. Note that nchain can be NULL, and can also 1255 * be in an unlocked state depending on flags. 1256 */ 1257 cluster->array[i].chain = nchain; 1258 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 1259 1260 if (key_accum > key_next) 1261 key_accum = key_next; 1262 1263 focus = cluster->focus; 1264 if (focus == NULL) 1265 goto done; 1266 if (nchain == NULL) 1267 goto done; 1268 #if 0 1269 if (nchain == focus) /* ASSERTED NOT TRUE */ 1270 ... 1271 #endif 1272 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE); 1273 if (nchain->bref.type != focus->bref.type || 1274 nchain->bref.key != focus->bref.key || 1275 nchain->bref.keybits != focus->bref.keybits || 1276 nchain->bref.modify_tid != focus->bref.modify_tid || 1277 nchain->bytes != focus->bytes || 1278 ddflag != cluster->ddflag) { 1279 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1280 } 1281 1282 done: 1283 *key_nextp = key_accum; 1284 #if 0 1285 /* 1286 * For now don't re-resolve cluster->flags. 1287 */ 1288 hammer2_cluster_resolve(cluster); 1289 #endif 1290 } 1291 1292 /* 1293 * Create a new cluster using the specified key 1294 */ 1295 int 1296 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 1297 hammer2_cluster_t **clusterp, 1298 hammer2_key_t key, int keybits, 1299 int type, size_t bytes, int flags) 1300 { 1301 hammer2_cluster_t *cluster; 1302 hammer2_pfs_t *pmp; 1303 int error; 1304 int i; 1305 1306 pmp = trans->pmp; /* can be NULL */ 1307 1308 if ((cluster = *clusterp) == NULL) { 1309 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, 1310 M_WAITOK | M_ZERO); 1311 cluster->pmp = pmp; /* can be NULL */ 1312 cluster->refs = 1; 1313 cluster->flags = HAMMER2_CLUSTER_LOCKED; 1314 } 1315 cluster->focus_index = 0; 1316 cluster->focus = NULL; 1317 1318 /* 1319 * NOTE: cluster->array[] entries can initially be NULL. If 1320 * *clusterp is supplied, skip NULL entries, otherwise 1321 * create new chains. 1322 */ 1323 for (i = 0; i < cparent->nchains; ++i) { 1324 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1325 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1326 continue; 1327 } 1328 if (*clusterp) { 1329 if ((cluster->array[i].flags & 1330 HAMMER2_CITEM_FEMOD) == 0) { 1331 cluster->array[i].flags |= 1332 HAMMER2_CITEM_INVALID; 1333 continue; 1334 } 1335 if (cluster->array[i].chain == NULL) 1336 continue; 1337 } 1338 error = hammer2_chain_create(trans, &cparent->array[i].chain, 1339 &cluster->array[i].chain, pmp, 1340 key, keybits, 1341 type, bytes, flags); 1342 KKASSERT(error == 0); 1343 if (cluster->focus == NULL) { 1344 cluster->focus_index = i; 1345 cluster->focus = cluster->array[i].chain; 1346 } 1347 if (cparent->focus == cparent->array[i].chain) { 1348 cluster->focus_index = i; 1349 cluster->focus = cluster->array[i].chain; 1350 } 1351 } 1352 cluster->nchains = i; 1353 *clusterp = cluster; 1354 hammer2_cluster_resolve(cluster); 1355 1356 return error; 1357 } 1358 1359 /* 1360 * Rename a cluster to a new parent. 1361 * 1362 * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(), 1363 * So the data_off field is not relevant. Only the key and 1364 * keybits are used. 1365 */ 1366 void 1367 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, 1368 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 1369 int flags) 1370 { 1371 hammer2_chain_t *chain; 1372 hammer2_blockref_t xbref; 1373 int i; 1374 1375 cluster->focus = NULL; 1376 cparent->focus = NULL; 1377 cluster->focus_index = 0; 1378 cparent->focus_index = 0; 1379 1380 for (i = 0; i < cluster->nchains; ++i) { 1381 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1382 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1383 continue; 1384 } 1385 chain = cluster->array[i].chain; 1386 if (chain) { 1387 if (bref) { 1388 xbref = chain->bref; 1389 xbref.key = bref->key; 1390 xbref.keybits = bref->keybits; 1391 hammer2_chain_rename(trans, &xbref, 1392 &cparent->array[i].chain, 1393 chain, flags); 1394 } else { 1395 hammer2_chain_rename(trans, NULL, 1396 &cparent->array[i].chain, 1397 chain, flags); 1398 } 1399 KKASSERT(cluster->array[i].chain == chain); /*remove*/ 1400 } 1401 } 1402 } 1403 1404 /* 1405 * Mark a cluster deleted 1406 */ 1407 void 1408 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 1409 hammer2_cluster_t *cluster, int flags) 1410 { 1411 hammer2_chain_t *chain; 1412 hammer2_chain_t *parent; 1413 int i; 1414 1415 if (cparent == NULL) { 1416 kprintf("cparent is NULL\n"); 1417 return; 1418 } 1419 1420 for (i = 0; i < cluster->nchains; ++i) { 1421 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1422 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1423 continue; 1424 } 1425 parent = cparent->array[i].chain; 1426 chain = cluster->array[i].chain; 1427 if (chain == NULL) 1428 continue; 1429 if (chain->parent != parent) { 1430 kprintf("hammer2_cluster_delete: parent " 1431 "mismatch chain=%p parent=%p against=%p\n", 1432 chain, chain->parent, parent); 1433 } else { 1434 hammer2_chain_delete(trans, parent, chain, flags); 1435 } 1436 } 1437 } 1438 1439 /* 1440 * Create a snapshot of the specified {parent, ochain} with the specified 1441 * label. The originating hammer2_inode must be exclusively locked for 1442 * safety. 1443 * 1444 * The ioctl code has already synced the filesystem. 1445 */ 1446 int 1447 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster, 1448 hammer2_ioc_pfs_t *pfs) 1449 { 1450 hammer2_dev_t *hmp; 1451 hammer2_cluster_t *ncluster; 1452 const hammer2_inode_data_t *ripdata; 1453 hammer2_inode_data_t *wipdata; 1454 hammer2_chain_t *nchain; 1455 hammer2_inode_t *nip; 1456 size_t name_len; 1457 hammer2_key_t lhc; 1458 struct vattr vat; 1459 #if 0 1460 uuid_t opfs_clid; 1461 #endif 1462 int error; 1463 int i; 1464 1465 kprintf("snapshot %s\n", pfs->name); 1466 1467 name_len = strlen(pfs->name); 1468 lhc = hammer2_dirhash(pfs->name, name_len); 1469 1470 /* 1471 * Get the clid 1472 */ 1473 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata; 1474 #if 0 1475 opfs_clid = ripdata->pfs_clid; 1476 #endif 1477 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */ 1478 1479 /* 1480 * Create the snapshot directory under the super-root 1481 * 1482 * Set PFS type, generate a unique filesystem id, and generate 1483 * a cluster id. Use the same clid when snapshotting a PFS root, 1484 * which theoretically allows the snapshot to be used as part of 1485 * the same cluster (perhaps as a cache). 1486 * 1487 * Copy the (flushed) blockref array. Theoretically we could use 1488 * chain_duplicate() but it becomes difficult to disentangle 1489 * the shared core so for now just brute-force it. 1490 */ 1491 VATTR_NULL(&vat); 1492 vat.va_type = VDIR; 1493 vat.va_mode = 0755; 1494 ncluster = NULL; 1495 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat, 1496 proc0.p_ucred, pfs->name, name_len, 1497 &ncluster, 1498 HAMMER2_INSERT_PFSROOT, &error); 1499 1500 if (nip) { 1501 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0); 1502 wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER; 1503 wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT; 1504 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT; 1505 kern_uuidgen(&wipdata->pfs_fsid, 1); 1506 1507 /* 1508 * Give the snapshot its own private cluster. As a snapshot 1509 * no further synchronization with the original cluster will 1510 * be done. 1511 */ 1512 #if 0 1513 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1514 wipdata->pfs_clid = opfs_clid; 1515 else 1516 kern_uuidgen(&wipdata->pfs_clid, 1); 1517 #endif 1518 kern_uuidgen(&wipdata->pfs_clid, 1); 1519 1520 for (i = 0; i < ncluster->nchains; ++i) { 1521 if ((ncluster->array[i].flags & 1522 HAMMER2_CITEM_FEMOD) == 0) { 1523 ncluster->array[i].flags |= 1524 HAMMER2_CITEM_INVALID; 1525 continue; 1526 } 1527 nchain = ncluster->array[i].chain; 1528 if (nchain) 1529 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 1530 } 1531 #if 0 1532 /* XXX can't set this unless we do an explicit flush, which 1533 we also need a pmp assigned to do, else the flush code 1534 won't flush ncluster because it thinks it is crossing a 1535 flush boundary */ 1536 hammer2_cluster_set_chainflags(ncluster, 1537 HAMMER2_CHAIN_PFSBOUNDARY); 1538 #endif 1539 1540 /* XXX hack blockset copy */ 1541 /* XXX doesn't work with real cluster */ 1542 KKASSERT(ocluster->nchains == 1); 1543 wipdata->u.blockset = ripdata->u.blockset; 1544 hammer2_cluster_modsync(ncluster); 1545 for (i = 0; i < ncluster->nchains; ++i) { 1546 nchain = ncluster->array[i].chain; 1547 if (nchain) 1548 hammer2_flush(trans, nchain); 1549 } 1550 hammer2_inode_unlock(nip, ncluster); 1551 } 1552 return (error); 1553 } 1554 1555 /* 1556 * Return locked parent cluster given a locked child. The child remains 1557 * locked on return. The new parent's focus follows the child's focus 1558 * and the parent is always resolved. 1559 */ 1560 hammer2_cluster_t * 1561 hammer2_cluster_parent(hammer2_cluster_t *cluster) 1562 { 1563 hammer2_cluster_t *cparent; 1564 int i; 1565 1566 cparent = hammer2_cluster_copy(cluster); 1567 1568 for (i = 0; i < cparent->nchains; ++i) { 1569 hammer2_chain_t *chain; 1570 hammer2_chain_t *rchain; 1571 1572 /* 1573 * Calculate parent for each element. Old chain has an extra 1574 * ref for cparent but the lock remains with cluster. 1575 */ 1576 chain = cparent->array[i].chain; 1577 if (chain == NULL) 1578 continue; 1579 while ((rchain = chain->parent) != NULL) { 1580 hammer2_chain_ref(rchain); 1581 hammer2_chain_unlock(chain); 1582 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS); 1583 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1584 if (chain->parent == rchain) 1585 break; 1586 hammer2_chain_unlock(rchain); 1587 hammer2_chain_drop(rchain); 1588 } 1589 if (cluster->focus == chain) { 1590 cparent->focus_index = i; 1591 cparent->focus = rchain; 1592 } 1593 cparent->array[i].chain = rchain; 1594 hammer2_chain_drop(chain); 1595 } 1596 cparent->flags |= HAMMER2_CLUSTER_LOCKED; 1597 hammer2_cluster_resolve(cparent); 1598 1599 return cparent; 1600 } 1601 1602 /************************************************************************ 1603 * CLUSTER I/O * 1604 ************************************************************************ 1605 * 1606 * 1607 * WARNING! blockref[] array data is not universal. These functions should 1608 * only be used to access universal data. 1609 * 1610 * NOTE! The rdata call will wait for at least one of the chain I/Os to 1611 * complete if necessary. The I/O's should have already been 1612 * initiated by the cluster_lock/chain_lock operation. 1613 * 1614 * The cluster must already be in a modified state before wdata 1615 * is called. The data will already be available for this case. 1616 */ 1617 const hammer2_media_data_t * 1618 hammer2_cluster_rdata(hammer2_cluster_t *cluster) 1619 { 1620 return(cluster->focus->data); 1621 } 1622 1623 hammer2_media_data_t * 1624 hammer2_cluster_wdata(hammer2_cluster_t *cluster) 1625 { 1626 KKASSERT(hammer2_cluster_modified(cluster)); 1627 return(cluster->focus->data); 1628 } 1629 1630 /* 1631 * Load cluster data asynchronously with callback. 1632 * 1633 * The callback is made for the first validated data found, or NULL 1634 * if no valid data is available. 1635 * 1636 * NOTE! The cluster structure is either unique or serialized (e.g. embedded 1637 * in the inode with an exclusive lock held), the chain structure may be 1638 * shared. 1639 */ 1640 void 1641 hammer2_cluster_load_async(hammer2_cluster_t *cluster, 1642 void (*callback)(hammer2_iocb_t *iocb), void *ptr) 1643 { 1644 hammer2_chain_t *chain; 1645 hammer2_iocb_t *iocb; 1646 hammer2_dev_t *hmp; 1647 hammer2_blockref_t *bref; 1648 int i; 1649 1650 /* 1651 * Try to find a chain whos data is already resolved. If none can 1652 * be found, start with the first chain. 1653 */ 1654 chain = NULL; 1655 for (i = 0; i < cluster->nchains; ++i) { 1656 chain = cluster->array[i].chain; 1657 if (chain && chain->data) 1658 break; 1659 } 1660 if (i == cluster->nchains) { 1661 chain = cluster->array[0].chain; 1662 i = 0; 1663 } 1664 1665 iocb = &cluster->iocb; 1666 iocb->callback = callback; 1667 iocb->dio = NULL; /* for already-validated case */ 1668 iocb->cluster = cluster; 1669 iocb->chain = chain; 1670 iocb->ptr = ptr; 1671 iocb->lbase = (off_t)i; 1672 iocb->flags = 0; 1673 iocb->error = 0; 1674 1675 /* 1676 * Data already validated 1677 */ 1678 if (chain->data) { 1679 callback(iocb); 1680 return; 1681 } 1682 1683 /* 1684 * We must resolve to a device buffer, either by issuing I/O or 1685 * by creating a zero-fill element. We do not mark the buffer 1686 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1687 * API must still be used to do that). 1688 * 1689 * The device buffer is variable-sized in powers of 2 down 1690 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1691 * chunk always contains buffers of the same size. (XXX) 1692 * 1693 * The minimum physical IO size may be larger than the variable 1694 * block size. 1695 * 1696 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes 1697 * matches hammer2_devblksize()? Or does the freemap's 1698 * pre-zeroing handle the case for us? 1699 */ 1700 bref = &chain->bref; 1701 hmp = chain->hmp; 1702 1703 #if 0 1704 /* handled by callback? <- TODO XXX even needed for loads? */ 1705 /* 1706 * The getblk() optimization for a 100% overwrite can only be used 1707 * if the physical block size matches the request. 1708 */ 1709 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 1710 chain->bytes == hammer2_devblksize(chain->bytes)) { 1711 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio); 1712 KKASSERT(error == 0); 1713 iocb->dio = dio; 1714 callback(iocb); 1715 return; 1716 } 1717 #endif 1718 1719 /* 1720 * Otherwise issue a read 1721 */ 1722 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1723 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb); 1724 } 1725