1 /* 2 * Copyright (c) 2013-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information into a single entity. It allows direct access to media 37 * data as long as it is not blockref array data. Meaning, basically, 38 * just inode and file data. 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 */ 48 #include <sys/cdefs.h> 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/types.h> 52 #include <sys/lock.h> 53 #include <sys/uuid.h> 54 55 #include "hammer2.h" 56 57 /* 58 * Returns TRUE if any chain in the cluster needs to be resized. 59 */ 60 int 61 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes) 62 { 63 hammer2_chain_t *chain; 64 int i; 65 66 for (i = 0; i < cluster->nchains; ++i) { 67 chain = cluster->array[i]; 68 if (chain && chain->bytes != bytes) 69 return 1; 70 } 71 return 0; 72 } 73 74 uint8_t 75 hammer2_cluster_type(hammer2_cluster_t *cluster) 76 { 77 return(cluster->focus->bref.type); 78 } 79 80 int 81 hammer2_cluster_modified(hammer2_cluster_t *cluster) 82 { 83 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); 84 } 85 86 /* 87 * Return a bref representative of the cluster. Any data offset is removed 88 * (since it would only be applicable to a particular chain in the cluster). 89 * 90 * However, the radix portion of data_off is used for many purposes and will 91 * be retained. 92 */ 93 void 94 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 95 { 96 *bref = cluster->focus->bref; 97 bref->data_off &= HAMMER2_OFF_MASK_RADIX; 98 } 99 100 void 101 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 102 { 103 hammer2_chain_t *chain; 104 int i; 105 106 for (i = 0; i < cluster->nchains; ++i) { 107 chain = cluster->array[i]; 108 if (chain) 109 atomic_set_int(&chain->flags, flags); 110 } 111 } 112 113 void 114 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster) 115 { 116 hammer2_chain_t *chain; 117 int i; 118 119 for (i = 0; i < cluster->nchains; ++i) { 120 chain = cluster->array[i]; 121 if (chain) 122 hammer2_chain_setflush(trans, chain); 123 } 124 } 125 126 void 127 hammer2_cluster_setmethod_check(hammer2_trans_t *trans, 128 hammer2_cluster_t *cluster, 129 int check_algo) 130 { 131 hammer2_chain_t *chain; 132 int i; 133 134 for (i = 0; i < cluster->nchains; ++i) { 135 chain = cluster->array[i]; 136 if (chain) { 137 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 138 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1); 139 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo); 140 } 141 } 142 } 143 144 /* 145 * Create a cluster with one ref from the specified chain. The chain 146 * is not further referenced. The caller typically supplies a locked 147 * chain and transfers ownership to the cluster. 148 */ 149 hammer2_cluster_t * 150 hammer2_cluster_from_chain(hammer2_chain_t *chain) 151 { 152 hammer2_cluster_t *cluster; 153 154 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 155 cluster->array[0] = chain; 156 cluster->nchains = 1; 157 cluster->focus = chain; 158 cluster->pmp = chain->pmp; 159 cluster->refs = 1; 160 161 return cluster; 162 } 163 164 /* 165 * Allocates a cluster and its underlying chain structures. The underlying 166 * chains will be locked. The cluster and underlying chains will have one 167 * ref. 168 */ 169 hammer2_cluster_t * 170 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp, 171 hammer2_trans_t *trans, hammer2_blockref_t *bref) 172 { 173 hammer2_cluster_t *cluster; 174 hammer2_cluster_t *rcluster; 175 hammer2_chain_t *chain; 176 #if 0 177 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 178 #endif 179 int i; 180 181 KKASSERT(pmp != NULL); 182 183 /* 184 * Construct the appropriate system structure. 185 */ 186 switch(bref->type) { 187 case HAMMER2_BREF_TYPE_INODE: 188 case HAMMER2_BREF_TYPE_INDIRECT: 189 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 190 case HAMMER2_BREF_TYPE_DATA: 191 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 192 /* 193 * Chain's are really only associated with the hmp but we 194 * maintain a pmp association for per-mount memory tracking 195 * purposes. The pmp can be NULL. 196 */ 197 break; 198 case HAMMER2_BREF_TYPE_VOLUME: 199 case HAMMER2_BREF_TYPE_FREEMAP: 200 chain = NULL; 201 panic("hammer2_cluster_alloc volume type illegal for op"); 202 default: 203 chain = NULL; 204 panic("hammer2_cluster_alloc: unrecognized blockref type: %d", 205 bref->type); 206 } 207 208 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 209 cluster->refs = 1; 210 211 rcluster = &pmp->iroot->cluster; 212 for (i = 0; i < rcluster->nchains; ++i) { 213 chain = hammer2_chain_alloc(rcluster->array[i]->hmp, 214 pmp, trans, bref); 215 #if 0 216 chain->hmp = rcluster->array[i]->hmp; 217 chain->bref = *bref; 218 chain->bytes = bytes; 219 chain->refs = 1; 220 chain->flags = HAMMER2_CHAIN_ALLOCATED; 221 #endif 222 223 /* 224 * NOTE: When loading a chain from backing store or creating a 225 * snapshot, trans will be NULL and the caller is 226 * responsible for setting these fields. 227 */ 228 cluster->array[i] = chain; 229 } 230 cluster->nchains = i; 231 cluster->pmp = pmp; 232 cluster->focus = cluster->array[0]; 233 234 return (cluster); 235 } 236 237 /* 238 * Add a reference to a cluster. 239 * 240 * We must also ref the underlying chains in order to allow ref/unlock 241 * sequences to later re-lock. 242 */ 243 void 244 hammer2_cluster_ref(hammer2_cluster_t *cluster) 245 { 246 hammer2_chain_t *chain; 247 int i; 248 249 atomic_add_int(&cluster->refs, 1); 250 for (i = 0; i < cluster->nchains; ++i) { 251 chain = cluster->array[i]; 252 if (chain) 253 hammer2_chain_ref(chain); 254 } 255 } 256 257 /* 258 * Drop the caller's reference to the cluster. When the ref count drops to 259 * zero this function frees the cluster and drops all underlying chains. 260 * 261 * In-progress read I/Os are typically detached from the cluster once the 262 * first one returns (the remaining stay attached to the DIOs but are then 263 * ignored and drop naturally). 264 */ 265 void 266 hammer2_cluster_drop(hammer2_cluster_t *cluster) 267 { 268 hammer2_chain_t *chain; 269 int i; 270 271 KKASSERT(cluster->refs > 0); 272 for (i = 0; i < cluster->nchains; ++i) { 273 chain = cluster->array[i]; 274 if (chain) { 275 hammer2_chain_drop(chain); 276 if (cluster->refs == 1) 277 cluster->array[i] = NULL; 278 } 279 } 280 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 281 cluster->focus = NULL; 282 kfree(cluster, M_HAMMER2); 283 /* cluster = NULL; safety */ 284 } 285 } 286 287 void 288 hammer2_cluster_wait(hammer2_cluster_t *cluster) 289 { 290 tsleep(cluster->focus, 0, "h2clcw", 1); 291 } 292 293 /* 294 * Lock and ref a cluster. This adds a ref to the cluster and its chains 295 * and then locks them. 296 */ 297 int 298 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 299 { 300 hammer2_chain_t *chain; 301 int i; 302 int error; 303 304 error = 0; 305 atomic_add_int(&cluster->refs, 1); 306 for (i = 0; i < cluster->nchains; ++i) { 307 chain = cluster->array[i]; 308 if (chain) { 309 error = hammer2_chain_lock(chain, how); 310 if (error) { 311 while (--i >= 0) 312 hammer2_chain_unlock(cluster->array[i]); 313 atomic_add_int(&cluster->refs, -1); 314 break; 315 } 316 } 317 } 318 return error; 319 } 320 321 /* 322 * Replace the contents of dst with src, adding a reference to src's chains. 323 * dst is assumed to already have a ref and any chains present in dst are 324 * assumed to be locked and will be unlocked. 325 * 326 * If the chains in src are locked, only one of (src) or (dst) should be 327 * considered locked by the caller after return, not both. 328 */ 329 void 330 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src) 331 { 332 hammer2_chain_t *chain; 333 int i; 334 335 KKASSERT(dst->refs == 1); 336 dst->focus = NULL; 337 338 for (i = 0; i < src->nchains; ++i) { 339 chain = src->array[i]; 340 if (chain) { 341 hammer2_chain_ref(chain); 342 if (i < dst->nchains && dst->array[i]) 343 hammer2_chain_unlock(dst->array[i]); 344 dst->array[i] = chain; 345 if (dst->focus == NULL) 346 dst->focus = chain; 347 } 348 } 349 while (i < dst->nchains) { 350 chain = dst->array[i]; 351 if (chain) { 352 hammer2_chain_unlock(chain); 353 dst->array[i] = NULL; 354 } 355 ++i; 356 } 357 dst->nchains = src->nchains; 358 } 359 360 /* 361 * Replace the contents of the locked destination with the contents of the 362 * locked source. Destination must have one ref. 363 * 364 * Returns with the destination still with one ref and the copied chains 365 * with an additional lock (representing their state on the destination). 366 * The original chains associated with the destination are unlocked. 367 */ 368 void 369 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src) 370 { 371 hammer2_chain_t *chain; 372 int i; 373 374 KKASSERT(dst->refs == 1); 375 376 dst->focus = NULL; 377 for (i = 0; i < src->nchains; ++i) { 378 chain = src->array[i]; 379 if (chain) { 380 hammer2_chain_lock(chain, 0); 381 if (i < dst->nchains && dst->array[i]) 382 hammer2_chain_unlock(dst->array[i]); 383 dst->array[i] = src->array[i]; 384 if (dst->focus == NULL) 385 dst->focus = chain; 386 } 387 } 388 while (i < dst->nchains) { 389 chain = dst->array[i]; 390 if (chain) { 391 hammer2_chain_unlock(chain); 392 dst->array[i] = NULL; 393 } 394 ++i; 395 } 396 dst->nchains = src->nchains; 397 } 398 399 /* 400 * Copy a cluster, returned a ref'd cluster. All underlying chains 401 * are also ref'd, but not locked. 402 * 403 * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied 404 * to the new cluster and a reference is nominally added to them and to 405 * the cluster. The cluster will have 1 ref. 406 * 407 * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains 408 * are copied but no additional references are made and the cluster will have 409 * 0 refs. Callers must ref the cluster and the chains before dropping it 410 * (typically by locking it). 411 * 412 * If flags are passed as 0 the copy is setup as if it contained the chains 413 * but the chains will not be copied over, and the cluster will have 0 refs. 414 * Callers must ref the cluster before dropping it (typically by locking it). 415 */ 416 hammer2_cluster_t * 417 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags) 418 { 419 hammer2_pfsmount_t *pmp = ocluster->pmp; 420 hammer2_cluster_t *ncluster; 421 hammer2_chain_t *chain; 422 int i; 423 424 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO); 425 ncluster->pmp = pmp; 426 ncluster->nchains = ocluster->nchains; 427 ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1; 428 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) { 429 ncluster->focus = ocluster->focus; 430 for (i = 0; i < ocluster->nchains; ++i) { 431 chain = ocluster->array[i]; 432 ncluster->array[i] = chain; 433 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 && 434 chain) { 435 hammer2_chain_ref(chain); 436 } 437 } 438 } 439 return (ncluster); 440 } 441 442 /* 443 * Unlock and deref a cluster. The cluster is destroyed if this is the 444 * last ref. 445 */ 446 void 447 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 448 { 449 hammer2_chain_t *chain; 450 int i; 451 452 KKASSERT(cluster->refs > 0); 453 for (i = 0; i < cluster->nchains; ++i) { 454 chain = cluster->array[i]; 455 if (chain) { 456 hammer2_chain_unlock(chain); 457 if (cluster->refs == 1) 458 cluster->array[i] = NULL; /* safety */ 459 } 460 } 461 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 462 cluster->focus = NULL; 463 kfree(cluster, M_HAMMER2); 464 /* cluster = NULL; safety */ 465 } 466 } 467 468 /* 469 * Resize the cluster's physical storage allocation in-place. This may 470 * replace the cluster's chains. 471 */ 472 void 473 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 474 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 475 int nradix, int flags) 476 { 477 int i; 478 479 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */ 480 KKASSERT(cparent->nchains == cluster->nchains); 481 482 cluster->focus = NULL; 483 for (i = 0; i < cluster->nchains; ++i) { 484 if (cluster->array[i]) { 485 KKASSERT(cparent->array[i]); 486 hammer2_chain_resize(trans, ip, 487 cparent->array[i], 488 cluster->array[i], 489 nradix, flags); 490 if (cluster->focus == NULL) 491 cluster->focus = cluster->array[i]; 492 } 493 } 494 } 495 496 /* 497 * Set an inode's cluster modified, marking the related chains RW and 498 * duplicating them if necessary. 499 * 500 * The passed-in chain is a localized copy of the chain previously acquired 501 * when the inode was locked (and possilby replaced in the mean time), and 502 * must also be updated. In fact, we update it first and then synchronize 503 * the inode's cluster cache. 504 */ 505 hammer2_inode_data_t * 506 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 507 hammer2_cluster_t *cluster, int flags) 508 { 509 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 510 hammer2_cluster_modify(trans, cluster, flags); 511 512 hammer2_inode_repoint(ip, NULL, cluster); 513 if (ip->vp) 514 vsetisdirty(ip->vp); 515 return (&hammer2_cluster_wdata(cluster)->ipdata); 516 } 517 518 /* 519 * Adjust the cluster's chains to allow modification and adjust the 520 * focus. Data will be accessible on return. 521 */ 522 void 523 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster, 524 int flags) 525 { 526 int i; 527 528 cluster->focus = NULL; 529 for (i = 0; i < cluster->nchains; ++i) { 530 if (cluster->array[i]) { 531 hammer2_chain_modify(trans, cluster->array[i], flags); 532 if (cluster->focus == NULL) 533 cluster->focus = cluster->array[i]; 534 } 535 } 536 } 537 538 /* 539 * Synchronize modifications from the focus to other chains in a cluster. 540 * Convenient because nominal API users can just modify the contents of the 541 * focus (at least for non-blockref data). 542 * 543 * Nominal front-end operations only edit non-block-table data in a single 544 * chain. This code copies such modifications to the other chains in the 545 * cluster. Blocktable modifications are handled on a chain-by-chain basis 546 * by both the frontend and the backend and will explode in fireworks if 547 * blindly copied. 548 */ 549 void 550 hammer2_cluster_modsync(hammer2_cluster_t *cluster) 551 { 552 hammer2_chain_t *focus; 553 hammer2_chain_t *scan; 554 const hammer2_inode_data_t *ripdata; 555 hammer2_inode_data_t *wipdata; 556 int i; 557 558 focus = cluster->focus; 559 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED); 560 561 for (i = 0; i < cluster->nchains; ++i) { 562 scan = cluster->array[i]; 563 if (scan == NULL || scan == focus) 564 continue; 565 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED); 566 KKASSERT(focus->bytes == scan->bytes && 567 focus->bref.type == scan->bref.type); 568 switch(focus->bref.type) { 569 case HAMMER2_BREF_TYPE_INODE: 570 ripdata = &focus->data->ipdata; 571 wipdata = &scan->data->ipdata; 572 if ((ripdata->op_flags & 573 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 574 bcopy(ripdata, wipdata, 575 offsetof(hammer2_inode_data_t, u)); 576 break; 577 } 578 /* fall through */ 579 case HAMMER2_BREF_TYPE_DATA: 580 bcopy(focus->data, scan->data, focus->bytes); 581 break; 582 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 583 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 584 case HAMMER2_BREF_TYPE_FREEMAP: 585 case HAMMER2_BREF_TYPE_VOLUME: 586 panic("hammer2_cluster_modsync: illegal node type"); 587 /* NOT REACHED */ 588 break; 589 default: 590 panic("hammer2_cluster_modsync: unknown node type"); 591 break; 592 } 593 } 594 } 595 596 /* 597 * Lookup initialization/completion API 598 */ 599 hammer2_cluster_t * 600 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags) 601 { 602 hammer2_cluster_t *cluster; 603 int i; 604 605 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 606 cluster->pmp = cparent->pmp; /* can be NULL */ 607 /* cluster->focus = NULL; already null */ 608 609 for (i = 0; i < cparent->nchains; ++i) { 610 cluster->array[i] = cparent->array[i]; 611 if (cluster->focus == NULL) 612 cluster->focus = cluster->array[i]; 613 } 614 cluster->nchains = cparent->nchains; 615 616 /* 617 * Independently lock (this will also give cluster 1 ref) 618 */ 619 if (flags & HAMMER2_LOOKUP_SHARED) { 620 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS | 621 HAMMER2_RESOLVE_SHARED); 622 } else { 623 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); 624 } 625 return (cluster); 626 } 627 628 void 629 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent) 630 { 631 if (cparent) 632 hammer2_cluster_unlock(cparent); 633 } 634 635 /* 636 * Locate first match or overlap under parent, return a new cluster 637 */ 638 hammer2_cluster_t * 639 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp, 640 hammer2_key_t key_beg, hammer2_key_t key_end, 641 int flags, int *ddflagp) 642 { 643 hammer2_pfsmount_t *pmp; 644 hammer2_cluster_t *cluster; 645 hammer2_chain_t *chain; 646 hammer2_key_t key_accum; 647 hammer2_key_t key_next; 648 hammer2_key_t bref_key; 649 int bref_keybits; 650 int null_count; 651 int ddflag; 652 int i; 653 uint8_t bref_type; 654 u_int bytes; 655 656 pmp = cparent->pmp; /* can be NULL */ 657 key_accum = *key_nextp; 658 null_count = 0; 659 bref_type = 0; 660 bref_key = 0; 661 bref_keybits = 0; 662 bytes = 0; 663 664 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 665 cluster->pmp = pmp; /* can be NULL */ 666 cluster->refs = 1; 667 /* cluster->focus = NULL; already null */ 668 cparent->focus = NULL; 669 *ddflagp = 0; 670 671 for (i = 0; i < cparent->nchains; ++i) { 672 key_next = *key_nextp; 673 if (cparent->array[i] == NULL) { 674 ++null_count; 675 continue; 676 } 677 chain = hammer2_chain_lookup(&cparent->array[i], &key_next, 678 key_beg, key_end, 679 &cparent->cache_index[i], 680 flags, &ddflag); 681 if (cparent->focus == NULL) 682 cparent->focus = cparent->array[i]; 683 cluster->array[i] = chain; 684 if (chain == NULL) { 685 ++null_count; 686 } else { 687 if (cluster->focus == NULL) { 688 bref_type = chain->bref.type; 689 bref_key = chain->bref.key; 690 bref_keybits = chain->bref.keybits; 691 bytes = chain->bytes; 692 *ddflagp = ddflag; 693 cluster->focus = chain; 694 } 695 KKASSERT(bref_type == chain->bref.type); 696 KKASSERT(bref_key == chain->bref.key); 697 KKASSERT(bref_keybits == chain->bref.keybits); 698 KKASSERT(bytes == chain->bytes); 699 KKASSERT(*ddflagp == ddflag); 700 } 701 if (key_accum > key_next) 702 key_accum = key_next; 703 } 704 *key_nextp = key_accum; 705 cluster->nchains = i; 706 707 if (null_count == i) { 708 hammer2_cluster_drop(cluster); 709 cluster = NULL; 710 } 711 712 return (cluster); 713 } 714 715 /* 716 * Locate next match or overlap under parent, replace cluster 717 */ 718 hammer2_cluster_t * 719 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 720 hammer2_key_t *key_nextp, 721 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 722 { 723 hammer2_chain_t *chain; 724 hammer2_key_t key_accum; 725 hammer2_key_t key_next; 726 int null_count; 727 int i; 728 729 key_accum = *key_nextp; 730 null_count = 0; 731 cluster->focus = NULL; 732 cparent->focus = NULL; 733 734 for (i = 0; i < cparent->nchains; ++i) { 735 key_next = *key_nextp; 736 chain = cluster->array[i]; 737 if (chain == NULL) { 738 if (cparent->focus == NULL) 739 cparent->focus = cparent->array[i]; 740 ++null_count; 741 continue; 742 } 743 if (cparent->array[i] == NULL) { 744 if (flags & HAMMER2_LOOKUP_NOLOCK) 745 hammer2_chain_drop(chain); 746 else 747 hammer2_chain_unlock(chain); 748 ++null_count; 749 continue; 750 } 751 chain = hammer2_chain_next(&cparent->array[i], chain, 752 &key_next, key_beg, key_end, 753 &cparent->cache_index[i], flags); 754 if (cparent->focus == NULL) 755 cparent->focus = cparent->array[i]; 756 cluster->array[i] = chain; 757 if (chain == NULL) { 758 ++null_count; 759 } else if (cluster->focus == NULL) { 760 cluster->focus = chain; 761 } 762 if (key_accum > key_next) 763 key_accum = key_next; 764 } 765 766 if (null_count == i) { 767 hammer2_cluster_drop(cluster); 768 cluster = NULL; 769 } 770 return(cluster); 771 } 772 773 #if 0 774 /* 775 * XXX initial NULL cluster needs reworking (pass **clusterp ?) 776 * 777 * The raw scan function is similar to lookup/next but does not seek to a key. 778 * Blockrefs are iterated via first_chain = (parent, NULL) and 779 * next_chain = (parent, chain). 780 * 781 * The passed-in parent must be locked and its data resolved. The returned 782 * chain will be locked. Pass chain == NULL to acquire the first sub-chain 783 * under parent and then iterate with the passed-in chain (which this 784 * function will unlock). 785 */ 786 hammer2_cluster_t * 787 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 788 int flags) 789 { 790 hammer2_chain_t *chain; 791 int null_count; 792 int i; 793 794 null_count = 0; 795 796 for (i = 0; i < cparent->nchains; ++i) { 797 chain = cluster->array[i]; 798 if (chain == NULL) { 799 ++null_count; 800 continue; 801 } 802 if (cparent->array[i] == NULL) { 803 if (flags & HAMMER2_LOOKUP_NOLOCK) 804 hammer2_chain_drop(chain); 805 else 806 hammer2_chain_unlock(chain); 807 ++null_count; 808 continue; 809 } 810 811 chain = hammer2_chain_scan(cparent->array[i], chain, 812 &cparent->cache_index[i], flags); 813 cluster->array[i] = chain; 814 if (chain == NULL) 815 ++null_count; 816 } 817 818 if (null_count == i) { 819 hammer2_cluster_drop(cluster); 820 cluster = NULL; 821 } 822 return(cluster); 823 } 824 825 #endif 826 827 /* 828 * Create a new cluster using the specified key 829 */ 830 int 831 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 832 hammer2_cluster_t **clusterp, 833 hammer2_key_t key, int keybits, 834 int type, size_t bytes, int flags) 835 { 836 hammer2_cluster_t *cluster; 837 hammer2_pfsmount_t *pmp; 838 int error; 839 int i; 840 841 pmp = trans->pmp; /* can be NULL */ 842 843 if ((cluster = *clusterp) == NULL) { 844 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, 845 M_WAITOK | M_ZERO); 846 cluster->pmp = pmp; /* can be NULL */ 847 cluster->refs = 1; 848 } 849 cluster->focus = NULL; 850 cparent->focus = NULL; 851 852 /* 853 * NOTE: cluster->array[] entries can initially be NULL. If 854 * *clusterp is supplied, skip NULL entries, otherwise 855 * create new chains. 856 */ 857 for (i = 0; i < cparent->nchains; ++i) { 858 if (*clusterp && cluster->array[i] == NULL) { 859 if (cparent->focus == NULL) 860 cparent->focus = cparent->array[i]; 861 continue; 862 } 863 error = hammer2_chain_create(trans, &cparent->array[i], 864 &cluster->array[i], pmp, 865 key, keybits, 866 type, bytes, flags); 867 KKASSERT(error == 0); 868 if (cparent->focus == NULL) 869 cparent->focus = cparent->array[i]; 870 if (cluster->focus == NULL) 871 cluster->focus = cluster->array[i]; 872 } 873 cluster->nchains = i; 874 *clusterp = cluster; 875 876 return error; 877 } 878 879 /* 880 * Rename a cluster to a new parent. 881 * 882 * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields 883 * are used from a passed-in non-NULL bref pointer. All other fields 884 * are extracted from the original chain for each chain in the 885 * iteration. 886 */ 887 void 888 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, 889 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 890 int flags) 891 { 892 hammer2_chain_t *chain; 893 hammer2_blockref_t xbref; 894 int i; 895 896 cluster->focus = NULL; 897 cparent->focus = NULL; 898 899 for (i = 0; i < cluster->nchains; ++i) { 900 chain = cluster->array[i]; 901 if (chain) { 902 if (bref) { 903 xbref = chain->bref; 904 xbref.key = bref->key; 905 xbref.keybits = bref->keybits; 906 hammer2_chain_rename(trans, &xbref, 907 &cparent->array[i], 908 chain, flags); 909 } else { 910 hammer2_chain_rename(trans, NULL, 911 &cparent->array[i], 912 chain, flags); 913 } 914 cluster->array[i] = chain; 915 if (cluster->focus == NULL) 916 cluster->focus = chain; 917 if (cparent->focus == NULL) 918 cparent->focus = cparent->array[i]; 919 } else { 920 if (cparent->focus == NULL) 921 cparent->focus = cparent->array[i]; 922 } 923 } 924 } 925 926 /* 927 * Mark a cluster deleted 928 */ 929 void 930 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 931 hammer2_cluster_t *cluster, int flags) 932 { 933 hammer2_chain_t *chain; 934 hammer2_chain_t *parent; 935 int i; 936 937 if (cparent == NULL) { 938 kprintf("cparent is NULL\n"); 939 return; 940 } 941 942 for (i = 0; i < cluster->nchains; ++i) { 943 parent = (i < cparent->nchains) ? cparent->array[i] : NULL; 944 chain = cluster->array[i]; 945 if (chain == NULL) 946 continue; 947 if (chain->parent != parent) { 948 kprintf("hammer2_cluster_delete: parent " 949 "mismatch chain=%p parent=%p against=%p\n", 950 chain, chain->parent, parent); 951 } else { 952 hammer2_chain_delete(trans, parent, chain, flags); 953 } 954 } 955 } 956 957 /* 958 * Create a snapshot of the specified {parent, ochain} with the specified 959 * label. The originating hammer2_inode must be exclusively locked for 960 * safety. 961 * 962 * The ioctl code has already synced the filesystem. 963 */ 964 int 965 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster, 966 hammer2_ioc_pfs_t *pfs) 967 { 968 hammer2_mount_t *hmp; 969 hammer2_cluster_t *ncluster; 970 const hammer2_inode_data_t *ripdata; 971 hammer2_inode_data_t *wipdata; 972 hammer2_inode_t *nip; 973 size_t name_len; 974 hammer2_key_t lhc; 975 struct vattr vat; 976 uuid_t opfs_clid; 977 int error; 978 int i; 979 980 kprintf("snapshot %s\n", pfs->name); 981 982 name_len = strlen(pfs->name); 983 lhc = hammer2_dirhash(pfs->name, name_len); 984 985 /* 986 * Get the clid 987 */ 988 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata; 989 opfs_clid = ripdata->pfs_clid; 990 hmp = ocluster->focus->hmp; 991 992 /* 993 * Create the snapshot directory under the super-root 994 * 995 * Set PFS type, generate a unique filesystem id, and generate 996 * a cluster id. Use the same clid when snapshotting a PFS root, 997 * which theoretically allows the snapshot to be used as part of 998 * the same cluster (perhaps as a cache). 999 * 1000 * Copy the (flushed) blockref array. Theoretically we could use 1001 * chain_duplicate() but it becomes difficult to disentangle 1002 * the shared core so for now just brute-force it. 1003 */ 1004 VATTR_NULL(&vat); 1005 vat.va_type = VDIR; 1006 vat.va_mode = 0755; 1007 ncluster = NULL; 1008 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat, 1009 proc0.p_ucred, pfs->name, name_len, 1010 &ncluster, &error); 1011 1012 if (nip) { 1013 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0); 1014 wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT; 1015 kern_uuidgen(&wipdata->pfs_fsid, 1); 1016 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1017 wipdata->pfs_clid = opfs_clid; 1018 else 1019 kern_uuidgen(&wipdata->pfs_clid, 1); 1020 1021 for (i = 0; i < ncluster->nchains; ++i) { 1022 if (ncluster->array[i]) { 1023 ncluster->array[i]->bref.flags |= 1024 HAMMER2_BREF_FLAG_PFSROOT; 1025 } 1026 } 1027 #if 0 1028 /* XXX can't set this unless we do an explicit flush, which 1029 we also need a pmp assigned to do, else the flush code 1030 won't flush ncluster because it thinks it is crossing a 1031 flush boundary */ 1032 hammer2_cluster_set_chainflags(ncluster, 1033 HAMMER2_CHAIN_PFSBOUNDARY); 1034 #endif 1035 1036 /* XXX hack blockset copy */ 1037 /* XXX doesn't work with real cluster */ 1038 KKASSERT(ocluster->nchains == 1); 1039 wipdata->u.blockset = ripdata->u.blockset; 1040 hammer2_cluster_modsync(ncluster); 1041 for (i = 0; i < ncluster->nchains; ++i) { 1042 if (ncluster->array[i]) 1043 hammer2_flush(trans, ncluster->array[i]); 1044 } 1045 hammer2_inode_unlock_ex(nip, ncluster); 1046 } 1047 return (error); 1048 } 1049 1050 /* 1051 * Return locked parent cluster given a locked child. The child remains 1052 * locked on return. 1053 */ 1054 hammer2_cluster_t * 1055 hammer2_cluster_parent(hammer2_cluster_t *cluster) 1056 { 1057 hammer2_cluster_t *cparent; 1058 int i; 1059 1060 cparent = hammer2_cluster_copy(cluster, HAMMER2_CLUSTER_COPY_NOCHAINS); 1061 for (i = 0; i < cluster->nchains; ++i) { 1062 hammer2_chain_t *chain; 1063 hammer2_chain_t *rchain; 1064 1065 chain = cluster->array[i]; 1066 if (chain == NULL) 1067 continue; 1068 hammer2_chain_ref(chain); 1069 while ((rchain = chain->parent) != NULL) { 1070 hammer2_chain_ref(rchain); 1071 hammer2_chain_unlock(chain); 1072 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS); 1073 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1074 hammer2_chain_drop(rchain); 1075 if (chain->parent == rchain) 1076 break; 1077 hammer2_chain_unlock(rchain); 1078 } 1079 hammer2_chain_drop(chain); 1080 cparent->array[i] = rchain; 1081 } 1082 return cparent; 1083 } 1084 1085 /************************************************************************ 1086 * CLUSTER I/O * 1087 ************************************************************************ 1088 * 1089 * 1090 * WARNING! blockref[] array data is not universal. These functions should 1091 * only be used to access universal data. 1092 * 1093 * NOTE! The rdata call will wait for at least one of the chain I/Os to 1094 * complete if necessary. The I/O's should have already been 1095 * initiated by the cluster_lock/chain_lock operation. 1096 * 1097 * The cluster must already be in a modified state before wdata 1098 * is called. The data will already be available for this case. 1099 */ 1100 const hammer2_media_data_t * 1101 hammer2_cluster_rdata(hammer2_cluster_t *cluster) 1102 { 1103 return(cluster->focus->data); 1104 } 1105 1106 hammer2_media_data_t * 1107 hammer2_cluster_wdata(hammer2_cluster_t *cluster) 1108 { 1109 KKASSERT(hammer2_cluster_modified(cluster)); 1110 return(cluster->focus->data); 1111 } 1112 1113 /* 1114 * Load async into independent buffer - used to load logical buffers from 1115 * underlying device data. The callback is made for the first validated 1116 * data found, or NULL if no valid data is available. 1117 * 1118 * NOTE! The cluster structure is either unique or serialized (e.g. embedded 1119 * in the inode with an exclusive lock held), the chain structure may be 1120 * shared. 1121 */ 1122 void 1123 hammer2_cluster_load_async(hammer2_cluster_t *cluster, 1124 void (*callback)(hammer2_iocb_t *iocb), void *ptr) 1125 { 1126 hammer2_chain_t *chain; 1127 hammer2_iocb_t *iocb; 1128 hammer2_mount_t *hmp; 1129 hammer2_blockref_t *bref; 1130 int i; 1131 1132 /* 1133 * Try to find a chain whos data is already resolved. If none can 1134 * be found, start with the first chain. 1135 */ 1136 chain = NULL; 1137 for (i = 0; i < cluster->nchains; ++i) { 1138 chain = cluster->array[i]; 1139 if (chain && chain->data) 1140 break; 1141 } 1142 if (i == cluster->nchains) { 1143 chain = cluster->array[0]; 1144 i = 0; 1145 } 1146 1147 iocb = &cluster->iocb; 1148 iocb->callback = callback; 1149 iocb->dio = NULL; /* for already-validated case */ 1150 iocb->cluster = cluster; 1151 iocb->chain = chain; 1152 iocb->ptr = ptr; 1153 iocb->lbase = (off_t)i; 1154 iocb->flags = 0; 1155 iocb->error = 0; 1156 1157 /* 1158 * Data already validated 1159 */ 1160 if (chain->data) { 1161 callback(iocb); 1162 return; 1163 } 1164 1165 /* 1166 * We must resolve to a device buffer, either by issuing I/O or 1167 * by creating a zero-fill element. We do not mark the buffer 1168 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1169 * API must still be used to do that). 1170 * 1171 * The device buffer is variable-sized in powers of 2 down 1172 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1173 * chunk always contains buffers of the same size. (XXX) 1174 * 1175 * The minimum physical IO size may be larger than the variable 1176 * block size. 1177 */ 1178 bref = &chain->bref; 1179 hmp = chain->hmp; 1180 1181 #if 0 1182 /* handled by callback? <- TODO XXX even needed for loads? */ 1183 /* 1184 * The getblk() optimization for a 100% overwrite can only be used 1185 * if the physical block size matches the request. 1186 */ 1187 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 1188 chain->bytes == hammer2_devblksize(chain->bytes)) { 1189 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio); 1190 KKASSERT(error == 0); 1191 iocb->dio = dio; 1192 callback(iocb); 1193 return; 1194 } 1195 #endif 1196 1197 /* 1198 * Otherwise issue a read 1199 */ 1200 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1201 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb); 1202 } 1203 1204 /************************************************************************ 1205 * NODE FAILURES * 1206 ************************************************************************ 1207 * 1208 * A node failure can occur for numerous reasons. 1209 * 1210 * - A read I/O may fail 1211 * - A write I/O may fail 1212 * - An unexpected chain might be found (or be missing) 1213 * - A node might disconnect temporarily and reconnect later 1214 * (for example, a USB stick could get pulled, or a node might 1215 * be programmatically disconnected). 1216 * - A node might run out of space during a modifying operation. 1217 * 1218 * When a read failure or an unexpected chain state is found, the chain and 1219 * parent chain at the failure point for the nodes involved (the nodes 1220 * which we determine to be in error) are flagged as failed and removed 1221 * from the cluster. The node itself is allowed to remain active. The 1222 * highest common point (usually a parent chain) is queued to the 1223 * resynchronization thread for action. 1224 * 1225 * When a write I/O fails or a node runs out of space, we first adjust 1226 * as if a read failure occurs but we further disable flushes on the 1227 * ENTIRE node. Concurrent modifying transactions are allowed to complete 1228 * but any new modifying transactions will automatically remove the node 1229 * from consideration in all related cluster structures and not generate 1230 * any new modified chains. The ROOT chain for the failed node(s) is queued 1231 * to the resynchronization thread for action. 1232 * 1233 * A temporary disconnect is handled as if a write failure occurred. 1234 * 1235 * Any of these failures might or might not stall related high level VNOPS, 1236 * depending on what has failed, what nodes remain, the type of cluster, 1237 * and the operating state of the cluster. 1238 * 1239 * FLUSH ON WRITE-DISABLED NODES 1240 * 1241 * A flush on a write-disabled node is not allowed to write anything because 1242 * we cannot safely update the mirror_tid anywhere on the failed node. The 1243 * synchronization thread uses mirror_tid to calculate incremental resyncs. 1244 * Dirty meta-data related to the failed node is thrown away. 1245 * 1246 * Dirty buffer cache buffers and inodes are only thrown away if they can be 1247 * retired... that is, if the filesystem still has enough nodes to complete 1248 * the operation. 1249 */ 1250 1251 /************************************************************************ 1252 * SYNCHRONIZATION THREAD * 1253 ************************************************************************ 1254 * 1255 * This thread is responsible for [re]synchronizing the cluster representing 1256 * a PFS. Any out-of-sync or failed node starts this thread on a 1257 * node-by-node basis when the failure is detected. 1258 * 1259 * Clusters needing resynchronization are queued at the highest point 1260 * where the parent on the failed node is still valid, or a special 1261 * incremental scan from the ROOT is queued if no parent exists. This 1262 * thread is also responsible for waiting for reconnections of the failed 1263 * node if the cause was due to a disconnect, and waiting for space to be 1264 * freed up if the cause was due to running out of space. 1265 * 1266 * If the cause is due to a node running out of space, this thread will also 1267 * remove older (unlocked) snapshots to make new space, recover space, and 1268 * then start resynchronization. 1269 * 1270 * Each resynchronization pass virtually snapshots the PFS on the good nodes 1271 * and synchronizes using that snapshot against the target node. This 1272 * ensures a consistent chain topology and also avoid interference between 1273 * the resynchronization thread and frontend operations. 1274 * 1275 * Since these are per-node threads it is possible to resynchronize several 1276 * nodes at once. 1277 */ 1278