1 /* 2 * Copyright (c) 2013-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information into a single entity. It allows direct access to media 37 * data as long as it is not blockref array data. Meaning, basically, 38 * just inode and file data. 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 */ 48 #include <sys/cdefs.h> 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/types.h> 52 #include <sys/lock.h> 53 #include <sys/uuid.h> 54 55 #include "hammer2.h" 56 57 u_int 58 hammer2_cluster_bytes(hammer2_cluster_t *cluster) 59 { 60 return(cluster->focus->bytes); 61 } 62 63 uint8_t 64 hammer2_cluster_type(hammer2_cluster_t *cluster) 65 { 66 return(cluster->focus->bref.type); 67 } 68 69 /* 70 * NOTE: When modifying a cluster object via hammer2_cluster_wdata() 71 * and hammer2_cluster_modsync(), remember that block array 72 * entries are not copied to the elements of the cluster. 73 */ 74 const hammer2_media_data_t * 75 hammer2_cluster_data(hammer2_cluster_t *cluster) 76 { 77 return(cluster->focus->data); 78 } 79 80 hammer2_media_data_t * 81 hammer2_cluster_wdata(hammer2_cluster_t *cluster) 82 { 83 return(cluster->focus->data); 84 } 85 86 int 87 hammer2_cluster_modified(hammer2_cluster_t *cluster) 88 { 89 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); 90 } 91 92 int 93 hammer2_cluster_unlinked(hammer2_cluster_t *cluster) 94 { 95 return((cluster->focus->flags & HAMMER2_CHAIN_UNLINKED) != 0); 96 } 97 98 /* 99 * Return a bref representative of the cluster. Any data offset is removed 100 * (since it would only be applicable to a particular chain in the cluster). 101 * 102 * However, the radix portion of data_off is used for many purposes and will 103 * be retained. 104 */ 105 void 106 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 107 { 108 *bref = cluster->focus->bref; 109 bref->data_off &= HAMMER2_OFF_MASK_RADIX; 110 } 111 112 void 113 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 114 { 115 hammer2_chain_t *chain; 116 int i; 117 118 for (i = 0; i < cluster->nchains; ++i) { 119 chain = cluster->array[i]; 120 if (chain) 121 atomic_set_int(&chain->flags, flags); 122 } 123 } 124 125 void 126 hammer2_cluster_setsubmod(hammer2_trans_t *trans, hammer2_cluster_t *cluster) 127 { 128 hammer2_chain_t *chain; 129 int i; 130 131 for (i = 0; i < cluster->nchains; ++i) { 132 chain = cluster->array[i]; 133 if (chain) 134 hammer2_chain_setsubmod(trans, chain); 135 } 136 } 137 138 /* 139 * Create a cluster with one ref from the specified chain. The chain 140 * is not further referenced. The caller typically supplies a locked 141 * chain and transfers ownership to the cluster. 142 */ 143 hammer2_cluster_t * 144 hammer2_cluster_from_chain(hammer2_chain_t *chain) 145 { 146 hammer2_cluster_t *cluster; 147 148 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 149 cluster->array[0] = chain; 150 cluster->nchains = 1; 151 cluster->focus = chain; 152 cluster->pmp = chain->pmp; 153 cluster->refs = 1; 154 155 return cluster; 156 } 157 158 /* 159 * Allocates a cluster and its underlying chain structures. The underlying 160 * chains will be locked. The cluster and underlying chains will have one 161 * ref. 162 */ 163 hammer2_cluster_t * 164 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp, 165 hammer2_trans_t *trans, hammer2_blockref_t *bref) 166 { 167 hammer2_cluster_t *cluster; 168 hammer2_cluster_t *rcluster; 169 hammer2_chain_t *chain; 170 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 171 int i; 172 173 KKASSERT(pmp != NULL); 174 175 /* 176 * Construct the appropriate system structure. 177 */ 178 switch(bref->type) { 179 case HAMMER2_BREF_TYPE_INODE: 180 case HAMMER2_BREF_TYPE_INDIRECT: 181 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 182 case HAMMER2_BREF_TYPE_DATA: 183 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 184 /* 185 * Chain's are really only associated with the hmp but we 186 * maintain a pmp association for per-mount memory tracking 187 * purposes. The pmp can be NULL. 188 */ 189 break; 190 case HAMMER2_BREF_TYPE_VOLUME: 191 case HAMMER2_BREF_TYPE_FREEMAP: 192 chain = NULL; 193 panic("hammer2_cluster_alloc volume type illegal for op"); 194 default: 195 chain = NULL; 196 panic("hammer2_cluster_alloc: unrecognized blockref type: %d", 197 bref->type); 198 } 199 200 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 201 cluster->refs = 1; 202 203 rcluster = &pmp->iroot->cluster; 204 for (i = 0; i < rcluster->nchains; ++i) { 205 chain = hammer2_chain_alloc(rcluster->array[i]->hmp, 206 pmp, trans, bref); 207 chain->hmp = rcluster->array[i]->hmp; 208 chain->bref = *bref; 209 chain->bytes = bytes; 210 chain->refs = 1; 211 chain->flags = HAMMER2_CHAIN_ALLOCATED; 212 chain->delete_xid = HAMMER2_XID_MAX; 213 214 /* 215 * Set modify_tid if a transaction is creating the inode. 216 * Enforce update_xlo = 0 so nearby transactions do not think 217 * it has been flushed when it hasn't. 218 * 219 * NOTE: When loading a chain from backing store or creating a 220 * snapshot, trans will be NULL and the caller is 221 * responsible for setting these fields. 222 */ 223 if (trans) { 224 chain->modify_xid = trans->sync_xid; 225 chain->update_xlo = 0; 226 } 227 cluster->array[i] = chain; 228 } 229 cluster->nchains = i; 230 cluster->pmp = pmp; 231 cluster->focus = cluster->array[0]; 232 233 return (cluster); 234 } 235 236 /* 237 * Associate an existing core with the chain or allocate a new core. 238 * 239 * The core is not locked. No additional refs on the chain are made. 240 * (trans) must not be NULL if (core) is not NULL. 241 * 242 * When chains are delete-duplicated during flushes we insert nchain on 243 * the ownerq after ochain instead of at the end in order to give the 244 * drop code visibility in the correct order, otherwise drops can be missed. 245 */ 246 void 247 hammer2_cluster_core_alloc(hammer2_trans_t *trans, 248 hammer2_cluster_t *ncluster, 249 hammer2_cluster_t *ocluster) 250 { 251 int i; 252 253 for (i = 0; i < ocluster->nchains; ++i) { 254 if (ncluster->array[i]) { 255 hammer2_chain_core_alloc(trans, 256 ncluster->array[i], 257 ocluster->array[i]); 258 } 259 } 260 } 261 262 /* 263 * Add a reference to a cluster. 264 * 265 * We must also ref the underlying chains in order to allow ref/unlock 266 * sequences to later re-lock. 267 */ 268 void 269 hammer2_cluster_ref(hammer2_cluster_t *cluster) 270 { 271 hammer2_chain_t *chain; 272 int i; 273 274 atomic_add_int(&cluster->refs, 1); 275 for (i = 0; i < cluster->nchains; ++i) { 276 chain = cluster->array[i]; 277 if (chain) 278 hammer2_chain_ref(chain); 279 } 280 } 281 282 /* 283 * Drop the caller's reference to the cluster. When the ref count drops to 284 * zero this function frees the cluster and drops all underlying chains. 285 */ 286 void 287 hammer2_cluster_drop(hammer2_cluster_t *cluster) 288 { 289 hammer2_chain_t *chain; 290 int i; 291 292 KKASSERT(cluster->refs > 0); 293 for (i = 0; i < cluster->nchains; ++i) { 294 chain = cluster->array[i]; 295 if (chain) { 296 hammer2_chain_drop(chain); 297 if (cluster->refs == 1) 298 cluster->array[i] = NULL; 299 } 300 } 301 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 302 cluster->focus = NULL; 303 kfree(cluster, M_HAMMER2); 304 /* cluster = NULL; safety */ 305 } 306 } 307 308 void 309 hammer2_cluster_wait(hammer2_cluster_t *cluster) 310 { 311 tsleep(cluster->focus, 0, "h2clcw", 1); 312 } 313 314 /* 315 * Lock and ref a cluster. This adds a ref to the cluster and its chains 316 * and then locks them. 317 */ 318 int 319 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 320 { 321 hammer2_chain_t *chain; 322 int i; 323 int error; 324 325 error = 0; 326 atomic_add_int(&cluster->refs, 1); 327 for (i = 0; i < cluster->nchains; ++i) { 328 chain = cluster->array[i]; 329 if (chain) { 330 error = hammer2_chain_lock(chain, how); 331 if (error) { 332 while (--i >= 0) 333 hammer2_chain_unlock(cluster->array[i]); 334 atomic_add_int(&cluster->refs, -1); 335 break; 336 } 337 } 338 } 339 return error; 340 } 341 342 /* 343 * Replace the contents of dst with src, adding a reference to src's chains. 344 * dst is assumed to already have a ref and any chains present in dst are 345 * assumed to be locked and will be unlocked. 346 * 347 * If the chains in src are locked, only one of (src) or (dst) should be 348 * considered locked by the caller after return, not both. 349 */ 350 void 351 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src) 352 { 353 hammer2_chain_t *chain; 354 int i; 355 356 KKASSERT(dst->refs == 1); 357 dst->focus = NULL; 358 359 for (i = 0; i < src->nchains; ++i) { 360 chain = src->array[i]; 361 if (chain) { 362 hammer2_chain_ref(chain); 363 if (i < dst->nchains && dst->array[i]) 364 hammer2_chain_unlock(dst->array[i]); 365 dst->array[i] = chain; 366 if (dst->focus == NULL) 367 dst->focus = chain; 368 } 369 } 370 while (i < dst->nchains) { 371 chain = dst->array[i]; 372 if (chain) { 373 hammer2_chain_unlock(chain); 374 dst->array[i] = NULL; 375 } 376 ++i; 377 } 378 dst->nchains = src->nchains; 379 } 380 381 /* 382 * Replace the contents of the locked destination with the contents of the 383 * locked source. Destination must have one ref. 384 * 385 * Returns with the destination still with one ref and the copied chains 386 * with an additional lock (representing their state on the destination). 387 * The original chains associated with the destination are unlocked. 388 */ 389 void 390 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src) 391 { 392 hammer2_chain_t *chain; 393 int i; 394 395 KKASSERT(dst->refs == 1); 396 397 dst->focus = NULL; 398 for (i = 0; i < src->nchains; ++i) { 399 chain = src->array[i]; 400 if (chain) { 401 hammer2_chain_lock(chain, 0); 402 if (i < dst->nchains && dst->array[i]) 403 hammer2_chain_unlock(dst->array[i]); 404 dst->array[i] = src->array[i]; 405 if (dst->focus == NULL) 406 dst->focus = chain; 407 } 408 } 409 while (i < dst->nchains) { 410 chain = dst->array[i]; 411 if (chain) { 412 hammer2_chain_unlock(chain); 413 dst->array[i] = NULL; 414 } 415 ++i; 416 } 417 dst->nchains = src->nchains; 418 } 419 420 /* 421 * Copy a cluster, returned a ref'd cluster. All underlying chains 422 * are also ref'd, but not locked. 423 * 424 * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied 425 * to the new cluster and a reference is nominally added to them and to 426 * the cluster. The cluster will have 1 ref. 427 * 428 * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains 429 * are copied but no additional references are made and the cluster will have 430 * 0 refs. Callers must ref the cluster and the chains before dropping it 431 * (typically by locking it). 432 * 433 * If flags are passed as 0 the copy is setup as if it contained the chains 434 * but the chains will not be copied over, and the cluster will have 0 refs. 435 * Callers must ref the cluster before dropping it (typically by locking it). 436 */ 437 hammer2_cluster_t * 438 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags) 439 { 440 hammer2_pfsmount_t *pmp = ocluster->pmp; 441 hammer2_cluster_t *ncluster; 442 hammer2_chain_t *chain; 443 int i; 444 445 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO); 446 ncluster->pmp = pmp; 447 ncluster->nchains = ocluster->nchains; 448 ncluster->focus = ocluster->focus; 449 ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1; 450 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) { 451 for (i = 0; i < ocluster->nchains; ++i) { 452 chain = ocluster->array[i]; 453 ncluster->array[i] = chain; 454 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 && 455 chain) { 456 hammer2_chain_ref(chain); 457 } 458 } 459 } 460 return (ncluster); 461 } 462 463 /* 464 * Unlock and deref a cluster. The cluster is destroyed if this is the 465 * last ref. 466 */ 467 void 468 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 469 { 470 hammer2_chain_t *chain; 471 int i; 472 473 KKASSERT(cluster->refs > 0); 474 for (i = 0; i < cluster->nchains; ++i) { 475 chain = cluster->array[i]; 476 if (chain) { 477 hammer2_chain_unlock(chain); 478 if (cluster->refs == 1) 479 cluster->array[i] = NULL; /* safety */ 480 } 481 } 482 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 483 cluster->focus = NULL; 484 kfree(cluster, M_HAMMER2); 485 /* cluster = NULL; safety */ 486 } 487 } 488 489 /* 490 * Refactor the locked chains of a cluster. 491 */ 492 void 493 hammer2_cluster_refactor(hammer2_cluster_t *cluster) 494 { 495 int i; 496 497 cluster->focus = NULL; 498 for (i = 0; i < cluster->nchains; ++i) { 499 if (cluster->array[i]) { 500 hammer2_chain_refactor(&cluster->array[i]); 501 if (cluster->focus == NULL) 502 cluster->focus = cluster->array[i]; 503 } 504 } 505 } 506 507 /* 508 * Resize the cluster's physical storage allocation in-place. This may 509 * replace the cluster's chains. 510 */ 511 void 512 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 513 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 514 int nradix, int flags) 515 { 516 int i; 517 518 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */ 519 KKASSERT(cparent->nchains == cluster->nchains); 520 521 cluster->focus = NULL; 522 for (i = 0; i < cluster->nchains; ++i) { 523 if (cluster->array[i]) { 524 KKASSERT(cparent->array[i]); 525 hammer2_chain_resize(trans, ip, 526 cparent->array[i], 527 &cluster->array[i], 528 nradix, flags); 529 if (cluster->focus == NULL) 530 cluster->focus = cluster->array[i]; 531 } 532 } 533 } 534 535 /* 536 * Set an inode's cluster modified, marking the related chains RW and 537 * duplicating them if necessary. 538 * 539 * The passed-in chain is a localized copy of the chain previously acquired 540 * when the inode was locked (and possilby replaced in the mean time), and 541 * must also be updated. In fact, we update it first and then synchronize 542 * the inode's cluster cache. 543 */ 544 hammer2_inode_data_t * 545 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 546 hammer2_cluster_t *cluster, int flags) 547 { 548 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 549 hammer2_cluster_modify(trans, cluster, flags); 550 551 hammer2_inode_repoint(ip, NULL, cluster); 552 if (ip->vp) 553 vsetisdirty(ip->vp); 554 return (&hammer2_cluster_wdata(cluster)->ipdata); 555 } 556 557 /* 558 * Adjust the cluster's chains to allow modification. 559 */ 560 void 561 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster, 562 int flags) 563 { 564 int i; 565 566 cluster->focus = NULL; 567 for (i = 0; i < cluster->nchains; ++i) { 568 if (cluster->array[i]) { 569 hammer2_chain_modify(trans, &cluster->array[i], flags); 570 if (cluster->focus == NULL) 571 cluster->focus = cluster->array[i]; 572 } 573 } 574 } 575 576 /* 577 * Synchronize modifications with other chains in a cluster. 578 * 579 * Nominal front-end operations only edit non-block-table data in a single 580 * chain. This code copies such modifications to the other chains in the 581 * cluster. 582 */ 583 /* hammer2_cluster_modsync() */ 584 585 void 586 hammer2_cluster_modsync(hammer2_cluster_t *cluster) 587 { 588 hammer2_chain_t *focus; 589 hammer2_chain_t *scan; 590 const hammer2_inode_data_t *ripdata; 591 hammer2_inode_data_t *wipdata; 592 int i; 593 594 focus = cluster->focus; 595 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED); 596 597 for (i = 0; i < cluster->nchains; ++i) { 598 scan = cluster->array[i]; 599 if (scan == NULL || scan == focus) 600 continue; 601 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED); 602 KKASSERT(focus->bytes == scan->bytes && 603 focus->bref.type == scan->bref.type); 604 switch(focus->bref.type) { 605 case HAMMER2_BREF_TYPE_INODE: 606 ripdata = &focus->data->ipdata; 607 wipdata = &scan->data->ipdata; 608 if ((ripdata->op_flags & 609 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 610 bcopy(ripdata, wipdata, 611 offsetof(hammer2_inode_data_t, u)); 612 break; 613 } 614 /* fall through */ 615 case HAMMER2_BREF_TYPE_DATA: 616 bcopy(focus->data, scan->data, focus->bytes); 617 break; 618 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 619 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 620 case HAMMER2_BREF_TYPE_FREEMAP: 621 case HAMMER2_BREF_TYPE_VOLUME: 622 panic("hammer2_cluster_modsync: illegal node type"); 623 /* NOT REACHED */ 624 break; 625 default: 626 panic("hammer2_cluster_modsync: unknown node type"); 627 break; 628 } 629 } 630 } 631 632 /* 633 * Lookup initialization/completion API 634 */ 635 hammer2_cluster_t * 636 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags) 637 { 638 hammer2_cluster_t *cluster; 639 int i; 640 641 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 642 cluster->pmp = cparent->pmp; /* can be NULL */ 643 /* cluster->focus = NULL; already null */ 644 645 for (i = 0; i < cparent->nchains; ++i) { 646 cluster->array[i] = cparent->array[i]; 647 if (cluster->focus == NULL) 648 cluster->focus = cluster->array[i]; 649 } 650 cluster->nchains = cparent->nchains; 651 652 /* 653 * Independently lock (this will also give cluster 1 ref) 654 */ 655 if (flags & HAMMER2_LOOKUP_SHARED) { 656 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS | 657 HAMMER2_RESOLVE_SHARED); 658 } else { 659 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); 660 } 661 return (cluster); 662 } 663 664 void 665 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent) 666 { 667 if (cparent) 668 hammer2_cluster_unlock(cparent); 669 } 670 671 /* 672 * Locate first match or overlap under parent, return a new cluster 673 */ 674 hammer2_cluster_t * 675 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp, 676 hammer2_key_t key_beg, hammer2_key_t key_end, 677 int flags, int *ddflagp) 678 { 679 hammer2_pfsmount_t *pmp; 680 hammer2_cluster_t *cluster; 681 hammer2_chain_t *chain; 682 hammer2_key_t key_accum; 683 hammer2_key_t key_next; 684 hammer2_key_t bref_key; 685 int bref_keybits; 686 int null_count; 687 int ddflag; 688 int i; 689 uint8_t bref_type; 690 u_int bytes; 691 692 pmp = cparent->pmp; /* can be NULL */ 693 key_accum = *key_nextp; 694 null_count = 0; 695 bref_type = 0; 696 bref_key = 0; 697 bref_keybits = 0; 698 bytes = 0; 699 700 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 701 cluster->pmp = pmp; /* can be NULL */ 702 cluster->refs = 1; 703 /* cluster->focus = NULL; already null */ 704 cparent->focus = NULL; 705 *ddflagp = 0; 706 707 for (i = 0; i < cparent->nchains; ++i) { 708 key_next = *key_nextp; 709 if (cparent->array[i] == NULL) { 710 ++null_count; 711 continue; 712 } 713 chain = hammer2_chain_lookup(&cparent->array[i], &key_next, 714 key_beg, key_end, 715 &cparent->cache_index[i], 716 flags, &ddflag); 717 if (cparent->focus == NULL) 718 cparent->focus = cparent->array[i]; 719 cluster->array[i] = chain; 720 if (chain == NULL) { 721 ++null_count; 722 } else { 723 if (cluster->focus == NULL) { 724 bref_type = chain->bref.type; 725 bref_key = chain->bref.key; 726 bref_keybits = chain->bref.keybits; 727 bytes = chain->bytes; 728 *ddflagp = ddflag; 729 cluster->focus = chain; 730 } 731 KKASSERT(bref_type == chain->bref.type); 732 KKASSERT(bref_key == chain->bref.key); 733 KKASSERT(bref_keybits == chain->bref.keybits); 734 KKASSERT(bytes == chain->bytes); 735 KKASSERT(*ddflagp == ddflag); 736 } 737 if (key_accum > key_next) 738 key_accum = key_next; 739 } 740 *key_nextp = key_accum; 741 cluster->nchains = i; 742 743 if (null_count == i) { 744 hammer2_cluster_drop(cluster); 745 cluster = NULL; 746 } 747 748 return (cluster); 749 } 750 751 /* 752 * Locate next match or overlap under parent, replace cluster 753 */ 754 hammer2_cluster_t * 755 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 756 hammer2_key_t *key_nextp, 757 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 758 { 759 hammer2_chain_t *chain; 760 hammer2_key_t key_accum; 761 hammer2_key_t key_next; 762 int null_count; 763 int i; 764 765 key_accum = *key_nextp; 766 null_count = 0; 767 cluster->focus = NULL; 768 cparent->focus = NULL; 769 770 for (i = 0; i < cparent->nchains; ++i) { 771 key_next = *key_nextp; 772 chain = cluster->array[i]; 773 if (chain == NULL) { 774 if (cparent->focus == NULL) 775 cparent->focus = cparent->array[i]; 776 ++null_count; 777 continue; 778 } 779 if (cparent->array[i] == NULL) { 780 if (flags & HAMMER2_LOOKUP_NOLOCK) 781 hammer2_chain_drop(chain); 782 else 783 hammer2_chain_unlock(chain); 784 ++null_count; 785 continue; 786 } 787 chain = hammer2_chain_next(&cparent->array[i], chain, 788 &key_next, key_beg, key_end, 789 &cparent->cache_index[i], flags); 790 if (cparent->focus == NULL) 791 cparent->focus = cparent->array[i]; 792 cluster->array[i] = chain; 793 if (chain == NULL) { 794 ++null_count; 795 } else if (cluster->focus == NULL) { 796 cluster->focus = chain; 797 } 798 if (key_accum > key_next) 799 key_accum = key_next; 800 } 801 802 if (null_count == i) { 803 hammer2_cluster_drop(cluster); 804 cluster = NULL; 805 } 806 return(cluster); 807 } 808 809 #if 0 810 /* 811 * XXX initial NULL cluster needs reworking (pass **clusterp ?) 812 * 813 * The raw scan function is similar to lookup/next but does not seek to a key. 814 * Blockrefs are iterated via first_chain = (parent, NULL) and 815 * next_chain = (parent, chain). 816 * 817 * The passed-in parent must be locked and its data resolved. The returned 818 * chain will be locked. Pass chain == NULL to acquire the first sub-chain 819 * under parent and then iterate with the passed-in chain (which this 820 * function will unlock). 821 */ 822 hammer2_cluster_t * 823 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 824 int flags) 825 { 826 hammer2_chain_t *chain; 827 int null_count; 828 int i; 829 830 null_count = 0; 831 832 for (i = 0; i < cparent->nchains; ++i) { 833 chain = cluster->array[i]; 834 if (chain == NULL) { 835 ++null_count; 836 continue; 837 } 838 if (cparent->array[i] == NULL) { 839 if (flags & HAMMER2_LOOKUP_NOLOCK) 840 hammer2_chain_drop(chain); 841 else 842 hammer2_chain_unlock(chain); 843 ++null_count; 844 continue; 845 } 846 847 chain = hammer2_chain_scan(cparent->array[i], chain, 848 &cparent->cache_index[i], flags); 849 cluster->array[i] = chain; 850 if (chain == NULL) 851 ++null_count; 852 } 853 854 if (null_count == i) { 855 hammer2_cluster_drop(cluster); 856 cluster = NULL; 857 } 858 return(cluster); 859 } 860 861 #endif 862 863 /* 864 * Create a new cluster using the specified key 865 */ 866 int 867 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 868 hammer2_cluster_t **clusterp, 869 hammer2_key_t key, int keybits, int type, size_t bytes) 870 { 871 hammer2_cluster_t *cluster; 872 hammer2_pfsmount_t *pmp; 873 int error; 874 int i; 875 876 pmp = trans->pmp; /* can be NULL */ 877 878 if ((cluster = *clusterp) == NULL) { 879 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, 880 M_WAITOK | M_ZERO); 881 cluster->pmp = pmp; /* can be NULL */ 882 cluster->refs = 1; 883 } 884 cluster->focus = NULL; 885 cparent->focus = NULL; 886 887 /* 888 * NOTE: cluster->array[] entries can initially be NULL. If 889 * *clusterp is supplied, skip NULL entries, otherwise 890 * create new chains. 891 */ 892 for (i = 0; i < cparent->nchains; ++i) { 893 if (*clusterp && cluster->array[i] == NULL) { 894 if (cparent->focus == NULL) 895 cparent->focus = cparent->array[i]; 896 continue; 897 } 898 error = hammer2_chain_create(trans, &cparent->array[i], 899 &cluster->array[i], pmp, 900 key, keybits, type, bytes); 901 KKASSERT(error == 0); 902 if (cparent->focus == NULL) 903 cparent->focus = cparent->array[i]; 904 if (cluster->focus == NULL) 905 cluster->focus = cluster->array[i]; 906 } 907 cluster->nchains = i; 908 *clusterp = cluster; 909 910 return error; 911 } 912 913 /* 914 * Duplicate a cluster under a new parent. 915 * 916 * WARNING! Unlike hammer2_chain_duplicate(), only the key and keybits fields 917 * are used from a passed-in non-NULL bref pointer. All other fields 918 * are extracted from the original chain for each chain in the 919 * iteration. 920 */ 921 void 922 hammer2_cluster_duplicate(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 923 hammer2_cluster_t *cluster, hammer2_blockref_t *bref, 924 int snapshot, int duplicate_reason) 925 { 926 hammer2_chain_t *chain; 927 hammer2_blockref_t xbref; 928 int i; 929 930 cluster->focus = NULL; 931 cparent->focus = NULL; 932 933 for (i = 0; i < cluster->nchains; ++i) { 934 chain = cluster->array[i]; 935 if (chain) { 936 if (bref) { 937 xbref = chain->bref; 938 xbref.key = bref->key; 939 xbref.keybits = bref->keybits; 940 hammer2_chain_duplicate(trans, 941 &cparent->array[i], 942 &chain, &xbref, 943 snapshot, 944 duplicate_reason); 945 } else { 946 hammer2_chain_duplicate(trans, 947 &cparent->array[i], 948 &chain, NULL, 949 snapshot, 950 duplicate_reason); 951 } 952 cluster->array[i] = chain; 953 if (cluster->focus == NULL) 954 cluster->focus = chain; 955 if (cparent->focus == NULL) 956 cparent->focus = cparent->array[i]; 957 } else { 958 if (cparent->focus == NULL) 959 cparent->focus = cparent->array[i]; 960 } 961 } 962 } 963 964 /* 965 * Delete-duplicate a cluster in-place. 966 */ 967 void 968 hammer2_cluster_delete_duplicate(hammer2_trans_t *trans, 969 hammer2_cluster_t *cluster, int flags) 970 { 971 hammer2_chain_t *chain; 972 int i; 973 974 cluster->focus = NULL; 975 for (i = 0; i < cluster->nchains; ++i) { 976 chain = cluster->array[i]; 977 if (chain) { 978 hammer2_chain_delete_duplicate(trans, &chain, flags); 979 cluster->array[i] = chain; 980 if (cluster->focus == NULL) 981 cluster->focus = chain; 982 } 983 } 984 } 985 986 /* 987 * Mark a cluster deleted 988 */ 989 void 990 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cluster, 991 int flags) 992 { 993 hammer2_chain_t *chain; 994 int i; 995 996 for (i = 0; i < cluster->nchains; ++i) { 997 chain = cluster->array[i]; 998 if (chain) 999 hammer2_chain_delete(trans, chain, flags); 1000 } 1001 } 1002 1003 /* 1004 * Create a snapshot of the specified {parent, ochain} with the specified 1005 * label. The originating hammer2_inode must be exclusively locked for 1006 * safety. 1007 * 1008 * The ioctl code has already synced the filesystem. 1009 */ 1010 int 1011 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster, 1012 hammer2_ioc_pfs_t *pfs) 1013 { 1014 hammer2_mount_t *hmp; 1015 hammer2_cluster_t *ncluster; 1016 const hammer2_inode_data_t *ipdata; 1017 hammer2_inode_data_t *wipdata; 1018 hammer2_inode_t *nip; 1019 size_t name_len; 1020 hammer2_key_t lhc; 1021 struct vattr vat; 1022 uuid_t opfs_clid; 1023 int error; 1024 1025 kprintf("snapshot %s\n", pfs->name); 1026 1027 name_len = strlen(pfs->name); 1028 lhc = hammer2_dirhash(pfs->name, name_len); 1029 1030 ipdata = &hammer2_cluster_data(ocluster)->ipdata; 1031 opfs_clid = ipdata->pfs_clid; 1032 hmp = ocluster->focus->hmp; 1033 1034 /* 1035 * Create the snapshot directory under the super-root 1036 * 1037 * Set PFS type, generate a unique filesystem id, and generate 1038 * a cluster id. Use the same clid when snapshotting a PFS root, 1039 * which theoretically allows the snapshot to be used as part of 1040 * the same cluster (perhaps as a cache). 1041 * 1042 * Copy the (flushed) blockref array. Theoretically we could use 1043 * chain_duplicate() but it becomes difficult to disentangle 1044 * the shared core so for now just brute-force it. 1045 */ 1046 VATTR_NULL(&vat); 1047 vat.va_type = VDIR; 1048 vat.va_mode = 0755; 1049 ncluster = NULL; 1050 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat, 1051 proc0.p_ucred, pfs->name, name_len, 1052 &ncluster, &error); 1053 1054 if (nip) { 1055 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0); 1056 wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT; 1057 kern_uuidgen(&wipdata->pfs_fsid, 1); 1058 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSROOT) 1059 wipdata->pfs_clid = opfs_clid; 1060 else 1061 kern_uuidgen(&wipdata->pfs_clid, 1); 1062 hammer2_cluster_set_chainflags(ncluster, HAMMER2_CHAIN_PFSROOT); 1063 1064 /* XXX hack blockset copy */ 1065 /* XXX doesn't work with real cluster */ 1066 KKASSERT(ocluster->nchains == 1); 1067 wipdata->u.blockset = ocluster->focus->data->ipdata.u.blockset; 1068 hammer2_cluster_modsync(ncluster); 1069 hammer2_inode_unlock_ex(nip, ncluster); 1070 } 1071 return (error); 1072 } 1073 1074 /************************************************************************ 1075 * NODE FAILURES * 1076 ************************************************************************ 1077 * 1078 * A node failure can occur for numerous reasons. 1079 * 1080 * - A read I/O may fail 1081 * - A write I/O may fail 1082 * - An unexpected chain might be found (or be missing) 1083 * - A node might disconnect temporarily and reconnect later 1084 * (for example, a USB stick could get pulled, or a node might 1085 * be programmatically disconnected). 1086 * - A node might run out of space during a modifying operation. 1087 * 1088 * When a read failure or an unexpected chain state is found, the chain and 1089 * parent chain at the failure point for the nodes involved (the nodes 1090 * which we determine to be in error) are flagged as failed and removed 1091 * from the cluster. The node itself is allowed to remain active. The 1092 * highest common point (usually a parent chain) is queued to the 1093 * resynchronization thread for action. 1094 * 1095 * When a write I/O fails or a node runs out of space, we first adjust 1096 * as if a read failure occurs but we further disable flushes on the 1097 * ENTIRE node. Concurrent modifying transactions are allowed to complete 1098 * but any new modifying transactions will automatically remove the node 1099 * from consideration in all related cluster structures and not generate 1100 * any new modified chains. The ROOT chain for the failed node(s) is queued 1101 * to the resynchronization thread for action. 1102 * 1103 * A temporary disconnect is handled as if a write failure occurred. 1104 * 1105 * Any of these failures might or might not stall related high level VNOPS, 1106 * depending on what has failed, what nodes remain, the type of cluster, 1107 * and the operating state of the cluster. 1108 * 1109 * FLUSH ON WRITE-DISABLED NODES 1110 * 1111 * A flush on a write-disabled node is not allowed to write anything because 1112 * we cannot safely update the mirror_tid anywhere on the failed node. The 1113 * synchronization thread uses mirror_tid to calculate incremental resyncs. 1114 * Dirty meta-data related to the failed node is thrown away. 1115 * 1116 * Dirty buffer cache buffers and inodes are only thrown away if they can be 1117 * retired... that is, if the filesystem still has enough nodes to complete 1118 * the operation. 1119 */ 1120 1121 /************************************************************************ 1122 * SYNCHRONIZATION THREAD * 1123 ************************************************************************ 1124 * 1125 * This thread is responsible for [re]synchronizing the cluster representing 1126 * a PFS. Any out-of-sync or failed node starts this thread on a 1127 * node-by-node basis when the failure is detected. 1128 * 1129 * Clusters needing resynchronization are queued at the highest point 1130 * where the parent on the failed node is still valid, or a special 1131 * incremental scan from the ROOT is queued if no parent exists. This 1132 * thread is also responsible for waiting for reconnections of the failed 1133 * node if the cause was due to a disconnect, and waiting for space to be 1134 * freed up if the cause was due to running out of space. 1135 * 1136 * If the cause is due to a node running out of space, this thread will also 1137 * remove older (unlocked) snapshots to make new space, recover space, and 1138 * then start resynchronization. 1139 * 1140 * Each resynchronization pass virtually snapshots the PFS on the good nodes 1141 * and synchronizes using that snapshot against the target node. This 1142 * ensures a consistent chain topology and also avoid interference between 1143 * the resynchronization thread and frontend operations. 1144 * 1145 * Since these are per-node threads it is possible to resynchronize several 1146 * nodes at once. 1147 */ 1148