1 /* 2 * Copyright (c) 2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * This module implements the cluster synchronizer. Basically the way 36 * it works is that a thread is created for each cluster node in a PFS. 37 * This thread is responsible for synchronizing the current node using 38 * data from other nodes. 39 * 40 * Any out of sync master or slave can get back into synchronization as 41 * long as a quorum of masters agree on the update_tid. If a quorum is 42 * not available it may still be possible to synchronize to the highest 43 * available update_tid as a way of trying to catch up as much as possible 44 * until a quorum is available. 45 * 46 * If no quorum is possible (which can happen even if all masters are 47 * available, if the update_tid does not match), then manual intervention 48 * may be required to resolve discrepancies. 49 */ 50 #include "hammer2.h" 51 52 typedef struct hammer2_deferred_ip { 53 struct hammer2_deferred_ip *next; 54 hammer2_inode_t *ip; 55 } hammer2_deferred_ip_t; 56 57 typedef struct hammer2_deferred_list { 58 hammer2_deferred_ip_t *base; 59 int count; 60 } hammer2_deferred_list_t; 61 62 63 #define HAMMER2_SYNCHRO_DEBUG 1 64 65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip, 66 hammer2_deferred_list_t *list, int isroot); 67 #if 0 68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags); 69 nerror = hammer2_sync_insert( 70 thr, &parent, &chain, 71 focus->bref.modify_tid, 72 idx, focus); 73 #endif 74 static int hammer2_sync_insert(hammer2_thread_t *thr, 75 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 76 hammer2_tid_t modify_tid, int idx, 77 hammer2_chain_t *focus); 78 static int hammer2_sync_destroy(hammer2_thread_t *thr, 79 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 80 hammer2_tid_t mtid, int idx); 81 static int hammer2_sync_replace(hammer2_thread_t *thr, 82 hammer2_chain_t *parent, hammer2_chain_t *chain, 83 hammer2_tid_t mtid, int idx, 84 hammer2_chain_t *focus, int isroot); 85 86 /**************************************************************************** 87 * HAMMER2 SYNC THREADS * 88 ****************************************************************************/ 89 /* 90 * Primary management thread for an element of a node. A thread will exist 91 * for each element requiring management. 92 * 93 * No management threads are needed for the SPMP or for any PMP with only 94 * a single MASTER. 95 * 96 * On the SPMP - handles bulkfree and dedup operations 97 * On a PFS - handles remastering and synchronization 98 */ 99 void 100 hammer2_primary_sync_thread(void *arg) 101 { 102 hammer2_thread_t *thr = arg; 103 hammer2_pfs_t *pmp; 104 hammer2_deferred_list_t list; 105 hammer2_deferred_ip_t *defer; 106 int error; 107 108 pmp = thr->pmp; 109 bzero(&list, sizeof(list)); 110 111 lockmgr(&thr->lk, LK_EXCLUSIVE); 112 while ((thr->flags & HAMMER2_THREAD_STOP) == 0) { 113 /* 114 * Handle freeze request 115 */ 116 if (thr->flags & HAMMER2_THREAD_FREEZE) { 117 atomic_set_int(&thr->flags, HAMMER2_THREAD_FROZEN); 118 atomic_clear_int(&thr->flags, HAMMER2_THREAD_FREEZE); 119 } 120 121 /* 122 * Force idle if frozen until unfrozen or stopped. 123 */ 124 if (thr->flags & HAMMER2_THREAD_FROZEN) { 125 lksleep(thr->xopq, &thr->lk, 0, "frozen", 0); 126 continue; 127 } 128 129 /* 130 * Reset state on REMASTER request 131 */ 132 if (thr->flags & HAMMER2_THREAD_REMASTER) { 133 atomic_clear_int(&thr->flags, HAMMER2_THREAD_REMASTER); 134 /* reset state */ 135 } 136 137 /* 138 * Synchronization scan. 139 */ 140 if (hammer2_debug & 0x8000) 141 kprintf("sync_slaves pfs %s clindex %d\n", 142 pmp->pfs_names[thr->clindex], thr->clindex); 143 hammer2_trans_init(pmp, 0); 144 145 hammer2_inode_ref(pmp->iroot); 146 147 for (;;) { 148 int didbreak = 0; 149 /* XXX lock synchronize pmp->modify_tid */ 150 error = hammer2_sync_slaves(thr, pmp->iroot, &list, 1); 151 if (hammer2_debug & 0x8000) { 152 kprintf("sync_slaves error %d defer %p\n", 153 error, list.base); 154 } 155 if (error != EAGAIN) 156 break; 157 while ((defer = list.base) != NULL) { 158 hammer2_inode_t *nip; 159 160 nip = defer->ip; 161 error = hammer2_sync_slaves(thr, nip, &list, 0); 162 if (error && error != EAGAIN && error != ENOENT) 163 break; 164 if (hammer2_thr_break(thr)) { 165 didbreak = 1; 166 break; 167 } 168 169 /* 170 * If no additional defers occurred we can 171 * remove this one, otherwise keep it on 172 * the list and retry once the additional 173 * defers have completed. 174 */ 175 if (defer == list.base) { 176 --list.count; 177 list.base = defer->next; 178 kfree(defer, M_HAMMER2); 179 defer = NULL; /* safety */ 180 hammer2_inode_drop(nip); 181 } 182 } 183 184 /* 185 * If the thread is being remastered, frozen, or 186 * stopped, clean up any left-over deferals. 187 */ 188 if (didbreak || (error && error != EAGAIN)) { 189 kprintf("didbreak\n"); 190 while ((defer = list.base) != NULL) { 191 --list.count; 192 hammer2_inode_drop(defer->ip); 193 list.base = defer->next; 194 kfree(defer, M_HAMMER2); 195 } 196 if (error == 0 || error == EAGAIN) 197 error = EINPROGRESS; 198 break; 199 } 200 } 201 202 hammer2_inode_drop(pmp->iroot); 203 hammer2_trans_done(pmp); 204 205 if (error && error != EINPROGRESS) 206 kprintf("hammer2_sync_slaves: error %d\n", error); 207 208 /* 209 * Wait for event, or 5-second poll. 210 */ 211 lksleep(thr->xopq, &thr->lk, 0, "h2idle", hz * 5); 212 } 213 thr->td = NULL; 214 wakeup(thr); 215 lockmgr(&thr->lk, LK_RELEASE); 216 /* thr structure can go invalid after this point */ 217 } 218 219 #if 0 220 /* 221 * Given a locked cluster created from pmp->iroot, update the PFS's 222 * reporting status. 223 */ 224 static 225 void 226 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags) 227 { 228 hammer2_pfs_t *pmp = thr->pmp; 229 230 flags &= HAMMER2_CLUSTER_ZFLAGS; 231 if (pmp->cluster_flags == flags) 232 return; 233 pmp->cluster_flags = flags; 234 235 kprintf("pfs %p", pmp); 236 if (flags & HAMMER2_CLUSTER_MSYNCED) 237 kprintf(" masters-all-good"); 238 if (flags & HAMMER2_CLUSTER_SSYNCED) 239 kprintf(" slaves-all-good"); 240 241 if (flags & HAMMER2_CLUSTER_WRHARD) 242 kprintf(" quorum/rw"); 243 else if (flags & HAMMER2_CLUSTER_RDHARD) 244 kprintf(" quorum/ro"); 245 246 if (flags & HAMMER2_CLUSTER_UNHARD) 247 kprintf(" out-of-sync-masters"); 248 else if (flags & HAMMER2_CLUSTER_NOHARD) 249 kprintf(" no-masters-visible"); 250 251 if (flags & HAMMER2_CLUSTER_WRSOFT) 252 kprintf(" soft/rw"); 253 else if (flags & HAMMER2_CLUSTER_RDSOFT) 254 kprintf(" soft/ro"); 255 256 if (flags & HAMMER2_CLUSTER_UNSOFT) 257 kprintf(" out-of-sync-slaves"); 258 else if (flags & HAMMER2_CLUSTER_NOSOFT) 259 kprintf(" no-slaves-visible"); 260 kprintf("\n"); 261 } 262 #endif 263 264 #if 0 265 static 266 void 267 dumpcluster(const char *label, 268 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster) 269 { 270 hammer2_chain_t *chain; 271 int i; 272 273 if ((hammer2_debug & 1) == 0) 274 return; 275 276 kprintf("%s\t", label); 277 KKASSERT(cparent->nchains == cluster->nchains); 278 for (i = 0; i < cparent->nchains; ++i) { 279 if (i) 280 kprintf("\t"); 281 kprintf("%d ", i); 282 if ((chain = cparent->array[i].chain) != NULL) { 283 kprintf("%016jx%s ", 284 chain->bref.key, 285 ((cparent->array[i].flags & 286 HAMMER2_CITEM_INVALID) ? "(I)" : " ") 287 ); 288 } else { 289 kprintf(" NULL %s ", " "); 290 } 291 if ((chain = cluster->array[i].chain) != NULL) { 292 kprintf("%016jx%s ", 293 chain->bref.key, 294 ((cluster->array[i].flags & 295 HAMMER2_CITEM_INVALID) ? "(I)" : " ") 296 ); 297 } else { 298 kprintf(" NULL %s ", " "); 299 } 300 kprintf("\n"); 301 } 302 } 303 #endif 304 305 /* 306 * Each out of sync node sync-thread must issue an all-nodes XOP scan of 307 * the inode. This creates a multiplication effect since the XOP scan itself 308 * issues to all nodes. However, this is the only way we can safely 309 * synchronize nodes which might have disparate I/O bandwidths and the only 310 * way we can safely deal with stalled nodes. 311 */ 312 static 313 int 314 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip, 315 hammer2_deferred_list_t *list, int isroot) 316 { 317 hammer2_xop_scanall_t *xop; 318 hammer2_chain_t *parent; 319 hammer2_chain_t *chain; 320 hammer2_pfs_t *pmp; 321 hammer2_key_t key_next; 322 hammer2_tid_t sync_tid; 323 int cache_index = -1; 324 int needrescan; 325 int want_update; 326 int error; 327 int nerror; 328 int idx; 329 int n; 330 331 pmp = ip->pmp; 332 idx = thr->clindex; /* cluster node we are responsible for */ 333 needrescan = 0; 334 want_update = 0; 335 sync_tid = 0; 336 chain = NULL; 337 parent = NULL; 338 339 #if 0 340 /* 341 * Nothing to do if all slaves are synchronized. 342 * Nothing to do if cluster not authoritatively readable. 343 */ 344 if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED) 345 return(0); 346 if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0) 347 return(HAMMER2_ERROR_INCOMPLETE); 348 #endif 349 350 error = 0; 351 352 /* 353 * Resolve the root inode of the PFS and determine if synchronization 354 * is needed by checking modify_tid. 355 */ 356 { 357 hammer2_xop_ipcluster_t *xop2; 358 hammer2_chain_t *focus; 359 360 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 361 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 362 hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster, 363 idx); 364 hammer2_inode_unlock(ip); 365 error = hammer2_xop_collect(&xop2->head, 0); 366 if (error == 0 && (focus = xop2->head.cluster.focus) != NULL) { 367 sync_tid = focus->bref.modify_tid; /* XXX */ 368 chain = hammer2_inode_chain_and_parent(ip, idx, 369 &parent, 370 HAMMER2_RESOLVE_ALWAYS | 371 HAMMER2_RESOLVE_SHARED); 372 want_update = (chain->bref.modify_tid != sync_tid); 373 if (chain) { 374 hammer2_chain_unlock(chain); 375 hammer2_chain_drop(chain); 376 chain = NULL; 377 } 378 if (parent) { 379 hammer2_chain_unlock(parent); 380 hammer2_chain_drop(parent); 381 parent = NULL; 382 } 383 } 384 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP); 385 } 386 387 if (want_update == 0) 388 return(0); 389 390 /* 391 * The inode is left unlocked during the scan. Issue a XOP 392 * that does *not* include our cluster index to iterate 393 * properly synchronized elements and resolve our cluster index 394 * against it. 395 */ 396 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 397 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 398 xop->key_beg = HAMMER2_KEY_MIN; 399 xop->key_end = HAMMER2_KEY_MAX; 400 xop->resolve_flags = HAMMER2_RESOLVE_SHARED | 401 HAMMER2_RESOLVE_ALWAYS; 402 xop->lookup_flags = HAMMER2_LOOKUP_SHARED | 403 HAMMER2_LOOKUP_NODIRECT | 404 HAMMER2_LOOKUP_ALWAYS; 405 hammer2_xop_start_except(&xop->head, hammer2_xop_scanall, idx); 406 parent = hammer2_inode_chain(ip, idx, 407 HAMMER2_RESOLVE_ALWAYS | 408 HAMMER2_RESOLVE_SHARED); 409 hammer2_inode_unlock(ip); 410 411 chain = hammer2_chain_lookup(&parent, &key_next, 412 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 413 &cache_index, 414 HAMMER2_LOOKUP_SHARED | 415 HAMMER2_LOOKUP_NODIRECT | 416 HAMMER2_LOOKUP_NODATA); 417 error = hammer2_xop_collect(&xop->head, 0); 418 kprintf("START_SCAN IP=%016jx chain=%p (%016jx)\n", 419 ip->meta.name_key, chain, 420 (chain ? chain->bref.key : -1)); 421 422 for (;;) { 423 /* 424 * We are done if our scan is done and the XOP scan is done. 425 * We are done if the XOP scan failed (that is, we don't 426 * have authoritative data to synchronize with). 427 */ 428 int advance_local = 0; 429 int advance_xop = 0; 430 int dodefer = 0; 431 hammer2_chain_t *focus; 432 433 if (chain == NULL && error == ENOENT) 434 break; 435 if (error && error != ENOENT) 436 break; 437 438 /* 439 * Compare 440 */ 441 if (chain && error == ENOENT) { 442 /* 443 * If we have local chains but the XOP scan is done, 444 * the chains need to be deleted. 445 */ 446 n = -1; 447 focus = NULL; 448 } else if (chain == NULL) { 449 /* 450 * If our local scan is done but the XOP scan is not, 451 * we need to create the missing chain(s). 452 */ 453 n = 1; 454 focus = xop->head.cluster.focus; 455 } else { 456 /* 457 * Otherwise compare to determine the action 458 * needed. 459 */ 460 focus = xop->head.cluster.focus; 461 n = hammer2_chain_cmp(chain, focus); 462 } 463 464 /* 465 * Take action based on comparison results. 466 */ 467 if (n < 0) { 468 /* 469 * Delete extranious local data. This will 470 * automatically advance the chain. 471 */ 472 nerror = hammer2_sync_destroy(thr, &parent, &chain, 473 0, idx); 474 } else if (n == 0 && chain->bref.modify_tid != 475 focus->bref.modify_tid) { 476 /* 477 * Matching key but local data or meta-data requires 478 * updating. If we will recurse, we still need to 479 * update to compatible content first but we do not 480 * synchronize modify_tid until the entire recursion 481 * has completed successfully. 482 * 483 * NOTE: Do not try to access hardlink pointers as if 484 * they were normal inodes, the inode cache will 485 * get seriously confused. 486 */ 487 if (focus->bref.type == HAMMER2_BREF_TYPE_INODE && 488 focus->data->ipdata.meta.type != 489 HAMMER2_OBJTYPE_HARDLINK) { 490 nerror = hammer2_sync_replace( 491 thr, parent, chain, 492 0, 493 idx, focus, 0); 494 dodefer = 1; 495 } else { 496 nerror = hammer2_sync_replace( 497 thr, parent, chain, 498 focus->bref.modify_tid, 499 idx, focus, 0); 500 } 501 advance_local = 1; 502 advance_xop = 1; 503 } else if (n == 0) { 504 /* 505 * 100% match, advance both 506 */ 507 advance_local = 1; 508 advance_xop = 1; 509 nerror = 0; 510 } else if (n > 0) { 511 /* 512 * Insert missing local data. 513 * 514 * If we will recurse, we still need to update to 515 * compatible content first but we do not synchronize 516 * modify_tid until the entire recursion has 517 * completed successfully. 518 * 519 * NOTE: Do not try to access hardlink pointers as if 520 * they were normal inodes, the inode cache will 521 * get seriously confused. 522 */ 523 if (focus->bref.type == HAMMER2_BREF_TYPE_INODE && 524 focus->data->ipdata.meta.type != 525 HAMMER2_OBJTYPE_HARDLINK) { 526 nerror = hammer2_sync_insert( 527 thr, &parent, &chain, 528 0, 529 idx, focus); 530 dodefer = 2; 531 } else { 532 nerror = hammer2_sync_insert( 533 thr, &parent, &chain, 534 focus->bref.modify_tid, 535 idx, focus); 536 } 537 advance_local = 1; 538 advance_xop = 1; 539 } 540 541 /* 542 * We cannot recurse depth-first because the XOP is still 543 * running in node threads for this scan. Create a placemarker 544 * by obtaining and record the hammer2_inode. 545 * 546 * We excluded our node from the XOP so we must temporarily 547 * add it to xop->head.cluster so it is properly incorporated 548 * into the inode. 549 * 550 * The deferral is pushed onto a LIFO list for bottom-up 551 * synchronization. 552 */ 553 if (error == 0 && dodefer) { 554 hammer2_inode_t *nip; 555 hammer2_deferred_ip_t *defer; 556 557 KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE); 558 559 defer = kmalloc(sizeof(*defer), M_HAMMER2, 560 M_WAITOK | M_ZERO); 561 KKASSERT(xop->head.cluster.array[idx].chain == NULL); 562 xop->head.cluster.array[idx].flags = 563 HAMMER2_CITEM_INVALID; 564 xop->head.cluster.array[idx].chain = chain; 565 nip = hammer2_inode_get(pmp, ip, 566 &xop->head.cluster, idx); 567 xop->head.cluster.array[idx].chain = NULL; 568 569 hammer2_inode_ref(nip); 570 hammer2_inode_unlock(nip); 571 572 defer->next = list->base; 573 defer->ip = nip; 574 list->base = defer; 575 ++list->count; 576 needrescan = 1; 577 } 578 579 /* 580 * If at least one deferral was added and the deferral 581 * list has grown too large, stop adding more. This 582 * will trigger an EAGAIN return. 583 */ 584 if (needrescan && list->count > 1000) 585 break; 586 587 /* 588 * Advancements for iteration. 589 */ 590 if (advance_xop) { 591 error = hammer2_xop_collect(&xop->head, 0); 592 } 593 if (advance_local) { 594 chain = hammer2_chain_next(&parent, chain, &key_next, 595 key_next, HAMMER2_KEY_MAX, 596 &cache_index, 597 HAMMER2_LOOKUP_SHARED | 598 HAMMER2_LOOKUP_NODIRECT | 599 HAMMER2_LOOKUP_NODATA); 600 } 601 } 602 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 603 if (chain) { 604 hammer2_chain_unlock(chain); 605 hammer2_chain_drop(chain); 606 } 607 if (parent) { 608 hammer2_chain_unlock(parent); 609 hammer2_chain_drop(parent); 610 } 611 612 /* 613 * If we added deferrals we want the caller to synchronize them 614 * and then call us again. 615 * 616 * NOTE: In this situation we do not yet want to synchronize our 617 * inode, setting the error code also has that effect. 618 */ 619 if ((error == 0 || error == ENOENT) && needrescan) 620 error = EAGAIN; 621 622 /* 623 * If no error occurred we can synchronize the inode meta-data 624 * and modify_tid. Only limited changes are made to PFSROOTs. 625 * 626 * XXX inode lock was lost 627 */ 628 if (error == 0 || error == ENOENT) { 629 hammer2_xop_ipcluster_t *xop2; 630 hammer2_chain_t *focus; 631 632 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 633 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 634 hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster, 635 idx); 636 hammer2_inode_unlock(ip); 637 error = hammer2_xop_collect(&xop2->head, 0); 638 if (error == 0) { 639 focus = xop2->head.cluster.focus; 640 kprintf("syncthr: update inode %p (%s)\n", 641 focus, 642 (focus ? 643 (char *)focus->data->ipdata.filename : "?")); 644 chain = hammer2_inode_chain_and_parent(ip, idx, 645 &parent, 646 HAMMER2_RESOLVE_ALWAYS | 647 HAMMER2_RESOLVE_SHARED); 648 649 KKASSERT(parent != NULL); 650 nerror = hammer2_sync_replace( 651 thr, parent, chain, 652 sync_tid, 653 idx, focus, isroot); 654 hammer2_chain_unlock(chain); 655 hammer2_chain_drop(chain); 656 hammer2_chain_unlock(parent); 657 hammer2_chain_drop(parent); 658 /* XXX */ 659 } 660 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP); 661 } 662 663 return error; 664 } 665 666 /* 667 * Create a missing chain by copying the focus from another device. 668 * 669 * On entry *parentp and focus are both locked shared. The chain will be 670 * created and returned in *chainp also locked shared. 671 */ 672 static 673 int 674 hammer2_sync_insert(hammer2_thread_t *thr, 675 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 676 hammer2_tid_t mtid, int idx, hammer2_chain_t *focus) 677 { 678 hammer2_chain_t *chain; 679 hammer2_key_t dummy; 680 int cache_index = -1; 681 682 #if HAMMER2_SYNCHRO_DEBUG 683 if (hammer2_debug & 1) 684 kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n", 685 *parentp, 686 (*parentp)->bref.type, 687 (*parentp)->bref.key, 688 idx, 689 focus->bref.type, focus->bref.key, mtid); 690 #endif 691 692 /* 693 * Parent requires an exclusive lock for the insertion. 694 * We must unlock the child to avoid deadlocks while 695 * relocking the parent. 696 */ 697 if (*chainp) { 698 hammer2_chain_unlock(*chainp); 699 hammer2_chain_drop(*chainp); 700 *chainp = NULL; 701 } 702 hammer2_chain_unlock(*parentp); 703 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS); 704 705 /* 706 * We must reissue the lookup to properly position (*parentp) 707 * for the insertion. 708 */ 709 chain = hammer2_chain_lookup(parentp, &dummy, 710 focus->bref.key, focus->bref.key, 711 &cache_index, 712 HAMMER2_LOOKUP_NODIRECT | 713 HAMMER2_LOOKUP_ALWAYS); 714 KKASSERT(chain == NULL); 715 716 chain = NULL; 717 hammer2_chain_create(parentp, &chain, thr->pmp, 718 focus->bref.key, focus->bref.keybits, 719 focus->bref.type, focus->bytes, 720 mtid, 0, 0); 721 hammer2_chain_modify(chain, mtid, 0, 0); 722 723 /* 724 * Copy focus to new chain 725 */ 726 727 /* type already set */ 728 chain->bref.methods = focus->bref.methods; 729 /* keybits already set */ 730 chain->bref.vradix = focus->bref.vradix; 731 /* mirror_tid set by flush */ 732 KKASSERT(chain->bref.modify_tid == mtid); 733 chain->bref.flags = focus->bref.flags; 734 /* key already present */ 735 /* check code will be recalculated */ 736 737 /* 738 * Copy data body. 739 */ 740 switch(chain->bref.type) { 741 case HAMMER2_BREF_TYPE_INODE: 742 if ((focus->data->ipdata.meta.op_flags & 743 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 744 /* do not copy block table */ 745 bcopy(focus->data, chain->data, 746 offsetof(hammer2_inode_data_t, u)); 747 break; 748 } 749 /* fall through copy whole thing */ 750 case HAMMER2_BREF_TYPE_DATA: 751 bcopy(focus->data, chain->data, chain->bytes); 752 hammer2_chain_setcheck(chain, chain->data); 753 break; 754 default: 755 KKASSERT(0); 756 break; 757 } 758 759 hammer2_chain_unlock(chain); /* unlock, leave ref */ 760 *chainp = chain; /* will be returned locked */ 761 762 /* 763 * Avoid ordering deadlock when relocking shared. 764 */ 765 hammer2_chain_unlock(*parentp); 766 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED | 767 HAMMER2_RESOLVE_ALWAYS); 768 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED | 769 HAMMER2_RESOLVE_ALWAYS); 770 771 return 0; 772 } 773 774 /* 775 * Destroy an extranious chain. 776 * 777 * Both *parentp and *chainp are locked shared. 778 * 779 * On return, *chainp will be adjusted to point to the next element in the 780 * iteration and locked shared. 781 */ 782 static 783 int 784 hammer2_sync_destroy(hammer2_thread_t *thr, 785 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 786 hammer2_tid_t mtid, int idx) 787 { 788 hammer2_chain_t *chain; 789 hammer2_chain_t *parent; 790 hammer2_key_t key_next; 791 hammer2_key_t save_key; 792 int cache_index = -1; 793 794 chain = *chainp; 795 796 #if HAMMER2_SYNCHRO_DEBUG 797 if (hammer2_debug & 1) 798 kprintf("destroy rec %p/%p slave %d %d.%016jx\n", 799 *parentp, chain, 800 idx, chain->bref.type, chain->bref.key); 801 #endif 802 803 save_key = chain->bref.key; 804 if (save_key != HAMMER2_KEY_MAX) 805 ++save_key; 806 807 /* 808 * Try to avoid unnecessary I/O. 809 * 810 * XXX accounting not propagated up properly. We might have to do 811 * a RESOLVE_MAYBE here and pass 0 for the flags. 812 */ 813 hammer2_chain_unlock(chain); /* relock exclusive */ 814 hammer2_chain_unlock(*parentp); 815 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS); 816 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 817 818 hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT); 819 hammer2_chain_unlock(chain); 820 hammer2_chain_drop(chain); 821 chain = NULL; /* safety */ 822 823 hammer2_chain_unlock(*parentp); /* relock shared */ 824 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED | 825 HAMMER2_RESOLVE_ALWAYS); 826 *chainp = hammer2_chain_lookup(&parent, &key_next, 827 save_key, HAMMER2_KEY_MAX, 828 &cache_index, 829 HAMMER2_LOOKUP_SHARED | 830 HAMMER2_LOOKUP_NODIRECT | 831 HAMMER2_LOOKUP_NODATA); 832 return 0; 833 } 834 835 /* 836 * cparent is locked exclusively, with an extra ref, cluster is not locked. 837 * Replace element [i] in the cluster. 838 */ 839 static 840 int 841 hammer2_sync_replace(hammer2_thread_t *thr, 842 hammer2_chain_t *parent, hammer2_chain_t *chain, 843 hammer2_tid_t mtid, int idx, 844 hammer2_chain_t *focus, int isroot) 845 { 846 int nradix; 847 uint8_t otype; 848 849 #if HAMMER2_SYNCHRO_DEBUG 850 if (hammer2_debug & 1) 851 kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n", 852 chain, 853 idx, 854 focus->bref.type, focus->bref.key, mtid); 855 #endif 856 hammer2_chain_unlock(chain); 857 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 858 if (chain->bytes != focus->bytes) { 859 /* XXX what if compressed? */ 860 nradix = hammer2_getradix(chain->bytes); 861 hammer2_chain_resize(NULL, parent, chain, 862 mtid, 0, 863 nradix, 0); 864 } 865 hammer2_chain_modify(chain, mtid, 0, 0); 866 otype = chain->bref.type; 867 chain->bref.type = focus->bref.type; 868 chain->bref.methods = focus->bref.methods; 869 chain->bref.keybits = focus->bref.keybits; 870 chain->bref.vradix = focus->bref.vradix; 871 /* mirror_tid updated by flush */ 872 KKASSERT(mtid == 0 || chain->bref.modify_tid == mtid); 873 chain->bref.flags = focus->bref.flags; 874 /* key already present */ 875 /* check code will be recalculated */ 876 chain->error = 0; 877 878 /* 879 * Copy data body. 880 */ 881 switch(chain->bref.type) { 882 case HAMMER2_BREF_TYPE_INODE: 883 /* 884 * Special case PFSROOTs, only limited changes can be made 885 * since the meta-data contains miscellanious distinguishing 886 * fields. 887 */ 888 if (isroot) { 889 chain->data->ipdata.meta.uflags = 890 focus->data->ipdata.meta.uflags; 891 chain->data->ipdata.meta.rmajor = 892 focus->data->ipdata.meta.rmajor; 893 chain->data->ipdata.meta.rminor = 894 focus->data->ipdata.meta.rminor; 895 chain->data->ipdata.meta.ctime = 896 focus->data->ipdata.meta.ctime; 897 chain->data->ipdata.meta.mtime = 898 focus->data->ipdata.meta.mtime; 899 chain->data->ipdata.meta.atime = 900 focus->data->ipdata.meta.atime; 901 /* not btime */ 902 chain->data->ipdata.meta.uid = 903 focus->data->ipdata.meta.uid; 904 chain->data->ipdata.meta.gid = 905 focus->data->ipdata.meta.gid; 906 chain->data->ipdata.meta.mode = 907 focus->data->ipdata.meta.mode; 908 chain->data->ipdata.meta.ncopies = 909 focus->data->ipdata.meta.ncopies; 910 chain->data->ipdata.meta.comp_algo = 911 focus->data->ipdata.meta.comp_algo; 912 chain->data->ipdata.meta.check_algo = 913 focus->data->ipdata.meta.check_algo; 914 chain->data->ipdata.meta.data_quota = 915 focus->data->ipdata.meta.data_quota; 916 chain->data->ipdata.meta.inode_quota = 917 focus->data->ipdata.meta.inode_quota; 918 chain->data->ipdata.meta.attr_tid = 919 focus->data->ipdata.meta.attr_tid; 920 chain->data->ipdata.meta.dirent_tid = 921 focus->data->ipdata.meta.dirent_tid; 922 hammer2_chain_setcheck(chain, chain->data); 923 break; 924 } 925 926 /* 927 * Normal replacement. 928 */ 929 if ((focus->data->ipdata.meta.op_flags & 930 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 931 /* 932 * If DIRECTDATA is transitioning to 0 or the old 933 * chain is not an inode we have to initialize 934 * the block table. 935 */ 936 if (otype != HAMMER2_BREF_TYPE_INODE || 937 (chain->data->ipdata.meta.op_flags & 938 HAMMER2_OPFLAG_DIRECTDATA)) { 939 kprintf("chain inode trans away from dd\n"); 940 bzero(&chain->data->ipdata.u, 941 sizeof(chain->data->ipdata.u)); 942 } 943 bcopy(focus->data, chain->data, 944 offsetof(hammer2_inode_data_t, u)); 945 /* XXX setcheck on inode should not be needed */ 946 hammer2_chain_setcheck(chain, chain->data); 947 break; 948 } 949 /* fall through */ 950 case HAMMER2_BREF_TYPE_DATA: 951 bcopy(focus->data, chain->data, chain->bytes); 952 hammer2_chain_setcheck(chain, chain->data); 953 break; 954 default: 955 KKASSERT(0); 956 break; 957 } 958 959 hammer2_chain_unlock(chain); 960 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED | 961 HAMMER2_RESOLVE_MAYBE); 962 963 return 0; 964 } 965