1 /* 2 * Copyright (c) 2015-2017 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * This module implements the cluster synchronizer. Basically the way 36 * it works is that a thread is created for each cluster node in a PFS. 37 * This thread is responsible for synchronizing the current node using 38 * data from other nodes. 39 * 40 * Any out of sync master or slave can get back into synchronization as 41 * long as a quorum of masters agree on the update_tid. If a quorum is 42 * not available it may still be possible to synchronize to the highest 43 * available update_tid as a way of trying to catch up as much as possible 44 * until a quorum is available. 45 * 46 * If no quorum is possible (which can happen even if all masters are 47 * available, if the update_tid does not match), then manual intervention 48 * may be required to resolve discrepancies. 49 */ 50 #include "hammer2.h" 51 52 typedef struct hammer2_deferred_ip { 53 struct hammer2_deferred_ip *next; 54 hammer2_inode_t *ip; 55 } hammer2_deferred_ip_t; 56 57 typedef struct hammer2_deferred_list { 58 hammer2_deferred_ip_t *base; 59 int count; 60 } hammer2_deferred_list_t; 61 62 63 #define HAMMER2_SYNCHRO_DEBUG 1 64 65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip, 66 hammer2_deferred_list_t *list, int isroot); 67 #if 0 68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags); 69 nerror = hammer2_sync_insert( 70 thr, &parent, &chain, 71 focus->bref.modify_tid, 72 idx, focus); 73 #endif 74 static int hammer2_sync_insert(hammer2_thread_t *thr, 75 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 76 hammer2_tid_t modify_tid, int idx, 77 hammer2_chain_t *focus); 78 static int hammer2_sync_destroy(hammer2_thread_t *thr, 79 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 80 hammer2_tid_t mtid, int idx); 81 static int hammer2_sync_replace(hammer2_thread_t *thr, 82 hammer2_chain_t *parent, hammer2_chain_t *chain, 83 hammer2_tid_t mtid, int idx, 84 hammer2_chain_t *focus, int isroot); 85 86 /**************************************************************************** 87 * HAMMER2 SYNC THREADS * 88 ****************************************************************************/ 89 /* 90 * Primary management thread for an element of a node. A thread will exist 91 * for each element requiring management. 92 * 93 * No management threads are needed for the SPMP or for any PMP with only 94 * a single MASTER. 95 * 96 * On the SPMP - handles bulkfree and dedup operations 97 * On a PFS - handles remastering and synchronization 98 */ 99 void 100 hammer2_primary_sync_thread(void *arg) 101 { 102 hammer2_thread_t *thr = arg; 103 hammer2_pfs_t *pmp; 104 hammer2_deferred_list_t list; 105 hammer2_deferred_ip_t *defer; 106 int error; 107 uint32_t flags; 108 uint32_t nflags; 109 110 pmp = thr->pmp; 111 bzero(&list, sizeof(list)); 112 113 for (;;) { 114 flags = thr->flags; 115 cpu_ccfence(); 116 117 /* 118 * Handle stop request 119 */ 120 if (flags & HAMMER2_THREAD_STOP) 121 break; 122 123 /* 124 * Handle freeze request 125 */ 126 if (flags & HAMMER2_THREAD_FREEZE) { 127 nflags = (flags & ~(HAMMER2_THREAD_FREEZE | 128 HAMMER2_THREAD_CLIENTWAIT)) | 129 HAMMER2_THREAD_FROZEN; 130 if (!atomic_cmpset_int(&thr->flags, flags, nflags)) 131 continue; 132 if (flags & HAMMER2_THREAD_CLIENTWAIT) 133 wakeup(&thr->flags); 134 flags = nflags; 135 /* fall through */ 136 } 137 138 if (flags & HAMMER2_THREAD_UNFREEZE) { 139 nflags = flags & ~(HAMMER2_THREAD_UNFREEZE | 140 HAMMER2_THREAD_FROZEN | 141 HAMMER2_THREAD_CLIENTWAIT); 142 if (!atomic_cmpset_int(&thr->flags, flags, nflags)) 143 continue; 144 if (flags & HAMMER2_THREAD_CLIENTWAIT) 145 wakeup(&thr->flags); 146 flags = nflags; 147 /* fall through */ 148 } 149 150 /* 151 * Force idle if frozen until unfrozen or stopped. 152 */ 153 if (flags & HAMMER2_THREAD_FROZEN) { 154 nflags = flags | HAMMER2_THREAD_WAITING; 155 tsleep_interlock(&thr->flags, 0); 156 if (atomic_cmpset_int(&thr->flags, flags, nflags)) { 157 tsleep(&thr->flags, PINTERLOCKED, "frozen", 0); 158 atomic_clear_int(&thr->flags, 159 HAMMER2_THREAD_WAITING); 160 } 161 continue; 162 } 163 164 /* 165 * Reset state on REMASTER request 166 */ 167 if (thr->flags & HAMMER2_THREAD_REMASTER) { 168 nflags = flags & ~HAMMER2_THREAD_REMASTER; 169 if (atomic_cmpset_int(&thr->flags, flags, nflags)) { 170 /* reset state here */ 171 } 172 continue; 173 } 174 175 /* 176 * Synchronization scan. 177 */ 178 if (hammer2_debug & 0x8000) 179 kprintf("sync_slaves pfs %s clindex %d\n", 180 pmp->pfs_names[thr->clindex], thr->clindex); 181 hammer2_trans_init(pmp, 0); 182 183 hammer2_inode_ref(pmp->iroot); 184 185 for (;;) { 186 int didbreak = 0; 187 /* XXX lock synchronize pmp->modify_tid */ 188 error = hammer2_sync_slaves(thr, pmp->iroot, &list, 1); 189 if (hammer2_debug & 0x8000) { 190 kprintf("sync_slaves error %d defer %p\n", 191 error, list.base); 192 } 193 if (error != EAGAIN) 194 break; 195 while ((defer = list.base) != NULL) { 196 hammer2_inode_t *nip; 197 198 nip = defer->ip; 199 error = hammer2_sync_slaves(thr, nip, &list, 200 (nip == pmp->iroot)); 201 if (error && error != EAGAIN && error != ENOENT) 202 break; 203 if (hammer2_thr_break(thr)) { 204 didbreak = 1; 205 break; 206 } 207 208 /* 209 * If no additional defers occurred we can 210 * remove this one, otherwise keep it on 211 * the list and retry once the additional 212 * defers have completed. 213 */ 214 if (defer == list.base) { 215 --list.count; 216 list.base = defer->next; 217 kfree(defer, M_HAMMER2); 218 defer = NULL; /* safety */ 219 hammer2_inode_drop(nip); 220 } 221 } 222 223 /* 224 * If the thread is being remastered, frozen, or 225 * stopped, clean up any left-over deferals. 226 */ 227 if (didbreak || (error && error != EAGAIN)) { 228 kprintf("didbreak\n"); 229 while ((defer = list.base) != NULL) { 230 --list.count; 231 hammer2_inode_drop(defer->ip); 232 list.base = defer->next; 233 kfree(defer, M_HAMMER2); 234 } 235 if (error == 0 || error == EAGAIN) 236 error = EINPROGRESS; 237 break; 238 } 239 } 240 241 hammer2_inode_drop(pmp->iroot); 242 hammer2_trans_done(pmp); 243 244 if (error && error != EINPROGRESS) 245 kprintf("hammer2_sync_slaves: error %d\n", error); 246 247 /* 248 * Wait for event, or 5-second poll. 249 */ 250 nflags = flags | HAMMER2_THREAD_WAITING; 251 tsleep_interlock(&thr->flags, 0); 252 if (atomic_cmpset_int(&thr->flags, flags, nflags)) { 253 tsleep(&thr->flags, 0, "h2idle", hz * 5); 254 atomic_clear_int(&thr->flags, HAMMER2_THREAD_WAITING); 255 } 256 } 257 thr->td = NULL; 258 hammer2_thr_return(thr, HAMMER2_THREAD_STOPPED); 259 /* thr structure can go invalid after this point */ 260 } 261 262 #if 0 263 /* 264 * Given a locked cluster created from pmp->iroot, update the PFS's 265 * reporting status. 266 */ 267 static 268 void 269 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags) 270 { 271 hammer2_pfs_t *pmp = thr->pmp; 272 273 flags &= HAMMER2_CLUSTER_ZFLAGS; 274 if (pmp->cluster_flags == flags) 275 return; 276 pmp->cluster_flags = flags; 277 278 kprintf("pfs %p", pmp); 279 if (flags & HAMMER2_CLUSTER_MSYNCED) 280 kprintf(" masters-all-good"); 281 if (flags & HAMMER2_CLUSTER_SSYNCED) 282 kprintf(" slaves-all-good"); 283 284 if (flags & HAMMER2_CLUSTER_WRHARD) 285 kprintf(" quorum/rw"); 286 else if (flags & HAMMER2_CLUSTER_RDHARD) 287 kprintf(" quorum/ro"); 288 289 if (flags & HAMMER2_CLUSTER_UNHARD) 290 kprintf(" out-of-sync-masters"); 291 else if (flags & HAMMER2_CLUSTER_NOHARD) 292 kprintf(" no-masters-visible"); 293 294 if (flags & HAMMER2_CLUSTER_WRSOFT) 295 kprintf(" soft/rw"); 296 else if (flags & HAMMER2_CLUSTER_RDSOFT) 297 kprintf(" soft/ro"); 298 299 if (flags & HAMMER2_CLUSTER_UNSOFT) 300 kprintf(" out-of-sync-slaves"); 301 else if (flags & HAMMER2_CLUSTER_NOSOFT) 302 kprintf(" no-slaves-visible"); 303 kprintf("\n"); 304 } 305 #endif 306 307 #if 0 308 static 309 void 310 dumpcluster(const char *label, 311 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster) 312 { 313 hammer2_chain_t *chain; 314 int i; 315 316 if ((hammer2_debug & 1) == 0) 317 return; 318 319 kprintf("%s\t", label); 320 KKASSERT(cparent->nchains == cluster->nchains); 321 for (i = 0; i < cparent->nchains; ++i) { 322 if (i) 323 kprintf("\t"); 324 kprintf("%d ", i); 325 if ((chain = cparent->array[i].chain) != NULL) { 326 kprintf("%016jx%s ", 327 chain->bref.key, 328 ((cparent->array[i].flags & 329 HAMMER2_CITEM_INVALID) ? "(I)" : " ") 330 ); 331 } else { 332 kprintf(" NULL %s ", " "); 333 } 334 if ((chain = cluster->array[i].chain) != NULL) { 335 kprintf("%016jx%s ", 336 chain->bref.key, 337 ((cluster->array[i].flags & 338 HAMMER2_CITEM_INVALID) ? "(I)" : " ") 339 ); 340 } else { 341 kprintf(" NULL %s ", " "); 342 } 343 kprintf("\n"); 344 } 345 } 346 #endif 347 348 /* 349 * Each out of sync node sync-thread must issue an all-nodes XOP scan of 350 * the inode. This creates a multiplication effect since the XOP scan itself 351 * issues to all nodes. However, this is the only way we can safely 352 * synchronize nodes which might have disparate I/O bandwidths and the only 353 * way we can safely deal with stalled nodes. 354 */ 355 static 356 int 357 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip, 358 hammer2_deferred_list_t *list, int isroot) 359 { 360 hammer2_xop_scanall_t *xop; 361 hammer2_chain_t *parent; 362 hammer2_chain_t *chain; 363 hammer2_pfs_t *pmp; 364 hammer2_key_t key_next; 365 hammer2_tid_t sync_tid; 366 int cache_index = -1; 367 int needrescan; 368 int want_update; 369 int error; 370 int nerror; 371 int idx; 372 int n; 373 374 pmp = ip->pmp; 375 idx = thr->clindex; /* cluster node we are responsible for */ 376 needrescan = 0; 377 want_update = 0; 378 sync_tid = 0; 379 chain = NULL; 380 parent = NULL; 381 382 #if 0 383 /* 384 * Nothing to do if all slaves are synchronized. 385 * Nothing to do if cluster not authoritatively readable. 386 */ 387 if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED) 388 return(0); 389 if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0) 390 return(HAMMER2_ERROR_INCOMPLETE); 391 #endif 392 393 error = 0; 394 395 /* 396 * Resolve the root inode of the PFS and determine if synchronization 397 * is needed by checking modify_tid. 398 * 399 * Retain the synchronization TID from the focus inode and use it 400 * later to synchronize the focus inode if/when the recursion 401 * succeeds. 402 */ 403 { 404 hammer2_xop_ipcluster_t *xop2; 405 hammer2_chain_t *focus; 406 407 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 408 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 409 hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster, 410 idx); 411 hammer2_inode_unlock(ip); 412 error = hammer2_xop_collect(&xop2->head, 0); 413 if (error == 0 && (focus = xop2->head.cluster.focus) != NULL) { 414 sync_tid = focus->bref.modify_tid; 415 chain = hammer2_inode_chain_and_parent(ip, idx, 416 &parent, 417 HAMMER2_RESOLVE_ALWAYS | 418 HAMMER2_RESOLVE_SHARED); 419 want_update = (chain->bref.modify_tid != sync_tid); 420 if (chain) { 421 hammer2_chain_unlock(chain); 422 hammer2_chain_drop(chain); 423 chain = NULL; 424 } 425 if (parent) { 426 hammer2_chain_unlock(parent); 427 hammer2_chain_drop(parent); 428 parent = NULL; 429 } 430 } 431 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP); 432 } 433 434 if (want_update == 0) 435 return(0); 436 437 /* 438 * The inode is left unlocked during the scan. Issue a XOP 439 * that does *not* include our cluster index to iterate 440 * properly synchronized elements and resolve our cluster index 441 * against it. 442 */ 443 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 444 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 445 xop->key_beg = HAMMER2_KEY_MIN; 446 xop->key_end = HAMMER2_KEY_MAX; 447 xop->resolve_flags = HAMMER2_RESOLVE_SHARED | 448 HAMMER2_RESOLVE_ALWAYS; 449 xop->lookup_flags = HAMMER2_LOOKUP_SHARED | 450 HAMMER2_LOOKUP_NODIRECT | 451 HAMMER2_LOOKUP_ALWAYS; 452 hammer2_xop_start_except(&xop->head, hammer2_xop_scanall, idx); 453 parent = hammer2_inode_chain(ip, idx, 454 HAMMER2_RESOLVE_ALWAYS | 455 HAMMER2_RESOLVE_SHARED); 456 hammer2_inode_unlock(ip); 457 458 chain = hammer2_chain_lookup(&parent, &key_next, 459 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 460 &cache_index, 461 HAMMER2_LOOKUP_SHARED | 462 HAMMER2_LOOKUP_NODIRECT | 463 HAMMER2_LOOKUP_NODATA); 464 error = hammer2_xop_collect(&xop->head, 0); 465 if (hammer2_debug & 0x8000) { 466 kprintf("START_SCAN IP=%016jx chain=%p (%016jx)\n", 467 ip->meta.name_key, chain, 468 (chain ? chain->bref.key : -1)); 469 } 470 471 for (;;) { 472 /* 473 * We are done if our scan is done and the XOP scan is done. 474 * We are done if the XOP scan failed (that is, we don't 475 * have authoritative data to synchronize with). 476 */ 477 int advance_local = 0; 478 int advance_xop = 0; 479 int dodefer = 0; 480 hammer2_chain_t *focus; 481 482 if (chain == NULL && error == ENOENT) 483 break; 484 if (error && error != ENOENT) 485 break; 486 487 /* 488 * Compare 489 */ 490 if (chain && error == ENOENT) { 491 /* 492 * If we have local chains but the XOP scan is done, 493 * the chains need to be deleted. 494 */ 495 n = -1; 496 focus = NULL; 497 } else if (chain == NULL) { 498 /* 499 * If our local scan is done but the XOP scan is not, 500 * we need to create the missing chain(s). 501 */ 502 n = 1; 503 focus = xop->head.cluster.focus; 504 } else { 505 /* 506 * Otherwise compare to determine the action 507 * needed. 508 */ 509 focus = xop->head.cluster.focus; 510 n = hammer2_chain_cmp(chain, focus); 511 } 512 513 /* 514 * Take action based on comparison results. 515 */ 516 if (n < 0) { 517 /* 518 * Delete extranious local data. This will 519 * automatically advance the chain. 520 */ 521 nerror = hammer2_sync_destroy(thr, &parent, &chain, 522 0, idx); 523 } else if (n == 0 && chain->bref.modify_tid != 524 focus->bref.modify_tid) { 525 /* 526 * Matching key but local data or meta-data requires 527 * updating. If we will recurse, we still need to 528 * update to compatible content first but we do not 529 * synchronize modify_tid until the entire recursion 530 * has completed successfully. 531 * 532 * NOTE: Do not try to access hardlink pointers as if 533 * they were normal inodes, the inode cache will 534 * get seriously confused. 535 */ 536 if (focus->bref.type == HAMMER2_BREF_TYPE_INODE && 537 focus->data->ipdata.meta.type != 538 HAMMER2_OBJTYPE_HARDLINK) { 539 nerror = hammer2_sync_replace( 540 thr, parent, chain, 541 0, 542 idx, focus, 0); 543 dodefer = 1; 544 } else { 545 nerror = hammer2_sync_replace( 546 thr, parent, chain, 547 focus->bref.modify_tid, 548 idx, focus, 0); 549 } 550 advance_local = 1; 551 advance_xop = 1; 552 } else if (n == 0) { 553 /* 554 * 100% match, advance both 555 */ 556 advance_local = 1; 557 advance_xop = 1; 558 nerror = 0; 559 } else if (n > 0) { 560 /* 561 * Insert missing local data. 562 * 563 * If we will recurse, we still need to update to 564 * compatible content first but we do not synchronize 565 * modify_tid until the entire recursion has 566 * completed successfully. 567 * 568 * NOTE: Do not try to access hardlink pointers as if 569 * they were normal inodes, the inode cache will 570 * get seriously confused. 571 */ 572 if (focus->bref.type == HAMMER2_BREF_TYPE_INODE && 573 focus->data->ipdata.meta.type != 574 HAMMER2_OBJTYPE_HARDLINK) { 575 nerror = hammer2_sync_insert( 576 thr, &parent, &chain, 577 0, 578 idx, focus); 579 dodefer = 2; 580 } else { 581 nerror = hammer2_sync_insert( 582 thr, &parent, &chain, 583 focus->bref.modify_tid, 584 idx, focus); 585 } 586 advance_local = 1; 587 advance_xop = 1; 588 } 589 590 /* 591 * We cannot recurse depth-first because the XOP is still 592 * running in node threads for this scan. Create a placemarker 593 * by obtaining and record the hammer2_inode. 594 * 595 * We excluded our node from the XOP so we must temporarily 596 * add it to xop->head.cluster so it is properly incorporated 597 * into the inode. 598 * 599 * The deferral is pushed onto a LIFO list for bottom-up 600 * synchronization. 601 */ 602 if (error == 0 && dodefer) { 603 hammer2_inode_t *nip; 604 hammer2_deferred_ip_t *defer; 605 606 KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE); 607 608 defer = kmalloc(sizeof(*defer), M_HAMMER2, 609 M_WAITOK | M_ZERO); 610 KKASSERT(xop->head.cluster.array[idx].chain == NULL); 611 xop->head.cluster.array[idx].flags = 612 HAMMER2_CITEM_INVALID; 613 xop->head.cluster.array[idx].chain = chain; 614 nip = hammer2_inode_get(pmp, ip, 615 &xop->head.cluster, idx); 616 xop->head.cluster.array[idx].chain = NULL; 617 618 hammer2_inode_ref(nip); 619 hammer2_inode_unlock(nip); 620 621 defer->next = list->base; 622 defer->ip = nip; 623 list->base = defer; 624 ++list->count; 625 needrescan = 1; 626 } 627 628 /* 629 * If at least one deferral was added and the deferral 630 * list has grown too large, stop adding more. This 631 * will trigger an EAGAIN return. 632 */ 633 if (needrescan && list->count > 1000) 634 break; 635 636 /* 637 * Advancements for iteration. 638 */ 639 if (advance_xop) { 640 error = hammer2_xop_collect(&xop->head, 0); 641 } 642 if (advance_local) { 643 chain = hammer2_chain_next(&parent, chain, &key_next, 644 key_next, HAMMER2_KEY_MAX, 645 &cache_index, 646 HAMMER2_LOOKUP_SHARED | 647 HAMMER2_LOOKUP_NODIRECT | 648 HAMMER2_LOOKUP_NODATA); 649 } 650 } 651 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 652 if (chain) { 653 hammer2_chain_unlock(chain); 654 hammer2_chain_drop(chain); 655 } 656 if (parent) { 657 hammer2_chain_unlock(parent); 658 hammer2_chain_drop(parent); 659 } 660 661 /* 662 * If we added deferrals we want the caller to synchronize them 663 * and then call us again. 664 * 665 * NOTE: In this situation we do not yet want to synchronize our 666 * inode, setting the error code also has that effect. 667 */ 668 if ((error == 0 || error == ENOENT) && needrescan) 669 error = EAGAIN; 670 671 /* 672 * If no error occurred we can synchronize the inode meta-data 673 * and modify_tid. Only limited changes are made to PFSROOTs. 674 * 675 * XXX inode lock was lost 676 */ 677 if (error == 0 || error == ENOENT) { 678 hammer2_xop_ipcluster_t *xop2; 679 hammer2_chain_t *focus; 680 681 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 682 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 683 hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster, 684 idx); 685 hammer2_inode_unlock(ip); 686 error = hammer2_xop_collect(&xop2->head, 0); 687 if (error == 0) { 688 focus = xop2->head.cluster.focus; 689 if (hammer2_debug & 0x8000) { 690 kprintf("syncthr: update inode %p (%s)\n", 691 focus, 692 (focus ? (char *)focus->data-> 693 ipdata.filename : 694 "?")); 695 } 696 chain = hammer2_inode_chain_and_parent(ip, idx, 697 &parent, 698 HAMMER2_RESOLVE_ALWAYS | 699 HAMMER2_RESOLVE_SHARED); 700 701 KKASSERT(parent != NULL); 702 nerror = hammer2_sync_replace( 703 thr, parent, chain, 704 sync_tid, 705 idx, focus, isroot); 706 hammer2_chain_unlock(chain); 707 hammer2_chain_drop(chain); 708 hammer2_chain_unlock(parent); 709 hammer2_chain_drop(parent); 710 /* XXX */ 711 } 712 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP); 713 } 714 715 return error; 716 } 717 718 /* 719 * Create a missing chain by copying the focus from another device. 720 * 721 * On entry *parentp and focus are both locked shared. The chain will be 722 * created and returned in *chainp also locked shared. 723 */ 724 static 725 int 726 hammer2_sync_insert(hammer2_thread_t *thr, 727 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 728 hammer2_tid_t mtid, int idx, hammer2_chain_t *focus) 729 { 730 hammer2_chain_t *chain; 731 hammer2_key_t dummy; 732 int cache_index = -1; 733 734 #if HAMMER2_SYNCHRO_DEBUG 735 if (hammer2_debug & 1) 736 kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n", 737 *parentp, 738 (*parentp)->bref.type, 739 (*parentp)->bref.key, 740 idx, 741 focus->bref.type, focus->bref.key, mtid); 742 #endif 743 744 /* 745 * Parent requires an exclusive lock for the insertion. 746 * We must unlock the child to avoid deadlocks while 747 * relocking the parent. 748 */ 749 if (*chainp) { 750 hammer2_chain_unlock(*chainp); 751 hammer2_chain_drop(*chainp); 752 *chainp = NULL; 753 } 754 hammer2_chain_unlock(*parentp); 755 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS); 756 757 /* 758 * We must reissue the lookup to properly position (*parentp) 759 * for the insertion. 760 */ 761 chain = hammer2_chain_lookup(parentp, &dummy, 762 focus->bref.key, focus->bref.key, 763 &cache_index, 764 HAMMER2_LOOKUP_NODIRECT | 765 HAMMER2_LOOKUP_ALWAYS); 766 KKASSERT(chain == NULL); 767 768 chain = NULL; 769 hammer2_chain_create(parentp, &chain, 770 thr->pmp, focus->bref.methods, 771 focus->bref.key, focus->bref.keybits, 772 focus->bref.type, focus->bytes, 773 mtid, 0, 0); 774 hammer2_chain_modify(chain, mtid, 0, 0); 775 776 /* 777 * Copy focus to new chain 778 */ 779 780 /* type already set */ 781 chain->bref.methods = focus->bref.methods; 782 /* keybits already set */ 783 chain->bref.vradix = focus->bref.vradix; 784 /* mirror_tid set by flush */ 785 KKASSERT(chain->bref.modify_tid == mtid); 786 chain->bref.flags = focus->bref.flags; 787 /* key already present */ 788 /* check code will be recalculated */ 789 790 /* 791 * Copy data body. 792 */ 793 switch(chain->bref.type) { 794 case HAMMER2_BREF_TYPE_INODE: 795 if ((focus->data->ipdata.meta.op_flags & 796 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 797 /* do not copy block table */ 798 bcopy(focus->data, chain->data, 799 offsetof(hammer2_inode_data_t, u)); 800 break; 801 } 802 /* fall through copy whole thing */ 803 case HAMMER2_BREF_TYPE_DATA: 804 bcopy(focus->data, chain->data, chain->bytes); 805 hammer2_chain_setcheck(chain, chain->data); 806 break; 807 default: 808 KKASSERT(0); 809 break; 810 } 811 812 hammer2_chain_unlock(chain); /* unlock, leave ref */ 813 *chainp = chain; /* will be returned locked */ 814 815 /* 816 * Avoid ordering deadlock when relocking shared. 817 */ 818 hammer2_chain_unlock(*parentp); 819 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED | 820 HAMMER2_RESOLVE_ALWAYS); 821 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED | 822 HAMMER2_RESOLVE_ALWAYS); 823 824 return 0; 825 } 826 827 /* 828 * Destroy an extranious chain. 829 * 830 * Both *parentp and *chainp are locked shared. 831 * 832 * On return, *chainp will be adjusted to point to the next element in the 833 * iteration and locked shared. 834 */ 835 static 836 int 837 hammer2_sync_destroy(hammer2_thread_t *thr, 838 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 839 hammer2_tid_t mtid, int idx) 840 { 841 hammer2_chain_t *chain; 842 hammer2_key_t key_next; 843 hammer2_key_t save_key; 844 int cache_index = -1; 845 846 chain = *chainp; 847 848 #if HAMMER2_SYNCHRO_DEBUG 849 if (hammer2_debug & 1) 850 kprintf("destroy rec %p/%p slave %d %d.%016jx\n", 851 *parentp, chain, 852 idx, chain->bref.type, chain->bref.key); 853 #endif 854 855 save_key = chain->bref.key; 856 if (save_key != HAMMER2_KEY_MAX) 857 ++save_key; 858 859 /* 860 * Try to avoid unnecessary I/O. 861 * 862 * XXX accounting not propagated up properly. We might have to do 863 * a RESOLVE_MAYBE here and pass 0 for the flags. 864 */ 865 hammer2_chain_unlock(chain); /* relock exclusive */ 866 hammer2_chain_unlock(*parentp); 867 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS); 868 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 869 870 hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT); 871 hammer2_chain_unlock(chain); 872 hammer2_chain_drop(chain); 873 chain = NULL; /* safety */ 874 875 hammer2_chain_unlock(*parentp); /* relock shared */ 876 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED | 877 HAMMER2_RESOLVE_ALWAYS); 878 *chainp = hammer2_chain_lookup(parentp, &key_next, 879 save_key, HAMMER2_KEY_MAX, 880 &cache_index, 881 HAMMER2_LOOKUP_SHARED | 882 HAMMER2_LOOKUP_NODIRECT | 883 HAMMER2_LOOKUP_NODATA); 884 return 0; 885 } 886 887 /* 888 * cparent is locked exclusively, with an extra ref, cluster is not locked. 889 * Replace element [i] in the cluster. 890 */ 891 static 892 int 893 hammer2_sync_replace(hammer2_thread_t *thr, 894 hammer2_chain_t *parent, hammer2_chain_t *chain, 895 hammer2_tid_t mtid, int idx, 896 hammer2_chain_t *focus, int isroot) 897 { 898 int nradix; 899 uint8_t otype; 900 901 #if HAMMER2_SYNCHRO_DEBUG 902 if (hammer2_debug & 1) 903 kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n", 904 chain, 905 idx, 906 focus->bref.type, focus->bref.key, mtid); 907 #endif 908 hammer2_chain_unlock(chain); 909 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 910 if (chain->bytes != focus->bytes) { 911 /* XXX what if compressed? */ 912 nradix = hammer2_getradix(chain->bytes); 913 hammer2_chain_resize(NULL, parent, chain, 914 mtid, 0, 915 nradix, 0); 916 } 917 hammer2_chain_modify(chain, mtid, 0, 0); 918 otype = chain->bref.type; 919 chain->bref.type = focus->bref.type; 920 chain->bref.methods = focus->bref.methods; 921 chain->bref.keybits = focus->bref.keybits; 922 chain->bref.vradix = focus->bref.vradix; 923 /* mirror_tid updated by flush */ 924 KKASSERT(mtid == 0 || chain->bref.modify_tid == mtid); 925 chain->bref.flags = focus->bref.flags; 926 /* key already present */ 927 /* check code will be recalculated */ 928 chain->error = 0; 929 930 /* 931 * Copy data body. 932 */ 933 switch(chain->bref.type) { 934 case HAMMER2_BREF_TYPE_INODE: 935 /* 936 * Special case PFSROOTs, only limited changes can be made 937 * since the meta-data contains miscellanious distinguishing 938 * fields. 939 */ 940 if (isroot) { 941 chain->data->ipdata.meta.uflags = 942 focus->data->ipdata.meta.uflags; 943 chain->data->ipdata.meta.rmajor = 944 focus->data->ipdata.meta.rmajor; 945 chain->data->ipdata.meta.rminor = 946 focus->data->ipdata.meta.rminor; 947 chain->data->ipdata.meta.ctime = 948 focus->data->ipdata.meta.ctime; 949 chain->data->ipdata.meta.mtime = 950 focus->data->ipdata.meta.mtime; 951 chain->data->ipdata.meta.atime = 952 focus->data->ipdata.meta.atime; 953 /* not btime */ 954 chain->data->ipdata.meta.uid = 955 focus->data->ipdata.meta.uid; 956 chain->data->ipdata.meta.gid = 957 focus->data->ipdata.meta.gid; 958 chain->data->ipdata.meta.mode = 959 focus->data->ipdata.meta.mode; 960 chain->data->ipdata.meta.ncopies = 961 focus->data->ipdata.meta.ncopies; 962 chain->data->ipdata.meta.comp_algo = 963 focus->data->ipdata.meta.comp_algo; 964 chain->data->ipdata.meta.check_algo = 965 focus->data->ipdata.meta.check_algo; 966 chain->data->ipdata.meta.data_quota = 967 focus->data->ipdata.meta.data_quota; 968 chain->data->ipdata.meta.inode_quota = 969 focus->data->ipdata.meta.inode_quota; 970 971 /* 972 * last snapshot tid controls overwrite 973 */ 974 if (chain->data->ipdata.meta.pfs_lsnap_tid < 975 focus->data->ipdata.meta.pfs_lsnap_tid) { 976 chain->data->ipdata.meta.pfs_lsnap_tid = 977 focus->data->ipdata.meta.pfs_lsnap_tid; 978 } 979 980 hammer2_chain_setcheck(chain, chain->data); 981 break; 982 } 983 984 /* 985 * Normal replacement. 986 */ 987 if ((focus->data->ipdata.meta.op_flags & 988 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 989 /* 990 * If DIRECTDATA is transitioning to 0 or the old 991 * chain is not an inode we have to initialize 992 * the block table. 993 */ 994 if (otype != HAMMER2_BREF_TYPE_INODE || 995 (chain->data->ipdata.meta.op_flags & 996 HAMMER2_OPFLAG_DIRECTDATA)) { 997 kprintf("chain inode trans away from dd\n"); 998 bzero(&chain->data->ipdata.u, 999 sizeof(chain->data->ipdata.u)); 1000 } 1001 bcopy(focus->data, chain->data, 1002 offsetof(hammer2_inode_data_t, u)); 1003 /* XXX setcheck on inode should not be needed */ 1004 hammer2_chain_setcheck(chain, chain->data); 1005 break; 1006 } 1007 /* fall through */ 1008 case HAMMER2_BREF_TYPE_DATA: 1009 bcopy(focus->data, chain->data, chain->bytes); 1010 hammer2_chain_setcheck(chain, chain->data); 1011 break; 1012 default: 1013 KKASSERT(0); 1014 break; 1015 } 1016 1017 hammer2_chain_unlock(chain); 1018 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED | 1019 HAMMER2_RESOLVE_MAYBE); 1020 1021 return 0; 1022 } 1023