1 /* 2 * Copyright (c) 2015-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * This module implements the cluster synchronizer. Basically the way 36 * it works is that a thread is created for each cluster node in a PFS. 37 * This thread is responsible for synchronizing the current node using 38 * data from other nodes. 39 * 40 * Any out of sync master or slave can get back into synchronization as 41 * long as a quorum of masters agree on the update_tid. If a quorum is 42 * not available it may still be possible to synchronize to the highest 43 * available update_tid as a way of trying to catch up as much as possible 44 * until a quorum is available. 45 * 46 * If no quorum is possible (which can happen even if all masters are 47 * available, if the update_tid does not match), then manual intervention 48 * may be required to resolve discrepancies. 49 */ 50 #include "hammer2.h" 51 52 typedef struct hammer2_deferred_ip { 53 struct hammer2_deferred_ip *next; 54 hammer2_inode_t *ip; 55 } hammer2_deferred_ip_t; 56 57 typedef struct hammer2_deferred_list { 58 hammer2_deferred_ip_t *base; 59 int count; 60 } hammer2_deferred_list_t; 61 62 63 #define HAMMER2_SYNCHRO_DEBUG 1 64 65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip, 66 hammer2_deferred_list_t *list, int isroot); 67 #if 0 68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags); 69 nerror = hammer2_sync_insert( 70 thr, &parent, &chain, 71 focus->bref.modify_tid, 72 idx, focus); 73 #endif 74 static int hammer2_sync_insert(hammer2_thread_t *thr, 75 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 76 hammer2_tid_t modify_tid, int idx, 77 hammer2_chain_t *focus); 78 static int hammer2_sync_destroy(hammer2_thread_t *thr, 79 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 80 hammer2_tid_t mtid, int idx); 81 static int hammer2_sync_replace(hammer2_thread_t *thr, 82 hammer2_chain_t *parent, hammer2_chain_t *chain, 83 hammer2_tid_t mtid, int idx, 84 hammer2_chain_t *focus, int isroot); 85 86 /**************************************************************************** 87 * HAMMER2 SYNC THREADS * 88 ****************************************************************************/ 89 /* 90 * Primary management thread for an element of a node. A thread will exist 91 * for each element requiring management. 92 * 93 * No management threads are needed for the SPMP or for any PMP with only 94 * a single MASTER. 95 * 96 * On the SPMP - handles bulkfree and dedup operations 97 * On a PFS - handles remastering and synchronization 98 */ 99 void 100 hammer2_primary_sync_thread(void *arg) 101 { 102 hammer2_thread_t *thr = arg; 103 hammer2_pfs_t *pmp; 104 hammer2_deferred_list_t list; 105 hammer2_deferred_ip_t *defer; 106 int error; 107 uint32_t flags; 108 uint32_t nflags; 109 110 pmp = thr->pmp; 111 bzero(&list, sizeof(list)); 112 113 for (;;) { 114 flags = thr->flags; 115 cpu_ccfence(); 116 117 /* 118 * Handle stop request 119 */ 120 if (flags & HAMMER2_THREAD_STOP) 121 break; 122 123 /* 124 * Handle freeze request 125 */ 126 if (flags & HAMMER2_THREAD_FREEZE) { 127 nflags = (flags & ~(HAMMER2_THREAD_FREEZE | 128 HAMMER2_THREAD_WAITING)) | 129 HAMMER2_THREAD_FROZEN; 130 if (!atomic_cmpset_int(&thr->flags, flags, nflags)) 131 continue; 132 if (flags & HAMMER2_THREAD_WAITING) 133 wakeup(&thr->flags); 134 continue; 135 } 136 137 if (flags & HAMMER2_THREAD_UNFREEZE) { 138 nflags = flags & ~(HAMMER2_THREAD_UNFREEZE | 139 HAMMER2_THREAD_FROZEN | 140 HAMMER2_THREAD_WAITING); 141 if (!atomic_cmpset_int(&thr->flags, flags, nflags)) 142 continue; 143 if (flags & HAMMER2_THREAD_WAITING) 144 wakeup(&thr->flags); 145 continue; 146 } 147 148 /* 149 * Force idle if frozen until unfrozen or stopped. 150 */ 151 if (flags & HAMMER2_THREAD_FROZEN) { 152 nflags = flags | HAMMER2_THREAD_WAITING; 153 154 tsleep_interlock(&thr->flags, 0); 155 if (atomic_cmpset_int(&thr->flags, flags, nflags)) 156 tsleep(&thr->flags, PINTERLOCKED, "frozen", 0); 157 continue; 158 } 159 160 /* 161 * Reset state on REMASTER request 162 */ 163 if (thr->flags & HAMMER2_THREAD_REMASTER) { 164 nflags = flags & ~HAMMER2_THREAD_REMASTER; 165 if (atomic_cmpset_int(&thr->flags, flags, nflags)) { 166 /* reset state here */ 167 } 168 continue; 169 } 170 171 /* 172 * Synchronization scan. 173 */ 174 if (hammer2_debug & 0x8000) 175 kprintf("sync_slaves pfs %s clindex %d\n", 176 pmp->pfs_names[thr->clindex], thr->clindex); 177 hammer2_trans_init(pmp, 0); 178 179 hammer2_inode_ref(pmp->iroot); 180 181 for (;;) { 182 int didbreak = 0; 183 /* XXX lock synchronize pmp->modify_tid */ 184 error = hammer2_sync_slaves(thr, pmp->iroot, &list, 1); 185 if (hammer2_debug & 0x8000) { 186 kprintf("sync_slaves error %d defer %p\n", 187 error, list.base); 188 } 189 if (error != HAMMER2_ERROR_EAGAIN) 190 break; 191 while ((defer = list.base) != NULL) { 192 hammer2_inode_t *nip; 193 194 nip = defer->ip; 195 error = hammer2_sync_slaves(thr, nip, &list, 196 (nip == pmp->iroot)); 197 if (error && 198 error != HAMMER2_ERROR_EAGAIN && 199 error != HAMMER2_ERROR_ENOENT) { 200 break; 201 } 202 if (hammer2_thr_break(thr)) { 203 didbreak = 1; 204 break; 205 } 206 207 /* 208 * If no additional defers occurred we can 209 * remove this one, otherwise keep it on 210 * the list and retry once the additional 211 * defers have completed. 212 */ 213 if (defer == list.base) { 214 --list.count; 215 list.base = defer->next; 216 kfree(defer, M_HAMMER2); 217 defer = NULL; /* safety */ 218 hammer2_inode_drop(nip); 219 } 220 } 221 222 /* 223 * If the thread is being remastered, frozen, or 224 * stopped, clean up any left-over deferals. 225 */ 226 if (didbreak || 227 (error && error != HAMMER2_ERROR_EAGAIN)) { 228 kprintf("didbreak\n"); 229 while ((defer = list.base) != NULL) { 230 --list.count; 231 hammer2_inode_drop(defer->ip); 232 list.base = defer->next; 233 kfree(defer, M_HAMMER2); 234 } 235 if (error == 0 || error == HAMMER2_ERROR_EAGAIN) 236 error = HAMMER2_ERROR_EINPROGRESS; 237 break; 238 } 239 } 240 241 hammer2_inode_drop(pmp->iroot); 242 hammer2_trans_done(pmp); 243 244 if (error && error != HAMMER2_ERROR_EINPROGRESS) 245 kprintf("hammer2_sync_slaves: error %d\n", error); 246 247 /* 248 * Wait for event, or 5-second poll. 249 */ 250 nflags = flags | HAMMER2_THREAD_WAITING; 251 tsleep_interlock(&thr->flags, 0); 252 if (atomic_cmpset_int(&thr->flags, flags, nflags)) { 253 tsleep(&thr->flags, 0, "h2idle", hz * 5); 254 } 255 } 256 thr->td = NULL; 257 hammer2_thr_signal(thr, HAMMER2_THREAD_STOPPED); 258 /* thr structure can go invalid after this point */ 259 } 260 261 #if 0 262 /* 263 * Given a locked cluster created from pmp->iroot, update the PFS's 264 * reporting status. 265 */ 266 static 267 void 268 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags) 269 { 270 hammer2_pfs_t *pmp = thr->pmp; 271 272 flags &= HAMMER2_CLUSTER_ZFLAGS; 273 if (pmp->cluster_flags == flags) 274 return; 275 pmp->cluster_flags = flags; 276 277 kprintf("pfs %p", pmp); 278 if (flags & HAMMER2_CLUSTER_MSYNCED) 279 kprintf(" masters-all-good"); 280 if (flags & HAMMER2_CLUSTER_SSYNCED) 281 kprintf(" slaves-all-good"); 282 283 if (flags & HAMMER2_CLUSTER_WRHARD) 284 kprintf(" quorum/rw"); 285 else if (flags & HAMMER2_CLUSTER_RDHARD) 286 kprintf(" quorum/ro"); 287 288 if (flags & HAMMER2_CLUSTER_UNHARD) 289 kprintf(" out-of-sync-masters"); 290 else if (flags & HAMMER2_CLUSTER_NOHARD) 291 kprintf(" no-masters-visible"); 292 293 if (flags & HAMMER2_CLUSTER_WRSOFT) 294 kprintf(" soft/rw"); 295 else if (flags & HAMMER2_CLUSTER_RDSOFT) 296 kprintf(" soft/ro"); 297 298 if (flags & HAMMER2_CLUSTER_UNSOFT) 299 kprintf(" out-of-sync-slaves"); 300 else if (flags & HAMMER2_CLUSTER_NOSOFT) 301 kprintf(" no-slaves-visible"); 302 kprintf("\n"); 303 } 304 #endif 305 306 #if 0 307 static 308 void 309 dumpcluster(const char *label, 310 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster) 311 { 312 hammer2_chain_t *chain; 313 int i; 314 315 if ((hammer2_debug & 1) == 0) 316 return; 317 318 kprintf("%s\t", label); 319 KKASSERT(cparent->nchains == cluster->nchains); 320 for (i = 0; i < cparent->nchains; ++i) { 321 if (i) 322 kprintf("\t"); 323 kprintf("%d ", i); 324 if ((chain = cparent->array[i].chain) != NULL) { 325 kprintf("%016jx%s ", 326 chain->bref.key, 327 ((cparent->array[i].flags & 328 HAMMER2_CITEM_INVALID) ? "(I)" : " ") 329 ); 330 } else { 331 kprintf(" NULL %s ", " "); 332 } 333 if ((chain = cluster->array[i].chain) != NULL) { 334 kprintf("%016jx%s ", 335 chain->bref.key, 336 ((cluster->array[i].flags & 337 HAMMER2_CITEM_INVALID) ? "(I)" : " ") 338 ); 339 } else { 340 kprintf(" NULL %s ", " "); 341 } 342 kprintf("\n"); 343 } 344 } 345 #endif 346 347 /* 348 * Each out of sync node sync-thread must issue an all-nodes XOP scan of 349 * the inode. This creates a multiplication effect since the XOP scan itself 350 * issues to all nodes. However, this is the only way we can safely 351 * synchronize nodes which might have disparate I/O bandwidths and the only 352 * way we can safely deal with stalled nodes. 353 * 354 * XXX serror / merror rollup and handling. 355 */ 356 static 357 int 358 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip, 359 hammer2_deferred_list_t *list, int isroot) 360 { 361 hammer2_xop_scanall_t *xop; 362 hammer2_chain_t *parent; 363 hammer2_chain_t *chain; 364 hammer2_pfs_t *pmp; 365 hammer2_key_t key_next; 366 hammer2_tid_t sync_tid; 367 int needrescan; 368 int want_update; 369 int serror; /* slave error */ 370 int merror; /* master error (from xop_collect) */ 371 int nerror; /* temporary error */ 372 int idx; 373 int n; 374 375 pmp = ip->pmp; 376 idx = thr->clindex; /* cluster node we are responsible for */ 377 needrescan = 0; 378 want_update = 0; 379 sync_tid = 0; 380 chain = NULL; 381 parent = NULL; 382 383 #if 0 384 /* 385 * Nothing to do if all slaves are synchronized. 386 * Nothing to do if cluster not authoritatively readable. 387 */ 388 if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED) 389 return(0); 390 if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0) 391 return(HAMMER2_ERROR_INCOMPLETE); 392 #endif 393 394 merror = 0; 395 396 /* 397 * Resolve the root inode of the PFS and determine if synchronization 398 * is needed by checking modify_tid. 399 * 400 * Retain the synchronization TID from the focus inode and use it 401 * later to synchronize the focus inode if/when the recursion 402 * succeeds. 403 */ 404 { 405 hammer2_xop_ipcluster_t *xop2; 406 hammer2_chain_t *focus; 407 408 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 409 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 410 hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster, 411 idx); 412 hammer2_inode_unlock(ip); 413 merror = hammer2_xop_collect(&xop2->head, 0); 414 if (merror == 0 && (focus = xop2->head.cluster.focus) != NULL) { 415 sync_tid = focus->bref.modify_tid; 416 chain = hammer2_inode_chain_and_parent(ip, idx, 417 &parent, 418 HAMMER2_RESOLVE_ALWAYS | 419 HAMMER2_RESOLVE_SHARED); 420 want_update = (chain->bref.modify_tid != sync_tid); 421 if (chain) { 422 hammer2_chain_unlock(chain); 423 hammer2_chain_drop(chain); 424 chain = NULL; 425 } 426 if (parent) { 427 hammer2_chain_unlock(parent); 428 hammer2_chain_drop(parent); 429 parent = NULL; 430 } 431 } 432 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP); 433 } 434 435 if (want_update == 0) 436 return(0); 437 438 /* 439 * The inode is left unlocked during the scan. Issue a XOP 440 * that does *not* include our cluster index to iterate 441 * properly synchronized elements and resolve our cluster index 442 * against it. 443 */ 444 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 445 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 446 xop->key_beg = HAMMER2_KEY_MIN; 447 xop->key_end = HAMMER2_KEY_MAX; 448 xop->resolve_flags = HAMMER2_RESOLVE_SHARED | 449 HAMMER2_RESOLVE_ALWAYS; 450 xop->lookup_flags = HAMMER2_LOOKUP_SHARED | 451 HAMMER2_LOOKUP_NODIRECT | 452 HAMMER2_LOOKUP_ALWAYS; 453 hammer2_xop_start_except(&xop->head, hammer2_xop_scanall, idx); 454 parent = hammer2_inode_chain(ip, idx, 455 HAMMER2_RESOLVE_ALWAYS | 456 HAMMER2_RESOLVE_SHARED); 457 hammer2_inode_unlock(ip); 458 459 chain = hammer2_chain_lookup(&parent, &key_next, 460 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 461 &serror, 462 HAMMER2_LOOKUP_SHARED | 463 HAMMER2_LOOKUP_NODIRECT | 464 HAMMER2_LOOKUP_NODATA); 465 merror = hammer2_xop_collect(&xop->head, 0); 466 if (hammer2_debug & 0x8000) { 467 kprintf("START_SCAN IP=%016jx chain=%p (%016jx)\n", 468 ip->meta.name_key, chain, 469 (chain ? chain->bref.key : -1)); 470 } 471 472 for (;;) { 473 /* 474 * We are done if our scan is done and the XOP scan is done. 475 * We are done if the XOP scan failed (that is, we don't 476 * have authoritative data to synchronize with). 477 */ 478 int advance_local = 0; 479 int advance_xop = 0; 480 int dodefer = 0; 481 hammer2_chain_t *focus; 482 483 if (chain == NULL && merror == HAMMER2_ERROR_ENOENT) 484 break; 485 if (merror && merror != HAMMER2_ERROR_ENOENT) 486 break; 487 488 /* 489 * Compare 490 */ 491 if (chain && merror == HAMMER2_ERROR_ENOENT) { 492 /* 493 * If we have local chains but the XOP scan is done, 494 * the chains need to be deleted. 495 */ 496 n = -1; 497 focus = NULL; 498 } else if (chain == NULL) { 499 /* 500 * If our local scan is done but the XOP scan is not, 501 * we need to create the missing chain(s). 502 */ 503 n = 1; 504 focus = xop->head.cluster.focus; 505 } else { 506 /* 507 * Otherwise compare to determine the action 508 * needed. 509 */ 510 focus = xop->head.cluster.focus; 511 n = hammer2_chain_cmp(chain, focus); 512 } 513 514 /* 515 * Take action based on comparison results. 516 */ 517 if (n < 0) { 518 /* 519 * Delete extranious local data. This will 520 * automatically advance the chain. 521 */ 522 nerror = hammer2_sync_destroy(thr, &parent, &chain, 523 0, idx); 524 } else if (n == 0 && chain->bref.modify_tid != 525 focus->bref.modify_tid) { 526 /* 527 * Matching key but local data or meta-data requires 528 * updating. If we will recurse, we still need to 529 * update to compatible content first but we do not 530 * synchronize modify_tid until the entire recursion 531 * has completed successfully. 532 */ 533 if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) { 534 nerror = hammer2_sync_replace( 535 thr, parent, chain, 536 0, 537 idx, focus, 0); 538 dodefer = 1; 539 } else { 540 nerror = hammer2_sync_replace( 541 thr, parent, chain, 542 focus->bref.modify_tid, 543 idx, focus, 0); 544 } 545 advance_local = 1; 546 advance_xop = 1; 547 } else if (n == 0) { 548 /* 549 * 100% match, advance both 550 */ 551 advance_local = 1; 552 advance_xop = 1; 553 nerror = 0; 554 } else if (n > 0) { 555 /* 556 * Insert missing local data. 557 * 558 * If we will recurse, we still need to update to 559 * compatible content first but we do not synchronize 560 * modify_tid until the entire recursion has 561 * completed successfully. 562 */ 563 if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) { 564 nerror = hammer2_sync_insert( 565 thr, &parent, &chain, 566 0, 567 idx, focus); 568 dodefer = 2; 569 } else { 570 nerror = hammer2_sync_insert( 571 thr, &parent, &chain, 572 focus->bref.modify_tid, 573 idx, focus); 574 } 575 advance_local = 1; 576 advance_xop = 1; 577 } 578 579 /* 580 * We cannot recurse depth-first because the XOP is still 581 * running in node threads for this scan. Create a placemarker 582 * by obtaining and record the hammer2_inode. 583 * 584 * We excluded our node from the XOP so we must temporarily 585 * add it to xop->head.cluster so it is properly incorporated 586 * into the inode. 587 * 588 * The deferral is pushed onto a LIFO list for bottom-up 589 * synchronization. 590 */ 591 if (merror == 0 && dodefer) { 592 hammer2_inode_t *nip; 593 hammer2_deferred_ip_t *defer; 594 595 KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE); 596 597 defer = kmalloc(sizeof(*defer), M_HAMMER2, 598 M_WAITOK | M_ZERO); 599 KKASSERT(xop->head.cluster.array[idx].chain == NULL); 600 xop->head.cluster.array[idx].flags = 601 HAMMER2_CITEM_INVALID; 602 xop->head.cluster.array[idx].chain = chain; 603 nip = hammer2_inode_get(pmp, ip, 604 &xop->head.cluster, idx); 605 xop->head.cluster.array[idx].chain = NULL; 606 607 hammer2_inode_ref(nip); 608 hammer2_inode_unlock(nip); 609 610 defer->next = list->base; 611 defer->ip = nip; 612 list->base = defer; 613 ++list->count; 614 needrescan = 1; 615 } 616 617 /* 618 * If at least one deferral was added and the deferral 619 * list has grown too large, stop adding more. This 620 * will trigger an HAMMER2_ERROR_EAGAIN return. 621 */ 622 if (needrescan && list->count > 1000) 623 break; 624 625 /* 626 * Advancements for iteration. 627 */ 628 if (advance_xop) { 629 merror = hammer2_xop_collect(&xop->head, 0); 630 } 631 if (advance_local) { 632 chain = hammer2_chain_next(&parent, chain, &key_next, 633 key_next, HAMMER2_KEY_MAX, 634 &serror, 635 HAMMER2_LOOKUP_SHARED | 636 HAMMER2_LOOKUP_NODIRECT | 637 HAMMER2_LOOKUP_NODATA); 638 } 639 } 640 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 641 if (chain) { 642 hammer2_chain_unlock(chain); 643 hammer2_chain_drop(chain); 644 } 645 if (parent) { 646 hammer2_chain_unlock(parent); 647 hammer2_chain_drop(parent); 648 } 649 650 /* 651 * If we added deferrals we want the caller to synchronize them 652 * and then call us again. 653 * 654 * NOTE: In this situation we do not yet want to synchronize our 655 * inode, setting the error code also has that effect. 656 */ 657 if ((merror == 0 || merror == HAMMER2_ERROR_ENOENT) && needrescan) 658 merror = HAMMER2_ERROR_EAGAIN; 659 660 /* 661 * If no error occurred we can synchronize the inode meta-data 662 * and modify_tid. Only limited changes are made to PFSROOTs. 663 * 664 * XXX inode lock was lost 665 */ 666 if (merror == 0 || merror == HAMMER2_ERROR_ENOENT) { 667 hammer2_xop_ipcluster_t *xop2; 668 hammer2_chain_t *focus; 669 670 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 671 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 672 hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster, 673 idx); 674 hammer2_inode_unlock(ip); 675 merror = hammer2_xop_collect(&xop2->head, 0); 676 if (merror == 0) { 677 focus = xop2->head.cluster.focus; 678 if (hammer2_debug & 0x8000) { 679 kprintf("syncthr: update inode %p (%s)\n", 680 focus, 681 (focus ? (char *)focus->data-> 682 ipdata.filename : 683 "?")); 684 } 685 chain = hammer2_inode_chain_and_parent(ip, idx, 686 &parent, 687 HAMMER2_RESOLVE_ALWAYS | 688 HAMMER2_RESOLVE_SHARED); 689 690 KKASSERT(parent != NULL); 691 nerror = hammer2_sync_replace( 692 thr, parent, chain, 693 sync_tid, 694 idx, focus, isroot); 695 hammer2_chain_unlock(chain); 696 hammer2_chain_drop(chain); 697 hammer2_chain_unlock(parent); 698 hammer2_chain_drop(parent); 699 /* XXX */ 700 } 701 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP); 702 } 703 704 return merror; 705 } 706 707 /* 708 * Create a missing chain by copying the focus from another device. 709 * 710 * On entry *parentp and focus are both locked shared. The chain will be 711 * created and returned in *chainp also locked shared. 712 */ 713 static 714 int 715 hammer2_sync_insert(hammer2_thread_t *thr, 716 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 717 hammer2_tid_t mtid, int idx, hammer2_chain_t *focus) 718 { 719 hammer2_chain_t *chain; 720 hammer2_key_t dummy; 721 int error; 722 723 #if HAMMER2_SYNCHRO_DEBUG 724 if (hammer2_debug & 1) 725 kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n", 726 *parentp, 727 (*parentp)->bref.type, 728 (*parentp)->bref.key, 729 idx, 730 focus->bref.type, focus->bref.key, mtid); 731 #endif 732 733 /* 734 * Parent requires an exclusive lock for the insertion. 735 * We must unlock the child to avoid deadlocks while 736 * relocking the parent. 737 */ 738 if (*chainp) { 739 hammer2_chain_unlock(*chainp); 740 hammer2_chain_drop(*chainp); 741 *chainp = NULL; 742 } 743 hammer2_chain_unlock(*parentp); 744 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS); 745 746 /* 747 * We must reissue the lookup to properly position (*parentp) 748 * for the insertion. 749 */ 750 chain = hammer2_chain_lookup(parentp, &dummy, 751 focus->bref.key, focus->bref.key, 752 &error, 753 HAMMER2_LOOKUP_NODIRECT | 754 HAMMER2_LOOKUP_ALWAYS); 755 KKASSERT(chain == NULL); 756 757 chain = NULL; 758 error = hammer2_chain_create(parentp, &chain, 759 thr->pmp, focus->bref.methods, 760 focus->bref.key, focus->bref.keybits, 761 focus->bref.type, focus->bytes, 762 mtid, 0, 0); 763 if (error == 0) { 764 error = hammer2_chain_modify(chain, mtid, 0, 0); 765 if (error) 766 goto failed; 767 768 /* 769 * Copy focus to new chain 770 */ 771 772 /* type already set */ 773 chain->bref.methods = focus->bref.methods; 774 /* keybits already set */ 775 chain->bref.vradix = focus->bref.vradix; 776 /* mirror_tid set by flush */ 777 KKASSERT(chain->bref.modify_tid == mtid); 778 chain->bref.flags = focus->bref.flags; 779 /* key already present */ 780 /* check code will be recalculated */ 781 782 /* 783 * Copy data body. 784 */ 785 switch(chain->bref.type) { 786 case HAMMER2_BREF_TYPE_INODE: 787 if ((focus->data->ipdata.meta.op_flags & 788 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 789 /* do not copy block table */ 790 bcopy(focus->data, chain->data, 791 offsetof(hammer2_inode_data_t, u)); 792 break; 793 } 794 /* fall through copy whole thing */ 795 case HAMMER2_BREF_TYPE_DATA: 796 bcopy(focus->data, chain->data, chain->bytes); 797 hammer2_chain_setcheck(chain, chain->data); 798 break; 799 case HAMMER2_BREF_TYPE_DIRENT: 800 /* 801 * Directory entries embed data in the blockref. 802 */ 803 if (chain->bytes) { 804 bcopy(focus->data, chain->data, chain->bytes); 805 hammer2_chain_setcheck(chain, chain->data); 806 } else { 807 chain->bref.check = focus->bref.check; 808 } 809 chain->bref.embed = focus->bref.embed; 810 break; 811 default: 812 KKASSERT(0); 813 break; 814 } 815 } 816 817 failed: 818 if (chain) 819 hammer2_chain_unlock(chain); /* unlock, leave ref */ 820 *chainp = chain; /* will be returned locked */ 821 822 /* 823 * Avoid an ordering deadlock when relocking shared. 824 */ 825 hammer2_chain_unlock(*parentp); 826 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED | 827 HAMMER2_RESOLVE_ALWAYS); 828 if (chain) { 829 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED | 830 HAMMER2_RESOLVE_ALWAYS); 831 error = chain->error; 832 } 833 834 return error; 835 } 836 837 /* 838 * Destroy an extranious chain. 839 * 840 * Both *parentp and *chainp are locked shared. 841 * 842 * On return, *chainp will be adjusted to point to the next element in the 843 * iteration and locked shared. 844 */ 845 static 846 int 847 hammer2_sync_destroy(hammer2_thread_t *thr, 848 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 849 hammer2_tid_t mtid, int idx) 850 { 851 hammer2_chain_t *chain; 852 hammer2_key_t key_next; 853 hammer2_key_t save_key; 854 int error; 855 856 chain = *chainp; 857 858 #if HAMMER2_SYNCHRO_DEBUG 859 if (hammer2_debug & 1) 860 kprintf("destroy rec %p/%p slave %d %d.%016jx\n", 861 *parentp, chain, 862 idx, chain->bref.type, chain->bref.key); 863 #endif 864 865 save_key = chain->bref.key; 866 if (save_key != HAMMER2_KEY_MAX) 867 ++save_key; 868 869 /* 870 * Try to avoid unnecessary I/O. 871 * 872 * XXX accounting not propagated up properly. We might have to do 873 * a RESOLVE_MAYBE here and pass 0 for the flags. 874 */ 875 hammer2_chain_unlock(chain); /* relock exclusive */ 876 hammer2_chain_unlock(*parentp); 877 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS); 878 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 879 880 hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT); 881 hammer2_chain_unlock(chain); 882 hammer2_chain_drop(chain); 883 chain = NULL; /* safety */ 884 885 hammer2_chain_unlock(*parentp); /* relock shared */ 886 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED | 887 HAMMER2_RESOLVE_ALWAYS); 888 *chainp = hammer2_chain_lookup(parentp, &key_next, 889 save_key, HAMMER2_KEY_MAX, 890 &error, 891 HAMMER2_LOOKUP_SHARED | 892 HAMMER2_LOOKUP_NODIRECT | 893 HAMMER2_LOOKUP_NODATA); 894 return error; 895 } 896 897 /* 898 * cparent is locked exclusively, with an extra ref, cluster is not locked. 899 * Replace element [i] in the cluster. 900 */ 901 static 902 int 903 hammer2_sync_replace(hammer2_thread_t *thr, 904 hammer2_chain_t *parent, hammer2_chain_t *chain, 905 hammer2_tid_t mtid, int idx, 906 hammer2_chain_t *focus, int isroot) 907 { 908 uint8_t otype; 909 int nradix; 910 int error; 911 912 #if HAMMER2_SYNCHRO_DEBUG 913 if (hammer2_debug & 1) 914 kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n", 915 chain, 916 idx, 917 focus->bref.type, focus->bref.key, mtid); 918 #endif 919 hammer2_chain_unlock(chain); 920 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 921 error = chain->error; 922 if (error == 0) { 923 if (chain->bytes != focus->bytes) { 924 /* XXX what if compressed? */ 925 nradix = hammer2_getradix(chain->bytes); 926 error = hammer2_chain_resize(chain, mtid, 0, nradix, 0); 927 if (error) 928 goto failed; 929 } 930 error = hammer2_chain_modify(chain, mtid, 0, 0); 931 if (error) 932 goto failed; 933 otype = chain->bref.type; 934 chain->bref.type = focus->bref.type; 935 chain->bref.methods = focus->bref.methods; 936 chain->bref.keybits = focus->bref.keybits; 937 chain->bref.vradix = focus->bref.vradix; 938 /* mirror_tid updated by flush */ 939 KKASSERT(mtid == 0 || chain->bref.modify_tid == mtid); 940 chain->bref.flags = focus->bref.flags; 941 /* key already present */ 942 /* check code will be recalculated */ 943 944 /* 945 * Copy data body. 946 */ 947 switch(chain->bref.type) { 948 case HAMMER2_BREF_TYPE_INODE: 949 /* 950 * Special case PFSROOTs, only limited changes can 951 * be made since the meta-data contains miscellanious 952 * distinguishing fields. 953 */ 954 if (isroot) { 955 chain->data->ipdata.meta.uflags = 956 focus->data->ipdata.meta.uflags; 957 chain->data->ipdata.meta.rmajor = 958 focus->data->ipdata.meta.rmajor; 959 chain->data->ipdata.meta.rminor = 960 focus->data->ipdata.meta.rminor; 961 chain->data->ipdata.meta.ctime = 962 focus->data->ipdata.meta.ctime; 963 chain->data->ipdata.meta.mtime = 964 focus->data->ipdata.meta.mtime; 965 chain->data->ipdata.meta.atime = 966 focus->data->ipdata.meta.atime; 967 /* not btime */ 968 chain->data->ipdata.meta.uid = 969 focus->data->ipdata.meta.uid; 970 chain->data->ipdata.meta.gid = 971 focus->data->ipdata.meta.gid; 972 chain->data->ipdata.meta.mode = 973 focus->data->ipdata.meta.mode; 974 chain->data->ipdata.meta.ncopies = 975 focus->data->ipdata.meta.ncopies; 976 chain->data->ipdata.meta.comp_algo = 977 focus->data->ipdata.meta.comp_algo; 978 chain->data->ipdata.meta.check_algo = 979 focus->data->ipdata.meta.check_algo; 980 chain->data->ipdata.meta.data_quota = 981 focus->data->ipdata.meta.data_quota; 982 chain->data->ipdata.meta.inode_quota = 983 focus->data->ipdata.meta.inode_quota; 984 985 /* 986 * last snapshot tid controls overwrite 987 */ 988 if (chain->data->ipdata.meta.pfs_lsnap_tid < 989 focus->data->ipdata.meta.pfs_lsnap_tid) { 990 chain->data->ipdata.meta.pfs_lsnap_tid = 991 focus->data->ipdata.meta.pfs_lsnap_tid; 992 } 993 994 hammer2_chain_setcheck(chain, chain->data); 995 break; 996 } 997 998 /* 999 * Normal replacement. 1000 */ 1001 if ((focus->data->ipdata.meta.op_flags & 1002 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 1003 /* 1004 * If DIRECTDATA is transitioning to 0 or the 1005 * old chain is not an inode we have to 1006 * initialize the block table. 1007 */ 1008 if (otype != HAMMER2_BREF_TYPE_INODE || 1009 (chain->data->ipdata.meta.op_flags & 1010 HAMMER2_OPFLAG_DIRECTDATA)) { 1011 kprintf("chain inode trans " 1012 "away from dd\n"); 1013 bzero(&chain->data->ipdata.u, 1014 sizeof(chain->data->ipdata.u)); 1015 } 1016 bcopy(focus->data, chain->data, 1017 offsetof(hammer2_inode_data_t, u)); 1018 /* XXX setcheck on inode should not be needed */ 1019 hammer2_chain_setcheck(chain, chain->data); 1020 break; 1021 } 1022 /* fall through */ 1023 case HAMMER2_BREF_TYPE_DATA: 1024 bcopy(focus->data, chain->data, chain->bytes); 1025 hammer2_chain_setcheck(chain, chain->data); 1026 break; 1027 case HAMMER2_BREF_TYPE_DIRENT: 1028 /* 1029 * Directory entries embed data in the blockref. 1030 */ 1031 if (chain->bytes) { 1032 bcopy(focus->data, chain->data, chain->bytes); 1033 hammer2_chain_setcheck(chain, chain->data); 1034 } else { 1035 chain->bref.check = focus->bref.check; 1036 } 1037 chain->bref.embed = focus->bref.embed; 1038 break; 1039 default: 1040 KKASSERT(0); 1041 break; 1042 } 1043 } 1044 1045 failed: 1046 hammer2_chain_unlock(chain); 1047 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED | 1048 HAMMER2_RESOLVE_MAYBE); 1049 1050 return error; 1051 } 1052