1 /* 2 * Copyright (c) 2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * This module implements the cluster synchronizer. Basically the way 36 * it works is that a thread is created for each cluster node in a PFS. 37 * This thread is responsible for synchronizing the current node using 38 * data from other nodes. 39 * 40 * Any out of sync master or slave can get back into synchronization as 41 * long as a quorum of masters agree on the update_tid. If a quorum is 42 * not available it may still be possible to synchronize to the highest 43 * available update_tid as a way of trying to catch up as much as possible 44 * until a quorum is available. 45 * 46 * If no quorum is possible (which can happen even if all masters are 47 * available, if the update_tid does not match), then manual intervention 48 * may be required to resolve discrepancies. 49 */ 50 #include "hammer2.h" 51 52 typedef struct hammer2_deferred_ip { 53 struct hammer2_deferred_ip *next; 54 hammer2_inode_t *ip; 55 } hammer2_deferred_ip_t; 56 57 typedef struct hammer2_deferred_list { 58 hammer2_deferred_ip_t *base; 59 int count; 60 } hammer2_deferred_list_t; 61 62 63 #define HAMMER2_SYNCHRO_DEBUG 1 64 65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip, 66 hammer2_deferred_list_t *list, int isroot); 67 #if 0 68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags); 69 nerror = hammer2_sync_insert( 70 thr, &parent, &chain, 71 focus->bref.modify_tid, 72 idx, focus); 73 #endif 74 static int hammer2_sync_insert(hammer2_thread_t *thr, 75 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 76 hammer2_tid_t modify_tid, int idx, 77 hammer2_chain_t *focus); 78 static int hammer2_sync_destroy(hammer2_thread_t *thr, 79 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 80 hammer2_tid_t mtid, int idx); 81 static int hammer2_sync_replace(hammer2_thread_t *thr, 82 hammer2_chain_t *parent, hammer2_chain_t *chain, 83 hammer2_tid_t mtid, int idx, 84 hammer2_chain_t *focus, int isroot); 85 86 /**************************************************************************** 87 * HAMMER2 SYNC THREADS * 88 ****************************************************************************/ 89 /* 90 * Primary management thread for an element of a node. A thread will exist 91 * for each element requiring management. 92 * 93 * No management threads are needed for the SPMP or for any PMP with only 94 * a single MASTER. 95 * 96 * On the SPMP - handles bulkfree and dedup operations 97 * On a PFS - handles remastering and synchronization 98 */ 99 void 100 hammer2_primary_sync_thread(void *arg) 101 { 102 hammer2_thread_t *thr = arg; 103 hammer2_pfs_t *pmp; 104 hammer2_deferred_list_t list; 105 hammer2_deferred_ip_t *defer; 106 int error; 107 uint32_t flags; 108 uint32_t nflags; 109 110 pmp = thr->pmp; 111 bzero(&list, sizeof(list)); 112 113 for (;;) { 114 flags = thr->flags; 115 cpu_ccfence(); 116 117 /* 118 * Handle stop request 119 */ 120 if (flags & HAMMER2_THREAD_STOP) 121 break; 122 123 /* 124 * Handle freeze request 125 */ 126 if (flags & HAMMER2_THREAD_FREEZE) { 127 nflags = (flags & ~(HAMMER2_THREAD_FREEZE | 128 HAMMER2_THREAD_CLIENTWAIT)) | 129 HAMMER2_THREAD_FROZEN; 130 if (!atomic_cmpset_int(&thr->flags, flags, nflags)) 131 continue; 132 if (flags & HAMMER2_THREAD_CLIENTWAIT) 133 wakeup(&thr->flags); 134 flags = nflags; 135 /* fall through */ 136 } 137 138 if (flags & HAMMER2_THREAD_UNFREEZE) { 139 nflags = flags & ~(HAMMER2_THREAD_UNFREEZE | 140 HAMMER2_THREAD_FROZEN | 141 HAMMER2_THREAD_CLIENTWAIT); 142 if (!atomic_cmpset_int(&thr->flags, flags, nflags)) 143 continue; 144 if (flags & HAMMER2_THREAD_CLIENTWAIT) 145 wakeup(&thr->flags); 146 flags = nflags; 147 /* fall through */ 148 } 149 150 /* 151 * Force idle if frozen until unfrozen or stopped. 152 */ 153 if (flags & HAMMER2_THREAD_FROZEN) { 154 nflags = flags | HAMMER2_THREAD_WAITING; 155 tsleep_interlock(&thr->flags, 0); 156 if (atomic_cmpset_int(&thr->flags, flags, nflags)) { 157 tsleep(&thr->flags, PINTERLOCKED, "frozen", 0); 158 atomic_clear_int(&thr->flags, 159 HAMMER2_THREAD_WAITING); 160 } 161 continue; 162 } 163 164 /* 165 * Reset state on REMASTER request 166 */ 167 if (thr->flags & HAMMER2_THREAD_REMASTER) { 168 nflags = flags & ~HAMMER2_THREAD_REMASTER; 169 if (atomic_cmpset_int(&thr->flags, flags, nflags)) { 170 /* reset state here */ 171 } 172 continue; 173 } 174 175 /* 176 * Synchronization scan. 177 */ 178 if (hammer2_debug & 0x8000) 179 kprintf("sync_slaves pfs %s clindex %d\n", 180 pmp->pfs_names[thr->clindex], thr->clindex); 181 hammer2_trans_init(pmp, 0); 182 183 hammer2_inode_ref(pmp->iroot); 184 185 for (;;) { 186 int didbreak = 0; 187 /* XXX lock synchronize pmp->modify_tid */ 188 error = hammer2_sync_slaves(thr, pmp->iroot, &list, 1); 189 if (hammer2_debug & 0x8000) { 190 kprintf("sync_slaves error %d defer %p\n", 191 error, list.base); 192 } 193 if (error != EAGAIN) 194 break; 195 while ((defer = list.base) != NULL) { 196 hammer2_inode_t *nip; 197 198 nip = defer->ip; 199 error = hammer2_sync_slaves(thr, nip, &list, 0); 200 if (error && error != EAGAIN && error != ENOENT) 201 break; 202 if (hammer2_thr_break(thr)) { 203 didbreak = 1; 204 break; 205 } 206 207 /* 208 * If no additional defers occurred we can 209 * remove this one, otherwise keep it on 210 * the list and retry once the additional 211 * defers have completed. 212 */ 213 if (defer == list.base) { 214 --list.count; 215 list.base = defer->next; 216 kfree(defer, M_HAMMER2); 217 defer = NULL; /* safety */ 218 hammer2_inode_drop(nip); 219 } 220 } 221 222 /* 223 * If the thread is being remastered, frozen, or 224 * stopped, clean up any left-over deferals. 225 */ 226 if (didbreak || (error && error != EAGAIN)) { 227 kprintf("didbreak\n"); 228 while ((defer = list.base) != NULL) { 229 --list.count; 230 hammer2_inode_drop(defer->ip); 231 list.base = defer->next; 232 kfree(defer, M_HAMMER2); 233 } 234 if (error == 0 || error == EAGAIN) 235 error = EINPROGRESS; 236 break; 237 } 238 } 239 240 hammer2_inode_drop(pmp->iroot); 241 hammer2_trans_done(pmp); 242 243 if (error && error != EINPROGRESS) 244 kprintf("hammer2_sync_slaves: error %d\n", error); 245 246 /* 247 * Wait for event, or 5-second poll. 248 */ 249 nflags = flags | HAMMER2_THREAD_WAITING; 250 tsleep_interlock(&thr->flags, 0); 251 if (atomic_cmpset_int(&thr->flags, flags, nflags)) { 252 tsleep(&thr->flags, 0, "h2idle", hz * 5); 253 atomic_clear_int(&thr->flags, HAMMER2_THREAD_WAITING); 254 } 255 } 256 thr->td = NULL; 257 hammer2_thr_return(thr, HAMMER2_THREAD_STOPPED); 258 /* thr structure can go invalid after this point */ 259 wakeup(thr); 260 } 261 262 #if 0 263 /* 264 * Given a locked cluster created from pmp->iroot, update the PFS's 265 * reporting status. 266 */ 267 static 268 void 269 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags) 270 { 271 hammer2_pfs_t *pmp = thr->pmp; 272 273 flags &= HAMMER2_CLUSTER_ZFLAGS; 274 if (pmp->cluster_flags == flags) 275 return; 276 pmp->cluster_flags = flags; 277 278 kprintf("pfs %p", pmp); 279 if (flags & HAMMER2_CLUSTER_MSYNCED) 280 kprintf(" masters-all-good"); 281 if (flags & HAMMER2_CLUSTER_SSYNCED) 282 kprintf(" slaves-all-good"); 283 284 if (flags & HAMMER2_CLUSTER_WRHARD) 285 kprintf(" quorum/rw"); 286 else if (flags & HAMMER2_CLUSTER_RDHARD) 287 kprintf(" quorum/ro"); 288 289 if (flags & HAMMER2_CLUSTER_UNHARD) 290 kprintf(" out-of-sync-masters"); 291 else if (flags & HAMMER2_CLUSTER_NOHARD) 292 kprintf(" no-masters-visible"); 293 294 if (flags & HAMMER2_CLUSTER_WRSOFT) 295 kprintf(" soft/rw"); 296 else if (flags & HAMMER2_CLUSTER_RDSOFT) 297 kprintf(" soft/ro"); 298 299 if (flags & HAMMER2_CLUSTER_UNSOFT) 300 kprintf(" out-of-sync-slaves"); 301 else if (flags & HAMMER2_CLUSTER_NOSOFT) 302 kprintf(" no-slaves-visible"); 303 kprintf("\n"); 304 } 305 #endif 306 307 #if 0 308 static 309 void 310 dumpcluster(const char *label, 311 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster) 312 { 313 hammer2_chain_t *chain; 314 int i; 315 316 if ((hammer2_debug & 1) == 0) 317 return; 318 319 kprintf("%s\t", label); 320 KKASSERT(cparent->nchains == cluster->nchains); 321 for (i = 0; i < cparent->nchains; ++i) { 322 if (i) 323 kprintf("\t"); 324 kprintf("%d ", i); 325 if ((chain = cparent->array[i].chain) != NULL) { 326 kprintf("%016jx%s ", 327 chain->bref.key, 328 ((cparent->array[i].flags & 329 HAMMER2_CITEM_INVALID) ? "(I)" : " ") 330 ); 331 } else { 332 kprintf(" NULL %s ", " "); 333 } 334 if ((chain = cluster->array[i].chain) != NULL) { 335 kprintf("%016jx%s ", 336 chain->bref.key, 337 ((cluster->array[i].flags & 338 HAMMER2_CITEM_INVALID) ? "(I)" : " ") 339 ); 340 } else { 341 kprintf(" NULL %s ", " "); 342 } 343 kprintf("\n"); 344 } 345 } 346 #endif 347 348 /* 349 * Each out of sync node sync-thread must issue an all-nodes XOP scan of 350 * the inode. This creates a multiplication effect since the XOP scan itself 351 * issues to all nodes. However, this is the only way we can safely 352 * synchronize nodes which might have disparate I/O bandwidths and the only 353 * way we can safely deal with stalled nodes. 354 */ 355 static 356 int 357 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip, 358 hammer2_deferred_list_t *list, int isroot) 359 { 360 hammer2_xop_scanall_t *xop; 361 hammer2_chain_t *parent; 362 hammer2_chain_t *chain; 363 hammer2_pfs_t *pmp; 364 hammer2_key_t key_next; 365 hammer2_tid_t sync_tid; 366 int cache_index = -1; 367 int needrescan; 368 int want_update; 369 int error; 370 int nerror; 371 int idx; 372 int n; 373 374 pmp = ip->pmp; 375 idx = thr->clindex; /* cluster node we are responsible for */ 376 needrescan = 0; 377 want_update = 0; 378 sync_tid = 0; 379 chain = NULL; 380 parent = NULL; 381 382 #if 0 383 /* 384 * Nothing to do if all slaves are synchronized. 385 * Nothing to do if cluster not authoritatively readable. 386 */ 387 if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED) 388 return(0); 389 if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0) 390 return(HAMMER2_ERROR_INCOMPLETE); 391 #endif 392 393 error = 0; 394 395 /* 396 * Resolve the root inode of the PFS and determine if synchronization 397 * is needed by checking modify_tid. 398 */ 399 { 400 hammer2_xop_ipcluster_t *xop2; 401 hammer2_chain_t *focus; 402 403 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 404 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 405 hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster, 406 idx); 407 hammer2_inode_unlock(ip); 408 error = hammer2_xop_collect(&xop2->head, 0); 409 if (error == 0 && (focus = xop2->head.cluster.focus) != NULL) { 410 sync_tid = focus->bref.modify_tid; /* XXX */ 411 chain = hammer2_inode_chain_and_parent(ip, idx, 412 &parent, 413 HAMMER2_RESOLVE_ALWAYS | 414 HAMMER2_RESOLVE_SHARED); 415 want_update = (chain->bref.modify_tid != sync_tid); 416 if (chain) { 417 hammer2_chain_unlock(chain); 418 hammer2_chain_drop(chain); 419 chain = NULL; 420 } 421 if (parent) { 422 hammer2_chain_unlock(parent); 423 hammer2_chain_drop(parent); 424 parent = NULL; 425 } 426 } 427 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP); 428 } 429 430 if (want_update == 0) 431 return(0); 432 433 /* 434 * The inode is left unlocked during the scan. Issue a XOP 435 * that does *not* include our cluster index to iterate 436 * properly synchronized elements and resolve our cluster index 437 * against it. 438 */ 439 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 440 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 441 xop->key_beg = HAMMER2_KEY_MIN; 442 xop->key_end = HAMMER2_KEY_MAX; 443 xop->resolve_flags = HAMMER2_RESOLVE_SHARED | 444 HAMMER2_RESOLVE_ALWAYS; 445 xop->lookup_flags = HAMMER2_LOOKUP_SHARED | 446 HAMMER2_LOOKUP_NODIRECT | 447 HAMMER2_LOOKUP_ALWAYS; 448 hammer2_xop_start_except(&xop->head, hammer2_xop_scanall, idx); 449 parent = hammer2_inode_chain(ip, idx, 450 HAMMER2_RESOLVE_ALWAYS | 451 HAMMER2_RESOLVE_SHARED); 452 hammer2_inode_unlock(ip); 453 454 chain = hammer2_chain_lookup(&parent, &key_next, 455 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 456 &cache_index, 457 HAMMER2_LOOKUP_SHARED | 458 HAMMER2_LOOKUP_NODIRECT | 459 HAMMER2_LOOKUP_NODATA); 460 error = hammer2_xop_collect(&xop->head, 0); 461 kprintf("START_SCAN IP=%016jx chain=%p (%016jx)\n", 462 ip->meta.name_key, chain, 463 (chain ? chain->bref.key : -1)); 464 465 for (;;) { 466 /* 467 * We are done if our scan is done and the XOP scan is done. 468 * We are done if the XOP scan failed (that is, we don't 469 * have authoritative data to synchronize with). 470 */ 471 int advance_local = 0; 472 int advance_xop = 0; 473 int dodefer = 0; 474 hammer2_chain_t *focus; 475 476 if (chain == NULL && error == ENOENT) 477 break; 478 if (error && error != ENOENT) 479 break; 480 481 /* 482 * Compare 483 */ 484 if (chain && error == ENOENT) { 485 /* 486 * If we have local chains but the XOP scan is done, 487 * the chains need to be deleted. 488 */ 489 n = -1; 490 focus = NULL; 491 } else if (chain == NULL) { 492 /* 493 * If our local scan is done but the XOP scan is not, 494 * we need to create the missing chain(s). 495 */ 496 n = 1; 497 focus = xop->head.cluster.focus; 498 } else { 499 /* 500 * Otherwise compare to determine the action 501 * needed. 502 */ 503 focus = xop->head.cluster.focus; 504 n = hammer2_chain_cmp(chain, focus); 505 } 506 507 /* 508 * Take action based on comparison results. 509 */ 510 if (n < 0) { 511 /* 512 * Delete extranious local data. This will 513 * automatically advance the chain. 514 */ 515 nerror = hammer2_sync_destroy(thr, &parent, &chain, 516 0, idx); 517 } else if (n == 0 && chain->bref.modify_tid != 518 focus->bref.modify_tid) { 519 /* 520 * Matching key but local data or meta-data requires 521 * updating. If we will recurse, we still need to 522 * update to compatible content first but we do not 523 * synchronize modify_tid until the entire recursion 524 * has completed successfully. 525 * 526 * NOTE: Do not try to access hardlink pointers as if 527 * they were normal inodes, the inode cache will 528 * get seriously confused. 529 */ 530 if (focus->bref.type == HAMMER2_BREF_TYPE_INODE && 531 focus->data->ipdata.meta.type != 532 HAMMER2_OBJTYPE_HARDLINK) { 533 nerror = hammer2_sync_replace( 534 thr, parent, chain, 535 0, 536 idx, focus, 0); 537 dodefer = 1; 538 } else { 539 nerror = hammer2_sync_replace( 540 thr, parent, chain, 541 focus->bref.modify_tid, 542 idx, focus, 0); 543 } 544 advance_local = 1; 545 advance_xop = 1; 546 } else if (n == 0) { 547 /* 548 * 100% match, advance both 549 */ 550 advance_local = 1; 551 advance_xop = 1; 552 nerror = 0; 553 } else if (n > 0) { 554 /* 555 * Insert missing local data. 556 * 557 * If we will recurse, we still need to update to 558 * compatible content first but we do not synchronize 559 * modify_tid until the entire recursion has 560 * completed successfully. 561 * 562 * NOTE: Do not try to access hardlink pointers as if 563 * they were normal inodes, the inode cache will 564 * get seriously confused. 565 */ 566 if (focus->bref.type == HAMMER2_BREF_TYPE_INODE && 567 focus->data->ipdata.meta.type != 568 HAMMER2_OBJTYPE_HARDLINK) { 569 nerror = hammer2_sync_insert( 570 thr, &parent, &chain, 571 0, 572 idx, focus); 573 dodefer = 2; 574 } else { 575 nerror = hammer2_sync_insert( 576 thr, &parent, &chain, 577 focus->bref.modify_tid, 578 idx, focus); 579 } 580 advance_local = 1; 581 advance_xop = 1; 582 } 583 584 /* 585 * We cannot recurse depth-first because the XOP is still 586 * running in node threads for this scan. Create a placemarker 587 * by obtaining and record the hammer2_inode. 588 * 589 * We excluded our node from the XOP so we must temporarily 590 * add it to xop->head.cluster so it is properly incorporated 591 * into the inode. 592 * 593 * The deferral is pushed onto a LIFO list for bottom-up 594 * synchronization. 595 */ 596 if (error == 0 && dodefer) { 597 hammer2_inode_t *nip; 598 hammer2_deferred_ip_t *defer; 599 600 KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE); 601 602 defer = kmalloc(sizeof(*defer), M_HAMMER2, 603 M_WAITOK | M_ZERO); 604 KKASSERT(xop->head.cluster.array[idx].chain == NULL); 605 xop->head.cluster.array[idx].flags = 606 HAMMER2_CITEM_INVALID; 607 xop->head.cluster.array[idx].chain = chain; 608 nip = hammer2_inode_get(pmp, ip, 609 &xop->head.cluster, idx); 610 xop->head.cluster.array[idx].chain = NULL; 611 612 hammer2_inode_ref(nip); 613 hammer2_inode_unlock(nip); 614 615 defer->next = list->base; 616 defer->ip = nip; 617 list->base = defer; 618 ++list->count; 619 needrescan = 1; 620 } 621 622 /* 623 * If at least one deferral was added and the deferral 624 * list has grown too large, stop adding more. This 625 * will trigger an EAGAIN return. 626 */ 627 if (needrescan && list->count > 1000) 628 break; 629 630 /* 631 * Advancements for iteration. 632 */ 633 if (advance_xop) { 634 error = hammer2_xop_collect(&xop->head, 0); 635 } 636 if (advance_local) { 637 chain = hammer2_chain_next(&parent, chain, &key_next, 638 key_next, HAMMER2_KEY_MAX, 639 &cache_index, 640 HAMMER2_LOOKUP_SHARED | 641 HAMMER2_LOOKUP_NODIRECT | 642 HAMMER2_LOOKUP_NODATA); 643 } 644 } 645 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 646 if (chain) { 647 hammer2_chain_unlock(chain); 648 hammer2_chain_drop(chain); 649 } 650 if (parent) { 651 hammer2_chain_unlock(parent); 652 hammer2_chain_drop(parent); 653 } 654 655 /* 656 * If we added deferrals we want the caller to synchronize them 657 * and then call us again. 658 * 659 * NOTE: In this situation we do not yet want to synchronize our 660 * inode, setting the error code also has that effect. 661 */ 662 if ((error == 0 || error == ENOENT) && needrescan) 663 error = EAGAIN; 664 665 /* 666 * If no error occurred we can synchronize the inode meta-data 667 * and modify_tid. Only limited changes are made to PFSROOTs. 668 * 669 * XXX inode lock was lost 670 */ 671 if (error == 0 || error == ENOENT) { 672 hammer2_xop_ipcluster_t *xop2; 673 hammer2_chain_t *focus; 674 675 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 676 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 677 hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster, 678 idx); 679 hammer2_inode_unlock(ip); 680 error = hammer2_xop_collect(&xop2->head, 0); 681 if (error == 0) { 682 focus = xop2->head.cluster.focus; 683 kprintf("syncthr: update inode %p (%s)\n", 684 focus, 685 (focus ? 686 (char *)focus->data->ipdata.filename : "?")); 687 chain = hammer2_inode_chain_and_parent(ip, idx, 688 &parent, 689 HAMMER2_RESOLVE_ALWAYS | 690 HAMMER2_RESOLVE_SHARED); 691 692 KKASSERT(parent != NULL); 693 nerror = hammer2_sync_replace( 694 thr, parent, chain, 695 sync_tid, 696 idx, focus, isroot); 697 hammer2_chain_unlock(chain); 698 hammer2_chain_drop(chain); 699 hammer2_chain_unlock(parent); 700 hammer2_chain_drop(parent); 701 /* XXX */ 702 } 703 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP); 704 } 705 706 return error; 707 } 708 709 /* 710 * Create a missing chain by copying the focus from another device. 711 * 712 * On entry *parentp and focus are both locked shared. The chain will be 713 * created and returned in *chainp also locked shared. 714 */ 715 static 716 int 717 hammer2_sync_insert(hammer2_thread_t *thr, 718 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 719 hammer2_tid_t mtid, int idx, hammer2_chain_t *focus) 720 { 721 hammer2_chain_t *chain; 722 hammer2_key_t dummy; 723 int cache_index = -1; 724 725 #if HAMMER2_SYNCHRO_DEBUG 726 if (hammer2_debug & 1) 727 kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n", 728 *parentp, 729 (*parentp)->bref.type, 730 (*parentp)->bref.key, 731 idx, 732 focus->bref.type, focus->bref.key, mtid); 733 #endif 734 735 /* 736 * Parent requires an exclusive lock for the insertion. 737 * We must unlock the child to avoid deadlocks while 738 * relocking the parent. 739 */ 740 if (*chainp) { 741 hammer2_chain_unlock(*chainp); 742 hammer2_chain_drop(*chainp); 743 *chainp = NULL; 744 } 745 hammer2_chain_unlock(*parentp); 746 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS); 747 748 /* 749 * We must reissue the lookup to properly position (*parentp) 750 * for the insertion. 751 */ 752 chain = hammer2_chain_lookup(parentp, &dummy, 753 focus->bref.key, focus->bref.key, 754 &cache_index, 755 HAMMER2_LOOKUP_NODIRECT | 756 HAMMER2_LOOKUP_ALWAYS); 757 KKASSERT(chain == NULL); 758 759 chain = NULL; 760 hammer2_chain_create(parentp, &chain, 761 thr->pmp, focus->bref.methods, 762 focus->bref.key, focus->bref.keybits, 763 focus->bref.type, focus->bytes, 764 mtid, 0, 0); 765 hammer2_chain_modify(chain, mtid, 0, 0); 766 767 /* 768 * Copy focus to new chain 769 */ 770 771 /* type already set */ 772 chain->bref.methods = focus->bref.methods; 773 /* keybits already set */ 774 chain->bref.vradix = focus->bref.vradix; 775 /* mirror_tid set by flush */ 776 KKASSERT(chain->bref.modify_tid == mtid); 777 chain->bref.flags = focus->bref.flags; 778 /* key already present */ 779 /* check code will be recalculated */ 780 781 /* 782 * Copy data body. 783 */ 784 switch(chain->bref.type) { 785 case HAMMER2_BREF_TYPE_INODE: 786 if ((focus->data->ipdata.meta.op_flags & 787 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 788 /* do not copy block table */ 789 bcopy(focus->data, chain->data, 790 offsetof(hammer2_inode_data_t, u)); 791 break; 792 } 793 /* fall through copy whole thing */ 794 case HAMMER2_BREF_TYPE_DATA: 795 bcopy(focus->data, chain->data, chain->bytes); 796 hammer2_chain_setcheck(chain, chain->data); 797 break; 798 default: 799 KKASSERT(0); 800 break; 801 } 802 803 hammer2_chain_unlock(chain); /* unlock, leave ref */ 804 *chainp = chain; /* will be returned locked */ 805 806 /* 807 * Avoid ordering deadlock when relocking shared. 808 */ 809 hammer2_chain_unlock(*parentp); 810 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED | 811 HAMMER2_RESOLVE_ALWAYS); 812 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED | 813 HAMMER2_RESOLVE_ALWAYS); 814 815 return 0; 816 } 817 818 /* 819 * Destroy an extranious chain. 820 * 821 * Both *parentp and *chainp are locked shared. 822 * 823 * On return, *chainp will be adjusted to point to the next element in the 824 * iteration and locked shared. 825 */ 826 static 827 int 828 hammer2_sync_destroy(hammer2_thread_t *thr, 829 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 830 hammer2_tid_t mtid, int idx) 831 { 832 hammer2_chain_t *chain; 833 hammer2_chain_t *parent; 834 hammer2_key_t key_next; 835 hammer2_key_t save_key; 836 int cache_index = -1; 837 838 chain = *chainp; 839 840 #if HAMMER2_SYNCHRO_DEBUG 841 if (hammer2_debug & 1) 842 kprintf("destroy rec %p/%p slave %d %d.%016jx\n", 843 *parentp, chain, 844 idx, chain->bref.type, chain->bref.key); 845 #endif 846 847 save_key = chain->bref.key; 848 if (save_key != HAMMER2_KEY_MAX) 849 ++save_key; 850 851 /* 852 * Try to avoid unnecessary I/O. 853 * 854 * XXX accounting not propagated up properly. We might have to do 855 * a RESOLVE_MAYBE here and pass 0 for the flags. 856 */ 857 hammer2_chain_unlock(chain); /* relock exclusive */ 858 hammer2_chain_unlock(*parentp); 859 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS); 860 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 861 862 hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT); 863 hammer2_chain_unlock(chain); 864 hammer2_chain_drop(chain); 865 chain = NULL; /* safety */ 866 867 hammer2_chain_unlock(*parentp); /* relock shared */ 868 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED | 869 HAMMER2_RESOLVE_ALWAYS); 870 *chainp = hammer2_chain_lookup(&parent, &key_next, 871 save_key, HAMMER2_KEY_MAX, 872 &cache_index, 873 HAMMER2_LOOKUP_SHARED | 874 HAMMER2_LOOKUP_NODIRECT | 875 HAMMER2_LOOKUP_NODATA); 876 return 0; 877 } 878 879 /* 880 * cparent is locked exclusively, with an extra ref, cluster is not locked. 881 * Replace element [i] in the cluster. 882 */ 883 static 884 int 885 hammer2_sync_replace(hammer2_thread_t *thr, 886 hammer2_chain_t *parent, hammer2_chain_t *chain, 887 hammer2_tid_t mtid, int idx, 888 hammer2_chain_t *focus, int isroot) 889 { 890 int nradix; 891 uint8_t otype; 892 893 #if HAMMER2_SYNCHRO_DEBUG 894 if (hammer2_debug & 1) 895 kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n", 896 chain, 897 idx, 898 focus->bref.type, focus->bref.key, mtid); 899 #endif 900 hammer2_chain_unlock(chain); 901 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 902 if (chain->bytes != focus->bytes) { 903 /* XXX what if compressed? */ 904 nradix = hammer2_getradix(chain->bytes); 905 hammer2_chain_resize(NULL, parent, chain, 906 mtid, 0, 907 nradix, 0); 908 } 909 hammer2_chain_modify(chain, mtid, 0, 0); 910 otype = chain->bref.type; 911 chain->bref.type = focus->bref.type; 912 chain->bref.methods = focus->bref.methods; 913 chain->bref.keybits = focus->bref.keybits; 914 chain->bref.vradix = focus->bref.vradix; 915 /* mirror_tid updated by flush */ 916 KKASSERT(mtid == 0 || chain->bref.modify_tid == mtid); 917 chain->bref.flags = focus->bref.flags; 918 /* key already present */ 919 /* check code will be recalculated */ 920 chain->error = 0; 921 922 /* 923 * Copy data body. 924 */ 925 switch(chain->bref.type) { 926 case HAMMER2_BREF_TYPE_INODE: 927 /* 928 * Special case PFSROOTs, only limited changes can be made 929 * since the meta-data contains miscellanious distinguishing 930 * fields. 931 */ 932 if (isroot) { 933 chain->data->ipdata.meta.uflags = 934 focus->data->ipdata.meta.uflags; 935 chain->data->ipdata.meta.rmajor = 936 focus->data->ipdata.meta.rmajor; 937 chain->data->ipdata.meta.rminor = 938 focus->data->ipdata.meta.rminor; 939 chain->data->ipdata.meta.ctime = 940 focus->data->ipdata.meta.ctime; 941 chain->data->ipdata.meta.mtime = 942 focus->data->ipdata.meta.mtime; 943 chain->data->ipdata.meta.atime = 944 focus->data->ipdata.meta.atime; 945 /* not btime */ 946 chain->data->ipdata.meta.uid = 947 focus->data->ipdata.meta.uid; 948 chain->data->ipdata.meta.gid = 949 focus->data->ipdata.meta.gid; 950 chain->data->ipdata.meta.mode = 951 focus->data->ipdata.meta.mode; 952 chain->data->ipdata.meta.ncopies = 953 focus->data->ipdata.meta.ncopies; 954 chain->data->ipdata.meta.comp_algo = 955 focus->data->ipdata.meta.comp_algo; 956 chain->data->ipdata.meta.check_algo = 957 focus->data->ipdata.meta.check_algo; 958 chain->data->ipdata.meta.data_quota = 959 focus->data->ipdata.meta.data_quota; 960 chain->data->ipdata.meta.inode_quota = 961 focus->data->ipdata.meta.inode_quota; 962 963 /* 964 * last snapshot tid controls overwrite 965 */ 966 if (chain->data->ipdata.meta.pfs_lsnap_tid < 967 focus->data->ipdata.meta.pfs_lsnap_tid) { 968 chain->data->ipdata.meta.pfs_lsnap_tid = 969 focus->data->ipdata.meta.pfs_lsnap_tid; 970 } 971 972 hammer2_chain_setcheck(chain, chain->data); 973 break; 974 } 975 976 /* 977 * Normal replacement. 978 */ 979 if ((focus->data->ipdata.meta.op_flags & 980 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 981 /* 982 * If DIRECTDATA is transitioning to 0 or the old 983 * chain is not an inode we have to initialize 984 * the block table. 985 */ 986 if (otype != HAMMER2_BREF_TYPE_INODE || 987 (chain->data->ipdata.meta.op_flags & 988 HAMMER2_OPFLAG_DIRECTDATA)) { 989 kprintf("chain inode trans away from dd\n"); 990 bzero(&chain->data->ipdata.u, 991 sizeof(chain->data->ipdata.u)); 992 } 993 bcopy(focus->data, chain->data, 994 offsetof(hammer2_inode_data_t, u)); 995 /* XXX setcheck on inode should not be needed */ 996 hammer2_chain_setcheck(chain, chain->data); 997 break; 998 } 999 /* fall through */ 1000 case HAMMER2_BREF_TYPE_DATA: 1001 bcopy(focus->data, chain->data, chain->bytes); 1002 hammer2_chain_setcheck(chain, chain->data); 1003 break; 1004 default: 1005 KKASSERT(0); 1006 break; 1007 } 1008 1009 hammer2_chain_unlock(chain); 1010 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED | 1011 HAMMER2_RESOLVE_MAYBE); 1012 1013 return 0; 1014 } 1015