1 /* 2 * Copyright (c) 2015-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * This module implements the cluster synchronizer. Basically the way 36 * it works is that a thread is created for each cluster node in a PFS. 37 * This thread is responsible for synchronizing the current node using 38 * data from other nodes. 39 * 40 * Any out of sync master or slave can get back into synchronization as 41 * long as a quorum of masters agree on the update_tid. If a quorum is 42 * not available it may still be possible to synchronize to the highest 43 * available update_tid as a way of trying to catch up as much as possible 44 * until a quorum is available. 45 * 46 * If no quorum is possible (which can happen even if all masters are 47 * available, if the update_tid does not match), then manual intervention 48 * may be required to resolve discrepancies. 49 */ 50 #include "hammer2.h" 51 52 typedef struct hammer2_deferred_ip { 53 struct hammer2_deferred_ip *next; 54 hammer2_inode_t *ip; 55 } hammer2_deferred_ip_t; 56 57 typedef struct hammer2_deferred_list { 58 hammer2_deferred_ip_t *base; 59 int count; 60 } hammer2_deferred_list_t; 61 62 63 #define HAMMER2_SYNCHRO_DEBUG 1 64 65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip, 66 hammer2_deferred_list_t *list, int isroot); 67 #if 0 68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags); 69 nerror = hammer2_sync_insert( 70 thr, &parent, &chain, 71 focus->bref.modify_tid, 72 idx, focus); 73 #endif 74 static int hammer2_sync_insert(hammer2_thread_t *thr, 75 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 76 hammer2_tid_t modify_tid, int idx, 77 hammer2_xop_head_t *xop, hammer2_chain_t *focus); 78 static int hammer2_sync_destroy(hammer2_thread_t *thr, 79 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 80 hammer2_tid_t mtid, int idx); 81 static int hammer2_sync_replace(hammer2_thread_t *thr, 82 hammer2_chain_t *parent, hammer2_chain_t *chain, 83 hammer2_tid_t mtid, int idx, 84 hammer2_xop_head_t *xop, hammer2_chain_t *focus, 85 int isroot); 86 87 /**************************************************************************** 88 * HAMMER2 SYNC THREADS * 89 ****************************************************************************/ 90 /* 91 * Primary management thread for an element of a node. A thread will exist 92 * for each element requiring management. 93 * 94 * No management threads are needed for the SPMP or for any PMP with only 95 * a single MASTER. 96 * 97 * On the SPMP - handles bulkfree and dedup operations 98 * On a PFS - handles remastering and synchronization 99 */ 100 void 101 hammer2_primary_sync_thread(void *arg) 102 { 103 hammer2_thread_t *thr = arg; 104 hammer2_pfs_t *pmp; 105 hammer2_deferred_list_t list; 106 hammer2_deferred_ip_t *defer; 107 int error; 108 uint32_t flags; 109 uint32_t nflags; 110 111 pmp = thr->pmp; 112 bzero(&list, sizeof(list)); 113 114 for (;;) { 115 flags = thr->flags; 116 cpu_ccfence(); 117 118 /* 119 * Handle stop request 120 */ 121 if (flags & HAMMER2_THREAD_STOP) 122 break; 123 124 /* 125 * Handle freeze request 126 */ 127 if (flags & HAMMER2_THREAD_FREEZE) { 128 nflags = (flags & ~(HAMMER2_THREAD_FREEZE | 129 HAMMER2_THREAD_WAITING)) | 130 HAMMER2_THREAD_FROZEN; 131 if (!atomic_cmpset_int(&thr->flags, flags, nflags)) 132 continue; 133 if (flags & HAMMER2_THREAD_WAITING) 134 wakeup(&thr->flags); 135 continue; 136 } 137 138 if (flags & HAMMER2_THREAD_UNFREEZE) { 139 nflags = flags & ~(HAMMER2_THREAD_UNFREEZE | 140 HAMMER2_THREAD_FROZEN | 141 HAMMER2_THREAD_WAITING); 142 if (!atomic_cmpset_int(&thr->flags, flags, nflags)) 143 continue; 144 if (flags & HAMMER2_THREAD_WAITING) 145 wakeup(&thr->flags); 146 continue; 147 } 148 149 /* 150 * Force idle if frozen until unfrozen or stopped. 151 */ 152 if (flags & HAMMER2_THREAD_FROZEN) { 153 nflags = flags | HAMMER2_THREAD_WAITING; 154 155 tsleep_interlock(&thr->flags, 0); 156 if (atomic_cmpset_int(&thr->flags, flags, nflags)) 157 tsleep(&thr->flags, PINTERLOCKED, "frozen", 0); 158 continue; 159 } 160 161 /* 162 * Reset state on REMASTER request 163 */ 164 if (thr->flags & HAMMER2_THREAD_REMASTER) { 165 nflags = flags & ~HAMMER2_THREAD_REMASTER; 166 if (atomic_cmpset_int(&thr->flags, flags, nflags)) { 167 /* reset state here */ 168 } 169 continue; 170 } 171 172 /* 173 * Synchronization scan. 174 */ 175 if (hammer2_debug & 0x8000) 176 kprintf("sync_slaves pfs %s clindex %d\n", 177 pmp->pfs_names[thr->clindex], thr->clindex); 178 hammer2_trans_init(pmp, 0); 179 180 hammer2_inode_ref(pmp->iroot); 181 182 for (;;) { 183 int didbreak = 0; 184 /* XXX lock synchronize pmp->modify_tid */ 185 error = hammer2_sync_slaves(thr, pmp->iroot, &list, 1); 186 if (hammer2_debug & 0x8000) { 187 kprintf("sync_slaves error %d defer %p\n", 188 error, list.base); 189 } 190 if (error != HAMMER2_ERROR_EAGAIN) 191 break; 192 while ((defer = list.base) != NULL) { 193 hammer2_inode_t *nip; 194 195 nip = defer->ip; 196 error = hammer2_sync_slaves(thr, nip, &list, 197 (nip == pmp->iroot)); 198 if (error && 199 error != HAMMER2_ERROR_EAGAIN && 200 error != HAMMER2_ERROR_ENOENT) { 201 break; 202 } 203 if (hammer2_thr_break(thr)) { 204 didbreak = 1; 205 break; 206 } 207 208 /* 209 * If no additional defers occurred we can 210 * remove this one, otherwise keep it on 211 * the list and retry once the additional 212 * defers have completed. 213 */ 214 if (defer == list.base) { 215 --list.count; 216 list.base = defer->next; 217 kfree(defer, M_HAMMER2); 218 defer = NULL; /* safety */ 219 hammer2_inode_drop(nip); 220 } 221 } 222 223 /* 224 * If the thread is being remastered, frozen, or 225 * stopped, clean up any left-over deferals. 226 */ 227 if (didbreak || 228 (error && error != HAMMER2_ERROR_EAGAIN)) { 229 kprintf("didbreak\n"); 230 while ((defer = list.base) != NULL) { 231 --list.count; 232 hammer2_inode_drop(defer->ip); 233 list.base = defer->next; 234 kfree(defer, M_HAMMER2); 235 } 236 if (error == 0 || error == HAMMER2_ERROR_EAGAIN) 237 error = HAMMER2_ERROR_EINPROGRESS; 238 break; 239 } 240 } 241 242 hammer2_inode_drop(pmp->iroot); 243 hammer2_trans_done(pmp, 0); 244 245 if (error && error != HAMMER2_ERROR_EINPROGRESS) 246 kprintf("hammer2_sync_slaves: error %d\n", error); 247 248 /* 249 * Wait for event, or 5-second poll. 250 */ 251 nflags = flags | HAMMER2_THREAD_WAITING; 252 tsleep_interlock(&thr->flags, 0); 253 if (atomic_cmpset_int(&thr->flags, flags, nflags)) { 254 tsleep(&thr->flags, 0, "h2idle", hz * 5); 255 } 256 } 257 thr->td = NULL; 258 hammer2_thr_signal(thr, HAMMER2_THREAD_STOPPED); 259 /* thr structure can go invalid after this point */ 260 } 261 262 #if 0 263 /* 264 * Given a locked cluster created from pmp->iroot, update the PFS's 265 * reporting status. 266 */ 267 static 268 void 269 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags) 270 { 271 hammer2_pfs_t *pmp = thr->pmp; 272 273 flags &= HAMMER2_CLUSTER_ZFLAGS; 274 if (pmp->cluster_flags == flags) 275 return; 276 pmp->cluster_flags = flags; 277 278 kprintf("pfs %p", pmp); 279 if (flags & HAMMER2_CLUSTER_MSYNCED) 280 kprintf(" masters-all-good"); 281 if (flags & HAMMER2_CLUSTER_SSYNCED) 282 kprintf(" slaves-all-good"); 283 284 if (flags & HAMMER2_CLUSTER_WRHARD) 285 kprintf(" quorum/rw"); 286 else if (flags & HAMMER2_CLUSTER_RDHARD) 287 kprintf(" quorum/ro"); 288 289 if (flags & HAMMER2_CLUSTER_UNHARD) 290 kprintf(" out-of-sync-masters"); 291 else if (flags & HAMMER2_CLUSTER_NOHARD) 292 kprintf(" no-masters-visible"); 293 294 if (flags & HAMMER2_CLUSTER_WRSOFT) 295 kprintf(" soft/rw"); 296 else if (flags & HAMMER2_CLUSTER_RDSOFT) 297 kprintf(" soft/ro"); 298 299 if (flags & HAMMER2_CLUSTER_UNSOFT) 300 kprintf(" out-of-sync-slaves"); 301 else if (flags & HAMMER2_CLUSTER_NOSOFT) 302 kprintf(" no-slaves-visible"); 303 kprintf("\n"); 304 } 305 #endif 306 307 #if 0 308 static 309 void 310 dumpcluster(const char *label, 311 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster) 312 { 313 hammer2_chain_t *chain; 314 int i; 315 316 if ((hammer2_debug & 1) == 0) 317 return; 318 319 kprintf("%s\t", label); 320 KKASSERT(cparent->nchains == cluster->nchains); 321 for (i = 0; i < cparent->nchains; ++i) { 322 if (i) 323 kprintf("\t"); 324 kprintf("%d ", i); 325 if ((chain = cparent->array[i].chain) != NULL) { 326 kprintf("%016jx%s ", 327 chain->bref.key, 328 ((cparent->array[i].flags & 329 HAMMER2_CITEM_INVALID) ? "(I)" : " ") 330 ); 331 } else { 332 kprintf(" NULL %s ", " "); 333 } 334 if ((chain = cluster->array[i].chain) != NULL) { 335 kprintf("%016jx%s ", 336 chain->bref.key, 337 ((cluster->array[i].flags & 338 HAMMER2_CITEM_INVALID) ? "(I)" : " ") 339 ); 340 } else { 341 kprintf(" NULL %s ", " "); 342 } 343 kprintf("\n"); 344 } 345 } 346 #endif 347 348 /* 349 * Each out of sync node sync-thread must issue an all-nodes XOP scan of 350 * the inode. This creates a multiplication effect since the XOP scan itself 351 * issues to all nodes. However, this is the only way we can safely 352 * synchronize nodes which might have disparate I/O bandwidths and the only 353 * way we can safely deal with stalled nodes. 354 * 355 * XXX serror / merror rollup and handling. 356 */ 357 static 358 int 359 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip, 360 hammer2_deferred_list_t *list, int isroot) 361 { 362 hammer2_xop_scanall_t *xop; 363 hammer2_chain_t *parent; 364 hammer2_chain_t *chain; 365 hammer2_pfs_t *pmp; 366 hammer2_key_t key_next; 367 hammer2_tid_t sync_tid; 368 int needrescan; 369 int want_update; 370 int serror; /* slave error */ 371 int merror; /* master error (from xop_collect) */ 372 int nerror; /* temporary error */ 373 int idx; 374 int n; 375 376 pmp = ip->pmp; 377 idx = thr->clindex; /* cluster node we are responsible for */ 378 needrescan = 0; 379 want_update = 0; 380 sync_tid = 0; 381 chain = NULL; 382 parent = NULL; 383 384 #if 0 385 /* 386 * Nothing to do if all slaves are synchronized. 387 * Nothing to do if cluster not authoritatively readable. 388 */ 389 if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED) 390 return(0); 391 if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0) 392 return(HAMMER2_ERROR_INCOMPLETE); 393 #endif 394 395 merror = 0; 396 397 /* 398 * Resolve the root inode of the PFS and determine if synchronization 399 * is needed by checking modify_tid. 400 * 401 * Retain the synchronization TID from the focus inode and use it 402 * later to synchronize the focus inode if/when the recursion 403 * succeeds. 404 */ 405 { 406 hammer2_xop_ipcluster_t *xop2; 407 hammer2_chain_t *focus; 408 409 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 410 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 411 hammer2_xop_start_except(&xop2->head, &hammer2_ipcluster_desc, 412 idx); 413 hammer2_inode_unlock(ip); 414 merror = hammer2_xop_collect(&xop2->head, 0); 415 if (merror == 0 && (focus = xop2->head.cluster.focus) != NULL) { 416 sync_tid = focus->bref.modify_tid; 417 chain = hammer2_inode_chain_and_parent(ip, idx, 418 &parent, 419 HAMMER2_RESOLVE_ALWAYS | 420 HAMMER2_RESOLVE_SHARED); 421 want_update = (chain->bref.modify_tid != sync_tid); 422 if (chain) { 423 hammer2_chain_unlock(chain); 424 hammer2_chain_drop(chain); 425 chain = NULL; 426 } 427 if (parent) { 428 hammer2_chain_unlock(parent); 429 hammer2_chain_drop(parent); 430 parent = NULL; 431 } 432 } 433 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP); 434 } 435 436 if (want_update == 0) 437 return(0); 438 439 /* 440 * The inode is left unlocked during the scan. Issue a XOP 441 * that does *not* include our cluster index to iterate 442 * properly synchronized elements and resolve our cluster index 443 * against it. 444 */ 445 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 446 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 447 xop->key_beg = HAMMER2_KEY_MIN; 448 xop->key_end = HAMMER2_KEY_MAX; 449 xop->resolve_flags = HAMMER2_RESOLVE_SHARED | 450 HAMMER2_RESOLVE_ALWAYS; 451 xop->lookup_flags = HAMMER2_LOOKUP_SHARED | 452 HAMMER2_LOOKUP_NODIRECT | 453 HAMMER2_LOOKUP_ALWAYS; 454 hammer2_xop_start_except(&xop->head, &hammer2_scanall_desc, idx); 455 parent = hammer2_inode_chain(ip, idx, 456 HAMMER2_RESOLVE_ALWAYS | 457 HAMMER2_RESOLVE_SHARED); 458 hammer2_inode_unlock(ip); 459 460 chain = hammer2_chain_lookup(&parent, &key_next, 461 HAMMER2_KEY_MIN, HAMMER2_KEY_MAX, 462 &serror, 463 HAMMER2_LOOKUP_SHARED | 464 HAMMER2_LOOKUP_NODIRECT | 465 HAMMER2_LOOKUP_NODATA); 466 merror = hammer2_xop_collect(&xop->head, 0); 467 if (hammer2_debug & 0x8000) { 468 kprintf("START_SCAN IP=%016jx chain=%p (%016jx)\n", 469 ip->meta.name_key, chain, 470 (chain ? chain->bref.key : -1)); 471 } 472 473 for (;;) { 474 /* 475 * We are done if our scan is done and the XOP scan is done. 476 * We are done if the XOP scan failed (that is, we don't 477 * have authoritative data to synchronize with). 478 */ 479 int advance_local = 0; 480 int advance_xop = 0; 481 int dodefer = 0; 482 hammer2_chain_t *focus; 483 484 if (chain == NULL && merror == HAMMER2_ERROR_ENOENT) 485 break; 486 if (merror && merror != HAMMER2_ERROR_ENOENT) 487 break; 488 489 /* 490 * Compare 491 */ 492 if (chain && merror == HAMMER2_ERROR_ENOENT) { 493 /* 494 * If we have local chains but the XOP scan is done, 495 * the chains need to be deleted. 496 */ 497 n = -1; 498 focus = NULL; 499 } else if (chain == NULL) { 500 /* 501 * If our local scan is done but the XOP scan is not, 502 * we need to create the missing chain(s). 503 */ 504 n = 1; 505 focus = xop->head.cluster.focus; 506 } else { 507 /* 508 * Otherwise compare to determine the action 509 * needed. 510 */ 511 focus = xop->head.cluster.focus; 512 n = hammer2_chain_cmp(chain, focus); 513 } 514 515 /* 516 * Take action based on comparison results. 517 */ 518 if (n < 0) { 519 /* 520 * Delete extranious local data. This will 521 * automatically advance the chain. 522 */ 523 nerror = hammer2_sync_destroy(thr, &parent, &chain, 524 0, idx); 525 } else if (n == 0 && chain->bref.modify_tid != 526 focus->bref.modify_tid) { 527 /* 528 * Matching key but local data or meta-data requires 529 * updating. If we will recurse, we still need to 530 * update to compatible content first but we do not 531 * synchronize modify_tid until the entire recursion 532 * has completed successfully. 533 */ 534 if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) { 535 nerror = hammer2_sync_replace( 536 thr, parent, chain, 537 0, 538 idx, &xop->head, focus, 0); 539 dodefer = 1; 540 } else { 541 nerror = hammer2_sync_replace( 542 thr, parent, chain, 543 focus->bref.modify_tid, 544 idx, &xop->head, focus, 0); 545 } 546 advance_local = 1; 547 advance_xop = 1; 548 } else if (n == 0) { 549 /* 550 * 100% match, advance both 551 */ 552 advance_local = 1; 553 advance_xop = 1; 554 nerror = 0; 555 } else if (n > 0) { 556 /* 557 * Insert missing local data. 558 * 559 * If we will recurse, we still need to update to 560 * compatible content first but we do not synchronize 561 * modify_tid until the entire recursion has 562 * completed successfully. 563 */ 564 if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) { 565 nerror = hammer2_sync_insert( 566 thr, &parent, &chain, 567 0, 568 idx, &xop->head, focus); 569 dodefer = 2; 570 } else { 571 nerror = hammer2_sync_insert( 572 thr, &parent, &chain, 573 focus->bref.modify_tid, 574 idx, &xop->head, focus); 575 } 576 advance_local = 1; 577 advance_xop = 1; 578 } 579 580 /* 581 * We cannot recurse depth-first because the XOP is still 582 * running in node threads for this scan. Create a placemarker 583 * by obtaining and record the hammer2_inode. 584 * 585 * We excluded our node from the XOP so we must temporarily 586 * add it to xop->head.cluster so it is properly incorporated 587 * into the inode. 588 * 589 * The deferral is pushed onto a LIFO list for bottom-up 590 * synchronization. 591 */ 592 if (merror == 0 && dodefer) { 593 hammer2_inode_t *nip; 594 hammer2_deferred_ip_t *defer; 595 596 KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE); 597 598 defer = kmalloc(sizeof(*defer), M_HAMMER2, 599 M_WAITOK | M_ZERO); 600 KKASSERT(xop->head.cluster.array[idx].chain == NULL); 601 xop->head.cluster.array[idx].flags = 602 HAMMER2_CITEM_INVALID; 603 xop->head.cluster.array[idx].chain = chain; 604 nip = hammer2_inode_get(pmp, &xop->head, -1, idx); 605 xop->head.cluster.array[idx].chain = NULL; 606 607 hammer2_inode_ref(nip); 608 hammer2_inode_unlock(nip); 609 610 defer->next = list->base; 611 defer->ip = nip; 612 list->base = defer; 613 ++list->count; 614 needrescan = 1; 615 } 616 617 /* 618 * If at least one deferral was added and the deferral 619 * list has grown too large, stop adding more. This 620 * will trigger an HAMMER2_ERROR_EAGAIN return. 621 */ 622 if (needrescan && list->count > 1000) 623 break; 624 625 /* 626 * Advancements for iteration. 627 */ 628 if (advance_xop) { 629 merror = hammer2_xop_collect(&xop->head, 0); 630 } 631 if (advance_local) { 632 chain = hammer2_chain_next(&parent, chain, &key_next, 633 key_next, HAMMER2_KEY_MAX, 634 &serror, 635 HAMMER2_LOOKUP_SHARED | 636 HAMMER2_LOOKUP_NODIRECT | 637 HAMMER2_LOOKUP_NODATA); 638 } 639 } 640 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 641 if (chain) { 642 hammer2_chain_unlock(chain); 643 hammer2_chain_drop(chain); 644 } 645 if (parent) { 646 hammer2_chain_unlock(parent); 647 hammer2_chain_drop(parent); 648 } 649 650 /* 651 * If we added deferrals we want the caller to synchronize them 652 * and then call us again. 653 * 654 * NOTE: In this situation we do not yet want to synchronize our 655 * inode, setting the error code also has that effect. 656 */ 657 if ((merror == 0 || merror == HAMMER2_ERROR_ENOENT) && needrescan) 658 merror = HAMMER2_ERROR_EAGAIN; 659 660 /* 661 * If no error occurred we can synchronize the inode meta-data 662 * and modify_tid. Only limited changes are made to PFSROOTs. 663 * 664 * XXX inode lock was lost 665 */ 666 if (merror == 0 || merror == HAMMER2_ERROR_ENOENT) { 667 hammer2_xop_ipcluster_t *xop2; 668 hammer2_chain_t *focus; 669 670 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED); 671 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 672 hammer2_xop_start_except(&xop2->head, &hammer2_ipcluster_desc, 673 idx); 674 hammer2_inode_unlock(ip); 675 merror = hammer2_xop_collect(&xop2->head, 0); 676 if (merror == 0) { 677 focus = xop2->head.cluster.focus; 678 if ((hammer2_debug & 0x8000) && focus) { 679 const char *filename; 680 681 filename = hammer2_xop_gdata(&xop2->head)-> 682 ipdata.filename; 683 kprintf("syncthr: update inode %p (%s)\n", 684 focus, filename); 685 hammer2_xop_pdata(&xop2->head); 686 } 687 chain = hammer2_inode_chain_and_parent(ip, idx, 688 &parent, 689 HAMMER2_RESOLVE_ALWAYS | 690 HAMMER2_RESOLVE_SHARED); 691 692 KKASSERT(parent != NULL); 693 nerror = hammer2_sync_replace( 694 thr, parent, chain, 695 sync_tid, 696 idx, &xop2->head, focus, isroot); 697 hammer2_chain_unlock(chain); 698 hammer2_chain_drop(chain); 699 hammer2_chain_unlock(parent); 700 hammer2_chain_drop(parent); 701 /* XXX */ 702 } 703 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP); 704 } 705 706 return merror; 707 } 708 709 /* 710 * Create a missing chain by copying the focus from another device. 711 * 712 * On entry *parentp and focus are both locked shared. The chain will be 713 * created and returned in *chainp also locked shared. 714 */ 715 static 716 int 717 hammer2_sync_insert(hammer2_thread_t *thr, 718 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 719 hammer2_tid_t mtid, int idx, hammer2_xop_head_t *xop, 720 hammer2_chain_t *focus) 721 { 722 hammer2_chain_t *chain; 723 hammer2_key_t dummy; 724 int error; 725 726 #if HAMMER2_SYNCHRO_DEBUG 727 if (hammer2_debug & 1) 728 kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n", 729 *parentp, 730 (*parentp)->bref.type, 731 (*parentp)->bref.key, 732 idx, 733 focus->bref.type, focus->bref.key, mtid); 734 #endif 735 736 /* 737 * Parent requires an exclusive lock for the insertion. 738 * We must unlock the child to avoid deadlocks while 739 * relocking the parent. 740 */ 741 if (*chainp) { 742 hammer2_chain_unlock(*chainp); 743 hammer2_chain_drop(*chainp); 744 *chainp = NULL; 745 } 746 hammer2_chain_unlock(*parentp); 747 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS); 748 749 /* 750 * We must reissue the lookup to properly position (*parentp) 751 * for the insertion. 752 */ 753 chain = hammer2_chain_lookup(parentp, &dummy, 754 focus->bref.key, focus->bref.key, 755 &error, 756 HAMMER2_LOOKUP_NODIRECT | 757 HAMMER2_LOOKUP_ALWAYS); 758 KKASSERT(chain == NULL); 759 760 chain = NULL; 761 error = hammer2_chain_create(parentp, &chain, NULL, thr->pmp, 762 focus->bref.methods, 763 focus->bref.key, focus->bref.keybits, 764 focus->bref.type, focus->bytes, 765 mtid, 0, 0); 766 if (error == 0) { 767 const hammer2_media_data_t *data; 768 769 error = hammer2_chain_modify(chain, mtid, 0, 0); 770 if (error) 771 goto failed; 772 773 /* 774 * Copy focus to new chain 775 */ 776 777 /* type already set */ 778 chain->bref.methods = focus->bref.methods; 779 /* keybits already set */ 780 chain->bref.vradix = focus->bref.vradix; 781 /* mirror_tid set by flush */ 782 KKASSERT(chain->bref.modify_tid == mtid); 783 chain->bref.flags = focus->bref.flags; 784 /* key already present */ 785 /* check code will be recalculated */ 786 787 /* 788 * Copy data body. 789 */ 790 switch(chain->bref.type) { 791 case HAMMER2_BREF_TYPE_INODE: 792 data = hammer2_xop_gdata(xop); 793 794 if ((data->ipdata.meta.op_flags & 795 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 796 /* do not copy block table */ 797 bcopy(data, chain->data, 798 offsetof(hammer2_inode_data_t, u)); 799 hammer2_xop_pdata(xop); 800 break; 801 } 802 hammer2_xop_pdata(xop); 803 /* fall through copy whole thing */ 804 case HAMMER2_BREF_TYPE_DATA: 805 data = hammer2_xop_gdata(xop); 806 bcopy(data, chain->data, chain->bytes); 807 hammer2_chain_setcheck(chain, chain->data); 808 hammer2_xop_pdata(xop); 809 break; 810 case HAMMER2_BREF_TYPE_DIRENT: 811 /* 812 * Directory entries embed data in the blockref. 813 */ 814 if (chain->bytes) { 815 data = hammer2_xop_gdata(xop); 816 bcopy(data, chain->data, chain->bytes); 817 hammer2_chain_setcheck(chain, chain->data); 818 hammer2_xop_pdata(xop); 819 } else { 820 chain->bref.check = focus->bref.check; 821 } 822 chain->bref.embed = focus->bref.embed; 823 break; 824 default: 825 KKASSERT(0); 826 break; 827 } 828 } 829 830 failed: 831 if (chain) 832 hammer2_chain_unlock(chain); /* unlock, leave ref */ 833 *chainp = chain; /* will be returned locked */ 834 835 /* 836 * Avoid an ordering deadlock when relocking shared. 837 */ 838 hammer2_chain_unlock(*parentp); 839 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED | 840 HAMMER2_RESOLVE_ALWAYS); 841 if (chain) { 842 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED | 843 HAMMER2_RESOLVE_ALWAYS); 844 error = chain->error; 845 } 846 847 return error; 848 } 849 850 /* 851 * Destroy an extranious chain. 852 * 853 * Both *parentp and *chainp are locked shared. 854 * 855 * On return, *chainp will be adjusted to point to the next element in the 856 * iteration and locked shared. 857 */ 858 static 859 int 860 hammer2_sync_destroy(hammer2_thread_t *thr, 861 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 862 hammer2_tid_t mtid, int idx) 863 { 864 hammer2_chain_t *chain; 865 hammer2_key_t key_next; 866 hammer2_key_t save_key; 867 int error; 868 869 chain = *chainp; 870 871 #if HAMMER2_SYNCHRO_DEBUG 872 if (hammer2_debug & 1) 873 kprintf("destroy rec %p/%p slave %d %d.%016jx\n", 874 *parentp, chain, 875 idx, chain->bref.type, chain->bref.key); 876 #endif 877 878 save_key = chain->bref.key; 879 if (save_key != HAMMER2_KEY_MAX) 880 ++save_key; 881 882 /* 883 * Try to avoid unnecessary I/O. 884 * 885 * XXX accounting not propagated up properly. We might have to do 886 * a RESOLVE_MAYBE here and pass 0 for the flags. 887 */ 888 hammer2_chain_unlock(chain); /* relock exclusive */ 889 hammer2_chain_unlock(*parentp); 890 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS); 891 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 892 893 hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT); 894 hammer2_chain_unlock(chain); 895 hammer2_chain_drop(chain); 896 chain = NULL; /* safety */ 897 898 hammer2_chain_unlock(*parentp); /* relock shared */ 899 hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED | 900 HAMMER2_RESOLVE_ALWAYS); 901 *chainp = hammer2_chain_lookup(parentp, &key_next, 902 save_key, HAMMER2_KEY_MAX, 903 &error, 904 HAMMER2_LOOKUP_SHARED | 905 HAMMER2_LOOKUP_NODIRECT | 906 HAMMER2_LOOKUP_NODATA); 907 return error; 908 } 909 910 /* 911 * cparent is locked exclusively, with an extra ref, cluster is not locked. 912 * Replace element [i] in the cluster. 913 */ 914 static 915 int 916 hammer2_sync_replace(hammer2_thread_t *thr, 917 hammer2_chain_t *parent, hammer2_chain_t *chain, 918 hammer2_tid_t mtid, int idx, 919 hammer2_xop_head_t *xop, hammer2_chain_t *focus, 920 int isroot) 921 { 922 uint8_t otype; 923 int nradix; 924 int error; 925 926 #if HAMMER2_SYNCHRO_DEBUG 927 if (hammer2_debug & 1) 928 kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n", 929 chain, 930 idx, 931 focus->bref.type, focus->bref.key, mtid); 932 #endif 933 hammer2_chain_unlock(chain); 934 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 935 error = chain->error; 936 if (error == 0) { 937 const hammer2_media_data_t *data; 938 939 if (chain->bytes != focus->bytes) { 940 /* XXX what if compressed? */ 941 nradix = hammer2_getradix(chain->bytes); 942 error = hammer2_chain_resize(chain, mtid, 0, nradix, 0); 943 if (error) 944 goto failed; 945 } 946 error = hammer2_chain_modify(chain, mtid, 0, 0); 947 if (error) 948 goto failed; 949 otype = chain->bref.type; 950 data = hammer2_xop_gdata(xop); 951 chain->bref.type = focus->bref.type; 952 chain->bref.methods = focus->bref.methods; 953 chain->bref.keybits = focus->bref.keybits; 954 chain->bref.vradix = focus->bref.vradix; 955 /* mirror_tid updated by flush */ 956 KKASSERT(mtid == 0 || chain->bref.modify_tid == mtid); 957 chain->bref.flags = focus->bref.flags; 958 /* key already present */ 959 /* check code will be recalculated */ 960 961 /* 962 * Copy data body. 963 */ 964 switch(chain->bref.type) { 965 case HAMMER2_BREF_TYPE_INODE: 966 /* 967 * Special case PFSROOTs, only limited changes can 968 * be made since the meta-data contains miscellanious 969 * distinguishing fields. 970 */ 971 if (isroot) { 972 chain->data->ipdata.meta.uflags = 973 data->ipdata.meta.uflags; 974 chain->data->ipdata.meta.rmajor = 975 data->ipdata.meta.rmajor; 976 chain->data->ipdata.meta.rminor = 977 data->ipdata.meta.rminor; 978 chain->data->ipdata.meta.ctime = 979 data->ipdata.meta.ctime; 980 chain->data->ipdata.meta.mtime = 981 data->ipdata.meta.mtime; 982 chain->data->ipdata.meta.atime = 983 data->ipdata.meta.atime; 984 /* not btime */ 985 chain->data->ipdata.meta.uid = 986 data->ipdata.meta.uid; 987 chain->data->ipdata.meta.gid = 988 data->ipdata.meta.gid; 989 chain->data->ipdata.meta.mode = 990 data->ipdata.meta.mode; 991 chain->data->ipdata.meta.ncopies = 992 data->ipdata.meta.ncopies; 993 chain->data->ipdata.meta.comp_algo = 994 data->ipdata.meta.comp_algo; 995 chain->data->ipdata.meta.check_algo = 996 data->ipdata.meta.check_algo; 997 chain->data->ipdata.meta.data_quota = 998 data->ipdata.meta.data_quota; 999 chain->data->ipdata.meta.inode_quota = 1000 data->ipdata.meta.inode_quota; 1001 1002 /* 1003 * last snapshot tid controls overwrite 1004 */ 1005 if (chain->data->ipdata.meta.pfs_lsnap_tid < 1006 data->ipdata.meta.pfs_lsnap_tid) { 1007 chain->data->ipdata.meta.pfs_lsnap_tid = 1008 data->ipdata.meta.pfs_lsnap_tid; 1009 } 1010 1011 hammer2_chain_setcheck(chain, chain->data); 1012 break; 1013 } 1014 1015 /* 1016 * Normal replacement. 1017 */ 1018 if ((data->ipdata.meta.op_flags & 1019 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 1020 /* 1021 * If DIRECTDATA is transitioning to 0 or the 1022 * old chain is not an inode we have to 1023 * initialize the block table. 1024 */ 1025 if (otype != HAMMER2_BREF_TYPE_INODE || 1026 (chain->data->ipdata.meta.op_flags & 1027 HAMMER2_OPFLAG_DIRECTDATA)) { 1028 kprintf("chain inode trans " 1029 "away from dd\n"); 1030 bzero(&chain->data->ipdata.u, 1031 sizeof(chain->data->ipdata.u)); 1032 } 1033 bcopy(data, chain->data, 1034 offsetof(hammer2_inode_data_t, u)); 1035 /* XXX setcheck on inode should not be needed */ 1036 hammer2_chain_setcheck(chain, chain->data); 1037 break; 1038 } 1039 /* fall through */ 1040 case HAMMER2_BREF_TYPE_DATA: 1041 bcopy(data, chain->data, chain->bytes); 1042 hammer2_chain_setcheck(chain, chain->data); 1043 break; 1044 case HAMMER2_BREF_TYPE_DIRENT: 1045 /* 1046 * Directory entries embed data in the blockref. 1047 */ 1048 if (chain->bytes) { 1049 bcopy(data, chain->data, chain->bytes); 1050 hammer2_chain_setcheck(chain, chain->data); 1051 } else { 1052 chain->bref.check = focus->bref.check; 1053 } 1054 chain->bref.embed = focus->bref.embed; 1055 break; 1056 default: 1057 KKASSERT(0); 1058 break; 1059 } 1060 hammer2_xop_pdata(xop); 1061 } 1062 1063 failed: 1064 hammer2_chain_unlock(chain); 1065 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED | 1066 HAMMER2_RESOLVE_MAYBE); 1067 1068 return error; 1069 } 1070