xref: /dragonfly/sys/vfs/hammer2/hammer2_synchro.c (revision e6e77800)
1 /*
2  * Copyright (c) 2015-2017 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * This module implements the cluster synchronizer.  Basically the way
36  * it works is that a thread is created for each cluster node in a PFS.
37  * This thread is responsible for synchronizing the current node using
38  * data from other nodes.
39  *
40  * Any out of sync master or slave can get back into synchronization as
41  * long as a quorum of masters agree on the update_tid.  If a quorum is
42  * not available it may still be possible to synchronize to the highest
43  * available update_tid as a way of trying to catch up as much as possible
44  * until a quorum is available.
45  *
46  * If no quorum is possible (which can happen even if all masters are
47  * available, if the update_tid does not match), then manual intervention
48  * may be required to resolve discrepancies.
49  */
50 #include "hammer2.h"
51 
52 typedef struct hammer2_deferred_ip {
53 	struct hammer2_deferred_ip *next;
54 	hammer2_inode_t	*ip;
55 } hammer2_deferred_ip_t;
56 
57 typedef struct hammer2_deferred_list {
58 	hammer2_deferred_ip_t	*base;
59 	int			count;
60 } hammer2_deferred_list_t;
61 
62 
63 #define HAMMER2_SYNCHRO_DEBUG 1
64 
65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
66 				hammer2_deferred_list_t *list, int isroot);
67 #if 0
68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags);
69 				nerror = hammer2_sync_insert(
70 						thr, &parent, &chain,
71 						focus->bref.modify_tid,
72 						idx, focus);
73 #endif
74 static int hammer2_sync_insert(hammer2_thread_t *thr,
75 			hammer2_chain_t **parentp, hammer2_chain_t **chainp,
76 			hammer2_tid_t modify_tid, int idx,
77 			hammer2_chain_t *focus);
78 static int hammer2_sync_destroy(hammer2_thread_t *thr,
79 			hammer2_chain_t **parentp, hammer2_chain_t **chainp,
80 			hammer2_tid_t mtid, int idx);
81 static int hammer2_sync_replace(hammer2_thread_t *thr,
82 			hammer2_chain_t *parent, hammer2_chain_t *chain,
83 			hammer2_tid_t mtid, int idx,
84 			hammer2_chain_t *focus, int isroot);
85 
86 /****************************************************************************
87  *			    HAMMER2 SYNC THREADS 			    *
88  ****************************************************************************/
89 /*
90  * Primary management thread for an element of a node.  A thread will exist
91  * for each element requiring management.
92  *
93  * No management threads are needed for the SPMP or for any PMP with only
94  * a single MASTER.
95  *
96  * On the SPMP - handles bulkfree and dedup operations
97  * On a PFS    - handles remastering and synchronization
98  */
99 void
100 hammer2_primary_sync_thread(void *arg)
101 {
102 	hammer2_thread_t *thr = arg;
103 	hammer2_pfs_t *pmp;
104 	hammer2_deferred_list_t list;
105 	hammer2_deferred_ip_t *defer;
106 	int error;
107 	uint32_t flags;
108 	uint32_t nflags;
109 
110 	pmp = thr->pmp;
111 	bzero(&list, sizeof(list));
112 
113 	for (;;) {
114 		flags = thr->flags;
115 		cpu_ccfence();
116 
117 		/*
118 		 * Handle stop request
119 		 */
120 		if (flags & HAMMER2_THREAD_STOP)
121 			break;
122 
123 		/*
124 		 * Handle freeze request
125 		 */
126 		if (flags & HAMMER2_THREAD_FREEZE) {
127 			nflags = (flags & ~(HAMMER2_THREAD_FREEZE |
128 					    HAMMER2_THREAD_WAITING)) |
129 				 HAMMER2_THREAD_FROZEN;
130 			if (!atomic_cmpset_int(&thr->flags, flags, nflags))
131 				continue;
132 			if (flags & HAMMER2_THREAD_WAITING)
133 				wakeup(&thr->flags);
134 			continue;
135 		}
136 
137 		if (flags & HAMMER2_THREAD_UNFREEZE) {
138 			nflags = flags & ~(HAMMER2_THREAD_UNFREEZE |
139 					   HAMMER2_THREAD_FROZEN |
140 					   HAMMER2_THREAD_WAITING);
141 			if (!atomic_cmpset_int(&thr->flags, flags, nflags))
142 				continue;
143 			if (flags & HAMMER2_THREAD_WAITING)
144 				wakeup(&thr->flags);
145 			continue;
146 		}
147 
148 		/*
149 		 * Force idle if frozen until unfrozen or stopped.
150 		 */
151 		if (flags & HAMMER2_THREAD_FROZEN) {
152 			nflags = flags | HAMMER2_THREAD_WAITING;
153 
154 			tsleep_interlock(&thr->flags, 0);
155 			if (atomic_cmpset_int(&thr->flags, flags, nflags))
156 				tsleep(&thr->flags, PINTERLOCKED, "frozen", 0);
157 			continue;
158 		}
159 
160 		/*
161 		 * Reset state on REMASTER request
162 		 */
163 		if (thr->flags & HAMMER2_THREAD_REMASTER) {
164 			nflags = flags & ~HAMMER2_THREAD_REMASTER;
165 			if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
166 				/* reset state here */
167 			}
168 			continue;
169 		}
170 
171 		/*
172 		 * Synchronization scan.
173 		 */
174 		if (hammer2_debug & 0x8000)
175 			kprintf("sync_slaves pfs %s clindex %d\n",
176 				pmp->pfs_names[thr->clindex], thr->clindex);
177 		hammer2_trans_init(pmp, 0);
178 
179 		hammer2_inode_ref(pmp->iroot);
180 
181 		for (;;) {
182 			int didbreak = 0;
183 			/* XXX lock synchronize pmp->modify_tid */
184 			error = hammer2_sync_slaves(thr, pmp->iroot, &list, 1);
185 			if (hammer2_debug & 0x8000) {
186 				kprintf("sync_slaves error %d defer %p\n",
187 					error, list.base);
188 			}
189 			if (error != HAMMER2_ERROR_EAGAIN)
190 				break;
191 			while ((defer = list.base) != NULL) {
192 				hammer2_inode_t *nip;
193 
194 				nip = defer->ip;
195 				error = hammer2_sync_slaves(thr, nip, &list,
196 							(nip == pmp->iroot));
197 				if (error &&
198 				    error != HAMMER2_ERROR_EAGAIN &&
199 				    error != HAMMER2_ERROR_ENOENT) {
200 					break;
201 				}
202 				if (hammer2_thr_break(thr)) {
203 					didbreak = 1;
204 					break;
205 				}
206 
207 				/*
208 				 * If no additional defers occurred we can
209 				 * remove this one, otherwise keep it on
210 				 * the list and retry once the additional
211 				 * defers have completed.
212 				 */
213 				if (defer == list.base) {
214 					--list.count;
215 					list.base = defer->next;
216 					kfree(defer, M_HAMMER2);
217 					defer = NULL;	/* safety */
218 					hammer2_inode_drop(nip);
219 				}
220 			}
221 
222 			/*
223 			 * If the thread is being remastered, frozen, or
224 			 * stopped, clean up any left-over deferals.
225 			 */
226 			if (didbreak ||
227 			    (error && error != HAMMER2_ERROR_EAGAIN)) {
228 				kprintf("didbreak\n");
229 				while ((defer = list.base) != NULL) {
230 					--list.count;
231 					hammer2_inode_drop(defer->ip);
232 					list.base = defer->next;
233 					kfree(defer, M_HAMMER2);
234 				}
235 				if (error == 0 || error == HAMMER2_ERROR_EAGAIN)
236 					error = HAMMER2_ERROR_EINPROGRESS;
237 				break;
238 			}
239 		}
240 
241 		hammer2_inode_drop(pmp->iroot);
242 		hammer2_trans_done(pmp);
243 
244 		if (error && error != HAMMER2_ERROR_EINPROGRESS)
245 			kprintf("hammer2_sync_slaves: error %d\n", error);
246 
247 		/*
248 		 * Wait for event, or 5-second poll.
249 		 */
250 		nflags = flags | HAMMER2_THREAD_WAITING;
251 		tsleep_interlock(&thr->flags, 0);
252 		if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
253 			tsleep(&thr->flags, 0, "h2idle", hz * 5);
254 		}
255 	}
256 	thr->td = NULL;
257 	hammer2_thr_signal(thr, HAMMER2_THREAD_STOPPED);
258 	/* thr structure can go invalid after this point */
259 }
260 
261 #if 0
262 /*
263  * Given a locked cluster created from pmp->iroot, update the PFS's
264  * reporting status.
265  */
266 static
267 void
268 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags)
269 {
270 	hammer2_pfs_t *pmp = thr->pmp;
271 
272 	flags &= HAMMER2_CLUSTER_ZFLAGS;
273 	if (pmp->cluster_flags == flags)
274 		return;
275 	pmp->cluster_flags = flags;
276 
277 	kprintf("pfs %p", pmp);
278 	if (flags & HAMMER2_CLUSTER_MSYNCED)
279 		kprintf(" masters-all-good");
280 	if (flags & HAMMER2_CLUSTER_SSYNCED)
281 		kprintf(" slaves-all-good");
282 
283 	if (flags & HAMMER2_CLUSTER_WRHARD)
284 		kprintf(" quorum/rw");
285 	else if (flags & HAMMER2_CLUSTER_RDHARD)
286 		kprintf(" quorum/ro");
287 
288 	if (flags & HAMMER2_CLUSTER_UNHARD)
289 		kprintf(" out-of-sync-masters");
290 	else if (flags & HAMMER2_CLUSTER_NOHARD)
291 		kprintf(" no-masters-visible");
292 
293 	if (flags & HAMMER2_CLUSTER_WRSOFT)
294 		kprintf(" soft/rw");
295 	else if (flags & HAMMER2_CLUSTER_RDSOFT)
296 		kprintf(" soft/ro");
297 
298 	if (flags & HAMMER2_CLUSTER_UNSOFT)
299 		kprintf(" out-of-sync-slaves");
300 	else if (flags & HAMMER2_CLUSTER_NOSOFT)
301 		kprintf(" no-slaves-visible");
302 	kprintf("\n");
303 }
304 #endif
305 
306 #if 0
307 static
308 void
309 dumpcluster(const char *label,
310 	    hammer2_cluster_t *cparent, hammer2_cluster_t *cluster)
311 {
312 	hammer2_chain_t *chain;
313 	int i;
314 
315 	if ((hammer2_debug & 1) == 0)
316 		return;
317 
318 	kprintf("%s\t", label);
319 	KKASSERT(cparent->nchains == cluster->nchains);
320 	for (i = 0; i < cparent->nchains; ++i) {
321 		if (i)
322 			kprintf("\t");
323 		kprintf("%d ", i);
324 		if ((chain = cparent->array[i].chain) != NULL) {
325 			kprintf("%016jx%s ",
326 				chain->bref.key,
327 				((cparent->array[i].flags &
328 				  HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
329 			);
330 		} else {
331 			kprintf("      NULL      %s ", "   ");
332 		}
333 		if ((chain = cluster->array[i].chain) != NULL) {
334 			kprintf("%016jx%s ",
335 				chain->bref.key,
336 				((cluster->array[i].flags &
337 				  HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
338 			);
339 		} else {
340 			kprintf("      NULL      %s ", "   ");
341 		}
342 		kprintf("\n");
343 	}
344 }
345 #endif
346 
347 /*
348  * Each out of sync node sync-thread must issue an all-nodes XOP scan of
349  * the inode.  This creates a multiplication effect since the XOP scan itself
350  * issues to all nodes.  However, this is the only way we can safely
351  * synchronize nodes which might have disparate I/O bandwidths and the only
352  * way we can safely deal with stalled nodes.
353  *
354  * XXX serror / merror rollup and handling.
355  */
356 static
357 int
358 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
359 		    hammer2_deferred_list_t *list, int isroot)
360 {
361 	hammer2_xop_scanall_t *xop;
362 	hammer2_chain_t *parent;
363 	hammer2_chain_t *chain;
364 	hammer2_pfs_t *pmp;
365 	hammer2_key_t key_next;
366 	hammer2_tid_t sync_tid;
367 	int needrescan;
368 	int want_update;
369 	int serror;		/* slave error */
370 	int merror;		/* master error (from xop_collect) */
371 	int nerror;		/* temporary error */
372 	int idx;
373 	int n;
374 
375 	pmp = ip->pmp;
376 	idx = thr->clindex;	/* cluster node we are responsible for */
377 	needrescan = 0;
378 	want_update = 0;
379 	sync_tid = 0;
380 	chain = NULL;
381 	parent = NULL;
382 
383 #if 0
384 	/*
385 	 * Nothing to do if all slaves are synchronized.
386 	 * Nothing to do if cluster not authoritatively readable.
387 	 */
388 	if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED)
389 		return(0);
390 	if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0)
391 		return(HAMMER2_ERROR_INCOMPLETE);
392 #endif
393 
394 	merror = 0;
395 
396 	/*
397 	 * Resolve the root inode of the PFS and determine if synchronization
398 	 * is needed by checking modify_tid.
399 	 *
400 	 * Retain the synchronization TID from the focus inode and use it
401 	 * later to synchronize the focus inode if/when the recursion
402 	 * succeeds.
403 	 */
404 	{
405 		hammer2_xop_ipcluster_t *xop2;
406 		hammer2_chain_t *focus;
407 
408 		hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
409 		xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
410 		hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster,
411 					 idx);
412 		hammer2_inode_unlock(ip);
413 		merror = hammer2_xop_collect(&xop2->head, 0);
414 		if (merror == 0 && (focus = xop2->head.cluster.focus) != NULL) {
415 			sync_tid = focus->bref.modify_tid;
416 			chain = hammer2_inode_chain_and_parent(ip, idx,
417 						    &parent,
418 						    HAMMER2_RESOLVE_ALWAYS |
419 						    HAMMER2_RESOLVE_SHARED);
420 			want_update = (chain->bref.modify_tid != sync_tid);
421 			if (chain) {
422 				hammer2_chain_unlock(chain);
423 				hammer2_chain_drop(chain);
424 				chain = NULL;
425 			}
426 			if (parent) {
427 				hammer2_chain_unlock(parent);
428 				hammer2_chain_drop(parent);
429 				parent = NULL;
430 			}
431 		}
432 		hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
433 	}
434 
435 	if (want_update == 0)
436 		return(0);
437 
438 	/*
439 	 * The inode is left unlocked during the scan.  Issue a XOP
440 	 * that does *not* include our cluster index to iterate
441 	 * properly synchronized elements and resolve our cluster index
442 	 * against it.
443 	 */
444 	hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
445 	xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
446 	xop->key_beg = HAMMER2_KEY_MIN;
447 	xop->key_end = HAMMER2_KEY_MAX;
448 	xop->resolve_flags = HAMMER2_RESOLVE_SHARED |
449 			     HAMMER2_RESOLVE_ALWAYS;
450 	xop->lookup_flags = HAMMER2_LOOKUP_SHARED |
451 			    HAMMER2_LOOKUP_NODIRECT |
452 			    HAMMER2_LOOKUP_ALWAYS;
453 	hammer2_xop_start_except(&xop->head, hammer2_xop_scanall, idx);
454 	parent = hammer2_inode_chain(ip, idx,
455 				     HAMMER2_RESOLVE_ALWAYS |
456 				     HAMMER2_RESOLVE_SHARED);
457 	hammer2_inode_unlock(ip);
458 
459 	chain = hammer2_chain_lookup(&parent, &key_next,
460 				     HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
461 				     &serror,
462 				     HAMMER2_LOOKUP_SHARED |
463 				     HAMMER2_LOOKUP_NODIRECT |
464 				     HAMMER2_LOOKUP_NODATA);
465 	merror = hammer2_xop_collect(&xop->head, 0);
466 	if (hammer2_debug & 0x8000) {
467 		kprintf("START_SCAN IP=%016jx chain=%p (%016jx)\n",
468 			ip->meta.name_key, chain,
469 			(chain ? chain->bref.key : -1));
470 	}
471 
472 	for (;;) {
473 		/*
474 		 * We are done if our scan is done and the XOP scan is done.
475 		 * We are done if the XOP scan failed (that is, we don't
476 		 * have authoritative data to synchronize with).
477 		 */
478 		int advance_local = 0;
479 		int advance_xop = 0;
480 		int dodefer = 0;
481 		hammer2_chain_t *focus;
482 
483 		if (chain == NULL && merror == HAMMER2_ERROR_ENOENT)
484 			break;
485 		if (merror && merror != HAMMER2_ERROR_ENOENT)
486 			break;
487 
488 		/*
489 		 * Compare
490 		 */
491 		if (chain && merror == HAMMER2_ERROR_ENOENT) {
492 			/*
493 			 * If we have local chains but the XOP scan is done,
494 			 * the chains need to be deleted.
495 			 */
496 			n = -1;
497 			focus = NULL;
498 		} else if (chain == NULL) {
499 			/*
500 			 * If our local scan is done but the XOP scan is not,
501 			 * we need to create the missing chain(s).
502 			 */
503 			n = 1;
504 			focus = xop->head.cluster.focus;
505 		} else {
506 			/*
507 			 * Otherwise compare to determine the action
508 			 * needed.
509 			 */
510 			focus = xop->head.cluster.focus;
511 			n = hammer2_chain_cmp(chain, focus);
512 		}
513 
514 		/*
515 		 * Take action based on comparison results.
516 		 */
517 		if (n < 0) {
518 			/*
519 			 * Delete extranious local data.  This will
520 			 * automatically advance the chain.
521 			 */
522 			nerror = hammer2_sync_destroy(thr, &parent, &chain,
523 						      0, idx);
524 		} else if (n == 0 && chain->bref.modify_tid !=
525 				     focus->bref.modify_tid) {
526 			/*
527 			 * Matching key but local data or meta-data requires
528 			 * updating.  If we will recurse, we still need to
529 			 * update to compatible content first but we do not
530 			 * synchronize modify_tid until the entire recursion
531 			 * has completed successfully.
532 			 */
533 			if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
534 				nerror = hammer2_sync_replace(
535 						thr, parent, chain,
536 						0,
537 						idx, focus, 0);
538 				dodefer = 1;
539 			} else {
540 				nerror = hammer2_sync_replace(
541 						thr, parent, chain,
542 						focus->bref.modify_tid,
543 						idx, focus, 0);
544 			}
545 			advance_local = 1;
546 			advance_xop = 1;
547 		} else if (n == 0) {
548 			/*
549 			 * 100% match, advance both
550 			 */
551 			advance_local = 1;
552 			advance_xop = 1;
553 			nerror = 0;
554 		} else if (n > 0) {
555 			/*
556 			 * Insert missing local data.
557 			 *
558 			 * If we will recurse, we still need to update to
559 			 * compatible content first but we do not synchronize
560 			 * modify_tid until the entire recursion has
561 			 * completed successfully.
562 			 */
563 			if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
564 				nerror = hammer2_sync_insert(
565 						thr, &parent, &chain,
566 						0,
567 						idx, focus);
568 				dodefer = 2;
569 			} else {
570 				nerror = hammer2_sync_insert(
571 						thr, &parent, &chain,
572 						focus->bref.modify_tid,
573 						idx, focus);
574 			}
575 			advance_local = 1;
576 			advance_xop = 1;
577 		}
578 
579 		/*
580 		 * We cannot recurse depth-first because the XOP is still
581 		 * running in node threads for this scan.  Create a placemarker
582 		 * by obtaining and record the hammer2_inode.
583 		 *
584 		 * We excluded our node from the XOP so we must temporarily
585 		 * add it to xop->head.cluster so it is properly incorporated
586 		 * into the inode.
587 		 *
588 		 * The deferral is pushed onto a LIFO list for bottom-up
589 		 * synchronization.
590 		 */
591 		if (merror == 0 && dodefer) {
592 			hammer2_inode_t *nip;
593 			hammer2_deferred_ip_t *defer;
594 
595 			KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE);
596 
597 			defer = kmalloc(sizeof(*defer), M_HAMMER2,
598 					M_WAITOK | M_ZERO);
599 			KKASSERT(xop->head.cluster.array[idx].chain == NULL);
600 			xop->head.cluster.array[idx].flags =
601 							HAMMER2_CITEM_INVALID;
602 			xop->head.cluster.array[idx].chain = chain;
603 			nip = hammer2_inode_get(pmp, ip,
604 						&xop->head.cluster, idx);
605 			xop->head.cluster.array[idx].chain = NULL;
606 
607 			hammer2_inode_ref(nip);
608 			hammer2_inode_unlock(nip);
609 
610 			defer->next = list->base;
611 			defer->ip = nip;
612 			list->base = defer;
613 			++list->count;
614 			needrescan = 1;
615 		}
616 
617 		/*
618 		 * If at least one deferral was added and the deferral
619 		 * list has grown too large, stop adding more.  This
620 		 * will trigger an HAMMER2_ERROR_EAGAIN return.
621 		 */
622 		if (needrescan && list->count > 1000)
623 			break;
624 
625 		/*
626 		 * Advancements for iteration.
627 		 */
628 		if (advance_xop) {
629 			merror = hammer2_xop_collect(&xop->head, 0);
630 		}
631 		if (advance_local) {
632 			chain = hammer2_chain_next(&parent, chain, &key_next,
633 						   key_next, HAMMER2_KEY_MAX,
634 						   &serror,
635 						   HAMMER2_LOOKUP_SHARED |
636 						   HAMMER2_LOOKUP_NODIRECT |
637 						   HAMMER2_LOOKUP_NODATA);
638 		}
639 	}
640 	hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
641 	if (chain) {
642 		hammer2_chain_unlock(chain);
643 		hammer2_chain_drop(chain);
644 	}
645 	if (parent) {
646 		hammer2_chain_unlock(parent);
647 		hammer2_chain_drop(parent);
648 	}
649 
650 	/*
651 	 * If we added deferrals we want the caller to synchronize them
652 	 * and then call us again.
653 	 *
654 	 * NOTE: In this situation we do not yet want to synchronize our
655 	 *	 inode, setting the error code also has that effect.
656 	 */
657 	if ((merror == 0 || merror == HAMMER2_ERROR_ENOENT) && needrescan)
658 		merror = HAMMER2_ERROR_EAGAIN;
659 
660 	/*
661 	 * If no error occurred we can synchronize the inode meta-data
662 	 * and modify_tid.  Only limited changes are made to PFSROOTs.
663 	 *
664 	 * XXX inode lock was lost
665 	 */
666 	if (merror == 0 || merror == HAMMER2_ERROR_ENOENT) {
667 		hammer2_xop_ipcluster_t *xop2;
668 		hammer2_chain_t *focus;
669 
670 		hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
671 		xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
672 		hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster,
673 					 idx);
674 		hammer2_inode_unlock(ip);
675 		merror = hammer2_xop_collect(&xop2->head, 0);
676 		if (merror == 0) {
677 			focus = xop2->head.cluster.focus;
678 			if (hammer2_debug & 0x8000) {
679 				kprintf("syncthr: update inode %p (%s)\n",
680 					focus,
681 					(focus ? (char *)focus->data->
682 							 ipdata.filename :
683 						 "?"));
684 			}
685 			chain = hammer2_inode_chain_and_parent(ip, idx,
686 						    &parent,
687 						    HAMMER2_RESOLVE_ALWAYS |
688 						    HAMMER2_RESOLVE_SHARED);
689 
690 			KKASSERT(parent != NULL);
691 			nerror = hammer2_sync_replace(
692 					thr, parent, chain,
693 					sync_tid,
694 					idx, focus, isroot);
695 			hammer2_chain_unlock(chain);
696 			hammer2_chain_drop(chain);
697 			hammer2_chain_unlock(parent);
698 			hammer2_chain_drop(parent);
699 			/* XXX */
700 		}
701 		hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
702 	}
703 
704 	return merror;
705 }
706 
707 /*
708  * Create a missing chain by copying the focus from another device.
709  *
710  * On entry *parentp and focus are both locked shared.  The chain will be
711  * created and returned in *chainp also locked shared.
712  */
713 static
714 int
715 hammer2_sync_insert(hammer2_thread_t *thr,
716 		    hammer2_chain_t **parentp, hammer2_chain_t **chainp,
717 		    hammer2_tid_t mtid, int idx, hammer2_chain_t *focus)
718 {
719 	hammer2_chain_t *chain;
720 	hammer2_key_t dummy;
721 	int error;
722 
723 #if HAMMER2_SYNCHRO_DEBUG
724 	if (hammer2_debug & 1)
725 	kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n",
726 		*parentp,
727 		(*parentp)->bref.type,
728 		(*parentp)->bref.key,
729 		idx,
730 		focus->bref.type, focus->bref.key, mtid);
731 #endif
732 
733 	/*
734 	 * Parent requires an exclusive lock for the insertion.
735 	 * We must unlock the child to avoid deadlocks while
736 	 * relocking the parent.
737 	 */
738 	if (*chainp) {
739 		hammer2_chain_unlock(*chainp);
740 		hammer2_chain_drop(*chainp);
741 		*chainp = NULL;
742 	}
743 	hammer2_chain_unlock(*parentp);
744 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
745 
746 	/*
747 	 * We must reissue the lookup to properly position (*parentp)
748 	 * for the insertion.
749 	 */
750 	chain = hammer2_chain_lookup(parentp, &dummy,
751 				     focus->bref.key, focus->bref.key,
752 				     &error,
753 				     HAMMER2_LOOKUP_NODIRECT |
754 				     HAMMER2_LOOKUP_ALWAYS);
755 	KKASSERT(chain == NULL);
756 
757 	chain = NULL;
758 	error = hammer2_chain_create(parentp, &chain,
759 				     thr->pmp, focus->bref.methods,
760 				     focus->bref.key, focus->bref.keybits,
761 				     focus->bref.type, focus->bytes,
762 				     mtid, 0, 0);
763 	if (error == 0) {
764 		error = hammer2_chain_modify(chain, mtid, 0, 0);
765 		if (error)
766 			goto failed;
767 
768 		/*
769 		 * Copy focus to new chain
770 		 */
771 
772 		/* type already set */
773 		chain->bref.methods = focus->bref.methods;
774 		/* keybits already set */
775 		chain->bref.vradix = focus->bref.vradix;
776 		/* mirror_tid set by flush */
777 		KKASSERT(chain->bref.modify_tid == mtid);
778 		chain->bref.flags = focus->bref.flags;
779 		/* key already present */
780 		/* check code will be recalculated */
781 
782 		/*
783 		 * Copy data body.
784 		 */
785 		switch(chain->bref.type) {
786 		case HAMMER2_BREF_TYPE_INODE:
787 			if ((focus->data->ipdata.meta.op_flags &
788 			     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
789 				/* do not copy block table */
790 				bcopy(focus->data, chain->data,
791 				      offsetof(hammer2_inode_data_t, u));
792 				break;
793 			}
794 			/* fall through copy whole thing */
795 		case HAMMER2_BREF_TYPE_DATA:
796 			bcopy(focus->data, chain->data, chain->bytes);
797 			hammer2_chain_setcheck(chain, chain->data);
798 			break;
799 		case HAMMER2_BREF_TYPE_DIRENT:
800 			/*
801 			 * Directory entries embed data in the blockref.
802 			 */
803 			if (chain->bytes) {
804 				bcopy(focus->data, chain->data, chain->bytes);
805 				hammer2_chain_setcheck(chain, chain->data);
806 			} else {
807 				chain->bref.check = focus->bref.check;
808 			}
809 			chain->bref.embed = focus->bref.embed;
810 			break;
811 		default:
812 			KKASSERT(0);
813 			break;
814 		}
815 	}
816 
817 failed:
818 	if (chain)
819 		hammer2_chain_unlock(chain);	/* unlock, leave ref */
820 	*chainp = chain;			/* will be returned locked */
821 
822 	/*
823 	 * Avoid an ordering deadlock when relocking shared.
824 	 */
825 	hammer2_chain_unlock(*parentp);
826 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
827 				     HAMMER2_RESOLVE_ALWAYS);
828 	if (chain) {
829 		hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
830 					  HAMMER2_RESOLVE_ALWAYS);
831 		error = chain->error;
832 	}
833 
834 	return error;
835 }
836 
837 /*
838  * Destroy an extranious chain.
839  *
840  * Both *parentp and *chainp are locked shared.
841  *
842  * On return, *chainp will be adjusted to point to the next element in the
843  * iteration and locked shared.
844  */
845 static
846 int
847 hammer2_sync_destroy(hammer2_thread_t *thr,
848 		     hammer2_chain_t **parentp, hammer2_chain_t **chainp,
849 		     hammer2_tid_t mtid, int idx)
850 {
851 	hammer2_chain_t *chain;
852 	hammer2_key_t key_next;
853 	hammer2_key_t save_key;
854 	int error;
855 
856 	chain = *chainp;
857 
858 #if HAMMER2_SYNCHRO_DEBUG
859 	if (hammer2_debug & 1)
860 	kprintf("destroy rec %p/%p slave %d %d.%016jx\n",
861 		*parentp, chain,
862 		idx, chain->bref.type, chain->bref.key);
863 #endif
864 
865 	save_key = chain->bref.key;
866 	if (save_key != HAMMER2_KEY_MAX)
867 		++save_key;
868 
869 	/*
870 	 * Try to avoid unnecessary I/O.
871 	 *
872 	 * XXX accounting not propagated up properly.  We might have to do
873 	 *     a RESOLVE_MAYBE here and pass 0 for the flags.
874 	 */
875 	hammer2_chain_unlock(chain);	/* relock exclusive */
876 	hammer2_chain_unlock(*parentp);
877 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
878 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
879 
880 	hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT);
881 	hammer2_chain_unlock(chain);
882 	hammer2_chain_drop(chain);
883 	chain = NULL;			/* safety */
884 
885 	hammer2_chain_unlock(*parentp);	/* relock shared */
886 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
887 				     HAMMER2_RESOLVE_ALWAYS);
888 	*chainp = hammer2_chain_lookup(parentp, &key_next,
889 				     save_key, HAMMER2_KEY_MAX,
890 				     &error,
891 				     HAMMER2_LOOKUP_SHARED |
892 				     HAMMER2_LOOKUP_NODIRECT |
893 				     HAMMER2_LOOKUP_NODATA);
894 	return error;
895 }
896 
897 /*
898  * cparent is locked exclusively, with an extra ref, cluster is not locked.
899  * Replace element [i] in the cluster.
900  */
901 static
902 int
903 hammer2_sync_replace(hammer2_thread_t *thr,
904 		     hammer2_chain_t *parent, hammer2_chain_t *chain,
905 		     hammer2_tid_t mtid, int idx,
906 		     hammer2_chain_t *focus, int isroot)
907 {
908 	uint8_t otype;
909 	int nradix;
910 	int error;
911 
912 #if HAMMER2_SYNCHRO_DEBUG
913 	if (hammer2_debug & 1)
914 	kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n",
915 		chain,
916 		idx,
917 		focus->bref.type, focus->bref.key, mtid);
918 #endif
919 	hammer2_chain_unlock(chain);
920 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
921 	error = chain->error;
922 	if (error == 0) {
923 		if (chain->bytes != focus->bytes) {
924 			/* XXX what if compressed? */
925 			nradix = hammer2_getradix(chain->bytes);
926 			error = hammer2_chain_resize(chain, mtid, 0, nradix, 0);
927 			if (error)
928 				goto failed;
929 		}
930 		error = hammer2_chain_modify(chain, mtid, 0, 0);
931 		if (error)
932 			goto failed;
933 		otype = chain->bref.type;
934 		chain->bref.type = focus->bref.type;
935 		chain->bref.methods = focus->bref.methods;
936 		chain->bref.keybits = focus->bref.keybits;
937 		chain->bref.vradix = focus->bref.vradix;
938 		/* mirror_tid updated by flush */
939 		KKASSERT(mtid == 0 || chain->bref.modify_tid == mtid);
940 		chain->bref.flags = focus->bref.flags;
941 		/* key already present */
942 		/* check code will be recalculated */
943 
944 		/*
945 		 * Copy data body.
946 		 */
947 		switch(chain->bref.type) {
948 		case HAMMER2_BREF_TYPE_INODE:
949 			/*
950 			 * Special case PFSROOTs, only limited changes can
951 			 * be made since the meta-data contains miscellanious
952 			 * distinguishing fields.
953 			 */
954 			if (isroot) {
955 				chain->data->ipdata.meta.uflags =
956 					focus->data->ipdata.meta.uflags;
957 				chain->data->ipdata.meta.rmajor =
958 					focus->data->ipdata.meta.rmajor;
959 				chain->data->ipdata.meta.rminor =
960 					focus->data->ipdata.meta.rminor;
961 				chain->data->ipdata.meta.ctime =
962 					focus->data->ipdata.meta.ctime;
963 				chain->data->ipdata.meta.mtime =
964 					focus->data->ipdata.meta.mtime;
965 				chain->data->ipdata.meta.atime =
966 					focus->data->ipdata.meta.atime;
967 				/* not btime */
968 				chain->data->ipdata.meta.uid =
969 					focus->data->ipdata.meta.uid;
970 				chain->data->ipdata.meta.gid =
971 					focus->data->ipdata.meta.gid;
972 				chain->data->ipdata.meta.mode =
973 					focus->data->ipdata.meta.mode;
974 				chain->data->ipdata.meta.ncopies =
975 					focus->data->ipdata.meta.ncopies;
976 				chain->data->ipdata.meta.comp_algo =
977 					focus->data->ipdata.meta.comp_algo;
978 				chain->data->ipdata.meta.check_algo =
979 					focus->data->ipdata.meta.check_algo;
980 				chain->data->ipdata.meta.data_quota =
981 					focus->data->ipdata.meta.data_quota;
982 				chain->data->ipdata.meta.inode_quota =
983 					focus->data->ipdata.meta.inode_quota;
984 
985 				/*
986 				 * last snapshot tid controls overwrite
987 				 */
988 				if (chain->data->ipdata.meta.pfs_lsnap_tid <
989 				    focus->data->ipdata.meta.pfs_lsnap_tid) {
990 					chain->data->ipdata.meta.pfs_lsnap_tid =
991 					focus->data->ipdata.meta.pfs_lsnap_tid;
992 				}
993 
994 				hammer2_chain_setcheck(chain, chain->data);
995 				break;
996 			}
997 
998 			/*
999 			 * Normal replacement.
1000 			 */
1001 			if ((focus->data->ipdata.meta.op_flags &
1002 			     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1003 				/*
1004 				 * If DIRECTDATA is transitioning to 0 or the
1005 				 * old chain is not an inode we have to
1006 				 * initialize the block table.
1007 				 */
1008 				if (otype != HAMMER2_BREF_TYPE_INODE ||
1009 				    (chain->data->ipdata.meta.op_flags &
1010 				     HAMMER2_OPFLAG_DIRECTDATA)) {
1011 					kprintf("chain inode trans "
1012 						"away from dd\n");
1013 					bzero(&chain->data->ipdata.u,
1014 					      sizeof(chain->data->ipdata.u));
1015 				}
1016 				bcopy(focus->data, chain->data,
1017 				      offsetof(hammer2_inode_data_t, u));
1018 				/* XXX setcheck on inode should not be needed */
1019 				hammer2_chain_setcheck(chain, chain->data);
1020 				break;
1021 			}
1022 			/* fall through */
1023 		case HAMMER2_BREF_TYPE_DATA:
1024 			bcopy(focus->data, chain->data, chain->bytes);
1025 			hammer2_chain_setcheck(chain, chain->data);
1026 			break;
1027 		case HAMMER2_BREF_TYPE_DIRENT:
1028 			/*
1029 			 * Directory entries embed data in the blockref.
1030 			 */
1031 			if (chain->bytes) {
1032 				bcopy(focus->data, chain->data, chain->bytes);
1033 				hammer2_chain_setcheck(chain, chain->data);
1034 			} else {
1035 				chain->bref.check = focus->bref.check;
1036 			}
1037 			chain->bref.embed = focus->bref.embed;
1038 			break;
1039 		default:
1040 			KKASSERT(0);
1041 			break;
1042 		}
1043 	}
1044 
1045 failed:
1046 	hammer2_chain_unlock(chain);
1047 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
1048 				  HAMMER2_RESOLVE_MAYBE);
1049 
1050 	return error;
1051 }
1052