xref: /dragonfly/sys/vfs/hammer2/hammer2_synchro.c (revision 62dc643e)
1 /*
2  * Copyright (c) 2015-2017 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * This module implements the cluster synchronizer.  Basically the way
36  * it works is that a thread is created for each cluster node in a PFS.
37  * This thread is responsible for synchronizing the current node using
38  * data from other nodes.
39  *
40  * Any out of sync master or slave can get back into synchronization as
41  * long as a quorum of masters agree on the update_tid.  If a quorum is
42  * not available it may still be possible to synchronize to the highest
43  * available update_tid as a way of trying to catch up as much as possible
44  * until a quorum is available.
45  *
46  * If no quorum is possible (which can happen even if all masters are
47  * available, if the update_tid does not match), then manual intervention
48  * may be required to resolve discrepancies.
49  */
50 #include "hammer2.h"
51 
52 typedef struct hammer2_deferred_ip {
53 	struct hammer2_deferred_ip *next;
54 	hammer2_inode_t	*ip;
55 } hammer2_deferred_ip_t;
56 
57 typedef struct hammer2_deferred_list {
58 	hammer2_deferred_ip_t	*base;
59 	int			count;
60 } hammer2_deferred_list_t;
61 
62 
63 #define HAMMER2_SYNCHRO_DEBUG 1
64 
65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
66 				hammer2_deferred_list_t *list, int isroot);
67 #if 0
68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags);
69 				nerror = hammer2_sync_insert(
70 						thr, &parent, &chain,
71 						focus->bref.modify_tid,
72 						idx, focus);
73 #endif
74 static int hammer2_sync_insert(hammer2_thread_t *thr,
75 			hammer2_chain_t **parentp, hammer2_chain_t **chainp,
76 			hammer2_tid_t modify_tid, int idx,
77 			hammer2_chain_t *focus);
78 static int hammer2_sync_destroy(hammer2_thread_t *thr,
79 			hammer2_chain_t **parentp, hammer2_chain_t **chainp,
80 			hammer2_tid_t mtid, int idx);
81 static int hammer2_sync_replace(hammer2_thread_t *thr,
82 			hammer2_chain_t *parent, hammer2_chain_t *chain,
83 			hammer2_tid_t mtid, int idx,
84 			hammer2_chain_t *focus, int isroot);
85 
86 /****************************************************************************
87  *			    HAMMER2 SYNC THREADS 			    *
88  ****************************************************************************/
89 /*
90  * Primary management thread for an element of a node.  A thread will exist
91  * for each element requiring management.
92  *
93  * No management threads are needed for the SPMP or for any PMP with only
94  * a single MASTER.
95  *
96  * On the SPMP - handles bulkfree and dedup operations
97  * On a PFS    - handles remastering and synchronization
98  */
99 void
100 hammer2_primary_sync_thread(void *arg)
101 {
102 	hammer2_thread_t *thr = arg;
103 	hammer2_pfs_t *pmp;
104 	hammer2_deferred_list_t list;
105 	hammer2_deferred_ip_t *defer;
106 	int error;
107 	uint32_t flags;
108 	uint32_t nflags;
109 
110 	pmp = thr->pmp;
111 	bzero(&list, sizeof(list));
112 
113 	for (;;) {
114 		flags = thr->flags;
115 		cpu_ccfence();
116 
117 		/*
118 		 * Handle stop request
119 		 */
120 		if (flags & HAMMER2_THREAD_STOP)
121 			break;
122 
123 		/*
124 		 * Handle freeze request
125 		 */
126 		if (flags & HAMMER2_THREAD_FREEZE) {
127 			nflags = (flags & ~(HAMMER2_THREAD_FREEZE |
128 					    HAMMER2_THREAD_CLIENTWAIT)) |
129 				 HAMMER2_THREAD_FROZEN;
130 			if (!atomic_cmpset_int(&thr->flags, flags, nflags))
131 				continue;
132 			if (flags & HAMMER2_THREAD_CLIENTWAIT)
133 				wakeup(&thr->flags);
134 			flags = nflags;
135 			/* fall through */
136 		}
137 
138 		if (flags & HAMMER2_THREAD_UNFREEZE) {
139 			nflags = flags & ~(HAMMER2_THREAD_UNFREEZE |
140 					   HAMMER2_THREAD_FROZEN |
141 					   HAMMER2_THREAD_CLIENTWAIT);
142 			if (!atomic_cmpset_int(&thr->flags, flags, nflags))
143 				continue;
144 			if (flags & HAMMER2_THREAD_CLIENTWAIT)
145 				wakeup(&thr->flags);
146 			flags = nflags;
147 			/* fall through */
148 		}
149 
150 		/*
151 		 * Force idle if frozen until unfrozen or stopped.
152 		 */
153 		if (flags & HAMMER2_THREAD_FROZEN) {
154 			nflags = flags | HAMMER2_THREAD_WAITING;
155 			tsleep_interlock(&thr->flags, 0);
156 			if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
157 				tsleep(&thr->flags, PINTERLOCKED, "frozen", 0);
158 				atomic_clear_int(&thr->flags,
159 						 HAMMER2_THREAD_WAITING);
160 			}
161 			continue;
162 		}
163 
164 		/*
165 		 * Reset state on REMASTER request
166 		 */
167 		if (thr->flags & HAMMER2_THREAD_REMASTER) {
168 			nflags = flags & ~HAMMER2_THREAD_REMASTER;
169 			if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
170 				/* reset state here */
171 			}
172 			continue;
173 		}
174 
175 		/*
176 		 * Synchronization scan.
177 		 */
178 		if (hammer2_debug & 0x8000)
179 			kprintf("sync_slaves pfs %s clindex %d\n",
180 				pmp->pfs_names[thr->clindex], thr->clindex);
181 		hammer2_trans_init(pmp, 0);
182 
183 		hammer2_inode_ref(pmp->iroot);
184 
185 		for (;;) {
186 			int didbreak = 0;
187 			/* XXX lock synchronize pmp->modify_tid */
188 			error = hammer2_sync_slaves(thr, pmp->iroot, &list, 1);
189 			if (hammer2_debug & 0x8000) {
190 				kprintf("sync_slaves error %d defer %p\n",
191 					error, list.base);
192 			}
193 			if (error != EAGAIN)
194 				break;
195 			while ((defer = list.base) != NULL) {
196 				hammer2_inode_t *nip;
197 
198 				nip = defer->ip;
199 				error = hammer2_sync_slaves(thr, nip, &list,
200 							(nip == pmp->iroot));
201 				if (error && error != EAGAIN && error != ENOENT)
202 					break;
203 				if (hammer2_thr_break(thr)) {
204 					didbreak = 1;
205 					break;
206 				}
207 
208 				/*
209 				 * If no additional defers occurred we can
210 				 * remove this one, otherwise keep it on
211 				 * the list and retry once the additional
212 				 * defers have completed.
213 				 */
214 				if (defer == list.base) {
215 					--list.count;
216 					list.base = defer->next;
217 					kfree(defer, M_HAMMER2);
218 					defer = NULL;	/* safety */
219 					hammer2_inode_drop(nip);
220 				}
221 			}
222 
223 			/*
224 			 * If the thread is being remastered, frozen, or
225 			 * stopped, clean up any left-over deferals.
226 			 */
227 			if (didbreak || (error && error != EAGAIN)) {
228 				kprintf("didbreak\n");
229 				while ((defer = list.base) != NULL) {
230 					--list.count;
231 					hammer2_inode_drop(defer->ip);
232 					list.base = defer->next;
233 					kfree(defer, M_HAMMER2);
234 				}
235 				if (error == 0 || error == EAGAIN)
236 					error = EINPROGRESS;
237 				break;
238 			}
239 		}
240 
241 		hammer2_inode_drop(pmp->iroot);
242 		hammer2_trans_done(pmp);
243 
244 		if (error && error != EINPROGRESS)
245 			kprintf("hammer2_sync_slaves: error %d\n", error);
246 
247 		/*
248 		 * Wait for event, or 5-second poll.
249 		 */
250 		nflags = flags | HAMMER2_THREAD_WAITING;
251 		tsleep_interlock(&thr->flags, 0);
252 		if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
253 			tsleep(&thr->flags, 0, "h2idle", hz * 5);
254 			atomic_clear_int(&thr->flags, HAMMER2_THREAD_WAITING);
255 		}
256 	}
257 	thr->td = NULL;
258 	hammer2_thr_return(thr, HAMMER2_THREAD_STOPPED);
259 	/* thr structure can go invalid after this point */
260 }
261 
262 #if 0
263 /*
264  * Given a locked cluster created from pmp->iroot, update the PFS's
265  * reporting status.
266  */
267 static
268 void
269 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags)
270 {
271 	hammer2_pfs_t *pmp = thr->pmp;
272 
273 	flags &= HAMMER2_CLUSTER_ZFLAGS;
274 	if (pmp->cluster_flags == flags)
275 		return;
276 	pmp->cluster_flags = flags;
277 
278 	kprintf("pfs %p", pmp);
279 	if (flags & HAMMER2_CLUSTER_MSYNCED)
280 		kprintf(" masters-all-good");
281 	if (flags & HAMMER2_CLUSTER_SSYNCED)
282 		kprintf(" slaves-all-good");
283 
284 	if (flags & HAMMER2_CLUSTER_WRHARD)
285 		kprintf(" quorum/rw");
286 	else if (flags & HAMMER2_CLUSTER_RDHARD)
287 		kprintf(" quorum/ro");
288 
289 	if (flags & HAMMER2_CLUSTER_UNHARD)
290 		kprintf(" out-of-sync-masters");
291 	else if (flags & HAMMER2_CLUSTER_NOHARD)
292 		kprintf(" no-masters-visible");
293 
294 	if (flags & HAMMER2_CLUSTER_WRSOFT)
295 		kprintf(" soft/rw");
296 	else if (flags & HAMMER2_CLUSTER_RDSOFT)
297 		kprintf(" soft/ro");
298 
299 	if (flags & HAMMER2_CLUSTER_UNSOFT)
300 		kprintf(" out-of-sync-slaves");
301 	else if (flags & HAMMER2_CLUSTER_NOSOFT)
302 		kprintf(" no-slaves-visible");
303 	kprintf("\n");
304 }
305 #endif
306 
307 #if 0
308 static
309 void
310 dumpcluster(const char *label,
311 	    hammer2_cluster_t *cparent, hammer2_cluster_t *cluster)
312 {
313 	hammer2_chain_t *chain;
314 	int i;
315 
316 	if ((hammer2_debug & 1) == 0)
317 		return;
318 
319 	kprintf("%s\t", label);
320 	KKASSERT(cparent->nchains == cluster->nchains);
321 	for (i = 0; i < cparent->nchains; ++i) {
322 		if (i)
323 			kprintf("\t");
324 		kprintf("%d ", i);
325 		if ((chain = cparent->array[i].chain) != NULL) {
326 			kprintf("%016jx%s ",
327 				chain->bref.key,
328 				((cparent->array[i].flags &
329 				  HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
330 			);
331 		} else {
332 			kprintf("      NULL      %s ", "   ");
333 		}
334 		if ((chain = cluster->array[i].chain) != NULL) {
335 			kprintf("%016jx%s ",
336 				chain->bref.key,
337 				((cluster->array[i].flags &
338 				  HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
339 			);
340 		} else {
341 			kprintf("      NULL      %s ", "   ");
342 		}
343 		kprintf("\n");
344 	}
345 }
346 #endif
347 
348 /*
349  * Each out of sync node sync-thread must issue an all-nodes XOP scan of
350  * the inode.  This creates a multiplication effect since the XOP scan itself
351  * issues to all nodes.  However, this is the only way we can safely
352  * synchronize nodes which might have disparate I/O bandwidths and the only
353  * way we can safely deal with stalled nodes.
354  */
355 static
356 int
357 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
358 		    hammer2_deferred_list_t *list, int isroot)
359 {
360 	hammer2_xop_scanall_t *xop;
361 	hammer2_chain_t *parent;
362 	hammer2_chain_t *chain;
363 	hammer2_pfs_t *pmp;
364 	hammer2_key_t key_next;
365 	hammer2_tid_t sync_tid;
366 	int cache_index = -1;
367 	int needrescan;
368 	int want_update;
369 	int error;
370 	int nerror;
371 	int idx;
372 	int n;
373 
374 	pmp = ip->pmp;
375 	idx = thr->clindex;	/* cluster node we are responsible for */
376 	needrescan = 0;
377 	want_update = 0;
378 	sync_tid = 0;
379 	chain = NULL;
380 	parent = NULL;
381 
382 #if 0
383 	/*
384 	 * Nothing to do if all slaves are synchronized.
385 	 * Nothing to do if cluster not authoritatively readable.
386 	 */
387 	if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED)
388 		return(0);
389 	if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0)
390 		return(HAMMER2_ERROR_INCOMPLETE);
391 #endif
392 
393 	error = 0;
394 
395 	/*
396 	 * Resolve the root inode of the PFS and determine if synchronization
397 	 * is needed by checking modify_tid.
398 	 *
399 	 * Retain the synchronization TID from the focus inode and use it
400 	 * later to synchronize the focus inode if/when the recursion
401 	 * succeeds.
402 	 */
403 	{
404 		hammer2_xop_ipcluster_t *xop2;
405 		hammer2_chain_t *focus;
406 
407 		hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
408 		xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
409 		hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster,
410 					 idx);
411 		hammer2_inode_unlock(ip);
412 		error = hammer2_xop_collect(&xop2->head, 0);
413 		if (error == 0 && (focus = xop2->head.cluster.focus) != NULL) {
414 			sync_tid = focus->bref.modify_tid;
415 			chain = hammer2_inode_chain_and_parent(ip, idx,
416 						    &parent,
417 						    HAMMER2_RESOLVE_ALWAYS |
418 						    HAMMER2_RESOLVE_SHARED);
419 			want_update = (chain->bref.modify_tid != sync_tid);
420 			if (chain) {
421 				hammer2_chain_unlock(chain);
422 				hammer2_chain_drop(chain);
423 				chain = NULL;
424 			}
425 			if (parent) {
426 				hammer2_chain_unlock(parent);
427 				hammer2_chain_drop(parent);
428 				parent = NULL;
429 			}
430 		}
431 		hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
432 	}
433 
434 	if (want_update == 0)
435 		return(0);
436 
437 	/*
438 	 * The inode is left unlocked during the scan.  Issue a XOP
439 	 * that does *not* include our cluster index to iterate
440 	 * properly synchronized elements and resolve our cluster index
441 	 * against it.
442 	 */
443 	hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
444 	xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
445 	xop->key_beg = HAMMER2_KEY_MIN;
446 	xop->key_end = HAMMER2_KEY_MAX;
447 	xop->resolve_flags = HAMMER2_RESOLVE_SHARED |
448 			     HAMMER2_RESOLVE_ALWAYS;
449 	xop->lookup_flags = HAMMER2_LOOKUP_SHARED |
450 			    HAMMER2_LOOKUP_NODIRECT |
451 			    HAMMER2_LOOKUP_ALWAYS;
452 	hammer2_xop_start_except(&xop->head, hammer2_xop_scanall, idx);
453 	parent = hammer2_inode_chain(ip, idx,
454 				     HAMMER2_RESOLVE_ALWAYS |
455 				     HAMMER2_RESOLVE_SHARED);
456 	hammer2_inode_unlock(ip);
457 
458 	chain = hammer2_chain_lookup(&parent, &key_next,
459 				     HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
460 				     &cache_index,
461 				     HAMMER2_LOOKUP_SHARED |
462 				     HAMMER2_LOOKUP_NODIRECT |
463 				     HAMMER2_LOOKUP_NODATA);
464 	error = hammer2_xop_collect(&xop->head, 0);
465 	if (hammer2_debug & 0x8000) {
466 		kprintf("START_SCAN IP=%016jx chain=%p (%016jx)\n",
467 			ip->meta.name_key, chain,
468 			(chain ? chain->bref.key : -1));
469 	}
470 
471 	for (;;) {
472 		/*
473 		 * We are done if our scan is done and the XOP scan is done.
474 		 * We are done if the XOP scan failed (that is, we don't
475 		 * have authoritative data to synchronize with).
476 		 */
477 		int advance_local = 0;
478 		int advance_xop = 0;
479 		int dodefer = 0;
480 		hammer2_chain_t *focus;
481 
482 		if (chain == NULL && error == ENOENT)
483 			break;
484 		if (error && error != ENOENT)
485 			break;
486 
487 		/*
488 		 * Compare
489 		 */
490 		if (chain && error == ENOENT) {
491 			/*
492 			 * If we have local chains but the XOP scan is done,
493 			 * the chains need to be deleted.
494 			 */
495 			n = -1;
496 			focus = NULL;
497 		} else if (chain == NULL) {
498 			/*
499 			 * If our local scan is done but the XOP scan is not,
500 			 * we need to create the missing chain(s).
501 			 */
502 			n = 1;
503 			focus = xop->head.cluster.focus;
504 		} else {
505 			/*
506 			 * Otherwise compare to determine the action
507 			 * needed.
508 			 */
509 			focus = xop->head.cluster.focus;
510 			n = hammer2_chain_cmp(chain, focus);
511 		}
512 
513 		/*
514 		 * Take action based on comparison results.
515 		 */
516 		if (n < 0) {
517 			/*
518 			 * Delete extranious local data.  This will
519 			 * automatically advance the chain.
520 			 */
521 			nerror = hammer2_sync_destroy(thr, &parent, &chain,
522 						      0, idx);
523 		} else if (n == 0 && chain->bref.modify_tid !=
524 				     focus->bref.modify_tid) {
525 			/*
526 			 * Matching key but local data or meta-data requires
527 			 * updating.  If we will recurse, we still need to
528 			 * update to compatible content first but we do not
529 			 * synchronize modify_tid until the entire recursion
530 			 * has completed successfully.
531 			 *
532 			 * NOTE: Do not try to access hardlink pointers as if
533 			 *	 they were normal inodes, the inode cache will
534 			 *	 get seriously confused.
535 			 */
536 			if (focus->bref.type == HAMMER2_BREF_TYPE_INODE &&
537 			    focus->data->ipdata.meta.type !=
538 			    HAMMER2_OBJTYPE_HARDLINK) {
539 				nerror = hammer2_sync_replace(
540 						thr, parent, chain,
541 						0,
542 						idx, focus, 0);
543 				dodefer = 1;
544 			} else {
545 				nerror = hammer2_sync_replace(
546 						thr, parent, chain,
547 						focus->bref.modify_tid,
548 						idx, focus, 0);
549 			}
550 			advance_local = 1;
551 			advance_xop = 1;
552 		} else if (n == 0) {
553 			/*
554 			 * 100% match, advance both
555 			 */
556 			advance_local = 1;
557 			advance_xop = 1;
558 			nerror = 0;
559 		} else if (n > 0) {
560 			/*
561 			 * Insert missing local data.
562 			 *
563 			 * If we will recurse, we still need to update to
564 			 * compatible content first but we do not synchronize
565 			 * modify_tid until the entire recursion has
566 			 * completed successfully.
567 			 *
568 			 * NOTE: Do not try to access hardlink pointers as if
569 			 *	 they were normal inodes, the inode cache will
570 			 *	 get seriously confused.
571 			 */
572 			if (focus->bref.type == HAMMER2_BREF_TYPE_INODE &&
573 			    focus->data->ipdata.meta.type !=
574 			    HAMMER2_OBJTYPE_HARDLINK) {
575 				nerror = hammer2_sync_insert(
576 						thr, &parent, &chain,
577 						0,
578 						idx, focus);
579 				dodefer = 2;
580 			} else {
581 				nerror = hammer2_sync_insert(
582 						thr, &parent, &chain,
583 						focus->bref.modify_tid,
584 						idx, focus);
585 			}
586 			advance_local = 1;
587 			advance_xop = 1;
588 		}
589 
590 		/*
591 		 * We cannot recurse depth-first because the XOP is still
592 		 * running in node threads for this scan.  Create a placemarker
593 		 * by obtaining and record the hammer2_inode.
594 		 *
595 		 * We excluded our node from the XOP so we must temporarily
596 		 * add it to xop->head.cluster so it is properly incorporated
597 		 * into the inode.
598 		 *
599 		 * The deferral is pushed onto a LIFO list for bottom-up
600 		 * synchronization.
601 		 */
602 		if (error == 0 && dodefer) {
603 			hammer2_inode_t *nip;
604 			hammer2_deferred_ip_t *defer;
605 
606 			KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE);
607 
608 			defer = kmalloc(sizeof(*defer), M_HAMMER2,
609 					M_WAITOK | M_ZERO);
610 			KKASSERT(xop->head.cluster.array[idx].chain == NULL);
611 			xop->head.cluster.array[idx].flags =
612 							HAMMER2_CITEM_INVALID;
613 			xop->head.cluster.array[idx].chain = chain;
614 			nip = hammer2_inode_get(pmp, ip,
615 						&xop->head.cluster, idx);
616 			xop->head.cluster.array[idx].chain = NULL;
617 
618 			hammer2_inode_ref(nip);
619 			hammer2_inode_unlock(nip);
620 
621 			defer->next = list->base;
622 			defer->ip = nip;
623 			list->base = defer;
624 			++list->count;
625 			needrescan = 1;
626 		}
627 
628 		/*
629 		 * If at least one deferral was added and the deferral
630 		 * list has grown too large, stop adding more.  This
631 		 * will trigger an EAGAIN return.
632 		 */
633 		if (needrescan && list->count > 1000)
634 			break;
635 
636 		/*
637 		 * Advancements for iteration.
638 		 */
639 		if (advance_xop) {
640 			error = hammer2_xop_collect(&xop->head, 0);
641 		}
642 		if (advance_local) {
643 			chain = hammer2_chain_next(&parent, chain, &key_next,
644 						   key_next, HAMMER2_KEY_MAX,
645 						   &cache_index,
646 						   HAMMER2_LOOKUP_SHARED |
647 						   HAMMER2_LOOKUP_NODIRECT |
648 						   HAMMER2_LOOKUP_NODATA);
649 		}
650 	}
651 	hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
652 	if (chain) {
653 		hammer2_chain_unlock(chain);
654 		hammer2_chain_drop(chain);
655 	}
656 	if (parent) {
657 		hammer2_chain_unlock(parent);
658 		hammer2_chain_drop(parent);
659 	}
660 
661 	/*
662 	 * If we added deferrals we want the caller to synchronize them
663 	 * and then call us again.
664 	 *
665 	 * NOTE: In this situation we do not yet want to synchronize our
666 	 *	 inode, setting the error code also has that effect.
667 	 */
668 	if ((error == 0 || error == ENOENT) && needrescan)
669 		error = EAGAIN;
670 
671 	/*
672 	 * If no error occurred we can synchronize the inode meta-data
673 	 * and modify_tid.  Only limited changes are made to PFSROOTs.
674 	 *
675 	 * XXX inode lock was lost
676 	 */
677 	if (error == 0 || error == ENOENT) {
678 		hammer2_xop_ipcluster_t *xop2;
679 		hammer2_chain_t *focus;
680 
681 		hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
682 		xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
683 		hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster,
684 					 idx);
685 		hammer2_inode_unlock(ip);
686 		error = hammer2_xop_collect(&xop2->head, 0);
687 		if (error == 0) {
688 			focus = xop2->head.cluster.focus;
689 			if (hammer2_debug & 0x8000) {
690 				kprintf("syncthr: update inode %p (%s)\n",
691 					focus,
692 					(focus ? (char *)focus->data->
693 							 ipdata.filename :
694 						 "?"));
695 			}
696 			chain = hammer2_inode_chain_and_parent(ip, idx,
697 						    &parent,
698 						    HAMMER2_RESOLVE_ALWAYS |
699 						    HAMMER2_RESOLVE_SHARED);
700 
701 			KKASSERT(parent != NULL);
702 			nerror = hammer2_sync_replace(
703 					thr, parent, chain,
704 					sync_tid,
705 					idx, focus, isroot);
706 			hammer2_chain_unlock(chain);
707 			hammer2_chain_drop(chain);
708 			hammer2_chain_unlock(parent);
709 			hammer2_chain_drop(parent);
710 			/* XXX */
711 		}
712 		hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
713 	}
714 
715 	return error;
716 }
717 
718 /*
719  * Create a missing chain by copying the focus from another device.
720  *
721  * On entry *parentp and focus are both locked shared.  The chain will be
722  * created and returned in *chainp also locked shared.
723  */
724 static
725 int
726 hammer2_sync_insert(hammer2_thread_t *thr,
727 		    hammer2_chain_t **parentp, hammer2_chain_t **chainp,
728 		    hammer2_tid_t mtid, int idx, hammer2_chain_t *focus)
729 {
730 	hammer2_chain_t *chain;
731 	hammer2_key_t dummy;
732 	int cache_index = -1;
733 
734 #if HAMMER2_SYNCHRO_DEBUG
735 	if (hammer2_debug & 1)
736 	kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n",
737 		*parentp,
738 		(*parentp)->bref.type,
739 		(*parentp)->bref.key,
740 		idx,
741 		focus->bref.type, focus->bref.key, mtid);
742 #endif
743 
744 	/*
745 	 * Parent requires an exclusive lock for the insertion.
746 	 * We must unlock the child to avoid deadlocks while
747 	 * relocking the parent.
748 	 */
749 	if (*chainp) {
750 		hammer2_chain_unlock(*chainp);
751 		hammer2_chain_drop(*chainp);
752 		*chainp = NULL;
753 	}
754 	hammer2_chain_unlock(*parentp);
755 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
756 
757 	/*
758 	 * We must reissue the lookup to properly position (*parentp)
759 	 * for the insertion.
760 	 */
761 	chain = hammer2_chain_lookup(parentp, &dummy,
762 				     focus->bref.key, focus->bref.key,
763 				     &cache_index,
764 				     HAMMER2_LOOKUP_NODIRECT |
765 				     HAMMER2_LOOKUP_ALWAYS);
766 	KKASSERT(chain == NULL);
767 
768 	chain = NULL;
769 	hammer2_chain_create(parentp, &chain,
770 			     thr->pmp, focus->bref.methods,
771 			     focus->bref.key, focus->bref.keybits,
772 			     focus->bref.type, focus->bytes,
773 			     mtid, 0, 0);
774 	hammer2_chain_modify(chain, mtid, 0, 0);
775 
776 	/*
777 	 * Copy focus to new chain
778 	 */
779 
780 	/* type already set */
781 	chain->bref.methods = focus->bref.methods;
782 	/* keybits already set */
783 	chain->bref.vradix = focus->bref.vradix;
784 	/* mirror_tid set by flush */
785 	KKASSERT(chain->bref.modify_tid == mtid);
786 	chain->bref.flags = focus->bref.flags;
787 	/* key already present */
788 	/* check code will be recalculated */
789 
790 	/*
791 	 * Copy data body.
792 	 */
793 	switch(chain->bref.type) {
794 	case HAMMER2_BREF_TYPE_INODE:
795 		if ((focus->data->ipdata.meta.op_flags &
796 		     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
797 			/* do not copy block table */
798 			bcopy(focus->data, chain->data,
799 			      offsetof(hammer2_inode_data_t, u));
800 			break;
801 		}
802 		/* fall through copy whole thing */
803 	case HAMMER2_BREF_TYPE_DATA:
804 		bcopy(focus->data, chain->data, chain->bytes);
805 		hammer2_chain_setcheck(chain, chain->data);
806 		break;
807 	default:
808 		KKASSERT(0);
809 		break;
810 	}
811 
812 	hammer2_chain_unlock(chain);		/* unlock, leave ref */
813 	*chainp = chain;			/* will be returned locked */
814 
815 	/*
816 	 * Avoid ordering deadlock when relocking shared.
817 	 */
818 	hammer2_chain_unlock(*parentp);
819 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
820 				     HAMMER2_RESOLVE_ALWAYS);
821 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
822 				  HAMMER2_RESOLVE_ALWAYS);
823 
824 	return 0;
825 }
826 
827 /*
828  * Destroy an extranious chain.
829  *
830  * Both *parentp and *chainp are locked shared.
831  *
832  * On return, *chainp will be adjusted to point to the next element in the
833  * iteration and locked shared.
834  */
835 static
836 int
837 hammer2_sync_destroy(hammer2_thread_t *thr,
838 		     hammer2_chain_t **parentp, hammer2_chain_t **chainp,
839 		     hammer2_tid_t mtid, int idx)
840 {
841 	hammer2_chain_t *chain;
842 	hammer2_key_t key_next;
843 	hammer2_key_t save_key;
844 	int cache_index = -1;
845 
846 	chain = *chainp;
847 
848 #if HAMMER2_SYNCHRO_DEBUG
849 	if (hammer2_debug & 1)
850 	kprintf("destroy rec %p/%p slave %d %d.%016jx\n",
851 		*parentp, chain,
852 		idx, chain->bref.type, chain->bref.key);
853 #endif
854 
855 	save_key = chain->bref.key;
856 	if (save_key != HAMMER2_KEY_MAX)
857 		++save_key;
858 
859 	/*
860 	 * Try to avoid unnecessary I/O.
861 	 *
862 	 * XXX accounting not propagated up properly.  We might have to do
863 	 *     a RESOLVE_MAYBE here and pass 0 for the flags.
864 	 */
865 	hammer2_chain_unlock(chain);	/* relock exclusive */
866 	hammer2_chain_unlock(*parentp);
867 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
868 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
869 
870 	hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT);
871 	hammer2_chain_unlock(chain);
872 	hammer2_chain_drop(chain);
873 	chain = NULL;			/* safety */
874 
875 	hammer2_chain_unlock(*parentp);	/* relock shared */
876 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
877 				     HAMMER2_RESOLVE_ALWAYS);
878 	*chainp = hammer2_chain_lookup(parentp, &key_next,
879 				     save_key, HAMMER2_KEY_MAX,
880 				     &cache_index,
881 				     HAMMER2_LOOKUP_SHARED |
882 				     HAMMER2_LOOKUP_NODIRECT |
883 				     HAMMER2_LOOKUP_NODATA);
884 	return 0;
885 }
886 
887 /*
888  * cparent is locked exclusively, with an extra ref, cluster is not locked.
889  * Replace element [i] in the cluster.
890  */
891 static
892 int
893 hammer2_sync_replace(hammer2_thread_t *thr,
894 		     hammer2_chain_t *parent, hammer2_chain_t *chain,
895 		     hammer2_tid_t mtid, int idx,
896 		     hammer2_chain_t *focus, int isroot)
897 {
898 	int nradix;
899 	uint8_t otype;
900 
901 #if HAMMER2_SYNCHRO_DEBUG
902 	if (hammer2_debug & 1)
903 	kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n",
904 		chain,
905 		idx,
906 		focus->bref.type, focus->bref.key, mtid);
907 #endif
908 	hammer2_chain_unlock(chain);
909 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
910 	if (chain->bytes != focus->bytes) {
911 		/* XXX what if compressed? */
912 		nradix = hammer2_getradix(chain->bytes);
913 		hammer2_chain_resize(NULL, parent, chain,
914 				     mtid, 0,
915 				     nradix, 0);
916 	}
917 	hammer2_chain_modify(chain, mtid, 0, 0);
918 	otype = chain->bref.type;
919 	chain->bref.type = focus->bref.type;
920 	chain->bref.methods = focus->bref.methods;
921 	chain->bref.keybits = focus->bref.keybits;
922 	chain->bref.vradix = focus->bref.vradix;
923 	/* mirror_tid updated by flush */
924 	KKASSERT(mtid == 0 || chain->bref.modify_tid == mtid);
925 	chain->bref.flags = focus->bref.flags;
926 	/* key already present */
927 	/* check code will be recalculated */
928 	chain->error = 0;
929 
930 	/*
931 	 * Copy data body.
932 	 */
933 	switch(chain->bref.type) {
934 	case HAMMER2_BREF_TYPE_INODE:
935 		/*
936 		 * Special case PFSROOTs, only limited changes can be made
937 		 * since the meta-data contains miscellanious distinguishing
938 		 * fields.
939 		 */
940 		if (isroot) {
941 			chain->data->ipdata.meta.uflags =
942 				focus->data->ipdata.meta.uflags;
943 			chain->data->ipdata.meta.rmajor =
944 				focus->data->ipdata.meta.rmajor;
945 			chain->data->ipdata.meta.rminor =
946 				focus->data->ipdata.meta.rminor;
947 			chain->data->ipdata.meta.ctime =
948 				focus->data->ipdata.meta.ctime;
949 			chain->data->ipdata.meta.mtime =
950 				focus->data->ipdata.meta.mtime;
951 			chain->data->ipdata.meta.atime =
952 				focus->data->ipdata.meta.atime;
953 			/* not btime */
954 			chain->data->ipdata.meta.uid =
955 				focus->data->ipdata.meta.uid;
956 			chain->data->ipdata.meta.gid =
957 				focus->data->ipdata.meta.gid;
958 			chain->data->ipdata.meta.mode =
959 				focus->data->ipdata.meta.mode;
960 			chain->data->ipdata.meta.ncopies =
961 				focus->data->ipdata.meta.ncopies;
962 			chain->data->ipdata.meta.comp_algo =
963 				focus->data->ipdata.meta.comp_algo;
964 			chain->data->ipdata.meta.check_algo =
965 				focus->data->ipdata.meta.check_algo;
966 			chain->data->ipdata.meta.data_quota =
967 				focus->data->ipdata.meta.data_quota;
968 			chain->data->ipdata.meta.inode_quota =
969 				focus->data->ipdata.meta.inode_quota;
970 
971 			/*
972 			 * last snapshot tid controls overwrite
973 			 */
974 			if (chain->data->ipdata.meta.pfs_lsnap_tid <
975 			    focus->data->ipdata.meta.pfs_lsnap_tid) {
976 				chain->data->ipdata.meta.pfs_lsnap_tid =
977 					focus->data->ipdata.meta.pfs_lsnap_tid;
978 			}
979 
980 			hammer2_chain_setcheck(chain, chain->data);
981 			break;
982 		}
983 
984 		/*
985 		 * Normal replacement.
986 		 */
987 		if ((focus->data->ipdata.meta.op_flags &
988 		     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
989 			/*
990 			 * If DIRECTDATA is transitioning to 0 or the old
991 			 * chain is not an inode we have to initialize
992 			 * the block table.
993 			 */
994 			if (otype != HAMMER2_BREF_TYPE_INODE ||
995 			    (chain->data->ipdata.meta.op_flags &
996 			     HAMMER2_OPFLAG_DIRECTDATA)) {
997 				kprintf("chain inode trans away from dd\n");
998 				bzero(&chain->data->ipdata.u,
999 				      sizeof(chain->data->ipdata.u));
1000 			}
1001 			bcopy(focus->data, chain->data,
1002 			      offsetof(hammer2_inode_data_t, u));
1003 			/* XXX setcheck on inode should not be needed */
1004 			hammer2_chain_setcheck(chain, chain->data);
1005 			break;
1006 		}
1007 		/* fall through */
1008 	case HAMMER2_BREF_TYPE_DATA:
1009 		bcopy(focus->data, chain->data, chain->bytes);
1010 		hammer2_chain_setcheck(chain, chain->data);
1011 		break;
1012 	default:
1013 		KKASSERT(0);
1014 		break;
1015 	}
1016 
1017 	hammer2_chain_unlock(chain);
1018 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
1019 				  HAMMER2_RESOLVE_MAYBE);
1020 
1021 	return 0;
1022 }
1023