xref: /dragonfly/sys/vfs/hammer2/hammer2_synchro.c (revision 0db87cb7)
1 /*
2  * Copyright (c) 2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * This module implements the cluster synchronizer.  Basically the way
36  * it works is that a thread is created for each cluster node in a PFS.
37  * This thread is responsible for synchronizing the current node using
38  * data from other nodes.
39  *
40  * Any out of sync master or slave can get back into synchronization as
41  * long as a quorum of masters agree on the update_tid.  If a quorum is
42  * not available it may still be possible to synchronize to the highest
43  * available update_tid as a way of trying to catch up as much as possible
44  * until a quorum is available.
45  *
46  * If no quorum is possible (which can happen even if all masters are
47  * available, if the update_tid does not match), then manual intervention
48  * may be required to resolve discrepancies.
49  */
50 #include "hammer2.h"
51 
52 typedef struct hammer2_deferred_ip {
53 	struct hammer2_deferred_ip *next;
54 	hammer2_inode_t	*ip;
55 } hammer2_deferred_ip_t;
56 
57 typedef struct hammer2_deferred_list {
58 	hammer2_deferred_ip_t	*base;
59 	int			count;
60 } hammer2_deferred_list_t;
61 
62 
63 #define HAMMER2_SYNCHRO_DEBUG 1
64 
65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
66 				hammer2_deferred_list_t *list);
67 #if 0
68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags);
69 				nerror = hammer2_sync_insert(
70 						thr, &parent, &chain,
71 						focus->bref.modify_tid,
72 						idx, focus);
73 #endif
74 static int hammer2_sync_insert(hammer2_thread_t *thr,
75 			hammer2_chain_t **parentp, hammer2_chain_t **chainp,
76 			hammer2_tid_t modify_tid, int idx,
77 			hammer2_chain_t *focus);
78 static int hammer2_sync_destroy(hammer2_thread_t *thr,
79 			hammer2_chain_t **parentp, hammer2_chain_t **chainp,
80 			hammer2_tid_t mtid, int idx);
81 static int hammer2_sync_replace(hammer2_thread_t *thr,
82 			hammer2_chain_t *parent, hammer2_chain_t *chain,
83 			hammer2_tid_t mtid, int idx,
84 			hammer2_chain_t *focus);
85 
86 /****************************************************************************
87  *			    HAMMER2 SYNC THREADS 			    *
88  ****************************************************************************/
89 /*
90  * Primary management thread for an element of a node.  A thread will exist
91  * for each element requiring management.
92  *
93  * No management threads are needed for the SPMP or for any PMP with only
94  * a single MASTER.
95  *
96  * On the SPMP - handles bulkfree and dedup operations
97  * On a PFS    - handles remastering and synchronization
98  */
99 void
100 hammer2_primary_sync_thread(void *arg)
101 {
102 	hammer2_thread_t *thr = arg;
103 	hammer2_pfs_t *pmp;
104 	hammer2_deferred_list_t list;
105 	hammer2_deferred_ip_t *defer;
106 	int error;
107 
108 	pmp = thr->pmp;
109 	bzero(&list, sizeof(list));
110 
111 	lockmgr(&thr->lk, LK_EXCLUSIVE);
112 	while ((thr->flags & HAMMER2_THREAD_STOP) == 0) {
113 		/*
114 		 * Handle freeze request
115 		 */
116 		if (thr->flags & HAMMER2_THREAD_FREEZE) {
117 			atomic_set_int(&thr->flags, HAMMER2_THREAD_FROZEN);
118 			atomic_clear_int(&thr->flags, HAMMER2_THREAD_FREEZE);
119 		}
120 
121 		/*
122 		 * Force idle if frozen until unfrozen or stopped.
123 		 */
124 		if (thr->flags & HAMMER2_THREAD_FROZEN) {
125 			lksleep(thr->xopq, &thr->lk, 0, "frozen", 0);
126 			continue;
127 		}
128 
129 		/*
130 		 * Reset state on REMASTER request
131 		 */
132 		if (thr->flags & HAMMER2_THREAD_REMASTER) {
133 			atomic_clear_int(&thr->flags, HAMMER2_THREAD_REMASTER);
134 			/* reset state */
135 		}
136 
137 		/*
138 		 * Synchronization scan.
139 		 */
140 		kprintf("sync_slaves pfs %s clindex %d\n",
141 			pmp->pfs_names[thr->clindex], thr->clindex);
142 		hammer2_trans_init(pmp, 0);
143 
144 		hammer2_inode_ref(pmp->iroot);
145 
146 		for (;;) {
147 			int didbreak = 0;
148 			/* XXX lock synchronize pmp->modify_tid */
149 			error = hammer2_sync_slaves(thr, pmp->iroot, &list);
150 			if (error != EAGAIN)
151 				break;
152 			while ((defer = list.base) != NULL) {
153 				hammer2_inode_t *nip;
154 
155 				nip = defer->ip;
156 				error = hammer2_sync_slaves(thr, nip, &list);
157 				if (error && error != EAGAIN)
158 					break;
159 				if (hammer2_thr_break(thr)) {
160 					didbreak = 1;
161 					break;
162 				}
163 
164 				/*
165 				 * If no additional defers occurred we can
166 				 * remove this one, otherwrise keep it on
167 				 * the list and retry once the additional
168 				 * defers have completed.
169 				 */
170 				if (defer == list.base) {
171 					--list.count;
172 					list.base = defer->next;
173 					kfree(defer, M_HAMMER2);
174 					defer = NULL;	/* safety */
175 					hammer2_inode_drop(nip);
176 				}
177 			}
178 
179 			/*
180 			 * If the thread is being remastered, frozen, or
181 			 * stopped, clean up any left-over deferals.
182 			 */
183 			if (didbreak || (error && error != EAGAIN)) {
184 				kprintf("didbreak\n");
185 				while ((defer = list.base) != NULL) {
186 					--list.count;
187 					hammer2_inode_drop(defer->ip);
188 					list.base = defer->next;
189 					kfree(defer, M_HAMMER2);
190 				}
191 				if (error == 0 || error == EAGAIN)
192 					error = EINPROGRESS;
193 				break;
194 			}
195 		}
196 
197 		hammer2_inode_drop(pmp->iroot);
198 		hammer2_trans_done(pmp);
199 
200 		if (error)
201 			kprintf("hammer2_sync_slaves: error %d\n", error);
202 
203 		/*
204 		 * Wait for event, or 5-second poll.
205 		 */
206 		lksleep(thr->xopq, &thr->lk, 0, "h2idle", hz * 5);
207 	}
208 	thr->td = NULL;
209 	wakeup(thr);
210 	lockmgr(&thr->lk, LK_RELEASE);
211 	/* thr structure can go invalid after this point */
212 }
213 
214 #if 0
215 /*
216  * Given a locked cluster created from pmp->iroot, update the PFS's
217  * reporting status.
218  */
219 static
220 void
221 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags)
222 {
223 	hammer2_pfs_t *pmp = thr->pmp;
224 
225 	flags &= HAMMER2_CLUSTER_ZFLAGS;
226 	if (pmp->cluster_flags == flags)
227 		return;
228 	pmp->cluster_flags = flags;
229 
230 	kprintf("pfs %p", pmp);
231 	if (flags & HAMMER2_CLUSTER_MSYNCED)
232 		kprintf(" masters-all-good");
233 	if (flags & HAMMER2_CLUSTER_SSYNCED)
234 		kprintf(" slaves-all-good");
235 
236 	if (flags & HAMMER2_CLUSTER_WRHARD)
237 		kprintf(" quorum/rw");
238 	else if (flags & HAMMER2_CLUSTER_RDHARD)
239 		kprintf(" quorum/ro");
240 
241 	if (flags & HAMMER2_CLUSTER_UNHARD)
242 		kprintf(" out-of-sync-masters");
243 	else if (flags & HAMMER2_CLUSTER_NOHARD)
244 		kprintf(" no-masters-visible");
245 
246 	if (flags & HAMMER2_CLUSTER_WRSOFT)
247 		kprintf(" soft/rw");
248 	else if (flags & HAMMER2_CLUSTER_RDSOFT)
249 		kprintf(" soft/ro");
250 
251 	if (flags & HAMMER2_CLUSTER_UNSOFT)
252 		kprintf(" out-of-sync-slaves");
253 	else if (flags & HAMMER2_CLUSTER_NOSOFT)
254 		kprintf(" no-slaves-visible");
255 	kprintf("\n");
256 }
257 #endif
258 
259 #if 0
260 static
261 void
262 dumpcluster(const char *label,
263 	    hammer2_cluster_t *cparent, hammer2_cluster_t *cluster)
264 {
265 	hammer2_chain_t *chain;
266 	int i;
267 
268 	if ((hammer2_debug & 1) == 0)
269 		return;
270 
271 	kprintf("%s\t", label);
272 	KKASSERT(cparent->nchains == cluster->nchains);
273 	for (i = 0; i < cparent->nchains; ++i) {
274 		if (i)
275 			kprintf("\t");
276 		kprintf("%d ", i);
277 		if ((chain = cparent->array[i].chain) != NULL) {
278 			kprintf("%016jx%s ",
279 				chain->bref.key,
280 				((cparent->array[i].flags &
281 				  HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
282 			);
283 		} else {
284 			kprintf("      NULL      %s ", "   ");
285 		}
286 		if ((chain = cluster->array[i].chain) != NULL) {
287 			kprintf("%016jx%s ",
288 				chain->bref.key,
289 				((cluster->array[i].flags &
290 				  HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
291 			);
292 		} else {
293 			kprintf("      NULL      %s ", "   ");
294 		}
295 		kprintf("\n");
296 	}
297 }
298 #endif
299 
300 /*
301  * Each out of sync node sync-thread must issue an all-nodes XOP scan of
302  * the inode.  This creates a multiplication effect since the XOP scan itself
303  * issues to all nodes.  However, this is the only way we can safely
304  * synchronize nodes which might have disparate I/O bandwidths and the only
305  * way we can safely deal with stalled nodes.
306  */
307 static
308 int
309 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
310 		    hammer2_deferred_list_t *list)
311 {
312 	hammer2_xop_scanall_t *xop;
313 	hammer2_chain_t *parent;
314 	hammer2_chain_t *chain;
315 	hammer2_pfs_t *pmp;
316 	hammer2_key_t key_next;
317 	hammer2_tid_t sync_tid;
318 	int cache_index = -1;
319 	int needrescan;
320 	int wantupdate;
321 	int error;
322 	int nerror;
323 	int idx;
324 	int n;
325 
326 	pmp = ip->pmp;
327 	idx = thr->clindex;	/* cluster node we are responsible for */
328 	needrescan = 0;
329 	wantupdate = 0;
330 
331 	if (ip->cluster.focus == NULL)
332 		return (EINPROGRESS);
333 	sync_tid = ip->cluster.focus->bref.modify_tid;
334 
335 #if 0
336 	/*
337 	 * Nothing to do if all slaves are synchronized.
338 	 * Nothing to do if cluster not authoritatively readable.
339 	 */
340 	if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED)
341 		return(0);
342 	if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0)
343 		return(HAMMER2_ERROR_INCOMPLETE);
344 #endif
345 
346 	error = 0;
347 
348 	/*
349 	 * The inode is left unlocked during the scan.  Issue a XOP
350 	 * that does *not* include our cluster index to iterate
351 	 * properly synchronized elements and resolve our cluster index
352 	 * against it.
353 	 */
354 	hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
355 	xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
356 	xop->key_beg = HAMMER2_KEY_MIN;
357 	xop->key_end = HAMMER2_KEY_MAX;
358 	hammer2_xop_start_except(&xop->head, hammer2_xop_scanall, idx);
359 	parent = hammer2_inode_chain(ip, idx,
360 				     HAMMER2_RESOLVE_ALWAYS |
361 				     HAMMER2_RESOLVE_SHARED);
362 	if (parent->bref.modify_tid != sync_tid)
363 		wantupdate = 1;
364 
365 	hammer2_inode_unlock(ip);
366 
367 	chain = hammer2_chain_lookup(&parent, &key_next,
368 				     HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
369 				     &cache_index,
370 				     HAMMER2_LOOKUP_SHARED |
371 				     HAMMER2_LOOKUP_NODIRECT |
372 				     HAMMER2_LOOKUP_NODATA);
373 	error = hammer2_xop_collect(&xop->head, 0);
374 	kprintf("XOP_INITIAL xop=%p clindex %d on %s\n", xop, thr->clindex,
375 		pmp->pfs_names[thr->clindex]);
376 
377 	for (;;) {
378 		/*
379 		 * We are done if our scan is done and the XOP scan is done.
380 		 * We are done if the XOP scan failed (that is, we don't
381 		 * have authoritative data to synchronize with).
382 		 */
383 		int advance_local = 0;
384 		int advance_xop = 0;
385 		int dodefer = 0;
386 		hammer2_chain_t *focus;
387 
388 		kprintf("loop xop=%p chain[1]=%p lockcnt=%d\n",
389 			xop, xop->head.cluster.array[1].chain,
390 			(xop->head.cluster.array[1].chain ?
391 			    xop->head.cluster.array[1].chain->lockcnt : -1)
392 			);
393 
394 		if (chain == NULL && error == ENOENT)
395 			break;
396 		if (error && error != ENOENT)
397 			break;
398 
399 		/*
400 		 * Compare
401 		 */
402 		if (chain && error == ENOENT) {
403 			/*
404 			 * If we have local chains but the XOP scan is done,
405 			 * the chains need to be deleted.
406 			 */
407 			n = -1;
408 			focus = NULL;
409 		} else if (chain == NULL) {
410 			/*
411 			 * If our local scan is done but the XOP scan is not,
412 			 * we need to create the missing chain(s).
413 			 */
414 			n = 1;
415 			focus = xop->head.cluster.focus;
416 		} else {
417 			/*
418 			 * Otherwise compare to determine the action
419 			 * needed.
420 			 */
421 			focus = xop->head.cluster.focus;
422 			n = hammer2_chain_cmp(chain, focus);
423 		}
424 
425 		/*
426 		 * Take action based on comparison results.
427 		 */
428 		if (n < 0) {
429 			/*
430 			 * Delete extranious local data.  This will
431 			 * automatically advance the chain.
432 			 */
433 			nerror = hammer2_sync_destroy(thr, &parent, &chain,
434 						      0, idx);
435 		} else if (n == 0 && chain->bref.modify_tid !=
436 				     focus->bref.modify_tid) {
437 			/*
438 			 * Matching key but local data or meta-data requires
439 			 * updating.  If we will recurse, we still need to
440 			 * update to compatible content first but we do not
441 			 * synchronize modify_tid until the entire recursion
442 			 * has completed successfully.
443 			 */
444 			if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
445 				nerror = hammer2_sync_replace(
446 						thr, parent, chain,
447 						0,
448 						idx, focus);
449 				dodefer = 1;
450 			} else {
451 				nerror = hammer2_sync_replace(
452 						thr, parent, chain,
453 						focus->bref.modify_tid,
454 						idx, focus);
455 			}
456 		} else if (n == 0) {
457 			/*
458 			 * 100% match, advance both
459 			 */
460 			advance_local = 1;
461 			advance_xop = 1;
462 			nerror = 0;
463 		} else if (n > 0) {
464 			/*
465 			 * Insert missing local data.
466 			 *
467 			 * If we will recurse, we still need to update to
468 			 * compatible content first but we do not synchronize
469 			 * modify_tid until the entire recursion has
470 			 * completed successfully.
471 			 */
472 			if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
473 				nerror = hammer2_sync_insert(
474 						thr, &parent, &chain,
475 						0,
476 						idx, focus);
477 				dodefer = 2;
478 			} else {
479 				nerror = hammer2_sync_insert(
480 						thr, &parent, &chain,
481 						focus->bref.modify_tid,
482 						idx, focus);
483 			}
484 			advance_local = 1;
485 			advance_xop = 1;
486 		}
487 
488 		/*
489 		 * We cannot recurse depth-first because the XOP is still
490 		 * running in node threads for this scan.  Create a placemarker
491 		 * by obtaining and record the hammer2_inode.
492 		 *
493 		 * We excluded our node from the XOP so we must temporarily
494 		 * add it to xop->head.cluster so it is properly incorporated
495 		 * into the inode.
496 		 *
497 		 * The deferral is pushed onto a LIFO list for bottom-up
498 		 * synchronization.
499 		 */
500 		if (error == 0 && dodefer) {
501 			hammer2_inode_t *nip;
502 			hammer2_deferred_ip_t *defer;
503 
504 			KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE);
505 
506 			defer = kmalloc(sizeof(*defer), M_HAMMER2,
507 					M_WAITOK | M_ZERO);
508 			KKASSERT(xop->head.cluster.array[idx].chain == NULL);
509 			xop->head.cluster.array[idx].flags =
510 							HAMMER2_CITEM_INVALID;
511 			xop->head.cluster.array[idx].chain = chain;
512 			nip = hammer2_inode_get(pmp, ip,
513 						&xop->head.cluster, idx);
514 			xop->head.cluster.array[idx].chain = NULL;
515 
516 			hammer2_inode_ref(nip);
517 			hammer2_inode_unlock(nip);
518 
519 			defer->next = list->base;
520 			defer->ip = nip;
521 			list->base = defer;
522 			++list->count;
523 			needrescan = 1;
524 		}
525 
526 		/*
527 		 * If at least one deferral was added and the deferral
528 		 * list has grown too large, stop adding more.  This
529 		 * will trigger an EAGAIN return.
530 		 */
531 		if (needrescan && list->count > 1000)
532 			break;
533 
534 		/*
535 		 * Advancements for iteration.
536 		 */
537 		if (advance_xop) {
538 			error = hammer2_xop_collect(&xop->head, 0);
539 		}
540 		if (advance_local) {
541 			chain = hammer2_chain_next(&parent, chain, &key_next,
542 						   key_next, HAMMER2_KEY_MAX,
543 						   &cache_index,
544 						   HAMMER2_LOOKUP_SHARED |
545 						   HAMMER2_LOOKUP_NODIRECT |
546 						   HAMMER2_LOOKUP_NODATA);
547 		}
548 	}
549 	hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
550 	if (chain) {
551 		hammer2_chain_unlock(chain);
552 		hammer2_chain_drop(chain);
553 	}
554 	if (parent) {
555 		hammer2_chain_unlock(parent);
556 		hammer2_chain_drop(parent);
557 	}
558 
559 	/*
560 	 * If we added deferrals we want the caller to synchronize them
561 	 * and then call us again.
562 	 *
563 	 * NOTE: In this situation we do not yet want to synchronize our
564 	 *	 inode, setting the error code also has that effect.
565 	 */
566 	if (error == 0 && needrescan)
567 		error = EAGAIN;
568 
569 	/*
570 	 * If no error occurred and work was performed, synchronize the
571 	 * inode meta-data itself.
572 	 *
573 	 * XXX inode lock was lost
574 	 */
575 	if (error == 0 && wantupdate) {
576 		hammer2_xop_ipcluster_t *xop2;
577 		hammer2_chain_t *focus;
578 
579 		xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
580 		hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster,
581 					 idx);
582 		error = hammer2_xop_collect(&xop2->head, 0);
583 		if (error == 0) {
584 			focus = xop2->head.cluster.focus;
585 			kprintf("syncthr: update inode %p (%s)\n",
586 				focus,
587 				(focus ?
588 				 (char *)focus->data->ipdata.filename : "?"));
589 			chain = hammer2_inode_chain_and_parent(ip, idx,
590 						    &parent,
591 						    HAMMER2_RESOLVE_ALWAYS |
592 						    HAMMER2_RESOLVE_SHARED);
593 
594 			KKASSERT(parent != NULL);
595 			nerror = hammer2_sync_replace(
596 					thr, parent, chain,
597 					sync_tid,
598 					idx, focus);
599 			hammer2_chain_unlock(chain);
600 			hammer2_chain_drop(chain);
601 			hammer2_chain_unlock(parent);
602 			hammer2_chain_drop(parent);
603 			/* XXX */
604 		}
605 		hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
606 	}
607 
608 	return error;
609 }
610 
611 /*
612  * Create a missing chain by copying the focus from another device.
613  *
614  * On entry *parentp and focus are both locked shared.  The chain will be
615  * created and returned in *chainp also locked shared.
616  */
617 static
618 int
619 hammer2_sync_insert(hammer2_thread_t *thr,
620 		    hammer2_chain_t **parentp, hammer2_chain_t **chainp,
621 		    hammer2_tid_t mtid, int idx, hammer2_chain_t *focus)
622 {
623 	hammer2_chain_t *chain;
624 
625 #if HAMMER2_SYNCHRO_DEBUG
626 	if (hammer2_debug & 1)
627 	kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n",
628 		*parentp,
629 		(*parentp)->bref.type,
630 		(*parentp)->bref.key,
631 		idx,
632 		focus->bref.type, focus->bref.key, mtid);
633 #endif
634 
635 	/*
636 	 * Create the missing chain.  Exclusive locks are needed.
637 	 *
638 	 * Have to be careful to avoid deadlocks.
639 	 */
640 	if (*chainp)
641 		hammer2_chain_unlock(*chainp);
642 	hammer2_chain_unlock(*parentp);
643 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
644 	/* reissue lookup? */
645 
646 	chain = NULL;
647 	hammer2_chain_create(parentp, &chain, thr->pmp,
648 			     focus->bref.key, focus->bref.keybits,
649 			     focus->bref.type, focus->bytes,
650 			     mtid, 0, 0);
651 	hammer2_chain_modify(chain, mtid, 0, 0);
652 
653 	/*
654 	 * Copy focus to new chain
655 	 */
656 
657 	/* type already set */
658 	chain->bref.methods = focus->bref.methods;
659 	/* keybits already set */
660 	chain->bref.vradix = focus->bref.vradix;
661 	/* mirror_tid set by flush */
662 	KKASSERT(chain->bref.modify_tid == mtid);
663 	chain->bref.flags = focus->bref.flags;
664 	/* key already present */
665 	/* check code will be recalculated */
666 
667 	/*
668 	 * Copy data body.
669 	 */
670 	switch(chain->bref.type) {
671 	case HAMMER2_BREF_TYPE_INODE:
672 		if ((focus->data->ipdata.meta.op_flags &
673 		     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
674 			bcopy(focus->data, chain->data,
675 			      offsetof(hammer2_inode_data_t, u));
676 			break;
677 		}
678 		/* fall through */
679 	case HAMMER2_BREF_TYPE_DATA:
680 		bcopy(focus->data, chain->data, chain->bytes);
681 		hammer2_chain_setcheck(chain, chain->data);
682 		break;
683 	default:
684 		KKASSERT(0);
685 		break;
686 	}
687 
688 	hammer2_chain_unlock(chain);		/* unlock, leave ref */
689 	if (*chainp)
690 		hammer2_chain_drop(*chainp);
691 	*chainp = chain;			/* will be returned locked */
692 
693 	/*
694 	 * Avoid ordering deadlock when relocking.
695 	 */
696 	hammer2_chain_unlock(*parentp);
697 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
698 				     HAMMER2_RESOLVE_ALWAYS);
699 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
700 				  HAMMER2_RESOLVE_ALWAYS);
701 
702 	return 0;
703 }
704 
705 /*
706  * Destroy an extranious chain.
707  *
708  * Both *parentp and *chainp are locked shared.
709  *
710  * On return, *chainp will be adjusted to point to the next element in the
711  * iteration and locked shared.
712  */
713 static
714 int
715 hammer2_sync_destroy(hammer2_thread_t *thr,
716 		     hammer2_chain_t **parentp, hammer2_chain_t **chainp,
717 		     hammer2_tid_t mtid, int idx)
718 {
719 	hammer2_chain_t *chain;
720 	hammer2_chain_t *parent;
721 	hammer2_key_t key_next;
722 	hammer2_key_t save_key;
723 	int cache_index = -1;
724 
725 	chain = *chainp;
726 
727 #if HAMMER2_SYNCHRO_DEBUG
728 	if (hammer2_debug & 1)
729 	kprintf("destroy rec %p/%p slave %d %d.%016jx\n",
730 		*parentp, chain,
731 		idx, chain->bref.type, chain->bref.key);
732 #endif
733 
734 	save_key = chain->bref.key;
735 	if (save_key != HAMMER2_KEY_MAX)
736 		++save_key;
737 
738 	/*
739 	 * Try to avoid unnecessary I/O.
740 	 *
741 	 * XXX accounting not propagated up properly.  We might have to do
742 	 *     a RESOLVE_MAYBE here and pass 0 for the flags.
743 	 */
744 	hammer2_chain_unlock(chain);	/* relock exclusive */
745 	hammer2_chain_unlock(*parentp);
746 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
747 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
748 
749 	hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT);
750 	hammer2_chain_unlock(chain);
751 	hammer2_chain_drop(chain);
752 	chain = NULL;			/* safety */
753 
754 	hammer2_chain_unlock(*parentp);	/* relock shared */
755 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
756 				     HAMMER2_RESOLVE_ALWAYS);
757 	*chainp = hammer2_chain_lookup(&parent, &key_next,
758 				     save_key, HAMMER2_KEY_MAX,
759 				     &cache_index,
760 				     HAMMER2_LOOKUP_SHARED |
761 				     HAMMER2_LOOKUP_NODIRECT |
762 				     HAMMER2_LOOKUP_NODATA);
763 	return 0;
764 }
765 
766 /*
767  * cparent is locked exclusively, with an extra ref, cluster is not locked.
768  * Replace element [i] in the cluster.
769  */
770 static
771 int
772 hammer2_sync_replace(hammer2_thread_t *thr,
773 		     hammer2_chain_t *parent, hammer2_chain_t *chain,
774 		     hammer2_tid_t mtid, int idx,
775 		     hammer2_chain_t *focus)
776 {
777 	int nradix;
778 	uint8_t otype;
779 
780 #if HAMMER2_SYNCHRO_DEBUG
781 	if (hammer2_debug & 1)
782 	kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n",
783 		chain,
784 		idx,
785 		focus->bref.type, focus->bref.key, mtid);
786 #endif
787 	hammer2_chain_unlock(chain);
788 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
789 	if (chain->bytes != focus->bytes) {
790 		/* XXX what if compressed? */
791 		nradix = hammer2_getradix(chain->bytes);
792 		hammer2_chain_resize(NULL, parent, chain,
793 				     mtid, 0,
794 				     nradix, 0);
795 	}
796 	hammer2_chain_modify(chain, mtid, 0, 0);
797 	otype = chain->bref.type;
798 	chain->bref.type = focus->bref.type;
799 	chain->bref.methods = focus->bref.methods;
800 	chain->bref.keybits = focus->bref.keybits;
801 	chain->bref.vradix = focus->bref.vradix;
802 	/* mirror_tid updated by flush */
803 	KKASSERT(chain->bref.modify_tid == mtid);
804 	chain->bref.flags = focus->bref.flags;
805 	/* key already present */
806 	/* check code will be recalculated */
807 	chain->error = 0;
808 
809 	/*
810 	 * Copy data body.
811 	 */
812 	switch(chain->bref.type) {
813 	case HAMMER2_BREF_TYPE_INODE:
814 		if ((focus->data->ipdata.meta.op_flags &
815 		     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
816 			/*
817 			 * If DIRECTDATA is transitioning to 0 or the old
818 			 * chain is not an inode we have to initialize
819 			 * the block table.
820 			 */
821 			if (otype != HAMMER2_BREF_TYPE_INODE ||
822 			    (chain->data->ipdata.meta.op_flags &
823 			     HAMMER2_OPFLAG_DIRECTDATA)) {
824 				kprintf("chain inode trans away from dd\n");
825 				bzero(&chain->data->ipdata.u,
826 				      sizeof(chain->data->ipdata.u));
827 			}
828 			bcopy(focus->data, chain->data,
829 			      offsetof(hammer2_inode_data_t, u));
830 			/* XXX setcheck on inode should not be needed */
831 			hammer2_chain_setcheck(chain, chain->data);
832 			break;
833 		}
834 		/* fall through */
835 	case HAMMER2_BREF_TYPE_DATA:
836 		bcopy(focus->data, chain->data, chain->bytes);
837 		hammer2_chain_setcheck(chain, chain->data);
838 		break;
839 	default:
840 		KKASSERT(0);
841 		break;
842 	}
843 
844 	hammer2_chain_unlock(chain);
845 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
846 				  HAMMER2_RESOLVE_MAYBE);
847 
848 	return 0;
849 }
850