xref: /dragonfly/sys/vfs/hammer2/hammer2_synchro.c (revision e98bdfd3)
1 /*
2  * Copyright (c) 2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * This module implements the cluster synchronizer.  Basically the way
36  * it works is that a thread is created for each cluster node in a PFS.
37  * This thread is responsible for synchronizing the current node using
38  * data from other nodes.
39  *
40  * Any out of sync master or slave can get back into synchronization as
41  * long as a quorum of masters agree on the update_tid.  If a quorum is
42  * not available it may still be possible to synchronize to the highest
43  * available update_tid as a way of trying to catch up as much as possible
44  * until a quorum is available.
45  *
46  * If no quorum is possible (which can happen even if all masters are
47  * available, if the update_tid does not match), then manual intervention
48  * may be required to resolve discrepancies.
49  */
50 #include "hammer2.h"
51 
52 typedef struct hammer2_deferred_ip {
53 	struct hammer2_deferred_ip *next;
54 	hammer2_inode_t	*ip;
55 } hammer2_deferred_ip_t;
56 
57 typedef struct hammer2_deferred_list {
58 	hammer2_deferred_ip_t	*base;
59 	int			count;
60 } hammer2_deferred_list_t;
61 
62 
63 #define HAMMER2_SYNCHRO_DEBUG 1
64 
65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
66 				hammer2_deferred_list_t *list, int isroot);
67 #if 0
68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags);
69 				nerror = hammer2_sync_insert(
70 						thr, &parent, &chain,
71 						focus->bref.modify_tid,
72 						idx, focus);
73 #endif
74 static int hammer2_sync_insert(hammer2_thread_t *thr,
75 			hammer2_chain_t **parentp, hammer2_chain_t **chainp,
76 			hammer2_tid_t modify_tid, int idx,
77 			hammer2_chain_t *focus);
78 static int hammer2_sync_destroy(hammer2_thread_t *thr,
79 			hammer2_chain_t **parentp, hammer2_chain_t **chainp,
80 			hammer2_tid_t mtid, int idx);
81 static int hammer2_sync_replace(hammer2_thread_t *thr,
82 			hammer2_chain_t *parent, hammer2_chain_t *chain,
83 			hammer2_tid_t mtid, int idx,
84 			hammer2_chain_t *focus, int isroot);
85 
86 /****************************************************************************
87  *			    HAMMER2 SYNC THREADS 			    *
88  ****************************************************************************/
89 /*
90  * Primary management thread for an element of a node.  A thread will exist
91  * for each element requiring management.
92  *
93  * No management threads are needed for the SPMP or for any PMP with only
94  * a single MASTER.
95  *
96  * On the SPMP - handles bulkfree and dedup operations
97  * On a PFS    - handles remastering and synchronization
98  */
99 void
100 hammer2_primary_sync_thread(void *arg)
101 {
102 	hammer2_thread_t *thr = arg;
103 	hammer2_pfs_t *pmp;
104 	hammer2_deferred_list_t list;
105 	hammer2_deferred_ip_t *defer;
106 	int error;
107 
108 	pmp = thr->pmp;
109 	bzero(&list, sizeof(list));
110 
111 	lockmgr(&thr->lk, LK_EXCLUSIVE);
112 	while ((thr->flags & HAMMER2_THREAD_STOP) == 0) {
113 		/*
114 		 * Handle freeze request
115 		 */
116 		if (thr->flags & HAMMER2_THREAD_FREEZE) {
117 			atomic_set_int(&thr->flags, HAMMER2_THREAD_FROZEN);
118 			atomic_clear_int(&thr->flags, HAMMER2_THREAD_FREEZE);
119 		}
120 
121 		/*
122 		 * Force idle if frozen until unfrozen or stopped.
123 		 */
124 		if (thr->flags & HAMMER2_THREAD_FROZEN) {
125 			lksleep(thr->xopq, &thr->lk, 0, "frozen", 0);
126 			continue;
127 		}
128 
129 		/*
130 		 * Reset state on REMASTER request
131 		 */
132 		if (thr->flags & HAMMER2_THREAD_REMASTER) {
133 			atomic_clear_int(&thr->flags, HAMMER2_THREAD_REMASTER);
134 			/* reset state */
135 		}
136 
137 		/*
138 		 * Synchronization scan.
139 		 */
140 		if (hammer2_debug & 0x8000)
141 			kprintf("sync_slaves pfs %s clindex %d\n",
142 				pmp->pfs_names[thr->clindex], thr->clindex);
143 		hammer2_trans_init(pmp, 0);
144 
145 		hammer2_inode_ref(pmp->iroot);
146 
147 		for (;;) {
148 			int didbreak = 0;
149 			/* XXX lock synchronize pmp->modify_tid */
150 			error = hammer2_sync_slaves(thr, pmp->iroot, &list, 1);
151 			if (hammer2_debug & 0x8000) {
152 				kprintf("sync_slaves error %d defer %p\n",
153 					error, list.base);
154 			}
155 			if (error != EAGAIN)
156 				break;
157 			while ((defer = list.base) != NULL) {
158 				hammer2_inode_t *nip;
159 
160 				nip = defer->ip;
161 				error = hammer2_sync_slaves(thr, nip, &list, 0);
162 				if (error && error != EAGAIN && error != ENOENT)
163 					break;
164 				if (hammer2_thr_break(thr)) {
165 					didbreak = 1;
166 					break;
167 				}
168 
169 				/*
170 				 * If no additional defers occurred we can
171 				 * remove this one, otherwise keep it on
172 				 * the list and retry once the additional
173 				 * defers have completed.
174 				 */
175 				if (defer == list.base) {
176 					--list.count;
177 					list.base = defer->next;
178 					kfree(defer, M_HAMMER2);
179 					defer = NULL;	/* safety */
180 					hammer2_inode_drop(nip);
181 				}
182 			}
183 
184 			/*
185 			 * If the thread is being remastered, frozen, or
186 			 * stopped, clean up any left-over deferals.
187 			 */
188 			if (didbreak || (error && error != EAGAIN)) {
189 				kprintf("didbreak\n");
190 				while ((defer = list.base) != NULL) {
191 					--list.count;
192 					hammer2_inode_drop(defer->ip);
193 					list.base = defer->next;
194 					kfree(defer, M_HAMMER2);
195 				}
196 				if (error == 0 || error == EAGAIN)
197 					error = EINPROGRESS;
198 				break;
199 			}
200 		}
201 
202 		hammer2_inode_drop(pmp->iroot);
203 		hammer2_trans_done(pmp);
204 
205 		if (error && error != EINPROGRESS)
206 			kprintf("hammer2_sync_slaves: error %d\n", error);
207 
208 		/*
209 		 * Wait for event, or 5-second poll.
210 		 */
211 		lksleep(thr->xopq, &thr->lk, 0, "h2idle", hz * 5);
212 	}
213 	thr->td = NULL;
214 	wakeup(thr);
215 	lockmgr(&thr->lk, LK_RELEASE);
216 	/* thr structure can go invalid after this point */
217 }
218 
219 #if 0
220 /*
221  * Given a locked cluster created from pmp->iroot, update the PFS's
222  * reporting status.
223  */
224 static
225 void
226 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags)
227 {
228 	hammer2_pfs_t *pmp = thr->pmp;
229 
230 	flags &= HAMMER2_CLUSTER_ZFLAGS;
231 	if (pmp->cluster_flags == flags)
232 		return;
233 	pmp->cluster_flags = flags;
234 
235 	kprintf("pfs %p", pmp);
236 	if (flags & HAMMER2_CLUSTER_MSYNCED)
237 		kprintf(" masters-all-good");
238 	if (flags & HAMMER2_CLUSTER_SSYNCED)
239 		kprintf(" slaves-all-good");
240 
241 	if (flags & HAMMER2_CLUSTER_WRHARD)
242 		kprintf(" quorum/rw");
243 	else if (flags & HAMMER2_CLUSTER_RDHARD)
244 		kprintf(" quorum/ro");
245 
246 	if (flags & HAMMER2_CLUSTER_UNHARD)
247 		kprintf(" out-of-sync-masters");
248 	else if (flags & HAMMER2_CLUSTER_NOHARD)
249 		kprintf(" no-masters-visible");
250 
251 	if (flags & HAMMER2_CLUSTER_WRSOFT)
252 		kprintf(" soft/rw");
253 	else if (flags & HAMMER2_CLUSTER_RDSOFT)
254 		kprintf(" soft/ro");
255 
256 	if (flags & HAMMER2_CLUSTER_UNSOFT)
257 		kprintf(" out-of-sync-slaves");
258 	else if (flags & HAMMER2_CLUSTER_NOSOFT)
259 		kprintf(" no-slaves-visible");
260 	kprintf("\n");
261 }
262 #endif
263 
264 #if 0
265 static
266 void
267 dumpcluster(const char *label,
268 	    hammer2_cluster_t *cparent, hammer2_cluster_t *cluster)
269 {
270 	hammer2_chain_t *chain;
271 	int i;
272 
273 	if ((hammer2_debug & 1) == 0)
274 		return;
275 
276 	kprintf("%s\t", label);
277 	KKASSERT(cparent->nchains == cluster->nchains);
278 	for (i = 0; i < cparent->nchains; ++i) {
279 		if (i)
280 			kprintf("\t");
281 		kprintf("%d ", i);
282 		if ((chain = cparent->array[i].chain) != NULL) {
283 			kprintf("%016jx%s ",
284 				chain->bref.key,
285 				((cparent->array[i].flags &
286 				  HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
287 			);
288 		} else {
289 			kprintf("      NULL      %s ", "   ");
290 		}
291 		if ((chain = cluster->array[i].chain) != NULL) {
292 			kprintf("%016jx%s ",
293 				chain->bref.key,
294 				((cluster->array[i].flags &
295 				  HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
296 			);
297 		} else {
298 			kprintf("      NULL      %s ", "   ");
299 		}
300 		kprintf("\n");
301 	}
302 }
303 #endif
304 
305 /*
306  * Each out of sync node sync-thread must issue an all-nodes XOP scan of
307  * the inode.  This creates a multiplication effect since the XOP scan itself
308  * issues to all nodes.  However, this is the only way we can safely
309  * synchronize nodes which might have disparate I/O bandwidths and the only
310  * way we can safely deal with stalled nodes.
311  */
312 static
313 int
314 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
315 		    hammer2_deferred_list_t *list, int isroot)
316 {
317 	hammer2_xop_scanall_t *xop;
318 	hammer2_chain_t *parent;
319 	hammer2_chain_t *chain;
320 	hammer2_pfs_t *pmp;
321 	hammer2_key_t key_next;
322 	hammer2_tid_t sync_tid;
323 	int cache_index = -1;
324 	int needrescan;
325 	int want_update;
326 	int error;
327 	int nerror;
328 	int idx;
329 	int n;
330 
331 	pmp = ip->pmp;
332 	idx = thr->clindex;	/* cluster node we are responsible for */
333 	needrescan = 0;
334 	want_update = 0;
335 	sync_tid = 0;
336 	chain = NULL;
337 	parent = NULL;
338 
339 #if 0
340 	/*
341 	 * Nothing to do if all slaves are synchronized.
342 	 * Nothing to do if cluster not authoritatively readable.
343 	 */
344 	if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED)
345 		return(0);
346 	if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0)
347 		return(HAMMER2_ERROR_INCOMPLETE);
348 #endif
349 
350 	error = 0;
351 
352 	/*
353 	 * Resolve the root inode of the PFS and determine if synchronization
354 	 * is needed by checking modify_tid.
355 	 */
356 	{
357 		hammer2_xop_ipcluster_t *xop2;
358 		hammer2_chain_t *focus;
359 
360 		hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
361 		xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
362 		hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster,
363 					 idx);
364 		hammer2_inode_unlock(ip);
365 		error = hammer2_xop_collect(&xop2->head, 0);
366 		if (error == 0 && (focus = xop2->head.cluster.focus) != NULL) {
367 			sync_tid = focus->bref.modify_tid; /* XXX */
368 			chain = hammer2_inode_chain_and_parent(ip, idx,
369 						    &parent,
370 						    HAMMER2_RESOLVE_ALWAYS |
371 						    HAMMER2_RESOLVE_SHARED);
372 			want_update = (chain->bref.modify_tid != sync_tid);
373 			if (chain) {
374 				hammer2_chain_unlock(chain);
375 				hammer2_chain_drop(chain);
376 				chain = NULL;
377 			}
378 			if (parent) {
379 				hammer2_chain_unlock(parent);
380 				hammer2_chain_drop(parent);
381 				parent = NULL;
382 			}
383 		}
384 		hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
385 	}
386 
387 	if (want_update == 0)
388 		return(0);
389 
390 	/*
391 	 * The inode is left unlocked during the scan.  Issue a XOP
392 	 * that does *not* include our cluster index to iterate
393 	 * properly synchronized elements and resolve our cluster index
394 	 * against it.
395 	 */
396 	hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
397 	xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
398 	xop->key_beg = HAMMER2_KEY_MIN;
399 	xop->key_end = HAMMER2_KEY_MAX;
400 	xop->resolve_flags = HAMMER2_RESOLVE_SHARED |
401 			     HAMMER2_RESOLVE_ALWAYS;
402 	xop->lookup_flags = HAMMER2_LOOKUP_SHARED |
403 			    HAMMER2_LOOKUP_NODIRECT |
404 			    HAMMER2_LOOKUP_ALWAYS;
405 	hammer2_xop_start_except(&xop->head, hammer2_xop_scanall, idx);
406 	parent = hammer2_inode_chain(ip, idx,
407 				     HAMMER2_RESOLVE_ALWAYS |
408 				     HAMMER2_RESOLVE_SHARED);
409 	hammer2_inode_unlock(ip);
410 
411 	chain = hammer2_chain_lookup(&parent, &key_next,
412 				     HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
413 				     &cache_index,
414 				     HAMMER2_LOOKUP_SHARED |
415 				     HAMMER2_LOOKUP_NODIRECT |
416 				     HAMMER2_LOOKUP_NODATA);
417 	error = hammer2_xop_collect(&xop->head, 0);
418 	kprintf("START_SCAN IP=%016jx chain=%p (%016jx)\n",
419 		ip->meta.name_key, chain,
420 		(chain ? chain->bref.key : -1));
421 
422 	for (;;) {
423 		/*
424 		 * We are done if our scan is done and the XOP scan is done.
425 		 * We are done if the XOP scan failed (that is, we don't
426 		 * have authoritative data to synchronize with).
427 		 */
428 		int advance_local = 0;
429 		int advance_xop = 0;
430 		int dodefer = 0;
431 		hammer2_chain_t *focus;
432 
433 		if (chain == NULL && error == ENOENT)
434 			break;
435 		if (error && error != ENOENT)
436 			break;
437 
438 		/*
439 		 * Compare
440 		 */
441 		if (chain && error == ENOENT) {
442 			/*
443 			 * If we have local chains but the XOP scan is done,
444 			 * the chains need to be deleted.
445 			 */
446 			n = -1;
447 			focus = NULL;
448 		} else if (chain == NULL) {
449 			/*
450 			 * If our local scan is done but the XOP scan is not,
451 			 * we need to create the missing chain(s).
452 			 */
453 			n = 1;
454 			focus = xop->head.cluster.focus;
455 		} else {
456 			/*
457 			 * Otherwise compare to determine the action
458 			 * needed.
459 			 */
460 			focus = xop->head.cluster.focus;
461 			n = hammer2_chain_cmp(chain, focus);
462 		}
463 
464 		/*
465 		 * Take action based on comparison results.
466 		 */
467 		if (n < 0) {
468 			/*
469 			 * Delete extranious local data.  This will
470 			 * automatically advance the chain.
471 			 */
472 			nerror = hammer2_sync_destroy(thr, &parent, &chain,
473 						      0, idx);
474 		} else if (n == 0 && chain->bref.modify_tid !=
475 				     focus->bref.modify_tid) {
476 			/*
477 			 * Matching key but local data or meta-data requires
478 			 * updating.  If we will recurse, we still need to
479 			 * update to compatible content first but we do not
480 			 * synchronize modify_tid until the entire recursion
481 			 * has completed successfully.
482 			 *
483 			 * NOTE: Do not try to access hardlink pointers as if
484 			 *	 they were normal inodes, the inode cache will
485 			 *	 get seriously confused.
486 			 */
487 			if (focus->bref.type == HAMMER2_BREF_TYPE_INODE &&
488 			    focus->data->ipdata.meta.type !=
489 			    HAMMER2_OBJTYPE_HARDLINK) {
490 				nerror = hammer2_sync_replace(
491 						thr, parent, chain,
492 						0,
493 						idx, focus, 0);
494 				dodefer = 1;
495 			} else {
496 				nerror = hammer2_sync_replace(
497 						thr, parent, chain,
498 						focus->bref.modify_tid,
499 						idx, focus, 0);
500 			}
501 			advance_local = 1;
502 			advance_xop = 1;
503 		} else if (n == 0) {
504 			/*
505 			 * 100% match, advance both
506 			 */
507 			advance_local = 1;
508 			advance_xop = 1;
509 			nerror = 0;
510 		} else if (n > 0) {
511 			/*
512 			 * Insert missing local data.
513 			 *
514 			 * If we will recurse, we still need to update to
515 			 * compatible content first but we do not synchronize
516 			 * modify_tid until the entire recursion has
517 			 * completed successfully.
518 			 *
519 			 * NOTE: Do not try to access hardlink pointers as if
520 			 *	 they were normal inodes, the inode cache will
521 			 *	 get seriously confused.
522 			 */
523 			if (focus->bref.type == HAMMER2_BREF_TYPE_INODE &&
524 			    focus->data->ipdata.meta.type !=
525 			    HAMMER2_OBJTYPE_HARDLINK) {
526 				nerror = hammer2_sync_insert(
527 						thr, &parent, &chain,
528 						0,
529 						idx, focus);
530 				dodefer = 2;
531 			} else {
532 				nerror = hammer2_sync_insert(
533 						thr, &parent, &chain,
534 						focus->bref.modify_tid,
535 						idx, focus);
536 			}
537 			advance_local = 1;
538 			advance_xop = 1;
539 		}
540 
541 		/*
542 		 * We cannot recurse depth-first because the XOP is still
543 		 * running in node threads for this scan.  Create a placemarker
544 		 * by obtaining and record the hammer2_inode.
545 		 *
546 		 * We excluded our node from the XOP so we must temporarily
547 		 * add it to xop->head.cluster so it is properly incorporated
548 		 * into the inode.
549 		 *
550 		 * The deferral is pushed onto a LIFO list for bottom-up
551 		 * synchronization.
552 		 */
553 		if (error == 0 && dodefer) {
554 			hammer2_inode_t *nip;
555 			hammer2_deferred_ip_t *defer;
556 
557 			KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE);
558 
559 			defer = kmalloc(sizeof(*defer), M_HAMMER2,
560 					M_WAITOK | M_ZERO);
561 			KKASSERT(xop->head.cluster.array[idx].chain == NULL);
562 			xop->head.cluster.array[idx].flags =
563 							HAMMER2_CITEM_INVALID;
564 			xop->head.cluster.array[idx].chain = chain;
565 			nip = hammer2_inode_get(pmp, ip,
566 						&xop->head.cluster, idx);
567 			xop->head.cluster.array[idx].chain = NULL;
568 
569 			hammer2_inode_ref(nip);
570 			hammer2_inode_unlock(nip);
571 
572 			defer->next = list->base;
573 			defer->ip = nip;
574 			list->base = defer;
575 			++list->count;
576 			needrescan = 1;
577 		}
578 
579 		/*
580 		 * If at least one deferral was added and the deferral
581 		 * list has grown too large, stop adding more.  This
582 		 * will trigger an EAGAIN return.
583 		 */
584 		if (needrescan && list->count > 1000)
585 			break;
586 
587 		/*
588 		 * Advancements for iteration.
589 		 */
590 		if (advance_xop) {
591 			error = hammer2_xop_collect(&xop->head, 0);
592 		}
593 		if (advance_local) {
594 			chain = hammer2_chain_next(&parent, chain, &key_next,
595 						   key_next, HAMMER2_KEY_MAX,
596 						   &cache_index,
597 						   HAMMER2_LOOKUP_SHARED |
598 						   HAMMER2_LOOKUP_NODIRECT |
599 						   HAMMER2_LOOKUP_NODATA);
600 		}
601 	}
602 	hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
603 	if (chain) {
604 		hammer2_chain_unlock(chain);
605 		hammer2_chain_drop(chain);
606 	}
607 	if (parent) {
608 		hammer2_chain_unlock(parent);
609 		hammer2_chain_drop(parent);
610 	}
611 
612 	/*
613 	 * If we added deferrals we want the caller to synchronize them
614 	 * and then call us again.
615 	 *
616 	 * NOTE: In this situation we do not yet want to synchronize our
617 	 *	 inode, setting the error code also has that effect.
618 	 */
619 	if ((error == 0 || error == ENOENT) && needrescan)
620 		error = EAGAIN;
621 
622 	/*
623 	 * If no error occurred we can synchronize the inode meta-data
624 	 * and modify_tid.  Only limited changes are made to PFSROOTs.
625 	 *
626 	 * XXX inode lock was lost
627 	 */
628 	if (error == 0 || error == ENOENT) {
629 		hammer2_xop_ipcluster_t *xop2;
630 		hammer2_chain_t *focus;
631 
632 		hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
633 		xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
634 		hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster,
635 					 idx);
636 		hammer2_inode_unlock(ip);
637 		error = hammer2_xop_collect(&xop2->head, 0);
638 		if (error == 0) {
639 			focus = xop2->head.cluster.focus;
640 			kprintf("syncthr: update inode %p (%s)\n",
641 				focus,
642 				(focus ?
643 				 (char *)focus->data->ipdata.filename : "?"));
644 			chain = hammer2_inode_chain_and_parent(ip, idx,
645 						    &parent,
646 						    HAMMER2_RESOLVE_ALWAYS |
647 						    HAMMER2_RESOLVE_SHARED);
648 
649 			KKASSERT(parent != NULL);
650 			nerror = hammer2_sync_replace(
651 					thr, parent, chain,
652 					sync_tid,
653 					idx, focus, isroot);
654 			hammer2_chain_unlock(chain);
655 			hammer2_chain_drop(chain);
656 			hammer2_chain_unlock(parent);
657 			hammer2_chain_drop(parent);
658 			/* XXX */
659 		}
660 		hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
661 	}
662 
663 	return error;
664 }
665 
666 /*
667  * Create a missing chain by copying the focus from another device.
668  *
669  * On entry *parentp and focus are both locked shared.  The chain will be
670  * created and returned in *chainp also locked shared.
671  */
672 static
673 int
674 hammer2_sync_insert(hammer2_thread_t *thr,
675 		    hammer2_chain_t **parentp, hammer2_chain_t **chainp,
676 		    hammer2_tid_t mtid, int idx, hammer2_chain_t *focus)
677 {
678 	hammer2_chain_t *chain;
679 	hammer2_key_t dummy;
680 	int cache_index = -1;
681 
682 #if HAMMER2_SYNCHRO_DEBUG
683 	if (hammer2_debug & 1)
684 	kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n",
685 		*parentp,
686 		(*parentp)->bref.type,
687 		(*parentp)->bref.key,
688 		idx,
689 		focus->bref.type, focus->bref.key, mtid);
690 #endif
691 
692 	/*
693 	 * Parent requires an exclusive lock for the insertion.
694 	 * We must unlock the child to avoid deadlocks while
695 	 * relocking the parent.
696 	 */
697 	if (*chainp) {
698 		hammer2_chain_unlock(*chainp);
699 		hammer2_chain_drop(*chainp);
700 		*chainp = NULL;
701 	}
702 	hammer2_chain_unlock(*parentp);
703 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
704 
705 	/*
706 	 * We must reissue the lookup to properly position (*parentp)
707 	 * for the insertion.
708 	 */
709 	chain = hammer2_chain_lookup(parentp, &dummy,
710 				     focus->bref.key, focus->bref.key,
711 				     &cache_index,
712 				     HAMMER2_LOOKUP_NODIRECT |
713 				     HAMMER2_LOOKUP_ALWAYS);
714 	KKASSERT(chain == NULL);
715 
716 	chain = NULL;
717 	hammer2_chain_create(parentp, &chain, thr->pmp,
718 			     focus->bref.key, focus->bref.keybits,
719 			     focus->bref.type, focus->bytes,
720 			     mtid, 0, 0);
721 	hammer2_chain_modify(chain, mtid, 0, 0);
722 
723 	/*
724 	 * Copy focus to new chain
725 	 */
726 
727 	/* type already set */
728 	chain->bref.methods = focus->bref.methods;
729 	/* keybits already set */
730 	chain->bref.vradix = focus->bref.vradix;
731 	/* mirror_tid set by flush */
732 	KKASSERT(chain->bref.modify_tid == mtid);
733 	chain->bref.flags = focus->bref.flags;
734 	/* key already present */
735 	/* check code will be recalculated */
736 
737 	/*
738 	 * Copy data body.
739 	 */
740 	switch(chain->bref.type) {
741 	case HAMMER2_BREF_TYPE_INODE:
742 		if ((focus->data->ipdata.meta.op_flags &
743 		     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
744 			/* do not copy block table */
745 			bcopy(focus->data, chain->data,
746 			      offsetof(hammer2_inode_data_t, u));
747 			break;
748 		}
749 		/* fall through copy whole thing */
750 	case HAMMER2_BREF_TYPE_DATA:
751 		bcopy(focus->data, chain->data, chain->bytes);
752 		hammer2_chain_setcheck(chain, chain->data);
753 		break;
754 	default:
755 		KKASSERT(0);
756 		break;
757 	}
758 
759 	hammer2_chain_unlock(chain);		/* unlock, leave ref */
760 	*chainp = chain;			/* will be returned locked */
761 
762 	/*
763 	 * Avoid ordering deadlock when relocking shared.
764 	 */
765 	hammer2_chain_unlock(*parentp);
766 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
767 				     HAMMER2_RESOLVE_ALWAYS);
768 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
769 				  HAMMER2_RESOLVE_ALWAYS);
770 
771 	return 0;
772 }
773 
774 /*
775  * Destroy an extranious chain.
776  *
777  * Both *parentp and *chainp are locked shared.
778  *
779  * On return, *chainp will be adjusted to point to the next element in the
780  * iteration and locked shared.
781  */
782 static
783 int
784 hammer2_sync_destroy(hammer2_thread_t *thr,
785 		     hammer2_chain_t **parentp, hammer2_chain_t **chainp,
786 		     hammer2_tid_t mtid, int idx)
787 {
788 	hammer2_chain_t *chain;
789 	hammer2_chain_t *parent;
790 	hammer2_key_t key_next;
791 	hammer2_key_t save_key;
792 	int cache_index = -1;
793 
794 	chain = *chainp;
795 
796 #if HAMMER2_SYNCHRO_DEBUG
797 	if (hammer2_debug & 1)
798 	kprintf("destroy rec %p/%p slave %d %d.%016jx\n",
799 		*parentp, chain,
800 		idx, chain->bref.type, chain->bref.key);
801 #endif
802 
803 	save_key = chain->bref.key;
804 	if (save_key != HAMMER2_KEY_MAX)
805 		++save_key;
806 
807 	/*
808 	 * Try to avoid unnecessary I/O.
809 	 *
810 	 * XXX accounting not propagated up properly.  We might have to do
811 	 *     a RESOLVE_MAYBE here and pass 0 for the flags.
812 	 */
813 	hammer2_chain_unlock(chain);	/* relock exclusive */
814 	hammer2_chain_unlock(*parentp);
815 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
816 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
817 
818 	hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT);
819 	hammer2_chain_unlock(chain);
820 	hammer2_chain_drop(chain);
821 	chain = NULL;			/* safety */
822 
823 	hammer2_chain_unlock(*parentp);	/* relock shared */
824 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
825 				     HAMMER2_RESOLVE_ALWAYS);
826 	*chainp = hammer2_chain_lookup(&parent, &key_next,
827 				     save_key, HAMMER2_KEY_MAX,
828 				     &cache_index,
829 				     HAMMER2_LOOKUP_SHARED |
830 				     HAMMER2_LOOKUP_NODIRECT |
831 				     HAMMER2_LOOKUP_NODATA);
832 	return 0;
833 }
834 
835 /*
836  * cparent is locked exclusively, with an extra ref, cluster is not locked.
837  * Replace element [i] in the cluster.
838  */
839 static
840 int
841 hammer2_sync_replace(hammer2_thread_t *thr,
842 		     hammer2_chain_t *parent, hammer2_chain_t *chain,
843 		     hammer2_tid_t mtid, int idx,
844 		     hammer2_chain_t *focus, int isroot)
845 {
846 	int nradix;
847 	uint8_t otype;
848 
849 #if HAMMER2_SYNCHRO_DEBUG
850 	if (hammer2_debug & 1)
851 	kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n",
852 		chain,
853 		idx,
854 		focus->bref.type, focus->bref.key, mtid);
855 #endif
856 	hammer2_chain_unlock(chain);
857 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
858 	if (chain->bytes != focus->bytes) {
859 		/* XXX what if compressed? */
860 		nradix = hammer2_getradix(chain->bytes);
861 		hammer2_chain_resize(NULL, parent, chain,
862 				     mtid, 0,
863 				     nradix, 0);
864 	}
865 	hammer2_chain_modify(chain, mtid, 0, 0);
866 	otype = chain->bref.type;
867 	chain->bref.type = focus->bref.type;
868 	chain->bref.methods = focus->bref.methods;
869 	chain->bref.keybits = focus->bref.keybits;
870 	chain->bref.vradix = focus->bref.vradix;
871 	/* mirror_tid updated by flush */
872 	KKASSERT(mtid == 0 || chain->bref.modify_tid == mtid);
873 	chain->bref.flags = focus->bref.flags;
874 	/* key already present */
875 	/* check code will be recalculated */
876 	chain->error = 0;
877 
878 	/*
879 	 * Copy data body.
880 	 */
881 	switch(chain->bref.type) {
882 	case HAMMER2_BREF_TYPE_INODE:
883 		/*
884 		 * Special case PFSROOTs, only limited changes can be made
885 		 * since the meta-data contains miscellanious distinguishing
886 		 * fields.
887 		 */
888 		if (isroot) {
889 			chain->data->ipdata.meta.uflags =
890 				focus->data->ipdata.meta.uflags;
891 			chain->data->ipdata.meta.rmajor =
892 				focus->data->ipdata.meta.rmajor;
893 			chain->data->ipdata.meta.rminor =
894 				focus->data->ipdata.meta.rminor;
895 			chain->data->ipdata.meta.ctime =
896 				focus->data->ipdata.meta.ctime;
897 			chain->data->ipdata.meta.mtime =
898 				focus->data->ipdata.meta.mtime;
899 			chain->data->ipdata.meta.atime =
900 				focus->data->ipdata.meta.atime;
901 			/* not btime */
902 			chain->data->ipdata.meta.uid =
903 				focus->data->ipdata.meta.uid;
904 			chain->data->ipdata.meta.gid =
905 				focus->data->ipdata.meta.gid;
906 			chain->data->ipdata.meta.mode =
907 				focus->data->ipdata.meta.mode;
908 			chain->data->ipdata.meta.ncopies =
909 				focus->data->ipdata.meta.ncopies;
910 			chain->data->ipdata.meta.comp_algo =
911 				focus->data->ipdata.meta.comp_algo;
912 			chain->data->ipdata.meta.check_algo =
913 				focus->data->ipdata.meta.check_algo;
914 			chain->data->ipdata.meta.data_quota =
915 				focus->data->ipdata.meta.data_quota;
916 			chain->data->ipdata.meta.inode_quota =
917 				focus->data->ipdata.meta.inode_quota;
918 			chain->data->ipdata.meta.attr_tid =
919 				focus->data->ipdata.meta.attr_tid;
920 			chain->data->ipdata.meta.dirent_tid =
921 				focus->data->ipdata.meta.dirent_tid;
922 			hammer2_chain_setcheck(chain, chain->data);
923 			break;
924 		}
925 
926 		/*
927 		 * Normal replacement.
928 		 */
929 		if ((focus->data->ipdata.meta.op_flags &
930 		     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
931 			/*
932 			 * If DIRECTDATA is transitioning to 0 or the old
933 			 * chain is not an inode we have to initialize
934 			 * the block table.
935 			 */
936 			if (otype != HAMMER2_BREF_TYPE_INODE ||
937 			    (chain->data->ipdata.meta.op_flags &
938 			     HAMMER2_OPFLAG_DIRECTDATA)) {
939 				kprintf("chain inode trans away from dd\n");
940 				bzero(&chain->data->ipdata.u,
941 				      sizeof(chain->data->ipdata.u));
942 			}
943 			bcopy(focus->data, chain->data,
944 			      offsetof(hammer2_inode_data_t, u));
945 			/* XXX setcheck on inode should not be needed */
946 			hammer2_chain_setcheck(chain, chain->data);
947 			break;
948 		}
949 		/* fall through */
950 	case HAMMER2_BREF_TYPE_DATA:
951 		bcopy(focus->data, chain->data, chain->bytes);
952 		hammer2_chain_setcheck(chain, chain->data);
953 		break;
954 	default:
955 		KKASSERT(0);
956 		break;
957 	}
958 
959 	hammer2_chain_unlock(chain);
960 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
961 				  HAMMER2_RESOLVE_MAYBE);
962 
963 	return 0;
964 }
965