xref: /dragonfly/sys/vfs/hammer2/hammer2_synchro.c (revision 9ef1e017)
1 /*
2  * Copyright (c) 2015-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * This module implements the cluster synchronizer.  Basically the way
36  * it works is that a thread is created for each cluster node in a PFS.
37  * This thread is responsible for synchronizing the current node using
38  * data from other nodes.
39  *
40  * Any out of sync master or slave can get back into synchronization as
41  * long as a quorum of masters agree on the update_tid.  If a quorum is
42  * not available it may still be possible to synchronize to the highest
43  * available update_tid as a way of trying to catch up as much as possible
44  * until a quorum is available.
45  *
46  * If no quorum is possible (which can happen even if all masters are
47  * available, if the update_tid does not match), then manual intervention
48  * may be required to resolve discrepancies.
49  */
50 #include "hammer2.h"
51 
52 typedef struct hammer2_deferred_ip {
53 	struct hammer2_deferred_ip *next;
54 	hammer2_inode_t	*ip;
55 } hammer2_deferred_ip_t;
56 
57 typedef struct hammer2_deferred_list {
58 	hammer2_deferred_ip_t	*base;
59 	int			count;
60 } hammer2_deferred_list_t;
61 
62 
63 #define HAMMER2_SYNCHRO_DEBUG 1
64 
65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
66 				hammer2_deferred_list_t *list, int isroot);
67 #if 0
68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags);
69 				nerror = hammer2_sync_insert(
70 						thr, &parent, &chain,
71 						focus->bref.modify_tid,
72 						idx, focus);
73 #endif
74 static int hammer2_sync_insert(hammer2_thread_t *thr,
75 			hammer2_chain_t **parentp, hammer2_chain_t **chainp,
76 			hammer2_tid_t modify_tid, int idx,
77 			hammer2_xop_head_t *xop, hammer2_chain_t *focus);
78 static int hammer2_sync_destroy(hammer2_thread_t *thr,
79 			hammer2_chain_t **parentp, hammer2_chain_t **chainp,
80 			hammer2_tid_t mtid, int idx);
81 static int hammer2_sync_replace(hammer2_thread_t *thr,
82 			hammer2_chain_t *parent, hammer2_chain_t *chain,
83 			hammer2_tid_t mtid, int idx,
84 			hammer2_xop_head_t *xop, hammer2_chain_t *focus,
85 			int isroot);
86 
87 /****************************************************************************
88  *			    HAMMER2 SYNC THREADS 			    *
89  ****************************************************************************/
90 /*
91  * Primary management thread for an element of a node.  A thread will exist
92  * for each element requiring management.
93  *
94  * No management threads are needed for the SPMP or for any PMP with only
95  * a single MASTER.
96  *
97  * On the SPMP - handles bulkfree and dedup operations
98  * On a PFS    - handles remastering and synchronization
99  */
100 void
101 hammer2_primary_sync_thread(void *arg)
102 {
103 	hammer2_thread_t *thr = arg;
104 	hammer2_pfs_t *pmp;
105 	hammer2_deferred_list_t list;
106 	hammer2_deferred_ip_t *defer;
107 	int error;
108 	uint32_t flags;
109 	uint32_t nflags;
110 
111 	pmp = thr->pmp;
112 	bzero(&list, sizeof(list));
113 
114 	for (;;) {
115 		flags = thr->flags;
116 		cpu_ccfence();
117 
118 		/*
119 		 * Handle stop request
120 		 */
121 		if (flags & HAMMER2_THREAD_STOP)
122 			break;
123 
124 		/*
125 		 * Handle freeze request
126 		 */
127 		if (flags & HAMMER2_THREAD_FREEZE) {
128 			nflags = (flags & ~(HAMMER2_THREAD_FREEZE |
129 					    HAMMER2_THREAD_WAITING)) |
130 				 HAMMER2_THREAD_FROZEN;
131 			if (!atomic_cmpset_int(&thr->flags, flags, nflags))
132 				continue;
133 			if (flags & HAMMER2_THREAD_WAITING)
134 				wakeup(&thr->flags);
135 			continue;
136 		}
137 
138 		if (flags & HAMMER2_THREAD_UNFREEZE) {
139 			nflags = flags & ~(HAMMER2_THREAD_UNFREEZE |
140 					   HAMMER2_THREAD_FROZEN |
141 					   HAMMER2_THREAD_WAITING);
142 			if (!atomic_cmpset_int(&thr->flags, flags, nflags))
143 				continue;
144 			if (flags & HAMMER2_THREAD_WAITING)
145 				wakeup(&thr->flags);
146 			continue;
147 		}
148 
149 		/*
150 		 * Force idle if frozen until unfrozen or stopped.
151 		 */
152 		if (flags & HAMMER2_THREAD_FROZEN) {
153 			nflags = flags | HAMMER2_THREAD_WAITING;
154 
155 			tsleep_interlock(&thr->flags, 0);
156 			if (atomic_cmpset_int(&thr->flags, flags, nflags))
157 				tsleep(&thr->flags, PINTERLOCKED, "frozen", 0);
158 			continue;
159 		}
160 
161 		/*
162 		 * Reset state on REMASTER request
163 		 */
164 		if (thr->flags & HAMMER2_THREAD_REMASTER) {
165 			nflags = flags & ~HAMMER2_THREAD_REMASTER;
166 			if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
167 				/* reset state here */
168 			}
169 			continue;
170 		}
171 
172 		/*
173 		 * Synchronization scan.
174 		 */
175 		if (hammer2_debug & 0x8000)
176 			kprintf("sync_slaves pfs %s clindex %d\n",
177 				pmp->pfs_names[thr->clindex], thr->clindex);
178 		hammer2_trans_init(pmp, 0);
179 
180 		hammer2_inode_ref(pmp->iroot);
181 
182 		for (;;) {
183 			int didbreak = 0;
184 			/* XXX lock synchronize pmp->modify_tid */
185 			error = hammer2_sync_slaves(thr, pmp->iroot, &list, 1);
186 			if (hammer2_debug & 0x8000) {
187 				kprintf("sync_slaves error %d defer %p\n",
188 					error, list.base);
189 			}
190 			if (error != HAMMER2_ERROR_EAGAIN)
191 				break;
192 			while ((defer = list.base) != NULL) {
193 				hammer2_inode_t *nip;
194 
195 				nip = defer->ip;
196 				error = hammer2_sync_slaves(thr, nip, &list,
197 							(nip == pmp->iroot));
198 				if (error &&
199 				    error != HAMMER2_ERROR_EAGAIN &&
200 				    error != HAMMER2_ERROR_ENOENT) {
201 					break;
202 				}
203 				if (hammer2_thr_break(thr)) {
204 					didbreak = 1;
205 					break;
206 				}
207 
208 				/*
209 				 * If no additional defers occurred we can
210 				 * remove this one, otherwise keep it on
211 				 * the list and retry once the additional
212 				 * defers have completed.
213 				 */
214 				if (defer == list.base) {
215 					--list.count;
216 					list.base = defer->next;
217 					kfree(defer, M_HAMMER2);
218 					defer = NULL;	/* safety */
219 					hammer2_inode_drop(nip);
220 				}
221 			}
222 
223 			/*
224 			 * If the thread is being remastered, frozen, or
225 			 * stopped, clean up any left-over deferals.
226 			 */
227 			if (didbreak ||
228 			    (error && error != HAMMER2_ERROR_EAGAIN)) {
229 				kprintf("didbreak\n");
230 				while ((defer = list.base) != NULL) {
231 					--list.count;
232 					hammer2_inode_drop(defer->ip);
233 					list.base = defer->next;
234 					kfree(defer, M_HAMMER2);
235 				}
236 				if (error == 0 || error == HAMMER2_ERROR_EAGAIN)
237 					error = HAMMER2_ERROR_EINPROGRESS;
238 				break;
239 			}
240 		}
241 
242 		hammer2_inode_drop(pmp->iroot);
243 		hammer2_trans_done(pmp, 0);
244 
245 		if (error && error != HAMMER2_ERROR_EINPROGRESS)
246 			kprintf("hammer2_sync_slaves: error %d\n", error);
247 
248 		/*
249 		 * Wait for event, or 5-second poll.
250 		 */
251 		nflags = flags | HAMMER2_THREAD_WAITING;
252 		tsleep_interlock(&thr->flags, 0);
253 		if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
254 			tsleep(&thr->flags, 0, "h2idle", hz * 5);
255 		}
256 	}
257 	thr->td = NULL;
258 	hammer2_thr_signal(thr, HAMMER2_THREAD_STOPPED);
259 	/* thr structure can go invalid after this point */
260 }
261 
262 #if 0
263 /*
264  * Given a locked cluster created from pmp->iroot, update the PFS's
265  * reporting status.
266  */
267 static
268 void
269 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags)
270 {
271 	hammer2_pfs_t *pmp = thr->pmp;
272 
273 	flags &= HAMMER2_CLUSTER_ZFLAGS;
274 	if (pmp->cluster_flags == flags)
275 		return;
276 	pmp->cluster_flags = flags;
277 
278 	kprintf("pfs %p", pmp);
279 	if (flags & HAMMER2_CLUSTER_MSYNCED)
280 		kprintf(" masters-all-good");
281 	if (flags & HAMMER2_CLUSTER_SSYNCED)
282 		kprintf(" slaves-all-good");
283 
284 	if (flags & HAMMER2_CLUSTER_WRHARD)
285 		kprintf(" quorum/rw");
286 	else if (flags & HAMMER2_CLUSTER_RDHARD)
287 		kprintf(" quorum/ro");
288 
289 	if (flags & HAMMER2_CLUSTER_UNHARD)
290 		kprintf(" out-of-sync-masters");
291 	else if (flags & HAMMER2_CLUSTER_NOHARD)
292 		kprintf(" no-masters-visible");
293 
294 	if (flags & HAMMER2_CLUSTER_WRSOFT)
295 		kprintf(" soft/rw");
296 	else if (flags & HAMMER2_CLUSTER_RDSOFT)
297 		kprintf(" soft/ro");
298 
299 	if (flags & HAMMER2_CLUSTER_UNSOFT)
300 		kprintf(" out-of-sync-slaves");
301 	else if (flags & HAMMER2_CLUSTER_NOSOFT)
302 		kprintf(" no-slaves-visible");
303 	kprintf("\n");
304 }
305 #endif
306 
307 #if 0
308 static
309 void
310 dumpcluster(const char *label,
311 	    hammer2_cluster_t *cparent, hammer2_cluster_t *cluster)
312 {
313 	hammer2_chain_t *chain;
314 	int i;
315 
316 	if ((hammer2_debug & 1) == 0)
317 		return;
318 
319 	kprintf("%s\t", label);
320 	KKASSERT(cparent->nchains == cluster->nchains);
321 	for (i = 0; i < cparent->nchains; ++i) {
322 		if (i)
323 			kprintf("\t");
324 		kprintf("%d ", i);
325 		if ((chain = cparent->array[i].chain) != NULL) {
326 			kprintf("%016jx%s ",
327 				chain->bref.key,
328 				((cparent->array[i].flags &
329 				  HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
330 			);
331 		} else {
332 			kprintf("      NULL      %s ", "   ");
333 		}
334 		if ((chain = cluster->array[i].chain) != NULL) {
335 			kprintf("%016jx%s ",
336 				chain->bref.key,
337 				((cluster->array[i].flags &
338 				  HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
339 			);
340 		} else {
341 			kprintf("      NULL      %s ", "   ");
342 		}
343 		kprintf("\n");
344 	}
345 }
346 #endif
347 
348 /*
349  * Each out of sync node sync-thread must issue an all-nodes XOP scan of
350  * the inode.  This creates a multiplication effect since the XOP scan itself
351  * issues to all nodes.  However, this is the only way we can safely
352  * synchronize nodes which might have disparate I/O bandwidths and the only
353  * way we can safely deal with stalled nodes.
354  *
355  * XXX serror / merror rollup and handling.
356  */
357 static
358 int
359 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
360 		    hammer2_deferred_list_t *list, int isroot)
361 {
362 	hammer2_xop_scanall_t *xop;
363 	hammer2_chain_t *parent;
364 	hammer2_chain_t *chain;
365 	hammer2_pfs_t *pmp;
366 	hammer2_key_t key_next;
367 	hammer2_tid_t sync_tid;
368 	int needrescan;
369 	int want_update;
370 	int serror;		/* slave error */
371 	int merror;		/* master error (from xop_collect) */
372 	int nerror;		/* temporary error */
373 	int idx;
374 	int n;
375 
376 	pmp = ip->pmp;
377 	idx = thr->clindex;	/* cluster node we are responsible for */
378 	needrescan = 0;
379 	want_update = 0;
380 	sync_tid = 0;
381 	chain = NULL;
382 	parent = NULL;
383 
384 #if 0
385 	/*
386 	 * Nothing to do if all slaves are synchronized.
387 	 * Nothing to do if cluster not authoritatively readable.
388 	 */
389 	if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED)
390 		return(0);
391 	if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0)
392 		return(HAMMER2_ERROR_INCOMPLETE);
393 #endif
394 
395 	merror = 0;
396 
397 	/*
398 	 * Resolve the root inode of the PFS and determine if synchronization
399 	 * is needed by checking modify_tid.
400 	 *
401 	 * Retain the synchronization TID from the focus inode and use it
402 	 * later to synchronize the focus inode if/when the recursion
403 	 * succeeds.
404 	 */
405 	{
406 		hammer2_xop_ipcluster_t *xop2;
407 		hammer2_chain_t *focus;
408 
409 		hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
410 		xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
411 		hammer2_xop_start_except(&xop2->head, &hammer2_ipcluster_desc,
412 					 idx);
413 		hammer2_inode_unlock(ip);
414 		merror = hammer2_xop_collect(&xop2->head, 0);
415 		if (merror == 0 && (focus = xop2->head.cluster.focus) != NULL) {
416 			sync_tid = focus->bref.modify_tid;
417 			chain = hammer2_inode_chain_and_parent(ip, idx,
418 						    &parent,
419 						    HAMMER2_RESOLVE_ALWAYS |
420 						    HAMMER2_RESOLVE_SHARED);
421 			want_update = (chain->bref.modify_tid != sync_tid);
422 			if (chain) {
423 				hammer2_chain_unlock(chain);
424 				hammer2_chain_drop(chain);
425 				chain = NULL;
426 			}
427 			if (parent) {
428 				hammer2_chain_unlock(parent);
429 				hammer2_chain_drop(parent);
430 				parent = NULL;
431 			}
432 		}
433 		hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
434 	}
435 
436 	if (want_update == 0)
437 		return(0);
438 
439 	/*
440 	 * The inode is left unlocked during the scan.  Issue a XOP
441 	 * that does *not* include our cluster index to iterate
442 	 * properly synchronized elements and resolve our cluster index
443 	 * against it.
444 	 */
445 	hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
446 	xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
447 	xop->key_beg = HAMMER2_KEY_MIN;
448 	xop->key_end = HAMMER2_KEY_MAX;
449 	xop->resolve_flags = HAMMER2_RESOLVE_SHARED |
450 			     HAMMER2_RESOLVE_ALWAYS;
451 	xop->lookup_flags = HAMMER2_LOOKUP_SHARED |
452 			    HAMMER2_LOOKUP_NODIRECT |
453 			    HAMMER2_LOOKUP_ALWAYS;
454 	hammer2_xop_start_except(&xop->head, &hammer2_scanall_desc, idx);
455 	parent = hammer2_inode_chain(ip, idx,
456 				     HAMMER2_RESOLVE_ALWAYS |
457 				     HAMMER2_RESOLVE_SHARED);
458 	hammer2_inode_unlock(ip);
459 
460 	chain = hammer2_chain_lookup(&parent, &key_next,
461 				     HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
462 				     &serror,
463 				     HAMMER2_LOOKUP_SHARED |
464 				     HAMMER2_LOOKUP_NODIRECT |
465 				     HAMMER2_LOOKUP_NODATA);
466 	merror = hammer2_xop_collect(&xop->head, 0);
467 	if (hammer2_debug & 0x8000) {
468 		kprintf("START_SCAN IP=%016jx chain=%p (%016jx)\n",
469 			ip->meta.name_key, chain,
470 			(chain ? chain->bref.key : -1));
471 	}
472 
473 	for (;;) {
474 		/*
475 		 * We are done if our scan is done and the XOP scan is done.
476 		 * We are done if the XOP scan failed (that is, we don't
477 		 * have authoritative data to synchronize with).
478 		 */
479 		int advance_local = 0;
480 		int advance_xop = 0;
481 		int dodefer = 0;
482 		hammer2_chain_t *focus;
483 
484 		if (chain == NULL && merror == HAMMER2_ERROR_ENOENT)
485 			break;
486 		if (merror && merror != HAMMER2_ERROR_ENOENT)
487 			break;
488 
489 		/*
490 		 * Compare
491 		 */
492 		if (chain && merror == HAMMER2_ERROR_ENOENT) {
493 			/*
494 			 * If we have local chains but the XOP scan is done,
495 			 * the chains need to be deleted.
496 			 */
497 			n = -1;
498 			focus = NULL;
499 		} else if (chain == NULL) {
500 			/*
501 			 * If our local scan is done but the XOP scan is not,
502 			 * we need to create the missing chain(s).
503 			 */
504 			n = 1;
505 			focus = xop->head.cluster.focus;
506 		} else {
507 			/*
508 			 * Otherwise compare to determine the action
509 			 * needed.
510 			 */
511 			focus = xop->head.cluster.focus;
512 			n = hammer2_chain_cmp(chain, focus);
513 		}
514 
515 		/*
516 		 * Take action based on comparison results.
517 		 */
518 		if (n < 0) {
519 			/*
520 			 * Delete extranious local data.  This will
521 			 * automatically advance the chain.
522 			 */
523 			nerror = hammer2_sync_destroy(thr, &parent, &chain,
524 						      0, idx);
525 		} else if (n == 0 && chain->bref.modify_tid !=
526 				     focus->bref.modify_tid) {
527 			/*
528 			 * Matching key but local data or meta-data requires
529 			 * updating.  If we will recurse, we still need to
530 			 * update to compatible content first but we do not
531 			 * synchronize modify_tid until the entire recursion
532 			 * has completed successfully.
533 			 */
534 			if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
535 				nerror = hammer2_sync_replace(
536 						thr, parent, chain,
537 						0,
538 						idx, &xop->head, focus, 0);
539 				dodefer = 1;
540 			} else {
541 				nerror = hammer2_sync_replace(
542 						thr, parent, chain,
543 						focus->bref.modify_tid,
544 						idx, &xop->head, focus, 0);
545 			}
546 			advance_local = 1;
547 			advance_xop = 1;
548 		} else if (n == 0) {
549 			/*
550 			 * 100% match, advance both
551 			 */
552 			advance_local = 1;
553 			advance_xop = 1;
554 			nerror = 0;
555 		} else if (n > 0) {
556 			/*
557 			 * Insert missing local data.
558 			 *
559 			 * If we will recurse, we still need to update to
560 			 * compatible content first but we do not synchronize
561 			 * modify_tid until the entire recursion has
562 			 * completed successfully.
563 			 */
564 			if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
565 				nerror = hammer2_sync_insert(
566 						thr, &parent, &chain,
567 						0,
568 						idx, &xop->head, focus);
569 				dodefer = 2;
570 			} else {
571 				nerror = hammer2_sync_insert(
572 						thr, &parent, &chain,
573 						focus->bref.modify_tid,
574 						idx, &xop->head, focus);
575 			}
576 			advance_local = 1;
577 			advance_xop = 1;
578 		}
579 
580 		/*
581 		 * We cannot recurse depth-first because the XOP is still
582 		 * running in node threads for this scan.  Create a placemarker
583 		 * by obtaining and record the hammer2_inode.
584 		 *
585 		 * We excluded our node from the XOP so we must temporarily
586 		 * add it to xop->head.cluster so it is properly incorporated
587 		 * into the inode.
588 		 *
589 		 * The deferral is pushed onto a LIFO list for bottom-up
590 		 * synchronization.
591 		 */
592 		if (merror == 0 && dodefer) {
593 			hammer2_inode_t *nip;
594 			hammer2_deferred_ip_t *defer;
595 
596 			KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE);
597 
598 			defer = kmalloc(sizeof(*defer), M_HAMMER2,
599 					M_WAITOK | M_ZERO);
600 			KKASSERT(xop->head.cluster.array[idx].chain == NULL);
601 			xop->head.cluster.array[idx].flags =
602 							HAMMER2_CITEM_INVALID;
603 			xop->head.cluster.array[idx].chain = chain;
604 			nip = hammer2_inode_get(pmp, &xop->head, -1, idx);
605 			xop->head.cluster.array[idx].chain = NULL;
606 
607 			hammer2_inode_ref(nip);
608 			hammer2_inode_unlock(nip);
609 
610 			defer->next = list->base;
611 			defer->ip = nip;
612 			list->base = defer;
613 			++list->count;
614 			needrescan = 1;
615 		}
616 
617 		/*
618 		 * If at least one deferral was added and the deferral
619 		 * list has grown too large, stop adding more.  This
620 		 * will trigger an HAMMER2_ERROR_EAGAIN return.
621 		 */
622 		if (needrescan && list->count > 1000)
623 			break;
624 
625 		/*
626 		 * Advancements for iteration.
627 		 */
628 		if (advance_xop) {
629 			merror = hammer2_xop_collect(&xop->head, 0);
630 		}
631 		if (advance_local) {
632 			chain = hammer2_chain_next(&parent, chain, &key_next,
633 						   key_next, HAMMER2_KEY_MAX,
634 						   &serror,
635 						   HAMMER2_LOOKUP_SHARED |
636 						   HAMMER2_LOOKUP_NODIRECT |
637 						   HAMMER2_LOOKUP_NODATA);
638 		}
639 	}
640 	hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
641 	if (chain) {
642 		hammer2_chain_unlock(chain);
643 		hammer2_chain_drop(chain);
644 	}
645 	if (parent) {
646 		hammer2_chain_unlock(parent);
647 		hammer2_chain_drop(parent);
648 	}
649 
650 	/*
651 	 * If we added deferrals we want the caller to synchronize them
652 	 * and then call us again.
653 	 *
654 	 * NOTE: In this situation we do not yet want to synchronize our
655 	 *	 inode, setting the error code also has that effect.
656 	 */
657 	if ((merror == 0 || merror == HAMMER2_ERROR_ENOENT) && needrescan)
658 		merror = HAMMER2_ERROR_EAGAIN;
659 
660 	/*
661 	 * If no error occurred we can synchronize the inode meta-data
662 	 * and modify_tid.  Only limited changes are made to PFSROOTs.
663 	 *
664 	 * XXX inode lock was lost
665 	 */
666 	if (merror == 0 || merror == HAMMER2_ERROR_ENOENT) {
667 		hammer2_xop_ipcluster_t *xop2;
668 		hammer2_chain_t *focus;
669 
670 		hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
671 		xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
672 		hammer2_xop_start_except(&xop2->head, &hammer2_ipcluster_desc,
673 					 idx);
674 		hammer2_inode_unlock(ip);
675 		merror = hammer2_xop_collect(&xop2->head, 0);
676 		if (merror == 0) {
677 			focus = xop2->head.cluster.focus;
678 			if ((hammer2_debug & 0x8000) && focus) {
679 				const char *filename;
680 
681 				filename = hammer2_xop_gdata(&xop2->head)->
682 						ipdata.filename;
683 				kprintf("syncthr: update inode %p (%s)\n",
684 					focus, filename);
685 				hammer2_xop_pdata(&xop2->head);
686 			}
687 			chain = hammer2_inode_chain_and_parent(ip, idx,
688 						    &parent,
689 						    HAMMER2_RESOLVE_ALWAYS |
690 						    HAMMER2_RESOLVE_SHARED);
691 
692 			KKASSERT(parent != NULL);
693 			nerror = hammer2_sync_replace(
694 					thr, parent, chain,
695 					sync_tid,
696 					idx, &xop2->head, focus, isroot);
697 			hammer2_chain_unlock(chain);
698 			hammer2_chain_drop(chain);
699 			hammer2_chain_unlock(parent);
700 			hammer2_chain_drop(parent);
701 			/* XXX */
702 		}
703 		hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
704 	}
705 
706 	return merror;
707 }
708 
709 /*
710  * Create a missing chain by copying the focus from another device.
711  *
712  * On entry *parentp and focus are both locked shared.  The chain will be
713  * created and returned in *chainp also locked shared.
714  */
715 static
716 int
717 hammer2_sync_insert(hammer2_thread_t *thr,
718 		    hammer2_chain_t **parentp, hammer2_chain_t **chainp,
719 		    hammer2_tid_t mtid, int idx, hammer2_xop_head_t *xop,
720 		    hammer2_chain_t *focus)
721 {
722 	hammer2_chain_t *chain;
723 	hammer2_key_t dummy;
724 	int error;
725 
726 #if HAMMER2_SYNCHRO_DEBUG
727 	if (hammer2_debug & 1)
728 	kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n",
729 		*parentp,
730 		(*parentp)->bref.type,
731 		(*parentp)->bref.key,
732 		idx,
733 		focus->bref.type, focus->bref.key, mtid);
734 #endif
735 
736 	/*
737 	 * Parent requires an exclusive lock for the insertion.
738 	 * We must unlock the child to avoid deadlocks while
739 	 * relocking the parent.
740 	 */
741 	if (*chainp) {
742 		hammer2_chain_unlock(*chainp);
743 		hammer2_chain_drop(*chainp);
744 		*chainp = NULL;
745 	}
746 	hammer2_chain_unlock(*parentp);
747 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
748 
749 	/*
750 	 * We must reissue the lookup to properly position (*parentp)
751 	 * for the insertion.
752 	 */
753 	chain = hammer2_chain_lookup(parentp, &dummy,
754 				     focus->bref.key, focus->bref.key,
755 				     &error,
756 				     HAMMER2_LOOKUP_NODIRECT |
757 				     HAMMER2_LOOKUP_ALWAYS);
758 	KKASSERT(chain == NULL);
759 
760 	chain = NULL;
761 	error = hammer2_chain_create(parentp, &chain, NULL, thr->pmp,
762 				     focus->bref.methods,
763 				     focus->bref.key, focus->bref.keybits,
764 				     focus->bref.type, focus->bytes,
765 				     mtid, 0, 0);
766 	if (error == 0) {
767 		const hammer2_media_data_t *data;
768 
769 		error = hammer2_chain_modify(chain, mtid, 0, 0);
770 		if (error)
771 			goto failed;
772 
773 		/*
774 		 * Copy focus to new chain
775 		 */
776 
777 		/* type already set */
778 		chain->bref.methods = focus->bref.methods;
779 		/* keybits already set */
780 		chain->bref.vradix = focus->bref.vradix;
781 		/* mirror_tid set by flush */
782 		KKASSERT(chain->bref.modify_tid == mtid);
783 		chain->bref.flags = focus->bref.flags;
784 		/* key already present */
785 		/* check code will be recalculated */
786 
787 		/*
788 		 * Copy data body.
789 		 */
790 		switch(chain->bref.type) {
791 		case HAMMER2_BREF_TYPE_INODE:
792 			data = hammer2_xop_gdata(xop);
793 
794 			if ((data->ipdata.meta.op_flags &
795 			     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
796 				/* do not copy block table */
797 				bcopy(data, chain->data,
798 				      offsetof(hammer2_inode_data_t, u));
799 				hammer2_xop_pdata(xop);
800 				break;
801 			}
802 			hammer2_xop_pdata(xop);
803 			/* fall through copy whole thing */
804 		case HAMMER2_BREF_TYPE_DATA:
805 			data = hammer2_xop_gdata(xop);
806 			bcopy(data, chain->data, chain->bytes);
807 			hammer2_chain_setcheck(chain, chain->data);
808 			hammer2_xop_pdata(xop);
809 			break;
810 		case HAMMER2_BREF_TYPE_DIRENT:
811 			/*
812 			 * Directory entries embed data in the blockref.
813 			 */
814 			if (chain->bytes) {
815 				data = hammer2_xop_gdata(xop);
816 				bcopy(data, chain->data, chain->bytes);
817 				hammer2_chain_setcheck(chain, chain->data);
818 				hammer2_xop_pdata(xop);
819 			} else {
820 				chain->bref.check = focus->bref.check;
821 			}
822 			chain->bref.embed = focus->bref.embed;
823 			break;
824 		default:
825 			KKASSERT(0);
826 			break;
827 		}
828 	}
829 
830 failed:
831 	if (chain)
832 		hammer2_chain_unlock(chain);	/* unlock, leave ref */
833 	*chainp = chain;			/* will be returned locked */
834 
835 	/*
836 	 * Avoid an ordering deadlock when relocking shared.
837 	 */
838 	hammer2_chain_unlock(*parentp);
839 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
840 				     HAMMER2_RESOLVE_ALWAYS);
841 	if (chain) {
842 		hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
843 					  HAMMER2_RESOLVE_ALWAYS);
844 		error = chain->error;
845 	}
846 
847 	return error;
848 }
849 
850 /*
851  * Destroy an extranious chain.
852  *
853  * Both *parentp and *chainp are locked shared.
854  *
855  * On return, *chainp will be adjusted to point to the next element in the
856  * iteration and locked shared.
857  */
858 static
859 int
860 hammer2_sync_destroy(hammer2_thread_t *thr,
861 		     hammer2_chain_t **parentp, hammer2_chain_t **chainp,
862 		     hammer2_tid_t mtid, int idx)
863 {
864 	hammer2_chain_t *chain;
865 	hammer2_key_t key_next;
866 	hammer2_key_t save_key;
867 	int error;
868 
869 	chain = *chainp;
870 
871 #if HAMMER2_SYNCHRO_DEBUG
872 	if (hammer2_debug & 1)
873 	kprintf("destroy rec %p/%p slave %d %d.%016jx\n",
874 		*parentp, chain,
875 		idx, chain->bref.type, chain->bref.key);
876 #endif
877 
878 	save_key = chain->bref.key;
879 	if (save_key != HAMMER2_KEY_MAX)
880 		++save_key;
881 
882 	/*
883 	 * Try to avoid unnecessary I/O.
884 	 *
885 	 * XXX accounting not propagated up properly.  We might have to do
886 	 *     a RESOLVE_MAYBE here and pass 0 for the flags.
887 	 */
888 	hammer2_chain_unlock(chain);	/* relock exclusive */
889 	hammer2_chain_unlock(*parentp);
890 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
891 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
892 
893 	hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT);
894 	hammer2_chain_unlock(chain);
895 	hammer2_chain_drop(chain);
896 	chain = NULL;			/* safety */
897 
898 	hammer2_chain_unlock(*parentp);	/* relock shared */
899 	hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
900 				     HAMMER2_RESOLVE_ALWAYS);
901 	*chainp = hammer2_chain_lookup(parentp, &key_next,
902 				     save_key, HAMMER2_KEY_MAX,
903 				     &error,
904 				     HAMMER2_LOOKUP_SHARED |
905 				     HAMMER2_LOOKUP_NODIRECT |
906 				     HAMMER2_LOOKUP_NODATA);
907 	return error;
908 }
909 
910 /*
911  * cparent is locked exclusively, with an extra ref, cluster is not locked.
912  * Replace element [i] in the cluster.
913  */
914 static
915 int
916 hammer2_sync_replace(hammer2_thread_t *thr,
917 		     hammer2_chain_t *parent, hammer2_chain_t *chain,
918 		     hammer2_tid_t mtid, int idx,
919 		     hammer2_xop_head_t *xop, hammer2_chain_t *focus,
920 		     int isroot)
921 {
922 	uint8_t otype;
923 	int nradix;
924 	int error;
925 
926 #if HAMMER2_SYNCHRO_DEBUG
927 	if (hammer2_debug & 1)
928 	kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n",
929 		chain,
930 		idx,
931 		focus->bref.type, focus->bref.key, mtid);
932 #endif
933 	hammer2_chain_unlock(chain);
934 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
935 	error = chain->error;
936 	if (error == 0) {
937 		const hammer2_media_data_t *data;
938 
939 		if (chain->bytes != focus->bytes) {
940 			/* XXX what if compressed? */
941 			nradix = hammer2_getradix(chain->bytes);
942 			error = hammer2_chain_resize(chain, mtid, 0, nradix, 0);
943 			if (error)
944 				goto failed;
945 		}
946 		error = hammer2_chain_modify(chain, mtid, 0, 0);
947 		if (error)
948 			goto failed;
949 		otype = chain->bref.type;
950 		data = hammer2_xop_gdata(xop);
951 		chain->bref.type = focus->bref.type;
952 		chain->bref.methods = focus->bref.methods;
953 		chain->bref.keybits = focus->bref.keybits;
954 		chain->bref.vradix = focus->bref.vradix;
955 		/* mirror_tid updated by flush */
956 		KKASSERT(mtid == 0 || chain->bref.modify_tid == mtid);
957 		chain->bref.flags = focus->bref.flags;
958 		/* key already present */
959 		/* check code will be recalculated */
960 
961 		/*
962 		 * Copy data body.
963 		 */
964 		switch(chain->bref.type) {
965 		case HAMMER2_BREF_TYPE_INODE:
966 			/*
967 			 * Special case PFSROOTs, only limited changes can
968 			 * be made since the meta-data contains miscellanious
969 			 * distinguishing fields.
970 			 */
971 			if (isroot) {
972 				chain->data->ipdata.meta.uflags =
973 					data->ipdata.meta.uflags;
974 				chain->data->ipdata.meta.rmajor =
975 					data->ipdata.meta.rmajor;
976 				chain->data->ipdata.meta.rminor =
977 					data->ipdata.meta.rminor;
978 				chain->data->ipdata.meta.ctime =
979 					data->ipdata.meta.ctime;
980 				chain->data->ipdata.meta.mtime =
981 					data->ipdata.meta.mtime;
982 				chain->data->ipdata.meta.atime =
983 					data->ipdata.meta.atime;
984 				/* not btime */
985 				chain->data->ipdata.meta.uid =
986 					data->ipdata.meta.uid;
987 				chain->data->ipdata.meta.gid =
988 					data->ipdata.meta.gid;
989 				chain->data->ipdata.meta.mode =
990 					data->ipdata.meta.mode;
991 				chain->data->ipdata.meta.ncopies =
992 					data->ipdata.meta.ncopies;
993 				chain->data->ipdata.meta.comp_algo =
994 					data->ipdata.meta.comp_algo;
995 				chain->data->ipdata.meta.check_algo =
996 					data->ipdata.meta.check_algo;
997 				chain->data->ipdata.meta.data_quota =
998 					data->ipdata.meta.data_quota;
999 				chain->data->ipdata.meta.inode_quota =
1000 					data->ipdata.meta.inode_quota;
1001 
1002 				/*
1003 				 * last snapshot tid controls overwrite
1004 				 */
1005 				if (chain->data->ipdata.meta.pfs_lsnap_tid <
1006 				    data->ipdata.meta.pfs_lsnap_tid) {
1007 					chain->data->ipdata.meta.pfs_lsnap_tid =
1008 					data->ipdata.meta.pfs_lsnap_tid;
1009 				}
1010 
1011 				hammer2_chain_setcheck(chain, chain->data);
1012 				break;
1013 			}
1014 
1015 			/*
1016 			 * Normal replacement.
1017 			 */
1018 			if ((data->ipdata.meta.op_flags &
1019 			     HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1020 				/*
1021 				 * If DIRECTDATA is transitioning to 0 or the
1022 				 * old chain is not an inode we have to
1023 				 * initialize the block table.
1024 				 */
1025 				if (otype != HAMMER2_BREF_TYPE_INODE ||
1026 				    (chain->data->ipdata.meta.op_flags &
1027 				     HAMMER2_OPFLAG_DIRECTDATA)) {
1028 					kprintf("chain inode trans "
1029 						"away from dd\n");
1030 					bzero(&chain->data->ipdata.u,
1031 					      sizeof(chain->data->ipdata.u));
1032 				}
1033 				bcopy(data, chain->data,
1034 				      offsetof(hammer2_inode_data_t, u));
1035 				/* XXX setcheck on inode should not be needed */
1036 				hammer2_chain_setcheck(chain, chain->data);
1037 				break;
1038 			}
1039 			/* fall through */
1040 		case HAMMER2_BREF_TYPE_DATA:
1041 			bcopy(data, chain->data, chain->bytes);
1042 			hammer2_chain_setcheck(chain, chain->data);
1043 			break;
1044 		case HAMMER2_BREF_TYPE_DIRENT:
1045 			/*
1046 			 * Directory entries embed data in the blockref.
1047 			 */
1048 			if (chain->bytes) {
1049 				bcopy(data, chain->data, chain->bytes);
1050 				hammer2_chain_setcheck(chain, chain->data);
1051 			} else {
1052 				chain->bref.check = focus->bref.check;
1053 			}
1054 			chain->bref.embed = focus->bref.embed;
1055 			break;
1056 		default:
1057 			KKASSERT(0);
1058 			break;
1059 		}
1060 		hammer2_xop_pdata(xop);
1061 	}
1062 
1063 failed:
1064 	hammer2_chain_unlock(chain);
1065 	hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
1066 				  HAMMER2_RESOLVE_MAYBE);
1067 
1068 	return error;
1069 }
1070