xref: /dragonfly/sys/vfs/hammer2/hammer2_cluster.c (revision 896f2e3a)
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *	    locks and I/O, do quorum and/or master-slave processing, and
50  *	    it must operate properly even if some nodes are broken (which
51  *	    can also mean indefinite locks).
52  *
53  *				CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *		hammer2_cluster_lock()
59  *		hammer2_cluster_parent()
60  *
61  *	- Most complex functions, quorum management on transaction ids.
62  *
63  *	- Locking and data accesses must be internally asynchronous.
64  *
65  *	- Validate and manage cache coherency primitives (cache state
66  *	  is stored in chain topologies but must be validated by these
67  *	  functions).
68  *
69  * (2) Lookups and Scans
70  *		hammer2_cluster_lookup()
71  *		hammer2_cluster_next()
72  *
73  *	- Depend on locking & data retrieval functions, but still complex.
74  *
75  *	- Must do quorum management on transaction ids.
76  *
77  *	- Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *		hammer2_cluster_create()
81  *		hammer2_cluster_rename()
82  *		hammer2_cluster_delete()
83  *		hammer2_cluster_modify()
84  *		hammer2_cluster_modsync()
85  *
86  *	- Can usually punt on failures, operation continues unless quorum
87  *	  is lost.  If quorum is lost, must wait for resynchronization
88  *	  (depending on the management mode).
89  *
90  *	- Must disconnect node on failures (also not flush), remount, and
91  *	  resynchronize.
92  *
93  *	- Network links (via kdmsg) are relatively easy to issue as the
94  *	  complex underworkings of hammer2_chain.c don't have to messed
95  *	  with (the protocol is at a higher level than block-level).
96  *
97  *	- Multiple local disk nodes (i.e. block devices) are another matter.
98  *	  Chain operations have to be dispatched to per-node threads (xN)
99  *	  because we can't asynchronize potentially very complex chain
100  *	  operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *	  (these threads are also used to terminate incoming kdmsg ops from
103  *	  other machines).
104  *
105  *	- Single-node filesystems do not use threads and will simply call
106  *	  hammer2_chain.c functions directly.  This short-cut is handled
107  *	  at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115 
116 #include "hammer2.h"
117 
118 /*
119  * Returns non-zero if any chain in the cluster needs to be resized.
120  * Errored elements are not used in the calculation.
121  */
122 int
123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
124 {
125 	hammer2_chain_t *chain;
126 	int i;
127 
128 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
129 	for (i = 0; i < cluster->nchains; ++i) {
130 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
131 			continue;
132 		chain = cluster->array[i].chain;
133 		if (chain == NULL)
134 			continue;
135 		if (chain->error)
136 			continue;
137 		if (chain->bytes != bytes)
138 			return 1;
139 	}
140 	return 0;
141 }
142 
143 /*
144  * Returns the bref type of the cluster's foucs.
145  *
146  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
147  * The cluster must be locked.
148  */
149 uint8_t
150 hammer2_cluster_type(hammer2_cluster_t *cluster)
151 {
152 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
153 	if (cluster->error == 0)
154 		return(cluster->focus->bref.type);
155 	return 0;
156 }
157 
158 /*
159  * Returns non-zero if the cluster's focus is flagged as being modified.
160  *
161  * If the cluster is errored, returns 0.
162  */
163 int
164 hammer2_cluster_modified(hammer2_cluster_t *cluster)
165 {
166 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
167 	if (cluster->error == 0)
168 		return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
169 	return 0;
170 }
171 
172 /*
173  * Returns the bref of the cluster's focus, sans any data-offset information
174  * (since offset information is per-node and wouldn't be useful).
175  *
176  * Callers use this function to access modify_tid, mirror_tid, type,
177  * key, and keybits.
178  *
179  * If the cluster is errored, returns an empty bref.
180  * The cluster must be locked.
181  */
182 void
183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
184 {
185 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
186 	if (cluster->error == 0) {
187 		*bref = cluster->focus->bref;
188 		bref->data_off = 0;
189 	} else {
190 		bzero(bref, sizeof(*bref));
191 	}
192 }
193 
194 /*
195  * Return non-zero if the chain representing an inode has been flagged
196  * as having been unlinked.  Allows the vnode reclaim to avoid loading
197  * the inode data from disk e.g. when unmount or recycling old, clean
198  * vnodes.
199  *
200  * The cluster does not need to be locked.
201  * The focus cannot be used since the cluster might not be locked.
202  */
203 int
204 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
205 {
206 	hammer2_chain_t *chain;
207 	int flags;
208 	int i;
209 
210 	flags = 0;
211 	for (i = 0; i < cluster->nchains; ++i) {
212 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
213 			continue;
214 		chain = cluster->array[i].chain;
215 		if (chain)
216 			flags |= chain->flags;
217 	}
218 	return (flags & HAMMER2_CHAIN_UNLINKED);
219 }
220 
221 /*
222  * Set a bitmask of flags in all chains related to a cluster.
223  * The cluster should probably be locked.
224  *
225  * XXX Only operate on FEMOD elements?
226  */
227 void
228 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
229 {
230 	hammer2_chain_t *chain;
231 	int i;
232 
233 	for (i = 0; i < cluster->nchains; ++i) {
234 		chain = cluster->array[i].chain;
235 		if (chain)
236 			atomic_set_int(&chain->flags, flags);
237 	}
238 }
239 
240 /*
241  * Set a bitmask of flags in all chains related to a cluster.
242  * The cluster should probably be locked.
243  *
244  * XXX Only operate on FEMOD elements?
245  */
246 void
247 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
248 {
249 	hammer2_chain_t *chain;
250 	int i;
251 
252 	for (i = 0; i < cluster->nchains; ++i) {
253 		chain = cluster->array[i].chain;
254 		if (chain)
255 			atomic_clear_int(&chain->flags, flags);
256 	}
257 }
258 
259 /*
260  * Flag the cluster for flushing recursively up to the root.  Despite the
261  * work it does, this is relatively benign.  It just makes sure that the
262  * flusher has top-down visibility to this cluster.
263  *
264  * Errored chains are not flagged for flushing.
265  *
266  * The cluster should probably be locked.
267  */
268 void
269 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
270 {
271 	hammer2_chain_t *chain;
272 	int i;
273 
274 	for (i = 0; i < cluster->nchains; ++i) {
275 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
276 			continue;
277 		chain = cluster->array[i].chain;
278 		if (chain == NULL)
279 			continue;
280 		if (chain->error)
281 			continue;
282 		hammer2_chain_setflush(trans, chain);
283 	}
284 }
285 
286 /*
287  * Set the check mode for the cluster.
288  * Errored elements of the cluster are ignored.
289  *
290  * The cluster must be locked and modified.
291  */
292 void
293 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
294 				hammer2_cluster_t *cluster,
295 				int check_algo)
296 {
297 	hammer2_chain_t *chain;
298 	int i;
299 
300 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
301 	for (i = 0; i < cluster->nchains; ++i) {
302 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
303 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
304 			continue;
305 		}
306 		chain = cluster->array[i].chain;
307 		if (chain == NULL)
308 			continue;
309 		if (chain->error)
310 			continue;
311 		KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
312 		chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
313 		chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
314 	}
315 }
316 
317 /*
318  * Create a degenerate cluster with one ref from a single locked chain.
319  * The returned cluster will be focused on the chain and inherit its
320  * error state.
321  *
322  * The chain's lock and reference are transfered to the new cluster, so
323  * the caller should not try to unlock the chain separately.
324  *
325  * We fake the flags.
326  */
327 hammer2_cluster_t *
328 hammer2_cluster_from_chain(hammer2_chain_t *chain)
329 {
330 	hammer2_cluster_t *cluster;
331 
332 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
333 	cluster->array[0].chain = chain;
334 	cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
335 	cluster->nchains = 1;
336 	cluster->focus = chain;
337 	cluster->focus_index = 0;
338 	cluster->pmp = chain->pmp;
339 	cluster->refs = 1;
340 	cluster->error = chain->error;
341 	cluster->flags = HAMMER2_CLUSTER_LOCKED |
342 			 HAMMER2_CLUSTER_WRHARD |
343 			 HAMMER2_CLUSTER_RDHARD |
344 			 HAMMER2_CLUSTER_MSYNCED |
345 			 HAMMER2_CLUSTER_SSYNCED;
346 
347 	return cluster;
348 }
349 
350 /*
351  * Add a reference to a cluster and its underlying chains.
352  *
353  * We must also ref the underlying chains in order to allow ref/unlock
354  * sequences to later re-lock.
355  */
356 void
357 hammer2_cluster_ref(hammer2_cluster_t *cluster)
358 {
359 	atomic_add_int(&cluster->refs, 1);
360 #if 0
361 	hammer2_chain_t *chain;
362 	int i;
363 
364 	for (i = 0; i < cluster->nchains; ++i) {
365 		chain = cluster->array[i].chain;
366 		if (chain)
367 			hammer2_chain_ref(chain);
368 	}
369 #endif
370 }
371 
372 /*
373  * Drop the caller's reference to the cluster.  When the ref count drops to
374  * zero this function frees the cluster and drops all underlying chains.
375  *
376  * In-progress read I/Os are typically detached from the cluster once the
377  * first one returns (the remaining stay attached to the DIOs but are then
378  * ignored and drop naturally).
379  */
380 void
381 hammer2_cluster_drop(hammer2_cluster_t *cluster)
382 {
383 	hammer2_chain_t *chain;
384 	int i;
385 
386 	KKASSERT(cluster->refs > 0);
387 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
388 		cluster->focus = NULL;		/* safety XXX chg to assert */
389 		cluster->focus_index = 0;
390 
391 		for (i = 0; i < cluster->nchains; ++i) {
392 			chain = cluster->array[i].chain;
393 			if (chain) {
394 				hammer2_chain_drop(chain);
395 				cluster->array[i].chain = NULL; /* safety */
396 			}
397 		}
398 		cluster->nchains = 0;				/* safety */
399 
400 		kfree(cluster, M_HAMMER2);
401 		/* cluster is invalid */
402 	}
403 }
404 
405 void
406 hammer2_cluster_wait(hammer2_cluster_t *cluster)
407 {
408 	tsleep(cluster->focus, 0, "h2clcw", 1);
409 }
410 
411 /*
412  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
413  * and then locks them, modified by various RESOLVE flags.
414  *
415  * The act of locking a cluster sets its focus.  Note that cluster elements
416  * flagged with HAMMER2_CITEM_INVALID cannot be set as a focus.  Locking a
417  * cluster does not adjust this flag since exact matches only matter for leafs
418  * (parents can depend on minor differences in topology).
419  *
420  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
421  * operations.  Typically this is only set on a quorum of MASTERs or
422  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
423  * is present, this bit is *not* set on a quorum of MASTERs.  The
424  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
425  * that create/modify/delete elements use it.
426  *
427  * The chains making up the cluster may be narrowed down based on quorum
428  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
429  * narrowed down to a single chain as long as the entire subtopology is known
430  * to be intact.  So, for example, we can narrow a read-only op to a single
431  * fast SLAVE but if we focus a CACHE chain we must still retain at least
432  * a SLAVE to ensure that the subtopology can be accessed.
433  *
434  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
435  * to be maintained once the topology is validated as-of the top level of
436  * the operation.
437  *
438  * If a failure occurs the operation must be aborted by higher-level code and
439  * retried. XXX
440  */
441 void
442 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
443 {
444 	hammer2_chain_t *chain;
445 	int i;
446 
447 	/* cannot be on inode-embedded cluster template, must be on copy */
448 	KKASSERT(cluster->refs > 0);
449 	KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
450 	if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
451 		panic("hammer2_cluster_lock: cluster %p already locked!\n",
452 			cluster);
453 	} else {
454 		KKASSERT(cluster->focus == NULL);
455 	}
456 	atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
457 
458 	/*
459 	 * Lock chains and resolve state.
460 	 */
461 	for (i = 0; i < cluster->nchains; ++i) {
462 		chain = cluster->array[i].chain;
463 		if (chain == NULL)
464 			continue;
465 		hammer2_chain_lock(chain, how);
466 	}
467 
468 	hammer2_cluster_resolve(cluster);
469 }
470 
471 void
472 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
473 {
474 	hammer2_chain_t *chain;
475 	hammer2_pfs_t *pmp;
476 	hammer2_tid_t quorum_tid;
477 	int focus_pfs_type;
478 	uint32_t nflags;
479 	int ttlmasters;
480 	int ttlslaves;
481 	int nmasters;
482 	int nslaves;
483 	int nquorum;
484 	int smpresent;
485 	int i;
486 
487 	cluster->error = 0;
488 
489 	quorum_tid = 0;
490 	focus_pfs_type = 0;
491 	nflags = 0;
492 	ttlmasters = 0;
493 	ttlslaves = 0;
494 	nmasters = 0;
495 	nslaves = 0;
496 
497 	/*
498 	 * Calculate quorum
499 	 */
500 	pmp = cluster->pmp;
501 	KKASSERT(pmp != NULL || cluster->nchains == 0);
502 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
503 	smpresent = 0;
504 
505 	/*
506 	 * Pass 1
507 	 */
508 	for (i = 0; i < cluster->nchains; ++i) {
509 		chain = cluster->array[i].chain;
510 		if (chain == NULL)
511 			continue;
512 		if (chain->error) {
513 			if (cluster->focus == NULL || cluster->focus == chain) {
514 				/* error will be overridden by valid focus */
515 				cluster->error = chain->error;
516 			}
517 
518 			/*
519 			 * Must count total masters and slaves whether the
520 			 * chain is errored or not.
521 			 */
522 			switch (cluster->pmp->pfs_types[i]) {
523 			case HAMMER2_PFSTYPE_MASTER:
524 				++ttlmasters;
525 				break;
526 			case HAMMER2_PFSTYPE_SLAVE:
527 				++ttlslaves;
528 				break;
529 			}
530 			continue;
531 		}
532 		switch (cluster->pmp->pfs_types[i]) {
533 		case HAMMER2_PFSTYPE_MASTER:
534 			++ttlmasters;
535 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
536 				/*
537 				 * Invalid as in unsynchronized, cannot be
538 				 * used to calculate the quorum.
539 				 */
540 			} else if (quorum_tid < chain->bref.modify_tid ||
541 				   nmasters == 0) {
542 				nmasters = 1;
543 				quorum_tid = chain->bref.modify_tid;
544 			} else if (quorum_tid == chain->bref.modify_tid) {
545 				++nmasters;
546 			}
547 			break;
548 		case HAMMER2_PFSTYPE_SLAVE:
549 			++ttlslaves;
550 			break;
551 		case HAMMER2_PFSTYPE_SOFT_MASTER:
552 			nflags |= HAMMER2_CLUSTER_WRSOFT;
553 			nflags |= HAMMER2_CLUSTER_RDSOFT;
554 			smpresent = 1;
555 			break;
556 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
557 			nflags |= HAMMER2_CLUSTER_RDSOFT;
558 			break;
559 		case HAMMER2_PFSTYPE_SUPROOT:
560 			/*
561 			 * Degenerate cluster representing the super-root
562 			 * topology on a single device.  Fake stuff so
563 			 * cluster ops work as expected.
564 			 */
565 			nflags |= HAMMER2_CLUSTER_WRHARD;
566 			nflags |= HAMMER2_CLUSTER_RDHARD;
567 			cluster->focus_index = i;
568 			cluster->focus = chain;
569 			cluster->error = chain->error;
570 			break;
571 		default:
572 			break;
573 		}
574 	}
575 
576 	/*
577 	 * Pass 2
578 	 */
579 	for (i = 0; i < cluster->nchains; ++i) {
580 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
581 		chain = cluster->array[i].chain;
582 		if (chain == NULL)
583 			continue;
584 		if (chain->error) {
585 			if (cluster->focus == NULL || cluster->focus == chain) {
586 				/* error will be overridden by valid focus */
587 				cluster->error = chain->error;
588 			}
589 			continue;
590 		}
591 
592 		switch (cluster->pmp->pfs_types[i]) {
593 		case HAMMER2_PFSTYPE_MASTER:
594 			/*
595 			 * We must have enough up-to-date masters to reach
596 			 * a quorum and the master modify_tid must match
597 			 * the quorum's modify_tid.
598 			 *
599 			 * Do not select an errored or out-of-sync master.
600 			 */
601 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
602 				nflags |= HAMMER2_CLUSTER_UNHARD;
603 			} else if (nmasters >= nquorum &&
604 				   chain->error == 0 &&
605 				   quorum_tid == chain->bref.modify_tid) {
606 				nflags |= HAMMER2_CLUSTER_WRHARD;
607 				nflags |= HAMMER2_CLUSTER_RDHARD;
608 				if (!smpresent) {
609 					cluster->array[i].flags |=
610 							HAMMER2_CITEM_FEMOD;
611 				}
612 				if (cluster->focus == NULL ||
613 				    focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
614 					focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
615 					cluster->focus_index = i;
616 					cluster->focus = chain;
617 					cluster->error = chain->error;
618 				}
619 			} else if (chain->error == 0) {
620 				nflags |= HAMMER2_CLUSTER_UNHARD;
621 			}
622 			break;
623 		case HAMMER2_PFSTYPE_SLAVE:
624 			/*
625 			 * We must have enough up-to-date masters to reach
626 			 * a quorum and the slave modify_tid must match the
627 			 * quorum's modify_tid.
628 			 *
629 			 * Do not select an errored slave.
630 			 */
631 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
632 				nflags |= HAMMER2_CLUSTER_UNHARD;
633 			} else if (nmasters >= nquorum &&
634 				   chain->error == 0 &&
635 				   quorum_tid == chain->bref.modify_tid) {
636 				++nslaves;
637 				nflags |= HAMMER2_CLUSTER_RDHARD;
638 				if (cluster->focus == NULL) {
639 					focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
640 					cluster->focus_index = i;
641 					cluster->focus = chain;
642 					cluster->error = chain->error;
643 				}
644 			} else if (chain->error == 0) {
645 				nflags |= HAMMER2_CLUSTER_UNSOFT;
646 			}
647 			break;
648 		case HAMMER2_PFSTYPE_SOFT_MASTER:
649 			/*
650 			 * Directly mounted soft master always wins.  There
651 			 * should be only one.
652 			 */
653 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
654 			cluster->focus_index = i;
655 			cluster->focus = chain;
656 			cluster->error = chain->error;
657 			focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
658 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
659 			break;
660 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
661 			/*
662 			 * Directly mounted soft slave always wins.  There
663 			 * should be only one.
664 			 */
665 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
666 			if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
667 				cluster->focus_index = i;
668 				cluster->focus = chain;
669 				cluster->error = chain->error;
670 				focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
671 			}
672 			break;
673 		case HAMMER2_PFSTYPE_SUPROOT:
674 			/*
675 			 * spmp (degenerate case)
676 			 */
677 			KKASSERT(i == 0);
678 			cluster->focus_index = i;
679 			cluster->focus = chain;
680 			cluster->error = chain->error;
681 			focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
682 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
683 			break;
684 		default:
685 			break;
686 		}
687 	}
688 
689 	if (ttlslaves == 0)
690 		nflags |= HAMMER2_CLUSTER_NOSOFT;
691 	if (ttlmasters == 0)
692 		nflags |= HAMMER2_CLUSTER_NOHARD;
693 
694 	/*
695 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
696 	 * all available nodes (even if 0 are available) are fully
697 	 * synchronized.  This is used by the synchronization thread to
698 	 * determine if there is work it could potentially accomplish.
699 	 */
700 	if (nslaves == ttlslaves)
701 		nflags |= HAMMER2_CLUSTER_SSYNCED;
702 	if (nmasters == ttlmasters)
703 		nflags |= HAMMER2_CLUSTER_MSYNCED;
704 
705 	/*
706 	 * Determine if the cluster was successfully locked for the
707 	 * requested operation and generate an error code.  The cluster
708 	 * will not be locked (or ref'd) if an error is returned.
709 	 *
710 	 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
711 	 * to determine if reading or writing is possible.  If writing, the
712 	 * cluster still requires a call to hammer2_cluster_modify() first.
713 	 */
714 	atomic_set_int(&cluster->flags, nflags);
715 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
716 }
717 
718 /*
719  * Copy a cluster, returned a ref'd cluster.  All underlying chains
720  * are also ref'd, but not locked.
721  *
722  * The cluster focus is not set because the cluster is not yet locked
723  * (and the originating cluster does not have to be locked either).
724  */
725 hammer2_cluster_t *
726 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
727 {
728 	hammer2_pfs_t *pmp = ocluster->pmp;
729 	hammer2_cluster_t *ncluster;
730 	hammer2_chain_t *chain;
731 	int i;
732 
733 	ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
734 	ncluster->pmp = pmp;
735 	ncluster->nchains = ocluster->nchains;
736 	ncluster->refs = 1;
737 	ncluster->flags = 0;	/* cluster not locked */
738 
739 	for (i = 0; i < ocluster->nchains; ++i) {
740 		chain = ocluster->array[i].chain;
741 		ncluster->array[i].chain = chain;
742 		if (chain)
743 			hammer2_chain_ref(chain);
744 	}
745 	return (ncluster);
746 }
747 
748 /*
749  * Unlock and deref a cluster.  The cluster is destroyed if this is the
750  * last ref.
751  */
752 void
753 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
754 {
755 	hammer2_chain_t *chain;
756 	int i;
757 
758 	if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
759 		kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
760 			cluster);
761 	}
762 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
763 	KKASSERT(cluster->refs > 0);
764 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
765 
766 	for (i = 0; i < cluster->nchains; ++i) {
767 		chain = cluster->array[i].chain;
768 		if (chain)
769 			hammer2_chain_unlock(chain);
770 	}
771 	cluster->focus_index = 0;
772 	cluster->focus = NULL;
773 }
774 
775 /*
776  * Resize the cluster's physical storage allocation in-place.  This may
777  * replace the cluster's chains.
778  */
779 void
780 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
781 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
782 		       int nradix, int flags)
783 {
784 	hammer2_chain_t *chain;
785 	int i;
786 
787 	KKASSERT(cparent->pmp == cluster->pmp);		/* can be NULL */
788 	KKASSERT(cparent->nchains == cluster->nchains);
789 
790 	for (i = 0; i < cluster->nchains; ++i) {
791 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
792 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
793 			continue;
794 		}
795 		chain = cluster->array[i].chain;
796 		if (chain) {
797 			KKASSERT(cparent->array[i].chain);
798 			hammer2_chain_resize(trans, ip,
799 					     cparent->array[i].chain, chain,
800 					     nradix, flags);
801 		}
802 	}
803 }
804 
805 /*
806  * Set an inode's cluster modified, marking the related chains RW and
807  * duplicating them if necessary.
808  *
809  * The passed-in chain is a localized copy of the chain previously acquired
810  * when the inode was locked (and possilby replaced in the mean time), and
811  * must also be updated.  In fact, we update it first and then synchronize
812  * the inode's cluster cache.
813  */
814 hammer2_inode_data_t *
815 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
816 			  hammer2_cluster_t *cluster, int flags)
817 {
818 	atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
819 	hammer2_cluster_modify(trans, cluster, flags);
820 
821 	hammer2_inode_repoint(ip, NULL, cluster);
822 	if (ip->vp)
823 		vsetisdirty(ip->vp);
824 	return (&hammer2_cluster_wdata(cluster)->ipdata);
825 }
826 
827 /*
828  * Adjust the cluster's chains to allow modification and adjust the
829  * focus.  Data will be accessible on return.
830  *
831  * If our focused master errors on modify, re-resolve the cluster to
832  * try to select a different master.
833  */
834 void
835 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
836 		       int flags)
837 {
838 	hammer2_chain_t *chain;
839 	int resolve_again;
840 	int i;
841 
842 	resolve_again = 0;
843 	for (i = 0; i < cluster->nchains; ++i) {
844 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
845 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
846 			continue;
847 		}
848 		chain = cluster->array[i].chain;
849 		if (chain == NULL)
850 			continue;
851 		if (chain->error)
852 			continue;
853 		hammer2_chain_modify(trans, chain, flags);
854 		if (cluster->focus == chain && chain->error) {
855 			cluster->error = chain->error;
856 			resolve_again = 1;
857 		}
858 	}
859 	if (resolve_again)
860 		hammer2_cluster_resolve(cluster);
861 }
862 
863 /*
864  * Synchronize modifications from the focus to other chains in a cluster.
865  * Convenient because nominal API users can just modify the contents of the
866  * focus (at least for non-blockref data).
867  *
868  * Nominal front-end operations only edit non-block-table data in a single
869  * chain.  This code copies such modifications to the other chains in the
870  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
871  * by both the frontend and the backend and will explode in fireworks if
872  * blindly copied.
873  */
874 void
875 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
876 {
877 	hammer2_chain_t *focus;
878 	hammer2_chain_t *scan;
879 	const hammer2_inode_data_t *ripdata;
880 	hammer2_inode_data_t *wipdata;
881 	int i;
882 
883 	focus = cluster->focus;
884 	KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
885 
886 	for (i = 0; i < cluster->nchains; ++i) {
887 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
888 			continue;
889 		scan = cluster->array[i].chain;
890 		if (scan == NULL || scan == focus)
891 			continue;
892 		if (scan->error)
893 			continue;
894 		KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
895 		KKASSERT(focus->bytes == scan->bytes &&
896 			 focus->bref.type == scan->bref.type);
897 		switch(focus->bref.type) {
898 		case HAMMER2_BREF_TYPE_INODE:
899 			ripdata = &focus->data->ipdata;
900 			wipdata = &scan->data->ipdata;
901 			if ((ripdata->op_flags &
902 			    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
903 				bcopy(ripdata, wipdata,
904 				      offsetof(hammer2_inode_data_t, u));
905 				break;
906 			}
907 			/* fall through to full copy */
908 		case HAMMER2_BREF_TYPE_DATA:
909 			bcopy(focus->data, scan->data, focus->bytes);
910 			break;
911 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
912 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
913 		case HAMMER2_BREF_TYPE_FREEMAP:
914 		case HAMMER2_BREF_TYPE_VOLUME:
915 			panic("hammer2_cluster_modsync: illegal node type");
916 			/* NOT REACHED */
917 			break;
918 		default:
919 			panic("hammer2_cluster_modsync: unknown node type");
920 			break;
921 		}
922 	}
923 }
924 
925 /*
926  * Lookup initialization/completion API.  Returns a locked cluster with 1 ref.
927  */
928 hammer2_cluster_t *
929 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
930 {
931 	hammer2_cluster_t *cluster;
932 
933 	cluster = hammer2_cluster_copy(cparent);
934 	if (flags & HAMMER2_LOOKUP_SHARED) {
935 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
936 					      HAMMER2_RESOLVE_SHARED);
937 	} else {
938 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
939 	}
940 	return (cluster);
941 }
942 
943 void
944 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
945 {
946 	if (cparent) {
947 		hammer2_cluster_unlock(cparent);
948 		hammer2_cluster_drop(cparent);
949 	}
950 }
951 
952 /*
953  * Locate first match or overlap under parent, return a new cluster
954  */
955 hammer2_cluster_t *
956 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
957 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
958 {
959 	hammer2_pfs_t *pmp;
960 	hammer2_cluster_t *cluster;
961 	hammer2_chain_t *chain;
962 	hammer2_chain_t *focus;
963 	hammer2_key_t key_accum;
964 	hammer2_key_t key_next;
965 	int null_count;
966 	int i;
967 
968 	pmp = cparent->pmp;				/* can be NULL */
969 	key_accum = *key_nextp;
970 	null_count = 0;
971 
972 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
973 	cluster->pmp = pmp;				/* can be NULL */
974 	cluster->refs = 1;
975 	if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
976 		cluster->flags |= HAMMER2_CLUSTER_LOCKED;
977 
978 	/*
979 	 * Pass-1, issue lookup and find focus.
980 	 */
981 	for (i = 0; i < cparent->nchains; ++i) {
982 		cluster->array[i].flags = cparent->array[i].flags;
983 		key_next = *key_nextp;
984 
985 		/*
986 		 * Nothing to base the lookup, or parent was not synchronized.
987 		 */
988 		if (cparent->array[i].chain == NULL ||
989 		    (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
990 			++null_count;
991 			continue;
992 		}
993 
994 		chain = hammer2_chain_lookup(&cparent->array[i].chain,
995 					     &key_next,
996 					     key_beg, key_end,
997 					     &cparent->array[i].cache_index,
998 					     flags);
999 		cluster->array[i].chain = chain;
1000 		if (chain == NULL) {
1001 			++null_count;
1002 		} else if (chain->error) {
1003 			/*
1004 			 * Leave errored chain in cluster, but it cannot be
1005 			 * the cluster's focus.  It is still possible for an
1006 			 * error'd chain to be synchronized (since we have
1007 			 * the bref), synchronization state will be handled
1008 			 * in pass-2.
1009 			 */
1010 			;
1011 		} else if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
1012 			/*
1013 			 * Leave unsynchronized chain in cluster, but it cannot
1014 			 * be the cluster's focus.
1015 			 */
1016 			;
1017 		} else {
1018 			int ddflag = (chain->bref.type ==
1019 				      HAMMER2_BREF_TYPE_INODE);
1020 
1021 			if (cluster->focus == NULL) {
1022 				cluster->focus_index = i;
1023 				cluster->focus = chain;
1024 				cluster->ddflag = ddflag;
1025 			}
1026 			if (cparent->focus == cparent->array[i].chain) {
1027 				cluster->focus_index = i;
1028 				cluster->focus = chain;
1029 				cluster->ddflag = ddflag;
1030 			}
1031 		}
1032 		if (key_accum > key_next)
1033 			key_accum = key_next;
1034 	}
1035 
1036 	/*
1037 	 * Pass-2 invalidate mismatches
1038 	 */
1039 	focus = cluster->focus;
1040 	if (focus == NULL)
1041 		goto done;
1042 
1043 	for (i = 0; i < cparent->nchains; ++i) {
1044 		int ddflag;
1045 
1046 		chain = cluster->array[i].chain;
1047 
1048 		if (chain == NULL)
1049 			continue;
1050 		if (chain == focus)
1051 			continue;
1052 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1053 			continue;
1054 
1055 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1056 		if (chain->bref.type != focus->bref.type ||
1057 		    chain->bref.key != focus->bref.key ||
1058 		    chain->bref.keybits != focus->bref.keybits ||
1059 		    chain->bref.modify_tid != focus->bref.modify_tid ||
1060 		    chain->bytes != focus->bytes ||
1061 		    ddflag != cluster->ddflag) {
1062 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1063 		}
1064 	}
1065 
1066 	/*
1067 	 * Resolve cluster flags.  A lookup or locking failure could wind
1068 	 * up changing the cluster.
1069 	 */
1070 done:
1071 	*key_nextp = key_accum;
1072 	cluster->nchains = i;
1073 	hammer2_cluster_resolve(cluster);
1074 
1075 	if (null_count == i) {
1076 		hammer2_cluster_drop(cluster);
1077 		cluster = NULL;
1078 	}
1079 
1080 	return (cluster);
1081 }
1082 
1083 /*
1084  * Locate next match or overlap under parent, replace cluster
1085  */
1086 hammer2_cluster_t *
1087 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1088 		     hammer2_key_t *key_nextp,
1089 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
1090 {
1091 	hammer2_chain_t *ochain;
1092 	hammer2_chain_t *nchain;
1093 	hammer2_chain_t *focus;
1094 	hammer2_key_t key_accum;
1095 	hammer2_key_t key_next;
1096 	int null_count;
1097 	int i;
1098 
1099 	key_accum = *key_nextp;
1100 	null_count = 0;
1101 	cluster->focus = NULL;
1102 	cparent->focus = NULL;
1103 	cluster->focus_index = 0;
1104 	cparent->focus_index = 0;
1105 
1106 	cluster->ddflag = 0;
1107 
1108 	for (i = 0; i < cparent->nchains; ++i) {
1109 		key_next = *key_nextp;
1110 		ochain = cluster->array[i].chain;
1111 
1112 		/*
1113 		 * Nothing to iterate from.  These cases can occur under
1114 		 * normal operations.  For example, during synchronization
1115 		 * a slave might reach the end of its scan while records
1116 		 * are still left on the master(s).
1117 		 */
1118 		if (ochain == NULL) {
1119 			++null_count;
1120 			continue;
1121 		}
1122 		if (cparent->array[i].chain == NULL ||
1123 		    (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
1124 		    (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
1125 			if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
1126 				hammer2_chain_unlock(ochain);
1127 			hammer2_chain_drop(ochain);
1128 			cluster->array[i].chain = NULL;
1129 			++null_count;
1130 			continue;
1131 		}
1132 
1133 		nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1134 					    &key_next, key_beg, key_end,
1135 					    &cparent->array[i].cache_index,
1136 					    flags);
1137 		/* ochain now invalid but can still be used for focus check */
1138 
1139 		cluster->array[i].chain = nchain;
1140 		if (nchain == NULL) {
1141 			++null_count;
1142 		} else if (nchain->error) {
1143 			/*
1144 			 * Leave errored chain in cluster, but it cannot be
1145 			 * the cluster's focus.
1146 			 */
1147 			;
1148 		} else {
1149 			int ddflag = (nchain->bref.type ==
1150 				      HAMMER2_BREF_TYPE_INODE);
1151 
1152 			/*
1153 			 * Possible new focus.
1154 			 */
1155 			if (cluster->focus == NULL) {
1156 				cluster->ddflag = ddflag;
1157 				cluster->focus_index = i;
1158 				cluster->focus = nchain;
1159 			}
1160 
1161 			/*
1162 			 * Fixup pre-existing focus.
1163 			 */
1164 			if (cluster->focus == ochain) {
1165 				cluster->focus_index = i;
1166 				cluster->focus = nchain;
1167 			}
1168 		}
1169 		if (key_accum > key_next)
1170 			key_accum = key_next;
1171 	}
1172 
1173 	/*
1174 	 * Pass-2 invalidate mismatches
1175 	 */
1176 	focus = cluster->focus;
1177 	if (focus == NULL)
1178 		goto done;
1179 
1180 	for (i = 0; i < cparent->nchains; ++i) {
1181 		int ddflag;
1182 
1183 		nchain = cluster->array[i].chain;
1184 
1185 		if (nchain == NULL)
1186 			continue;
1187 		if (nchain == focus)
1188 			continue;
1189 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1190 			continue;
1191 
1192 		ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1193 		if (nchain->bref.type != focus->bref.type ||
1194 		    nchain->bref.key != focus->bref.key ||
1195 		    nchain->bref.keybits != focus->bref.keybits ||
1196 		    nchain->bref.modify_tid != focus->bref.modify_tid ||
1197 		    nchain->bytes != focus->bytes ||
1198 		    ddflag != cluster->ddflag) {
1199 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1200 		}
1201 	}
1202 
1203 done:
1204 	*key_nextp = key_accum;
1205 	cluster->nchains = i;
1206 	hammer2_cluster_resolve(cluster);
1207 
1208 	if (null_count == i) {
1209 		hammer2_cluster_drop(cluster);
1210 		cluster = NULL;
1211 	}
1212 	return(cluster);
1213 }
1214 
1215 /*
1216  * Advance just one chain in the cluster and recalculate the invalid bit.
1217  * (used during synchronization to advance past a chain being deleted).
1218  *
1219  * The chain being advanced must not be the focus and the clusters in
1220  * question must have already passed normal cluster_lookup/cluster_next
1221  * checks.
1222  *
1223  * The cluster always remains intact on return, so void function.
1224  */
1225 void
1226 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
1227 				  hammer2_cluster_t *cluster,
1228 				  hammer2_key_t *key_nextp,
1229 				  hammer2_key_t key_beg,
1230 				  hammer2_key_t key_end,
1231 				  int i, int flags)
1232 {
1233 	hammer2_chain_t *ochain;
1234 	hammer2_chain_t *nchain;
1235 	hammer2_chain_t *focus;
1236 	hammer2_key_t key_accum;
1237 	hammer2_key_t key_next;
1238 	int ddflag;
1239 
1240 	key_accum = *key_nextp;
1241 	key_next = *key_nextp;
1242 	ochain = cluster->array[i].chain;
1243 	if (ochain == NULL)
1244 		goto done;
1245 	KKASSERT(ochain != cluster->focus);
1246 
1247 	nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
1248 				    &key_next, key_beg, key_end,
1249 				    &cparent->array[i].cache_index,
1250 				    flags);
1251 	/* ochain now invalid */
1252 
1253 	/*
1254 	 * Install nchain.  Note that nchain can be NULL, and can also
1255 	 * be in an unlocked state depending on flags.
1256 	 */
1257 	cluster->array[i].chain = nchain;
1258 	cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1259 
1260 	if (key_accum > key_next)
1261 		key_accum = key_next;
1262 
1263 	focus = cluster->focus;
1264 	if (focus == NULL)
1265 		goto done;
1266 	if (nchain == NULL)
1267 		goto done;
1268 #if 0
1269 	if (nchain == focus)	/* ASSERTED NOT TRUE */
1270 		...
1271 #endif
1272 	ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
1273 	if (nchain->bref.type != focus->bref.type ||
1274 	    nchain->bref.key != focus->bref.key ||
1275 	    nchain->bref.keybits != focus->bref.keybits ||
1276 	    nchain->bref.modify_tid != focus->bref.modify_tid ||
1277 	    nchain->bytes != focus->bytes ||
1278 	    ddflag != cluster->ddflag) {
1279 		cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1280 	}
1281 
1282 done:
1283 	*key_nextp = key_accum;
1284 #if 0
1285 	/*
1286 	 * For now don't re-resolve cluster->flags.
1287 	 */
1288 	hammer2_cluster_resolve(cluster);
1289 #endif
1290 }
1291 
1292 /*
1293  * Create a new cluster using the specified key
1294  */
1295 int
1296 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1297 		     hammer2_cluster_t **clusterp,
1298 		     hammer2_key_t key, int keybits,
1299 		     int type, size_t bytes, int flags)
1300 {
1301 	hammer2_cluster_t *cluster;
1302 	hammer2_pfs_t *pmp;
1303 	int error;
1304 	int i;
1305 
1306 	pmp = trans->pmp;				/* can be NULL */
1307 
1308 	if ((cluster = *clusterp) == NULL) {
1309 		cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
1310 				  M_WAITOK | M_ZERO);
1311 		cluster->pmp = pmp;			/* can be NULL */
1312 		cluster->refs = 1;
1313 		cluster->flags = HAMMER2_CLUSTER_LOCKED;
1314 	}
1315 	cluster->focus_index = 0;
1316 	cluster->focus = NULL;
1317 
1318 	/*
1319 	 * NOTE: cluster->array[] entries can initially be NULL.  If
1320 	 *	 *clusterp is supplied, skip NULL entries, otherwise
1321 	 *	 create new chains.
1322 	 */
1323 	for (i = 0; i < cparent->nchains; ++i) {
1324 		if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1325 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1326 			continue;
1327 		}
1328 		if (*clusterp) {
1329 			if ((cluster->array[i].flags &
1330 			     HAMMER2_CITEM_FEMOD) == 0) {
1331 				cluster->array[i].flags |=
1332 						HAMMER2_CITEM_INVALID;
1333 				continue;
1334 			}
1335 			if (cluster->array[i].chain == NULL)
1336 				continue;
1337 		}
1338 		error = hammer2_chain_create(trans, &cparent->array[i].chain,
1339 					     &cluster->array[i].chain, pmp,
1340 					     key, keybits,
1341 					     type, bytes, flags);
1342 		KKASSERT(error == 0);
1343 		if (cluster->focus == NULL) {
1344 			cluster->focus_index = i;
1345 			cluster->focus = cluster->array[i].chain;
1346 		}
1347 		if (cparent->focus == cparent->array[i].chain) {
1348 			cluster->focus_index = i;
1349 			cluster->focus = cluster->array[i].chain;
1350 		}
1351 	}
1352 	cluster->nchains = i;
1353 	*clusterp = cluster;
1354 	hammer2_cluster_resolve(cluster);
1355 
1356 	return error;
1357 }
1358 
1359 /*
1360  * Rename a cluster to a new parent.
1361  *
1362  * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(),
1363  *	    So the data_off field is not relevant.  Only the key and
1364  *	    keybits are used.
1365  */
1366 void
1367 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
1368 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1369 		       int flags)
1370 {
1371 	hammer2_chain_t *chain;
1372 	hammer2_blockref_t xbref;
1373 	int i;
1374 
1375 	cluster->focus = NULL;
1376 	cparent->focus = NULL;
1377 	cluster->focus_index = 0;
1378 	cparent->focus_index = 0;
1379 
1380 	for (i = 0; i < cluster->nchains; ++i) {
1381 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1382 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1383 			continue;
1384 		}
1385 		chain = cluster->array[i].chain;
1386 		if (chain) {
1387 			if (bref) {
1388 				xbref = chain->bref;
1389 				xbref.key = bref->key;
1390 				xbref.keybits = bref->keybits;
1391 				hammer2_chain_rename(trans, &xbref,
1392 						     &cparent->array[i].chain,
1393 						     chain, flags);
1394 			} else {
1395 				hammer2_chain_rename(trans, NULL,
1396 						     &cparent->array[i].chain,
1397 						     chain, flags);
1398 			}
1399 			KKASSERT(cluster->array[i].chain == chain); /*remove*/
1400 		}
1401 	}
1402 }
1403 
1404 /*
1405  * Mark a cluster deleted
1406  */
1407 void
1408 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1409 		       hammer2_cluster_t *cluster, int flags)
1410 {
1411 	hammer2_chain_t *chain;
1412 	hammer2_chain_t *parent;
1413 	int i;
1414 
1415 	if (cparent == NULL) {
1416 		kprintf("cparent is NULL\n");
1417 		return;
1418 	}
1419 
1420 	for (i = 0; i < cluster->nchains; ++i) {
1421 		if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
1422 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1423 			continue;
1424 		}
1425 		parent = cparent->array[i].chain;
1426 		chain = cluster->array[i].chain;
1427 		if (chain == NULL)
1428 			continue;
1429 		if (chain->parent != parent) {
1430 			kprintf("hammer2_cluster_delete: parent "
1431 				"mismatch chain=%p parent=%p against=%p\n",
1432 				chain, chain->parent, parent);
1433 		} else {
1434 			hammer2_chain_delete(trans, parent, chain, flags);
1435 		}
1436 	}
1437 }
1438 
1439 /*
1440  * Create a snapshot of the specified {parent, ochain} with the specified
1441  * label.  The originating hammer2_inode must be exclusively locked for
1442  * safety.
1443  *
1444  * The ioctl code has already synced the filesystem.
1445  */
1446 int
1447 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1448 		       hammer2_ioc_pfs_t *pfs)
1449 {
1450 	hammer2_dev_t *hmp;
1451 	hammer2_cluster_t *ncluster;
1452 	const hammer2_inode_data_t *ripdata;
1453 	hammer2_inode_data_t *wipdata;
1454 	hammer2_chain_t *nchain;
1455 	hammer2_inode_t *nip;
1456 	size_t name_len;
1457 	hammer2_key_t lhc;
1458 	struct vattr vat;
1459 #if 0
1460 	uuid_t opfs_clid;
1461 #endif
1462 	int error;
1463 	int i;
1464 
1465 	kprintf("snapshot %s\n", pfs->name);
1466 
1467 	name_len = strlen(pfs->name);
1468 	lhc = hammer2_dirhash(pfs->name, name_len);
1469 
1470 	/*
1471 	 * Get the clid
1472 	 */
1473 	ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1474 #if 0
1475 	opfs_clid = ripdata->pfs_clid;
1476 #endif
1477 	hmp = ocluster->focus->hmp;	/* XXX find synchronized local disk */
1478 
1479 	/*
1480 	 * Create the snapshot directory under the super-root
1481 	 *
1482 	 * Set PFS type, generate a unique filesystem id, and generate
1483 	 * a cluster id.  Use the same clid when snapshotting a PFS root,
1484 	 * which theoretically allows the snapshot to be used as part of
1485 	 * the same cluster (perhaps as a cache).
1486 	 *
1487 	 * Copy the (flushed) blockref array.  Theoretically we could use
1488 	 * chain_duplicate() but it becomes difficult to disentangle
1489 	 * the shared core so for now just brute-force it.
1490 	 */
1491 	VATTR_NULL(&vat);
1492 	vat.va_type = VDIR;
1493 	vat.va_mode = 0755;
1494 	ncluster = NULL;
1495 	nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1496 				   proc0.p_ucred, pfs->name, name_len,
1497 				   &ncluster,
1498 				   HAMMER2_INSERT_PFSROOT, &error);
1499 
1500 	if (nip) {
1501 		wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1502 		wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER;
1503 		wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
1504 		wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1505 		kern_uuidgen(&wipdata->pfs_fsid, 1);
1506 
1507 		/*
1508 		 * Give the snapshot its own private cluster.  As a snapshot
1509 		 * no further synchronization with the original cluster will
1510 		 * be done.
1511 		 */
1512 #if 0
1513 		if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1514 			wipdata->pfs_clid = opfs_clid;
1515 		else
1516 			kern_uuidgen(&wipdata->pfs_clid, 1);
1517 #endif
1518 		kern_uuidgen(&wipdata->pfs_clid, 1);
1519 
1520 		for (i = 0; i < ncluster->nchains; ++i) {
1521 			if ((ncluster->array[i].flags &
1522 			     HAMMER2_CITEM_FEMOD) == 0) {
1523 				ncluster->array[i].flags |=
1524 					HAMMER2_CITEM_INVALID;
1525 				continue;
1526 			}
1527 			nchain = ncluster->array[i].chain;
1528 			if (nchain)
1529 				nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1530 		}
1531 #if 0
1532 		/* XXX can't set this unless we do an explicit flush, which
1533 		   we also need a pmp assigned to do, else the flush code
1534 		   won't flush ncluster because it thinks it is crossing a
1535 		   flush boundary */
1536 		hammer2_cluster_set_chainflags(ncluster,
1537 					       HAMMER2_CHAIN_PFSBOUNDARY);
1538 #endif
1539 
1540 		/* XXX hack blockset copy */
1541 		/* XXX doesn't work with real cluster */
1542 		KKASSERT(ocluster->nchains == 1);
1543 		wipdata->u.blockset = ripdata->u.blockset;
1544 		hammer2_cluster_modsync(ncluster);
1545 		for (i = 0; i < ncluster->nchains; ++i) {
1546 			nchain = ncluster->array[i].chain;
1547 			if (nchain)
1548 				hammer2_flush(trans, nchain);
1549 		}
1550 		hammer2_inode_unlock(nip, ncluster);
1551 	}
1552 	return (error);
1553 }
1554 
1555 /*
1556  * Return locked parent cluster given a locked child.  The child remains
1557  * locked on return.  The new parent's focus follows the child's focus
1558  * and the parent is always resolved.
1559  */
1560 hammer2_cluster_t *
1561 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1562 {
1563 	hammer2_cluster_t *cparent;
1564 	int i;
1565 
1566 	cparent = hammer2_cluster_copy(cluster);
1567 
1568 	for (i = 0; i < cparent->nchains; ++i) {
1569 		hammer2_chain_t *chain;
1570 		hammer2_chain_t *rchain;
1571 
1572 		/*
1573 		 * Calculate parent for each element.  Old chain has an extra
1574 		 * ref for cparent but the lock remains with cluster.
1575 		 */
1576 		chain = cparent->array[i].chain;
1577 		if (chain == NULL)
1578 			continue;
1579 		while ((rchain = chain->parent) != NULL) {
1580 			hammer2_chain_ref(rchain);
1581 			hammer2_chain_unlock(chain);
1582 			hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1583 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1584 			if (chain->parent == rchain)
1585 				break;
1586 			hammer2_chain_unlock(rchain);
1587 			hammer2_chain_drop(rchain);
1588 		}
1589 		if (cluster->focus == chain) {
1590 			cparent->focus_index = i;
1591 			cparent->focus = rchain;
1592 		}
1593 		cparent->array[i].chain = rchain;
1594 		hammer2_chain_drop(chain);
1595 	}
1596 	cparent->flags |= HAMMER2_CLUSTER_LOCKED;
1597 	hammer2_cluster_resolve(cparent);
1598 
1599 	return cparent;
1600 }
1601 
1602 /************************************************************************
1603  *			        CLUSTER I/O 				*
1604  ************************************************************************
1605  *
1606  *
1607  * WARNING! blockref[] array data is not universal.  These functions should
1608  *	    only be used to access universal data.
1609  *
1610  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1611  *	    complete if necessary.  The I/O's should have already been
1612  *	    initiated by the cluster_lock/chain_lock operation.
1613  *
1614  *	    The cluster must already be in a modified state before wdata
1615  *	    is called.  The data will already be available for this case.
1616  */
1617 const hammer2_media_data_t *
1618 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1619 {
1620 	return(cluster->focus->data);
1621 }
1622 
1623 hammer2_media_data_t *
1624 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1625 {
1626 	KKASSERT(hammer2_cluster_modified(cluster));
1627 	return(cluster->focus->data);
1628 }
1629 
1630 /*
1631  * Load cluster data asynchronously with callback.
1632  *
1633  * The callback is made for the first validated data found, or NULL
1634  * if no valid data is available.
1635  *
1636  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1637  *	 in the inode with an exclusive lock held), the chain structure may be
1638  *	 shared.
1639  */
1640 void
1641 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1642 			   void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1643 {
1644 	hammer2_chain_t *chain;
1645 	hammer2_iocb_t *iocb;
1646 	hammer2_dev_t *hmp;
1647 	hammer2_blockref_t *bref;
1648 	int i;
1649 
1650 	/*
1651 	 * Try to find a chain whos data is already resolved.  If none can
1652 	 * be found, start with the first chain.
1653 	 */
1654 	chain = NULL;
1655 	for (i = 0; i < cluster->nchains; ++i) {
1656 		chain = cluster->array[i].chain;
1657 		if (chain && chain->data)
1658 			break;
1659 	}
1660 	if (i == cluster->nchains) {
1661 		chain = cluster->array[0].chain;
1662 		i = 0;
1663 	}
1664 
1665 	iocb = &cluster->iocb;
1666 	iocb->callback = callback;
1667 	iocb->dio = NULL;		/* for already-validated case */
1668 	iocb->cluster = cluster;
1669 	iocb->chain = chain;
1670 	iocb->ptr = ptr;
1671 	iocb->lbase = (off_t)i;
1672 	iocb->flags = 0;
1673 	iocb->error = 0;
1674 
1675 	/*
1676 	 * Data already validated
1677 	 */
1678 	if (chain->data) {
1679 		callback(iocb);
1680 		return;
1681 	}
1682 
1683 	/*
1684 	 * We must resolve to a device buffer, either by issuing I/O or
1685 	 * by creating a zero-fill element.  We do not mark the buffer
1686 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1687 	 * API must still be used to do that).
1688 	 *
1689 	 * The device buffer is variable-sized in powers of 2 down
1690 	 * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1691 	 * chunk always contains buffers of the same size. (XXX)
1692 	 *
1693 	 * The minimum physical IO size may be larger than the variable
1694 	 * block size.
1695 	 *
1696 	 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes
1697 	 *	      matches hammer2_devblksize()?  Or does the freemap's
1698 	 *	      pre-zeroing handle the case for us?
1699 	 */
1700 	bref = &chain->bref;
1701 	hmp = chain->hmp;
1702 
1703 #if 0
1704 	/* handled by callback? <- TODO XXX even needed for loads? */
1705 	/*
1706 	 * The getblk() optimization for a 100% overwrite can only be used
1707 	 * if the physical block size matches the request.
1708 	 */
1709 	if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1710 	    chain->bytes == hammer2_devblksize(chain->bytes)) {
1711 		error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1712 		KKASSERT(error == 0);
1713 		iocb->dio = dio;
1714 		callback(iocb);
1715 		return;
1716 	}
1717 #endif
1718 
1719 	/*
1720 	 * Otherwise issue a read
1721 	 */
1722 	hammer2_adjreadcounter(&chain->bref, chain->bytes);
1723 	hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1724 }
1725