xref: /dragonfly/sys/vfs/hammer2/hammer2_cluster.c (revision d4ef6694)
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information into a single entity.  It allows direct access to media
37  * data as long as it is not blockref array data.  Meaning, basically,
38  * just inode and file data.
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  */
48 #include <sys/cdefs.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/types.h>
52 #include <sys/lock.h>
53 #include <sys/uuid.h>
54 
55 #include "hammer2.h"
56 
57 /*
58  * Returns TRUE if any chain in the cluster needs to be resized.
59  */
60 int
61 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
62 {
63 	hammer2_chain_t *chain;
64 	int i;
65 
66 	for (i = 0; i < cluster->nchains; ++i) {
67 		chain = cluster->array[i];
68 		if (chain && chain->bytes != bytes)
69 			return 1;
70 	}
71 	return 0;
72 }
73 
74 uint8_t
75 hammer2_cluster_type(hammer2_cluster_t *cluster)
76 {
77 	return(cluster->focus->bref.type);
78 }
79 
80 int
81 hammer2_cluster_modified(hammer2_cluster_t *cluster)
82 {
83 	return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
84 }
85 
86 /*
87  * Return a bref representative of the cluster.  Any data offset is removed
88  * (since it would only be applicable to a particular chain in the cluster).
89  *
90  * However, the radix portion of data_off is used for many purposes and will
91  * be retained.
92  */
93 void
94 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
95 {
96 	*bref = cluster->focus->bref;
97 	bref->data_off &= HAMMER2_OFF_MASK_RADIX;
98 }
99 
100 void
101 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
102 {
103 	hammer2_chain_t *chain;
104 	int i;
105 
106 	for (i = 0; i < cluster->nchains; ++i) {
107 		chain = cluster->array[i];
108 		if (chain)
109 			atomic_set_int(&chain->flags, flags);
110 	}
111 }
112 
113 void
114 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
115 {
116 	hammer2_chain_t *chain;
117 	int i;
118 
119 	for (i = 0; i < cluster->nchains; ++i) {
120 		chain = cluster->array[i];
121 		if (chain)
122 			hammer2_chain_setflush(trans, chain);
123 	}
124 }
125 
126 void
127 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
128 				hammer2_cluster_t *cluster,
129 				int check_algo)
130 {
131 	hammer2_chain_t *chain;
132 	int i;
133 
134 	for (i = 0; i < cluster->nchains; ++i) {
135 		chain = cluster->array[i];
136 		if (chain) {
137 			KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
138 			chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
139 			chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
140 		}
141 	}
142 }
143 
144 /*
145  * Create a cluster with one ref from the specified chain.  The chain
146  * is not further referenced.  The caller typically supplies a locked
147  * chain and transfers ownership to the cluster.
148  */
149 hammer2_cluster_t *
150 hammer2_cluster_from_chain(hammer2_chain_t *chain)
151 {
152 	hammer2_cluster_t *cluster;
153 
154 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
155 	cluster->array[0] = chain;
156 	cluster->nchains = 1;
157 	cluster->focus = chain;
158 	cluster->pmp = chain->pmp;
159 	cluster->refs = 1;
160 
161 	return cluster;
162 }
163 
164 /*
165  * Allocates a cluster and its underlying chain structures.  The underlying
166  * chains will be locked.  The cluster and underlying chains will have one
167  * ref.
168  */
169 hammer2_cluster_t *
170 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
171 		      hammer2_trans_t *trans, hammer2_blockref_t *bref)
172 {
173 	hammer2_cluster_t *cluster;
174 	hammer2_cluster_t *rcluster;
175 	hammer2_chain_t *chain;
176 #if 0
177 	u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
178 #endif
179 	int i;
180 
181 	KKASSERT(pmp != NULL);
182 
183 	/*
184 	 * Construct the appropriate system structure.
185 	 */
186 	switch(bref->type) {
187 	case HAMMER2_BREF_TYPE_INODE:
188 	case HAMMER2_BREF_TYPE_INDIRECT:
189 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
190 	case HAMMER2_BREF_TYPE_DATA:
191 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
192 		/*
193 		 * Chain's are really only associated with the hmp but we
194 		 * maintain a pmp association for per-mount memory tracking
195 		 * purposes.  The pmp can be NULL.
196 		 */
197 		break;
198 	case HAMMER2_BREF_TYPE_VOLUME:
199 	case HAMMER2_BREF_TYPE_FREEMAP:
200 		chain = NULL;
201 		panic("hammer2_cluster_alloc volume type illegal for op");
202 	default:
203 		chain = NULL;
204 		panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
205 		      bref->type);
206 	}
207 
208 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
209 	cluster->refs = 1;
210 
211 	rcluster = &pmp->iroot->cluster;
212 	for (i = 0; i < rcluster->nchains; ++i) {
213 		chain = hammer2_chain_alloc(rcluster->array[i]->hmp,
214 					    pmp, trans, bref);
215 #if 0
216 		chain->hmp = rcluster->array[i]->hmp;
217 		chain->bref = *bref;
218 		chain->bytes = bytes;
219 		chain->refs = 1;
220 		chain->flags = HAMMER2_CHAIN_ALLOCATED;
221 #endif
222 
223 		/*
224 		 * NOTE: When loading a chain from backing store or creating a
225 		 *	 snapshot, trans will be NULL and the caller is
226 		 *	 responsible for setting these fields.
227 		 */
228 		cluster->array[i] = chain;
229 	}
230 	cluster->nchains = i;
231 	cluster->pmp = pmp;
232 	cluster->focus = cluster->array[0];
233 
234 	return (cluster);
235 }
236 
237 /*
238  * Add a reference to a cluster.
239  *
240  * We must also ref the underlying chains in order to allow ref/unlock
241  * sequences to later re-lock.
242  */
243 void
244 hammer2_cluster_ref(hammer2_cluster_t *cluster)
245 {
246 	hammer2_chain_t *chain;
247 	int i;
248 
249 	atomic_add_int(&cluster->refs, 1);
250 	for (i = 0; i < cluster->nchains; ++i) {
251 		chain = cluster->array[i];
252 		if (chain)
253 			hammer2_chain_ref(chain);
254 	}
255 }
256 
257 /*
258  * Drop the caller's reference to the cluster.  When the ref count drops to
259  * zero this function frees the cluster and drops all underlying chains.
260  *
261  * In-progress read I/Os are typically detached from the cluster once the
262  * first one returns (the remaining stay attached to the DIOs but are then
263  * ignored and drop naturally).
264  */
265 void
266 hammer2_cluster_drop(hammer2_cluster_t *cluster)
267 {
268 	hammer2_chain_t *chain;
269 	int i;
270 
271 	KKASSERT(cluster->refs > 0);
272 	for (i = 0; i < cluster->nchains; ++i) {
273 		chain = cluster->array[i];
274 		if (chain) {
275 			hammer2_chain_drop(chain);
276 			if (cluster->refs == 1)
277 				cluster->array[i] = NULL;
278 		}
279 	}
280 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
281 		cluster->focus = NULL;
282 		kfree(cluster, M_HAMMER2);
283 		/* cluster = NULL; safety */
284 	}
285 }
286 
287 void
288 hammer2_cluster_wait(hammer2_cluster_t *cluster)
289 {
290 	tsleep(cluster->focus, 0, "h2clcw", 1);
291 }
292 
293 /*
294  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
295  * and then locks them.
296  */
297 int
298 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
299 {
300 	hammer2_chain_t *chain;
301 	int i;
302 	int error;
303 
304 	error = 0;
305 	atomic_add_int(&cluster->refs, 1);
306 	for (i = 0; i < cluster->nchains; ++i) {
307 		chain = cluster->array[i];
308 		if (chain) {
309 			error = hammer2_chain_lock(chain, how);
310 			if (error) {
311 				while (--i >= 0)
312 					hammer2_chain_unlock(cluster->array[i]);
313 				atomic_add_int(&cluster->refs, -1);
314 				break;
315 			}
316 		}
317 	}
318 	return error;
319 }
320 
321 /*
322  * Replace the contents of dst with src, adding a reference to src's chains.
323  * dst is assumed to already have a ref and any chains present in dst are
324  * assumed to be locked and will be unlocked.
325  *
326  * If the chains in src are locked, only one of (src) or (dst) should be
327  * considered locked by the caller after return, not both.
328  */
329 void
330 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
331 {
332 	hammer2_chain_t *chain;
333 	int i;
334 
335 	KKASSERT(dst->refs == 1);
336 	dst->focus = NULL;
337 
338 	for (i = 0; i < src->nchains; ++i) {
339 		chain = src->array[i];
340 		if (chain) {
341 			hammer2_chain_ref(chain);
342 			if (i < dst->nchains && dst->array[i])
343 				hammer2_chain_unlock(dst->array[i]);
344 			dst->array[i] = chain;
345 			if (dst->focus == NULL)
346 				dst->focus = chain;
347 		}
348 	}
349 	while (i < dst->nchains) {
350 		chain = dst->array[i];
351 		if (chain) {
352 			hammer2_chain_unlock(chain);
353 			dst->array[i] = NULL;
354 		}
355 		++i;
356 	}
357 	dst->nchains = src->nchains;
358 }
359 
360 /*
361  * Replace the contents of the locked destination with the contents of the
362  * locked source.  Destination must have one ref.
363  *
364  * Returns with the destination still with one ref and the copied chains
365  * with an additional lock (representing their state on the destination).
366  * The original chains associated with the destination are unlocked.
367  */
368 void
369 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
370 {
371 	hammer2_chain_t *chain;
372 	int i;
373 
374 	KKASSERT(dst->refs == 1);
375 
376 	dst->focus = NULL;
377 	for (i = 0; i < src->nchains; ++i) {
378 		chain = src->array[i];
379 		if (chain) {
380 			hammer2_chain_lock(chain, 0);
381 			if (i < dst->nchains && dst->array[i])
382 				hammer2_chain_unlock(dst->array[i]);
383 			dst->array[i] = src->array[i];
384 			if (dst->focus == NULL)
385 				dst->focus = chain;
386 		}
387 	}
388 	while (i < dst->nchains) {
389 		chain = dst->array[i];
390 		if (chain) {
391 			hammer2_chain_unlock(chain);
392 			dst->array[i] = NULL;
393 		}
394 		++i;
395 	}
396 	dst->nchains = src->nchains;
397 }
398 
399 /*
400  * Copy a cluster, returned a ref'd cluster.  All underlying chains
401  * are also ref'd, but not locked.
402  *
403  * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied
404  * to the new cluster and a reference is nominally added to them and to
405  * the cluster.  The cluster will have 1 ref.
406  *
407  * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains
408  * are copied but no additional references are made and the cluster will have
409  * 0 refs.  Callers must ref the cluster and the chains before dropping it
410  * (typically by locking it).
411  *
412  * If flags are passed as 0 the copy is setup as if it contained the chains
413  * but the chains will not be copied over, and the cluster will have 0 refs.
414  * Callers must ref the cluster before dropping it (typically by locking it).
415  */
416 hammer2_cluster_t *
417 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags)
418 {
419 	hammer2_pfsmount_t *pmp = ocluster->pmp;
420 	hammer2_cluster_t *ncluster;
421 	hammer2_chain_t *chain;
422 	int i;
423 
424 	ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
425 	ncluster->pmp = pmp;
426 	ncluster->nchains = ocluster->nchains;
427 	ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1;
428 	if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) {
429 		ncluster->focus = ocluster->focus;
430 		for (i = 0; i < ocluster->nchains; ++i) {
431 			chain = ocluster->array[i];
432 			ncluster->array[i] = chain;
433 			if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 &&
434 			    chain) {
435 				hammer2_chain_ref(chain);
436 			}
437 		}
438 	}
439 	return (ncluster);
440 }
441 
442 /*
443  * Unlock and deref a cluster.  The cluster is destroyed if this is the
444  * last ref.
445  */
446 void
447 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
448 {
449 	hammer2_chain_t *chain;
450 	int i;
451 
452 	KKASSERT(cluster->refs > 0);
453 	for (i = 0; i < cluster->nchains; ++i) {
454 		chain = cluster->array[i];
455 		if (chain) {
456 			hammer2_chain_unlock(chain);
457 			if (cluster->refs == 1)
458 				cluster->array[i] = NULL;	/* safety */
459 		}
460 	}
461 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
462 		cluster->focus = NULL;
463 		kfree(cluster, M_HAMMER2);
464 		/* cluster = NULL; safety */
465 	}
466 }
467 
468 /*
469  * Resize the cluster's physical storage allocation in-place.  This may
470  * replace the cluster's chains.
471  */
472 void
473 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
474 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
475 		       int nradix, int flags)
476 {
477 	int i;
478 
479 	KKASSERT(cparent->pmp == cluster->pmp);		/* can be NULL */
480 	KKASSERT(cparent->nchains == cluster->nchains);
481 
482 	cluster->focus = NULL;
483 	for (i = 0; i < cluster->nchains; ++i) {
484 		if (cluster->array[i]) {
485 			KKASSERT(cparent->array[i]);
486 			hammer2_chain_resize(trans, ip,
487 					     cparent->array[i],
488 					     cluster->array[i],
489 					     nradix, flags);
490 			if (cluster->focus == NULL)
491 				cluster->focus = cluster->array[i];
492 		}
493 	}
494 }
495 
496 /*
497  * Set an inode's cluster modified, marking the related chains RW and
498  * duplicating them if necessary.
499  *
500  * The passed-in chain is a localized copy of the chain previously acquired
501  * when the inode was locked (and possilby replaced in the mean time), and
502  * must also be updated.  In fact, we update it first and then synchronize
503  * the inode's cluster cache.
504  */
505 hammer2_inode_data_t *
506 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
507 			  hammer2_cluster_t *cluster, int flags)
508 {
509 	atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
510 	hammer2_cluster_modify(trans, cluster, flags);
511 
512 	hammer2_inode_repoint(ip, NULL, cluster);
513 	if (ip->vp)
514 		vsetisdirty(ip->vp);
515 	return (&hammer2_cluster_wdata(cluster)->ipdata);
516 }
517 
518 /*
519  * Adjust the cluster's chains to allow modification and adjust the
520  * focus.  Data will be accessible on return.
521  */
522 void
523 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
524 		       int flags)
525 {
526 	int i;
527 
528 	cluster->focus = NULL;
529 	for (i = 0; i < cluster->nchains; ++i) {
530 		if (cluster->array[i]) {
531 			hammer2_chain_modify(trans, cluster->array[i], flags);
532 			if (cluster->focus == NULL)
533 				cluster->focus = cluster->array[i];
534 		}
535 	}
536 }
537 
538 /*
539  * Synchronize modifications from the focus to other chains in a cluster.
540  * Convenient because nominal API users can just modify the contents of the
541  * focus (at least for non-blockref data).
542  *
543  * Nominal front-end operations only edit non-block-table data in a single
544  * chain.  This code copies such modifications to the other chains in the
545  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
546  * by both the frontend and the backend and will explode in fireworks if
547  * blindly copied.
548  */
549 void
550 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
551 {
552 	hammer2_chain_t *focus;
553 	hammer2_chain_t *scan;
554 	const hammer2_inode_data_t *ripdata;
555 	hammer2_inode_data_t *wipdata;
556 	int i;
557 
558 	focus = cluster->focus;
559 	KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
560 
561 	for (i = 0; i < cluster->nchains; ++i) {
562 		scan = cluster->array[i];
563 		if (scan == NULL || scan == focus)
564 			continue;
565 		KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
566 		KKASSERT(focus->bytes == scan->bytes &&
567 			 focus->bref.type == scan->bref.type);
568 		switch(focus->bref.type) {
569 		case HAMMER2_BREF_TYPE_INODE:
570 			ripdata = &focus->data->ipdata;
571 			wipdata = &scan->data->ipdata;
572 			if ((ripdata->op_flags &
573 			    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
574 				bcopy(ripdata, wipdata,
575 				      offsetof(hammer2_inode_data_t, u));
576 				break;
577 			}
578 			/* fall through */
579 		case HAMMER2_BREF_TYPE_DATA:
580 			bcopy(focus->data, scan->data, focus->bytes);
581 			break;
582 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
583 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
584 		case HAMMER2_BREF_TYPE_FREEMAP:
585 		case HAMMER2_BREF_TYPE_VOLUME:
586 			panic("hammer2_cluster_modsync: illegal node type");
587 			/* NOT REACHED */
588 			break;
589 		default:
590 			panic("hammer2_cluster_modsync: unknown node type");
591 			break;
592 		}
593 	}
594 }
595 
596 /*
597  * Lookup initialization/completion API
598  */
599 hammer2_cluster_t *
600 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
601 {
602 	hammer2_cluster_t *cluster;
603 	int i;
604 
605 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
606 	cluster->pmp = cparent->pmp;			/* can be NULL */
607 	/* cluster->focus = NULL; already null */
608 
609 	for (i = 0; i < cparent->nchains; ++i) {
610 		cluster->array[i] = cparent->array[i];
611 		if (cluster->focus == NULL)
612 			cluster->focus = cluster->array[i];
613 	}
614 	cluster->nchains = cparent->nchains;
615 
616 	/*
617 	 * Independently lock (this will also give cluster 1 ref)
618 	 */
619 	if (flags & HAMMER2_LOOKUP_SHARED) {
620 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
621 					      HAMMER2_RESOLVE_SHARED);
622 	} else {
623 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
624 	}
625 	return (cluster);
626 }
627 
628 void
629 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
630 {
631 	if (cparent)
632 		hammer2_cluster_unlock(cparent);
633 }
634 
635 /*
636  * Locate first match or overlap under parent, return a new cluster
637  */
638 hammer2_cluster_t *
639 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
640 		     hammer2_key_t key_beg, hammer2_key_t key_end,
641 		     int flags, int *ddflagp)
642 {
643 	hammer2_pfsmount_t *pmp;
644 	hammer2_cluster_t *cluster;
645 	hammer2_chain_t *chain;
646 	hammer2_key_t key_accum;
647 	hammer2_key_t key_next;
648 	hammer2_key_t bref_key;
649 	int bref_keybits;
650 	int null_count;
651 	int ddflag;
652 	int i;
653 	uint8_t bref_type;
654 	u_int bytes;
655 
656 	pmp = cparent->pmp;				/* can be NULL */
657 	key_accum = *key_nextp;
658 	null_count = 0;
659 	bref_type = 0;
660 	bref_key = 0;
661 	bref_keybits = 0;
662 	bytes = 0;
663 
664 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
665 	cluster->pmp = pmp;				/* can be NULL */
666 	cluster->refs = 1;
667 	/* cluster->focus = NULL; already null */
668 	cparent->focus = NULL;
669 	*ddflagp = 0;
670 
671 	for (i = 0; i < cparent->nchains; ++i) {
672 		key_next = *key_nextp;
673 		if (cparent->array[i] == NULL) {
674 			++null_count;
675 			continue;
676 		}
677 		chain = hammer2_chain_lookup(&cparent->array[i], &key_next,
678 					     key_beg, key_end,
679 					     &cparent->cache_index[i],
680 					     flags, &ddflag);
681 		if (cparent->focus == NULL)
682 			cparent->focus = cparent->array[i];
683 		cluster->array[i] = chain;
684 		if (chain == NULL) {
685 			++null_count;
686 		} else {
687 			if (cluster->focus == NULL) {
688 				bref_type = chain->bref.type;
689 				bref_key = chain->bref.key;
690 				bref_keybits = chain->bref.keybits;
691 				bytes = chain->bytes;
692 				*ddflagp = ddflag;
693 				cluster->focus = chain;
694 			}
695 			KKASSERT(bref_type == chain->bref.type);
696 			KKASSERT(bref_key == chain->bref.key);
697 			KKASSERT(bref_keybits == chain->bref.keybits);
698 			KKASSERT(bytes == chain->bytes);
699 			KKASSERT(*ddflagp == ddflag);
700 		}
701 		if (key_accum > key_next)
702 			key_accum = key_next;
703 	}
704 	*key_nextp = key_accum;
705 	cluster->nchains = i;
706 
707 	if (null_count == i) {
708 		hammer2_cluster_drop(cluster);
709 		cluster = NULL;
710 	}
711 
712 	return (cluster);
713 }
714 
715 /*
716  * Locate next match or overlap under parent, replace cluster
717  */
718 hammer2_cluster_t *
719 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
720 		     hammer2_key_t *key_nextp,
721 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
722 {
723 	hammer2_chain_t *chain;
724 	hammer2_key_t key_accum;
725 	hammer2_key_t key_next;
726 	int null_count;
727 	int i;
728 
729 	key_accum = *key_nextp;
730 	null_count = 0;
731 	cluster->focus = NULL;
732 	cparent->focus = NULL;
733 
734 	for (i = 0; i < cparent->nchains; ++i) {
735 		key_next = *key_nextp;
736 		chain = cluster->array[i];
737 		if (chain == NULL) {
738 			if (cparent->focus == NULL)
739 				cparent->focus = cparent->array[i];
740 			++null_count;
741 			continue;
742 		}
743 		if (cparent->array[i] == NULL) {
744 			if (flags & HAMMER2_LOOKUP_NOLOCK)
745 				hammer2_chain_drop(chain);
746 			else
747 				hammer2_chain_unlock(chain);
748 			++null_count;
749 			continue;
750 		}
751 		chain = hammer2_chain_next(&cparent->array[i], chain,
752 					   &key_next, key_beg, key_end,
753 					   &cparent->cache_index[i], flags);
754 		if (cparent->focus == NULL)
755 			cparent->focus = cparent->array[i];
756 		cluster->array[i] = chain;
757 		if (chain == NULL) {
758 			++null_count;
759 		} else if (cluster->focus == NULL) {
760 			cluster->focus = chain;
761 		}
762 		if (key_accum > key_next)
763 			key_accum = key_next;
764 	}
765 
766 	if (null_count == i) {
767 		hammer2_cluster_drop(cluster);
768 		cluster = NULL;
769 	}
770 	return(cluster);
771 }
772 
773 #if 0
774 /*
775  * XXX initial NULL cluster needs reworking (pass **clusterp ?)
776  *
777  * The raw scan function is similar to lookup/next but does not seek to a key.
778  * Blockrefs are iterated via first_chain = (parent, NULL) and
779  * next_chain = (parent, chain).
780  *
781  * The passed-in parent must be locked and its data resolved.  The returned
782  * chain will be locked.  Pass chain == NULL to acquire the first sub-chain
783  * under parent and then iterate with the passed-in chain (which this
784  * function will unlock).
785  */
786 hammer2_cluster_t *
787 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
788 		     int flags)
789 {
790 	hammer2_chain_t *chain;
791 	int null_count;
792 	int i;
793 
794 	null_count = 0;
795 
796 	for (i = 0; i < cparent->nchains; ++i) {
797 		chain = cluster->array[i];
798 		if (chain == NULL) {
799 			++null_count;
800 			continue;
801 		}
802 		if (cparent->array[i] == NULL) {
803 			if (flags & HAMMER2_LOOKUP_NOLOCK)
804 				hammer2_chain_drop(chain);
805 			else
806 				hammer2_chain_unlock(chain);
807 			++null_count;
808 			continue;
809 		}
810 
811 		chain = hammer2_chain_scan(cparent->array[i], chain,
812 					   &cparent->cache_index[i], flags);
813 		cluster->array[i] = chain;
814 		if (chain == NULL)
815 			++null_count;
816 	}
817 
818 	if (null_count == i) {
819 		hammer2_cluster_drop(cluster);
820 		cluster = NULL;
821 	}
822 	return(cluster);
823 }
824 
825 #endif
826 
827 /*
828  * Create a new cluster using the specified key
829  */
830 int
831 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
832 		     hammer2_cluster_t **clusterp,
833 		     hammer2_key_t key, int keybits,
834 		     int type, size_t bytes, int flags)
835 {
836 	hammer2_cluster_t *cluster;
837 	hammer2_pfsmount_t *pmp;
838 	int error;
839 	int i;
840 
841 	pmp = trans->pmp;				/* can be NULL */
842 
843 	if ((cluster = *clusterp) == NULL) {
844 		cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
845 				  M_WAITOK | M_ZERO);
846 		cluster->pmp = pmp;			/* can be NULL */
847 		cluster->refs = 1;
848 	}
849 	cluster->focus = NULL;
850 	cparent->focus = NULL;
851 
852 	/*
853 	 * NOTE: cluster->array[] entries can initially be NULL.  If
854 	 *	 *clusterp is supplied, skip NULL entries, otherwise
855 	 *	 create new chains.
856 	 */
857 	for (i = 0; i < cparent->nchains; ++i) {
858 		if (*clusterp && cluster->array[i] == NULL) {
859 			if (cparent->focus == NULL)
860 				cparent->focus = cparent->array[i];
861 			continue;
862 		}
863 		error = hammer2_chain_create(trans, &cparent->array[i],
864 					     &cluster->array[i], pmp,
865 					     key, keybits,
866 					     type, bytes, flags);
867 		KKASSERT(error == 0);
868 		if (cparent->focus == NULL)
869 			cparent->focus = cparent->array[i];
870 		if (cluster->focus == NULL)
871 			cluster->focus = cluster->array[i];
872 	}
873 	cluster->nchains = i;
874 	*clusterp = cluster;
875 
876 	return error;
877 }
878 
879 /*
880  * Rename a cluster to a new parent.
881  *
882  * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields
883  *	    are used from a passed-in non-NULL bref pointer.  All other fields
884  *	    are extracted from the original chain for each chain in the
885  *	    iteration.
886  */
887 void
888 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
889 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
890 		       int flags)
891 {
892 	hammer2_chain_t *chain;
893 	hammer2_blockref_t xbref;
894 	int i;
895 
896 	cluster->focus = NULL;
897 	cparent->focus = NULL;
898 
899 	for (i = 0; i < cluster->nchains; ++i) {
900 		chain = cluster->array[i];
901 		if (chain) {
902 			if (bref) {
903 				xbref = chain->bref;
904 				xbref.key = bref->key;
905 				xbref.keybits = bref->keybits;
906 				hammer2_chain_rename(trans, &xbref,
907 						     &cparent->array[i],
908 						     chain, flags);
909 			} else {
910 				hammer2_chain_rename(trans, NULL,
911 						     &cparent->array[i],
912 						     chain, flags);
913 			}
914 			cluster->array[i] = chain;
915 			if (cluster->focus == NULL)
916 				cluster->focus = chain;
917 			if (cparent->focus == NULL)
918 				cparent->focus = cparent->array[i];
919 		} else {
920 			if (cparent->focus == NULL)
921 				cparent->focus = cparent->array[i];
922 		}
923 	}
924 }
925 
926 /*
927  * Mark a cluster deleted
928  */
929 void
930 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
931 		       hammer2_cluster_t *cluster, int flags)
932 {
933 	hammer2_chain_t *chain;
934 	hammer2_chain_t *parent;
935 	int i;
936 
937 	if (cparent == NULL) {
938 		kprintf("cparent is NULL\n");
939 		return;
940 	}
941 
942 	for (i = 0; i < cluster->nchains; ++i) {
943 		parent = (i < cparent->nchains) ? cparent->array[i] : NULL;
944 		chain = cluster->array[i];
945 		if (chain == NULL)
946 			continue;
947 		if (chain->parent != parent) {
948 			kprintf("hammer2_cluster_delete: parent "
949 				"mismatch chain=%p parent=%p against=%p\n",
950 				chain, chain->parent, parent);
951 		} else {
952 			hammer2_chain_delete(trans, parent, chain, flags);
953 		}
954 	}
955 }
956 
957 /*
958  * Create a snapshot of the specified {parent, ochain} with the specified
959  * label.  The originating hammer2_inode must be exclusively locked for
960  * safety.
961  *
962  * The ioctl code has already synced the filesystem.
963  */
964 int
965 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
966 		       hammer2_ioc_pfs_t *pfs)
967 {
968 	hammer2_mount_t *hmp;
969 	hammer2_cluster_t *ncluster;
970 	const hammer2_inode_data_t *ripdata;
971 	hammer2_inode_data_t *wipdata;
972 	hammer2_inode_t *nip;
973 	size_t name_len;
974 	hammer2_key_t lhc;
975 	struct vattr vat;
976 	uuid_t opfs_clid;
977 	int error;
978 	int i;
979 
980 	kprintf("snapshot %s\n", pfs->name);
981 
982 	name_len = strlen(pfs->name);
983 	lhc = hammer2_dirhash(pfs->name, name_len);
984 
985 	/*
986 	 * Get the clid
987 	 */
988 	ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
989 	opfs_clid = ripdata->pfs_clid;
990 	hmp = ocluster->focus->hmp;
991 
992 	/*
993 	 * Create the snapshot directory under the super-root
994 	 *
995 	 * Set PFS type, generate a unique filesystem id, and generate
996 	 * a cluster id.  Use the same clid when snapshotting a PFS root,
997 	 * which theoretically allows the snapshot to be used as part of
998 	 * the same cluster (perhaps as a cache).
999 	 *
1000 	 * Copy the (flushed) blockref array.  Theoretically we could use
1001 	 * chain_duplicate() but it becomes difficult to disentangle
1002 	 * the shared core so for now just brute-force it.
1003 	 */
1004 	VATTR_NULL(&vat);
1005 	vat.va_type = VDIR;
1006 	vat.va_mode = 0755;
1007 	ncluster = NULL;
1008 	nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1009 				   proc0.p_ucred, pfs->name, name_len,
1010 				   &ncluster, &error);
1011 
1012 	if (nip) {
1013 		wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1014 		wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
1015 		kern_uuidgen(&wipdata->pfs_fsid, 1);
1016 		if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1017 			wipdata->pfs_clid = opfs_clid;
1018 		else
1019 			kern_uuidgen(&wipdata->pfs_clid, 1);
1020 
1021 		for (i = 0; i < ncluster->nchains; ++i) {
1022 			if (ncluster->array[i]) {
1023 				ncluster->array[i]->bref.flags |=
1024 				    HAMMER2_BREF_FLAG_PFSROOT;
1025 			}
1026 		}
1027 #if 0
1028 		/* XXX can't set this unless we do an explicit flush, which
1029 		   we also need a pmp assigned to do, else the flush code
1030 		   won't flush ncluster because it thinks it is crossing a
1031 		   flush boundary */
1032 		hammer2_cluster_set_chainflags(ncluster,
1033 					       HAMMER2_CHAIN_PFSBOUNDARY);
1034 #endif
1035 
1036 		/* XXX hack blockset copy */
1037 		/* XXX doesn't work with real cluster */
1038 		KKASSERT(ocluster->nchains == 1);
1039 		wipdata->u.blockset = ripdata->u.blockset;
1040 		hammer2_cluster_modsync(ncluster);
1041 		for (i = 0; i < ncluster->nchains; ++i) {
1042 			if (ncluster->array[i])
1043 				hammer2_flush(trans, ncluster->array[i]);
1044 		}
1045 		hammer2_inode_unlock_ex(nip, ncluster);
1046 	}
1047 	return (error);
1048 }
1049 
1050 /*
1051  * Return locked parent cluster given a locked child.  The child remains
1052  * locked on return.
1053  */
1054 hammer2_cluster_t *
1055 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1056 {
1057 	hammer2_cluster_t *cparent;
1058 	int i;
1059 
1060 	cparent = hammer2_cluster_copy(cluster, HAMMER2_CLUSTER_COPY_NOCHAINS);
1061 	for (i = 0; i < cluster->nchains; ++i) {
1062 		hammer2_chain_t *chain;
1063 		hammer2_chain_t *rchain;
1064 
1065 		chain = cluster->array[i];
1066 		if (chain == NULL)
1067 			continue;
1068 		hammer2_chain_ref(chain);
1069 		while ((rchain = chain->parent) != NULL) {
1070 			hammer2_chain_ref(rchain);
1071 			hammer2_chain_unlock(chain);
1072 			hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1073 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1074 			hammer2_chain_drop(rchain);
1075 			if (chain->parent == rchain)
1076 				break;
1077 			hammer2_chain_unlock(rchain);
1078 		}
1079 		hammer2_chain_drop(chain);
1080 		cparent->array[i] = rchain;
1081 	}
1082 	return cparent;
1083 }
1084 
1085 /************************************************************************
1086  *			        CLUSTER I/O 				*
1087  ************************************************************************
1088  *
1089  *
1090  * WARNING! blockref[] array data is not universal.  These functions should
1091  *	    only be used to access universal data.
1092  *
1093  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1094  *	    complete if necessary.  The I/O's should have already been
1095  *	    initiated by the cluster_lock/chain_lock operation.
1096  *
1097  *	    The cluster must already be in a modified state before wdata
1098  *	    is called.  The data will already be available for this case.
1099  */
1100 const hammer2_media_data_t *
1101 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1102 {
1103 	return(cluster->focus->data);
1104 }
1105 
1106 hammer2_media_data_t *
1107 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1108 {
1109 	KKASSERT(hammer2_cluster_modified(cluster));
1110 	return(cluster->focus->data);
1111 }
1112 
1113 /*
1114  * Load async into independent buffer - used to load logical buffers from
1115  * underlying device data.  The callback is made for the first validated
1116  * data found, or NULL if no valid data is available.
1117  *
1118  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1119  *	 in the inode with an exclusive lock held), the chain structure may be
1120  *	 shared.
1121  */
1122 void
1123 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1124 			   void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1125 {
1126 	hammer2_chain_t *chain;
1127 	hammer2_iocb_t *iocb;
1128 	hammer2_mount_t *hmp;
1129 	hammer2_blockref_t *bref;
1130 	int i;
1131 
1132 	/*
1133 	 * Try to find a chain whos data is already resolved.  If none can
1134 	 * be found, start with the first chain.
1135 	 */
1136 	chain = NULL;
1137 	for (i = 0; i < cluster->nchains; ++i) {
1138 		chain = cluster->array[i];
1139 		if (chain && chain->data)
1140 			break;
1141 	}
1142 	if (i == cluster->nchains) {
1143 		chain = cluster->array[0];
1144 		i = 0;
1145 	}
1146 
1147 	iocb = &cluster->iocb;
1148 	iocb->callback = callback;
1149 	iocb->dio = NULL;		/* for already-validated case */
1150 	iocb->cluster = cluster;
1151 	iocb->chain = chain;
1152 	iocb->ptr = ptr;
1153 	iocb->lbase = (off_t)i;
1154 	iocb->flags = 0;
1155 	iocb->error = 0;
1156 
1157 	/*
1158 	 * Data already validated
1159 	 */
1160 	if (chain->data) {
1161 		callback(iocb);
1162 		return;
1163 	}
1164 
1165 	/*
1166 	 * We must resolve to a device buffer, either by issuing I/O or
1167 	 * by creating a zero-fill element.  We do not mark the buffer
1168 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1169 	 * API must still be used to do that).
1170 	 *
1171 	 * The device buffer is variable-sized in powers of 2 down
1172 	 * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1173 	 * chunk always contains buffers of the same size. (XXX)
1174 	 *
1175 	 * The minimum physical IO size may be larger than the variable
1176 	 * block size.
1177 	 */
1178 	bref = &chain->bref;
1179 	hmp = chain->hmp;
1180 
1181 #if 0
1182 	/* handled by callback? <- TODO XXX even needed for loads? */
1183 	/*
1184 	 * The getblk() optimization for a 100% overwrite can only be used
1185 	 * if the physical block size matches the request.
1186 	 */
1187 	if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1188 	    chain->bytes == hammer2_devblksize(chain->bytes)) {
1189 		error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1190 		KKASSERT(error == 0);
1191 		iocb->dio = dio;
1192 		callback(iocb);
1193 		return;
1194 	}
1195 #endif
1196 
1197 	/*
1198 	 * Otherwise issue a read
1199 	 */
1200 	hammer2_adjreadcounter(&chain->bref, chain->bytes);
1201 	hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1202 }
1203 
1204 /************************************************************************
1205  *			    NODE FAILURES 				*
1206  ************************************************************************
1207  *
1208  * A node failure can occur for numerous reasons.
1209  *
1210  *	- A read I/O may fail
1211  *	- A write I/O may fail
1212  *	- An unexpected chain might be found (or be missing)
1213  *	- A node might disconnect temporarily and reconnect later
1214  *	  (for example, a USB stick could get pulled, or a node might
1215  *	  be programmatically disconnected).
1216  *	- A node might run out of space during a modifying operation.
1217  *
1218  * When a read failure or an unexpected chain state is found, the chain and
1219  * parent chain at the failure point for the nodes involved (the nodes
1220  * which we determine to be in error) are flagged as failed and removed
1221  * from the cluster.  The node itself is allowed to remain active.  The
1222  * highest common point (usually a parent chain) is queued to the
1223  * resynchronization thread for action.
1224  *
1225  * When a write I/O fails or a node runs out of space, we first adjust
1226  * as if a read failure occurs but we further disable flushes on the
1227  * ENTIRE node.  Concurrent modifying transactions are allowed to complete
1228  * but any new modifying transactions will automatically remove the node
1229  * from consideration in all related cluster structures and not generate
1230  * any new modified chains.  The ROOT chain for the failed node(s) is queued
1231  * to the resynchronization thread for action.
1232  *
1233  * A temporary disconnect is handled as if a write failure occurred.
1234  *
1235  * Any of these failures might or might not stall related high level VNOPS,
1236  * depending on what has failed, what nodes remain, the type of cluster,
1237  * and the operating state of the cluster.
1238  *
1239  *			    FLUSH ON WRITE-DISABLED NODES
1240  *
1241  * A flush on a write-disabled node is not allowed to write anything because
1242  * we cannot safely update the mirror_tid anywhere on the failed node.  The
1243  * synchronization thread uses mirror_tid to calculate incremental resyncs.
1244  * Dirty meta-data related to the failed node is thrown away.
1245  *
1246  * Dirty buffer cache buffers and inodes are only thrown away if they can be
1247  * retired... that is, if the filesystem still has enough nodes to complete
1248  * the operation.
1249  */
1250 
1251 /************************************************************************
1252  *			SYNCHRONIZATION THREAD				*
1253  ************************************************************************
1254  *
1255  * This thread is responsible for [re]synchronizing the cluster representing
1256  * a PFS.  Any out-of-sync or failed node starts this thread on a
1257  * node-by-node basis when the failure is detected.
1258  *
1259  * Clusters needing resynchronization are queued at the highest point
1260  * where the parent on the failed node is still valid, or a special
1261  * incremental scan from the ROOT is queued if no parent exists.  This
1262  * thread is also responsible for waiting for reconnections of the failed
1263  * node if the cause was due to a disconnect, and waiting for space to be
1264  * freed up if the cause was due to running out of space.
1265  *
1266  * If the cause is due to a node running out of space, this thread will also
1267  * remove older (unlocked) snapshots to make new space, recover space, and
1268  * then start resynchronization.
1269  *
1270  * Each resynchronization pass virtually snapshots the PFS on the good nodes
1271  * and synchronizes using that snapshot against the target node.  This
1272  * ensures a consistent chain topology and also avoid interference between
1273  * the resynchronization thread and frontend operations.
1274  *
1275  * Since these are per-node threads it is possible to resynchronize several
1276  * nodes at once.
1277  */
1278