xref: /dragonfly/sys/vfs/hammer2/hammer2_cluster.c (revision e0b1d537)
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information into a single entity.  It allows direct access to media
37  * data as long as it is not blockref array data.  Meaning, basically,
38  * just inode and file data.
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  */
48 #include <sys/cdefs.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/types.h>
52 #include <sys/lock.h>
53 #include <sys/uuid.h>
54 
55 #include "hammer2.h"
56 
57 u_int
58 hammer2_cluster_bytes(hammer2_cluster_t *cluster)
59 {
60 	return(cluster->focus->bytes);
61 }
62 
63 uint8_t
64 hammer2_cluster_type(hammer2_cluster_t *cluster)
65 {
66 	return(cluster->focus->bref.type);
67 }
68 
69 /*
70  * NOTE: When modifying a cluster object via hammer2_cluster_wdata()
71  *	 and hammer2_cluster_modsync(), remember that block array
72  *	 entries are not copied to the elements of the cluster.
73  */
74 const hammer2_media_data_t *
75 hammer2_cluster_data(hammer2_cluster_t *cluster)
76 {
77 	return(cluster->focus->data);
78 }
79 
80 hammer2_media_data_t *
81 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
82 {
83 	return(cluster->focus->data);
84 }
85 
86 int
87 hammer2_cluster_modified(hammer2_cluster_t *cluster)
88 {
89 	return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
90 }
91 
92 int
93 hammer2_cluster_unlinked(hammer2_cluster_t *cluster)
94 {
95 	return((cluster->focus->flags & HAMMER2_CHAIN_UNLINKED) != 0);
96 }
97 
98 /*
99  * Return a bref representative of the cluster.  Any data offset is removed
100  * (since it would only be applicable to a particular chain in the cluster).
101  *
102  * However, the radix portion of data_off is used for many purposes and will
103  * be retained.
104  */
105 void
106 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
107 {
108 	*bref = cluster->focus->bref;
109 	bref->data_off &= HAMMER2_OFF_MASK_RADIX;
110 }
111 
112 void
113 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
114 {
115 	hammer2_chain_t *chain;
116 	int i;
117 
118 	for (i = 0; i < cluster->nchains; ++i) {
119 		chain = cluster->array[i];
120 		if (chain)
121 			atomic_set_int(&chain->flags, flags);
122 	}
123 }
124 
125 void
126 hammer2_cluster_setsubmod(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
127 {
128 	hammer2_chain_t *chain;
129 	int i;
130 
131 	for (i = 0; i < cluster->nchains; ++i) {
132 		chain = cluster->array[i];
133 		if (chain)
134 			hammer2_chain_setsubmod(trans, chain);
135 	}
136 }
137 
138 /*
139  * Create a cluster with one ref from the specified chain.  The chain
140  * is not further referenced.  The caller typically supplies a locked
141  * chain and transfers ownership to the cluster.
142  */
143 hammer2_cluster_t *
144 hammer2_cluster_from_chain(hammer2_chain_t *chain)
145 {
146 	hammer2_cluster_t *cluster;
147 
148 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
149 	cluster->array[0] = chain;
150 	cluster->nchains = 1;
151 	cluster->focus = chain;
152 	cluster->pmp = chain->pmp;
153 	cluster->refs = 1;
154 
155 	return cluster;
156 }
157 
158 /*
159  * Allocates a cluster and its underlying chain structures.  The underlying
160  * chains will be locked.  The cluster and underlying chains will have one
161  * ref.
162  */
163 hammer2_cluster_t *
164 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
165 		      hammer2_trans_t *trans, hammer2_blockref_t *bref)
166 {
167 	hammer2_cluster_t *cluster;
168 	hammer2_cluster_t *rcluster;
169 	hammer2_chain_t *chain;
170 	u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
171 	int i;
172 
173 	KKASSERT(pmp != NULL);
174 
175 	/*
176 	 * Construct the appropriate system structure.
177 	 */
178 	switch(bref->type) {
179 	case HAMMER2_BREF_TYPE_INODE:
180 	case HAMMER2_BREF_TYPE_INDIRECT:
181 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
182 	case HAMMER2_BREF_TYPE_DATA:
183 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
184 		/*
185 		 * Chain's are really only associated with the hmp but we
186 		 * maintain a pmp association for per-mount memory tracking
187 		 * purposes.  The pmp can be NULL.
188 		 */
189 		break;
190 	case HAMMER2_BREF_TYPE_VOLUME:
191 	case HAMMER2_BREF_TYPE_FREEMAP:
192 		chain = NULL;
193 		panic("hammer2_cluster_alloc volume type illegal for op");
194 	default:
195 		chain = NULL;
196 		panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
197 		      bref->type);
198 	}
199 
200 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
201 	cluster->refs = 1;
202 
203 	rcluster = &pmp->iroot->cluster;
204 	for (i = 0; i < rcluster->nchains; ++i) {
205 		chain = hammer2_chain_alloc(rcluster->array[i]->hmp,
206 					    pmp, trans, bref);
207 		chain->hmp = rcluster->array[i]->hmp;
208 		chain->bref = *bref;
209 		chain->bytes = bytes;
210 		chain->refs = 1;
211 		chain->flags = HAMMER2_CHAIN_ALLOCATED;
212 		chain->delete_xid = HAMMER2_XID_MAX;
213 
214 		/*
215 		 * Set modify_tid if a transaction is creating the inode.
216 		 * Enforce update_xlo = 0 so nearby transactions do not think
217 		 * it has been flushed when it hasn't.
218 		 *
219 		 * NOTE: When loading a chain from backing store or creating a
220 		 *	 snapshot, trans will be NULL and the caller is
221 		 *	 responsible for setting these fields.
222 		 */
223 		if (trans) {
224 			chain->modify_xid = trans->sync_xid;
225 			chain->update_xlo = 0;
226 		}
227 		cluster->array[i] = chain;
228 	}
229 	cluster->nchains = i;
230 	cluster->pmp = pmp;
231 	cluster->focus = cluster->array[0];
232 
233 	return (cluster);
234 }
235 
236 /*
237  * Associate an existing core with the chain or allocate a new core.
238  *
239  * The core is not locked.  No additional refs on the chain are made.
240  * (trans) must not be NULL if (core) is not NULL.
241  *
242  * When chains are delete-duplicated during flushes we insert nchain on
243  * the ownerq after ochain instead of at the end in order to give the
244  * drop code visibility in the correct order, otherwise drops can be missed.
245  */
246 void
247 hammer2_cluster_core_alloc(hammer2_trans_t *trans,
248 			   hammer2_cluster_t *ncluster,
249 			   hammer2_cluster_t *ocluster)
250 {
251 	int i;
252 
253 	for (i = 0; i < ocluster->nchains; ++i) {
254 		if (ncluster->array[i]) {
255 			hammer2_chain_core_alloc(trans,
256 						 ncluster->array[i],
257 						 ocluster->array[i]);
258 		}
259 	}
260 }
261 
262 /*
263  * Add a reference to a cluster.
264  *
265  * We must also ref the underlying chains in order to allow ref/unlock
266  * sequences to later re-lock.
267  */
268 void
269 hammer2_cluster_ref(hammer2_cluster_t *cluster)
270 {
271 	hammer2_chain_t *chain;
272 	int i;
273 
274 	atomic_add_int(&cluster->refs, 1);
275 	for (i = 0; i < cluster->nchains; ++i) {
276 		chain = cluster->array[i];
277 		if (chain)
278 			hammer2_chain_ref(chain);
279 	}
280 }
281 
282 /*
283  * Drop the caller's reference to the cluster.  When the ref count drops to
284  * zero this function frees the cluster and drops all underlying chains.
285  */
286 void
287 hammer2_cluster_drop(hammer2_cluster_t *cluster)
288 {
289 	hammer2_chain_t *chain;
290 	int i;
291 
292 	KKASSERT(cluster->refs > 0);
293 	for (i = 0; i < cluster->nchains; ++i) {
294 		chain = cluster->array[i];
295 		if (chain) {
296 			hammer2_chain_drop(chain);
297 			if (cluster->refs == 1)
298 				cluster->array[i] = NULL;
299 		}
300 	}
301 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
302 		cluster->focus = NULL;
303 		kfree(cluster, M_HAMMER2);
304 		/* cluster = NULL; safety */
305 	}
306 }
307 
308 void
309 hammer2_cluster_wait(hammer2_cluster_t *cluster)
310 {
311 	tsleep(cluster->focus, 0, "h2clcw", 1);
312 }
313 
314 /*
315  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
316  * and then locks them.
317  */
318 int
319 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
320 {
321 	hammer2_chain_t *chain;
322 	int i;
323 	int error;
324 
325 	error = 0;
326 	atomic_add_int(&cluster->refs, 1);
327 	for (i = 0; i < cluster->nchains; ++i) {
328 		chain = cluster->array[i];
329 		if (chain) {
330 			error = hammer2_chain_lock(chain, how);
331 			if (error) {
332 				while (--i >= 0)
333 					hammer2_chain_unlock(cluster->array[i]);
334 				atomic_add_int(&cluster->refs, -1);
335 				break;
336 			}
337 		}
338 	}
339 	return error;
340 }
341 
342 /*
343  * Replace the contents of dst with src, adding a reference to src's chains.
344  * dst is assumed to already have a ref and any chains present in dst are
345  * assumed to be locked and will be unlocked.
346  *
347  * If the chains in src are locked, only one of (src) or (dst) should be
348  * considered locked by the caller after return, not both.
349  */
350 void
351 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
352 {
353 	hammer2_chain_t *chain;
354 	int i;
355 
356 	KKASSERT(dst->refs == 1);
357 	dst->focus = NULL;
358 
359 	for (i = 0; i < src->nchains; ++i) {
360 		chain = src->array[i];
361 		if (chain) {
362 			hammer2_chain_ref(chain);
363 			if (i < dst->nchains && dst->array[i])
364 				hammer2_chain_unlock(dst->array[i]);
365 			dst->array[i] = chain;
366 			if (dst->focus == NULL)
367 				dst->focus = chain;
368 		}
369 	}
370 	while (i < dst->nchains) {
371 		chain = dst->array[i];
372 		if (chain) {
373 			hammer2_chain_unlock(chain);
374 			dst->array[i] = NULL;
375 		}
376 		++i;
377 	}
378 	dst->nchains = src->nchains;
379 }
380 
381 /*
382  * Replace the contents of the locked destination with the contents of the
383  * locked source.  Destination must have one ref.
384  *
385  * Returns with the destination still with one ref and the copied chains
386  * with an additional lock (representing their state on the destination).
387  * The original chains associated with the destination are unlocked.
388  */
389 void
390 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
391 {
392 	hammer2_chain_t *chain;
393 	int i;
394 
395 	KKASSERT(dst->refs == 1);
396 
397 	dst->focus = NULL;
398 	for (i = 0; i < src->nchains; ++i) {
399 		chain = src->array[i];
400 		if (chain) {
401 			hammer2_chain_lock(chain, 0);
402 			if (i < dst->nchains && dst->array[i])
403 				hammer2_chain_unlock(dst->array[i]);
404 			dst->array[i] = src->array[i];
405 			if (dst->focus == NULL)
406 				dst->focus = chain;
407 		}
408 	}
409 	while (i < dst->nchains) {
410 		chain = dst->array[i];
411 		if (chain) {
412 			hammer2_chain_unlock(chain);
413 			dst->array[i] = NULL;
414 		}
415 		++i;
416 	}
417 	dst->nchains = src->nchains;
418 }
419 
420 /*
421  * Copy a cluster, returned a ref'd cluster.  All underlying chains
422  * are also ref'd, but not locked.
423  *
424  * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied
425  * to the new cluster and a reference is nominally added to them and to
426  * the cluster.  The cluster will have 1 ref.
427  *
428  * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains
429  * are copied but no additional references are made and the cluster will have
430  * 0 refs.  Callers must ref the cluster and the chains before dropping it
431  * (typically by locking it).
432  *
433  * If flags are passed as 0 the copy is setup as if it contained the chains
434  * but the chains will not be copied over, and the cluster will have 0 refs.
435  * Callers must ref the cluster before dropping it (typically by locking it).
436  */
437 hammer2_cluster_t *
438 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags)
439 {
440 	hammer2_pfsmount_t *pmp = ocluster->pmp;
441 	hammer2_cluster_t *ncluster;
442 	hammer2_chain_t *chain;
443 	int i;
444 
445 	ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
446 	ncluster->pmp = pmp;
447 	ncluster->nchains = ocluster->nchains;
448 	ncluster->focus = ocluster->focus;
449 	ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1;
450 	if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) {
451 		for (i = 0; i < ocluster->nchains; ++i) {
452 			chain = ocluster->array[i];
453 			ncluster->array[i] = chain;
454 			if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 &&
455 			    chain) {
456 				hammer2_chain_ref(chain);
457 			}
458 		}
459 	}
460 	return (ncluster);
461 }
462 
463 /*
464  * Unlock and deref a cluster.  The cluster is destroyed if this is the
465  * last ref.
466  */
467 void
468 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
469 {
470 	hammer2_chain_t *chain;
471 	int i;
472 
473 	KKASSERT(cluster->refs > 0);
474 	for (i = 0; i < cluster->nchains; ++i) {
475 		chain = cluster->array[i];
476 		if (chain) {
477 			hammer2_chain_unlock(chain);
478 			if (cluster->refs == 1)
479 				cluster->array[i] = NULL;	/* safety */
480 		}
481 	}
482 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
483 		cluster->focus = NULL;
484 		kfree(cluster, M_HAMMER2);
485 		/* cluster = NULL; safety */
486 	}
487 }
488 
489 /*
490  * Refactor the locked chains of a cluster.
491  */
492 void
493 hammer2_cluster_refactor(hammer2_cluster_t *cluster)
494 {
495 	int i;
496 
497 	cluster->focus = NULL;
498 	for (i = 0; i < cluster->nchains; ++i) {
499 		if (cluster->array[i]) {
500 			hammer2_chain_refactor(&cluster->array[i]);
501 			if (cluster->focus == NULL)
502 				cluster->focus = cluster->array[i];
503 		}
504 	}
505 }
506 
507 /*
508  * Resize the cluster's physical storage allocation in-place.  This may
509  * replace the cluster's chains.
510  */
511 void
512 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
513 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
514 		       int nradix, int flags)
515 {
516 	int i;
517 
518 	KKASSERT(cparent->pmp == cluster->pmp);		/* can be NULL */
519 	KKASSERT(cparent->nchains == cluster->nchains);
520 
521 	cluster->focus = NULL;
522 	for (i = 0; i < cluster->nchains; ++i) {
523 		if (cluster->array[i]) {
524 			KKASSERT(cparent->array[i]);
525 			hammer2_chain_resize(trans, ip,
526 					     cparent->array[i],
527 					     &cluster->array[i],
528 					     nradix, flags);
529 			if (cluster->focus == NULL)
530 				cluster->focus = cluster->array[i];
531 		}
532 	}
533 }
534 
535 /*
536  * Set an inode's cluster modified, marking the related chains RW and
537  * duplicating them if necessary.
538  *
539  * The passed-in chain is a localized copy of the chain previously acquired
540  * when the inode was locked (and possilby replaced in the mean time), and
541  * must also be updated.  In fact, we update it first and then synchronize
542  * the inode's cluster cache.
543  */
544 hammer2_inode_data_t *
545 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
546 			  hammer2_cluster_t *cluster, int flags)
547 {
548 	atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
549 	hammer2_cluster_modify(trans, cluster, flags);
550 
551 	hammer2_inode_repoint(ip, NULL, cluster);
552 	if (ip->vp)
553 		vsetisdirty(ip->vp);
554 	return (&hammer2_cluster_wdata(cluster)->ipdata);
555 }
556 
557 /*
558  * Adjust the cluster's chains to allow modification.
559  */
560 void
561 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
562 		       int flags)
563 {
564 	int i;
565 
566 	cluster->focus = NULL;
567 	for (i = 0; i < cluster->nchains; ++i) {
568 		if (cluster->array[i]) {
569 			hammer2_chain_modify(trans, &cluster->array[i], flags);
570 			if (cluster->focus == NULL)
571 				cluster->focus = cluster->array[i];
572 		}
573 	}
574 }
575 
576 /*
577  * Synchronize modifications with other chains in a cluster.
578  *
579  * Nominal front-end operations only edit non-block-table data in a single
580  * chain.  This code copies such modifications to the other chains in the
581  * cluster.
582  */
583 /* hammer2_cluster_modsync() */
584 
585 void
586 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
587 {
588 	hammer2_chain_t *focus;
589 	hammer2_chain_t *scan;
590 	const hammer2_inode_data_t *ripdata;
591 	hammer2_inode_data_t *wipdata;
592 	int i;
593 
594 	focus = cluster->focus;
595 	KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
596 
597 	for (i = 0; i < cluster->nchains; ++i) {
598 		scan = cluster->array[i];
599 		if (scan == NULL || scan == focus)
600 			continue;
601 		KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
602 		KKASSERT(focus->bytes == scan->bytes &&
603 			 focus->bref.type == scan->bref.type);
604 		switch(focus->bref.type) {
605 		case HAMMER2_BREF_TYPE_INODE:
606 			ripdata = &focus->data->ipdata;
607 			wipdata = &scan->data->ipdata;
608 			if ((ripdata->op_flags &
609 			    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
610 				bcopy(ripdata, wipdata,
611 				      offsetof(hammer2_inode_data_t, u));
612 				break;
613 			}
614 			/* fall through */
615 		case HAMMER2_BREF_TYPE_DATA:
616 			bcopy(focus->data, scan->data, focus->bytes);
617 			break;
618 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
619 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
620 		case HAMMER2_BREF_TYPE_FREEMAP:
621 		case HAMMER2_BREF_TYPE_VOLUME:
622 			panic("hammer2_cluster_modsync: illegal node type");
623 			/* NOT REACHED */
624 			break;
625 		default:
626 			panic("hammer2_cluster_modsync: unknown node type");
627 			break;
628 		}
629 	}
630 }
631 
632 /*
633  * Lookup initialization/completion API
634  */
635 hammer2_cluster_t *
636 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
637 {
638 	hammer2_cluster_t *cluster;
639 	int i;
640 
641 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
642 	cluster->pmp = cparent->pmp;			/* can be NULL */
643 	/* cluster->focus = NULL; already null */
644 
645 	for (i = 0; i < cparent->nchains; ++i) {
646 		cluster->array[i] = cparent->array[i];
647 		if (cluster->focus == NULL)
648 			cluster->focus = cluster->array[i];
649 	}
650 	cluster->nchains = cparent->nchains;
651 
652 	/*
653 	 * Independently lock (this will also give cluster 1 ref)
654 	 */
655 	if (flags & HAMMER2_LOOKUP_SHARED) {
656 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
657 					      HAMMER2_RESOLVE_SHARED);
658 	} else {
659 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
660 	}
661 	return (cluster);
662 }
663 
664 void
665 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
666 {
667 	if (cparent)
668 		hammer2_cluster_unlock(cparent);
669 }
670 
671 /*
672  * Locate first match or overlap under parent, return a new cluster
673  */
674 hammer2_cluster_t *
675 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
676 		     hammer2_key_t key_beg, hammer2_key_t key_end,
677 		     int flags, int *ddflagp)
678 {
679 	hammer2_pfsmount_t *pmp;
680 	hammer2_cluster_t *cluster;
681 	hammer2_chain_t *chain;
682 	hammer2_key_t key_accum;
683 	hammer2_key_t key_next;
684 	hammer2_key_t bref_key;
685 	int bref_keybits;
686 	int null_count;
687 	int ddflag;
688 	int i;
689 	uint8_t bref_type;
690 	u_int bytes;
691 
692 	pmp = cparent->pmp;				/* can be NULL */
693 	key_accum = *key_nextp;
694 	null_count = 0;
695 	bref_type = 0;
696 	bref_key = 0;
697 	bref_keybits = 0;
698 	bytes = 0;
699 
700 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
701 	cluster->pmp = pmp;				/* can be NULL */
702 	cluster->refs = 1;
703 	/* cluster->focus = NULL; already null */
704 	cparent->focus = NULL;
705 	*ddflagp = 0;
706 
707 	for (i = 0; i < cparent->nchains; ++i) {
708 		key_next = *key_nextp;
709 		if (cparent->array[i] == NULL) {
710 			++null_count;
711 			continue;
712 		}
713 		chain = hammer2_chain_lookup(&cparent->array[i], &key_next,
714 					     key_beg, key_end,
715 					     &cparent->cache_index[i],
716 					     flags, &ddflag);
717 		if (cparent->focus == NULL)
718 			cparent->focus = cparent->array[i];
719 		cluster->array[i] = chain;
720 		if (chain == NULL) {
721 			++null_count;
722 		} else {
723 			if (cluster->focus == NULL) {
724 				bref_type = chain->bref.type;
725 				bref_key = chain->bref.key;
726 				bref_keybits = chain->bref.keybits;
727 				bytes = chain->bytes;
728 				*ddflagp = ddflag;
729 				cluster->focus = chain;
730 			}
731 			KKASSERT(bref_type == chain->bref.type);
732 			KKASSERT(bref_key == chain->bref.key);
733 			KKASSERT(bref_keybits == chain->bref.keybits);
734 			KKASSERT(bytes == chain->bytes);
735 			KKASSERT(*ddflagp == ddflag);
736 		}
737 		if (key_accum > key_next)
738 			key_accum = key_next;
739 	}
740 	*key_nextp = key_accum;
741 	cluster->nchains = i;
742 
743 	if (null_count == i) {
744 		hammer2_cluster_drop(cluster);
745 		cluster = NULL;
746 	}
747 
748 	return (cluster);
749 }
750 
751 /*
752  * Locate next match or overlap under parent, replace cluster
753  */
754 hammer2_cluster_t *
755 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
756 		     hammer2_key_t *key_nextp,
757 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
758 {
759 	hammer2_chain_t *chain;
760 	hammer2_key_t key_accum;
761 	hammer2_key_t key_next;
762 	int null_count;
763 	int i;
764 
765 	key_accum = *key_nextp;
766 	null_count = 0;
767 	cluster->focus = NULL;
768 	cparent->focus = NULL;
769 
770 	for (i = 0; i < cparent->nchains; ++i) {
771 		key_next = *key_nextp;
772 		chain = cluster->array[i];
773 		if (chain == NULL) {
774 			if (cparent->focus == NULL)
775 				cparent->focus = cparent->array[i];
776 			++null_count;
777 			continue;
778 		}
779 		if (cparent->array[i] == NULL) {
780 			if (flags & HAMMER2_LOOKUP_NOLOCK)
781 				hammer2_chain_drop(chain);
782 			else
783 				hammer2_chain_unlock(chain);
784 			++null_count;
785 			continue;
786 		}
787 		chain = hammer2_chain_next(&cparent->array[i], chain,
788 					   &key_next, key_beg, key_end,
789 					   &cparent->cache_index[i], flags);
790 		if (cparent->focus == NULL)
791 			cparent->focus = cparent->array[i];
792 		cluster->array[i] = chain;
793 		if (chain == NULL) {
794 			++null_count;
795 		} else if (cluster->focus == NULL) {
796 			cluster->focus = chain;
797 		}
798 		if (key_accum > key_next)
799 			key_accum = key_next;
800 	}
801 
802 	if (null_count == i) {
803 		hammer2_cluster_drop(cluster);
804 		cluster = NULL;
805 	}
806 	return(cluster);
807 }
808 
809 #if 0
810 /*
811  * XXX initial NULL cluster needs reworking (pass **clusterp ?)
812  *
813  * The raw scan function is similar to lookup/next but does not seek to a key.
814  * Blockrefs are iterated via first_chain = (parent, NULL) and
815  * next_chain = (parent, chain).
816  *
817  * The passed-in parent must be locked and its data resolved.  The returned
818  * chain will be locked.  Pass chain == NULL to acquire the first sub-chain
819  * under parent and then iterate with the passed-in chain (which this
820  * function will unlock).
821  */
822 hammer2_cluster_t *
823 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
824 		     int flags)
825 {
826 	hammer2_chain_t *chain;
827 	int null_count;
828 	int i;
829 
830 	null_count = 0;
831 
832 	for (i = 0; i < cparent->nchains; ++i) {
833 		chain = cluster->array[i];
834 		if (chain == NULL) {
835 			++null_count;
836 			continue;
837 		}
838 		if (cparent->array[i] == NULL) {
839 			if (flags & HAMMER2_LOOKUP_NOLOCK)
840 				hammer2_chain_drop(chain);
841 			else
842 				hammer2_chain_unlock(chain);
843 			++null_count;
844 			continue;
845 		}
846 
847 		chain = hammer2_chain_scan(cparent->array[i], chain,
848 					   &cparent->cache_index[i], flags);
849 		cluster->array[i] = chain;
850 		if (chain == NULL)
851 			++null_count;
852 	}
853 
854 	if (null_count == i) {
855 		hammer2_cluster_drop(cluster);
856 		cluster = NULL;
857 	}
858 	return(cluster);
859 }
860 
861 #endif
862 
863 /*
864  * Create a new cluster using the specified key
865  */
866 int
867 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
868 		     hammer2_cluster_t **clusterp,
869 		     hammer2_key_t key, int keybits, int type, size_t bytes)
870 {
871 	hammer2_cluster_t *cluster;
872 	hammer2_pfsmount_t *pmp;
873 	int error;
874 	int i;
875 
876 	pmp = trans->pmp;				/* can be NULL */
877 
878 	if ((cluster = *clusterp) == NULL) {
879 		cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
880 				  M_WAITOK | M_ZERO);
881 		cluster->pmp = pmp;			/* can be NULL */
882 		cluster->refs = 1;
883 	}
884 	cluster->focus = NULL;
885 	cparent->focus = NULL;
886 
887 	/*
888 	 * NOTE: cluster->array[] entries can initially be NULL.  If
889 	 *	 *clusterp is supplied, skip NULL entries, otherwise
890 	 *	 create new chains.
891 	 */
892 	for (i = 0; i < cparent->nchains; ++i) {
893 		if (*clusterp && cluster->array[i] == NULL) {
894 			if (cparent->focus == NULL)
895 				cparent->focus = cparent->array[i];
896 			continue;
897 		}
898 		error = hammer2_chain_create(trans, &cparent->array[i],
899 					     &cluster->array[i], pmp,
900 					     key, keybits, type, bytes);
901 		KKASSERT(error == 0);
902 		if (cparent->focus == NULL)
903 			cparent->focus = cparent->array[i];
904 		if (cluster->focus == NULL)
905 			cluster->focus = cluster->array[i];
906 	}
907 	cluster->nchains = i;
908 	*clusterp = cluster;
909 
910 	return error;
911 }
912 
913 /*
914  * Duplicate a cluster under a new parent.
915  *
916  * WARNING! Unlike hammer2_chain_duplicate(), only the key and keybits fields
917  *	    are used from a passed-in non-NULL bref pointer.  All other fields
918  *	    are extracted from the original chain for each chain in the
919  *	    iteration.
920  */
921 void
922 hammer2_cluster_duplicate(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
923 			  hammer2_cluster_t *cluster, hammer2_blockref_t *bref,
924 			  int snapshot, int duplicate_reason)
925 {
926 	hammer2_chain_t *chain;
927 	hammer2_blockref_t xbref;
928 	int i;
929 
930 	cluster->focus = NULL;
931 	cparent->focus = NULL;
932 
933 	for (i = 0; i < cluster->nchains; ++i) {
934 		chain = cluster->array[i];
935 		if (chain) {
936 			if (bref) {
937 				xbref = chain->bref;
938 				xbref.key = bref->key;
939 				xbref.keybits = bref->keybits;
940 				hammer2_chain_duplicate(trans,
941 							&cparent->array[i],
942 							&chain, &xbref,
943 							snapshot,
944 							duplicate_reason);
945 			} else {
946 				hammer2_chain_duplicate(trans,
947 							&cparent->array[i],
948 							&chain, NULL,
949 							snapshot,
950 							duplicate_reason);
951 			}
952 			cluster->array[i] = chain;
953 			if (cluster->focus == NULL)
954 				cluster->focus = chain;
955 			if (cparent->focus == NULL)
956 				cparent->focus = cparent->array[i];
957 		} else {
958 			if (cparent->focus == NULL)
959 				cparent->focus = cparent->array[i];
960 		}
961 	}
962 }
963 
964 /*
965  * Delete-duplicate a cluster in-place.
966  */
967 void
968 hammer2_cluster_delete_duplicate(hammer2_trans_t *trans,
969 				 hammer2_cluster_t *cluster, int flags)
970 {
971 	hammer2_chain_t *chain;
972 	int i;
973 
974 	cluster->focus = NULL;
975 	for (i = 0; i < cluster->nchains; ++i) {
976 		chain = cluster->array[i];
977 		if (chain) {
978 			hammer2_chain_delete_duplicate(trans, &chain, flags);
979 			cluster->array[i] = chain;
980 			if (cluster->focus == NULL)
981 				cluster->focus = chain;
982 		}
983 	}
984 }
985 
986 /*
987  * Mark a cluster deleted
988  */
989 void
990 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
991 		       int flags)
992 {
993 	hammer2_chain_t *chain;
994 	int i;
995 
996 	for (i = 0; i < cluster->nchains; ++i) {
997 		chain = cluster->array[i];
998 		if (chain)
999 			hammer2_chain_delete(trans, chain, flags);
1000 	}
1001 }
1002 
1003 /*
1004  * Create a snapshot of the specified {parent, ochain} with the specified
1005  * label.  The originating hammer2_inode must be exclusively locked for
1006  * safety.
1007  *
1008  * The ioctl code has already synced the filesystem.
1009  */
1010 int
1011 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1012 		       hammer2_ioc_pfs_t *pfs)
1013 {
1014 	hammer2_mount_t *hmp;
1015 	hammer2_cluster_t *ncluster;
1016 	const hammer2_inode_data_t *ipdata;
1017 	hammer2_inode_data_t *wipdata;
1018 	hammer2_inode_t *nip;
1019 	size_t name_len;
1020 	hammer2_key_t lhc;
1021 	struct vattr vat;
1022 	uuid_t opfs_clid;
1023 	int error;
1024 
1025 	kprintf("snapshot %s\n", pfs->name);
1026 
1027 	name_len = strlen(pfs->name);
1028 	lhc = hammer2_dirhash(pfs->name, name_len);
1029 
1030 	ipdata = &hammer2_cluster_data(ocluster)->ipdata;
1031 	opfs_clid = ipdata->pfs_clid;
1032 	hmp = ocluster->focus->hmp;
1033 
1034 	/*
1035 	 * Create the snapshot directory under the super-root
1036 	 *
1037 	 * Set PFS type, generate a unique filesystem id, and generate
1038 	 * a cluster id.  Use the same clid when snapshotting a PFS root,
1039 	 * which theoretically allows the snapshot to be used as part of
1040 	 * the same cluster (perhaps as a cache).
1041 	 *
1042 	 * Copy the (flushed) blockref array.  Theoretically we could use
1043 	 * chain_duplicate() but it becomes difficult to disentangle
1044 	 * the shared core so for now just brute-force it.
1045 	 */
1046 	VATTR_NULL(&vat);
1047 	vat.va_type = VDIR;
1048 	vat.va_mode = 0755;
1049 	ncluster = NULL;
1050 	nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1051 				   proc0.p_ucred, pfs->name, name_len,
1052 				   &ncluster, &error);
1053 
1054 	if (nip) {
1055 		wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1056 		wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
1057 		kern_uuidgen(&wipdata->pfs_fsid, 1);
1058 		if (ocluster->focus->flags & HAMMER2_CHAIN_PFSROOT)
1059 			wipdata->pfs_clid = opfs_clid;
1060 		else
1061 			kern_uuidgen(&wipdata->pfs_clid, 1);
1062 		hammer2_cluster_set_chainflags(ncluster, HAMMER2_CHAIN_PFSROOT);
1063 
1064 		/* XXX hack blockset copy */
1065 		/* XXX doesn't work with real cluster */
1066 		KKASSERT(ocluster->nchains == 1);
1067 		wipdata->u.blockset = ocluster->focus->data->ipdata.u.blockset;
1068 		hammer2_cluster_modsync(ncluster);
1069 		hammer2_inode_unlock_ex(nip, ncluster);
1070 	}
1071 	return (error);
1072 }
1073 
1074 /************************************************************************
1075  *			    NODE FAILURES 				*
1076  ************************************************************************
1077  *
1078  * A node failure can occur for numerous reasons.
1079  *
1080  *	- A read I/O may fail
1081  *	- A write I/O may fail
1082  *	- An unexpected chain might be found (or be missing)
1083  *	- A node might disconnect temporarily and reconnect later
1084  *	  (for example, a USB stick could get pulled, or a node might
1085  *	  be programmatically disconnected).
1086  *	- A node might run out of space during a modifying operation.
1087  *
1088  * When a read failure or an unexpected chain state is found, the chain and
1089  * parent chain at the failure point for the nodes involved (the nodes
1090  * which we determine to be in error) are flagged as failed and removed
1091  * from the cluster.  The node itself is allowed to remain active.  The
1092  * highest common point (usually a parent chain) is queued to the
1093  * resynchronization thread for action.
1094  *
1095  * When a write I/O fails or a node runs out of space, we first adjust
1096  * as if a read failure occurs but we further disable flushes on the
1097  * ENTIRE node.  Concurrent modifying transactions are allowed to complete
1098  * but any new modifying transactions will automatically remove the node
1099  * from consideration in all related cluster structures and not generate
1100  * any new modified chains.  The ROOT chain for the failed node(s) is queued
1101  * to the resynchronization thread for action.
1102  *
1103  * A temporary disconnect is handled as if a write failure occurred.
1104  *
1105  * Any of these failures might or might not stall related high level VNOPS,
1106  * depending on what has failed, what nodes remain, the type of cluster,
1107  * and the operating state of the cluster.
1108  *
1109  *			    FLUSH ON WRITE-DISABLED NODES
1110  *
1111  * A flush on a write-disabled node is not allowed to write anything because
1112  * we cannot safely update the mirror_tid anywhere on the failed node.  The
1113  * synchronization thread uses mirror_tid to calculate incremental resyncs.
1114  * Dirty meta-data related to the failed node is thrown away.
1115  *
1116  * Dirty buffer cache buffers and inodes are only thrown away if they can be
1117  * retired... that is, if the filesystem still has enough nodes to complete
1118  * the operation.
1119  */
1120 
1121 /************************************************************************
1122  *			SYNCHRONIZATION THREAD				*
1123  ************************************************************************
1124  *
1125  * This thread is responsible for [re]synchronizing the cluster representing
1126  * a PFS.  Any out-of-sync or failed node starts this thread on a
1127  * node-by-node basis when the failure is detected.
1128  *
1129  * Clusters needing resynchronization are queued at the highest point
1130  * where the parent on the failed node is still valid, or a special
1131  * incremental scan from the ROOT is queued if no parent exists.  This
1132  * thread is also responsible for waiting for reconnections of the failed
1133  * node if the cause was due to a disconnect, and waiting for space to be
1134  * freed up if the cause was due to running out of space.
1135  *
1136  * If the cause is due to a node running out of space, this thread will also
1137  * remove older (unlocked) snapshots to make new space, recover space, and
1138  * then start resynchronization.
1139  *
1140  * Each resynchronization pass virtually snapshots the PFS on the good nodes
1141  * and synchronizes using that snapshot against the target node.  This
1142  * ensures a consistent chain topology and also avoid interference between
1143  * the resynchronization thread and frontend operations.
1144  *
1145  * Since these are per-node threads it is possible to resynchronize several
1146  * nodes at once.
1147  */
1148