xref: /dragonfly/sys/vfs/hammer2/hammer2_cluster.c (revision 1aa0974c)
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *	    locks and I/O, do quorum and/or master-slave processing, and
50  *	    it must operate properly even if some nodes are broken (which
51  *	    can also mean indefinite locks).
52  *
53  *				CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *		hammer2_cluster_lock()
59  *		hammer2_cluster_parent()
60  *
61  *	- Most complex functions, quorum management on transaction ids.
62  *
63  *	- Locking and data accesses must be internally asynchronous.
64  *
65  *	- Validate and manage cache coherency primitives (cache state
66  *	  is stored in chain topologies but must be validated by these
67  *	  functions).
68  *
69  * (2) Lookups and Scans
70  *		hammer2_cluster_lookup()
71  *		hammer2_cluster_next()
72  *
73  *	- Depend on locking & data retrieval functions, but still complex.
74  *
75  *	- Must do quorum management on transaction ids.
76  *
77  *	- Lookup and Iteration ops Must be internally asynchronous.
78  *
79  * (3) Modifying Operations
80  *		hammer2_cluster_create()
81  *		hammer2_cluster_rename()
82  *		hammer2_cluster_delete()
83  *		hammer2_cluster_modify()
84  *		hammer2_cluster_modsync()
85  *
86  *	- Can usually punt on failures, operation continues unless quorum
87  *	  is lost.  If quorum is lost, must wait for resynchronization
88  *	  (depending on the management mode).
89  *
90  *	- Must disconnect node on failures (also not flush), remount, and
91  *	  resynchronize.
92  *
93  *	- Network links (via kdmsg) are relatively easy to issue as the
94  *	  complex underworkings of hammer2_chain.c don't have to messed
95  *	  with (the protocol is at a higher level than block-level).
96  *
97  *	- Multiple local disk nodes (i.e. block devices) are another matter.
98  *	  Chain operations have to be dispatched to per-node threads (xN)
99  *	  because we can't asynchronize potentially very complex chain
100  *	  operations in hammer2_chain.c (it would be a huge mess).
101  *
102  *	  (these threads are also used to terminate incoming kdmsg ops from
103  *	  other machines).
104  *
105  *	- Single-node filesystems do not use threads and will simply call
106  *	  hammer2_chain.c functions directly.  This short-cut is handled
107  *	  at the base of each cluster function.
108  */
109 #include <sys/cdefs.h>
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/types.h>
113 #include <sys/lock.h>
114 #include <sys/uuid.h>
115 
116 #include "hammer2.h"
117 
118 /*
119  * Returns TRUE if any chain in the cluster needs to be resized.
120  */
121 int
122 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes)
123 {
124 	hammer2_chain_t *chain;
125 	int i;
126 
127 	for (i = 0; i < cluster->nchains; ++i) {
128 		chain = cluster->array[i].chain;
129 		if (chain && chain->bytes != bytes)
130 			return 1;
131 	}
132 	return 0;
133 }
134 
135 uint8_t
136 hammer2_cluster_type(hammer2_cluster_t *cluster)
137 {
138 	return(cluster->focus->bref.type);
139 }
140 
141 int
142 hammer2_cluster_modified(hammer2_cluster_t *cluster)
143 {
144 	return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
145 }
146 
147 /*
148  * Return a bref representative of the cluster.  Any data offset is removed
149  * (since it would only be applicable to a particular chain in the cluster).
150  *
151  * However, the radix portion of data_off is used for many purposes and will
152  * be retained.
153  */
154 void
155 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
156 {
157 	*bref = cluster->focus->bref;
158 	bref->data_off &= HAMMER2_OFF_MASK_RADIX;
159 }
160 
161 /*
162  * Return non-zero if the chain representing an inode has been flagged
163  * as having been unlinked.  Allows the vnode reclaim to avoid loading
164  * the inode data from disk e.g. when unmount or recycling old, clean
165  * vnodes.
166  */
167 int
168 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster)
169 {
170 	hammer2_chain_t *chain;
171 	int flags;
172 	int i;
173 
174 	flags = 0;
175 	for (i = 0; i < cluster->nchains; ++i) {
176 		chain = cluster->array[i].chain;
177 		if (chain)
178 			flags |= chain->flags;
179 	}
180 	return (flags & HAMMER2_CHAIN_UNLINKED);
181 }
182 
183 void
184 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
185 {
186 	hammer2_chain_t *chain;
187 	int i;
188 
189 	for (i = 0; i < cluster->nchains; ++i) {
190 		chain = cluster->array[i].chain;
191 		if (chain)
192 			atomic_set_int(&chain->flags, flags);
193 	}
194 }
195 
196 void
197 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
198 {
199 	hammer2_chain_t *chain;
200 	int i;
201 
202 	for (i = 0; i < cluster->nchains; ++i) {
203 		chain = cluster->array[i].chain;
204 		if (chain)
205 			atomic_clear_int(&chain->flags, flags);
206 	}
207 }
208 
209 void
210 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
211 {
212 	hammer2_chain_t *chain;
213 	int i;
214 
215 	for (i = 0; i < cluster->nchains; ++i) {
216 		chain = cluster->array[i].chain;
217 		if (chain)
218 			hammer2_chain_setflush(trans, chain);
219 	}
220 }
221 
222 void
223 hammer2_cluster_setmethod_check(hammer2_trans_t *trans,
224 				hammer2_cluster_t *cluster,
225 				int check_algo)
226 {
227 	hammer2_chain_t *chain;
228 	int i;
229 
230 	for (i = 0; i < cluster->nchains; ++i) {
231 		chain = cluster->array[i].chain;
232 		if (chain) {
233 			KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
234 			chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1);
235 			chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo);
236 		}
237 	}
238 }
239 
240 /*
241  * Create a cluster with one ref from the specified chain.  The chain
242  * is not further referenced.  The caller typically supplies a locked
243  * chain and transfers ownership to the cluster.
244  *
245  * The returned cluster will be focused on the chain (strictly speaking,
246  * the focus should be NULL if the chain is not locked but we do not check
247  * for this condition).
248  */
249 hammer2_cluster_t *
250 hammer2_cluster_from_chain(hammer2_chain_t *chain)
251 {
252 	hammer2_cluster_t *cluster;
253 
254 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
255 	cluster->array[0].chain = chain;
256 	cluster->nchains = 1;
257 	cluster->focus = chain;
258 	cluster->pmp = chain->pmp;
259 	cluster->refs = 1;
260 
261 	return cluster;
262 }
263 
264 /*
265  * Allocates a cluster and its underlying chain structures.  The underlying
266  * chains will be locked.  The cluster and underlying chains will have one
267  * ref and will be focused on the first chain.
268  *
269  * XXX focus on first chain.
270  */
271 hammer2_cluster_t *
272 hammer2_cluster_alloc(hammer2_pfs_t *pmp,
273 		      hammer2_trans_t *trans, hammer2_blockref_t *bref)
274 {
275 	hammer2_cluster_t *cluster;
276 	hammer2_cluster_t *rcluster;
277 	hammer2_chain_t *chain;
278 	hammer2_chain_t *rchain;
279 #if 0
280 	u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
281 #endif
282 	int i;
283 
284 	KKASSERT(pmp != NULL);
285 
286 	/*
287 	 * Construct the appropriate system structure.
288 	 */
289 	switch(bref->type) {
290 	case HAMMER2_BREF_TYPE_INODE:
291 	case HAMMER2_BREF_TYPE_INDIRECT:
292 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
293 	case HAMMER2_BREF_TYPE_DATA:
294 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
295 		/*
296 		 * Chain's are really only associated with the hmp but we
297 		 * maintain a pmp association for per-mount memory tracking
298 		 * purposes.  The pmp can be NULL.
299 		 */
300 		break;
301 	case HAMMER2_BREF_TYPE_VOLUME:
302 	case HAMMER2_BREF_TYPE_FREEMAP:
303 		chain = NULL;
304 		panic("hammer2_cluster_alloc volume type illegal for op");
305 	default:
306 		chain = NULL;
307 		panic("hammer2_cluster_alloc: unrecognized blockref type: %d",
308 		      bref->type);
309 	}
310 
311 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
312 	cluster->refs = 1;
313 
314 	rcluster = &pmp->iroot->cluster;
315 	for (i = 0; i < rcluster->nchains; ++i) {
316 		rchain = rcluster->array[i].chain;
317 		chain = hammer2_chain_alloc(rchain->hmp, pmp, trans, bref);
318 #if 0
319 		chain->hmp = rchain->hmp;
320 		chain->bref = *bref;
321 		chain->bytes = bytes;
322 		chain->refs = 1;
323 		chain->flags = HAMMER2_CHAIN_ALLOCATED;
324 #endif
325 
326 		/*
327 		 * NOTE: When loading a chain from backing store or creating a
328 		 *	 snapshot, trans will be NULL and the caller is
329 		 *	 responsible for setting these fields.
330 		 */
331 		cluster->array[i].chain = chain;
332 	}
333 	cluster->nchains = i;
334 	cluster->pmp = pmp;
335 	cluster->focus = cluster->array[0].chain;
336 
337 	return (cluster);
338 }
339 
340 /*
341  * Add a reference to a cluster.
342  *
343  * We must also ref the underlying chains in order to allow ref/unlock
344  * sequences to later re-lock.
345  */
346 void
347 hammer2_cluster_ref(hammer2_cluster_t *cluster)
348 {
349 	hammer2_chain_t *chain;
350 	int i;
351 
352 	atomic_add_int(&cluster->refs, 1);
353 	for (i = 0; i < cluster->nchains; ++i) {
354 		chain = cluster->array[i].chain;
355 		if (chain)
356 			hammer2_chain_ref(chain);
357 	}
358 }
359 
360 /*
361  * Drop the caller's reference to the cluster.  When the ref count drops to
362  * zero this function frees the cluster and drops all underlying chains.
363  *
364  * In-progress read I/Os are typically detached from the cluster once the
365  * first one returns (the remaining stay attached to the DIOs but are then
366  * ignored and drop naturally).
367  */
368 void
369 hammer2_cluster_drop(hammer2_cluster_t *cluster)
370 {
371 	hammer2_chain_t *chain;
372 	int i;
373 
374 	KKASSERT(cluster->refs > 0);
375 	for (i = 0; i < cluster->nchains; ++i) {
376 		chain = cluster->array[i].chain;
377 		if (chain) {
378 			hammer2_chain_drop(chain);
379 			if (cluster->refs == 1)
380 				cluster->array[i].chain = NULL;
381 		}
382 	}
383 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
384 		cluster->focus = NULL;		/* safety */
385 		kfree(cluster, M_HAMMER2);
386 		/* cluster is invalid */
387 	}
388 }
389 
390 void
391 hammer2_cluster_wait(hammer2_cluster_t *cluster)
392 {
393 	tsleep(cluster->focus, 0, "h2clcw", 1);
394 }
395 
396 /*
397  * Lock and ref a cluster.  This adds a ref to the cluster and its chains
398  * and then locks them.
399  *
400  * The act of locking a cluster sets its focus if not already set.
401  */
402 int
403 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
404 {
405 	hammer2_chain_t *chain;
406 	hammer2_chain_t *tmp;
407 	int i;
408 	int error;
409 
410 	if ((how & HAMMER2_RESOLVE_NOREF) == 0)
411 		atomic_add_int(&cluster->refs, 1);
412 
413 	error = 0;
414 
415 	for (i = 0; i < cluster->nchains; ++i) {
416 		chain = cluster->array[i].chain;
417 		if (chain) {
418 			error = hammer2_chain_lock(chain, how);
419 			if (error) {
420 				while (--i >= 0) {
421 					tmp = cluster->array[i].chain;
422 					hammer2_chain_unlock(tmp);
423 				}
424 				atomic_add_int(&cluster->refs, -1);
425 				break;
426 			}
427 			if (cluster->focus == NULL)
428 				cluster->focus = chain;
429 		}
430 	}
431 	return error;
432 }
433 
434 /*
435  * Replace the contents of dst with src, adding a reference to src's chains.
436  * dst is assumed to already have a ref and any chains present in dst are
437  * assumed to be locked and will be unlocked.
438  *
439  * If the chains in src are locked, only one of (src) or (dst) should be
440  * considered locked by the caller after return, not both.
441  */
442 void
443 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src)
444 {
445 	hammer2_chain_t *chain;
446 	hammer2_chain_t *tmp;
447 	int i;
448 
449 	KKASSERT(dst->refs == 1);
450 	dst->focus = NULL;
451 
452 	for (i = 0; i < src->nchains; ++i) {
453 		chain = src->array[i].chain;
454 		if (chain) {
455 			hammer2_chain_ref(chain);
456 			if (i < dst->nchains &&
457 			    (tmp = dst->array[i].chain) != NULL) {
458 				hammer2_chain_unlock(tmp);
459 			}
460 			dst->array[i].chain = chain;
461 			if (dst->focus == NULL)
462 				dst->focus = chain;
463 		}
464 	}
465 	while (i < dst->nchains) {
466 		chain = dst->array[i].chain;
467 		if (chain) {
468 			hammer2_chain_unlock(chain);
469 			dst->array[i].chain = NULL;
470 		}
471 		++i;
472 	}
473 	dst->nchains = src->nchains;
474 }
475 
476 /*
477  * Replace the contents of the locked destination with the contents of the
478  * locked source.  Destination must have one ref.
479  *
480  * Returns with the destination still with one ref and the copied chains
481  * with an additional lock (representing their state on the destination).
482  * The original chains associated with the destination are unlocked.
483  */
484 void
485 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src)
486 {
487 	hammer2_chain_t *chain;
488 	hammer2_chain_t *tmp;
489 	int i;
490 
491 	KKASSERT(dst->refs == 1);
492 
493 	dst->focus = NULL;
494 	for (i = 0; i < src->nchains; ++i) {
495 		chain = src->array[i].chain;
496 		if (chain) {
497 			hammer2_chain_lock(chain, 0);
498 			if (i < dst->nchains &&
499 			    (tmp = dst->array[i].chain) != NULL) {
500 				hammer2_chain_unlock(tmp);
501 			}
502 			dst->array[i].chain = chain;
503 			if (dst->focus == NULL)
504 				dst->focus = chain;
505 		}
506 	}
507 	while (i < dst->nchains) {
508 		chain = dst->array[i].chain;
509 		if (chain) {
510 			hammer2_chain_unlock(chain);
511 			dst->array[i].chain = NULL;
512 		}
513 		++i;
514 	}
515 	dst->nchains = src->nchains;
516 }
517 
518 /*
519  * Copy a cluster, returned a ref'd cluster.  All underlying chains
520  * are also ref'd, but not locked.  The cluster focus is not set because
521  * the cluster is not yet locked (and the originating cluster does not
522  * have to be locked either).
523  */
524 hammer2_cluster_t *
525 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
526 {
527 	hammer2_pfs_t *pmp = ocluster->pmp;
528 	hammer2_cluster_t *ncluster;
529 	hammer2_chain_t *chain;
530 	int i;
531 
532 	ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
533 	ncluster->pmp = pmp;
534 	ncluster->nchains = ocluster->nchains;
535 	ncluster->refs = 1;
536 
537 	for (i = 0; i < ocluster->nchains; ++i) {
538 		chain = ocluster->array[i].chain;
539 		ncluster->array[i].chain = chain;
540 		if (chain)
541 			hammer2_chain_ref(chain);
542 	}
543 	return (ncluster);
544 }
545 
546 /*
547  * Unlock and deref a cluster.  The cluster is destroyed if this is the
548  * last ref.
549  */
550 void
551 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
552 {
553 	hammer2_chain_t *chain;
554 	int i;
555 
556 	KKASSERT(cluster->refs > 0);
557 	for (i = 0; i < cluster->nchains; ++i) {
558 		chain = cluster->array[i].chain;
559 		if (chain) {
560 			hammer2_chain_unlock(chain);
561 			if (cluster->refs == 1)
562 				cluster->array[i].chain = NULL;	/* safety */
563 		}
564 	}
565 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
566 		cluster->focus = NULL;
567 		kfree(cluster, M_HAMMER2);
568 		/* cluster = NULL; safety */
569 	}
570 }
571 
572 /*
573  * Resize the cluster's physical storage allocation in-place.  This may
574  * replace the cluster's chains.
575  */
576 void
577 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
578 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
579 		       int nradix, int flags)
580 {
581 	hammer2_chain_t *chain;
582 	int i;
583 
584 	KKASSERT(cparent->pmp == cluster->pmp);		/* can be NULL */
585 	KKASSERT(cparent->nchains == cluster->nchains);
586 
587 	cluster->focus = NULL;
588 	for (i = 0; i < cluster->nchains; ++i) {
589 		chain = cluster->array[i].chain;
590 		if (chain) {
591 			KKASSERT(cparent->array[i].chain);
592 			hammer2_chain_resize(trans, ip,
593 					     cparent->array[i].chain, chain,
594 					     nradix, flags);
595 			if (cluster->focus == NULL)
596 				cluster->focus = chain;
597 		}
598 	}
599 }
600 
601 /*
602  * Set an inode's cluster modified, marking the related chains RW and
603  * duplicating them if necessary.
604  *
605  * The passed-in chain is a localized copy of the chain previously acquired
606  * when the inode was locked (and possilby replaced in the mean time), and
607  * must also be updated.  In fact, we update it first and then synchronize
608  * the inode's cluster cache.
609  */
610 hammer2_inode_data_t *
611 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
612 			  hammer2_cluster_t *cluster, int flags)
613 {
614 	atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
615 	hammer2_cluster_modify(trans, cluster, flags);
616 
617 	hammer2_inode_repoint(ip, NULL, cluster);
618 	if (ip->vp)
619 		vsetisdirty(ip->vp);
620 	return (&hammer2_cluster_wdata(cluster)->ipdata);
621 }
622 
623 /*
624  * Adjust the cluster's chains to allow modification and adjust the
625  * focus.  Data will be accessible on return.
626  */
627 void
628 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
629 		       int flags)
630 {
631 	hammer2_chain_t *chain;
632 	int i;
633 
634 	cluster->focus = NULL;
635 	for (i = 0; i < cluster->nchains; ++i) {
636 		chain = cluster->array[i].chain;
637 		if (chain) {
638 			hammer2_chain_modify(trans, chain, flags);
639 			if (cluster->focus == NULL)
640 				cluster->focus = chain;
641 		}
642 	}
643 }
644 
645 /*
646  * Synchronize modifications from the focus to other chains in a cluster.
647  * Convenient because nominal API users can just modify the contents of the
648  * focus (at least for non-blockref data).
649  *
650  * Nominal front-end operations only edit non-block-table data in a single
651  * chain.  This code copies such modifications to the other chains in the
652  * cluster.  Blocktable modifications are handled on a chain-by-chain basis
653  * by both the frontend and the backend and will explode in fireworks if
654  * blindly copied.
655  */
656 void
657 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
658 {
659 	hammer2_chain_t *focus;
660 	hammer2_chain_t *scan;
661 	const hammer2_inode_data_t *ripdata;
662 	hammer2_inode_data_t *wipdata;
663 	int i;
664 
665 	focus = cluster->focus;
666 	KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
667 
668 	for (i = 0; i < cluster->nchains; ++i) {
669 		scan = cluster->array[i].chain;
670 		if (scan == NULL || scan == focus)
671 			continue;
672 		KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
673 		KKASSERT(focus->bytes == scan->bytes &&
674 			 focus->bref.type == scan->bref.type);
675 		switch(focus->bref.type) {
676 		case HAMMER2_BREF_TYPE_INODE:
677 			ripdata = &focus->data->ipdata;
678 			wipdata = &scan->data->ipdata;
679 			if ((ripdata->op_flags &
680 			    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
681 				bcopy(ripdata, wipdata,
682 				      offsetof(hammer2_inode_data_t, u));
683 				break;
684 			}
685 			/* fall through to full copy */
686 		case HAMMER2_BREF_TYPE_DATA:
687 			bcopy(focus->data, scan->data, focus->bytes);
688 			break;
689 		case HAMMER2_BREF_TYPE_FREEMAP_NODE:
690 		case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
691 		case HAMMER2_BREF_TYPE_FREEMAP:
692 		case HAMMER2_BREF_TYPE_VOLUME:
693 			panic("hammer2_cluster_modsync: illegal node type");
694 			/* NOT REACHED */
695 			break;
696 		default:
697 			panic("hammer2_cluster_modsync: unknown node type");
698 			break;
699 		}
700 	}
701 }
702 
703 /*
704  * Lookup initialization/completion API
705  */
706 hammer2_cluster_t *
707 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
708 {
709 	hammer2_cluster_t *cluster;
710 	int i;
711 
712 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
713 	cluster->pmp = cparent->pmp;			/* can be NULL */
714 	/* cluster->focus = NULL; already null */
715 
716 	for (i = 0; i < cparent->nchains; ++i) {
717 		cluster->array[i].chain = cparent->array[i].chain;
718 		if (cluster->focus == NULL)
719 			cluster->focus = cluster->array[i].chain;
720 	}
721 	cluster->nchains = cparent->nchains;
722 
723 	/*
724 	 * Independently lock (this will also give cluster 1 ref)
725 	 */
726 	if (flags & HAMMER2_LOOKUP_SHARED) {
727 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
728 					      HAMMER2_RESOLVE_SHARED);
729 	} else {
730 		hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
731 	}
732 	return (cluster);
733 }
734 
735 void
736 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
737 {
738 	if (cparent)
739 		hammer2_cluster_unlock(cparent);
740 }
741 
742 /*
743  * Locate first match or overlap under parent, return a new cluster
744  */
745 hammer2_cluster_t *
746 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
747 		     hammer2_key_t key_beg, hammer2_key_t key_end,
748 		     int flags, int *ddflagp)
749 {
750 	hammer2_pfs_t *pmp;
751 	hammer2_cluster_t *cluster;
752 	hammer2_chain_t *chain;
753 	hammer2_key_t key_accum;
754 	hammer2_key_t key_next;
755 	hammer2_key_t bref_key;
756 	int bref_keybits;
757 	int null_count;
758 	int ddflag;
759 	int i;
760 	uint8_t bref_type;
761 	u_int bytes;
762 
763 	pmp = cparent->pmp;				/* can be NULL */
764 	key_accum = *key_nextp;
765 	null_count = 0;
766 	bref_type = 0;
767 	bref_key = 0;
768 	bref_keybits = 0;
769 	bytes = 0;
770 
771 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
772 	cluster->pmp = pmp;				/* can be NULL */
773 	cluster->refs = 1;
774 	/* cluster->focus = NULL; already null */
775 	cparent->focus = NULL;
776 	*ddflagp = 0;
777 
778 	for (i = 0; i < cparent->nchains; ++i) {
779 		key_next = *key_nextp;
780 		if (cparent->array[i].chain == NULL) {
781 			++null_count;
782 			continue;
783 		}
784 		chain = hammer2_chain_lookup(&cparent->array[i].chain,
785 					     &key_next,
786 					     key_beg, key_end,
787 					     &cparent->array[i].cache_index,
788 					     flags, &ddflag);
789 		if (cparent->focus == NULL)
790 			cparent->focus = cparent->array[i].chain;
791 		cluster->array[i].chain = chain;
792 		if (chain == NULL) {
793 			++null_count;
794 		} else {
795 			if (cluster->focus == NULL) {
796 				bref_type = chain->bref.type;
797 				bref_key = chain->bref.key;
798 				bref_keybits = chain->bref.keybits;
799 				bytes = chain->bytes;
800 				*ddflagp = ddflag;
801 				cluster->focus = chain;
802 			}
803 			KKASSERT(bref_type == chain->bref.type);
804 			KKASSERT(bref_key == chain->bref.key);
805 			KKASSERT(bref_keybits == chain->bref.keybits);
806 			KKASSERT(bytes == chain->bytes);
807 			KKASSERT(*ddflagp == ddflag);
808 		}
809 		if (key_accum > key_next)
810 			key_accum = key_next;
811 	}
812 	*key_nextp = key_accum;
813 	cluster->nchains = i;
814 
815 	if (null_count == i) {
816 		hammer2_cluster_drop(cluster);
817 		cluster = NULL;
818 	}
819 
820 	return (cluster);
821 }
822 
823 /*
824  * Locate next match or overlap under parent, replace cluster
825  */
826 hammer2_cluster_t *
827 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
828 		     hammer2_key_t *key_nextp,
829 		     hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
830 {
831 	hammer2_chain_t *chain;
832 	hammer2_key_t key_accum;
833 	hammer2_key_t key_next;
834 	int null_count;
835 	int i;
836 
837 	key_accum = *key_nextp;
838 	null_count = 0;
839 	cluster->focus = NULL;
840 	cparent->focus = NULL;
841 
842 	for (i = 0; i < cparent->nchains; ++i) {
843 		key_next = *key_nextp;
844 		chain = cluster->array[i].chain;
845 		if (chain == NULL) {
846 			if (cparent->focus == NULL)
847 				cparent->focus = cparent->array[i].chain;
848 			++null_count;
849 			continue;
850 		}
851 		if (cparent->array[i].chain == NULL) {
852 			if (flags & HAMMER2_LOOKUP_NOLOCK)
853 				hammer2_chain_drop(chain);
854 			else
855 				hammer2_chain_unlock(chain);
856 			++null_count;
857 			continue;
858 		}
859 		chain = hammer2_chain_next(&cparent->array[i].chain, chain,
860 					   &key_next, key_beg, key_end,
861 					   &cparent->array[i].cache_index,
862 					   flags);
863 		if (cparent->focus == NULL)
864 			cparent->focus = cparent->array[i].chain;
865 		cluster->array[i].chain = chain;
866 		if (chain == NULL) {
867 			++null_count;
868 		} else if (cluster->focus == NULL) {
869 			cluster->focus = chain;
870 		}
871 		if (key_accum > key_next)
872 			key_accum = key_next;
873 	}
874 
875 	if (null_count == i) {
876 		hammer2_cluster_drop(cluster);
877 		cluster = NULL;
878 	}
879 	return(cluster);
880 }
881 
882 #if 0
883 /*
884  * XXX initial NULL cluster needs reworking (pass **clusterp ?)
885  *
886  * The raw scan function is similar to lookup/next but does not seek to a key.
887  * Blockrefs are iterated via first_chain = (parent, NULL) and
888  * next_chain = (parent, chain).
889  *
890  * The passed-in parent must be locked and its data resolved.  The returned
891  * chain will be locked.  Pass chain == NULL to acquire the first sub-chain
892  * under parent and then iterate with the passed-in chain (which this
893  * function will unlock).
894  */
895 hammer2_cluster_t *
896 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
897 		     int flags)
898 {
899 	hammer2_chain_t *chain;
900 	int null_count;
901 	int i;
902 
903 	null_count = 0;
904 
905 	for (i = 0; i < cparent->nchains; ++i) {
906 		chain = cluster->array[i].chain;
907 		if (chain == NULL) {
908 			++null_count;
909 			continue;
910 		}
911 		if (cparent->array[i].chain == NULL) {
912 			if (flags & HAMMER2_LOOKUP_NOLOCK)
913 				hammer2_chain_drop(chain);
914 			else
915 				hammer2_chain_unlock(chain);
916 			++null_count;
917 			continue;
918 		}
919 
920 		chain = hammer2_chain_scan(cparent->array[i].chain, chain,
921 					   &cparent->array[i].cache_index,
922 					   flags);
923 		cluster->array[i].chain = chain;
924 		if (chain == NULL)
925 			++null_count;
926 	}
927 
928 	if (null_count == i) {
929 		hammer2_cluster_drop(cluster);
930 		cluster = NULL;
931 	}
932 	return(cluster);
933 }
934 
935 #endif
936 
937 /*
938  * Create a new cluster using the specified key
939  */
940 int
941 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
942 		     hammer2_cluster_t **clusterp,
943 		     hammer2_key_t key, int keybits,
944 		     int type, size_t bytes, int flags)
945 {
946 	hammer2_cluster_t *cluster;
947 	hammer2_pfs_t *pmp;
948 	int error;
949 	int i;
950 
951 	pmp = trans->pmp;				/* can be NULL */
952 
953 	if ((cluster = *clusterp) == NULL) {
954 		cluster = kmalloc(sizeof(*cluster), M_HAMMER2,
955 				  M_WAITOK | M_ZERO);
956 		cluster->pmp = pmp;			/* can be NULL */
957 		cluster->refs = 1;
958 	}
959 	cluster->focus = NULL;
960 	cparent->focus = NULL;
961 
962 	/*
963 	 * NOTE: cluster->array[] entries can initially be NULL.  If
964 	 *	 *clusterp is supplied, skip NULL entries, otherwise
965 	 *	 create new chains.
966 	 */
967 	for (i = 0; i < cparent->nchains; ++i) {
968 		if (*clusterp && cluster->array[i].chain == NULL) {
969 			if (cparent->focus == NULL)
970 				cparent->focus = cparent->array[i].chain;
971 			continue;
972 		}
973 		error = hammer2_chain_create(trans, &cparent->array[i].chain,
974 					     &cluster->array[i].chain, pmp,
975 					     key, keybits,
976 					     type, bytes, flags);
977 		KKASSERT(error == 0);
978 		if (cparent->focus == NULL)
979 			cparent->focus = cparent->array[i].chain;
980 		if (cluster->focus == NULL)
981 			cluster->focus = cluster->array[i].chain;
982 	}
983 	cluster->nchains = i;
984 	*clusterp = cluster;
985 
986 	return error;
987 }
988 
989 /*
990  * Rename a cluster to a new parent.
991  *
992  * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields
993  *	    are used from a passed-in non-NULL bref pointer.  All other fields
994  *	    are extracted from the original chain for each chain in the
995  *	    iteration.
996  */
997 void
998 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
999 		       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
1000 		       int flags)
1001 {
1002 	hammer2_chain_t *chain;
1003 	hammer2_blockref_t xbref;
1004 	int i;
1005 
1006 	cluster->focus = NULL;
1007 	cparent->focus = NULL;
1008 
1009 	for (i = 0; i < cluster->nchains; ++i) {
1010 		chain = cluster->array[i].chain;
1011 		if (chain) {
1012 			if (bref) {
1013 				xbref = chain->bref;
1014 				xbref.key = bref->key;
1015 				xbref.keybits = bref->keybits;
1016 				hammer2_chain_rename(trans, &xbref,
1017 						     &cparent->array[i].chain,
1018 						     chain, flags);
1019 			} else {
1020 				hammer2_chain_rename(trans, NULL,
1021 						     &cparent->array[i].chain,
1022 						     chain, flags);
1023 			}
1024 			cluster->array[i].chain = chain;
1025 			if (cluster->focus == NULL)
1026 				cluster->focus = chain;
1027 			if (cparent->focus == NULL)
1028 				cparent->focus = cparent->array[i].chain;
1029 		} else {
1030 			if (cparent->focus == NULL)
1031 				cparent->focus = cparent->array[i].chain;
1032 		}
1033 	}
1034 }
1035 
1036 /*
1037  * Mark a cluster deleted
1038  */
1039 void
1040 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
1041 		       hammer2_cluster_t *cluster, int flags)
1042 {
1043 	hammer2_chain_t *chain;
1044 	hammer2_chain_t *parent;
1045 	int i;
1046 
1047 	if (cparent == NULL) {
1048 		kprintf("cparent is NULL\n");
1049 		return;
1050 	}
1051 
1052 	for (i = 0; i < cluster->nchains; ++i) {
1053 		parent = (i < cparent->nchains) ?
1054 			 cparent->array[i].chain : NULL;
1055 		chain = cluster->array[i].chain;
1056 		if (chain == NULL)
1057 			continue;
1058 		if (chain->parent != parent) {
1059 			kprintf("hammer2_cluster_delete: parent "
1060 				"mismatch chain=%p parent=%p against=%p\n",
1061 				chain, chain->parent, parent);
1062 		} else {
1063 			hammer2_chain_delete(trans, parent, chain, flags);
1064 		}
1065 	}
1066 }
1067 
1068 /*
1069  * Create a snapshot of the specified {parent, ochain} with the specified
1070  * label.  The originating hammer2_inode must be exclusively locked for
1071  * safety.
1072  *
1073  * The ioctl code has already synced the filesystem.
1074  */
1075 int
1076 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
1077 		       hammer2_ioc_pfs_t *pfs)
1078 {
1079 	hammer2_dev_t *hmp;
1080 	hammer2_cluster_t *ncluster;
1081 	const hammer2_inode_data_t *ripdata;
1082 	hammer2_inode_data_t *wipdata;
1083 	hammer2_chain_t *nchain;
1084 	hammer2_inode_t *nip;
1085 	size_t name_len;
1086 	hammer2_key_t lhc;
1087 	struct vattr vat;
1088 #if 0
1089 	uuid_t opfs_clid;
1090 #endif
1091 	int error;
1092 	int i;
1093 
1094 	kprintf("snapshot %s\n", pfs->name);
1095 
1096 	name_len = strlen(pfs->name);
1097 	lhc = hammer2_dirhash(pfs->name, name_len);
1098 
1099 	/*
1100 	 * Get the clid
1101 	 */
1102 	ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
1103 #if 0
1104 	opfs_clid = ripdata->pfs_clid;
1105 #endif
1106 	hmp = ocluster->focus->hmp;	/* XXX find synchronized local disk */
1107 
1108 	/*
1109 	 * Create the snapshot directory under the super-root
1110 	 *
1111 	 * Set PFS type, generate a unique filesystem id, and generate
1112 	 * a cluster id.  Use the same clid when snapshotting a PFS root,
1113 	 * which theoretically allows the snapshot to be used as part of
1114 	 * the same cluster (perhaps as a cache).
1115 	 *
1116 	 * Copy the (flushed) blockref array.  Theoretically we could use
1117 	 * chain_duplicate() but it becomes difficult to disentangle
1118 	 * the shared core so for now just brute-force it.
1119 	 */
1120 	VATTR_NULL(&vat);
1121 	vat.va_type = VDIR;
1122 	vat.va_mode = 0755;
1123 	ncluster = NULL;
1124 	nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
1125 				   proc0.p_ucred, pfs->name, name_len,
1126 				   &ncluster,
1127 				   HAMMER2_INSERT_PFSROOT, &error);
1128 
1129 	if (nip) {
1130 		wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
1131 		wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
1132 		wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT;
1133 		kern_uuidgen(&wipdata->pfs_fsid, 1);
1134 
1135 		/*
1136 		 * Give the snapshot its own private cluster.  As a snapshot
1137 		 * no further synchronization with the original cluster will
1138 		 * be done.
1139 		 */
1140 #if 0
1141 		if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1142 			wipdata->pfs_clid = opfs_clid;
1143 		else
1144 			kern_uuidgen(&wipdata->pfs_clid, 1);
1145 #endif
1146 		kern_uuidgen(&wipdata->pfs_clid, 1);
1147 
1148 		for (i = 0; i < ncluster->nchains; ++i) {
1149 			nchain = ncluster->array[i].chain;
1150 			if (nchain)
1151 				nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
1152 		}
1153 #if 0
1154 		/* XXX can't set this unless we do an explicit flush, which
1155 		   we also need a pmp assigned to do, else the flush code
1156 		   won't flush ncluster because it thinks it is crossing a
1157 		   flush boundary */
1158 		hammer2_cluster_set_chainflags(ncluster,
1159 					       HAMMER2_CHAIN_PFSBOUNDARY);
1160 #endif
1161 
1162 		/* XXX hack blockset copy */
1163 		/* XXX doesn't work with real cluster */
1164 		KKASSERT(ocluster->nchains == 1);
1165 		wipdata->u.blockset = ripdata->u.blockset;
1166 		hammer2_cluster_modsync(ncluster);
1167 		for (i = 0; i < ncluster->nchains; ++i) {
1168 			nchain = ncluster->array[i].chain;
1169 			if (nchain)
1170 				hammer2_flush(trans, nchain);
1171 		}
1172 		hammer2_inode_unlock_ex(nip, ncluster);
1173 	}
1174 	return (error);
1175 }
1176 
1177 /*
1178  * Return locked parent cluster given a locked child.  The child remains
1179  * locked on return.  The new parent's focus follows the child's focus
1180  * and the parent is always resolved.
1181  */
1182 hammer2_cluster_t *
1183 hammer2_cluster_parent(hammer2_cluster_t *cluster)
1184 {
1185 	hammer2_cluster_t *cparent;
1186 	int i;
1187 
1188 	cparent = hammer2_cluster_copy(cluster);
1189 	for (i = 0; i < cparent->nchains; ++i) {
1190 		hammer2_chain_t *chain;
1191 		hammer2_chain_t *rchain;
1192 
1193 		/*
1194 		 * Calculate parent for each element.  Old chain has an extra
1195 		 * ref for cparent but the lock remains with cluster.
1196 		 */
1197 		chain = cparent->array[i].chain;
1198 		if (chain == NULL)
1199 			continue;
1200 		while ((rchain = chain->parent) != NULL) {
1201 			hammer2_chain_ref(rchain);
1202 			hammer2_chain_unlock(chain);
1203 			hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
1204 			hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1205 			hammer2_chain_drop(rchain);
1206 			if (chain->parent == rchain)
1207 				break;
1208 			hammer2_chain_unlock(rchain);
1209 		}
1210 		if (cluster->focus == chain)
1211 			cparent->focus = rchain;
1212 		cparent->array[i].chain = rchain;
1213 		hammer2_chain_drop(chain);
1214 	}
1215 	return cparent;
1216 }
1217 
1218 /************************************************************************
1219  *			        CLUSTER I/O 				*
1220  ************************************************************************
1221  *
1222  *
1223  * WARNING! blockref[] array data is not universal.  These functions should
1224  *	    only be used to access universal data.
1225  *
1226  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1227  *	    complete if necessary.  The I/O's should have already been
1228  *	    initiated by the cluster_lock/chain_lock operation.
1229  *
1230  *	    The cluster must already be in a modified state before wdata
1231  *	    is called.  The data will already be available for this case.
1232  */
1233 const hammer2_media_data_t *
1234 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1235 {
1236 	return(cluster->focus->data);
1237 }
1238 
1239 hammer2_media_data_t *
1240 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1241 {
1242 	KKASSERT(hammer2_cluster_modified(cluster));
1243 	return(cluster->focus->data);
1244 }
1245 
1246 /*
1247  * Load async into independent buffer - used to load logical buffers from
1248  * underlying device data.  The callback is made for the first validated
1249  * data found, or NULL if no valid data is available.
1250  *
1251  * NOTE! The cluster structure is either unique or serialized (e.g. embedded
1252  *	 in the inode with an exclusive lock held), the chain structure may be
1253  *	 shared.
1254  */
1255 void
1256 hammer2_cluster_load_async(hammer2_cluster_t *cluster,
1257 			   void (*callback)(hammer2_iocb_t *iocb), void *ptr)
1258 {
1259 	hammer2_chain_t *chain;
1260 	hammer2_iocb_t *iocb;
1261 	hammer2_dev_t *hmp;
1262 	hammer2_blockref_t *bref;
1263 	int i;
1264 
1265 	/*
1266 	 * Try to find a chain whos data is already resolved.  If none can
1267 	 * be found, start with the first chain.
1268 	 */
1269 	chain = NULL;
1270 	for (i = 0; i < cluster->nchains; ++i) {
1271 		chain = cluster->array[i].chain;
1272 		if (chain && chain->data)
1273 			break;
1274 	}
1275 	if (i == cluster->nchains) {
1276 		chain = cluster->array[0].chain;
1277 		i = 0;
1278 	}
1279 
1280 	iocb = &cluster->iocb;
1281 	iocb->callback = callback;
1282 	iocb->dio = NULL;		/* for already-validated case */
1283 	iocb->cluster = cluster;
1284 	iocb->chain = chain;
1285 	iocb->ptr = ptr;
1286 	iocb->lbase = (off_t)i;
1287 	iocb->flags = 0;
1288 	iocb->error = 0;
1289 
1290 	/*
1291 	 * Data already validated
1292 	 */
1293 	if (chain->data) {
1294 		callback(iocb);
1295 		return;
1296 	}
1297 
1298 	/*
1299 	 * We must resolve to a device buffer, either by issuing I/O or
1300 	 * by creating a zero-fill element.  We do not mark the buffer
1301 	 * dirty when creating a zero-fill element (the hammer2_chain_modify()
1302 	 * API must still be used to do that).
1303 	 *
1304 	 * The device buffer is variable-sized in powers of 2 down
1305 	 * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1306 	 * chunk always contains buffers of the same size. (XXX)
1307 	 *
1308 	 * The minimum physical IO size may be larger than the variable
1309 	 * block size.
1310 	 */
1311 	bref = &chain->bref;
1312 	hmp = chain->hmp;
1313 
1314 #if 0
1315 	/* handled by callback? <- TODO XXX even needed for loads? */
1316 	/*
1317 	 * The getblk() optimization for a 100% overwrite can only be used
1318 	 * if the physical block size matches the request.
1319 	 */
1320 	if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1321 	    chain->bytes == hammer2_devblksize(chain->bytes)) {
1322 		error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1323 		KKASSERT(error == 0);
1324 		iocb->dio = dio;
1325 		callback(iocb);
1326 		return;
1327 	}
1328 #endif
1329 
1330 	/*
1331 	 * Otherwise issue a read
1332 	 */
1333 	hammer2_adjreadcounter(&chain->bref, chain->bytes);
1334 	hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb);
1335 }
1336 
1337 /************************************************************************
1338  *			    NODE FAILURES 				*
1339  ************************************************************************
1340  *
1341  * A node failure can occur for numerous reasons.
1342  *
1343  *	- A read I/O may fail
1344  *	- A write I/O may fail
1345  *	- An unexpected chain might be found (or be missing)
1346  *	- A node might disconnect temporarily and reconnect later
1347  *	  (for example, a USB stick could get pulled, or a node might
1348  *	  be programmatically disconnected).
1349  *	- A node might run out of space during a modifying operation.
1350  *
1351  * When a read failure or an unexpected chain state is found, the chain and
1352  * parent chain at the failure point for the nodes involved (the nodes
1353  * which we determine to be in error) are flagged as failed and removed
1354  * from the cluster.  The node itself is allowed to remain active.  The
1355  * highest common point (usually a parent chain) is queued to the
1356  * resynchronization thread for action.
1357  *
1358  * When a write I/O fails or a node runs out of space, we first adjust
1359  * as if a read failure occurs but we further disable flushes on the
1360  * ENTIRE node.  Concurrent modifying transactions are allowed to complete
1361  * but any new modifying transactions will automatically remove the node
1362  * from consideration in all related cluster structures and not generate
1363  * any new modified chains.  The ROOT chain for the failed node(s) is queued
1364  * to the resynchronization thread for action.
1365  *
1366  * A temporary disconnect is handled as if a write failure occurred.
1367  *
1368  * Any of these failures might or might not stall related high level VNOPS,
1369  * depending on what has failed, what nodes remain, the type of cluster,
1370  * and the operating state of the cluster.
1371  *
1372  *			    FLUSH ON WRITE-DISABLED NODES
1373  *
1374  * A flush on a write-disabled node is not allowed to write anything because
1375  * we cannot safely update the mirror_tid anywhere on the failed node.  The
1376  * synchronization thread uses mirror_tid to calculate incremental resyncs.
1377  * Dirty meta-data related to the failed node is thrown away.
1378  *
1379  * Dirty buffer cache buffers and inodes are only thrown away if they can be
1380  * retired... that is, if the filesystem still has enough nodes to complete
1381  * the operation.
1382  */
1383 
1384 /************************************************************************
1385  *			SYNCHRONIZATION THREAD				*
1386  ************************************************************************
1387  *
1388  * This thread is responsible for [re]synchronizing the cluster representing
1389  * a PFS.  Any out-of-sync or failed node starts this thread on a
1390  * node-by-node basis when the failure is detected.
1391  *
1392  * Clusters needing resynchronization are queued at the highest point
1393  * where the parent on the failed node is still valid, or a special
1394  * incremental scan from the ROOT is queued if no parent exists.  This
1395  * thread is also responsible for waiting for reconnections of the failed
1396  * node if the cause was due to a disconnect, and waiting for space to be
1397  * freed up if the cause was due to running out of space.
1398  *
1399  * If the cause is due to a node running out of space, this thread will also
1400  * remove older (unlocked) snapshots to make new space, recover space, and
1401  * then start resynchronization.
1402  *
1403  * Each resynchronization pass virtually snapshots the PFS on the good nodes
1404  * and synchronizes using that snapshot against the target node.  This
1405  * ensures a consistent chain topology and also avoids interference between
1406  * the resynchronization thread and frontend operations.
1407  *
1408  * Since these are per-node threads it is possible to resynchronize several
1409  * nodes at once.
1410  */
1411