xref: /dragonfly/sys/vfs/hammer2/hammer2_cluster.c (revision 6ab64ab6)
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *	    locks and I/O, do quorum and/or master-slave processing, and
50  *	    it must operate properly even if some nodes are broken (which
51  *	    can also mean indefinite locks).
52  *
53  *				CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *
59  *	- Most complex functions, quorum management on transaction ids.
60  *
61  *	- Locking and data accesses must be internally asynchronous.
62  *
63  *	- Validate and manage cache coherency primitives (cache state
64  *	  is stored in chain topologies but must be validated by these
65  *	  functions).
66  *
67  * (2) Lookups and Scans
68  *		hammer2_cluster_lookup()
69  *		hammer2_cluster_next()
70  *
71  *	- Depend on locking & data retrieval functions, but still complex.
72  *
73  *	- Must do quorum management on transaction ids.
74  *
75  *	- Lookup and Iteration ops Must be internally asynchronous.
76  *
77  * (3) Modifying Operations
78  *		hammer2_cluster_create()
79  *
80  *	- Can usually punt on failures, operation continues unless quorum
81  *	  is lost.  If quorum is lost, must wait for resynchronization
82  *	  (depending on the management mode).
83  *
84  *	- Must disconnect node on failures (also not flush), remount, and
85  *	  resynchronize.
86  *
87  *	- Network links (via kdmsg) are relatively easy to issue as the
88  *	  complex underworkings of hammer2_chain.c don't have to messed
89  *	  with (the protocol is at a higher level than block-level).
90  *
91  *	- Multiple local disk nodes (i.e. block devices) are another matter.
92  *	  Chain operations have to be dispatched to per-node threads (xN)
93  *	  because we can't asynchronize potentially very complex chain
94  *	  operations in hammer2_chain.c (it would be a huge mess).
95  *
96  *	  (these threads are also used to terminate incoming kdmsg ops from
97  *	  other machines).
98  *
99  *	- Single-node filesystems do not use threads and will simply call
100  *	  hammer2_chain.c functions directly.  This short-cut is handled
101  *	  at the base of each cluster function.
102  */
103 #include <sys/cdefs.h>
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/types.h>
107 #include <sys/lock.h>
108 #include <sys/uuid.h>
109 
110 #include "hammer2.h"
111 
112 /*
113  * Returns the bref type of the cluster's foucs.
114  *
115  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
116  * The cluster must be locked.
117  */
118 uint8_t
119 hammer2_cluster_type(hammer2_cluster_t *cluster)
120 {
121 	if (cluster->error == 0) {
122 		KKASSERT(cluster->focus != NULL);
123 		return(cluster->focus->bref.type);
124 	}
125 	return 0;
126 }
127 
128 /*
129  * Returns non-zero if the cluster's focus is flagged as being modified.
130  *
131  * If the cluster is errored, returns 0.
132  */
133 static
134 int
135 hammer2_cluster_modified(hammer2_cluster_t *cluster)
136 {
137 	if (cluster->error == 0) {
138 		KKASSERT(cluster->focus != NULL);
139 		return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
140 	}
141 	return 0;
142 }
143 
144 /*
145  * Returns the bref of the cluster's focus, sans any data-offset information
146  * (since offset information is per-node and wouldn't be useful).
147  *
148  * Callers use this function to access modify_tid, mirror_tid, type,
149  * key, and keybits.
150  *
151  * If the cluster is errored, returns an empty bref.
152  * The cluster must be locked.
153  */
154 void
155 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
156 {
157 	if (cluster->error == 0) {
158 		KKASSERT(cluster->focus != NULL);
159 		*bref = cluster->focus->bref;
160 		bref->data_off = 0;
161 	} else {
162 		bzero(bref, sizeof(*bref));
163 	}
164 }
165 
166 /*
167  * Create a degenerate cluster with one ref from a single locked chain.
168  * The returned cluster will be focused on the chain and inherit its
169  * error state.
170  *
171  * The chain's lock and reference are transfered to the new cluster, so
172  * the caller should not try to unlock the chain separately.
173  *
174  * We fake the flags.
175  */
176 hammer2_cluster_t *
177 hammer2_cluster_from_chain(hammer2_chain_t *chain)
178 {
179 	hammer2_cluster_t *cluster;
180 
181 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
182 	cluster->array[0].chain = chain;
183 	cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
184 	cluster->nchains = 1;
185 	cluster->focus = chain;
186 	cluster->focus_index = 0;
187 	cluster->pmp = chain->pmp;
188 	cluster->refs = 1;
189 	cluster->error = chain->error;
190 	cluster->flags = HAMMER2_CLUSTER_LOCKED |
191 			 HAMMER2_CLUSTER_WRHARD |
192 			 HAMMER2_CLUSTER_RDHARD |
193 			 HAMMER2_CLUSTER_MSYNCED |
194 			 HAMMER2_CLUSTER_SSYNCED;
195 
196 	return cluster;
197 }
198 
199 /*
200  * Add a reference to a cluster and its underlying chains.
201  *
202  * We must also ref the underlying chains in order to allow ref/unlock
203  * sequences to later re-lock.
204  */
205 void
206 hammer2_cluster_ref(hammer2_cluster_t *cluster)
207 {
208 	atomic_add_int(&cluster->refs, 1);
209 }
210 
211 /*
212  * Drop the caller's reference to the cluster.  When the ref count drops to
213  * zero this function frees the cluster and drops all underlying chains.
214  *
215  * In-progress read I/Os are typically detached from the cluster once the
216  * first one returns (the remaining stay attached to the DIOs but are then
217  * ignored and drop naturally).
218  */
219 void
220 hammer2_cluster_drop(hammer2_cluster_t *cluster)
221 {
222 	hammer2_chain_t *chain;
223 	int i;
224 
225 	KKASSERT(cluster->refs > 0);
226 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
227 		cluster->focus = NULL;		/* safety XXX chg to assert */
228 		cluster->focus_index = 0;
229 
230 		for (i = 0; i < cluster->nchains; ++i) {
231 			chain = cluster->array[i].chain;
232 			if (chain) {
233 				hammer2_chain_drop(chain);
234 				cluster->array[i].chain = NULL; /* safety */
235 			}
236 		}
237 		cluster->nchains = 0;				/* safety */
238 
239 		kfree(cluster, M_HAMMER2);
240 		/* cluster is invalid */
241 	}
242 }
243 
244 /*
245  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained.
246  *
247  * WARNING! This function expects the caller to handle resolution of the
248  *	    cluster.  We never re-resolve the cluster in this function,
249  *	    because it might be used to temporarily unlock/relock a cparent
250  *	    in an iteration or recursrion, and the cparents elements do not
251  *	    necessarily match.
252  */
253 void
254 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
255 {
256 	hammer2_chain_t *chain;
257 	int i;
258 
259 	/* cannot be on inode-embedded cluster template, must be on copy */
260 	KKASSERT(cluster->refs > 0);
261 	KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
262 	if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
263 		panic("hammer2_cluster_lock: cluster %p already locked!\n",
264 			cluster);
265 	}
266 	atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
267 
268 	/*
269 	 * Lock chains and resolve state.
270 	 */
271 	for (i = 0; i < cluster->nchains; ++i) {
272 		chain = cluster->array[i].chain;
273 		if (chain == NULL)
274 			continue;
275 		hammer2_chain_lock(chain, how);
276 	}
277 }
278 
279 /*
280  * Calculate the clustering state for the cluster and set its focus.
281  * This routine must be called with care.  For example, it should not
282  * normally be called after relocking a non-leaf cluster because parent
283  * clusters help iterations and each element might be at a slightly different
284  * indirect node (each node's topology is independently indexed).
285  *
286  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
287  * operations.  Typically this is only set on a quorum of MASTERs or
288  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
289  * is present, this bit is *not* set on a quorum of MASTERs.  The
290  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
291  * that create/modify/delete elements use it.
292  *
293  * The chains making up the cluster may be narrowed down based on quorum
294  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
295  * narrowed down to a single chain as long as the entire subtopology is known
296  * to be intact.  So, for example, we can narrow a read-only op to a single
297  * fast SLAVE but if we focus a CACHE chain we must still retain at least
298  * a SLAVE to ensure that the subtopology can be accessed.
299  *
300  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
301  * to be maintained once the topology is validated as-of the top level of
302  * the operation.
303  *
304  * If a failure occurs the operation must be aborted by higher-level code and
305  * retried. XXX
306  */
307 void
308 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
309 {
310 	hammer2_chain_t *chain;
311 	hammer2_chain_t *focus;
312 	hammer2_pfs_t *pmp;
313 	hammer2_tid_t quorum_tid;
314 	hammer2_tid_t last_best_quorum_tid;
315 	int focus_pfs_type;
316 	uint32_t nflags;
317 	int ttlmasters;
318 	int ttlslaves;
319 	int nmasters;
320 	int nslaves;
321 	int nquorum;
322 	int smpresent;
323 	int i;
324 
325 	cluster->error = 0;
326 	cluster->focus = NULL;
327 
328 	focus_pfs_type = 0;
329 	nflags = 0;
330 	ttlmasters = 0;
331 	ttlslaves = 0;
332 	nmasters = 0;
333 	nslaves = 0;
334 
335 	/*
336 	 * Calculate quorum
337 	 */
338 	pmp = cluster->pmp;
339 	KKASSERT(pmp != NULL || cluster->nchains == 0);
340 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
341 	smpresent = 0;
342 
343 	/*
344 	 * Pass 1
345 	 *
346 	 * NOTE: A NULL chain is not necessarily an error, it could be
347 	 *	 e.g. a lookup failure or the end of an iteration.
348 	 *	 Process normally.
349 	 */
350 	for (i = 0; i < cluster->nchains; ++i) {
351 		chain = cluster->array[i].chain;
352 		if (chain && chain->error) {
353 			if (cluster->focus == NULL || cluster->focus == chain) {
354 				/* error will be overridden by valid focus */
355 				cluster->error = chain->error;
356 			}
357 
358 			/*
359 			 * Must count total masters and slaves whether the
360 			 * chain is errored or not.
361 			 */
362 			switch (cluster->pmp->pfs_types[i]) {
363 			case HAMMER2_PFSTYPE_SUPROOT:
364 			case HAMMER2_PFSTYPE_MASTER:
365 				++ttlmasters;
366 				break;
367 			case HAMMER2_PFSTYPE_SLAVE:
368 				++ttlslaves;
369 				break;
370 			}
371 			continue;
372 		}
373 		switch (cluster->pmp->pfs_types[i]) {
374 		case HAMMER2_PFSTYPE_MASTER:
375 			++ttlmasters;
376 			break;
377 		case HAMMER2_PFSTYPE_SLAVE:
378 			++ttlslaves;
379 			break;
380 		case HAMMER2_PFSTYPE_SOFT_MASTER:
381 			nflags |= HAMMER2_CLUSTER_WRSOFT;
382 			nflags |= HAMMER2_CLUSTER_RDSOFT;
383 			smpresent = 1;
384 			break;
385 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
386 			nflags |= HAMMER2_CLUSTER_RDSOFT;
387 			break;
388 		case HAMMER2_PFSTYPE_SUPROOT:
389 			/*
390 			 * Degenerate cluster representing the super-root
391 			 * topology on a single device.  Fake stuff so
392 			 * cluster ops work as expected.
393 			 */
394 			nflags |= HAMMER2_CLUSTER_WRHARD;
395 			nflags |= HAMMER2_CLUSTER_RDHARD;
396 			cluster->focus_index = i;
397 			cluster->focus = chain;
398 			cluster->error = chain ? chain->error : 0;
399 			++ttlmasters;
400 			break;
401 		default:
402 			break;
403 		}
404 	}
405 
406 	/*
407 	 * Pass 2
408 	 *
409 	 * Resolve masters.  Calculate nmasters for the highest matching
410 	 * TID, if a quorum cannot be attained try the next lower matching
411 	 * TID until we exhaust TIDs.
412 	 *
413 	 * NOTE: A NULL chain is not necessarily an error, it could be
414 	 *	 e.g. a lookup failure or the end of an iteration.
415 	 *	 Process normally.
416 	 */
417 	last_best_quorum_tid = HAMMER2_TID_MAX;
418 	quorum_tid = 0;		/* fix gcc warning */
419 
420 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
421 		nmasters = 0;
422 		quorum_tid = 0;
423 
424 		for (i = 0; i < cluster->nchains; ++i) {
425 			switch (cluster->pmp->pfs_types[i]) {
426 			case HAMMER2_PFSTYPE_SUPROOT:
427 			case HAMMER2_PFSTYPE_MASTER:
428 				break;
429 			default:
430 				continue;
431 			}
432 			chain = cluster->array[i].chain;
433 
434 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
435 				/*
436 				 * Invalid as in unsynchronized, cannot be
437 				 * used to calculate the quorum.
438 				 */
439 			} else if (chain == NULL && quorum_tid == 0) {
440 				/*
441 				 * NULL chain on master matches NULL chains
442 				 * on other masters.
443 				 */
444 				++nmasters;
445 			} else if (quorum_tid < last_best_quorum_tid &&
446 				   chain != NULL &&
447 				   (quorum_tid < chain->bref.modify_tid ||
448 				    nmasters == 0)) {
449 				/*
450 				 * Better TID located, reset nmasters count.
451 				 */
452 				nmasters = 1;
453 				quorum_tid = chain->bref.modify_tid;
454 			} else if (chain &&
455 				   quorum_tid == chain->bref.modify_tid) {
456 				/*
457 				 * TID matches current collection.
458 				 */
459 				++nmasters;
460 			}
461 		}
462 		if (nmasters >= nquorum)
463 			break;
464 		last_best_quorum_tid = quorum_tid;
465 	}
466 
467 	/*
468 	 * Pass 3
469 	 *
470 	 * NOTE: A NULL chain is not necessarily an error, it could be
471 	 *	 e.g. a lookup failure or the end of an iteration.
472 	 *	 Process normally.
473 	 */
474 	for (i = 0; i < cluster->nchains; ++i) {
475 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
476 		chain = cluster->array[i].chain;
477 		if (chain && chain->error) {
478 			if (cluster->focus == NULL || cluster->focus == chain) {
479 				/* error will be overridden by valid focus */
480 				cluster->error = chain->error;
481 			}
482 			continue;
483 		}
484 
485 		switch (cluster->pmp->pfs_types[i]) {
486 		case HAMMER2_PFSTYPE_MASTER:
487 			/*
488 			 * We must have enough up-to-date masters to reach
489 			 * a quorum and the master modify_tid must match
490 			 * the quorum's modify_tid.
491 			 *
492 			 * Do not select an errored or out-of-sync master.
493 			 */
494 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
495 				nflags |= HAMMER2_CLUSTER_UNHARD;
496 			} else if (nmasters >= nquorum &&
497 				   (chain == NULL || chain->error == 0) &&
498 				   ((chain == NULL && quorum_tid == 0) ||
499 				    (chain != NULL && quorum_tid ==
500 						  chain->bref.modify_tid))) {
501 				nflags |= HAMMER2_CLUSTER_WRHARD;
502 				nflags |= HAMMER2_CLUSTER_RDHARD;
503 				if (!smpresent) {
504 					cluster->array[i].flags |=
505 							HAMMER2_CITEM_FEMOD;
506 				}
507 				if (cluster->focus == NULL ||
508 				    focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
509 					focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
510 					cluster->focus_index = i;
511 					cluster->focus = chain; /* NULL ok */
512 					cluster->error = chain ? chain->error :
513 								 0;
514 				}
515 			} else if (chain == NULL || chain->error == 0) {
516 				nflags |= HAMMER2_CLUSTER_UNHARD;
517 			}
518 			break;
519 		case HAMMER2_PFSTYPE_SLAVE:
520 			/*
521 			 * We must have enough up-to-date masters to reach
522 			 * a quorum and the slave modify_tid must match the
523 			 * quorum's modify_tid.
524 			 *
525 			 * Do not select an errored slave.
526 			 */
527 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
528 				nflags |= HAMMER2_CLUSTER_UNHARD;
529 			} else if (nmasters >= nquorum &&
530 				   (chain == NULL || chain->error == 0) &&
531 				   ((chain == NULL && quorum_tid == 0) ||
532 				    (chain && quorum_tid ==
533 					      chain->bref.modify_tid))) {
534 				++nslaves;
535 				nflags |= HAMMER2_CLUSTER_RDHARD;
536 #if 0
537 				/* XXX optimize for RESOLVE_RDONLY */
538 				if (cluster->focus == NULL) {
539 					focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
540 					cluster->focus_index = i;
541 					cluster->focus = chain; /* NULL ok */
542 					cluster->error = chain ? chain->error :
543 								 0;
544 				}
545 #endif
546 			} else if (chain == NULL || chain->error == 0) {
547 				nflags |= HAMMER2_CLUSTER_UNSOFT;
548 			}
549 			break;
550 		case HAMMER2_PFSTYPE_SOFT_MASTER:
551 			/*
552 			 * Directly mounted soft master always wins.  There
553 			 * should be only one.
554 			 */
555 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
556 			cluster->focus_index = i;
557 			cluster->focus = chain;
558 			cluster->error = chain ? chain->error : 0;
559 			focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
560 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
561 			break;
562 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
563 			/*
564 			 * Directly mounted soft slave always wins.  There
565 			 * should be only one.
566 			 */
567 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
568 			if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
569 				cluster->focus_index = i;
570 				cluster->focus = chain;
571 				cluster->error = chain ? chain->error : 0;
572 				focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
573 			}
574 			break;
575 		case HAMMER2_PFSTYPE_SUPROOT:
576 			/*
577 			 * spmp (degenerate case)
578 			 */
579 			KKASSERT(i == 0);
580 			cluster->focus_index = i;
581 			cluster->focus = chain;
582 			cluster->error = chain ? chain->error : 0;
583 			focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
584 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
585 			break;
586 		default:
587 			break;
588 		}
589 	}
590 
591 	/*
592 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
593 	 * is bad or if we are at the PFS root (the bref won't match at
594 	 * the PFS root, obviously).
595 	 */
596 	focus = cluster->focus;
597 	if (focus) {
598 		cluster->ddflag =
599 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
600 	} else {
601 		cluster->ddflag = 0;
602 		goto skip4;
603 	}
604 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
605 		goto skip4;
606 
607 	/*
608 	 * Pass 4
609 	 *
610 	 * Validate the elements that were not marked invalid.  They should
611 	 * match.
612 	 */
613 	for (i = 0; i < cluster->nchains; ++i) {
614 		int ddflag;
615 
616 		chain = cluster->array[i].chain;
617 
618 		if (chain == NULL)
619 			continue;
620 		if (chain == focus)
621 			continue;
622 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
623 			continue;
624 
625 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
626 		if (chain->bref.type != focus->bref.type ||
627 		    chain->bref.key != focus->bref.key ||
628 		    chain->bref.keybits != focus->bref.keybits ||
629 		    chain->bref.modify_tid != focus->bref.modify_tid ||
630 		    chain->bytes != focus->bytes ||
631 		    ddflag != cluster->ddflag) {
632 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
633 			if (hammer2_debug & 1)
634 			kprintf("cluster_resolve: matching modify_tid failed "
635 				"bref test: idx=%d type=%02x/%02x "
636 				"key=%016jx/%d-%016jx/%d "
637 				"mod=%016jx/%016jx bytes=%u/%u\n",
638 				i,
639 				chain->bref.type, focus->bref.type,
640 				chain->bref.key, chain->bref.keybits,
641 				focus->bref.key, focus->bref.keybits,
642 				chain->bref.modify_tid, focus->bref.modify_tid,
643 				chain->bytes, focus->bytes);
644 			if (hammer2_debug & 0x4000)
645 				panic("cluster_resolve");
646 			/* flag issue and force resync? */
647 		}
648 	}
649 skip4:
650 
651 	if (ttlslaves == 0)
652 		nflags |= HAMMER2_CLUSTER_NOSOFT;
653 	if (ttlmasters == 0)
654 		nflags |= HAMMER2_CLUSTER_NOHARD;
655 
656 	/*
657 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
658 	 * all available nodes (even if 0 are available) are fully
659 	 * synchronized.  This is used by the synchronization thread to
660 	 * determine if there is work it could potentially accomplish.
661 	 */
662 	if (nslaves == ttlslaves)
663 		nflags |= HAMMER2_CLUSTER_SSYNCED;
664 	if (nmasters == ttlmasters)
665 		nflags |= HAMMER2_CLUSTER_MSYNCED;
666 
667 	/*
668 	 * Determine if the cluster was successfully locked for the
669 	 * requested operation and generate an error code.  The cluster
670 	 * will not be locked (or ref'd) if an error is returned.
671 	 */
672 	atomic_set_int(&cluster->flags, nflags);
673 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
674 }
675 
676 /*
677  * This is used by the XOPS subsystem to calculate the state of
678  * the collection and tell hammer2_xop_collect() what to do with it.
679  * The collection can be in various states of desynchronization, the
680  * caller specifically wants to resolve the passed-in key.
681  *
682  * Return values:
683  *	0		- Quorum agreement, key is valid
684  *
685  *	ENOENT		- Quorum agreement, end of scan
686  *
687  *	ESRCH		- Quorum agreement, key is INVALID (caller should
688  *			  skip key).
689  *
690  *	EIO		- Quorum agreement but all elements had errors.
691  *
692  *	EDEADLK		- No quorum agreement possible for key, a repair
693  *			  may be needed.  Caller has to decide what to do,
694  *			  possibly iterating the key or generating an EIO.
695  *
696  *	EINPROGRESS	- No quorum agreement yet, but agreement is still
697  *			  possible if caller waits for more responses.  Caller
698  *			  should not iterate key.
699  *
700  * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
701  *
702  * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
703  */
704 int
705 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
706 {
707 	hammer2_chain_t *chain;
708 	hammer2_chain_t *focus;
709 	hammer2_pfs_t *pmp;
710 	hammer2_tid_t quorum_tid;
711 	hammer2_tid_t last_best_quorum_tid;
712 	uint32_t nflags;
713 	int ttlmasters;
714 	int ttlslaves;
715 	int nmasters;
716 	int nmasters_keymatch;
717 	int nslaves;
718 	int nquorum;
719 	int umasters;	/* unknown masters (still in progress) */
720 	int smpresent;
721 	int error;
722 	int i;
723 
724 	cluster->error = 0;
725 	cluster->focus = NULL;
726 
727 	pmp = cluster->pmp;
728 	KKASSERT(pmp != NULL || cluster->nchains == 0);
729 
730 	/*
731 	 * Calculate quorum
732 	 */
733 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
734 	smpresent = 0;
735 	nflags = 0;
736 	ttlmasters = 0;
737 	ttlslaves = 0;
738 	nmasters = 0;
739 	nmasters_keymatch = 0;
740 	umasters = 0;
741 	nslaves = 0;
742 
743 
744 	/*
745 	 * Pass 1
746 	 *
747 	 * NOTE: A NULL chain is not necessarily an error, it could be
748 	 *	 e.g. a lookup failure or the end of an iteration.
749 	 *	 Process normally.
750 	 */
751 	for (i = 0; i < cluster->nchains; ++i) {
752 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
753 		cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
754 
755 		chain = cluster->array[i].chain;
756 		error = cluster->array[i].error;
757 		if (chain && error) {
758 			if (cluster->focus == NULL || cluster->focus == chain) {
759 				/* error will be overridden by valid focus */
760 				/* XXX */
761 			}
762 
763 			/*
764 			 * Must count total masters and slaves whether the
765 			 * chain is errored or not.
766 			 */
767 			switch (cluster->pmp->pfs_types[i]) {
768 			case HAMMER2_PFSTYPE_SUPROOT:
769 			case HAMMER2_PFSTYPE_MASTER:
770 				++ttlmasters;
771 				break;
772 			case HAMMER2_PFSTYPE_SLAVE:
773 				++ttlslaves;
774 				break;
775 			}
776 			continue;
777 		}
778 		switch (cluster->pmp->pfs_types[i]) {
779 		case HAMMER2_PFSTYPE_MASTER:
780 			++ttlmasters;
781 			break;
782 		case HAMMER2_PFSTYPE_SLAVE:
783 			++ttlslaves;
784 			break;
785 		case HAMMER2_PFSTYPE_SOFT_MASTER:
786 			nflags |= HAMMER2_CLUSTER_WRSOFT;
787 			nflags |= HAMMER2_CLUSTER_RDSOFT;
788 			smpresent = 1;
789 			break;
790 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
791 			nflags |= HAMMER2_CLUSTER_RDSOFT;
792 			break;
793 		case HAMMER2_PFSTYPE_SUPROOT:
794 			/*
795 			 * Degenerate cluster representing the super-root
796 			 * topology on a single device.  Fake stuff so
797 			 * cluster ops work as expected.
798 			 */
799 			++ttlmasters;
800 			nflags |= HAMMER2_CLUSTER_WRHARD;
801 			nflags |= HAMMER2_CLUSTER_RDHARD;
802 			cluster->focus_index = i;
803 			cluster->focus = chain;
804 			cluster->error = error;
805 			break;
806 		default:
807 			break;
808 		}
809 	}
810 
811 	/*
812 	 * Pass 2
813 	 *
814 	 * Resolve nmasters		- master nodes fully match
815 	 *
816 	 * Resolve umasters		- master nodes operation still
817 	 *				  in progress
818 	 *
819 	 * Resolve nmasters_keymatch	- master nodes match the passed-in
820 	 *				  key and may or may not match
821 	 *				  the quorum-agreed tid.
822 	 *
823 	 * The quorum-agreed TID is the highest matching TID.
824 	 */
825 	last_best_quorum_tid = HAMMER2_TID_MAX;
826 	quorum_tid = 0;		/* fix gcc warning */
827 
828 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
829 		nmasters = 0;
830 		quorum_tid = 0;
831 
832 		for (i = 0; i < cluster->nchains; ++i) {
833 			/* XXX SOFT smpresent handling */
834 			switch(cluster->pmp->pfs_types[i]) {
835 			case HAMMER2_PFSTYPE_MASTER:
836 			case HAMMER2_PFSTYPE_SUPROOT:
837 				break;
838 			default:
839 				continue;
840 			}
841 
842 			chain = cluster->array[i].chain;
843 			error = cluster->array[i].error;
844 
845 			/*
846 			 * Skip elements still in progress.  umasters keeps
847 			 * track of masters that might still be in-progress.
848 			 */
849 			if (chain == NULL && (cluster->array[i].flags &
850 					      HAMMER2_CITEM_NULL) == 0) {
851 				++umasters;
852 				continue;
853 			}
854 
855 			/*
856 			 * Key match?
857 			 */
858 			if (flags & HAMMER2_CHECK_NULL) {
859 				if (chain == NULL) {
860 					++nmasters;
861 					++nmasters_keymatch;
862 					if (cluster->error == 0)
863 						cluster->error = error;
864 				}
865 			} else if (chain &&
866 				   (key == (hammer2_key_t)-1 ||
867 				    chain->bref.key == key)) {
868 				++nmasters_keymatch;
869 				if (quorum_tid < last_best_quorum_tid &&
870 				    (quorum_tid < chain->bref.modify_tid ||
871 				     nmasters == 0)) {
872 					/*
873 					 * Better TID located, reset
874 					 * nmasters count.
875 					 */
876 					nmasters = 0;
877 					quorum_tid = chain->bref.modify_tid;
878 				}
879 				if (quorum_tid == chain->bref.modify_tid) {
880 					/*
881 					 * TID matches current collection.
882 					 *
883 					 * (error handled in next pass)
884 					 */
885 					++nmasters;
886 					if (chain->error == 0) {
887 						cluster->focus = chain;
888 						cluster->focus_index = i;
889 					}
890 				}
891 			}
892 		}
893 		if (nmasters >= nquorum)
894 			break;
895 		last_best_quorum_tid = quorum_tid;
896 	}
897 
898 	/*
899 	kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
900 		nmasters, nquorum, nmasters_keymatch, umasters);
901 	*/
902 
903 	/*
904 	 * Early return if we do not have enough masters.
905 	 */
906 	if (nmasters < nquorum) {
907 		if (nmasters + umasters >= nquorum)
908 			return EINPROGRESS;
909 		if (nmasters_keymatch < nquorum)
910 			return ESRCH;
911 		return EDEADLK;
912 	}
913 
914 	/*
915 	 * Validated end of scan.
916 	 */
917 	if (flags & HAMMER2_CHECK_NULL) {
918 		if (cluster->error == 0)
919 			cluster->error = ENOENT;
920 		return cluster->error;
921 	}
922 
923 	/*
924 	 * If we have a NULL focus at this point the agreeing quorum all
925 	 * had chain errors.
926 	 */
927 	if (cluster->focus == NULL)
928 		return EIO;
929 
930 	/*
931 	 * Pass 3
932 	 *
933 	 * We have quorum agreement, validate elements, not end of scan.
934 	 */
935 	cluster->error = 0;
936 	for (i = 0; i < cluster->nchains; ++i) {
937 		chain = cluster->array[i].chain;
938 		error = cluster->array[i].error;
939 		if (chain == NULL ||
940 		    chain->bref.key != key ||
941 		    chain->bref.modify_tid != quorum_tid) {
942 			continue;
943 		}
944 
945 		/*
946 		 * Quorum Match
947 		 *
948 		 * XXX for now, cumulative error.
949 		 */
950 		if (cluster->error == 0)
951 			cluster->error = error;
952 
953 		switch (cluster->pmp->pfs_types[i]) {
954 		case HAMMER2_PFSTYPE_MASTER:
955 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
956 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
957 			nflags |= HAMMER2_CLUSTER_WRHARD;
958 			nflags |= HAMMER2_CLUSTER_RDHARD;
959 			break;
960 		case HAMMER2_PFSTYPE_SLAVE:
961 			/*
962 			 * We must have enough up-to-date masters to reach
963 			 * a quorum and the slave modify_tid must match the
964 			 * quorum's modify_tid.
965 			 *
966 			 * Do not select an errored slave.
967 			 */
968 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
969 			nflags |= HAMMER2_CLUSTER_RDHARD;
970 			++nslaves;
971 			break;
972 		case HAMMER2_PFSTYPE_SOFT_MASTER:
973 			/*
974 			 * Directly mounted soft master always wins.  There
975 			 * should be only one.
976 			 */
977 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
978 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
979 			break;
980 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
981 			/*
982 			 * Directly mounted soft slave always wins.  There
983 			 * should be only one.
984 			 *
985 			 * XXX
986 			 */
987 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
988 			break;
989 		case HAMMER2_PFSTYPE_SUPROOT:
990 			/*
991 			 * spmp (degenerate case)
992 			 */
993 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
994 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
995 			nflags |= HAMMER2_CLUSTER_WRHARD;
996 			nflags |= HAMMER2_CLUSTER_RDHARD;
997 			break;
998 		default:
999 			break;
1000 		}
1001 	}
1002 
1003 	/*
1004 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
1005 	 * is bad or if we are at the PFS root (the bref won't match at
1006 	 * the PFS root, obviously).
1007 	 */
1008 	focus = cluster->focus;
1009 	if (focus) {
1010 		cluster->ddflag =
1011 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1012 	} else {
1013 		cluster->ddflag = 0;
1014 		goto skip4;
1015 	}
1016 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1017 		goto skip4;
1018 
1019 	/*
1020 	 * Pass 4
1021 	 *
1022 	 * Validate the elements that were not marked invalid.  They should
1023 	 * match.
1024 	 */
1025 	for (i = 0; i < cluster->nchains; ++i) {
1026 		int ddflag;
1027 
1028 		chain = cluster->array[i].chain;
1029 
1030 		if (chain == NULL)
1031 			continue;
1032 		if (chain == focus)
1033 			continue;
1034 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1035 			continue;
1036 
1037 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1038 		if (chain->bref.type != focus->bref.type ||
1039 		    chain->bref.key != focus->bref.key ||
1040 		    chain->bref.keybits != focus->bref.keybits ||
1041 		    chain->bref.modify_tid != focus->bref.modify_tid ||
1042 		    chain->bytes != focus->bytes ||
1043 		    ddflag != cluster->ddflag) {
1044 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1045 			if (hammer2_debug & 1)
1046 			kprintf("cluster_resolve: matching modify_tid failed "
1047 				"bref test: idx=%d type=%02x/%02x "
1048 				"key=%016jx/%d-%016jx/%d "
1049 				"mod=%016jx/%016jx bytes=%u/%u\n",
1050 				i,
1051 				chain->bref.type, focus->bref.type,
1052 				chain->bref.key, chain->bref.keybits,
1053 				focus->bref.key, focus->bref.keybits,
1054 				chain->bref.modify_tid, focus->bref.modify_tid,
1055 				chain->bytes, focus->bytes);
1056 			if (hammer2_debug & 0x4000)
1057 				panic("cluster_resolve");
1058 			/* flag issue and force resync? */
1059 		}
1060 	}
1061 skip4:
1062 
1063 	if (ttlslaves == 0)
1064 		nflags |= HAMMER2_CLUSTER_NOSOFT;
1065 	if (ttlmasters == 0)
1066 		nflags |= HAMMER2_CLUSTER_NOHARD;
1067 
1068 	/*
1069 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
1070 	 * all available nodes (even if 0 are available) are fully
1071 	 * synchronized.  This is used by the synchronization thread to
1072 	 * determine if there is work it could potentially accomplish.
1073 	 */
1074 	if (nslaves == ttlslaves)
1075 		nflags |= HAMMER2_CLUSTER_SSYNCED;
1076 	if (nmasters == ttlmasters)
1077 		nflags |= HAMMER2_CLUSTER_MSYNCED;
1078 
1079 	/*
1080 	 * Determine if the cluster was successfully locked for the
1081 	 * requested operation and generate an error code.  The cluster
1082 	 * will not be locked (or ref'd) if an error is returned.
1083 	 */
1084 	atomic_set_int(&cluster->flags, nflags);
1085 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1086 
1087 	return cluster->error;
1088 }
1089 
1090 /*
1091  * This is used by the sync thread to force non-NULL elements of a copy
1092  * of the pmp->iroot cluster to be good which is required to prime the
1093  * sync.
1094  */
1095 void
1096 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1097 {
1098 	int i;
1099 
1100 	for (i = 0; i < cluster->nchains; ++i) {
1101 		if (cluster->array[i].chain)
1102 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1103 	}
1104 }
1105 
1106 /*
1107  * Copy a cluster, returned a ref'd cluster.  All underlying chains
1108  * are also ref'd, but not locked.  Focus state is also copied.
1109  *
1110  * Original cluster does not have to be locked but usually is.
1111  * New cluster will not be flagged as locked.
1112  *
1113  * Callers using this function to initialize a new cluster from an inode
1114  * generally lock and resolve the resulting cluster.
1115  *
1116  * Callers which use this function to save/restore a cluster structure
1117  * generally retain the focus state and do not re-resolve it.  Caller should
1118  * not try to re-resolve internal (cparent) node state during an iteration
1119  * as the individual tracking elements of cparent in an iteration may not
1120  * match even though they are correct.
1121  */
1122 hammer2_cluster_t *
1123 hammer2_cluster_copy(hammer2_cluster_t *ocluster)
1124 {
1125 	hammer2_pfs_t *pmp = ocluster->pmp;
1126 	hammer2_cluster_t *ncluster;
1127 	hammer2_chain_t *chain;
1128 	int i;
1129 
1130 	ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
1131 	ncluster->pmp = pmp;
1132 	ncluster->nchains = ocluster->nchains;
1133 	ncluster->refs = 1;
1134 
1135 	for (i = 0; i < ocluster->nchains; ++i) {
1136 		chain = ocluster->array[i].chain;
1137 		ncluster->array[i].chain = chain;
1138 		ncluster->array[i].flags = ocluster->array[i].flags;
1139 		if (chain)
1140 			hammer2_chain_ref(chain);
1141 	}
1142 	ncluster->focus_index = ocluster->focus_index;
1143 	ncluster->focus = ocluster->focus;
1144 	ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED |
1145 					      HAMMER2_CLUSTER_INODE);
1146 
1147 	return (ncluster);
1148 }
1149 
1150 /*
1151  * Unlock a cluster.  Refcount and focus is maintained.
1152  */
1153 void
1154 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1155 {
1156 	hammer2_chain_t *chain;
1157 	int i;
1158 
1159 	if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1160 		kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1161 			cluster);
1162 	}
1163 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1164 	KKASSERT(cluster->refs > 0);
1165 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1166 
1167 	for (i = 0; i < cluster->nchains; ++i) {
1168 		chain = cluster->array[i].chain;
1169 		if (chain)
1170 			hammer2_chain_unlock(chain);
1171 	}
1172 }
1173 
1174 /************************************************************************
1175  *			        CLUSTER I/O 				*
1176  ************************************************************************
1177  *
1178  *
1179  * WARNING! blockref[] array data is not universal.  These functions should
1180  *	    only be used to access universal data.
1181  *
1182  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1183  *	    complete if necessary.  The I/O's should have already been
1184  *	    initiated by the cluster_lock/chain_lock operation.
1185  *
1186  *	    The cluster must already be in a modified state before wdata
1187  *	    is called.  The data will already be available for this case.
1188  */
1189 const hammer2_media_data_t *
1190 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1191 {
1192 	KKASSERT(cluster->focus != NULL);
1193 	return(cluster->focus->data);
1194 }
1195 
1196 hammer2_media_data_t *
1197 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1198 {
1199 	KKASSERT(cluster->focus != NULL);
1200 	KKASSERT(hammer2_cluster_modified(cluster));
1201 	return(cluster->focus->data);
1202 }
1203