xref: /dragonfly/sys/vfs/hammer2/hammer2_cluster.c (revision 8f2ce533)
1 /*
2  * Copyright (c) 2013-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *	    locks and I/O, do quorum and/or master-slave processing, and
50  *	    it must operate properly even if some nodes are broken (which
51  *	    can also mean indefinite locks).
52  *
53  *				CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *
59  *	- Most complex functions, quorum management on transaction ids.
60  *
61  *	- Locking and data accesses must be internally asynchronous.
62  *
63  *	- Validate and manage cache coherency primitives (cache state
64  *	  is stored in chain topologies but must be validated by these
65  *	  functions).
66  *
67  * (2) Lookups and Scans
68  *		hammer2_cluster_lookup()
69  *		hammer2_cluster_next()
70  *
71  *	- Depend on locking & data retrieval functions, but still complex.
72  *
73  *	- Must do quorum management on transaction ids.
74  *
75  *	- Lookup and Iteration ops Must be internally asynchronous.
76  *
77  * (3) Modifying Operations
78  *		hammer2_cluster_create()
79  *
80  *	- Can usually punt on failures, operation continues unless quorum
81  *	  is lost.  If quorum is lost, must wait for resynchronization
82  *	  (depending on the management mode).
83  *
84  *	- Must disconnect node on failures (also not flush), remount, and
85  *	  resynchronize.
86  *
87  *	- Network links (via kdmsg) are relatively easy to issue as the
88  *	  complex underworkings of hammer2_chain.c don't have to messed
89  *	  with (the protocol is at a higher level than block-level).
90  *
91  *	- Multiple local disk nodes (i.e. block devices) are another matter.
92  *	  Chain operations have to be dispatched to per-node threads (xN)
93  *	  because we can't asynchronize potentially very complex chain
94  *	  operations in hammer2_chain.c (it would be a huge mess).
95  *
96  *	  (these threads are also used to terminate incoming kdmsg ops from
97  *	  other machines).
98  *
99  *	- Single-node filesystems do not use threads and will simply call
100  *	  hammer2_chain.c functions directly.  This short-cut is handled
101  *	  at the base of each cluster function.
102  */
103 #include <sys/cdefs.h>
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/types.h>
107 
108 #include "hammer2.h"
109 
110 /*
111  * Returns the bref type of the cluster's foucs.
112  *
113  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
114  * The cluster must be locked.
115  */
116 uint8_t
117 hammer2_cluster_type(hammer2_cluster_t *cluster)
118 {
119 	if (cluster->error == 0) {
120 		KKASSERT(cluster->focus != NULL);
121 		return(cluster->focus->bref.type);
122 	}
123 	return 0;
124 }
125 
126 /*
127  * Returns the bref of the cluster's focus, sans any data-offset information
128  * (since offset information is per-node and wouldn't be useful).
129  *
130  * Callers use this function to access modify_tid, mirror_tid, type,
131  * key, and keybits.
132  *
133  * If the cluster is errored, returns an empty bref.
134  * The cluster must be locked.
135  */
136 void
137 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
138 {
139 	if (cluster->error == 0) {
140 		KKASSERT(cluster->focus != NULL);
141 		*bref = cluster->focus->bref;
142 		bref->data_off = 0;
143 	} else {
144 		bzero(bref, sizeof(*bref));
145 	}
146 }
147 
148 /*
149  * Create a degenerate cluster with one ref from a single locked chain.
150  * The returned cluster will be focused on the chain and inherit its
151  * error state.
152  *
153  * The chain's lock and reference are transfered to the new cluster, so
154  * the caller should not try to unlock the chain separately.
155  *
156  * We fake the flags.
157  */
158 void
159 hammer2_dummy_xop_from_chain(hammer2_xop_head_t *xop, hammer2_chain_t *chain)
160 {
161 	hammer2_cluster_t *cluster;
162 
163 	bzero(xop, sizeof(*xop));
164 
165 	cluster = &xop->cluster;
166 	cluster->array[0].chain = chain;
167 	cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
168 	cluster->nchains = 1;
169 	cluster->focus = chain;
170 	cluster->focus_index = 0;
171 	cluster->pmp = chain->pmp;
172 	cluster->refs = 1;
173 	cluster->error = chain->error;
174 	cluster->flags = HAMMER2_CLUSTER_LOCKED |
175 			 HAMMER2_CLUSTER_WRHARD |
176 			 HAMMER2_CLUSTER_RDHARD |
177 			 HAMMER2_CLUSTER_MSYNCED |
178 			 HAMMER2_CLUSTER_SSYNCED;
179 }
180 
181 /*
182  * Add a reference to a cluster and its underlying chains.
183  *
184  * We must also ref the underlying chains in order to allow ref/unlock
185  * sequences to later re-lock.
186  */
187 void
188 hammer2_cluster_ref(hammer2_cluster_t *cluster)
189 {
190 	atomic_add_int(&cluster->refs, 1);
191 }
192 
193 /*
194  * Drop the caller's reference to the cluster.  When the ref count drops to
195  * zero this function frees the cluster and drops all underlying chains.
196  *
197  * In-progress read I/Os are typically detached from the cluster once the
198  * first one returns (the remaining stay attached to the DIOs but are then
199  * ignored and drop naturally).
200  */
201 void
202 hammer2_cluster_drop(hammer2_cluster_t *cluster)
203 {
204 	hammer2_chain_t *chain;
205 	int i;
206 
207 	KKASSERT(cluster->refs > 0);
208 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
209 		cluster->focus = NULL;		/* safety XXX chg to assert */
210 		cluster->focus_index = 0;
211 
212 		for (i = 0; i < cluster->nchains; ++i) {
213 			chain = cluster->array[i].chain;
214 			if (chain) {
215 				hammer2_chain_drop(chain);
216 				cluster->array[i].chain = NULL; /* safety */
217 			}
218 		}
219 		cluster->nchains = 0;				/* safety */
220 
221 		kfree(cluster, M_HAMMER2);
222 		/* cluster is invalid */
223 	}
224 }
225 
226 /*
227  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained.
228  *
229  * WARNING! This function expects the caller to handle resolution of the
230  *	    cluster.  We never re-resolve the cluster in this function,
231  *	    because it might be used to temporarily unlock/relock a cparent
232  *	    in an iteration or recursrion, and the cparents elements do not
233  *	    necessarily match.
234  */
235 void
236 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
237 {
238 	hammer2_chain_t *chain;
239 	int i;
240 
241 	/* cannot be on inode-embedded cluster template, must be on copy */
242 	KKASSERT(cluster->refs > 0);
243 	KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
244 	if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
245 		panic("hammer2_cluster_lock: cluster %p already locked!\n",
246 			cluster);
247 	}
248 	atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
249 
250 	/*
251 	 * Lock chains and resolve state.
252 	 */
253 	for (i = 0; i < cluster->nchains; ++i) {
254 		chain = cluster->array[i].chain;
255 		if (chain == NULL)
256 			continue;
257 		hammer2_chain_lock(chain, how);
258 	}
259 }
260 
261 void
262 hammer2_cluster_unhold(hammer2_cluster_t *cluster)
263 {
264 	hammer2_chain_t *chain;
265 	int i;
266 
267 	for (i = 0; i < cluster->nchains; ++i) {
268 		chain = cluster->array[i].chain;
269 		if (chain == NULL)
270 			continue;
271 		hammer2_chain_unhold(chain);
272 	}
273 }
274 
275 void
276 hammer2_cluster_rehold(hammer2_cluster_t *cluster)
277 {
278 	hammer2_chain_t *chain;
279 	int i;
280 
281 	for (i = 0; i < cluster->nchains; ++i) {
282 		chain = cluster->array[i].chain;
283 		if (chain == NULL)
284 			continue;
285 		hammer2_chain_rehold(chain);
286 	}
287 }
288 
289 /*
290  * This is used by the XOPS subsystem to calculate the state of
291  * the collection and tell hammer2_xop_collect() what to do with it.
292  * The collection can be in various states of desynchronization, the
293  * caller specifically wants to resolve the passed-in key.
294  *
295  * Return values (HAMMER2_ERROR_*):
296  *
297  *	0		- Quorum agreement, key is valid
298  *
299  *	ENOENT		- Quorum agreement, end of scan
300  *
301  *	ESRCH		- Quorum agreement, key is INVALID (caller should
302  *			  skip key).
303  *
304  *	EIO		- Quorum agreement but all elements had errors.
305  *
306  *	EDEADLK		- No quorum agreement possible for key, a repair
307  *			  may be needed.  Caller has to decide what to do,
308  *			  possibly iterating the key or generating an EIO.
309  *
310  *	EINPROGRESS	- No quorum agreement yet, but agreement is still
311  *			  possible if caller waits for more responses.  Caller
312  *			  should not iterate key.
313  *
314  *	CHECK		- CRC check error
315  *
316  * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
317  *
318  * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
319  */
320 int
321 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
322 {
323 	hammer2_chain_t *chain;
324 	hammer2_chain_t *focus;
325 	hammer2_pfs_t *pmp;
326 	hammer2_tid_t quorum_tid;
327 	hammer2_tid_t last_best_quorum_tid;
328 	uint32_t nflags;
329 	int ttlmasters;
330 	int ttlslaves;
331 	int nmasters;
332 	int nmasters_keymatch;
333 	int nslaves;
334 	int nquorum;
335 	int umasters;	/* unknown masters (still in progress) */
336 	int smpresent;
337 	int error;
338 	int i;
339 
340 	cluster->error = 0;
341 	cluster->focus = NULL;
342 
343 	pmp = cluster->pmp;
344 	KKASSERT(pmp != NULL || cluster->nchains == 0);
345 
346 	/*
347 	 * Calculate quorum
348 	 */
349 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
350 	smpresent = 0;
351 	nflags = 0;
352 	ttlmasters = 0;
353 	ttlslaves = 0;
354 
355 	/*
356 	 * Pass 1
357 	 *
358 	 * NOTE: A NULL chain is not necessarily an error, it could be
359 	 *	 e.g. a lookup failure or the end of an iteration.
360 	 *	 Process normally.
361 	 */
362 	for (i = 0; i < cluster->nchains; ++i) {
363 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
364 		cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
365 
366 		chain = cluster->array[i].chain;
367 		error = cluster->array[i].error;
368 		if (chain && error) {
369 			if (cluster->focus == NULL || cluster->focus == chain) {
370 				/* error will be overridden by valid focus */
371 				/* XXX */
372 			}
373 
374 			/*
375 			 * Must count total masters and slaves whether the
376 			 * chain is errored or not.
377 			 */
378 			switch (cluster->pmp->pfs_types[i]) {
379 			case HAMMER2_PFSTYPE_SUPROOT:
380 			case HAMMER2_PFSTYPE_MASTER:
381 				++ttlmasters;
382 				break;
383 			case HAMMER2_PFSTYPE_SLAVE:
384 				++ttlslaves;
385 				break;
386 			}
387 			continue;
388 		}
389 		switch (cluster->pmp->pfs_types[i]) {
390 		case HAMMER2_PFSTYPE_MASTER:
391 			++ttlmasters;
392 			break;
393 		case HAMMER2_PFSTYPE_SLAVE:
394 			++ttlslaves;
395 			break;
396 		case HAMMER2_PFSTYPE_SOFT_MASTER:
397 			nflags |= HAMMER2_CLUSTER_WRSOFT;
398 			nflags |= HAMMER2_CLUSTER_RDSOFT;
399 			smpresent = 1;
400 			break;
401 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
402 			nflags |= HAMMER2_CLUSTER_RDSOFT;
403 			break;
404 		case HAMMER2_PFSTYPE_SUPROOT:
405 			/*
406 			 * Degenerate cluster representing the super-root
407 			 * topology on a single device.  Fake stuff so
408 			 * cluster ops work as expected.
409 			 */
410 			++ttlmasters;
411 			nflags |= HAMMER2_CLUSTER_WRHARD;
412 			nflags |= HAMMER2_CLUSTER_RDHARD;
413 			cluster->focus_index = i;
414 			cluster->focus = chain;
415 			cluster->error = error;
416 			break;
417 		default:
418 			break;
419 		}
420 	}
421 
422 	/*
423 	 * Pass 2
424 	 *
425 	 * Resolve nmasters		- master nodes fully match
426 	 *
427 	 * Resolve umasters		- master nodes operation still
428 	 *				  in progress
429 	 *
430 	 * Resolve nmasters_keymatch	- master nodes match the passed-in
431 	 *				  key and may or may not match
432 	 *				  the quorum-agreed tid.
433 	 *
434 	 * The quorum-agreed TID is the highest matching TID.
435 	 */
436 	last_best_quorum_tid = HAMMER2_TID_MAX;
437 	umasters = 0;
438 	nmasters = 0;
439 	nmasters_keymatch = 0;
440 	quorum_tid = 0;		/* fix gcc warning */
441 
442 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
443 		umasters = 0;
444 		nmasters = 0;
445 		nmasters_keymatch = 0;
446 		quorum_tid = 0;
447 
448 		for (i = 0; i < cluster->nchains; ++i) {
449 			/* XXX SOFT smpresent handling */
450 			switch(cluster->pmp->pfs_types[i]) {
451 			case HAMMER2_PFSTYPE_MASTER:
452 			case HAMMER2_PFSTYPE_SUPROOT:
453 				break;
454 			default:
455 				continue;
456 			}
457 
458 			chain = cluster->array[i].chain;
459 			error = cluster->array[i].error;
460 
461 			/*
462 			 * Skip elements still in progress.  umasters keeps
463 			 * track of masters that might still be in-progress.
464 			 */
465 			if (chain == NULL && (cluster->array[i].flags &
466 					      HAMMER2_CITEM_NULL) == 0) {
467 				++umasters;
468 				continue;
469 			}
470 
471 			/*
472 			 * Key match?
473 			 */
474 			if (flags & HAMMER2_CHECK_NULL) {
475 				if (chain == NULL) {
476 					++nmasters;
477 					++nmasters_keymatch;
478 					if (cluster->error == 0)
479 						cluster->error = error;
480 				}
481 			} else if (chain &&
482 				   (key == (hammer2_key_t)-1 ||
483 				    chain->bref.key == key)) {
484 				++nmasters_keymatch;
485 
486 				if (chain->bref.modify_tid <
487 				     last_best_quorum_tid &&
488 				    quorum_tid < chain->bref.modify_tid) {
489 					/*
490 					 * Select new TID as master if better
491 					 * than any found so far in this loop,
492 					 * as long as it does not reach the
493 					 * best tid found in the previous loop.
494 					 */
495 					nmasters = 0;
496 					quorum_tid = chain->bref.modify_tid;
497 				}
498 				if (quorum_tid == chain->bref.modify_tid) {
499 					/*
500 					 * TID matches current collection.
501 					 *
502 					 * (error handled in next pass)
503 					 */
504 					++nmasters;
505 					if (chain->error == 0) {
506 						cluster->focus = chain;
507 						cluster->focus_index = i;
508 					}
509 				}
510 			}
511 		}
512 		if (nmasters >= nquorum)
513 			break;
514 		last_best_quorum_tid = quorum_tid;
515 	}
516 
517 	/*
518 	kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
519 		nmasters, nquorum, nmasters_keymatch, umasters);
520 	*/
521 
522 	/*
523 	 * Early return if we do not have enough masters.
524 	 */
525 	if (nmasters < nquorum) {
526 		if (nmasters + umasters >= nquorum)
527 			return HAMMER2_ERROR_EINPROGRESS;
528 		if (nmasters_keymatch < nquorum)
529 			return HAMMER2_ERROR_ESRCH;
530 		return HAMMER2_ERROR_EDEADLK;
531 	}
532 
533 	/*
534 	 * Validated end of scan.
535 	 */
536 	if (flags & HAMMER2_CHECK_NULL) {
537 		if (cluster->error == 0)
538 			cluster->error = HAMMER2_ERROR_ENOENT;
539 		return cluster->error;
540 	}
541 
542 	/*
543 	 * If we have a NULL focus at this point the agreeing quorum all
544 	 * had chain errors.
545 	 */
546 	if (cluster->focus == NULL)
547 		return HAMMER2_ERROR_EIO;
548 
549 	/*
550 	 * Pass 3
551 	 *
552 	 * We have quorum agreement, validate elements, not end of scan.
553 	 */
554 	nslaves = 0;
555 	cluster->error = 0;
556 
557 	for (i = 0; i < cluster->nchains; ++i) {
558 		chain = cluster->array[i].chain;
559 		error = cluster->array[i].error;
560 		if (chain == NULL ||
561 		    chain->bref.key != key ||
562 		    chain->bref.modify_tid != quorum_tid) {
563 			continue;
564 		}
565 
566 		/*
567 		 * Quorum Match
568 		 *
569 		 * XXX for now, cumulative error.
570 		 */
571 		if (cluster->error == 0)
572 			cluster->error = error;
573 
574 		switch (cluster->pmp->pfs_types[i]) {
575 		case HAMMER2_PFSTYPE_MASTER:
576 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
577 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
578 			nflags |= HAMMER2_CLUSTER_WRHARD;
579 			nflags |= HAMMER2_CLUSTER_RDHARD;
580 			break;
581 		case HAMMER2_PFSTYPE_SLAVE:
582 			/*
583 			 * We must have enough up-to-date masters to reach
584 			 * a quorum and the slave modify_tid must match the
585 			 * quorum's modify_tid.
586 			 *
587 			 * Do not select an errored slave.
588 			 */
589 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
590 			nflags |= HAMMER2_CLUSTER_RDHARD;
591 			++nslaves;
592 			break;
593 		case HAMMER2_PFSTYPE_SOFT_MASTER:
594 			/*
595 			 * Directly mounted soft master always wins.  There
596 			 * should be only one.
597 			 */
598 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
599 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
600 			break;
601 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
602 			/*
603 			 * Directly mounted soft slave always wins.  There
604 			 * should be only one.
605 			 *
606 			 * XXX
607 			 */
608 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
609 			break;
610 		case HAMMER2_PFSTYPE_SUPROOT:
611 			/*
612 			 * spmp (degenerate case)
613 			 */
614 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
615 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
616 			nflags |= HAMMER2_CLUSTER_WRHARD;
617 			nflags |= HAMMER2_CLUSTER_RDHARD;
618 			break;
619 		default:
620 			break;
621 		}
622 	}
623 
624 	/*
625 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
626 	 * is bad or if we are at the PFS root (the bref won't match at
627 	 * the PFS root, obviously).
628 	 *
629 	 * focus is probably not locked and it isn't safe to test its
630 	 * content (e.g. focus->data, focus->dio, other content).  We
631 	 * do not synchronize the dio to the cpu here.  In fact, in numerous
632 	 * situations the frontend doesn't even need to access its dio/data,
633 	 * so synchronizing it here would be wasteful.
634 	 */
635 	focus = cluster->focus;
636 	if (focus) {
637 		cluster->ddflag =
638 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
639 	} else {
640 		cluster->ddflag = 0;
641 		goto skip4;
642 	}
643 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
644 		goto skip4;
645 
646 	/*
647 	 * Pass 4
648 	 *
649 	 * Validate the elements that were not marked invalid.  They should
650 	 * match.
651 	 */
652 	for (i = 0; i < cluster->nchains; ++i) {
653 		int ddflag;
654 
655 		chain = cluster->array[i].chain;
656 
657 		if (chain == NULL)
658 			continue;
659 		if (chain == focus)
660 			continue;
661 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
662 			continue;
663 
664 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
665 		if (chain->bref.type != focus->bref.type ||
666 		    chain->bref.key != focus->bref.key ||
667 		    chain->bref.keybits != focus->bref.keybits ||
668 		    chain->bref.modify_tid != focus->bref.modify_tid ||
669 		    chain->bytes != focus->bytes ||
670 		    ddflag != cluster->ddflag) {
671 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
672 			if (hammer2_debug & 1)
673 			kprintf("cluster_check: matching modify_tid failed "
674 				"bref test: idx=%d type=%02x/%02x "
675 				"key=%016jx/%d-%016jx/%d "
676 				"mod=%016jx/%016jx bytes=%u/%u\n",
677 				i,
678 				chain->bref.type, focus->bref.type,
679 				chain->bref.key, chain->bref.keybits,
680 				focus->bref.key, focus->bref.keybits,
681 				chain->bref.modify_tid, focus->bref.modify_tid,
682 				chain->bytes, focus->bytes);
683 			if (hammer2_debug & 0x4000)
684 				panic("cluster_check");
685 			/* flag issue and force resync? */
686 		}
687 	}
688 skip4:
689 
690 	if (ttlslaves == 0)
691 		nflags |= HAMMER2_CLUSTER_NOSOFT;
692 	if (ttlmasters == 0)
693 		nflags |= HAMMER2_CLUSTER_NOHARD;
694 
695 	/*
696 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
697 	 * all available nodes (even if 0 are available) are fully
698 	 * synchronized.  This is used by the synchronization thread to
699 	 * determine if there is work it could potentially accomplish.
700 	 */
701 	if (nslaves == ttlslaves)
702 		nflags |= HAMMER2_CLUSTER_SSYNCED;
703 	if (nmasters == ttlmasters)
704 		nflags |= HAMMER2_CLUSTER_MSYNCED;
705 
706 	/*
707 	 * Determine if the cluster was successfully locked for the
708 	 * requested operation and generate an error code.  The cluster
709 	 * will not be locked (or ref'd) if an error is returned.
710 	 */
711 	atomic_set_int(&cluster->flags, nflags);
712 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
713 
714 	return cluster->error;
715 }
716 
717 /*
718  * This is used by the sync thread to force non-NULL elements of a copy
719  * of the pmp->iroot cluster to be good which is required to prime the
720  * sync.
721  */
722 void
723 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
724 {
725 	int i;
726 
727 	for (i = 0; i < cluster->nchains; ++i) {
728 		if (cluster->array[i].chain)
729 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
730 	}
731 }
732 
733 /*
734  * Unlock a cluster.  Refcount and focus is maintained.
735  */
736 void
737 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
738 {
739 	hammer2_chain_t *chain;
740 	int i;
741 
742 	if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
743 		kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
744 			cluster);
745 	}
746 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
747 	KKASSERT(cluster->refs > 0);
748 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
749 
750 	for (i = 0; i < cluster->nchains; ++i) {
751 		chain = cluster->array[i].chain;
752 		if (chain)
753 			hammer2_chain_unlock(chain);
754 	}
755 }
756