1 /*
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2022 Tomohiro Kusumi <tkusumi@netbsd.org>
5  * Copyright (c) 2011-2022 The DragonFly Project.  All rights reserved.
6  *
7  * This code is derived from software contributed to The DragonFly Project
8  * by Matthew Dillon <dillon@dragonflybsd.org>
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in
18  *    the documentation and/or other materials provided with the
19  *    distribution.
20  * 3. Neither the name of The DragonFly Project nor the names of its
21  *    contributors may be used to endorse or promote products derived
22  *    from this software without specific, prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
28  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
34  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 /*
38  * The cluster module collects multiple chains representing the same
39  * information from different nodes into a single entity.  It allows direct
40  * access to media data as long as it is not blockref array data (which
41  * will obviously have to be different at each node).
42  *
43  * This module also handles I/O dispatch, status rollup, and various
44  * mastership arrangements including quorum operations.  It effectively
45  * presents one topology to the vnops layer.
46  *
47  * Many of the API calls mimic chain API calls but operate on clusters
48  * instead of chains.  Please see hammer2_chain.c for more complete code
49  * documentation of the API functions.
50  *
51  * WARNING! This module is *extremely* complex.  It must issue asynchronous
52  *	    locks and I/O, do quorum and/or master-slave processing, and
53  *	    it must operate properly even if some nodes are broken (which
54  *	    can also mean indefinite locks).
55  *
56  *				CLUSTER OPERATIONS
57  *
58  * Cluster operations can be broken down into three pieces:
59  *
60  * (1) Chain locking and data retrieval.
61  *
62  *	- Most complex functions, quorum management on transaction ids.
63  *
64  *	- Locking and data accesses must be internally asynchronous.
65  *
66  *	- Validate and manage cache coherency primitives (cache state
67  *	  is stored in chain topologies but must be validated by these
68  *	  functions).
69  *
70  * (2) Lookups and Scans
71  *		hammer2_cluster_lookup()
72  *		hammer2_cluster_next()
73  *
74  *	- Depend on locking & data retrieval functions, but still complex.
75  *
76  *	- Must do quorum management on transaction ids.
77  *
78  *	- Lookup and Iteration ops Must be internally asynchronous.
79  *
80  * (3) Modifying Operations
81  *		hammer2_cluster_create()
82  *
83  *	- Can usually punt on failures, operation continues unless quorum
84  *	  is lost.  If quorum is lost, must wait for resynchronization
85  *	  (depending on the management mode).
86  *
87  *	- Must disconnect node on failures (also not flush), remount, and
88  *	  resynchronize.
89  *
90  *	- Network links (via kdmsg) are relatively easy to issue as the
91  *	  complex underworkings of hammer2_chain.c don't have to messed
92  *	  with (the protocol is at a higher level than block-level).
93  *
94  *	- Multiple local disk nodes (i.e. block devices) are another matter.
95  *	  Chain operations have to be dispatched to per-node threads (xN)
96  *	  because we can't asynchronize potentially very complex chain
97  *	  operations in hammer2_chain.c (it would be a huge mess).
98  *
99  *	  (these threads are also used to terminate incoming kdmsg ops from
100  *	  other machines).
101  *
102  *	- Single-node filesystems do not use threads and will simply call
103  *	  hammer2_chain.c functions directly.  This short-cut is handled
104  *	  at the base of each cluster function.
105  */
106 /*
107 #include <sys/cdefs.h>
108 #include <sys/param.h>
109 #include <sys/systm.h>
110 #include <sys/types.h>
111 */
112 
113 #include "hammer2.h"
114 
115 /*
116  * Returns the bref type of the cluster's foucs.
117  *
118  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
119  * The cluster must be locked.
120  */
121 uint8_t
122 hammer2_cluster_type(hammer2_cluster_t *cluster)
123 {
124 	if (cluster->error == 0) {
125 		KKASSERT(cluster->focus != NULL);
126 		return(cluster->focus->bref.type);
127 	}
128 	return 0;
129 }
130 
131 /*
132  * Returns the bref of the cluster's focus, sans any data-offset information
133  * (since offset information is per-node and wouldn't be useful).
134  *
135  * Callers use this function to access modify_tid, mirror_tid, type,
136  * key, and keybits.
137  *
138  * If the cluster is errored, returns an empty bref.
139  * The cluster must be locked.
140  */
141 void
142 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
143 {
144 	if (cluster->error == 0) {
145 		KKASSERT(cluster->focus != NULL);
146 		*bref = cluster->focus->bref;
147 		bref->data_off = 0;
148 	} else {
149 		bzero(bref, sizeof(*bref));
150 	}
151 }
152 
153 /*
154  * Create a degenerate cluster with one ref from a single locked chain.
155  * The returned cluster will be focused on the chain and inherit its
156  * error state.
157  *
158  * The chain's lock and reference are transfered to the new cluster, so
159  * the caller should not try to unlock the chain separately.
160  *
161  * We fake the flags.
162  */
163 void
164 hammer2_dummy_xop_from_chain(hammer2_xop_head_t *xop, hammer2_chain_t *chain)
165 {
166 	hammer2_cluster_t *cluster;
167 
168 	bzero(xop, sizeof(*xop));
169 
170 	cluster = &xop->cluster;
171 	cluster->array[0].chain = chain;
172 	cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
173 	cluster->nchains = 1;
174 	cluster->focus = chain;
175 	cluster->focus_index = 0;
176 	cluster->pmp = chain->pmp;
177 	cluster->refs = 1;
178 	cluster->error = chain->error;
179 	cluster->flags = HAMMER2_CLUSTER_LOCKED |
180 			 HAMMER2_CLUSTER_WRHARD |
181 			 HAMMER2_CLUSTER_RDHARD |
182 			 HAMMER2_CLUSTER_MSYNCED |
183 			 HAMMER2_CLUSTER_SSYNCED;
184 }
185 
186 /*
187  * Add a reference to a cluster and its underlying chains.
188  *
189  * We must also ref the underlying chains in order to allow ref/unlock
190  * sequences to later re-lock.
191  */
192 void
193 hammer2_cluster_ref(hammer2_cluster_t *cluster)
194 {
195 	atomic_add_int(&cluster->refs, 1);
196 }
197 
198 /*
199  * Drop the caller's reference to the cluster.  When the ref count drops to
200  * zero this function frees the cluster and drops all underlying chains.
201  *
202  * In-progress read I/Os are typically detached from the cluster once the
203  * first one returns (the remaining stay attached to the DIOs but are then
204  * ignored and drop naturally).
205  */
206 void
207 hammer2_cluster_drop(hammer2_cluster_t *cluster)
208 {
209 	hammer2_chain_t *chain;
210 	int i;
211 
212 	KKASSERT(cluster->refs > 0);
213 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
214 		cluster->focus = NULL;		/* safety XXX chg to assert */
215 		cluster->focus_index = 0;
216 
217 		for (i = 0; i < cluster->nchains; ++i) {
218 			chain = cluster->array[i].chain;
219 			if (chain) {
220 				hammer2_chain_drop(chain);
221 				cluster->array[i].chain = NULL; /* safety */
222 			}
223 		}
224 		cluster->nchains = 0;				/* safety */
225 
226 		kfree(cluster, M_HAMMER2);
227 		/* cluster is invalid */
228 	}
229 }
230 
231 /*
232  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained.
233  *
234  * WARNING! This function expects the caller to handle resolution of the
235  *	    cluster.  We never re-resolve the cluster in this function,
236  *	    because it might be used to temporarily unlock/relock a cparent
237  *	    in an iteration or recursrion, and the cparents elements do not
238  *	    necessarily match.
239  */
240 void
241 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
242 {
243 	hammer2_chain_t *chain;
244 	int i;
245 
246 	/* cannot be on inode-embedded cluster template, must be on copy */
247 	KKASSERT(cluster->refs > 0);
248 	KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
249 	if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
250 		panic("hammer2_cluster_lock: cluster %p already locked!\n",
251 			cluster);
252 	}
253 	atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
254 
255 	/*
256 	 * Lock chains and resolve state.
257 	 */
258 	for (i = 0; i < cluster->nchains; ++i) {
259 		chain = cluster->array[i].chain;
260 		if (chain == NULL)
261 			continue;
262 		hammer2_chain_lock(chain, how);
263 	}
264 }
265 
266 void
267 hammer2_cluster_unhold(hammer2_cluster_t *cluster)
268 {
269 	hammer2_chain_t *chain;
270 	int i;
271 
272 	for (i = 0; i < cluster->nchains; ++i) {
273 		chain = cluster->array[i].chain;
274 		if (chain == NULL)
275 			continue;
276 		hammer2_chain_unhold(chain);
277 	}
278 }
279 
280 void
281 hammer2_cluster_rehold(hammer2_cluster_t *cluster)
282 {
283 	hammer2_chain_t *chain;
284 	int i;
285 
286 	for (i = 0; i < cluster->nchains; ++i) {
287 		chain = cluster->array[i].chain;
288 		if (chain == NULL)
289 			continue;
290 		hammer2_chain_rehold(chain);
291 	}
292 }
293 
294 /*
295  * This is used by the XOPS subsystem to calculate the state of
296  * the collection and tell hammer2_xop_collect() what to do with it.
297  * The collection can be in various states of desynchronization, the
298  * caller specifically wants to resolve the passed-in key.
299  *
300  * Return values (HAMMER2_ERROR_*):
301  *
302  *	0		- Quorum agreement, key is valid
303  *
304  *	ENOENT		- Quorum agreement, end of scan
305  *
306  *	ESRCH		- Quorum agreement, key is INVALID (caller should
307  *			  skip key).
308  *
309  *	EIO		- Quorum agreement but all elements had errors.
310  *
311  *	EDEADLK		- No quorum agreement possible for key, a repair
312  *			  may be needed.  Caller has to decide what to do,
313  *			  possibly iterating the key or generating an EIO.
314  *
315  *	EINPROGRESS	- No quorum agreement yet, but agreement is still
316  *			  possible if caller waits for more responses.  Caller
317  *			  should not iterate key.
318  *
319  *	CHECK		- CRC check error
320  *
321  * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
322  *
323  * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
324  */
325 int
326 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
327 {
328 	hammer2_chain_t *chain;
329 	hammer2_chain_t *focus;
330 	hammer2_pfs_t *pmp;
331 	hammer2_tid_t quorum_tid;
332 	hammer2_tid_t last_best_quorum_tid;
333 	uint32_t nflags;
334 	int ttlmasters;
335 	int ttlslaves;
336 	int nmasters;
337 	int nmasters_keymatch;
338 	int nslaves;
339 	int nquorum;
340 	int umasters;	/* unknown masters (still in progress) */
341 	int smpresent;
342 	int error;
343 	int i;
344 
345 	cluster->error = 0;
346 	cluster->focus = NULL;
347 
348 	pmp = cluster->pmp;
349 	KKASSERT(pmp != NULL || cluster->nchains == 0);
350 
351 	/*
352 	 * Calculate quorum
353 	 */
354 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
355 	smpresent = 0;
356 	nflags = 0;
357 	ttlmasters = 0;
358 	ttlslaves = 0;
359 
360 	/*
361 	 * Pass 1
362 	 *
363 	 * NOTE: A NULL chain is not necessarily an error, it could be
364 	 *	 e.g. a lookup failure or the end of an iteration.
365 	 *	 Process normally.
366 	 */
367 	for (i = 0; i < cluster->nchains; ++i) {
368 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
369 		cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
370 
371 		chain = cluster->array[i].chain;
372 		error = cluster->array[i].error;
373 		if (chain && error) {
374 			if (cluster->focus == NULL || cluster->focus == chain) {
375 				/* error will be overridden by valid focus */
376 				/* XXX */
377 			}
378 
379 			/*
380 			 * Must count total masters and slaves whether the
381 			 * chain is errored or not.
382 			 */
383 			switch (cluster->pmp->pfs_types[i]) {
384 			case HAMMER2_PFSTYPE_SUPROOT:
385 			case HAMMER2_PFSTYPE_MASTER:
386 				++ttlmasters;
387 				break;
388 			case HAMMER2_PFSTYPE_SLAVE:
389 				++ttlslaves;
390 				break;
391 			}
392 			continue;
393 		}
394 		switch (cluster->pmp->pfs_types[i]) {
395 		case HAMMER2_PFSTYPE_MASTER:
396 			++ttlmasters;
397 			break;
398 		case HAMMER2_PFSTYPE_SLAVE:
399 			++ttlslaves;
400 			break;
401 		case HAMMER2_PFSTYPE_SOFT_MASTER:
402 			nflags |= HAMMER2_CLUSTER_WRSOFT;
403 			nflags |= HAMMER2_CLUSTER_RDSOFT;
404 			smpresent = 1;
405 			break;
406 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
407 			nflags |= HAMMER2_CLUSTER_RDSOFT;
408 			break;
409 		case HAMMER2_PFSTYPE_SUPROOT:
410 			/*
411 			 * Degenerate cluster representing the super-root
412 			 * topology on a single device.  Fake stuff so
413 			 * cluster ops work as expected.
414 			 */
415 			++ttlmasters;
416 			nflags |= HAMMER2_CLUSTER_WRHARD;
417 			nflags |= HAMMER2_CLUSTER_RDHARD;
418 			cluster->focus_index = i;
419 			cluster->focus = chain;
420 			cluster->error = error;
421 			break;
422 		default:
423 			break;
424 		}
425 	}
426 
427 	/*
428 	 * Pass 2
429 	 *
430 	 * Resolve nmasters		- master nodes fully match
431 	 *
432 	 * Resolve umasters		- master nodes operation still
433 	 *				  in progress
434 	 *
435 	 * Resolve nmasters_keymatch	- master nodes match the passed-in
436 	 *				  key and may or may not match
437 	 *				  the quorum-agreed tid.
438 	 *
439 	 * The quorum-agreed TID is the highest matching TID.
440 	 */
441 	last_best_quorum_tid = HAMMER2_TID_MAX;
442 	umasters = 0;
443 	nmasters = 0;
444 	nmasters_keymatch = 0;
445 	quorum_tid = 0;		/* fix gcc warning */
446 
447 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
448 		umasters = 0;
449 		nmasters = 0;
450 		nmasters_keymatch = 0;
451 		quorum_tid = 0;
452 
453 		for (i = 0; i < cluster->nchains; ++i) {
454 			/* XXX SOFT smpresent handling */
455 			switch(cluster->pmp->pfs_types[i]) {
456 			case HAMMER2_PFSTYPE_MASTER:
457 			case HAMMER2_PFSTYPE_SUPROOT:
458 				break;
459 			default:
460 				continue;
461 			}
462 
463 			chain = cluster->array[i].chain;
464 			error = cluster->array[i].error;
465 
466 			/*
467 			 * Skip elements still in progress.  umasters keeps
468 			 * track of masters that might still be in-progress.
469 			 */
470 			if (chain == NULL && (cluster->array[i].flags &
471 					      HAMMER2_CITEM_NULL) == 0) {
472 				++umasters;
473 				continue;
474 			}
475 
476 			/*
477 			 * Key match?
478 			 */
479 			if (flags & HAMMER2_CHECK_NULL) {
480 				if (chain == NULL) {
481 					++nmasters;
482 					++nmasters_keymatch;
483 					if (cluster->error == 0)
484 						cluster->error = error;
485 				}
486 			} else if (chain &&
487 				   (key == (hammer2_key_t)-1 ||
488 				    chain->bref.key == key)) {
489 				++nmasters_keymatch;
490 
491 				if (chain->bref.modify_tid <
492 				     last_best_quorum_tid &&
493 				    quorum_tid < chain->bref.modify_tid) {
494 					/*
495 					 * Select new TID as master if better
496 					 * than any found so far in this loop,
497 					 * as long as it does not reach the
498 					 * best tid found in the previous loop.
499 					 */
500 					nmasters = 0;
501 					quorum_tid = chain->bref.modify_tid;
502 				}
503 				if (quorum_tid == chain->bref.modify_tid) {
504 					/*
505 					 * TID matches current collection.
506 					 *
507 					 * (error handled in next pass)
508 					 */
509 					++nmasters;
510 					if (chain->error == 0) {
511 						cluster->focus = chain;
512 						cluster->focus_index = i;
513 					}
514 				}
515 			}
516 		}
517 		if (nmasters >= nquorum)
518 			break;
519 		last_best_quorum_tid = quorum_tid;
520 	}
521 
522 	/*
523 	kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
524 		nmasters, nquorum, nmasters_keymatch, umasters);
525 	*/
526 
527 	/*
528 	 * Early return if we do not have enough masters.
529 	 */
530 	if (nmasters < nquorum) {
531 		if (nmasters + umasters >= nquorum)
532 			return HAMMER2_ERROR_EINPROGRESS;
533 		if (nmasters_keymatch < nquorum)
534 			return HAMMER2_ERROR_ESRCH;
535 		return HAMMER2_ERROR_EDEADLK;
536 	}
537 
538 	/*
539 	 * Validated end of scan.
540 	 */
541 	if (flags & HAMMER2_CHECK_NULL) {
542 		if (cluster->error == 0)
543 			cluster->error = HAMMER2_ERROR_ENOENT;
544 		return cluster->error;
545 	}
546 
547 	/*
548 	 * If we have a NULL focus at this point the agreeing quorum all
549 	 * had chain errors.
550 	 */
551 	if (cluster->focus == NULL)
552 		return HAMMER2_ERROR_EIO;
553 
554 	/*
555 	 * Pass 3
556 	 *
557 	 * We have quorum agreement, validate elements, not end of scan.
558 	 */
559 	nslaves = 0;
560 	cluster->error = 0;
561 
562 	for (i = 0; i < cluster->nchains; ++i) {
563 		chain = cluster->array[i].chain;
564 		error = cluster->array[i].error;
565 		if (chain == NULL ||
566 		    chain->bref.key != key ||
567 		    chain->bref.modify_tid != quorum_tid) {
568 			continue;
569 		}
570 
571 		/*
572 		 * Quorum Match
573 		 *
574 		 * XXX for now, cumulative error.
575 		 */
576 		if (cluster->error == 0)
577 			cluster->error = error;
578 
579 		switch (cluster->pmp->pfs_types[i]) {
580 		case HAMMER2_PFSTYPE_MASTER:
581 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
582 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
583 			nflags |= HAMMER2_CLUSTER_WRHARD;
584 			nflags |= HAMMER2_CLUSTER_RDHARD;
585 			break;
586 		case HAMMER2_PFSTYPE_SLAVE:
587 			/*
588 			 * We must have enough up-to-date masters to reach
589 			 * a quorum and the slave modify_tid must match the
590 			 * quorum's modify_tid.
591 			 *
592 			 * Do not select an errored slave.
593 			 */
594 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
595 			nflags |= HAMMER2_CLUSTER_RDHARD;
596 			++nslaves;
597 			break;
598 		case HAMMER2_PFSTYPE_SOFT_MASTER:
599 			/*
600 			 * Directly mounted soft master always wins.  There
601 			 * should be only one.
602 			 */
603 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
604 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
605 			break;
606 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
607 			/*
608 			 * Directly mounted soft slave always wins.  There
609 			 * should be only one.
610 			 *
611 			 * XXX
612 			 */
613 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
614 			break;
615 		case HAMMER2_PFSTYPE_SUPROOT:
616 			/*
617 			 * spmp (degenerate case)
618 			 */
619 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
620 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
621 			nflags |= HAMMER2_CLUSTER_WRHARD;
622 			nflags |= HAMMER2_CLUSTER_RDHARD;
623 			break;
624 		default:
625 			break;
626 		}
627 	}
628 
629 	/*
630 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
631 	 * is bad or if we are at the PFS root (the bref won't match at
632 	 * the PFS root, obviously).
633 	 *
634 	 * focus is probably not locked and it isn't safe to test its
635 	 * content (e.g. focus->data, focus->dio, other content).  We
636 	 * do not synchronize the dio to the cpu here.  In fact, in numerous
637 	 * situations the frontend doesn't even need to access its dio/data,
638 	 * so synchronizing it here would be wasteful.
639 	 */
640 	focus = cluster->focus;
641 	if (focus) {
642 		cluster->ddflag =
643 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
644 	} else {
645 		cluster->ddflag = 0;
646 		goto skip4;
647 	}
648 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
649 		goto skip4;
650 
651 	/*
652 	 * Pass 4
653 	 *
654 	 * Validate the elements that were not marked invalid.  They should
655 	 * match.
656 	 */
657 	for (i = 0; i < cluster->nchains; ++i) {
658 		int ddflag;
659 
660 		chain = cluster->array[i].chain;
661 
662 		if (chain == NULL)
663 			continue;
664 		if (chain == focus)
665 			continue;
666 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
667 			continue;
668 
669 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
670 		if (chain->bref.type != focus->bref.type ||
671 		    chain->bref.key != focus->bref.key ||
672 		    chain->bref.keybits != focus->bref.keybits ||
673 		    chain->bref.modify_tid != focus->bref.modify_tid ||
674 		    chain->bytes != focus->bytes ||
675 		    ddflag != cluster->ddflag) {
676 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
677 			if (hammer2_debug & 1)
678 			kprintf("cluster_check: matching modify_tid failed "
679 				"bref test: idx=%d type=%02x/%02x "
680 				"key=%016jx/%d-%016jx/%d "
681 				"mod=%016jx/%016jx bytes=%u/%u\n",
682 				i,
683 				chain->bref.type, focus->bref.type,
684 				chain->bref.key, chain->bref.keybits,
685 				focus->bref.key, focus->bref.keybits,
686 				chain->bref.modify_tid, focus->bref.modify_tid,
687 				chain->bytes, focus->bytes);
688 			if (hammer2_debug & 0x4000)
689 				panic("cluster_check");
690 			/* flag issue and force resync? */
691 		}
692 	}
693 skip4:
694 
695 	if (ttlslaves == 0)
696 		nflags |= HAMMER2_CLUSTER_NOSOFT;
697 	if (ttlmasters == 0)
698 		nflags |= HAMMER2_CLUSTER_NOHARD;
699 
700 	/*
701 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
702 	 * all available nodes (even if 0 are available) are fully
703 	 * synchronized.  This is used by the synchronization thread to
704 	 * determine if there is work it could potentially accomplish.
705 	 */
706 	if (nslaves == ttlslaves)
707 		nflags |= HAMMER2_CLUSTER_SSYNCED;
708 	if (nmasters == ttlmasters)
709 		nflags |= HAMMER2_CLUSTER_MSYNCED;
710 
711 	/*
712 	 * Determine if the cluster was successfully locked for the
713 	 * requested operation and generate an error code.  The cluster
714 	 * will not be locked (or ref'd) if an error is returned.
715 	 */
716 	atomic_set_int(&cluster->flags, nflags);
717 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
718 
719 	return cluster->error;
720 }
721 
722 /*
723  * This is used by the sync thread to force non-NULL elements of a copy
724  * of the pmp->iroot cluster to be good which is required to prime the
725  * sync.
726  */
727 void
728 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
729 {
730 	int i;
731 
732 	for (i = 0; i < cluster->nchains; ++i) {
733 		if (cluster->array[i].chain)
734 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
735 	}
736 }
737 
738 /*
739  * Unlock a cluster.  Refcount and focus is maintained.
740  */
741 void
742 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
743 {
744 	hammer2_chain_t *chain;
745 	int i;
746 
747 	if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
748 		kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
749 			cluster);
750 	}
751 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
752 	KKASSERT(cluster->refs > 0);
753 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
754 
755 	for (i = 0; i < cluster->nchains; ++i) {
756 		chain = cluster->array[i].chain;
757 		if (chain)
758 			hammer2_chain_unlock(chain);
759 	}
760 }
761