xref: /dragonfly/sys/vfs/hammer2/hammer2_cluster.c (revision 8af44722)
1 /*
2  * Copyright (c) 2013-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *	    locks and I/O, do quorum and/or master-slave processing, and
50  *	    it must operate properly even if some nodes are broken (which
51  *	    can also mean indefinite locks).
52  *
53  *				CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *
59  *	- Most complex functions, quorum management on transaction ids.
60  *
61  *	- Locking and data accesses must be internally asynchronous.
62  *
63  *	- Validate and manage cache coherency primitives (cache state
64  *	  is stored in chain topologies but must be validated by these
65  *	  functions).
66  *
67  * (2) Lookups and Scans
68  *		hammer2_cluster_lookup()
69  *		hammer2_cluster_next()
70  *
71  *	- Depend on locking & data retrieval functions, but still complex.
72  *
73  *	- Must do quorum management on transaction ids.
74  *
75  *	- Lookup and Iteration ops Must be internally asynchronous.
76  *
77  * (3) Modifying Operations
78  *		hammer2_cluster_create()
79  *
80  *	- Can usually punt on failures, operation continues unless quorum
81  *	  is lost.  If quorum is lost, must wait for resynchronization
82  *	  (depending on the management mode).
83  *
84  *	- Must disconnect node on failures (also not flush), remount, and
85  *	  resynchronize.
86  *
87  *	- Network links (via kdmsg) are relatively easy to issue as the
88  *	  complex underworkings of hammer2_chain.c don't have to messed
89  *	  with (the protocol is at a higher level than block-level).
90  *
91  *	- Multiple local disk nodes (i.e. block devices) are another matter.
92  *	  Chain operations have to be dispatched to per-node threads (xN)
93  *	  because we can't asynchronize potentially very complex chain
94  *	  operations in hammer2_chain.c (it would be a huge mess).
95  *
96  *	  (these threads are also used to terminate incoming kdmsg ops from
97  *	  other machines).
98  *
99  *	- Single-node filesystems do not use threads and will simply call
100  *	  hammer2_chain.c functions directly.  This short-cut is handled
101  *	  at the base of each cluster function.
102  */
103 #include <sys/cdefs.h>
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/types.h>
107 #include <sys/lock.h>
108 #include <sys/uuid.h>
109 
110 #include "hammer2.h"
111 
112 /*
113  * Returns the bref type of the cluster's foucs.
114  *
115  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
116  * The cluster must be locked.
117  */
118 uint8_t
119 hammer2_cluster_type(hammer2_cluster_t *cluster)
120 {
121 	if (cluster->error == 0) {
122 		KKASSERT(cluster->focus != NULL);
123 		return(cluster->focus->bref.type);
124 	}
125 	return 0;
126 }
127 
128 /*
129  * Returns the bref of the cluster's focus, sans any data-offset information
130  * (since offset information is per-node and wouldn't be useful).
131  *
132  * Callers use this function to access modify_tid, mirror_tid, type,
133  * key, and keybits.
134  *
135  * If the cluster is errored, returns an empty bref.
136  * The cluster must be locked.
137  */
138 void
139 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
140 {
141 	if (cluster->error == 0) {
142 		KKASSERT(cluster->focus != NULL);
143 		*bref = cluster->focus->bref;
144 		bref->data_off = 0;
145 	} else {
146 		bzero(bref, sizeof(*bref));
147 	}
148 }
149 
150 /*
151  * Create a degenerate cluster with one ref from a single locked chain.
152  * The returned cluster will be focused on the chain and inherit its
153  * error state.
154  *
155  * The chain's lock and reference are transfered to the new cluster, so
156  * the caller should not try to unlock the chain separately.
157  *
158  * We fake the flags.
159  */
160 void
161 hammer2_dummy_xop_from_chain(hammer2_xop_head_t *xop, hammer2_chain_t *chain)
162 {
163 	hammer2_cluster_t *cluster;
164 
165 	bzero(xop, sizeof(*xop));
166 
167 	cluster = &xop->cluster;
168 	cluster->array[0].chain = chain;
169 	cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
170 	cluster->nchains = 1;
171 	cluster->focus = chain;
172 	cluster->focus_index = 0;
173 	cluster->pmp = chain->pmp;
174 	cluster->refs = 1;
175 	cluster->error = chain->error;
176 	cluster->flags = HAMMER2_CLUSTER_LOCKED |
177 			 HAMMER2_CLUSTER_WRHARD |
178 			 HAMMER2_CLUSTER_RDHARD |
179 			 HAMMER2_CLUSTER_MSYNCED |
180 			 HAMMER2_CLUSTER_SSYNCED;
181 }
182 
183 /*
184  * Add a reference to a cluster and its underlying chains.
185  *
186  * We must also ref the underlying chains in order to allow ref/unlock
187  * sequences to later re-lock.
188  */
189 void
190 hammer2_cluster_ref(hammer2_cluster_t *cluster)
191 {
192 	atomic_add_int(&cluster->refs, 1);
193 }
194 
195 /*
196  * Drop the caller's reference to the cluster.  When the ref count drops to
197  * zero this function frees the cluster and drops all underlying chains.
198  *
199  * In-progress read I/Os are typically detached from the cluster once the
200  * first one returns (the remaining stay attached to the DIOs but are then
201  * ignored and drop naturally).
202  */
203 void
204 hammer2_cluster_drop(hammer2_cluster_t *cluster)
205 {
206 	hammer2_chain_t *chain;
207 	int i;
208 
209 	KKASSERT(cluster->refs > 0);
210 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
211 		cluster->focus = NULL;		/* safety XXX chg to assert */
212 		cluster->focus_index = 0;
213 
214 		for (i = 0; i < cluster->nchains; ++i) {
215 			chain = cluster->array[i].chain;
216 			if (chain) {
217 				hammer2_chain_drop(chain);
218 				cluster->array[i].chain = NULL; /* safety */
219 			}
220 		}
221 		cluster->nchains = 0;				/* safety */
222 
223 		kfree(cluster, M_HAMMER2);
224 		/* cluster is invalid */
225 	}
226 }
227 
228 /*
229  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained.
230  *
231  * WARNING! This function expects the caller to handle resolution of the
232  *	    cluster.  We never re-resolve the cluster in this function,
233  *	    because it might be used to temporarily unlock/relock a cparent
234  *	    in an iteration or recursrion, and the cparents elements do not
235  *	    necessarily match.
236  */
237 void
238 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
239 {
240 	hammer2_chain_t *chain;
241 	int i;
242 
243 	/* cannot be on inode-embedded cluster template, must be on copy */
244 	KKASSERT(cluster->refs > 0);
245 	KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
246 	if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
247 		panic("hammer2_cluster_lock: cluster %p already locked!\n",
248 			cluster);
249 	}
250 	atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
251 
252 	/*
253 	 * Lock chains and resolve state.
254 	 */
255 	for (i = 0; i < cluster->nchains; ++i) {
256 		chain = cluster->array[i].chain;
257 		if (chain == NULL)
258 			continue;
259 		hammer2_chain_lock(chain, how);
260 	}
261 }
262 
263 void
264 hammer2_cluster_unhold(hammer2_cluster_t *cluster)
265 {
266 	hammer2_chain_t *chain;
267 	int i;
268 
269 	for (i = 0; i < cluster->nchains; ++i) {
270 		chain = cluster->array[i].chain;
271 		if (chain == NULL)
272 			continue;
273 		hammer2_chain_unhold(chain);
274 	}
275 }
276 
277 void
278 hammer2_cluster_rehold(hammer2_cluster_t *cluster)
279 {
280 	hammer2_chain_t *chain;
281 	int i;
282 
283 	for (i = 0; i < cluster->nchains; ++i) {
284 		chain = cluster->array[i].chain;
285 		if (chain == NULL)
286 			continue;
287 		hammer2_chain_rehold(chain);
288 	}
289 }
290 
291 /*
292  * Calculate the clustering state for the cluster and set its focus.
293  * This routine must be called with care.  For example, it should not
294  * normally be called after relocking a non-leaf cluster because parent
295  * clusters help iterations and each element might be at a slightly different
296  * indirect node (each node's topology is independently indexed).
297  *
298  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
299  * operations.  Typically this is only set on a quorum of MASTERs or
300  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
301  * is present, this bit is *not* set on a quorum of MASTERs.  The
302  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
303  * that create/modify/delete elements use it.
304  *
305  * The chains making up the cluster may be narrowed down based on quorum
306  * acceptability.
307  *
308  * If a failure occurs the operation must be aborted by higher-level code and
309  * retried. XXX
310  */
311 void
312 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
313 {
314 	hammer2_chain_t *chain;
315 	hammer2_chain_t *focus;
316 	hammer2_pfs_t *pmp;
317 	hammer2_tid_t quorum_tid;
318 	hammer2_tid_t last_best_quorum_tid;
319 	int focus_pfs_type;
320 	uint32_t nflags;
321 	int ttlmasters;
322 	int ttlslaves;
323 	int nmasters;
324 	int nslaves;
325 	int nquorum;
326 	int smpresent;
327 	int i;
328 
329 	cluster->error = 0;
330 	cluster->focus = NULL;
331 
332 	focus_pfs_type = 0;
333 	nflags = 0;
334 	ttlmasters = 0;
335 	ttlslaves = 0;
336 	nmasters = 0;
337 	nslaves = 0;
338 
339 	/*
340 	 * Calculate quorum
341 	 */
342 	pmp = cluster->pmp;
343 	KKASSERT(pmp != NULL || cluster->nchains == 0);
344 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
345 	smpresent = 0;
346 
347 	/*
348 	 * Pass 1
349 	 *
350 	 * NOTE: A NULL chain is not necessarily an error, it could be
351 	 *	 e.g. a lookup failure or the end of an iteration.
352 	 *	 Process normally.
353 	 */
354 	for (i = 0; i < cluster->nchains; ++i) {
355 		chain = cluster->array[i].chain;
356 		if (chain && chain->error) {
357 			if (cluster->focus == NULL || cluster->focus == chain) {
358 				/* error will be overridden by valid focus */
359 				cluster->error = chain->error;
360 			}
361 
362 			/*
363 			 * Must count total masters and slaves whether the
364 			 * chain is errored or not.
365 			 */
366 			switch (cluster->pmp->pfs_types[i]) {
367 			case HAMMER2_PFSTYPE_SUPROOT:
368 			case HAMMER2_PFSTYPE_MASTER:
369 				++ttlmasters;
370 				break;
371 			case HAMMER2_PFSTYPE_SLAVE:
372 				++ttlslaves;
373 				break;
374 			}
375 			continue;
376 		}
377 		switch (cluster->pmp->pfs_types[i]) {
378 		case HAMMER2_PFSTYPE_MASTER:
379 			++ttlmasters;
380 			break;
381 		case HAMMER2_PFSTYPE_SLAVE:
382 			++ttlslaves;
383 			break;
384 		case HAMMER2_PFSTYPE_SOFT_MASTER:
385 			nflags |= HAMMER2_CLUSTER_WRSOFT;
386 			nflags |= HAMMER2_CLUSTER_RDSOFT;
387 			smpresent = 1;
388 			break;
389 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
390 			nflags |= HAMMER2_CLUSTER_RDSOFT;
391 			break;
392 		case HAMMER2_PFSTYPE_SUPROOT:
393 			/*
394 			 * Degenerate cluster representing the super-root
395 			 * topology on a single device.  Fake stuff so
396 			 * cluster ops work as expected.
397 			 */
398 			nflags |= HAMMER2_CLUSTER_WRHARD;
399 			nflags |= HAMMER2_CLUSTER_RDHARD;
400 			cluster->focus_index = i;
401 			cluster->focus = chain;
402 			cluster->error = chain ? chain->error : 0;
403 			++ttlmasters;
404 			break;
405 		default:
406 			break;
407 		}
408 	}
409 
410 	/*
411 	 * Pass 2
412 	 *
413 	 * Resolve masters.  Calculate nmasters for the highest matching
414 	 * TID, if a quorum cannot be attained try the next lower matching
415 	 * TID until we exhaust TIDs.
416 	 *
417 	 * NOTE: A NULL chain is not necessarily an error, it could be
418 	 *	 e.g. a lookup failure or the end of an iteration.
419 	 *	 Process normally.
420 	 */
421 	last_best_quorum_tid = HAMMER2_TID_MAX;
422 	quorum_tid = 0;		/* fix gcc warning */
423 
424 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
425 		nmasters = 0;
426 		quorum_tid = 0;
427 
428 		for (i = 0; i < cluster->nchains; ++i) {
429 			switch (cluster->pmp->pfs_types[i]) {
430 			case HAMMER2_PFSTYPE_SUPROOT:
431 			case HAMMER2_PFSTYPE_MASTER:
432 				break;
433 			default:
434 				continue;
435 			}
436 			chain = cluster->array[i].chain;
437 
438 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
439 				/*
440 				 * Invalid as in unsynchronized, cannot be
441 				 * used to calculate the quorum.
442 				 */
443 			} else if (chain == NULL && quorum_tid == 0) {
444 				/*
445 				 * NULL chain on master matches NULL chains
446 				 * on other masters.
447 				 */
448 				++nmasters;
449 			} else if (quorum_tid < last_best_quorum_tid &&
450 				   chain != NULL &&
451 				   (quorum_tid < chain->bref.modify_tid ||
452 				    nmasters == 0)) {
453 				/*
454 				 * Better TID located, reset nmasters count.
455 				 */
456 				nmasters = 1;
457 				quorum_tid = chain->bref.modify_tid;
458 			} else if (chain &&
459 				   quorum_tid == chain->bref.modify_tid) {
460 				/*
461 				 * TID matches current collection.
462 				 */
463 				++nmasters;
464 			}
465 		}
466 		if (nmasters >= nquorum)
467 			break;
468 		last_best_quorum_tid = quorum_tid;
469 	}
470 
471 	/*
472 	 * Pass 3
473 	 *
474 	 * NOTE: A NULL chain is not necessarily an error, it could be
475 	 *	 e.g. a lookup failure or the end of an iteration.
476 	 *	 Process normally.
477 	 */
478 	for (i = 0; i < cluster->nchains; ++i) {
479 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
480 		chain = cluster->array[i].chain;
481 		if (chain && chain->error) {
482 			if (cluster->focus == NULL || cluster->focus == chain) {
483 				/* error will be overridden by valid focus */
484 				cluster->error = chain->error;
485 			}
486 			continue;
487 		}
488 
489 		switch (cluster->pmp->pfs_types[i]) {
490 		case HAMMER2_PFSTYPE_MASTER:
491 			/*
492 			 * We must have enough up-to-date masters to reach
493 			 * a quorum and the master modify_tid must match
494 			 * the quorum's modify_tid.
495 			 *
496 			 * Do not select an errored or out-of-sync master.
497 			 */
498 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
499 				nflags |= HAMMER2_CLUSTER_UNHARD;
500 			} else if (nmasters >= nquorum &&
501 				   (chain == NULL || chain->error == 0) &&
502 				   ((chain == NULL && quorum_tid == 0) ||
503 				    (chain != NULL && quorum_tid ==
504 						  chain->bref.modify_tid))) {
505 				nflags |= HAMMER2_CLUSTER_WRHARD;
506 				nflags |= HAMMER2_CLUSTER_RDHARD;
507 				if (!smpresent) {
508 					cluster->array[i].flags |=
509 							HAMMER2_CITEM_FEMOD;
510 				}
511 				if (cluster->focus == NULL ||
512 				    focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
513 					focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
514 					cluster->focus_index = i;
515 					cluster->focus = chain; /* NULL ok */
516 					cluster->error = chain ? chain->error :
517 								 0;
518 				}
519 			} else if (chain == NULL || chain->error == 0) {
520 				nflags |= HAMMER2_CLUSTER_UNHARD;
521 			}
522 			break;
523 		case HAMMER2_PFSTYPE_SLAVE:
524 			/*
525 			 * We must have enough up-to-date masters to reach
526 			 * a quorum and the slave modify_tid must match the
527 			 * quorum's modify_tid.
528 			 *
529 			 * Do not select an errored slave.
530 			 */
531 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
532 				nflags |= HAMMER2_CLUSTER_UNHARD;
533 			} else if (nmasters >= nquorum &&
534 				   (chain == NULL || chain->error == 0) &&
535 				   ((chain == NULL && quorum_tid == 0) ||
536 				    (chain && quorum_tid ==
537 					      chain->bref.modify_tid))) {
538 				++nslaves;
539 				nflags |= HAMMER2_CLUSTER_RDHARD;
540 			} else if (chain == NULL || chain->error == 0) {
541 				nflags |= HAMMER2_CLUSTER_UNSOFT;
542 			}
543 			break;
544 		case HAMMER2_PFSTYPE_SOFT_MASTER:
545 			/*
546 			 * Directly mounted soft master always wins.  There
547 			 * should be only one.
548 			 */
549 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
550 			cluster->focus_index = i;
551 			cluster->focus = chain;
552 			cluster->error = chain ? chain->error : 0;
553 			focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
554 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
555 			break;
556 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
557 			/*
558 			 * Directly mounted soft slave always wins.  There
559 			 * should be only one.
560 			 */
561 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
562 			if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
563 				cluster->focus_index = i;
564 				cluster->focus = chain;
565 				cluster->error = chain ? chain->error : 0;
566 				focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
567 			}
568 			break;
569 		case HAMMER2_PFSTYPE_SUPROOT:
570 			/*
571 			 * spmp (degenerate case)
572 			 */
573 			KKASSERT(i == 0);
574 			cluster->focus_index = i;
575 			cluster->focus = chain;
576 			cluster->error = chain ? chain->error : 0;
577 			focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
578 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
579 			break;
580 		default:
581 			break;
582 		}
583 	}
584 
585 	/*
586 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
587 	 * is bad or if we are at the PFS root (the bref won't match at
588 	 * the PFS root, obviously).
589 	 */
590 	focus = cluster->focus;
591 	if (focus) {
592 		cluster->ddflag =
593 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
594 	} else {
595 		cluster->ddflag = 0;
596 		goto skip4;
597 	}
598 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
599 		goto skip4;
600 
601 	/*
602 	 * Pass 4
603 	 *
604 	 * Validate the elements that were not marked invalid.  They should
605 	 * match.
606 	 */
607 	for (i = 0; i < cluster->nchains; ++i) {
608 		int ddflag;
609 
610 		chain = cluster->array[i].chain;
611 
612 		if (chain == NULL)
613 			continue;
614 		if (chain == focus)
615 			continue;
616 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
617 			continue;
618 
619 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
620 		if (chain->bref.type != focus->bref.type ||
621 		    chain->bref.key != focus->bref.key ||
622 		    chain->bref.keybits != focus->bref.keybits ||
623 		    chain->bref.modify_tid != focus->bref.modify_tid ||
624 		    chain->bytes != focus->bytes ||
625 		    ddflag != cluster->ddflag) {
626 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
627 			if (hammer2_debug & 1)
628 			kprintf("cluster_resolve: matching modify_tid failed "
629 				"bref test: idx=%d type=%02x/%02x "
630 				"key=%016jx/%d-%016jx/%d "
631 				"mod=%016jx/%016jx bytes=%u/%u\n",
632 				i,
633 				chain->bref.type, focus->bref.type,
634 				chain->bref.key, chain->bref.keybits,
635 				focus->bref.key, focus->bref.keybits,
636 				chain->bref.modify_tid, focus->bref.modify_tid,
637 				chain->bytes, focus->bytes);
638 			if (hammer2_debug & 0x4000)
639 				panic("cluster_resolve");
640 			/* flag issue and force resync? */
641 		}
642 	}
643 skip4:
644 
645 	if (ttlslaves == 0)
646 		nflags |= HAMMER2_CLUSTER_NOSOFT;
647 	if (ttlmasters == 0)
648 		nflags |= HAMMER2_CLUSTER_NOHARD;
649 
650 	/*
651 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
652 	 * all available nodes (even if 0 are available) are fully
653 	 * synchronized.  This is used by the synchronization thread to
654 	 * determine if there is work it could potentially accomplish.
655 	 */
656 	if (nslaves == ttlslaves)
657 		nflags |= HAMMER2_CLUSTER_SSYNCED;
658 	if (nmasters == ttlmasters)
659 		nflags |= HAMMER2_CLUSTER_MSYNCED;
660 
661 	/*
662 	 * Determine if the cluster was successfully locked for the
663 	 * requested operation and generate an error code.  The cluster
664 	 * will not be locked (or ref'd) if an error is returned.
665 	 */
666 	atomic_set_int(&cluster->flags, nflags);
667 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
668 }
669 
670 /*
671  * This is used by the XOPS subsystem to calculate the state of
672  * the collection and tell hammer2_xop_collect() what to do with it.
673  * The collection can be in various states of desynchronization, the
674  * caller specifically wants to resolve the passed-in key.
675  *
676  * Return values:
677  *	0		- Quorum agreement, key is valid
678  *
679  *	ENOENT		- Quorum agreement, end of scan
680  *
681  *	ESRCH		- Quorum agreement, key is INVALID (caller should
682  *			  skip key).
683  *
684  *	EIO		- Quorum agreement but all elements had errors.
685  *
686  *	EDEADLK		- No quorum agreement possible for key, a repair
687  *			  may be needed.  Caller has to decide what to do,
688  *			  possibly iterating the key or generating an EIO.
689  *
690  *	EINPROGRESS	- No quorum agreement yet, but agreement is still
691  *			  possible if caller waits for more responses.  Caller
692  *			  should not iterate key.
693  *
694  * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
695  *
696  * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
697  */
698 int
699 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
700 {
701 	hammer2_chain_t *chain;
702 	hammer2_chain_t *focus;
703 	hammer2_pfs_t *pmp;
704 	hammer2_tid_t quorum_tid;
705 	hammer2_tid_t last_best_quorum_tid;
706 	uint32_t nflags;
707 	int ttlmasters;
708 	int ttlslaves;
709 	int nmasters;
710 	int nmasters_keymatch;
711 	int nslaves;
712 	int nquorum;
713 	int umasters;	/* unknown masters (still in progress) */
714 	int smpresent;
715 	int error;
716 	int i;
717 
718 	cluster->error = 0;
719 	cluster->focus = NULL;
720 
721 	pmp = cluster->pmp;
722 	KKASSERT(pmp != NULL || cluster->nchains == 0);
723 
724 	/*
725 	 * Calculate quorum
726 	 */
727 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
728 	smpresent = 0;
729 	nflags = 0;
730 	ttlmasters = 0;
731 	ttlslaves = 0;
732 
733 	/*
734 	 * Pass 1
735 	 *
736 	 * NOTE: A NULL chain is not necessarily an error, it could be
737 	 *	 e.g. a lookup failure or the end of an iteration.
738 	 *	 Process normally.
739 	 */
740 	for (i = 0; i < cluster->nchains; ++i) {
741 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
742 		cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
743 
744 		chain = cluster->array[i].chain;
745 		error = cluster->array[i].error;
746 		if (chain && error) {
747 			if (cluster->focus == NULL || cluster->focus == chain) {
748 				/* error will be overridden by valid focus */
749 				/* XXX */
750 			}
751 
752 			/*
753 			 * Must count total masters and slaves whether the
754 			 * chain is errored or not.
755 			 */
756 			switch (cluster->pmp->pfs_types[i]) {
757 			case HAMMER2_PFSTYPE_SUPROOT:
758 			case HAMMER2_PFSTYPE_MASTER:
759 				++ttlmasters;
760 				break;
761 			case HAMMER2_PFSTYPE_SLAVE:
762 				++ttlslaves;
763 				break;
764 			}
765 			continue;
766 		}
767 		switch (cluster->pmp->pfs_types[i]) {
768 		case HAMMER2_PFSTYPE_MASTER:
769 			++ttlmasters;
770 			break;
771 		case HAMMER2_PFSTYPE_SLAVE:
772 			++ttlslaves;
773 			break;
774 		case HAMMER2_PFSTYPE_SOFT_MASTER:
775 			nflags |= HAMMER2_CLUSTER_WRSOFT;
776 			nflags |= HAMMER2_CLUSTER_RDSOFT;
777 			smpresent = 1;
778 			break;
779 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
780 			nflags |= HAMMER2_CLUSTER_RDSOFT;
781 			break;
782 		case HAMMER2_PFSTYPE_SUPROOT:
783 			/*
784 			 * Degenerate cluster representing the super-root
785 			 * topology on a single device.  Fake stuff so
786 			 * cluster ops work as expected.
787 			 */
788 			++ttlmasters;
789 			nflags |= HAMMER2_CLUSTER_WRHARD;
790 			nflags |= HAMMER2_CLUSTER_RDHARD;
791 			cluster->focus_index = i;
792 			cluster->focus = chain;
793 			cluster->error = error;
794 			break;
795 		default:
796 			break;
797 		}
798 	}
799 
800 	/*
801 	 * Pass 2
802 	 *
803 	 * Resolve nmasters		- master nodes fully match
804 	 *
805 	 * Resolve umasters		- master nodes operation still
806 	 *				  in progress
807 	 *
808 	 * Resolve nmasters_keymatch	- master nodes match the passed-in
809 	 *				  key and may or may not match
810 	 *				  the quorum-agreed tid.
811 	 *
812 	 * The quorum-agreed TID is the highest matching TID.
813 	 */
814 	last_best_quorum_tid = HAMMER2_TID_MAX;
815 	umasters = 0;
816 	nmasters = 0;
817 	nmasters_keymatch = 0;
818 	quorum_tid = 0;		/* fix gcc warning */
819 
820 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
821 		umasters = 0;
822 		nmasters = 0;
823 		nmasters_keymatch = 0;
824 		quorum_tid = 0;
825 
826 		for (i = 0; i < cluster->nchains; ++i) {
827 			/* XXX SOFT smpresent handling */
828 			switch(cluster->pmp->pfs_types[i]) {
829 			case HAMMER2_PFSTYPE_MASTER:
830 			case HAMMER2_PFSTYPE_SUPROOT:
831 				break;
832 			default:
833 				continue;
834 			}
835 
836 			chain = cluster->array[i].chain;
837 			error = cluster->array[i].error;
838 
839 			/*
840 			 * Skip elements still in progress.  umasters keeps
841 			 * track of masters that might still be in-progress.
842 			 */
843 			if (chain == NULL && (cluster->array[i].flags &
844 					      HAMMER2_CITEM_NULL) == 0) {
845 				++umasters;
846 				continue;
847 			}
848 
849 			/*
850 			 * Key match?
851 			 */
852 			if (flags & HAMMER2_CHECK_NULL) {
853 				if (chain == NULL) {
854 					++nmasters;
855 					++nmasters_keymatch;
856 					if (cluster->error == 0)
857 						cluster->error = error;
858 				}
859 			} else if (chain &&
860 				   (key == (hammer2_key_t)-1 ||
861 				    chain->bref.key == key)) {
862 				++nmasters_keymatch;
863 
864 				if (chain->bref.modify_tid <
865 				     last_best_quorum_tid &&
866 				    quorum_tid < chain->bref.modify_tid) {
867 					/*
868 					 * Select new TID as master if better
869 					 * than any found so far in this loop,
870 					 * as long as it does not reach the
871 					 * best tid found in the previous loop.
872 					 */
873 					nmasters = 0;
874 					quorum_tid = chain->bref.modify_tid;
875 				}
876 				if (quorum_tid == chain->bref.modify_tid) {
877 					/*
878 					 * TID matches current collection.
879 					 *
880 					 * (error handled in next pass)
881 					 */
882 					++nmasters;
883 					if (chain->error == 0) {
884 						cluster->focus = chain;
885 						cluster->focus_index = i;
886 					}
887 				}
888 			}
889 		}
890 		if (nmasters >= nquorum)
891 			break;
892 		last_best_quorum_tid = quorum_tid;
893 	}
894 
895 	/*
896 	kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
897 		nmasters, nquorum, nmasters_keymatch, umasters);
898 	*/
899 
900 	/*
901 	 * Early return if we do not have enough masters.
902 	 */
903 	if (nmasters < nquorum) {
904 		if (nmasters + umasters >= nquorum)
905 			return HAMMER2_ERROR_EINPROGRESS;
906 		if (nmasters_keymatch < nquorum)
907 			return HAMMER2_ERROR_ESRCH;
908 		return HAMMER2_ERROR_EDEADLK;
909 	}
910 
911 	/*
912 	 * Validated end of scan.
913 	 */
914 	if (flags & HAMMER2_CHECK_NULL) {
915 		if (cluster->error == 0)
916 			cluster->error = HAMMER2_ERROR_ENOENT;
917 		return cluster->error;
918 	}
919 
920 	/*
921 	 * If we have a NULL focus at this point the agreeing quorum all
922 	 * had chain errors.
923 	 */
924 	if (cluster->focus == NULL)
925 		return HAMMER2_ERROR_EIO;
926 
927 	/*
928 	 * Pass 3
929 	 *
930 	 * We have quorum agreement, validate elements, not end of scan.
931 	 */
932 	nslaves = 0;
933 	cluster->error = 0;
934 
935 	for (i = 0; i < cluster->nchains; ++i) {
936 		chain = cluster->array[i].chain;
937 		error = cluster->array[i].error;
938 		if (chain == NULL ||
939 		    chain->bref.key != key ||
940 		    chain->bref.modify_tid != quorum_tid) {
941 			continue;
942 		}
943 
944 		/*
945 		 * Quorum Match
946 		 *
947 		 * XXX for now, cumulative error.
948 		 */
949 		if (cluster->error == 0)
950 			cluster->error = error;
951 
952 		switch (cluster->pmp->pfs_types[i]) {
953 		case HAMMER2_PFSTYPE_MASTER:
954 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
955 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
956 			nflags |= HAMMER2_CLUSTER_WRHARD;
957 			nflags |= HAMMER2_CLUSTER_RDHARD;
958 			break;
959 		case HAMMER2_PFSTYPE_SLAVE:
960 			/*
961 			 * We must have enough up-to-date masters to reach
962 			 * a quorum and the slave modify_tid must match the
963 			 * quorum's modify_tid.
964 			 *
965 			 * Do not select an errored slave.
966 			 */
967 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
968 			nflags |= HAMMER2_CLUSTER_RDHARD;
969 			++nslaves;
970 			break;
971 		case HAMMER2_PFSTYPE_SOFT_MASTER:
972 			/*
973 			 * Directly mounted soft master always wins.  There
974 			 * should be only one.
975 			 */
976 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
977 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
978 			break;
979 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
980 			/*
981 			 * Directly mounted soft slave always wins.  There
982 			 * should be only one.
983 			 *
984 			 * XXX
985 			 */
986 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
987 			break;
988 		case HAMMER2_PFSTYPE_SUPROOT:
989 			/*
990 			 * spmp (degenerate case)
991 			 */
992 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
993 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
994 			nflags |= HAMMER2_CLUSTER_WRHARD;
995 			nflags |= HAMMER2_CLUSTER_RDHARD;
996 			break;
997 		default:
998 			break;
999 		}
1000 	}
1001 
1002 	/*
1003 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
1004 	 * is bad or if we are at the PFS root (the bref won't match at
1005 	 * the PFS root, obviously).
1006 	 *
1007 	 * focus is probably not locked and it isn't safe to test its
1008 	 * content (e.g. focus->data, focus->dio, other content).  We
1009 	 * do not synchronize the dio to the cpu here.  In fact, in numerous
1010 	 * situations the frontend doesn't even need to access its dio/data,
1011 	 * so synchronizing it here would be wasteful.
1012 	 */
1013 	focus = cluster->focus;
1014 	if (focus) {
1015 		cluster->ddflag =
1016 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1017 	} else {
1018 		cluster->ddflag = 0;
1019 		goto skip4;
1020 	}
1021 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1022 		goto skip4;
1023 
1024 	/*
1025 	 * Pass 4
1026 	 *
1027 	 * Validate the elements that were not marked invalid.  They should
1028 	 * match.
1029 	 */
1030 	for (i = 0; i < cluster->nchains; ++i) {
1031 		int ddflag;
1032 
1033 		chain = cluster->array[i].chain;
1034 
1035 		if (chain == NULL)
1036 			continue;
1037 		if (chain == focus)
1038 			continue;
1039 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1040 			continue;
1041 
1042 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1043 		if (chain->bref.type != focus->bref.type ||
1044 		    chain->bref.key != focus->bref.key ||
1045 		    chain->bref.keybits != focus->bref.keybits ||
1046 		    chain->bref.modify_tid != focus->bref.modify_tid ||
1047 		    chain->bytes != focus->bytes ||
1048 		    ddflag != cluster->ddflag) {
1049 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1050 			if (hammer2_debug & 1)
1051 			kprintf("cluster_resolve: matching modify_tid failed "
1052 				"bref test: idx=%d type=%02x/%02x "
1053 				"key=%016jx/%d-%016jx/%d "
1054 				"mod=%016jx/%016jx bytes=%u/%u\n",
1055 				i,
1056 				chain->bref.type, focus->bref.type,
1057 				chain->bref.key, chain->bref.keybits,
1058 				focus->bref.key, focus->bref.keybits,
1059 				chain->bref.modify_tid, focus->bref.modify_tid,
1060 				chain->bytes, focus->bytes);
1061 			if (hammer2_debug & 0x4000)
1062 				panic("cluster_resolve");
1063 			/* flag issue and force resync? */
1064 		}
1065 	}
1066 skip4:
1067 
1068 	if (ttlslaves == 0)
1069 		nflags |= HAMMER2_CLUSTER_NOSOFT;
1070 	if (ttlmasters == 0)
1071 		nflags |= HAMMER2_CLUSTER_NOHARD;
1072 
1073 	/*
1074 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
1075 	 * all available nodes (even if 0 are available) are fully
1076 	 * synchronized.  This is used by the synchronization thread to
1077 	 * determine if there is work it could potentially accomplish.
1078 	 */
1079 	if (nslaves == ttlslaves)
1080 		nflags |= HAMMER2_CLUSTER_SSYNCED;
1081 	if (nmasters == ttlmasters)
1082 		nflags |= HAMMER2_CLUSTER_MSYNCED;
1083 
1084 	/*
1085 	 * Determine if the cluster was successfully locked for the
1086 	 * requested operation and generate an error code.  The cluster
1087 	 * will not be locked (or ref'd) if an error is returned.
1088 	 */
1089 	atomic_set_int(&cluster->flags, nflags);
1090 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1091 
1092 	return cluster->error;
1093 }
1094 
1095 /*
1096  * This is used by the sync thread to force non-NULL elements of a copy
1097  * of the pmp->iroot cluster to be good which is required to prime the
1098  * sync.
1099  */
1100 void
1101 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1102 {
1103 	int i;
1104 
1105 	for (i = 0; i < cluster->nchains; ++i) {
1106 		if (cluster->array[i].chain)
1107 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1108 	}
1109 }
1110 
1111 /*
1112  * Unlock a cluster.  Refcount and focus is maintained.
1113  */
1114 void
1115 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1116 {
1117 	hammer2_chain_t *chain;
1118 	int i;
1119 
1120 	if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1121 		kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1122 			cluster);
1123 	}
1124 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1125 	KKASSERT(cluster->refs > 0);
1126 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1127 
1128 	for (i = 0; i < cluster->nchains; ++i) {
1129 		chain = cluster->array[i].chain;
1130 		if (chain)
1131 			hammer2_chain_unlock(chain);
1132 	}
1133 }
1134