xref: /dragonfly/sys/vfs/hammer2/hammer2_cluster.c (revision e62ef63c)
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * The cluster module collects multiple chains representing the same
36  * information from different nodes into a single entity.  It allows direct
37  * access to media data as long as it is not blockref array data (which
38  * will obviously have to be different at each node).
39  *
40  * This module also handles I/O dispatch, status rollup, and various
41  * mastership arrangements including quorum operations.  It effectively
42  * presents one topology to the vnops layer.
43  *
44  * Many of the API calls mimic chain API calls but operate on clusters
45  * instead of chains.  Please see hammer2_chain.c for more complete code
46  * documentation of the API functions.
47  *
48  * WARNING! This module is *extremely* complex.  It must issue asynchronous
49  *	    locks and I/O, do quorum and/or master-slave processing, and
50  *	    it must operate properly even if some nodes are broken (which
51  *	    can also mean indefinite locks).
52  *
53  *				CLUSTER OPERATIONS
54  *
55  * Cluster operations can be broken down into three pieces:
56  *
57  * (1) Chain locking and data retrieval.
58  *
59  *	- Most complex functions, quorum management on transaction ids.
60  *
61  *	- Locking and data accesses must be internally asynchronous.
62  *
63  *	- Validate and manage cache coherency primitives (cache state
64  *	  is stored in chain topologies but must be validated by these
65  *	  functions).
66  *
67  * (2) Lookups and Scans
68  *		hammer2_cluster_lookup()
69  *		hammer2_cluster_next()
70  *
71  *	- Depend on locking & data retrieval functions, but still complex.
72  *
73  *	- Must do quorum management on transaction ids.
74  *
75  *	- Lookup and Iteration ops Must be internally asynchronous.
76  *
77  * (3) Modifying Operations
78  *		hammer2_cluster_create()
79  *
80  *	- Can usually punt on failures, operation continues unless quorum
81  *	  is lost.  If quorum is lost, must wait for resynchronization
82  *	  (depending on the management mode).
83  *
84  *	- Must disconnect node on failures (also not flush), remount, and
85  *	  resynchronize.
86  *
87  *	- Network links (via kdmsg) are relatively easy to issue as the
88  *	  complex underworkings of hammer2_chain.c don't have to messed
89  *	  with (the protocol is at a higher level than block-level).
90  *
91  *	- Multiple local disk nodes (i.e. block devices) are another matter.
92  *	  Chain operations have to be dispatched to per-node threads (xN)
93  *	  because we can't asynchronize potentially very complex chain
94  *	  operations in hammer2_chain.c (it would be a huge mess).
95  *
96  *	  (these threads are also used to terminate incoming kdmsg ops from
97  *	  other machines).
98  *
99  *	- Single-node filesystems do not use threads and will simply call
100  *	  hammer2_chain.c functions directly.  This short-cut is handled
101  *	  at the base of each cluster function.
102  */
103 #include <sys/cdefs.h>
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/types.h>
107 #include <sys/lock.h>
108 #include <sys/uuid.h>
109 
110 #include "hammer2.h"
111 
112 /*
113  * Returns the bref type of the cluster's foucs.
114  *
115  * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0).
116  * The cluster must be locked.
117  */
118 uint8_t
119 hammer2_cluster_type(hammer2_cluster_t *cluster)
120 {
121 	if (cluster->error == 0) {
122 		KKASSERT(cluster->focus != NULL);
123 		return(cluster->focus->bref.type);
124 	}
125 	return 0;
126 }
127 
128 #ifdef INVARIANTS
129 /*
130  * Returns non-zero if the cluster's focus is flagged as being modified.
131  *
132  * If the cluster is errored, returns 0.
133  */
134 static
135 int
136 hammer2_cluster_modified(hammer2_cluster_t *cluster)
137 {
138 	if (cluster->error == 0) {
139 		KKASSERT(cluster->focus != NULL);
140 		return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
141 	}
142 	return 0;
143 }
144 #endif
145 
146 /*
147  * Returns the bref of the cluster's focus, sans any data-offset information
148  * (since offset information is per-node and wouldn't be useful).
149  *
150  * Callers use this function to access modify_tid, mirror_tid, type,
151  * key, and keybits.
152  *
153  * If the cluster is errored, returns an empty bref.
154  * The cluster must be locked.
155  */
156 void
157 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref)
158 {
159 	if (cluster->error == 0) {
160 		KKASSERT(cluster->focus != NULL);
161 		*bref = cluster->focus->bref;
162 		bref->data_off = 0;
163 	} else {
164 		bzero(bref, sizeof(*bref));
165 	}
166 }
167 
168 /*
169  * Create a degenerate cluster with one ref from a single locked chain.
170  * The returned cluster will be focused on the chain and inherit its
171  * error state.
172  *
173  * The chain's lock and reference are transfered to the new cluster, so
174  * the caller should not try to unlock the chain separately.
175  *
176  * We fake the flags.
177  */
178 hammer2_cluster_t *
179 hammer2_cluster_from_chain(hammer2_chain_t *chain)
180 {
181 	hammer2_cluster_t *cluster;
182 
183 	cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
184 	cluster->array[0].chain = chain;
185 	cluster->array[0].flags = HAMMER2_CITEM_FEMOD;
186 	cluster->nchains = 1;
187 	cluster->focus = chain;
188 	cluster->focus_index = 0;
189 	cluster->pmp = chain->pmp;
190 	cluster->refs = 1;
191 	cluster->error = chain->error;
192 	cluster->flags = HAMMER2_CLUSTER_LOCKED |
193 			 HAMMER2_CLUSTER_WRHARD |
194 			 HAMMER2_CLUSTER_RDHARD |
195 			 HAMMER2_CLUSTER_MSYNCED |
196 			 HAMMER2_CLUSTER_SSYNCED;
197 
198 	return cluster;
199 }
200 
201 /*
202  * Add a reference to a cluster and its underlying chains.
203  *
204  * We must also ref the underlying chains in order to allow ref/unlock
205  * sequences to later re-lock.
206  */
207 void
208 hammer2_cluster_ref(hammer2_cluster_t *cluster)
209 {
210 	atomic_add_int(&cluster->refs, 1);
211 }
212 
213 /*
214  * Drop the caller's reference to the cluster.  When the ref count drops to
215  * zero this function frees the cluster and drops all underlying chains.
216  *
217  * In-progress read I/Os are typically detached from the cluster once the
218  * first one returns (the remaining stay attached to the DIOs but are then
219  * ignored and drop naturally).
220  */
221 void
222 hammer2_cluster_drop(hammer2_cluster_t *cluster)
223 {
224 	hammer2_chain_t *chain;
225 	int i;
226 
227 	KKASSERT(cluster->refs > 0);
228 	if (atomic_fetchadd_int(&cluster->refs, -1) == 1) {
229 		cluster->focus = NULL;		/* safety XXX chg to assert */
230 		cluster->focus_index = 0;
231 
232 		for (i = 0; i < cluster->nchains; ++i) {
233 			chain = cluster->array[i].chain;
234 			if (chain) {
235 				hammer2_chain_drop(chain);
236 				cluster->array[i].chain = NULL; /* safety */
237 			}
238 		}
239 		cluster->nchains = 0;				/* safety */
240 
241 		kfree(cluster, M_HAMMER2);
242 		/* cluster is invalid */
243 	}
244 }
245 
246 /*
247  * Lock a cluster.  Cluster must already be referenced.  Focus is maintained.
248  *
249  * WARNING! This function expects the caller to handle resolution of the
250  *	    cluster.  We never re-resolve the cluster in this function,
251  *	    because it might be used to temporarily unlock/relock a cparent
252  *	    in an iteration or recursrion, and the cparents elements do not
253  *	    necessarily match.
254  */
255 void
256 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
257 {
258 	hammer2_chain_t *chain;
259 	int i;
260 
261 	/* cannot be on inode-embedded cluster template, must be on copy */
262 	KKASSERT(cluster->refs > 0);
263 	KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0);
264 	if (cluster->flags & HAMMER2_CLUSTER_LOCKED) {
265 		panic("hammer2_cluster_lock: cluster %p already locked!\n",
266 			cluster);
267 	}
268 	atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
269 
270 	/*
271 	 * Lock chains and resolve state.
272 	 */
273 	for (i = 0; i < cluster->nchains; ++i) {
274 		chain = cluster->array[i].chain;
275 		if (chain == NULL)
276 			continue;
277 		hammer2_chain_lock(chain, how);
278 	}
279 }
280 
281 /*
282  * Calculate the clustering state for the cluster and set its focus.
283  * This routine must be called with care.  For example, it should not
284  * normally be called after relocking a non-leaf cluster because parent
285  * clusters help iterations and each element might be at a slightly different
286  * indirect node (each node's topology is independently indexed).
287  *
288  * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal
289  * operations.  Typically this is only set on a quorum of MASTERs or
290  * on a SOFT_MASTER.  Also as a degenerate case on SUPROOT.  If a SOFT_MASTER
291  * is present, this bit is *not* set on a quorum of MASTERs.  The
292  * synchronization code ignores this bit, but all hammer2_cluster_*() calls
293  * that create/modify/delete elements use it.
294  *
295  * The chains making up the cluster may be narrowed down based on quorum
296  * acceptability, and if RESOLVE_RDONLY is specified the chains can be
297  * narrowed down to a single chain as long as the entire subtopology is known
298  * to be intact.  So, for example, we can narrow a read-only op to a single
299  * fast SLAVE but if we focus a CACHE chain we must still retain at least
300  * a SLAVE to ensure that the subtopology can be accessed.
301  *
302  * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
303  * to be maintained once the topology is validated as-of the top level of
304  * the operation.
305  *
306  * If a failure occurs the operation must be aborted by higher-level code and
307  * retried. XXX
308  */
309 void
310 hammer2_cluster_resolve(hammer2_cluster_t *cluster)
311 {
312 	hammer2_chain_t *chain;
313 	hammer2_chain_t *focus;
314 	hammer2_pfs_t *pmp;
315 	hammer2_tid_t quorum_tid;
316 	hammer2_tid_t last_best_quorum_tid;
317 	int focus_pfs_type;
318 	uint32_t nflags;
319 	int ttlmasters;
320 	int ttlslaves;
321 	int nmasters;
322 	int nslaves;
323 	int nquorum;
324 	int smpresent;
325 	int i;
326 
327 	cluster->error = 0;
328 	cluster->focus = NULL;
329 
330 	focus_pfs_type = 0;
331 	nflags = 0;
332 	ttlmasters = 0;
333 	ttlslaves = 0;
334 	nmasters = 0;
335 	nslaves = 0;
336 
337 	/*
338 	 * Calculate quorum
339 	 */
340 	pmp = cluster->pmp;
341 	KKASSERT(pmp != NULL || cluster->nchains == 0);
342 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
343 	smpresent = 0;
344 
345 	/*
346 	 * Pass 1
347 	 *
348 	 * NOTE: A NULL chain is not necessarily an error, it could be
349 	 *	 e.g. a lookup failure or the end of an iteration.
350 	 *	 Process normally.
351 	 */
352 	for (i = 0; i < cluster->nchains; ++i) {
353 		chain = cluster->array[i].chain;
354 		if (chain && chain->error) {
355 			if (cluster->focus == NULL || cluster->focus == chain) {
356 				/* error will be overridden by valid focus */
357 				cluster->error = chain->error;
358 			}
359 
360 			/*
361 			 * Must count total masters and slaves whether the
362 			 * chain is errored or not.
363 			 */
364 			switch (cluster->pmp->pfs_types[i]) {
365 			case HAMMER2_PFSTYPE_SUPROOT:
366 			case HAMMER2_PFSTYPE_MASTER:
367 				++ttlmasters;
368 				break;
369 			case HAMMER2_PFSTYPE_SLAVE:
370 				++ttlslaves;
371 				break;
372 			}
373 			continue;
374 		}
375 		switch (cluster->pmp->pfs_types[i]) {
376 		case HAMMER2_PFSTYPE_MASTER:
377 			++ttlmasters;
378 			break;
379 		case HAMMER2_PFSTYPE_SLAVE:
380 			++ttlslaves;
381 			break;
382 		case HAMMER2_PFSTYPE_SOFT_MASTER:
383 			nflags |= HAMMER2_CLUSTER_WRSOFT;
384 			nflags |= HAMMER2_CLUSTER_RDSOFT;
385 			smpresent = 1;
386 			break;
387 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
388 			nflags |= HAMMER2_CLUSTER_RDSOFT;
389 			break;
390 		case HAMMER2_PFSTYPE_SUPROOT:
391 			/*
392 			 * Degenerate cluster representing the super-root
393 			 * topology on a single device.  Fake stuff so
394 			 * cluster ops work as expected.
395 			 */
396 			nflags |= HAMMER2_CLUSTER_WRHARD;
397 			nflags |= HAMMER2_CLUSTER_RDHARD;
398 			cluster->focus_index = i;
399 			cluster->focus = chain;
400 			cluster->error = chain ? chain->error : 0;
401 			++ttlmasters;
402 			break;
403 		default:
404 			break;
405 		}
406 	}
407 
408 	/*
409 	 * Pass 2
410 	 *
411 	 * Resolve masters.  Calculate nmasters for the highest matching
412 	 * TID, if a quorum cannot be attained try the next lower matching
413 	 * TID until we exhaust TIDs.
414 	 *
415 	 * NOTE: A NULL chain is not necessarily an error, it could be
416 	 *	 e.g. a lookup failure or the end of an iteration.
417 	 *	 Process normally.
418 	 */
419 	last_best_quorum_tid = HAMMER2_TID_MAX;
420 	quorum_tid = 0;		/* fix gcc warning */
421 
422 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
423 		nmasters = 0;
424 		quorum_tid = 0;
425 
426 		for (i = 0; i < cluster->nchains; ++i) {
427 			switch (cluster->pmp->pfs_types[i]) {
428 			case HAMMER2_PFSTYPE_SUPROOT:
429 			case HAMMER2_PFSTYPE_MASTER:
430 				break;
431 			default:
432 				continue;
433 			}
434 			chain = cluster->array[i].chain;
435 
436 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
437 				/*
438 				 * Invalid as in unsynchronized, cannot be
439 				 * used to calculate the quorum.
440 				 */
441 			} else if (chain == NULL && quorum_tid == 0) {
442 				/*
443 				 * NULL chain on master matches NULL chains
444 				 * on other masters.
445 				 */
446 				++nmasters;
447 			} else if (quorum_tid < last_best_quorum_tid &&
448 				   chain != NULL &&
449 				   (quorum_tid < chain->bref.modify_tid ||
450 				    nmasters == 0)) {
451 				/*
452 				 * Better TID located, reset nmasters count.
453 				 */
454 				nmasters = 1;
455 				quorum_tid = chain->bref.modify_tid;
456 			} else if (chain &&
457 				   quorum_tid == chain->bref.modify_tid) {
458 				/*
459 				 * TID matches current collection.
460 				 */
461 				++nmasters;
462 			}
463 		}
464 		if (nmasters >= nquorum)
465 			break;
466 		last_best_quorum_tid = quorum_tid;
467 	}
468 
469 	/*
470 	 * Pass 3
471 	 *
472 	 * NOTE: A NULL chain is not necessarily an error, it could be
473 	 *	 e.g. a lookup failure or the end of an iteration.
474 	 *	 Process normally.
475 	 */
476 	for (i = 0; i < cluster->nchains; ++i) {
477 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
478 		chain = cluster->array[i].chain;
479 		if (chain && chain->error) {
480 			if (cluster->focus == NULL || cluster->focus == chain) {
481 				/* error will be overridden by valid focus */
482 				cluster->error = chain->error;
483 			}
484 			continue;
485 		}
486 
487 		switch (cluster->pmp->pfs_types[i]) {
488 		case HAMMER2_PFSTYPE_MASTER:
489 			/*
490 			 * We must have enough up-to-date masters to reach
491 			 * a quorum and the master modify_tid must match
492 			 * the quorum's modify_tid.
493 			 *
494 			 * Do not select an errored or out-of-sync master.
495 			 */
496 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
497 				nflags |= HAMMER2_CLUSTER_UNHARD;
498 			} else if (nmasters >= nquorum &&
499 				   (chain == NULL || chain->error == 0) &&
500 				   ((chain == NULL && quorum_tid == 0) ||
501 				    (chain != NULL && quorum_tid ==
502 						  chain->bref.modify_tid))) {
503 				nflags |= HAMMER2_CLUSTER_WRHARD;
504 				nflags |= HAMMER2_CLUSTER_RDHARD;
505 				if (!smpresent) {
506 					cluster->array[i].flags |=
507 							HAMMER2_CITEM_FEMOD;
508 				}
509 				if (cluster->focus == NULL ||
510 				    focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) {
511 					focus_pfs_type = HAMMER2_PFSTYPE_MASTER;
512 					cluster->focus_index = i;
513 					cluster->focus = chain; /* NULL ok */
514 					cluster->error = chain ? chain->error :
515 								 0;
516 				}
517 			} else if (chain == NULL || chain->error == 0) {
518 				nflags |= HAMMER2_CLUSTER_UNHARD;
519 			}
520 			break;
521 		case HAMMER2_PFSTYPE_SLAVE:
522 			/*
523 			 * We must have enough up-to-date masters to reach
524 			 * a quorum and the slave modify_tid must match the
525 			 * quorum's modify_tid.
526 			 *
527 			 * Do not select an errored slave.
528 			 */
529 			if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) {
530 				nflags |= HAMMER2_CLUSTER_UNHARD;
531 			} else if (nmasters >= nquorum &&
532 				   (chain == NULL || chain->error == 0) &&
533 				   ((chain == NULL && quorum_tid == 0) ||
534 				    (chain && quorum_tid ==
535 					      chain->bref.modify_tid))) {
536 				++nslaves;
537 				nflags |= HAMMER2_CLUSTER_RDHARD;
538 #if 0
539 				/* XXX optimize for RESOLVE_RDONLY */
540 				if (cluster->focus == NULL) {
541 					focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
542 					cluster->focus_index = i;
543 					cluster->focus = chain; /* NULL ok */
544 					cluster->error = chain ? chain->error :
545 								 0;
546 				}
547 #endif
548 			} else if (chain == NULL || chain->error == 0) {
549 				nflags |= HAMMER2_CLUSTER_UNSOFT;
550 			}
551 			break;
552 		case HAMMER2_PFSTYPE_SOFT_MASTER:
553 			/*
554 			 * Directly mounted soft master always wins.  There
555 			 * should be only one.
556 			 */
557 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER);
558 			cluster->focus_index = i;
559 			cluster->focus = chain;
560 			cluster->error = chain ? chain->error : 0;
561 			focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
562 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
563 			break;
564 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
565 			/*
566 			 * Directly mounted soft slave always wins.  There
567 			 * should be only one.
568 			 */
569 			KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE);
570 			if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) {
571 				cluster->focus_index = i;
572 				cluster->focus = chain;
573 				cluster->error = chain ? chain->error : 0;
574 				focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
575 			}
576 			break;
577 		case HAMMER2_PFSTYPE_SUPROOT:
578 			/*
579 			 * spmp (degenerate case)
580 			 */
581 			KKASSERT(i == 0);
582 			cluster->focus_index = i;
583 			cluster->focus = chain;
584 			cluster->error = chain ? chain->error : 0;
585 			focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT;
586 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
587 			break;
588 		default:
589 			break;
590 		}
591 	}
592 
593 	/*
594 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
595 	 * is bad or if we are at the PFS root (the bref won't match at
596 	 * the PFS root, obviously).
597 	 */
598 	focus = cluster->focus;
599 	if (focus) {
600 		cluster->ddflag =
601 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
602 	} else {
603 		cluster->ddflag = 0;
604 		goto skip4;
605 	}
606 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
607 		goto skip4;
608 
609 	/*
610 	 * Pass 4
611 	 *
612 	 * Validate the elements that were not marked invalid.  They should
613 	 * match.
614 	 */
615 	for (i = 0; i < cluster->nchains; ++i) {
616 		int ddflag;
617 
618 		chain = cluster->array[i].chain;
619 
620 		if (chain == NULL)
621 			continue;
622 		if (chain == focus)
623 			continue;
624 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
625 			continue;
626 
627 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
628 		if (chain->bref.type != focus->bref.type ||
629 		    chain->bref.key != focus->bref.key ||
630 		    chain->bref.keybits != focus->bref.keybits ||
631 		    chain->bref.modify_tid != focus->bref.modify_tid ||
632 		    chain->bytes != focus->bytes ||
633 		    ddflag != cluster->ddflag) {
634 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
635 			if (hammer2_debug & 1)
636 			kprintf("cluster_resolve: matching modify_tid failed "
637 				"bref test: idx=%d type=%02x/%02x "
638 				"key=%016jx/%d-%016jx/%d "
639 				"mod=%016jx/%016jx bytes=%u/%u\n",
640 				i,
641 				chain->bref.type, focus->bref.type,
642 				chain->bref.key, chain->bref.keybits,
643 				focus->bref.key, focus->bref.keybits,
644 				chain->bref.modify_tid, focus->bref.modify_tid,
645 				chain->bytes, focus->bytes);
646 			if (hammer2_debug & 0x4000)
647 				panic("cluster_resolve");
648 			/* flag issue and force resync? */
649 		}
650 	}
651 skip4:
652 
653 	if (ttlslaves == 0)
654 		nflags |= HAMMER2_CLUSTER_NOSOFT;
655 	if (ttlmasters == 0)
656 		nflags |= HAMMER2_CLUSTER_NOHARD;
657 
658 	/*
659 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
660 	 * all available nodes (even if 0 are available) are fully
661 	 * synchronized.  This is used by the synchronization thread to
662 	 * determine if there is work it could potentially accomplish.
663 	 */
664 	if (nslaves == ttlslaves)
665 		nflags |= HAMMER2_CLUSTER_SSYNCED;
666 	if (nmasters == ttlmasters)
667 		nflags |= HAMMER2_CLUSTER_MSYNCED;
668 
669 	/*
670 	 * Determine if the cluster was successfully locked for the
671 	 * requested operation and generate an error code.  The cluster
672 	 * will not be locked (or ref'd) if an error is returned.
673 	 */
674 	atomic_set_int(&cluster->flags, nflags);
675 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
676 }
677 
678 /*
679  * This is used by the XOPS subsystem to calculate the state of
680  * the collection and tell hammer2_xop_collect() what to do with it.
681  * The collection can be in various states of desynchronization, the
682  * caller specifically wants to resolve the passed-in key.
683  *
684  * Return values:
685  *	0		- Quorum agreement, key is valid
686  *
687  *	ENOENT		- Quorum agreement, end of scan
688  *
689  *	ESRCH		- Quorum agreement, key is INVALID (caller should
690  *			  skip key).
691  *
692  *	EIO		- Quorum agreement but all elements had errors.
693  *
694  *	EDEADLK		- No quorum agreement possible for key, a repair
695  *			  may be needed.  Caller has to decide what to do,
696  *			  possibly iterating the key or generating an EIO.
697  *
698  *	EINPROGRESS	- No quorum agreement yet, but agreement is still
699  *			  possible if caller waits for more responses.  Caller
700  *			  should not iterate key.
701  *
702  * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds.
703  *
704  * XXX needs to handle SOFT_MASTER and SOFT_SLAVE
705  */
706 int
707 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags)
708 {
709 	hammer2_chain_t *chain;
710 	hammer2_chain_t *focus;
711 	hammer2_pfs_t *pmp;
712 	hammer2_tid_t quorum_tid;
713 	hammer2_tid_t last_best_quorum_tid;
714 	uint32_t nflags;
715 	int ttlmasters;
716 	int ttlslaves;
717 	int nmasters;
718 	int nmasters_keymatch;
719 	int nslaves;
720 	int nquorum;
721 	int umasters;	/* unknown masters (still in progress) */
722 	int smpresent;
723 	int error;
724 	int i;
725 
726 	cluster->error = 0;
727 	cluster->focus = NULL;
728 
729 	pmp = cluster->pmp;
730 	KKASSERT(pmp != NULL || cluster->nchains == 0);
731 
732 	/*
733 	 * Calculate quorum
734 	 */
735 	nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0;
736 	smpresent = 0;
737 	nflags = 0;
738 	ttlmasters = 0;
739 	ttlslaves = 0;
740 
741 	/*
742 	 * Pass 1
743 	 *
744 	 * NOTE: A NULL chain is not necessarily an error, it could be
745 	 *	 e.g. a lookup failure or the end of an iteration.
746 	 *	 Process normally.
747 	 */
748 	for (i = 0; i < cluster->nchains; ++i) {
749 		cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD;
750 		cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
751 
752 		chain = cluster->array[i].chain;
753 		error = cluster->array[i].error;
754 		if (chain && error) {
755 			if (cluster->focus == NULL || cluster->focus == chain) {
756 				/* error will be overridden by valid focus */
757 				/* XXX */
758 			}
759 
760 			/*
761 			 * Must count total masters and slaves whether the
762 			 * chain is errored or not.
763 			 */
764 			switch (cluster->pmp->pfs_types[i]) {
765 			case HAMMER2_PFSTYPE_SUPROOT:
766 			case HAMMER2_PFSTYPE_MASTER:
767 				++ttlmasters;
768 				break;
769 			case HAMMER2_PFSTYPE_SLAVE:
770 				++ttlslaves;
771 				break;
772 			}
773 			continue;
774 		}
775 		switch (cluster->pmp->pfs_types[i]) {
776 		case HAMMER2_PFSTYPE_MASTER:
777 			++ttlmasters;
778 			break;
779 		case HAMMER2_PFSTYPE_SLAVE:
780 			++ttlslaves;
781 			break;
782 		case HAMMER2_PFSTYPE_SOFT_MASTER:
783 			nflags |= HAMMER2_CLUSTER_WRSOFT;
784 			nflags |= HAMMER2_CLUSTER_RDSOFT;
785 			smpresent = 1;
786 			break;
787 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
788 			nflags |= HAMMER2_CLUSTER_RDSOFT;
789 			break;
790 		case HAMMER2_PFSTYPE_SUPROOT:
791 			/*
792 			 * Degenerate cluster representing the super-root
793 			 * topology on a single device.  Fake stuff so
794 			 * cluster ops work as expected.
795 			 */
796 			++ttlmasters;
797 			nflags |= HAMMER2_CLUSTER_WRHARD;
798 			nflags |= HAMMER2_CLUSTER_RDHARD;
799 			cluster->focus_index = i;
800 			cluster->focus = chain;
801 			cluster->error = error;
802 			break;
803 		default:
804 			break;
805 		}
806 	}
807 
808 	/*
809 	 * Pass 2
810 	 *
811 	 * Resolve nmasters		- master nodes fully match
812 	 *
813 	 * Resolve umasters		- master nodes operation still
814 	 *				  in progress
815 	 *
816 	 * Resolve nmasters_keymatch	- master nodes match the passed-in
817 	 *				  key and may or may not match
818 	 *				  the quorum-agreed tid.
819 	 *
820 	 * The quorum-agreed TID is the highest matching TID.
821 	 */
822 	last_best_quorum_tid = HAMMER2_TID_MAX;
823 	umasters = 0;
824 	nmasters = 0;
825 	nmasters_keymatch = 0;
826 	quorum_tid = 0;		/* fix gcc warning */
827 
828 	while (nmasters < nquorum && last_best_quorum_tid != 0) {
829 		umasters = 0;
830 		nmasters = 0;
831 		nmasters_keymatch = 0;
832 		quorum_tid = 0;
833 
834 		for (i = 0; i < cluster->nchains; ++i) {
835 			/* XXX SOFT smpresent handling */
836 			switch(cluster->pmp->pfs_types[i]) {
837 			case HAMMER2_PFSTYPE_MASTER:
838 			case HAMMER2_PFSTYPE_SUPROOT:
839 				break;
840 			default:
841 				continue;
842 			}
843 
844 			chain = cluster->array[i].chain;
845 			error = cluster->array[i].error;
846 
847 			/*
848 			 * Skip elements still in progress.  umasters keeps
849 			 * track of masters that might still be in-progress.
850 			 */
851 			if (chain == NULL && (cluster->array[i].flags &
852 					      HAMMER2_CITEM_NULL) == 0) {
853 				++umasters;
854 				continue;
855 			}
856 
857 			/*
858 			 * Key match?
859 			 */
860 			if (flags & HAMMER2_CHECK_NULL) {
861 				if (chain == NULL) {
862 					++nmasters;
863 					++nmasters_keymatch;
864 					if (cluster->error == 0)
865 						cluster->error = error;
866 				}
867 			} else if (chain &&
868 				   (key == (hammer2_key_t)-1 ||
869 				    chain->bref.key == key)) {
870 				++nmasters_keymatch;
871 
872 				if (chain->bref.modify_tid <
873 				     last_best_quorum_tid &&
874 				    quorum_tid < chain->bref.modify_tid) {
875 					/*
876 					 * Select new TID as master if better
877 					 * than any found so far in this loop,
878 					 * as long as it does not reach the
879 					 * best tid found in the previous loop.
880 					 */
881 					nmasters = 0;
882 					quorum_tid = chain->bref.modify_tid;
883 				}
884 				if (quorum_tid == chain->bref.modify_tid) {
885 					/*
886 					 * TID matches current collection.
887 					 *
888 					 * (error handled in next pass)
889 					 */
890 					++nmasters;
891 					if (chain->error == 0) {
892 						cluster->focus = chain;
893 						cluster->focus_index = i;
894 					}
895 				}
896 			}
897 		}
898 		if (nmasters >= nquorum)
899 			break;
900 		last_best_quorum_tid = quorum_tid;
901 	}
902 
903 	/*
904 	kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n",
905 		nmasters, nquorum, nmasters_keymatch, umasters);
906 	*/
907 
908 	/*
909 	 * Early return if we do not have enough masters.
910 	 */
911 	if (nmasters < nquorum) {
912 		if (nmasters + umasters >= nquorum)
913 			return HAMMER2_ERROR_EINPROGRESS;
914 		if (nmasters_keymatch < nquorum)
915 			return HAMMER2_ERROR_ESRCH;
916 		return HAMMER2_ERROR_EDEADLK;
917 	}
918 
919 	/*
920 	 * Validated end of scan.
921 	 */
922 	if (flags & HAMMER2_CHECK_NULL) {
923 		if (cluster->error == 0)
924 			cluster->error = HAMMER2_ERROR_ENOENT;
925 		return cluster->error;
926 	}
927 
928 	/*
929 	 * If we have a NULL focus at this point the agreeing quorum all
930 	 * had chain errors.
931 	 */
932 	if (cluster->focus == NULL)
933 		return HAMMER2_ERROR_EIO;
934 
935 	/*
936 	 * Pass 3
937 	 *
938 	 * We have quorum agreement, validate elements, not end of scan.
939 	 */
940 	nslaves = 0;
941 	cluster->error = 0;
942 
943 	for (i = 0; i < cluster->nchains; ++i) {
944 		chain = cluster->array[i].chain;
945 		error = cluster->array[i].error;
946 		if (chain == NULL ||
947 		    chain->bref.key != key ||
948 		    chain->bref.modify_tid != quorum_tid) {
949 			continue;
950 		}
951 
952 		/*
953 		 * Quorum Match
954 		 *
955 		 * XXX for now, cumulative error.
956 		 */
957 		if (cluster->error == 0)
958 			cluster->error = error;
959 
960 		switch (cluster->pmp->pfs_types[i]) {
961 		case HAMMER2_PFSTYPE_MASTER:
962 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
963 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
964 			nflags |= HAMMER2_CLUSTER_WRHARD;
965 			nflags |= HAMMER2_CLUSTER_RDHARD;
966 			break;
967 		case HAMMER2_PFSTYPE_SLAVE:
968 			/*
969 			 * We must have enough up-to-date masters to reach
970 			 * a quorum and the slave modify_tid must match the
971 			 * quorum's modify_tid.
972 			 *
973 			 * Do not select an errored slave.
974 			 */
975 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
976 			nflags |= HAMMER2_CLUSTER_RDHARD;
977 			++nslaves;
978 			break;
979 		case HAMMER2_PFSTYPE_SOFT_MASTER:
980 			/*
981 			 * Directly mounted soft master always wins.  There
982 			 * should be only one.
983 			 */
984 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
985 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
986 			break;
987 		case HAMMER2_PFSTYPE_SOFT_SLAVE:
988 			/*
989 			 * Directly mounted soft slave always wins.  There
990 			 * should be only one.
991 			 *
992 			 * XXX
993 			 */
994 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
995 			break;
996 		case HAMMER2_PFSTYPE_SUPROOT:
997 			/*
998 			 * spmp (degenerate case)
999 			 */
1000 			cluster->array[i].flags |= HAMMER2_CITEM_FEMOD;
1001 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1002 			nflags |= HAMMER2_CLUSTER_WRHARD;
1003 			nflags |= HAMMER2_CLUSTER_RDHARD;
1004 			break;
1005 		default:
1006 			break;
1007 		}
1008 	}
1009 
1010 	/*
1011 	 * Focus now set, adjust ddflag.  Skip this pass if the focus
1012 	 * is bad or if we are at the PFS root (the bref won't match at
1013 	 * the PFS root, obviously).
1014 	 *
1015 	 * Also make sure the chain's data is synchronized to the cpu.
1016 	 */
1017 	focus = cluster->focus;
1018 	if (focus) {
1019 		if (focus->data) {
1020 			if (focus->dio)
1021 				hammer2_io_bkvasync(focus->dio);
1022 		}
1023 		cluster->ddflag =
1024 			(cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE);
1025 	} else {
1026 		cluster->ddflag = 0;
1027 		goto skip4;
1028 	}
1029 	if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1030 		goto skip4;
1031 
1032 	/*
1033 	 * Pass 4
1034 	 *
1035 	 * Validate the elements that were not marked invalid.  They should
1036 	 * match.
1037 	 */
1038 	for (i = 0; i < cluster->nchains; ++i) {
1039 		int ddflag;
1040 
1041 		chain = cluster->array[i].chain;
1042 
1043 		if (chain == NULL)
1044 			continue;
1045 		if (chain == focus)
1046 			continue;
1047 		if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1048 			continue;
1049 
1050 		ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1051 		if (chain->bref.type != focus->bref.type ||
1052 		    chain->bref.key != focus->bref.key ||
1053 		    chain->bref.keybits != focus->bref.keybits ||
1054 		    chain->bref.modify_tid != focus->bref.modify_tid ||
1055 		    chain->bytes != focus->bytes ||
1056 		    ddflag != cluster->ddflag) {
1057 			cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
1058 			if (hammer2_debug & 1)
1059 			kprintf("cluster_resolve: matching modify_tid failed "
1060 				"bref test: idx=%d type=%02x/%02x "
1061 				"key=%016jx/%d-%016jx/%d "
1062 				"mod=%016jx/%016jx bytes=%u/%u\n",
1063 				i,
1064 				chain->bref.type, focus->bref.type,
1065 				chain->bref.key, chain->bref.keybits,
1066 				focus->bref.key, focus->bref.keybits,
1067 				chain->bref.modify_tid, focus->bref.modify_tid,
1068 				chain->bytes, focus->bytes);
1069 			if (hammer2_debug & 0x4000)
1070 				panic("cluster_resolve");
1071 			/* flag issue and force resync? */
1072 		}
1073 	}
1074 skip4:
1075 
1076 	if (ttlslaves == 0)
1077 		nflags |= HAMMER2_CLUSTER_NOSOFT;
1078 	if (ttlmasters == 0)
1079 		nflags |= HAMMER2_CLUSTER_NOHARD;
1080 
1081 	/*
1082 	 * Set SSYNCED or MSYNCED for slaves and masters respectively if
1083 	 * all available nodes (even if 0 are available) are fully
1084 	 * synchronized.  This is used by the synchronization thread to
1085 	 * determine if there is work it could potentially accomplish.
1086 	 */
1087 	if (nslaves == ttlslaves)
1088 		nflags |= HAMMER2_CLUSTER_SSYNCED;
1089 	if (nmasters == ttlmasters)
1090 		nflags |= HAMMER2_CLUSTER_MSYNCED;
1091 
1092 	/*
1093 	 * Determine if the cluster was successfully locked for the
1094 	 * requested operation and generate an error code.  The cluster
1095 	 * will not be locked (or ref'd) if an error is returned.
1096 	 */
1097 	atomic_set_int(&cluster->flags, nflags);
1098 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
1099 
1100 	return cluster->error;
1101 }
1102 
1103 /*
1104  * This is used by the sync thread to force non-NULL elements of a copy
1105  * of the pmp->iroot cluster to be good which is required to prime the
1106  * sync.
1107  */
1108 void
1109 hammer2_cluster_forcegood(hammer2_cluster_t *cluster)
1110 {
1111 	int i;
1112 
1113 	for (i = 0; i < cluster->nchains; ++i) {
1114 		if (cluster->array[i].chain)
1115 			cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
1116 	}
1117 }
1118 
1119 /*
1120  * Unlock a cluster.  Refcount and focus is maintained.
1121  */
1122 void
1123 hammer2_cluster_unlock(hammer2_cluster_t *cluster)
1124 {
1125 	hammer2_chain_t *chain;
1126 	int i;
1127 
1128 	if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) {
1129 		kprintf("hammer2_cluster_unlock: cluster %p not locked\n",
1130 			cluster);
1131 	}
1132 	KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED);
1133 	KKASSERT(cluster->refs > 0);
1134 	atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
1135 
1136 	for (i = 0; i < cluster->nchains; ++i) {
1137 		chain = cluster->array[i].chain;
1138 		if (chain)
1139 			hammer2_chain_unlock(chain);
1140 	}
1141 }
1142 
1143 /************************************************************************
1144  *			        CLUSTER I/O 				*
1145  ************************************************************************
1146  *
1147  *
1148  * WARNING! blockref[] array data is not universal.  These functions should
1149  *	    only be used to access universal data.
1150  *
1151  * NOTE!    The rdata call will wait for at least one of the chain I/Os to
1152  *	    complete if necessary.  The I/O's should have already been
1153  *	    initiated by the cluster_lock/chain_lock operation.
1154  *
1155  *	    The cluster must already be in a modified state before wdata
1156  *	    is called.  The data will already be available for this case.
1157  */
1158 const hammer2_media_data_t *
1159 hammer2_cluster_rdata(hammer2_cluster_t *cluster)
1160 {
1161 	hammer2_chain_t *chain;
1162 
1163 	chain = cluster->focus;
1164 	KKASSERT(chain != NULL && chain->lockcnt);
1165 	if (chain->dio)
1166 		hammer2_io_bkvasync(chain->dio);
1167 	return (chain->data);
1168 }
1169 
1170 hammer2_media_data_t *
1171 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
1172 {
1173 	hammer2_chain_t *chain;
1174 
1175 	chain = cluster->focus;
1176 	KKASSERT(chain != NULL && chain->lockcnt &&
1177 		 hammer2_cluster_modified(cluster));
1178 	if (chain->dio)
1179 		hammer2_io_bkvasync(chain->dio);
1180 	return(chain->data);
1181 }
1182