xref: /illumos-gate/usr/src/cmd/fm/fmd/common/fmd_case.c (revision 2a910fbb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * FMD Case Subsystem
29  *
30  * Diagnosis engines are expected to group telemetry events related to the
31  * diagnosis of a particular problem on the system into a set of cases.  The
32  * diagnosis engine may have any number of cases open at a given point in time.
33  * Some cases may eventually be *solved* by associating a suspect list of one
34  * or more problems with the case, at which point fmd publishes a list.suspect
35  * event for the case and it becomes visible to administrators and agents.
36  *
37  * Every case is named using a UUID, and is globally visible in the case hash.
38  * Cases are reference-counted, except for the reference from the case hash
39  * itself.  Consumers of case references include modules, which store active
40  * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
41  *
42  * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
43  * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
44  * or transport) and the case is referenced by the mod_cases list.  Once the
45  * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
46  * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
47  *
48  *			+------------+
49  *	     +----------|  UNSOLVED  |
50  *	     |		+------------+
51  *	     |		      1 |
52  *	     |			|
53  *	     |		+-------v----+
54  *	   2 |		|    SOLVED  |
55  *	     |		+------------+
56  *	     |		    3 |  5 |
57  *	     +------------+   |    |
58  *			  |   |    |
59  *			+-v---v----v-+
60  *			| CLOSE_WAIT |
61  *			+------------+
62  *			  |   |    |
63  *	      +-----------+   |    +------------+
64  *	      |		    4 |			|
65  *	      v		+-----v------+		|
66  *	   discard      |   CLOSED   |	      6	|
67  *			+------------+		|
68  *			      |			|
69  *			      |	   +------------+
70  *			    7 |	   |
71  *			+-----v----v-+
72  *			|  REPAIRED  |
73  *			+------------+
74  *			      |
75  *			    8 |
76  *			+-----v------+
77  *			|  RESOLVED  |
78  *			+------------+
79  *			      |
80  *			      v
81  *			   discard
82  *
83  * The state machine changes are triggered by calls to fmd_case_transition()
84  * from various locations inside of fmd, as described below:
85  *
86  * [1] Called by: fmd_case_solve()
87  *       Actions: FMD_CF_SOLVED flag is set in ci_flags
88  *                conviction policy is applied to suspect list
89  *                suspects convicted are marked faulty (F) in R$
90  *                list.suspect event logged and dispatched
91  *
92  * [2] Called by: fmd_case_close(), fmd_case_uuclose()
93  *       Actions: diagnosis engine fmdo_close() entry point scheduled
94  *                case discarded upon exit from CLOSE_WAIT
95  *
96  * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
97  *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
98  *                suspects convicted (F) are marked unusable (U) in R$
99  *                diagnosis engine fmdo_close() entry point scheduled
100  *                case transitions to CLOSED [4] upon exit from CLOSE_WAIT
101  *
102  * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
103  *       Actions: list.isolated event dispatched
104  *                case deleted from module's list of open cases
105  *
106  * [5] Called by: fmd_case_repair(), fmd_case_update()
107  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
108  *                diagnosis engine fmdo_close() entry point scheduled
109  *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
110  *
111  * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
112  *       Actions: suspects convicted are marked non faulty (!F) in R$
113  *                list.repaired or list.updated event dispatched
114  *
115  * [7] Called by: fmd_case_repair(), fmd_case_update()
116  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
117  *                suspects convicted are marked non faulty (!F) in R$
118  *                list.repaired or list.updated event dispatched
119  *
120  * [8] Called by: fmd_case_uuresolve()
121  *       Actions: list.resolved event dispatched
122  *		  case is discarded
123  */
124 
125 #include <sys/fm/protocol.h>
126 #include <uuid/uuid.h>
127 #include <alloca.h>
128 
129 #include <fmd_alloc.h>
130 #include <fmd_module.h>
131 #include <fmd_error.h>
132 #include <fmd_conf.h>
133 #include <fmd_case.h>
134 #include <fmd_string.h>
135 #include <fmd_subr.h>
136 #include <fmd_protocol.h>
137 #include <fmd_event.h>
138 #include <fmd_eventq.h>
139 #include <fmd_dispq.h>
140 #include <fmd_buf.h>
141 #include <fmd_log.h>
142 #include <fmd_asru.h>
143 #include <fmd_fmri.h>
144 #include <fmd_xprt.h>
145 
146 #include <fmd.h>
147 
148 static const char *const _fmd_case_snames[] = {
149 	"UNSOLVED",	/* FMD_CASE_UNSOLVED */
150 	"SOLVED",	/* FMD_CASE_SOLVED */
151 	"CLOSE_WAIT",	/* FMD_CASE_CLOSE_WAIT */
152 	"CLOSED",	/* FMD_CASE_CLOSED */
153 	"REPAIRED",	/* FMD_CASE_REPAIRED */
154 	"RESOLVED"	/* FMD_CASE_RESOLVED */
155 };
156 
157 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *);
158 
159 fmd_case_hash_t *
160 fmd_case_hash_create(void)
161 {
162 	fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
163 
164 	(void) pthread_rwlock_init(&chp->ch_lock, NULL);
165 	chp->ch_hashlen = fmd.d_str_buckets;
166 	chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
167 	chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen,
168 	    FMD_SLEEP);
169 	chp->ch_count = 0;
170 
171 	return (chp);
172 }
173 
174 /*
175  * Destroy the case hash.  Unlike most of our hash tables, no active references
176  * are kept by the case hash itself; all references come from other subsystems.
177  * The hash must be destroyed after all modules are unloaded; if anything was
178  * present in the hash it would be by definition a reference count leak.
179  */
180 void
181 fmd_case_hash_destroy(fmd_case_hash_t *chp)
182 {
183 	fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
184 	fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen);
185 	fmd_free(chp, sizeof (fmd_case_hash_t));
186 }
187 
188 /*
189  * Take a snapshot of the case hash by placing an additional hold on each
190  * member in an auxiliary array, and then call 'func' for each case.
191  */
192 void
193 fmd_case_hash_apply(fmd_case_hash_t *chp,
194     void (*func)(fmd_case_t *, void *), void *arg)
195 {
196 	fmd_case_impl_t *cp, **cps, **cpp;
197 	uint_t cpc, i;
198 
199 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
200 
201 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
202 	cpc = chp->ch_count;
203 
204 	for (i = 0; i < chp->ch_hashlen; i++) {
205 		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next)
206 			*cpp++ = fmd_case_tryhold(cp);
207 	}
208 
209 	ASSERT(cpp == cps + cpc);
210 	(void) pthread_rwlock_unlock(&chp->ch_lock);
211 
212 	for (i = 0; i < cpc; i++) {
213 		if (cps[i] != NULL) {
214 			func((fmd_case_t *)cps[i], arg);
215 			fmd_case_rele((fmd_case_t *)cps[i]);
216 		}
217 	}
218 
219 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
220 }
221 
222 static void
223 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
224 {
225 	uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
226 
227 	cip->ci_code_next = chp->ch_code_hash[h];
228 	chp->ch_code_hash[h] = cip;
229 }
230 
231 static void
232 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
233 {
234 	fmd_case_impl_t **pp, *cp;
235 
236 	if (cip->ci_code) {
237 		uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
238 
239 		pp = &chp->ch_code_hash[h];
240 		for (cp = *pp; cp != NULL; cp = cp->ci_code_next) {
241 			if (cp != cip)
242 				pp = &cp->ci_code_next;
243 			else
244 				break;
245 		}
246 		if (cp != NULL) {
247 			*pp = cp->ci_code_next;
248 			cp->ci_code_next = NULL;
249 		}
250 	}
251 }
252 
253 /*
254  * Look up the diagcode for this case and cache it in ci_code.  If no suspects
255  * were defined for this case or if the lookup fails, the event dictionary or
256  * module code is broken, and we set the event code to a precomputed default.
257  */
258 static const char *
259 fmd_case_mkcode(fmd_case_t *cp)
260 {
261 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
262 	fmd_case_susp_t *cis;
263 	fmd_case_hash_t *chp = fmd.d_cases;
264 
265 	char **keys, **keyp;
266 	const char *s;
267 
268 	ASSERT(MUTEX_HELD(&cip->ci_lock));
269 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
270 
271 	/*
272 	 * delete any existing entry from code hash if it is on it
273 	 */
274 	fmd_case_code_hash_delete(chp, cip);
275 
276 	fmd_free(cip->ci_code, cip->ci_codelen);
277 	cip->ci_codelen = cip->ci_mod->mod_codelen;
278 	cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
279 	keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
280 
281 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
282 		if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
283 			keyp++;
284 	}
285 
286 	*keyp = NULL; /* mark end of keys[] array for libdiagcode */
287 
288 	if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
289 	    cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
290 		(void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
291 		fmd_free(cip->ci_code, cip->ci_codelen);
292 		cip->ci_codelen = strlen(s) + 1;
293 		cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
294 		(void) strcpy(cip->ci_code, s);
295 	}
296 
297 	/*
298 	 * add into hash of solved cases
299 	 */
300 	fmd_case_code_hash_insert(chp, cip);
301 
302 	return (cip->ci_code);
303 }
304 
305 typedef struct {
306 	int	*fcl_countp;
307 	int	fcl_maxcount;
308 	uint8_t *fcl_ba;
309 	nvlist_t **fcl_nva;
310 	int	*fcl_msgp;
311 } fmd_case_lst_t;
312 
313 static void
314 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg)
315 {
316 	fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg;
317 	boolean_t b;
318 	int state;
319 
320 	if (*entryp->fcl_countp >= entryp->fcl_maxcount)
321 		return;
322 	if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE,
323 	    &b) == 0 && b == B_FALSE)
324 		*entryp->fcl_msgp = B_FALSE;
325 	entryp->fcl_ba[*entryp->fcl_countp] = 0;
326 	state = fmd_asru_al_getstate(alp);
327 	if (state & FMD_ASRU_DEGRADED)
328 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED;
329 	if (state & FMD_ASRU_UNUSABLE)
330 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE;
331 	if (state & FMD_ASRU_FAULTY)
332 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY;
333 	if (!(state & FMD_ASRU_PRESENT))
334 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT;
335 	if (alp->al_reason == FMD_ASRU_REPAIRED)
336 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED;
337 	else if (alp->al_reason == FMD_ASRU_REPLACED)
338 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED;
339 	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
340 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED;
341 	entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event;
342 	(*entryp->fcl_countp)++;
343 }
344 
345 static void
346 fmd_case_faulty(fmd_asru_link_t *alp, void *arg)
347 {
348 	int *faultyp = (int *)arg;
349 
350 	*faultyp |= (alp->al_flags & FMD_ASRU_FAULTY);
351 }
352 
353 static void
354 fmd_case_usable(fmd_asru_link_t *alp, void *arg)
355 {
356 	int *usablep = (int *)arg;
357 
358 	*usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE);
359 }
360 
361 static void
362 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg)
363 {
364 	int *not_faultyp = (int *)arg;
365 
366 	*not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY);
367 }
368 
369 /*
370  * Have we got any suspects with an asru that are still unusable and present?
371  */
372 static void
373 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg)
374 {
375 	int *rvalp = (int *)arg;
376 	int state;
377 	nvlist_t *asru;
378 
379 	/*
380 	 * if this a proxy case and this suspect doesn't have an local asru
381 	 * then state is unknown so we must assume it may still be unusable.
382 	 */
383 	if ((alp->al_flags & FMD_ASRU_PROXY) &&
384 	    !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) {
385 		*rvalp |= B_TRUE;
386 		return;
387 	}
388 
389 	state = fmd_asru_al_getstate(alp);
390 	if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0)
391 		return;
392 	*rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT));
393 }
394 
395 nvlist_t *
396 fmd_case_mkevent(fmd_case_t *cp, const char *class)
397 {
398 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
399 	nvlist_t **nva, *nvl;
400 	uint8_t *ba;
401 	int msg = B_TRUE;
402 	const char *code;
403 	fmd_case_lst_t fcl;
404 	int count = 0;
405 
406 	(void) pthread_mutex_lock(&cip->ci_lock);
407 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
408 
409 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
410 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
411 
412 	/*
413 	 * For each suspect associated with the case, store its fault event
414 	 * nvlist in 'nva'.  We also look to see if any of the suspect faults
415 	 * have asked not to be messaged.  If any of them have made such a
416 	 * request, propagate that attribute to the composite list.* event.
417 	 * Finally, store each suspect's faulty status into the bitmap 'ba'.
418 	 */
419 	fcl.fcl_countp = &count;
420 	fcl.fcl_maxcount = cip->ci_nsuspects;
421 	fcl.fcl_msgp = &msg;
422 	fcl.fcl_ba = ba;
423 	fcl.fcl_nva = nva;
424 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
425 
426 	if (cip->ci_code == NULL)
427 		(void) fmd_case_mkcode(cp);
428 	/*
429 	 * For repair and updated event, we lookup diagcode from dict using key
430 	 * "list.repaired" or "list.updated" or "list.resolved".
431 	 */
432 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
433 		(void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code);
434 	else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
435 		(void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code);
436 	else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)
437 		(void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code);
438 	else
439 		code = cip->ci_code;
440 
441 	if (msg == B_FALSE)
442 		cip->ci_flags |= FMD_CF_INVISIBLE;
443 
444 	/*
445 	 * Use the ci_diag_de if one has been saved (eg for an injected fault).
446 	 * Otherwise use the authority for the current module.
447 	 */
448 	nvl = fmd_protocol_list(class, cip->ci_diag_de == NULL ?
449 	    cip->ci_mod->mod_fmri : cip->ci_diag_de, cip->ci_uuid, code, count,
450 	    nva, ba, msg, &cip->ci_tv, cip->ci_injected);
451 
452 	(void) pthread_mutex_unlock(&cip->ci_lock);
453 	return (nvl);
454 }
455 
456 static int fmd_case_match_on_faulty_overlap = 1;
457 static int fmd_case_match_on_acquit_overlap = 1;
458 static int fmd_case_auto_acquit_isolated = 1;
459 static int fmd_case_auto_acquit_non_acquitted = 1;
460 static int fmd_case_too_recent = 10; /* time in seconds */
461 
462 static boolean_t
463 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem)
464 {
465 	nvlist_t *new_rsrc;
466 	nvlist_t *rsrc;
467 	char *new_name = NULL;
468 	char *name = NULL;
469 	ssize_t new_namelen;
470 	ssize_t namelen;
471 	int fmri_present = 1;
472 	int new_fmri_present = 1;
473 	int match = B_FALSE;
474 	fmd_topo_t *ftp = fmd_topo_hold();
475 
476 	if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0)
477 		fmri_present = 0;
478 	else {
479 		if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1)
480 			goto done;
481 		name = fmd_alloc(namelen + 1, FMD_SLEEP);
482 		if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1)
483 			goto done;
484 	}
485 	if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0)
486 		new_fmri_present = 0;
487 	else {
488 		if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1)
489 			goto done;
490 		new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP);
491 		if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1)
492 			goto done;
493 	}
494 	match = (fmri_present == new_fmri_present &&
495 	    (fmri_present == 0 ||
496 	    topo_fmri_strcmp(ftp->ft_hdl, name, new_name)));
497 done:
498 	if (name != NULL)
499 		fmd_free(name, namelen + 1);
500 	if (new_name != NULL)
501 		fmd_free(new_name, new_namelen + 1);
502 	fmd_topo_rele(ftp);
503 	return (match);
504 }
505 
506 static int
507 fmd_case_match_suspect(nvlist_t *nvl1, nvlist_t *nvl2)
508 {
509 	char *class, *new_class;
510 
511 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_ASRU))
512 		return (0);
513 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_RESOURCE))
514 		return (0);
515 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_FRU))
516 		return (0);
517 	(void) nvlist_lookup_string(nvl2, FM_CLASS, &class);
518 	(void) nvlist_lookup_string(nvl1, FM_CLASS, &new_class);
519 	return (strcmp(class, new_class) == 0);
520 }
521 
522 typedef struct {
523 	int	*fcms_countp;
524 	int	fcms_maxcount;
525 	fmd_case_impl_t *fcms_cip;
526 	uint8_t *fcms_new_susp_state;
527 	uint8_t *fcms_old_susp_state;
528 	uint8_t *fcms_old_match_state;
529 } fcms_t;
530 #define	SUSPECT_STATE_FAULTY				0x1
531 #define	SUSPECT_STATE_ISOLATED				0x2
532 #define	SUSPECT_STATE_REMOVED				0x4
533 #define	SUSPECT_STATE_ACQUITED				0x8
534 #define	SUSPECT_STATE_REPAIRED				0x10
535 #define	SUSPECT_STATE_REPLACED				0x20
536 #define	SUSPECT_STATE_NO_MATCH				0x1
537 
538 /*
539  * This is called for each suspect in the old case. Compare it against each
540  * suspect in the new case, setting fcms_old_susp_state and fcms_new_susp_state
541  * as appropriate. fcms_new_susp_state will left as 0 if the suspect is not
542  * found in the old case.
543  */
544 static void
545 fmd_case_match_suspects(fmd_asru_link_t *alp, void *arg)
546 {
547 	fcms_t *fcmsp = (fcms_t *)arg;
548 	fmd_case_impl_t *cip = fcmsp->fcms_cip;
549 	fmd_case_susp_t *cis;
550 	int i = 0;
551 	int state = fmd_asru_al_getstate(alp);
552 
553 	if (*fcmsp->fcms_countp >= fcmsp->fcms_maxcount)
554 		return;
555 
556 	if (!(state & FMD_ASRU_PRESENT) || (!(state & FMD_ASRU_FAULTY) &&
557 	    alp->al_reason == FMD_ASRU_REMOVED))
558 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
559 		    SUSPECT_STATE_REMOVED;
560 	else if ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_FAULTY))
561 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
562 		    SUSPECT_STATE_ISOLATED;
563 	else if (state & FMD_ASRU_FAULTY)
564 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
565 		    SUSPECT_STATE_FAULTY;
566 	else if (alp->al_reason == FMD_ASRU_REPLACED)
567 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
568 		    SUSPECT_STATE_REPLACED;
569 	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
570 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
571 		    SUSPECT_STATE_ACQUITED;
572 	else
573 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
574 		    SUSPECT_STATE_REPAIRED;
575 
576 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next, i++)
577 		if (fmd_case_match_suspect(cis->cis_nvl, alp->al_event) == 1)
578 			break;
579 	if (cis != NULL)
580 		fcmsp->fcms_new_susp_state[i] =
581 		    fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp];
582 	else
583 		fcmsp->fcms_old_match_state[*fcmsp->fcms_countp] |=
584 		    SUSPECT_STATE_NO_MATCH;
585 	(*fcmsp->fcms_countp)++;
586 }
587 
588 typedef struct {
589 	int	*fca_do_update;
590 	fmd_case_impl_t *fca_cip;
591 } fca_t;
592 
593 /*
594  * Re-fault all acquitted suspects that are still present in the new list.
595  */
596 static void
597 fmd_case_fault_acquitted_matching(fmd_asru_link_t *alp, void *arg)
598 {
599 	fca_t *fcap = (fca_t *)arg;
600 	fmd_case_impl_t *cip = fcap->fca_cip;
601 	fmd_case_susp_t *cis;
602 	int state = fmd_asru_al_getstate(alp);
603 
604 	if (!(state & FMD_ASRU_FAULTY) &&
605 	    alp->al_reason == FMD_ASRU_ACQUITTED) {
606 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
607 			if (fmd_case_match_suspect(cis->cis_nvl,
608 			    alp->al_event) == 1)
609 				break;
610 		if (cis != NULL) {
611 			(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
612 			*fcap->fca_do_update = 1;
613 		}
614 	}
615 }
616 
617 /*
618  * Re-fault all suspects that are still present in the new list.
619  */
620 static void
621 fmd_case_fault_all_matching(fmd_asru_link_t *alp, void *arg)
622 {
623 	fca_t *fcap = (fca_t *)arg;
624 	fmd_case_impl_t *cip = fcap->fca_cip;
625 	fmd_case_susp_t *cis;
626 	int state = fmd_asru_al_getstate(alp);
627 
628 	if (!(state & FMD_ASRU_FAULTY)) {
629 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
630 			if (fmd_case_match_suspect(cis->cis_nvl,
631 			    alp->al_event) == 1)
632 				break;
633 		if (cis != NULL) {
634 			(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
635 			*fcap->fca_do_update = 1;
636 		}
637 	}
638 }
639 
640 /*
641  * Acquit all suspects that are no longer present in the new list.
642  */
643 static void
644 fmd_case_acquit_no_match(fmd_asru_link_t *alp, void *arg)
645 {
646 	fca_t *fcap = (fca_t *)arg;
647 	fmd_case_impl_t *cip = fcap->fca_cip;
648 	fmd_case_susp_t *cis;
649 	int state = fmd_asru_al_getstate(alp);
650 
651 	if (state & FMD_ASRU_FAULTY) {
652 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
653 			if (fmd_case_match_suspect(cis->cis_nvl,
654 			    alp->al_event) == 1)
655 				break;
656 		if (cis == NULL) {
657 			(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
658 			    FMD_ASRU_ACQUITTED);
659 			*fcap->fca_do_update = 1;
660 		}
661 	}
662 }
663 
664 /*
665  * Acquit all isolated suspects.
666  */
667 static void
668 fmd_case_acquit_isolated(fmd_asru_link_t *alp, void *arg)
669 {
670 	int *do_update = (int *)arg;
671 	int state = fmd_asru_al_getstate(alp);
672 
673 	if ((state & FMD_ASRU_PRESENT) && (state & FMD_ASRU_UNUSABLE) &&
674 	    (state & FMD_ASRU_FAULTY)) {
675 		(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
676 		    FMD_ASRU_ACQUITTED);
677 		*do_update = 1;
678 	}
679 }
680 
681 /*
682  * Acquit suspect which matches specified nvlist
683  */
684 static void
685 fmd_case_acquit_suspect(fmd_asru_link_t *alp, void *arg)
686 {
687 	nvlist_t *nvl = (nvlist_t *)arg;
688 	int state = fmd_asru_al_getstate(alp);
689 
690 	if ((state & FMD_ASRU_FAULTY) &&
691 	    fmd_case_match_suspect(nvl, alp->al_event) == 1)
692 		(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
693 		    FMD_ASRU_ACQUITTED);
694 }
695 
696 typedef struct {
697 	fmd_case_impl_t *fccd_cip;
698 	uint8_t *fccd_new_susp_state;
699 	uint8_t *fccd_new_match_state;
700 	int *fccd_discard_new;
701 	int *fccd_adjust_new;
702 } fccd_t;
703 
704 /*
705  * see if a matching suspect list already exists in the cache
706  */
707 static void
708 fmd_case_check_for_dups(fmd_case_t *old_cp, void *arg)
709 {
710 	fccd_t *fccdp = (fccd_t *)arg;
711 	fmd_case_impl_t *new_cip = fccdp->fccd_cip;
712 	fmd_case_impl_t *old_cip = (fmd_case_impl_t *)old_cp;
713 	int i, count = 0, do_update = 0, got_isolated_overlap = 0;
714 	int got_faulty_overlap = 0;
715 	int got_acquit_overlap = 0;
716 	boolean_t too_recent;
717 	uint64_t most_recent = 0;
718 	fcms_t fcms;
719 	fca_t fca;
720 	uint8_t *new_susp_state;
721 	uint8_t *old_susp_state;
722 	uint8_t *old_match_state;
723 
724 	new_susp_state = alloca(new_cip->ci_nsuspects * sizeof (uint8_t));
725 	for (i = 0; i < new_cip->ci_nsuspects; i++)
726 		new_susp_state[i] = 0;
727 	old_susp_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
728 	for (i = 0; i < old_cip->ci_nsuspects; i++)
729 		old_susp_state[i] = 0;
730 	old_match_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
731 	for (i = 0; i < old_cip->ci_nsuspects; i++)
732 		old_match_state[i] = 0;
733 
734 	/*
735 	 * Compare with each suspect in the existing case.
736 	 */
737 	fcms.fcms_countp = &count;
738 	fcms.fcms_maxcount = old_cip->ci_nsuspects;
739 	fcms.fcms_cip = new_cip;
740 	fcms.fcms_new_susp_state = new_susp_state;
741 	fcms.fcms_old_susp_state = old_susp_state;
742 	fcms.fcms_old_match_state = old_match_state;
743 	fmd_asru_hash_apply_by_case(fmd.d_asrus, (fmd_case_t *)old_cip,
744 	    fmd_case_match_suspects, &fcms);
745 
746 	/*
747 	 * If we have some faulty, non-isolated suspects that overlap, then most
748 	 * likely it is the suspects that overlap in the suspect lists that are
749 	 * to blame. So we can consider this to be a match.
750 	 */
751 	for (i = 0; i < new_cip->ci_nsuspects; i++)
752 		if (new_susp_state[i] == SUSPECT_STATE_FAULTY)
753 			got_faulty_overlap = 1;
754 	if (got_faulty_overlap && fmd_case_match_on_faulty_overlap)
755 		goto got_match;
756 
757 	/*
758 	 * If we have no faulty, non-isolated suspects in the old case, but we
759 	 * do have some acquitted suspects that overlap, then most likely it is
760 	 * the acquitted suspects that overlap in the suspect lists that are
761 	 * to blame. So we can consider this to be a match.
762 	 */
763 	for (i = 0; i < new_cip->ci_nsuspects; i++)
764 		if (new_susp_state[i] == SUSPECT_STATE_ACQUITED)
765 			got_acquit_overlap = 1;
766 	for (i = 0; i < old_cip->ci_nsuspects; i++)
767 		if (old_susp_state[i] == SUSPECT_STATE_FAULTY)
768 			got_acquit_overlap = 0;
769 	if (got_acquit_overlap && fmd_case_match_on_acquit_overlap)
770 		goto got_match;
771 
772 	/*
773 	 * Check that all suspects in the new list are present in the old list.
774 	 * Return if we find one that isn't.
775 	 */
776 	for (i = 0; i < new_cip->ci_nsuspects; i++)
777 		if (new_susp_state[i] == 0)
778 			return;
779 
780 	/*
781 	 * Check that all suspects in the old list are present in the new list
782 	 * *or* they are isolated or removed/replaced (which would explain why
783 	 * they are not present in the new list). Return if we find one that is
784 	 * faulty and unisolated or repaired or acquitted, and that is not
785 	 * present in the new case.
786 	 */
787 	for (i = 0; i < old_cip->ci_nsuspects; i++)
788 		if (old_match_state[i] == SUSPECT_STATE_NO_MATCH &&
789 		    (old_susp_state[i] == SUSPECT_STATE_FAULTY ||
790 		    old_susp_state[i] == SUSPECT_STATE_ACQUITED ||
791 		    old_susp_state[i] == SUSPECT_STATE_REPAIRED))
792 			return;
793 
794 got_match:
795 	/*
796 	 * If the old case is already in repaired/resolved state, we can't
797 	 * do anything more with it, so keep the new case, but acquit some
798 	 * of the suspects if appropriate.
799 	 */
800 	if (old_cip->ci_state >= FMD_CASE_REPAIRED) {
801 		if (fmd_case_auto_acquit_non_acquitted) {
802 			*fccdp->fccd_adjust_new = 1;
803 			for (i = 0; i < new_cip->ci_nsuspects; i++) {
804 				fccdp->fccd_new_susp_state[i] |=
805 				    new_susp_state[i];
806 				if (new_susp_state[i] == 0)
807 					fccdp->fccd_new_susp_state[i] =
808 					    SUSPECT_STATE_NO_MATCH;
809 			}
810 		}
811 		return;
812 	}
813 
814 	/*
815 	 * Otherwise discard the new case and keep the old, again updating the
816 	 * state of the suspects as appropriate
817 	 */
818 	*fccdp->fccd_discard_new = 1;
819 	fca.fca_cip = new_cip;
820 	fca.fca_do_update = &do_update;
821 
822 	/*
823 	 * See if new case occurred within fmd_case_too_recent seconds of the
824 	 * most recent modification to the old case and if so don't do
825 	 * auto-acquit. This avoids problems if a flood of ereports come in and
826 	 * they don't all get diagnosed before the first case causes some of
827 	 * the devices to be isolated making it appear that an isolated device
828 	 * was in the suspect list.
829 	 */
830 	fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
831 	    fmd_asru_most_recent, &most_recent);
832 	too_recent = (new_cip->ci_tv.tv_sec - most_recent <
833 	    fmd_case_too_recent);
834 
835 	if (got_faulty_overlap) {
836 		/*
837 		 * Acquit any suspects not present in the new list, plus
838 		 * any that are are present but are isolated.
839 		 */
840 		fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
841 		    fmd_case_acquit_no_match, &fca);
842 		if (fmd_case_auto_acquit_isolated && !too_recent)
843 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
844 			    fmd_case_acquit_isolated, &do_update);
845 	} else if (got_acquit_overlap) {
846 		/*
847 		 * Re-fault the acquitted matching suspects and acquit all
848 		 * isolated suspects.
849 		 */
850 		if (fmd_case_auto_acquit_isolated && !too_recent) {
851 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
852 			    fmd_case_fault_acquitted_matching, &fca);
853 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
854 			    fmd_case_acquit_isolated, &do_update);
855 		}
856 	} else if (fmd_case_auto_acquit_isolated) {
857 		/*
858 		 * To get here, there must be no faulty or acquitted suspects,
859 		 * but there must be at least one isolated suspect. Just acquit
860 		 * non-matching isolated suspects. If there are no matching
861 		 * isolated suspects, then re-fault all matching suspects.
862 		 */
863 		for (i = 0; i < new_cip->ci_nsuspects; i++)
864 			if (new_susp_state[i] == SUSPECT_STATE_ISOLATED)
865 				got_isolated_overlap = 1;
866 		if (!got_isolated_overlap)
867 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
868 			    fmd_case_fault_all_matching, &fca);
869 		fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
870 		    fmd_case_acquit_no_match, &fca);
871 	}
872 
873 	/*
874 	 * If we've updated anything in the old case, call fmd_case_update()
875 	 */
876 	if (do_update)
877 		fmd_case_update(old_cp);
878 }
879 
880 /*
881  * Convict suspects in a case by applying a conviction policy and updating the
882  * resource cache prior to emitting the list.suspect event for the given case.
883  * At present, our policy is very simple: convict every suspect in the case.
884  * In the future, this policy can be extended and made configurable to permit:
885  *
886  * - convicting the suspect with the highest FIT rate
887  * - convicting the suspect with the cheapest FRU
888  * - convicting the suspect with the FRU that is in a depot's inventory
889  * - convicting the suspect with the longest lifetime
890  *
891  * and so forth.  A word to the wise: this problem is significantly harder that
892  * it seems at first glance.  Future work should heed the following advice:
893  *
894  * Hacking the policy into C code here is a very bad idea.  The policy needs to
895  * be decided upon very carefully and fundamentally encodes knowledge of what
896  * suspect list combinations can be emitted by what diagnosis engines.  As such
897  * fmd's code is the wrong location, because that would require fmd itself to
898  * be updated for every diagnosis engine change, defeating the entire design.
899  * The FMA Event Registry knows the suspect list combinations: policy inputs
900  * can be derived from it and used to produce per-module policy configuration.
901  *
902  * If the policy needs to be dynamic and not statically fixed at either fmd
903  * startup or module load time, any implementation of dynamic policy retrieval
904  * must employ some kind of caching mechanism or be part of a built-in module.
905  * The fmd_case_convict() function is called with locks held inside of fmd and
906  * is not a place where unbounded blocking on some inter-process or inter-
907  * system communication to another service (e.g. another daemon) can occur.
908  */
909 static int
910 fmd_case_convict(fmd_case_t *cp)
911 {
912 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
913 	fmd_asru_hash_t *ahp = fmd.d_asrus;
914 	int discard_new = 0, i;
915 	fmd_case_susp_t *cis;
916 	fmd_asru_link_t *alp;
917 	uint8_t *new_susp_state;
918 	uint8_t *new_match_state;
919 	int adjust_new = 0;
920 	fccd_t fccd;
921 	fmd_case_impl_t *ncp, **cps, **cpp;
922 	uint_t cpc;
923 	fmd_case_hash_t *chp;
924 
925 	/*
926 	 * First we must see if any matching cases already exist.
927 	 */
928 	new_susp_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
929 	for (i = 0; i < cip->ci_nsuspects; i++)
930 		new_susp_state[i] = 0;
931 	new_match_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
932 	for (i = 0; i < cip->ci_nsuspects; i++)
933 		new_match_state[i] = 0;
934 	fccd.fccd_cip = cip;
935 	fccd.fccd_adjust_new = &adjust_new;
936 	fccd.fccd_new_susp_state = new_susp_state;
937 	fccd.fccd_new_match_state = new_match_state;
938 	fccd.fccd_discard_new = &discard_new;
939 
940 	/*
941 	 * Hold all cases
942 	 */
943 	chp = fmd.d_cases;
944 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
945 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
946 	cpc = chp->ch_count;
947 	for (i = 0; i < chp->ch_hashlen; i++)
948 		for (ncp = chp->ch_hash[i]; ncp != NULL; ncp = ncp->ci_next)
949 			*cpp++ = fmd_case_tryhold(ncp);
950 	ASSERT(cpp == cps + cpc);
951 	(void) pthread_rwlock_unlock(&chp->ch_lock);
952 
953 	/*
954 	 * Run fmd_case_check_for_dups() on all cases except the current one.
955 	 */
956 	for (i = 0; i < cpc; i++) {
957 		if (cps[i] != NULL) {
958 			if (cps[i] != (fmd_case_impl_t *)cp)
959 				fmd_case_check_for_dups((fmd_case_t *)cps[i],
960 				    &fccd);
961 			fmd_case_rele((fmd_case_t *)cps[i]);
962 		}
963 	}
964 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
965 
966 	(void) pthread_mutex_lock(&cip->ci_lock);
967 	if (cip->ci_code == NULL)
968 		(void) fmd_case_mkcode(cp);
969 	else if (cip->ci_precanned)
970 		fmd_case_code_hash_insert(fmd.d_cases, cip);
971 
972 	if (discard_new) {
973 		/*
974 		 * We've found an existing case that is a match and it is not
975 		 * already in repaired or resolved state. So we can close this
976 		 * one as a duplicate.
977 		 */
978 		(void) pthread_mutex_unlock(&cip->ci_lock);
979 		return (1);
980 	}
981 
982 	/*
983 	 * Allocate new cache entries
984 	 */
985 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
986 		if ((alp = fmd_asru_hash_create_entry(ahp,
987 		    cp, cis->cis_nvl)) == NULL) {
988 			fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
989 			    "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
990 			continue;
991 		}
992 		alp->al_flags |= FMD_ASRU_PRESENT;
993 		alp->al_asru->asru_flags |= FMD_ASRU_PRESENT;
994 		(void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0);
995 		(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
996 	}
997 
998 	if (adjust_new) {
999 		int some_suspect = 0, some_not_suspect = 0;
1000 
1001 		/*
1002 		 * There is one or more matching case but they are already in
1003 		 * repaired or resolved state. So we need to keep the new
1004 		 * case, but we can adjust it. Repaired/removed/replaced
1005 		 * suspects are unlikely to be to blame (unless there are
1006 		 * actually two separate faults). So if we have a combination of
1007 		 * repaired/replaced/removed suspects and acquitted suspects in
1008 		 * the old lists, then we should acquit in the new list those
1009 		 * that were repaired/replaced/removed in the old.
1010 		 */
1011 		for (i = 0; i < cip->ci_nsuspects; i++) {
1012 			if ((new_susp_state[i] & SUSPECT_STATE_REPLACED) ||
1013 			    (new_susp_state[i] & SUSPECT_STATE_REPAIRED) ||
1014 			    (new_susp_state[i] & SUSPECT_STATE_REMOVED) ||
1015 			    (new_match_state[i] & SUSPECT_STATE_NO_MATCH))
1016 				some_not_suspect = 1;
1017 			else
1018 				some_suspect = 1;
1019 		}
1020 		if (some_suspect && some_not_suspect) {
1021 			for (cis = cip->ci_suspects, i = 0; cis != NULL;
1022 			    cis = cis->cis_next, i++)
1023 				if ((new_susp_state[i] &
1024 				    SUSPECT_STATE_REPLACED) ||
1025 				    (new_susp_state[i] &
1026 				    SUSPECT_STATE_REPAIRED) ||
1027 				    (new_susp_state[i] &
1028 				    SUSPECT_STATE_REMOVED) ||
1029 				    (new_match_state[i] &
1030 				    SUSPECT_STATE_NO_MATCH))
1031 					fmd_asru_hash_apply_by_case(fmd.d_asrus,
1032 					    cp, fmd_case_acquit_suspect,
1033 					    cis->cis_nvl);
1034 		}
1035 	}
1036 
1037 	(void) pthread_mutex_unlock(&cip->ci_lock);
1038 	return (0);
1039 }
1040 
1041 void
1042 fmd_case_publish(fmd_case_t *cp, uint_t state)
1043 {
1044 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1045 	fmd_event_t *e;
1046 	nvlist_t *nvl;
1047 	char *class;
1048 
1049 	if (state == FMD_CASE_CURRENT)
1050 		state = cip->ci_state; /* use current state */
1051 
1052 	switch (state) {
1053 	case FMD_CASE_SOLVED:
1054 		(void) pthread_mutex_lock(&cip->ci_lock);
1055 
1056 		/*
1057 		 * If we already have a code, then case is already solved.
1058 		 */
1059 		if (cip->ci_precanned == 0 && cip->ci_xprt == NULL &&
1060 		    cip->ci_code != NULL) {
1061 			(void) pthread_mutex_unlock(&cip->ci_lock);
1062 			break;
1063 		}
1064 
1065 		if (cip->ci_tv_valid == 0) {
1066 			fmd_time_gettimeofday(&cip->ci_tv);
1067 			cip->ci_tv_valid = 1;
1068 		}
1069 		(void) pthread_mutex_unlock(&cip->ci_lock);
1070 
1071 		if (fmd_case_convict(cp) == 1) { /* dupclose */
1072 			cip->ci_flags &= ~FMD_CF_SOLVED;
1073 			fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0);
1074 			break;
1075 		}
1076 		if (cip->ci_xprt != NULL) {
1077 			/*
1078 			 * For proxy, save some information about the transport
1079 			 * in the resource cache.
1080 			 */
1081 			int count = 0;
1082 			fmd_asru_set_on_proxy_t fasp;
1083 			fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)cip->ci_xprt;
1084 
1085 			fasp.fasp_countp = &count;
1086 			fasp.fasp_maxcount = cip->ci_nsuspects;
1087 			fasp.fasp_proxy_asru = cip->ci_proxy_asru;
1088 			fasp.fasp_proxy_external = xip->xi_flags &
1089 			    FMD_XPRT_EXTERNAL;
1090 			fasp.fasp_proxy_rdonly = ((xip->xi_flags &
1091 			    FMD_XPRT_RDWR) == FMD_XPRT_RDONLY);
1092 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1093 			    fmd_asru_set_on_proxy, &fasp);
1094 		}
1095 		nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
1096 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1097 
1098 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1099 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1100 		fmd_log_append(fmd.d_fltlog, e, cp);
1101 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1102 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1103 
1104 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1105 		cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
1106 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1107 
1108 		break;
1109 
1110 	case FMD_CASE_CLOSE_WAIT:
1111 		fmd_case_hold(cp);
1112 		e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
1113 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1114 
1115 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1116 		cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
1117 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1118 
1119 		break;
1120 
1121 	case FMD_CASE_CLOSED:
1122 		nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
1123 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1124 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1125 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1126 		break;
1127 
1128 	case FMD_CASE_REPAIRED:
1129 		nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
1130 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1131 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1132 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1133 		fmd_log_append(fmd.d_fltlog, e, cp);
1134 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1135 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1136 		break;
1137 
1138 	case FMD_CASE_RESOLVED:
1139 		nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS);
1140 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1141 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1142 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1143 		fmd_log_append(fmd.d_fltlog, e, cp);
1144 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1145 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1146 		break;
1147 	}
1148 }
1149 
1150 fmd_case_t *
1151 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
1152 {
1153 	fmd_case_impl_t *cip;
1154 	uint_t h;
1155 
1156 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
1157 	h = fmd_strhash(uuid) % chp->ch_hashlen;
1158 
1159 	for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
1160 		if (strcmp(cip->ci_uuid, uuid) == 0)
1161 			break;
1162 	}
1163 
1164 	/*
1165 	 * If deleting bit is set, treat the case as if it doesn't exist.
1166 	 */
1167 	if (cip != NULL)
1168 		cip = fmd_case_tryhold(cip);
1169 
1170 	if (cip == NULL)
1171 		(void) fmd_set_errno(EFMD_CASE_INVAL);
1172 
1173 	(void) pthread_rwlock_unlock(&chp->ch_lock);
1174 	return ((fmd_case_t *)cip);
1175 }
1176 
1177 static fmd_case_impl_t *
1178 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
1179 {
1180 	fmd_case_impl_t *eip;
1181 	uint_t h;
1182 
1183 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
1184 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
1185 
1186 	for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
1187 		if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 &&
1188 		    fmd_case_tryhold(eip) != NULL) {
1189 			(void) pthread_rwlock_unlock(&chp->ch_lock);
1190 			return (eip); /* uuid already present */
1191 		}
1192 	}
1193 
1194 	cip->ci_next = chp->ch_hash[h];
1195 	chp->ch_hash[h] = cip;
1196 
1197 	chp->ch_count++;
1198 	ASSERT(chp->ch_count != 0);
1199 
1200 	(void) pthread_rwlock_unlock(&chp->ch_lock);
1201 	return (cip);
1202 }
1203 
1204 static void
1205 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
1206 {
1207 	fmd_case_impl_t *cp, **pp;
1208 	uint_t h;
1209 
1210 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1211 
1212 	cip->ci_flags |= FMD_CF_DELETING;
1213 	(void) pthread_mutex_unlock(&cip->ci_lock);
1214 
1215 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
1216 
1217 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
1218 	pp = &chp->ch_hash[h];
1219 
1220 	for (cp = *pp; cp != NULL; cp = cp->ci_next) {
1221 		if (cp != cip)
1222 			pp = &cp->ci_next;
1223 		else
1224 			break;
1225 	}
1226 
1227 	if (cp == NULL) {
1228 		fmd_panic("case %p (%s) not found on hash chain %u\n",
1229 		    (void *)cip, cip->ci_uuid, h);
1230 	}
1231 
1232 	*pp = cp->ci_next;
1233 	cp->ci_next = NULL;
1234 
1235 	/*
1236 	 * delete from code hash if it is on it
1237 	 */
1238 	fmd_case_code_hash_delete(chp, cip);
1239 
1240 	ASSERT(chp->ch_count != 0);
1241 	chp->ch_count--;
1242 
1243 	(void) pthread_rwlock_unlock(&chp->ch_lock);
1244 
1245 	(void) pthread_mutex_lock(&cip->ci_lock);
1246 	ASSERT(cip->ci_flags & FMD_CF_DELETING);
1247 }
1248 
1249 fmd_case_t *
1250 fmd_case_create(fmd_module_t *mp, void *data)
1251 {
1252 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
1253 	fmd_case_impl_t *eip = NULL;
1254 	uuid_t uuid;
1255 
1256 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
1257 	fmd_buf_hash_create(&cip->ci_bufs);
1258 
1259 	fmd_module_hold(mp);
1260 	cip->ci_mod = mp;
1261 	cip->ci_refs = 1;
1262 	cip->ci_state = FMD_CASE_UNSOLVED;
1263 	cip->ci_flags = FMD_CF_DIRTY;
1264 	cip->ci_data = data;
1265 
1266 	/*
1267 	 * Calling libuuid: get a clue.  The library interfaces cleverly do not
1268 	 * define any constant for the length of an unparse string, and do not
1269 	 * permit the caller to specify a buffer length for safety.  The spec
1270 	 * says it will be 36 bytes, but we make it tunable just in case.
1271 	 */
1272 	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
1273 	cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
1274 
1275 	/*
1276 	 * We expect this loop to execute only once, but code it defensively
1277 	 * against the possibility of libuuid bugs.  Keep generating uuids and
1278 	 * attempting to do a hash insert until we get a unique one.
1279 	 */
1280 	do {
1281 		if (eip != NULL)
1282 			fmd_case_rele((fmd_case_t *)eip);
1283 		uuid_generate(uuid);
1284 		uuid_unparse(uuid, cip->ci_uuid);
1285 	} while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
1286 
1287 	ASSERT(fmd_module_locked(mp));
1288 	fmd_list_append(&mp->mod_cases, cip);
1289 	fmd_module_setcdirty(mp);
1290 
1291 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1292 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
1293 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1294 
1295 	return ((fmd_case_t *)cip);
1296 }
1297 
1298 static void
1299 fmd_case_destroy_suspects(fmd_case_impl_t *cip)
1300 {
1301 	fmd_case_susp_t *cis, *ncis;
1302 
1303 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1304 
1305 	if (cip->ci_proxy_asru)
1306 		fmd_free(cip->ci_proxy_asru, sizeof (uint8_t) *
1307 		    cip->ci_nsuspects);
1308 	if (cip->ci_diag_de)
1309 		nvlist_free(cip->ci_diag_de);
1310 	if (cip->ci_diag_asru)
1311 		fmd_free(cip->ci_diag_asru, sizeof (uint8_t) *
1312 		    cip->ci_nsuspects);
1313 
1314 	for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
1315 		ncis = cis->cis_next;
1316 		nvlist_free(cis->cis_nvl);
1317 		fmd_free(cis, sizeof (fmd_case_susp_t));
1318 	}
1319 
1320 	cip->ci_suspects = NULL;
1321 	cip->ci_nsuspects = 0;
1322 }
1323 
1324 fmd_case_t *
1325 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
1326     uint_t state, const char *uuid, const char *code)
1327 {
1328 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
1329 	fmd_case_impl_t *eip;
1330 
1331 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
1332 	fmd_buf_hash_create(&cip->ci_bufs);
1333 
1334 	fmd_module_hold(mp);
1335 	cip->ci_mod = mp;
1336 	cip->ci_xprt = xp;
1337 	cip->ci_refs = 1;
1338 	cip->ci_state = state;
1339 	cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
1340 	cip->ci_uuidlen = strlen(cip->ci_uuid);
1341 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
1342 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
1343 
1344 	if (state > FMD_CASE_CLOSE_WAIT)
1345 		cip->ci_flags |= FMD_CF_SOLVED;
1346 
1347 	/*
1348 	 * Insert the case into the global case hash.  If the specified UUID is
1349 	 * already present, check to see if it is an orphan: if so, reclaim it;
1350 	 * otherwise if it is owned by a different module then return NULL.
1351 	 */
1352 	if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
1353 		(void) pthread_mutex_lock(&cip->ci_lock);
1354 		cip->ci_refs--; /* decrement to zero */
1355 		fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
1356 
1357 		cip = eip; /* switch 'cip' to the existing case */
1358 		(void) pthread_mutex_lock(&cip->ci_lock);
1359 
1360 		/*
1361 		 * If the ASRU cache is trying to recreate an orphan, then just
1362 		 * return the existing case that we found without changing it.
1363 		 */
1364 		if (mp == fmd.d_rmod) {
1365 			/*
1366 			 * In case the case has already been created from
1367 			 * a checkpoint file we need to set up code now.
1368 			 */
1369 			if (cip->ci_state < FMD_CASE_CLOSED) {
1370 				if (code != NULL && cip->ci_code == NULL) {
1371 					cip->ci_code = fmd_strdup(code,
1372 					    FMD_SLEEP);
1373 					cip->ci_codelen = cip->ci_code ?
1374 					    strlen(cip->ci_code) + 1 : 0;
1375 					fmd_case_code_hash_insert(fmd.d_cases,
1376 					    cip);
1377 				}
1378 			}
1379 
1380 			/*
1381 			 * When recreating an orphan case, state passed in may
1382 			 * be CLOSED (faulty) or REPAIRED/RESOLVED (!faulty). If
1383 			 * any suspects are still CLOSED (faulty) then the
1384 			 * overall state needs to be CLOSED.
1385 			 */
1386 			if ((cip->ci_state == FMD_CASE_REPAIRED ||
1387 			    cip->ci_state == FMD_CASE_RESOLVED) &&
1388 			    state == FMD_CASE_CLOSED)
1389 				cip->ci_state = FMD_CASE_CLOSED;
1390 			(void) pthread_mutex_unlock(&cip->ci_lock);
1391 			fmd_case_rele((fmd_case_t *)cip);
1392 			return ((fmd_case_t *)cip);
1393 		}
1394 
1395 		/*
1396 		 * If the existing case isn't an orphan or is being proxied,
1397 		 * then we have a UUID conflict: return failure to the caller.
1398 		 */
1399 		if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
1400 			(void) pthread_mutex_unlock(&cip->ci_lock);
1401 			fmd_case_rele((fmd_case_t *)cip);
1402 			return (NULL);
1403 		}
1404 
1405 		/*
1406 		 * If the new module is reclaiming an orphaned case, remove
1407 		 * the case from the root module, switch ci_mod, and then fall
1408 		 * through to adding the case to the new owner module 'mp'.
1409 		 */
1410 		fmd_module_lock(cip->ci_mod);
1411 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1412 		fmd_module_unlock(cip->ci_mod);
1413 
1414 		fmd_module_rele(cip->ci_mod);
1415 		cip->ci_mod = mp;
1416 		fmd_module_hold(mp);
1417 
1418 		/*
1419 		 * It's possible that fmd crashed or was restarted during a
1420 		 * previous solve operation between the asru cache being created
1421 		 * and the ckpt file being updated to SOLVED. Thus when the DE
1422 		 * recreates the case here from the checkpoint file, the state
1423 		 * will be UNSOLVED and yet we are having to reclaim because
1424 		 * the case was in the asru cache. If this happens, revert the
1425 		 * case back to the UNSOLVED state and let the DE solve it again
1426 		 */
1427 		if (state == FMD_CASE_UNSOLVED) {
1428 			fmd_asru_hash_delete_case(fmd.d_asrus,
1429 			    (fmd_case_t *)cip);
1430 			fmd_case_destroy_suspects(cip);
1431 			fmd_case_code_hash_delete(fmd.d_cases, cip);
1432 			fmd_free(cip->ci_code, cip->ci_codelen);
1433 			cip->ci_code = NULL;
1434 			cip->ci_codelen = 0;
1435 			cip->ci_tv_valid = 0;
1436 		}
1437 
1438 		cip->ci_state = state;
1439 
1440 		(void) pthread_mutex_unlock(&cip->ci_lock);
1441 		fmd_case_rele((fmd_case_t *)cip);
1442 	} else {
1443 		/*
1444 		 * add into hash of solved cases
1445 		 */
1446 		if (cip->ci_code)
1447 			fmd_case_code_hash_insert(fmd.d_cases, cip);
1448 	}
1449 
1450 	ASSERT(fmd_module_locked(mp));
1451 	fmd_list_append(&mp->mod_cases, cip);
1452 
1453 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1454 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
1455 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1456 
1457 	return ((fmd_case_t *)cip);
1458 }
1459 
1460 void
1461 fmd_case_destroy(fmd_case_t *cp, int visible)
1462 {
1463 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1464 	fmd_case_item_t *cit, *ncit;
1465 
1466 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1467 	ASSERT(cip->ci_refs == 0);
1468 
1469 	if (visible) {
1470 		TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
1471 		fmd_case_hash_delete(fmd.d_cases, cip);
1472 	}
1473 
1474 	for (cit = cip->ci_items; cit != NULL; cit = ncit) {
1475 		ncit = cit->cit_next;
1476 		fmd_event_rele(cit->cit_event);
1477 		fmd_free(cit, sizeof (fmd_case_item_t));
1478 	}
1479 
1480 	fmd_case_destroy_suspects(cip);
1481 
1482 	if (cip->ci_principal != NULL)
1483 		fmd_event_rele(cip->ci_principal);
1484 
1485 	fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
1486 	fmd_free(cip->ci_code, cip->ci_codelen);
1487 	(void) fmd_buf_hash_destroy(&cip->ci_bufs);
1488 
1489 	fmd_module_rele(cip->ci_mod);
1490 	fmd_free(cip, sizeof (fmd_case_impl_t));
1491 }
1492 
1493 void
1494 fmd_case_hold(fmd_case_t *cp)
1495 {
1496 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1497 
1498 	(void) pthread_mutex_lock(&cip->ci_lock);
1499 	fmd_case_hold_locked(cp);
1500 	(void) pthread_mutex_unlock(&cip->ci_lock);
1501 }
1502 
1503 void
1504 fmd_case_hold_locked(fmd_case_t *cp)
1505 {
1506 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1507 
1508 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1509 	if (cip->ci_flags & FMD_CF_DELETING)
1510 		fmd_panic("attempt to hold a deleting case %p (%s)\n",
1511 		    (void *)cip, cip->ci_uuid);
1512 	cip->ci_refs++;
1513 	ASSERT(cip->ci_refs != 0);
1514 }
1515 
1516 static fmd_case_impl_t *
1517 fmd_case_tryhold(fmd_case_impl_t *cip)
1518 {
1519 	/*
1520 	 * If the case's "deleting" bit is unset, hold and return case,
1521 	 * otherwise, return NULL.
1522 	 */
1523 	(void) pthread_mutex_lock(&cip->ci_lock);
1524 	if (cip->ci_flags & FMD_CF_DELETING) {
1525 		(void) pthread_mutex_unlock(&cip->ci_lock);
1526 		cip = NULL;
1527 	} else {
1528 		fmd_case_hold_locked((fmd_case_t *)cip);
1529 		(void) pthread_mutex_unlock(&cip->ci_lock);
1530 	}
1531 	return (cip);
1532 }
1533 
1534 void
1535 fmd_case_rele(fmd_case_t *cp)
1536 {
1537 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1538 
1539 	(void) pthread_mutex_lock(&cip->ci_lock);
1540 	ASSERT(cip->ci_refs != 0);
1541 
1542 	if (--cip->ci_refs == 0)
1543 		fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
1544 	else
1545 		(void) pthread_mutex_unlock(&cip->ci_lock);
1546 }
1547 
1548 void
1549 fmd_case_rele_locked(fmd_case_t *cp)
1550 {
1551 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1552 
1553 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1554 	--cip->ci_refs;
1555 	ASSERT(cip->ci_refs != 0);
1556 }
1557 
1558 int
1559 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep)
1560 {
1561 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1562 	fmd_case_item_t *cit;
1563 	fmd_event_t *oep;
1564 	uint_t state;
1565 	int new;
1566 
1567 	fmd_event_hold(ep);
1568 	(void) pthread_mutex_lock(&cip->ci_lock);
1569 
1570 	if (cip->ci_flags & FMD_CF_SOLVED)
1571 		state = FMD_EVS_DIAGNOSED;
1572 	else
1573 		state = FMD_EVS_ACCEPTED;
1574 
1575 	oep = cip->ci_principal;
1576 	cip->ci_principal = ep;
1577 
1578 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1579 		if (cit->cit_event == ep)
1580 			break;
1581 	}
1582 
1583 	cip->ci_flags |= FMD_CF_DIRTY;
1584 	new = cit == NULL && ep != oep;
1585 
1586 	(void) pthread_mutex_unlock(&cip->ci_lock);
1587 
1588 	fmd_module_setcdirty(cip->ci_mod);
1589 	fmd_event_transition(ep, state);
1590 
1591 	if (oep != NULL)
1592 		fmd_event_rele(oep);
1593 
1594 	return (new);
1595 }
1596 
1597 int
1598 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep)
1599 {
1600 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1601 	fmd_case_item_t *cit;
1602 	uint_t state;
1603 	int new;
1604 	boolean_t injected;
1605 
1606 	(void) pthread_mutex_lock(&cip->ci_lock);
1607 
1608 	if (cip->ci_flags & FMD_CF_SOLVED)
1609 		state = FMD_EVS_DIAGNOSED;
1610 	else
1611 		state = FMD_EVS_ACCEPTED;
1612 
1613 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1614 		if (cit->cit_event == ep)
1615 			break;
1616 	}
1617 
1618 	new = cit == NULL && ep != cip->ci_principal;
1619 
1620 	/*
1621 	 * If the event is already in the case or the case is already solved,
1622 	 * there is no reason to save it: just transition it appropriately.
1623 	 */
1624 	if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
1625 		(void) pthread_mutex_unlock(&cip->ci_lock);
1626 		fmd_event_transition(ep, state);
1627 		return (new);
1628 	}
1629 
1630 	cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
1631 	fmd_event_hold(ep);
1632 
1633 	if (nvlist_lookup_boolean_value(((fmd_event_impl_t *)ep)->ev_nvl,
1634 	    "__injected", &injected) == 0 && injected)
1635 		fmd_case_set_injected(cp);
1636 
1637 	cit->cit_next = cip->ci_items;
1638 	cit->cit_event = ep;
1639 
1640 	cip->ci_items = cit;
1641 	cip->ci_nitems++;
1642 
1643 	cip->ci_flags |= FMD_CF_DIRTY;
1644 	(void) pthread_mutex_unlock(&cip->ci_lock);
1645 
1646 	fmd_module_setcdirty(cip->ci_mod);
1647 	fmd_event_transition(ep, state);
1648 
1649 	return (new);
1650 }
1651 
1652 void
1653 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
1654 {
1655 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1656 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1657 
1658 	(void) pthread_mutex_lock(&cip->ci_lock);
1659 	ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT);
1660 	cip->ci_flags |= FMD_CF_DIRTY;
1661 
1662 	cis->cis_next = cip->ci_suspects;
1663 	cis->cis_nvl = nvl;
1664 
1665 	cip->ci_suspects = cis;
1666 	cip->ci_nsuspects++;
1667 
1668 	(void) pthread_mutex_unlock(&cip->ci_lock);
1669 	if (cip->ci_xprt == NULL)
1670 		fmd_module_setcdirty(cip->ci_mod);
1671 }
1672 
1673 void
1674 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
1675 {
1676 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1677 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1678 	boolean_t b;
1679 
1680 	(void) pthread_mutex_lock(&cip->ci_lock);
1681 
1682 	cis->cis_next = cip->ci_suspects;
1683 	cis->cis_nvl = nvl;
1684 
1685 	if (nvlist_lookup_boolean_value(nvl,
1686 	    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
1687 		cip->ci_flags |= FMD_CF_INVISIBLE;
1688 
1689 	cip->ci_suspects = cis;
1690 	cip->ci_nsuspects++;
1691 
1692 	(void) pthread_mutex_unlock(&cip->ci_lock);
1693 }
1694 
1695 void
1696 fmd_case_reset_suspects(fmd_case_t *cp)
1697 {
1698 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1699 
1700 	(void) pthread_mutex_lock(&cip->ci_lock);
1701 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
1702 
1703 	fmd_case_destroy_suspects(cip);
1704 	cip->ci_flags |= FMD_CF_DIRTY;
1705 
1706 	(void) pthread_mutex_unlock(&cip->ci_lock);
1707 	fmd_module_setcdirty(cip->ci_mod);
1708 }
1709 
1710 /*ARGSUSED*/
1711 static void
1712 fmd_case_unusable(fmd_asru_link_t *alp, void *arg)
1713 {
1714 	(void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE);
1715 }
1716 
1717 /*
1718  * Grab ci_lock and update the case state and set the dirty bit.  Then perform
1719  * whatever actions and emit whatever events are appropriate for the state.
1720  * Refer to the topmost block comment explaining the state machine for details.
1721  */
1722 void
1723 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
1724 {
1725 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1726 	fmd_case_item_t *cit;
1727 	fmd_event_t *e;
1728 	int resolved = 0;
1729 	int any_unusable_and_present = 0;
1730 
1731 	ASSERT(state <= FMD_CASE_RESOLVED);
1732 	(void) pthread_mutex_lock(&cip->ci_lock);
1733 
1734 	if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED))
1735 		flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED | FMD_CF_RESOLVED);
1736 
1737 	cip->ci_flags |= flags;
1738 
1739 	if (cip->ci_state >= state) {
1740 		(void) pthread_mutex_unlock(&cip->ci_lock);
1741 		return; /* already in specified state */
1742 	}
1743 
1744 	TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1745 	    _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
1746 
1747 	cip->ci_state = state;
1748 	cip->ci_flags |= FMD_CF_DIRTY;
1749 
1750 	if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
1751 		fmd_module_setcdirty(cip->ci_mod);
1752 
1753 	switch (state) {
1754 	case FMD_CASE_SOLVED:
1755 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1756 			fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
1757 
1758 		if (cip->ci_principal != NULL) {
1759 			fmd_event_transition(cip->ci_principal,
1760 			    FMD_EVS_DIAGNOSED);
1761 		}
1762 		break;
1763 
1764 	case FMD_CASE_CLOSE_WAIT:
1765 		/*
1766 		 * If the case was never solved, do not change ASRUs.
1767 		 * If the case was never fmd_case_closed, do not change ASRUs.
1768 		 * If the case was repaired, do not change ASRUs.
1769 		 */
1770 		if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
1771 		    FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED))
1772 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1773 			    fmd_case_unusable, NULL);
1774 
1775 		/*
1776 		 * If an orphaned case transitions to CLOSE_WAIT, the owning
1777 		 * module is no longer loaded: continue on to CASE_CLOSED.
1778 		 */
1779 		if (fmd_case_orphaned(cp))
1780 			state = cip->ci_state = FMD_CASE_CLOSED;
1781 		break;
1782 
1783 	case FMD_CASE_REPAIRED:
1784 		ASSERT(cip->ci_xprt != NULL || fmd_case_orphaned(cp));
1785 
1786 		/*
1787 		 * If we've been requested to transition straight on to the
1788 		 * RESOLVED state (which can happen with fault proxying where a
1789 		 * list.resolved or a uuresolved is received from the other
1790 		 * side), or if all suspects are already either usable or not
1791 		 * present then transition straight to RESOLVED state,
1792 		 * publishing both the list.repaired and list.resolved. For a
1793 		 * proxy, if we discover here that all suspects are already
1794 		 * either usable or not present, notify the diag side instead
1795 		 * using fmd_xprt_uuresolved().
1796 		 */
1797 		if (flags & FMD_CF_RESOLVED) {
1798 			if (cip->ci_xprt != NULL)
1799 				fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1800 		} else {
1801 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1802 			    fmd_case_unusable_and_present,
1803 			    &any_unusable_and_present);
1804 			if (any_unusable_and_present)
1805 				break;
1806 			if (cip->ci_xprt != NULL) {
1807 				fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid);
1808 				break;
1809 			}
1810 		}
1811 
1812 		cip->ci_state = FMD_CASE_RESOLVED;
1813 		(void) pthread_mutex_unlock(&cip->ci_lock);
1814 		fmd_case_publish(cp, state);
1815 		TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1816 		    _fmd_case_snames[FMD_CASE_REPAIRED],
1817 		    _fmd_case_snames[FMD_CASE_RESOLVED]));
1818 		state = FMD_CASE_RESOLVED;
1819 		resolved = 1;
1820 		(void) pthread_mutex_lock(&cip->ci_lock);
1821 		break;
1822 
1823 	case FMD_CASE_RESOLVED:
1824 		/*
1825 		 * For a proxy, no need to check that all suspects are already
1826 		 * either usable or not present - this request has come from
1827 		 * the diagnosing side which makes the final decision on this.
1828 		 */
1829 		if (cip->ci_xprt != NULL) {
1830 			fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1831 			resolved = 1;
1832 			break;
1833 		}
1834 
1835 		ASSERT(fmd_case_orphaned(cp));
1836 
1837 		/*
1838 		 * If all suspects are already either usable or not present then
1839 		 * carry on, publish list.resolved and discard the case.
1840 		 */
1841 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1842 		    fmd_case_unusable_and_present, &any_unusable_and_present);
1843 		if (any_unusable_and_present) {
1844 			(void) pthread_mutex_unlock(&cip->ci_lock);
1845 			return;
1846 		}
1847 
1848 		resolved = 1;
1849 		break;
1850 	}
1851 
1852 	(void) pthread_mutex_unlock(&cip->ci_lock);
1853 
1854 	/*
1855 	 * If the module has initialized, then publish the appropriate event
1856 	 * for the new case state.  If not, we are being called from the
1857 	 * checkpoint code during module load, in which case the module's
1858 	 * _fmd_init() routine hasn't finished yet, and our event dictionaries
1859 	 * may not be open yet, which will prevent us from computing the event
1860 	 * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
1861 	 * event in our queue: this won't be processed until _fmd_init is done.
1862 	 */
1863 	if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
1864 		fmd_case_publish(cp, state);
1865 	else {
1866 		fmd_case_hold(cp);
1867 		e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
1868 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1869 	}
1870 
1871 	if (resolved) {
1872 		if (cip->ci_xprt != NULL) {
1873 			/*
1874 			 * If we transitioned to RESOLVED, adjust the reference
1875 			 * count to reflect our removal from
1876 			 * fmd.d_rmod->mod_cases above.  If the caller has not
1877 			 * placed an additional hold on the case, it will now
1878 			 * be freed.
1879 			 */
1880 			(void) pthread_mutex_lock(&cip->ci_lock);
1881 			fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1882 			(void) pthread_mutex_unlock(&cip->ci_lock);
1883 			fmd_case_rele(cp);
1884 		} else {
1885 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1886 			    fmd_asru_log_resolved, NULL);
1887 			(void) pthread_mutex_lock(&cip->ci_lock);
1888 			/* mark as "ready to be discarded */
1889 			cip->ci_flags |= FMD_CF_RES_CMPL;
1890 			(void) pthread_mutex_unlock(&cip->ci_lock);
1891 		}
1892 	}
1893 }
1894 
1895 /*
1896  * Discard any case if it is in RESOLVED state (and if check_if_aged argument
1897  * is set if all suspects have passed the rsrc.aged time).
1898  */
1899 void
1900 fmd_case_discard_resolved(fmd_case_t *cp, void *arg)
1901 {
1902 	int check_if_aged = *(int *)arg;
1903 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1904 
1905 	/*
1906 	 * First check if case has completed transition to resolved.
1907 	 */
1908 	(void) pthread_mutex_lock(&cip->ci_lock);
1909 	if (!(cip->ci_flags & FMD_CF_RES_CMPL)) {
1910 		(void) pthread_mutex_unlock(&cip->ci_lock);
1911 		return;
1912 	}
1913 
1914 	/*
1915 	 * Now if check_is_aged is set, see if all suspects have aged.
1916 	 */
1917 	if (check_if_aged) {
1918 		int aged = 1;
1919 
1920 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1921 		    fmd_asru_check_if_aged, &aged);
1922 		if (!aged) {
1923 			(void) pthread_mutex_unlock(&cip->ci_lock);
1924 			return;
1925 		}
1926 	}
1927 
1928 	/*
1929 	 * Finally discard the case, clearing FMD_CF_RES_CMPL so we don't
1930 	 * do it twice.
1931 	 */
1932 	fmd_module_lock(cip->ci_mod);
1933 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1934 	fmd_module_unlock(cip->ci_mod);
1935 	fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1936 	cip->ci_flags &= ~FMD_CF_RES_CMPL;
1937 	(void) pthread_mutex_unlock(&cip->ci_lock);
1938 	fmd_case_rele(cp);
1939 }
1940 
1941 /*
1942  * Transition the specified case to *at least* the specified state by first
1943  * re-validating the suspect list using the resource cache.  This function is
1944  * employed by the checkpoint code when restoring a saved, solved case to see
1945  * if the state of the case has effectively changed while fmd was not running
1946  * or the module was not loaded.
1947  */
1948 void
1949 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
1950 {
1951 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1952 
1953 	int usable = 0;		/* are any suspects usable? */
1954 
1955 	ASSERT(state >= FMD_CASE_SOLVED);
1956 	(void) pthread_mutex_lock(&cip->ci_lock);
1957 
1958 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable);
1959 
1960 	(void) pthread_mutex_unlock(&cip->ci_lock);
1961 
1962 	if (!usable) {
1963 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1964 		flags |= FMD_CF_ISOLATED;
1965 	}
1966 
1967 	fmd_case_transition(cp, state, flags);
1968 }
1969 
1970 void
1971 fmd_case_setdirty(fmd_case_t *cp)
1972 {
1973 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1974 
1975 	(void) pthread_mutex_lock(&cip->ci_lock);
1976 	cip->ci_flags |= FMD_CF_DIRTY;
1977 	(void) pthread_mutex_unlock(&cip->ci_lock);
1978 
1979 	fmd_module_setcdirty(cip->ci_mod);
1980 }
1981 
1982 void
1983 fmd_case_clrdirty(fmd_case_t *cp)
1984 {
1985 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1986 
1987 	(void) pthread_mutex_lock(&cip->ci_lock);
1988 	cip->ci_flags &= ~FMD_CF_DIRTY;
1989 	(void) pthread_mutex_unlock(&cip->ci_lock);
1990 }
1991 
1992 void
1993 fmd_case_commit(fmd_case_t *cp)
1994 {
1995 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1996 	fmd_case_item_t *cit;
1997 
1998 	(void) pthread_mutex_lock(&cip->ci_lock);
1999 
2000 	if (cip->ci_flags & FMD_CF_DIRTY) {
2001 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
2002 			fmd_event_commit(cit->cit_event);
2003 
2004 		if (cip->ci_principal != NULL)
2005 			fmd_event_commit(cip->ci_principal);
2006 
2007 		fmd_buf_hash_commit(&cip->ci_bufs);
2008 		cip->ci_flags &= ~FMD_CF_DIRTY;
2009 	}
2010 
2011 	(void) pthread_mutex_unlock(&cip->ci_lock);
2012 }
2013 
2014 /*
2015  * On proxy side, send back repair/acquit/etc request to diagnosing side
2016  */
2017 void
2018 fmd_case_xprt_updated(fmd_case_t *cp)
2019 {
2020 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2021 	nvlist_t **nva;
2022 	uint8_t *ba;
2023 	int msg = B_TRUE;
2024 	int count = 0;
2025 	fmd_case_lst_t fcl;
2026 
2027 	ASSERT(cip->ci_xprt != NULL);
2028 	(void) pthread_mutex_lock(&cip->ci_lock);
2029 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
2030 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
2031 	fcl.fcl_countp = &count;
2032 	fcl.fcl_maxcount = cip->ci_nsuspects;
2033 	fcl.fcl_msgp = &msg;
2034 	fcl.fcl_ba = ba;
2035 	fcl.fcl_nva = nva;
2036 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
2037 	(void) pthread_mutex_unlock(&cip->ci_lock);
2038 	fmd_xprt_updated(cip->ci_xprt, cip->ci_uuid, ba, cip->ci_proxy_asru,
2039 	    count);
2040 }
2041 
2042 /*
2043  * fmd_case_update_status() can be called on either the proxy side when a
2044  * list.suspect is received, or on the diagnosing side when an update request
2045  * is received from the proxy. It updates the status in the resource cache.
2046  */
2047 void
2048 fmd_case_update_status(fmd_case_t *cp, uint8_t *statusp, uint8_t *proxy_asrup,
2049     uint8_t *diag_asrup)
2050 {
2051 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2052 	int count = 0;
2053 	fmd_asru_update_status_t faus;
2054 
2055 	/*
2056 	 * update status of resource cache entries
2057 	 */
2058 	faus.faus_countp = &count;
2059 	faus.faus_maxcount = cip->ci_nsuspects;
2060 	faus.faus_ba = statusp;
2061 	faus.faus_proxy_asru = proxy_asrup;
2062 	faus.faus_diag_asru = diag_asrup;
2063 	faus.faus_is_proxy = (cip->ci_xprt != NULL);
2064 	(void) pthread_mutex_lock(&cip->ci_lock);
2065 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_update_status,
2066 	    &faus);
2067 	(void) pthread_mutex_unlock(&cip->ci_lock);
2068 }
2069 
2070 /*
2071  * Called on either the proxy side or the diag side when a repair has taken
2072  * place on the other side but this side may know the asru "contains"
2073  * relationships.
2074  */
2075 void
2076 fmd_case_update_containees(fmd_case_t *cp)
2077 {
2078 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2079 
2080 	(void) pthread_mutex_lock(&cip->ci_lock);
2081 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2082 	    fmd_asru_update_containees, NULL);
2083 	(void) pthread_mutex_unlock(&cip->ci_lock);
2084 }
2085 
2086 /*
2087  * fmd_case_close_status() is called on diagnosing side when proxy side
2088  * has had a uuclose. It updates the status in the resource cache.
2089  */
2090 void
2091 fmd_case_close_status(fmd_case_t *cp)
2092 {
2093 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2094 	int count = 0;
2095 	fmd_asru_close_status_t facs;
2096 
2097 	/*
2098 	 * update status of resource cache entries
2099 	 */
2100 	facs.facs_countp = &count;
2101 	facs.facs_maxcount = cip->ci_nsuspects;
2102 	(void) pthread_mutex_lock(&cip->ci_lock);
2103 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_close_status,
2104 	    &facs);
2105 	(void) pthread_mutex_unlock(&cip->ci_lock);
2106 }
2107 
2108 /*
2109  * Indicate that the case may need to change state because one or more of the
2110  * ASRUs named as a suspect has changed state.  We examine all the suspects
2111  * and if none are still faulty, we initiate a case close transition.
2112  */
2113 void
2114 fmd_case_update(fmd_case_t *cp)
2115 {
2116 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2117 	uint_t cstate;
2118 	int faulty = 0;
2119 
2120 	(void) pthread_mutex_lock(&cip->ci_lock);
2121 	cstate = cip->ci_state;
2122 
2123 	if (cip->ci_state < FMD_CASE_SOLVED) {
2124 		(void) pthread_mutex_unlock(&cip->ci_lock);
2125 		return; /* update is not appropriate */
2126 	}
2127 
2128 	if (cip->ci_flags & FMD_CF_REPAIRED) {
2129 		(void) pthread_mutex_unlock(&cip->ci_lock);
2130 		return; /* already repaired */
2131 	}
2132 
2133 	TRACE((FMD_DBG_CASE, "case update %s", cip->ci_uuid));
2134 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
2135 	(void) pthread_mutex_unlock(&cip->ci_lock);
2136 
2137 	if (faulty) {
2138 		nvlist_t *nvl;
2139 		fmd_event_t *e;
2140 		char *class;
2141 
2142 		TRACE((FMD_DBG_CASE, "sending list.updated %s", cip->ci_uuid));
2143 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
2144 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2145 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
2146 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
2147 		fmd_log_append(fmd.d_fltlog, e, cp);
2148 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
2149 		fmd_dispq_dispatch(fmd.d_disp, e, class);
2150 		return; /* one or more suspects are still marked faulty */
2151 	}
2152 
2153 	if (cstate == FMD_CASE_CLOSED)
2154 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2155 	else
2156 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2157 }
2158 
2159 /*
2160  * Delete a closed case from the module's case list once the fmdo_close() entry
2161  * point has run to completion.  If the case is owned by a transport module,
2162  * tell the transport to proxy a case close on the other end of the transport.
2163  * Transition to the appropriate next state based on ci_flags.  This
2164  * function represents the end of CLOSE_WAIT and transitions the case to either
2165  * CLOSED or REPAIRED or discards it entirely because it was never solved;
2166  * refer to the topmost block comment explaining the state machine for details.
2167  */
2168 void
2169 fmd_case_delete(fmd_case_t *cp)
2170 {
2171 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2172 	fmd_modstat_t *msp;
2173 	size_t buftotal;
2174 
2175 	TRACE((FMD_DBG_CASE, "case delete %s", cip->ci_uuid));
2176 	ASSERT(fmd_module_locked(cip->ci_mod));
2177 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
2178 	buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
2179 
2180 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
2181 	msp = cip->ci_mod->mod_stats;
2182 
2183 	ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
2184 	msp->ms_caseopen.fmds_value.ui64--;
2185 
2186 	ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
2187 	msp->ms_buftotal.fmds_value.ui64 -= buftotal;
2188 
2189 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
2190 
2191 	if (cip->ci_xprt == NULL)
2192 		fmd_module_setcdirty(cip->ci_mod);
2193 
2194 	fmd_module_rele(cip->ci_mod);
2195 	cip->ci_mod = fmd.d_rmod;
2196 	fmd_module_hold(cip->ci_mod);
2197 
2198 	/*
2199 	 * If the case has been solved, then retain it
2200 	 * on the root module's case list at least until we're transitioned.
2201 	 * Otherwise free the case with our final fmd_case_rele() below.
2202 	 */
2203 	if (cip->ci_flags & FMD_CF_SOLVED) {
2204 		fmd_module_lock(cip->ci_mod);
2205 		fmd_list_append(&cip->ci_mod->mod_cases, cip);
2206 		fmd_module_unlock(cip->ci_mod);
2207 		fmd_case_hold(cp);
2208 	}
2209 
2210 	/*
2211 	 * Transition onwards to REPAIRED or CLOSED as originally requested.
2212 	 * Note that for proxy case if we're transitioning to CLOSED it means
2213 	 * the case was isolated locally, so call fmd_xprt_uuclose() to notify
2214 	 * the diagnosing side. No need to notify the diagnosing side if we are
2215 	 * transitioning to REPAIRED as we only do this when requested to do
2216 	 * so by the diagnosing side anyway.
2217 	 */
2218 	if (cip->ci_flags & FMD_CF_REPAIRED)
2219 		fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
2220 	else if (cip->ci_flags & FMD_CF_ISOLATED) {
2221 		fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
2222 		if (cip->ci_xprt != NULL)
2223 			fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
2224 	}
2225 
2226 	fmd_case_rele(cp);
2227 }
2228 
2229 void
2230 fmd_case_discard(fmd_case_t *cp, boolean_t delete_from_asru_cache)
2231 {
2232 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2233 
2234 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
2235 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
2236 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
2237 
2238 	ASSERT(fmd_module_locked(cip->ci_mod));
2239 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
2240 	if (delete_from_asru_cache) {
2241 		(void) pthread_mutex_lock(&cip->ci_lock);
2242 		fmd_asru_hash_delete_case(fmd.d_asrus, cp);
2243 		(void) pthread_mutex_unlock(&cip->ci_lock);
2244 	}
2245 	fmd_case_rele(cp);
2246 }
2247 
2248 /*
2249  * Indicate that the problem corresponding to a case has been repaired by
2250  * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
2251  * already been closed, this function initiates the transition to CLOSE_WAIT.
2252  * The caller must have the case held from fmd_case_hash_lookup(), so we can
2253  * grab and drop ci_lock without the case being able to be freed in between.
2254  */
2255 int
2256 fmd_case_repair(fmd_case_t *cp)
2257 {
2258 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2259 	uint_t cstate;
2260 	fmd_asru_rep_arg_t fara;
2261 
2262 	(void) pthread_mutex_lock(&cip->ci_lock);
2263 	cstate = cip->ci_state;
2264 
2265 	if (cstate < FMD_CASE_SOLVED) {
2266 		(void) pthread_mutex_unlock(&cip->ci_lock);
2267 		return (fmd_set_errno(EFMD_CASE_STATE));
2268 	}
2269 
2270 	if (cip->ci_flags & FMD_CF_REPAIRED) {
2271 		(void) pthread_mutex_unlock(&cip->ci_lock);
2272 		return (0); /* already repaired */
2273 	}
2274 
2275 	TRACE((FMD_DBG_CASE, "case repair %s", cip->ci_uuid));
2276 	fara.fara_reason = FMD_ASRU_REPAIRED;
2277 	fara.fara_bywhat = FARA_BY_CASE;
2278 	fara.fara_rval = NULL;
2279 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
2280 	(void) pthread_mutex_unlock(&cip->ci_lock);
2281 
2282 	/*
2283 	 * if this is a proxied case, send the repair across the transport.
2284 	 * The remote side will then do the repair and send a list.repaired back
2285 	 * again such that we can finally repair the case on this side.
2286 	 */
2287 	if (cip->ci_xprt != NULL) {
2288 		fmd_case_xprt_updated(cp);
2289 		return (0);
2290 	}
2291 
2292 	if (cstate == FMD_CASE_CLOSED)
2293 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2294 	else
2295 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2296 
2297 	return (0);
2298 }
2299 
2300 int
2301 fmd_case_acquit(fmd_case_t *cp)
2302 {
2303 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2304 	uint_t cstate;
2305 	fmd_asru_rep_arg_t fara;
2306 
2307 	(void) pthread_mutex_lock(&cip->ci_lock);
2308 	cstate = cip->ci_state;
2309 
2310 	if (cstate < FMD_CASE_SOLVED) {
2311 		(void) pthread_mutex_unlock(&cip->ci_lock);
2312 		return (fmd_set_errno(EFMD_CASE_STATE));
2313 	}
2314 
2315 	if (cip->ci_flags & FMD_CF_REPAIRED) {
2316 		(void) pthread_mutex_unlock(&cip->ci_lock);
2317 		return (0); /* already repaired */
2318 	}
2319 
2320 	TRACE((FMD_DBG_CASE, "case acquit %s", cip->ci_uuid));
2321 	fara.fara_reason = FMD_ASRU_ACQUITTED;
2322 	fara.fara_bywhat = FARA_BY_CASE;
2323 	fara.fara_rval = NULL;
2324 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
2325 	(void) pthread_mutex_unlock(&cip->ci_lock);
2326 
2327 	/*
2328 	 * if this is a proxied case, send the repair across the transport.
2329 	 * The remote side will then do the repair and send a list.repaired back
2330 	 * again such that we can finally repair the case on this side.
2331 	 */
2332 	if (cip->ci_xprt != NULL) {
2333 		fmd_case_xprt_updated(cp);
2334 		return (0);
2335 	}
2336 
2337 	if (cstate == FMD_CASE_CLOSED)
2338 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2339 	else
2340 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2341 
2342 	return (0);
2343 }
2344 
2345 int
2346 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
2347 {
2348 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2349 	fmd_case_item_t *cit;
2350 	uint_t state;
2351 	int rv = 0;
2352 
2353 	(void) pthread_mutex_lock(&cip->ci_lock);
2354 
2355 	if (cip->ci_state >= FMD_CASE_SOLVED)
2356 		state = FMD_EVS_DIAGNOSED;
2357 	else
2358 		state = FMD_EVS_ACCEPTED;
2359 
2360 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
2361 		if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
2362 			break;
2363 	}
2364 
2365 	if (rv == 0 && cip->ci_principal != NULL)
2366 		rv = fmd_event_equal(ep, cip->ci_principal);
2367 
2368 	(void) pthread_mutex_unlock(&cip->ci_lock);
2369 
2370 	if (rv != 0)
2371 		fmd_event_transition(ep, state);
2372 
2373 	return (rv);
2374 }
2375 
2376 int
2377 fmd_case_orphaned(fmd_case_t *cp)
2378 {
2379 	return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
2380 }
2381 
2382 void
2383 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec)
2384 {
2385 	((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec;
2386 	((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec;
2387 	((fmd_case_impl_t *)cp)->ci_tv_valid = 1;
2388 }
2389 
2390 void
2391 fmd_case_set_injected(fmd_case_t *cp)
2392 {
2393 	((fmd_case_impl_t *)cp)->ci_injected = 1;
2394 }
2395 
2396 void
2397 fmd_case_set_de_fmri(fmd_case_t *cp, nvlist_t *nvl)
2398 {
2399 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2400 
2401 	if (cip->ci_diag_de)
2402 		nvlist_free(cip->ci_diag_de);
2403 	cip->ci_diag_de = nvl;
2404 }
2405 
2406 void
2407 fmd_case_setcode(fmd_case_t *cp, char *code)
2408 {
2409 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2410 
2411 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
2412 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
2413 }
2414 
2415 /*ARGSUSED*/
2416 static void
2417 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg)
2418 {
2419 	int not_faulty = 0;
2420 	int faulty = 0;
2421 	nvlist_t *nvl;
2422 	fmd_event_t *e;
2423 	char *class;
2424 	int any_unusable_and_present = 0;
2425 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2426 
2427 	if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL)
2428 		return;
2429 
2430 	if (cip->ci_state == FMD_CASE_RESOLVED) {
2431 		cip->ci_flags |= FMD_CF_RES_CMPL;
2432 		return;
2433 	}
2434 
2435 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
2436 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty,
2437 	    &not_faulty);
2438 
2439 	if (cip->ci_state >= FMD_CASE_REPAIRED && !faulty) {
2440 		/*
2441 		 * If none of the suspects is faulty, replay the list.repaired.
2442 		 * If all suspects are already either usable or not present then
2443 		 * also transition straight to RESOLVED state.
2444 		 */
2445 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2446 		    fmd_case_unusable_and_present, &any_unusable_and_present);
2447 		if (!any_unusable_and_present) {
2448 			cip->ci_state = FMD_CASE_RESOLVED;
2449 
2450 			TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
2451 			    cip->ci_uuid));
2452 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
2453 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2454 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
2455 			    class);
2456 			fmd_dispq_dispatch(fmd.d_disp, e, class);
2457 
2458 			TRACE((FMD_DBG_CASE, "replay sending list.resolved %s",
2459 			    cip->ci_uuid));
2460 			fmd_case_publish(cp, FMD_CASE_RESOLVED);
2461 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2462 			    fmd_asru_log_resolved, NULL);
2463 			cip->ci_flags |= FMD_CF_RES_CMPL;
2464 		} else {
2465 			TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
2466 			    cip->ci_uuid));
2467 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
2468 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2469 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
2470 			    class);
2471 			fmd_dispq_dispatch(fmd.d_disp, e, class);
2472 		}
2473 	} else if (faulty && not_faulty) {
2474 		/*
2475 		 * if some but not all of the suspects are not faulty, replay
2476 		 * the list.updated.
2477 		 */
2478 		TRACE((FMD_DBG_CASE, "replay sending list.updated %s",
2479 		    cip->ci_uuid));
2480 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
2481 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2482 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
2483 		fmd_dispq_dispatch(fmd.d_disp, e, class);
2484 	}
2485 }
2486 
2487 void
2488 fmd_case_repair_replay()
2489 {
2490 	fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL);
2491 }
2492