xref: /freebsd/sys/contrib/openzfs/module/zfs/fm.c (revision c66ec88f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Fault Management Architecture (FMA) Resource and Protocol Support
27  *
28  * The routines contained herein provide services to support kernel subsystems
29  * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089).
30  *
31  * Name-Value Pair Lists
32  *
33  * The embodiment of an FMA protocol element (event, fmri or authority) is a
34  * name-value pair list (nvlist_t).  FMA-specific nvlist constructor and
35  * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used
36  * to create an nvpair list using custom allocators.  Callers may choose to
37  * allocate either from the kernel memory allocator, or from a preallocated
38  * buffer, useful in constrained contexts like high-level interrupt routines.
39  *
40  * Protocol Event and FMRI Construction
41  *
42  * Convenience routines are provided to construct nvlist events according to
43  * the FMA Event Protocol and Naming Schema specification for ereports and
44  * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes.
45  *
46  * ENA Manipulation
47  *
48  * Routines to generate ENA formats 0, 1 and 2 are available as well as
49  * routines to increment formats 1 and 2.  Individual fields within the
50  * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(),
51  * fm_ena_format_get() and fm_ena_gen_get().
52  */
53 
54 #include <sys/types.h>
55 #include <sys/time.h>
56 #include <sys/list.h>
57 #include <sys/nvpair.h>
58 #include <sys/cmn_err.h>
59 #include <sys/sysmacros.h>
60 #include <sys/sunddi.h>
61 #include <sys/systeminfo.h>
62 #include <sys/fm/util.h>
63 #include <sys/fm/protocol.h>
64 #include <sys/kstat.h>
65 #include <sys/zfs_context.h>
66 #ifdef _KERNEL
67 #include <sys/atomic.h>
68 #include <sys/condvar.h>
69 #include <sys/console.h>
70 #include <sys/time.h>
71 #include <sys/zfs_ioctl.h>
72 
73 int zfs_zevent_len_max = 0;
74 int zfs_zevent_cols = 80;
75 int zfs_zevent_console = 0;
76 
77 static int zevent_len_cur = 0;
78 static int zevent_waiters = 0;
79 static int zevent_flags = 0;
80 
81 /* Num events rate limited since the last time zfs_zevent_next() was called */
82 static uint64_t ratelimit_dropped = 0;
83 
84 /*
85  * The EID (Event IDentifier) is used to uniquely tag a zevent when it is
86  * posted.  The posted EIDs are monotonically increasing but not persistent.
87  * They will be reset to the initial value (1) each time the kernel module is
88  * loaded.
89  */
90 static uint64_t zevent_eid = 0;
91 
92 static kmutex_t zevent_lock;
93 static list_t zevent_list;
94 static kcondvar_t zevent_cv;
95 #endif /* _KERNEL */
96 
97 
98 /*
99  * Common fault management kstats to record event generation failures
100  */
101 
102 struct erpt_kstat {
103 	kstat_named_t	erpt_dropped;		/* num erpts dropped on post */
104 	kstat_named_t	erpt_set_failed;	/* num erpt set failures */
105 	kstat_named_t	fmri_set_failed;	/* num fmri set failures */
106 	kstat_named_t	payload_set_failed;	/* num payload set failures */
107 	kstat_named_t	erpt_duplicates;	/* num duplicate erpts */
108 };
109 
110 static struct erpt_kstat erpt_kstat_data = {
111 	{ "erpt-dropped", KSTAT_DATA_UINT64 },
112 	{ "erpt-set-failed", KSTAT_DATA_UINT64 },
113 	{ "fmri-set-failed", KSTAT_DATA_UINT64 },
114 	{ "payload-set-failed", KSTAT_DATA_UINT64 },
115 	{ "erpt-duplicates", KSTAT_DATA_UINT64 }
116 };
117 
118 kstat_t *fm_ksp;
119 
120 #ifdef _KERNEL
121 
122 /*
123  * Formatting utility function for fm_nvprintr.  We attempt to wrap chunks of
124  * output so they aren't split across console lines, and return the end column.
125  */
126 /*PRINTFLIKE4*/
127 static int
128 fm_printf(int depth, int c, int cols, const char *format, ...)
129 {
130 	va_list ap;
131 	int width;
132 	char c1;
133 
134 	va_start(ap, format);
135 	width = vsnprintf(&c1, sizeof (c1), format, ap);
136 	va_end(ap);
137 
138 	if (c + width >= cols) {
139 		console_printf("\n");
140 		c = 0;
141 		if (format[0] != ' ' && depth > 0) {
142 			console_printf(" ");
143 			c++;
144 		}
145 	}
146 
147 	va_start(ap, format);
148 	console_vprintf(format, ap);
149 	va_end(ap);
150 
151 	return ((c + width) % cols);
152 }
153 
154 /*
155  * Recursively print an nvlist in the specified column width and return the
156  * column we end up in.  This function is called recursively by fm_nvprint(),
157  * below.  We generically format the entire nvpair using hexadecimal
158  * integers and strings, and elide any integer arrays.  Arrays are basically
159  * used for cache dumps right now, so we suppress them so as not to overwhelm
160  * the amount of console output we produce at panic time.  This can be further
161  * enhanced as FMA technology grows based upon the needs of consumers.  All
162  * FMA telemetry is logged using the dump device transport, so the console
163  * output serves only as a fallback in case this procedure is unsuccessful.
164  */
165 static int
166 fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
167 {
168 	nvpair_t *nvp;
169 
170 	for (nvp = nvlist_next_nvpair(nvl, NULL);
171 	    nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
172 
173 		data_type_t type = nvpair_type(nvp);
174 		const char *name = nvpair_name(nvp);
175 
176 		boolean_t b;
177 		uint8_t i8;
178 		uint16_t i16;
179 		uint32_t i32;
180 		uint64_t i64;
181 		char *str;
182 		nvlist_t *cnv;
183 
184 		if (strcmp(name, FM_CLASS) == 0)
185 			continue; /* already printed by caller */
186 
187 		c = fm_printf(d, c, cols, " %s=", name);
188 
189 		switch (type) {
190 		case DATA_TYPE_BOOLEAN:
191 			c = fm_printf(d + 1, c, cols, " 1");
192 			break;
193 
194 		case DATA_TYPE_BOOLEAN_VALUE:
195 			(void) nvpair_value_boolean_value(nvp, &b);
196 			c = fm_printf(d + 1, c, cols, b ? "1" : "0");
197 			break;
198 
199 		case DATA_TYPE_BYTE:
200 			(void) nvpair_value_byte(nvp, &i8);
201 			c = fm_printf(d + 1, c, cols, "0x%x", i8);
202 			break;
203 
204 		case DATA_TYPE_INT8:
205 			(void) nvpair_value_int8(nvp, (void *)&i8);
206 			c = fm_printf(d + 1, c, cols, "0x%x", i8);
207 			break;
208 
209 		case DATA_TYPE_UINT8:
210 			(void) nvpair_value_uint8(nvp, &i8);
211 			c = fm_printf(d + 1, c, cols, "0x%x", i8);
212 			break;
213 
214 		case DATA_TYPE_INT16:
215 			(void) nvpair_value_int16(nvp, (void *)&i16);
216 			c = fm_printf(d + 1, c, cols, "0x%x", i16);
217 			break;
218 
219 		case DATA_TYPE_UINT16:
220 			(void) nvpair_value_uint16(nvp, &i16);
221 			c = fm_printf(d + 1, c, cols, "0x%x", i16);
222 			break;
223 
224 		case DATA_TYPE_INT32:
225 			(void) nvpair_value_int32(nvp, (void *)&i32);
226 			c = fm_printf(d + 1, c, cols, "0x%x", i32);
227 			break;
228 
229 		case DATA_TYPE_UINT32:
230 			(void) nvpair_value_uint32(nvp, &i32);
231 			c = fm_printf(d + 1, c, cols, "0x%x", i32);
232 			break;
233 
234 		case DATA_TYPE_INT64:
235 			(void) nvpair_value_int64(nvp, (void *)&i64);
236 			c = fm_printf(d + 1, c, cols, "0x%llx",
237 			    (u_longlong_t)i64);
238 			break;
239 
240 		case DATA_TYPE_UINT64:
241 			(void) nvpair_value_uint64(nvp, &i64);
242 			c = fm_printf(d + 1, c, cols, "0x%llx",
243 			    (u_longlong_t)i64);
244 			break;
245 
246 		case DATA_TYPE_HRTIME:
247 			(void) nvpair_value_hrtime(nvp, (void *)&i64);
248 			c = fm_printf(d + 1, c, cols, "0x%llx",
249 			    (u_longlong_t)i64);
250 			break;
251 
252 		case DATA_TYPE_STRING:
253 			(void) nvpair_value_string(nvp, &str);
254 			c = fm_printf(d + 1, c, cols, "\"%s\"",
255 			    str ? str : "<NULL>");
256 			break;
257 
258 		case DATA_TYPE_NVLIST:
259 			c = fm_printf(d + 1, c, cols, "[");
260 			(void) nvpair_value_nvlist(nvp, &cnv);
261 			c = fm_nvprintr(cnv, d + 1, c, cols);
262 			c = fm_printf(d + 1, c, cols, " ]");
263 			break;
264 
265 		case DATA_TYPE_NVLIST_ARRAY: {
266 			nvlist_t **val;
267 			uint_t i, nelem;
268 
269 			c = fm_printf(d + 1, c, cols, "[");
270 			(void) nvpair_value_nvlist_array(nvp, &val, &nelem);
271 			for (i = 0; i < nelem; i++) {
272 				c = fm_nvprintr(val[i], d + 1, c, cols);
273 			}
274 			c = fm_printf(d + 1, c, cols, " ]");
275 			}
276 			break;
277 
278 		case DATA_TYPE_INT8_ARRAY: {
279 			int8_t *val;
280 			uint_t i, nelem;
281 
282 			c = fm_printf(d + 1, c, cols, "[ ");
283 			(void) nvpair_value_int8_array(nvp, &val, &nelem);
284 			for (i = 0; i < nelem; i++)
285 				c = fm_printf(d + 1, c, cols, "0x%llx ",
286 				    (u_longlong_t)val[i]);
287 
288 			c = fm_printf(d + 1, c, cols, "]");
289 			break;
290 			}
291 
292 		case DATA_TYPE_UINT8_ARRAY: {
293 			uint8_t *val;
294 			uint_t i, nelem;
295 
296 			c = fm_printf(d + 1, c, cols, "[ ");
297 			(void) nvpair_value_uint8_array(nvp, &val, &nelem);
298 			for (i = 0; i < nelem; i++)
299 				c = fm_printf(d + 1, c, cols, "0x%llx ",
300 				    (u_longlong_t)val[i]);
301 
302 			c = fm_printf(d + 1, c, cols, "]");
303 			break;
304 			}
305 
306 		case DATA_TYPE_INT16_ARRAY: {
307 			int16_t *val;
308 			uint_t i, nelem;
309 
310 			c = fm_printf(d + 1, c, cols, "[ ");
311 			(void) nvpair_value_int16_array(nvp, &val, &nelem);
312 			for (i = 0; i < nelem; i++)
313 				c = fm_printf(d + 1, c, cols, "0x%llx ",
314 				    (u_longlong_t)val[i]);
315 
316 			c = fm_printf(d + 1, c, cols, "]");
317 			break;
318 			}
319 
320 		case DATA_TYPE_UINT16_ARRAY: {
321 			uint16_t *val;
322 			uint_t i, nelem;
323 
324 			c = fm_printf(d + 1, c, cols, "[ ");
325 			(void) nvpair_value_uint16_array(nvp, &val, &nelem);
326 			for (i = 0; i < nelem; i++)
327 				c = fm_printf(d + 1, c, cols, "0x%llx ",
328 				    (u_longlong_t)val[i]);
329 
330 			c = fm_printf(d + 1, c, cols, "]");
331 			break;
332 			}
333 
334 		case DATA_TYPE_INT32_ARRAY: {
335 			int32_t *val;
336 			uint_t i, nelem;
337 
338 			c = fm_printf(d + 1, c, cols, "[ ");
339 			(void) nvpair_value_int32_array(nvp, &val, &nelem);
340 			for (i = 0; i < nelem; i++)
341 			c = fm_printf(d + 1, c, cols, "0x%llx ",
342 			    (u_longlong_t)val[i]);
343 
344 			c = fm_printf(d + 1, c, cols, "]");
345 			break;
346 			}
347 
348 		case DATA_TYPE_UINT32_ARRAY: {
349 			uint32_t *val;
350 			uint_t i, nelem;
351 
352 			c = fm_printf(d + 1, c, cols, "[ ");
353 			(void) nvpair_value_uint32_array(nvp, &val, &nelem);
354 			for (i = 0; i < nelem; i++)
355 				c = fm_printf(d + 1, c, cols, "0x%llx ",
356 				    (u_longlong_t)val[i]);
357 
358 			c = fm_printf(d + 1, c, cols, "]");
359 			break;
360 			}
361 
362 		case DATA_TYPE_INT64_ARRAY: {
363 			int64_t *val;
364 			uint_t i, nelem;
365 
366 			c = fm_printf(d + 1, c, cols, "[ ");
367 			(void) nvpair_value_int64_array(nvp, &val, &nelem);
368 			for (i = 0; i < nelem; i++)
369 				c = fm_printf(d + 1, c, cols, "0x%llx ",
370 				    (u_longlong_t)val[i]);
371 
372 			c = fm_printf(d + 1, c, cols, "]");
373 			break;
374 			}
375 
376 		case DATA_TYPE_UINT64_ARRAY: {
377 			uint64_t *val;
378 			uint_t i, nelem;
379 
380 			c = fm_printf(d + 1, c, cols, "[ ");
381 			(void) nvpair_value_uint64_array(nvp, &val, &nelem);
382 			for (i = 0; i < nelem; i++)
383 				c = fm_printf(d + 1, c, cols, "0x%llx ",
384 				    (u_longlong_t)val[i]);
385 
386 			c = fm_printf(d + 1, c, cols, "]");
387 			break;
388 			}
389 
390 		case DATA_TYPE_STRING_ARRAY:
391 		case DATA_TYPE_BOOLEAN_ARRAY:
392 		case DATA_TYPE_BYTE_ARRAY:
393 			c = fm_printf(d + 1, c, cols, "[...]");
394 			break;
395 
396 		case DATA_TYPE_UNKNOWN:
397 		case DATA_TYPE_DONTCARE:
398 			c = fm_printf(d + 1, c, cols, "<unknown>");
399 			break;
400 		}
401 	}
402 
403 	return (c);
404 }
405 
406 void
407 fm_nvprint(nvlist_t *nvl)
408 {
409 	char *class;
410 	int c = 0;
411 
412 	console_printf("\n");
413 
414 	if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0)
415 		c = fm_printf(0, c, zfs_zevent_cols, "%s", class);
416 
417 	if (fm_nvprintr(nvl, 0, c, zfs_zevent_cols) != 0)
418 		console_printf("\n");
419 
420 	console_printf("\n");
421 }
422 
423 static zevent_t *
424 zfs_zevent_alloc(void)
425 {
426 	zevent_t *ev;
427 
428 	ev = kmem_zalloc(sizeof (zevent_t), KM_SLEEP);
429 
430 	list_create(&ev->ev_ze_list, sizeof (zfs_zevent_t),
431 	    offsetof(zfs_zevent_t, ze_node));
432 	list_link_init(&ev->ev_node);
433 
434 	return (ev);
435 }
436 
437 static void
438 zfs_zevent_free(zevent_t *ev)
439 {
440 	/* Run provided cleanup callback */
441 	ev->ev_cb(ev->ev_nvl, ev->ev_detector);
442 
443 	list_destroy(&ev->ev_ze_list);
444 	kmem_free(ev, sizeof (zevent_t));
445 }
446 
447 static void
448 zfs_zevent_drain(zevent_t *ev)
449 {
450 	zfs_zevent_t *ze;
451 
452 	ASSERT(MUTEX_HELD(&zevent_lock));
453 	list_remove(&zevent_list, ev);
454 
455 	/* Remove references to this event in all private file data */
456 	while ((ze = list_head(&ev->ev_ze_list)) != NULL) {
457 		list_remove(&ev->ev_ze_list, ze);
458 		ze->ze_zevent = NULL;
459 		ze->ze_dropped++;
460 	}
461 
462 	zfs_zevent_free(ev);
463 }
464 
465 void
466 zfs_zevent_drain_all(int *count)
467 {
468 	zevent_t *ev;
469 
470 	mutex_enter(&zevent_lock);
471 	while ((ev = list_head(&zevent_list)) != NULL)
472 		zfs_zevent_drain(ev);
473 
474 	*count = zevent_len_cur;
475 	zevent_len_cur = 0;
476 	mutex_exit(&zevent_lock);
477 }
478 
479 /*
480  * New zevents are inserted at the head.  If the maximum queue
481  * length is exceeded a zevent will be drained from the tail.
482  * As part of this any user space processes which currently have
483  * a reference to this zevent_t in their private data will have
484  * this reference set to NULL.
485  */
486 static void
487 zfs_zevent_insert(zevent_t *ev)
488 {
489 	ASSERT(MUTEX_HELD(&zevent_lock));
490 	list_insert_head(&zevent_list, ev);
491 
492 	if (zevent_len_cur >= zfs_zevent_len_max)
493 		zfs_zevent_drain(list_tail(&zevent_list));
494 	else
495 		zevent_len_cur++;
496 }
497 
498 /*
499  * Post a zevent. The cb will be called when nvl and detector are no longer
500  * needed, i.e.:
501  * - An error happened and a zevent can't be posted. In this case, cb is called
502  *   before zfs_zevent_post() returns.
503  * - The event is being drained and freed.
504  */
505 int
506 zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb)
507 {
508 	inode_timespec_t tv;
509 	int64_t tv_array[2];
510 	uint64_t eid;
511 	size_t nvl_size = 0;
512 	zevent_t *ev;
513 	int error;
514 
515 	ASSERT(cb != NULL);
516 
517 	gethrestime(&tv);
518 	tv_array[0] = tv.tv_sec;
519 	tv_array[1] = tv.tv_nsec;
520 
521 	error = nvlist_add_int64_array(nvl, FM_EREPORT_TIME, tv_array, 2);
522 	if (error) {
523 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
524 		goto out;
525 	}
526 
527 	eid = atomic_inc_64_nv(&zevent_eid);
528 	error = nvlist_add_uint64(nvl, FM_EREPORT_EID, eid);
529 	if (error) {
530 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
531 		goto out;
532 	}
533 
534 	error = nvlist_size(nvl, &nvl_size, NV_ENCODE_NATIVE);
535 	if (error) {
536 		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
537 		goto out;
538 	}
539 
540 	if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) {
541 		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
542 		error = EOVERFLOW;
543 		goto out;
544 	}
545 
546 	if (zfs_zevent_console)
547 		fm_nvprint(nvl);
548 
549 	ev = zfs_zevent_alloc();
550 	if (ev == NULL) {
551 		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
552 		error = ENOMEM;
553 		goto out;
554 	}
555 
556 	ev->ev_nvl = nvl;
557 	ev->ev_detector = detector;
558 	ev->ev_cb = cb;
559 	ev->ev_eid = eid;
560 
561 	mutex_enter(&zevent_lock);
562 	zfs_zevent_insert(ev);
563 	cv_broadcast(&zevent_cv);
564 	mutex_exit(&zevent_lock);
565 
566 out:
567 	if (error)
568 		cb(nvl, detector);
569 
570 	return (error);
571 }
572 
573 void
574 zfs_zevent_track_duplicate(void)
575 {
576 	atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64);
577 }
578 
579 static int
580 zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze)
581 {
582 	*ze = zfsdev_get_state(minor, ZST_ZEVENT);
583 	if (*ze == NULL)
584 		return (SET_ERROR(EBADF));
585 
586 	return (0);
587 }
588 
589 int
590 zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze)
591 {
592 	int error;
593 
594 	error = zfsdev_getminor(fd, minorp);
595 	if (error == 0)
596 		error = zfs_zevent_minor_to_state(*minorp, ze);
597 
598 	if (error)
599 		zfs_zevent_fd_rele(fd);
600 
601 	return (error);
602 }
603 
604 void
605 zfs_zevent_fd_rele(int fd)
606 {
607 	zfs_file_put(fd);
608 }
609 
610 /*
611  * Get the next zevent in the stream and place a copy in 'event'.  This
612  * may fail with ENOMEM if the encoded nvlist size exceeds the passed
613  * 'event_size'.  In this case the stream pointer is not advanced and
614  * and 'event_size' is set to the minimum required buffer size.
615  */
616 int
617 zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size,
618     uint64_t *dropped)
619 {
620 	zevent_t *ev;
621 	size_t size;
622 	int error = 0;
623 
624 	mutex_enter(&zevent_lock);
625 	if (ze->ze_zevent == NULL) {
626 		/* New stream start at the beginning/tail */
627 		ev = list_tail(&zevent_list);
628 		if (ev == NULL) {
629 			error = ENOENT;
630 			goto out;
631 		}
632 	} else {
633 		/*
634 		 * Existing stream continue with the next element and remove
635 		 * ourselves from the wait queue for the previous element
636 		 */
637 		ev = list_prev(&zevent_list, ze->ze_zevent);
638 		if (ev == NULL) {
639 			error = ENOENT;
640 			goto out;
641 		}
642 	}
643 
644 	VERIFY(nvlist_size(ev->ev_nvl, &size, NV_ENCODE_NATIVE) == 0);
645 	if (size > *event_size) {
646 		*event_size = size;
647 		error = ENOMEM;
648 		goto out;
649 	}
650 
651 	if (ze->ze_zevent)
652 		list_remove(&ze->ze_zevent->ev_ze_list, ze);
653 
654 	ze->ze_zevent = ev;
655 	list_insert_head(&ev->ev_ze_list, ze);
656 	(void) nvlist_dup(ev->ev_nvl, event, KM_SLEEP);
657 	*dropped = ze->ze_dropped;
658 
659 #ifdef _KERNEL
660 	/* Include events dropped due to rate limiting */
661 	*dropped += ratelimit_dropped;
662 	ratelimit_dropped = 0;
663 #endif
664 	ze->ze_dropped = 0;
665 out:
666 	mutex_exit(&zevent_lock);
667 
668 	return (error);
669 }
670 
671 /*
672  * Wait in an interruptible state for any new events.
673  */
674 int
675 zfs_zevent_wait(zfs_zevent_t *ze)
676 {
677 	int error = EAGAIN;
678 
679 	mutex_enter(&zevent_lock);
680 	zevent_waiters++;
681 
682 	while (error == EAGAIN) {
683 		if (zevent_flags & ZEVENT_SHUTDOWN) {
684 			error = SET_ERROR(ESHUTDOWN);
685 			break;
686 		}
687 
688 		error = cv_wait_sig(&zevent_cv, &zevent_lock);
689 		if (signal_pending(current)) {
690 			error = SET_ERROR(EINTR);
691 			break;
692 		} else if (!list_is_empty(&zevent_list)) {
693 			error = 0;
694 			continue;
695 		} else {
696 			error = EAGAIN;
697 		}
698 	}
699 
700 	zevent_waiters--;
701 	mutex_exit(&zevent_lock);
702 
703 	return (error);
704 }
705 
706 /*
707  * The caller may seek to a specific EID by passing that EID.  If the EID
708  * is still available in the posted list of events the cursor is positioned
709  * there.  Otherwise ENOENT is returned and the cursor is not moved.
710  *
711  * There are two reserved EIDs which may be passed and will never fail.
712  * ZEVENT_SEEK_START positions the cursor at the start of the list, and
713  * ZEVENT_SEEK_END positions the cursor at the end of the list.
714  */
715 int
716 zfs_zevent_seek(zfs_zevent_t *ze, uint64_t eid)
717 {
718 	zevent_t *ev;
719 	int error = 0;
720 
721 	mutex_enter(&zevent_lock);
722 
723 	if (eid == ZEVENT_SEEK_START) {
724 		if (ze->ze_zevent)
725 			list_remove(&ze->ze_zevent->ev_ze_list, ze);
726 
727 		ze->ze_zevent = NULL;
728 		goto out;
729 	}
730 
731 	if (eid == ZEVENT_SEEK_END) {
732 		if (ze->ze_zevent)
733 			list_remove(&ze->ze_zevent->ev_ze_list, ze);
734 
735 		ev = list_head(&zevent_list);
736 		if (ev) {
737 			ze->ze_zevent = ev;
738 			list_insert_head(&ev->ev_ze_list, ze);
739 		} else {
740 			ze->ze_zevent = NULL;
741 		}
742 
743 		goto out;
744 	}
745 
746 	for (ev = list_tail(&zevent_list); ev != NULL;
747 	    ev = list_prev(&zevent_list, ev)) {
748 		if (ev->ev_eid == eid) {
749 			if (ze->ze_zevent)
750 				list_remove(&ze->ze_zevent->ev_ze_list, ze);
751 
752 			ze->ze_zevent = ev;
753 			list_insert_head(&ev->ev_ze_list, ze);
754 			break;
755 		}
756 	}
757 
758 	if (ev == NULL)
759 		error = ENOENT;
760 
761 out:
762 	mutex_exit(&zevent_lock);
763 
764 	return (error);
765 }
766 
767 void
768 zfs_zevent_init(zfs_zevent_t **zep)
769 {
770 	zfs_zevent_t *ze;
771 
772 	ze = *zep = kmem_zalloc(sizeof (zfs_zevent_t), KM_SLEEP);
773 	list_link_init(&ze->ze_node);
774 }
775 
776 void
777 zfs_zevent_destroy(zfs_zevent_t *ze)
778 {
779 	mutex_enter(&zevent_lock);
780 	if (ze->ze_zevent)
781 		list_remove(&ze->ze_zevent->ev_ze_list, ze);
782 	mutex_exit(&zevent_lock);
783 
784 	kmem_free(ze, sizeof (zfs_zevent_t));
785 }
786 #endif /* _KERNEL */
787 
788 /*
789  * Wrappers for FM nvlist allocators
790  */
791 /* ARGSUSED */
792 static void *
793 i_fm_alloc(nv_alloc_t *nva, size_t size)
794 {
795 	return (kmem_zalloc(size, KM_SLEEP));
796 }
797 
798 /* ARGSUSED */
799 static void
800 i_fm_free(nv_alloc_t *nva, void *buf, size_t size)
801 {
802 	kmem_free(buf, size);
803 }
804 
805 const nv_alloc_ops_t fm_mem_alloc_ops = {
806 	.nv_ao_init = NULL,
807 	.nv_ao_fini = NULL,
808 	.nv_ao_alloc = i_fm_alloc,
809 	.nv_ao_free = i_fm_free,
810 	.nv_ao_reset = NULL
811 };
812 
813 /*
814  * Create and initialize a new nv_alloc_t for a fixed buffer, buf.  A pointer
815  * to the newly allocated nv_alloc_t structure is returned upon success or NULL
816  * is returned to indicate that the nv_alloc structure could not be created.
817  */
818 nv_alloc_t *
819 fm_nva_xcreate(char *buf, size_t bufsz)
820 {
821 	nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
822 
823 	if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) {
824 		kmem_free(nvhdl, sizeof (nv_alloc_t));
825 		return (NULL);
826 	}
827 
828 	return (nvhdl);
829 }
830 
831 /*
832  * Destroy a previously allocated nv_alloc structure.  The fixed buffer
833  * associated with nva must be freed by the caller.
834  */
835 void
836 fm_nva_xdestroy(nv_alloc_t *nva)
837 {
838 	nv_alloc_fini(nva);
839 	kmem_free(nva, sizeof (nv_alloc_t));
840 }
841 
842 /*
843  * Create a new nv list.  A pointer to a new nv list structure is returned
844  * upon success or NULL is returned to indicate that the structure could
845  * not be created.  The newly created nv list is created and managed by the
846  * operations installed in nva.   If nva is NULL, the default FMA nva
847  * operations are installed and used.
848  *
849  * When called from the kernel and nva == NULL, this function must be called
850  * from passive kernel context with no locks held that can prevent a
851  * sleeping memory allocation from occurring.  Otherwise, this function may
852  * be called from other kernel contexts as long a valid nva created via
853  * fm_nva_create() is supplied.
854  */
855 nvlist_t *
856 fm_nvlist_create(nv_alloc_t *nva)
857 {
858 	int hdl_alloced = 0;
859 	nvlist_t *nvl;
860 	nv_alloc_t *nvhdl;
861 
862 	if (nva == NULL) {
863 		nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
864 
865 		if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) {
866 			kmem_free(nvhdl, sizeof (nv_alloc_t));
867 			return (NULL);
868 		}
869 		hdl_alloced = 1;
870 	} else {
871 		nvhdl = nva;
872 	}
873 
874 	if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) {
875 		if (hdl_alloced) {
876 			nv_alloc_fini(nvhdl);
877 			kmem_free(nvhdl, sizeof (nv_alloc_t));
878 		}
879 		return (NULL);
880 	}
881 
882 	return (nvl);
883 }
884 
885 /*
886  * Destroy a previously allocated nvlist structure.  flag indicates whether
887  * or not the associated nva structure should be freed (FM_NVA_FREE) or
888  * retained (FM_NVA_RETAIN).  Retaining the nv alloc structure allows
889  * it to be re-used for future nvlist creation operations.
890  */
891 void
892 fm_nvlist_destroy(nvlist_t *nvl, int flag)
893 {
894 	nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl);
895 
896 	nvlist_free(nvl);
897 
898 	if (nva != NULL) {
899 		if (flag == FM_NVA_FREE)
900 			fm_nva_xdestroy(nva);
901 	}
902 }
903 
904 int
905 i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
906 {
907 	int nelem, ret = 0;
908 	data_type_t type;
909 
910 	while (ret == 0 && name != NULL) {
911 		type = va_arg(ap, data_type_t);
912 		switch (type) {
913 		case DATA_TYPE_BYTE:
914 			ret = nvlist_add_byte(payload, name,
915 			    va_arg(ap, uint_t));
916 			break;
917 		case DATA_TYPE_BYTE_ARRAY:
918 			nelem = va_arg(ap, int);
919 			ret = nvlist_add_byte_array(payload, name,
920 			    va_arg(ap, uchar_t *), nelem);
921 			break;
922 		case DATA_TYPE_BOOLEAN_VALUE:
923 			ret = nvlist_add_boolean_value(payload, name,
924 			    va_arg(ap, boolean_t));
925 			break;
926 		case DATA_TYPE_BOOLEAN_ARRAY:
927 			nelem = va_arg(ap, int);
928 			ret = nvlist_add_boolean_array(payload, name,
929 			    va_arg(ap, boolean_t *), nelem);
930 			break;
931 		case DATA_TYPE_INT8:
932 			ret = nvlist_add_int8(payload, name,
933 			    va_arg(ap, int));
934 			break;
935 		case DATA_TYPE_INT8_ARRAY:
936 			nelem = va_arg(ap, int);
937 			ret = nvlist_add_int8_array(payload, name,
938 			    va_arg(ap, int8_t *), nelem);
939 			break;
940 		case DATA_TYPE_UINT8:
941 			ret = nvlist_add_uint8(payload, name,
942 			    va_arg(ap, uint_t));
943 			break;
944 		case DATA_TYPE_UINT8_ARRAY:
945 			nelem = va_arg(ap, int);
946 			ret = nvlist_add_uint8_array(payload, name,
947 			    va_arg(ap, uint8_t *), nelem);
948 			break;
949 		case DATA_TYPE_INT16:
950 			ret = nvlist_add_int16(payload, name,
951 			    va_arg(ap, int));
952 			break;
953 		case DATA_TYPE_INT16_ARRAY:
954 			nelem = va_arg(ap, int);
955 			ret = nvlist_add_int16_array(payload, name,
956 			    va_arg(ap, int16_t *), nelem);
957 			break;
958 		case DATA_TYPE_UINT16:
959 			ret = nvlist_add_uint16(payload, name,
960 			    va_arg(ap, uint_t));
961 			break;
962 		case DATA_TYPE_UINT16_ARRAY:
963 			nelem = va_arg(ap, int);
964 			ret = nvlist_add_uint16_array(payload, name,
965 			    va_arg(ap, uint16_t *), nelem);
966 			break;
967 		case DATA_TYPE_INT32:
968 			ret = nvlist_add_int32(payload, name,
969 			    va_arg(ap, int32_t));
970 			break;
971 		case DATA_TYPE_INT32_ARRAY:
972 			nelem = va_arg(ap, int);
973 			ret = nvlist_add_int32_array(payload, name,
974 			    va_arg(ap, int32_t *), nelem);
975 			break;
976 		case DATA_TYPE_UINT32:
977 			ret = nvlist_add_uint32(payload, name,
978 			    va_arg(ap, uint32_t));
979 			break;
980 		case DATA_TYPE_UINT32_ARRAY:
981 			nelem = va_arg(ap, int);
982 			ret = nvlist_add_uint32_array(payload, name,
983 			    va_arg(ap, uint32_t *), nelem);
984 			break;
985 		case DATA_TYPE_INT64:
986 			ret = nvlist_add_int64(payload, name,
987 			    va_arg(ap, int64_t));
988 			break;
989 		case DATA_TYPE_INT64_ARRAY:
990 			nelem = va_arg(ap, int);
991 			ret = nvlist_add_int64_array(payload, name,
992 			    va_arg(ap, int64_t *), nelem);
993 			break;
994 		case DATA_TYPE_UINT64:
995 			ret = nvlist_add_uint64(payload, name,
996 			    va_arg(ap, uint64_t));
997 			break;
998 		case DATA_TYPE_UINT64_ARRAY:
999 			nelem = va_arg(ap, int);
1000 			ret = nvlist_add_uint64_array(payload, name,
1001 			    va_arg(ap, uint64_t *), nelem);
1002 			break;
1003 		case DATA_TYPE_STRING:
1004 			ret = nvlist_add_string(payload, name,
1005 			    va_arg(ap, char *));
1006 			break;
1007 		case DATA_TYPE_STRING_ARRAY:
1008 			nelem = va_arg(ap, int);
1009 			ret = nvlist_add_string_array(payload, name,
1010 			    va_arg(ap, char **), nelem);
1011 			break;
1012 		case DATA_TYPE_NVLIST:
1013 			ret = nvlist_add_nvlist(payload, name,
1014 			    va_arg(ap, nvlist_t *));
1015 			break;
1016 		case DATA_TYPE_NVLIST_ARRAY:
1017 			nelem = va_arg(ap, int);
1018 			ret = nvlist_add_nvlist_array(payload, name,
1019 			    va_arg(ap, nvlist_t **), nelem);
1020 			break;
1021 		default:
1022 			ret = EINVAL;
1023 		}
1024 
1025 		name = va_arg(ap, char *);
1026 	}
1027 	return (ret);
1028 }
1029 
1030 void
1031 fm_payload_set(nvlist_t *payload, ...)
1032 {
1033 	int ret;
1034 	const char *name;
1035 	va_list ap;
1036 
1037 	va_start(ap, payload);
1038 	name = va_arg(ap, char *);
1039 	ret = i_fm_payload_set(payload, name, ap);
1040 	va_end(ap);
1041 
1042 	if (ret)
1043 		atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64);
1044 }
1045 
1046 /*
1047  * Set-up and validate the members of an ereport event according to:
1048  *
1049  *	Member name		Type		Value
1050  *	====================================================
1051  *	class			string		ereport
1052  *	version			uint8_t		0
1053  *	ena			uint64_t	<ena>
1054  *	detector		nvlist_t	<detector>
1055  *	ereport-payload		nvlist_t	<var args>
1056  *
1057  * We don't actually add a 'version' member to the payload.  Really,
1058  * the version quoted to us by our caller is that of the category 1
1059  * "ereport" event class (and we require FM_EREPORT_VERS0) but
1060  * the payload version of the actual leaf class event under construction
1061  * may be something else.  Callers should supply a version in the varargs,
1062  * or (better) we could take two version arguments - one for the
1063  * ereport category 1 classification (expect FM_EREPORT_VERS0) and one
1064  * for the leaf class.
1065  */
1066 void
1067 fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class,
1068     uint64_t ena, const nvlist_t *detector, ...)
1069 {
1070 	char ereport_class[FM_MAX_CLASS];
1071 	const char *name;
1072 	va_list ap;
1073 	int ret;
1074 
1075 	if (version != FM_EREPORT_VERS0) {
1076 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1077 		return;
1078 	}
1079 
1080 	(void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s",
1081 	    FM_EREPORT_CLASS, erpt_class);
1082 	if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) {
1083 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1084 		return;
1085 	}
1086 
1087 	if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) {
1088 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1089 	}
1090 
1091 	if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR,
1092 	    (nvlist_t *)detector) != 0) {
1093 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1094 	}
1095 
1096 	va_start(ap, detector);
1097 	name = va_arg(ap, const char *);
1098 	ret = i_fm_payload_set(ereport, name, ap);
1099 	va_end(ap);
1100 
1101 	if (ret)
1102 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1103 }
1104 
1105 /*
1106  * Set-up and validate the members of an hc fmri according to;
1107  *
1108  *	Member name		Type		Value
1109  *	===================================================
1110  *	version			uint8_t		0
1111  *	auth			nvlist_t	<auth>
1112  *	hc-name			string		<name>
1113  *	hc-id			string		<id>
1114  *
1115  * Note that auth and hc-id are optional members.
1116  */
1117 
1118 #define	HC_MAXPAIRS	20
1119 #define	HC_MAXNAMELEN	50
1120 
1121 static int
1122 fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth)
1123 {
1124 	if (version != FM_HC_SCHEME_VERSION) {
1125 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1126 		return (0);
1127 	}
1128 
1129 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 ||
1130 	    nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) {
1131 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1132 		return (0);
1133 	}
1134 
1135 	if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
1136 	    (nvlist_t *)auth) != 0) {
1137 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1138 		return (0);
1139 	}
1140 
1141 	return (1);
1142 }
1143 
1144 void
1145 fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
1146     nvlist_t *snvl, int npairs, ...)
1147 {
1148 	nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
1149 	nvlist_t *pairs[HC_MAXPAIRS];
1150 	va_list ap;
1151 	int i;
1152 
1153 	if (!fm_fmri_hc_set_common(fmri, version, auth))
1154 		return;
1155 
1156 	npairs = MIN(npairs, HC_MAXPAIRS);
1157 
1158 	va_start(ap, npairs);
1159 	for (i = 0; i < npairs; i++) {
1160 		const char *name = va_arg(ap, const char *);
1161 		uint32_t id = va_arg(ap, uint32_t);
1162 		char idstr[11];
1163 
1164 		(void) snprintf(idstr, sizeof (idstr), "%u", id);
1165 
1166 		pairs[i] = fm_nvlist_create(nva);
1167 		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
1168 		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
1169 			atomic_inc_64(
1170 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1171 		}
1172 	}
1173 	va_end(ap);
1174 
1175 	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
1176 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1177 
1178 	for (i = 0; i < npairs; i++)
1179 		fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
1180 
1181 	if (snvl != NULL) {
1182 		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
1183 			atomic_inc_64(
1184 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1185 		}
1186 	}
1187 }
1188 
1189 void
1190 fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
1191     nvlist_t *snvl, nvlist_t *bboard, int npairs, ...)
1192 {
1193 	nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
1194 	nvlist_t *pairs[HC_MAXPAIRS];
1195 	nvlist_t **hcl;
1196 	uint_t n;
1197 	int i, j;
1198 	va_list ap;
1199 	char *hcname, *hcid;
1200 
1201 	if (!fm_fmri_hc_set_common(fmri, version, auth))
1202 		return;
1203 
1204 	/*
1205 	 * copy the bboard nvpairs to the pairs array
1206 	 */
1207 	if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
1208 	    != 0) {
1209 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1210 		return;
1211 	}
1212 
1213 	for (i = 0; i < n; i++) {
1214 		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
1215 		    &hcname) != 0) {
1216 			atomic_inc_64(
1217 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1218 			return;
1219 		}
1220 		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
1221 			atomic_inc_64(
1222 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1223 			return;
1224 		}
1225 
1226 		pairs[i] = fm_nvlist_create(nva);
1227 		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 ||
1228 		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) {
1229 			for (j = 0; j <= i; j++) {
1230 				if (pairs[j] != NULL)
1231 					fm_nvlist_destroy(pairs[j],
1232 					    FM_NVA_RETAIN);
1233 			}
1234 			atomic_inc_64(
1235 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1236 			return;
1237 		}
1238 	}
1239 
1240 	/*
1241 	 * create the pairs from passed in pairs
1242 	 */
1243 	npairs = MIN(npairs, HC_MAXPAIRS);
1244 
1245 	va_start(ap, npairs);
1246 	for (i = n; i < npairs + n; i++) {
1247 		const char *name = va_arg(ap, const char *);
1248 		uint32_t id = va_arg(ap, uint32_t);
1249 		char idstr[11];
1250 		(void) snprintf(idstr, sizeof (idstr), "%u", id);
1251 		pairs[i] = fm_nvlist_create(nva);
1252 		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
1253 		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
1254 			for (j = 0; j <= i; j++) {
1255 				if (pairs[j] != NULL)
1256 					fm_nvlist_destroy(pairs[j],
1257 					    FM_NVA_RETAIN);
1258 			}
1259 			atomic_inc_64(
1260 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1261 			return;
1262 		}
1263 	}
1264 	va_end(ap);
1265 
1266 	/*
1267 	 * Create the fmri hc list
1268 	 */
1269 	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
1270 	    npairs + n) != 0) {
1271 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1272 		return;
1273 	}
1274 
1275 	for (i = 0; i < npairs + n; i++) {
1276 			fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
1277 	}
1278 
1279 	if (snvl != NULL) {
1280 		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
1281 			atomic_inc_64(
1282 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1283 			return;
1284 		}
1285 	}
1286 }
1287 
1288 /*
1289  * Set-up and validate the members of an dev fmri according to:
1290  *
1291  *	Member name		Type		Value
1292  *	====================================================
1293  *	version			uint8_t		0
1294  *	auth			nvlist_t	<auth>
1295  *	devpath			string		<devpath>
1296  *	[devid]			string		<devid>
1297  *	[target-port-l0id]	string		<target-port-lun0-id>
1298  *
1299  * Note that auth and devid are optional members.
1300  */
1301 void
1302 fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth,
1303     const char *devpath, const char *devid, const char *tpl0)
1304 {
1305 	int err = 0;
1306 
1307 	if (version != DEV_SCHEME_VERSION0) {
1308 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1309 		return;
1310 	}
1311 
1312 	err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version);
1313 	err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
1314 
1315 	if (auth != NULL) {
1316 		err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
1317 		    (nvlist_t *)auth);
1318 	}
1319 
1320 	err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath);
1321 
1322 	if (devid != NULL)
1323 		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid);
1324 
1325 	if (tpl0 != NULL)
1326 		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
1327 
1328 	if (err)
1329 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1330 
1331 }
1332 
1333 /*
1334  * Set-up and validate the members of an cpu fmri according to:
1335  *
1336  *	Member name		Type		Value
1337  *	====================================================
1338  *	version			uint8_t		0
1339  *	auth			nvlist_t	<auth>
1340  *	cpuid			uint32_t	<cpu_id>
1341  *	cpumask			uint8_t		<cpu_mask>
1342  *	serial			uint64_t	<serial_id>
1343  *
1344  * Note that auth, cpumask, serial are optional members.
1345  *
1346  */
1347 void
1348 fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth,
1349     uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp)
1350 {
1351 	uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64;
1352 
1353 	if (version < CPU_SCHEME_VERSION1) {
1354 		atomic_inc_64(failedp);
1355 		return;
1356 	}
1357 
1358 	if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) {
1359 		atomic_inc_64(failedp);
1360 		return;
1361 	}
1362 
1363 	if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME,
1364 	    FM_FMRI_SCHEME_CPU) != 0) {
1365 		atomic_inc_64(failedp);
1366 		return;
1367 	}
1368 
1369 	if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY,
1370 	    (nvlist_t *)auth) != 0)
1371 		atomic_inc_64(failedp);
1372 
1373 	if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0)
1374 		atomic_inc_64(failedp);
1375 
1376 	if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK,
1377 	    *cpu_maskp) != 0)
1378 		atomic_inc_64(failedp);
1379 
1380 	if (serial_idp == NULL || nvlist_add_string(fmri_cpu,
1381 	    FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0)
1382 			atomic_inc_64(failedp);
1383 }
1384 
1385 /*
1386  * Set-up and validate the members of a mem according to:
1387  *
1388  *	Member name		Type		Value
1389  *	====================================================
1390  *	version			uint8_t		0
1391  *	auth			nvlist_t	<auth>		[optional]
1392  *	unum			string		<unum>
1393  *	serial			string		<serial>	[optional*]
1394  *	offset			uint64_t	<offset>	[optional]
1395  *
1396  *	* serial is required if offset is present
1397  */
1398 void
1399 fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
1400     const char *unum, const char *serial, uint64_t offset)
1401 {
1402 	if (version != MEM_SCHEME_VERSION0) {
1403 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1404 		return;
1405 	}
1406 
1407 	if (!serial && (offset != (uint64_t)-1)) {
1408 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1409 		return;
1410 	}
1411 
1412 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
1413 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1414 		return;
1415 	}
1416 
1417 	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) {
1418 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1419 		return;
1420 	}
1421 
1422 	if (auth != NULL) {
1423 		if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
1424 		    (nvlist_t *)auth) != 0) {
1425 			atomic_inc_64(
1426 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1427 		}
1428 	}
1429 
1430 	if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) {
1431 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1432 	}
1433 
1434 	if (serial != NULL) {
1435 		if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
1436 		    (char **)&serial, 1) != 0) {
1437 			atomic_inc_64(
1438 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1439 		}
1440 		if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri,
1441 		    FM_FMRI_MEM_OFFSET, offset) != 0) {
1442 			atomic_inc_64(
1443 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1444 		}
1445 	}
1446 }
1447 
1448 void
1449 fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
1450     uint64_t vdev_guid)
1451 {
1452 	if (version != ZFS_SCHEME_VERSION0) {
1453 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1454 		return;
1455 	}
1456 
1457 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
1458 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1459 		return;
1460 	}
1461 
1462 	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
1463 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1464 		return;
1465 	}
1466 
1467 	if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
1468 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1469 	}
1470 
1471 	if (vdev_guid != 0) {
1472 		if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
1473 			atomic_inc_64(
1474 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1475 		}
1476 	}
1477 }
1478 
1479 uint64_t
1480 fm_ena_increment(uint64_t ena)
1481 {
1482 	uint64_t new_ena;
1483 
1484 	switch (ENA_FORMAT(ena)) {
1485 	case FM_ENA_FMT1:
1486 		new_ena = ena + (1 << ENA_FMT1_GEN_SHFT);
1487 		break;
1488 	case FM_ENA_FMT2:
1489 		new_ena = ena + (1 << ENA_FMT2_GEN_SHFT);
1490 		break;
1491 	default:
1492 		new_ena = 0;
1493 	}
1494 
1495 	return (new_ena);
1496 }
1497 
1498 uint64_t
1499 fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format)
1500 {
1501 	uint64_t ena = 0;
1502 
1503 	switch (format) {
1504 	case FM_ENA_FMT1:
1505 		if (timestamp) {
1506 			ena = (uint64_t)((format & ENA_FORMAT_MASK) |
1507 			    ((cpuid << ENA_FMT1_CPUID_SHFT) &
1508 			    ENA_FMT1_CPUID_MASK) |
1509 			    ((timestamp << ENA_FMT1_TIME_SHFT) &
1510 			    ENA_FMT1_TIME_MASK));
1511 		} else {
1512 			ena = (uint64_t)((format & ENA_FORMAT_MASK) |
1513 			    ((cpuid << ENA_FMT1_CPUID_SHFT) &
1514 			    ENA_FMT1_CPUID_MASK) |
1515 			    ((gethrtime() << ENA_FMT1_TIME_SHFT) &
1516 			    ENA_FMT1_TIME_MASK));
1517 		}
1518 		break;
1519 	case FM_ENA_FMT2:
1520 		ena = (uint64_t)((format & ENA_FORMAT_MASK) |
1521 		    ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK));
1522 		break;
1523 	default:
1524 		break;
1525 	}
1526 
1527 	return (ena);
1528 }
1529 
1530 uint64_t
1531 fm_ena_generate(uint64_t timestamp, uchar_t format)
1532 {
1533 	uint64_t ena;
1534 
1535 	kpreempt_disable();
1536 	ena = fm_ena_generate_cpu(timestamp, getcpuid(), format);
1537 	kpreempt_enable();
1538 
1539 	return (ena);
1540 }
1541 
1542 uint64_t
1543 fm_ena_generation_get(uint64_t ena)
1544 {
1545 	uint64_t gen;
1546 
1547 	switch (ENA_FORMAT(ena)) {
1548 	case FM_ENA_FMT1:
1549 		gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT;
1550 		break;
1551 	case FM_ENA_FMT2:
1552 		gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT;
1553 		break;
1554 	default:
1555 		gen = 0;
1556 		break;
1557 	}
1558 
1559 	return (gen);
1560 }
1561 
1562 uchar_t
1563 fm_ena_format_get(uint64_t ena)
1564 {
1565 
1566 	return (ENA_FORMAT(ena));
1567 }
1568 
1569 uint64_t
1570 fm_ena_id_get(uint64_t ena)
1571 {
1572 	uint64_t id;
1573 
1574 	switch (ENA_FORMAT(ena)) {
1575 	case FM_ENA_FMT1:
1576 		id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT;
1577 		break;
1578 	case FM_ENA_FMT2:
1579 		id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT;
1580 		break;
1581 	default:
1582 		id = 0;
1583 	}
1584 
1585 	return (id);
1586 }
1587 
1588 uint64_t
1589 fm_ena_time_get(uint64_t ena)
1590 {
1591 	uint64_t time;
1592 
1593 	switch (ENA_FORMAT(ena)) {
1594 	case FM_ENA_FMT1:
1595 		time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT;
1596 		break;
1597 	case FM_ENA_FMT2:
1598 		time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT;
1599 		break;
1600 	default:
1601 		time = 0;
1602 	}
1603 
1604 	return (time);
1605 }
1606 
1607 #ifdef _KERNEL
1608 /*
1609  * Helper function to increment ereport dropped count.  Used by the event
1610  * rate limiting code to give feedback to the user about how many events were
1611  * rate limited by including them in the 'dropped' count.
1612  */
1613 void
1614 fm_erpt_dropped_increment(void)
1615 {
1616 	atomic_inc_64(&ratelimit_dropped);
1617 }
1618 
1619 void
1620 fm_init(void)
1621 {
1622 	zevent_len_cur = 0;
1623 	zevent_flags = 0;
1624 
1625 	if (zfs_zevent_len_max == 0)
1626 		zfs_zevent_len_max = ERPT_MAX_ERRS * MAX(max_ncpus, 4);
1627 
1628 	/* Initialize zevent allocation and generation kstats */
1629 	fm_ksp = kstat_create("zfs", 0, "fm", "misc", KSTAT_TYPE_NAMED,
1630 	    sizeof (struct erpt_kstat) / sizeof (kstat_named_t),
1631 	    KSTAT_FLAG_VIRTUAL);
1632 
1633 	if (fm_ksp != NULL) {
1634 		fm_ksp->ks_data = &erpt_kstat_data;
1635 		kstat_install(fm_ksp);
1636 	} else {
1637 		cmn_err(CE_NOTE, "failed to create fm/misc kstat\n");
1638 	}
1639 
1640 	mutex_init(&zevent_lock, NULL, MUTEX_DEFAULT, NULL);
1641 	list_create(&zevent_list, sizeof (zevent_t),
1642 	    offsetof(zevent_t, ev_node));
1643 	cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL);
1644 
1645 	zfs_ereport_init();
1646 }
1647 
1648 void
1649 fm_fini(void)
1650 {
1651 	int count;
1652 
1653 	zfs_ereport_fini();
1654 
1655 	zfs_zevent_drain_all(&count);
1656 
1657 	mutex_enter(&zevent_lock);
1658 	cv_broadcast(&zevent_cv);
1659 
1660 	zevent_flags |= ZEVENT_SHUTDOWN;
1661 	while (zevent_waiters > 0) {
1662 		mutex_exit(&zevent_lock);
1663 		schedule();
1664 		mutex_enter(&zevent_lock);
1665 	}
1666 	mutex_exit(&zevent_lock);
1667 
1668 	cv_destroy(&zevent_cv);
1669 	list_destroy(&zevent_list);
1670 	mutex_destroy(&zevent_lock);
1671 
1672 	if (fm_ksp != NULL) {
1673 		kstat_delete(fm_ksp);
1674 		fm_ksp = NULL;
1675 	}
1676 }
1677 #endif /* _KERNEL */
1678 
1679 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, INT, ZMOD_RW,
1680 	"Max event queue length");
1681 
1682 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, cols, INT, ZMOD_RW,
1683 	"Max event column width");
1684 
1685 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, console, INT, ZMOD_RW,
1686 	"Log events to the console");
1687