1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <fm/fmd_api.h>
27 #include <fm/libtopo.h>
28 #include <fm/topo_hc.h>
29 #include <fm/topo_mod.h>
30 #include <fm/topo_method.h>
31 
32 #include <sys/fm/protocol.h>
33 #include <sys/systeminfo.h>
34 
35 #include <string.h>
36 
37 #define	ST_EREPORT_CLASS	"ereport.sensor.failure"
38 
39 typedef struct sensor_fault {
40 	struct sensor_fault	*sf_next;
41 	char			*sf_fru;
42 	uint32_t		sf_num_fails;
43 	boolean_t		sf_last_faulted;
44 	boolean_t		sf_faulted;
45 	boolean_t		sf_unknown;
46 } sensor_fault_t;
47 
48 typedef struct sensor_transport {
49 	fmd_hdl_t	*st_hdl;
50 	fmd_xprt_t	*st_xprt;
51 	hrtime_t	st_interval;
52 	id_t		st_timer;
53 	sensor_fault_t	*st_faults;
54 	boolean_t	st_first;
55 	/*
56 	 * The number of consecutive sensor readings indicating failure that
57 	 * we'll tolerate before sending an ereport.
58 	 */
59 	uint32_t	st_tolerance;
60 } sensor_transport_t;
61 
62 typedef struct st_stats {
63 	fmd_stat_t st_bad_fmri;
64 	fmd_stat_t st_topo_errs;
65 	fmd_stat_t st_repairs;
66 } st_stats_t;
67 
68 st_stats_t st_stats = {
69 	{ "bad_fmri", FMD_TYPE_UINT64, "bad or missing resource/FRU FMRI" },
70 	{ "topo_errors", FMD_TYPE_UINT64, "errors walking topology" },
71 	{ "repairs", FMD_TYPE_UINT64, "auto repairs" }
72 };
73 
74 static int
75 st_check_component(topo_hdl_t *thp, tnode_t *node, void *arg)
76 {
77 	sensor_transport_t *stp = arg;
78 	fmd_hdl_t *hdl = stp->st_hdl;
79 	const char *name = topo_node_name(node);
80 	nvlist_t *nvl, *props, *rsrc, *fru;
81 	char *fmri;
82 	int err, ret;
83 	int32_t last_source, source = -1;
84 	boolean_t nonrecov, faulted, predictive, source_diff;
85 	nvpair_t *nvp;
86 	uint64_t ena;
87 	nvlist_t *event;
88 	sensor_fault_t *sfp, **current;
89 
90 	if (strcmp(name, FAN) != 0 && strcmp(name, PSU) != 0)
91 		return (0);
92 
93 	if (topo_node_resource(node, &rsrc, NULL) != 0) {
94 		st_stats.st_bad_fmri.fmds_value.ui64++;
95 		return (0);
96 	}
97 
98 	/*
99 	 * If the resource isn't present, don't bother invoking the sensor
100 	 * failure method.  It may be that the sensors aren't part of the same
101 	 * physical FRU and will report failure if the FRU is no longer there.
102 	 */
103 	if ((ret = topo_fmri_present(thp, rsrc, &err)) < 0) {
104 		fmd_hdl_debug(hdl, "topo_fmri_present() failed for %s=%d",
105 		    name, topo_node_instance(node));
106 		nvlist_free(rsrc);
107 		return (0);
108 	}
109 
110 	if (!ret) {
111 		fmd_hdl_debug(hdl, "%s=%d is not present, ignoring",
112 		    name, topo_node_instance(node));
113 		nvlist_free(rsrc);
114 		return (0);
115 	}
116 
117 	if (topo_method_invoke(node, TOPO_METH_SENSOR_FAILURE,
118 	    TOPO_METH_SENSOR_FAILURE_VERSION, NULL, &nvl, &err) != 0) {
119 		if (err == ETOPO_METHOD_NOTSUP) {
120 			fmd_hdl_debug(hdl, "Method %s not supported on %s=%d",
121 			    TOPO_METH_SENSOR_FAILURE, name,
122 			    topo_node_instance(node));
123 			nvlist_free(rsrc);
124 			return (0);
125 		}
126 		nvl = NULL;
127 	}
128 
129 	if (topo_node_fru(node, &fru, NULL, NULL) != 0) {
130 		st_stats.st_bad_fmri.fmds_value.ui64++;
131 		nvlist_free(nvl);
132 		nvlist_free(rsrc);
133 		return (0);
134 	}
135 
136 	if (topo_fmri_nvl2str(thp, fru, &fmri, &err) != 0) {
137 		st_stats.st_bad_fmri.fmds_value.ui64++;
138 		nvlist_free(nvl);
139 		nvlist_free(fru);
140 		nvlist_free(rsrc);
141 		return (0);
142 	}
143 
144 	nvlist_free(fru);
145 
146 	faulted = nonrecov = source_diff = B_FALSE;
147 	predictive = B_TRUE;
148 	if (nvl != NULL)  {
149 		nvp = NULL;
150 		while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
151 			if (nvpair_value_nvlist(nvp, &props) != 0)
152 				continue;
153 
154 			faulted = B_TRUE;
155 
156 			/*
157 			 * We need some simple rules to handle the case where
158 			 * there are multiple facility nodes that indicate
159 			 * a problem with this FRU, but disagree on the values
160 			 * of nonrecov, predictive or source:
161 			 *
162 			 * 1) nonrecov will be set to true if one or more
163 			 *   facility nodes indicates true.  Otherwise it will
164 			 *   default to false
165 			 *
166 			 * 2) predictive will default to false and remain false
167 			 *    if one or more facility nodes indicate false.
168 			 *
169 			 * 3) source will be set to unknown unless all facility
170 			 *    nodes agree on the source
171 			 */
172 			if (nonrecov == B_FALSE)
173 				if (nvlist_lookup_boolean_value(props,
174 				    "nonrecov", &nonrecov) != 0)
175 					nonrecov = B_FALSE;
176 			if (predictive == B_TRUE)
177 				if (nvlist_lookup_boolean_value(props,
178 				    "predictive", &predictive) != 0)
179 					predictive = B_FALSE;
180 
181 			last_source = source;
182 			if (nvlist_lookup_uint32(props, "source",
183 			    (uint32_t *)&source) != 0)
184 				source = TOPO_SENSOR_ERRSRC_UNKNOWN;
185 			if (last_source != -1 && last_source != source)
186 				source_diff = B_TRUE;
187 		}
188 		if (source_diff)
189 			source = TOPO_SENSOR_ERRSRC_UNKNOWN;
190 	}
191 
192 	/*
193 	 * See if we know about this fru.
194 	 */
195 	for (current = &stp->st_faults; *current != NULL;
196 	    current = &(*current)->sf_next) {
197 		if (topo_fmri_strcmp(thp, fmri,
198 		    (*current)->sf_fru))
199 			break;
200 	}
201 
202 	sfp = *current;
203 	if (sfp == NULL) {
204 		/*
205 		 * We add this FRU to our list under two circumstances:
206 		 *
207 		 * 	1. This FRU is faulted and needs to be remembered to
208 		 *	   avoid duplicate ereports.
209 		 *
210 		 * 	2. This is the initial pass, and we want to repair the
211 		 *	   FRU in case it was repaired while we were offline.
212 		 */
213 		if (stp->st_first || faulted) {
214 			sfp = fmd_hdl_zalloc(hdl, sizeof (sensor_fault_t),
215 			    FMD_SLEEP);
216 			sfp->sf_fru = fmd_hdl_strdup(hdl, fmri, FMD_SLEEP);
217 			sfp->sf_next = stp->st_faults;
218 			stp->st_faults = sfp;
219 		} else {
220 			goto out;
221 		}
222 	}
223 
224 	if (faulted)
225 		sfp->sf_num_fails++;
226 
227 	if (nvl == NULL)
228 		sfp->sf_unknown = B_TRUE;
229 
230 	if (faulted) {
231 		/*
232 		 * Construct and post the ereport.
233 		 *
234 		 * XXFM we only post one ereport per fru.  It should be possible
235 		 * to uniquely identify faulty resources instead and post one
236 		 * per resource, even if they share the same FRU.
237 		 */
238 		if (!sfp->sf_last_faulted &&
239 		    (sfp->sf_num_fails > stp->st_tolerance)) {
240 			ena = fmd_event_ena_create(hdl);
241 			event = fmd_nvl_alloc(hdl, FMD_SLEEP);
242 
243 			(void) nvlist_add_string(event, "type", name);
244 			(void) nvlist_add_boolean_value(event, "nonrecov",
245 			    nonrecov);
246 			(void) nvlist_add_boolean_value(event, "predictive",
247 			    predictive);
248 			(void) nvlist_add_uint32(event, "source",
249 			    (uint32_t)source);
250 			(void) nvlist_add_nvlist(event, "details", nvl);
251 			(void) nvlist_add_string(event, FM_CLASS,
252 			    ST_EREPORT_CLASS);
253 			(void) nvlist_add_uint8(event, FM_VERSION,
254 			    FM_EREPORT_VERSION);
255 			(void) nvlist_add_uint64(event, FM_EREPORT_ENA, ena);
256 			(void) nvlist_add_nvlist(event, FM_EREPORT_DETECTOR,
257 			    rsrc);
258 
259 			fmd_xprt_post(hdl, stp->st_xprt, event, 0);
260 			fmd_hdl_debug(hdl, "posted ereport: %s",
261 			    ST_EREPORT_CLASS);
262 		}
263 
264 		sfp->sf_faulted = B_TRUE;
265 	}
266 
267 out:
268 	topo_hdl_strfree(thp, fmri);
269 	nvlist_free(rsrc);
270 	nvlist_free(nvl);
271 	return (0);
272 }
273 
274 /*ARGSUSED*/
275 static void
276 st_timeout(fmd_hdl_t *hdl, id_t id, void *data)
277 {
278 	sensor_transport_t *stp;
279 	sensor_fault_t *sfp, **current;
280 	topo_hdl_t *thp;
281 	topo_walk_t *twp;
282 	int err;
283 
284 	fmd_hdl_debug(hdl, "timeout: checking topology");
285 
286 	stp = fmd_hdl_getspecific(hdl);
287 	thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION);
288 
289 	if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC, st_check_component,
290 	    stp, &err)) == NULL) {
291 		fmd_hdl_topo_rele(hdl, thp);
292 		fmd_hdl_error(hdl, "failed to walk topology: %s\n",
293 		    topo_strerror(err));
294 		st_stats.st_topo_errs.fmds_value.ui64++;
295 		return;
296 	}
297 
298 	/*
299 	 * Initialize values in our internal FRU list for this iteration of
300 	 * sensor reads.  Keep track of whether the FRU was faulted in the
301 	 * previous pass so we don't send multiple ereports for the same
302 	 * problem.
303 	 */
304 	for (sfp = stp->st_faults; sfp != NULL; sfp = sfp->sf_next) {
305 		sfp->sf_unknown = B_FALSE;
306 		if (sfp->sf_num_fails > stp->st_tolerance)
307 			sfp->sf_last_faulted = sfp->sf_faulted;
308 		sfp->sf_faulted = B_FALSE;
309 	}
310 
311 	if (topo_walk_step(twp, TOPO_WALK_CHILD) == TOPO_WALK_ERR) {
312 		topo_walk_fini(twp);
313 		fmd_hdl_topo_rele(hdl, thp);
314 		fmd_hdl_error(hdl, "failed to walk topology\n");
315 		st_stats.st_topo_errs.fmds_value.ui64++;
316 		return;
317 	}
318 
319 	/*
320 	 * Remove any faults that weren't seen in the last pass.
321 	 */
322 	for (current = &stp->st_faults; *current != NULL; ) {
323 		sfp = *current;
324 		if (!sfp->sf_faulted && !sfp->sf_unknown) {
325 			fmd_hdl_debug(hdl, "repairing %s", sfp->sf_fru);
326 			fmd_repair_fru(hdl, sfp->sf_fru);
327 			st_stats.st_repairs.fmds_value.ui64++;
328 			*current = sfp->sf_next;
329 			fmd_hdl_strfree(hdl, sfp->sf_fru);
330 			fmd_hdl_free(hdl, sfp, sizeof (sensor_fault_t));
331 		} else {
332 			current = &sfp->sf_next;
333 		}
334 	}
335 
336 	stp->st_first = B_FALSE;
337 	topo_walk_fini(twp);
338 	fmd_hdl_topo_rele(hdl, thp);
339 
340 	stp->st_timer = fmd_timer_install(hdl, NULL, NULL, stp->st_interval);
341 }
342 
343 static const fmd_prop_t fmd_props[] = {
344 	{ "interval", FMD_TYPE_TIME, "1min" },
345 	{ "tolerance", FMD_TYPE_UINT32, "1" },
346 	{ NULL, 0, NULL }
347 };
348 
349 static const fmd_hdl_ops_t fmd_ops = {
350 	NULL,			/* fmdo_recv */
351 	st_timeout,		/* fmdo_timeout */
352 	NULL, 			/* fmdo_close */
353 	NULL,			/* fmdo_stats */
354 	NULL,			/* fmdo_gc */
355 	NULL,			/* fmdo_send */
356 	NULL			/* fmdo_topo */
357 };
358 
359 static const fmd_hdl_info_t fmd_info = {
360 	"Sensor Transport Agent", "1.1", &fmd_ops, fmd_props
361 };
362 
363 void
364 _fmd_init(fmd_hdl_t *hdl)
365 {
366 	sensor_transport_t *stp;
367 	char buf[SYS_NMLN];
368 
369 	/*
370 	 * The sensor-transport module is currently only supported on x86
371 	 * platforms.  So to avoid unnecessarily wasting cpu cycles on sparc
372 	 * walking the hc scheme tree every 60 seconds, we'll bail out before
373 	 * registering the handle.
374 	 */
375 	if ((sysinfo(SI_ARCHITECTURE, buf, sizeof (buf)) == -1) ||
376 	    (strcmp(buf, "i386") != 0))
377 		return;
378 
379 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0)
380 		return;
381 
382 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC,
383 	    sizeof (st_stats) / sizeof (fmd_stat_t),
384 	    (fmd_stat_t *)&st_stats);
385 
386 	stp = fmd_hdl_zalloc(hdl, sizeof (sensor_transport_t), FMD_SLEEP);
387 	stp->st_interval = fmd_prop_get_int64(hdl, "interval");
388 	stp->st_tolerance = fmd_prop_get_int32(hdl, "tolerance");
389 
390 	fmd_hdl_setspecific(hdl, stp);
391 
392 	stp->st_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY, NULL, NULL);
393 	stp->st_hdl = hdl;
394 	stp->st_first = B_TRUE;
395 
396 	/* kick off the first asynchronous discovery */
397 	stp->st_timer = fmd_timer_install(hdl, NULL, NULL, 0);
398 }
399 
400 void
401 _fmd_fini(fmd_hdl_t *hdl)
402 {
403 	sensor_transport_t *stp;
404 	sensor_fault_t *sfp;
405 
406 	stp = fmd_hdl_getspecific(hdl);
407 	if (stp != NULL) {
408 		fmd_xprt_close(hdl, stp->st_xprt);
409 
410 		while ((sfp = stp->st_faults) != NULL) {
411 			stp->st_faults = sfp->sf_next;
412 
413 			fmd_hdl_strfree(hdl, sfp->sf_fru);
414 			fmd_hdl_free(hdl, sfp, sizeof (sensor_fault_t));
415 		}
416 
417 		fmd_hdl_free(hdl, stp, sizeof (sensor_transport_t));
418 	}
419 }
420