1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <fm/fmd_api.h> 27 #include <fm/libtopo.h> 28 #include <fm/topo_hc.h> 29 #include <fm/topo_mod.h> 30 #include <fm/topo_method.h> 31 32 #include <sys/fm/protocol.h> 33 #include <sys/systeminfo.h> 34 35 #include <string.h> 36 37 #define ST_EREPORT_CLASS "ereport.sensor.failure" 38 39 typedef struct sensor_fault { 40 struct sensor_fault *sf_next; 41 char *sf_fru; 42 uint32_t sf_num_fails; 43 boolean_t sf_last_faulted; 44 boolean_t sf_faulted; 45 boolean_t sf_unknown; 46 } sensor_fault_t; 47 48 typedef struct sensor_transport { 49 fmd_hdl_t *st_hdl; 50 fmd_xprt_t *st_xprt; 51 hrtime_t st_interval; 52 id_t st_timer; 53 sensor_fault_t *st_faults; 54 boolean_t st_first; 55 /* 56 * The number of consecutive sensor readings indicating failure that 57 * we'll tolerate before sending an ereport. 58 */ 59 uint32_t st_tolerance; 60 } sensor_transport_t; 61 62 typedef struct st_stats { 63 fmd_stat_t st_bad_fmri; 64 fmd_stat_t st_topo_errs; 65 fmd_stat_t st_repairs; 66 } st_stats_t; 67 68 st_stats_t st_stats = { 69 { "bad_fmri", FMD_TYPE_UINT64, "bad or missing resource/FRU FMRI" }, 70 { "topo_errors", FMD_TYPE_UINT64, "errors walking topology" }, 71 { "repairs", FMD_TYPE_UINT64, "auto repairs" } 72 }; 73 74 static int 75 st_check_component(topo_hdl_t *thp, tnode_t *node, void *arg) 76 { 77 sensor_transport_t *stp = arg; 78 fmd_hdl_t *hdl = stp->st_hdl; 79 const char *name = topo_node_name(node); 80 nvlist_t *nvl, *props, *rsrc, *fru; 81 char *fmri; 82 int err, ret; 83 int32_t last_source, source = -1; 84 boolean_t nonrecov, faulted, predictive, source_diff; 85 nvpair_t *nvp; 86 uint64_t ena; 87 nvlist_t *event; 88 sensor_fault_t *sfp, **current; 89 90 if (strcmp(name, FAN) != 0 && strcmp(name, PSU) != 0) 91 return (0); 92 93 if (topo_node_resource(node, &rsrc, NULL) != 0) { 94 st_stats.st_bad_fmri.fmds_value.ui64++; 95 return (0); 96 } 97 98 /* 99 * If the resource isn't present, don't bother invoking the sensor 100 * failure method. It may be that the sensors aren't part of the same 101 * physical FRU and will report failure if the FRU is no longer there. 102 */ 103 if ((ret = topo_fmri_present(thp, rsrc, &err)) < 0) { 104 fmd_hdl_debug(hdl, "topo_fmri_present() failed for %s=%d", 105 name, topo_node_instance(node)); 106 nvlist_free(rsrc); 107 return (0); 108 } 109 110 if (!ret) { 111 fmd_hdl_debug(hdl, "%s=%d is not present, ignoring", 112 name, topo_node_instance(node)); 113 nvlist_free(rsrc); 114 return (0); 115 } 116 117 if (topo_method_invoke(node, TOPO_METH_SENSOR_FAILURE, 118 TOPO_METH_SENSOR_FAILURE_VERSION, NULL, &nvl, &err) != 0) { 119 if (err == ETOPO_METHOD_NOTSUP) { 120 fmd_hdl_debug(hdl, "Method %s not supported on %s=%d", 121 TOPO_METH_SENSOR_FAILURE, name, 122 topo_node_instance(node)); 123 nvlist_free(rsrc); 124 return (0); 125 } 126 nvl = NULL; 127 } 128 129 if (topo_node_fru(node, &fru, NULL, NULL) != 0) { 130 st_stats.st_bad_fmri.fmds_value.ui64++; 131 nvlist_free(nvl); 132 nvlist_free(rsrc); 133 return (0); 134 } 135 136 if (topo_fmri_nvl2str(thp, fru, &fmri, &err) != 0) { 137 st_stats.st_bad_fmri.fmds_value.ui64++; 138 nvlist_free(nvl); 139 nvlist_free(fru); 140 nvlist_free(rsrc); 141 return (0); 142 } 143 144 nvlist_free(fru); 145 146 faulted = nonrecov = source_diff = B_FALSE; 147 predictive = B_TRUE; 148 if (nvl != NULL) { 149 nvp = NULL; 150 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 151 if (nvpair_value_nvlist(nvp, &props) != 0) 152 continue; 153 154 faulted = B_TRUE; 155 156 /* 157 * We need some simple rules to handle the case where 158 * there are multiple facility nodes that indicate 159 * a problem with this FRU, but disagree on the values 160 * of nonrecov, predictive or source: 161 * 162 * 1) nonrecov will be set to true if one or more 163 * facility nodes indicates true. Otherwise it will 164 * default to false 165 * 166 * 2) predictive will default to false and remain false 167 * if one or more facility nodes indicate false. 168 * 169 * 3) source will be set to unknown unless all facility 170 * nodes agree on the source 171 */ 172 if (nonrecov == B_FALSE) 173 if (nvlist_lookup_boolean_value(props, 174 "nonrecov", &nonrecov) != 0) 175 nonrecov = B_FALSE; 176 if (predictive == B_TRUE) 177 if (nvlist_lookup_boolean_value(props, 178 "predictive", &predictive) != 0) 179 predictive = B_FALSE; 180 181 last_source = source; 182 if (nvlist_lookup_uint32(props, "source", 183 (uint32_t *)&source) != 0) 184 source = TOPO_SENSOR_ERRSRC_UNKNOWN; 185 if (last_source != -1 && last_source != source) 186 source_diff = B_TRUE; 187 } 188 if (source_diff) 189 source = TOPO_SENSOR_ERRSRC_UNKNOWN; 190 } 191 192 /* 193 * See if we know about this fru. 194 */ 195 for (current = &stp->st_faults; *current != NULL; 196 current = &(*current)->sf_next) { 197 if (topo_fmri_strcmp(thp, fmri, 198 (*current)->sf_fru)) 199 break; 200 } 201 202 sfp = *current; 203 if (sfp == NULL) { 204 /* 205 * We add this FRU to our list under two circumstances: 206 * 207 * 1. This FRU is faulted and needs to be remembered to 208 * avoid duplicate ereports. 209 * 210 * 2. This is the initial pass, and we want to repair the 211 * FRU in case it was repaired while we were offline. 212 */ 213 if (stp->st_first || faulted) { 214 sfp = fmd_hdl_zalloc(hdl, sizeof (sensor_fault_t), 215 FMD_SLEEP); 216 sfp->sf_fru = fmd_hdl_strdup(hdl, fmri, FMD_SLEEP); 217 sfp->sf_next = stp->st_faults; 218 stp->st_faults = sfp; 219 } else { 220 goto out; 221 } 222 } 223 224 if (faulted) 225 sfp->sf_num_fails++; 226 227 if (nvl == NULL) 228 sfp->sf_unknown = B_TRUE; 229 230 if (faulted) { 231 /* 232 * Construct and post the ereport. 233 * 234 * XXFM we only post one ereport per fru. It should be possible 235 * to uniquely identify faulty resources instead and post one 236 * per resource, even if they share the same FRU. 237 */ 238 if (!sfp->sf_last_faulted && 239 (sfp->sf_num_fails > stp->st_tolerance)) { 240 ena = fmd_event_ena_create(hdl); 241 event = fmd_nvl_alloc(hdl, FMD_SLEEP); 242 243 (void) nvlist_add_string(event, "type", name); 244 (void) nvlist_add_boolean_value(event, "nonrecov", 245 nonrecov); 246 (void) nvlist_add_boolean_value(event, "predictive", 247 predictive); 248 (void) nvlist_add_uint32(event, "source", 249 (uint32_t)source); 250 (void) nvlist_add_nvlist(event, "details", nvl); 251 (void) nvlist_add_string(event, FM_CLASS, 252 ST_EREPORT_CLASS); 253 (void) nvlist_add_uint8(event, FM_VERSION, 254 FM_EREPORT_VERSION); 255 (void) nvlist_add_uint64(event, FM_EREPORT_ENA, ena); 256 (void) nvlist_add_nvlist(event, FM_EREPORT_DETECTOR, 257 rsrc); 258 259 fmd_xprt_post(hdl, stp->st_xprt, event, 0); 260 fmd_hdl_debug(hdl, "posted ereport: %s", 261 ST_EREPORT_CLASS); 262 } 263 264 sfp->sf_faulted = B_TRUE; 265 } 266 267 out: 268 topo_hdl_strfree(thp, fmri); 269 nvlist_free(rsrc); 270 nvlist_free(nvl); 271 return (0); 272 } 273 274 /*ARGSUSED*/ 275 static void 276 st_timeout(fmd_hdl_t *hdl, id_t id, void *data) 277 { 278 sensor_transport_t *stp; 279 sensor_fault_t *sfp, **current; 280 topo_hdl_t *thp; 281 topo_walk_t *twp; 282 int err; 283 284 fmd_hdl_debug(hdl, "timeout: checking topology"); 285 286 stp = fmd_hdl_getspecific(hdl); 287 thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); 288 289 if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC, st_check_component, 290 stp, &err)) == NULL) { 291 fmd_hdl_topo_rele(hdl, thp); 292 fmd_hdl_error(hdl, "failed to walk topology: %s\n", 293 topo_strerror(err)); 294 st_stats.st_topo_errs.fmds_value.ui64++; 295 return; 296 } 297 298 /* 299 * Initialize values in our internal FRU list for this iteration of 300 * sensor reads. Keep track of whether the FRU was faulted in the 301 * previous pass so we don't send multiple ereports for the same 302 * problem. 303 */ 304 for (sfp = stp->st_faults; sfp != NULL; sfp = sfp->sf_next) { 305 sfp->sf_unknown = B_FALSE; 306 if (sfp->sf_num_fails > stp->st_tolerance) 307 sfp->sf_last_faulted = sfp->sf_faulted; 308 sfp->sf_faulted = B_FALSE; 309 } 310 311 if (topo_walk_step(twp, TOPO_WALK_CHILD) == TOPO_WALK_ERR) { 312 topo_walk_fini(twp); 313 fmd_hdl_topo_rele(hdl, thp); 314 fmd_hdl_error(hdl, "failed to walk topology\n"); 315 st_stats.st_topo_errs.fmds_value.ui64++; 316 return; 317 } 318 319 /* 320 * Remove any faults that weren't seen in the last pass. 321 */ 322 for (current = &stp->st_faults; *current != NULL; ) { 323 sfp = *current; 324 if (!sfp->sf_faulted && !sfp->sf_unknown) { 325 fmd_hdl_debug(hdl, "repairing %s", sfp->sf_fru); 326 fmd_repair_fru(hdl, sfp->sf_fru); 327 st_stats.st_repairs.fmds_value.ui64++; 328 *current = sfp->sf_next; 329 fmd_hdl_strfree(hdl, sfp->sf_fru); 330 fmd_hdl_free(hdl, sfp, sizeof (sensor_fault_t)); 331 } else { 332 current = &sfp->sf_next; 333 } 334 } 335 336 stp->st_first = B_FALSE; 337 topo_walk_fini(twp); 338 fmd_hdl_topo_rele(hdl, thp); 339 340 stp->st_timer = fmd_timer_install(hdl, NULL, NULL, stp->st_interval); 341 } 342 343 static const fmd_prop_t fmd_props[] = { 344 { "interval", FMD_TYPE_TIME, "1min" }, 345 { "tolerance", FMD_TYPE_UINT32, "1" }, 346 { NULL, 0, NULL } 347 }; 348 349 static const fmd_hdl_ops_t fmd_ops = { 350 NULL, /* fmdo_recv */ 351 st_timeout, /* fmdo_timeout */ 352 NULL, /* fmdo_close */ 353 NULL, /* fmdo_stats */ 354 NULL, /* fmdo_gc */ 355 NULL, /* fmdo_send */ 356 NULL /* fmdo_topo */ 357 }; 358 359 static const fmd_hdl_info_t fmd_info = { 360 "Sensor Transport Agent", "1.1", &fmd_ops, fmd_props 361 }; 362 363 void 364 _fmd_init(fmd_hdl_t *hdl) 365 { 366 sensor_transport_t *stp; 367 char buf[SYS_NMLN]; 368 369 /* 370 * The sensor-transport module is currently only supported on x86 371 * platforms. So to avoid unnecessarily wasting cpu cycles on sparc 372 * walking the hc scheme tree every 60 seconds, we'll bail out before 373 * registering the handle. 374 */ 375 if ((sysinfo(SI_ARCHITECTURE, buf, sizeof (buf)) == -1) || 376 (strcmp(buf, "i386") != 0)) 377 return; 378 379 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) 380 return; 381 382 (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, 383 sizeof (st_stats) / sizeof (fmd_stat_t), 384 (fmd_stat_t *)&st_stats); 385 386 stp = fmd_hdl_zalloc(hdl, sizeof (sensor_transport_t), FMD_SLEEP); 387 stp->st_interval = fmd_prop_get_int64(hdl, "interval"); 388 stp->st_tolerance = fmd_prop_get_int32(hdl, "tolerance"); 389 390 fmd_hdl_setspecific(hdl, stp); 391 392 stp->st_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY, NULL, NULL); 393 stp->st_hdl = hdl; 394 stp->st_first = B_TRUE; 395 396 /* kick off the first asynchronous discovery */ 397 stp->st_timer = fmd_timer_install(hdl, NULL, NULL, 0); 398 } 399 400 void 401 _fmd_fini(fmd_hdl_t *hdl) 402 { 403 sensor_transport_t *stp; 404 sensor_fault_t *sfp; 405 406 stp = fmd_hdl_getspecific(hdl); 407 if (stp != NULL) { 408 fmd_xprt_close(hdl, stp->st_xprt); 409 410 while ((sfp = stp->st_faults) != NULL) { 411 stp->st_faults = sfp->sf_next; 412 413 fmd_hdl_strfree(hdl, sfp->sf_fru); 414 fmd_hdl_free(hdl, sfp, sizeof (sensor_fault_t)); 415 } 416 417 fmd_hdl_free(hdl, stp, sizeof (sensor_transport_t)); 418 } 419 } 420