1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2018, Joyent, Inc. 24 */ 25 26 #include <fm/fmd_api.h> 27 #include <fm/libtopo.h> 28 #include <fm/topo_hc.h> 29 #include <fm/topo_mod.h> 30 #include <fm/topo_method.h> 31 32 #include <sys/fm/protocol.h> 33 #include <sys/systeminfo.h> 34 35 #include <string.h> 36 37 #define ST_EREPORT_CLASS "ereport.sensor.failure" 38 39 typedef struct sensor_fault { 40 struct sensor_fault *sf_next; 41 char *sf_fru; 42 uint32_t sf_num_fails; 43 boolean_t sf_last_faulted; 44 boolean_t sf_faulted; 45 boolean_t sf_unknown; 46 } sensor_fault_t; 47 48 typedef struct sensor_transport { 49 fmd_hdl_t *st_hdl; 50 fmd_xprt_t *st_xprt; 51 hrtime_t st_interval; 52 id_t st_timer; 53 sensor_fault_t *st_faults; 54 boolean_t st_first; 55 /* 56 * The number of consecutive sensor readings indicating failure that 57 * we'll tolerate before sending an ereport. 58 */ 59 uint32_t st_tolerance; 60 nvlist_t *st_spoofs; 61 } sensor_transport_t; 62 63 typedef struct st_stats { 64 fmd_stat_t st_bad_fmri; 65 fmd_stat_t st_topo_errs; 66 fmd_stat_t st_repairs; 67 } st_stats_t; 68 69 st_stats_t st_stats = { 70 { "bad_fmri", FMD_TYPE_UINT64, "bad or missing resource/FRU FMRI" }, 71 { "topo_errors", FMD_TYPE_UINT64, "errors walking topology" }, 72 { "repairs", FMD_TYPE_UINT64, "auto repairs" } 73 }; 74 75 static int st_check_component_complaints; 76 static int have_complained; 77 static char *spoof_prop = NULL; 78 79 static int 80 st_check_component(topo_hdl_t *thp, tnode_t *node, void *arg) 81 { 82 sensor_transport_t *stp = arg; 83 fmd_hdl_t *hdl = stp->st_hdl; 84 const char *name = topo_node_name(node); 85 nvlist_t *nvl, *props, *rsrc, *fru; 86 char *fmri; 87 int err, ret; 88 int32_t last_source, source = -1; 89 boolean_t nonrecov, faulted, predictive, source_diff, injected; 90 nvpair_t *nvp; 91 uint64_t ena; 92 nvlist_t *event; 93 sensor_fault_t *sfp, **current; 94 95 if (strcmp(name, FAN) != 0 && strcmp(name, PSU) != 0) 96 return (0); 97 98 if (topo_node_resource(node, &rsrc, NULL) != 0) { 99 st_stats.st_bad_fmri.fmds_value.ui64++; 100 return (0); 101 } 102 103 /* 104 * If the resource isn't present, don't bother invoking the sensor 105 * failure method. It may be that the sensors aren't part of the same 106 * physical FRU and will report failure if the FRU is no longer there. 107 */ 108 if ((ret = topo_fmri_present(thp, rsrc, &err)) < 0) { 109 fmd_hdl_debug(hdl, "topo_fmri_present() failed for %s=%d", 110 name, topo_node_instance(node)); 111 nvlist_free(rsrc); 112 return (0); 113 } 114 115 if (!ret) { 116 fmd_hdl_debug(hdl, "%s=%d is not present, ignoring", 117 name, topo_node_instance(node)); 118 nvlist_free(rsrc); 119 return (0); 120 } 121 122 if (topo_method_invoke(node, TOPO_METH_SENSOR_FAILURE, 123 TOPO_METH_SENSOR_FAILURE_VERSION, stp->st_spoofs, &nvl, &err) != 124 0) { 125 if (err == ETOPO_METHOD_NOTSUP) { 126 st_check_component_complaints++; 127 if (!have_complained) { 128 fmd_hdl_debug(hdl, "Method %s not supported " 129 "on %s=%d", TOPO_METH_SENSOR_FAILURE, name, 130 topo_node_instance(node)); 131 } 132 nvlist_free(rsrc); 133 return (0); 134 } 135 nvl = NULL; 136 } 137 138 if (topo_node_fru(node, &fru, NULL, &err) != 0) { 139 st_stats.st_bad_fmri.fmds_value.ui64++; 140 nvlist_free(nvl); 141 nvlist_free(rsrc); 142 return (0); 143 } 144 145 if (topo_fmri_nvl2str(thp, fru, &fmri, &err) != 0) { 146 st_stats.st_bad_fmri.fmds_value.ui64++; 147 nvlist_free(nvl); 148 nvlist_free(fru); 149 nvlist_free(rsrc); 150 return (0); 151 } 152 153 nvlist_free(fru); 154 155 faulted = nonrecov = source_diff = injected = B_FALSE; 156 predictive = B_TRUE; 157 if (nvl != NULL) { 158 nvp = NULL; 159 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 160 if (nvpair_value_nvlist(nvp, &props) != 0) 161 continue; 162 163 faulted = B_TRUE; 164 165 /* 166 * We need some simple rules to handle the case where 167 * there are multiple facility nodes that indicate 168 * a problem with this FRU, but disagree on the values 169 * of nonrecov, predictive or source: 170 * 171 * 1) nonrecov will be set to true if one or more 172 * facility nodes indicates true. Otherwise it will 173 * default to false 174 * 175 * 2) predictive will default to false and remain false 176 * if one or more facility nodes indicate false. 177 * 178 * 3) source will be set to unknown unless all facility 179 * nodes agree on the source 180 * 181 * 4) injected defaults to false, but will be set to 182 * true if any of the sensor states were injected. 183 */ 184 if (nonrecov == B_FALSE) 185 if (nvlist_lookup_boolean_value(props, 186 "nonrecov", &nonrecov) != 0) 187 nonrecov = B_FALSE; 188 if (predictive == B_TRUE) 189 if (nvlist_lookup_boolean_value(props, 190 "predictive", &predictive) != 0) 191 predictive = B_FALSE; 192 (void) nvlist_lookup_boolean_value(props, 193 "injected", &injected); 194 195 last_source = source; 196 if (nvlist_lookup_uint32(props, "source", 197 (uint32_t *)&source) != 0) 198 source = TOPO_SENSOR_ERRSRC_UNKNOWN; 199 if (last_source != -1 && last_source != source) 200 source_diff = B_TRUE; 201 } 202 if (source_diff) 203 source = TOPO_SENSOR_ERRSRC_UNKNOWN; 204 } 205 206 /* 207 * See if we know about this fru. 208 */ 209 for (current = &stp->st_faults; *current != NULL; 210 current = &(*current)->sf_next) { 211 if (topo_fmri_strcmp(thp, fmri, 212 (*current)->sf_fru)) 213 break; 214 } 215 216 sfp = *current; 217 if (sfp == NULL) { 218 /* 219 * We add this FRU to our list under two circumstances: 220 * 221 * 1. This FRU is faulted and needs to be remembered to 222 * avoid duplicate ereports. 223 * 224 * 2. This is the initial pass, and we want to repair the 225 * FRU in case it was repaired while we were offline. 226 */ 227 if (stp->st_first || faulted) { 228 sfp = fmd_hdl_zalloc(hdl, sizeof (sensor_fault_t), 229 FMD_SLEEP); 230 sfp->sf_fru = fmd_hdl_strdup(hdl, fmri, FMD_SLEEP); 231 sfp->sf_next = stp->st_faults; 232 stp->st_faults = sfp; 233 } else { 234 goto out; 235 } 236 } 237 238 if (faulted) 239 sfp->sf_num_fails++; 240 241 if (nvl == NULL) 242 sfp->sf_unknown = B_TRUE; 243 244 if (faulted) { 245 /* 246 * Construct and post the ereport. 247 * 248 * XXFM we only post one ereport per fru. It should be possible 249 * to uniquely identify faulty resources instead and post one 250 * per resource, even if they share the same FRU. 251 */ 252 if (!sfp->sf_last_faulted && 253 (sfp->sf_num_fails > stp->st_tolerance)) { 254 ena = fmd_event_ena_create(hdl); 255 event = fmd_nvl_alloc(hdl, FMD_SLEEP); 256 257 (void) nvlist_add_string(event, "type", name); 258 (void) nvlist_add_boolean_value(event, "nonrecov", 259 nonrecov); 260 (void) nvlist_add_boolean_value(event, "predictive", 261 predictive); 262 (void) nvlist_add_uint32(event, "source", 263 (uint32_t)source); 264 (void) nvlist_add_nvlist(event, "details", nvl); 265 (void) nvlist_add_string(event, FM_CLASS, 266 ST_EREPORT_CLASS); 267 (void) nvlist_add_uint8(event, FM_VERSION, 268 FM_EREPORT_VERSION); 269 (void) nvlist_add_uint64(event, FM_EREPORT_ENA, ena); 270 (void) nvlist_add_nvlist(event, FM_EREPORT_DETECTOR, 271 rsrc); 272 (void) nvlist_add_boolean_value(event, "__injected", 273 injected); 274 fmd_xprt_post(hdl, stp->st_xprt, event, 0); 275 fmd_hdl_debug(hdl, "posted ereport: %s", 276 ST_EREPORT_CLASS); 277 } 278 279 sfp->sf_faulted = B_TRUE; 280 } 281 282 out: 283 topo_hdl_strfree(thp, fmri); 284 nvlist_free(rsrc); 285 nvlist_free(nvl); 286 return (0); 287 } 288 289 int st_timeout_verbose = 0; 290 291 /*ARGSUSED*/ 292 static void 293 st_timeout(fmd_hdl_t *hdl, id_t id, void *data) 294 { 295 sensor_transport_t *stp; 296 sensor_fault_t *sfp, **current; 297 topo_hdl_t *thp; 298 topo_walk_t *twp; 299 int err; 300 301 if (st_timeout_verbose) 302 fmd_hdl_debug(hdl, "timeout: checking topology"); 303 304 stp = fmd_hdl_getspecific(hdl); 305 thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); 306 307 if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC, st_check_component, 308 stp, &err)) == NULL) { 309 fmd_hdl_topo_rele(hdl, thp); 310 fmd_hdl_error(hdl, "failed to walk topology: %s\n", 311 topo_strerror(err)); 312 st_stats.st_topo_errs.fmds_value.ui64++; 313 return; 314 } 315 316 if (st_check_component_complaints) 317 have_complained++; 318 319 /* 320 * Initialize values in our internal FRU list for this iteration of 321 * sensor reads. Keep track of whether the FRU was faulted in the 322 * previous pass so we don't send multiple ereports for the same 323 * problem. 324 */ 325 for (sfp = stp->st_faults; sfp != NULL; sfp = sfp->sf_next) { 326 sfp->sf_unknown = B_FALSE; 327 if (sfp->sf_num_fails > stp->st_tolerance) 328 sfp->sf_last_faulted = sfp->sf_faulted; 329 sfp->sf_faulted = B_FALSE; 330 } 331 332 if (topo_walk_step(twp, TOPO_WALK_CHILD) == TOPO_WALK_ERR) { 333 topo_walk_fini(twp); 334 fmd_hdl_topo_rele(hdl, thp); 335 fmd_hdl_error(hdl, "failed to walk topology\n"); 336 st_stats.st_topo_errs.fmds_value.ui64++; 337 return; 338 } 339 340 /* 341 * Remove any faults that weren't seen in the last pass. 342 */ 343 for (current = &stp->st_faults; *current != NULL; ) { 344 sfp = *current; 345 if (!sfp->sf_faulted && !sfp->sf_unknown) { 346 fmd_hdl_debug(hdl, "repairing %s", sfp->sf_fru); 347 fmd_repair_fru(hdl, sfp->sf_fru); 348 st_stats.st_repairs.fmds_value.ui64++; 349 *current = sfp->sf_next; 350 fmd_hdl_strfree(hdl, sfp->sf_fru); 351 fmd_hdl_free(hdl, sfp, sizeof (sensor_fault_t)); 352 } else { 353 current = &sfp->sf_next; 354 } 355 } 356 357 stp->st_first = B_FALSE; 358 topo_walk_fini(twp); 359 fmd_hdl_topo_rele(hdl, thp); 360 361 stp->st_timer = fmd_timer_install(hdl, NULL, NULL, stp->st_interval); 362 } 363 364 /* 365 * Parse the value of the spoof-sensor-state module property and store the 366 * result in an nvlist of nvlists. The format of the value is 3-tuple, 367 * delimited by colons, as follows: 368 * 369 * FMRIPATTERN:SENSORNAME:SENSORSTATE;... 370 * 371 * where FMRIPATTERN can be a string with wildcards that matches the FMRI 372 * of a node associated with the target sensor facility. 373 * 374 * where SENSORNAME is the node name of the target sensor facility 375 * 376 * where SENSORSTATE is the desired sensor state value to spoof. 377 * 378 * Multiple tuples can be specifed, delimited by semicolons. 379 * 380 * If any errors are encountered while parsing the value, all parsing is 381 * ceased and an ereport will be generated indicating a failure to parse 382 * the value. 383 */ 384 /*ARGSUSED*/ 385 static int 386 parse_spoof_param(fmd_hdl_t *hdl, char *param, sensor_transport_t *stp) 387 { 388 char *sensor, *last_sensor, *field, *last_field; 389 nvlist_t *spoof; 390 391 if (nvlist_alloc(&stp->st_spoofs, NV_UNIQUE_NAME, 0) != 0) { 392 return (-1); 393 } 394 395 sensor = strtok_r(param, ";", &last_sensor); 396 while (sensor != NULL) { 397 if (nvlist_alloc(&spoof, NV_UNIQUE_NAME, 0) != 0) 398 goto err; 399 400 if ((field = strtok_r(sensor, ":", &last_field)) == NULL || 401 nvlist_add_string(spoof, ST_SPOOF_FMRI, field) != 0) 402 goto err; 403 404 if ((field = strtok_r(NULL, ":", &last_field)) == NULL || 405 nvlist_add_string(spoof, ST_SPOOF_SENSOR, field) != 0) 406 goto err; 407 408 if ((field = strtok_r(NULL, ":", &last_field)) == NULL || 409 nvlist_add_uint32(spoof, ST_SPOOF_STATE, 410 strtol(field, NULL, 0)) != 0) 411 goto err; 412 413 if (nvlist_add_nvlist(stp->st_spoofs, sensor, spoof) != 0) 414 goto err; 415 416 spoof = NULL; 417 sensor = strtok_r(NULL, ";", &last_sensor); 418 } 419 420 return (0); 421 err: 422 nvlist_free(spoof); 423 nvlist_free(stp->st_spoofs); 424 stp->st_spoofs = NULL; 425 return (-1); 426 } 427 428 static const fmd_prop_t fmd_props[] = { 429 { "interval", FMD_TYPE_TIME, "1min" }, 430 { "tolerance", FMD_TYPE_UINT32, "1" }, 431 { "spoof_sensor_state", FMD_TYPE_STRING, NULL }, 432 { NULL, 0, NULL } 433 }; 434 435 static const fmd_hdl_ops_t fmd_ops = { 436 NULL, /* fmdo_recv */ 437 st_timeout, /* fmdo_timeout */ 438 NULL, /* fmdo_close */ 439 NULL, /* fmdo_stats */ 440 NULL, /* fmdo_gc */ 441 NULL, /* fmdo_send */ 442 NULL /* fmdo_topo */ 443 }; 444 445 static const fmd_hdl_info_t fmd_info = { 446 "Sensor Transport Agent", "1.1", &fmd_ops, fmd_props 447 }; 448 449 void 450 _fmd_init(fmd_hdl_t *hdl) 451 { 452 sensor_transport_t *stp; 453 char buf[SYS_NMLN]; 454 455 /* 456 * The sensor-transport module is currently only supported on x86 457 * platforms. So to avoid unnecessarily wasting cpu cycles on sparc 458 * walking the hc scheme tree every 60 seconds, we'll bail out before 459 * registering the handle. 460 */ 461 if ((sysinfo(SI_ARCHITECTURE, buf, sizeof (buf)) == -1) || 462 (strcmp(buf, "i386") != 0)) 463 return; 464 465 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) 466 return; 467 468 (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, 469 sizeof (st_stats) / sizeof (fmd_stat_t), 470 (fmd_stat_t *)&st_stats); 471 472 stp = fmd_hdl_zalloc(hdl, sizeof (sensor_transport_t), FMD_SLEEP); 473 stp->st_interval = fmd_prop_get_int64(hdl, "interval"); 474 stp->st_tolerance = fmd_prop_get_int32(hdl, "tolerance"); 475 spoof_prop = fmd_prop_get_string(hdl, "spoof_sensor_state"); 476 477 if (spoof_prop != NULL && parse_spoof_param(hdl, spoof_prop, stp) != 0) 478 fmd_hdl_error(hdl, "Error parsing config file"); 479 480 fmd_hdl_setspecific(hdl, stp); 481 482 stp->st_xprt = fmd_xprt_open(hdl, FMD_XPRT_RDONLY, NULL, NULL); 483 stp->st_hdl = hdl; 484 stp->st_first = B_TRUE; 485 486 /* kick off the first asynchronous discovery */ 487 stp->st_timer = fmd_timer_install(hdl, NULL, NULL, 0); 488 } 489 490 void 491 _fmd_fini(fmd_hdl_t *hdl) 492 { 493 sensor_transport_t *stp; 494 sensor_fault_t *sfp; 495 496 stp = fmd_hdl_getspecific(hdl); 497 if (stp != NULL) { 498 fmd_xprt_close(hdl, stp->st_xprt); 499 500 while ((sfp = stp->st_faults) != NULL) { 501 stp->st_faults = sfp->sf_next; 502 503 fmd_hdl_strfree(hdl, sfp->sf_fru); 504 fmd_hdl_free(hdl, sfp, sizeof (sensor_fault_t)); 505 } 506 nvlist_free(stp->st_spoofs); 507 fmd_hdl_free(hdl, stp, sizeof (sensor_transport_t)); 508 } 509 fmd_prop_free_string(hdl, spoof_prop); 510 } 511