1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * SMF software-response subsidiary 28 */ 29 30 #include <strings.h> 31 #include <fm/libtopo.h> 32 #include <libscf.h> 33 #include <sys/fm/protocol.h> 34 #include <fm/fmd_fmri.h> 35 36 #include "../../common/sw.h" 37 #include "smf.h" 38 39 static struct { 40 fmd_stat_t swrp_smf_repairs; 41 fmd_stat_t swrp_smf_clears; 42 fmd_stat_t swrp_smf_closed; 43 fmd_stat_t swrp_smf_wrongclass; 44 fmd_stat_t swrp_smf_badlist; 45 fmd_stat_t swrp_smf_badresource; 46 fmd_stat_t swrp_smf_badclrevent; 47 fmd_stat_t swrp_smf_noloop; 48 fmd_stat_t swrp_smf_suppressed; 49 fmd_stat_t swrp_smf_cachefull; 50 } swrp_smf_stats = { 51 { "swrp_smf_repairs", FMD_TYPE_UINT64, 52 "repair events received for propogation to SMF" }, 53 { "swrp_smf_clears", FMD_TYPE_UINT64, 54 "notifications from SMF of exiting maint state" }, 55 { "swrp_smf_closed", FMD_TYPE_UINT64, 56 "cases closed" }, 57 { "swrp_smf_wrongclass", FMD_TYPE_UINT64, 58 "unexpected event class received" }, 59 { "swrp_smf_badlist", FMD_TYPE_UINT64, 60 "list event with invalid structure" }, 61 { "swrp_smf_badresource", FMD_TYPE_UINT64, 62 "list.repaired with smf fault but bad svc fmri" }, 63 { "swrp_smf_badclrevent", FMD_TYPE_UINT64, 64 "maint clear event from SMF malformed" }, 65 { "swrp_smf_noloop", FMD_TYPE_UINT64, 66 "avoidance of smf->fmd->smf repairs propogations" }, 67 { "swrp_smf_suppressed", FMD_TYPE_UINT64, 68 "not propogated to smf because no longer in maint" }, 69 { "swrp_smf_cachefull", FMD_TYPE_UINT64, 70 "uuid cache full" }, 71 }; 72 73 #define BUMPSTAT(stat) swrp_smf_stats.stat.fmds_value.ui64++ 74 75 #define CACHE_NENT_INC 16 76 #define CACHE_NENT_MAX 128 77 78 struct smf_uuid_cache_ent { 79 char uuid[37]; 80 char fmristr[90]; 81 uint8_t mark; 82 }; 83 84 #define CACHE_VERSION 1 85 86 struct smf_uuid_cache { 87 uint32_t version; /* Version */ 88 uint32_t nentries; /* Real size of array below */ 89 struct smf_uuid_cache_ent entry[1]; /* Cache entries */ 90 }; 91 92 static struct smf_uuid_cache *uuid_cache; 93 94 #define UUID_CACHE_BUFNAME "uuid_cache" 95 96 static void 97 uuid_cache_grow(fmd_hdl_t *hdl) 98 { 99 struct smf_uuid_cache *newcache; 100 size_t newsz; 101 uint32_t n; 102 103 n = (uuid_cache == NULL ? 0 : uuid_cache->nentries) + CACHE_NENT_INC; 104 newsz = sizeof (struct smf_uuid_cache) + (n - 1) * 105 sizeof (struct smf_uuid_cache_ent); 106 107 newcache = fmd_hdl_zalloc(hdl, newsz, FMD_SLEEP); 108 newcache->version = CACHE_VERSION; 109 newcache->nentries = n; 110 111 if (uuid_cache != NULL) { 112 uint32_t oldn = uuid_cache->nentries; 113 size_t oldsz = sizeof (struct smf_uuid_cache) + 114 (oldn - 1) * sizeof (struct smf_uuid_cache_ent); 115 116 bcopy(&uuid_cache->entry[0], &newcache->entry[0], oldsz); 117 fmd_hdl_free(hdl, uuid_cache, oldsz); 118 fmd_buf_destroy(hdl, NULL, UUID_CACHE_BUFNAME); 119 } 120 121 uuid_cache = newcache; 122 fmd_buf_create(hdl, NULL, UUID_CACHE_BUFNAME, newsz); 123 } 124 125 static void 126 uuid_cache_persist(fmd_hdl_t *hdl) 127 { 128 size_t sz = sizeof (struct smf_uuid_cache) + 129 (uuid_cache->nentries - 1) * sizeof (struct smf_uuid_cache_ent); 130 131 fmd_buf_write(hdl, NULL, UUID_CACHE_BUFNAME, uuid_cache, sz); 132 } 133 134 /* 135 * Garbage-collect the uuid cache. Any cases that are already resolved 136 * we do not need an entry for. If a case is not resolved but the 137 * service involved in that case is no longer in maintenance state 138 * then we've lost sync somehow, so repair the asru (which will 139 * also resolve the case). 140 */ 141 static void 142 uuid_cache_gc(fmd_hdl_t *hdl) 143 { 144 struct smf_uuid_cache_ent *entp; 145 topo_hdl_t *thp = NULL; 146 nvlist_t *svcfmri; 147 char *svcname; 148 int err, i; 149 150 for (i = 0; i < uuid_cache->nentries; i++) { 151 entp = &uuid_cache->entry[i]; 152 153 if (entp->uuid[0] == '\0') 154 continue; 155 156 if (fmd_case_uuisresolved(hdl, entp->uuid)) { 157 bzero(entp->uuid, sizeof (entp->uuid)); 158 bzero(entp->fmristr, sizeof (entp->fmristr)); 159 entp->mark = 0; 160 } else { 161 if (thp == NULL) 162 thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); 163 164 if (topo_fmri_str2nvl(thp, entp->fmristr, &svcfmri, 165 &err) != 0) { 166 fmd_hdl_error(hdl, "str2nvl failed for %s\n", 167 entp->fmristr); 168 continue; 169 } 170 171 if (fmd_nvl_fmri_service_state(hdl, svcfmri) != 172 FMD_SERVICE_STATE_UNUSABLE) { 173 svcname = sw_smf_svcfmri2shortstr(hdl, svcfmri); 174 (void) fmd_repair_asru(hdl, entp->fmristr); 175 fmd_hdl_strfree(hdl, svcname); 176 } 177 178 nvlist_free(svcfmri); 179 } 180 } 181 182 if (thp) 183 fmd_hdl_topo_rele(hdl, thp); 184 185 uuid_cache_persist(hdl); 186 } 187 188 static void 189 uuid_cache_restore(fmd_hdl_t *hdl) 190 { 191 size_t sz = fmd_buf_size(hdl, NULL, UUID_CACHE_BUFNAME); 192 193 if (sz == 0) 194 return; 195 196 uuid_cache = fmd_hdl_alloc(hdl, sz, FMD_SLEEP); 197 fmd_buf_read(hdl, NULL, UUID_CACHE_BUFNAME, uuid_cache, sz); 198 199 /* 200 * Garbage collect now, not just for tidiness but also to help 201 * fmd and smf state stay in sync at module startup. 202 */ 203 uuid_cache_gc(hdl); 204 } 205 206 /* 207 * Add the UUID of an SMF maintenance defect case to our cache and 208 * record the associated full svc FMRI string for the case. 209 */ 210 static void 211 swrp_smf_cache_add(fmd_hdl_t *hdl, char *uuid, char *fmristr) 212 { 213 struct smf_uuid_cache_ent *entp = NULL; 214 int gced = 0; 215 int i; 216 217 if (uuid_cache == NULL) 218 uuid_cache_grow(hdl); 219 220 /* 221 * If we somehow already have an entry for this uuid then 222 * return leaving it undisturbed. 223 */ 224 for (i = 0; i < uuid_cache->nentries; i++) { 225 if (strcmp(uuid, uuid_cache->entry[i].uuid) == 0) 226 return; 227 } 228 229 scan: 230 for (i = 0; i < uuid_cache->nentries; i++) { 231 if (uuid_cache->entry[i].uuid[0] == '\0') { 232 entp = &uuid_cache->entry[i]; 233 break; 234 } 235 } 236 237 if (entp == NULL) { 238 uint32_t oldn = uuid_cache->nentries; 239 240 /* 241 * Before growing the cache we try again after first 242 * garbage-collecting the existing cache for any cases 243 * that are confirmed as resolved. 244 */ 245 if (!gced) { 246 uuid_cache_gc(hdl); 247 gced = 1; 248 goto scan; 249 } 250 251 if (oldn < CACHE_NENT_MAX) { 252 uuid_cache_grow(hdl); 253 entp = &uuid_cache->entry[oldn]; 254 } else { 255 BUMPSTAT(swrp_smf_cachefull); 256 return; 257 } 258 } 259 260 (void) strncpy(entp->uuid, uuid, sizeof (entp->uuid)); 261 (void) strncpy(entp->fmristr, fmristr, sizeof (entp->fmristr)); 262 uuid_cache_persist(hdl); 263 } 264 265 /* 266 * Mark cache entry/entries as resolved - if they match in either uuid 267 * (if not NULL) or fmristr (if not NULL) mark as resolved. Return 1 iff 268 * an entry that matched on uuid was already marked, otherwise (entry 269 * matched on either, matched on uuid but not marked, not found). 270 */ 271 static int 272 swrp_smf_cache_mark(fmd_hdl_t *hdl, char *uuid, char *fmristr) 273 { 274 int dirty = 0; 275 int rv = 0; 276 int i; 277 278 if (uuid_cache == NULL) 279 return (0); 280 281 for (i = 0; i < uuid_cache->nentries; i++) { 282 struct smf_uuid_cache_ent *entp = &uuid_cache->entry[i]; 283 284 if (entp->uuid[0] == '\0') 285 continue; 286 287 if (uuid && strcmp(uuid, entp->uuid) == 0) { 288 if (entp->mark) 289 rv = 1; 290 entp->mark = 1; 291 dirty++; 292 } else if (fmristr && strcmp(fmristr, entp->fmristr) == 0) { 293 entp->mark = 1; 294 dirty++; 295 } 296 } 297 298 if (dirty) 299 uuid_cache_persist(hdl); 300 301 return (rv); 302 } 303 304 /* 305 * We will receive list events for cases we are not interested in. Test 306 * that this list has exactly one suspect and that it matches the maintenance 307 * defect. Return the defect to the caller in the second argument, 308 * and the defect resource element in the third arg. 309 */ 310 static int 311 suspect_is_maint_defect(fmd_hdl_t *hdl, nvlist_t *nvl, 312 nvlist_t **defectnvl, nvlist_t **rsrcnvl) 313 { 314 nvlist_t **faults; 315 uint_t nfaults; 316 317 if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 318 &faults, &nfaults) != 0) { 319 BUMPSTAT(swrp_smf_badlist); 320 return (0); 321 } 322 323 if (nfaults != 1 || 324 !fmd_nvl_class_match(hdl, faults[0], SW_SMF_MAINT_DEFECT)) 325 return (0); 326 327 if (nvlist_lookup_nvlist(faults[0], FM_FAULT_RESOURCE, rsrcnvl) != 0) { 328 BUMPSTAT(swrp_smf_badlist); 329 return (0); 330 } 331 332 *defectnvl = faults[0]; 333 334 return (1); 335 } 336 337 /* 338 * Received newly-diagnosed list.suspect events that are for the 339 * maintenane defect we diagnose. Close the case (the resource was already 340 * isolated by SMF) after cachng the case UUID. 341 */ 342 /*ARGSUSED*/ 343 static void 344 swrp_smf_cacheuuid(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 345 const char *class, void *arg) 346 { 347 nvlist_t *defect, *rsrc; 348 char *fmristr, *uuid; 349 350 if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0) { 351 BUMPSTAT(swrp_smf_badlist); 352 return; 353 } 354 355 if (!suspect_is_maint_defect(hdl, nvl, &defect, &rsrc)) 356 return; 357 358 if ((fmristr = sw_smf_svcfmri2str(hdl, rsrc)) == NULL) { 359 BUMPSTAT(swrp_smf_badlist); 360 return; 361 } 362 363 swrp_smf_cache_add(hdl, uuid, fmristr); 364 fmd_hdl_strfree(hdl, fmristr); 365 366 if (!fmd_case_uuclosed(hdl, uuid)) { 367 fmd_case_uuclose(hdl, uuid); 368 BUMPSTAT(swrp_smf_closed); 369 } 370 } 371 372 /*ARGSUSED*/ 373 static void 374 swrp_smf2fmd(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 375 const char *class, void *arg) 376 { 377 nvlist_t *attr, *fmri; 378 char *fromstate; 379 char *fmristr; 380 381 if (!fmd_nvl_class_match(hdl, nvl, TRANCLASS("*"))) { 382 BUMPSTAT(swrp_smf_wrongclass); 383 return; 384 } 385 386 if (nvlist_lookup_nvlist(nvl, FM_IREPORT_ATTRIBUTES, &attr) != 0 || 387 nvlist_lookup_string(attr, "from-state", &fromstate) != 0) { 388 BUMPSTAT(swrp_smf_badclrevent); 389 return; 390 } 391 392 /* 393 * Filter those not describing a transition out of maintenance. 394 */ 395 if (strcmp(fromstate, "maintenance") != 0) 396 return; 397 398 if (nvlist_lookup_nvlist(attr, "svc", &fmri) != 0) { 399 BUMPSTAT(swrp_smf_badclrevent); 400 return; 401 } 402 403 if ((fmristr = sw_smf_svcfmri2str(hdl, fmri)) == NULL) { 404 BUMPSTAT(swrp_smf_badclrevent); 405 return; 406 } 407 408 /* 409 * Mark any UUID for a case against this service as resolved 410 * in our cache. When we fmd_repair_asru below fmd will emit 411 * a list.repaired as a result, and our handling of that event 412 * must not propogate the repair towards SMF (since the repair 413 * was initiated via SMF itself and not via fmadm). 414 */ 415 (void) swrp_smf_cache_mark(hdl, NULL, fmristr); 416 417 (void) fmd_repair_asru(hdl, fmristr); 418 fmd_hdl_strfree(hdl, fmristr); 419 BUMPSTAT(swrp_smf_clears); 420 } 421 422 /*ARGSUSED*/ 423 static void 424 swrp_fmd2smf(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 425 const char *class, void *arg) 426 { 427 char *fmristr, *shrtfmristr; 428 nvlist_t *defect, *rsrc; 429 char *uuid; 430 int already; 431 432 if (strcmp(class, FM_LIST_REPAIRED_CLASS) != 0) { 433 BUMPSTAT(swrp_smf_wrongclass); 434 return; 435 } 436 437 if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0) { 438 BUMPSTAT(swrp_smf_badlist); 439 return; 440 } 441 442 if (!suspect_is_maint_defect(hdl, nvl, &defect, &rsrc)) 443 return; 444 445 if ((fmristr = sw_smf_svcfmri2str(hdl, rsrc)) == NULL) { 446 BUMPSTAT(swrp_smf_badresource); 447 return; 448 } 449 450 already = swrp_smf_cache_mark(hdl, uuid, fmristr); 451 fmd_hdl_strfree(hdl, fmristr); 452 453 /* 454 * If the cache already had a marked entry for this UUID then 455 * this is a list.repaired arising from a SMF-initiated maintenance 456 * clear (propogated with fmd_repair_asru above which then results 457 * in a list.repaired) and so we should not propogate the repair 458 * back towards SMF. But do still force the case to RESOLVED state in 459 * case fmd is unable to confirm the service no longer in maintenance 460 * state (it may have failed again) so that a new case can be opened. 461 */ 462 fmd_case_uuresolved(hdl, uuid); 463 if (already) { 464 BUMPSTAT(swrp_smf_noloop); 465 return; 466 } 467 468 /* 469 * Only propogate to SMF if we can see that service still 470 * in maintenance state. We're not synchronized with SMF 471 * and this state could change at any time, but if we can 472 * see it's not in maintenance state then things are obviously 473 * moving (e.g., external svcadm active) so we don't poke 474 * at SMF otherwise we confuse things or duplicate operations. 475 */ 476 477 if (fmd_nvl_fmri_service_state(hdl, rsrc) == 478 FMD_SERVICE_STATE_UNUSABLE) { 479 shrtfmristr = sw_smf_svcfmri2shortstr(hdl, rsrc); 480 481 if (shrtfmristr != NULL) { 482 (void) smf_restore_instance(shrtfmristr); 483 fmd_hdl_strfree(hdl, shrtfmristr); 484 BUMPSTAT(swrp_smf_repairs); 485 } else { 486 BUMPSTAT(swrp_smf_badresource); 487 } 488 } else { 489 BUMPSTAT(swrp_smf_suppressed); 490 } 491 } 492 493 const struct sw_disp swrp_smf_disp[] = { 494 { TRANCLASS("*"), swrp_smf2fmd, NULL }, 495 { FM_LIST_SUSPECT_CLASS, swrp_smf_cacheuuid, NULL }, 496 { FM_LIST_REPAIRED_CLASS, swrp_fmd2smf, NULL }, 497 { NULL, NULL, NULL } 498 }; 499 500 /*ARGSUSED*/ 501 int 502 swrp_smf_init(fmd_hdl_t *hdl, id_t id, const struct sw_disp **dpp, int *nelemp) 503 { 504 (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (swrp_smf_stats) / 505 sizeof (fmd_stat_t), (fmd_stat_t *)&swrp_smf_stats); 506 507 uuid_cache_restore(hdl); 508 509 /* 510 * We need to subscribe to all SMF transition class events because 511 * we need to look inside the payload to see which events indicate 512 * a transition out of maintenance state. 513 */ 514 fmd_hdl_subscribe(hdl, TRANCLASS("*")); 515 516 /* 517 * Subscribe to the defect class diagnosed for maintenance events. 518 * The module will then receive list.suspect events including 519 * these defects, and in our dispatch table above we list routing 520 * for list.suspect. 521 */ 522 fmd_hdl_subscribe(hdl, SW_SMF_MAINT_DEFECT); 523 524 *dpp = &swrp_smf_disp[0]; 525 *nelemp = sizeof (swrp_smf_disp) / sizeof (swrp_smf_disp[0]); 526 return (SW_SUB_INIT_SUCCESS); 527 } 528 529 /*ARGSUSED*/ 530 void 531 swrp_smf_fini(fmd_hdl_t *hdl) 532 { 533 } 534 535 const struct sw_subinfo smf_response_info = { 536 "smf repair", /* swsub_name */ 537 SW_CASE_NONE, /* swsub_casetype */ 538 swrp_smf_init, /* swsub_init */ 539 swrp_smf_fini, /* swsub_fini */ 540 NULL, /* swsub_timeout */ 541 NULL, /* swsub_case_close */ 542 NULL, /* swsub_case_vrfy */ 543 }; 544