1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * SMF software-diagnosis subsidiary 28 * 29 * We model service instances in maintenance state as a defect diagnosis 30 * in FMA. When an instance transitions to maintenance state the SMF 31 * graph engine publishes an event which we subscribe to here, and diagnose 32 * a corresponding defect. 33 * 34 * We always solve a case immediately after opening it. But we leave the 35 * case close action to the response agent which needs to cache case UUIDs. 36 * So in the normal case, where software-response is loaded and operational, 37 * our cases will transition to CLOSED state moments after we solve them. 38 * But if fmd restarts in the interim or if software-response is not loaded 39 * then our cases may hang around in SOLVED state for a while, which means 40 * we could iterate over them on receipt of new events. But we don't - 41 * we blindly solve a new case for every new maintenance event received, 42 * and leave it to the fmd duplicate detection and history-based diagnosis 43 * logic to do the right thing. 44 * 45 * Our sibling SMF response subsidiary propogates fmadm-initiated repairs 46 * into SMF, and svcadm-initiated clears back into FMA. In both cases 47 * the case is moved on to the RESOLVED state, even if fmd is unable to 48 * verify that the service is out of maintenance state (i.e., no longer 49 * isolated). If the service immediately re-enters maintenance state then 50 * we diagnose a fresh case. The history-based diagnosis changes in fmd 51 * "do the right thing" and avoid throwing away new cases as duplicates 52 * of old ones hanging around in the "resolved but not all usable again" 53 * state. 54 */ 55 56 #include <strings.h> 57 #include <fm/libtopo.h> 58 #include <fm/fmd_fmri.h> 59 60 #include "../../common/sw.h" 61 #include "smf.h" 62 63 static id_t myid; 64 65 static struct { 66 fmd_stat_t swde_smf_diagnosed; 67 fmd_stat_t swde_smf_bad_class; 68 fmd_stat_t swde_smf_no_attr; 69 fmd_stat_t swde_smf_bad_attr; 70 fmd_stat_t swde_smf_bad_fmri; 71 fmd_stat_t swde_smf_no_uuid; 72 fmd_stat_t swde_smf_no_reason_short; 73 fmd_stat_t swde_smf_no_reason_long; 74 fmd_stat_t swde_smf_no_svcname; 75 fmd_stat_t swde_smf_admin_maint_drop; 76 fmd_stat_t swde_smf_bad_nvlist_pack; 77 fmd_stat_t swde_smf_dupuuid; 78 } swde_smf_stats = { 79 { "swde_smf_diagnosed", FMD_TYPE_UINT64, 80 "maintenance state defects published" }, 81 { "swde_smf_bad_class", FMD_TYPE_UINT64, 82 "incorrect event class received" }, 83 { "swde_smf_no_attr", FMD_TYPE_UINT64, 84 "malformed event - missing attr nvlist" }, 85 { "swde_smf_bad_attr", FMD_TYPE_UINT64, 86 "malformed event - invalid attr list" }, 87 { "swde_smf_bad_fmri", FMD_TYPE_UINT64, 88 "malformed event - fmri2str fails" }, 89 { "swde_smf_no_uuid", FMD_TYPE_UINT64, 90 "malformed event - missing uuid" }, 91 { "swde_smf_no_reason_short", FMD_TYPE_UINT64, 92 "SMF transition event had no reason-short" }, 93 { "swde_smf_no_reason_long", FMD_TYPE_UINT64, 94 "SMF transition event had no reason-long" }, 95 { "swde_smf_no_svcname", FMD_TYPE_UINT64, 96 "SMF transition event had no svc-string" }, 97 { "swde_smf_admin_maint_drop", FMD_TYPE_UINT64, 98 "maintenance transitions requested by admin - no diagnosis" }, 99 { "swde_smf_bad_nvlist_pack", FMD_TYPE_UINT64, 100 "failed nvlist_size or nvlist_pack" }, 101 { "swde_smf_dupuuid", FMD_TYPE_UINT64, 102 "duplicate events received" }, 103 }; 104 105 #define SWDE_SMF_CASEDATA_VERS 1 106 107 typedef struct swde_smf_casedata { 108 uint32_t scd_vers; /* must be first member */ 109 size_t scd_nvlbufsz; /* size of following buffer */ 110 /* packed fmri nvlist follows */ 111 } swde_smf_casedata_t; 112 113 #define BUMPSTAT(stat) swde_smf_stats.stat.fmds_value.ui64++ 114 115 /*ARGSUSED*/ 116 void 117 swde_smf_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 118 const char *class, void *arg) 119 { 120 char *rsn = NULL, *rsnl = NULL, *svcname = NULL; 121 nvlist_t *attr, *svcfmri, *defect; 122 swde_smf_casedata_t *cdp; 123 fmd_case_t *cp; 124 char *fmribuf; 125 char *uuid; 126 size_t sz; 127 128 if (!fmd_nvl_class_match(hdl, nvl, TRANCLASS("maintenance"))) { 129 BUMPSTAT(swde_smf_bad_class); 130 return; 131 } 132 133 if (nvlist_lookup_nvlist(nvl, FM_IREPORT_ATTRIBUTES, &attr) != 0) { 134 BUMPSTAT(swde_smf_no_attr); 135 return; 136 } 137 138 if (nvlist_lookup_string(nvl, FM_IREPORT_UUID, &uuid) != 0) { 139 BUMPSTAT(swde_smf_no_uuid); 140 return; 141 } 142 143 if (nvlist_lookup_nvlist(attr, "svc", &svcfmri) != 0) { 144 BUMPSTAT(swde_smf_bad_attr); 145 return; 146 } 147 148 if (nvlist_lookup_string(attr, "reason-short", &rsn) != 0) { 149 BUMPSTAT(swde_smf_no_reason_short); 150 return; 151 } 152 153 if (nvlist_lookup_string(attr, "reason-long", &rsnl) != 0) { 154 BUMPSTAT(swde_smf_no_reason_long); 155 return; 156 } 157 158 if (nvlist_lookup_string(attr, "svc-string", &svcname) != 0) { 159 BUMPSTAT(swde_smf_no_svcname); 160 return; 161 } 162 163 if (strcmp(rsn, "administrative_request") == 0) { 164 BUMPSTAT(swde_smf_admin_maint_drop); 165 return; 166 } 167 168 /* 169 * Our case checkpoint data, version 1. 170 */ 171 if (nvlist_size(svcfmri, &sz, NV_ENCODE_NATIVE) != 0) { 172 BUMPSTAT(swde_smf_bad_nvlist_pack); 173 return; 174 } 175 cdp = fmd_hdl_zalloc(hdl, sizeof (*cdp) + sz, FMD_SLEEP); 176 cdp->scd_vers = SWDE_SMF_CASEDATA_VERS; 177 fmribuf = (char *)cdp + sizeof (*cdp); 178 cdp->scd_nvlbufsz = sz; 179 (void) nvlist_pack(svcfmri, &fmribuf, &sz, NV_ENCODE_NATIVE, 0); 180 181 /* 182 * Open a case with UUID matching the originating event, and no 183 * associated serialization data. Create a defect and add it to 184 * the case, and link the originating event to the case. This 185 * call will return NULL if a case with the requested UUID already 186 * exists, which would mean we are processing an event twice so 187 * we can discard. 188 */ 189 if ((cp = swde_case_open(hdl, myid, uuid, SWDE_SMF_CASEDATA_VERS, 190 (void *)cdp, sizeof (*cdp) + sz)) == NULL) { 191 BUMPSTAT(swde_smf_dupuuid); 192 fmd_hdl_free(hdl, cdp, sizeof (*cdp) + sz); 193 return; 194 } 195 196 defect = fmd_nvl_create_defect(hdl, SW_SMF_MAINT_DEFECT, 197 100, svcfmri, NULL, svcfmri); 198 if (rsn != NULL) 199 (void) nvlist_add_string(defect, "reason-short", rsn); 200 if (rsnl != NULL) 201 (void) nvlist_add_string(defect, "reason-long", rsnl); 202 if (svcname != NULL) 203 (void) nvlist_add_string(defect, "svc-string", svcname); 204 fmd_case_add_suspect(hdl, cp, defect); 205 fmd_case_add_ereport(hdl, cp, ep); 206 207 /* 208 * Now solve the case, and immediately close it. Although the 209 * resource is already isolated (SMF put it in maintenance state) 210 * we do not immediately close the case here - our sibling response 211 * logic will do that after caching the case UUID. 212 */ 213 fmd_case_solve(hdl, cp); 214 BUMPSTAT(swde_smf_diagnosed); 215 } 216 217 /* 218 * In the normal course of events we keep in sync with SMF through the 219 * maintenance enter/clear events it raises. Even if a maintenance 220 * state is cleared using svcadm while fmd is not running, the event 221 * will pend and be consumed when fmd does start and we'll close the 222 * case (in the response agent). 223 * 224 * But is is possible for discontinuities to produce some confusion: 225 * 226 * - if an instance is in maintenance state (and so shown in svcs -x 227 * and fmadm faulty output) at the time we clone a new boot 228 * environment then when we boot the new BE we can be out of 229 * sync if the instance is cleared when we boot there 230 * 231 * - meddling with /var/fm state - eg manual clear of files there, 232 * or restore of old state 233 * 234 * So as an extra guard we have a case verify function which is called 235 * at fmd restart (module load for software-diagnosis). We must 236 * return 0 to close the case, non-zero to retain it. 237 */ 238 int 239 swde_smf_vrfy(fmd_hdl_t *hdl, fmd_case_t *cp) 240 { 241 swde_smf_casedata_t *cdp; 242 nvlist_t *svcfmri; 243 uint32_t v; 244 int rv; 245 246 cdp = swde_case_data(hdl, cp, &v); 247 248 if (cdp == NULL || v != 1) 249 return (0); /* bad or damaged - just close */ 250 251 if (nvlist_unpack((char *)cdp + sizeof (*cdp), 252 cdp->scd_nvlbufsz, &svcfmri, 0) != 0) 253 return (0); /* ditto */ 254 255 switch (fmd_nvl_fmri_service_state(hdl, svcfmri)) { 256 case FMD_SERVICE_STATE_UNUSABLE: 257 /* 258 * Keep case iff in maintenance state 259 */ 260 rv = 1; 261 break; 262 263 default: 264 /* 265 * Discard the case for all other states - cleared, 266 * service no longer exists, ... whatever. 267 */ 268 rv = 0; 269 break; 270 } 271 272 nvlist_free(svcfmri); 273 return (rv); 274 } 275 276 const struct sw_disp swde_smf_disp[] = { 277 { TRANCLASS("maintenance"), swde_smf_recv, NULL }, 278 { NULL, NULL, NULL } 279 }; 280 281 /*ARGSUSED*/ 282 int 283 swde_smf_init(fmd_hdl_t *hdl, id_t id, const struct sw_disp **dpp, int *nelemp) 284 { 285 myid = id; 286 287 (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (swde_smf_stats) / 288 sizeof (fmd_stat_t), (fmd_stat_t *)&swde_smf_stats); 289 290 fmd_hdl_subscribe(hdl, TRANCLASS("maintenance")); 291 292 *dpp = &swde_smf_disp[0]; 293 *nelemp = sizeof (swde_smf_disp) / sizeof (swde_smf_disp[0]); 294 return (SW_SUB_INIT_SUCCESS); 295 } 296 297 const struct sw_subinfo smf_diag_info = { 298 "smf diagnosis", /* swsub_name */ 299 SW_CASE_SMF, /* swsub_casetype */ 300 swde_smf_init, /* swsub_init */ 301 NULL, /* swsub_fini */ 302 NULL, /* swsub_timeout */ 303 NULL, /* swsub_case_close */ 304 swde_smf_vrfy, /* swsub_case_vrfy */ 305 }; 306