1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Disk Monitor 31 */ 32 #include <sys/types.h> 33 #include <sys/stat.h> 34 #include <fcntl.h> 35 #include <time.h> 36 #include <stdio.h> 37 #include <stdlib.h> 38 #include <strings.h> 39 #include <stdarg.h> 40 #include <errno.h> 41 #include <signal.h> 42 #include <unistd.h> 43 #include <pthread.h> 44 #include <libnvpair.h> 45 #include <fm/fmd_api.h> 46 #include <fm/fmd_fmri.h> 47 #include <sys/fm/protocol.h> 48 #include <sys/fm/io/disk.h> 49 #include <fm/libtopo.h> 50 51 #include "disk_monitor.h" 52 #include "hotplug_mgr.h" 53 #include "schg_mgr.h" 54 #include "topo_gather.h" 55 #include "dm_platform.h" 56 57 #define THIS_FMD_MODULE_NAME "disk-monitor" 58 59 static enum disk_init_state { 60 INIT_STATE_NONE = 0, 61 STATE_CHANGE_MGR_INITTED = 2, 62 HOTPLUG_MGR_INITTED = 4 63 } g_init_state = INIT_STATE_NONE; 64 65 typedef enum { 66 LT_SUSPECT, 67 LT_REPAIRED 68 } fm_list_type_t; 69 70 /* 71 * Global verbosity flag -- controls chattiness of debug messages and 72 * warnings. Its value is determined by the fmd property "log-level" 73 * settable in the DE's .conf file. 74 */ 75 log_class_t g_verbose = 0; 76 cfgdata_t *config_data = NULL; 77 fmd_hdl_t *g_fm_hdl = NULL; 78 79 static const fmd_prop_t fmd_props[]; 80 81 static void 82 diskmon_teardown_all(void) 83 { 84 cleanup_hotplug_manager(); 85 cleanup_state_change_manager(config_data); 86 config_fini(); 87 } 88 89 static int 90 count_disks(diskmon_t *disklistp) 91 { 92 int i = 0; 93 94 while (disklistp != NULL) { 95 i++; 96 disklistp = disklistp->next; 97 } 98 99 return (i); 100 } 101 102 static int 103 diskmon_init(void) 104 { 105 /* 106 * Block the generation of state change events (generated by the 107 * hotplug manager thread) here; they will be unblocked after the 108 * state change manager thread is ready to accept state changes 109 * (shortly after it starts). 110 */ 111 block_state_change_events(); 112 113 if (dm_platform_init() != 0) 114 goto cleanup; 115 116 if (init_hotplug_manager() != 0) 117 goto cleanup; 118 else 119 g_init_state |= HOTPLUG_MGR_INITTED; 120 121 if (init_state_change_manager(config_data) != 0) 122 goto cleanup; 123 else 124 g_init_state |= STATE_CHANGE_MGR_INITTED; 125 126 return (E_SUCCESS); 127 128 cleanup: 129 130 unblock_state_change_events(); 131 132 /* 133 * The cleanup order here does matter, due to dependencies between the 134 * managers. 135 */ 136 if (g_init_state & HOTPLUG_MGR_INITTED) 137 cleanup_hotplug_manager(); 138 if (g_init_state & STATE_CHANGE_MGR_INITTED) 139 cleanup_state_change_manager(config_data); 140 dm_platform_fini(); 141 142 return (E_ERROR); 143 } 144 145 static void 146 dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl) 147 { 148 const char *action_prop = NULL; 149 const char *action_string; 150 151 /* 152 * The predictive failure action is the activation of the fault 153 * indicator. 154 */ 155 if (fmd_nvl_class_match(hdl, nvl, 156 DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP)) 157 action_prop = DISK_PROP_OTEMPACTION; 158 159 if (fmd_nvl_class_match(hdl, nvl, 160 DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL)) 161 action_prop = DISK_PROP_STFAILACTION; 162 163 dm_fault_indicator_set(diskp, INDICATOR_ON); 164 165 if (action_prop != NULL && 166 (action_string = dm_prop_lookup(diskp->props, action_prop)) 167 != NULL) { 168 169 if (dm_platform_indicator_execute(action_string) != 0) { 170 log_warn("Fault action `%s' did not successfully " 171 "complete.\n", action_string); 172 } 173 } 174 } 175 176 static void 177 diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl) 178 { 179 char *uuid = NULL; 180 nvlist_t **nva; 181 uint_t nvc; 182 diskmon_t *diskp; 183 nvlist_t *fmri; 184 nvlist_t *fltnvl; 185 int err = 0; 186 187 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid); 188 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 189 &nva, &nvc); 190 if (err != 0) 191 return; 192 193 while (nvc-- != 0) { 194 195 fltnvl = *nva++; 196 197 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) 198 != 0) 199 continue; 200 201 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL) 202 continue; 203 204 log_msg(MM_MAIN, "Disk %s repaired!\n", 205 diskp->location); 206 207 dm_fault_indicator_set(diskp, INDICATOR_OFF); 208 209 dm_state_change(diskp, HPS_REPAIRED); 210 } 211 212 } 213 214 static void 215 diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl) 216 { 217 char *uuid = NULL; 218 nvlist_t **nva; 219 uint_t nvc; 220 diskmon_t *diskp; 221 nvlist_t *fmri; 222 nvlist_t *fltnvl; 223 int err = 0; 224 225 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid); 226 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 227 &nva, &nvc); 228 if (err != 0) 229 return; 230 231 while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) { 232 233 fltnvl = *nva++; 234 235 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0) 236 continue; 237 238 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL) 239 continue; 240 241 /* Execute the actions associated with this fault */ 242 dm_fault_execute_actions(hdl, diskp, fltnvl); 243 244 /* 245 * Send a state change event to the state change manager 246 */ 247 dm_state_change(diskp, HPS_FAULTED); 248 } 249 250 if (!fmd_case_uuclosed(hdl, uuid)) { 251 /* Case is closed */ 252 fmd_case_uuclose(hdl, uuid); 253 } 254 } 255 256 /*ARGSUSED*/ 257 static void 258 diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) 259 { 260 diskmon_t *diskp; 261 nvlist_t *fmri; 262 263 if (g_verbose & MM_MAIN) 264 nvlist_print(stderr, nvl); 265 266 /* 267 * Act on the fault suspect list or repaired list (embedded agent 268 * action). 269 */ 270 if (fmd_nvl_class_match(hdl, nvl, "list.repaired")) { 271 272 diskmon_agent_repair(hdl, nvl); 273 return; 274 275 } else if (fmd_nvl_class_match(hdl, nvl, "list.suspect")) { 276 277 diskmon_agent_suspect(hdl, nvl); 278 return; 279 } 280 281 /* 282 * If we get any replayed faults, set the diskmon's faulted 283 * flag for the appropriate fault, then change the diskmon's state 284 * to faulted. 285 */ 286 if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) { 287 288 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, 289 &fmri) != 0) 290 return; 291 292 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL) 293 return; 294 295 /* Execute the actions associated with this fault */ 296 dm_fault_execute_actions(hdl, diskp, nvl); 297 298 /* 299 * If the fault wasn't generated by this module, send a 300 * state change event to the state change manager 301 */ 302 dm_state_change(diskp, HPS_FAULTED); 303 return; 304 } 305 } 306 307 static const fmd_hdl_ops_t fmd_ops = { 308 diskmon_recv, /* fmdo_recv */ 309 NULL, /* fmdo_timeout */ 310 NULL, /* fmdo_close */ 311 NULL, /* fmdo_stats */ 312 NULL, /* fmdo_gc */ 313 }; 314 315 static const fmd_prop_t fmd_props[] = { 316 { GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" }, 317 { NULL, 0, NULL } 318 }; 319 320 static const fmd_hdl_info_t fmd_info = { 321 "Disk Monitor", 322 DISK_MONITOR_MODULE_VERSION, 323 &fmd_ops, 324 fmd_props 325 }; 326 327 void 328 _fmd_init(fmd_hdl_t *hdl) 329 { 330 fmd_case_t *cp; 331 int disk_count; 332 333 g_fm_hdl = hdl; 334 335 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { 336 return; 337 } 338 339 if (config_init()) { 340 log_err("Could not initialize configuration!\n"); 341 fmd_hdl_unregister(hdl); 342 return; 343 } 344 345 if (config_get(hdl, fmd_props)) { 346 config_fini(); 347 log_err("Could not retrieve configuration from libtopo!\n"); 348 fmd_hdl_unregister(hdl); 349 return; 350 } 351 352 /* 353 * If there are no disks to monitor, bail out 354 */ 355 if ((disk_count = count_disks(config_data->disk_list)) == 0) { 356 config_fini(); 357 fmd_hdl_unregister(hdl); 358 return; 359 } 360 361 if (diskmon_init() == E_ERROR) { 362 config_fini(); 363 fmd_hdl_unregister(hdl); 364 return; 365 } 366 367 log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count); 368 369 /* 370 * Iterate over all active cases. 371 * Since we automatically solve all cases, these cases must have 372 * had the fault added, but the DE must have been interrupted 373 * before they were solved. 374 */ 375 for (cp = fmd_case_next(hdl, NULL); 376 cp != NULL; cp = fmd_case_next(hdl, cp)) { 377 378 if (!fmd_case_solved(hdl, cp)) 379 fmd_case_solve(hdl, cp); 380 } 381 } 382 383 /*ARGSUSED*/ 384 void 385 _fmd_fini(fmd_hdl_t *hdl) 386 { 387 diskmon_teardown_all(); 388 g_fm_hdl = NULL; 389 } 390