1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Disk Monitor 31 */ 32 #include <sys/types.h> 33 #include <sys/stat.h> 34 #include <fcntl.h> 35 #include <time.h> 36 #include <stdio.h> 37 #include <stdlib.h> 38 #include <strings.h> 39 #include <stdarg.h> 40 #include <errno.h> 41 #include <signal.h> 42 #include <unistd.h> 43 #include <pthread.h> 44 #include <libnvpair.h> 45 #include <fm/fmd_api.h> 46 #include <fm/fmd_fmri.h> 47 #include <sys/fm/protocol.h> 48 #include <sys/fm/io/disk.h> 49 #include <fm/libtopo.h> 50 51 #include "disk_monitor.h" 52 #include "hotplug_mgr.h" 53 #include "schg_mgr.h" 54 #include "topo_gather.h" 55 #include "dm_platform.h" 56 57 #define THIS_FMD_MODULE_NAME "disk-monitor" 58 59 static enum disk_init_state { 60 INIT_STATE_NONE = 0, 61 STATE_CHANGE_MGR_INITTED = 2, 62 HOTPLUG_MGR_INITTED = 4 63 } g_init_state = INIT_STATE_NONE; 64 65 typedef enum { 66 LT_SUSPECT, 67 LT_REPAIRED 68 } fm_list_type_t; 69 70 /* 71 * Global verbosity flag -- controls chattiness of debug messages and 72 * warnings. Its value is determined by the fmd property "log-level" 73 * settable in the DE's .conf file. 74 */ 75 log_class_t g_verbose = 0; 76 cfgdata_t *config_data = NULL; 77 fmd_hdl_t *g_fm_hdl = NULL; 78 79 static const fmd_prop_t fmd_props[]; 80 81 static void 82 diskmon_teardown_all(void) 83 { 84 cleanup_hotplug_manager(); 85 cleanup_state_change_manager(config_data); 86 config_fini(); 87 } 88 89 static int 90 count_disks(diskmon_t *disklistp) 91 { 92 int i = 0; 93 94 while (disklistp != NULL) { 95 i++; 96 disklistp = disklistp->next; 97 } 98 99 return (i); 100 } 101 102 static int 103 diskmon_init(void) 104 { 105 /* 106 * Block the generation of state change events (generated by the 107 * hotplug manager thread) here; they will be unblocked after the 108 * state change manager thread is ready to accept state changes 109 * (shortly after it starts). 110 */ 111 block_state_change_events(); 112 113 if (dm_platform_init() != 0) 114 goto cleanup; 115 116 if (init_hotplug_manager() != 0) 117 goto cleanup; 118 else 119 g_init_state |= HOTPLUG_MGR_INITTED; 120 121 if (init_state_change_manager(config_data) != 0) 122 goto cleanup; 123 else 124 g_init_state |= STATE_CHANGE_MGR_INITTED; 125 126 return (E_SUCCESS); 127 128 cleanup: 129 130 unblock_state_change_events(); 131 132 /* 133 * The cleanup order here does matter, due to dependencies between the 134 * managers. 135 */ 136 if (g_init_state & HOTPLUG_MGR_INITTED) 137 cleanup_hotplug_manager(); 138 if (g_init_state & STATE_CHANGE_MGR_INITTED) 139 cleanup_state_change_manager(config_data); 140 dm_platform_fini(); 141 142 return (E_ERROR); 143 } 144 145 static void 146 dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl) 147 { 148 const char *action_prop = NULL; 149 const char *action_string; 150 151 /* 152 * The predictive failure action is the activation of the fault 153 * indicator. 154 */ 155 if (fmd_nvl_class_match(hdl, nvl, 156 DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP)) 157 action_prop = DISK_PROP_OTEMPACTION; 158 159 if (fmd_nvl_class_match(hdl, nvl, 160 DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL)) 161 action_prop = DISK_PROP_STFAILACTION; 162 163 dm_fault_indicator_set(diskp, INDICATOR_ON); 164 165 if (action_prop != NULL && 166 (action_string = dm_prop_lookup(diskp->props, action_prop)) 167 != NULL) { 168 169 if (dm_platform_indicator_execute(action_string) != 0) { 170 log_warn("Fault action `%s' did not successfully " 171 "complete.\n", action_string); 172 } 173 } 174 } 175 176 static void 177 diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl, int repair) 178 { 179 char *uuid = NULL; 180 nvlist_t **nva; 181 uint_t nvc; 182 diskmon_t *diskp; 183 nvlist_t *fmri; 184 nvlist_t *fltnvl; 185 int err = 0; 186 187 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid); 188 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 189 &nva, &nvc); 190 if (err != 0) 191 return; 192 193 while (nvc-- != 0) { 194 195 fltnvl = *nva++; 196 197 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) 198 != 0) 199 continue; 200 201 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL) 202 continue; 203 204 log_msg(MM_MAIN, "Disk %s repaired!\n", 205 diskp->location); 206 207 dm_fault_indicator_set(diskp, INDICATOR_OFF); 208 209 dm_state_change(diskp, HPS_REPAIRED); 210 } 211 212 if (repair) 213 fmd_case_uuresolved(hdl, uuid); 214 215 } 216 217 static void 218 diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl) 219 { 220 char *uuid = NULL; 221 nvlist_t **nva; 222 uint_t nvc; 223 diskmon_t *diskp; 224 nvlist_t *fmri; 225 nvlist_t *fltnvl; 226 int err = 0; 227 228 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid); 229 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 230 &nva, &nvc); 231 if (err != 0) 232 return; 233 234 while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) { 235 236 fltnvl = *nva++; 237 238 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0) 239 continue; 240 241 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL) 242 continue; 243 244 /* Execute the actions associated with this fault */ 245 dm_fault_execute_actions(hdl, diskp, fltnvl); 246 247 /* 248 * Send a state change event to the state change manager 249 */ 250 dm_state_change(diskp, HPS_FAULTED); 251 } 252 253 if (!fmd_case_uuclosed(hdl, uuid)) { 254 /* Case is closed */ 255 fmd_case_uuclose(hdl, uuid); 256 } 257 } 258 259 /*ARGSUSED*/ 260 static void 261 diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) 262 { 263 diskmon_t *diskp; 264 nvlist_t *fmri; 265 266 if (g_verbose & MM_MAIN) 267 nvlist_print(stderr, nvl); 268 269 /* 270 * Act on the fault suspect list or repaired list (embedded agent 271 * action). 272 */ 273 if (fmd_nvl_class_match(hdl, nvl, FM_LIST_REPAIRED_CLASS)) { 274 275 diskmon_agent_repair(hdl, nvl, 1); 276 return; 277 278 } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_UPDATED_CLASS)) { 279 280 diskmon_agent_repair(hdl, nvl, 0); 281 return; 282 283 } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_SUSPECT_CLASS)) { 284 285 diskmon_agent_suspect(hdl, nvl); 286 return; 287 } 288 289 /* 290 * If we get any replayed faults, set the diskmon's faulted 291 * flag for the appropriate fault, then change the diskmon's state 292 * to faulted. 293 */ 294 if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) { 295 296 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, 297 &fmri) != 0) 298 return; 299 300 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL) 301 return; 302 303 /* Execute the actions associated with this fault */ 304 dm_fault_execute_actions(hdl, diskp, nvl); 305 306 /* 307 * If the fault wasn't generated by this module, send a 308 * state change event to the state change manager 309 */ 310 dm_state_change(diskp, HPS_FAULTED); 311 return; 312 } 313 } 314 315 static const fmd_hdl_ops_t fmd_ops = { 316 diskmon_recv, /* fmdo_recv */ 317 NULL, /* fmdo_timeout */ 318 NULL, /* fmdo_close */ 319 NULL, /* fmdo_stats */ 320 NULL, /* fmdo_gc */ 321 }; 322 323 static const fmd_prop_t fmd_props[] = { 324 { GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" }, 325 { NULL, 0, NULL } 326 }; 327 328 static const fmd_hdl_info_t fmd_info = { 329 "Disk Monitor", 330 DISK_MONITOR_MODULE_VERSION, 331 &fmd_ops, 332 fmd_props 333 }; 334 335 void 336 _fmd_init(fmd_hdl_t *hdl) 337 { 338 fmd_case_t *cp; 339 int disk_count; 340 341 g_fm_hdl = hdl; 342 343 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { 344 return; 345 } 346 347 if (config_init()) { 348 log_err("Could not initialize configuration!\n"); 349 fmd_hdl_unregister(hdl); 350 return; 351 } 352 353 if (config_get(hdl, fmd_props)) { 354 config_fini(); 355 log_err("Could not retrieve configuration from libtopo!\n"); 356 fmd_hdl_unregister(hdl); 357 return; 358 } 359 360 /* 361 * If there are no disks to monitor, bail out 362 */ 363 if ((disk_count = count_disks(config_data->disk_list)) == 0) { 364 config_fini(); 365 fmd_hdl_unregister(hdl); 366 return; 367 } 368 369 if (diskmon_init() == E_ERROR) { 370 config_fini(); 371 fmd_hdl_unregister(hdl); 372 return; 373 } 374 375 log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count); 376 377 /* 378 * Iterate over all active cases. 379 * Since we automatically solve all cases, these cases must have 380 * had the fault added, but the DE must have been interrupted 381 * before they were solved. 382 */ 383 for (cp = fmd_case_next(hdl, NULL); 384 cp != NULL; cp = fmd_case_next(hdl, cp)) { 385 386 if (!fmd_case_solved(hdl, cp)) 387 fmd_case_solve(hdl, cp); 388 } 389 } 390 391 /*ARGSUSED*/ 392 void 393 _fmd_fini(fmd_hdl_t *hdl) 394 { 395 diskmon_teardown_all(); 396 g_fm_hdl = NULL; 397 } 398