1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Disk Monitor
31  */
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 #include <fcntl.h>
35 #include <time.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <strings.h>
39 #include <stdarg.h>
40 #include <errno.h>
41 #include <signal.h>
42 #include <unistd.h>
43 #include <pthread.h>
44 #include <libnvpair.h>
45 #include <fm/fmd_api.h>
46 #include <fm/fmd_fmri.h>
47 #include <sys/fm/protocol.h>
48 #include <sys/fm/io/disk.h>
49 #include <fm/libtopo.h>
50 
51 #include "disk_monitor.h"
52 #include "hotplug_mgr.h"
53 #include "schg_mgr.h"
54 #include "topo_gather.h"
55 #include "dm_platform.h"
56 
57 #define	THIS_FMD_MODULE_NAME "disk-monitor"
58 
59 static enum disk_init_state {
60 	INIT_STATE_NONE = 0,
61 	STATE_CHANGE_MGR_INITTED = 2,
62 	HOTPLUG_MGR_INITTED = 4
63 } g_init_state = INIT_STATE_NONE;
64 
65 typedef enum {
66 	LT_SUSPECT,
67 	LT_REPAIRED
68 } fm_list_type_t;
69 
70 /*
71  * Global verbosity flag -- controls chattiness of debug messages and
72  * warnings.  Its value is determined by the fmd property "log-level"
73  * settable in the DE's .conf file.
74  */
75 log_class_t			g_verbose = 0;
76 cfgdata_t			*config_data = NULL;
77 fmd_hdl_t			*g_fm_hdl = NULL;
78 
79 static const fmd_prop_t		fmd_props[];
80 
81 static void
82 diskmon_teardown_all(void)
83 {
84 	cleanup_hotplug_manager();
85 	cleanup_state_change_manager(config_data);
86 	config_fini();
87 }
88 
89 static int
90 count_disks(diskmon_t *disklistp)
91 {
92 	int i = 0;
93 
94 	while (disklistp != NULL) {
95 		i++;
96 		disklistp = disklistp->next;
97 	}
98 
99 	return (i);
100 }
101 
102 static int
103 diskmon_init(void)
104 {
105 	/*
106 	 * Block the generation of state change events (generated by the
107 	 * hotplug manager thread) here; they will be unblocked after the
108 	 * state change manager thread is ready to accept state changes
109 	 * (shortly after it starts).
110 	 */
111 	block_state_change_events();
112 
113 	if (dm_platform_init() != 0)
114 		goto cleanup;
115 
116 	if (init_hotplug_manager() != 0)
117 		goto cleanup;
118 	else
119 		g_init_state |= HOTPLUG_MGR_INITTED;
120 
121 	if (init_state_change_manager(config_data) != 0)
122 		goto cleanup;
123 	else
124 		g_init_state |= STATE_CHANGE_MGR_INITTED;
125 
126 	return (E_SUCCESS);
127 
128 cleanup:
129 
130 	unblock_state_change_events();
131 
132 	/*
133 	 * The cleanup order here does matter, due to dependencies between the
134 	 * managers.
135 	 */
136 	if (g_init_state & HOTPLUG_MGR_INITTED)
137 		cleanup_hotplug_manager();
138 	if (g_init_state & STATE_CHANGE_MGR_INITTED)
139 		cleanup_state_change_manager(config_data);
140 	dm_platform_fini();
141 
142 	return (E_ERROR);
143 }
144 
145 static void
146 dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl)
147 {
148 	const char		*action_prop = NULL;
149 	const char		*action_string;
150 
151 	/*
152 	 * The predictive failure action is the activation of the fault
153 	 * indicator.
154 	 */
155 	if (fmd_nvl_class_match(hdl, nvl,
156 	    DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP))
157 		action_prop = DISK_PROP_OTEMPACTION;
158 
159 	if (fmd_nvl_class_match(hdl, nvl,
160 	    DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL))
161 		action_prop = DISK_PROP_STFAILACTION;
162 
163 	dm_fault_indicator_set(diskp, INDICATOR_ON);
164 
165 	if (action_prop != NULL &&
166 	    (action_string = dm_prop_lookup(diskp->props, action_prop))
167 	    != NULL) {
168 
169 		if (dm_platform_indicator_execute(action_string) != 0) {
170 			log_warn("Fault action `%s' did not successfully "
171 			    "complete.\n", action_string);
172 		}
173 	}
174 }
175 
176 static void
177 diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl)
178 {
179 	char		*uuid = NULL;
180 	nvlist_t	**nva;
181 	uint_t		nvc;
182 	diskmon_t	*diskp;
183 	nvlist_t	*fmri;
184 	nvlist_t	*fltnvl;
185 	int		err = 0;
186 
187 	err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
188 	err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
189 	    &nva, &nvc);
190 	if (err != 0)
191 		return;
192 
193 	while (nvc-- != 0) {
194 
195 		fltnvl = *nva++;
196 
197 		if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri)
198 		    != 0)
199 			continue;
200 
201 		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
202 			continue;
203 
204 		log_msg(MM_MAIN, "Disk %s repaired!\n",
205 		    diskp->location);
206 
207 		dm_fault_indicator_set(diskp, INDICATOR_OFF);
208 
209 		dm_state_change(diskp, HPS_REPAIRED);
210 	}
211 
212 }
213 
214 static void
215 diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl)
216 {
217 	char		*uuid = NULL;
218 	nvlist_t	**nva;
219 	uint_t		nvc;
220 	diskmon_t	*diskp;
221 	nvlist_t	*fmri;
222 	nvlist_t	*fltnvl;
223 	int		err = 0;
224 
225 	err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
226 	err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
227 	    &nva, &nvc);
228 	if (err != 0)
229 		return;
230 
231 	while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) {
232 
233 		fltnvl = *nva++;
234 
235 		if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0)
236 			continue;
237 
238 		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
239 			continue;
240 
241 		/* Execute the actions associated with this fault */
242 		dm_fault_execute_actions(hdl, diskp,  fltnvl);
243 
244 		/*
245 		 * Send a state change event to the state change manager
246 		 */
247 		dm_state_change(diskp, HPS_FAULTED);
248 	}
249 
250 	if (!fmd_case_uuclosed(hdl, uuid)) {
251 		/* Case is closed */
252 		fmd_case_uuclose(hdl, uuid);
253 	}
254 }
255 
256 /*ARGSUSED*/
257 static void
258 diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
259 {
260 	diskmon_t	*diskp;
261 	nvlist_t	*fmri;
262 
263 	if (g_verbose & MM_MAIN)
264 		nvlist_print(stderr, nvl);
265 
266 	/*
267 	 * Act on the fault suspect list or repaired list (embedded agent
268 	 * action).
269 	 */
270 	if (fmd_nvl_class_match(hdl, nvl, "list.repaired")) {
271 
272 		diskmon_agent_repair(hdl, nvl);
273 		return;
274 
275 	} else if (fmd_nvl_class_match(hdl, nvl, "list.suspect")) {
276 
277 		diskmon_agent_suspect(hdl, nvl);
278 		return;
279 	}
280 
281 	/*
282 	 * If we get any replayed faults, set the diskmon's faulted
283 	 * flag for the appropriate fault, then change the diskmon's state
284 	 * to faulted.
285 	 */
286 	if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) {
287 
288 		if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE,
289 		    &fmri) != 0)
290 			return;
291 
292 		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
293 			return;
294 
295 		/* Execute the actions associated with this fault */
296 		dm_fault_execute_actions(hdl, diskp, nvl);
297 
298 		/*
299 		 * If the fault wasn't generated by this module, send a
300 		 * state change event to the state change manager
301 		 */
302 		dm_state_change(diskp, HPS_FAULTED);
303 		return;
304 	}
305 }
306 
307 static const fmd_hdl_ops_t fmd_ops = {
308 	diskmon_recv,	/* fmdo_recv */
309 	NULL,		/* fmdo_timeout */
310 	NULL,		/* fmdo_close */
311 	NULL,		/* fmdo_stats */
312 	NULL,		/* fmdo_gc */
313 };
314 
315 static const fmd_prop_t fmd_props[] = {
316 	{ GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" },
317 	{ NULL, 0, NULL }
318 };
319 
320 static const fmd_hdl_info_t fmd_info = {
321 	"Disk Monitor",
322 	DISK_MONITOR_MODULE_VERSION,
323 	&fmd_ops,
324 	fmd_props
325 };
326 
327 void
328 _fmd_init(fmd_hdl_t *hdl)
329 {
330 	fmd_case_t	*cp;
331 	int		disk_count;
332 
333 	g_fm_hdl = hdl;
334 
335 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
336 		return;
337 	}
338 
339 	if (config_init()) {
340 		log_err("Could not initialize configuration!\n");
341 		fmd_hdl_unregister(hdl);
342 		return;
343 	}
344 
345 	if (config_get(hdl, fmd_props)) {
346 		config_fini();
347 		log_err("Could not retrieve configuration from libtopo!\n");
348 		fmd_hdl_unregister(hdl);
349 		return;
350 	}
351 
352 	/*
353 	 * If there are no disks to monitor, bail out
354 	 */
355 	if ((disk_count = count_disks(config_data->disk_list)) == 0) {
356 		config_fini();
357 		fmd_hdl_unregister(hdl);
358 		return;
359 	}
360 
361 	if (diskmon_init() == E_ERROR) {
362 		config_fini();
363 		fmd_hdl_unregister(hdl);
364 		return;
365 	}
366 
367 	log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count);
368 
369 	/*
370 	 * Iterate over all active cases.
371 	 * Since we automatically solve all cases, these cases must have
372 	 * had the fault added, but the DE must have been interrupted
373 	 * before they were solved.
374 	 */
375 	for (cp = fmd_case_next(hdl, NULL);
376 	    cp != NULL; cp = fmd_case_next(hdl, cp)) {
377 
378 		if (!fmd_case_solved(hdl, cp))
379 			fmd_case_solve(hdl, cp);
380 	}
381 }
382 
383 /*ARGSUSED*/
384 void
385 _fmd_fini(fmd_hdl_t *hdl)
386 {
387 	diskmon_teardown_all();
388 	g_fm_hdl = NULL;
389 }
390