1 /*
2  * Copyright (c) 2010-2012 Red Hat, Inc.
3  *
4  * All rights reserved.
5  *
6  * Author: Angus Salkeld <asalkeld@redhat.com>
7  *
8  * This software licensed under BSD license, the text of which follows:
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions are met:
12  *
13  * - Redistributions of source code must retain the above copyright notice,
14  *   this list of conditions and the following disclaimer.
15  * - Redistributions in binary form must reproduce the above copyright notice,
16  *   this list of conditions and the following disclaimer in the documentation
17  *   and/or other materials provided with the distribution.
18  * - Neither the name of the MontaVista Software, Inc. nor the names of its
19  *   contributors may be used to endorse or promote products derived from this
20  *   software without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
26  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
32  * THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include <config.h>
36 
37 #include <unistd.h>
38 #include <fcntl.h>
39 #include <sys/ioctl.h>
40 #include <linux/types.h>
41 #include <linux/watchdog.h>
42 #include <sys/reboot.h>
43 
44 #include <corosync/corotypes.h>
45 #include <corosync/corodefs.h>
46 #include <corosync/coroapi.h>
47 #include <corosync/list.h>
48 #include <corosync/logsys.h>
49 #include <corosync/icmap.h>
50 #include "fsm.h"
51 
52 #include "service.h"
53 
54 typedef enum {
55 	WD_RESOURCE_GOOD,
56 	WD_RESOURCE_FAILED,
57 	WD_RESOURCE_STATE_UNKNOWN,
58 	WD_RESOURCE_NOT_MONITORED
59 } wd_resource_state_t;
60 
61 struct resource {
62 	char res_path[ICMAP_KEYNAME_MAXLEN];
63 	char *recovery;
64 	char name[CS_MAX_NAME_LENGTH];
65 	time_t last_updated;
66 	struct cs_fsm fsm;
67 
68 	corosync_timer_handle_t check_timer;
69 	uint64_t check_timeout;
70 	icmap_track_t icmap_track;
71 };
72 
73 LOGSYS_DECLARE_SUBSYS("WD");
74 
75 /*
76  * Service Interfaces required by service_message_handler struct
77  */
78 static char *wd_exec_init_fn (struct corosync_api_v1 *corosync_api);
79 static int wd_exec_exit_fn (void);
80 static void wd_resource_check_fn (void* resource_ref);
81 
82 static struct corosync_api_v1 *api;
83 #define WD_DEFAULT_TIMEOUT_SEC 6
84 #define WD_DEFAULT_TIMEOUT_MS (WD_DEFAULT_TIMEOUT_SEC * CS_TIME_MS_IN_SEC)
85 #define WD_MIN_TIMEOUT_MS 500
86 #define WD_MAX_TIMEOUT_MS (120 * CS_TIME_MS_IN_SEC)
87 static uint32_t watchdog_timeout = WD_DEFAULT_TIMEOUT_SEC;
88 static uint64_t tickle_timeout = (WD_DEFAULT_TIMEOUT_MS / 2);
89 static int dog = -1;
90 static corosync_timer_handle_t wd_timer;
91 static int watchdog_ok = 1;
92 static char *watchdog_device = "/dev/watchdog";
93 
94 struct corosync_service_engine wd_service_engine = {
95 	.name			= "corosync watchdog service",
96 	.id			= WD_SERVICE,
97 	.priority		= 1,
98 	.private_data_size	= 0,
99 	.flow_control		= CS_LIB_FLOW_CONTROL_NOT_REQUIRED,
100 	.lib_init_fn		= NULL,
101 	.lib_exit_fn		= NULL,
102 	.lib_engine		= NULL,
103 	.lib_engine_count	= 0,
104 	.exec_engine		= NULL,
105 	.exec_engine_count	= 0,
106 	.confchg_fn		= NULL,
107 	.exec_init_fn		= wd_exec_init_fn,
108 	.exec_exit_fn		= wd_exec_exit_fn,
109 	.exec_dump_fn		= NULL
110 };
111 
112 static DECLARE_LIST_INIT (confchg_notify);
113 
114 /*
115  * F S M
116  */
117 static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * data);
118 static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * data);
119 
120 enum wd_resource_state {
121 	WD_S_RUNNING,
122 	WD_S_FAILED,
123 	WD_S_STOPPED
124 };
125 
126 enum wd_resource_event {
127 	WD_E_FAILURE,
128 	WD_E_CONFIG_CHANGED
129 };
130 
131 const char * wd_running_str		= "running";
132 const char * wd_failed_str		= "failed";
133 const char * wd_failure_str		= "failure";
134 const char * wd_stopped_str		= "stopped";
135 const char * wd_config_changed_str	= "config_changed";
136 
137 struct cs_fsm_entry wd_fsm_table[] = {
138 	{ WD_S_STOPPED,	WD_E_CONFIG_CHANGED,	wd_config_changed,	{WD_S_STOPPED, WD_S_RUNNING, -1} },
139 	{ WD_S_STOPPED,	WD_E_FAILURE,		NULL,			{-1} },
140 	{ WD_S_RUNNING,	WD_E_CONFIG_CHANGED,	wd_config_changed,	{WD_S_RUNNING, WD_S_STOPPED, -1} },
141 	{ WD_S_RUNNING,	WD_E_FAILURE,		wd_resource_failed,	{WD_S_FAILED, -1} },
142 	{ WD_S_FAILED,	WD_E_CONFIG_CHANGED,	wd_config_changed,	{WD_S_RUNNING, WD_S_STOPPED, -1} },
143 	{ WD_S_FAILED,	WD_E_FAILURE,		NULL,			{-1} },
144 };
145 
wd_get_service_engine_ver0(void)146 struct corosync_service_engine *wd_get_service_engine_ver0 (void)
147 {
148 	return (&wd_service_engine);
149 }
150 
wd_res_state_to_str(struct cs_fsm * fsm,int32_t state)151 static const char * wd_res_state_to_str(struct cs_fsm* fsm,
152 	int32_t state)
153 {
154 	switch (state) {
155 	case WD_S_STOPPED:
156 		return wd_stopped_str;
157 		break;
158 	case WD_S_RUNNING:
159 		return wd_running_str;
160 		break;
161 	case WD_S_FAILED:
162 		return wd_failed_str;
163 		break;
164 	}
165 	return NULL;
166 }
167 
wd_res_event_to_str(struct cs_fsm * fsm,int32_t event)168 static const char * wd_res_event_to_str(struct cs_fsm* fsm,
169 	int32_t event)
170 {
171 	switch (event) {
172 	case WD_E_CONFIG_CHANGED:
173 		return wd_config_changed_str;
174 		break;
175 	case WD_E_FAILURE:
176 		return wd_failure_str;
177 		break;
178 	}
179 	return NULL;
180 }
181 
wd_fsm_cb(struct cs_fsm * fsm,int cb_event,int32_t curr_state,int32_t next_state,int32_t fsm_event,void * data)182 static void wd_fsm_cb (struct cs_fsm *fsm, int cb_event, int32_t curr_state,
183 	int32_t next_state, int32_t fsm_event, void *data)
184 {
185 	switch (cb_event) {
186 	case CS_FSM_CB_EVENT_PROCESS_NF:
187 		log_printf (LOGSYS_LEVEL_ERROR, "Fsm:%s could not find event \"%s\" in state \"%s\"",
188 			fsm->name, fsm->event_to_str(fsm, fsm_event), fsm->state_to_str(fsm, curr_state));
189 		corosync_exit_error(COROSYNC_DONE_FATAL_ERR);
190 		break;
191 	case CS_FSM_CB_EVENT_STATE_SET:
192 		log_printf (LOGSYS_LEVEL_INFO, "Fsm:%s event \"%s\", state \"%s\" --> \"%s\"",
193 			fsm->name,
194 			fsm->event_to_str(fsm, fsm_event),
195 			fsm->state_to_str(fsm, fsm->table[fsm->curr_entry].curr_state),
196 			fsm->state_to_str(fsm, next_state));
197 		break;
198 	case CS_FSM_CB_EVENT_STATE_SET_NF:
199 		log_printf (LOGSYS_LEVEL_CRIT, "Fsm:%s Can't change state from \"%s\" to \"%s\" (event was \"%s\")",
200 			fsm->name,
201 			fsm->state_to_str(fsm, fsm->table[fsm->curr_entry].curr_state),
202 			fsm->state_to_str(fsm, next_state),
203 			fsm->event_to_str(fsm, fsm_event));
204 	        corosync_exit_error(COROSYNC_DONE_FATAL_ERR);
205 		break;
206 	default:
207 		log_printf (LOGSYS_LEVEL_CRIT, "Fsm: Unknown callback event!");
208 	        corosync_exit_error(COROSYNC_DONE_FATAL_ERR);
209 		break;
210 	}
211 }
212 
213 /*
214  * returns (CS_TRUE == OK, CS_FALSE == failed)
215  */
wd_resource_state_is_ok(struct resource * ref)216 static int32_t wd_resource_state_is_ok (struct resource *ref)
217 {
218 	char* state = NULL;
219 	uint64_t last_updated;
220 	uint64_t my_time;
221 	uint64_t allowed_period;
222 	char key_name[ICMAP_KEYNAME_MAXLEN];
223 
224 	if ((snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "last_updated") >= ICMAP_KEYNAME_MAXLEN) ||
225 		(icmap_get_uint64(key_name, &last_updated) != CS_OK)) {
226 		/* key does not exist.
227 		*/
228 		return CS_FALSE;
229 	}
230 
231 	if ((snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "state") >= ICMAP_KEYNAME_MAXLEN) ||
232 		(icmap_get_string(key_name, &state) != CS_OK || strcmp(state, "disabled") == 0)) {
233 		/* key does not exist.
234 		*/
235 		if (state != NULL)
236 			free(state);
237 
238 		return CS_FALSE;
239 	}
240 
241 	if (last_updated == 0) {
242 		/* initial value */
243 		free(state);
244 		return CS_TRUE;
245 	}
246 
247 	my_time = cs_timestamp_get();
248 
249 	/*
250 	 * Here we check that the monitor has written a timestamp within the poll_period
251 	 * plus a grace factor of (0.5 * poll_period).
252 	 */
253 	allowed_period = (ref->check_timeout * MILLI_2_NANO_SECONDS * 3) / 2;
254 	if ((last_updated + allowed_period) < my_time) {
255 		log_printf (LOGSYS_LEVEL_ERROR,
256 			"last_updated %"PRIu64" ms too late, period:%"PRIu64".",
257 			(uint64_t)(my_time/MILLI_2_NANO_SECONDS - ((last_updated + allowed_period) / MILLI_2_NANO_SECONDS)),
258 			ref->check_timeout);
259 		free(state);
260 		return CS_FALSE;
261 	}
262 
263 	if (strcmp (state, wd_failed_str) == 0) {
264 		free(state);
265 		return CS_FALSE;
266 	}
267 
268 	free(state);
269 	return CS_TRUE;
270 }
271 
wd_config_changed(struct cs_fsm * fsm,int32_t event,void * data)272 static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * data)
273 {
274 	char *state;
275 	uint64_t tmp_value;
276 	uint64_t next_timeout;
277 	struct resource *ref = (struct resource*)data;
278 	char key_name[ICMAP_KEYNAME_MAXLEN];
279 
280 	next_timeout = ref->check_timeout;
281 
282 	if ((snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "poll_period") >= ICMAP_KEYNAME_MAXLEN) ||
283 		(icmap_get_uint64(ref->res_path, &tmp_value) == CS_OK)) {
284 		if (tmp_value >= WD_MIN_TIMEOUT_MS && tmp_value <= WD_MAX_TIMEOUT_MS) {
285 			log_printf (LOGSYS_LEVEL_DEBUG,
286 				"poll_period changing from:%"PRIu64" to %"PRIu64".",
287 				ref->check_timeout, tmp_value);
288 			/*
289 			 * To easy in the transition between poll_period's we are going
290 			 * to make the first timeout the bigger of the new and old value.
291 			 * This is to give the monitoring system time to adjust.
292 			 */
293 			next_timeout = CS_MAX(tmp_value, ref->check_timeout);
294 			ref->check_timeout = tmp_value;
295 		} else {
296 			log_printf (LOGSYS_LEVEL_WARNING,
297 				"Could NOT use poll_period:%"PRIu64" ms for resource %s",
298 				tmp_value, ref->name);
299 		}
300 	}
301 
302 	if ((snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "recovery") >= ICMAP_KEYNAME_MAXLEN) ||
303 		(icmap_get_string(key_name, &ref->recovery) != CS_OK)) {
304 		/* key does not exist.
305 		 */
306 		log_printf (LOGSYS_LEVEL_WARNING,
307 			"resource %s missing a recovery key.", ref->name);
308 		cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref, wd_fsm_cb);
309 		return;
310 	}
311 	if ((snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "state") >= ICMAP_KEYNAME_MAXLEN) ||
312 		(icmap_get_string(key_name, &state) != CS_OK)) {
313 		/* key does not exist.
314 		*/
315 		log_printf (LOGSYS_LEVEL_WARNING,
316 			"resource %s missing a state key.", ref->name);
317 		cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref, wd_fsm_cb);
318 		return;
319 	}
320 	if (ref->check_timer) {
321 		api->timer_delete(ref->check_timer);
322 		ref->check_timer = 0;
323 	}
324 
325 	if (strcmp(wd_stopped_str, state) == 0) {
326 		cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref, wd_fsm_cb);
327 	} else {
328 		api->timer_add_duration(next_timeout * MILLI_2_NANO_SECONDS,
329 			ref, wd_resource_check_fn, &ref->check_timer);
330 		cs_fsm_state_set(&ref->fsm, WD_S_RUNNING, ref, wd_fsm_cb);
331 	}
332 	free(state);
333 }
334 
wd_resource_failed(struct cs_fsm * fsm,int32_t event,void * data)335 static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * data)
336 {
337 	struct resource* ref = (struct resource*)data;
338 
339 	if (ref->check_timer) {
340 		api->timer_delete(ref->check_timer);
341 		ref->check_timer = 0;
342 	}
343 
344 	log_printf (LOGSYS_LEVEL_CRIT, "%s resource \"%s\" failed!",
345 		ref->recovery, (char*)ref->name);
346 	if (strcmp (ref->recovery, "watchdog") == 0 ||
347 	    strcmp (ref->recovery, "quit") == 0) {
348 		watchdog_ok = 0;
349 	}
350 	else if (strcmp (ref->recovery, "reboot") == 0) {
351 		reboot(RB_AUTOBOOT);
352 	}
353 	else if (strcmp (ref->recovery, "shutdown") == 0) {
354 		reboot(RB_POWER_OFF);
355 	}
356 	cs_fsm_state_set(fsm, WD_S_FAILED, data, wd_fsm_cb);
357 }
358 
wd_key_changed(int32_t event,const char * key_name,struct icmap_notify_value new_val,struct icmap_notify_value old_val,void * user_data)359 static void wd_key_changed(
360 	int32_t event,
361 	const char *key_name,
362 	struct icmap_notify_value new_val,
363 	struct icmap_notify_value old_val,
364 	void *user_data)
365 {
366 	struct resource* ref = (struct resource*)user_data;
367 	char *last_key_part;
368 
369 	if (ref == NULL) {
370 		return ;
371 	}
372 
373 	last_key_part = strrchr(key_name, '.');
374 	if (last_key_part == NULL) {
375 		return ;
376 	}
377 	last_key_part++;
378 
379 	if (event == ICMAP_TRACK_ADD || event == ICMAP_TRACK_MODIFY) {
380 		if (strcmp(last_key_part, "last_updated") == 0 ||
381 			strcmp(last_key_part, "current") == 0) {
382 			return;
383 		}
384 
385 		cs_fsm_process(&ref->fsm, WD_E_CONFIG_CHANGED, ref, wd_fsm_cb);
386 	}
387 
388 	if (event == ICMAP_TRACK_DELETE && ref != NULL) {
389 		if (strcmp(last_key_part, "state") != 0) {
390 			return ;
391 		}
392 
393 		log_printf (LOGSYS_LEVEL_WARNING,
394 			"resource \"%s\" deleted from cmap!",
395 			ref->name);
396 
397 		api->timer_delete(ref->check_timer);
398 		ref->check_timer = 0;
399 		icmap_track_delete(ref->icmap_track);
400 
401 		free(ref);
402 	}
403 }
404 
wd_resource_check_fn(void * resource_ref)405 static void wd_resource_check_fn (void* resource_ref)
406 {
407 	struct resource* ref = (struct resource*)resource_ref;
408 
409 	if (wd_resource_state_is_ok (ref) == CS_FALSE) {
410 		cs_fsm_process(&ref->fsm, WD_E_FAILURE, ref, wd_fsm_cb);
411 		return;
412 	}
413 	api->timer_add_duration(ref->check_timeout*MILLI_2_NANO_SECONDS,
414 		ref, wd_resource_check_fn, &ref->check_timer);
415 }
416 
417 /*
418  * return 0   - fully configured
419  * return -1  - partially configured
420  */
wd_resource_create(char * res_path,char * res_name)421 static int32_t wd_resource_create (char *res_path, char *res_name)
422 {
423 	char *state;
424 	uint64_t tmp_value;
425 	struct resource *ref = calloc (1, sizeof (struct resource));
426 	char key_name[ICMAP_KEYNAME_MAXLEN];
427 
428 	strcpy(ref->res_path, res_path);
429 	ref->check_timeout = WD_DEFAULT_TIMEOUT_MS;
430 	ref->check_timer = 0;
431 
432 	strcpy(ref->name, res_name);
433 	ref->fsm.name = ref->name;
434 	ref->fsm.table = wd_fsm_table;
435 	ref->fsm.entries = sizeof(wd_fsm_table) / sizeof(struct cs_fsm_entry);
436 	ref->fsm.curr_entry = 0;
437 	ref->fsm.curr_state = WD_S_STOPPED;
438 	ref->fsm.state_to_str = wd_res_state_to_str;
439 	ref->fsm.event_to_str = wd_res_event_to_str;
440 
441 	snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", res_path, "poll_period");
442 	if (icmap_get_uint64(key_name, &tmp_value) != CS_OK) {
443 		icmap_set_uint64(key_name, ref->check_timeout);
444 	} else {
445 		if (tmp_value >= WD_MIN_TIMEOUT_MS && tmp_value <= WD_MAX_TIMEOUT_MS) {
446 			ref->check_timeout = tmp_value;
447 		} else {
448 			log_printf (LOGSYS_LEVEL_WARNING,
449 				"Could NOT use poll_period:%"PRIu64" ms for resource %s",
450 				tmp_value, ref->name);
451 		}
452 	}
453 
454 	icmap_track_add(res_path,
455 			ICMAP_TRACK_ADD | ICMAP_TRACK_MODIFY | ICMAP_TRACK_DELETE | ICMAP_TRACK_PREFIX,
456 			wd_key_changed,
457 			ref, &ref->icmap_track);
458 
459 	snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", res_path, "recovery");
460 	if (icmap_get_string(key_name, &ref->recovery) != CS_OK) {
461 		/* key does not exist.
462 		 */
463 		log_printf (LOGSYS_LEVEL_WARNING,
464 			"resource %s missing a recovery key.", ref->name);
465 		return -1;
466 	}
467 	snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", res_path, "state");
468 	if (icmap_get_string(key_name, &state) != CS_OK) {
469 		/* key does not exist.
470 		*/
471 		log_printf (LOGSYS_LEVEL_WARNING,
472 			"resource %s missing a state key.", ref->name);
473 		return -1;
474 	}
475 
476 	snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", res_path, "last_updated");
477 	if (icmap_get_uint64(key_name, &tmp_value) != CS_OK) {
478 		/* key does not exist.
479 		 */
480 		ref->last_updated = 0;
481 	} else {
482 		ref->last_updated = tmp_value;
483 	}
484 
485 	/*
486 	 * delay the first check to give the monitor time to start working.
487 	 */
488 	tmp_value = CS_MAX(ref->check_timeout * 2, WD_DEFAULT_TIMEOUT_MS);
489 	api->timer_add_duration(tmp_value * MILLI_2_NANO_SECONDS,
490 		ref,
491 		wd_resource_check_fn, &ref->check_timer);
492 
493 	cs_fsm_state_set(&ref->fsm, WD_S_RUNNING, ref, wd_fsm_cb);
494 	return 0;
495 }
496 
497 
wd_tickle_fn(void * arg)498 static void wd_tickle_fn (void* arg)
499 {
500 	ENTER();
501 
502 	if (watchdog_ok) {
503 		if (dog > 0) {
504 			ioctl(dog, WDIOC_KEEPALIVE, &watchdog_ok);
505 		}
506 		api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL,
507 			wd_tickle_fn, &wd_timer);
508 	}
509 	else {
510 		log_printf (LOGSYS_LEVEL_ALERT, "NOT tickling the watchdog!");
511 	}
512 
513 }
514 
wd_resource_created_cb(int32_t event,const char * key_name,struct icmap_notify_value new_val,struct icmap_notify_value old_val,void * user_data)515 static void wd_resource_created_cb(
516 	int32_t event,
517 	const char *key_name,
518 	struct icmap_notify_value new_val,
519 	struct icmap_notify_value old_val,
520 	void *user_data)
521 {
522 	char res_name[ICMAP_KEYNAME_MAXLEN];
523 	char res_type[ICMAP_KEYNAME_MAXLEN];
524 	char tmp_key[ICMAP_KEYNAME_MAXLEN];
525 	int res;
526 
527 	if (event != ICMAP_TRACK_ADD) {
528 		return ;
529 	}
530 
531 	res = sscanf(key_name, "resources.%[^.].%[^.].%[^.]", res_type, res_name, tmp_key);
532 	if (res != 3) {
533 		return ;
534 	}
535 
536 	if (strcmp(tmp_key, "state") != 0) {
537 		return ;
538 	}
539 
540 	snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "resources.%s.%s.", res_type, res_name);
541 	wd_resource_create (tmp_key, res_name);
542 }
543 
wd_scan_resources(void)544 static void wd_scan_resources (void)
545 {
546 	int res_count = 0;
547 	icmap_track_t icmap_track = NULL;
548 	icmap_iter_t iter;
549 	const char *key_name;
550 	int res;
551 	char res_name[ICMAP_KEYNAME_MAXLEN];
552 	char res_type[ICMAP_KEYNAME_MAXLEN];
553 	char tmp_key[ICMAP_KEYNAME_MAXLEN];
554 
555 	ENTER();
556 
557 	iter = icmap_iter_init("resources.");
558 	while ((key_name = icmap_iter_next(iter, NULL, NULL)) != NULL) {
559 		res = sscanf(key_name, "resources.%[^.].%[^.].%[^.]", res_type, res_name, tmp_key);
560 		if (res != 3) {
561 			continue ;
562 		}
563 
564 		if (strcmp(tmp_key, "state") != 0) {
565 			continue ;
566 		}
567 
568 		snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "resources.%s.%s.", res_type, res_name);
569 		if (wd_resource_create (tmp_key, res_name) == 0) {
570 			res_count++;
571 		}
572 	}
573 	icmap_iter_finalize(iter);
574 
575 	icmap_track_add("resources.process.", ICMAP_TRACK_ADD | ICMAP_TRACK_PREFIX,
576 			wd_resource_created_cb, NULL, &icmap_track);
577 	icmap_track_add("resources.system.", ICMAP_TRACK_ADD | ICMAP_TRACK_PREFIX,
578 			wd_resource_created_cb, NULL, &icmap_track);
579 
580 	if (res_count == 0) {
581 		log_printf (LOGSYS_LEVEL_INFO, "no resources configured.");
582 	}
583 }
584 
585 
watchdog_timeout_apply(uint32_t new)586 static void watchdog_timeout_apply (uint32_t new)
587 {
588 	struct watchdog_info ident;
589 	uint32_t original_timeout = 0;
590 
591 	if (dog > 0) {
592 		ioctl(dog, WDIOC_GETTIMEOUT, &original_timeout);
593 	}
594 
595 	if (new == original_timeout) {
596 		return;
597 	}
598 
599 	watchdog_timeout = new;
600 
601 	if (dog > 0) {
602 		ioctl(dog, WDIOC_GETSUPPORT, &ident);
603 		if (ident.options & WDIOF_SETTIMEOUT) {
604 			/* yay! the dog is trained.
605 			 */
606 			ioctl(dog, WDIOC_SETTIMEOUT, &watchdog_timeout);
607 		}
608 		ioctl(dog, WDIOC_GETTIMEOUT, &watchdog_timeout);
609 	}
610 
611 	if (watchdog_timeout == new) {
612 		tickle_timeout = (watchdog_timeout * CS_TIME_MS_IN_SEC)/ 2;
613 
614 		/* reset the tickle timer in case it was reduced.
615 		 */
616 		api->timer_delete (wd_timer);
617 		api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL,
618 			wd_tickle_fn, &wd_timer);
619 
620 		log_printf (LOGSYS_LEVEL_DEBUG, "The Watchdog timeout is %d seconds", watchdog_timeout);
621 		log_printf (LOGSYS_LEVEL_DEBUG, "The tickle timeout is %"PRIu64" ms", tickle_timeout);
622 	} else {
623 		log_printf (LOGSYS_LEVEL_WARNING,
624 			"Could not change the Watchdog timeout from %d to %d seconds",
625 			original_timeout, new);
626 	}
627 
628 }
629 
setup_watchdog(void)630 static int setup_watchdog(void)
631 {
632 	struct watchdog_info ident;
633 	char *str;
634 
635 	ENTER();
636 
637 	if (icmap_get_string("resources.watchdog_device", &str) == CS_OK) {
638 		if (strcmp (str, "off") == 0) {
639 			log_printf (LOGSYS_LEVEL_WARNING, "Watchdog disabled by configuration");
640 			free(str);
641 			dog = -1;
642 			return -1;
643 		} else {
644 			watchdog_device = str;
645 		}
646 	}
647 
648 	if (access (watchdog_device, W_OK) != 0) {
649 		log_printf (LOGSYS_LEVEL_WARNING, "No watchdog %s, try modprobe <a watchdog>", watchdog_device);
650 		dog = -1;
651 		return -1;
652 	}
653 
654 	/* here goes, lets hope they have "Magic Close"
655 	 */
656 	dog = open(watchdog_device, O_WRONLY);
657 
658 	if (dog == -1) {
659 		log_printf (LOGSYS_LEVEL_WARNING, "Watchdog %s exists but couldn't be opened.", watchdog_device);
660 		dog = -1;
661 		return -1;
662 	}
663 
664 	/* Right we have the dog.
665 	 * Lets see what breed it is.
666 	 */
667 
668 	ioctl(dog, WDIOC_GETSUPPORT, &ident);
669 	log_printf (LOGSYS_LEVEL_INFO, "Watchdog %s is now being tickled by corosync.", watchdog_device);
670 	log_printf (LOGSYS_LEVEL_DEBUG, "%s", ident.identity);
671 
672 	watchdog_timeout_apply (watchdog_timeout);
673 
674 	ioctl(dog, WDIOC_SETOPTIONS, WDIOS_ENABLECARD);
675 
676 	return 0;
677 }
678 
wd_top_level_key_changed(int32_t event,const char * key_name,struct icmap_notify_value new_val,struct icmap_notify_value old_val,void * user_data)679 static void wd_top_level_key_changed(
680 	int32_t event,
681 	const char *key_name,
682 	struct icmap_notify_value new_val,
683 	struct icmap_notify_value old_val,
684 	void *user_data)
685 {
686 	uint32_t tmp_value_32;
687 
688 	ENTER();
689 
690 	if (icmap_get_uint32("resources.watchdog_timeout", &tmp_value_32) == CS_OK) {
691 		if (tmp_value_32 >= 2 && tmp_value_32 <= 120) {
692 			watchdog_timeout_apply (tmp_value_32);
693 			return;
694 		}
695 	}
696 
697 	log_printf (LOGSYS_LEVEL_WARNING,
698 		"Set watchdog_timeout is out of range (2..120).");
699 	icmap_set_uint32("resources.watchdog_timeout", watchdog_timeout);
700 }
701 
watchdog_timeout_get_initial(void)702 static void watchdog_timeout_get_initial (void)
703 {
704 	uint32_t tmp_value_32;
705 	icmap_track_t icmap_track = NULL;
706 
707 	ENTER();
708 
709 	if (icmap_get_uint32("resources.watchdog_timeout", &tmp_value_32) != CS_OK) {
710 		watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC);
711 
712 		icmap_set_uint32("resources.watchdog_timeout", watchdog_timeout);
713 	}
714 	else {
715 		if (tmp_value_32 >= 2 && tmp_value_32 <= 120) {
716 			watchdog_timeout_apply (tmp_value_32);
717 		}
718 		else {
719 			log_printf (LOGSYS_LEVEL_WARNING,
720 				"Set watchdog_timeout is out of range (2..120).");
721 			log_printf (LOGSYS_LEVEL_INFO,
722 				"use default value %d seconds.", WD_DEFAULT_TIMEOUT_SEC);
723 			watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC);
724 			icmap_set_uint32("resources.watchdog_timeout", watchdog_timeout);
725 		}
726 	}
727 
728 	icmap_track_add("resources.watchdog_timeout", ICMAP_TRACK_MODIFY,
729 			wd_top_level_key_changed, NULL, &icmap_track);
730 
731 }
732 
wd_exec_init_fn(struct corosync_api_v1 * corosync_api)733 static char *wd_exec_init_fn (struct corosync_api_v1 *corosync_api)
734 {
735 
736 	ENTER();
737 
738 	api = corosync_api;
739 
740 	watchdog_timeout_get_initial();
741 
742 	setup_watchdog();
743 
744 	wd_scan_resources();
745 
746 	return NULL;
747 }
748 
wd_exec_exit_fn(void)749 static int wd_exec_exit_fn (void)
750 {
751 	char magic = 'V';
752 	ENTER();
753 
754 	if (dog > 0) {
755 		log_printf (LOGSYS_LEVEL_INFO, "magically closing the watchdog.");
756 		if (write (dog, &magic, 1) == -1) {
757 		    log_printf (LOGSYS_LEVEL_ERROR, "failed to write %c to dog(%d).", magic, dog);
758 		}
759 	}
760 	return 0;
761 }
762 
763 
764