1 /*
2    monitoring links to all other nodes to detect dead nodes
3 
4 
5    Copyright (C) Ronnie Sahlberg 2007
6 
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11 
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include "replace.h"
22 #include "system/filesys.h"
23 #include "system/network.h"
24 #include "system/wait.h"
25 
26 #include <talloc.h>
27 #include <tevent.h>
28 
29 #include "lib/util/debug.h"
30 #include "lib/util/samba_util.h"
31 #include "lib/util/util_process.h"
32 
33 #include "ctdb_private.h"
34 
35 #include "common/system.h"
36 #include "common/common.h"
37 #include "common/logging.h"
38 
39 struct ctdb_monitor_state {
40 	TALLOC_CTX *monitor_context;
41 	uint32_t next_interval;
42 	uint32_t event_script_timeouts;
43 };
44 
45 static void ctdb_check_health(struct tevent_context *ev,
46 			      struct tevent_timer *te,
47 			      struct timeval t, void *private_data);
48 
ctdb_run_notification_script_child(struct ctdb_context * ctdb,const char * event)49 static int ctdb_run_notification_script_child(struct ctdb_context *ctdb, const char *event)
50 {
51 	struct stat st;
52 	int ret;
53 	char *cmd;
54 
55 	if (stat(ctdb->notification_script, &st) != 0) {
56 		DEBUG(DEBUG_ERR,("Could not stat notification script %s. Can not send notifications.\n", ctdb->notification_script));
57 		return -1;
58 	}
59 	if (!(st.st_mode & S_IXUSR)) {
60 		DEBUG(DEBUG_ERR,("Notification script %s is not executable.\n", ctdb->notification_script));
61 		return -1;
62 	}
63 
64 	cmd = talloc_asprintf(ctdb, "%s %s\n", ctdb->notification_script, event);
65 	CTDB_NO_MEMORY(ctdb, cmd);
66 
67 	ret = system(cmd);
68 	/* if the system() call was successful, translate ret into the
69 	   return code from the command
70 	*/
71 	if (ret != -1) {
72 		ret = WEXITSTATUS(ret);
73 	}
74 	if (ret != 0) {
75 		DEBUG(DEBUG_ERR,("Notification script \"%s\" failed with error %d\n", cmd, ret));
76 	}
77 
78 	return ret;
79 }
80 
ctdb_run_notification_script(struct ctdb_context * ctdb,const char * event)81 void ctdb_run_notification_script(struct ctdb_context *ctdb, const char *event)
82 {
83 	pid_t child;
84 
85 	if (ctdb->notification_script == NULL) {
86 		return;
87 	}
88 
89 	child = ctdb_fork(ctdb);
90 	if (child == (pid_t)-1) {
91 		DEBUG(DEBUG_ERR,("Failed to fork() a notification child process\n"));
92 		return;
93 	}
94 	if (child == 0) {
95 		int ret;
96 
97 		prctl_set_comment("ctdb_notification");
98 		ret = ctdb_run_notification_script_child(ctdb, event);
99 		if (ret != 0) {
100 			DEBUG(DEBUG_ERR,(__location__ " Notification script failed\n"));
101 		}
102 		_exit(0);
103 	}
104 
105 	return;
106 }
107 
108 /*
109   called when a health monitoring event script finishes
110  */
ctdb_health_callback(struct ctdb_context * ctdb,int status,void * p)111 static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p)
112 {
113 	struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
114 	TDB_DATA data;
115 	struct ctdb_node_flag_change c;
116 	uint32_t next_interval;
117 	int ret;
118 	TDB_DATA rddata;
119 	struct ctdb_srvid_message rd;
120 	const char *state_str = NULL;
121 
122 	c.pnn = ctdb->pnn;
123 	c.old_flags = node->flags;
124 
125 	ZERO_STRUCT(rd);
126 	rd.pnn   = ctdb->pnn;
127 	rd.srvid = 0;
128 
129 	rddata.dptr = (uint8_t *)&rd;
130 	rddata.dsize = sizeof(rd);
131 
132 	if (status == ECANCELED) {
133 		DEBUG(DEBUG_ERR,("Monitoring event was cancelled\n"));
134 		goto after_change_status;
135 	}
136 
137 	if (status == ETIMEDOUT) {
138 		ctdb->monitor->event_script_timeouts++;
139 
140 		if (ctdb->monitor->event_script_timeouts >=
141 		    ctdb->tunable.monitor_timeout_count) {
142 			DEBUG(DEBUG_ERR,
143 			      ("Maximum monitor timeout count %u reached."
144 			       " Making node unhealthy\n",
145 			       ctdb->tunable.monitor_timeout_count));
146 		} else {
147 			/* We pretend this is OK. */
148 			goto after_change_status;
149 		}
150 	} else {
151 		ctdb->monitor->event_script_timeouts = 0;
152 	}
153 
154 	if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) {
155 		DEBUG(DEBUG_NOTICE,("monitor event failed - disabling node\n"));
156 		node->flags |= NODE_FLAGS_UNHEALTHY;
157 		ctdb->monitor->next_interval = 5;
158 
159 		ctdb_run_notification_script(ctdb, "unhealthy");
160 	} else if (status == 0 && (node->flags & NODE_FLAGS_UNHEALTHY)) {
161 		DEBUG(DEBUG_NOTICE,("monitor event OK - node re-enabled\n"));
162 		node->flags &= ~NODE_FLAGS_UNHEALTHY;
163 		ctdb->monitor->next_interval = 5;
164 
165 		ctdb_run_notification_script(ctdb, "healthy");
166 	}
167 
168 after_change_status:
169 	next_interval = ctdb->monitor->next_interval;
170 
171 	ctdb->monitor->next_interval *= 2;
172 	if (ctdb->monitor->next_interval > ctdb->tunable.monitor_interval) {
173 		ctdb->monitor->next_interval = ctdb->tunable.monitor_interval;
174 	}
175 
176 	tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
177 			 timeval_current_ofs(next_interval, 0),
178 			 ctdb_check_health, ctdb);
179 
180 	if (c.old_flags == node->flags) {
181 		return;
182 	}
183 
184 	c.new_flags = node->flags;
185 
186 	data.dptr = (uint8_t *)&c;
187 	data.dsize = sizeof(c);
188 
189 	/* ask the recovery daemon to push these changes out to all nodes */
190 	ctdb_daemon_send_message(ctdb, ctdb->pnn,
191 				 CTDB_SRVID_PUSH_NODE_FLAGS, data);
192 
193 	if (c.new_flags & NODE_FLAGS_UNHEALTHY) {
194 		state_str = "UNHEALTHY";
195 	} else {
196 		state_str = "HEALTHY";
197 	}
198 
199 	/* ask the recmaster to reallocate all addresses */
200 	DEBUG(DEBUG_ERR,
201 	      ("Node became %s. Ask recovery master to reallocate IPs\n",
202 	       state_str));
203 	ret = ctdb_daemon_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_TAKEOVER_RUN, rddata);
204 	if (ret != 0) {
205 		DEBUG(DEBUG_ERR,
206 		      (__location__
207 		       " Failed to send IP takeover run request\n"));
208 	}
209 }
210 
211 
212 static void ctdb_run_startup(struct tevent_context *ev,
213 			     struct tevent_timer *te,
214 			     struct timeval t, void *private_data);
215 /*
216   called when the startup event script finishes
217  */
ctdb_startup_callback(struct ctdb_context * ctdb,int status,void * p)218 static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p)
219 {
220 	if (status != 0) {
221 		DEBUG(DEBUG_ERR,("startup event failed\n"));
222 		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
223 				 timeval_current_ofs(5, 0),
224 				 ctdb_run_startup, ctdb);
225 		return;
226 	}
227 
228 	DEBUG(DEBUG_NOTICE,("startup event OK - enabling monitoring\n"));
229 	ctdb_set_runstate(ctdb, CTDB_RUNSTATE_RUNNING);
230 	ctdb->monitor->next_interval = 2;
231 	ctdb_run_notification_script(ctdb, "startup");
232 
233 	/* tell all other nodes we've just started up */
234 	ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED,
235 				 0, CTDB_CONTROL_STARTUP, 0,
236 				 CTDB_CTRL_FLAG_NOREPLY,
237 				 tdb_null, NULL, NULL);
238 
239 	tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
240 			 timeval_current_ofs(ctdb->monitor->next_interval, 0),
241 			 ctdb_check_health, ctdb);
242 }
243 
ctdb_run_startup(struct tevent_context * ev,struct tevent_timer * te,struct timeval t,void * private_data)244 static void ctdb_run_startup(struct tevent_context *ev,
245 			     struct tevent_timer *te,
246 			     struct timeval t, void *private_data)
247 {
248 	struct ctdb_context *ctdb = talloc_get_type(private_data,
249 						    struct ctdb_context);
250 	int ret;
251 
252 	/* This is necessary to avoid the "startup" event colliding
253 	 * with the "ipreallocated" event from the takeover run
254 	 * following the first recovery.  We might as well serialise
255 	 * these things if we can.
256 	 */
257 	if (ctdb->runstate < CTDB_RUNSTATE_STARTUP) {
258 		DEBUG(DEBUG_NOTICE,
259 		      ("Not yet in startup runstate. Wait one more second\n"));
260 		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
261 				 timeval_current_ofs(1, 0),
262 				 ctdb_run_startup, ctdb);
263 		return;
264 	}
265 
266 	DEBUG(DEBUG_NOTICE,("Running the \"startup\" event.\n"));
267 	ret = ctdb_event_script_callback(ctdb,
268 					 ctdb->monitor->monitor_context,
269 					 ctdb_startup_callback,
270 					 ctdb, CTDB_EVENT_STARTUP, "%s", "");
271 
272 	if (ret != 0) {
273 		DEBUG(DEBUG_ERR,("Unable to launch startup event script\n"));
274 		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
275 				 timeval_current_ofs(5, 0),
276 				 ctdb_run_startup, ctdb);
277 	}
278 }
279 
280 /*
281   wait until we have finished initial recoveries before we start the
282   monitoring events
283  */
ctdb_wait_until_recovered(struct tevent_context * ev,struct tevent_timer * te,struct timeval t,void * private_data)284 static void ctdb_wait_until_recovered(struct tevent_context *ev,
285 				      struct tevent_timer *te,
286 				      struct timeval t, void *private_data)
287 {
288 	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
289 	int ret;
290 	static int count = 0;
291 
292 	count++;
293 
294 	if (count < 60 || count%600 == 0) {
295 		DEBUG(DEBUG_NOTICE,("CTDB_WAIT_UNTIL_RECOVERED\n"));
296 		if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_STOPPED) {
297 			DEBUG(DEBUG_NOTICE,("Node is STOPPED. Node will NOT recover.\n"));
298 		}
299 	}
300 
301 	if (ctdb->vnn_map->generation == INVALID_GENERATION) {
302 		ctdb->db_persistent_startup_generation = INVALID_GENERATION;
303 
304 		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
305 				 timeval_current_ofs(1, 0),
306 				 ctdb_wait_until_recovered, ctdb);
307 		return;
308 	}
309 
310 	if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
311 		ctdb->db_persistent_startup_generation = INVALID_GENERATION;
312 
313 		DEBUG(DEBUG_NOTICE,(__location__ " in recovery. Wait one more second\n"));
314 		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
315 				 timeval_current_ofs(1, 0),
316 				 ctdb_wait_until_recovered, ctdb);
317 		return;
318 	}
319 
320 
321 	if (!fast_start && timeval_elapsed(&ctdb->last_recovery_finished) < (ctdb->tunable.rerecovery_timeout + 3)) {
322 		ctdb->db_persistent_startup_generation = INVALID_GENERATION;
323 
324 		DEBUG(DEBUG_NOTICE,(__location__ " wait for pending recoveries to end. Wait one more second.\n"));
325 
326 		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
327 				 timeval_current_ofs(1, 0),
328 				 ctdb_wait_until_recovered, ctdb);
329 		return;
330 	}
331 
332 	if (ctdb->vnn_map->generation == ctdb->db_persistent_startup_generation) {
333 		DEBUG(DEBUG_INFO,(__location__ " skip ctdb_recheck_persistent_health() "
334 				  "until the next recovery\n"));
335 		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
336 				 timeval_current_ofs(1, 0),
337 				 ctdb_wait_until_recovered, ctdb);
338 		return;
339 	}
340 
341 	ctdb->db_persistent_startup_generation = ctdb->vnn_map->generation;
342 	ret = ctdb_recheck_persistent_health(ctdb);
343 	if (ret != 0) {
344 		ctdb->db_persistent_check_errors++;
345 		if (ctdb->db_persistent_check_errors < ctdb->max_persistent_check_errors) {
346 			DEBUG(DEBUG_ERR,
347 			      (__location__ "ctdb_recheck_persistent_health() "
348 			      "failed (%llu of %llu times) - retry later\n",
349 			      (unsigned long long)ctdb->db_persistent_check_errors,
350 			      (unsigned long long)ctdb->max_persistent_check_errors));
351 			tevent_add_timer(ctdb->ev,
352 					 ctdb->monitor->monitor_context,
353 					 timeval_current_ofs(1, 0),
354 					 ctdb_wait_until_recovered, ctdb);
355 			return;
356 		}
357 		DEBUG(DEBUG_ALERT,(__location__
358 				  "ctdb_recheck_persistent_health() failed (%llu times) - prepare shutdown\n",
359 				  (unsigned long long)ctdb->db_persistent_check_errors));
360 		ctdb_shutdown_sequence(ctdb, 11);
361 		/* In case above returns due to duplicate shutdown */
362 		return;
363 	}
364 	ctdb->db_persistent_check_errors = 0;
365 
366 	tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
367 			 timeval_current(), ctdb_run_startup, ctdb);
368 }
369 
370 
371 /*
372   see if the event scripts think we are healthy
373  */
ctdb_check_health(struct tevent_context * ev,struct tevent_timer * te,struct timeval t,void * private_data)374 static void ctdb_check_health(struct tevent_context *ev,
375 			      struct tevent_timer *te,
376 			      struct timeval t, void *private_data)
377 {
378 	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
379 	bool skip_monitoring = false;
380 	int ret = 0;
381 
382 	if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL ||
383 	    ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE ||
384 	    ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
385 		skip_monitoring = true;
386 	} else {
387 		if (ctdb_db_all_frozen(ctdb)) {
388 			DEBUG(DEBUG_ERR,
389 			      ("Skip monitoring since databases are frozen\n"));
390 			skip_monitoring = true;
391 		}
392 	}
393 
394 	if (skip_monitoring) {
395 		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
396 				 timeval_current_ofs(ctdb->monitor->next_interval, 0),
397 				 ctdb_check_health, ctdb);
398 		return;
399 	}
400 
401 	ret = ctdb_event_script_callback(ctdb,
402 					 ctdb->monitor->monitor_context,
403 					 ctdb_health_callback,
404 					 ctdb, CTDB_EVENT_MONITOR, "%s", "");
405 	if (ret != 0) {
406 		DEBUG(DEBUG_ERR,("Unable to launch monitor event script\n"));
407 		ctdb->monitor->next_interval = 5;
408 		tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
409 				 timeval_current_ofs(5, 0),
410 				 ctdb_check_health, ctdb);
411 	}
412 }
413 
414 /* stop any monitoring
415    this should only be done when shutting down the daemon
416 */
ctdb_stop_monitoring(struct ctdb_context * ctdb)417 void ctdb_stop_monitoring(struct ctdb_context *ctdb)
418 {
419 	talloc_free(ctdb->monitor->monitor_context);
420 	ctdb->monitor->monitor_context = NULL;
421 
422 	ctdb->monitor->next_interval = 5;
423 	DEBUG(DEBUG_NOTICE,("Monitoring has been stopped\n"));
424 }
425 
426 /*
427   start watching for nodes that might be dead
428  */
ctdb_wait_for_first_recovery(struct ctdb_context * ctdb)429 void ctdb_wait_for_first_recovery(struct ctdb_context *ctdb)
430 {
431 	ctdb_set_runstate(ctdb, CTDB_RUNSTATE_FIRST_RECOVERY);
432 
433 	ctdb->monitor = talloc(ctdb, struct ctdb_monitor_state);
434 	CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor);
435 
436 	ctdb->monitor->monitor_context = talloc_new(ctdb->monitor);
437 	CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context);
438 
439 	tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
440 			 timeval_current_ofs(1, 0),
441 			 ctdb_wait_until_recovered, ctdb);
442 }
443 
444 
445 /*
446   modify flags on a node
447  */
ctdb_control_modflags(struct ctdb_context * ctdb,TDB_DATA indata)448 int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata)
449 {
450 	struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)indata.dptr;
451 	struct ctdb_node *node;
452 	uint32_t old_flags;
453 
454 	if (c->pnn >= ctdb->num_nodes) {
455 		DEBUG(DEBUG_ERR,(__location__ " Node %d is invalid, num_nodes :%d\n", c->pnn, ctdb->num_nodes));
456 		return -1;
457 	}
458 
459 	node         = ctdb->nodes[c->pnn];
460 	old_flags    = node->flags;
461 	if (c->pnn != ctdb->pnn) {
462 		c->old_flags  = node->flags;
463 	}
464 	node->flags   = c->new_flags & ~NODE_FLAGS_DISCONNECTED;
465 	node->flags  |= (c->old_flags & NODE_FLAGS_DISCONNECTED);
466 
467 	/* we don't let other nodes modify our STOPPED status */
468 	if (c->pnn == ctdb->pnn) {
469 		node->flags &= ~NODE_FLAGS_STOPPED;
470 		if (old_flags & NODE_FLAGS_STOPPED) {
471 			node->flags |= NODE_FLAGS_STOPPED;
472 		}
473 	}
474 
475 	/* we don't let other nodes modify our BANNED status */
476 	if (c->pnn == ctdb->pnn) {
477 		node->flags &= ~NODE_FLAGS_BANNED;
478 		if (old_flags & NODE_FLAGS_BANNED) {
479 			node->flags |= NODE_FLAGS_BANNED;
480 		}
481 	}
482 
483 	if (node->flags == c->old_flags) {
484 		DEBUG(DEBUG_INFO, ("Control modflags on node %u - Unchanged - flags 0x%x\n", c->pnn, node->flags));
485 		return 0;
486 	}
487 
488 	DEBUG(DEBUG_INFO, ("Control modflags on node %u - flags now 0x%x\n", c->pnn, node->flags));
489 
490 	if (node->flags == 0 && ctdb->runstate <= CTDB_RUNSTATE_STARTUP) {
491 		DEBUG(DEBUG_ERR, (__location__ " Node %u became healthy - force recovery for startup\n",
492 				  c->pnn));
493 		ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
494 	}
495 
496 	/* tell the recovery daemon something has changed */
497 	c->new_flags = node->flags;
498 	ctdb_daemon_send_message(ctdb, ctdb->pnn,
499 				 CTDB_SRVID_SET_NODE_FLAGS, indata);
500 
501 	return 0;
502 }
503