1 /*****************************************************************************\
2  *  backup.c - backup slurm controller
3  *****************************************************************************
4  *  Copyright (C) 2002-2007 The Regents of the University of California.
5  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
6  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7  *  Written by Morris Jette <jette@llnl.gov>, et. al.
8  *  CODE-OCEC-09-009. All rights reserved.
9  *
10  *  This file is part of Slurm, a resource management program.
11  *  For details, see <https://slurm.schedmd.com/>.
12  *  Please also read the included file: DISCLAIMER.
13  *
14  *  Slurm is free software; you can redistribute it and/or modify it under
15  *  the terms of the GNU General Public License as published by the Free
16  *  Software Foundation; either version 2 of the License, or (at your option)
17  *  any later version.
18  *
19  *  In addition, as a special exception, the copyright holders give permission
20  *  to link the code of portions of this program with the OpenSSL library under
21  *  certain conditions as described in each individual source file, and
22  *  distribute linked combinations including the two. You must obey the GNU
23  *  General Public License in all respects for all of the code used other than
24  *  OpenSSL. If you modify file(s) with this exception, you may extend this
25  *  exception to your version of the file(s), but you are not obligated to do
26  *  so. If you do not wish to do so, delete this exception statement from your
27  *  version.  If you delete this exception statement from all source files in
28  *  the program, then also delete it here.
29  *
30  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
31  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
32  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
33  *  details.
34  *
35  *  You should have received a copy of the GNU General Public License along
36  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
37  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
38 \*****************************************************************************/
39 
40 #include "config.h"
41 
42 #include <errno.h>
43 #include <pthread.h>
44 #include <signal.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <sys/resource.h>
49 #include <sys/stat.h>
50 
51 #include "slurm/slurm_errno.h"
52 
53 #include "src/common/daemonize.h"
54 #include "src/common/log.h"
55 #include "src/common/macros.h"
56 #include "src/common/node_select.h"
57 #include "src/common/slurm_auth.h"
58 #include "src/common/slurm_accounting_storage.h"
59 #include "src/common/switch.h"
60 #include "src/common/xsignal.h"
61 #include "src/common/xstring.h"
62 
63 #include "src/slurmctld/heartbeat.h"
64 #include "src/slurmctld/locks.h"
65 #include "src/slurmctld/proc_req.h"
66 #include "src/slurmctld/read_config.h"
67 #include "src/slurmctld/slurmctld.h"
68 #include "src/slurmctld/trigger_mgr.h"
69 
70 #define _DEBUG		0
71 #define SHUTDOWN_WAIT	2	/* Time to wait for primary server shutdown */
72 
73 static int          _background_process_msg(slurm_msg_t * msg);
74 static void *       _background_rpc_mgr(void *no_data);
75 static void *       _background_signal_hand(void *no_data);
76 static void         _backup_reconfig(void);
77 static int          _shutdown_primary_controller(int wait_time);
78 static void *       _trigger_slurmctld_event(void *arg);
79 inline static void  _update_cred_key(void);
80 
81 typedef struct ping_struct {
82 	int backup_inx;
83 	char *control_addr;
84 	char *control_machine;
85 	uint32_t slurmctld_port;
86 } ping_struct_t;
87 
88 typedef struct {
89 	time_t control_time;
90 	bool responding;
91 } ctld_ping_t;
92 
93 /* Local variables */
94 static ctld_ping_t *	ctld_ping = NULL;
95 static bool		dump_core = false;
96 static time_t		last_controller_response;
97 static pthread_mutex_t	ping_mutex = PTHREAD_MUTEX_INITIALIZER;
98 static volatile bool	takeover = false;
99 static pthread_cond_t	shutdown_cond = PTHREAD_COND_INITIALIZER;
100 static pthread_mutex_t	shutdown_mutex = PTHREAD_MUTEX_INITIALIZER;
101 static int		shutdown_rc = SLURM_SUCCESS;
102 static int		shutdown_thread_cnt = 0;
103 static int		shutdown_timeout = 0;
104 
105 /*
106  * Static list of signals to block in this process
107  * *Must be zero-terminated*
108  */
109 static int backup_sigarray[] = {
110 	SIGINT,  SIGTERM, SIGCHLD, SIGUSR1,
111 	SIGUSR2, SIGTSTP, SIGXCPU, SIGQUIT,
112 	SIGPIPE, SIGALRM, SIGABRT, SIGHUP, 0
113 };
114 
115 /*
116  * run_backup - this is the backup controller, it should run in standby
117  *	mode, assuming control when the primary controller stops responding
118  */
run_backup(slurm_trigger_callbacks_t * callbacks)119 void run_backup(slurm_trigger_callbacks_t *callbacks)
120 {
121 	int i;
122 	time_t last_ping = 0;
123 	slurmctld_lock_t config_read_lock = {
124 		READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
125 	slurmctld_lock_t config_write_lock = {
126 		WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
127 
128 	info("slurmctld running in background mode");
129 	takeover = false;
130 	last_controller_response = time(NULL);
131 
132 	/* default: don't resume if shutdown */
133 	slurmctld_config.resume_backup = false;
134 
135 	/* It is now ok to tell the primary I am done (if I ever had control) */
136 	slurm_mutex_lock(&slurmctld_config.thread_count_lock);
137 	slurm_cond_broadcast(&slurmctld_config.backup_finish_cond);
138 	slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
139 
140 	if (xsignal_block(backup_sigarray) < 0)
141 		error("Unable to block signals");
142 
143 	/*
144 	 * create attached thread to process RPCs
145 	 */
146 	slurm_thread_create(&slurmctld_config.thread_id_rpc,
147 			    _background_rpc_mgr, NULL);
148 
149 	/*
150 	 * create attached thread for signal handling
151 	 */
152 	slurm_thread_create(&slurmctld_config.thread_id_sig,
153 			    _background_signal_hand, NULL);
154 
155 	slurm_thread_create_detached(NULL, _trigger_slurmctld_event, NULL);
156 
157 	for (i = 0; ((i < 5) && (slurmctld_config.shutdown_time == 0)); i++) {
158 		sleep(1);       /* Give the primary slurmctld set-up time */
159 	}
160 
161 	/* repeatedly ping ControlMachine */
162 	while (slurmctld_config.shutdown_time == 0) {
163 		sleep(1);
164 		/* Lock of slurmctld_conf below not important */
165 		if (slurmctld_conf.slurmctld_timeout &&
166 		    (takeover == false) &&
167 		    ((time(NULL) - last_ping) <
168 		     (slurmctld_conf.slurmctld_timeout / 3)))
169 			continue;
170 
171 		last_ping = time(NULL);
172 		if (ping_controllers(false) == SLURM_SUCCESS)
173 			last_controller_response = time(NULL);
174 		else if (takeover) {
175 			/*
176 			 * in takeover mode, take control as soon as
177 			 * primary no longer respond
178 			 */
179 			break;
180 		} else {
181 			time_t use_time, last_heartbeat;
182 			int server_inx = -1;
183 			last_heartbeat = get_last_heartbeat(&server_inx);
184 			debug("%s: last_heartbeat %ld from server %d",
185 			      __func__, last_heartbeat, server_inx);
186 
187 			use_time = last_controller_response;
188 			if (server_inx > backup_inx) {
189 				info("Lower priority slurmctld is currently primary (%d > %d)",
190 				     server_inx, backup_inx);
191 			} else if (last_heartbeat > last_controller_response) {
192 				/* Race condition for time stamps */
193 				debug("Last message to the controller was at %ld,"
194 				      " but the last heartbeat was written at %ld,"
195 				      " trusting the filesystem instead of the network"
196 				      " and not asserting control at this time.",
197 				      last_controller_response, last_heartbeat);
198 				use_time = last_heartbeat;
199 			}
200 
201 			if ((time(NULL) - use_time) >
202 			    slurmctld_conf.slurmctld_timeout)
203 				break;
204 		}
205 	}
206 
207 	if (slurmctld_config.shutdown_time != 0) {
208 		/*
209 		 * Since pidfile is created as user root (its owner is
210 		 *   changed to SlurmUser) SlurmUser may not be able to
211 		 *   remove it, so this is not necessarily an error.
212 		 * No longer need slurmctld_conf lock after above join.
213 		 */
214 		if (unlink(slurmctld_conf.slurmctld_pidfile) < 0)
215 			verbose("Unable to remove pidfile '%s': %m",
216 				slurmctld_conf.slurmctld_pidfile);
217 
218 		info("BackupController terminating");
219 		pthread_join(slurmctld_config.thread_id_sig, NULL);
220 		log_fini();
221 		if (dump_core)
222 			abort();
223 		else
224 			exit(0);
225 	}
226 
227 	lock_slurmctld(config_read_lock);
228 	error("ControlMachine %s not responding, BackupController%d %s taking over",
229 	      slurmctld_conf.control_machine[0], backup_inx,
230 	      slurmctld_config.node_name_short);
231 	unlock_slurmctld(config_read_lock);
232 
233 	backup_slurmctld_restart();
234 	trigger_primary_ctld_fail();
235 	trigger_backup_ctld_as_ctrl();
236 
237 	pthread_kill(slurmctld_config.thread_id_sig, SIGTERM);
238 	pthread_join(slurmctld_config.thread_id_sig, NULL);
239 	pthread_join(slurmctld_config.thread_id_rpc, NULL);
240 
241 	/*
242 	 * The job list needs to be freed before we run
243 	 * ctld_assoc_mgr_init, it should be empty here in the first place.
244 	 */
245 	lock_slurmctld(config_write_lock);
246 	job_fini();
247 	init_job_conf();
248 	unlock_slurmctld(config_write_lock);
249 
250 	ctld_assoc_mgr_init(callbacks);
251 
252 	/* clear old state and read new state */
253 	lock_slurmctld(config_write_lock);
254 	if (switch_g_restore(slurmctld_conf.state_save_location, true)) {
255 		error("failed to restore switch state");
256 		abort();
257 	}
258 	if (read_slurm_conf(2, false)) {	/* Recover all state */
259 		error("Unable to recover slurm state");
260 		abort();
261 	}
262 	slurmctld_config.shutdown_time = (time_t) 0;
263 	unlock_slurmctld(config_write_lock);
264 	select_g_select_nodeinfo_set_all();
265 
266 	return;
267 }
268 
269 /*
270  * _background_signal_hand - Process daemon-wide signals for the
271  *	backup controller
272  */
_background_signal_hand(void * no_data)273 static void *_background_signal_hand(void *no_data)
274 {
275 	int sig, rc;
276 	sigset_t set;
277 	/* Locks: Write configuration, job, node, and partition */
278 	slurmctld_lock_t config_write_lock = {
279 		WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
280 
281 	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
282 	(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
283 
284 	while (slurmctld_config.shutdown_time == 0) {
285 		xsignal_sigset_create(backup_sigarray, &set);
286 		rc = sigwait(&set, &sig);
287 		if (rc == EINTR)
288 			continue;
289 		switch (sig) {
290 		case SIGINT:	/* kill -2  or <CTRL-C> */
291 		case SIGTERM:	/* kill -15 */
292 			info("Terminate signal (SIGINT or SIGTERM) received");
293 			slurmctld_config.shutdown_time = time(NULL);
294 			slurmctld_shutdown();
295 			return NULL;	/* Normal termination */
296 			break;
297 		case SIGHUP:    /* kill -1 */
298 			info("Reconfigure signal (SIGHUP) received");
299 			/*
300 			 * XXX - need to shut down the scheduler
301 			 * plugin, re-read the configuration, and then
302 			 * restart the (possibly new) plugin.
303 			 */
304 			lock_slurmctld(config_write_lock);
305 			_backup_reconfig();
306 			/* Leave config lock set through this */
307 			_update_cred_key();
308 			unlock_slurmctld(config_write_lock);
309 			break;
310 		case SIGABRT:   /* abort */
311 			info("SIGABRT received");
312 			slurmctld_config.shutdown_time = time(NULL);
313 			slurmctld_shutdown();
314 			dump_core = true;
315 			return NULL;    /* Normal termination */
316 			break;
317 		case SIGUSR2:
318 			info("Logrotate signal (SIGUSR2) received");
319 			update_logging();
320 			break;
321 		default:
322 			error("Invalid signal (%d) received", sig);
323 		}
324 	}
325 	return NULL;
326 }
327 
328 /*
329  * Reset the job credential key based upon configuration parameters.
330  * slurmctld_conf is locked on entry.
331  */
_update_cred_key(void)332 static void _update_cred_key(void)
333 {
334 	slurm_cred_ctx_key_update(slurmctld_config.cred_ctx,
335 			slurmctld_conf.job_credential_private_key);
336 }
337 
_sig_handler(int signal)338 static void _sig_handler(int signal)
339 {
340 }
341 
342 /*
343  * _background_rpc_mgr - Read and process incoming RPCs to the background
344  *	controller (that's us)
345  */
_background_rpc_mgr(void * no_data)346 static void *_background_rpc_mgr(void *no_data)
347 {
348 	int newsockfd, sockfd;
349 	slurm_addr_t cli_addr;
350 	slurm_msg_t msg;
351 	int error_code;
352 
353 	/* Read configuration only */
354 	slurmctld_lock_t config_read_lock = {
355 		READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
356 	int sigarray[] = {SIGUSR1, 0};
357 
358 	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
359 	(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
360 	debug3("_background_rpc_mgr pid = %lu", (unsigned long) getpid());
361 
362 	/* initialize port for RPCs */
363 	lock_slurmctld(config_read_lock);
364 
365 	if ((sockfd = slurm_init_msg_engine_port(slurmctld_conf.slurmctld_port))
366 	    == SLURM_ERROR)
367 		fatal("slurm_init_msg_engine_port error %m");
368 	unlock_slurmctld(config_read_lock);
369 
370 	/*
371 	 * Prepare to catch SIGUSR1 to interrupt accept().  This signal is
372 	 * generated by the slurmctld signal handler thread upon receipt of
373 	 * SIGABRT, SIGINT, or SIGTERM. That thread does all processing of
374 	 * all signals.
375 	 */
376 	xsignal(SIGUSR1, _sig_handler);
377 	xsignal_unblock(sigarray);
378 
379 	/*
380 	 * Process incoming RPCs indefinitely
381 	 */
382 	while (slurmctld_config.shutdown_time == 0) {
383 		/*
384 		 * accept needed for stream implementation is a no-op in
385 		 * message implementation that just passes sockfd to newsockfd
386 		 */
387 		if ((newsockfd = slurm_accept_msg_conn(sockfd, &cli_addr))
388 		    == SLURM_ERROR) {
389 			if (errno != EINTR)
390 				error("slurm_accept_msg_conn: %m");
391 			continue;
392 		}
393 
394 		slurm_msg_t_init(&msg);
395 		if (slurm_receive_msg(newsockfd, &msg, 0) != 0)
396 			error("slurm_receive_msg: %m");
397 
398 		error_code = _background_process_msg(&msg);
399 		if ((error_code == SLURM_SUCCESS)			&&
400 		    (msg.msg_type == REQUEST_SHUTDOWN_IMMEDIATE)	&&
401 		    (slurmctld_config.shutdown_time == 0))
402 			slurmctld_config.shutdown_time = time(NULL);
403 
404 		slurm_free_msg_members(&msg);
405 
406 		close(newsockfd);	/* close new socket */
407 	}
408 
409 	debug3("_background_rpc_mgr shutting down");
410 	close(sockfd);	/* close the main socket */
411 	pthread_exit((void *) 0);
412 	return NULL;
413 }
414 
415 /*
416  * _background_process_msg - process an RPC to the backup_controller
417  */
_background_process_msg(slurm_msg_t * msg)418 static int _background_process_msg(slurm_msg_t *msg)
419 {
420 	int error_code = SLURM_SUCCESS;
421 	bool send_rc = true;
422 
423 	if (msg->msg_type != REQUEST_PING) {
424 		bool super_user = false;
425 		uid_t uid = g_slurm_auth_get_uid(msg->auth_cred);
426 
427 		if (validate_slurm_user(uid))
428 			super_user = true;
429 
430 		if (super_user &&
431 		    (msg->msg_type == REQUEST_SHUTDOWN_IMMEDIATE)) {
432 			info("Performing RPC: REQUEST_SHUTDOWN_IMMEDIATE");
433 			send_rc = false;
434 		} else if (super_user &&
435 			   (msg->msg_type == REQUEST_SHUTDOWN)) {
436 			info("Performing RPC: REQUEST_SHUTDOWN");
437 			pthread_kill(slurmctld_config.thread_id_sig, SIGTERM);
438 		} else if (super_user &&
439 			   (msg->msg_type == REQUEST_TAKEOVER)) {
440 			info("Performing RPC: REQUEST_TAKEOVER");
441 			(void) _shutdown_primary_controller(SHUTDOWN_WAIT);
442 			takeover = true;
443 			error_code = SLURM_SUCCESS;
444 		} else if (super_user &&
445 			   (msg->msg_type == REQUEST_CONTROL)) {
446 			debug3("Ignoring RPC: REQUEST_CONTROL");
447 			error_code = ESLURM_DISABLED;
448 			last_controller_response = time(NULL);
449 		} else if (msg->msg_type == REQUEST_CONTROL_STATUS) {
450 			slurm_rpc_control_status(msg, 0);
451 			send_rc = false;
452 		} else {
453 			error("Invalid RPC received %d while in standby mode",
454 			      msg->msg_type);
455 			error_code = ESLURM_IN_STANDBY_MODE;
456 		}
457 	}
458 	if (send_rc)
459 		slurm_send_rc_msg(msg, error_code);
460 	return error_code;
461 }
462 
_ping_ctld_thread(void * arg)463 static void *_ping_ctld_thread(void *arg)
464 {
465 	ping_struct_t *ping = (ping_struct_t *) arg;
466 	slurm_msg_t req, resp;
467 	control_status_msg_t *control_msg;
468 	time_t control_time = (time_t) 0;
469 	bool responding = false;
470 
471 	slurm_msg_t_init(&req);
472 	slurm_set_addr(&req.address, ping->slurmctld_port, ping->control_addr);
473 	req.msg_type = REQUEST_CONTROL_STATUS;
474 	if (slurm_send_recv_node_msg(&req, &resp, 0) == SLURM_SUCCESS) {
475 		switch (resp.msg_type) {
476 		case RESPONSE_CONTROL_STATUS:
477 			control_msg = (control_status_msg_t *) resp.data;
478 			if (ping->backup_inx != control_msg->backup_inx) {
479 				error("%s: BackupController# index mismatch (%d != %u) from host %s",
480 				      __func__, ping->backup_inx,
481 				      control_msg->backup_inx,
482 				      ping->control_machine);
483 			}
484 			control_time  = control_msg->control_time;
485 			responding = true;
486 			break;
487 		default:
488 			error("%s:, Unknown response message %u from host %s",
489 			      __func__, resp.msg_type, ping->control_machine);
490 			break;
491 		}
492 		slurm_free_msg_data(resp.msg_type, resp.data);
493 		if (resp.auth_cred)
494 			g_slurm_auth_destroy(resp.auth_cred);
495 	}
496 
497 	slurm_mutex_lock(&ping_mutex);
498 	if (responding) {
499 		ctld_ping[ping->backup_inx].control_time = control_time;
500 		ctld_ping[ping->backup_inx].responding = true;
501 	}
502 	slurm_mutex_unlock(&ping_mutex);
503 
504 	xfree(ping->control_addr);
505 	xfree(ping->control_machine);
506 	xfree(ping);
507 
508 	return NULL;
509 }
510 
511 /*
512  * Ping all higher-priority control nodes.
513  * RET SLURM_SUCCESS if a currently active controller is found
514  */
ping_controllers(bool active_controller)515 extern int ping_controllers(bool active_controller)
516 {
517 	int i, ping_target_cnt;
518 	ping_struct_t *ping;
519 	pthread_t *ping_tids;
520 	/* Locks: Read configuration */
521 	slurmctld_lock_t config_read_lock = {
522 		READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
523 	bool active_ctld = false, avail_ctld = false;
524 
525 	if (active_controller)
526 		ping_target_cnt = slurmctld_conf.control_cnt;
527 	else
528 		ping_target_cnt = backup_inx;
529 
530 	ctld_ping = xcalloc(ping_target_cnt, sizeof(ctld_ping_t));
531 	ping_tids = xcalloc(ping_target_cnt, sizeof(pthread_t));
532 
533 	for (i = 0; i < ping_target_cnt; i++) {
534 		ctld_ping[i].control_time  = (time_t) 0;
535 		ctld_ping[i].responding = false;
536 	}
537 
538 	lock_slurmctld(config_read_lock);
539 	for (i = 0; i < ping_target_cnt; i++) {
540 		if (i == backup_inx)	/* Avoid pinging ourselves */
541 			continue;
542 
543 		ping = xmalloc(sizeof(ping_struct_t));
544 		ping->backup_inx      = i;
545 		ping->control_addr    = xstrdup(slurmctld_conf.control_addr[i]);
546 		ping->control_machine = xstrdup(slurmctld_conf.control_machine[i]);
547 		ping->slurmctld_port  = slurmctld_conf.slurmctld_port;
548 		slurm_thread_create(&ping_tids[i], _ping_ctld_thread, ping);
549 	}
550 	unlock_slurmctld(config_read_lock);
551 
552 	for (i = 0; i < ping_target_cnt; i++) {
553 		if (i == backup_inx)	/* Avoid pinging ourselves */
554 			continue;
555 		pthread_join(ping_tids[i], NULL);
556 	}
557 	xfree(ping_tids);
558 
559 	for (i = 0; i < ping_target_cnt; i++) {
560 		if (i == backup_inx)	/* Avoid pinging ourselves */
561 			continue;
562 		if (ctld_ping[i].control_time) {
563 			/*
564 			 * Higher priority slurmctld is already in
565 			 * primary mode
566 			 */
567 			active_ctld = true;
568 		}
569 		if (ctld_ping[i].responding) {
570 			/*
571 			 * Higher priority slurmctld is available to
572 			 * enter primary mode
573 			 */
574 			avail_ctld = true;
575 		} else if (active_controller) {
576 			trigger_backup_ctld_fail(i);
577 		}
578 	}
579 
580 	xfree(ctld_ping);
581 	if (active_ctld || avail_ctld)
582 		return SLURM_SUCCESS;
583 	return SLURM_ERROR;
584 }
585 
586 /*
587  * Reload the slurm.conf parameters without any processing
588  * of the node, partition, or state information.
589  * Specifically, we don't want to purge batch scripts based
590  * upon old job state information.
591  * This is a stripped down version of read_slurm_conf(0).
592  */
_backup_reconfig(void)593 static void _backup_reconfig(void)
594 {
595 	slurm_conf_reinit(NULL);
596 	update_logging();
597 	slurmctld_conf.last_update = time(NULL);
598 	return;
599 }
600 
_shutdown_controller(void * arg)601 static void *_shutdown_controller(void *arg)
602 {
603 	int shutdown_inx, rc = SLURM_SUCCESS, rc2 = SLURM_SUCCESS;
604 	slurm_msg_t req;
605 
606 	shutdown_inx = *((int *) arg);
607 	xfree(arg);
608 
609 	slurm_msg_t_init(&req);
610 	slurm_set_addr(&req.address, slurmctld_conf.slurmctld_port,
611 		       slurmctld_conf.control_addr[shutdown_inx]);
612 	req.msg_type = REQUEST_CONTROL;
613 	if (slurm_send_recv_rc_msg_only_one(&req, &rc2, shutdown_timeout) < 0) {
614 		error("%s: send/recv(%s): %m", __func__,
615 		      slurmctld_conf.control_machine[shutdown_inx]);
616 		rc = SLURM_ERROR;
617 	} else if (rc2 == ESLURM_DISABLED) {
618 		debug("primary controller responding");
619 	} else if (rc2 == SLURM_SUCCESS) {
620 		debug("primary controller has relinquished control");
621 	} else {
622 		error("%s(%s): %s", __func__,
623 		      slurmctld_conf.control_machine[shutdown_inx],
624 		      slurm_strerror(rc2));
625 		rc = SLURM_ERROR;
626 	}
627 
628 	slurm_mutex_lock(&shutdown_mutex);
629 	if (rc != SLURM_SUCCESS)
630 		shutdown_rc = rc;
631 	shutdown_thread_cnt--;
632 	slurm_cond_signal(&shutdown_cond);
633 	slurm_mutex_unlock(&shutdown_mutex);
634 	return NULL;
635 }
636 
637 /*
638  * Tell the primary controller and all other possible controller daemons to
639  *	relinquish control, primary control_machine has to suspend operation
640  * Based on _shutdown_backup_controller from controller.c
641  * wait_time - How long to wait for primary controller to write state, seconds.
642  * RET 0 or an error code
643  * NOTE: READ lock_slurmctld config before entry (or be single-threaded)
644  */
_shutdown_primary_controller(int wait_time)645 static int _shutdown_primary_controller(int wait_time)
646 {
647 	int i, *arg;
648 
649 	if (shutdown_timeout == 0) {
650 		shutdown_timeout = slurm_get_msg_timeout() / 2;
651 		shutdown_timeout = MAX(shutdown_timeout, 2);	/* 2 sec min */
652 		shutdown_timeout = MIN(shutdown_timeout, CONTROL_TIMEOUT);
653 		shutdown_timeout *= 1000;	/* sec to msec */
654 	}
655 
656 	if ((slurmctld_conf.control_addr[0] == NULL) ||
657 	    (slurmctld_conf.control_addr[0][0] == '\0')) {
658 		error("%s: no primary controller to shutdown", __func__);
659 		return SLURM_ERROR;
660 	}
661 
662 	shutdown_rc = SLURM_SUCCESS;
663 	for (i = 0; i < slurmctld_conf.control_cnt; i++) {
664 		if (i == backup_inx)
665 			continue;	/* No message to self */
666 
667 		arg = xmalloc(sizeof(int));
668 		*arg = i;
669 		slurm_thread_create_detached(NULL, _shutdown_controller, arg);
670 		slurm_mutex_lock(&shutdown_mutex);
671 		shutdown_thread_cnt++;
672 		slurm_mutex_unlock(&shutdown_mutex);
673 	}
674 
675 	slurm_mutex_lock(&shutdown_mutex);
676 	while (shutdown_thread_cnt != 0) {
677 		slurm_cond_wait(&shutdown_cond, &shutdown_mutex);
678 	}
679 	slurm_mutex_unlock(&shutdown_mutex);
680 
681 	/*
682 	 * FIXME: Ideally the REQUEST_CONTROL RPC does not return until all
683 	 * other activity has ceased and the state has been saved. That is
684 	 * not presently the case (it returns when no other work is pending,
685 	 * so the state save should occur right away). We sleep for a while
686 	 * here and give the primary controller time to shutdown
687 	 */
688 	if (wait_time)
689 		sleep(wait_time);
690 
691 	return shutdown_rc;
692 }
693 
_trigger_slurmctld_event(void * arg)694 static void *_trigger_slurmctld_event(void *arg)
695 {
696 	trigger_info_t ti;
697 
698 	memset(&ti, 0, sizeof(ti));
699 	ti.res_id = "*";
700 	ti.res_type = TRIGGER_RES_TYPE_SLURMCTLD;
701 	ti.trig_type = TRIGGER_TYPE_BU_CTLD_RES_OP;
702 	ti.control_inx = backup_inx;
703 	if (slurm_pull_trigger(&ti)) {
704 		error("%s: TRIGGER_TYPE_BU_CTLD_RES_OP send failure: %m",
705 		      __func__);
706 	} else {
707 		verbose("%s: TRIGGER_TYPE_BU_CTLD_RES_OP sent", __func__);
708 	}
709 	return NULL;
710 }
711