1 /*****************************************************************************\
2  *  capmc_resume.c - Power up identified nodes with (optional) features.
3  *  Once complete, modify the node's active features as needed.
4  *
5  *  Usage: "capmc_resume <hostlist> [features]"
6  *****************************************************************************
7  *  Copyright (C) 2016-2017 SchedMD LLC.
8  *
9  *  This file is part of Slurm, a resource management program.
10  *  For details, see <https://slurm.schedmd.com/>.
11  *  Please also read the included file: DISCLAIMER.
12  *
13  *  Slurm is free software; you can redistribute it and/or modify it under
14  *  the terms of the GNU General Public License as published by the Free
15  *  Software Foundation; either version 2 of the License, or (at your option)
16  *  any later version.
17  *
18  *  In addition, as a special exception, the copyright holders give permission
19  *  to link the code of portions of this program with the OpenSSL library under
20  *  certain conditions as described in each individual source file, and
21  *  distribute linked combinations including the two. You must obey the GNU
22  *  General Public License in all respects for all of the code used other than
23  *  OpenSSL. If you modify file(s) with this exception, you may extend this
24  *  exception to your version of the file(s), but you are not obligated to do
25  *  so. If you do not wish to do so, delete this exception statement from your
26  *  version.  If you delete this exception statement from all source files in
27  *  the program, then also delete it here.
28  *
29  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
30  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
31  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
32  *  details.
33  *
34  *  You should have received a copy of the GNU General Public License along
35  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
36  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
37 \*****************************************************************************/
38 
39 #if HAVE_CONFIG_H
40 #  include "config.h"
41 #endif
42 
43 #define _GNU_SOURCE	/* For POLLRDHUP */
44 
45 #if HAVE_JSON_C_INC
46 #  include <json-c/json.h>
47 #elif HAVE_JSON_INC
48 #  include <json/json.h>
49 #endif
50 
51 #include <poll.h>
52 #include <signal.h>
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <string.h>
56 
57 #include "slurm/slurm.h"
58 #include "slurm/slurm_errno.h"
59 #include "src/common/log.h"
60 #include "src/common/macros.h"
61 #include "src/common/parse_config.h"
62 #include "src/common/read_config.h"
63 #include "src/common/slurm_protocol_api.h"
64 #include "src/common/xassert.h"
65 #include "src/common/xmalloc.h"
66 #include "src/common/xstring.h"
67 
68 #define MAX_THREADS 256
69 
70 /* Maximum poll wait time for child processes, in milliseconds */
71 #define MAX_POLL_WAIT 500
72 
73 /* Default and minimum timeout parameters for the capmc command */
74 #define DEFAULT_CAPMC_RETRIES 4
75 #define DEFAULT_CAPMC_TIMEOUT 60000	/* 60 seconds */
76 #define MIN_CAPMC_TIMEOUT 1000		/* 1 second */
77 
78 /* Number of times to try performing "node_reinit" operation */
79 #define NODE_REINIT_RETRIES 10
80 
81 /* Number of times to try performing node state change operation */
82 #define NODE_STATE_RETRIES 10
83 
84 /* Static variables */
85 static char *capmc_path = NULL;
86 static uint32_t capmc_poll_freq = 45;
87 static uint32_t capmc_retries = DEFAULT_CAPMC_RETRIES;
88 static uint32_t capmc_timeout = DEFAULT_CAPMC_TIMEOUT;
89 static char *log_file = NULL;
90 static bitstr_t *node_bitmap = NULL;
91 static char *prog_name = NULL;
92 static char *mcdram_mode = NULL, *numa_mode = NULL;
93 static char *syscfg_path = NULL;
94 
95 /* NOTE: Keep this table synchronized with the table in
96  * src/plugins/node_features/knl_cray/node_features_knl_cray.c */
97 static s_p_options_t knl_conf_file_options[] = {
98 	{"AllowMCDRAM", S_P_STRING},
99 	{"AllowNUMA", S_P_STRING},
100 	{"AllowUserBoot", S_P_STRING},
101 	{"BootTime", S_P_UINT32},
102 	{"CapmcPath", S_P_STRING},
103 	{"CapmcPollFreq", S_P_UINT32},
104 	{"CapmcRetries", S_P_UINT32},
105 	{"CapmcTimeout", S_P_UINT32},
106 	{"CnselectPath", S_P_STRING},
107 	{"DefaultMCDRAM", S_P_STRING},
108 	{"DefaultNUMA", S_P_STRING},
109 	{"LogFile", S_P_STRING},
110 	{"McPath", S_P_STRING},
111 	{"SyscfgPath", S_P_STRING},
112 	{"UmeCheckInterval", S_P_UINT32},
113 	{NULL}
114 };
115 
116 /* Static functions */
117 static s_p_hashtbl_t *_config_make_tbl(char *filename);
118 static uint32_t *_json_parse_nids(json_object *jobj, char *key, int *num);
119 static void _read_config(void);
120 static char *_run_script(char **script_argv, int *status);
121 static int _tot_wait(struct timeval *start_time);
122 
_config_make_tbl(char * filename)123 static s_p_hashtbl_t *_config_make_tbl(char *filename)
124 {
125 	s_p_hashtbl_t *tbl = NULL;
126 
127 	xassert(filename);
128 
129 	if (!(tbl = s_p_hashtbl_create(knl_conf_file_options))) {
130 		error("%s: s_p_hashtbl_create error: %s", prog_name,
131 		      slurm_strerror(slurm_get_errno()));
132 		return tbl;
133 	}
134 
135 	if (s_p_parse_file(tbl, NULL, filename, false) == SLURM_ERROR) {
136 		error("%s: s_p_parse_file error: %s", prog_name,
137 		      slurm_strerror(slurm_get_errno()));
138 		s_p_hashtbl_destroy(tbl);
139 		tbl = NULL;
140 	}
141 
142 	return tbl;
143 }
144 
_read_config(void)145 static void _read_config(void)
146 {
147 	char *knl_conf_file;
148 	s_p_hashtbl_t *tbl;
149 
150 	capmc_timeout = DEFAULT_CAPMC_TIMEOUT;
151 	knl_conf_file = get_extra_conf_path("knl_cray.conf");
152 	if ((tbl = _config_make_tbl(knl_conf_file))) {
153 		(void) s_p_get_string(&capmc_path, "CapmcPath", tbl);
154 		(void) s_p_get_uint32(&capmc_poll_freq, "CapmcPollFreq", tbl);
155 		(void) s_p_get_uint32(&capmc_retries, "CapmcRetries", tbl);
156 		(void) s_p_get_uint32(&capmc_timeout, "CapmcTimeout", tbl);
157 		(void) s_p_get_string(&log_file, "LogFile", tbl);
158 		(void) s_p_get_string(&syscfg_path, "SyscfgPath", tbl);
159 	}
160 	xfree(knl_conf_file);
161 	s_p_hashtbl_destroy(tbl);
162 	if (!capmc_path)
163 		capmc_path = xstrdup("/opt/cray/capmc/default/bin/capmc");
164 	capmc_timeout = MAX(capmc_timeout, MIN_CAPMC_TIMEOUT);
165 	if (!log_file)
166 		log_file = slurm_get_job_slurmctld_logfile();
167 }
168 
169 /*
170  * Return time in msec since "start time"
171  */
_tot_wait(struct timeval * start_time)172 static int _tot_wait(struct timeval *start_time)
173 {
174 	struct timeval end_time;
175 	int msec_delay;
176 
177 	gettimeofday(&end_time, NULL);
178 	msec_delay =   (end_time.tv_sec  - start_time->tv_sec ) * 1000;
179 	msec_delay += ((end_time.tv_usec - start_time->tv_usec + 500) / 1000);
180 	return msec_delay;
181 }
182 
183 /* Run a script and return its stdout plus exit status */
_run_script(char ** script_argv,int * status)184 static char *_run_script(char **script_argv, int *status)
185 {
186 	int cc, i, new_wait, resp_size = 0, resp_offset = 0;
187 	pid_t cpid;
188 	char *resp = NULL;
189 	int pfd[2] = { -1, -1 };
190 
191 	if (access(capmc_path, R_OK | X_OK) < 0) {
192 		error("%s: Can not execute: %s", prog_name, capmc_path);
193 		*status = 127;
194 		resp = xstrdup("Slurm node_features/knl_cray configuration error");
195 		return resp;
196 	}
197 	if (pipe(pfd) != 0) {
198 		error("%s: pipe(): %s", prog_name,
199 		      slurm_strerror(slurm_get_errno()));
200 		*status = 127;
201 		resp = xstrdup("System error");
202 		return resp;
203 	}
204 
205 	if ((cpid = fork()) == 0) {
206 		cc = sysconf(_SC_OPEN_MAX);
207 		dup2(pfd[1], STDERR_FILENO);
208 		dup2(pfd[1], STDOUT_FILENO);
209 		for (i = 0; i < cc; i++) {
210 			if ((i != STDERR_FILENO) && (i != STDOUT_FILENO))
211 				close(i);
212 		}
213 		setpgid(0, 0);
214 		execv(capmc_path, script_argv);
215 		error("%s: execv(): %s", prog_name,
216 		      slurm_strerror(slurm_get_errno()));
217 		exit(127);
218 	} else if (cpid < 0) {
219 		close(pfd[0]);
220 		close(pfd[1]);
221 		error("%s: fork(): %s", prog_name,
222 		      slurm_strerror(slurm_get_errno()));
223 		*status = 127;
224 		resp = xstrdup("System error");
225 		return resp;
226 	} else {
227 		struct pollfd fds;
228 		struct timeval tstart;
229 		resp_size = 1024;
230 		resp = xmalloc(resp_size);
231 		close(pfd[1]);
232 		gettimeofday(&tstart, NULL);
233 		while (1) {
234 			fds.fd = pfd[0];
235 			fds.events = POLLIN | POLLHUP | POLLRDHUP;
236 			fds.revents = 0;
237 			new_wait = capmc_timeout - _tot_wait(&tstart);
238 			if (new_wait <= 0) {
239 				error("%s: poll() timeout @ %d msec", prog_name,
240 				      capmc_timeout);
241 				break;
242 			}
243 			new_wait = MIN(new_wait, MAX_POLL_WAIT);
244 			i = poll(&fds, 1, new_wait);
245 			if (i == 0) {
246 				continue;
247 			} else if (i < 0) {
248 				error("%s: poll(): %s", prog_name,
249 				      slurm_strerror(slurm_get_errno()));
250 				break;
251 			}
252 			if ((fds.revents & POLLIN) == 0)
253 				break;
254 			i = read(pfd[0], resp + resp_offset,
255 				 resp_size - resp_offset);
256 			if (i == 0) {
257 				break;
258 			} else if (i < 0) {
259 				if (errno == EAGAIN)
260 					continue;
261 				error("%s: read(): %s", prog_name,
262 				      slurm_strerror(slurm_get_errno()));
263 				break;
264 			} else {
265 				resp_offset += i;
266 				if (resp_offset + 1024 >= resp_size) {
267 					resp_size *= 2;
268 					resp = xrealloc(resp, resp_size);
269 				}
270 			}
271 		}
272 		killpg(cpid, SIGTERM);
273 		usleep(10000);
274 		killpg(cpid, SIGKILL);
275 		waitpid(cpid, status, 0);
276 		close(pfd[0]);
277 	}
278 	return resp;
279 }
280 
281 /* Convert node name string to equivalent nid string */
_node_names_2_nid_list(char * node_names)282 static char *_node_names_2_nid_list(char *node_names)
283 {
284 	char *nid_list = NULL;
285 	int i, last_nid_index = -1;
286 	bool is_dash = false;
287 
288 	for (i = 0; node_names[i]; i++) {
289 		int nid_index = 0;
290 		/* skip "nid[" */
291 		if ((node_names[i] < '0') || (node_names[i] > '9'))
292 			continue;
293 		/* skip leading zeros */
294 		while (node_names[i] == '0')
295 			i++;
296 		if (node_names[i] == '[')
297 			i++;
298 		while ((node_names[i] >= '0') && (node_names[i] <= '9')) {
299 			nid_index *= 10;
300 			nid_index += (node_names[i++] - '0');
301 		}
302 		if (is_dash && (nid_index >= last_nid_index)) {
303 			bit_nset(node_bitmap, last_nid_index, nid_index);
304 		} else {
305 			bit_set(node_bitmap, nid_index);
306 		}
307 		if ((is_dash = (node_names[i] == '-')))
308 			last_nid_index = nid_index;
309 		else if (node_names[i] == '\0')
310 			break;
311 	}
312 
313 	i = strlen(node_names) + 1;
314 	nid_list = xmalloc(i);
315 	bit_fmt(nid_list, i, node_bitmap);
316 
317 	return nid_list;
318 }
319 
320 /* Attempt to modify modes and reboot nodes in a single capmc call.
321  * RET 0 on success, -1 on failure */
_update_all_nodes(char * host_list)322 static int _update_all_nodes(char *host_list)
323 {
324 	char *argv[10], *nid_list, *resp_msg;
325 	int rc = 0, retry, status = 0;
326 
327 	nid_list = _node_names_2_nid_list(host_list);
328 
329 	if (mcdram_mode) {
330 		/* Update MCDRAM mode.
331 		* Example: "capmc set_mcdram_cfg –n 43 –m cache" */
332 		argv[0] = "capmc";
333 		argv[1] = "set_mcdram_cfg";
334 		argv[2] = "-m";
335 		argv[3] = mcdram_mode;
336 		argv[4] = "-n";
337 		argv[5] = nid_list;
338 		argv[6] = NULL;
339 		for (retry = 0; ; retry++) {
340 			resp_msg = _run_script(argv, &status);
341 			if ((status == 0) ||
342 			    (resp_msg && strcasestr(resp_msg, "Success"))) {
343 				debug("%s: set_mcdram_cfg sent to %s",
344 				      prog_name, argv[5]);
345 				xfree(resp_msg);
346 				break;
347 			}
348 			error("%s: capmc(%s,%s,%s,%s,%s): %d %s",
349 			      prog_name, argv[1], argv[2], argv[3],
350 			      argv[4], argv[5], status, resp_msg);
351 			if (resp_msg && strstr(resp_msg, "Could not lookup") &&
352 			    (retry <= capmc_retries)) {
353 				/* State Manager is down. Sleep and retry */
354 				sleep(1);
355 				xfree(resp_msg);
356 			} else {
357 				/* Non-recoverable error */
358 				rc = -1;
359 				xfree(resp_msg);
360 				break;
361 			}
362 		}
363 	}
364 
365 	if (numa_mode && (rc == 0)) {
366 		/* Update NUMA mode.
367 		 * Example: "capmc set_numa_cfg –m a2a –n 43" */
368 		argv[0] = "capmc";
369 		argv[1] = "set_numa_cfg";
370 		argv[2] = "-m";
371 		argv[3] = numa_mode;
372 		argv[4] = "-n";
373 		argv[5] = nid_list;
374 		argv[6] = NULL;
375 		for (retry = 0; ; retry++) {
376 			resp_msg = _run_script(argv, &status);
377 			if ((status == 0) ||
378 			    (resp_msg && strcasestr(resp_msg, "Success"))) {
379 				debug("%s: set_numa_cfg sent to %s",
380 				      prog_name, argv[5]);
381 				xfree(resp_msg);
382 				break;
383 			}
384 			error("%s: capmc(%s,%s,%s,%s,%s): %d %s",
385 			      prog_name, argv[1], argv[2], argv[3],
386 			      argv[4], argv[5], status, resp_msg);
387 			if (resp_msg && strstr(resp_msg, "Could not lookup") &&
388 			    (retry <= capmc_retries)) {
389 				/* State Manager is down. Sleep and retry */
390 				sleep(1);
391 				xfree(resp_msg);
392 			} else {
393 				/* Non-recoverable error */
394 				rc = -1;
395 				xfree(resp_msg);
396 				break;
397 			}
398 		}
399 	}
400 
401 	/* Request node restart.
402 	 * Example: "capmc node_reinit –n 43" */
403 	if (rc == 0) {
404 		argv[0] = "capmc";
405 		argv[1] = "node_reinit";
406 		argv[2] = "-n";
407 		argv[3] = nid_list;
408 		argv[4] = NULL;
409 //		argv[4] = "-r";	/* Future option: Reason */
410 //		argv[5] = "Change KNL mode";
411 		for (retry = 0; ; retry++) {
412 			resp_msg = _run_script(argv, &status);
413 			if ((status == 0) ||
414 			    (resp_msg && strcasestr(resp_msg, "Success"))) {
415 				debug("%s: node_reinit sent to %s",
416 				      prog_name, argv[3]);
417 				xfree(resp_msg);
418 				break;
419 			}
420 			error("%s: capmc(%s,%s,%s): %d %s", prog_name,
421 			      argv[1], argv[2], argv[3], status, resp_msg);
422 			if (resp_msg &&
423 			    (strstr(resp_msg, "Could not lookup") ||
424 			     strstr(resp_msg, "Internal server error")) &&
425 			    (retry <= capmc_retries)) {
426 				/* State Manager is down. Sleep and retry */
427 				sleep(1);
428 				xfree(resp_msg);
429 			} else {
430 				/* Non-recoverable error */
431 				rc = -1;
432 				xfree(resp_msg);
433 				break;
434 			}
435 		}
436 	}
437 
438 	xfree(nid_list);
439 
440 	return rc;
441 }
442 
_json_parse_nids(json_object * jobj,char * key,int * num)443 static uint32_t *_json_parse_nids(json_object *jobj, char *key, int *num)
444 {
445 	json_object *j_array = NULL;
446 	json_object *j_value = NULL;
447 	enum json_type j_type;
448 	uint32_t *ents;
449 	int i, cnt;
450 
451 	*num = 0;
452         json_object_object_get_ex(jobj, key, &j_array);
453 	if (!j_array) {
454 		debug("%s: key=%s not found in nid specification",
455 		      prog_name, key);
456 		return NULL;
457 	}
458 
459 	cnt = json_object_array_length(j_array);
460 	ents = xmalloc(sizeof(uint32_t) * cnt);
461 	for (i = 0; i < cnt; i++) {
462 		j_value = json_object_array_get_idx(j_array, i);
463 		j_type = json_object_get_type(j_value);
464 		if (j_type != json_type_int) {
465 			error("%s: Unable to parse nid specification",
466 			      prog_name);
467 			break;
468 		} else {
469 			ents[i] = (uint32_t) json_object_get_int64(j_value);
470 			*num = i + 1;
471 		}
472 	}
473 	return ents;
474 }
475 
476 /* Wait for all identified computed nodes to enter "on" state */
_wait_all_nodes_on(void)477 static void _wait_all_nodes_on(void)
478 {
479 	char *argv[10], *resp_msg;
480 	int i, nid_cnt = 0, status = 0;
481 	json_object *j;
482 	uint32_t *nid_array;
483 	time_t start_time = time(NULL);
484 
485 	while ((difftime(time(NULL), start_time) < (30 * 60)) &&
486 	       (bit_set_count(node_bitmap) > 0)) {
487 		sleep(capmc_poll_freq);
488 		argv[0] = "capmc";
489 		argv[1] = "node_status";
490 		argv[2] = NULL;
491 		resp_msg = _run_script(argv, &status);
492 		if (status != 0) {
493 			error("%s: capmc(%s,%s,%s): %d %s", prog_name,
494 				argv[1], argv[2], argv[3], status, resp_msg);
495 			break;
496 		}
497 		j = json_tokener_parse(resp_msg);
498 		if (j == NULL) {
499 			error("%s: json parser failed on %s",
500 			      prog_name, resp_msg);
501 			xfree(resp_msg);
502 			break;
503 		}
504 		xfree(resp_msg);
505 		nid_cnt = 0;
506 		nid_array = _json_parse_nids(j, "on", &nid_cnt);
507 		json_object_put(j);	/* Frees json memory */
508 		for (i = 0; i < nid_cnt; i++) {
509 			bit_clear(node_bitmap, nid_array[i]);
510 		}
511 		xfree(nid_array);
512 	}
513 }
514 
main(int argc,char * argv[])515 int main(int argc, char *argv[])
516 {
517 	log_options_t log_opts = LOG_OPTS_INITIALIZER;
518 	char *features, *save_ptr = NULL, *tok;
519 	update_node_msg_t node_msg;
520 	int rc =  SLURM_SUCCESS;
521 
522 	xstrfmtcat(prog_name, "%s[%u]", argv[0], (uint32_t) getpid());
523 	_read_config();
524 	log_opts.stderr_level = LOG_LEVEL_QUIET;
525 	log_opts.syslog_level = LOG_LEVEL_QUIET;
526 	if (slurm_get_debug_flags() & DEBUG_FLAG_NODE_FEATURES)
527 		log_opts.logfile_level = LOG_LEVEL_DEBUG;
528 	else
529 		log_opts.logfile_level = LOG_LEVEL_ERROR;
530 	(void) log_init(argv[0], log_opts, LOG_DAEMON, log_file);
531 
532 	if ((argc < 2) || (argc > 3)) {
533 		fprintf(stderr, "Usage: hostlist [mode]\n");
534 		error("Usage: hostlist [mode]");
535 		exit(2);
536 	}
537 
538 	/* Parse the MCDRAM and NUMA boot options */
539 	if (argc == 3) {
540 		features = xstrdup(argv[2]);
541 		tok = strtok_r(features, ",", &save_ptr);
542 		while (tok) {
543 			printf("%s\n", tok);
544 			if (!strcasecmp(tok, "a2a")  ||
545 			    !strcasecmp(tok, "hemi") ||
546 			    !strcasecmp(tok, "quad") ||
547 			    !strcasecmp(tok, "snc2") ||
548 			    !strcasecmp(tok, "snc4")) {
549 				xfree(numa_mode);
550 				numa_mode = xstrdup(tok);
551 			} else if (!strcasecmp(tok, "cache") ||
552 				   !strcasecmp(tok, "split") ||
553 				   !strcasecmp(tok, "equal") ||
554 				   !strcasecmp(tok, "flat")) {
555 				xfree(mcdram_mode);
556 				mcdram_mode = xstrdup(tok);
557 			}
558 			tok = strtok_r(NULL, ",", &save_ptr);
559 		}
560 		xfree(features);
561 	}
562 
563 	/* Attempt to update modes and restart nodes in a single capmc call */
564 	node_bitmap = bit_alloc(100000);
565 	if (_update_all_nodes(argv[1]) != 0) {
566 		/* Could not reboot nodes.
567 		 * Requeue the job we were trying to start */
568 		uint32_t job_id = 0;
569 		char *job_id_str = getenv("SLURM_JOB_ID");
570 		if (job_id_str)
571 			job_id = strtol(job_id_str, NULL, 10);
572 		if (job_id)
573 			(void) slurm_requeue(job_id, JOB_RECONFIG_FAIL);
574 
575 		/* Return the nodes to service */
576 		slurm_init_update_node_msg(&node_msg);
577 		node_msg.node_names = argv[1];
578 		node_msg.node_state = NODE_STATE_POWER_SAVE |
579 				      NODE_STATE_POWER_UP;
580 		rc = slurm_update_node(&node_msg);
581 		if (rc != SLURM_SUCCESS) {
582 			error("%s: slurm_update_node(\'%s\', \'IDLE\'): %s\n",
583 			      prog_name, argv[1],
584 			      slurm_strerror(slurm_get_errno()));
585 		}
586 
587 		exit(1);
588 	}
589 	xfree(mcdram_mode);
590 	xfree(numa_mode);
591 
592 	if (argc == 3) {
593 		slurm_init_update_node_msg(&node_msg);
594 		node_msg.node_names = argv[1];
595 		node_msg.features_act = argv[2];
596 		rc = slurm_update_node(&node_msg);
597 		if (rc != SLURM_SUCCESS) {
598 			error("%s: slurm_update_node(\'%s\', \'%s\'): %s\n",
599 			      prog_name, argv[1], argv[2],
600 			      slurm_strerror(slurm_get_errno()));
601 		}
602 	}
603 
604 	/* Wait for all nodes to change state to "on" */
605 	_wait_all_nodes_on();
606 
607 	bit_free(node_bitmap);
608 	xfree(prog_name);
609 	if (rc == SLURM_SUCCESS)
610 		exit(0);
611 	exit(1);
612 }
613