1 /*****************************************************************************\
2 * capmc_resume.c - Power up identified nodes with (optional) features.
3 * Once complete, modify the node's active features as needed.
4 *
5 * Usage: "capmc_resume <hostlist> [features]"
6 *****************************************************************************
7 * Copyright (C) 2016-2017 SchedMD LLC.
8 *
9 * This file is part of Slurm, a resource management program.
10 * For details, see <https://slurm.schedmd.com/>.
11 * Please also read the included file: DISCLAIMER.
12 *
13 * Slurm is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option)
16 * any later version.
17 *
18 * In addition, as a special exception, the copyright holders give permission
19 * to link the code of portions of this program with the OpenSSL library under
20 * certain conditions as described in each individual source file, and
21 * distribute linked combinations including the two. You must obey the GNU
22 * General Public License in all respects for all of the code used other than
23 * OpenSSL. If you modify file(s) with this exception, you may extend this
24 * exception to your version of the file(s), but you are not obligated to do
25 * so. If you do not wish to do so, delete this exception statement from your
26 * version. If you delete this exception statement from all source files in
27 * the program, then also delete it here.
28 *
29 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
30 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
31 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
32 * details.
33 *
34 * You should have received a copy of the GNU General Public License along
35 * with Slurm; if not, write to the Free Software Foundation, Inc.,
36 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
37 \*****************************************************************************/
38
39 #if HAVE_CONFIG_H
40 # include "config.h"
41 #endif
42
43 #define _GNU_SOURCE /* For POLLRDHUP */
44
45 #if HAVE_JSON_C_INC
46 # include <json-c/json.h>
47 #elif HAVE_JSON_INC
48 # include <json/json.h>
49 #endif
50
51 #include <poll.h>
52 #include <signal.h>
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <string.h>
56
57 #include "slurm/slurm.h"
58 #include "slurm/slurm_errno.h"
59 #include "src/common/log.h"
60 #include "src/common/macros.h"
61 #include "src/common/parse_config.h"
62 #include "src/common/read_config.h"
63 #include "src/common/slurm_protocol_api.h"
64 #include "src/common/xassert.h"
65 #include "src/common/xmalloc.h"
66 #include "src/common/xstring.h"
67
68 #define MAX_THREADS 256
69
70 /* Maximum poll wait time for child processes, in milliseconds */
71 #define MAX_POLL_WAIT 500
72
73 /* Default and minimum timeout parameters for the capmc command */
74 #define DEFAULT_CAPMC_RETRIES 4
75 #define DEFAULT_CAPMC_TIMEOUT 60000 /* 60 seconds */
76 #define MIN_CAPMC_TIMEOUT 1000 /* 1 second */
77
78 /* Number of times to try performing "node_reinit" operation */
79 #define NODE_REINIT_RETRIES 10
80
81 /* Number of times to try performing node state change operation */
82 #define NODE_STATE_RETRIES 10
83
84 /* Static variables */
85 static char *capmc_path = NULL;
86 static uint32_t capmc_poll_freq = 45;
87 static uint32_t capmc_retries = DEFAULT_CAPMC_RETRIES;
88 static uint32_t capmc_timeout = DEFAULT_CAPMC_TIMEOUT;
89 static char *log_file = NULL;
90 static bitstr_t *node_bitmap = NULL;
91 static char *prog_name = NULL;
92 static char *mcdram_mode = NULL, *numa_mode = NULL;
93 static char *syscfg_path = NULL;
94
95 /* NOTE: Keep this table synchronized with the table in
96 * src/plugins/node_features/knl_cray/node_features_knl_cray.c */
97 static s_p_options_t knl_conf_file_options[] = {
98 {"AllowMCDRAM", S_P_STRING},
99 {"AllowNUMA", S_P_STRING},
100 {"AllowUserBoot", S_P_STRING},
101 {"BootTime", S_P_UINT32},
102 {"CapmcPath", S_P_STRING},
103 {"CapmcPollFreq", S_P_UINT32},
104 {"CapmcRetries", S_P_UINT32},
105 {"CapmcTimeout", S_P_UINT32},
106 {"CnselectPath", S_P_STRING},
107 {"DefaultMCDRAM", S_P_STRING},
108 {"DefaultNUMA", S_P_STRING},
109 {"LogFile", S_P_STRING},
110 {"McPath", S_P_STRING},
111 {"SyscfgPath", S_P_STRING},
112 {"UmeCheckInterval", S_P_UINT32},
113 {NULL}
114 };
115
116 /* Static functions */
117 static s_p_hashtbl_t *_config_make_tbl(char *filename);
118 static uint32_t *_json_parse_nids(json_object *jobj, char *key, int *num);
119 static void _read_config(void);
120 static char *_run_script(char **script_argv, int *status);
121 static int _tot_wait(struct timeval *start_time);
122
_config_make_tbl(char * filename)123 static s_p_hashtbl_t *_config_make_tbl(char *filename)
124 {
125 s_p_hashtbl_t *tbl = NULL;
126
127 xassert(filename);
128
129 if (!(tbl = s_p_hashtbl_create(knl_conf_file_options))) {
130 error("%s: s_p_hashtbl_create error: %s", prog_name,
131 slurm_strerror(slurm_get_errno()));
132 return tbl;
133 }
134
135 if (s_p_parse_file(tbl, NULL, filename, false) == SLURM_ERROR) {
136 error("%s: s_p_parse_file error: %s", prog_name,
137 slurm_strerror(slurm_get_errno()));
138 s_p_hashtbl_destroy(tbl);
139 tbl = NULL;
140 }
141
142 return tbl;
143 }
144
_read_config(void)145 static void _read_config(void)
146 {
147 char *knl_conf_file;
148 s_p_hashtbl_t *tbl;
149
150 capmc_timeout = DEFAULT_CAPMC_TIMEOUT;
151 knl_conf_file = get_extra_conf_path("knl_cray.conf");
152 if ((tbl = _config_make_tbl(knl_conf_file))) {
153 (void) s_p_get_string(&capmc_path, "CapmcPath", tbl);
154 (void) s_p_get_uint32(&capmc_poll_freq, "CapmcPollFreq", tbl);
155 (void) s_p_get_uint32(&capmc_retries, "CapmcRetries", tbl);
156 (void) s_p_get_uint32(&capmc_timeout, "CapmcTimeout", tbl);
157 (void) s_p_get_string(&log_file, "LogFile", tbl);
158 (void) s_p_get_string(&syscfg_path, "SyscfgPath", tbl);
159 }
160 xfree(knl_conf_file);
161 s_p_hashtbl_destroy(tbl);
162 if (!capmc_path)
163 capmc_path = xstrdup("/opt/cray/capmc/default/bin/capmc");
164 capmc_timeout = MAX(capmc_timeout, MIN_CAPMC_TIMEOUT);
165 if (!log_file)
166 log_file = slurm_get_job_slurmctld_logfile();
167 }
168
169 /*
170 * Return time in msec since "start time"
171 */
_tot_wait(struct timeval * start_time)172 static int _tot_wait(struct timeval *start_time)
173 {
174 struct timeval end_time;
175 int msec_delay;
176
177 gettimeofday(&end_time, NULL);
178 msec_delay = (end_time.tv_sec - start_time->tv_sec ) * 1000;
179 msec_delay += ((end_time.tv_usec - start_time->tv_usec + 500) / 1000);
180 return msec_delay;
181 }
182
183 /* Run a script and return its stdout plus exit status */
_run_script(char ** script_argv,int * status)184 static char *_run_script(char **script_argv, int *status)
185 {
186 int cc, i, new_wait, resp_size = 0, resp_offset = 0;
187 pid_t cpid;
188 char *resp = NULL;
189 int pfd[2] = { -1, -1 };
190
191 if (access(capmc_path, R_OK | X_OK) < 0) {
192 error("%s: Can not execute: %s", prog_name, capmc_path);
193 *status = 127;
194 resp = xstrdup("Slurm node_features/knl_cray configuration error");
195 return resp;
196 }
197 if (pipe(pfd) != 0) {
198 error("%s: pipe(): %s", prog_name,
199 slurm_strerror(slurm_get_errno()));
200 *status = 127;
201 resp = xstrdup("System error");
202 return resp;
203 }
204
205 if ((cpid = fork()) == 0) {
206 cc = sysconf(_SC_OPEN_MAX);
207 dup2(pfd[1], STDERR_FILENO);
208 dup2(pfd[1], STDOUT_FILENO);
209 for (i = 0; i < cc; i++) {
210 if ((i != STDERR_FILENO) && (i != STDOUT_FILENO))
211 close(i);
212 }
213 setpgid(0, 0);
214 execv(capmc_path, script_argv);
215 error("%s: execv(): %s", prog_name,
216 slurm_strerror(slurm_get_errno()));
217 exit(127);
218 } else if (cpid < 0) {
219 close(pfd[0]);
220 close(pfd[1]);
221 error("%s: fork(): %s", prog_name,
222 slurm_strerror(slurm_get_errno()));
223 *status = 127;
224 resp = xstrdup("System error");
225 return resp;
226 } else {
227 struct pollfd fds;
228 struct timeval tstart;
229 resp_size = 1024;
230 resp = xmalloc(resp_size);
231 close(pfd[1]);
232 gettimeofday(&tstart, NULL);
233 while (1) {
234 fds.fd = pfd[0];
235 fds.events = POLLIN | POLLHUP | POLLRDHUP;
236 fds.revents = 0;
237 new_wait = capmc_timeout - _tot_wait(&tstart);
238 if (new_wait <= 0) {
239 error("%s: poll() timeout @ %d msec", prog_name,
240 capmc_timeout);
241 break;
242 }
243 new_wait = MIN(new_wait, MAX_POLL_WAIT);
244 i = poll(&fds, 1, new_wait);
245 if (i == 0) {
246 continue;
247 } else if (i < 0) {
248 error("%s: poll(): %s", prog_name,
249 slurm_strerror(slurm_get_errno()));
250 break;
251 }
252 if ((fds.revents & POLLIN) == 0)
253 break;
254 i = read(pfd[0], resp + resp_offset,
255 resp_size - resp_offset);
256 if (i == 0) {
257 break;
258 } else if (i < 0) {
259 if (errno == EAGAIN)
260 continue;
261 error("%s: read(): %s", prog_name,
262 slurm_strerror(slurm_get_errno()));
263 break;
264 } else {
265 resp_offset += i;
266 if (resp_offset + 1024 >= resp_size) {
267 resp_size *= 2;
268 resp = xrealloc(resp, resp_size);
269 }
270 }
271 }
272 killpg(cpid, SIGTERM);
273 usleep(10000);
274 killpg(cpid, SIGKILL);
275 waitpid(cpid, status, 0);
276 close(pfd[0]);
277 }
278 return resp;
279 }
280
281 /* Convert node name string to equivalent nid string */
_node_names_2_nid_list(char * node_names)282 static char *_node_names_2_nid_list(char *node_names)
283 {
284 char *nid_list = NULL;
285 int i, last_nid_index = -1;
286 bool is_dash = false;
287
288 for (i = 0; node_names[i]; i++) {
289 int nid_index = 0;
290 /* skip "nid[" */
291 if ((node_names[i] < '0') || (node_names[i] > '9'))
292 continue;
293 /* skip leading zeros */
294 while (node_names[i] == '0')
295 i++;
296 if (node_names[i] == '[')
297 i++;
298 while ((node_names[i] >= '0') && (node_names[i] <= '9')) {
299 nid_index *= 10;
300 nid_index += (node_names[i++] - '0');
301 }
302 if (is_dash && (nid_index >= last_nid_index)) {
303 bit_nset(node_bitmap, last_nid_index, nid_index);
304 } else {
305 bit_set(node_bitmap, nid_index);
306 }
307 if ((is_dash = (node_names[i] == '-')))
308 last_nid_index = nid_index;
309 else if (node_names[i] == '\0')
310 break;
311 }
312
313 i = strlen(node_names) + 1;
314 nid_list = xmalloc(i);
315 bit_fmt(nid_list, i, node_bitmap);
316
317 return nid_list;
318 }
319
320 /* Attempt to modify modes and reboot nodes in a single capmc call.
321 * RET 0 on success, -1 on failure */
_update_all_nodes(char * host_list)322 static int _update_all_nodes(char *host_list)
323 {
324 char *argv[10], *nid_list, *resp_msg;
325 int rc = 0, retry, status = 0;
326
327 nid_list = _node_names_2_nid_list(host_list);
328
329 if (mcdram_mode) {
330 /* Update MCDRAM mode.
331 * Example: "capmc set_mcdram_cfg –n 43 –m cache" */
332 argv[0] = "capmc";
333 argv[1] = "set_mcdram_cfg";
334 argv[2] = "-m";
335 argv[3] = mcdram_mode;
336 argv[4] = "-n";
337 argv[5] = nid_list;
338 argv[6] = NULL;
339 for (retry = 0; ; retry++) {
340 resp_msg = _run_script(argv, &status);
341 if ((status == 0) ||
342 (resp_msg && strcasestr(resp_msg, "Success"))) {
343 debug("%s: set_mcdram_cfg sent to %s",
344 prog_name, argv[5]);
345 xfree(resp_msg);
346 break;
347 }
348 error("%s: capmc(%s,%s,%s,%s,%s): %d %s",
349 prog_name, argv[1], argv[2], argv[3],
350 argv[4], argv[5], status, resp_msg);
351 if (resp_msg && strstr(resp_msg, "Could not lookup") &&
352 (retry <= capmc_retries)) {
353 /* State Manager is down. Sleep and retry */
354 sleep(1);
355 xfree(resp_msg);
356 } else {
357 /* Non-recoverable error */
358 rc = -1;
359 xfree(resp_msg);
360 break;
361 }
362 }
363 }
364
365 if (numa_mode && (rc == 0)) {
366 /* Update NUMA mode.
367 * Example: "capmc set_numa_cfg –m a2a –n 43" */
368 argv[0] = "capmc";
369 argv[1] = "set_numa_cfg";
370 argv[2] = "-m";
371 argv[3] = numa_mode;
372 argv[4] = "-n";
373 argv[5] = nid_list;
374 argv[6] = NULL;
375 for (retry = 0; ; retry++) {
376 resp_msg = _run_script(argv, &status);
377 if ((status == 0) ||
378 (resp_msg && strcasestr(resp_msg, "Success"))) {
379 debug("%s: set_numa_cfg sent to %s",
380 prog_name, argv[5]);
381 xfree(resp_msg);
382 break;
383 }
384 error("%s: capmc(%s,%s,%s,%s,%s): %d %s",
385 prog_name, argv[1], argv[2], argv[3],
386 argv[4], argv[5], status, resp_msg);
387 if (resp_msg && strstr(resp_msg, "Could not lookup") &&
388 (retry <= capmc_retries)) {
389 /* State Manager is down. Sleep and retry */
390 sleep(1);
391 xfree(resp_msg);
392 } else {
393 /* Non-recoverable error */
394 rc = -1;
395 xfree(resp_msg);
396 break;
397 }
398 }
399 }
400
401 /* Request node restart.
402 * Example: "capmc node_reinit –n 43" */
403 if (rc == 0) {
404 argv[0] = "capmc";
405 argv[1] = "node_reinit";
406 argv[2] = "-n";
407 argv[3] = nid_list;
408 argv[4] = NULL;
409 // argv[4] = "-r"; /* Future option: Reason */
410 // argv[5] = "Change KNL mode";
411 for (retry = 0; ; retry++) {
412 resp_msg = _run_script(argv, &status);
413 if ((status == 0) ||
414 (resp_msg && strcasestr(resp_msg, "Success"))) {
415 debug("%s: node_reinit sent to %s",
416 prog_name, argv[3]);
417 xfree(resp_msg);
418 break;
419 }
420 error("%s: capmc(%s,%s,%s): %d %s", prog_name,
421 argv[1], argv[2], argv[3], status, resp_msg);
422 if (resp_msg &&
423 (strstr(resp_msg, "Could not lookup") ||
424 strstr(resp_msg, "Internal server error")) &&
425 (retry <= capmc_retries)) {
426 /* State Manager is down. Sleep and retry */
427 sleep(1);
428 xfree(resp_msg);
429 } else {
430 /* Non-recoverable error */
431 rc = -1;
432 xfree(resp_msg);
433 break;
434 }
435 }
436 }
437
438 xfree(nid_list);
439
440 return rc;
441 }
442
_json_parse_nids(json_object * jobj,char * key,int * num)443 static uint32_t *_json_parse_nids(json_object *jobj, char *key, int *num)
444 {
445 json_object *j_array = NULL;
446 json_object *j_value = NULL;
447 enum json_type j_type;
448 uint32_t *ents;
449 int i, cnt;
450
451 *num = 0;
452 json_object_object_get_ex(jobj, key, &j_array);
453 if (!j_array) {
454 debug("%s: key=%s not found in nid specification",
455 prog_name, key);
456 return NULL;
457 }
458
459 cnt = json_object_array_length(j_array);
460 ents = xmalloc(sizeof(uint32_t) * cnt);
461 for (i = 0; i < cnt; i++) {
462 j_value = json_object_array_get_idx(j_array, i);
463 j_type = json_object_get_type(j_value);
464 if (j_type != json_type_int) {
465 error("%s: Unable to parse nid specification",
466 prog_name);
467 break;
468 } else {
469 ents[i] = (uint32_t) json_object_get_int64(j_value);
470 *num = i + 1;
471 }
472 }
473 return ents;
474 }
475
476 /* Wait for all identified computed nodes to enter "on" state */
_wait_all_nodes_on(void)477 static void _wait_all_nodes_on(void)
478 {
479 char *argv[10], *resp_msg;
480 int i, nid_cnt = 0, status = 0;
481 json_object *j;
482 uint32_t *nid_array;
483 time_t start_time = time(NULL);
484
485 while ((difftime(time(NULL), start_time) < (30 * 60)) &&
486 (bit_set_count(node_bitmap) > 0)) {
487 sleep(capmc_poll_freq);
488 argv[0] = "capmc";
489 argv[1] = "node_status";
490 argv[2] = NULL;
491 resp_msg = _run_script(argv, &status);
492 if (status != 0) {
493 error("%s: capmc(%s,%s,%s): %d %s", prog_name,
494 argv[1], argv[2], argv[3], status, resp_msg);
495 break;
496 }
497 j = json_tokener_parse(resp_msg);
498 if (j == NULL) {
499 error("%s: json parser failed on %s",
500 prog_name, resp_msg);
501 xfree(resp_msg);
502 break;
503 }
504 xfree(resp_msg);
505 nid_cnt = 0;
506 nid_array = _json_parse_nids(j, "on", &nid_cnt);
507 json_object_put(j); /* Frees json memory */
508 for (i = 0; i < nid_cnt; i++) {
509 bit_clear(node_bitmap, nid_array[i]);
510 }
511 xfree(nid_array);
512 }
513 }
514
main(int argc,char * argv[])515 int main(int argc, char *argv[])
516 {
517 log_options_t log_opts = LOG_OPTS_INITIALIZER;
518 char *features, *save_ptr = NULL, *tok;
519 update_node_msg_t node_msg;
520 int rc = SLURM_SUCCESS;
521
522 xstrfmtcat(prog_name, "%s[%u]", argv[0], (uint32_t) getpid());
523 _read_config();
524 log_opts.stderr_level = LOG_LEVEL_QUIET;
525 log_opts.syslog_level = LOG_LEVEL_QUIET;
526 if (slurm_get_debug_flags() & DEBUG_FLAG_NODE_FEATURES)
527 log_opts.logfile_level = LOG_LEVEL_DEBUG;
528 else
529 log_opts.logfile_level = LOG_LEVEL_ERROR;
530 (void) log_init(argv[0], log_opts, LOG_DAEMON, log_file);
531
532 if ((argc < 2) || (argc > 3)) {
533 fprintf(stderr, "Usage: hostlist [mode]\n");
534 error("Usage: hostlist [mode]");
535 exit(2);
536 }
537
538 /* Parse the MCDRAM and NUMA boot options */
539 if (argc == 3) {
540 features = xstrdup(argv[2]);
541 tok = strtok_r(features, ",", &save_ptr);
542 while (tok) {
543 printf("%s\n", tok);
544 if (!strcasecmp(tok, "a2a") ||
545 !strcasecmp(tok, "hemi") ||
546 !strcasecmp(tok, "quad") ||
547 !strcasecmp(tok, "snc2") ||
548 !strcasecmp(tok, "snc4")) {
549 xfree(numa_mode);
550 numa_mode = xstrdup(tok);
551 } else if (!strcasecmp(tok, "cache") ||
552 !strcasecmp(tok, "split") ||
553 !strcasecmp(tok, "equal") ||
554 !strcasecmp(tok, "flat")) {
555 xfree(mcdram_mode);
556 mcdram_mode = xstrdup(tok);
557 }
558 tok = strtok_r(NULL, ",", &save_ptr);
559 }
560 xfree(features);
561 }
562
563 /* Attempt to update modes and restart nodes in a single capmc call */
564 node_bitmap = bit_alloc(100000);
565 if (_update_all_nodes(argv[1]) != 0) {
566 /* Could not reboot nodes.
567 * Requeue the job we were trying to start */
568 uint32_t job_id = 0;
569 char *job_id_str = getenv("SLURM_JOB_ID");
570 if (job_id_str)
571 job_id = strtol(job_id_str, NULL, 10);
572 if (job_id)
573 (void) slurm_requeue(job_id, JOB_RECONFIG_FAIL);
574
575 /* Return the nodes to service */
576 slurm_init_update_node_msg(&node_msg);
577 node_msg.node_names = argv[1];
578 node_msg.node_state = NODE_STATE_POWER_SAVE |
579 NODE_STATE_POWER_UP;
580 rc = slurm_update_node(&node_msg);
581 if (rc != SLURM_SUCCESS) {
582 error("%s: slurm_update_node(\'%s\', \'IDLE\'): %s\n",
583 prog_name, argv[1],
584 slurm_strerror(slurm_get_errno()));
585 }
586
587 exit(1);
588 }
589 xfree(mcdram_mode);
590 xfree(numa_mode);
591
592 if (argc == 3) {
593 slurm_init_update_node_msg(&node_msg);
594 node_msg.node_names = argv[1];
595 node_msg.features_act = argv[2];
596 rc = slurm_update_node(&node_msg);
597 if (rc != SLURM_SUCCESS) {
598 error("%s: slurm_update_node(\'%s\', \'%s\'): %s\n",
599 prog_name, argv[1], argv[2],
600 slurm_strerror(slurm_get_errno()));
601 }
602 }
603
604 /* Wait for all nodes to change state to "on" */
605 _wait_all_nodes_on();
606
607 bit_free(node_bitmap);
608 xfree(prog_name);
609 if (rc == SLURM_SUCCESS)
610 exit(0);
611 exit(1);
612 }
613