1 /*****************************************************************************\
2 * capmc_suspend.c - Power down identified nodes
3 *
4 * Usage: "capmc_suspend <hostlist>"
5 *****************************************************************************
6 * Copyright (C) 2016-2017 SchedMD LLC.
7 *
8 * This file is part of Slurm, a resource management program.
9 * For details, see <https://slurm.schedmd.com/>.
10 * Please also read the included file: DISCLAIMER.
11 *
12 * Slurm is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option)
15 * any later version.
16 *
17 * In addition, as a special exception, the copyright holders give permission
18 * to link the code of portions of this program with the OpenSSL library under
19 * certain conditions as described in each individual source file, and
20 * distribute linked combinations including the two. You must obey the GNU
21 * General Public License in all respects for all of the code used other than
22 * OpenSSL. If you modify file(s) with this exception, you may extend this
23 * exception to your version of the file(s), but you are not obligated to do
24 * so. If you do not wish to do so, delete this exception statement from your
25 * version. If you delete this exception statement from all source files in
26 * the program, then also delete it here.
27 *
28 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
29 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
30 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
31 * details.
32 *
33 * You should have received a copy of the GNU General Public License along
34 * with Slurm; if not, write to the Free Software Foundation, Inc.,
35 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
36 \*****************************************************************************/
37
38 #if HAVE_CONFIG_H
39 # include "config.h"
40 #endif
41
42 #define _GNU_SOURCE /* For POLLRDHUP */
43
44 #if HAVE_JSON_C_INC
45 # include <json-c/json.h>
46 #elif HAVE_JSON_INC
47 # include <json/json.h>
48 #endif
49
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <string.h>
55
56 #include "slurm/slurm.h"
57 #include "slurm/slurm_errno.h"
58 #include "src/common/hostlist.h"
59 #include "src/common/log.h"
60 #include "src/common/macros.h"
61 #include "src/common/parse_config.h"
62 #include "src/common/read_config.h"
63 #include "src/common/slurm_protocol_api.h"
64 #include "src/common/xassert.h"
65 #include "src/common/xmalloc.h"
66 #include "src/common/xstring.h"
67
68 #define MAX_THREADS 256
69
70 /* Maximum poll wait time for child processes, in milliseconds */
71 #define MAX_POLL_WAIT 500
72
73 /* Default and minimum timeout parameters for the capmc command */
74 #define DEFAULT_CAPMC_RETRIES 4
75 #define DEFAULT_CAPMC_TIMEOUT 60000 /* 60 seconds */
76 #define MIN_CAPMC_TIMEOUT 1000 /* 1 second */
77
78 /* Number of times to try performing "node_off" operation */
79 #define NODE_OFF_RETRIES 10
80
81 /* How long to wait for a node to enter "off" state, in seconds */
82 #define NODE_OFF_STATE_WAIT (30 * 60)
83
84 /* Static variables */
85 static char *capmc_path = NULL;
86 static uint32_t capmc_poll_freq = 45; /* capmc state polling frequency */
87 static uint32_t capmc_retries = DEFAULT_CAPMC_RETRIES;
88 static uint32_t capmc_timeout = DEFAULT_CAPMC_TIMEOUT;
89 static char *log_file = NULL;
90 static char *prog_name = NULL;
91
92 /* NOTE: Keep this table synchronized with the table in
93 * src/plugins/node_features/knl_cray/node_features_knl_cray.c */
94 static s_p_options_t knl_conf_file_options[] = {
95 {"AllowMCDRAM", S_P_STRING},
96 {"AllowNUMA", S_P_STRING},
97 {"AllowUserBoot", S_P_STRING},
98 {"BootTime", S_P_UINT32},
99 {"CapmcPath", S_P_STRING},
100 {"CapmcPollFreq", S_P_UINT32},
101 {"CapmcRetries", S_P_UINT32},
102 {"CapmcTimeout", S_P_UINT32},
103 {"CnselectPath", S_P_STRING},
104 {"DefaultMCDRAM", S_P_STRING},
105 {"DefaultNUMA", S_P_STRING},
106 {"LogFile", S_P_STRING},
107 {"McPath", S_P_STRING},
108 {"SyscfgPath", S_P_STRING},
109 {"UmeCheckInterval", S_P_UINT32},
110 {NULL}
111 };
112
113 static s_p_hashtbl_t *_config_make_tbl(char *filename);
114 static void _read_config(void);
115 static char *_run_script(char **script_argv, int *status);
116 static int _tot_wait(struct timeval *start_time);
117 static int _update_all_nodes(char *node_names);
118
_config_make_tbl(char * filename)119 static s_p_hashtbl_t *_config_make_tbl(char *filename)
120 {
121 s_p_hashtbl_t *tbl = NULL;
122
123 xassert(filename);
124
125 if (!(tbl = s_p_hashtbl_create(knl_conf_file_options))) {
126 error("%s: s_p_hashtbl_create error: %s", prog_name,
127 slurm_strerror(slurm_get_errno()));
128 return tbl;
129 }
130
131 if (s_p_parse_file(tbl, NULL, filename, false) == SLURM_ERROR) {
132 error("%s: s_p_parse_file error: %s", prog_name,
133 slurm_strerror(slurm_get_errno()));
134 s_p_hashtbl_destroy(tbl);
135 tbl = NULL;
136 }
137
138 return tbl;
139 }
140
_read_config(void)141 static void _read_config(void)
142 {
143 char *knl_conf_file;
144 s_p_hashtbl_t *tbl;
145
146 capmc_timeout = DEFAULT_CAPMC_TIMEOUT;
147 knl_conf_file = get_extra_conf_path("knl_cray.conf");
148 if ((tbl = _config_make_tbl(knl_conf_file))) {
149 (void) s_p_get_string(&capmc_path, "CapmcPath", tbl);
150 (void) s_p_get_uint32(&capmc_poll_freq, "CapmcPollFreq", tbl);
151 (void) s_p_get_uint32(&capmc_retries, "CapmcRetries", tbl);
152 (void) s_p_get_uint32(&capmc_timeout, "CapmcTimeout", tbl);
153 (void) s_p_get_string(&log_file, "LogFile", tbl);
154 }
155 xfree(knl_conf_file);
156 s_p_hashtbl_destroy(tbl);
157 if (!capmc_path)
158 capmc_path = xstrdup("/opt/cray/capmc/default/bin/capmc");
159 capmc_timeout = MAX(capmc_timeout, MIN_CAPMC_TIMEOUT);
160 if (!log_file)
161 log_file = slurm_get_job_slurmctld_logfile();
162 }
163
164 /*
165 * Return time in msec since "start time"
166 */
_tot_wait(struct timeval * start_time)167 static int _tot_wait(struct timeval *start_time)
168 {
169 struct timeval end_time;
170 int msec_delay;
171
172 gettimeofday(&end_time, NULL);
173 msec_delay = (end_time.tv_sec - start_time->tv_sec ) * 1000;
174 msec_delay += ((end_time.tv_usec - start_time->tv_usec + 500) / 1000);
175 return msec_delay;
176 }
177
178 /* Run a script and return its stdout plus exit status */
_run_script(char ** script_argv,int * status)179 static char *_run_script(char **script_argv, int *status)
180 {
181 int cc, i, new_wait, resp_size = 0, resp_offset = 0;
182 pid_t cpid;
183 char *resp = NULL;
184 int pfd[2] = { -1, -1 };
185
186 if (access(capmc_path, R_OK | X_OK) < 0) {
187 error("%s: Can not execute: %s", prog_name, capmc_path);
188 *status = 127;
189 resp = xstrdup("Slurm node_features/knl_cray configuration error");
190 return resp;
191 }
192 if (pipe(pfd) != 0) {
193 error("%s: pipe(): %s", prog_name,
194 slurm_strerror(slurm_get_errno()));
195 *status = 127;
196 resp = xstrdup("System error");
197 return resp;
198 }
199
200 if ((cpid = fork()) == 0) {
201 cc = sysconf(_SC_OPEN_MAX);
202 dup2(pfd[1], STDERR_FILENO);
203 dup2(pfd[1], STDOUT_FILENO);
204 for (i = 0; i < cc; i++) {
205 if ((i != STDERR_FILENO) && (i != STDOUT_FILENO))
206 close(i);
207 }
208 setpgid(0, 0);
209 execv(capmc_path, script_argv);
210 error("%s: execv(): %s", prog_name,
211 slurm_strerror(slurm_get_errno()));
212 exit(127);
213 } else if (cpid < 0) {
214 close(pfd[0]);
215 close(pfd[1]);
216 error("%s: fork(): %s", prog_name,
217 slurm_strerror(slurm_get_errno()));
218 *status = 127;
219 resp = xstrdup("System error");
220 return resp;
221 } else {
222 struct pollfd fds;
223 struct timeval tstart;
224 resp_size = 1024;
225 resp = xmalloc(resp_size);
226 close(pfd[1]);
227 gettimeofday(&tstart, NULL);
228 while (1) {
229 fds.fd = pfd[0];
230 fds.events = POLLIN | POLLHUP | POLLRDHUP;
231 fds.revents = 0;
232 new_wait = capmc_timeout - _tot_wait(&tstart);
233 if (new_wait <= 0) {
234 error("%s: poll() timeout @ %d msec", prog_name,
235 capmc_timeout);
236 break;
237 }
238 new_wait = MIN(new_wait, MAX_POLL_WAIT);
239 i = poll(&fds, 1, new_wait);
240 if (i == 0) {
241 continue;
242 } else if (i < 0) {
243 error("%s: poll(): %s", prog_name,
244 slurm_strerror(slurm_get_errno()));
245 break;
246 }
247 if ((fds.revents & POLLIN) == 0)
248 break;
249 i = read(pfd[0], resp + resp_offset,
250 resp_size - resp_offset);
251 if (i == 0) {
252 break;
253 } else if (i < 0) {
254 if (errno == EAGAIN)
255 continue;
256 error("%s: read(): %s", prog_name,
257 slurm_strerror(slurm_get_errno()));
258 break;
259 } else {
260 resp_offset += i;
261 if (resp_offset + 1024 >= resp_size) {
262 resp_size *= 2;
263 resp = xrealloc(resp, resp_size);
264 }
265 }
266 }
267 killpg(cpid, SIGTERM);
268 usleep(10000);
269 killpg(cpid, SIGKILL);
270 waitpid(cpid, status, 0);
271 close(pfd[0]);
272 }
273 return resp;
274 }
275
276 /* Convert node name string to equivalent nid string */
_node_names_2_nid_list(char * node_names)277 static char *_node_names_2_nid_list(char *node_names)
278 {
279 char *nid_list = NULL;
280 int i, last_nid_index = -1;
281 bool is_dash = false;
282 bitstr_t *node_bitmap;
283
284 node_bitmap = bit_alloc(100000);
285 for (i = 0; node_names[i]; i++) {
286 int nid_index = 0;
287 /* skip "nid[" */
288 if ((node_names[i] < '0') || (node_names[i] > '9'))
289 continue;
290 /* skip leading zeros */
291 while (node_names[i] == '0')
292 i++;
293 if (node_names[i] == '[')
294 i++;
295 while ((node_names[i] >= '0') && (node_names[i] <= '9')) {
296 nid_index *= 10;
297 nid_index += (node_names[i++] - '0');
298 }
299 if (is_dash && (nid_index >= last_nid_index)) {
300 bit_nset(node_bitmap, last_nid_index, nid_index);
301 } else {
302 bit_set(node_bitmap, nid_index);
303 }
304 if ((is_dash = (node_names[i] == '-')))
305 last_nid_index = nid_index;
306 else if (node_names[i] == '\0')
307 break;
308 }
309
310 i = strlen(node_names) + 1;
311 nid_list = xmalloc(i);
312 bit_fmt(nid_list, i, node_bitmap);
313 bit_free(node_bitmap);
314
315 return nid_list;
316 }
317
318 /* Attempt to shutdown all nodes in a single capmc call.
319 * RET 0 on success, -1 on failure */
_update_all_nodes(char * node_names)320 static int _update_all_nodes(char *node_names)
321 {
322 char *argv[10], *nid_list, *resp_msg;
323 int rc = 0, retry, status = 0;
324
325 nid_list = _node_names_2_nid_list(node_names);
326 if (nid_list == NULL)
327 return -1;
328
329 /* Request node power down.
330 * Example: "capmc node_off –n 43" */
331 argv[0] = "capmc";
332 argv[1] = "node_off";
333 argv[2] = "-n";
334 argv[3] = nid_list;
335 argv[4] = NULL;
336 for (retry = 0; ; retry++) {
337 resp_msg = _run_script(argv, &status);
338 if ((status == 0) ||
339 (resp_msg && strcasestr(resp_msg, "Success"))) {
340 debug("%s: node_off sent to %s", prog_name, argv[3]);
341 xfree(resp_msg);
342 break;
343 }
344 error("%s: capmc(%s,%s,%s): %d %s", prog_name,
345 argv[1], argv[2], argv[3], status, resp_msg);
346 if (resp_msg && strstr(resp_msg, "Could not lookup") &&
347 (retry <= capmc_retries)) {
348 /* State Manager is down. Sleep and retry */
349 error("Cray State Manager is down, retrying request");
350 sleep(1);
351 xfree(resp_msg);
352 } else {
353 /* Non-recoverable error */
354 error("Aborting capmc_suspend for %s", nid_list);
355 rc = -1;
356 xfree(resp_msg);
357 break;
358 }
359 }
360
361 xfree(resp_msg);
362 xfree(nid_list);
363 return rc;
364 }
365
main(int argc,char * argv[])366 int main(int argc, char *argv[])
367 {
368 log_options_t log_opts = LOG_OPTS_INITIALIZER;
369
370 xstrfmtcat(prog_name, "%s[%u]", argv[0], (uint32_t) getpid());
371 _read_config();
372 log_opts.stderr_level = LOG_LEVEL_QUIET;
373 log_opts.syslog_level = LOG_LEVEL_QUIET;
374 if (slurm_get_debug_flags() & DEBUG_FLAG_NODE_FEATURES)
375 log_opts.logfile_level = LOG_LEVEL_DEBUG;
376 else
377 log_opts.logfile_level = LOG_LEVEL_ERROR;
378 (void) log_init(argv[0], log_opts, LOG_DAEMON, log_file);
379
380 /* Attempt to shutdown all nodes in a single capmc call. */
381 if (_update_all_nodes(argv[1]) != 0)
382 exit(1);
383
384 xfree(prog_name);
385 exit(0);
386 }
387