1 /*****************************************************************************\
2 * power_save.c - support node power saving mode. Nodes which have been
3 * idle for an extended period of time will be placed into a power saving
4 * mode by running an arbitrary script. This script can lower the voltage
5 * or frequency of the nodes or can completely power the nodes off.
6 * When the node is restored to normal operation, another script will be
7 * executed. Many parameters are available to control this mode of operation.
8 *****************************************************************************
9 * Copyright (C) 2007 The Regents of the University of California.
10 * Copyright (C) 2008-2009 Lawrence Livermore National Security.
11 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
12 * Written by Morris Jette <jette1@llnl.gov>
13 * CODE-OCEC-09-009. All rights reserved.
14 *
15 * This file is part of Slurm, a resource management program.
16 * For details, see <https://slurm.schedmd.com/>.
17 * Please also read the included file: DISCLAIMER.
18 *
19 * Slurm is free software; you can redistribute it and/or modify it under
20 * the terms of the GNU General Public License as published by the Free
21 * Software Foundation; either version 2 of the License, or (at your option)
22 * any later version.
23 *
24 * In addition, as a special exception, the copyright holders give permission
25 * to link the code of portions of this program with the OpenSSL library under
26 * certain conditions as described in each individual source file, and
27 * distribute linked combinations including the two. You must obey the GNU
28 * General Public License in all respects for all of the code used other than
29 * OpenSSL. If you modify file(s) with this exception, you may extend this
30 * exception to your version of the file(s), but you are not obligated to do
31 * so. If you do not wish to do so, delete this exception statement from your
32 * version. If you delete this exception statement from all source files in
33 * the program, then also delete it here.
34 *
35 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
36 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
37 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
38 * details.
39 *
40 * You should have received a copy of the GNU General Public License along
41 * with Slurm; if not, write to the Free Software Foundation, Inc.,
42 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
43 \*****************************************************************************/
44
45 #include "config.h"
46
47 #define _GNU_SOURCE
48
49 #include <limits.h> /* For LONG_MIN, LONG_MAX */
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <sys/stat.h>
54 #include <sys/types.h>
55 #include <sys/wait.h>
56 #include <time.h>
57 #include <unistd.h>
58
59 #include "src/common/bitstring.h"
60 #include "src/common/list.h"
61 #include "src/common/macros.h"
62 #include "src/common/node_features.h"
63 #include "src/common/read_config.h"
64 #include "src/common/slurm_accounting_storage.h"
65 #include "src/common/xstring.h"
66 #include "src/slurmctld/job_scheduler.h"
67 #include "src/slurmctld/locks.h"
68 #include "src/slurmctld/node_scheduler.h"
69 #include "src/slurmctld/power_save.h"
70 #include "src/slurmctld/slurmctld.h"
71 #include "src/slurmctld/trigger_mgr.h"
72
73 #define MAX_SHUTDOWN_DELAY 10 /* seconds to wait for child procs
74 * to exit after daemon shutdown
75 * request, then orphan or kill proc */
76
77 /* Records for tracking processes forked to suspend/resume nodes */
78 typedef struct proc_track_struct {
79 pid_t child_pid; /* pid of process */
80 time_t child_time; /* start time of process */
81 } proc_track_struct_t;
82 static List proc_track_list = NULL;
83
84 pthread_cond_t power_cond = PTHREAD_COND_INITIALIZER;
85 pthread_mutex_t power_mutex = PTHREAD_MUTEX_INITIALIZER;
86 bool power_save_config = false;
87 bool power_save_enabled = false;
88 bool power_save_started = false;
89 bool power_save_debug = false;
90
91 int idle_time, suspend_rate, resume_timeout, resume_rate, suspend_timeout;
92 char *suspend_prog = NULL, *resume_prog = NULL, *resume_fail_prog = NULL;
93 char *exc_nodes = NULL, *exc_parts = NULL;
94 time_t last_config = (time_t) 0;
95 time_t last_log = (time_t) 0, last_work_scan = (time_t) 0;
96 uint16_t slurmd_timeout;
97 static bool idle_on_node_suspend = false;
98
99 typedef struct exc_node_partital {
100 int exc_node_cnt;
101 bitstr_t *exc_node_cnt_bitmap;
102 } exc_node_partital_t;
103 List partial_node_list;
104
105 bitstr_t *exc_node_bitmap = NULL;
106
107 bitstr_t *resume_node_bitmap = NULL;
108 int suspend_cnt, resume_cnt;
109 float suspend_cnt_f, resume_cnt_f;
110
111 static void _clear_power_config(void);
112 static void _do_failed_nodes(char *hosts);
113 static void _do_power_work(time_t now);
114 static void _do_resume(char *host);
115 static void _do_suspend(char *host);
116 static int _init_power_config(void);
117 static void *_init_power_save(void *arg);
118 static int _kill_procs(void);
119 static void _reap_procs(void);
120 static void _re_wake(void);
121 static pid_t _run_prog(char *prog, char *arg1, char *arg2, uint32_t job_id);
122 static void _shutdown_power(void);
123 static bool _valid_prog(char *file_name);
124
_exc_node_part_free(void * x)125 static void _exc_node_part_free(void *x)
126 {
127 exc_node_partital_t *ext_part_struct = (exc_node_partital_t *) x;
128 FREE_NULL_BITMAP(ext_part_struct->exc_node_cnt_bitmap);
129 xfree(ext_part_struct);
130 }
131
_parse_exc_nodes(void)132 static int _parse_exc_nodes(void)
133 {
134 int rc = SLURM_SUCCESS;
135 char *end_ptr = NULL, *save_ptr = NULL, *sep, *tmp, *tok;
136
137 sep = strchr(exc_nodes, ':');
138 if (!sep)
139 return node_name2bitmap(exc_nodes, false, &exc_node_bitmap);
140
141 partial_node_list = list_create(_exc_node_part_free);
142 tmp = xstrdup(exc_nodes);
143 tok = strtok_r(tmp, ":", &save_ptr);
144 while (tok) {
145 bitstr_t *exc_node_cnt_bitmap = NULL;
146 long ext_node_cnt = 0;
147 exc_node_partital_t *ext_part_struct;
148
149 rc = node_name2bitmap(tok, false, &exc_node_cnt_bitmap);
150 if ((rc != SLURM_SUCCESS) || !exc_node_cnt_bitmap)
151 break;
152 tok = strtok_r(NULL, ",", &save_ptr);
153 if (tok) {
154 ext_node_cnt = strtol(tok, &end_ptr, 10);
155 if ((end_ptr[0] != '\0') || (ext_node_cnt < 1) ||
156 (ext_node_cnt >
157 bit_set_count(exc_node_cnt_bitmap))) {
158 FREE_NULL_BITMAP(exc_node_cnt_bitmap);
159 rc = SLURM_ERROR;
160 break;
161 }
162 } else {
163 ext_node_cnt = bit_set_count(exc_node_cnt_bitmap);
164 }
165 ext_part_struct = xmalloc(sizeof(exc_node_partital_t));
166 ext_part_struct->exc_node_cnt = (int) ext_node_cnt;
167 ext_part_struct->exc_node_cnt_bitmap = exc_node_cnt_bitmap;
168 list_append(partial_node_list, ext_part_struct);
169 tok = strtok_r(NULL, ":", &save_ptr);
170 }
171 xfree(tmp);
172 if (rc != SLURM_SUCCESS)
173 FREE_NULL_LIST(partial_node_list);
174
175 return rc;
176 }
177
178 /*
179 * Print elements of the excluded nodes with counts
180 */
_list_part_node_lists(void * x,void * arg)181 static int _list_part_node_lists(void *x, void *arg)
182 {
183 exc_node_partital_t *ext_part_struct = (exc_node_partital_t *) x;
184 char *tmp = bitmap2node_name(ext_part_struct->exc_node_cnt_bitmap);
185 info("power_save module, exclude %d nodes from %s",
186 ext_part_struct->exc_node_cnt, tmp);
187 xfree(tmp);
188 return 0;
189
190 }
191
192 /*
193 * Select the nodes specific nodes to be excluded from consideration for
194 * suspension based upon the node states and specified count. Nodes which
195 * can not be used (e.g. ALLOCATED, DOWN, DRAINED, etc.).
196 */
_pick_exc_nodes(void * x,void * arg)197 static int _pick_exc_nodes(void *x, void *arg)
198 {
199 bitstr_t **orig_exc_nodes = (bitstr_t **) arg;
200 exc_node_partital_t *ext_part_struct = (exc_node_partital_t *) x;
201 bitstr_t *exc_node_cnt_bitmap;
202 int i, i_first, i_last;
203 int avail_node_cnt, exc_node_cnt;
204 node_record_t *node_ptr;
205
206 avail_node_cnt = bit_set_count(ext_part_struct->exc_node_cnt_bitmap);
207 if (ext_part_struct->exc_node_cnt >= avail_node_cnt) {
208 /* Exclude all nodes in this set */
209 exc_node_cnt_bitmap =
210 bit_copy(ext_part_struct->exc_node_cnt_bitmap);
211 } else {
212 i = bit_size(ext_part_struct->exc_node_cnt_bitmap);
213 exc_node_cnt_bitmap = bit_alloc(i);
214 i_first = bit_ffs(ext_part_struct->exc_node_cnt_bitmap);
215 if (i_first >= 0)
216 i_last = bit_fls(ext_part_struct->exc_node_cnt_bitmap);
217 else
218 i_last = i_first - 1;
219 exc_node_cnt = ext_part_struct->exc_node_cnt;
220 for (i = i_first; i <= i_last; i++) {
221 if (!bit_test(ext_part_struct->exc_node_cnt_bitmap, i))
222 continue;
223 node_ptr = node_record_table_ptr + i;
224 if (!IS_NODE_IDLE(node_ptr) ||
225 IS_NODE_COMPLETING(node_ptr) ||
226 IS_NODE_DOWN(node_ptr) ||
227 IS_NODE_DRAIN(node_ptr) ||
228 IS_NODE_POWER_UP(node_ptr) ||
229 IS_NODE_POWER_SAVE(node_ptr) ||
230 (node_ptr->sus_job_cnt > 0))
231 continue;
232 bit_set(exc_node_cnt_bitmap, i);
233 if (--exc_node_cnt <= 0)
234 break;
235 }
236 }
237
238 if (*orig_exc_nodes == NULL) {
239 *orig_exc_nodes = exc_node_cnt_bitmap;
240 } else {
241 bit_or(*orig_exc_nodes, exc_node_cnt_bitmap);
242 FREE_NULL_BITMAP(exc_node_cnt_bitmap);
243 }
244
245 if (power_save_debug) {
246 char *tmp = bitmap2node_name(*orig_exc_nodes);
247 info("power_save module, excluded nodes %s", tmp);
248 xfree(tmp);
249 }
250
251 return 0;
252 }
253
254 /* Perform any power change work to nodes */
_do_power_work(time_t now)255 static void _do_power_work(time_t now)
256 {
257 int i, wake_cnt = 0, susp_total = 0;
258 time_t delta_t;
259 uint32_t susp_state;
260 bitstr_t *avoid_node_bitmap = NULL, *failed_node_bitmap = NULL;
261 bitstr_t *wake_node_bitmap = NULL, *sleep_node_bitmap = NULL;
262 node_record_t *node_ptr;
263
264 if (last_work_scan == 0) {
265 if (exc_nodes && (_parse_exc_nodes() != SLURM_SUCCESS))
266 error("Invalid SuspendExcNodes %s ignored", exc_nodes);
267
268 if (exc_parts) {
269 char *tmp = NULL, *one_part = NULL, *part_list = NULL;
270 part_record_t *part_ptr = NULL;
271
272 part_list = xstrdup(exc_parts);
273 one_part = strtok_r(part_list, ",", &tmp);
274 while (one_part != NULL) {
275 part_ptr = find_part_record(one_part);
276 if (!part_ptr) {
277 error("Invalid SuspendExcPart %s ignored",
278 one_part);
279 } else if (exc_node_bitmap) {
280 bit_or(exc_node_bitmap,
281 part_ptr->node_bitmap);
282 } else {
283 exc_node_bitmap =
284 bit_copy(part_ptr->node_bitmap);
285 }
286 one_part = strtok_r(NULL, ",", &tmp);
287 }
288 xfree(part_list);
289 }
290
291 if (exc_node_bitmap && power_save_debug) {
292 char *tmp = bitmap2node_name(exc_node_bitmap);
293 info("power_save module, excluded nodes %s", tmp);
294 xfree(tmp);
295 }
296 if (partial_node_list && power_save_debug) {
297 (void) list_for_each(partial_node_list,
298 _list_part_node_lists, NULL);
299
300 }
301 }
302
303 /* Set limit on counts of nodes to have state changed */
304 delta_t = now - last_work_scan;
305 if (delta_t >= 60) {
306 suspend_cnt_f = 0.0;
307 resume_cnt_f = 0.0;
308 } else {
309 float rate = (60 - delta_t) / 60.0;
310 suspend_cnt_f *= rate;
311 resume_cnt_f *= rate;
312 }
313 suspend_cnt = (suspend_cnt_f + 0.5);
314 resume_cnt = (resume_cnt_f + 0.5);
315
316 last_work_scan = now;
317
318 /* Identify nodes to avoid considering for suspend */
319 if (partial_node_list) {
320 (void) list_for_each(partial_node_list, _pick_exc_nodes,
321 &avoid_node_bitmap);
322 }
323 if (exc_node_bitmap) {
324 if (avoid_node_bitmap)
325 bit_or(avoid_node_bitmap, exc_node_bitmap);
326 else
327 avoid_node_bitmap = bit_copy(exc_node_bitmap);
328 }
329
330 /* Build bitmaps identifying each node which should change state */
331 for (i = 0, node_ptr = node_record_table_ptr;
332 i < node_record_count; i++, node_ptr++) {
333 susp_state = IS_NODE_POWER_SAVE(node_ptr);
334
335 if (susp_state)
336 susp_total++;
337
338 /* Resume nodes as appropriate */
339 if (susp_state &&
340 ((resume_rate == 0) || (resume_cnt < resume_rate)) &&
341 !IS_NODE_POWERING_DOWN(node_ptr) &&
342 (IS_NODE_ALLOCATED(node_ptr) ||
343 (node_ptr->last_idle > (now - idle_time)))) {
344 if (wake_node_bitmap == NULL) {
345 wake_node_bitmap =
346 bit_alloc(node_record_count);
347 }
348 wake_cnt++;
349 resume_cnt++;
350 resume_cnt_f++;
351 node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
352 node_ptr->node_state |= NODE_STATE_POWER_UP;
353 node_ptr->node_state |= NODE_STATE_NO_RESPOND;
354 bit_clear(power_node_bitmap, i);
355 node_ptr->boot_req_time = now;
356 node_ptr->last_response = now + resume_timeout;
357 bit_set(booting_node_bitmap, i);
358 bit_set(resume_node_bitmap, i);
359 bit_set(wake_node_bitmap, i);
360 }
361
362 /* Suspend nodes as appropriate */
363 if ((susp_state == 0) &&
364 ((suspend_rate == 0) || (suspend_cnt < suspend_rate)) &&
365 (IS_NODE_IDLE(node_ptr) || IS_NODE_DOWN(node_ptr)) &&
366 (node_ptr->sus_job_cnt == 0) &&
367 (!IS_NODE_COMPLETING(node_ptr)) &&
368 (!IS_NODE_POWER_UP(node_ptr)) &&
369 (node_ptr->last_idle != 0) &&
370 (node_ptr->last_idle < (now - idle_time)) &&
371 ((avoid_node_bitmap == NULL) ||
372 (bit_test(avoid_node_bitmap, i) == 0))) {
373 if (sleep_node_bitmap == NULL) {
374 sleep_node_bitmap =
375 bit_alloc(node_record_count);
376 }
377 suspend_cnt++;
378 suspend_cnt_f++;
379 node_ptr->node_state |= NODE_STATE_POWER_SAVE;
380 node_ptr->node_state |= NODE_STATE_POWERING_DOWN;
381 node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
382 bit_set(power_node_bitmap, i);
383 bit_set(sleep_node_bitmap, i);
384
385 /* Don't allocate until after SuspendTimeout */
386 bit_clear(avail_node_bitmap, i);
387 node_ptr->last_response = now + suspend_timeout;
388
389 if (idle_on_node_suspend) {
390 if (IS_NODE_DOWN(node_ptr)) {
391 trigger_node_up(node_ptr);
392 clusteracct_storage_g_node_up(
393 acct_db_conn, node_ptr, now);
394 } else if (IS_NODE_IDLE(node_ptr) &&
395 (IS_NODE_DRAIN(node_ptr) ||
396 IS_NODE_FAIL(node_ptr))) {
397 clusteracct_storage_g_node_up(
398 acct_db_conn, node_ptr, now);
399 }
400
401 node_ptr->node_state =
402 NODE_STATE_IDLE |
403 (node_ptr->node_state & NODE_STATE_FLAGS);
404 node_ptr->node_state &= (~NODE_STATE_DRAIN);
405 node_ptr->node_state &= (~NODE_STATE_FAIL);
406 }
407 }
408
409 if (IS_NODE_POWERING_DOWN(node_ptr) &&
410 (node_ptr->last_response < now)) {
411
412 node_ptr->node_state &= (~NODE_STATE_POWERING_DOWN);
413
414 if (!IS_NODE_DOWN(node_ptr) &&
415 !IS_NODE_DRAIN(node_ptr) &&
416 !IS_NODE_FAIL(node_ptr))
417 make_node_avail(i);
418
419 node_ptr->last_idle = 0;
420 }
421
422 /*
423 * Down nodes as if not resumed by ResumeTimeout
424 */
425 if (bit_test(booting_node_bitmap, i) &&
426 (now > node_ptr->last_response) &&
427 IS_NODE_POWER_UP(node_ptr) &&
428 IS_NODE_NO_RESPOND(node_ptr)) {
429 info("node %s not resumed by ResumeTimeout(%d) - marking down and power_save",
430 node_ptr->name, resume_timeout);
431 /*
432 * set_node_down_ptr() will remove the node from the
433 * avail_node_bitmap.
434 */
435 set_node_down_ptr(node_ptr, "ResumeTimeout reached");
436 node_ptr->node_state &= (~NODE_STATE_POWER_UP);
437 node_ptr->node_state |= NODE_STATE_POWER_SAVE;
438 bit_set(power_node_bitmap, i);
439 bit_clear(booting_node_bitmap, i);
440 bit_clear(resume_node_bitmap, i);
441 node_ptr->last_idle = 0;
442
443 if (resume_fail_prog) {
444 if (!failed_node_bitmap) {
445 failed_node_bitmap =
446 bit_alloc(node_record_count);
447 }
448 bit_set(failed_node_bitmap, i);
449 }
450 }
451 }
452 FREE_NULL_BITMAP(avoid_node_bitmap);
453 if (power_save_debug && ((now - last_log) > 600) && (susp_total > 0)) {
454 info("Power save mode: %d nodes", susp_total);
455 last_log = now;
456 }
457
458 if (sleep_node_bitmap) {
459 char *nodes;
460 nodes = bitmap2node_name(sleep_node_bitmap);
461 if (nodes)
462 _do_suspend(nodes);
463 else
464 error("power_save: bitmap2nodename");
465 xfree(nodes);
466 FREE_NULL_BITMAP(sleep_node_bitmap);
467 /* last_node_update could be changed already by another thread!
468 last_node_update = now; */
469 }
470
471 if (wake_node_bitmap) {
472 char *nodes;
473 nodes = bitmap2node_name(wake_node_bitmap);
474 if (nodes)
475 _do_resume(nodes);
476 else
477 error("power_save: bitmap2nodename");
478 xfree(nodes);
479 FREE_NULL_BITMAP(wake_node_bitmap);
480 /* last_node_update could be changed already by another thread!
481 last_node_update = now; */
482 }
483
484 if (failed_node_bitmap) {
485 char *nodes;
486 nodes = bitmap2node_name(failed_node_bitmap);
487 if (nodes)
488 _do_failed_nodes(nodes);
489 else
490 error("power_save: bitmap2nodename");
491 xfree(nodes);
492 FREE_NULL_BITMAP(failed_node_bitmap);
493 }
494 }
495
496 /*
497 * power_job_reboot - Reboot compute nodes for a job from the head node.
498 * Also change the modes of KNL nodes for node_features/knl_cray plugin.
499 * IN job_ptr - pointer to job that will be initiated
500 * RET SLURM_SUCCESS(0) or error code
501 */
power_job_reboot(job_record_t * job_ptr)502 extern int power_job_reboot(job_record_t *job_ptr)
503 {
504 int rc = SLURM_SUCCESS;
505 int i, i_first, i_last;
506 node_record_t *node_ptr;
507 bitstr_t *boot_node_bitmap = NULL, *feature_node_bitmap = NULL;
508 time_t now = time(NULL);
509 char *nodes, *reboot_features = NULL;
510 pid_t pid;
511
512 /*
513 * NOTE: See reboot_job_reboot() in job_scheduler.c for similar logic
514 * used by node_features/knl_generic plugin.
515 */
516 if (job_ptr->reboot)
517 boot_node_bitmap = bit_copy(job_ptr->node_bitmap);
518 else
519 boot_node_bitmap = node_features_reboot(job_ptr);
520 if (boot_node_bitmap == NULL) {
521 /* At minimum, the powered down nodes require reboot */
522 if (bit_overlap_any(power_node_bitmap, job_ptr->node_bitmap) ||
523 bit_overlap_any(booting_node_bitmap,
524 job_ptr->node_bitmap)) {
525 job_ptr->job_state |= JOB_CONFIGURING;
526 job_ptr->bit_flags |= NODE_REBOOT;
527 }
528 return SLURM_SUCCESS;
529 }
530
531 /* Modify state information for all nodes, KNL and others */
532 i_first = bit_ffs(boot_node_bitmap);
533 if (i_first >= 0)
534 i_last = bit_fls(boot_node_bitmap);
535 else
536 i_last = i_first - 1;
537 for (i = i_first; i <= i_last; i++) {
538 if (!bit_test(boot_node_bitmap, i))
539 continue;
540 node_ptr = node_record_table_ptr + i;
541 resume_cnt++;
542 resume_cnt_f++;
543 node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
544 node_ptr->node_state |= NODE_STATE_POWER_UP;
545 node_ptr->node_state |= NODE_STATE_NO_RESPOND;
546 bit_clear(power_node_bitmap, i);
547 bit_clear(avail_node_bitmap, i);
548 node_ptr->boot_req_time = now;
549 node_ptr->last_response = now + resume_timeout;
550 bit_set(booting_node_bitmap, i);
551 bit_set(resume_node_bitmap, i);
552 }
553
554 if (job_ptr->details && job_ptr->details->features &&
555 node_features_g_user_update(job_ptr->user_id)) {
556 reboot_features = node_features_g_job_xlate(
557 job_ptr->details->features);
558 if (reboot_features)
559 feature_node_bitmap = node_features_g_get_node_bitmap();
560 if (feature_node_bitmap)
561 bit_and(feature_node_bitmap, boot_node_bitmap);
562 if (!feature_node_bitmap ||
563 (bit_ffs(feature_node_bitmap) == -1)) {
564 /* No KNL nodes to reboot */
565 FREE_NULL_BITMAP(feature_node_bitmap);
566 } else {
567 bit_and_not(boot_node_bitmap, feature_node_bitmap);
568 if (bit_ffs(boot_node_bitmap) == -1) {
569 /* No non-KNL nodes to reboot */
570 FREE_NULL_BITMAP(boot_node_bitmap);
571 }
572 }
573 }
574
575 if (feature_node_bitmap) {
576 /* Reboot nodes to change KNL NUMA and/or MCDRAM mode */
577 nodes = bitmap2node_name(feature_node_bitmap);
578 if (nodes) {
579 job_ptr->job_state |= JOB_CONFIGURING;
580 job_ptr->wait_all_nodes = 1;
581 job_ptr->bit_flags |= NODE_REBOOT;
582 pid = _run_prog(resume_prog, nodes, reboot_features,
583 job_ptr->job_id);
584 if (power_save_debug)
585 info("%s: pid %d reboot nodes %s features %s",
586 __func__, (int) pid, nodes,
587 reboot_features);
588 } else {
589 error("%s: bitmap2nodename", __func__);
590 rc = SLURM_ERROR;
591 }
592 xfree(nodes);
593 FREE_NULL_BITMAP(feature_node_bitmap);
594 }
595 if (boot_node_bitmap) {
596 /* Reboot nodes with no feature changes */
597 nodes = bitmap2node_name(boot_node_bitmap);
598 if (nodes) {
599 job_ptr->job_state |= JOB_CONFIGURING;
600 job_ptr->wait_all_nodes = 1;
601 job_ptr->bit_flags |= NODE_REBOOT;
602 pid = _run_prog(resume_prog, nodes, NULL,
603 job_ptr->job_id);
604 if (power_save_debug)
605 info("%s: pid %d reboot nodes %s",
606 __func__, (int) pid, nodes);
607 } else {
608 error("%s: bitmap2nodename", __func__);
609 rc = SLURM_ERROR;
610 }
611 xfree(nodes);
612 }
613 FREE_NULL_BITMAP(boot_node_bitmap);
614 xfree(reboot_features);
615
616 last_node_update = now;
617
618 return rc;
619 }
620
621 /* If slurmctld crashes, the node state that it recovers could differ
622 * from the actual hardware state (e.g. ResumeProgram failed to complete).
623 * To address that, when a node that should be powered up for a running
624 * job is not responding, they try running ResumeProgram again. */
_re_wake(void)625 static void _re_wake(void)
626 {
627 node_record_t *node_ptr;
628 bitstr_t *wake_node_bitmap = NULL;
629 int i;
630
631 node_ptr = node_record_table_ptr;
632 for (i=0; i<node_record_count; i++, node_ptr++) {
633 if (IS_NODE_ALLOCATED(node_ptr) &&
634 IS_NODE_NO_RESPOND(node_ptr) &&
635 !IS_NODE_POWER_SAVE(node_ptr) &&
636 (bit_test(resume_node_bitmap, i) == 0)) {
637 if (wake_node_bitmap == NULL) {
638 wake_node_bitmap =
639 bit_alloc(node_record_count);
640 }
641 bit_set(wake_node_bitmap, i);
642 }
643 }
644
645 if (wake_node_bitmap) {
646 char *nodes;
647 nodes = bitmap2node_name(wake_node_bitmap);
648 if (nodes) {
649 pid_t pid = _run_prog(resume_prog, nodes, NULL, 0);
650 if (power_save_debug)
651 info("power_save: pid %d rewaking nodes %s",
652 (int) pid, nodes);
653 } else
654 error("power_save: bitmap2nodename");
655 xfree(nodes);
656 FREE_NULL_BITMAP(wake_node_bitmap);
657 }
658 }
659
_do_failed_nodes(char * hosts)660 static void _do_failed_nodes(char *hosts)
661 {
662 pid_t pid = _run_prog(resume_fail_prog, hosts, NULL, 0);
663 if (power_save_debug)
664 info("power_save: pid %d handle failed nodes %s",
665 (int)pid, hosts);
666 }
667
_do_resume(char * host)668 static void _do_resume(char *host)
669 {
670 pid_t pid = _run_prog(resume_prog, host, NULL, 0);
671 if (power_save_debug)
672 info("power_save: pid %d waking nodes %s",
673 (int) pid, host);
674 }
675
_do_suspend(char * host)676 static void _do_suspend(char *host)
677 {
678 pid_t pid = _run_prog(suspend_prog, host, NULL, 0);
679 if (power_save_debug)
680 info("power_save: pid %d suspending nodes %s",
681 (int) pid, host);
682 }
683
684 /* run a suspend or resume program
685 * prog IN - program to run
686 * arg1 IN - first program argument, the hostlist expression
687 * arg2 IN - second program argumentor NULL
688 * job_id IN - Passed as SLURM_JOB_ID environment variable
689 */
_run_prog(char * prog,char * arg1,char * arg2,uint32_t job_id)690 static pid_t _run_prog(char *prog, char *arg1, char *arg2, uint32_t job_id)
691 {
692 int i;
693 char *argv[4], job_id_str[32], *pname;
694 pid_t child;
695
696 if (prog == NULL) /* disabled, useful for testing */
697 return -1;
698
699 if (job_id)
700 snprintf(job_id_str, sizeof(job_id_str), "%u", job_id);
701 pname = strrchr(prog, '/');
702 if (pname == NULL)
703 argv[0] = prog;
704 else
705 argv[0] = pname + 1;
706 argv[1] = arg1;
707 argv[2] = arg2;
708 argv[3] = NULL;
709
710 child = fork();
711 if (child == 0) {
712 for (i = 0; i < 1024; i++)
713 (void) close(i);
714 setpgid(0, 0);
715 setenv("SLURM_CONF", slurmctld_conf.slurm_conf, 1);
716 if (job_id)
717 setenv("SLURM_JOB_ID", job_id_str, 1);
718 execv(prog, argv);
719 _exit(1);
720 } else if (child < 0) {
721 error("fork: %m");
722 } else {
723 /* save the pid */
724 proc_track_struct_t *proc_track;
725 proc_track = xmalloc(sizeof(proc_track_struct_t));
726 proc_track->child_pid = child;
727 proc_track->child_time = time(NULL);
728 list_append(proc_track_list, proc_track);
729 }
730 return child;
731 }
732
733 /* reap child processes previously forked to modify node state. */
_reap_procs(void)734 static void _reap_procs(void)
735 {
736 int delay, max_timeout, rc, status;
737 ListIterator iter;
738 proc_track_struct_t *proc_track;
739
740 max_timeout = MAX(suspend_timeout, resume_timeout);
741 iter = list_iterator_create(proc_track_list);
742 while ((proc_track = list_next(iter))) {
743 rc = waitpid(proc_track->child_pid, &status, WNOHANG);
744 if (rc == 0)
745 continue;
746
747 delay = difftime(time(NULL), proc_track->child_time);
748 if (power_save_debug && (delay > max_timeout)) {
749 info("power_save: program %d ran for %d sec",
750 (int) proc_track->child_pid, delay);
751 }
752
753 if (WIFEXITED(status)) {
754 rc = WEXITSTATUS(status);
755 if (rc != 0) {
756 error("power_save: program exit status of %d",
757 rc);
758 } else
759 ping_nodes_now = true;
760 } else if (WIFSIGNALED(status)) {
761 error("power_save: program signaled: %s",
762 strsignal(WTERMSIG(status)));
763 }
764
765 list_delete_item(iter);
766 }
767 list_iterator_destroy(iter);
768 }
769
770 /* kill (or orphan) child processes previously forked to modify node state.
771 * return the count of killed/orphaned processes */
_kill_procs(void)772 static int _kill_procs(void)
773 {
774 int killed = 0, rc, status;
775 ListIterator iter;
776 proc_track_struct_t *proc_track;
777
778 iter = list_iterator_create(proc_track_list);
779 while ((proc_track = list_next(iter))) {
780 rc = waitpid(proc_track->child_pid, &status, WNOHANG);
781 if (rc == 0) {
782 #ifdef POWER_SAVE_KILL_PROCS
783 error("power_save: killing process %d",
784 proc_track->child_pid);
785 kill((0 - proc_track->child_pid), SIGKILL);
786 #else
787 error("power_save: orphaning process %d",
788 proc_track->child_pid);
789 #endif
790 killed++;
791 } else {
792 /* process already completed */
793 }
794 list_delete_item(iter);
795 }
796 list_iterator_destroy(iter);
797
798 return killed;
799 }
800
801 /* shutdown power save daemons */
_shutdown_power(void)802 static void _shutdown_power(void)
803 {
804 int i, proc_cnt, max_timeout;
805
806 max_timeout = MAX(suspend_timeout, resume_timeout);
807 max_timeout = MIN(max_timeout, MAX_SHUTDOWN_DELAY);
808 /* Try to avoid orphan processes */
809 for (i = 0; ; i++) {
810 _reap_procs();
811 proc_cnt = list_count(proc_track_list);
812 if (proc_cnt == 0) /* all procs completed */
813 break;
814 if (i >= max_timeout) {
815 error("power_save: orphaning %d processes which are "
816 "not terminating so slurmctld can exit",
817 proc_cnt);
818 _kill_procs();
819 break;
820 } else if (i == 2) {
821 info("power_save: waiting for %d processes to complete",
822 proc_cnt);
823 } else if (i % 5 == 0) {
824 debug("power_save: waiting for %d processes to complete",
825 proc_cnt);
826 }
827 sleep(1);
828 }
829 }
830
831 /* Free all allocated memory */
_clear_power_config(void)832 static void _clear_power_config(void)
833 {
834 xfree(suspend_prog);
835 xfree(resume_prog);
836 xfree(exc_nodes);
837 xfree(exc_parts);
838 FREE_NULL_BITMAP(exc_node_bitmap);
839 FREE_NULL_LIST(partial_node_list);
840 }
841
842 /*
843 * Initialize power_save module parameters.
844 * Return 0 on valid configuration to run power saving,
845 * otherwise log the problem and return -1
846 */
_init_power_config(void)847 static int _init_power_config(void)
848 {
849 last_config = slurmctld_conf.last_update;
850 last_work_scan = 0;
851 last_log = 0;
852 idle_time = slurmctld_conf.suspend_time - 1;
853 suspend_rate = slurmctld_conf.suspend_rate;
854 resume_timeout = slurmctld_conf.resume_timeout;
855 resume_rate = slurmctld_conf.resume_rate;
856 slurmd_timeout = slurmctld_conf.slurmd_timeout;
857 suspend_timeout = slurmctld_conf.suspend_timeout;
858 _clear_power_config();
859 if (slurmctld_conf.suspend_program)
860 suspend_prog = xstrdup(slurmctld_conf.suspend_program);
861 if (slurmctld_conf.resume_fail_program)
862 resume_fail_prog = xstrdup(slurmctld_conf.resume_fail_program);
863 if (slurmctld_conf.resume_program)
864 resume_prog = xstrdup(slurmctld_conf.resume_program);
865 if (slurmctld_conf.suspend_exc_nodes)
866 exc_nodes = xstrdup(slurmctld_conf.suspend_exc_nodes);
867 if (slurmctld_conf.suspend_exc_parts)
868 exc_parts = xstrdup(slurmctld_conf.suspend_exc_parts);
869
870 idle_on_node_suspend = xstrcasestr(slurmctld_conf.slurmctld_params,
871 "idle_on_node_suspend");
872
873 if (idle_time < 0) { /* not an error */
874 debug("power_save module disabled, SuspendTime < 0");
875 return -1;
876 }
877 if (suspend_rate < 0) {
878 error("power_save module disabled, SuspendRate < 0");
879 test_config_rc = 1;
880 return -1;
881 }
882 if (resume_rate < 0) {
883 error("power_save module disabled, ResumeRate < 0");
884 test_config_rc = 1;
885 return -1;
886 }
887 if (suspend_prog == NULL) {
888 error("power_save module disabled, NULL SuspendProgram");
889 test_config_rc = 1;
890 return -1;
891 } else if (!_valid_prog(suspend_prog)) {
892 error("power_save module disabled, invalid SuspendProgram %s",
893 suspend_prog);
894 test_config_rc = 1;
895 return -1;
896 }
897 if (resume_prog == NULL) {
898 error("power_save module disabled, NULL ResumeProgram");
899 test_config_rc = 1;
900 return -1;
901 } else if (!_valid_prog(resume_prog)) {
902 error("power_save module disabled, invalid ResumeProgram %s",
903 resume_prog);
904 test_config_rc = 1;
905 return -1;
906 }
907
908 if (slurmctld_conf.debug_flags & DEBUG_FLAG_POWER_SAVE)
909 power_save_debug = true;
910 else
911 power_save_debug = false;
912
913 if (resume_fail_prog && !_valid_prog(resume_fail_prog)) {
914 /* error's already reported in _valid_prog() */
915 xfree(resume_fail_prog);
916 }
917
918 return 0;
919 }
920
_valid_prog(char * file_name)921 static bool _valid_prog(char *file_name)
922 {
923 struct stat buf;
924
925 if (file_name[0] != '/') {
926 error("power_save program %s not absolute pathname", file_name);
927 return false;
928 }
929
930 if (access(file_name, X_OK) != 0) {
931 error("power_save program %s not executable", file_name);
932 return false;
933 }
934
935 if (stat(file_name, &buf)) {
936 error("power_save program %s not found", file_name);
937 return false;
938 }
939 if (buf.st_mode & 022) {
940 error("power_save program %s has group or "
941 "world write permission", file_name);
942 return false;
943 }
944
945 return true;
946 }
947
948 /*
949 * config_power_mgr - Read power management configuration
950 */
config_power_mgr(void)951 extern void config_power_mgr(void)
952 {
953 slurm_mutex_lock(&power_mutex);
954 if (!power_save_config) {
955 if (_init_power_config() == 0)
956 power_save_enabled = true;
957 power_save_config = true;
958 }
959 slurm_cond_signal(&power_cond);
960 slurm_mutex_unlock(&power_mutex);
961 }
962
963 /*
964 * start_power_mgr - Start power management thread as needed. The thread
965 * terminates automatically at slurmctld shutdown time.
966 * IN thread_id - pointer to thread ID of the started pthread.
967 */
start_power_mgr(pthread_t * thread_id)968 extern void start_power_mgr(pthread_t *thread_id)
969 {
970 slurm_mutex_lock(&power_mutex);
971 if (power_save_started) { /* Already running */
972 slurm_mutex_unlock(&power_mutex);
973 return;
974 }
975 power_save_started = true;
976 proc_track_list = list_create(xfree_ptr);
977 slurm_mutex_unlock(&power_mutex);
978
979 slurm_thread_create(thread_id, _init_power_save, NULL);
980 }
981
982 /* Report if node power saving is enabled */
power_save_test(void)983 extern bool power_save_test(void)
984 {
985 bool rc;
986
987 slurm_mutex_lock(&power_mutex);
988 while (!power_save_config) {
989 slurm_cond_wait(&power_cond, &power_mutex);
990 }
991 rc = power_save_enabled;
992 slurm_mutex_unlock(&power_mutex);
993
994 return rc;
995 }
996
997 /* Free module's allocated memory */
power_save_fini(void)998 extern void power_save_fini(void)
999 {
1000 slurm_mutex_lock(&power_mutex);
1001 if (power_save_started) { /* Already running */
1002 power_save_started = false;
1003 FREE_NULL_LIST(proc_track_list);
1004 }
1005 slurm_mutex_unlock(&power_mutex);
1006 }
1007
1008 /*
1009 * init_power_save - Initialize the power save module. Started as a
1010 * pthread. Terminates automatically at slurmctld shutdown time.
1011 * Input and output are unused.
1012 */
_init_power_save(void * arg)1013 static void *_init_power_save(void *arg)
1014 {
1015 /* Locks: Read nodes */
1016 slurmctld_lock_t node_read_lock = {
1017 NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
1018 /* Locks: Write jobs and nodes */
1019 slurmctld_lock_t node_write_lock = {
1020 NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
1021 time_t now, boot_time = 0, last_power_scan = 0;
1022
1023 if (power_save_config && !power_save_enabled) {
1024 debug("power_save mode not enabled");
1025 goto fini;
1026 }
1027
1028 resume_node_bitmap = bit_alloc(node_record_count);
1029
1030 while (slurmctld_config.shutdown_time == 0) {
1031 sleep(1);
1032
1033 _reap_procs();
1034
1035 if ((last_config != slurmctld_conf.last_update) &&
1036 (_init_power_config())) {
1037 info("power_save mode has been disabled due to "
1038 "configuration changes");
1039 goto fini;
1040 }
1041
1042 now = time(NULL);
1043 if (boot_time == 0)
1044 boot_time = now;
1045
1046 /*
1047 * Only run every 10 seconds or after a node state change,
1048 * whichever happens first
1049 */
1050 if ((last_node_update >= last_power_scan) ||
1051 (now >= (last_power_scan + 10))) {
1052 lock_slurmctld(node_write_lock);
1053 _do_power_work(now);
1054 unlock_slurmctld(node_write_lock);
1055 last_power_scan = now;
1056 }
1057
1058 if (slurmd_timeout &&
1059 (now > (boot_time + (slurmd_timeout / 2)))) {
1060 lock_slurmctld(node_read_lock);
1061 _re_wake();
1062 unlock_slurmctld(node_read_lock);
1063 /* prevent additional executions */
1064 boot_time += (365 * 24 * 60 * 60);
1065 slurmd_timeout = 0;
1066 }
1067 }
1068
1069 fini: _clear_power_config();
1070 FREE_NULL_BITMAP(resume_node_bitmap);
1071 _shutdown_power();
1072 slurm_mutex_lock(&power_mutex);
1073 list_destroy(proc_track_list);
1074 proc_track_list = NULL;
1075 power_save_enabled = false;
1076 power_save_started = false;
1077 slurm_cond_signal(&power_cond);
1078 slurm_mutex_unlock(&power_mutex);
1079 pthread_exit(NULL);
1080 return NULL;
1081 }
1082