1 /*****************************************************************************\
2  *  power_save.c - support node power saving mode. Nodes which have been
3  *  idle for an extended period of time will be placed into a power saving
4  *  mode by running an arbitrary script. This script can lower the voltage
5  *  or frequency of the nodes or can completely power the nodes off.
6  *  When the node is restored to normal operation, another script will be
7  *  executed. Many parameters are available to control this mode of operation.
8  *****************************************************************************
9  *  Copyright (C) 2007 The Regents of the University of California.
10  *  Copyright (C) 2008-2009 Lawrence Livermore National Security.
11  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
12  *  Written by Morris Jette <jette1@llnl.gov>
13  *  CODE-OCEC-09-009. All rights reserved.
14  *
15  *  This file is part of Slurm, a resource management program.
16  *  For details, see <https://slurm.schedmd.com/>.
17  *  Please also read the included file: DISCLAIMER.
18  *
19  *  Slurm is free software; you can redistribute it and/or modify it under
20  *  the terms of the GNU General Public License as published by the Free
21  *  Software Foundation; either version 2 of the License, or (at your option)
22  *  any later version.
23  *
24  *  In addition, as a special exception, the copyright holders give permission
25  *  to link the code of portions of this program with the OpenSSL library under
26  *  certain conditions as described in each individual source file, and
27  *  distribute linked combinations including the two. You must obey the GNU
28  *  General Public License in all respects for all of the code used other than
29  *  OpenSSL. If you modify file(s) with this exception, you may extend this
30  *  exception to your version of the file(s), but you are not obligated to do
31  *  so. If you do not wish to do so, delete this exception statement from your
32  *  version.  If you delete this exception statement from all source files in
33  *  the program, then also delete it here.
34  *
35  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
36  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
37  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
38  *  details.
39  *
40  *  You should have received a copy of the GNU General Public License along
41  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
42  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
43 \*****************************************************************************/
44 
45 #include "config.h"
46 
47 #define _GNU_SOURCE
48 
49 #include <limits.h>	/* For LONG_MIN, LONG_MAX */
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <sys/stat.h>
54 #include <sys/types.h>
55 #include <sys/wait.h>
56 #include <time.h>
57 #include <unistd.h>
58 
59 #include "src/common/bitstring.h"
60 #include "src/common/list.h"
61 #include "src/common/macros.h"
62 #include "src/common/node_features.h"
63 #include "src/common/read_config.h"
64 #include "src/common/slurm_accounting_storage.h"
65 #include "src/common/xstring.h"
66 #include "src/slurmctld/job_scheduler.h"
67 #include "src/slurmctld/locks.h"
68 #include "src/slurmctld/node_scheduler.h"
69 #include "src/slurmctld/power_save.h"
70 #include "src/slurmctld/slurmctld.h"
71 #include "src/slurmctld/trigger_mgr.h"
72 
73 #define MAX_SHUTDOWN_DELAY	10	/* seconds to wait for child procs
74 					 * to exit after daemon shutdown
75 					 * request, then orphan or kill proc */
76 
77 /* Records for tracking processes forked to suspend/resume nodes */
78 typedef struct proc_track_struct {
79 	pid_t  child_pid;	/* pid of process		*/
80 	time_t child_time;	/* start time of process	*/
81 } proc_track_struct_t;
82 static List proc_track_list = NULL;
83 
84 pthread_cond_t power_cond = PTHREAD_COND_INITIALIZER;
85 pthread_mutex_t power_mutex = PTHREAD_MUTEX_INITIALIZER;
86 bool power_save_config = false;
87 bool power_save_enabled = false;
88 bool power_save_started = false;
89 bool power_save_debug = false;
90 
91 int idle_time, suspend_rate, resume_timeout, resume_rate, suspend_timeout;
92 char *suspend_prog = NULL, *resume_prog = NULL, *resume_fail_prog = NULL;
93 char *exc_nodes = NULL, *exc_parts = NULL;
94 time_t last_config = (time_t) 0;
95 time_t last_log = (time_t) 0, last_work_scan = (time_t) 0;
96 uint16_t slurmd_timeout;
97 static bool idle_on_node_suspend = false;
98 
99 typedef struct exc_node_partital {
100 	int exc_node_cnt;
101 	bitstr_t *exc_node_cnt_bitmap;
102 } exc_node_partital_t;
103 List partial_node_list;
104 
105 bitstr_t *exc_node_bitmap = NULL;
106 
107 bitstr_t *resume_node_bitmap = NULL;
108 int   suspend_cnt,   resume_cnt;
109 float suspend_cnt_f, resume_cnt_f;
110 
111 static void  _clear_power_config(void);
112 static void  _do_failed_nodes(char *hosts);
113 static void  _do_power_work(time_t now);
114 static void  _do_resume(char *host);
115 static void  _do_suspend(char *host);
116 static int   _init_power_config(void);
117 static void *_init_power_save(void *arg);
118 static int   _kill_procs(void);
119 static void  _reap_procs(void);
120 static void  _re_wake(void);
121 static pid_t _run_prog(char *prog, char *arg1, char *arg2, uint32_t job_id);
122 static void  _shutdown_power(void);
123 static bool  _valid_prog(char *file_name);
124 
_exc_node_part_free(void * x)125 static void _exc_node_part_free(void *x)
126 {
127 	exc_node_partital_t *ext_part_struct = (exc_node_partital_t *) x;
128 	FREE_NULL_BITMAP(ext_part_struct->exc_node_cnt_bitmap);
129 	xfree(ext_part_struct);
130 }
131 
_parse_exc_nodes(void)132 static int _parse_exc_nodes(void)
133 {
134 	int rc = SLURM_SUCCESS;
135 	char *end_ptr = NULL, *save_ptr = NULL, *sep, *tmp, *tok;
136 
137 	sep = strchr(exc_nodes, ':');
138 	if (!sep)
139 		return node_name2bitmap(exc_nodes, false, &exc_node_bitmap);
140 
141 	partial_node_list = list_create(_exc_node_part_free);
142 	tmp = xstrdup(exc_nodes);
143 	tok = strtok_r(tmp, ":", &save_ptr);
144 	while (tok) {
145 		bitstr_t *exc_node_cnt_bitmap = NULL;
146 		long ext_node_cnt = 0;
147 		exc_node_partital_t *ext_part_struct;
148 
149 		rc = node_name2bitmap(tok, false, &exc_node_cnt_bitmap);
150 		if ((rc != SLURM_SUCCESS) || !exc_node_cnt_bitmap)
151 			break;
152 		tok = strtok_r(NULL, ",", &save_ptr);
153 		if (tok) {
154 			ext_node_cnt = strtol(tok, &end_ptr, 10);
155 			if ((end_ptr[0] != '\0') || (ext_node_cnt < 1) ||
156 			    (ext_node_cnt >
157 			     bit_set_count(exc_node_cnt_bitmap))) {
158 				FREE_NULL_BITMAP(exc_node_cnt_bitmap);
159 				rc = SLURM_ERROR;
160 				break;
161 			}
162 		} else {
163 			ext_node_cnt = bit_set_count(exc_node_cnt_bitmap);
164 		}
165 		ext_part_struct = xmalloc(sizeof(exc_node_partital_t));
166 		ext_part_struct->exc_node_cnt = (int) ext_node_cnt;
167 		ext_part_struct->exc_node_cnt_bitmap = exc_node_cnt_bitmap;
168 		list_append(partial_node_list, ext_part_struct);
169 		tok = strtok_r(NULL, ":", &save_ptr);
170 	}
171 	xfree(tmp);
172 	if (rc != SLURM_SUCCESS)
173 		FREE_NULL_LIST(partial_node_list);
174 
175 	return rc;
176 }
177 
178 /*
179  * Print elements of the excluded nodes with counts
180  */
_list_part_node_lists(void * x,void * arg)181 static int _list_part_node_lists(void *x, void *arg)
182 {
183 	exc_node_partital_t *ext_part_struct = (exc_node_partital_t *) x;
184 	char *tmp = bitmap2node_name(ext_part_struct->exc_node_cnt_bitmap);
185 	info("power_save module, exclude %d nodes from %s",
186 	     ext_part_struct->exc_node_cnt, tmp);
187 	xfree(tmp);
188 	return 0;
189 
190 }
191 
192 /*
193  * Select the nodes specific nodes to be excluded from consideration for
194  * suspension based upon the node states and specified count. Nodes which
195  * can not be used (e.g. ALLOCATED, DOWN, DRAINED, etc.).
196  */
_pick_exc_nodes(void * x,void * arg)197 static int _pick_exc_nodes(void *x, void *arg)
198 {
199 	bitstr_t **orig_exc_nodes = (bitstr_t **) arg;
200 	exc_node_partital_t *ext_part_struct = (exc_node_partital_t *) x;
201 	bitstr_t *exc_node_cnt_bitmap;
202 	int i, i_first, i_last;
203 	int avail_node_cnt, exc_node_cnt;
204 	node_record_t *node_ptr;
205 
206 	avail_node_cnt = bit_set_count(ext_part_struct->exc_node_cnt_bitmap);
207 	if (ext_part_struct->exc_node_cnt >= avail_node_cnt) {
208 		/* Exclude all nodes in this set */
209 		exc_node_cnt_bitmap =
210 			bit_copy(ext_part_struct->exc_node_cnt_bitmap);
211 	} else {
212 		i = bit_size(ext_part_struct->exc_node_cnt_bitmap);
213 		exc_node_cnt_bitmap = bit_alloc(i);
214 		i_first = bit_ffs(ext_part_struct->exc_node_cnt_bitmap);
215 		if (i_first >= 0)
216 			i_last = bit_fls(ext_part_struct->exc_node_cnt_bitmap);
217 		else
218 			i_last = i_first - 1;
219 		exc_node_cnt = ext_part_struct->exc_node_cnt;
220 		for (i = i_first; i <= i_last; i++) {
221 			if (!bit_test(ext_part_struct->exc_node_cnt_bitmap, i))
222 				continue;
223 			node_ptr = node_record_table_ptr + i;
224 			if (!IS_NODE_IDLE(node_ptr)			||
225 			    IS_NODE_COMPLETING(node_ptr)		||
226 			    IS_NODE_DOWN(node_ptr)			||
227 			    IS_NODE_DRAIN(node_ptr)			||
228 			    IS_NODE_POWER_UP(node_ptr)			||
229 			    IS_NODE_POWER_SAVE(node_ptr)		||
230 			    (node_ptr->sus_job_cnt > 0))
231 				continue;
232 			bit_set(exc_node_cnt_bitmap, i);
233 			if (--exc_node_cnt <= 0)
234 				break;
235 		}
236 	}
237 
238 	if (*orig_exc_nodes == NULL) {
239 		*orig_exc_nodes = exc_node_cnt_bitmap;
240 	} else {
241 		bit_or(*orig_exc_nodes, exc_node_cnt_bitmap);
242 		FREE_NULL_BITMAP(exc_node_cnt_bitmap);
243 	}
244 
245 	if (power_save_debug) {
246 		char *tmp = bitmap2node_name(*orig_exc_nodes);
247 		info("power_save module, excluded nodes %s", tmp);
248 		xfree(tmp);
249 	}
250 
251 	return 0;
252 }
253 
254 /* Perform any power change work to nodes */
_do_power_work(time_t now)255 static void _do_power_work(time_t now)
256 {
257 	int i, wake_cnt = 0, susp_total = 0;
258 	time_t delta_t;
259 	uint32_t susp_state;
260 	bitstr_t *avoid_node_bitmap = NULL, *failed_node_bitmap = NULL;
261 	bitstr_t *wake_node_bitmap = NULL, *sleep_node_bitmap = NULL;
262 	node_record_t *node_ptr;
263 
264 	if (last_work_scan == 0) {
265 		if (exc_nodes && (_parse_exc_nodes() != SLURM_SUCCESS))
266 			error("Invalid SuspendExcNodes %s ignored", exc_nodes);
267 
268 		if (exc_parts) {
269 			char *tmp = NULL, *one_part = NULL, *part_list = NULL;
270 			part_record_t *part_ptr = NULL;
271 
272 			part_list = xstrdup(exc_parts);
273 			one_part = strtok_r(part_list, ",", &tmp);
274 			while (one_part != NULL) {
275 				part_ptr = find_part_record(one_part);
276 				if (!part_ptr) {
277 					error("Invalid SuspendExcPart %s ignored",
278 					      one_part);
279 				} else if (exc_node_bitmap) {
280 					bit_or(exc_node_bitmap,
281 					       part_ptr->node_bitmap);
282 				} else {
283 					exc_node_bitmap =
284 						bit_copy(part_ptr->node_bitmap);
285 				}
286 				one_part = strtok_r(NULL, ",", &tmp);
287 			}
288 			xfree(part_list);
289 		}
290 
291 		if (exc_node_bitmap && power_save_debug) {
292 			char *tmp = bitmap2node_name(exc_node_bitmap);
293 			info("power_save module, excluded nodes %s", tmp);
294 			xfree(tmp);
295 		}
296 		if (partial_node_list && power_save_debug) {
297 			(void) list_for_each(partial_node_list,
298 					     _list_part_node_lists, NULL);
299 
300 		}
301 	}
302 
303 	/* Set limit on counts of nodes to have state changed */
304 	delta_t = now - last_work_scan;
305 	if (delta_t >= 60) {
306 		suspend_cnt_f = 0.0;
307 		resume_cnt_f  = 0.0;
308 	} else {
309 		float rate = (60 - delta_t) / 60.0;
310 		suspend_cnt_f *= rate;
311 		resume_cnt_f  *= rate;
312 	}
313 	suspend_cnt = (suspend_cnt_f + 0.5);
314 	resume_cnt  = (resume_cnt_f  + 0.5);
315 
316 	last_work_scan = now;
317 
318 	/* Identify nodes to avoid considering for suspend */
319 	if (partial_node_list) {
320 		(void) list_for_each(partial_node_list, _pick_exc_nodes,
321 				     &avoid_node_bitmap);
322 	}
323 	if (exc_node_bitmap) {
324 		if (avoid_node_bitmap)
325 			bit_or(avoid_node_bitmap, exc_node_bitmap);
326 		else
327 			avoid_node_bitmap = bit_copy(exc_node_bitmap);
328 	}
329 
330 	/* Build bitmaps identifying each node which should change state */
331 	for (i = 0, node_ptr = node_record_table_ptr;
332 	     i < node_record_count; i++, node_ptr++) {
333 		susp_state = IS_NODE_POWER_SAVE(node_ptr);
334 
335 		if (susp_state)
336 			susp_total++;
337 
338 		/* Resume nodes as appropriate */
339 		if (susp_state &&
340 		    ((resume_rate == 0) || (resume_cnt < resume_rate))	&&
341 		    !IS_NODE_POWERING_DOWN(node_ptr) &&
342 		    (IS_NODE_ALLOCATED(node_ptr) ||
343 		     (node_ptr->last_idle > (now - idle_time)))) {
344 			if (wake_node_bitmap == NULL) {
345 				wake_node_bitmap =
346 					bit_alloc(node_record_count);
347 			}
348 			wake_cnt++;
349 			resume_cnt++;
350 			resume_cnt_f++;
351 			node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
352 			node_ptr->node_state |=   NODE_STATE_POWER_UP;
353 			node_ptr->node_state |=   NODE_STATE_NO_RESPOND;
354 			bit_clear(power_node_bitmap, i);
355 			node_ptr->boot_req_time = now;
356 			node_ptr->last_response = now + resume_timeout;
357 			bit_set(booting_node_bitmap, i);
358 			bit_set(resume_node_bitmap,  i);
359 			bit_set(wake_node_bitmap,    i);
360 		}
361 
362 		/* Suspend nodes as appropriate */
363 		if ((susp_state == 0)					&&
364 		    ((suspend_rate == 0) || (suspend_cnt < suspend_rate)) &&
365 		    (IS_NODE_IDLE(node_ptr) || IS_NODE_DOWN(node_ptr))	&&
366 		    (node_ptr->sus_job_cnt == 0)			&&
367 		    (!IS_NODE_COMPLETING(node_ptr))			&&
368 		    (!IS_NODE_POWER_UP(node_ptr))			&&
369 		    (node_ptr->last_idle != 0)				&&
370 		    (node_ptr->last_idle < (now - idle_time))		&&
371 		    ((avoid_node_bitmap == NULL) ||
372 		     (bit_test(avoid_node_bitmap, i) == 0))) {
373 			if (sleep_node_bitmap == NULL) {
374 				sleep_node_bitmap =
375 					bit_alloc(node_record_count);
376 			}
377 			suspend_cnt++;
378 			suspend_cnt_f++;
379 			node_ptr->node_state |= NODE_STATE_POWER_SAVE;
380 			node_ptr->node_state |= NODE_STATE_POWERING_DOWN;
381 			node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
382 			bit_set(power_node_bitmap,   i);
383 			bit_set(sleep_node_bitmap,   i);
384 
385 			/* Don't allocate until after SuspendTimeout */
386 			bit_clear(avail_node_bitmap, i);
387 			node_ptr->last_response = now + suspend_timeout;
388 
389 			if (idle_on_node_suspend) {
390 				if (IS_NODE_DOWN(node_ptr)) {
391 					trigger_node_up(node_ptr);
392 					clusteracct_storage_g_node_up(
393 						acct_db_conn, node_ptr, now);
394 				} else if (IS_NODE_IDLE(node_ptr) &&
395 					   (IS_NODE_DRAIN(node_ptr) ||
396 					    IS_NODE_FAIL(node_ptr))) {
397 					clusteracct_storage_g_node_up(
398 						acct_db_conn, node_ptr, now);
399 				}
400 
401 				node_ptr->node_state =
402 					NODE_STATE_IDLE |
403 					(node_ptr->node_state & NODE_STATE_FLAGS);
404 				node_ptr->node_state &= (~NODE_STATE_DRAIN);
405 				node_ptr->node_state &= (~NODE_STATE_FAIL);
406 			}
407 		}
408 
409 		if (IS_NODE_POWERING_DOWN(node_ptr) &&
410 		    (node_ptr->last_response < now)) {
411 
412 			node_ptr->node_state &= (~NODE_STATE_POWERING_DOWN);
413 
414 			if (!IS_NODE_DOWN(node_ptr) &&
415 			    !IS_NODE_DRAIN(node_ptr) &&
416 			    !IS_NODE_FAIL(node_ptr))
417 				make_node_avail(i);
418 
419 			node_ptr->last_idle = 0;
420 		}
421 
422 		/*
423 		 * Down nodes as if not resumed by ResumeTimeout
424 		 */
425 		if (bit_test(booting_node_bitmap, i) &&
426 		    (now > node_ptr->last_response)  &&
427 		    IS_NODE_POWER_UP(node_ptr) &&
428 		    IS_NODE_NO_RESPOND(node_ptr)) {
429 			info("node %s not resumed by ResumeTimeout(%d) - marking down and power_save",
430 			     node_ptr->name, resume_timeout);
431 			/*
432 			 * set_node_down_ptr() will remove the node from the
433 			 * avail_node_bitmap.
434 			 */
435 			set_node_down_ptr(node_ptr, "ResumeTimeout reached");
436 			node_ptr->node_state &= (~NODE_STATE_POWER_UP);
437 			node_ptr->node_state |= NODE_STATE_POWER_SAVE;
438 			bit_set(power_node_bitmap, i);
439 			bit_clear(booting_node_bitmap, i);
440 			bit_clear(resume_node_bitmap, i);
441 			node_ptr->last_idle = 0;
442 
443 			if (resume_fail_prog) {
444 				if (!failed_node_bitmap) {
445 					failed_node_bitmap =
446 						bit_alloc(node_record_count);
447 				}
448 				bit_set(failed_node_bitmap, i);
449 			}
450 		}
451 	}
452 	FREE_NULL_BITMAP(avoid_node_bitmap);
453 	if (power_save_debug && ((now - last_log) > 600) && (susp_total > 0)) {
454 		info("Power save mode: %d nodes", susp_total);
455 		last_log = now;
456 	}
457 
458 	if (sleep_node_bitmap) {
459 		char *nodes;
460 		nodes = bitmap2node_name(sleep_node_bitmap);
461 		if (nodes)
462 			_do_suspend(nodes);
463 		else
464 			error("power_save: bitmap2nodename");
465 		xfree(nodes);
466 		FREE_NULL_BITMAP(sleep_node_bitmap);
467 		/* last_node_update could be changed already by another thread!
468 		last_node_update = now; */
469 	}
470 
471 	if (wake_node_bitmap) {
472 		char *nodes;
473 		nodes = bitmap2node_name(wake_node_bitmap);
474 		if (nodes)
475 			_do_resume(nodes);
476 		else
477 			error("power_save: bitmap2nodename");
478 		xfree(nodes);
479 		FREE_NULL_BITMAP(wake_node_bitmap);
480 		/* last_node_update could be changed already by another thread!
481 		last_node_update = now; */
482 	}
483 
484 	if (failed_node_bitmap) {
485 		char *nodes;
486 		nodes = bitmap2node_name(failed_node_bitmap);
487 		if (nodes)
488 			_do_failed_nodes(nodes);
489 		else
490 			error("power_save: bitmap2nodename");
491 		xfree(nodes);
492 		FREE_NULL_BITMAP(failed_node_bitmap);
493 	}
494 }
495 
496 /*
497  * power_job_reboot - Reboot compute nodes for a job from the head node.
498  * Also change the modes of KNL nodes for node_features/knl_cray plugin.
499  * IN job_ptr - pointer to job that will be initiated
500  * RET SLURM_SUCCESS(0) or error code
501  */
power_job_reboot(job_record_t * job_ptr)502 extern int power_job_reboot(job_record_t *job_ptr)
503 {
504 	int rc = SLURM_SUCCESS;
505 	int i, i_first, i_last;
506 	node_record_t *node_ptr;
507 	bitstr_t *boot_node_bitmap = NULL, *feature_node_bitmap = NULL;
508 	time_t now = time(NULL);
509 	char *nodes, *reboot_features = NULL;
510 	pid_t pid;
511 
512 /*
513  *	NOTE: See reboot_job_reboot() in job_scheduler.c for similar logic
514  *	used by node_features/knl_generic plugin.
515  */
516 	if (job_ptr->reboot)
517 		boot_node_bitmap = bit_copy(job_ptr->node_bitmap);
518 	else
519 		boot_node_bitmap = node_features_reboot(job_ptr);
520 	if (boot_node_bitmap == NULL) {
521 		/* At minimum, the powered down nodes require reboot */
522 		if (bit_overlap_any(power_node_bitmap, job_ptr->node_bitmap) ||
523 		    bit_overlap_any(booting_node_bitmap,
524 				    job_ptr->node_bitmap)) {
525 			job_ptr->job_state |= JOB_CONFIGURING;
526 			job_ptr->bit_flags |= NODE_REBOOT;
527 		}
528 		return SLURM_SUCCESS;
529 	}
530 
531 	/* Modify state information for all nodes, KNL and others */
532 	i_first = bit_ffs(boot_node_bitmap);
533 	if (i_first >= 0)
534 		i_last = bit_fls(boot_node_bitmap);
535 	else
536 		i_last = i_first - 1;
537 	for (i = i_first; i <= i_last; i++) {
538 		if (!bit_test(boot_node_bitmap, i))
539 			continue;
540 		node_ptr = node_record_table_ptr + i;
541 		resume_cnt++;
542 		resume_cnt_f++;
543 		node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
544 		node_ptr->node_state |=   NODE_STATE_POWER_UP;
545 		node_ptr->node_state |=   NODE_STATE_NO_RESPOND;
546 		bit_clear(power_node_bitmap, i);
547 		bit_clear(avail_node_bitmap, i);
548 		node_ptr->boot_req_time = now;
549 		node_ptr->last_response = now + resume_timeout;
550 		bit_set(booting_node_bitmap, i);
551 		bit_set(resume_node_bitmap,  i);
552 	}
553 
554 	if (job_ptr->details && job_ptr->details->features &&
555 	    node_features_g_user_update(job_ptr->user_id)) {
556 		reboot_features = node_features_g_job_xlate(
557 					job_ptr->details->features);
558 		if (reboot_features)
559 			feature_node_bitmap = node_features_g_get_node_bitmap();
560 		if (feature_node_bitmap)
561 			bit_and(feature_node_bitmap, boot_node_bitmap);
562 		if (!feature_node_bitmap ||
563 		    (bit_ffs(feature_node_bitmap) == -1)) {
564 			/* No KNL nodes to reboot */
565 			FREE_NULL_BITMAP(feature_node_bitmap);
566 		} else {
567 			bit_and_not(boot_node_bitmap, feature_node_bitmap);
568 			if (bit_ffs(boot_node_bitmap) == -1) {
569 				/* No non-KNL nodes to reboot */
570 				FREE_NULL_BITMAP(boot_node_bitmap);
571 			}
572 		}
573 	}
574 
575 	if (feature_node_bitmap) {
576 		/* Reboot nodes to change KNL NUMA and/or MCDRAM mode */
577 		nodes = bitmap2node_name(feature_node_bitmap);
578 		if (nodes) {
579 			job_ptr->job_state |= JOB_CONFIGURING;
580 			job_ptr->wait_all_nodes = 1;
581 			job_ptr->bit_flags |= NODE_REBOOT;
582 			pid = _run_prog(resume_prog, nodes, reboot_features,
583 					job_ptr->job_id);
584 			if (power_save_debug)
585 				info("%s: pid %d reboot nodes %s features %s",
586 				     __func__, (int) pid, nodes,
587 				     reboot_features);
588 		} else {
589 			error("%s: bitmap2nodename", __func__);
590 			rc = SLURM_ERROR;
591 		}
592 		xfree(nodes);
593 		FREE_NULL_BITMAP(feature_node_bitmap);
594 	}
595 	if (boot_node_bitmap) {
596 		/* Reboot nodes with no feature changes */
597 		nodes = bitmap2node_name(boot_node_bitmap);
598 		if (nodes) {
599 			job_ptr->job_state |= JOB_CONFIGURING;
600 			job_ptr->wait_all_nodes = 1;
601 			job_ptr->bit_flags |= NODE_REBOOT;
602 			pid = _run_prog(resume_prog, nodes, NULL,
603 					job_ptr->job_id);
604 			if (power_save_debug)
605 				info("%s: pid %d reboot nodes %s",
606 				     __func__, (int) pid, nodes);
607 		} else {
608 			error("%s: bitmap2nodename", __func__);
609 			rc = SLURM_ERROR;
610 		}
611 		xfree(nodes);
612 	}
613 	FREE_NULL_BITMAP(boot_node_bitmap);
614 	xfree(reboot_features);
615 
616 	last_node_update = now;
617 
618 	return rc;
619 }
620 
621 /* If slurmctld crashes, the node state that it recovers could differ
622  * from the actual hardware state (e.g. ResumeProgram failed to complete).
623  * To address that, when a node that should be powered up for a running
624  * job is not responding, they try running ResumeProgram again. */
_re_wake(void)625 static void _re_wake(void)
626 {
627 	node_record_t *node_ptr;
628 	bitstr_t *wake_node_bitmap = NULL;
629 	int i;
630 
631 	node_ptr = node_record_table_ptr;
632 	for (i=0; i<node_record_count; i++, node_ptr++) {
633 		if (IS_NODE_ALLOCATED(node_ptr)   &&
634 		    IS_NODE_NO_RESPOND(node_ptr)  &&
635 		    !IS_NODE_POWER_SAVE(node_ptr) &&
636 		    (bit_test(resume_node_bitmap,  i) == 0)) {
637 			if (wake_node_bitmap == NULL) {
638 				wake_node_bitmap =
639 					bit_alloc(node_record_count);
640 			}
641 			bit_set(wake_node_bitmap, i);
642 		}
643 	}
644 
645 	if (wake_node_bitmap) {
646 		char *nodes;
647 		nodes = bitmap2node_name(wake_node_bitmap);
648 		if (nodes) {
649 			pid_t pid = _run_prog(resume_prog, nodes, NULL, 0);
650 			if (power_save_debug)
651 				info("power_save: pid %d rewaking nodes %s",
652 				     (int) pid, nodes);
653 		} else
654 			error("power_save: bitmap2nodename");
655 		xfree(nodes);
656 		FREE_NULL_BITMAP(wake_node_bitmap);
657 	}
658 }
659 
_do_failed_nodes(char * hosts)660 static void _do_failed_nodes(char *hosts)
661 {
662 	pid_t pid = _run_prog(resume_fail_prog, hosts, NULL, 0);
663 	if (power_save_debug)
664 		info("power_save: pid %d handle failed nodes %s",
665 		     (int)pid, hosts);
666 }
667 
_do_resume(char * host)668 static void _do_resume(char *host)
669 {
670 	pid_t pid = _run_prog(resume_prog, host, NULL, 0);
671 	if (power_save_debug)
672 		info("power_save: pid %d waking nodes %s",
673 		     (int) pid, host);
674 }
675 
_do_suspend(char * host)676 static void _do_suspend(char *host)
677 {
678 	pid_t pid = _run_prog(suspend_prog, host, NULL, 0);
679 	if (power_save_debug)
680 		info("power_save: pid %d suspending nodes %s",
681 		     (int) pid, host);
682 }
683 
684 /* run a suspend or resume program
685  * prog IN	- program to run
686  * arg1 IN	- first program argument, the hostlist expression
687  * arg2 IN	- second program argumentor NULL
688  * job_id IN	- Passed as SLURM_JOB_ID environment variable
689  */
_run_prog(char * prog,char * arg1,char * arg2,uint32_t job_id)690 static pid_t _run_prog(char *prog, char *arg1, char *arg2, uint32_t job_id)
691 {
692 	int i;
693 	char *argv[4], job_id_str[32], *pname;
694 	pid_t child;
695 
696 	if (prog == NULL)	/* disabled, useful for testing */
697 		return -1;
698 
699 	if (job_id)
700 		snprintf(job_id_str, sizeof(job_id_str), "%u", job_id);
701 	pname = strrchr(prog, '/');
702 	if (pname == NULL)
703 		argv[0] = prog;
704 	else
705 		argv[0] = pname + 1;
706 	argv[1] = arg1;
707 	argv[2] = arg2;
708 	argv[3] = NULL;
709 
710 	child = fork();
711 	if (child == 0) {
712 		for (i = 0; i < 1024; i++)
713 			(void) close(i);
714 		setpgid(0, 0);
715 		setenv("SLURM_CONF", slurmctld_conf.slurm_conf, 1);
716 		if (job_id)
717 			setenv("SLURM_JOB_ID", job_id_str, 1);
718 		execv(prog, argv);
719 		_exit(1);
720 	} else if (child < 0) {
721 		error("fork: %m");
722 	} else {
723 		/* save the pid */
724 		proc_track_struct_t *proc_track;
725 		proc_track = xmalloc(sizeof(proc_track_struct_t));
726 		proc_track->child_pid = child;
727 		proc_track->child_time = time(NULL);
728 		list_append(proc_track_list, proc_track);
729 	}
730 	return child;
731 }
732 
733 /* reap child processes previously forked to modify node state. */
_reap_procs(void)734 static void _reap_procs(void)
735 {
736 	int delay, max_timeout, rc, status;
737 	ListIterator iter;
738 	proc_track_struct_t *proc_track;
739 
740 	max_timeout = MAX(suspend_timeout, resume_timeout);
741 	iter = list_iterator_create(proc_track_list);
742 	while ((proc_track = list_next(iter))) {
743 		rc = waitpid(proc_track->child_pid, &status, WNOHANG);
744 		if (rc == 0)
745 			continue;
746 
747 		delay = difftime(time(NULL), proc_track->child_time);
748 		if (power_save_debug && (delay > max_timeout)) {
749 			info("power_save: program %d ran for %d sec",
750 			     (int) proc_track->child_pid, delay);
751 		}
752 
753 		if (WIFEXITED(status)) {
754 			rc = WEXITSTATUS(status);
755 			if (rc != 0) {
756 				error("power_save: program exit status of %d",
757 				      rc);
758 			} else
759 				ping_nodes_now = true;
760 		} else if (WIFSIGNALED(status)) {
761 			error("power_save: program signaled: %s",
762 			      strsignal(WTERMSIG(status)));
763 		}
764 
765 		list_delete_item(iter);
766 	}
767 	list_iterator_destroy(iter);
768 }
769 
770 /* kill (or orphan) child processes previously forked to modify node state.
771  * return the count of killed/orphaned processes */
_kill_procs(void)772 static int  _kill_procs(void)
773 {
774 	int killed = 0, rc, status;
775 	ListIterator iter;
776 	proc_track_struct_t *proc_track;
777 
778 	iter = list_iterator_create(proc_track_list);
779 	while ((proc_track = list_next(iter))) {
780 		rc = waitpid(proc_track->child_pid, &status, WNOHANG);
781 		if (rc == 0) {
782 #ifdef  POWER_SAVE_KILL_PROCS
783 			error("power_save: killing process %d",
784 			      proc_track->child_pid);
785 			kill((0 - proc_track->child_pid), SIGKILL);
786 #else
787 			error("power_save: orphaning process %d",
788 			      proc_track->child_pid);
789 #endif
790 			killed++;
791 		} else {
792 			/* process already completed */
793 		}
794 		list_delete_item(iter);
795 	}
796 	list_iterator_destroy(iter);
797 
798 	return killed;
799 }
800 
801 /* shutdown power save daemons */
_shutdown_power(void)802 static void _shutdown_power(void)
803 {
804 	int i, proc_cnt, max_timeout;
805 
806 	max_timeout = MAX(suspend_timeout, resume_timeout);
807 	max_timeout = MIN(max_timeout, MAX_SHUTDOWN_DELAY);
808 	/* Try to avoid orphan processes */
809 	for (i = 0; ; i++) {
810 		_reap_procs();
811 		proc_cnt = list_count(proc_track_list);
812 		if (proc_cnt == 0)	/* all procs completed */
813 			break;
814 		if (i >= max_timeout) {
815 			error("power_save: orphaning %d processes which are "
816 			      "not terminating so slurmctld can exit",
817 			      proc_cnt);
818 			_kill_procs();
819 			break;
820 		} else if (i == 2) {
821 			info("power_save: waiting for %d processes to complete",
822 			     proc_cnt);
823 		} else if (i % 5 == 0) {
824 			debug("power_save: waiting for %d processes to complete",
825 			      proc_cnt);
826 		}
827 		sleep(1);
828 	}
829 }
830 
831 /* Free all allocated memory */
_clear_power_config(void)832 static void _clear_power_config(void)
833 {
834 	xfree(suspend_prog);
835 	xfree(resume_prog);
836 	xfree(exc_nodes);
837 	xfree(exc_parts);
838 	FREE_NULL_BITMAP(exc_node_bitmap);
839 	FREE_NULL_LIST(partial_node_list);
840 }
841 
842 /*
843  * Initialize power_save module parameters.
844  * Return 0 on valid configuration to run power saving,
845  * otherwise log the problem and return -1
846  */
_init_power_config(void)847 static int _init_power_config(void)
848 {
849 	last_config     = slurmctld_conf.last_update;
850 	last_work_scan  = 0;
851 	last_log	= 0;
852 	idle_time       = slurmctld_conf.suspend_time - 1;
853 	suspend_rate    = slurmctld_conf.suspend_rate;
854 	resume_timeout  = slurmctld_conf.resume_timeout;
855 	resume_rate     = slurmctld_conf.resume_rate;
856 	slurmd_timeout  = slurmctld_conf.slurmd_timeout;
857 	suspend_timeout = slurmctld_conf.suspend_timeout;
858 	_clear_power_config();
859 	if (slurmctld_conf.suspend_program)
860 		suspend_prog = xstrdup(slurmctld_conf.suspend_program);
861 	if (slurmctld_conf.resume_fail_program)
862 		resume_fail_prog = xstrdup(slurmctld_conf.resume_fail_program);
863 	if (slurmctld_conf.resume_program)
864 		resume_prog = xstrdup(slurmctld_conf.resume_program);
865 	if (slurmctld_conf.suspend_exc_nodes)
866 		exc_nodes = xstrdup(slurmctld_conf.suspend_exc_nodes);
867 	if (slurmctld_conf.suspend_exc_parts)
868 		exc_parts = xstrdup(slurmctld_conf.suspend_exc_parts);
869 
870 	idle_on_node_suspend = xstrcasestr(slurmctld_conf.slurmctld_params,
871 					   "idle_on_node_suspend");
872 
873 	if (idle_time < 0) {	/* not an error */
874 		debug("power_save module disabled, SuspendTime < 0");
875 		return -1;
876 	}
877 	if (suspend_rate < 0) {
878 		error("power_save module disabled, SuspendRate < 0");
879 		test_config_rc = 1;
880 		return -1;
881 	}
882 	if (resume_rate < 0) {
883 		error("power_save module disabled, ResumeRate < 0");
884 		test_config_rc = 1;
885 		return -1;
886 	}
887 	if (suspend_prog == NULL) {
888 		error("power_save module disabled, NULL SuspendProgram");
889 		test_config_rc = 1;
890 		return -1;
891 	} else if (!_valid_prog(suspend_prog)) {
892 		error("power_save module disabled, invalid SuspendProgram %s",
893 		      suspend_prog);
894 		test_config_rc = 1;
895 		return -1;
896 	}
897 	if (resume_prog == NULL) {
898 		error("power_save module disabled, NULL ResumeProgram");
899 		test_config_rc = 1;
900 		return -1;
901 	} else if (!_valid_prog(resume_prog)) {
902 		error("power_save module disabled, invalid ResumeProgram %s",
903 		      resume_prog);
904 		test_config_rc = 1;
905 		return -1;
906 	}
907 
908 	if (slurmctld_conf.debug_flags & DEBUG_FLAG_POWER_SAVE)
909 		power_save_debug = true;
910 	else
911 		power_save_debug = false;
912 
913 	if (resume_fail_prog && !_valid_prog(resume_fail_prog)) {
914 		/* error's already reported in _valid_prog() */
915 		xfree(resume_fail_prog);
916 	}
917 
918 	return 0;
919 }
920 
_valid_prog(char * file_name)921 static bool _valid_prog(char *file_name)
922 {
923 	struct stat buf;
924 
925 	if (file_name[0] != '/') {
926 		error("power_save program %s not absolute pathname", file_name);
927 		return false;
928 	}
929 
930 	if (access(file_name, X_OK) != 0) {
931 		error("power_save program %s not executable", file_name);
932 		return false;
933 	}
934 
935 	if (stat(file_name, &buf)) {
936 		error("power_save program %s not found", file_name);
937 		return false;
938 	}
939 	if (buf.st_mode & 022) {
940 		error("power_save program %s has group or "
941 		      "world write permission", file_name);
942 		return false;
943 	}
944 
945 	return true;
946 }
947 
948 /*
949  * config_power_mgr - Read power management configuration
950  */
config_power_mgr(void)951 extern void config_power_mgr(void)
952 {
953 	slurm_mutex_lock(&power_mutex);
954 	if (!power_save_config) {
955 		if (_init_power_config() == 0)
956 			power_save_enabled = true;
957 		power_save_config = true;
958 	}
959 	slurm_cond_signal(&power_cond);
960 	slurm_mutex_unlock(&power_mutex);
961 }
962 
963 /*
964  * start_power_mgr - Start power management thread as needed. The thread
965  *	terminates automatically at slurmctld shutdown time.
966  * IN thread_id - pointer to thread ID of the started pthread.
967  */
start_power_mgr(pthread_t * thread_id)968 extern void start_power_mgr(pthread_t *thread_id)
969 {
970 	slurm_mutex_lock(&power_mutex);
971 	if (power_save_started) {     /* Already running */
972 		slurm_mutex_unlock(&power_mutex);
973 		return;
974 	}
975 	power_save_started = true;
976 	proc_track_list = list_create(xfree_ptr);
977 	slurm_mutex_unlock(&power_mutex);
978 
979 	slurm_thread_create(thread_id, _init_power_save, NULL);
980 }
981 
982 /* Report if node power saving is enabled */
power_save_test(void)983 extern bool power_save_test(void)
984 {
985 	bool rc;
986 
987 	slurm_mutex_lock(&power_mutex);
988 	while (!power_save_config) {
989 		slurm_cond_wait(&power_cond, &power_mutex);
990 	}
991 	rc = power_save_enabled;
992 	slurm_mutex_unlock(&power_mutex);
993 
994 	return rc;
995 }
996 
997 /* Free module's allocated memory */
power_save_fini(void)998 extern void power_save_fini(void)
999 {
1000 	slurm_mutex_lock(&power_mutex);
1001 	if (power_save_started) {     /* Already running */
1002 		power_save_started = false;
1003 		FREE_NULL_LIST(proc_track_list);
1004 	}
1005 	slurm_mutex_unlock(&power_mutex);
1006 }
1007 
1008 /*
1009  * init_power_save - Initialize the power save module. Started as a
1010  *	pthread. Terminates automatically at slurmctld shutdown time.
1011  *	Input and output are unused.
1012  */
_init_power_save(void * arg)1013 static void *_init_power_save(void *arg)
1014 {
1015         /* Locks: Read nodes */
1016         slurmctld_lock_t node_read_lock = {
1017                 NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
1018         /* Locks: Write jobs and nodes */
1019         slurmctld_lock_t node_write_lock = {
1020                 NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
1021 	time_t now, boot_time = 0, last_power_scan = 0;
1022 
1023 	if (power_save_config && !power_save_enabled) {
1024 		debug("power_save mode not enabled");
1025 		goto fini;
1026 	}
1027 
1028 	resume_node_bitmap  = bit_alloc(node_record_count);
1029 
1030 	while (slurmctld_config.shutdown_time == 0) {
1031 		sleep(1);
1032 
1033 		_reap_procs();
1034 
1035 		if ((last_config != slurmctld_conf.last_update) &&
1036 		    (_init_power_config())) {
1037 			info("power_save mode has been disabled due to "
1038 			     "configuration changes");
1039 			goto fini;
1040 		}
1041 
1042 		now = time(NULL);
1043 		if (boot_time == 0)
1044 			boot_time = now;
1045 
1046 		/*
1047 		 * Only run every 10 seconds or after a node state change,
1048 		 * whichever happens first
1049 		 */
1050 		if ((last_node_update >= last_power_scan) ||
1051 		    (now >= (last_power_scan + 10))) {
1052 			lock_slurmctld(node_write_lock);
1053 			_do_power_work(now);
1054 			unlock_slurmctld(node_write_lock);
1055 			last_power_scan = now;
1056 		}
1057 
1058 		if (slurmd_timeout &&
1059 		    (now > (boot_time + (slurmd_timeout / 2)))) {
1060 			lock_slurmctld(node_read_lock);
1061 			_re_wake();
1062 			unlock_slurmctld(node_read_lock);
1063 			/* prevent additional executions */
1064 			boot_time += (365 * 24 * 60 * 60);
1065 			slurmd_timeout = 0;
1066 		}
1067 	}
1068 
1069 fini:	_clear_power_config();
1070 	FREE_NULL_BITMAP(resume_node_bitmap);
1071 	_shutdown_power();
1072 	slurm_mutex_lock(&power_mutex);
1073 	list_destroy(proc_track_list);
1074 	proc_track_list = NULL;
1075 	power_save_enabled = false;
1076 	power_save_started = false;
1077 	slurm_cond_signal(&power_cond);
1078 	slurm_mutex_unlock(&power_mutex);
1079 	pthread_exit(NULL);
1080 	return NULL;
1081 }
1082