1 /*****************************************************************************\
2  *  powercapping.c - Definitions for power capping logic in the controller
3  *****************************************************************************
4  *  Copyright (C) 2013 CEA/DAM/DIF
5  *  Written by Matthieu Hautreux <matthieu.hautreux@cea.fr>
6  *
7  *  Copyright (C) 2014 Bull S.A.S.
8  *  Written by Yiannis Georgiou <yiannis.georgiou@bull.net>
9  *
10  *  This file is part of Slurm, a resource management program.
11  *  For details, see <https://slurm.schedmd.com/>.
12  *  Please also read the included file: DISCLAIMER.
13  *
14  *  Slurm is free software; you can redistribute it and/or modify it under
15  *  the terms of the GNU General Public License as published by the Free
16  *  Software Foundation; either version 2 of the License, or (at your option)
17  *  any later version.
18  *
19  *  In addition, as a special exception, the copyright holders give permission
20  *  to link the code of portions of this program with the OpenSSL library under
21  *  certain conditions as described in each individual source file, and
22  *  distribute linked combinations including the two. You must obey the GNU
23  *  General Public License in all respects for all of the code used other than
24  *  OpenSSL. If you modify file(s) with this exception, you may extend this
25  *  exception to your version of the file(s), but you are not obligated to do
26  *  so. If you do not wish to do so, delete this exception statement from your
27  *  version.  If you delete this exception statement from all source files in
28  *  the program, then also delete it here.
29  *
30  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
31  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
32  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
33  *  details.
34  *
35  *  You should have received a copy of the GNU General Public License along
36  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
37  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
38 \*****************************************************************************/
39 #include <stdlib.h>
40 #include <string.h>
41 
42 #include "src/common/bitstring.h"
43 #include "src/common/layouts_mgr.h"
44 #include "src/common/macros.h"
45 #include "src/common/node_conf.h"
46 #include "src/common/power.h"
47 #include "src/common/slurm_protocol_api.h"
48 #include "src/common/xstring.h"
49 #include "src/slurmctld/powercapping.h"
50 #include "src/slurmctld/reservation.h"
51 #include "src/slurmctld/slurmctld.h"
52 
53 
54 #define L_NAME		"power"
55 #define L_CLUSTER	"Cluster"
56 #define L_SUM_MAX	"MaxSumWatts"
57 #define L_SUM_IDLE	"IdleSumWatts"
58 #define L_SUM_CUR	"CurrentSumPower"
59 #define L_NODE_MAX	"MaxWatts"
60 #define L_NODE_IDLE	"IdleWatts"
61 #define L_NODE_DOWN	"DownWatts"
62 #define L_NODE_SAVE	"PowerSaveWatts"
63 #define L_NODE_CUR	"CurrentPower"
64 #define L_NUM_FREQ	"NumFreqChoices"
65 #define L_CUR_POWER	"CurrentCorePower"
66 
_powercap_enabled(void)67 static bool _powercap_enabled(void)
68 {
69 	if (powercap_get_cluster_current_cap() == 0)
70 		return false;
71 	return true;
72 }
73 
_which_power_layout(char * layout)74 int _which_power_layout(char *layout)
75 {
76 	uint32_t max_watts;
77 
78 	return layouts_entity_get_kv(layout, L_CLUSTER, L_SUM_MAX,
79 					 &max_watts, L_T_UINT32);
80 
81 }
82 
which_power_layout(void)83 int which_power_layout(void)
84 {
85 	layout_t* layout;
86 
87 	if (!_powercap_enabled())
88 		return 0;
89 
90 	layout = layouts_get_layout("power");
91 
92 	if (layout == NULL)
93 		return 0;
94 	else if (xstrcmp(layout->name,"default") == 0)
95 		return 1;
96 	else if (xstrcmp(layout->name,"cpufreq") == 0)
97 		return 2;
98 
99 	return 0;
100 }
101 
power_layout_ready(void)102 bool power_layout_ready(void)
103 {
104 	static time_t last_error_time = (time_t) 0;
105 	time_t now = time(NULL);
106 	node_record_t *node_ptr;
107 	uint32_t data[2];
108 	int i;
109 
110 	for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
111 	     i++, node_ptr++) {
112 		if (layouts_entity_get_mkv(L_NAME, node_ptr->name,
113 		    "MaxWatts,IdleWatts", data, (sizeof(uint32_t) * 2),
114 		    L_T_UINT32)) {
115 			/* Limit error message frequency, once per minute */
116 			if (difftime(now, last_error_time) < 60)
117 				return false;
118 			last_error_time = now;
119 			error("%s: node %s is not in the layouts.d/power.conf file",
120 			     __func__, node_ptr->name);
121 			return false;
122 		}
123 	}
124 	return true;
125 }
126 
127 
powercap_get_cluster_max_watts(void)128 uint32_t powercap_get_cluster_max_watts(void)
129 {
130 	uint32_t max_watts;
131 
132 	if (!_powercap_enabled())
133 		return 0;
134 
135 	if (!power_layout_ready())
136 		return 0;
137 
138 	layouts_entity_get_kv(L_NAME, L_CLUSTER, L_SUM_MAX, &max_watts,
139 				  L_T_UINT32);
140 
141 	return max_watts;
142 }
143 
powercap_get_cluster_min_watts(void)144 uint32_t powercap_get_cluster_min_watts(void)
145 {
146 	uint32_t min_watts = 0, tmp_watts, save_watts, down_watts;
147 	node_record_t *node_ptr;
148 	int i;
149 
150 	if (!_powercap_enabled())
151 		return 0;
152 
153 	if (!power_layout_ready())
154 		return 0;
155 
156 	for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
157 	     i++, node_ptr++) {
158 		layouts_entity_get_kv(L_NAME, node_ptr->name, L_NODE_IDLE,
159 					  &tmp_watts, L_T_UINT32);
160 		layouts_entity_get_kv(L_NAME, node_ptr->name, L_NODE_DOWN,
161 					  &down_watts, L_T_UINT32);
162 		tmp_watts = MIN(tmp_watts, down_watts);
163 		layouts_entity_get_kv(L_NAME, node_ptr->name, L_NODE_SAVE,
164 					  &save_watts, L_T_UINT32);
165 		tmp_watts = MIN(tmp_watts, save_watts);
166 		min_watts += tmp_watts;
167 	}
168 
169 	return min_watts;
170 }
171 
powercap_get_cluster_current_cap(void)172 uint32_t powercap_get_cluster_current_cap(void)
173 {
174 	char *end_ptr = NULL, *power_params, *tmp_ptr;
175 	uint32_t cap_watts = 0;
176 
177 	power_params = slurm_get_power_parameters();
178 	if (!power_params)
179 		return cap_watts;
180 
181 	if ((tmp_ptr = strstr(power_params, "cap_watts=INFINITE"))) {
182 		cap_watts = INFINITE;
183 	} else if ((tmp_ptr = strstr(power_params, "cap_watts=UNLIMITED"))) {
184 		cap_watts = INFINITE;
185 	} else if ((tmp_ptr = strstr(power_params, "cap_watts="))) {
186 		cap_watts = strtol(tmp_ptr + 10, &end_ptr, 10);
187 		if ((end_ptr[0] == 'k') || (end_ptr[0] == 'K')) {
188 			cap_watts *= 1000;
189 		} else if ((end_ptr[0] == 'm') || (end_ptr[0] == 'M')) {
190 			cap_watts *= 1000000;
191 		}
192 	}
193 	xfree(power_params);
194 
195 	return cap_watts;
196 }
197 
198 /* Strip "cap_watts=..." pointed to by tmp_ptr out of the string by shifting
199  * other string contents down over it. */
_strip_cap_watts(char * tmp_ptr)200 static void _strip_cap_watts(char *tmp_ptr)
201 {
202 	char *end_ptr;
203 	int i;
204 
205 	end_ptr = strchr(tmp_ptr, ',');
206 	if (!end_ptr) {
207 		tmp_ptr[0] = '\0';
208 		return;
209 	}
210 	end_ptr++;
211 	for (i = 0; ; i++) {
212 		tmp_ptr[i] = end_ptr[i];
213 		if (tmp_ptr[i] == '\0')
214 			break;
215 	}
216 
217 }
218 
powercap_set_cluster_cap(uint32_t new_cap)219 int powercap_set_cluster_cap(uint32_t new_cap)
220 {
221 	char *power_params, *sep, *tmp_ptr;
222 
223 	power_params = slurm_get_power_parameters();
224 	if (power_params) {
225 		while ((tmp_ptr = strstr(power_params, "cap_watts="))) {
226 			_strip_cap_watts(tmp_ptr);
227 		}
228 	}
229 	if (power_params && power_params[0])
230 		sep = ",";
231 	else
232 		sep = "";
233 	if (new_cap == INFINITE)
234 		xstrfmtcat(power_params, "%scap_watts=INFINITE", sep);
235 	else
236 		xstrfmtcat(power_params, "%scap_watts=%u", sep, new_cap);
237 	slurm_set_power_parameters(power_params);
238 	power_g_reconfig();
239 	xfree(power_params);
240 
241 	return 0;
242 }
243 
powercap_get_cluster_adjusted_max_watts(void)244 uint32_t powercap_get_cluster_adjusted_max_watts(void)
245 {
246 	uint32_t adj_max_watts = 0,val;
247 	node_record_t *node_ptr;
248 	int i;
249 
250 	if (!_powercap_enabled())
251 		return 0;
252 	if (!power_layout_ready())
253 		return 0;
254 	for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
255 	     i++, node_ptr++) {
256 		if (bit_test(power_node_bitmap, i)) {
257 			layouts_entity_get_kv(L_NAME, node_ptr->name,
258 					L_NODE_SAVE, &val, L_T_UINT32);
259 		} else if (!bit_test(up_node_bitmap, i)) {
260 			layouts_entity_get_kv(L_NAME, node_ptr->name,
261 					L_NODE_DOWN, &val, L_T_UINT32);
262 		} else {
263 			layouts_entity_get_kv(L_NAME, node_ptr->name,
264 					L_NODE_MAX, &val, L_T_UINT32);
265 		}
266 		adj_max_watts += val;
267 	}
268 
269 	return adj_max_watts;
270 }
271 
powercap_get_cluster_current_max_watts(void)272 uint32_t powercap_get_cluster_current_max_watts(void)
273 {
274 	uint32_t cur_max_watts = 0;
275 
276 	if (!_powercap_enabled())
277 		return 0;
278 	if (!power_layout_ready())
279 		return 0;
280 
281 	if (which_power_layout() == 1) {
282 		cur_max_watts = powercap_get_node_bitmap_maxwatts(NULL);
283 	} else {
284 		cur_max_watts = powercap_get_node_bitmap_maxwatts_dvfs(
285 					NULL, NULL, NULL, 0, 0);
286 	}
287 
288 	return cur_max_watts;
289 }
290 
powercap_get_node_bitmap_maxwatts(bitstr_t * idle_bitmap)291 uint32_t powercap_get_node_bitmap_maxwatts(bitstr_t *idle_bitmap)
292 {
293 	uint32_t max_watts = 0, val;
294 	node_record_t *node_ptr;
295 	int i;
296 	bitstr_t *tmp_bitmap = NULL;
297 
298 	if (!_powercap_enabled())
299 		return 0;
300 	if (!power_layout_ready())
301 		return 0;
302 
303 	/* if no input bitmap, consider the current idle nodes
304 	 * bitmap as the input bitmap tagging nodes to consider
305 	 * as idle while computing the max watts of the cluster */
306 	if (idle_bitmap == NULL) {
307 		tmp_bitmap = bit_copy(idle_node_bitmap);
308 		idle_bitmap = tmp_bitmap;
309 	}
310 
311 	for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
312 	     i++, node_ptr++) {
313 		/* non reserved node, evaluate the different cases */
314 		if (bit_test(idle_bitmap, i)) {
315 			 /* idle nodes, 2 cases : power save or not */
316 			if (bit_test(power_node_bitmap, i)) {
317 				layouts_entity_get_kv(L_NAME,
318 						node_ptr->name, L_NODE_SAVE,
319 						&val, L_T_UINT32);
320 			} else {
321 				layouts_entity_get_kv(L_NAME,
322 						node_ptr->name, L_NODE_IDLE,
323 						&val, L_T_UINT32);
324 			}
325 		} else {
326 			/* non idle nodes, 2 cases : down or not */
327 			if (!bit_test(up_node_bitmap, i)) {
328 				layouts_entity_get_kv(L_NAME,
329 						node_ptr->name, L_NODE_DOWN,
330 						&val, L_T_UINT32);
331 			} else {
332 				layouts_entity_get_kv(L_NAME,
333 						node_ptr->name, L_NODE_MAX,
334 						&val, L_T_UINT32);
335 			}
336 		}
337 		max_watts += val;
338 	}
339 
340 	if (tmp_bitmap)
341 		bit_free(tmp_bitmap);
342 
343 	return max_watts;
344 }
345 
powercap_get_job_cap(job_record_t * job_ptr,time_t when,bool reboot)346 uint32_t powercap_get_job_cap(job_record_t *job_ptr, time_t when, bool reboot)
347 {
348 	uint32_t powercap = 0, resv_watts;
349 
350 	powercap = powercap_get_cluster_current_cap();
351 	if (powercap == INFINITE)
352 		powercap = powercap_get_cluster_max_watts();
353 	if (powercap == 0)
354 		return 0; /* should not happened */
355 
356 	/* get the amount of watts reserved for the job */
357 	resv_watts = job_test_watts_resv(job_ptr, when, reboot);
358 
359 	/* avoid underflow of the cap value, return at least 0 */
360 	if (resv_watts > powercap)
361 		resv_watts = powercap;
362 
363 	return (powercap - resv_watts);
364 }
365 
powercap_get_cpufreq(bitstr_t * select_bitmap,int k)366 uint32_t powercap_get_cpufreq(bitstr_t *select_bitmap, int k)
367 {
368 	int i;
369 	node_record_t *node_ptr;
370 	char ename[128];
371 	uint32_t cpufreq = 0;
372 
373 	if (!_powercap_enabled())
374 		return cpufreq;
375 
376 	for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
377 	     i++, node_ptr++) {
378 		if (bit_test(select_bitmap, i)) {
379 			sprintf(ename, "Cpufreq%d", k);
380 			layouts_entity_get_kv(L_NAME, node_ptr->name,
381 						  ename, &cpufreq, L_T_UINT32);
382 		}
383 		break;
384 	}
385 
386 	return cpufreq;
387 }
388 
powercap_get_job_optimal_cpufreq(uint32_t powercap,int * allowed_freqs)389 int powercap_get_job_optimal_cpufreq(uint32_t powercap, int *allowed_freqs)
390 {
391 	uint32_t cur_max_watts = 0, *tmp_max_watts_dvfs = NULL;
392 	int k = 1;
393 	bitstr_t *tmp_bitmap = NULL;
394 
395 	if (!_powercap_enabled())
396 		return 0;
397 
398 	tmp_max_watts_dvfs = xmalloc(sizeof(uint32_t) * (allowed_freqs[0]+1));
399 	tmp_bitmap = bit_copy(idle_node_bitmap);
400 	bit_not(tmp_bitmap);
401 
402 	cur_max_watts = powercap_get_node_bitmap_maxwatts_dvfs(tmp_bitmap,
403 				idle_node_bitmap, tmp_max_watts_dvfs,
404 				allowed_freqs, 0);
405 	FREE_NULL_BITMAP(tmp_bitmap);
406 
407 	if (cur_max_watts > powercap) {
408 		while (tmp_max_watts_dvfs[k] > powercap &&
409 		      k < allowed_freqs[0] + 1) {
410 			k++;
411 		}
412 		if (k == allowed_freqs[0] + 1)
413 			k--;
414 	} else {
415 		k = 1;
416 	}
417 	xfree(tmp_max_watts_dvfs);
418 
419 	return k;
420 }
421 
powercap_get_job_nodes_numfreq(bitstr_t * select_bitmap,uint32_t cpu_freq_min,uint32_t cpu_freq_max)422 int *powercap_get_job_nodes_numfreq(bitstr_t *select_bitmap,
423 				    uint32_t cpu_freq_min,
424 				    uint32_t cpu_freq_max)
425 {
426 	uint16_t num_freq = 0;
427 	int i, p, *allowed_freqs = NULL, new_num_freq = 0;
428 	node_record_t *node_ptr;
429 	char ename[128];
430 	uint32_t cpufreq;
431 
432 	if (!_powercap_enabled())
433 		return NULL;
434 	if ((cpu_freq_min == NO_VAL) && (cpu_freq_max == NO_VAL)) {
435 		allowed_freqs = xmalloc(sizeof(int) * 2);
436 		/* allowed_freqs[0] = 0; Default value */
437 		return allowed_freqs;
438 	}
439 
440 	for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
441 	     i++, node_ptr++) {
442 		if (bit_test(select_bitmap, i)) {
443 			layouts_entity_get_kv(L_NAME, node_ptr->name,
444 					L_NUM_FREQ, &num_freq, L_T_UINT16);
445 			allowed_freqs = xmalloc(sizeof(int)*((int)num_freq+2));
446 			allowed_freqs[0] = (int) num_freq;
447 			for (p = num_freq; p > 0; p--) {
448 				sprintf(ename, "Cpufreq%d", p);
449 				layouts_entity_get_kv(L_NAME,
450 					  	  node_ptr->name, ename,
451 						  &cpufreq, L_T_UINT32);
452 
453 		/* In case a job is submitted with flags Low,High, etc on
454 		 * --cpu-freq parameter then we consider the whole range
455 		 * of available frequencies on nodes */
456 				if (((cpu_freq_min <= cpufreq) &&
457 				    (cpufreq <= cpu_freq_max)) ||
458 				    ((cpu_freq_min & CPU_FREQ_RANGE_FLAG) ||
459 				    (cpu_freq_max & CPU_FREQ_RANGE_FLAG))) {
460 					new_num_freq++;
461 					allowed_freqs[new_num_freq] = p;
462 				}
463 			}
464 			break;
465 		}
466 	}
467 
468 	if (allowed_freqs) {
469 		allowed_freqs[0] = new_num_freq;
470 	} else {
471 		allowed_freqs = xmalloc(sizeof(int) * 2);
472 		/* allowed_freqs[0] = 0; Default value */
473 	}
474 	return allowed_freqs;
475 }
476 
powercap_get_node_bitmap_maxwatts_dvfs(bitstr_t * idle_bitmap,bitstr_t * select_bitmap,uint32_t * max_watts_dvfs,int * allowed_freqs,uint32_t num_cpus)477 uint32_t powercap_get_node_bitmap_maxwatts_dvfs(bitstr_t *idle_bitmap,
478 			  bitstr_t *select_bitmap, uint32_t *max_watts_dvfs,
479 			  int* allowed_freqs, uint32_t num_cpus)
480 {
481 	uint32_t max_watts = 0, tmp_max_watts = 0, val = 0;
482 	uint32_t *tmp_max_watts_dvfs = NULL;
483 	node_record_t *node_ptr;
484 	int i, p;
485 	char ename[128], keyname[128];
486 	bitstr_t *tmp_bitmap = NULL;
487 	uint32_t data[5], core_data[4];
488 
489 	if (!_powercap_enabled())
490 		return 0;
491 
492 	if (max_watts_dvfs != NULL) {
493 		tmp_max_watts_dvfs =
494 			  xmalloc(sizeof(uint32_t)*(allowed_freqs[0]+1));
495 	}
496 
497 	/* if no input bitmap, consider the current idle nodes
498 	 * bitmap as the input bitmap tagging nodes to consider
499 	 * as idle while computing the max watts of the cluster */
500 	if (idle_bitmap == NULL && select_bitmap == NULL) {
501 		tmp_bitmap = bit_copy(idle_node_bitmap);
502 		idle_bitmap = tmp_bitmap;
503 		select_bitmap = tmp_bitmap;
504 	}
505 
506 	for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
507 	     i++, node_ptr++) {
508 		if (bit_test(idle_bitmap, i)) {
509 			/* idle nodes, 2 cases : power save or not */
510 			if (bit_test(power_node_bitmap, i)) {
511 				layouts_entity_get_kv(L_NAME,
512 						  node_ptr->name, L_NODE_SAVE,
513 						  &val, L_T_UINT32);
514 			} else {
515 				layouts_entity_get_kv(L_NAME,
516 						  node_ptr->name, L_NODE_IDLE,
517 						  &val, L_T_UINT32);
518 			}
519 
520 		} else if (bit_test(select_bitmap, i)) {
521 			layouts_entity_get_mkv(L_NAME, node_ptr->name,
522 				"IdleWatts,MaxWatts,CoresCount,LastCore,CurrentPower",
523 				data, (sizeof(uint32_t) * 5), L_T_UINT32);
524 
525 			/* tmp_max_watts = IdleWatts - cpus*IdleCoreWatts
526 			 * + cpus*MaxCoreWatts */
527 			sprintf(ename, "virtualcore%u", data[3]);
528 			if (num_cpus == 0 || num_cpus > data[2])
529 				num_cpus = data[2];
530 			layouts_entity_get_mkv(L_NAME, ename,
531 					       "IdleCoreWatts,MaxCoreWatts",
532 					       core_data,
533 					       (sizeof(uint32_t) * 2),
534 					       L_T_UINT32);
535 			if (data[4] == 0) {
536 				tmp_max_watts += data[0] -
537 					  num_cpus*core_data[0] +
538 					  num_cpus*core_data[1];
539 			} else if (data[4] > 0) {
540 				tmp_max_watts += data[4] -
541 					  num_cpus*core_data[0] +
542 					  num_cpus*core_data[1];
543 			} else if (num_cpus == data[2])
544 				tmp_max_watts += data[1];
545 
546 			if (!tmp_max_watts_dvfs)
547 				goto skip_dvfs;
548 			for (p = 1; p < (allowed_freqs[0] + 1); p++) {
549 				sprintf(keyname,
550 					"IdleCoreWatts,MaxCoreWatts,"
551 					"Cpufreq%dWatts,CurrentCorePower",
552 					allowed_freqs[p]);
553 				layouts_entity_get_mkv(L_NAME, ename, keyname,
554 					  core_data, (sizeof(uint32_t) * 4),
555 					  L_T_UINT32);
556 				if (num_cpus == data[2]) {
557 					tmp_max_watts_dvfs[p] +=
558 						  num_cpus*core_data[2];
559 				} else {
560 					if (data[4] == 0) {
561 						tmp_max_watts_dvfs[p] +=
562 						 	data[0] -
563 							num_cpus*core_data[0] +
564 							num_cpus*core_data[2];
565 					} else {
566 						tmp_max_watts_dvfs[p] +=
567 							data[4] -
568 							num_cpus*core_data[0] +
569 							num_cpus*core_data[2];
570 					}
571 				}
572 			}
573   skip_dvfs:		;
574 		} else {
575 			/* non-idle nodes, 2 cases : down or not */
576 			if (!bit_test(up_node_bitmap, i)) {
577 				layouts_entity_get_kv(L_NAME,
578 						  node_ptr->name, L_NODE_DOWN,
579 						  &val, L_T_UINT32);
580 			} else {
581 				layouts_entity_get_kv(L_NAME,
582 						  node_ptr->name, L_NODE_CUR,
583 						  &val, L_T_UINT32);
584 			}
585 		}
586 		max_watts += val;
587 		val = 0;
588 	}
589 	if (max_watts_dvfs) {
590 		for (p = 1; p < allowed_freqs[0] + 1; p++) {
591 			max_watts_dvfs[p] = max_watts + tmp_max_watts_dvfs[p];
592 		}
593 		xfree(tmp_max_watts_dvfs);
594 	}
595 	max_watts += tmp_max_watts;
596 
597 	if (tmp_bitmap)
598 		bit_free(tmp_bitmap);
599 
600 	return max_watts;
601 }
602