1 /*****************************************************************************\
2  *  node_features_knl_generic.c - Plugin for managing Intel KNL state
3  *  information on a generic Linux cluster
4  *****************************************************************************
5  *  Copyright (C) 2016-2017 SchedMD LLC.
6  *  Written by Morris Jette <jette@schedmd.com>
7  *             Danny Auble <da@schedmd.com>
8  *
9  *  This file is part of Slurm, a resource management program.
10  *  For details, see <https://slurm.schedmd.com/>.
11  *  Please also read the included file: DISCLAIMER.
12  *
13  *  Slurm is free software; you can redistribute it and/or modify it under
14  *  the terms of the GNU General Public License as published by the Free
15  *  Software Foundation; either version 2 of the License, or (at your option)
16  *  any later version.
17  *
18  *  In addition, as a special exception, the copyright holders give permission
19  *  to link the code of portions of this program with the OpenSSL library under
20  *  certain conditions as described in each individual source file, and
21  *  distribute linked combinations including the two. You must obey the GNU
22  *  General Public License in all respects for all of the code used other than
23  *  OpenSSL. If you modify file(s) with this exception, you may extend this
24  *  exception to your version of the file(s), but you are not obligated to do
25  *  so. If you do not wish to do so, delete this exception statement from your
26  *  version.  If you delete this exception statement from all source files in
27  *  the program, then also delete it here.
28  *
29  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
30  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
31  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
32  *  details.
33  *
34  *  You should have received a copy of the GNU General Public License along
35  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
36  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
37 \*****************************************************************************/
38 
39 #include "config.h"
40 
41 #define _GNU_SOURCE	/* For POLLRDHUP */
42 #include <ctype.h>
43 #include <fcntl.h>
44 #ifdef HAVE_NUMA
45 #undef NUMA_VERSION1_COMPATIBILITY
46 #include <numa.h>
47 #endif
48 #include <poll.h>
49 #include <pthread.h>
50 #include <signal.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <sys/stat.h>
54 #include <sys/types.h>
55 #include <time.h>
56 #include <unistd.h>
57 
58 #if defined(__APPLE__) || defined(__DragonFly__) || defined(__NetBSD__)
59 #define POLLRDHUP POLLHUP
60 #endif
61 
62 #include "slurm/slurm.h"
63 
64 #include "src/common/assoc_mgr.h"
65 #include "src/common/bitstring.h"
66 #include "src/common/fd.h"
67 #include "src/common/gres.h"
68 #include "src/common/list.h"
69 #include "src/common/macros.h"
70 #include "src/common/pack.h"
71 #include "src/common/parse_config.h"
72 #include "src/common/read_config.h"
73 #include "src/common/slurm_protocol_api.h"
74 #include "src/common/slurm_resource_info.h"
75 #include "src/common/timers.h"
76 #include "src/common/uid.h"
77 #include "src/common/xmalloc.h"
78 #include "src/common/xstring.h"
79 #include "src/slurmctld/job_scheduler.h"
80 #include "src/slurmctld/locks.h"
81 #include "src/slurmctld/node_scheduler.h"
82 #include "src/slurmctld/reservation.h"
83 #include "src/slurmctld/slurmctld.h"
84 #include "src/slurmctld/state_save.h"
85 #include "src/slurmd/slurmd/req.h"
86 
87 /* Maximum poll wait time for child processes, in milliseconds */
88 #define MAX_POLL_WAIT   500
89 #define DEFAULT_SYSCFG_TIMEOUT 1000
90 
91 /* Intel Knights Landing Configuration Modes */
92 #define KNL_NUMA_CNT	5
93 #define KNL_MCDRAM_CNT	5
94 #define KNL_NUMA_FLAG	0x00ff
95 #define KNL_ALL2ALL	0x0001
96 #define KNL_SNC2	0x0002
97 #define KNL_SNC4	0x0004
98 #define KNL_HEMI	0x0008
99 #define KNL_QUAD	0x0010
100 #define KNL_MCDRAM_FLAG	0xff00
101 #define KNL_CACHE	0x0100
102 #define KNL_EQUAL	0x0200
103 #define KNL_HYBRID	0x0400
104 #define KNL_FLAT	0x0800
105 #define KNL_AUTO	0x1000
106 
107 #ifndef MODPROBE_PATH
108 #define MODPROBE_PATH	"/sbin/modprobe"
109 #endif
110 #define ZONE_SORT_PATH	"/sys/kernel/zone_sort_free_pages/nodeid"
111 //#define ZONE_SORT_PATH	"/tmp/nodeid"	/* For testing */
112 
113 #ifndef DEFAULT_MCDRAM_SIZE
114 #define DEFAULT_MCDRAM_SIZE ((uint64_t) 16 * 1024 * 1024 * 1024)
115 #endif
116 
117 /* These are defined here so when we link with something other than
118  * the slurmctld we will have these symbols defined.  They will get
119  * overwritten when linking with the slurmctld.
120  */
121 #if defined (__APPLE__)
122 extern slurmctld_config_t slurmctld_config __attribute__((weak_import));
123 #else
124 slurmctld_config_t slurmctld_config;
125 #endif
126 
127 typedef enum {
128 	KNL_SYSTEM_TYPE_NOT_SET,
129 	KNL_SYSTEM_TYPE_INTEL,
130 	KNL_SYSTEM_TYPE_DELL,
131 } knl_system_type_t;
132 
133 /*
134  * These variables are required by the burst buffer plugin interface.  If they
135  * are not found in the plugin, the plugin loader will ignore it.
136  *
137  * plugin_name - a string giving a human-readable description of the
138  * plugin.  There is no maximum length, but the symbol must refer to
139  * a valid string.
140  *
141  * plugin_type - a string suggesting the type of the plugin or its
142  * applicability to a particular form of data or method of data handling.
143  * If the low-level plugin API is used, the contents of this string are
144  * unimportant and may be anything.  Slurm uses the higher-level plugin
145  * interface which requires this string to be of the form
146  *
147  *      <application>/<method>
148  *
149  * where <application> is a description of the intended application of
150  * the plugin (e.g., "node_features" for Slurm node_features) and <method> is a
151  * description of how this plugin satisfies that application.  Slurm will only
152  * load a node_features plugin if the plugin_type string has a prefix of
153  * "node_features/".
154  *
155  * plugin_version - an unsigned 32-bit integer containing the Slurm version
156  * (major.minor.micro combined into a single number).
157  */
158 const char plugin_name[]        = "node_features knl_generic plugin";
159 const char plugin_type[]        = "node_features/knl_generic";
160 const uint32_t plugin_version   = SLURM_VERSION_NUMBER;
161 
162 /* Configuration Parameters */
163 static uint16_t allow_mcdram = KNL_MCDRAM_FLAG;
164 static uint16_t allow_numa = KNL_NUMA_FLAG;
165 static uid_t *allowed_uid = NULL;
166 static int allowed_uid_cnt = 0;
167 static uint32_t boot_time = (5 * 60);	/* 5 minute estimated boot time */
168 static pthread_mutex_t config_mutex = PTHREAD_MUTEX_INITIALIZER;
169 static uint32_t cpu_bind[KNL_NUMA_CNT];	/* Derived from numa_cpu_bind */
170 static bool debug_flag = false;
171 static uint16_t default_mcdram = KNL_CACHE;
172 static uint16_t default_numa = KNL_ALL2ALL;
173 static char *mc_path = NULL;
174 static uint32_t node_reboot_weight = (INFINITE - 1);
175 static char *numa_cpu_bind = NULL;
176 static uint32_t syscfg_timeout = 0;
177 static bool reconfig = false;
178 static time_t shutdown_time = 0;
179 static int syscfg_found = -1;
180 static char *syscfg_path = NULL;
181 static knl_system_type_t knl_system_type = KNL_SYSTEM_TYPE_INTEL;
182 static uint32_t ume_check_interval = 0;
183 static pthread_mutex_t ume_mutex = PTHREAD_MUTEX_INITIALIZER;
184 static pthread_t ume_thread = 0;
185 static uint32_t force_load = 0;
186 static int hw_is_knl = -1;
187 
188 /* Percentage of MCDRAM used for cache by type, updated from syscfg */
189 static int mcdram_pct[KNL_MCDRAM_CNT];
190 static uint64_t *mcdram_per_node = NULL;
191 static bitstr_t *knl_node_bitmap = NULL;	/* KNL nodes found by syscfg */
192 
193 static s_p_options_t knl_conf_file_options[] = {
194 	{"AllowMCDRAM", S_P_STRING},
195 	{"AllowNUMA", S_P_STRING},
196 	{"AllowUserBoot", S_P_STRING},
197 	{"BootTime", S_P_UINT32},
198 	{"DefaultMCDRAM", S_P_STRING},
199 	{"DefaultNUMA", S_P_STRING},
200 	{"Force", S_P_UINT32},
201 	{"LogFile", S_P_STRING},
202 	{"McPath", S_P_STRING},
203 	{"NodeRebootWeight", S_P_UINT32},
204 	{"NumaCpuBind", S_P_STRING},
205 	{"SyscfgPath", S_P_STRING},
206 	{"SyscfgTimeout", S_P_UINT32},
207 	{"SystemType", S_P_STRING},
208 	{"UmeCheckInterval", S_P_UINT32},
209 	{NULL}
210 };
211 
212 static s_p_hashtbl_t *_config_make_tbl(char *filename);
213 static int  _knl_mcdram_bits_cnt(uint16_t mcdram_num);
214 static uint16_t _knl_mcdram_parse(char *mcdram_str, char *sep);
215 static char *_knl_mcdram_str(uint16_t mcdram_num);
216 static uint16_t _knl_mcdram_token(char *token);
217 static int _knl_numa_bits_cnt(uint16_t numa_num);
218 static uint16_t _knl_numa_parse(char *numa_str, char *sep);
219 static char *_knl_numa_str(uint16_t numa_num);
220 static int _knl_numa_inx(char *token);
221 static uint16_t _knl_numa_token(char *token);
222 static void _log_script_argv(char **script_argv, char *resp_msg);
223 static char *_run_script(char *cmd_path, char **script_argv, int *status);
224 static int  _tot_wait (struct timeval *start_time);
225 static void _update_cpu_bind(void);
226 
_config_make_tbl(char * filename)227 static s_p_hashtbl_t *_config_make_tbl(char *filename)
228 {
229 	s_p_hashtbl_t *tbl = NULL;
230 
231 	xassert(filename);
232 
233 	if (!(tbl = s_p_hashtbl_create(knl_conf_file_options))) {
234 		error("knl.conf: %s: s_p_hashtbl_create error: %m", __func__);
235 		return tbl;
236 	}
237 
238 	if (s_p_parse_file(tbl, NULL, filename, false) == SLURM_ERROR) {
239 		error("knl.conf: %s: s_p_parse_file error: %m", __func__);
240 		s_p_hashtbl_destroy(tbl);
241 		tbl = NULL;
242 	}
243 
244 	return tbl;
245 }
246 
247 /*
248  * Return the count of MCDRAM bits set
249  */
_knl_mcdram_bits_cnt(uint16_t mcdram_num)250 static int _knl_mcdram_bits_cnt(uint16_t mcdram_num)
251 {
252 	int cnt = 0, i;
253 	uint16_t tmp = 1;
254 
255 	for (i = 0; i < 16; i++) {
256 		if ((mcdram_num & KNL_MCDRAM_FLAG) & tmp)
257 			cnt++;
258 		tmp = tmp << 1;
259 	}
260 	return cnt;
261 }
262 
263 /*
264  * Translate KNL MCDRAM string to equivalent numeric value
265  * mcdram_str IN - String to scan
266  * sep IN - token separator to search for
267  * RET MCDRAM numeric value
268  */
_knl_mcdram_parse(char * mcdram_str,char * sep)269 static uint16_t _knl_mcdram_parse(char *mcdram_str, char *sep)
270 {
271 	char *save_ptr = NULL, *tmp, *tok;
272 	uint16_t mcdram_num = 0;
273 
274 	if (!mcdram_str)
275 		return mcdram_num;
276 
277 	tmp = xstrdup(mcdram_str);
278 	tok = strtok_r(tmp, sep, &save_ptr);
279 	while (tok) {
280 		mcdram_num |= _knl_mcdram_token(tok);
281 		tok = strtok_r(NULL, sep, &save_ptr);
282 	}
283 	xfree(tmp);
284 
285 	return mcdram_num;
286 }
287 
288 /*
289  * Translate KNL MCDRAM number to equivalent string value
290  * Caller must free return value
291  */
_knl_mcdram_str(uint16_t mcdram_num)292 static char *_knl_mcdram_str(uint16_t mcdram_num)
293 {
294 	char *mcdram_str = NULL, *sep = "";
295 
296 	if (mcdram_num & KNL_CACHE) {
297 		xstrfmtcat(mcdram_str, "%scache", sep);
298 		sep = ",";
299 	}
300 	if (mcdram_num & KNL_HYBRID) {
301 		xstrfmtcat(mcdram_str, "%shybrid", sep);
302 		sep = ",";
303 	}
304 	if (mcdram_num & KNL_FLAT) {
305 		xstrfmtcat(mcdram_str, "%sflat", sep);
306 		sep = ",";
307 	}
308 	if (mcdram_num & KNL_EQUAL) {
309 		xstrfmtcat(mcdram_str, "%sequal", sep);
310 		sep = ",";
311 	}
312 	if (mcdram_num & KNL_AUTO) {
313 		xstrfmtcat(mcdram_str, "%sauto", sep);
314 //		sep = ",";	/* Remove to avoid CLANG error */
315 	}
316 
317 	return mcdram_str;
318 }
319 
320 /*
321  * Given a KNL MCDRAM token, return its equivalent numeric value
322  * token IN - String to scan
323  * RET MCDRAM numeric value
324  */
_knl_mcdram_token(char * token)325 static uint16_t _knl_mcdram_token(char *token)
326 {
327 	uint16_t mcdram_num = 0;
328 
329 	if (!xstrcasecmp(token, "cache"))
330 		mcdram_num = KNL_CACHE;
331 	else if (!xstrcasecmp(token, "hybrid"))
332 		mcdram_num = KNL_HYBRID;
333 	else if (!xstrcasecmp(token, "flat") ||
334 		 !xstrcasecmp(token, "memory"))
335 		mcdram_num = KNL_FLAT;
336 	else if (!xstrcasecmp(token, "equal"))
337 		mcdram_num = KNL_EQUAL;
338 	else if (!xstrcasecmp(token, "auto"))
339 		mcdram_num = KNL_AUTO;
340 
341 	return mcdram_num;
342 }
343 
344 /*
345  * Return the count of NUMA bits set
346  */
_knl_numa_bits_cnt(uint16_t numa_num)347 static int _knl_numa_bits_cnt(uint16_t numa_num)
348 {
349 	int cnt = 0, i;
350 	uint16_t tmp = 1;
351 
352 	for (i = 0; i < 16; i++) {
353 		if ((numa_num & KNL_NUMA_FLAG) & tmp)
354 			cnt++;
355 		tmp = tmp << 1;
356 	}
357 	return cnt;
358 }
359 
360 /*
361  * Translate KNL NUMA string to equivalent numeric value
362  * numa_str IN - String to scan
363  * sep IN - token separator to search for
364  * RET NUMA numeric value
365  */
_knl_numa_parse(char * numa_str,char * sep)366 static uint16_t _knl_numa_parse(char *numa_str, char *sep)
367 {
368 	char *save_ptr = NULL, *tmp, *tok;
369 	uint16_t numa_num = 0;
370 
371 	if (!numa_str)
372 		return numa_num;
373 
374 	tmp = xstrdup(numa_str);
375 	tok = strtok_r(tmp, sep, &save_ptr);
376 	while (tok) {
377 		numa_num |= _knl_numa_token(tok);
378 		tok = strtok_r(NULL, sep, &save_ptr);
379 	}
380 	xfree(tmp);
381 
382 	return numa_num;
383 }
384 
385 /*
386  * Translate KNL NUMA number to equivalent string value
387  * Caller must free return value
388  */
_knl_numa_str(uint16_t numa_num)389 static char *_knl_numa_str(uint16_t numa_num)
390 {
391 	char *numa_str = NULL, *sep = "";
392 
393 	if (numa_num & KNL_ALL2ALL) {
394 		xstrfmtcat(numa_str, "%sa2a", sep);
395 		sep = ",";
396 	}
397 	if (numa_num & KNL_SNC2) {
398 		xstrfmtcat(numa_str, "%ssnc2", sep);
399 		sep = ",";
400 	}
401 	if (numa_num & KNL_SNC4) {
402 		xstrfmtcat(numa_str, "%ssnc4", sep);
403 		sep = ",";
404 	}
405 	if (numa_num & KNL_HEMI) {
406 		xstrfmtcat(numa_str, "%shemi", sep);
407 		sep = ",";
408 	}
409 	if (numa_num & KNL_QUAD) {
410 		xstrfmtcat(numa_str, "%squad", sep);
411 //		sep = ",";	/* Remove to avoid CLANG error */
412 	}
413 
414 	return numa_str;
415 
416 }
417 
418 /*
419  * Given a KNL NUMA token, return its equivalent numeric value
420  * token IN - String to scan
421  * RET NUMA numeric value
422  */
_knl_numa_token(char * token)423 static uint16_t _knl_numa_token(char *token)
424 {
425 	uint16_t numa_num = 0;
426 
427 	if (!xstrcasecmp(token, "a2a"))
428 		numa_num |= KNL_ALL2ALL;
429 	else if (!xstrcasecmp(token, "snc2"))
430 		numa_num |= KNL_SNC2;
431 	else if (!xstrcasecmp(token, "snc4"))
432 		numa_num |= KNL_SNC4;
433 	else if (!xstrcasecmp(token, "hemi"))
434 		numa_num |= KNL_HEMI;
435 	else if (!xstrcasecmp(token, "quad"))
436 		numa_num |= KNL_QUAD;
437 
438 	return numa_num;
439 }
440 
441 /*
442  * Given a KNL NUMA token, return its cpu_bind offset
443  * token IN - String to scan
444  * RET NUMA offset or -1 if not found
445  */
_knl_numa_inx(char * token)446 static int _knl_numa_inx(char *token)
447 {
448 	uint16_t numa_num;
449 	int i;
450 
451 	numa_num = _knl_numa_token(token);
452 	for (i = 0; i < KNL_NUMA_CNT; i++) {
453 		if ((0x01 << i) == numa_num)
454 			return i;
455 	}
456 	return -1;
457 }
458 
459 /*
460  * Translate KNL System enum to equivalent string value
461  */
_knl_system_type_str(knl_system_type_t system_type)462 static char *_knl_system_type_str(knl_system_type_t system_type)
463 {
464 	switch (system_type) {
465 	case KNL_SYSTEM_TYPE_INTEL:
466 		return "Intel";
467 	case KNL_SYSTEM_TYPE_DELL:
468 		return "Dell";
469 	case KNL_SYSTEM_TYPE_NOT_SET:
470 	default:
471 		return "Unknown";
472 	}
473 }
474 
475 /*
476  * Given a KNL System token, return its equivalent enum value
477  * token IN - String to scan
478  * RET System enum value
479  */
_knl_system_type_token(char * token)480 static knl_system_type_t _knl_system_type_token(char *token)
481 {
482 	knl_system_type_t system_type;
483 
484 	if (!xstrcasecmp("intel", token))
485 		system_type = KNL_SYSTEM_TYPE_INTEL;
486 	else if (!xstrcasecmp("dell", token))
487 		system_type = KNL_SYSTEM_TYPE_DELL;
488 	else
489 		system_type = KNL_SYSTEM_TYPE_NOT_SET;
490 
491 	return system_type;
492 }
493 
494 /*
495  * Return time in msec since "start time"
496  */
_tot_wait(struct timeval * start_time)497 static int _tot_wait (struct timeval *start_time)
498 {
499 	struct timeval end_time;
500 	int msec_delay;
501 
502 	gettimeofday(&end_time, NULL);
503 	msec_delay =   (end_time.tv_sec  - start_time->tv_sec ) * 1000;
504 	msec_delay += ((end_time.tv_usec - start_time->tv_usec + 500) / 1000);
505 	return msec_delay;
506 }
507 
508 /*
509  * Update cpu_bind array from current numa_cpu_bind configuration parameter
510  */
_update_cpu_bind(void)511 static void _update_cpu_bind(void)
512 {
513 	char *save_ptr = NULL, *sep, *tok, *tmp;
514 	int rc = SLURM_SUCCESS;
515 	int i, numa_inx, numa_def;
516 	uint32_t cpu_bind_val = 0;
517 
518 	for (i = 0; i < KNL_NUMA_CNT; i++)
519 		cpu_bind[0] = 0;
520 
521 	if (!numa_cpu_bind)
522 		return;
523 
524 	tmp = xstrdup(numa_cpu_bind);
525 	tok = strtok_r(tmp, ";", &save_ptr);
526 	while (tok) {
527 		sep = strchr(tok, '=');
528 		if (!sep) {
529 			rc = SLURM_ERROR;
530 			break;
531 		}
532 		sep[0] = '\0';
533 		numa_def = _knl_numa_token(tok);
534 		if (numa_def == 0) {
535 			rc = SLURM_ERROR;
536 			break;
537 		}
538 		if (xlate_cpu_bind_str(sep + 1, &cpu_bind_val) !=
539 		    SLURM_SUCCESS) {
540 			rc = SLURM_ERROR;
541 			break;
542 		}
543 		numa_inx = -1;
544 		for (i = 0; i < KNL_NUMA_CNT; i++) {
545 			if ((0x1 << i) == numa_def) {
546 				numa_inx = i;
547 				break;
548 			}
549 		}
550 		if (numa_inx > -1)
551 			cpu_bind[numa_inx] = cpu_bind_val;
552 		tok = strtok_r(NULL, ";", &save_ptr);
553 	}
554 	xfree(tmp);
555 
556 	if (rc != SLURM_SUCCESS) {
557 		error("%s: Invalid NumaCpuBind (%s), ignored",
558 		      plugin_type, numa_cpu_bind);
559 	}
560 
561 	if (debug_flag) {
562 		for (i = 0; i < KNL_NUMA_CNT; i++) {
563 			char cpu_bind_str[128], *numa_str;
564 			if (cpu_bind[i] == 0)
565 				continue;
566 			numa_str = _knl_numa_str(0x1 << i);
567 			slurm_sprint_cpu_bind_type(cpu_bind_str, cpu_bind[i]);
568 			info("CpuBind[%s] = %s", numa_str, cpu_bind_str);
569 			xfree(numa_str);
570 		}
571 	}
572 }
573 
574 /* Log a command's arguments. */
_log_script_argv(char ** script_argv,char * resp_msg)575 static void _log_script_argv(char **script_argv, char *resp_msg)
576 {
577 	char *cmd_line = NULL;
578 	int i;
579 
580 	if (!debug_flag)
581 		return;
582 
583 	for (i = 0; script_argv[i]; i++) {
584 		if (i)
585 			xstrcat(cmd_line, " ");
586 		xstrcat(cmd_line, script_argv[i]);
587 	}
588 	info("%s", cmd_line);
589 	if (resp_msg && resp_msg[0])
590 		info("%s", resp_msg);
591 	xfree(cmd_line);
592 }
593 
594 /* Run a script and return its stdout plus exit status */
_run_script(char * cmd_path,char ** script_argv,int * status)595 static char *_run_script(char *cmd_path, char **script_argv, int *status)
596 {
597 	int cc, i, new_wait, resp_size = 0, resp_offset = 0;
598 	pid_t cpid;
599 	char *resp = NULL;
600 	int pfd[2] = { -1, -1 };
601 
602 	if (access(cmd_path, R_OK | X_OK) < 0) {
603 		error("%s: %s can not be executed: %m", __func__, cmd_path);
604 		*status = 127;
605 		resp = xstrdup("Slurm node_features/knl_generic configuration error");
606 		return resp;
607 	}
608 	if (pipe(pfd) != 0) {
609 		error("%s: pipe(): %m", __func__);
610 		*status = 127;
611 		resp = xstrdup("System error");
612 		return resp;
613 	}
614 
615 	if ((cpid = fork()) == 0) {
616 		cc = sysconf(_SC_OPEN_MAX);
617 		dup2(pfd[1], STDERR_FILENO);
618 		dup2(pfd[1], STDOUT_FILENO);
619 		for (i = 0; i < cc; i++) {
620 			if ((i != STDERR_FILENO) && (i != STDOUT_FILENO))
621 				close(i);
622 		}
623 		setpgid(0, 0);
624 		execv(cmd_path, script_argv);
625 		error("%s: execv(%s): %m", __func__, cmd_path);
626 		_exit(127);
627 	} else if (cpid < 0) {
628 		close(pfd[0]);
629 		close(pfd[1]);
630 		error("%s: fork(): %m", __func__);
631 	} else {
632 		struct pollfd fds;
633 		struct timeval tstart;
634 		resp_size = 1024;
635 		resp = xmalloc(resp_size);
636 		close(pfd[1]);
637 		gettimeofday(&tstart, NULL);
638 		while (1) {
639 			if (slurmctld_config.shutdown_time) {
640 				error("%s: killing %s operation on shutdown",
641 				      __func__, script_argv[1]);
642 				break;
643 			}
644 			fds.fd = pfd[0];
645 			fds.events = POLLIN | POLLHUP | POLLRDHUP;
646 			fds.revents = 0;
647 			new_wait = syscfg_timeout - _tot_wait(&tstart);
648 			if (new_wait <= 0) {
649 				error("%s: %s poll timeout @ %d msec",
650 				      __func__, script_argv[1], syscfg_timeout);
651 				break;
652 			}
653 			new_wait = MIN(new_wait, MAX_POLL_WAIT);
654 			i = poll(&fds, 1, new_wait);
655 			if (i == 0) {
656 				continue;
657 			} else if (i < 0) {
658 				error("%s: %s poll:%m", __func__,
659 				      script_argv[1]);
660 				break;
661 			}
662 			if ((fds.revents & POLLIN) == 0)
663 				break;
664 			i = read(pfd[0], resp + resp_offset,
665 				 resp_size - resp_offset);
666 			if (i == 0) {
667 				break;
668 			} else if (i < 0) {
669 				if (errno == EAGAIN)
670 					continue;
671 				error("%s: read(%s): %m", __func__, syscfg_path);
672 				break;
673 			} else {
674 				resp_offset += i;
675 				if (resp_offset + 1024 >= resp_size) {
676 					resp_size *= 2;
677 					resp = xrealloc(resp, resp_size);
678 				}
679 			}
680 		}
681 		killpg(cpid, SIGTERM);
682 		usleep(10000);
683 		killpg(cpid, SIGKILL);
684 		waitpid(cpid, status, 0);
685 		close(pfd[0]);
686 	}
687 	return resp;
688 }
689 
_make_uid_array(char * uid_str)690 static void _make_uid_array(char *uid_str)
691 {
692 	char *save_ptr = NULL, *tmp_str, *tok;
693 	int i, uid_cnt = 0;
694 
695 	if (!uid_str)
696 		return;
697 
698 	/* Count the number of users */
699 	for (i = 0; uid_str[i]; i++) {
700 		if (uid_str[i] == ',')
701 			uid_cnt++;
702 	}
703 	uid_cnt++;
704 
705 	allowed_uid = xmalloc(sizeof(uid_t) * uid_cnt);
706 	allowed_uid_cnt = 0;
707 	tmp_str = xstrdup(uid_str);
708 	tok = strtok_r(tmp_str, ",", &save_ptr);
709 	while (tok) {
710 		if (uid_from_string(tok, &allowed_uid[allowed_uid_cnt++]) < 0)
711 			error("knl_generic.conf: Invalid AllowUserBoot: %s", tok);
712 		tok = strtok_r(NULL, ",", &save_ptr);
713 	}
714 	xfree(tmp_str);
715 }
716 
_make_uid_str(uid_t * uid_array,int uid_cnt)717 static char *_make_uid_str(uid_t *uid_array, int uid_cnt)
718 {
719 	char *sep = "", *tmp_str = NULL, *uid_str = NULL;
720 	int i;
721 
722 	if (allowed_uid_cnt == 0) {
723 		uid_str = xstrdup("ALL");
724 		return uid_str;
725 	}
726 
727 	for (i = 0; i < uid_cnt; i++) {
728 		tmp_str = uid_to_string(uid_array[i]);
729 		xstrfmtcat(uid_str, "%s%s(%d)", sep, tmp_str, uid_array[i]);
730 		xfree(tmp_str);
731 		sep = ",";
732 	}
733 
734 	return uid_str;
735 }
736 
737 /* Watch for Uncorrectable Memory Errors. Notify jobs if any detected */
_ume_agent(void * args)738 static void *_ume_agent(void *args)
739 {
740 	struct timespec req;
741 	int i, mc_num, csrow_num, ue_count, last_ue_count = -1;
742 	int *fd = NULL, fd_cnt = 0, fd_size = 0, ume_path_size;
743 	char buf[8], *ume_path;
744 	ssize_t rd_size;
745 
746 	/* Identify and open array of UME file descriptors */
747 	ume_path_size = strlen(mc_path) + 32;
748 	ume_path = xmalloc(ume_path_size);
749 	for (mc_num = 0; ; mc_num++) {
750 		for (csrow_num = 0; ; csrow_num++) {
751 			if (fd_cnt == fd_size) {
752 				fd_size += 64;
753 				fd = xrealloc(fd, sizeof(int) * fd_size);
754 			}
755 			snprintf(ume_path, ume_path_size,
756 				 "%s/mc%d/csrow%d/ue_count",
757 				 mc_path, mc_num, csrow_num);
758 			if ((fd[fd_cnt] = open(ume_path, 0)) >= 0)
759 				fd_cnt++;
760 			else
761 				break;
762 		}
763 		if (csrow_num == 0)
764 			break;
765 	}
766 	xfree(ume_path);
767 
768 	while (!shutdown_time) {
769 		/* Get current UME count */
770 		ue_count = 0;
771 		for (i = 0; i < fd_cnt; i++) {
772 			(void) lseek(fd[i], 0, SEEK_SET);
773 			rd_size = read(fd[i], buf, 7);
774 			if (rd_size <= 0)
775 				continue;
776 			buf[rd_size] = '\0';
777 			ue_count += atoi(buf);
778 		}
779 
780 		if (shutdown_time)
781 			break;
782 		/* If UME count changed, notify all steps */
783 		if ((last_ue_count < ue_count) && (last_ue_count != -1)) {
784 			i = ume_notify();
785 			error("UME error detected. Notified %d job steps", i);
786 		}
787 		last_ue_count = ue_count;
788 
789 		if (shutdown_time)
790 			break;
791 		/* Sleep before retry */
792 		req.tv_sec  =  ume_check_interval / USEC_IN_SEC;
793 		req.tv_nsec = (ume_check_interval % USEC_IN_SEC) *
794 			      NSEC_IN_USEC;
795 		(void) nanosleep(&req, NULL);
796 	}
797 
798 	for (i = 0; i < fd_cnt; i++)
799 		(void) close(fd[i]);
800 	xfree(fd);
801 
802 	return NULL;
803 }
804 
805 /* Load configuration */
init(void)806 extern int init(void)
807 {
808 	char *allow_mcdram_str, *allow_numa_str, *allow_user_str;
809 	char *default_mcdram_str, *default_numa_str;
810 	char *knl_conf_file, *tmp_str = NULL, *resume_program;
811 	s_p_hashtbl_t *tbl;
812 	struct stat stat_buf;
813 	int i, rc = SLURM_SUCCESS;
814 	char *cpuinfo_path = "/proc/cpuinfo";
815 	FILE *cpu_info_file;
816 	char buf[1024];
817 
818 	/* Set default values */
819 	allow_mcdram = KNL_MCDRAM_FLAG;
820 	allow_numa = KNL_NUMA_FLAG;
821 	xfree(allowed_uid);
822 	xfree(mc_path);
823 	xfree(syscfg_path);
824 	allowed_uid_cnt = 0;
825 	for (i = 0; i < KNL_NUMA_CNT; i++)
826 		cpu_bind[i] = 0;
827 	syscfg_timeout = DEFAULT_SYSCFG_TIMEOUT;
828 	debug_flag = false;
829 	default_mcdram = KNL_CACHE;
830 	default_numa = KNL_ALL2ALL;
831 //FIXME: Need better mechanism to get MCDRAM percentages
832 //	for (i = 0; i < KNL_MCDRAM_CNT; i++)
833 //		mcdram_pct[i] = -1;
834 	mcdram_pct[0] = 100;	// KNL_CACHE
835 	mcdram_pct[1] = 50;	// KNL_EQUAL
836 	mcdram_pct[2] = 50;	// KNL_HYBRID
837 	mcdram_pct[3] = 0;	// KNL_FLAT
838 	mcdram_pct[4] = 0;	// KNL_AUTO
839 	xfree(numa_cpu_bind);
840 
841 	if (slurm_get_debug_flags() & DEBUG_FLAG_NODE_FEATURES)
842 		debug_flag = true;
843 
844 	knl_conf_file = get_extra_conf_path("knl_generic.conf");
845 	if ((stat(knl_conf_file, &stat_buf) == 0) &&
846 	    (tbl = _config_make_tbl(knl_conf_file))) {
847 		if (s_p_get_string(&tmp_str, "AllowMCDRAM", tbl)) {
848 			allow_mcdram = _knl_mcdram_parse(tmp_str, ",");
849 			if (_knl_mcdram_bits_cnt(allow_mcdram) < 1) {
850 				fatal("knl_generic.conf: Invalid AllowMCDRAM=%s",
851 				      tmp_str);
852 			}
853 			xfree(tmp_str);
854 		}
855 		if (s_p_get_string(&tmp_str, "AllowNUMA", tbl)) {
856 			allow_numa = _knl_numa_parse(tmp_str, ",");
857 			if (_knl_numa_bits_cnt(allow_numa) < 1) {
858 				fatal("knl_generic.conf: Invalid AllowNUMA=%s",
859 				      tmp_str);
860 			}
861 			xfree(tmp_str);
862 		}
863 		if (s_p_get_string(&tmp_str, "AllowUserBoot", tbl)) {
864 			_make_uid_array(tmp_str);
865 			xfree(tmp_str);
866 		}
867 		(void) s_p_get_uint32(&boot_time, "BootTime", tbl);
868 		if (s_p_get_string(&tmp_str, "DefaultMCDRAM", tbl)) {
869 			default_mcdram = _knl_mcdram_parse(tmp_str, ",");
870 			if (_knl_mcdram_bits_cnt(default_mcdram) != 1) {
871 				fatal("knl_generic.conf: Invalid DefaultMCDRAM=%s",
872 				      tmp_str);
873 			}
874 			xfree(tmp_str);
875 		}
876 		if (s_p_get_string(&tmp_str, "DefaultNUMA", tbl)) {
877 			default_numa = _knl_numa_parse(tmp_str, ",");
878 			if (_knl_numa_bits_cnt(default_numa) != 1) {
879 				fatal("knl_generic.conf: Invalid DefaultNUMA=%s",
880 				      tmp_str);
881 			}
882 			xfree(tmp_str);
883 		}
884 		(void) s_p_get_uint32(&force_load, "Force", tbl);
885 		(void) s_p_get_string(&mc_path, "McPath", tbl);
886 		(void) s_p_get_uint32(&node_reboot_weight, "NodeRebootWeight",
887 				      tbl);
888 		if (s_p_get_string(&numa_cpu_bind, "NumaCpuBind", tbl))
889 			_update_cpu_bind();
890 		(void) s_p_get_string(&syscfg_path, "SyscfgPath", tbl);
891 		if (s_p_get_string(&tmp_str, "SystemType", tbl)) {
892 			if ((knl_system_type = _knl_system_type_token(tmp_str))
893 			    == KNL_SYSTEM_TYPE_NOT_SET)
894 				fatal("knl_generic.conf: Invalid SystemType=%s.",
895 				      tmp_str);
896 			xfree(tmp_str);
897 		}
898 		(void) s_p_get_uint32(&syscfg_timeout, "SyscfgTimeout", tbl);
899 		(void) s_p_get_uint32(&ume_check_interval, "UmeCheckInterval",
900 				      tbl);
901 
902 		s_p_hashtbl_destroy(tbl);
903 	} else if (errno != ENOENT) {
904 		error("Error opening/reading knl_generic.conf: %m");
905 		rc = SLURM_ERROR;
906 	}
907 	xfree(knl_conf_file);
908 	if (!mc_path)
909 		mc_path = xstrdup("/sys/devices/system/edac/mc");
910 	if (!syscfg_path)
911 		syscfg_path = xstrdup("/usr/bin/syscfg");
912 	if (access(syscfg_path, X_OK) == 0)
913 		syscfg_found = 1;
914 	else
915 		syscfg_found = 0;
916 
917 	hw_is_knl = 0;
918 	cpu_info_file = fopen(cpuinfo_path, "r");
919 	if (cpu_info_file == NULL) {
920 		error("Error opening/reading %s: %m", cpuinfo_path);
921 	} else {
922 		while (fgets(buf, sizeof(buf), cpu_info_file)) {
923 			if (strstr(buf, "Xeon Phi")) {
924 				hw_is_knl = 1;
925 				break;
926 			}
927 		}
928 		fclose(cpu_info_file);
929 	}
930 
931 	if ((resume_program = slurm_get_resume_program())) {
932 		error("Use of ResumeProgram with %s not currently supported",
933 		      plugin_name);
934 		xfree(resume_program);
935 		rc = SLURM_ERROR;
936 	}
937 
938 	if (slurm_get_debug_flags() & DEBUG_FLAG_NODE_FEATURES) {
939 		allow_mcdram_str = _knl_mcdram_str(allow_mcdram);
940 		allow_numa_str = _knl_numa_str(allow_numa);
941 		allow_user_str = _make_uid_str(allowed_uid, allowed_uid_cnt);
942 		default_mcdram_str = _knl_mcdram_str(default_mcdram);
943 		default_numa_str = _knl_numa_str(default_numa);
944 		info("AllowMCDRAM=%s AllowNUMA=%s",
945 		     allow_mcdram_str, allow_numa_str);
946 		info("AllowUserBoot=%s", allow_user_str);
947 		info("BootTIme=%u", boot_time);
948 		info("DefaultMCDRAM=%s DefaultNUMA=%s",
949 		     default_mcdram_str, default_numa_str);
950 		info("Force=%u", force_load);
951 		info("McPath=%s", mc_path);
952 		info("NodeRebootWeight=%u", node_reboot_weight);
953 		info("NumaCpuBind=%s", numa_cpu_bind);
954 		info("SyscfgPath=%s (Found=%d)", syscfg_path, syscfg_found);
955 		info("SyscfgTimeout=%u msec", syscfg_timeout);
956 		info("SystemType=%s", _knl_system_type_str(knl_system_type));
957 		info("UmeCheckInterval=%u", ume_check_interval);
958 		xfree(allow_mcdram_str);
959 		xfree(allow_numa_str);
960 		xfree(allow_user_str);
961 		xfree(default_mcdram_str);
962 		xfree(default_numa_str);
963 	}
964 	gres_plugin_add("hbm");
965 
966 	if ((rc == SLURM_SUCCESS) &&
967 	    ume_check_interval && running_in_slurmd()) {
968 		slurm_mutex_lock(&ume_mutex);
969 		slurm_thread_create(&ume_thread, _ume_agent, NULL);
970 		slurm_mutex_unlock(&ume_mutex);
971 	}
972 
973 	return rc;
974 }
975 
976 /* Release allocated memory */
fini(void)977 extern int fini(void)
978 {
979 	shutdown_time = time(NULL);
980 	slurm_mutex_lock(&ume_mutex);
981 	if (ume_thread) {
982 		pthread_join(ume_thread, NULL);
983 		ume_thread = 0;
984 	}
985 	slurm_mutex_unlock(&ume_mutex);
986 	xfree(allowed_uid);
987 	allowed_uid_cnt = 0;
988 	debug_flag = false;
989 	xfree(mcdram_per_node);
990 	xfree(mc_path);
991 	xfree(numa_cpu_bind);
992 	xfree(syscfg_path);
993 	FREE_NULL_BITMAP(knl_node_bitmap);
994 
995 	return SLURM_SUCCESS;
996 }
997 
998 /* Reload configuration */
node_features_p_reconfig(void)999 extern int node_features_p_reconfig(void)
1000 {
1001 	slurm_mutex_lock(&config_mutex);
1002 	reconfig = true;
1003 	slurm_mutex_unlock(&config_mutex);
1004 	return SLURM_SUCCESS;
1005 }
1006 
1007 /* Update active and available features on specified nodes,
1008  * sets features on all nodes if node_list is NULL */
node_features_p_get_node(char * node_list)1009 extern int node_features_p_get_node(char *node_list)
1010 {
1011 	slurm_mutex_lock(&config_mutex);
1012 	if (reconfig) {
1013 		(void) init();
1014 		reconfig = false;
1015 	}
1016 	slurm_mutex_unlock(&config_mutex);
1017 	return SLURM_SUCCESS;
1018 }
1019 
1020 /* Get this node's current and available MCDRAM and NUMA settings from BIOS.
1021  * avail_modes IN/OUT - append available modes, must be xfreed
1022  * current_mode IN/OUT - append current modes, must be xfreed
1023  *
1024  * NOTE: Not applicable on Cray systems; can be used on other systems.
1025  *
1026  * NOTES about syscfg (from Intel):
1027  * To display the BIOS Parameters:
1028  * >> syscfg /d biossettings <"BIOS variable Name">
1029  *
1030  * To Set the BIOS Parameters:
1031  * >> syscfg /bcs <AdminPw> <"BIOS variable Name"> <Value>
1032  * Note: If AdminPw is not set use ""
1033  */
node_features_p_node_state(char ** avail_modes,char ** current_mode)1034 extern void node_features_p_node_state(char **avail_modes, char **current_mode)
1035 {
1036 	char *avail_states = NULL, *cur_state = NULL;
1037 	char *resp_msg, *argv[10], *avail_sep = "", *cur_sep = "", *tok;
1038 	int status = 0;
1039 	int len = 0;
1040 
1041 	if (!syscfg_path || !avail_modes || !current_mode)
1042 		return;
1043 	if ((syscfg_found == 0) || (!hw_is_knl && !force_load)) {
1044 		/* This node on cluster lacks syscfg; should not be KNL */
1045 		static bool log_event = true;
1046 		if (log_event) {
1047 			info("%s: syscfg program not found or node isn't KNL, can not get KNL modes",
1048 			     __func__);
1049 			log_event = false;
1050 		}
1051 		*avail_modes = NULL;
1052 		*current_mode = NULL;
1053 		return;
1054 	}
1055 
1056 	switch (knl_system_type) {
1057 	case KNL_SYSTEM_TYPE_INTEL:
1058 		argv[0] = "syscfg";
1059 		argv[1] = "/d";
1060 		argv[2] = "BIOSSETTINGS";
1061 		argv[3] = "Cluster Mode";
1062 		argv[4] = NULL;
1063 		break;
1064 	case KNL_SYSTEM_TYPE_DELL:
1065 		argv[0] = "syscfg";
1066 		argv[1] = "--SystemMemoryModel";
1067 		argv[2] = NULL;
1068 		break;
1069 	default:
1070 		/* This should never happen */
1071 		error("%s: Unknown SystemType. %d", __func__, knl_system_type);
1072 		*avail_modes = NULL;
1073 		*current_mode = NULL;
1074 		return;
1075 	}
1076 	resp_msg = _run_script(syscfg_path, argv, &status);
1077 	if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
1078 		error("%s: syscfg (get cluster mode) status:%u response:%s",
1079 		      __func__, status, resp_msg);
1080 	}
1081 	if (resp_msg == NULL) {
1082 		info("%s: syscfg returned no information", __func__);
1083 	} else {
1084 		tok = NULL;
1085 		_log_script_argv(argv, resp_msg);
1086 		switch (knl_system_type) {
1087 		case KNL_SYSTEM_TYPE_INTEL:
1088 			tok = strstr(resp_msg, "Current Value : ");
1089 			len = 16;
1090 			break;
1091 		case KNL_SYSTEM_TYPE_DELL:
1092 			tok = strstr(resp_msg, "SystemMemoryModel=");
1093 			len = 18;
1094 			break;
1095 		default:
1096 			/* already handled above, should never get here */
1097 			break;
1098 		}
1099 		if (tok) {
1100 			tok += len;
1101 			if (!xstrncasecmp(tok, "All2All", 3)) {
1102 				cur_state = xstrdup("a2a");
1103 				cur_sep = ",";
1104 			} else if (!xstrncasecmp(tok, "Hemisphere", 3)) {
1105 				cur_state = xstrdup("hemi");
1106 				cur_sep = ",";
1107 			} else if (!xstrncasecmp(tok, "Quadrant", 3)) {
1108 				cur_state = xstrdup("quad");
1109 				cur_sep = ",";
1110 			} else if (!xstrncasecmp(tok, "SNC-2", 5)) {
1111 				cur_state = xstrdup("snc2");
1112 				cur_sep = ",";
1113 			} else if (!xstrncasecmp(tok, "SNC-4", 5)) {
1114 				cur_state = xstrdup("snc4");
1115 				cur_sep = ",";
1116 			}
1117 		}
1118 
1119 		switch (knl_system_type) {
1120 		case KNL_SYSTEM_TYPE_DELL:
1121 			argv[0] = "syscfg";
1122 			argv[1] = "-h";
1123 			argv[2] = "--SystemMemoryModel";
1124 			argv[3] = NULL;
1125 
1126 			xfree(resp_msg);
1127 			resp_msg = _run_script(syscfg_path, argv, &status);
1128 			if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
1129 				error("%s: syscfg (get cluster mode) status:%u response:%s",
1130 				      __func__, status, resp_msg);
1131 			}
1132 			if (resp_msg == NULL)
1133 				info("%s: syscfg -h --SystemMemoryModel returned no information", __func__);
1134 			break;
1135 		default:
1136 			break;
1137 		}
1138 
1139 		if (xstrcasestr(resp_msg, "All2All")) {
1140 			xstrfmtcat(avail_states, "%s%s", avail_sep, "a2a");
1141 			avail_sep = ",";
1142 		}
1143 		if (xstrcasestr(resp_msg, "Hemisphere")) {
1144 			xstrfmtcat(avail_states, "%s%s", avail_sep, "hemi");
1145 			avail_sep = ",";
1146 		}
1147 		if (xstrcasestr(resp_msg, "Quadrant")) {
1148 			xstrfmtcat(avail_states, "%s%s", avail_sep, "quad");
1149 			avail_sep = ",";
1150 		}
1151 		if (xstrcasestr(resp_msg, "SNC-2")) {
1152 			xstrfmtcat(avail_states, "%s%s", avail_sep, "snc2");
1153 			avail_sep = ",";
1154 		}
1155 		if (xstrcasestr(resp_msg, "SNC-4")) {
1156 			xstrfmtcat(avail_states, "%s%s", avail_sep, "snc4");
1157 			avail_sep = ",";
1158 		}
1159 		xfree(resp_msg);
1160 	}
1161 
1162 	switch (knl_system_type) {
1163 	case KNL_SYSTEM_TYPE_INTEL:
1164 		argv[0] = "syscfg";
1165 		argv[1] = "/d";
1166 		argv[2] = "BIOSSETTINGS";
1167 		argv[3] = "Memory Mode";
1168 		argv[4] = NULL;
1169 		break;
1170 	case KNL_SYSTEM_TYPE_DELL:
1171 		argv[0] = "syscfg";
1172 		argv[1] = "--ProcEmbMemMode";
1173 		argv[2] = NULL;
1174 		break;
1175 	default:
1176 		/* already handled above, should never get here */
1177 		break;
1178 	}
1179 	resp_msg = _run_script(syscfg_path, argv, &status);
1180 	if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
1181 		error("%s: syscfg (get memory mode) status:%u response:%s",
1182 		      __func__, status, resp_msg);
1183 	}
1184 	if (resp_msg == NULL) {
1185 		info("%s: syscfg returned no information", __func__);
1186 	} else {
1187 		tok = NULL;
1188 		_log_script_argv(argv, resp_msg);
1189 		switch (knl_system_type) {
1190 		case KNL_SYSTEM_TYPE_INTEL:
1191 			tok = strstr(resp_msg, "Current Value : ");
1192 			len = 16;
1193 			break;
1194 		case KNL_SYSTEM_TYPE_DELL:
1195 			tok = strstr(resp_msg, "ProcEmbMemMode=");
1196 			len = 15;
1197 			break;
1198 		default:
1199 			/* already handled above, should never get here */
1200 			break;
1201 		}
1202 		if (tok) {
1203 			tok += len;
1204 			if (!xstrncasecmp(tok, "Cache", 3)) {
1205 				xstrfmtcat(cur_state, "%s%s", cur_sep, "cache");
1206 			} else if (!xstrncasecmp(tok, "Flat", 3) ||
1207 				   !xstrncasecmp(tok, "Memory", 3)) {
1208 				xstrfmtcat(cur_state, "%s%s", cur_sep, "flat");
1209 			} else if (!xstrncasecmp(tok, "Hybrid", 3)) {
1210 				xstrfmtcat(cur_state, "%s%s", cur_sep, "hybrid");
1211 			} else if (!xstrncasecmp(tok, "Equal", 3)) {
1212 				xstrfmtcat(cur_state, "%s%s", cur_sep, "equal");
1213 			} else if (!xstrncasecmp(tok, "Auto", 3)) {
1214 				xstrfmtcat(cur_state, "%s%s", cur_sep, "auto");
1215 			}
1216 		}
1217 
1218 		switch (knl_system_type) {
1219 		case KNL_SYSTEM_TYPE_DELL:
1220 			argv[0] = "syscfg";
1221 			argv[1] = "-h";
1222 			argv[2] = "--ProcEmbMemMode";
1223 			argv[3] = NULL;
1224 
1225 			xfree(resp_msg);
1226 			resp_msg = _run_script(syscfg_path, argv, &status);
1227 			if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
1228 				error("%s: syscfg (get memory mode) status help:%u response:%s",
1229 				      __func__, status, resp_msg);
1230 			}
1231 			if (resp_msg == NULL)
1232 				info("%s: syscfg -h returned no information", __func__);
1233 			break;
1234 		default:
1235 			break;
1236 		}
1237 
1238 		if (xstrcasestr(resp_msg, "Cache")) {
1239 			xstrfmtcat(avail_states, "%s%s", avail_sep, "cache");
1240 			avail_sep = ",";
1241 		}
1242 		if (xstrcasestr(resp_msg, "Flat") ||
1243 		    xstrcasestr(resp_msg, "Memory")) {
1244 			xstrfmtcat(avail_states, "%s%s", avail_sep, "flat");
1245 			avail_sep = ",";
1246 		}
1247 		if (xstrcasestr(resp_msg, "Hybrid")) {
1248 			xstrfmtcat(avail_states, "%s%s", avail_sep, "hybrid");
1249 			avail_sep = ",";
1250 		}
1251 		if (xstrcasestr(resp_msg, "Equal")) {
1252 			xstrfmtcat(avail_states, "%s%s", avail_sep, "equal");
1253 			avail_sep = ",";
1254 		}
1255 		if (xstrcasestr(resp_msg, "Auto")) {
1256 			xstrfmtcat(avail_states, "%s%s", avail_sep, "auto");
1257 			/* avail_sep = ",";	CLANG error: Dead assignment */
1258 		}
1259 		xfree(resp_msg);
1260 	}
1261 
1262 	if (*avail_modes) {	/* Append for multiple node_features plugins */
1263 		if (*avail_modes[0])
1264 			avail_sep = ",";
1265 		else
1266 			avail_sep = "";
1267 		xstrfmtcat(*avail_modes, "%s%s", avail_sep, avail_states);
1268 		xfree(avail_states);
1269 	} else {
1270 		*avail_modes = avail_states;
1271 	}
1272 
1273 	if (*current_mode) {	/* Append for multiple node_features plugins */
1274 		if (*current_mode[0])
1275 			cur_sep = ",";
1276 		else
1277 			cur_sep = "";
1278 		xstrfmtcat(*current_mode, "%s%s", cur_sep, cur_state);
1279 		xfree(cur_state);
1280 	} else {
1281 		*current_mode = cur_state;
1282 	}
1283 }
1284 
1285 /* Test if a job's feature specification is valid */
node_features_p_job_valid(char * job_features)1286 extern int node_features_p_job_valid(char *job_features)
1287 {
1288 	uint16_t job_mcdram, job_numa;
1289 	int mcdram_cnt, numa_cnt;
1290 	int last_mcdram_cnt = 0, last_numa_cnt = 0;
1291 	int rc = SLURM_SUCCESS;
1292 	char last_sep = '\0', *tmp, *tok, *save_ptr = NULL;
1293 
1294 	if ((job_features == NULL) || (job_features[0] == '\0'))
1295 		return SLURM_SUCCESS;
1296 
1297 	tmp = xstrdup(job_features);
1298 	tok = strtok_r(tmp, "[]()|", &save_ptr);
1299 	while (tok) {
1300 		last_sep = tok[strlen(tok) - 1];
1301 		job_mcdram = _knl_mcdram_parse(tok, "&,*");
1302 		mcdram_cnt = _knl_mcdram_bits_cnt(job_mcdram) + last_mcdram_cnt;
1303 		if (mcdram_cnt > 1) {	/* Multiple ANDed MCDRAM options */
1304 			rc = ESLURM_INVALID_KNL;
1305 			break;
1306 		}
1307 
1308 		job_numa = _knl_numa_parse(tok, "&,*");
1309 		numa_cnt = _knl_numa_bits_cnt(job_numa) + last_numa_cnt;
1310 		if (numa_cnt > 1) {	/* Multiple ANDed NUMA options */
1311 			rc = ESLURM_INVALID_KNL;
1312 			break;
1313 		}
1314 		tok = strtok_r(NULL, "[]()|", &save_ptr);
1315 		if (tok &&
1316 		    ((last_sep == '&') ||	/* e.g. "equal&(flat|cache)" */
1317 		     (tok[0] == '&'))) {	/* e.g. "(flat|cache)&equal" */
1318 			last_mcdram_cnt += mcdram_cnt;
1319 			last_numa_cnt += numa_cnt;
1320 		} else {
1321 			last_mcdram_cnt = 0;
1322 			last_numa_cnt = 0;
1323 		}
1324 	}
1325 	xfree(tmp);
1326 
1327 	return rc;
1328 }
1329 
1330 /*
1331  * Translate a job's feature request to the node features needed at boot time.
1332  *	If multiple MCDRAM or NUMA values are ORed, pick the first ones.
1333  * IN job_features - job's --constraint specification
1334  * RET features required on node reboot. Must xfree to release memory
1335  */
node_features_p_job_xlate(char * job_features)1336 extern char *node_features_p_job_xlate(char *job_features)
1337 {
1338 	char *node_features = NULL;
1339 	char *tmp, *save_ptr = NULL, *mult, *sep = "", *tok;
1340 	bool has_numa = false, has_mcdram = false;
1341 
1342 	if ((job_features == NULL) || (job_features[0] ==  '\0'))
1343 		return node_features;
1344 
1345 	tmp = xstrdup(job_features);
1346 	tok = strtok_r(tmp, "[]()|&", &save_ptr);
1347 	while (tok) {
1348 		bool knl_opt = false;
1349 		if ((mult = strchr(tok, '*')))
1350 			mult[0] = '\0';
1351 		if (_knl_mcdram_token(tok)) {
1352 			if (!has_mcdram) {
1353 				has_mcdram = true;
1354 				knl_opt = true;
1355 			}
1356 		}
1357 		if (_knl_numa_token(tok)) {
1358 			if (!has_numa) {
1359 				has_numa = true;
1360 				knl_opt = true;
1361 			}
1362 		}
1363 		if (knl_opt) {
1364 			xstrfmtcat(node_features, "%s%s", sep, tok);
1365 			sep = ",";
1366 		}
1367 		tok = strtok_r(NULL, "[]()|&", &save_ptr);
1368 	}
1369 	xfree(tmp);
1370 
1371 	return node_features;
1372 }
1373 
_find_key_val(char * key,char * resp_msg)1374 static char *_find_key_val(char *key, char *resp_msg)
1375 {
1376 	char *sep = NULL, *tok, *val = NULL;
1377 	int i;
1378 
1379 	if ((key == NULL) || (resp_msg == NULL))
1380 		return NULL;
1381 
1382 	if ((tok = strstr(resp_msg, "Possible Values")))
1383 		tok += 15;
1384 	else
1385 		tok = resp_msg;
1386 	if ((tok = strstr(tok, key)))
1387 		sep = strchr(tok, ':');
1388 	if (sep) {
1389 		sep++;
1390 		while ((sep[0] != '\0')&& !isdigit(sep[0]))
1391 			sep++;
1392 		if (isdigit(sep[0])) {
1393 			val = xstrdup(sep);
1394 			for (i = 1 ; val[i]; i++) {
1395 				if (!isdigit(val[i])) {
1396 					val[i] = '\0';
1397 					break;
1398 				}
1399 			}
1400 		}
1401 	}
1402 
1403 	return val;
1404 }
1405 
1406 /* Set's the node's active features based upon job constraints.
1407  * NOTE: Executed by the slurmd daemon.
1408  * IN active_features - New active features
1409  * RET error code */
node_features_p_node_set(char * active_features)1410 extern int node_features_p_node_set(char *active_features)
1411 {
1412 	char *resp_msg, *argv[10], tmp[100];
1413 	char *key;
1414 	int error_code = SLURM_SUCCESS, status = 0;
1415 	char *mcdram_mode = NULL, *numa_mode = NULL;
1416 
1417 	if ((active_features == NULL) || (active_features[0] == '\0'))
1418 		return SLURM_SUCCESS;
1419 
1420 	if (!syscfg_path) {
1421 		error("%s: SyscfgPath not configured", __func__);
1422 		return SLURM_ERROR;
1423 	}
1424 	if ((syscfg_found == 0) || (!hw_is_knl && !force_load)) {
1425 		/* This node on cluster lacks syscfg; should not be KNL */
1426 		static bool log_event = true;
1427 		if (log_event) {
1428 			error("%s: syscfg program not found or node isn't KNL; can not set KNL modes",
1429 			      __func__);
1430 			log_event = false;
1431 		}
1432 		return SLURM_ERROR;
1433 	}
1434 
1435 	/* Identify available Cluster/NUMA modes */
1436 	switch (knl_system_type) {
1437 	case KNL_SYSTEM_TYPE_INTEL:
1438 		argv[0] = "syscfg";
1439 		argv[1] = "/d";
1440 		argv[2] = "BIOSSETTINGS";
1441 		argv[3] = "Cluster Mode";
1442 		argv[4] = NULL;
1443 		break;
1444 	case KNL_SYSTEM_TYPE_DELL:
1445 		argv[0] = "syscfg";
1446 		argv[1] = "--SystemMemoryModel";
1447 		argv[2] = NULL;
1448 		break;
1449 	default:
1450 		/* This should never happen */
1451 		error("%s: Unknown SystemType. %d", __func__, knl_system_type);
1452 		return SLURM_ERROR;
1453 	}
1454 	resp_msg = _run_script(syscfg_path, argv, &status);
1455 	if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
1456 		error("%s: syscfg (get cluster mode) status:%u response:%s",
1457 		      __func__, status, resp_msg);
1458 		error_code = SLURM_ERROR;
1459 	}
1460 	if (resp_msg == NULL) {
1461 		info("%s: syscfg returned no information", __func__);
1462 	} else {
1463 		_log_script_argv(argv, resp_msg);
1464 		if (strstr(active_features, "a2a"))
1465 			key = "All2All";
1466 		else if (strstr(active_features, "hemi"))
1467 			key = "Hemisphere";
1468 		else if (strstr(active_features, "quad"))
1469 			key = "Quadrant";
1470 		else if (strstr(active_features, "snc2"))
1471 			key = "SNC-2";
1472 		else if (strstr(active_features, "snc4"))
1473 			key = "SNC-4";
1474 		else
1475 			key = NULL;
1476 		switch (knl_system_type) {
1477 		case KNL_SYSTEM_TYPE_INTEL:
1478 			numa_mode = _find_key_val(key, resp_msg);
1479 			break;
1480 		case KNL_SYSTEM_TYPE_DELL:
1481 			numa_mode = xstrdup(key);
1482 		default:
1483 			break;
1484 		}
1485 		xfree(resp_msg);
1486 	}
1487 
1488 	/* Reset current Cluster/NUMA mode */
1489 	if (numa_mode) {
1490 		switch (knl_system_type) {
1491 		case KNL_SYSTEM_TYPE_INTEL:
1492 			argv[0] = "syscfg";
1493 			argv[1] = "/bcs";
1494 			argv[2] = "";
1495 			argv[3] = "BIOSSETTINGS";
1496 			argv[4] = "Cluster Mode";
1497 			argv[5] = numa_mode;
1498 			argv[6] = NULL;
1499 			break;
1500 		case KNL_SYSTEM_TYPE_DELL:
1501 			snprintf(tmp, sizeof(tmp),
1502 				 "--SystemMemoryModel=%s", numa_mode);
1503 			argv[0] = "syscfg";
1504 			argv[1] = tmp;
1505 			argv[2] = NULL;
1506 			break;
1507 		default:
1508 			/* already handled above, should never get here */
1509 			break;
1510 		}
1511 		resp_msg = _run_script(syscfg_path, argv, &status);
1512 		if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
1513 			error("%s: syscfg (set cluster mode) status:%u response:%s",
1514 			      __func__, status, resp_msg);
1515 			error_code = SLURM_ERROR;
1516 		}  else {
1517 			_log_script_argv(argv, resp_msg);
1518 		}
1519 		xfree(resp_msg);
1520 		xfree(numa_mode);
1521 	}
1522 
1523 	/* Identify available Memory/MCDRAM modes */
1524 	switch (knl_system_type) {
1525 	case KNL_SYSTEM_TYPE_INTEL:
1526 		argv[0] = "syscfg";
1527 		argv[1] = "/d";
1528 		argv[2] = "BIOSSETTINGS";
1529 		argv[3] = "Memory Mode";
1530 		argv[4] = NULL;
1531 		break;
1532 	case KNL_SYSTEM_TYPE_DELL:
1533 		argv[0] = "syscfg";
1534 		argv[1] = "--ProcEmbMemMode";
1535 		argv[2] = NULL;
1536 		break;
1537 	default:
1538 		/* already handled above, should never get here */
1539 		break;
1540 	}
1541 	resp_msg = _run_script(syscfg_path, argv, &status);
1542 	if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
1543 		error("%s: syscfg (get memory mode) status:%u response:%s",
1544 		      __func__, status, resp_msg);
1545 		error_code = SLURM_ERROR;
1546 	}
1547 	if (resp_msg == NULL) {
1548 		info("%s: syscfg returned no information", __func__);
1549 	} else {
1550 		_log_script_argv(argv, resp_msg);
1551 		if (strstr(active_features, "cache"))
1552 			key = "Cache";
1553 		else if (strstr(active_features, "flat"))
1554 			switch (knl_system_type) {
1555 			case KNL_SYSTEM_TYPE_INTEL:
1556 				key = "Flat";
1557 				break;
1558 			case KNL_SYSTEM_TYPE_DELL:
1559 				key = "Memory";
1560 				break;
1561 			default:
1562 				key = NULL;
1563 				break;
1564 			}
1565 		else if (strstr(active_features, "hybrid"))
1566 			key = "Hybrid";
1567 		else if (strstr(active_features, "equal"))
1568 			key = "Equal";
1569 		else if (strstr(active_features, "auto"))
1570 			key = "Auto";
1571 		else
1572 			key = NULL;
1573 
1574 		switch (knl_system_type) {
1575 		case KNL_SYSTEM_TYPE_INTEL:
1576 			mcdram_mode = _find_key_val(key, resp_msg);
1577 			break;
1578 		case KNL_SYSTEM_TYPE_DELL:
1579 			mcdram_mode = xstrdup(key);
1580 		default:
1581 			break;
1582 		}
1583 		xfree(resp_msg);
1584 	}
1585 
1586 	/* Reset current Memory/MCDRAM mode */
1587 	if (mcdram_mode) {
1588 		switch (knl_system_type) {
1589 		case KNL_SYSTEM_TYPE_INTEL:
1590 			argv[0] = "syscfg";
1591 			argv[1] = "/bcs";
1592 			argv[2] = "";
1593 			argv[3] = "BIOSSETTINGS";
1594 			argv[4] = "Memory Mode";
1595 			argv[5] = mcdram_mode;
1596 			argv[6] = NULL;
1597 			break;
1598 		case KNL_SYSTEM_TYPE_DELL:
1599 			snprintf(tmp, sizeof(tmp),
1600 				 "--ProcEmbMemMode=%s", mcdram_mode);
1601 			argv[0] = "syscfg";
1602 			argv[1] = tmp;
1603 			argv[2] = NULL;
1604 			break;
1605 		default:
1606 			/* already handled above, should never get here */
1607 			break;
1608 		}
1609 
1610 		resp_msg = _run_script(syscfg_path, argv, &status);
1611 		if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
1612 			error("%s: syscfg (set memory mode) status:%u response:%s",
1613 			      __func__, status, resp_msg);
1614 			error_code = SLURM_ERROR;
1615 		} else {
1616 			_log_script_argv(argv, resp_msg);
1617 		}
1618 		xfree(resp_msg);
1619 		xfree(mcdram_mode);
1620 	}
1621 
1622 	/* Clear features, do not pass as argument to reboot program
1623 	 * (assuming we are calling /sbin/reboot). */
1624 	active_features[0] = '\0';
1625 
1626 	return error_code;
1627 }
1628 
1629 /* Return bitmap of KNL nodes, NULL if none identified */
node_features_p_get_node_bitmap(void)1630 extern bitstr_t *node_features_p_get_node_bitmap(void)
1631 {
1632 	if (knl_node_bitmap)
1633 		return bit_copy(knl_node_bitmap);
1634 	return NULL;
1635 }
1636 
1637 /* Return count of overlaping bits in active_bitmap and knl_node_bitmap */
node_features_p_overlap(bitstr_t * active_bitmap)1638 extern int node_features_p_overlap(bitstr_t *active_bitmap)
1639 {
1640 	int cnt = 0;
1641 
1642 	if (!knl_node_bitmap || !active_bitmap ||
1643 	    !(cnt = bit_overlap(active_bitmap, knl_node_bitmap)))
1644 		return 0;
1645 
1646 	return cnt;
1647 }
1648 /* Return true if the plugin requires PowerSave mode for booting nodes */
node_features_p_node_power(void)1649 extern bool node_features_p_node_power(void)
1650 {
1651 	return false;
1652 }
1653 
1654 /*
1655  * Note the active features associated with a set of nodes have been updated.
1656  * Specifically update the node's "hbm" GRES and "CpuBind" values as needed.
1657  * IN active_features - New active features
1658  * IN node_bitmap - bitmap of nodes changed
1659  * RET error code
1660  */
node_features_p_node_update(char * active_features,bitstr_t * node_bitmap)1661 extern int node_features_p_node_update(char *active_features,
1662 				       bitstr_t *node_bitmap)
1663 {
1664 	int i, i_first, i_last;
1665 	int rc = SLURM_SUCCESS, numa_inx = -1;
1666 	int mcdram_inx = 0;
1667 	uint64_t mcdram_size;
1668 	node_record_t *node_ptr;
1669 	char *save_ptr = NULL, *tmp, *tok;
1670 
1671 	if (mcdram_per_node == NULL) {
1672 //FIXME: Additional logic is needed to determine the available MCDRAM space
1673 //FIXME: Additional logic will also be required to handle heterogeneous sizes
1674 		mcdram_per_node = xmalloc(sizeof(uint64_t) * node_record_count);
1675 		for (i = 0; i < node_record_count; i++)
1676 			mcdram_per_node[i] = DEFAULT_MCDRAM_SIZE;
1677 	}
1678 
1679 	if (active_features) {
1680 		tmp = xstrdup(active_features);
1681 		tok = strtok_r(tmp, ",", &save_ptr);
1682 		while (tok) {
1683 			if (numa_inx == -1)
1684 				numa_inx = _knl_numa_inx(tok);
1685 			mcdram_inx |= _knl_mcdram_token(tok);
1686 			tok = strtok_r(NULL, ",", &save_ptr);
1687 		}
1688 		xfree(tmp);
1689 	}
1690 
1691 	if (mcdram_inx >= 0) {
1692 		for (i = 0; i < KNL_MCDRAM_CNT; i++) {
1693 			if ((KNL_CACHE << i) == mcdram_inx)
1694 				break;
1695 		}
1696 		if ((i >= KNL_MCDRAM_CNT) || (mcdram_pct[i] == -1))
1697 			mcdram_inx = -1;
1698 		else
1699 			mcdram_inx = i;
1700 	} else {
1701 		mcdram_inx = -1;
1702 	}
1703 
1704 	xassert(node_bitmap);
1705 	i_first = bit_ffs(node_bitmap);
1706 	if (i_first >= 0)
1707 		i_last = bit_fls(node_bitmap);
1708 	else
1709 		i_last = i_first - 1;
1710 	for (i = i_first; i <= i_last; i++) {
1711 		if (!bit_test(node_bitmap, i))
1712 			continue;
1713 		if (i >= node_record_count) {
1714 			error("%s: Invalid node index (%d >= %d)",
1715 			      __func__, i, node_record_count);
1716 			rc = SLURM_ERROR;
1717 			break;
1718 		}
1719 		node_ptr = node_record_table_ptr + i;
1720 		if ((numa_inx >= 0) && cpu_bind[numa_inx])
1721 			node_ptr->cpu_bind = cpu_bind[numa_inx];
1722 		if (mcdram_per_node && (mcdram_inx >= 0)) {
1723 			mcdram_size = mcdram_per_node[i] *
1724 				      (100 - mcdram_pct[mcdram_inx]) / 100;
1725 			if (!node_ptr->gres)
1726 				node_ptr->gres =
1727 					xstrdup(node_ptr->config_ptr->gres);
1728 			gres_plugin_node_feature(node_ptr->name, "hbm",
1729 						 mcdram_size, &node_ptr->gres,
1730 						 &node_ptr->gres_list);
1731 		}
1732 	}
1733 
1734 	return rc;
1735 }
1736 
1737 /*
1738  * Return TRUE if the specified node update request is valid with respect
1739  * to features changes (i.e. don't permit a non-KNL node to set KNL features).
1740  *
1741  * arg IN - Pointer to node_record_t record
1742  * update_node_msg IN - Pointer to update request
1743  */
node_features_p_node_update_valid(void * arg,update_node_msg_t * update_node_msg)1744 extern bool node_features_p_node_update_valid(void *arg,
1745 					update_node_msg_t *update_node_msg)
1746 {
1747 	node_record_t *node_ptr = (node_record_t *) arg;
1748 	char *tmp, *save_ptr = NULL, *tok;
1749 	bool is_knl = false, invalid_feature = false;
1750 
1751 	/* No feature changes */
1752 	if (!update_node_msg->features && !update_node_msg->features_act)
1753 		return true;
1754 
1755 	/* Determine if this is KNL node based upon current features */
1756 	if (node_ptr->features && node_ptr->features[0]) {
1757 		tmp = xstrdup(node_ptr->features);
1758 		tok = strtok_r(tmp, ",", &save_ptr);
1759 		while (tok) {
1760 			if (_knl_mcdram_token(tok) || _knl_numa_token(tok)) {
1761 				is_knl = true;
1762 				break;
1763 			}
1764 			tok = strtok_r(NULL, ",", &save_ptr);
1765 		}
1766 		xfree(tmp);
1767 	}
1768 	if (is_knl)
1769 		return true;
1770 
1771 	/* Validate that AvailableFeatures update request has no KNL modes */
1772 	if (update_node_msg->features) {
1773 		tmp = xstrdup(update_node_msg->features);
1774 		tok = strtok_r(tmp, ",", &save_ptr);
1775 		while (tok) {
1776 			if (_knl_mcdram_token(tok) || _knl_numa_token(tok)) {
1777 				invalid_feature = true;
1778 				break;
1779 			}
1780 			tok = strtok_r(NULL, ",", &save_ptr);
1781 		}
1782 		xfree(tmp);
1783 		if (invalid_feature) {
1784 			info("Invalid AvailableFeatures update request (%s) for non-KNL node %s",
1785 			     update_node_msg->features, node_ptr->name);
1786 			return false;
1787 		}
1788 	}
1789 
1790 	/* Validate that ActiveFeatures update request has no KNL modes */
1791 	if (update_node_msg->features_act) {
1792 		tmp = xstrdup(update_node_msg->features_act);
1793 		tok = strtok_r(tmp, ",", &save_ptr);
1794 		while (tok) {
1795 			if (_knl_mcdram_token(tok) || _knl_numa_token(tok)) {
1796 				invalid_feature = true;
1797 				break;
1798 			}
1799 			tok = strtok_r(NULL, ",", &save_ptr);
1800 		}
1801 		xfree(tmp);
1802 		if (invalid_feature) {
1803 			info("Invalid ActiveFeatures update request (%s) for non-KNL node %s",
1804 			     update_node_msg->features_act, node_ptr->name);
1805 			return false;
1806 		}
1807 	}
1808 
1809 	/*
1810 	 * For non-KNL node, active and available features must match
1811 	 */
1812 	if (!update_node_msg->features) {
1813 		update_node_msg->features =
1814 			xstrdup(update_node_msg->features_act);
1815 	} else if (!update_node_msg->features_act) {
1816 		update_node_msg->features_act =
1817 			xstrdup(update_node_msg->features);
1818 	} else if (xstrcmp(update_node_msg->features,
1819 			   update_node_msg->features_act)) {
1820 		info("Invalid ActiveFeatures != AvailableFeatures (%s != %s) for non-KNL node %s",
1821 		     update_node_msg->features, update_node_msg->features_act,
1822 		     node_ptr->name);
1823 		return false;
1824 	}
1825 
1826 	return true;
1827 }
1828 
1829 /* Return TRUE if this (one) feature name is under this plugin's control */
node_features_p_changeable_feature(char * feature)1830 extern bool node_features_p_changeable_feature(char *feature)
1831 {
1832 	if (_knl_mcdram_token(feature) || _knl_numa_token(feature))
1833 		return true;
1834 	return false;
1835 }
1836 
1837 /*
1838  * Translate a node's feature specification by replacing any features associated
1839  *	with this plugin in the original value with the new values, preserving
1840  *	any features that are not associated with this plugin
1841  * IN new_features - newly active features
1842  * IN orig_features - original active features
1843  * IN avail_features - original available features
1844  * IN node_inx - index of node in node table
1845  * RET node's new merged features, must be xfreed
1846  */
node_features_p_node_xlate(char * new_features,char * orig_features,char * avail_features,int node_inx)1847 extern char *node_features_p_node_xlate(char *new_features, char *orig_features,
1848 					char *avail_features, int node_inx)
1849 {
1850 	char *node_features = NULL;
1851 	char *tmp, *save_ptr = NULL, *sep = "", *tok;
1852 	uint16_t new_mcdram = 0, new_numa = 0;
1853 	uint16_t tmp_mcdram, tmp_numa;
1854 	bool is_knl = false;
1855 
1856 	if (avail_features) {
1857 		tmp = xstrdup(avail_features);
1858 		tok = strtok_r(tmp, ",", &save_ptr);
1859 		while (tok) {
1860 			if (_knl_mcdram_token(tok) || _knl_numa_token(tok)) {
1861 				is_knl = true;
1862 			} else {
1863 				xstrfmtcat(node_features, "%s%s", sep, tok);
1864 				sep = ",";
1865 			}
1866 			tok = strtok_r(NULL, ",", &save_ptr);
1867 		}
1868 		xfree(tmp);
1869 		if (!is_knl) {
1870 			xfree(node_features);
1871 			sep = "";
1872 		}
1873 	}
1874 
1875 	if (new_features) {
1876 		/* Copy non-KNL features */
1877 		if (!is_knl && new_features) {
1878 			tmp = xstrdup(new_features);
1879 			tok = strtok_r(tmp, ",", &save_ptr);
1880 			while (tok) {
1881 				if ((_knl_mcdram_token(tok) == 0) &&
1882 				    (_knl_numa_token(tok)   == 0)) {
1883 					xstrfmtcat(node_features, "%s%s", sep,
1884 						   tok);
1885 					sep = ",";
1886 				}
1887 				tok = strtok_r(NULL, ",", &save_ptr);
1888 			}
1889 			xfree(tmp);
1890 		}
1891 
1892 		/* Copy new KNL features in MCDRAM/NUMA order */
1893 		tmp = xstrdup(new_features);
1894 		tok = strtok_r(tmp, ",", &save_ptr);
1895 		while (tok) {
1896 			if ((tmp_mcdram = _knl_mcdram_token(tok)))
1897 				new_mcdram |= tmp_mcdram;
1898 			else if ((tmp_numa = _knl_numa_token(tok)))
1899 				new_numa |= tmp_numa;
1900 			tok = strtok_r(NULL, ",", &save_ptr);
1901 		}
1902 		xfree(tmp);
1903 
1904 		if (is_knl && ((new_mcdram == 0) || (new_numa == 0))) {
1905 			/*
1906 			 * New active features lacks current MCDRAM or NUMA,
1907 			 * copy values from original
1908 			 */
1909 			tmp = xstrdup(orig_features);
1910 			tok = strtok_r(tmp, ",", &save_ptr);
1911 			while (tok) {
1912 				if ((new_mcdram == 0) &&
1913 				    (tmp_mcdram = _knl_mcdram_token(tok)))
1914 					new_mcdram |= tmp_mcdram;
1915 				else if ((new_numa == 0) &&
1916 					 (tmp_numa = _knl_numa_token(tok)))
1917 					new_numa |= tmp_numa;
1918 				tok = strtok_r(NULL, ",", &save_ptr);
1919 			}
1920 			xfree(tmp);
1921 		}
1922 		if (new_mcdram) {
1923 			tmp = _knl_mcdram_str(new_mcdram);
1924 			xstrfmtcat(node_features, "%s%s", sep, tmp);
1925 			xfree(tmp);
1926 			sep = ",";
1927 		}
1928 		if (new_numa) {
1929 			tmp = _knl_numa_str(new_numa);
1930 			xstrfmtcat(node_features, "%s%s", sep, tmp);
1931 			xfree(tmp);
1932 		}
1933 	}
1934 
1935 	if (is_knl) {
1936 		if (!knl_node_bitmap)
1937 			knl_node_bitmap = bit_alloc(node_record_count);
1938 		bit_set(knl_node_bitmap, node_inx);
1939 	}
1940 
1941 	return node_features;
1942 }
1943 
1944 /* Translate a node's new feature specification into a "standard" ordering
1945  * RET node's new merged features, must be xfreed */
node_features_p_node_xlate2(char * new_features)1946 extern char *node_features_p_node_xlate2(char *new_features)
1947 {
1948 	char *node_features = NULL;
1949 	char *tmp, *save_ptr = NULL, *sep = "", *tok;
1950 	uint16_t new_mcdram = 0, new_numa = 0;
1951 	uint16_t tmp_mcdram, tmp_numa;
1952 
1953 	if (new_features && *new_features) {
1954 		tmp = xstrdup(new_features);
1955 		tok = strtok_r(tmp, ",", &save_ptr);
1956 		while (tok) {
1957 			if ((tmp_mcdram = _knl_mcdram_token(tok))) {
1958 				new_mcdram |= tmp_mcdram;
1959 			} else if ((tmp_numa = _knl_numa_token(tok))) {
1960 				new_numa |= tmp_numa;
1961 			} else {
1962 				xstrfmtcat(node_features, "%s%s", sep, tok);
1963 				sep = ",";
1964 			}
1965 			tok = strtok_r(NULL, ",", &save_ptr);
1966 		}
1967 		xfree(tmp);
1968 		if (new_mcdram) {
1969 			tmp = _knl_mcdram_str(new_mcdram);
1970 			xstrfmtcat(node_features, "%s%s", sep, tmp);
1971 			xfree(tmp);
1972 			sep = ",";
1973 		}
1974 		if (new_numa) {
1975 			tmp = _knl_numa_str(new_numa);
1976 			xstrfmtcat(node_features, "%s%s", sep, tmp);
1977 			xfree(tmp);
1978 		}
1979 	}
1980 
1981 	return node_features;
1982 }
1983 
1984 /* Perform set up for step launch
1985  * mem_sort IN - Trigger sort of memory pages (KNL zonesort)
1986  * numa_bitmap IN - NUMA nodes allocated to this job */
node_features_p_step_config(bool mem_sort,bitstr_t * numa_bitmap)1987 extern void node_features_p_step_config(bool mem_sort, bitstr_t *numa_bitmap)
1988 {
1989 #ifdef HAVE_NUMA
1990 	if (mem_sort && (numa_available() != -1)) {
1991 		struct stat sb;
1992 		int buf_len, fd, i, len;
1993 		char buf[16];
1994 
1995 		if (stat(ZONE_SORT_PATH, &sb) == -1)
1996 			if (system(MODPROBE_PATH " zonesort_module")) {
1997 				/*
1998 				 * NOOP - compiling with optimizations throws
1999 				 * out a (void) cast and warns about ignoring
2000 				 * the return value
2001 				 */
2002 			}
2003 		if ((fd = open(ZONE_SORT_PATH, O_WRONLY | O_SYNC)) == -1) {
2004 			error("%s: Could not open file %s: %m",
2005 			      __func__, ZONE_SORT_PATH);
2006 		} else {
2007 			len = numa_max_node() + 1;
2008 			for (i = 0; i < len; i++) {
2009 				if (numa_bitmap && !bit_test(numa_bitmap, i))
2010 					continue;
2011 				snprintf(buf, sizeof(buf), "%d", i);
2012 				buf_len = strlen(buf) + 1;
2013 				// info("SORT NUMA %s", buf);
2014 				if (write(fd, buf, buf_len) != buf_len) {
2015 					error("%s: Could not write file %s: %m",
2016 					      __func__, ZONE_SORT_PATH);
2017 				}
2018 			}
2019 			(void) close(fd);
2020 		}
2021 	}
2022 #endif
2023 }
2024 
2025 /* Determine if the specified user can modify the currently available node
2026  * features */
node_features_p_user_update(uid_t uid)2027 extern bool node_features_p_user_update(uid_t uid)
2028 {
2029 	static int reboot_allowed = -1;
2030 	int i;
2031 
2032 	if (reboot_allowed == -1) {
2033 		char *reboot_program = slurm_get_reboot_program();
2034 		if (reboot_program && reboot_program[0])
2035 			reboot_allowed = 1;
2036 		else
2037 			reboot_allowed = 0;
2038 		xfree(reboot_program);
2039 	}
2040 
2041 	if (reboot_allowed != 1) {
2042 		info("Change in KNL mode not supported. No RebootProgram configured");
2043 		return false;
2044 	}
2045 
2046 	if (allowed_uid_cnt == 0)   /* Default is ALL users allowed to update */
2047 		return true;
2048 
2049 	for (i = 0; i < allowed_uid_cnt; i++) {
2050 		if (allowed_uid[i] == uid)
2051 			return true;
2052 	}
2053 
2054 	return false;
2055 }
2056 
2057 /* Return estimated reboot time, in seconds */
node_features_p_boot_time(void)2058 extern uint32_t node_features_p_boot_time(void)
2059 {
2060 	return boot_time;
2061 }
2062 
2063 /* Get node features plugin configuration */
node_features_p_get_config(config_plugin_params_t * p)2064 extern void node_features_p_get_config(config_plugin_params_t *p)
2065 {
2066 	config_key_pair_t *key_pair;
2067 	List data;
2068 
2069 	xassert(p);
2070 	xstrcat(p->name, plugin_type);
2071 	data = p->key_pairs;
2072 
2073 	key_pair = xmalloc(sizeof(config_key_pair_t));
2074 	key_pair->name = xstrdup("AllowMCDRAM");
2075 	key_pair->value = _knl_mcdram_str(allow_mcdram);
2076 	list_append(data, key_pair);
2077 
2078 	key_pair = xmalloc(sizeof(config_key_pair_t));
2079 	key_pair->name = xstrdup("AllowNUMA");
2080 	key_pair->value = _knl_numa_str(allow_numa);
2081 	list_append(data, key_pair);
2082 
2083 	key_pair = xmalloc(sizeof(config_key_pair_t));
2084 	key_pair->name = xstrdup("AllowUserBoot");
2085 	key_pair->value = _make_uid_str(allowed_uid, allowed_uid_cnt);
2086 	list_append(data, key_pair);
2087 
2088 	key_pair = xmalloc(sizeof(config_key_pair_t));
2089 	key_pair->name = xstrdup("BootTime");
2090 	key_pair->value = xstrdup_printf("%u", boot_time);
2091 	list_append(data, key_pair);
2092 
2093 	key_pair = xmalloc(sizeof(config_key_pair_t));
2094 	key_pair->name = xstrdup("DefaultMCDRAM");
2095 	key_pair->value = _knl_mcdram_str(default_mcdram);
2096 	list_append(data, key_pair);
2097 
2098 	key_pair = xmalloc(sizeof(config_key_pair_t));
2099 	key_pair->name = xstrdup("DefaultNUMA");
2100 	key_pair->value = _knl_numa_str(default_numa);
2101 	list_append(data, key_pair);
2102 
2103 	key_pair = xmalloc(sizeof(config_key_pair_t));
2104 	key_pair->name = xstrdup("Force");
2105 	key_pair->value = xstrdup_printf("%u", force_load);
2106 	list_append(data, key_pair);
2107 
2108 	key_pair = xmalloc(sizeof(config_key_pair_t));
2109 	key_pair->name = xstrdup("McPath");
2110 	key_pair->value = xstrdup(mc_path);
2111 	list_append(data, key_pair);
2112 
2113 	key_pair = xmalloc(sizeof(config_key_pair_t));
2114 	key_pair->name = xstrdup("NodeRebootWeight");
2115 	key_pair->value = xstrdup_printf("%u", node_reboot_weight);
2116 	list_append(data, key_pair);
2117 
2118 	key_pair = xmalloc(sizeof(config_key_pair_t));
2119 	key_pair->name = xstrdup("SyscfgPath");
2120 	key_pair->value = xstrdup(syscfg_path);
2121 	list_append(data, key_pair);
2122 
2123 	key_pair = xmalloc(sizeof(config_key_pair_t));
2124 	key_pair->name = xstrdup("SyscfgTimeout");
2125 	key_pair->value = xstrdup_printf("%u", syscfg_timeout);
2126 	list_append(data, key_pair);
2127 
2128 	key_pair = xmalloc(sizeof(config_key_pair_t));
2129 	key_pair->name = xstrdup("SystemType");
2130 	key_pair->value = xstrdup(_knl_system_type_str(knl_system_type));
2131 	list_append(data, key_pair);
2132 
2133 	key_pair = xmalloc(sizeof(config_key_pair_t));
2134 	key_pair->name = xstrdup("UmeCheckInterval");
2135 	key_pair->value = xstrdup_printf("%u", ume_check_interval);
2136 	list_append(data, key_pair);
2137 
2138 	list_sort(data, (ListCmpF) sort_key_pairs);
2139 
2140 	return;
2141 }
2142 
2143 /*
2144  * Return node "weight" field if reboot required to change mode
2145  */
node_features_p_reboot_weight(void)2146 extern uint32_t node_features_p_reboot_weight(void)
2147 {
2148 	return node_reboot_weight;
2149 }
2150