1 /*****************************************************************************\
2  *  node_features_knl_cray.c - Plugin for managing Cray KNL state information
3  *****************************************************************************
4  *  Copyright (C) 2016 SchedMD LLC.
5  *  Written by Morris Jette <jette@schedmd.com>
6  *
7  *  This file is part of Slurm, a resource management program.
8  *  For details, see <https://slurm.schedmd.com/>.
9  *  Please also read the included file: DISCLAIMER.
10  *
11  *  Slurm is free software; you can redistribute it and/or modify it under
12  *  the terms of the GNU General Public License as published by the Free
13  *  Software Foundation; either version 2 of the License, or (at your option)
14  *  any later version.
15  *
16  *  In addition, as a special exception, the copyright holders give permission
17  *  to link the code of portions of this program with the OpenSSL library under
18  *  certain conditions as described in each individual source file, and
19  *  distribute linked combinations including the two. You must obey the GNU
20  *  General Public License in all respects for all of the code used other than
21  *  OpenSSL. If you modify file(s) with this exception, you may extend this
22  *  exception to your version of the file(s), but you are not obligated to do
23  *  so. If you do not wish to do so, delete this exception statement from your
24  *  version.  If you delete this exception statement from all source files in
25  *  the program, then also delete it here.
26  *
27  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
28  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
29  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
30  *  details.
31  *
32  *  You should have received a copy of the GNU General Public License along
33  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
34  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
35 \*****************************************************************************/
36 
37 #if HAVE_CONFIG_H
38 #  include "config.h"
39 #endif
40 
41 #define _GNU_SOURCE	/* For POLLRDHUP */
42 #include <ctype.h>
43 #include <fcntl.h>
44 #ifdef HAVE_NUMA
45 #undef NUMA_VERSION1_COMPATIBILITY
46 #include <numa.h>
47 #endif
48 #include <poll.h>
49 #include <signal.h>
50 #include <stdlib.h>
51 #include <sys/stat.h>
52 #include <sys/types.h>
53 #include <time.h>
54 #include <unistd.h>
55 
56 #if HAVE_JSON_C_INC
57 #  include <json-c/json.h>
58 #elif HAVE_JSON_INC
59 #  include <json/json.h>
60 #endif
61 
62 #if defined(__APPLE__) || defined(__DragonFly__) || defined(__NetBSD__)
63 #define POLLRDHUP POLLHUP
64 #endif
65 
66 #include "slurm/slurm.h"
67 
68 #include "src/common/assoc_mgr.h"
69 #include "src/common/bitstring.h"
70 #include "src/common/fd.h"
71 #include "src/common/gres.h"
72 #include "src/common/list.h"
73 #include "src/common/macros.h"
74 #include "src/common/pack.h"
75 #include "src/common/parse_config.h"
76 #include "src/common/slurm_protocol_api.h"
77 #include "src/common/slurm_resource_info.h"
78 #include "src/common/timers.h"
79 #include "src/common/uid.h"
80 #include "src/common/xmalloc.h"
81 #include "src/common/xstring.h"
82 
83 #include "src/slurmctld/job_scheduler.h"
84 #include "src/slurmctld/locks.h"
85 #include "src/slurmctld/node_scheduler.h"
86 #include "src/slurmctld/read_config.h"
87 #include "src/slurmctld/reservation.h"
88 #include "src/slurmctld/slurmctld.h"
89 #include "src/slurmctld/state_save.h"
90 #include "src/slurmd/slurmd/req.h"
91 
92 /* Maximum poll wait time for child processes, in milliseconds */
93 #define MAX_POLL_WAIT 500
94 
95 /* Default and minimum timeout parameters for the capmc command */
96 #define DEFAULT_CAPMC_RETRIES 4
97 #define DEFAULT_CAPMC_TIMEOUT 60000	/* 60 seconds */
98 #define MIN_CAPMC_TIMEOUT 1000		/* 1 second */
99 
100 /* Intel Knights Landing Configuration Modes */
101 #define KNL_NUMA_CNT	5
102 #define KNL_MCDRAM_CNT	4
103 #define KNL_NUMA_FLAG	0x00ff
104 #define KNL_ALL2ALL	0x0001
105 #define KNL_SNC2	0x0002
106 #define KNL_SNC4	0x0004
107 #define KNL_HEMI	0x0008
108 #define KNL_QUAD	0x0010
109 #define KNL_MCDRAM_FLAG	0xff00
110 #define KNL_CACHE	0x0100
111 #define KNL_EQUAL	0x0200
112 #define KNL_SPLIT	0x0400
113 #define KNL_FLAT	0x0800
114 
115 #ifndef MODPROBE_PATH
116 #define MODPROBE_PATH	"/sbin/modprobe"
117 #endif
118 #define ZONE_SORT_PATH	"/sys/kernel/zone_sort_free_pages/nodeid"
119 
120 /* These are defined here so when we link with something other than
121  * the slurmctld we will have these symbols defined.  They will get
122  * overwritten when linking with the slurmctld.
123  */
124 #if defined (__APPLE__)
125 extern slurmctld_config_t slurmctld_config __attribute__((weak_import));
126 extern bitstr_t *avail_node_bitmap __attribute__((weak_import));
127 extern active_feature_list __attribute__((weak_import));
128 #else
129 slurmctld_config_t slurmctld_config;
130 bitstr_t *avail_node_bitmap;
131 List active_feature_list;
132 #endif
133 
134 /*
135  * These variables are required by the burst buffer plugin interface.  If they
136  * are not found in the plugin, the plugin loader will ignore it.
137  *
138  * plugin_name - a string giving a human-readable description of the
139  * plugin.  There is no maximum length, but the symbol must refer to
140  * a valid string.
141  *
142  * plugin_type - a string suggesting the type of the plugin or its
143  * applicability to a particular form of data or method of data handling.
144  * If the low-level plugin API is used, the contents of this string are
145  * unimportant and may be anything.  Slurm uses the higher-level plugin
146  * interface which requires this string to be of the form
147  *
148  *      <application>/<method>
149  *
150  * where <application> is a description of the intended application of
151  * the plugin (e.g., "node_features" for Slurm node_features) and <method> is a
152  * description of how this plugin satisfies that application.  Slurm will only
153  * load a node_features plugin if the plugin_type string has a prefix of
154  * "node_features/".
155  *
156  * plugin_version - an unsigned 32-bit integer containing the Slurm version
157  * (major.minor.micro combined into a single number).
158  */
159 const char plugin_name[]        = "node_features knl_cray plugin";
160 const char plugin_type[]        = "node_features/knl_cray";
161 const uint32_t plugin_version   = SLURM_VERSION_NUMBER;
162 
163 /* Configuration Parameters */
164 static uint16_t allow_mcdram = KNL_MCDRAM_FLAG;
165 static uint16_t allow_numa = KNL_NUMA_FLAG;
166 static uid_t *allowed_uid = NULL;
167 static int allowed_uid_cnt = 0;
168 static uint32_t boot_time = (45 * 60);	/* 45 minute estimated boot time */
169 static char *capmc_path = NULL;
170 static uint32_t capmc_poll_freq = 45;	/* capmc state polling frequency */
171 static uint32_t capmc_retries = DEFAULT_CAPMC_RETRIES;
172 static uint32_t capmc_timeout = 0;	/* capmc command timeout in msec */
173 static char *cnselect_path = NULL;
174 static uint32_t cpu_bind[KNL_NUMA_CNT];	/* Derived from numa_cpu_bind */
175 static bool debug_flag = false;
176 static uint16_t default_mcdram = KNL_CACHE;
177 static uint16_t default_numa = KNL_ALL2ALL;
178 static char *mc_path = NULL;
179 static uint32_t node_reboot_weight = (INFINITE - 1);
180 static char *numa_cpu_bind = NULL;
181 static char *syscfg_path = NULL;
182 static pthread_mutex_t config_mutex = PTHREAD_MUTEX_INITIALIZER;
183 static bool reconfig = false;
184 static uint32_t ume_check_interval = 0;
185 static pthread_mutex_t ume_mutex = PTHREAD_MUTEX_INITIALIZER;
186 static pthread_t ume_thread = 0;
187 static uint32_t validate_mode = 0;
188 
189 static bitstr_t *knl_node_bitmap = NULL;	/* KNL nodes found by capmc */
190 static pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER;
191 static char *node_list_queue = NULL;
192 static time_t node_time_queue = (time_t) 0;
193 static time_t shutdown_time = (time_t) 0;
194 static pthread_t queue_thread = 0;
195 
196 /* Percentage of MCDRAM used for cache by type, updated from capmc */
197 static int mcdram_pct[KNL_MCDRAM_CNT];
198 static int mcdram_set = 0;
199 static uint64_t *mcdram_per_node = NULL;
200 
201 /* NOTE: New knl_cray.conf parameters added below must also be added to the
202  * contribs/cray/capmc_suspend.c and contribs/cray/capmc_resume.c files */
203 static s_p_options_t knl_conf_file_options[] = {
204 	{"AllowMCDRAM", S_P_STRING},
205 	{"AllowNUMA", S_P_STRING},
206 	{"AllowUserBoot", S_P_STRING},
207 	{"BootTime", S_P_UINT32},
208 	{"CapmcPath", S_P_STRING},
209 	{"CapmcPollFreq", S_P_UINT32},
210 	{"CapmcRetries", S_P_UINT32},
211 	{"CapmcTimeout", S_P_UINT32},
212 	{"CnselectPath", S_P_STRING},
213 	{"DefaultMCDRAM", S_P_STRING},
214 	{"DefaultNUMA", S_P_STRING},
215 	{"LogFile", S_P_STRING},
216 	{"McPath", S_P_STRING},
217 	{"NumaCpuBind", S_P_STRING},
218 	{"SyscfgPath", S_P_STRING},
219 	{"NodeRebootWeight", S_P_UINT32},
220 	{"UmeCheckInterval", S_P_UINT32},
221 	{"ValidateMode", S_P_UINT32},
222 	{NULL}
223 };
224 
225 typedef struct mcdram_cap {
226 	uint32_t nid;
227 	char *mcdram_cfg;
228 } mcdram_cap_t;
229 
230 typedef struct mcdram_cfg {
231 	uint64_t dram_size;
232 	uint32_t nid;
233 	char *mcdram_cfg;
234 	uint64_t mcdram_size;
235 	uint16_t mcdram_pct;
236 } mcdram_cfg_t;
237 
238 typedef struct mcdram_cfg2 {
239 	int cache_pct;
240 	char *mcdram_cfg;
241 	char *nid_str;
242 	bitstr_t *node_bitmap;
243 } mcdram_cfg2_t;
244 
245 typedef struct numa_cap {
246 	uint32_t nid;
247 	char *numa_cfg;
248 } numa_cap_t;
249 
250 typedef struct numa_cfg {
251 	uint32_t nid;
252 	char *numa_cfg;
253 } numa_cfg_t;
254 
255 typedef struct numa_cfg2 {
256 	char *nid_str;
257 	bitstr_t *node_bitmap;
258 	char *numa_cfg;
259 } numa_cfg2_t;
260 
261 static void _check_node_disabled(void);
262 static void _check_node_status(void);
263 static s_p_hashtbl_t *_config_make_tbl(char *filename);
264 static void _free_script_argv(char **script_argv);
265 static mcdram_cap_t *_json_parse_mcdram_cap_array(json_object *jobj, char *key,
266 						  int *num);
267 static mcdram_cfg_t *_json_parse_mcdram_cfg_array(json_object *jobj, char *key,
268 						  int *num);
269 static void _json_parse_mcdram_cap_object(json_object *jobj, mcdram_cap_t *ent);
270 static void _json_parse_mcdram_cfg_object(json_object *jobj, mcdram_cfg_t *ent);
271 static numa_cap_t *_json_parse_numa_cap_array(json_object *jobj, char *key,
272 					      int *num);
273 static void _json_parse_numa_cap_object(json_object *jobj, numa_cap_t *ent);
274 static numa_cfg_t *_json_parse_numa_cfg_array(json_object *jobj, char *key,
275 					      int *num);
276 static void _json_parse_numa_cfg_object(json_object *jobj, numa_cfg_t *ent);
277 static int  _knl_mcdram_bits_cnt(uint16_t mcdram_num);
278 static uint16_t _knl_mcdram_parse(char *mcdram_str, char *sep);
279 static char *_knl_mcdram_str(uint16_t mcdram_num);
280 static uint16_t _knl_mcdram_token(char *token);
281 static int _knl_numa_bits_cnt(uint16_t numa_num);
282 static uint16_t _knl_numa_parse(char *numa_str, char *sep);
283 static char *_knl_numa_str(uint16_t numa_num);
284 static int _knl_numa_inx(char *token);
285 static uint16_t _knl_numa_token(char *token);
286 static mcdram_cfg2_t *_load_current_mcdram(int *num);
287 static numa_cfg2_t *_load_current_numa(int *num);
288 static char *_load_mcdram_type(int cache_pct);
289 static char *_load_numa_type(char *type);
290 static void _log_script_argv(char **script_argv, char *resp_msg);
291 static void _mcdram_cap_free(mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt);
292 static void _mcdram_cap_log(mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt);
293 static void _mcdram_cfg_free(mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt);
294 static void _mcdram_cfg2_free(mcdram_cfg2_t *mcdram_cfg2, int mcdram_cfg2_cnt);
295 static void _mcdram_cfg_log(mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt);
296 static void _merge_strings(char **node_features, char *node_cfg,
297 			   uint16_t allow_types);
298 static void _numa_cap_free(numa_cap_t *numa_cap, int numa_cap_cnt);
299 static void _numa_cap_log(numa_cap_t *numa_cap, int numa_cap_cnt);
300 static void _numa_cfg_free(numa_cfg_t *numa_cfg, int numa_cfg_cnt);
301 static void _numa_cfg2_free(numa_cfg2_t *numa_cfg, int numa_cfg2_cnt);
302 static void _numa_cfg_log(numa_cfg_t *numa_cfg, int numa_cfg_cnt);
303 static void _numa_cfg2_log(numa_cfg2_t *numa_cfg, int numa_cfg2_cnt);
304 static uint64_t _parse_size(char *size_str);
305 extern void *_queue_agent(void *args);
306 static int  _queue_node_update(char *node_list);
307 static char *_run_script(char *cmd_path, char **script_argv, int *status);
308 static void _strip_knl_opts(char **features);
309 static int  _tot_wait (struct timeval *start_time);
310 static void *_ume_agent(void *args);
311 static void _update_all_node_features(
312 				mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt,
313 				mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt,
314 				numa_cap_t *numa_cap, int numa_cap_cnt,
315 				numa_cfg_t *numa_cfg, int numa_cfg_cnt);
316 static void _update_cpu_bind(void);
317 static void _update_mcdram_pct(char *tok, int mcdram_num);
318 static void _update_node_features(node_record_t *node_ptr,
319 				  mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt,
320 				  mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt,
321 				  numa_cap_t *numa_cap, int numa_cap_cnt,
322 				  numa_cfg_t *numa_cfg, int numa_cfg_cnt);
323 static int _update_node_state(char *node_list, bool set_locks);
324 static void _validate_node_features(node_record_t *node_ptr);
325 
326 /* Function used both internally and externally */
327 extern int node_features_p_node_update(char *active_features,
328 				       bitstr_t *node_bitmap);
329 
_config_make_tbl(char * filename)330 static s_p_hashtbl_t *_config_make_tbl(char *filename)
331 {
332 	s_p_hashtbl_t *tbl = NULL;
333 
334 	xassert(filename);
335 
336 	if (!(tbl = s_p_hashtbl_create(knl_conf_file_options))) {
337 		error("knl.conf: %s: s_p_hashtbl_create error: %m", __func__);
338 		return tbl;
339 	}
340 
341 	if (s_p_parse_file(tbl, NULL, filename, false) == SLURM_ERROR) {
342 		error("knl.conf: %s: s_p_parse_file error: %m", __func__);
343 		s_p_hashtbl_destroy(tbl);
344 		tbl = NULL;
345 	}
346 
347 	return tbl;
348 }
349 
350 /*
351  * Return the count of MCDRAM bits set
352  */
_knl_mcdram_bits_cnt(uint16_t mcdram_num)353 static int _knl_mcdram_bits_cnt(uint16_t mcdram_num)
354 {
355 	int cnt = 0, i;
356 	uint16_t tmp = 1;
357 
358 	for (i = 0; i < 16; i++) {
359 		if ((mcdram_num & KNL_MCDRAM_FLAG) & tmp)
360 			cnt++;
361 		tmp = tmp << 1;
362 	}
363 	return cnt;
364 }
365 
366 /*
367  * Translate KNL MCDRAM string to equivalent numeric value
368  * mcdram_str IN - String to scan
369  * sep IN - token separator to search for
370  * RET MCDRAM numeric value
371  */
_knl_mcdram_parse(char * mcdram_str,char * sep)372 static uint16_t _knl_mcdram_parse(char *mcdram_str, char *sep)
373 {
374 	char *save_ptr = NULL, *tmp, *tok;
375 	uint16_t mcdram_num = 0;
376 
377 	if (!mcdram_str)
378 		return mcdram_num;
379 
380 	tmp = xstrdup(mcdram_str);
381 	tok = strtok_r(tmp, sep, &save_ptr);
382 	while (tok) {
383 		mcdram_num |= _knl_mcdram_token(tok);
384 		tok = strtok_r(NULL, sep, &save_ptr);
385 	}
386 	xfree(tmp);
387 
388 	return mcdram_num;
389 }
390 
391 /*
392  * Translate KNL MCDRAM number to equivalent string value
393  * Caller must free return value
394  */
_knl_mcdram_str(uint16_t mcdram_num)395 static char *_knl_mcdram_str(uint16_t mcdram_num)
396 {
397 	char *mcdram_str = NULL, *sep = "";
398 
399 	if (mcdram_num & KNL_CACHE) {
400 		xstrfmtcat(mcdram_str, "%scache", sep);
401 		sep = ",";
402 	}
403 	if (mcdram_num & KNL_SPLIT) {
404 		xstrfmtcat(mcdram_str, "%ssplit", sep);
405 		sep = ",";
406 	}
407 	if (mcdram_num & KNL_FLAT) {
408 		xstrfmtcat(mcdram_str, "%sflat", sep);
409 		sep = ",";
410 	}
411 	if (mcdram_num & KNL_EQUAL) {
412 		xstrfmtcat(mcdram_str, "%sequal", sep);
413 //		sep = ",";	/* Remove to avoid CLANG error */
414 	}
415 
416 	return mcdram_str;
417 }
418 
419 /*
420  * Given a KNL MCDRAM token, return its equivalent numeric value
421  * token IN - String to scan
422  * RET MCDRAM numeric value
423  */
_knl_mcdram_token(char * token)424 static uint16_t _knl_mcdram_token(char *token)
425 {
426 	uint16_t mcdram_num = 0;
427 
428 	if (!xstrcasecmp(token, "cache"))
429 		mcdram_num = KNL_CACHE;
430 	else if (!xstrcasecmp(token, "split"))
431 		mcdram_num = KNL_SPLIT;
432 	else if (!xstrcasecmp(token, "flat"))
433 		mcdram_num = KNL_FLAT;
434 	else if (!xstrcasecmp(token, "equal"))
435 		mcdram_num = KNL_EQUAL;
436 
437 	return mcdram_num;
438 }
439 
440 /*
441  * Return the count of NUMA bits set
442  */
_knl_numa_bits_cnt(uint16_t numa_num)443 static int _knl_numa_bits_cnt(uint16_t numa_num)
444 {
445 	int cnt = 0, i;
446 	uint16_t tmp = 1;
447 
448 	for (i = 0; i < 16; i++) {
449 		if ((numa_num & KNL_NUMA_FLAG) & tmp)
450 			cnt++;
451 		tmp = tmp << 1;
452 	}
453 	return cnt;
454 }
455 
456 /*
457  * Translate KNL NUMA string to equivalent numeric value
458  * numa_str IN - String to scan
459  * sep IN - token separator to search for
460  * RET NUMA numeric value
461  */
_knl_numa_parse(char * numa_str,char * sep)462 static uint16_t _knl_numa_parse(char *numa_str, char *sep)
463 {
464 	char *save_ptr = NULL, *tmp, *tok;
465 	uint16_t numa_num = 0;
466 
467 	if (!numa_str)
468 		return numa_num;
469 
470 	tmp = xstrdup(numa_str);
471 	tok = strtok_r(tmp, sep, &save_ptr);
472 	while (tok) {
473 		numa_num |= _knl_numa_token(tok);
474 		tok = strtok_r(NULL, sep, &save_ptr);
475 	}
476 	xfree(tmp);
477 
478 	return numa_num;
479 }
480 
481 /*
482  * Translate KNL NUMA number to equivalent string value
483  * Caller must free return value
484  */
_knl_numa_str(uint16_t numa_num)485 static char *_knl_numa_str(uint16_t numa_num)
486 {
487 	char *numa_str = NULL, *sep = "";
488 
489 	if (numa_num & KNL_ALL2ALL) {
490 		xstrfmtcat(numa_str, "%sa2a", sep);
491 		sep = ",";
492 	}
493 	if (numa_num & KNL_SNC2) {
494 		xstrfmtcat(numa_str, "%ssnc2", sep);
495 		sep = ",";
496 	}
497 	if (numa_num & KNL_SNC4) {
498 		xstrfmtcat(numa_str, "%ssnc4", sep);
499 		sep = ",";
500 	}
501 	if (numa_num & KNL_HEMI) {
502 		xstrfmtcat(numa_str, "%shemi", sep);
503 		sep = ",";
504 	}
505 	if (numa_num & KNL_QUAD) {
506 		xstrfmtcat(numa_str, "%squad", sep);
507 //		sep = ",";	/* Remove to avoid CLANG error */
508 	}
509 
510 	return numa_str;
511 
512 }
513 
514 /*
515  * Given a KNL NUMA token, return its equivalent numeric value
516  * token IN - String to scan
517  * RET NUMA numeric value
518  */
_knl_numa_token(char * token)519 static uint16_t _knl_numa_token(char *token)
520 {
521 	uint16_t numa_num = 0;
522 
523 	if (!xstrcasecmp(token, "a2a"))
524 		numa_num |= KNL_ALL2ALL;
525 	else if (!xstrcasecmp(token, "snc2"))
526 		numa_num |= KNL_SNC2;
527 	else if (!xstrcasecmp(token, "snc4"))
528 		numa_num |= KNL_SNC4;
529 	else if (!xstrcasecmp(token, "hemi"))
530 		numa_num |= KNL_HEMI;
531 	else if (!xstrcasecmp(token, "quad"))
532 		numa_num |= KNL_QUAD;
533 
534 	return numa_num;
535 }
536 
537 /*
538  * Given a KNL NUMA token, return its cpu_bind offset
539  * token IN - String to scan
540  * RET NUMA offset or -1 if not found
541  */
_knl_numa_inx(char * token)542 static int _knl_numa_inx(char *token)
543 {
544 	uint16_t numa_num;
545 	int i;
546 
547 	numa_num = _knl_numa_token(token);
548 	for (i = 0; i < KNL_NUMA_CNT; i++) {
549 		if ((0x01 << i) == numa_num)
550 			return i;
551 	}
552 	return -1;
553 }
554 
555 /* Remove all KNL feature names from the "features" string */
_strip_knl_opts(char ** features)556 static void _strip_knl_opts(char **features)
557 {
558 	char *save_ptr = NULL, *tok;
559 	char *tmp_str, *result_str = NULL, *sep = "";
560 
561 	if (*features == NULL)
562 		return;
563 
564 	tmp_str = xstrdup(*features);
565 	tok = strtok_r(tmp_str, ",", &save_ptr);
566 	while (tok) {
567 		if (!_knl_mcdram_token(tok) && !_knl_numa_token(tok)) {
568 			xstrfmtcat(result_str, "%s%s", sep, tok);
569 			sep = ",";
570 		}
571 		tok = strtok_r(NULL, ",", &save_ptr);
572 	}
573 	xfree(tmp_str);
574 	xfree(*features);
575 	*features = result_str;
576 }
577 
578 /*
579  * Return time in msec since "start time"
580  */
_tot_wait(struct timeval * start_time)581 static int _tot_wait (struct timeval *start_time)
582 {
583 	struct timeval end_time;
584 	int msec_delay;
585 
586 	gettimeofday(&end_time, NULL);
587 	msec_delay =   (end_time.tv_sec  - start_time->tv_sec ) * 1000;
588 	msec_delay += ((end_time.tv_usec - start_time->tv_usec + 500) / 1000);
589 	return msec_delay;
590 }
591 
592 /* Free an array of xmalloced records. The array must be NULL terminated. */
_free_script_argv(char ** script_argv)593 static void _free_script_argv(char **script_argv)
594 {
595 	int i;
596 
597 	for (i = 0; script_argv[i]; i++)
598 		xfree(script_argv[i]);
599 	xfree(script_argv);
600 }
601 
602 /*
603  * Update cpu_bind array from current numa_cpu_bind configuration parameter
604  */
_update_cpu_bind(void)605 static void _update_cpu_bind(void)
606 {
607 	char *save_ptr = NULL, *sep, *tok, *tmp;
608 	int rc = SLURM_SUCCESS;
609 	int i, numa_inx, numa_def;
610 	uint32_t cpu_bind_val = 0;
611 
612 	for (i = 0; i < KNL_NUMA_CNT; i++)
613 		cpu_bind[0] = 0;
614 
615 	if (!numa_cpu_bind)
616 		return;
617 
618 	tmp = xstrdup(numa_cpu_bind);
619 	tok = strtok_r(tmp, ";", &save_ptr);
620 	while (tok) {
621 		sep = strchr(tok, '=');
622 		if (!sep) {
623 			rc = SLURM_ERROR;
624 			break;
625 		}
626 		sep[0] = '\0';
627 		numa_def = _knl_numa_token(tok);
628 		if (numa_def == 0) {
629 			rc = SLURM_ERROR;
630 			break;
631 		}
632 		if (xlate_cpu_bind_str(sep + 1, &cpu_bind_val) !=
633 		    SLURM_SUCCESS) {
634 			rc = SLURM_ERROR;
635 			break;
636 		}
637 		numa_inx = -1;
638 		for (i = 0; i < KNL_NUMA_CNT; i++) {
639 			if ((0x1 << i) == numa_def) {
640 				numa_inx = i;
641 				break;
642 			}
643 		}
644 		if (numa_inx > -1)
645 			cpu_bind[numa_inx] = cpu_bind_val;
646 		tok = strtok_r(NULL, ";", &save_ptr);
647 	}
648 	xfree(tmp);
649 
650 	if (rc != SLURM_SUCCESS) {
651 		error("%s: Invalid NumaCpuBind (%s), ignored",
652 		      plugin_type, numa_cpu_bind);
653 	}
654 
655 	if (debug_flag) {
656 		for (i = 0; i < KNL_NUMA_CNT; i++) {
657 			char cpu_bind_str[128], *numa_str;
658 			if (cpu_bind[i] == 0)
659 				continue;
660 			numa_str = _knl_numa_str(0x1 << i);
661 			slurm_sprint_cpu_bind_type(cpu_bind_str, cpu_bind[i]);
662 			info("CpuBind[%s] = %s", numa_str, cpu_bind_str);
663 			xfree(numa_str);
664 		}
665 	}
666 }
667 
668 /*
669  * Update our mcdram_pct array with new data.
670  * tok IN - percentage of MCDRAM to be used as cache (string form)
671  * mcdram_num - MCDRAM value (bit from KNL_FLAT, etc.)
672  */
_update_mcdram_pct(char * tok,int mcdram_num)673 static void _update_mcdram_pct(char *tok, int mcdram_num)
674 {
675 	int inx;
676 
677 	if (mcdram_set == KNL_MCDRAM_CNT)
678 		return;
679 
680 	for (inx = 0; inx < KNL_MCDRAM_CNT; inx++) {
681 		if ((KNL_CACHE << inx) == mcdram_num)
682 			break;
683 	}
684 	if ((inx >= KNL_MCDRAM_CNT) || (mcdram_pct[inx] != -1))
685 		return;
686 	mcdram_pct[inx] = strtol(tok, NULL, 10);
687 	mcdram_set++;
688 }
689 
_json_parse_mcdram_cap_object(json_object * jobj,mcdram_cap_t * ent)690 static void _json_parse_mcdram_cap_object(json_object *jobj, mcdram_cap_t *ent)
691 {
692 	enum json_type type;
693 	struct json_object_iter iter;
694 	int64_t x;
695 	const char *p;
696 	char *tmp_str, *tok, *save_ptr = NULL, *sep = "";
697 	int last_mcdram_num = -1;
698 
699 	json_object_object_foreachC(jobj, iter) {
700 		type = json_object_get_type(iter.val);
701 		switch (type) {
702 		case json_type_int:
703 			x = json_object_get_int64(iter.val);
704 			if (xstrcmp(iter.key, "nid") == 0) {
705 				ent->nid = x;
706 			}
707 			break;
708 		case json_type_string:
709 			p = json_object_get_string(iter.val);
710 			if (xstrcmp(iter.key, "mcdram_cfg") == 0) {
711 				tmp_str = xstrdup(p);
712 				tok = strtok_r(tmp_str, ",", &save_ptr);
713 				while (tok) {
714 					if ((tok[0] >= '0') && (tok[0] <= '9')){
715 						_update_mcdram_pct(tok,
716 							last_mcdram_num);
717 						last_mcdram_num = -1;
718 					} else {
719 						last_mcdram_num =
720 							_knl_mcdram_token(tok);
721 						xstrfmtcat(ent->mcdram_cfg,
722 							   "%s%s", sep, tok);
723 						sep = ",";
724 					}
725 					tok = strtok_r(NULL, ",", &save_ptr);
726 				}
727 				xfree(tmp_str);
728 			}
729 			break;
730 		default:
731 			break;
732 		}
733 	}
734 }
735 
_parse_size(char * size_str)736 static uint64_t _parse_size(char *size_str)
737 {
738 	uint64_t size_num = 0;
739 	char *end_ptr = NULL;
740 
741 	size_num = (uint64_t) strtol(size_str, &end_ptr, 10);
742 	if ((end_ptr[0] == 'k') || (end_ptr[0] == 'K'))
743 		size_num *= 1024;
744 	else if ((end_ptr[0] == 'm') || (end_ptr[0] == 'M'))
745 		size_num *= (1024 * 1024);
746 	else if ((end_ptr[0] == 'g') || (end_ptr[0] == 'G'))
747 		size_num *= (1024 * 1024 * 1024);
748 	else if (end_ptr[0] != '\0')
749 		info("Invalid MCDRAM size: %s", size_str);
750 
751 	return size_num;
752 }
753 
_json_parse_mcdram_cfg_object(json_object * jobj,mcdram_cfg_t * ent)754 static void _json_parse_mcdram_cfg_object(json_object *jobj, mcdram_cfg_t *ent)
755 {
756 	enum json_type type;
757 	struct json_object_iter iter;
758 	int64_t x;
759 	const char *p;
760 
761 	/* Initialize object */
762 	ent->dram_size   = NO_VAL;
763 	ent->mcdram_pct  = NO_VAL16;
764 	ent->mcdram_size = NO_VAL;
765 
766 	json_object_object_foreachC(jobj, iter) {
767 		type = json_object_get_type(iter.val);
768 		switch (type) {
769 		case json_type_int:
770 			x = json_object_get_int64(iter.val);
771 			if (xstrcmp(iter.key, "nid") == 0) {
772 				ent->nid = x;
773 			} else if (xstrcmp(iter.key, "mcdram_pct") == 0) {
774 				ent->mcdram_pct = x;
775 			}
776 			break;
777 		case json_type_string:
778 			p = json_object_get_string(iter.val);
779 			if (xstrcmp(iter.key, "dram_size") == 0) {
780 				ent->dram_size = _parse_size((char *) p);
781 			} else if (xstrcmp(iter.key, "mcdram_cfg") == 0) {
782 				ent->mcdram_cfg = xstrdup(p);
783 			} else if (xstrcmp(iter.key, "mcdram_pct") == 0) {
784 				ent->mcdram_pct = _parse_size((char *) p);
785 			} else if (xstrcmp(iter.key, "mcdram_size") == 0) {
786 				ent->mcdram_size = _parse_size((char *) p);
787 			}
788 			break;
789 		default:
790 			break;
791 		}
792 	}
793 }
794 
_json_parse_numa_cap_object(json_object * jobj,numa_cap_t * ent)795 static void _json_parse_numa_cap_object(json_object *jobj, numa_cap_t *ent)
796 {
797 	enum json_type type;
798 	struct json_object_iter iter;
799 	int64_t x;
800 	const char *p;
801 
802 	json_object_object_foreachC(jobj, iter) {
803 		type = json_object_get_type(iter.val);
804 		switch (type) {
805 		case json_type_int:
806 			x = json_object_get_int64(iter.val);
807 			if (xstrcmp(iter.key, "nid") == 0) {
808 				ent->nid = x;
809 			}
810 			break;
811 		case json_type_string:
812 			p = json_object_get_string(iter.val);
813 			if (xstrcmp(iter.key, "numa_cfg") == 0) {
814 				ent->numa_cfg = xstrdup(p);
815 			}
816 			break;
817 		default:
818 			break;
819 		}
820 	}
821 }
822 
_json_parse_numa_cfg_object(json_object * jobj,numa_cfg_t * ent)823 static void _json_parse_numa_cfg_object(json_object *jobj, numa_cfg_t *ent)
824 {
825 	enum json_type type;
826 	struct json_object_iter iter;
827 	int64_t x;
828 	const char *p;
829 
830 	json_object_object_foreachC(jobj, iter) {
831 		type = json_object_get_type(iter.val);
832 		switch (type) {
833 		case json_type_int:
834 			x = json_object_get_int64(iter.val);
835 			if (xstrcmp(iter.key, "nid") == 0) {
836 				ent->nid = x;
837 			}
838 			break;
839 		case json_type_string:
840 			p = json_object_get_string(iter.val);
841 			if (xstrcmp(iter.key, "numa_cfg") == 0) {
842 				ent->numa_cfg = xstrdup(p);
843 			}
844 			break;
845 		default:
846 			break;
847 		}
848 	}
849 }
850 
_json_parse_mcdram_cap_array(json_object * jobj,char * key,int * num)851 static mcdram_cap_t *_json_parse_mcdram_cap_array(json_object *jobj, char *key,
852 						  int *num)
853 {
854 	json_object *jarray;
855 	json_object *jvalue;
856 	mcdram_cap_t *ents;
857 	int i;
858 
859 	jarray = jobj;
860 	json_object_object_get_ex(jobj, key, &jarray);
861 
862 	*num = json_object_array_length(jarray);
863 	ents = xmalloc(*num * sizeof(mcdram_cap_t));
864 
865 	for (i = 0; i < *num; i++) {
866 		jvalue = json_object_array_get_idx(jarray, i);
867 		_json_parse_mcdram_cap_object(jvalue, &ents[i]);
868 	}
869 
870 	return ents;
871 }
872 
873 /* Return NID string for all nodes with specified MCDRAM mode (HBM percentage).
874  * NOTE: Information not returned for nodes which are not up
875  * NOTE: xfree() the return value. */
_load_mcdram_type(int cache_pct)876 static char *_load_mcdram_type(int cache_pct)
877 {
878 	char **script_argv, *resp_msg;
879 	int i, status = 0;
880 	DEF_TIMERS;
881 
882 	if (cache_pct < 0)	/* Unsupported configuration on this system */
883 		return NULL;
884 	script_argv = xmalloc(sizeof(char *) * 4);	/* NULL terminated */
885 	script_argv[0] = xstrdup("cnselect");
886 	script_argv[1] = xstrdup("-e");
887 	xstrfmtcat(script_argv[2], "hbmcachepct.eq.%d", cache_pct);
888 	START_TIMER;
889 	resp_msg = _run_script(cnselect_path, script_argv, &status);
890 	END_TIMER;
891 	if (debug_flag) {
892 		info("%s: %s %s %s ran for %s", __func__,
893 		     script_argv[0], script_argv[1], script_argv[2], TIME_STR);
894 	}
895 	if (resp_msg == NULL) {
896 		debug("%s: %s %s %s returned no information",
897 		      __func__, script_argv[0], script_argv[1], script_argv[2]);
898 	} else {
899 		i = strlen(resp_msg);
900 		if (resp_msg[i-1] == '\n')
901 			resp_msg[i-1] = '\0';
902 	}
903 	_log_script_argv(script_argv, resp_msg);
904 	if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
905 		error("%s: %s %s %s status:%u response:%s", __func__,
906 		      script_argv[0], script_argv[1], script_argv[2],
907 		      status, resp_msg);
908 	}
909 	_free_script_argv(script_argv);
910 	return resp_msg;
911 }
912 
913 /* Return table of MCDRAM modes and NID string identifying nodes with that mode.
914  * Use _mcdram_cfg2_free() to release returned data structure */
_load_current_mcdram(int * num)915 static mcdram_cfg2_t *_load_current_mcdram(int *num)
916 {
917 	mcdram_cfg2_t *mcdram_cfg;
918 	int i;
919 
920 	mcdram_cfg = xmalloc(sizeof(mcdram_cfg2_t) * 4);
921 
922 	for (i = 0; i < 4; i++) {
923 		mcdram_cfg[i].cache_pct = mcdram_pct[i];
924 		mcdram_cfg[i].mcdram_cfg = _knl_mcdram_str(KNL_CACHE << i);
925 		mcdram_cfg[i].nid_str = _load_mcdram_type(mcdram_cfg[i].cache_pct);
926 		if (mcdram_cfg[i].nid_str && mcdram_cfg[i].nid_str[0]) {
927 			mcdram_cfg[i].node_bitmap = bit_alloc(100000);
928 			(void) bit_unfmt(mcdram_cfg[i].node_bitmap,
929 					 mcdram_cfg[i].nid_str);
930 		}
931 	}
932 	*num = 4;
933 	return mcdram_cfg;
934 }
935 
936 /* Return NID string for all nodes with specified NUMA mode.
937  * NOTE: Information not returned for nodes which are not up
938  * NOTE: xfree() the return value. */
_load_numa_type(char * type)939 static char *_load_numa_type(char *type)
940 {
941 	char **script_argv, *resp_msg;
942 	int i, status = 0;
943 	DEF_TIMERS;
944 
945 	script_argv = xmalloc(sizeof(char *) * 4);	/* NULL terminated */
946 	script_argv[0] = xstrdup("cnselect");
947 	script_argv[1] = xstrdup("-e");
948 	xstrfmtcat(script_argv[2], "numa_cfg.eq.%s", type);
949 	START_TIMER;
950 	resp_msg = _run_script(cnselect_path, script_argv, &status);
951 	END_TIMER;
952 	if (debug_flag) {
953 		info("%s: %s %s %s ran for %s", __func__,
954 		     script_argv[0], script_argv[1], script_argv[2], TIME_STR);
955 	}
956 	if (resp_msg == NULL) {
957 		debug("%s: %s %s %s returned no information",
958 		      __func__, script_argv[0], script_argv[1], script_argv[2]);
959 	} else {
960 		i = strlen(resp_msg);
961 		if (resp_msg[i-1] == '\n')
962 			resp_msg[i-1] = '\0';
963 	}
964 	_log_script_argv(script_argv, resp_msg);
965 	if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
966 		error("%s: %s %s %s status:%u response:%s", __func__,
967 		      script_argv[0], script_argv[1], script_argv[2],
968 		      status, resp_msg);
969 	}
970 	_free_script_argv(script_argv);
971 	return resp_msg;
972 }
973 
974 /* Return table of NUMA modes and NID string identifying nodes with that mode.
975  * Use _numa_cfg2_free() to release returned data structure */
_load_current_numa(int * num)976 static numa_cfg2_t *_load_current_numa(int *num)
977 {
978 	numa_cfg2_t *numa_cfg2;
979 	int i;
980 
981 	numa_cfg2 = xmalloc(sizeof(numa_cfg2_t) * 5);
982 	numa_cfg2[0].numa_cfg = xstrdup("a2a");
983 	numa_cfg2[1].numa_cfg = xstrdup("snc2");
984 	numa_cfg2[2].numa_cfg = xstrdup("snc4");
985 	numa_cfg2[3].numa_cfg = xstrdup("hemi");
986 	numa_cfg2[4].numa_cfg = xstrdup("quad");
987 
988 	for (i = 0; i < 5; i++) {
989 		numa_cfg2[i].nid_str = _load_numa_type(numa_cfg2[i].numa_cfg);
990 		if (numa_cfg2[i].nid_str && numa_cfg2[i].nid_str[0]) {
991 			numa_cfg2[i].node_bitmap = bit_alloc(100000);
992 			(void) bit_unfmt(numa_cfg2[i].node_bitmap,
993 					 numa_cfg2[i].nid_str);
994 		}
995 	}
996 	*num = 5;
997 	return numa_cfg2;
998 }
999 
_json_parse_mcdram_cfg_array(json_object * jobj,char * key,int * num)1000 static mcdram_cfg_t *_json_parse_mcdram_cfg_array(json_object *jobj, char *key,
1001 						  int *num)
1002 {
1003 	json_object *jarray;
1004 	json_object *jvalue;
1005 	mcdram_cfg_t *ents;
1006 	int i;
1007 
1008 	jarray = jobj;
1009 	json_object_object_get_ex(jobj, key, &jarray);
1010 
1011 	*num = json_object_array_length(jarray);
1012 	ents = xmalloc(*num * sizeof(mcdram_cfg_t));
1013 
1014 	for (i = 0; i < *num; i++) {
1015 		jvalue = json_object_array_get_idx(jarray, i);
1016 		_json_parse_mcdram_cfg_object(jvalue, &ents[i]);
1017 	}
1018 
1019 	return ents;
1020 }
1021 
_json_parse_numa_cap_array(json_object * jobj,char * key,int * num)1022 static numa_cap_t *_json_parse_numa_cap_array(json_object *jobj, char *key,
1023 					      int *num)
1024 {
1025 	json_object *jarray;
1026 	json_object *jvalue;
1027 	numa_cap_t *ents;
1028 	int i;
1029 
1030 	jarray = jobj;
1031 	json_object_object_get_ex(jobj, key, &jarray);
1032 
1033 	*num = json_object_array_length(jarray);
1034 	ents = xmalloc(*num * sizeof(numa_cap_t));
1035 
1036 	for (i = 0; i < *num; i++) {
1037 		jvalue = json_object_array_get_idx(jarray, i);
1038 		_json_parse_numa_cap_object(jvalue, &ents[i]);
1039 	}
1040 
1041 	return ents;
1042 }
1043 
_json_parse_numa_cfg_array(json_object * jobj,char * key,int * num)1044 static numa_cfg_t *_json_parse_numa_cfg_array(json_object *jobj, char *key,
1045 					      int *num)
1046 {
1047 	json_object *jarray;
1048 	json_object *jvalue;
1049 	numa_cfg_t *ents;
1050 	int i;
1051 
1052 	jarray = jobj;
1053 	json_object_object_get_ex(jobj, key, &jarray);
1054 
1055 	*num = json_object_array_length(jarray);
1056 	ents = xmalloc(*num * sizeof(numa_cfg_t));
1057 
1058 	for (i = 0; i < *num; i++) {
1059 		jvalue = json_object_array_get_idx(jarray, i);
1060 		_json_parse_numa_cfg_object(jvalue, &ents[i]);
1061 	}
1062 
1063 	return ents;
1064 }
1065 
1066 /* Log a command's arguments. */
_log_script_argv(char ** script_argv,char * resp_msg)1067 static void _log_script_argv(char **script_argv, char *resp_msg)
1068 {
1069 	char *cmd_line = NULL;
1070 	int i;
1071 
1072 	if (!debug_flag)
1073 		return;
1074 
1075 	for (i = 0; script_argv[i]; i++) {
1076 		if (i)
1077 			xstrcat(cmd_line, " ");
1078 		xstrcat(cmd_line, script_argv[i]);
1079 	}
1080 	info("%s", cmd_line);
1081 	if (resp_msg && resp_msg[0])
1082 		info("%s", resp_msg);
1083 	xfree(cmd_line);
1084 }
1085 
_mcdram_cap_free(mcdram_cap_t * mcdram_cap,int mcdram_cap_cnt)1086 static void _mcdram_cap_free(mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt)
1087 {
1088 	int i;
1089 
1090 	if (!mcdram_cap)
1091 		return;
1092 	for (i = 0; i < mcdram_cap_cnt; i++) {
1093 		xfree(mcdram_cap[i].mcdram_cfg);
1094 	}
1095 	xfree(mcdram_cap);
1096 }
1097 
_mcdram_cap_log(mcdram_cap_t * mcdram_cap,int mcdram_cap_cnt)1098 static void _mcdram_cap_log(mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt)
1099 {
1100 	int i;
1101 
1102 	if (!mcdram_cap)
1103 		return;
1104 	for (i = 0; i < mcdram_cap_cnt; i++) {
1105 		info("MCDRAM_CAP[%d]: nid:%u mcdram_cfg:%s",
1106 		     i, mcdram_cap[i].nid, mcdram_cap[i].mcdram_cfg);
1107 	}
1108 }
1109 
_mcdram_cfg_free(mcdram_cfg_t * mcdram_cfg,int mcdram_cfg_cnt)1110 static void _mcdram_cfg_free(mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt)
1111 {
1112 	int i;
1113 
1114 	if (!mcdram_cfg)
1115 		return;
1116 	for (i = 0; i < mcdram_cfg_cnt; i++) {
1117 		xfree(mcdram_cfg[i].mcdram_cfg);
1118 	}
1119 	xfree(mcdram_cfg);
1120 }
1121 
_mcdram_cfg2_free(mcdram_cfg2_t * mcdram_cfg2,int mcdram_cfg2_cnt)1122 static void _mcdram_cfg2_free(mcdram_cfg2_t *mcdram_cfg2, int mcdram_cfg2_cnt)
1123 {
1124 	int i;
1125 
1126 	if (!mcdram_cfg2)
1127 		return;
1128 	for (i = 0; i < mcdram_cfg2_cnt; i++) {
1129 		xfree(mcdram_cfg2[i].mcdram_cfg);
1130 		FREE_NULL_BITMAP(mcdram_cfg2[i].node_bitmap);
1131 		xfree(mcdram_cfg2[i].nid_str);
1132 	}
1133 	xfree(mcdram_cfg2);
1134 }
1135 
_mcdram_cfg_log(mcdram_cfg_t * mcdram_cfg,int mcdram_cfg_cnt)1136 static void _mcdram_cfg_log(mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt)
1137 {
1138 	int i;
1139 
1140 	if (!mcdram_cfg)
1141 		return;
1142 	for (i = 0; i < mcdram_cfg_cnt; i++) {
1143 		info("MCDRAM_CFG[%d]: nid:%u dram_size:%"PRIu64" mcdram_cfg:%s mcdram_pct:%u mcdram_size:%"PRIu64,
1144 		     i, mcdram_cfg[i].nid, mcdram_cfg[i].dram_size,
1145 		     mcdram_cfg[i].mcdram_cfg, mcdram_cfg[i].mcdram_pct,
1146 		     mcdram_cfg[i].mcdram_size);
1147 	}
1148 }
1149 
_mcdram_cfg2_log(mcdram_cfg2_t * mcdram_cfg2,int mcdram_cfg2_cnt)1150 static void _mcdram_cfg2_log(mcdram_cfg2_t *mcdram_cfg2, int mcdram_cfg2_cnt)
1151 {
1152 	int i;
1153 
1154 	if (!mcdram_cfg2)
1155 		return;
1156 	for (i = 0; i < mcdram_cfg2_cnt; i++) {
1157 		info("MCDRAM_CFG[%d]: nid_str:%s mcdram_cfg:%s cache_pct:%d",
1158 		     i, mcdram_cfg2[i].nid_str, mcdram_cfg2[i].mcdram_cfg,
1159 		     mcdram_cfg2[i].cache_pct);
1160 	}
1161 }
1162 
_numa_cap_free(numa_cap_t * numa_cap,int numa_cap_cnt)1163 static void _numa_cap_free(numa_cap_t *numa_cap, int numa_cap_cnt)
1164 {
1165 	int i;
1166 
1167 	if (!numa_cap)
1168 		return;
1169 	for (i = 0; i < numa_cap_cnt; i++) {
1170 		xfree(numa_cap[i].numa_cfg);
1171 	}
1172 	xfree(numa_cap);
1173 }
1174 
_numa_cap_log(numa_cap_t * numa_cap,int numa_cap_cnt)1175 static void _numa_cap_log(numa_cap_t *numa_cap, int numa_cap_cnt)
1176 {
1177 	int i;
1178 
1179 	if (!numa_cap)
1180 		return;
1181 	for (i = 0; i < numa_cap_cnt; i++) {
1182 		info("NUMA_CAP[%d]: nid:%u numa_cfg:%s",
1183 		     i, numa_cap[i].nid, numa_cap[i].numa_cfg);
1184 	}
1185 }
1186 
_numa_cfg_free(numa_cfg_t * numa_cfg,int numa_cfg_cnt)1187 static void _numa_cfg_free(numa_cfg_t *numa_cfg, int numa_cfg_cnt)
1188 {
1189 	int i;
1190 
1191 	if (!numa_cfg)
1192 		return;
1193 	for (i = 0; i < numa_cfg_cnt; i++) {
1194 		xfree(numa_cfg[i].numa_cfg);
1195 	}
1196 	xfree(numa_cfg);
1197 }
1198 
_numa_cfg2_free(numa_cfg2_t * numa_cfg2,int numa_cfg2_cnt)1199 static void _numa_cfg2_free(numa_cfg2_t *numa_cfg2, int numa_cfg2_cnt)
1200 {
1201 	int i;
1202 
1203 	if (!numa_cfg2)
1204 		return;
1205 	for (i = 0; i < numa_cfg2_cnt; i++) {
1206 		xfree(numa_cfg2[i].nid_str);
1207 		xfree(numa_cfg2[i].numa_cfg);
1208 		FREE_NULL_BITMAP(numa_cfg2[i].node_bitmap);
1209 	}
1210 	xfree(numa_cfg2);
1211 }
1212 
_numa_cfg_log(numa_cfg_t * numa_cfg,int numa_cfg_cnt)1213 static void _numa_cfg_log(numa_cfg_t *numa_cfg, int numa_cfg_cnt)
1214 {
1215 	int i;
1216 
1217 	if (!numa_cfg)
1218 		return;
1219 	for (i = 0; i < numa_cfg_cnt; i++) {
1220 		info("NUMA_CFG[%d]: nid:%u numa_cfg:%s",
1221 		     i, numa_cfg[i].nid, numa_cfg[i].numa_cfg);
1222 	}
1223 }
1224 
_numa_cfg2_log(numa_cfg2_t * numa_cfg2,int numa_cfg2_cnt)1225 static void _numa_cfg2_log(numa_cfg2_t *numa_cfg2, int numa_cfg2_cnt)
1226 {
1227 	int i;
1228 
1229 	if (!numa_cfg2)
1230 		return;
1231 	for (i = 0; i < numa_cfg2_cnt; i++) {
1232 		info("NUMA_CFG[%d]: nid_str:%s numa_cfg:%s",
1233 		     i, numa_cfg2[i].nid_str, numa_cfg2[i].numa_cfg);
1234 	}
1235 }
1236 
1237 /* Run a script and return its stdout plus exit status */
_run_script(char * cmd_path,char ** script_argv,int * status)1238 static char *_run_script(char *cmd_path, char **script_argv, int *status)
1239 {
1240 	int cc, i, new_wait, resp_size = 0, resp_offset = 0;
1241 	pid_t cpid;
1242 	char *resp = NULL;
1243 	int pfd[2] = { -1, -1 };
1244 
1245 	if (access(cmd_path, R_OK | X_OK) < 0) {
1246 		error("%s: %s can not be executed: %m", __func__, cmd_path);
1247 		*status = 127;
1248 		resp = xstrdup("Slurm node_features/knl_cray configuration error");
1249 		return resp;
1250 	}
1251 	if (pipe(pfd) != 0) {
1252 		error("%s: pipe(): %m", __func__);
1253 		*status = 127;
1254 		resp = xstrdup("System error");
1255 		return resp;
1256 	}
1257 
1258 	if ((cpid = fork()) == 0) {
1259 		cc = sysconf(_SC_OPEN_MAX);
1260 		dup2(pfd[1], STDERR_FILENO);
1261 		dup2(pfd[1], STDOUT_FILENO);
1262 		for (i = 0; i < cc; i++) {
1263 			if ((i != STDERR_FILENO) && (i != STDOUT_FILENO))
1264 				close(i);
1265 		}
1266 		setpgid(0, 0);
1267 		execv(cmd_path, script_argv);
1268 		error("%s: execv(%s): %m", __func__, cmd_path);
1269 		_exit(127);
1270 	} else if (cpid < 0) {
1271 		close(pfd[0]);
1272 		close(pfd[1]);
1273 		error("%s: fork(): %m", __func__);
1274 	} else {
1275 		struct pollfd fds;
1276 		struct timeval tstart;
1277 		resp_size = 1024;
1278 		resp = xmalloc(resp_size);
1279 		close(pfd[1]);
1280 		gettimeofday(&tstart, NULL);
1281 		while (1) {
1282 			if (slurmctld_config.shutdown_time) {
1283 				error("%s: killing %s operation on shutdown",
1284 				      __func__, script_argv[1]);
1285 				break;
1286 			}
1287 			fds.fd = pfd[0];
1288 			fds.events = POLLIN | POLLHUP | POLLRDHUP;
1289 			fds.revents = 0;
1290 			new_wait = capmc_timeout - _tot_wait(&tstart);
1291 			if (new_wait <= 0) {
1292 				error("%s: %s poll timeout @ %d msec",
1293 				      __func__, script_argv[1], capmc_timeout);
1294 				break;
1295 			}
1296 			new_wait = MIN(new_wait, MAX_POLL_WAIT);
1297 			i = poll(&fds, 1, new_wait);
1298 			if (i == 0) {
1299 				continue;
1300 			} else if (i < 0) {
1301 				error("%s: %s poll:%m", __func__,
1302 				      script_argv[1]);
1303 				break;
1304 			}
1305 			if ((fds.revents & POLLIN) == 0)
1306 				break;
1307 			i = read(pfd[0], resp + resp_offset,
1308 				 resp_size - resp_offset);
1309 			if (i == 0) {
1310 				break;
1311 			} else if (i < 0) {
1312 				if (errno == EAGAIN)
1313 					continue;
1314 				error("%s: read(%s): %m", __func__, cmd_path);
1315 				break;
1316 			} else {
1317 				resp_offset += i;
1318 				if (resp_offset + 1024 >= resp_size) {
1319 					resp_size *= 2;
1320 					resp = xrealloc(resp, resp_size);
1321 				}
1322 			}
1323 		}
1324 		killpg(cpid, SIGTERM);
1325 		usleep(10000);
1326 		killpg(cpid, SIGKILL);
1327 		waitpid(cpid, status, 0);
1328 		close(pfd[0]);
1329 	}
1330 	return resp;
1331 }
1332 
_merge_strings(char ** node_features,char * node_cfg,uint16_t allow_types)1333 static void _merge_strings(char **node_features, char *node_cfg,
1334 			   uint16_t allow_types)
1335 {
1336 	char *tmp_str1, *tok1, *save_ptr1 = NULL;
1337 	char *tmp_str2, *tok2, *save_ptr2 = NULL;
1338 	bool mcdram_filter = false, numa_filter = false;
1339 
1340 	if ((node_cfg == NULL) || (node_cfg[0] == '\0'))
1341 		return;
1342 	if (*node_features == NULL) {
1343 		*node_features = xstrdup(node_cfg);
1344 		return;
1345 	}
1346 
1347 	if ((allow_types &  KNL_MCDRAM_FLAG) &&
1348 	    (allow_types != KNL_MCDRAM_FLAG))
1349 		mcdram_filter = true;
1350 	if ((allow_types &  KNL_NUMA_FLAG) &&
1351 	    (allow_types != KNL_NUMA_FLAG))
1352 		numa_filter = true;
1353 
1354 	/* Merge strings and avoid duplicates */
1355 	tmp_str1 = xstrdup(node_cfg);
1356 	tok1 = strtok_r(tmp_str1, ",", &save_ptr1);
1357 	while (tok1) {
1358 		bool match = false;
1359 		if (mcdram_filter &&
1360 		    ((_knl_mcdram_token(tok1) & allow_types) == 0))
1361 			goto next_tok;
1362 		if (numa_filter &&
1363 		    ((_knl_numa_token(tok1) & allow_types) == 0))
1364 			goto next_tok;
1365 		tmp_str2 = xstrdup(*node_features);
1366 		tok2 = strtok_r(tmp_str2, ",", &save_ptr2);
1367 		while (tok2) {
1368 			if (!xstrcmp(tok1, tok2)) {
1369 				match = true;
1370 				break;
1371 			}
1372 			tok2 = strtok_r(NULL, ",", &save_ptr2);
1373 		}
1374 		xfree(tmp_str2);
1375 		if (!match)
1376 			xstrfmtcat(*node_features, ",%s", tok1);
1377 next_tok:	tok1 = strtok_r(NULL, ",", &save_ptr1);
1378 	}
1379 	xfree(tmp_str1);
1380 }
1381 
_make_node_down(node_record_t * node_ptr)1382 static void _make_node_down(node_record_t *node_ptr)
1383 {
1384 	if (!avail_node_bitmap) {
1385 		/*
1386 		 * In process of initial slurmctld startup,
1387 		 * node data structures not completely built yet
1388 		 */
1389 		node_ptr->node_state |= NODE_STATE_DRAIN;
1390 		node_ptr->reason = xstrdup("Invalid KNL modes");
1391 		node_ptr->reason_time = time(NULL);
1392 		node_ptr->reason_uid = getuid();
1393 	} else {
1394 		(void) drain_nodes(node_ptr->name, "Invalid KNL modes",
1395 				   getuid());
1396 	}
1397 }
1398 
1399 /*
1400  * Determine that the actual KNL mode matches the available and current node
1401  * features, otherwise DRAIN the node
1402  */
_validate_node_features(node_record_t * node_ptr)1403 static void _validate_node_features(node_record_t *node_ptr)
1404 {
1405 	char *tmp_str, *tok, *save_ptr = NULL;
1406 	uint16_t actual_mcdram = 0, actual_numa = 0;
1407 	uint16_t config_mcdram = 0, config_numa = 0;
1408 	uint16_t count_mcdram = 0,  count_numa = 0;
1409 	uint16_t tmp_mcdram, tmp_numa;
1410 
1411 	if (!node_ptr->features || IS_NODE_DOWN(node_ptr))
1412 		return;
1413 
1414 	tmp_str = xstrdup(node_ptr->features);
1415 	tok = strtok_r(tmp_str, ",", &save_ptr);
1416 	while (tok) {
1417 		if ((tmp_mcdram = _knl_mcdram_token(tok))) {
1418 			config_mcdram |= tmp_mcdram;
1419 			count_mcdram++;
1420 		} else if ((tmp_numa = _knl_numa_token(tok))) {
1421 			config_numa |= tmp_numa;
1422 			count_numa++;
1423 		}
1424 		tok = strtok_r(NULL, ",", &save_ptr);
1425 	}
1426 	xfree(tmp_str);
1427 
1428 	tmp_str = xstrdup(node_ptr->features_act);
1429 	tok = strtok_r(tmp_str, ",", &save_ptr);
1430 	while (tok) {
1431 		if ((tmp_mcdram = _knl_mcdram_token(tok)))
1432 			actual_mcdram |= tmp_mcdram;
1433 		else if ((tmp_numa = _knl_numa_token(tok)))
1434 			actual_numa |= tmp_numa;
1435 		tok = strtok_r(NULL, ",", &save_ptr);
1436 	}
1437 	xfree(tmp_str);
1438 
1439 	if ((config_mcdram != actual_mcdram) || (count_mcdram != 1) ||
1440 	    (config_numa   != actual_numa)   || (count_numa != 1)) {
1441 		_make_node_down(node_ptr);
1442 		error("Invalid KNL modes on node %s", node_ptr->name);
1443 	}
1444 }
1445 
1446 /*
1447  * Remove all KNL MCDRAM and NUMA type GRES from this node (it isn't KNL),
1448  * returns count of KNL features found.
1449  */
_strip_knl_features(char ** node_feature)1450 static int _strip_knl_features(char **node_feature)
1451 {
1452 	char *tmp_str1, *tok1, *save_ptr1 = NULL;
1453 	char *tmp_str2 = NULL, *sep = "";
1454 	int cnt = 0;
1455 
1456 	xassert(node_feature);
1457 	if (*node_feature == NULL)
1458 		return cnt;
1459 	tmp_str1 = xstrdup(*node_feature);
1460 	tok1 = strtok_r(tmp_str1, ",", &save_ptr1);
1461 	while (tok1) {
1462 		if (_knl_mcdram_token(tok1) || _knl_numa_token(tok1)) {
1463 			cnt++;
1464 		} else {
1465 			xstrfmtcat(tmp_str2, "%s%s", sep, tok1);
1466 			sep = ",";
1467 		}
1468 		tok1 = strtok_r(NULL, ",", &save_ptr1);
1469 	}
1470 	if (cnt) {	/* Update the nodes features */
1471 		xfree(*node_feature);
1472 		*node_feature = tmp_str2;
1473 	} else {	/* Discard new feature list */
1474 		xfree(tmp_str2);
1475 	}
1476 	xfree(tmp_str1);
1477 	return cnt;
1478 }
1479 
1480 /* Update features and features_act fields for ALL nodes based upon
1481  * its current configuration provided by capmc */
_update_all_node_features(mcdram_cap_t * mcdram_cap,int mcdram_cap_cnt,mcdram_cfg_t * mcdram_cfg,int mcdram_cfg_cnt,numa_cap_t * numa_cap,int numa_cap_cnt,numa_cfg_t * numa_cfg,int numa_cfg_cnt)1482 static void _update_all_node_features(
1483 				mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt,
1484 				mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt,
1485 				numa_cap_t *numa_cap, int numa_cap_cnt,
1486 				numa_cfg_t *numa_cfg, int numa_cfg_cnt)
1487 {
1488 	node_record_t *node_ptr;
1489 	char node_name[32], *prefix;
1490 	int i, node_inx, numa_inx, width = 5;
1491 	uint64_t mcdram_size;
1492 
1493 	if ((node_record_table_ptr == NULL) ||
1494 	    (node_record_table_ptr->name == NULL)) {
1495 		prefix = xstrdup("nid");
1496 	} else {
1497 		prefix = xstrdup(node_record_table_ptr->name);
1498 		for (i = 0; prefix[i]; i++) {
1499 			if ((prefix[i] >= '0') && (prefix[i] <= '9')) {
1500 				prefix[i] = '\0';
1501 				width = 1;
1502 				for (i++ ; prefix[i]; i++)
1503 					width++;
1504 				break;
1505 			}
1506 		}
1507 	}
1508 	if (mcdram_cap) {
1509 		if (!knl_node_bitmap)
1510 			knl_node_bitmap = bit_alloc(node_record_count);
1511 		for (i = 0; i < mcdram_cap_cnt; i++) {
1512 			snprintf(node_name, sizeof(node_name),
1513 				 "%s%.*d", prefix, width, mcdram_cap[i].nid);
1514 			node_ptr = find_node_record(node_name);
1515 			if (node_ptr) {
1516 				node_inx = node_ptr - node_record_table_ptr;
1517 				bit_set(knl_node_bitmap, node_inx);
1518 				if (validate_mode == 0) {
1519 					_merge_strings(&node_ptr->features,
1520 						       mcdram_cap[i].mcdram_cfg,
1521 						       allow_mcdram);
1522 				}
1523 			}
1524 		}
1525 	}
1526 	if (mcdram_cfg) {
1527 		for (i = 0; i < mcdram_cfg_cnt; i++) {
1528 			snprintf(node_name, sizeof(node_name),
1529 				 "%s%.*d", prefix, width, mcdram_cfg[i].nid);
1530 			if (!(node_ptr = find_node_record(node_name)))
1531 				continue;
1532 			mcdram_per_node[node_ptr - node_record_table_ptr] =
1533 				mcdram_cfg[i].mcdram_size;
1534 			_merge_strings(&node_ptr->features_act,
1535 				       mcdram_cfg[i].mcdram_cfg,
1536 				       allow_mcdram);
1537 			mcdram_size = mcdram_cfg[i].mcdram_size *
1538 				      (100 - mcdram_cfg[i].mcdram_pct) / 100;
1539 			if (!node_ptr->gres) {
1540 				node_ptr->gres =
1541 					xstrdup(node_ptr->config_ptr->gres);
1542 			}
1543 			gres_plugin_node_feature(node_ptr->name, "hbm",
1544 						 mcdram_size, &node_ptr->gres,
1545 						 &node_ptr->gres_list);
1546 		}
1547 	}
1548 	if (numa_cap && (validate_mode == 0)) {
1549 		for (i = 0; i < numa_cap_cnt; i++) {
1550 			snprintf(node_name, sizeof(node_name),
1551 				 "%s%.*d", prefix, width, numa_cap[i].nid);
1552 			node_ptr = find_node_record(node_name);
1553 			if (node_ptr) {
1554 				_merge_strings(&node_ptr->features,
1555 					       numa_cap[i].numa_cfg,
1556 					       allow_numa);
1557 			}
1558 		}
1559 	}
1560 	if (numa_cfg) {
1561 		for (i = 0; i < numa_cfg_cnt; i++) {
1562 			snprintf(node_name, sizeof(node_name),
1563 				 "%s%.*u", prefix, width, numa_cfg[i].nid);
1564 			node_ptr = find_node_record(node_name);
1565 			if (node_ptr) {
1566 				_merge_strings(&node_ptr->features_act,
1567 					       numa_cfg[i].numa_cfg,
1568 					       allow_numa);
1569 				numa_inx = _knl_numa_inx(numa_cfg[i].numa_cfg);
1570 				if ((numa_inx >= 0) && cpu_bind[numa_inx])
1571 					node_ptr->cpu_bind = cpu_bind[numa_inx];
1572 			}
1573 		}
1574 	}
1575 
1576 	/*
1577 	 * Make sure that only nodes reported by "capmc get_mcdram_capabilities"
1578 	 * contain KNL features
1579 	 */
1580 	for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
1581 	     i++, node_ptr++) {
1582 		if (knl_node_bitmap && bit_test(knl_node_bitmap, i)) {
1583 			if (validate_mode)
1584 				_validate_node_features(node_ptr);
1585 			continue;
1586 		}
1587 		node_inx = _strip_knl_features(&node_ptr->features) +
1588 			   _strip_knl_features(&node_ptr->features_act);
1589 		if (node_inx) {
1590 			error("Removed KNL features from non-KNL node %s",
1591 			      node_ptr->name);
1592 		}
1593 		if (!node_ptr->gres)
1594 			node_ptr->gres = xstrdup(node_ptr->config_ptr->gres);
1595 		gres_plugin_node_feature(node_ptr->name, "hbm", 0,
1596 					 &node_ptr->gres, &node_ptr->gres_list);
1597 	}
1598 
1599 	xfree(prefix);
1600 }
1601 
1602 /*
1603  * Update a specific node's features and features_act fields based upon
1604  * its current configuration provided by capmc
1605  */
_update_node_features(node_record_t * node_ptr,mcdram_cap_t * mcdram_cap,int mcdram_cap_cnt,mcdram_cfg_t * mcdram_cfg,int mcdram_cfg_cnt,numa_cap_t * numa_cap,int numa_cap_cnt,numa_cfg_t * numa_cfg,int numa_cfg_cnt)1606 static void _update_node_features(node_record_t *node_ptr,
1607 				  mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt,
1608 				  mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt,
1609 				  numa_cap_t *numa_cap, int numa_cap_cnt,
1610 				  numa_cfg_t *numa_cfg, int numa_cfg_cnt)
1611 {
1612 	int i, nid, node_inx, numa_inx;
1613 	char *end_ptr = "";
1614 	uint64_t mcdram_size;
1615 	bitstr_t *node_bitmap = NULL;
1616 	bool is_knl = false;
1617 
1618 	xassert(node_ptr);
1619 	nid = strtol(node_ptr->name + 3, &end_ptr, 10);
1620 	if (end_ptr[0] != '\0') {
1621 		error("%s: Invalid node name (%s)", __func__, node_ptr->name);
1622 		return;
1623 	}
1624 
1625 	_strip_knl_opts(&node_ptr->features);
1626 	if (node_ptr->features && !node_ptr->features_act)
1627 		node_ptr->features_act = xstrdup(node_ptr->features);
1628 	_strip_knl_opts(&node_ptr->features_act);
1629 
1630 	if (mcdram_cap && (validate_mode == 0)) {
1631 		for (i = 0; i < mcdram_cap_cnt; i++) {
1632 			if (nid == mcdram_cap[i].nid) {
1633 				_merge_strings(&node_ptr->features,
1634 					       mcdram_cap[i].mcdram_cfg,
1635 					       allow_mcdram);
1636 				is_knl = true;
1637 				break;
1638 			}
1639 		}
1640 	}
1641 
1642 	if (mcdram_cfg) {
1643 		for (i = 0; i < mcdram_cfg_cnt; i++) {
1644 			if (nid != mcdram_cfg[i].nid)
1645 				continue;
1646 			_merge_strings(&node_ptr->features_act,
1647 				       mcdram_cfg[i].mcdram_cfg, allow_mcdram);
1648 
1649 			mcdram_per_node[node_ptr - node_record_table_ptr] =
1650 				mcdram_cfg[i].mcdram_size;
1651 			mcdram_size = mcdram_cfg[i].mcdram_size *
1652 				      (100 - mcdram_cfg[i].mcdram_pct) / 100;
1653 			if (!node_ptr->gres) {
1654 				node_ptr->gres =
1655 					xstrdup(node_ptr->config_ptr->gres);
1656 			}
1657 			if (!node_ptr->gres) {
1658 				node_ptr->gres =
1659 					xstrdup(node_ptr->config_ptr->gres);
1660 			}
1661 			gres_plugin_node_feature(node_ptr->name, "hbm",
1662 						 mcdram_size, &node_ptr->gres,
1663 						 &node_ptr->gres_list);
1664 			break;
1665 		}
1666 	}
1667 	if (numa_cap && (validate_mode == 0)) {
1668 		for (i = 0; i < numa_cap_cnt; i++) {
1669 			if (nid == numa_cap[i].nid) {
1670 				_merge_strings(&node_ptr->features,
1671 					       numa_cap[i].numa_cfg,
1672 					       allow_numa);
1673 				break;
1674 			}
1675 		}
1676 	}
1677 	if (numa_cfg) {
1678 		for (i = 0; i < numa_cfg_cnt; i++) {
1679 			if (nid == numa_cfg[i].nid) {
1680 				_merge_strings(&node_ptr->features_act,
1681 					       numa_cfg[i].numa_cfg,
1682 					       allow_numa);
1683 				numa_inx = _knl_numa_inx(numa_cfg[i].numa_cfg);
1684 				if ((numa_inx >= 0) && cpu_bind[numa_inx])
1685 					node_ptr->cpu_bind = cpu_bind[numa_inx];
1686 				break;
1687 			}
1688 		}
1689 	}
1690 
1691 	/* Make sure that only nodes reported by "capmc get_mcdram_capabilities"
1692 	 * contain KNL features */
1693 	if (is_knl) {
1694 		if (validate_mode)
1695 			_validate_node_features(node_ptr);
1696 	} else {
1697 		node_inx = _strip_knl_features(&node_ptr->features) +
1698 			   _strip_knl_features(&node_ptr->features_act);
1699 		if (node_inx) {
1700 			error("Removed KNL features from non-KNL node %s",
1701 			      node_ptr->name);
1702 		}
1703 		if (!node_ptr->gres) {
1704 			node_ptr->gres =
1705 				xstrdup(node_ptr->config_ptr->gres);
1706 		}
1707 		gres_plugin_node_feature(node_ptr->name, "hbm", 0,
1708 					 &node_ptr->gres, &node_ptr->gres_list);
1709 	}
1710 
1711 	/* Update bitmaps and lists used by slurmctld for scheduling */
1712 	node_bitmap = bit_alloc(node_record_count);
1713 	bit_set(node_bitmap, (node_ptr - node_record_table_ptr));
1714 	update_feature_list(active_feature_list, node_ptr->features_act,
1715 			    node_bitmap);
1716 	(void) node_features_p_node_update(node_ptr->features_act, node_bitmap);
1717 	FREE_NULL_BITMAP(node_bitmap);
1718 }
1719 
_make_uid_array(char * uid_str)1720 static void _make_uid_array(char *uid_str)
1721 {
1722 	char *save_ptr = NULL, *tmp_str, *tok;
1723 	int i, uid_cnt = 0;
1724 
1725 	if (!uid_str)
1726 		return;
1727 
1728 	/* Count the number of users */
1729 	for (i = 0; uid_str[i]; i++) {
1730 		if (uid_str[i] == ',')
1731 			uid_cnt++;
1732 	}
1733 	uid_cnt++;
1734 
1735 	allowed_uid = xmalloc(sizeof(uid_t) * uid_cnt);
1736 	allowed_uid_cnt = 0;
1737 	tmp_str = xstrdup(uid_str);
1738 	tok = strtok_r(tmp_str, ",", &save_ptr);
1739 	while (tok) {
1740 		if (uid_from_string(tok, &allowed_uid[allowed_uid_cnt++]) < 0)
1741 			error("knl_cray.conf: Invalid AllowUserBoot: %s", tok);
1742 		tok = strtok_r(NULL, ",", &save_ptr);
1743 	}
1744 	xfree(tmp_str);
1745 }
1746 
_make_uid_str(uid_t * uid_array,int uid_cnt)1747 static char *_make_uid_str(uid_t *uid_array, int uid_cnt)
1748 {
1749 	char *sep = "", *tmp_str = NULL, *uid_str = NULL;
1750 	int i;
1751 
1752 	if (allowed_uid_cnt == 0) {
1753 		uid_str = xstrdup("ALL");
1754 		return uid_str;
1755 	}
1756 
1757 	for (i = 0; i < uid_cnt; i++) {
1758 		tmp_str = uid_to_string(uid_array[i]);
1759 		xstrfmtcat(uid_str, "%s%s(%d)", sep, tmp_str, uid_array[i]);
1760 		xfree(tmp_str);
1761 		sep = ",";
1762 	}
1763 
1764 	return uid_str;
1765 }
1766 
1767 /* Watch for Uncorrectable Memory Errors. Notify jobs if any detected */
_ume_agent(void * args)1768 static void *_ume_agent(void *args)
1769 {
1770 	struct timespec req;
1771 	int i, mc_num, csrow_num, ue_count, last_ue_count = -1;
1772 	int *fd = NULL, fd_cnt = 0, fd_size = 0, ume_path_size;
1773 	char buf[8], *ume_path;
1774 	ssize_t rd_size;
1775 
1776 	/* Identify and open array of UME file descriptors */
1777 	ume_path_size = strlen(mc_path) + 32;
1778 	ume_path = xmalloc(ume_path_size);
1779 	for (mc_num = 0; ; mc_num++) {
1780 		for (csrow_num = 0; ; csrow_num++) {
1781 			if (fd_cnt == fd_size) {
1782 				fd_size += 64;
1783 				fd = xrealloc(fd, sizeof(int) * fd_size);
1784 			}
1785 			snprintf(ume_path, ume_path_size,
1786 				 "%s/mc%d/csrow%d/ue_count",
1787 				 mc_path, mc_num, csrow_num);
1788 			if ((fd[fd_cnt] = open(ume_path, 0)) >= 0)
1789 				fd_cnt++;
1790 			else
1791 				break;
1792 		}
1793 		if (csrow_num == 0)
1794 			break;
1795 	}
1796 	xfree(ume_path);
1797 
1798 	while (!shutdown_time) {
1799 		/* Get current UME count */
1800 		ue_count = 0;
1801 		for (i = 0; i < fd_cnt; i++) {
1802 			(void) lseek(fd[i], 0, SEEK_SET);
1803 			rd_size = read(fd[i], buf, 7);
1804 			if (rd_size <= 0)
1805 				continue;
1806 			buf[rd_size] = '\0';
1807 			ue_count += atoi(buf);
1808 		}
1809 
1810 		if (shutdown_time)
1811 			break;
1812 		/* If UME count changed, notify all steps */
1813 		if ((last_ue_count < ue_count) && (last_ue_count != -1)) {
1814 			i = ume_notify();
1815 			error("UME error detected. Notified %d job steps", i);
1816 		}
1817 		last_ue_count = ue_count;
1818 
1819 		if (shutdown_time)
1820 			break;
1821 		/* Sleep before retry */
1822 		req.tv_sec  =  ume_check_interval / USEC_IN_SEC;
1823 		req.tv_nsec = (ume_check_interval % USEC_IN_SEC) *
1824 			      NSEC_IN_USEC;
1825 		(void) nanosleep(&req, NULL);
1826 	}
1827 
1828 	for (i = 0; i < fd_cnt; i++)
1829 		(void) close(fd[i]);
1830 	xfree(fd);
1831 
1832 	return NULL;
1833 }
1834 
1835 /* Load configuration */
init(void)1836 extern int init(void)
1837 {
1838 	char *allow_mcdram_str, *allow_numa_str, *allow_user_str;
1839 	char *default_mcdram_str, *default_numa_str;
1840 	char *knl_conf_file, *tmp_str = NULL;
1841 	s_p_hashtbl_t *tbl;
1842 	struct stat stat_buf;
1843 	int i;
1844 
1845 	/* Set default values */
1846 	allow_mcdram = KNL_MCDRAM_FLAG;
1847 	allow_numa = KNL_NUMA_FLAG;
1848 	xfree(allowed_uid);
1849 	allowed_uid_cnt = 0;
1850 	xfree(capmc_path);
1851 	capmc_poll_freq = 45;
1852 	capmc_timeout = DEFAULT_CAPMC_TIMEOUT;
1853 	for (i = 0; i < KNL_NUMA_CNT; i++)
1854 		cpu_bind[i] = 0;
1855 	xfree(cnselect_path);
1856 	debug_flag = false;
1857 	default_mcdram = KNL_CACHE;
1858 	default_numa = KNL_ALL2ALL;
1859 	xfree(mc_path);
1860 	for (i = 0; i < KNL_MCDRAM_CNT; i++)
1861 		mcdram_pct[i] = -1;
1862 	mcdram_set = 0;
1863 	xfree(numa_cpu_bind);
1864 	xfree(syscfg_path);
1865 
1866 	if (slurm_get_debug_flags() & DEBUG_FLAG_NODE_FEATURES)
1867 		debug_flag = true;
1868 
1869 	knl_conf_file = get_extra_conf_path("knl_cray.conf");
1870 	if ((stat(knl_conf_file, &stat_buf) == 0) &&
1871 	    (tbl = _config_make_tbl(knl_conf_file))) {
1872 		if (s_p_get_string(&tmp_str, "AllowMCDRAM", tbl)) {
1873 			allow_mcdram = _knl_mcdram_parse(tmp_str, ",");
1874 			if (_knl_mcdram_bits_cnt(allow_mcdram) < 1) {
1875 				fatal("knl_cray.conf: Invalid AllowMCDRAM=%s",
1876 				      tmp_str);
1877 			}
1878 			xfree(tmp_str);
1879 		}
1880 		if (s_p_get_string(&tmp_str, "AllowNUMA", tbl)) {
1881 			allow_numa = _knl_numa_parse(tmp_str, ",");
1882 			if (_knl_numa_bits_cnt(allow_numa) < 1) {
1883 				fatal("knl_cray.conf: Invalid AllowNUMA=%s",
1884 				      tmp_str);
1885 			}
1886 			xfree(tmp_str);
1887 		}
1888 		if (s_p_get_string(&tmp_str, "AllowUserBoot", tbl)) {
1889 			_make_uid_array(tmp_str);
1890 			xfree(tmp_str);
1891 		}
1892 		(void) s_p_get_uint32(&boot_time, "BootTime", tbl);
1893 		(void) s_p_get_string(&capmc_path, "CapmcPath", tbl);
1894 		(void) s_p_get_uint32(&capmc_poll_freq, "CapmcPollFreq", tbl);
1895 		(void) s_p_get_uint32(&capmc_retries, "CapmcRetries", tbl);
1896 		(void) s_p_get_uint32(&capmc_timeout, "CapmcTimeout", tbl);
1897 		(void) s_p_get_string(&cnselect_path, "CnselectPath", tbl);
1898 		if (s_p_get_string(&tmp_str, "DefaultMCDRAM", tbl)) {
1899 			default_mcdram = _knl_mcdram_parse(tmp_str, ",");
1900 			if (_knl_mcdram_bits_cnt(default_mcdram) != 1) {
1901 				fatal("knl_cray.conf: Invalid DefaultMCDRAM=%s",
1902 				      tmp_str);
1903 			}
1904 			xfree(tmp_str);
1905 		}
1906 		if (s_p_get_string(&tmp_str, "DefaultNUMA", tbl)) {
1907 			default_numa = _knl_numa_parse(tmp_str, ",");
1908 			if (_knl_numa_bits_cnt(default_numa) != 1) {
1909 				fatal("knl_cray.conf: Invalid DefaultNUMA=%s",
1910 				      tmp_str);
1911 			}
1912 			xfree(tmp_str);
1913 		}
1914 		(void) s_p_get_string(&mc_path, "McPath", tbl);
1915 		(void) s_p_get_uint32(&node_reboot_weight, "NodeRebootWeight",
1916 				      tbl);
1917 		if (s_p_get_string(&numa_cpu_bind, "NumaCpuBind", tbl))
1918 			_update_cpu_bind();
1919 		(void) s_p_get_string(&syscfg_path, "SyscfgPath", tbl);
1920 		(void) s_p_get_uint32(&ume_check_interval, "UmeCheckInterval",
1921 				      tbl);
1922 		(void) s_p_get_uint32(&validate_mode, "ValidateMode", tbl);
1923 		s_p_hashtbl_destroy(tbl);
1924 	} else {
1925 		error("something wrong with opening/reading knl_cray.conf");
1926 	}
1927 	xfree(knl_conf_file);
1928 	if (!capmc_path)
1929 		capmc_path = xstrdup("/opt/cray/capmc/default/bin/capmc");
1930 	capmc_timeout = MAX(capmc_timeout, MIN_CAPMC_TIMEOUT);
1931 	if (!cnselect_path)
1932 		cnselect_path = xstrdup("/opt/cray/sdb/default/bin/cnselect");
1933 	if (!mc_path)
1934 		mc_path = xstrdup("/sys/devices/system/edac/mc");
1935 	if (!syscfg_path)
1936 		verbose("SyscfgPath is not configured");
1937 
1938 	if (slurm_get_debug_flags() & DEBUG_FLAG_NODE_FEATURES) {
1939 		allow_mcdram_str = _knl_mcdram_str(allow_mcdram);
1940 		allow_numa_str = _knl_numa_str(allow_numa);
1941 		allow_user_str = _make_uid_str(allowed_uid, allowed_uid_cnt);
1942 		default_mcdram_str = _knl_mcdram_str(default_mcdram);
1943 		default_numa_str = _knl_numa_str(default_numa);
1944 		info("AllowMCDRAM=%s AllowNUMA=%s",
1945 		     allow_mcdram_str, allow_numa_str);
1946 		info("AllowUserBoot=%s", allow_user_str);
1947 		info("BootTIme=%u", boot_time);
1948 		info("CapmcPath=%s", capmc_path);
1949 		info("CapmcPollFreq=%u sec", capmc_poll_freq);
1950 		info("CapmcRetries=%u", capmc_retries);
1951 		info("CapmcTimeout=%u msec", capmc_timeout);
1952 		info("CnselectPath=%s", cnselect_path);
1953 		info("DefaultMCDRAM=%s DefaultNUMA=%s",
1954 		     default_mcdram_str, default_numa_str);
1955 		info("McPath=%s", mc_path);
1956 		info("NodeRebootWeight=%u", node_reboot_weight);
1957 		info("NumaCpuBind=%s", numa_cpu_bind);
1958 		info("SyscfgPath=%s", syscfg_path);
1959 		info("UmeCheckInterval=%u", ume_check_interval);
1960 		info("ValidateMode=%u", validate_mode);
1961 		xfree(allow_mcdram_str);
1962 		xfree(allow_numa_str);
1963 		xfree(allow_user_str);
1964 		xfree(default_mcdram_str);
1965 		xfree(default_numa_str);
1966 	}
1967 	gres_plugin_add("hbm");
1968 
1969 	if (ume_check_interval && running_in_slurmd()) {
1970 		slurm_mutex_lock(&ume_mutex);
1971 		slurm_thread_create(&ume_thread, _ume_agent, NULL);
1972 		slurm_mutex_unlock(&ume_mutex);
1973 	}
1974 
1975 	slurm_mutex_lock(&queue_mutex);
1976 	if (queue_thread == 0) {
1977 		/* since we do a join on this later we don't make it detached */
1978 		slurm_thread_create(&queue_thread, _queue_agent, NULL);
1979 	}
1980 	slurm_mutex_unlock(&queue_mutex);
1981 
1982 	return SLURM_SUCCESS;
1983 }
1984 
1985 /* Release allocated memory */
fini(void)1986 extern int fini(void)
1987 {
1988 	shutdown_time = time(NULL);
1989 	slurm_mutex_lock(&ume_mutex);
1990 	if (ume_thread) {
1991 		pthread_join(ume_thread, NULL);
1992 		ume_thread = 0;
1993 	}
1994 	slurm_mutex_unlock(&ume_mutex);
1995 	pthread_join(queue_thread, NULL);
1996 	slurm_mutex_lock(&queue_mutex);
1997 	xfree(node_list_queue);	/* just drop requessts */
1998 	shutdown_time = (time_t) 0;
1999 	queue_thread = 0;
2000 	slurm_mutex_unlock(&queue_mutex);
2001 
2002 	xfree(allowed_uid);
2003 	allowed_uid_cnt = 0;
2004 	xfree(capmc_path);
2005 	xfree(cnselect_path);
2006 	capmc_timeout = 0;
2007 	debug_flag = false;
2008 	xfree(mc_path);
2009 	xfree(mcdram_per_node);
2010 	xfree(numa_cpu_bind);
2011 	xfree(syscfg_path);
2012 	FREE_NULL_BITMAP(knl_node_bitmap);
2013 
2014 	return SLURM_SUCCESS;
2015 }
2016 
2017 /* Reload configuration */
node_features_p_reconfig(void)2018 extern int node_features_p_reconfig(void)
2019 {
2020 	slurm_mutex_lock(&config_mutex);
2021 	reconfig = true;
2022 	slurm_mutex_unlock(&config_mutex);
2023 	return SLURM_SUCCESS;
2024 }
2025 
2026 /* Put any nodes NOT found by "capmc node_status" into DRAIN state */
_check_node_status(void)2027 static void _check_node_status(void)
2028 {
2029 	json_object *j_obj;
2030 	json_object_iter iter;
2031 	json_object *j_array = NULL;
2032 	json_object *j_value;
2033 	char *resp_msg, **script_argv;
2034 	int i, nid, num_ent, retry, status = 0;
2035 	node_record_t *node_ptr;
2036 	bitstr_t *capmc_node_bitmap = NULL;
2037 	DEF_TIMERS;
2038 
2039 	script_argv = xmalloc(sizeof(char *) * 4); /* NULL terminated */
2040 	script_argv[0] = xstrdup("capmc");
2041 	script_argv[1] = xstrdup("node_status");
2042 	for (retry = 0; ; retry++) {
2043 		START_TIMER;
2044 		resp_msg = _run_script(capmc_path, script_argv, &status);
2045 		END_TIMER;
2046 		if (debug_flag)
2047 			info("%s: node_status ran for %s", __func__, TIME_STR);
2048 		_log_script_argv(script_argv, resp_msg);
2049 		if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
2050 			break;	/* Success */
2051 		error("%s: node_status status:%u response:%s",
2052 		      __func__, status, resp_msg);
2053 		if (resp_msg == NULL) {
2054 			info("%s: node_status returned no information",
2055 			     __func__);
2056 			_free_script_argv(script_argv);
2057 			return;
2058 		}
2059 		if (strstr(resp_msg, "Could not lookup") &&
2060 		    (retry <= capmc_retries)) {
2061 			/* State Manager is down. Sleep and retry */
2062 			sleep(1);
2063 			xfree(resp_msg);
2064 		} else {
2065 			xfree(resp_msg);
2066 			_free_script_argv(script_argv);
2067 			return;
2068 		}
2069 	}
2070 	_free_script_argv(script_argv);
2071 
2072 	j_obj = json_tokener_parse(resp_msg);
2073 	if (j_obj == NULL) {
2074 		error("%s: json parser failed on %s", __func__, resp_msg);
2075 		xfree(resp_msg);
2076 		return;
2077 	}
2078 	xfree(resp_msg);
2079 
2080 	capmc_node_bitmap = bit_alloc(100000);
2081 	json_object_object_foreachC(j_obj, iter) {
2082 		/* NOTE: The error number "e" and message "err_msg"
2083 		 * fields are currently ignored. */
2084 		if (!xstrcmp(iter.key, "e") ||
2085 		    !xstrcmp(iter.key, "err_msg"))
2086 			continue;
2087 		if (json_object_get_type(iter.val) != json_type_array)
2088 			continue;
2089 		json_object_object_get_ex(j_obj, iter.key, &j_array);
2090 		if (!j_array) {
2091 			error("%s: Unable to parse nid specification",
2092 			      __func__);
2093 			FREE_NULL_BITMAP(capmc_node_bitmap);
2094 			return;
2095 		}
2096 		num_ent = json_object_array_length(j_array);
2097 		for (i = 0; i < num_ent; i++) {
2098 			j_value = json_object_array_get_idx(j_array, i);
2099 			if (json_object_get_type(j_value) !=
2100 			    json_type_int) {
2101 				error("%s: Unable to parse nid specification",
2102 				      __func__);
2103 			} else {
2104 				nid = json_object_get_int64(j_value);
2105 				if ((nid >= 0) && (nid < 100000))
2106 					bit_set(capmc_node_bitmap, nid);
2107 			}
2108 		}
2109 	}
2110 	json_object_put(j_obj);	/* Frees json memory */
2111 
2112 	for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
2113 	     i++, node_ptr++) {
2114 		nid = atoi(node_ptr->name + 3);	/* Skip "nid" */
2115 		if ((nid < 0) || (nid >= 100000) ||
2116 		    bit_test(capmc_node_bitmap, nid))
2117 			continue;
2118 		info("Node %s not found by \'capmc node_status\', draining it",
2119 		     node_ptr->name);
2120 		if (IS_NODE_DOWN(node_ptr) || IS_NODE_DRAIN(node_ptr))
2121 			continue;
2122 		node_ptr->node_state |= NODE_STATE_DRAIN;
2123 		xfree(node_ptr->reason);
2124 		node_ptr->reason = xstrdup("Node not found by capmc");
2125 		node_ptr->reason_time = time(NULL);
2126 		node_ptr->reason_uid = slurm_get_slurm_user_id();
2127 		if (avail_node_bitmap)
2128 			bit_clear(avail_node_bitmap, i);
2129 	}
2130 	FREE_NULL_BITMAP(capmc_node_bitmap);
2131 }
2132 
2133 /* Put any disabled nodes into DRAIN state */
_check_node_disabled(void)2134 static void _check_node_disabled(void)
2135 {
2136 /* FIXME: To be added
2137  *
2138  * STEP 0 (for testing), disable/enable nodes:
2139  * > xtcli disable ${TARGET_NODE}
2140  * > xtcli enable ${TARGET_NODE}
2141  *
2142  * STEP 1: Identify disabled compute nodes
2143  * > xtshow --compute --disabled
2144  * L1s ...
2145  * L0s ...
2146  * Nodes ...
2147  * c0-0c0s7n0:    -|  disabled  [noflags|]
2148  * SeaStars ...
2149  * Links ...
2150  * c1-0c2s1s1l1:    -|  disabled  [noflags|]
2151  *
2152  * STEP 2: Map cname to nid name
2153  * > rtr -Im ${TARGET_BLADE}
2154  *
2155  * STEP 3: Drain the disabled compute nodes
2156  * See logic in _check_node_status() above.
2157  */
2158 }
2159 
2160 /* Periodically update node information for specified nodes. We can't do this
2161  * work in real-time since capmc takes multiple seconds to execute. */
_queue_agent(void * args)2162 extern void *_queue_agent(void *args)
2163 {
2164 	char *node_list;
2165 
2166 	while (shutdown_time == 0) {
2167 		sleep(1);
2168 		if (shutdown_time)
2169 			break;
2170 
2171 		if (node_list_queue &&
2172 		    (difftime(time(NULL), node_time_queue) >= 30)) {
2173 			slurm_mutex_lock(&queue_mutex);
2174 			node_list = node_list_queue;
2175 			node_list_queue = NULL;
2176 			node_time_queue = (time_t) 0;
2177 			slurm_mutex_unlock(&queue_mutex);
2178 			(void) _update_node_state(node_list, true);
2179 			xfree(node_list);
2180 		}
2181 	}
2182 
2183 	return NULL;
2184 }
2185 
2186 /* Queue request to update node information */
_queue_node_update(char * node_list)2187 static int _queue_node_update(char *node_list)
2188 {
2189 	slurm_mutex_lock(&queue_mutex);
2190 	if (node_time_queue == 0)
2191 		node_time_queue = time(NULL);
2192 	if (node_list_queue)
2193 		xstrcat(node_list_queue, ",");
2194 	xstrcat(node_list_queue, node_list);
2195 	slurm_mutex_unlock(&queue_mutex);
2196 
2197 	return SLURM_SUCCESS;
2198 }
2199 
2200 /* Update active and available features on specified nodes.
2201  * If node_list is NULL then update ALL nodes now.
2202  * If node_list is not NULL, then queue a request to update select nodes later.
2203  */
node_features_p_get_node(char * node_list)2204 extern int node_features_p_get_node(char *node_list)
2205 {
2206 	if (node_list &&		/* Selected node to be update */
2207 	    mcdram_per_node &&		/* and needed global info is */
2208 	    (mcdram_pct[0] != -1))	/* already available */
2209 		return _queue_node_update(node_list);
2210 
2211 	return _update_node_state(node_list, false);
2212 }
2213 
_update_node_state(char * node_list,bool set_locks)2214 static int _update_node_state(char *node_list, bool set_locks)
2215 {
2216 	json_object *j;
2217 	json_object_iter iter;
2218 	int i, k, rc = SLURM_SUCCESS, retry, status = 0;
2219 	DEF_TIMERS;
2220 	char *resp_msg, **script_argv;
2221 	mcdram_cap_t *mcdram_cap = NULL;
2222 	mcdram_cfg_t *mcdram_cfg = NULL;
2223 	mcdram_cfg2_t *mcdram_cfg2 = NULL;
2224 	numa_cap_t *numa_cap = NULL;
2225 	numa_cfg_t *numa_cfg = NULL;
2226 	numa_cfg2_t *numa_cfg2 = NULL;
2227 	int mcdram_cap_cnt = 0, mcdram_cfg_cnt = 0, mcdram_cfg2_cnt = 0;
2228 	int numa_cap_cnt = 0, numa_cfg_cnt = 0, numa_cfg2_cnt = 0;
2229 	node_record_t *node_ptr;
2230 	hostlist_t host_list;
2231 	char *node_name;
2232 
2233 	slurm_mutex_lock(&config_mutex);
2234 	if (reconfig) {
2235 		(void) init();
2236 		reconfig = false;
2237 	}
2238 	slurm_mutex_unlock(&config_mutex);
2239 
2240 	_check_node_status();	/* Drain nodes not found by capmc */
2241 	_check_node_disabled();	/* Drain disabled nodes */
2242 
2243 	if (!mcdram_per_node)
2244 		mcdram_per_node = xmalloc(sizeof(uint64_t) * node_record_count);
2245 
2246 	/*
2247 	 * Load available MCDRAM capabilities
2248 	 */
2249 	script_argv = xmalloc(sizeof(char *) * 4);	/* NULL terminated */
2250 	script_argv[0] = xstrdup("capmc");
2251 	script_argv[1] = xstrdup("get_mcdram_capabilities");
2252 	for (retry = 0; ; retry++) {
2253 		START_TIMER;
2254 		resp_msg = _run_script(capmc_path, script_argv, &status);
2255 		END_TIMER;
2256 		if (debug_flag) {
2257 			info("%s: get_mcdram_capabilities ran for %s",
2258 			     __func__, TIME_STR);
2259 		}
2260 		_log_script_argv(script_argv, resp_msg);
2261 		if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
2262 			break;	/* Success */
2263 		error("%s: get_mcdram_capabilities status:%u response:%s",
2264 		      __func__, status, resp_msg);
2265 		if (resp_msg == NULL) {
2266 			info("%s: get_mcdram_capabilities returned no information",
2267 			     __func__);
2268 			_free_script_argv(script_argv);
2269 			rc = SLURM_ERROR;
2270 			goto fini;
2271 		}
2272 		if (strstr(resp_msg, "Could not lookup") &&
2273 		    (retry <= capmc_retries)) {
2274 			/* State Manager is down. Sleep and retry */
2275 			sleep(1);
2276 			xfree(resp_msg);
2277 		} else {
2278 			xfree(resp_msg);
2279 			_free_script_argv(script_argv);
2280 			rc = SLURM_ERROR;
2281 			goto fini;
2282 		}
2283 	}
2284 	_free_script_argv(script_argv);
2285 
2286 	j = json_tokener_parse(resp_msg);
2287 	if (j == NULL) {
2288 		error("%s: json parser failed on %s", __func__, resp_msg);
2289 		xfree(resp_msg);
2290 		rc = SLURM_ERROR;
2291 		goto fini;
2292 	}
2293 	xfree(resp_msg);
2294 	json_object_object_foreachC(j, iter) {
2295 		if (xstrcmp(iter.key, "nids"))
2296 			continue;
2297 		mcdram_cap = _json_parse_mcdram_cap_array(j, iter.key,
2298 							  &mcdram_cap_cnt);
2299 		break;
2300 	}
2301 	json_object_put(j);	/* Frees json memory */
2302 
2303 	/*
2304 	 * Load current MCDRAM configuration
2305 	 */
2306 	script_argv = xmalloc(sizeof(char *) * 4);	/* NULL terminated */
2307 	script_argv[0] = xstrdup("capmc");
2308 	script_argv[1] = xstrdup("get_mcdram_cfg");
2309 	for (retry = 0; ; retry++) {
2310 		START_TIMER;
2311 		resp_msg = _run_script(capmc_path, script_argv, &status);
2312 		END_TIMER;
2313 		if (debug_flag) {
2314 			info("%s: get_mcdram_cfg ran for %s",
2315 			     __func__, TIME_STR);
2316 		}
2317 		_log_script_argv(script_argv, resp_msg);
2318 		if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
2319 			break;	/* Success */
2320 		error("%s: get_mcdram_cfg status:%u response:%s",
2321 		      __func__, status, resp_msg);
2322 		if (resp_msg == NULL) {
2323 			info("%s: get_mcdram_cfg returned no information",
2324 			     __func__);
2325 			_free_script_argv(script_argv);
2326 			rc = SLURM_ERROR;
2327 			goto fini;
2328 		}
2329 		if (strstr(resp_msg, "Could not lookup") &&
2330 		    (retry <= capmc_retries)) {
2331 			/* State Manager is down. Sleep and retry */
2332 			sleep(1);
2333 			xfree(resp_msg);
2334 		} else {
2335 			xfree(resp_msg);
2336 			_free_script_argv(script_argv);
2337 			rc = SLURM_ERROR;
2338 			goto fini;
2339 		}
2340 	}
2341 	_free_script_argv(script_argv);
2342 
2343 	j = json_tokener_parse(resp_msg);
2344 	if (j == NULL) {
2345 		error("%s: json parser failed on %s", __func__, resp_msg);
2346 		xfree(resp_msg);
2347 		rc = SLURM_ERROR;
2348 		goto fini;
2349 	}
2350 	xfree(resp_msg);
2351 	json_object_object_foreachC(j, iter) {
2352 		if (xstrcmp(iter.key, "nids"))
2353 			continue;
2354 		mcdram_cfg = _json_parse_mcdram_cfg_array(j, iter.key,
2355 							  &mcdram_cfg_cnt);
2356 		break;
2357 	}
2358 	json_object_put(j);	/* Frees json memory */
2359 
2360 	mcdram_cfg2 = _load_current_mcdram(&mcdram_cfg2_cnt);
2361 
2362 	/*
2363 	 * Load available NUMA capabilities
2364 	 */
2365 	script_argv = xmalloc(sizeof(char *) * 4);	/* NULL terminated */
2366 	script_argv[0] = xstrdup("capmc");
2367 	script_argv[1] = xstrdup("get_numa_capabilities");
2368 	for (retry = 0; ; retry++) {
2369 		START_TIMER;
2370 		resp_msg = _run_script(capmc_path, script_argv, &status);
2371 		END_TIMER;
2372 		if (debug_flag) {
2373 			info("%s: get_numa_capabilities ran for %s",
2374 			     __func__, TIME_STR);
2375 		}
2376 		_log_script_argv(script_argv, resp_msg);
2377 		if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
2378 			break;	/* Success */
2379 		error("%s: get_numa_capabilities status:%u response:%s",
2380 		      __func__, status, resp_msg);
2381 		if (resp_msg == NULL) {
2382 			info("%s: get_numa_capabilities returned no information",
2383 			     __func__);
2384 			_free_script_argv(script_argv);
2385 			rc = SLURM_ERROR;
2386 			goto fini;
2387 		}
2388 		if (strstr(resp_msg, "Could not lookup") &&
2389 		    (retry <= capmc_retries)) {
2390 			/* State Manager is down. Sleep and retry */
2391 			sleep(1);
2392 			xfree(resp_msg);
2393 		} else {
2394 			xfree(resp_msg);
2395 			_free_script_argv(script_argv);
2396 			rc = SLURM_ERROR;
2397 			goto fini;
2398 		}
2399 	}
2400 	_free_script_argv(script_argv);
2401 
2402 	j = json_tokener_parse(resp_msg);
2403 	if (j == NULL) {
2404 		error("%s: json parser failed on %s", __func__, resp_msg);
2405 		xfree(resp_msg);
2406 		rc = SLURM_ERROR;
2407 		goto fini;
2408 	}
2409 	xfree(resp_msg);
2410 	json_object_object_foreachC(j, iter) {
2411 		if (xstrcmp(iter.key, "nids"))
2412 			continue;
2413 		numa_cap = _json_parse_numa_cap_array(j, iter.key,
2414 						      &numa_cap_cnt);
2415 		break;
2416 	}
2417 	json_object_put(j);	/* Frees json memory */
2418 
2419 	/*
2420 	 * Load current NUMA configuration
2421 	 */
2422 	script_argv = xmalloc(sizeof(char *) * 4);	/* NULL terminated */
2423 	script_argv[0] = xstrdup("capmc");
2424 	script_argv[1] = xstrdup("get_numa_cfg");
2425 	for (retry = 0; ; retry++) {
2426 		START_TIMER;
2427 		resp_msg = _run_script(capmc_path, script_argv, &status);
2428 		END_TIMER;
2429 		if (debug_flag)
2430 			info("%s: get_numa_cfg ran for %s", __func__, TIME_STR);
2431 		_log_script_argv(script_argv, resp_msg);
2432 		if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
2433 			break;	/* Success */
2434 		error("%s: get_numa_cfg status:%u response:%s",
2435 		      __func__, status, resp_msg);
2436 		if (resp_msg == NULL) {
2437 			info("%s: get_numa_cfg returned no information",
2438 			     __func__);
2439 			_free_script_argv(script_argv);
2440 			rc = SLURM_ERROR;
2441 			goto fini;
2442 		}
2443 		if (strstr(resp_msg, "Could not lookup") &&
2444 		    (retry <= capmc_retries)) {
2445 			/* State Manager is down. Sleep and retry */
2446 			sleep(1);
2447 			xfree(resp_msg);
2448 		} else {
2449 			xfree(resp_msg);
2450 			_free_script_argv(script_argv);
2451 			rc = SLURM_ERROR;
2452 			goto fini;
2453 		}
2454 	}
2455 	_free_script_argv(script_argv);
2456 
2457 	j = json_tokener_parse(resp_msg);
2458 	if (j == NULL) {
2459 		error("%s: json parser failed on %s", __func__, resp_msg);
2460 		xfree(resp_msg);
2461 		rc = SLURM_ERROR;
2462 		goto fini;
2463 	}
2464 	xfree(resp_msg);
2465 	json_object_object_foreachC(j, iter) {
2466 		if (xstrcmp(iter.key, "nids"))
2467 			continue;
2468 		numa_cfg = _json_parse_numa_cfg_array(j, iter.key,
2469 						      &numa_cfg_cnt);
2470 		break;
2471 	}
2472 	json_object_put(j);	/* Frees json memory */
2473 
2474 	numa_cfg2 = _load_current_numa(&numa_cfg2_cnt);
2475 
2476 	if (debug_flag) {
2477 		_mcdram_cap_log(mcdram_cap, mcdram_cap_cnt);
2478 		_mcdram_cfg_log(mcdram_cfg, mcdram_cfg_cnt);
2479 		_mcdram_cfg2_log(mcdram_cfg2, mcdram_cfg2_cnt);
2480 		_numa_cap_log(numa_cap, numa_cap_cnt);
2481 		_numa_cfg_log(numa_cfg, numa_cfg_cnt);
2482 		_numa_cfg2_log(numa_cfg2, numa_cfg2_cnt);
2483 	}
2484 	for (i = 0; i < mcdram_cfg_cnt; i++) {
2485 		for (k = 0; k < mcdram_cfg2_cnt; k++) {
2486 			if (!mcdram_cfg2[k].node_bitmap ||
2487 			    !bit_test(mcdram_cfg2[k].node_bitmap,
2488 				      mcdram_cfg[i].nid))
2489 				continue;
2490 			if (mcdram_cfg[i].mcdram_pct !=
2491 			    mcdram_cfg2[k].cache_pct) {
2492 				if (mcdram_cfg[i].mcdram_pct == NO_VAL16) {
2493 					info("%s: No mcdram_pct from capmc for nid %u",
2494 					     __func__, mcdram_cfg[i].nid);
2495 				} else {
2496 					info("%s: HBM mismatch between capmc "
2497 					     "and cnselect for nid %u (%u != %d)",
2498 					     __func__, mcdram_cfg[i].nid,
2499 					     mcdram_cfg[i].mcdram_pct,
2500 					     mcdram_cfg2[k].cache_pct);
2501 				}
2502 				mcdram_cfg[i].mcdram_pct =
2503 					mcdram_cfg2[k].cache_pct;
2504 				xfree(mcdram_cfg[i].mcdram_cfg);
2505 				mcdram_cfg[i].mcdram_cfg =
2506 					xstrdup(mcdram_cfg2[k].mcdram_cfg);
2507 			}
2508 			break;
2509 		}
2510 	}
2511 	for (i = 0; i < numa_cfg_cnt; i++) {
2512 		for (k = 0; k < numa_cfg2_cnt; k++) {
2513 			if (!numa_cfg2[k].node_bitmap ||
2514 			    !bit_test(numa_cfg2[k].node_bitmap,
2515 				      numa_cfg[i].nid))
2516 				continue;
2517 			if (xstrcmp(numa_cfg[i].numa_cfg,
2518 				    numa_cfg2[k].numa_cfg)) {
2519 				if (!numa_cfg[i].numa_cfg) {
2520 					info("%s: No numa_cfg from capmc for nid %u",
2521 					     __func__, numa_cfg[i].nid);
2522 				} else {
2523 					info("%s: NUMA mismatch between capmc "
2524 					     "and cnselect for nid %u (%s != %s)",
2525 					     __func__, numa_cfg[i].nid,
2526 					     numa_cfg[i].numa_cfg,
2527 					     numa_cfg2[k].numa_cfg);
2528 				}
2529 				xfree(numa_cfg[i].numa_cfg);
2530 				numa_cfg[i].numa_cfg =
2531 					xstrdup(numa_cfg2[k].numa_cfg);
2532 			}
2533 			break;
2534 		}
2535 	}
2536 
2537 	START_TIMER;
2538 	if (node_list) {
2539 		/* Write nodes */
2540 		slurmctld_lock_t write_nodes_lock = {
2541 			NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK};
2542 
2543 		if ((host_list = hostlist_create(node_list)) == NULL) {
2544 			error ("hostlist_create error on %s: %m", node_list);
2545 			goto fini;
2546 		}
2547 		hostlist_uniq(host_list);
2548 
2549 		if (set_locks)
2550 			lock_slurmctld(write_nodes_lock);
2551 		while ((node_name = hostlist_shift(host_list))) {
2552 			node_ptr = find_node_record(node_name);
2553 			if (node_ptr) {
2554 				_update_node_features(node_ptr,
2555 						      mcdram_cap,mcdram_cap_cnt,
2556 						      mcdram_cfg,mcdram_cfg_cnt,
2557 						      numa_cap, numa_cap_cnt,
2558 						      numa_cfg, numa_cfg_cnt);
2559 			}
2560 			free(node_name);
2561 		}
2562 		if (set_locks)
2563 			unlock_slurmctld(write_nodes_lock);
2564 		hostlist_destroy(host_list);
2565 	} else {
2566 		time_t now = time(NULL);
2567 		for (i = 0, node_ptr = node_record_table_ptr;
2568 		     i < node_record_count; i++, node_ptr++) {
2569 			if ((node_ptr->last_response > now) &&
2570 			    IS_NODE_NO_RESPOND(node_ptr)) {
2571 				/*
2572 				 * Reboot likely in progress.
2573 				 * Preserve active KNL features and merge
2574 				 * with configured non-KNL features
2575 				 */
2576 				_merge_strings(&node_ptr->features_act,
2577 					       node_ptr->features, 0);
2578 				continue;
2579 			}
2580 			if (validate_mode == 0) {
2581 				_strip_knl_opts(&node_ptr->features);
2582 				xfree(node_ptr->features_act);
2583 				if (node_ptr->features) {
2584 					node_ptr->features_act =
2585 						xstrdup(node_ptr->features);
2586 				}
2587 			} else {
2588 				if (node_ptr->features) {
2589 					node_ptr->features_act =
2590 						xstrdup(node_ptr->features);
2591 				}
2592 			}
2593 		}
2594 		_update_all_node_features(mcdram_cap, mcdram_cap_cnt,
2595 					  mcdram_cfg, mcdram_cfg_cnt,
2596 					  numa_cap, numa_cap_cnt,
2597 					  numa_cfg, numa_cfg_cnt);
2598 	}
2599 	END_TIMER;
2600 	if (debug_flag)
2601 		info("%s: update_node_features ran for %s", __func__, TIME_STR);
2602 
2603 	last_node_update = time(NULL);
2604 
2605 fini:	_mcdram_cap_free(mcdram_cap, mcdram_cap_cnt);
2606 	_mcdram_cfg_free(mcdram_cfg, mcdram_cfg_cnt);
2607 	_mcdram_cfg2_free(mcdram_cfg2, mcdram_cfg2_cnt);
2608 	_numa_cap_free(numa_cap, numa_cap_cnt);
2609 	_numa_cfg_free(numa_cfg, numa_cfg_cnt);
2610 	_numa_cfg2_free(numa_cfg2, numa_cfg2_cnt);
2611 
2612 	return rc;
2613 }
2614 
2615 /* Get this node's current and available MCDRAM and NUMA settings from BIOS.
2616  * avail_modes IN/OUT - append available modes, must be xfreed
2617  * current_mode IN/OUT - append current modes, must be xfreed
2618  *
2619  * NOTE: Not applicable on Cray systems; can be used on other systems.
2620  *
2621  * NOTES about syscfg (from Intel):
2622  * To display the BIOS Parameters:
2623  * >> syscfg /d biossettings <"BIOS variable Name">
2624  *
2625  * To Set the BIOS Parameters:
2626  * >> syscfg /bcs <AdminPw> <"BIOS variable Name"> <Value>
2627  * Note: If AdminPw is not set use ""
2628  */
node_features_p_node_state(char ** avail_modes,char ** current_mode)2629 extern void node_features_p_node_state(char **avail_modes, char **current_mode)
2630 {
2631 	return;		/*  Not applicable on Cray systems */
2632 }
2633 
2634 /* Test if a job's feature specification is valid */
node_features_p_job_valid(char * job_features)2635 extern int node_features_p_job_valid(char *job_features)
2636 {
2637 	uint16_t job_mcdram, job_numa;
2638 	int mcdram_cnt, numa_cnt;
2639 	int last_mcdram_cnt = 0, last_numa_cnt = 0;
2640 	int rc = SLURM_SUCCESS;
2641 	char last_sep = '\0', *tmp, *tok, *save_ptr = NULL;
2642 
2643 	if ((job_features == NULL) || (job_features[0] == '\0'))
2644 		return SLURM_SUCCESS;
2645 
2646 	tmp = xstrdup(job_features);
2647 	tok = strtok_r(tmp, "[]()|", &save_ptr);
2648 	while (tok) {
2649 		last_sep = tok[strlen(tok) - 1];
2650 		job_mcdram = _knl_mcdram_parse(tok, "&,*");
2651 		mcdram_cnt = _knl_mcdram_bits_cnt(job_mcdram) + last_mcdram_cnt;
2652 		if (mcdram_cnt > 1) {	/* Multiple ANDed MCDRAM options */
2653 			rc = ESLURM_INVALID_KNL;
2654 			break;
2655 		}
2656 
2657 		job_numa = _knl_numa_parse(tok, "&,*");
2658 		numa_cnt = _knl_numa_bits_cnt(job_numa) + last_numa_cnt;
2659 		if (numa_cnt > 1) {	/* Multiple ANDed NUMA options */
2660 			rc = ESLURM_INVALID_KNL;
2661 			break;
2662 		}
2663 		tok = strtok_r(NULL, "[]()|", &save_ptr);
2664 		if (tok &&
2665 		    ((last_sep == '&') ||	/* e.g. "equal&(flat|cache)" */
2666 		     (tok[0] == '&'))) {	/* e.g. "(flat|cache)&equal" */
2667 			last_mcdram_cnt += mcdram_cnt;
2668 			last_numa_cnt += numa_cnt;
2669 		} else {
2670 			last_mcdram_cnt = 0;
2671 			last_numa_cnt = 0;
2672 		}
2673 	}
2674 	xfree(tmp);
2675 
2676 	return rc;
2677 }
2678 
2679 /*
2680  * Translate a job's feature request to the node features needed at boot time.
2681  *	If multiple MCDRAM or NUMA values are ORed, pick the first ones.
2682  * IN job_features - job's --constraint specification
2683  * RET features required on node reboot. Must xfree to release memory
2684  */
node_features_p_job_xlate(char * job_features)2685 extern char *node_features_p_job_xlate(char *job_features)
2686 {
2687 	char *node_features = NULL;
2688 	char *tmp, *save_ptr = NULL, *mult, *sep = "", *tok;
2689 	bool has_numa = false, has_mcdram = false;
2690 
2691 	if ((job_features == NULL) || (job_features[0] ==  '\0'))
2692 		return node_features;
2693 
2694 	tmp = xstrdup(job_features);
2695 	tok = strtok_r(tmp, "[]()|&", &save_ptr);
2696 	while (tok) {
2697 		bool knl_opt = false;
2698 		if ((mult = strchr(tok, '*')))
2699 			mult[0] = '\0';
2700 		if (_knl_mcdram_token(tok)) {
2701 			if (!has_mcdram) {
2702 				has_mcdram = true;
2703 				knl_opt = true;
2704 			}
2705 		}
2706 		if (_knl_numa_token(tok)) {
2707 			if (!has_numa) {
2708 				has_numa = true;
2709 				knl_opt = true;
2710 			}
2711 		}
2712 		if (knl_opt) {
2713 			xstrfmtcat(node_features, "%s%s", sep, tok);
2714 			sep = ",";
2715 		}
2716 		tok = strtok_r(NULL, "[]()|&", &save_ptr);
2717 	}
2718 	xfree(tmp);
2719 
2720 	return node_features;
2721 }
2722 
2723 /* Return bitmap of KNL nodes, NULL if none identified */
node_features_p_get_node_bitmap(void)2724 extern bitstr_t *node_features_p_get_node_bitmap(void)
2725 {
2726 	if (knl_node_bitmap)
2727 		return bit_copy(knl_node_bitmap);
2728 	return NULL;
2729 }
2730 
2731 /* Return count of overlaping bits in active_bitmap and knl_node_bitmap */
node_features_p_overlap(bitstr_t * active_bitmap)2732 extern int node_features_p_overlap(bitstr_t *active_bitmap)
2733 {
2734 	int cnt = 0;
2735 
2736 	if (!knl_node_bitmap || !active_bitmap ||
2737 	    !(cnt = bit_overlap(active_bitmap, knl_node_bitmap)))
2738 		return 0;
2739 
2740 	return cnt;
2741 }
2742 
2743 /* Return true if the plugin requires PowerSave mode for booting nodes */
node_features_p_node_power(void)2744 extern bool node_features_p_node_power(void)
2745 {
2746 	return true;
2747 }
2748 
2749 /* Set's the node's active features based upon job constraints.
2750  * NOTE: Executed by the slurmd daemon.
2751  * NOTE: Not applicable for knl_cray plugin, reconfiguration done by slurmctld
2752  * IN active_features - New active features
2753  * RET error code */
node_features_p_node_set(char * active_features)2754 extern int node_features_p_node_set(char *active_features)
2755 {
2756 	return SLURM_SUCCESS;
2757 }
2758 
2759 /*
2760  * Note the active features associated with a set of nodes have been updated.
2761  * Specifically update the node's "hbm" GRES and "CpuBind" values as needed.
2762  * IN active_features - New active features
2763  * IN node_bitmap - bitmap of nodes changed
2764  * RET error code
2765  */
node_features_p_node_update(char * active_features,bitstr_t * node_bitmap)2766 extern int node_features_p_node_update(char *active_features,
2767 				       bitstr_t *node_bitmap)
2768 {
2769 	int i, i_first, i_last;
2770 	int rc = SLURM_SUCCESS, numa_inx = -1;
2771 	int mcdram_inx = 0;
2772 	uint64_t mcdram_size;
2773 	node_record_t *node_ptr;
2774 	char *save_ptr = NULL, *tmp, *tok;
2775 
2776 	if (mcdram_per_node == NULL)
2777 		error("%s: mcdram_per_node == NULL", __func__);
2778 
2779 	if (active_features) {
2780 		tmp = xstrdup(active_features);
2781 		tok = strtok_r(tmp, ",", &save_ptr);
2782 		while (tok) {
2783 			if (numa_inx == -1)
2784 				numa_inx = _knl_numa_inx(tok);
2785 			mcdram_inx |= _knl_mcdram_token(tok);
2786 			tok = strtok_r(NULL, ",", &save_ptr);
2787 		}
2788 		xfree(tmp);
2789 	}
2790 
2791 	if (mcdram_inx >= 0) {
2792 		for (i = 0; i < KNL_MCDRAM_CNT; i++) {
2793 			if ((KNL_CACHE << i) == mcdram_inx)
2794 				break;
2795 		}
2796 		if ((i >= KNL_MCDRAM_CNT) || (mcdram_pct[i] == -1))
2797 			mcdram_inx = -1;
2798 		else
2799 			mcdram_inx = i;
2800 	} else {
2801 		mcdram_inx = -1;
2802 	}
2803 
2804 	xassert(node_bitmap);
2805 	i_first = bit_ffs(node_bitmap);
2806 	if (i_first >= 0)
2807 		i_last = bit_fls(node_bitmap);
2808 	else
2809 		i_last = i_first - 1;
2810 	for (i = i_first; i <= i_last; i++) {
2811 		if (!bit_test(node_bitmap, i))
2812 			continue;
2813 		if (i >= node_record_count) {
2814 			error("%s: Invalid node index (%d >= %d)",
2815 			      __func__, i, node_record_count);
2816 			rc = SLURM_ERROR;
2817 			break;
2818 		}
2819 		node_ptr = node_record_table_ptr + i;
2820 		if ((numa_inx >= 0) && cpu_bind[numa_inx])
2821 			node_ptr->cpu_bind = cpu_bind[numa_inx];
2822 		if (mcdram_per_node && (mcdram_inx >= 0)) {
2823 			mcdram_size = mcdram_per_node[i] *
2824 				      (100 - mcdram_pct[mcdram_inx]) / 100;
2825 			gres_plugin_node_feature(node_ptr->name, "hbm",
2826 						 mcdram_size, &node_ptr->gres,
2827 						 &node_ptr->gres_list);
2828 		}
2829 	}
2830 
2831 	return rc;
2832 }
2833 
2834 /*
2835  * Return TRUE if the specified node update request is valid with respect
2836  * to features changes (i.e. don't permit a non-KNL node to set KNL features).
2837  *
2838  * arg IN - Pointer to node_record_t record
2839  * update_node_msg IN - Pointer to update request
2840  */
node_features_p_node_update_valid(void * arg,update_node_msg_t * update_node_msg)2841 extern bool node_features_p_node_update_valid(void *arg,
2842 					update_node_msg_t *update_node_msg)
2843 {
2844 	node_record_t *node_ptr = (node_record_t *) arg;
2845 	char *tmp, *save_ptr = NULL, *tok;
2846 	bool is_knl = false, invalid_feature = false;
2847 
2848 	/* No feature changes */
2849 	if (!update_node_msg->features && !update_node_msg->features_act)
2850 		return true;
2851 
2852 	/* Determine if this is KNL node based upon current features */
2853 	if (node_ptr->features && node_ptr->features[0]) {
2854 		tmp = xstrdup(node_ptr->features);
2855 		tok = strtok_r(tmp, ",", &save_ptr);
2856 		while (tok) {
2857 			if (_knl_mcdram_token(tok) || _knl_numa_token(tok)) {
2858 				is_knl = true;
2859 				break;
2860 			}
2861 			tok = strtok_r(NULL, ",", &save_ptr);
2862 		}
2863 		xfree(tmp);
2864 	}
2865 	if (is_knl)
2866 		return true;
2867 
2868 	/* Validate that AvailableFeatures update request has no KNL modes */
2869 	if (update_node_msg->features) {
2870 		tmp = xstrdup(update_node_msg->features);
2871 		tok = strtok_r(tmp, ",", &save_ptr);
2872 		while (tok) {
2873 			if (_knl_mcdram_token(tok) || _knl_numa_token(tok)) {
2874 				invalid_feature = true;
2875 				break;
2876 			}
2877 			tok = strtok_r(NULL, ",", &save_ptr);
2878 		}
2879 		xfree(tmp);
2880 		if (invalid_feature) {
2881 			info("Invalid AvailableFeatures update request (%s) for non-KNL node %s",
2882 			     update_node_msg->features, node_ptr->name);
2883 			return false;
2884 		}
2885 	}
2886 
2887 	/* Validate that ActiveFeatures update request has no KNL modes */
2888 	if (update_node_msg->features_act) {
2889 		tmp = xstrdup(update_node_msg->features_act);
2890 		tok = strtok_r(tmp, ",", &save_ptr);
2891 		while (tok) {
2892 			if (_knl_mcdram_token(tok) || _knl_numa_token(tok)) {
2893 				invalid_feature = true;
2894 				break;
2895 			}
2896 			tok = strtok_r(NULL, ",", &save_ptr);
2897 		}
2898 		xfree(tmp);
2899 		if (invalid_feature) {
2900 			info("Invalid ActiveFeatures update request (%s) for non-KNL node %s",
2901 			     update_node_msg->features_act, node_ptr->name);
2902 			return false;
2903 		}
2904 	}
2905 
2906 	/*
2907 	 * For non-KNL node, active and available features must match
2908 	 */
2909 	if (!update_node_msg->features) {
2910 		update_node_msg->features =
2911 			xstrdup(update_node_msg->features_act);
2912 	} else if (!update_node_msg->features_act) {
2913 		update_node_msg->features_act =
2914 			xstrdup(update_node_msg->features);
2915 	} else if (xstrcmp(update_node_msg->features,
2916 			   update_node_msg->features_act)) {
2917 		info("Invalid ActiveFeatures != AvailableFeatures (%s != %s) for non-KNL node %s",
2918 		     update_node_msg->features, update_node_msg->features_act,
2919 		     node_ptr->name);
2920 		return false;
2921 	}
2922 
2923 	return true;
2924 }
2925 
2926 /* Return TRUE if this (one) feature name is under this plugin's control */
node_features_p_changeable_feature(char * feature)2927 extern bool node_features_p_changeable_feature(char *feature)
2928 {
2929 	if ((validate_mode == 0) &&
2930 	    (_knl_mcdram_token(feature) || _knl_numa_token(feature)))
2931 		return true;
2932 	return false;
2933 }
2934 
2935 /*
2936  * Translate a node's feature specification by replacing any features associated
2937  *	with this plugin in the original value with the new values, preserving
2938  *	any features that are not associated with this plugin
2939  * IN new_features - newly active features
2940  * IN orig_features - original active features
2941  * IN avail_features - original available features
2942  * IN node_inx - index of node in node table
2943  * RET node's new merged features, must be xfreed
2944  */
node_features_p_node_xlate(char * new_features,char * orig_features,char * avail_features,int node_inx)2945 extern char *node_features_p_node_xlate(char *new_features, char *orig_features,
2946 					char *avail_features, int node_inx)
2947 {
2948 	char *node_features = NULL;
2949 	char *tmp, *save_ptr = NULL, *sep = "", *tok;
2950 	uint16_t new_mcdram = 0, new_numa = 0;
2951 	uint16_t tmp_mcdram, tmp_numa;
2952 	bool is_knl = false;
2953 
2954 	if (avail_features) {
2955 		tmp = xstrdup(avail_features);
2956 		tok = strtok_r(tmp, ",", &save_ptr);
2957 		while (tok) {
2958 			if (_knl_mcdram_token(tok) || _knl_numa_token(tok)) {
2959 				is_knl = true;
2960 			} else {
2961 				xstrfmtcat(node_features, "%s%s", sep, tok);
2962 				sep = ",";
2963 			}
2964 			tok = strtok_r(NULL, ",", &save_ptr);
2965 		}
2966 		xfree(tmp);
2967 		if (!is_knl) {
2968 			xfree(node_features);
2969 			sep = "";
2970 		}
2971 	}
2972 
2973 	if (new_features) {
2974 		/* Copy non-KNL features */
2975 		if (!is_knl && new_features) {
2976 			tmp = xstrdup(new_features);
2977 			tok = strtok_r(tmp, ",", &save_ptr);
2978 			while (tok) {
2979 				if ((_knl_mcdram_token(tok) == 0) &&
2980 				    (_knl_numa_token(tok)   == 0)) {
2981 					xstrfmtcat(node_features, "%s%s", sep,
2982 						   tok);
2983 					sep = ",";
2984 				}
2985 				tok = strtok_r(NULL, ",", &save_ptr);
2986 			}
2987 			xfree(tmp);
2988 		}
2989 
2990 		/* Copy new KNL features in MCDRAM/NUMA order */
2991 		tmp = xstrdup(new_features);
2992 		tok = strtok_r(tmp, ",", &save_ptr);
2993 		while (tok) {
2994 			if ((tmp_mcdram = _knl_mcdram_token(tok)))
2995 				new_mcdram |= tmp_mcdram;
2996 			else if ((tmp_numa = _knl_numa_token(tok)))
2997 				new_numa |= tmp_numa;
2998 			tok = strtok_r(NULL, ",", &save_ptr);
2999 		}
3000 		xfree(tmp);
3001 
3002 		if (is_knl && ((new_mcdram == 0) || (new_numa == 0))) {
3003 			/*
3004 			 * New active features lacks current MCDRAM or NUMA,
3005 			 * copy values from original
3006 			 */
3007 			tmp = xstrdup(orig_features);
3008 			tok = strtok_r(tmp, ",", &save_ptr);
3009 			while (tok) {
3010 				if ((new_mcdram == 0) &&
3011 				    (tmp_mcdram = _knl_mcdram_token(tok)))
3012 					new_mcdram |= tmp_mcdram;
3013 				else if ((new_numa == 0) &&
3014 					 (tmp_numa = _knl_numa_token(tok)))
3015 					new_numa |= tmp_numa;
3016 				tok = strtok_r(NULL, ",", &save_ptr);
3017 			}
3018 			xfree(tmp);
3019 		}
3020 		if (new_mcdram) {
3021 			tmp = _knl_mcdram_str(new_mcdram);
3022 			xstrfmtcat(node_features, "%s%s", sep, tmp);
3023 			xfree(tmp);
3024 			sep = ",";
3025 		}
3026 		if (new_numa) {
3027 			tmp = _knl_numa_str(new_numa);
3028 			xstrfmtcat(node_features, "%s%s", sep, tmp);
3029 			xfree(tmp);
3030 		}
3031 	}
3032 
3033 	return node_features;
3034 }
3035 
3036 /* Translate a node's new feature specification into a "standard" ordering
3037  * RET node's new merged features, must be xfreed */
node_features_p_node_xlate2(char * new_features)3038 extern char *node_features_p_node_xlate2(char *new_features)
3039 {
3040 	char *node_features = NULL;
3041 	char *tmp, *save_ptr = NULL, *sep = "", *tok;
3042 	uint16_t new_mcdram = 0, new_numa = 0;
3043 	uint16_t tmp_mcdram, tmp_numa;
3044 
3045 	if (new_features) {
3046 		tmp = xstrdup(new_features);
3047 		tok = strtok_r(tmp, ",", &save_ptr);
3048 		while (tok) {
3049 			if ((tmp_mcdram = _knl_mcdram_token(tok))) {
3050 				new_mcdram |= tmp_mcdram;
3051 			} else if ((tmp_numa = _knl_numa_token(tok))) {
3052 				new_numa |= tmp_numa;
3053 			} else {
3054 				xstrfmtcat(node_features, "%s%s", sep, tok);
3055 				sep = ",";
3056 			}
3057 			tok = strtok_r(NULL, ",", &save_ptr);
3058 		}
3059 		xfree(tmp);
3060 		if (new_mcdram) {
3061 			tmp = _knl_mcdram_str(new_mcdram);
3062 			xstrfmtcat(node_features, "%s%s", sep, tmp);
3063 			xfree(tmp);
3064 			sep = ",";
3065 		}
3066 		if (new_numa) {
3067 			tmp = _knl_numa_str(new_numa);
3068 			xstrfmtcat(node_features, "%s%s", sep, tmp);
3069 			xfree(tmp);
3070 		}
3071 	}
3072 
3073 	return node_features;
3074 }
3075 
3076 /* Perform set up for step launch
3077  * mem_sort IN - Trigger sort of memory pages (KNL zonesort)
3078  * numa_bitmap IN - NUMA nodes allocated to this job */
node_features_p_step_config(bool mem_sort,bitstr_t * numa_bitmap)3079 extern void node_features_p_step_config(bool mem_sort, bitstr_t *numa_bitmap)
3080 {
3081 #ifdef HAVE_NUMA
3082 	if (mem_sort && (numa_available() != -1)) {
3083 		struct stat sb;
3084 		int buf_len, fd, i, len, rc;
3085 		char buf[12];
3086 
3087 		if (stat(ZONE_SORT_PATH, &sb) == -1) {
3088 			rc = system(MODPROBE_PATH " zonesort_module");
3089 			if (rc != -1)
3090 				rc = WEXITSTATUS(rc);
3091 			if (rc) {
3092 				verbose("%s: zonesort execution failure. Return code: %d",
3093 					__func__, rc);
3094 			}
3095 		}
3096 		if ((fd = open(ZONE_SORT_PATH, O_WRONLY | O_SYNC)) == -1) {
3097 			error("%s: Could not open file %s: %m",
3098 			      __func__, ZONE_SORT_PATH);
3099 		} else {
3100 			len = numa_max_node() + 1;
3101 			for (i = 0; i < len; i++) {
3102 				if (numa_bitmap && !bit_test(numa_bitmap, i))
3103 					continue;
3104 				snprintf(buf, sizeof(buf), "%d", i);
3105 				buf_len = strlen(buf) + 1;
3106 				if (write(fd, buf, buf_len) != buf_len) {
3107 					error("%s: Could not write file %s: %m",
3108 					      __func__, ZONE_SORT_PATH);
3109 				}
3110 			}
3111 			(void) close(fd);
3112 		}
3113 	}
3114 #endif
3115 }
3116 
3117 /* Determine if the specified user can modify the currently available node
3118  * features */
node_features_p_user_update(uid_t uid)3119 extern bool node_features_p_user_update(uid_t uid)
3120 {
3121 	int i;
3122 
3123 	if (allowed_uid_cnt == 0)   /* Default is ALL users allowed to update */
3124 		return true;
3125 
3126 	for (i = 0; i < allowed_uid_cnt; i++) {
3127 		if (allowed_uid[i] == uid)
3128 			return true;
3129 	}
3130 
3131 	return false;
3132 }
3133 
3134 /* Return estimated reboot time, in seconds */
node_features_p_boot_time(void)3135 extern uint32_t node_features_p_boot_time(void)
3136 {
3137 	return boot_time;
3138 }
3139 
3140 /* Get node features plugin configuration */
node_features_p_get_config(config_plugin_params_t * p)3141 extern void node_features_p_get_config(config_plugin_params_t *p)
3142 {
3143 	config_key_pair_t *key_pair;
3144 	List data;
3145 
3146 	xassert(p);
3147 	xstrcat(p->name, plugin_type);
3148 	data = p->key_pairs;
3149 
3150 	key_pair = xmalloc(sizeof(config_key_pair_t));
3151 	key_pair->name = xstrdup("AllowMCDRAM");
3152 	key_pair->value = _knl_mcdram_str(allow_mcdram);
3153 	list_append(data, key_pair);
3154 
3155 	key_pair = xmalloc(sizeof(config_key_pair_t));
3156 	key_pair->name = xstrdup("AllowNUMA");
3157 	key_pair->value = _knl_numa_str(allow_numa);
3158 	list_append(data, key_pair);
3159 
3160 	key_pair = xmalloc(sizeof(config_key_pair_t));
3161 	key_pair->name = xstrdup("AllowUserBoot");
3162 	key_pair->value = _make_uid_str(allowed_uid, allowed_uid_cnt);
3163 	list_append(data, key_pair);
3164 
3165 	key_pair = xmalloc(sizeof(config_key_pair_t));
3166 	key_pair->name = xstrdup("BootTime");
3167 	key_pair->value = xstrdup_printf("%u", boot_time);
3168 	list_append(data, key_pair);
3169 
3170 	key_pair = xmalloc(sizeof(config_key_pair_t));
3171 	key_pair->name = xstrdup("CapmcPath");
3172 	key_pair->value = xstrdup(capmc_path);
3173 	list_append(data, key_pair);
3174 
3175 	key_pair = xmalloc(sizeof(config_key_pair_t));
3176 	key_pair->name = xstrdup("CapmcPollFreq");
3177 	key_pair->value = xstrdup_printf("%u", capmc_poll_freq);
3178 	list_append(data, key_pair);
3179 
3180 	key_pair = xmalloc(sizeof(config_key_pair_t));
3181 	key_pair->name = xstrdup("CapmcRetries");
3182 	key_pair->value = xstrdup_printf("%u", capmc_retries);
3183 	list_append(data, key_pair);
3184 
3185 	key_pair = xmalloc(sizeof(config_key_pair_t));
3186 	key_pair->name = xstrdup("CapmcTimeout");
3187 	key_pair->value = xstrdup_printf("%u", capmc_timeout);
3188 	list_append(data, key_pair);
3189 
3190 	key_pair = xmalloc(sizeof(config_key_pair_t));
3191 	key_pair->name = xstrdup("CnselectPath");
3192 	key_pair->value = xstrdup(cnselect_path);
3193 	list_append(data, key_pair);
3194 
3195 	key_pair = xmalloc(sizeof(config_key_pair_t));
3196 	key_pair->name = xstrdup("DefaultMCDRAM");
3197 	key_pair->value = _knl_mcdram_str(default_mcdram);
3198 	list_append(data, key_pair);
3199 
3200 	key_pair = xmalloc(sizeof(config_key_pair_t));
3201 	key_pair->name = xstrdup("DefaultNUMA");
3202 	key_pair->value = _knl_numa_str(default_numa);
3203 	list_append(data, key_pair);
3204 
3205 	key_pair = xmalloc(sizeof(config_key_pair_t));
3206 	key_pair->name = xstrdup("McPath");
3207 	key_pair->value = xstrdup(mc_path);
3208 	list_append(data, key_pair);
3209 
3210 	key_pair = xmalloc(sizeof(config_key_pair_t));
3211 	key_pair->name = xstrdup("NodeRebootWeight");
3212 	key_pair->value = xstrdup_printf("%u", node_reboot_weight);
3213 	list_append(data, key_pair);
3214 
3215 	key_pair = xmalloc(sizeof(config_key_pair_t));
3216 	key_pair->name = xstrdup("SyscfgPath");
3217 	key_pair->value = xstrdup(syscfg_path);
3218 	list_append(data, key_pair);
3219 
3220 	key_pair = xmalloc(sizeof(config_key_pair_t));
3221 	key_pair->name = xstrdup("UmeCheckInterval");
3222 	key_pair->value = xstrdup_printf("%u", ume_check_interval);
3223 	list_append(data, key_pair);
3224 
3225 	list_sort(data, (ListCmpF) sort_key_pairs);
3226 
3227 	return;
3228 }
3229 
3230 /*
3231  * Return node "weight" field if reboot required to change mode
3232  */
node_features_p_reboot_weight(void)3233 extern uint32_t node_features_p_reboot_weight(void)
3234 {
3235 	return node_reboot_weight;
3236 }
3237