1 /*****************************************************************************\
2 * node_features_knl_cray.c - Plugin for managing Cray KNL state information
3 *****************************************************************************
4 * Copyright (C) 2016 SchedMD LLC.
5 * Written by Morris Jette <jette@schedmd.com>
6 *
7 * This file is part of Slurm, a resource management program.
8 * For details, see <https://slurm.schedmd.com/>.
9 * Please also read the included file: DISCLAIMER.
10 *
11 * Slurm is free software; you can redistribute it and/or modify it under
12 * the terms of the GNU General Public License as published by the Free
13 * Software Foundation; either version 2 of the License, or (at your option)
14 * any later version.
15 *
16 * In addition, as a special exception, the copyright holders give permission
17 * to link the code of portions of this program with the OpenSSL library under
18 * certain conditions as described in each individual source file, and
19 * distribute linked combinations including the two. You must obey the GNU
20 * General Public License in all respects for all of the code used other than
21 * OpenSSL. If you modify file(s) with this exception, you may extend this
22 * exception to your version of the file(s), but you are not obligated to do
23 * so. If you do not wish to do so, delete this exception statement from your
24 * version. If you delete this exception statement from all source files in
25 * the program, then also delete it here.
26 *
27 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
28 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
29 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
30 * details.
31 *
32 * You should have received a copy of the GNU General Public License along
33 * with Slurm; if not, write to the Free Software Foundation, Inc.,
34 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
35 \*****************************************************************************/
36
37 #if HAVE_CONFIG_H
38 # include "config.h"
39 #endif
40
41 #define _GNU_SOURCE /* For POLLRDHUP */
42 #include <ctype.h>
43 #include <fcntl.h>
44 #ifdef HAVE_NUMA
45 #undef NUMA_VERSION1_COMPATIBILITY
46 #include <numa.h>
47 #endif
48 #include <poll.h>
49 #include <signal.h>
50 #include <stdlib.h>
51 #include <sys/stat.h>
52 #include <sys/types.h>
53 #include <time.h>
54 #include <unistd.h>
55
56 #if HAVE_JSON_C_INC
57 # include <json-c/json.h>
58 #elif HAVE_JSON_INC
59 # include <json/json.h>
60 #endif
61
62 #if defined(__APPLE__) || defined(__DragonFly__) || defined(__NetBSD__)
63 #define POLLRDHUP POLLHUP
64 #endif
65
66 #include "slurm/slurm.h"
67
68 #include "src/common/assoc_mgr.h"
69 #include "src/common/bitstring.h"
70 #include "src/common/fd.h"
71 #include "src/common/gres.h"
72 #include "src/common/list.h"
73 #include "src/common/macros.h"
74 #include "src/common/pack.h"
75 #include "src/common/parse_config.h"
76 #include "src/common/slurm_protocol_api.h"
77 #include "src/common/slurm_resource_info.h"
78 #include "src/common/timers.h"
79 #include "src/common/uid.h"
80 #include "src/common/xmalloc.h"
81 #include "src/common/xstring.h"
82
83 #include "src/slurmctld/job_scheduler.h"
84 #include "src/slurmctld/locks.h"
85 #include "src/slurmctld/node_scheduler.h"
86 #include "src/slurmctld/read_config.h"
87 #include "src/slurmctld/reservation.h"
88 #include "src/slurmctld/slurmctld.h"
89 #include "src/slurmctld/state_save.h"
90 #include "src/slurmd/slurmd/req.h"
91
92 /* Maximum poll wait time for child processes, in milliseconds */
93 #define MAX_POLL_WAIT 500
94
95 /* Default and minimum timeout parameters for the capmc command */
96 #define DEFAULT_CAPMC_RETRIES 4
97 #define DEFAULT_CAPMC_TIMEOUT 60000 /* 60 seconds */
98 #define MIN_CAPMC_TIMEOUT 1000 /* 1 second */
99
100 /* Intel Knights Landing Configuration Modes */
101 #define KNL_NUMA_CNT 5
102 #define KNL_MCDRAM_CNT 4
103 #define KNL_NUMA_FLAG 0x00ff
104 #define KNL_ALL2ALL 0x0001
105 #define KNL_SNC2 0x0002
106 #define KNL_SNC4 0x0004
107 #define KNL_HEMI 0x0008
108 #define KNL_QUAD 0x0010
109 #define KNL_MCDRAM_FLAG 0xff00
110 #define KNL_CACHE 0x0100
111 #define KNL_EQUAL 0x0200
112 #define KNL_SPLIT 0x0400
113 #define KNL_FLAT 0x0800
114
115 #ifndef MODPROBE_PATH
116 #define MODPROBE_PATH "/sbin/modprobe"
117 #endif
118 #define ZONE_SORT_PATH "/sys/kernel/zone_sort_free_pages/nodeid"
119
120 /* These are defined here so when we link with something other than
121 * the slurmctld we will have these symbols defined. They will get
122 * overwritten when linking with the slurmctld.
123 */
124 #if defined (__APPLE__)
125 extern slurmctld_config_t slurmctld_config __attribute__((weak_import));
126 extern bitstr_t *avail_node_bitmap __attribute__((weak_import));
127 extern active_feature_list __attribute__((weak_import));
128 #else
129 slurmctld_config_t slurmctld_config;
130 bitstr_t *avail_node_bitmap;
131 List active_feature_list;
132 #endif
133
134 /*
135 * These variables are required by the burst buffer plugin interface. If they
136 * are not found in the plugin, the plugin loader will ignore it.
137 *
138 * plugin_name - a string giving a human-readable description of the
139 * plugin. There is no maximum length, but the symbol must refer to
140 * a valid string.
141 *
142 * plugin_type - a string suggesting the type of the plugin or its
143 * applicability to a particular form of data or method of data handling.
144 * If the low-level plugin API is used, the contents of this string are
145 * unimportant and may be anything. Slurm uses the higher-level plugin
146 * interface which requires this string to be of the form
147 *
148 * <application>/<method>
149 *
150 * where <application> is a description of the intended application of
151 * the plugin (e.g., "node_features" for Slurm node_features) and <method> is a
152 * description of how this plugin satisfies that application. Slurm will only
153 * load a node_features plugin if the plugin_type string has a prefix of
154 * "node_features/".
155 *
156 * plugin_version - an unsigned 32-bit integer containing the Slurm version
157 * (major.minor.micro combined into a single number).
158 */
159 const char plugin_name[] = "node_features knl_cray plugin";
160 const char plugin_type[] = "node_features/knl_cray";
161 const uint32_t plugin_version = SLURM_VERSION_NUMBER;
162
163 /* Configuration Parameters */
164 static uint16_t allow_mcdram = KNL_MCDRAM_FLAG;
165 static uint16_t allow_numa = KNL_NUMA_FLAG;
166 static uid_t *allowed_uid = NULL;
167 static int allowed_uid_cnt = 0;
168 static uint32_t boot_time = (45 * 60); /* 45 minute estimated boot time */
169 static char *capmc_path = NULL;
170 static uint32_t capmc_poll_freq = 45; /* capmc state polling frequency */
171 static uint32_t capmc_retries = DEFAULT_CAPMC_RETRIES;
172 static uint32_t capmc_timeout = 0; /* capmc command timeout in msec */
173 static char *cnselect_path = NULL;
174 static uint32_t cpu_bind[KNL_NUMA_CNT]; /* Derived from numa_cpu_bind */
175 static bool debug_flag = false;
176 static uint16_t default_mcdram = KNL_CACHE;
177 static uint16_t default_numa = KNL_ALL2ALL;
178 static char *mc_path = NULL;
179 static uint32_t node_reboot_weight = (INFINITE - 1);
180 static char *numa_cpu_bind = NULL;
181 static char *syscfg_path = NULL;
182 static pthread_mutex_t config_mutex = PTHREAD_MUTEX_INITIALIZER;
183 static bool reconfig = false;
184 static uint32_t ume_check_interval = 0;
185 static pthread_mutex_t ume_mutex = PTHREAD_MUTEX_INITIALIZER;
186 static pthread_t ume_thread = 0;
187 static uint32_t validate_mode = 0;
188
189 static bitstr_t *knl_node_bitmap = NULL; /* KNL nodes found by capmc */
190 static pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER;
191 static char *node_list_queue = NULL;
192 static time_t node_time_queue = (time_t) 0;
193 static time_t shutdown_time = (time_t) 0;
194 static pthread_t queue_thread = 0;
195
196 /* Percentage of MCDRAM used for cache by type, updated from capmc */
197 static int mcdram_pct[KNL_MCDRAM_CNT];
198 static int mcdram_set = 0;
199 static uint64_t *mcdram_per_node = NULL;
200
201 /* NOTE: New knl_cray.conf parameters added below must also be added to the
202 * contribs/cray/capmc_suspend.c and contribs/cray/capmc_resume.c files */
203 static s_p_options_t knl_conf_file_options[] = {
204 {"AllowMCDRAM", S_P_STRING},
205 {"AllowNUMA", S_P_STRING},
206 {"AllowUserBoot", S_P_STRING},
207 {"BootTime", S_P_UINT32},
208 {"CapmcPath", S_P_STRING},
209 {"CapmcPollFreq", S_P_UINT32},
210 {"CapmcRetries", S_P_UINT32},
211 {"CapmcTimeout", S_P_UINT32},
212 {"CnselectPath", S_P_STRING},
213 {"DefaultMCDRAM", S_P_STRING},
214 {"DefaultNUMA", S_P_STRING},
215 {"LogFile", S_P_STRING},
216 {"McPath", S_P_STRING},
217 {"NumaCpuBind", S_P_STRING},
218 {"SyscfgPath", S_P_STRING},
219 {"NodeRebootWeight", S_P_UINT32},
220 {"UmeCheckInterval", S_P_UINT32},
221 {"ValidateMode", S_P_UINT32},
222 {NULL}
223 };
224
225 typedef struct mcdram_cap {
226 uint32_t nid;
227 char *mcdram_cfg;
228 } mcdram_cap_t;
229
230 typedef struct mcdram_cfg {
231 uint64_t dram_size;
232 uint32_t nid;
233 char *mcdram_cfg;
234 uint64_t mcdram_size;
235 uint16_t mcdram_pct;
236 } mcdram_cfg_t;
237
238 typedef struct mcdram_cfg2 {
239 int cache_pct;
240 char *mcdram_cfg;
241 char *nid_str;
242 bitstr_t *node_bitmap;
243 } mcdram_cfg2_t;
244
245 typedef struct numa_cap {
246 uint32_t nid;
247 char *numa_cfg;
248 } numa_cap_t;
249
250 typedef struct numa_cfg {
251 uint32_t nid;
252 char *numa_cfg;
253 } numa_cfg_t;
254
255 typedef struct numa_cfg2 {
256 char *nid_str;
257 bitstr_t *node_bitmap;
258 char *numa_cfg;
259 } numa_cfg2_t;
260
261 static void _check_node_disabled(void);
262 static void _check_node_status(void);
263 static s_p_hashtbl_t *_config_make_tbl(char *filename);
264 static void _free_script_argv(char **script_argv);
265 static mcdram_cap_t *_json_parse_mcdram_cap_array(json_object *jobj, char *key,
266 int *num);
267 static mcdram_cfg_t *_json_parse_mcdram_cfg_array(json_object *jobj, char *key,
268 int *num);
269 static void _json_parse_mcdram_cap_object(json_object *jobj, mcdram_cap_t *ent);
270 static void _json_parse_mcdram_cfg_object(json_object *jobj, mcdram_cfg_t *ent);
271 static numa_cap_t *_json_parse_numa_cap_array(json_object *jobj, char *key,
272 int *num);
273 static void _json_parse_numa_cap_object(json_object *jobj, numa_cap_t *ent);
274 static numa_cfg_t *_json_parse_numa_cfg_array(json_object *jobj, char *key,
275 int *num);
276 static void _json_parse_numa_cfg_object(json_object *jobj, numa_cfg_t *ent);
277 static int _knl_mcdram_bits_cnt(uint16_t mcdram_num);
278 static uint16_t _knl_mcdram_parse(char *mcdram_str, char *sep);
279 static char *_knl_mcdram_str(uint16_t mcdram_num);
280 static uint16_t _knl_mcdram_token(char *token);
281 static int _knl_numa_bits_cnt(uint16_t numa_num);
282 static uint16_t _knl_numa_parse(char *numa_str, char *sep);
283 static char *_knl_numa_str(uint16_t numa_num);
284 static int _knl_numa_inx(char *token);
285 static uint16_t _knl_numa_token(char *token);
286 static mcdram_cfg2_t *_load_current_mcdram(int *num);
287 static numa_cfg2_t *_load_current_numa(int *num);
288 static char *_load_mcdram_type(int cache_pct);
289 static char *_load_numa_type(char *type);
290 static void _log_script_argv(char **script_argv, char *resp_msg);
291 static void _mcdram_cap_free(mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt);
292 static void _mcdram_cap_log(mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt);
293 static void _mcdram_cfg_free(mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt);
294 static void _mcdram_cfg2_free(mcdram_cfg2_t *mcdram_cfg2, int mcdram_cfg2_cnt);
295 static void _mcdram_cfg_log(mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt);
296 static void _merge_strings(char **node_features, char *node_cfg,
297 uint16_t allow_types);
298 static void _numa_cap_free(numa_cap_t *numa_cap, int numa_cap_cnt);
299 static void _numa_cap_log(numa_cap_t *numa_cap, int numa_cap_cnt);
300 static void _numa_cfg_free(numa_cfg_t *numa_cfg, int numa_cfg_cnt);
301 static void _numa_cfg2_free(numa_cfg2_t *numa_cfg, int numa_cfg2_cnt);
302 static void _numa_cfg_log(numa_cfg_t *numa_cfg, int numa_cfg_cnt);
303 static void _numa_cfg2_log(numa_cfg2_t *numa_cfg, int numa_cfg2_cnt);
304 static uint64_t _parse_size(char *size_str);
305 extern void *_queue_agent(void *args);
306 static int _queue_node_update(char *node_list);
307 static char *_run_script(char *cmd_path, char **script_argv, int *status);
308 static void _strip_knl_opts(char **features);
309 static int _tot_wait (struct timeval *start_time);
310 static void *_ume_agent(void *args);
311 static void _update_all_node_features(
312 mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt,
313 mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt,
314 numa_cap_t *numa_cap, int numa_cap_cnt,
315 numa_cfg_t *numa_cfg, int numa_cfg_cnt);
316 static void _update_cpu_bind(void);
317 static void _update_mcdram_pct(char *tok, int mcdram_num);
318 static void _update_node_features(node_record_t *node_ptr,
319 mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt,
320 mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt,
321 numa_cap_t *numa_cap, int numa_cap_cnt,
322 numa_cfg_t *numa_cfg, int numa_cfg_cnt);
323 static int _update_node_state(char *node_list, bool set_locks);
324 static void _validate_node_features(node_record_t *node_ptr);
325
326 /* Function used both internally and externally */
327 extern int node_features_p_node_update(char *active_features,
328 bitstr_t *node_bitmap);
329
_config_make_tbl(char * filename)330 static s_p_hashtbl_t *_config_make_tbl(char *filename)
331 {
332 s_p_hashtbl_t *tbl = NULL;
333
334 xassert(filename);
335
336 if (!(tbl = s_p_hashtbl_create(knl_conf_file_options))) {
337 error("knl.conf: %s: s_p_hashtbl_create error: %m", __func__);
338 return tbl;
339 }
340
341 if (s_p_parse_file(tbl, NULL, filename, false) == SLURM_ERROR) {
342 error("knl.conf: %s: s_p_parse_file error: %m", __func__);
343 s_p_hashtbl_destroy(tbl);
344 tbl = NULL;
345 }
346
347 return tbl;
348 }
349
350 /*
351 * Return the count of MCDRAM bits set
352 */
_knl_mcdram_bits_cnt(uint16_t mcdram_num)353 static int _knl_mcdram_bits_cnt(uint16_t mcdram_num)
354 {
355 int cnt = 0, i;
356 uint16_t tmp = 1;
357
358 for (i = 0; i < 16; i++) {
359 if ((mcdram_num & KNL_MCDRAM_FLAG) & tmp)
360 cnt++;
361 tmp = tmp << 1;
362 }
363 return cnt;
364 }
365
366 /*
367 * Translate KNL MCDRAM string to equivalent numeric value
368 * mcdram_str IN - String to scan
369 * sep IN - token separator to search for
370 * RET MCDRAM numeric value
371 */
_knl_mcdram_parse(char * mcdram_str,char * sep)372 static uint16_t _knl_mcdram_parse(char *mcdram_str, char *sep)
373 {
374 char *save_ptr = NULL, *tmp, *tok;
375 uint16_t mcdram_num = 0;
376
377 if (!mcdram_str)
378 return mcdram_num;
379
380 tmp = xstrdup(mcdram_str);
381 tok = strtok_r(tmp, sep, &save_ptr);
382 while (tok) {
383 mcdram_num |= _knl_mcdram_token(tok);
384 tok = strtok_r(NULL, sep, &save_ptr);
385 }
386 xfree(tmp);
387
388 return mcdram_num;
389 }
390
391 /*
392 * Translate KNL MCDRAM number to equivalent string value
393 * Caller must free return value
394 */
_knl_mcdram_str(uint16_t mcdram_num)395 static char *_knl_mcdram_str(uint16_t mcdram_num)
396 {
397 char *mcdram_str = NULL, *sep = "";
398
399 if (mcdram_num & KNL_CACHE) {
400 xstrfmtcat(mcdram_str, "%scache", sep);
401 sep = ",";
402 }
403 if (mcdram_num & KNL_SPLIT) {
404 xstrfmtcat(mcdram_str, "%ssplit", sep);
405 sep = ",";
406 }
407 if (mcdram_num & KNL_FLAT) {
408 xstrfmtcat(mcdram_str, "%sflat", sep);
409 sep = ",";
410 }
411 if (mcdram_num & KNL_EQUAL) {
412 xstrfmtcat(mcdram_str, "%sequal", sep);
413 // sep = ","; /* Remove to avoid CLANG error */
414 }
415
416 return mcdram_str;
417 }
418
419 /*
420 * Given a KNL MCDRAM token, return its equivalent numeric value
421 * token IN - String to scan
422 * RET MCDRAM numeric value
423 */
_knl_mcdram_token(char * token)424 static uint16_t _knl_mcdram_token(char *token)
425 {
426 uint16_t mcdram_num = 0;
427
428 if (!xstrcasecmp(token, "cache"))
429 mcdram_num = KNL_CACHE;
430 else if (!xstrcasecmp(token, "split"))
431 mcdram_num = KNL_SPLIT;
432 else if (!xstrcasecmp(token, "flat"))
433 mcdram_num = KNL_FLAT;
434 else if (!xstrcasecmp(token, "equal"))
435 mcdram_num = KNL_EQUAL;
436
437 return mcdram_num;
438 }
439
440 /*
441 * Return the count of NUMA bits set
442 */
_knl_numa_bits_cnt(uint16_t numa_num)443 static int _knl_numa_bits_cnt(uint16_t numa_num)
444 {
445 int cnt = 0, i;
446 uint16_t tmp = 1;
447
448 for (i = 0; i < 16; i++) {
449 if ((numa_num & KNL_NUMA_FLAG) & tmp)
450 cnt++;
451 tmp = tmp << 1;
452 }
453 return cnt;
454 }
455
456 /*
457 * Translate KNL NUMA string to equivalent numeric value
458 * numa_str IN - String to scan
459 * sep IN - token separator to search for
460 * RET NUMA numeric value
461 */
_knl_numa_parse(char * numa_str,char * sep)462 static uint16_t _knl_numa_parse(char *numa_str, char *sep)
463 {
464 char *save_ptr = NULL, *tmp, *tok;
465 uint16_t numa_num = 0;
466
467 if (!numa_str)
468 return numa_num;
469
470 tmp = xstrdup(numa_str);
471 tok = strtok_r(tmp, sep, &save_ptr);
472 while (tok) {
473 numa_num |= _knl_numa_token(tok);
474 tok = strtok_r(NULL, sep, &save_ptr);
475 }
476 xfree(tmp);
477
478 return numa_num;
479 }
480
481 /*
482 * Translate KNL NUMA number to equivalent string value
483 * Caller must free return value
484 */
_knl_numa_str(uint16_t numa_num)485 static char *_knl_numa_str(uint16_t numa_num)
486 {
487 char *numa_str = NULL, *sep = "";
488
489 if (numa_num & KNL_ALL2ALL) {
490 xstrfmtcat(numa_str, "%sa2a", sep);
491 sep = ",";
492 }
493 if (numa_num & KNL_SNC2) {
494 xstrfmtcat(numa_str, "%ssnc2", sep);
495 sep = ",";
496 }
497 if (numa_num & KNL_SNC4) {
498 xstrfmtcat(numa_str, "%ssnc4", sep);
499 sep = ",";
500 }
501 if (numa_num & KNL_HEMI) {
502 xstrfmtcat(numa_str, "%shemi", sep);
503 sep = ",";
504 }
505 if (numa_num & KNL_QUAD) {
506 xstrfmtcat(numa_str, "%squad", sep);
507 // sep = ","; /* Remove to avoid CLANG error */
508 }
509
510 return numa_str;
511
512 }
513
514 /*
515 * Given a KNL NUMA token, return its equivalent numeric value
516 * token IN - String to scan
517 * RET NUMA numeric value
518 */
_knl_numa_token(char * token)519 static uint16_t _knl_numa_token(char *token)
520 {
521 uint16_t numa_num = 0;
522
523 if (!xstrcasecmp(token, "a2a"))
524 numa_num |= KNL_ALL2ALL;
525 else if (!xstrcasecmp(token, "snc2"))
526 numa_num |= KNL_SNC2;
527 else if (!xstrcasecmp(token, "snc4"))
528 numa_num |= KNL_SNC4;
529 else if (!xstrcasecmp(token, "hemi"))
530 numa_num |= KNL_HEMI;
531 else if (!xstrcasecmp(token, "quad"))
532 numa_num |= KNL_QUAD;
533
534 return numa_num;
535 }
536
537 /*
538 * Given a KNL NUMA token, return its cpu_bind offset
539 * token IN - String to scan
540 * RET NUMA offset or -1 if not found
541 */
_knl_numa_inx(char * token)542 static int _knl_numa_inx(char *token)
543 {
544 uint16_t numa_num;
545 int i;
546
547 numa_num = _knl_numa_token(token);
548 for (i = 0; i < KNL_NUMA_CNT; i++) {
549 if ((0x01 << i) == numa_num)
550 return i;
551 }
552 return -1;
553 }
554
555 /* Remove all KNL feature names from the "features" string */
_strip_knl_opts(char ** features)556 static void _strip_knl_opts(char **features)
557 {
558 char *save_ptr = NULL, *tok;
559 char *tmp_str, *result_str = NULL, *sep = "";
560
561 if (*features == NULL)
562 return;
563
564 tmp_str = xstrdup(*features);
565 tok = strtok_r(tmp_str, ",", &save_ptr);
566 while (tok) {
567 if (!_knl_mcdram_token(tok) && !_knl_numa_token(tok)) {
568 xstrfmtcat(result_str, "%s%s", sep, tok);
569 sep = ",";
570 }
571 tok = strtok_r(NULL, ",", &save_ptr);
572 }
573 xfree(tmp_str);
574 xfree(*features);
575 *features = result_str;
576 }
577
578 /*
579 * Return time in msec since "start time"
580 */
_tot_wait(struct timeval * start_time)581 static int _tot_wait (struct timeval *start_time)
582 {
583 struct timeval end_time;
584 int msec_delay;
585
586 gettimeofday(&end_time, NULL);
587 msec_delay = (end_time.tv_sec - start_time->tv_sec ) * 1000;
588 msec_delay += ((end_time.tv_usec - start_time->tv_usec + 500) / 1000);
589 return msec_delay;
590 }
591
592 /* Free an array of xmalloced records. The array must be NULL terminated. */
_free_script_argv(char ** script_argv)593 static void _free_script_argv(char **script_argv)
594 {
595 int i;
596
597 for (i = 0; script_argv[i]; i++)
598 xfree(script_argv[i]);
599 xfree(script_argv);
600 }
601
602 /*
603 * Update cpu_bind array from current numa_cpu_bind configuration parameter
604 */
_update_cpu_bind(void)605 static void _update_cpu_bind(void)
606 {
607 char *save_ptr = NULL, *sep, *tok, *tmp;
608 int rc = SLURM_SUCCESS;
609 int i, numa_inx, numa_def;
610 uint32_t cpu_bind_val = 0;
611
612 for (i = 0; i < KNL_NUMA_CNT; i++)
613 cpu_bind[0] = 0;
614
615 if (!numa_cpu_bind)
616 return;
617
618 tmp = xstrdup(numa_cpu_bind);
619 tok = strtok_r(tmp, ";", &save_ptr);
620 while (tok) {
621 sep = strchr(tok, '=');
622 if (!sep) {
623 rc = SLURM_ERROR;
624 break;
625 }
626 sep[0] = '\0';
627 numa_def = _knl_numa_token(tok);
628 if (numa_def == 0) {
629 rc = SLURM_ERROR;
630 break;
631 }
632 if (xlate_cpu_bind_str(sep + 1, &cpu_bind_val) !=
633 SLURM_SUCCESS) {
634 rc = SLURM_ERROR;
635 break;
636 }
637 numa_inx = -1;
638 for (i = 0; i < KNL_NUMA_CNT; i++) {
639 if ((0x1 << i) == numa_def) {
640 numa_inx = i;
641 break;
642 }
643 }
644 if (numa_inx > -1)
645 cpu_bind[numa_inx] = cpu_bind_val;
646 tok = strtok_r(NULL, ";", &save_ptr);
647 }
648 xfree(tmp);
649
650 if (rc != SLURM_SUCCESS) {
651 error("%s: Invalid NumaCpuBind (%s), ignored",
652 plugin_type, numa_cpu_bind);
653 }
654
655 if (debug_flag) {
656 for (i = 0; i < KNL_NUMA_CNT; i++) {
657 char cpu_bind_str[128], *numa_str;
658 if (cpu_bind[i] == 0)
659 continue;
660 numa_str = _knl_numa_str(0x1 << i);
661 slurm_sprint_cpu_bind_type(cpu_bind_str, cpu_bind[i]);
662 info("CpuBind[%s] = %s", numa_str, cpu_bind_str);
663 xfree(numa_str);
664 }
665 }
666 }
667
668 /*
669 * Update our mcdram_pct array with new data.
670 * tok IN - percentage of MCDRAM to be used as cache (string form)
671 * mcdram_num - MCDRAM value (bit from KNL_FLAT, etc.)
672 */
_update_mcdram_pct(char * tok,int mcdram_num)673 static void _update_mcdram_pct(char *tok, int mcdram_num)
674 {
675 int inx;
676
677 if (mcdram_set == KNL_MCDRAM_CNT)
678 return;
679
680 for (inx = 0; inx < KNL_MCDRAM_CNT; inx++) {
681 if ((KNL_CACHE << inx) == mcdram_num)
682 break;
683 }
684 if ((inx >= KNL_MCDRAM_CNT) || (mcdram_pct[inx] != -1))
685 return;
686 mcdram_pct[inx] = strtol(tok, NULL, 10);
687 mcdram_set++;
688 }
689
_json_parse_mcdram_cap_object(json_object * jobj,mcdram_cap_t * ent)690 static void _json_parse_mcdram_cap_object(json_object *jobj, mcdram_cap_t *ent)
691 {
692 enum json_type type;
693 struct json_object_iter iter;
694 int64_t x;
695 const char *p;
696 char *tmp_str, *tok, *save_ptr = NULL, *sep = "";
697 int last_mcdram_num = -1;
698
699 json_object_object_foreachC(jobj, iter) {
700 type = json_object_get_type(iter.val);
701 switch (type) {
702 case json_type_int:
703 x = json_object_get_int64(iter.val);
704 if (xstrcmp(iter.key, "nid") == 0) {
705 ent->nid = x;
706 }
707 break;
708 case json_type_string:
709 p = json_object_get_string(iter.val);
710 if (xstrcmp(iter.key, "mcdram_cfg") == 0) {
711 tmp_str = xstrdup(p);
712 tok = strtok_r(tmp_str, ",", &save_ptr);
713 while (tok) {
714 if ((tok[0] >= '0') && (tok[0] <= '9')){
715 _update_mcdram_pct(tok,
716 last_mcdram_num);
717 last_mcdram_num = -1;
718 } else {
719 last_mcdram_num =
720 _knl_mcdram_token(tok);
721 xstrfmtcat(ent->mcdram_cfg,
722 "%s%s", sep, tok);
723 sep = ",";
724 }
725 tok = strtok_r(NULL, ",", &save_ptr);
726 }
727 xfree(tmp_str);
728 }
729 break;
730 default:
731 break;
732 }
733 }
734 }
735
_parse_size(char * size_str)736 static uint64_t _parse_size(char *size_str)
737 {
738 uint64_t size_num = 0;
739 char *end_ptr = NULL;
740
741 size_num = (uint64_t) strtol(size_str, &end_ptr, 10);
742 if ((end_ptr[0] == 'k') || (end_ptr[0] == 'K'))
743 size_num *= 1024;
744 else if ((end_ptr[0] == 'm') || (end_ptr[0] == 'M'))
745 size_num *= (1024 * 1024);
746 else if ((end_ptr[0] == 'g') || (end_ptr[0] == 'G'))
747 size_num *= (1024 * 1024 * 1024);
748 else if (end_ptr[0] != '\0')
749 info("Invalid MCDRAM size: %s", size_str);
750
751 return size_num;
752 }
753
_json_parse_mcdram_cfg_object(json_object * jobj,mcdram_cfg_t * ent)754 static void _json_parse_mcdram_cfg_object(json_object *jobj, mcdram_cfg_t *ent)
755 {
756 enum json_type type;
757 struct json_object_iter iter;
758 int64_t x;
759 const char *p;
760
761 /* Initialize object */
762 ent->dram_size = NO_VAL;
763 ent->mcdram_pct = NO_VAL16;
764 ent->mcdram_size = NO_VAL;
765
766 json_object_object_foreachC(jobj, iter) {
767 type = json_object_get_type(iter.val);
768 switch (type) {
769 case json_type_int:
770 x = json_object_get_int64(iter.val);
771 if (xstrcmp(iter.key, "nid") == 0) {
772 ent->nid = x;
773 } else if (xstrcmp(iter.key, "mcdram_pct") == 0) {
774 ent->mcdram_pct = x;
775 }
776 break;
777 case json_type_string:
778 p = json_object_get_string(iter.val);
779 if (xstrcmp(iter.key, "dram_size") == 0) {
780 ent->dram_size = _parse_size((char *) p);
781 } else if (xstrcmp(iter.key, "mcdram_cfg") == 0) {
782 ent->mcdram_cfg = xstrdup(p);
783 } else if (xstrcmp(iter.key, "mcdram_pct") == 0) {
784 ent->mcdram_pct = _parse_size((char *) p);
785 } else if (xstrcmp(iter.key, "mcdram_size") == 0) {
786 ent->mcdram_size = _parse_size((char *) p);
787 }
788 break;
789 default:
790 break;
791 }
792 }
793 }
794
_json_parse_numa_cap_object(json_object * jobj,numa_cap_t * ent)795 static void _json_parse_numa_cap_object(json_object *jobj, numa_cap_t *ent)
796 {
797 enum json_type type;
798 struct json_object_iter iter;
799 int64_t x;
800 const char *p;
801
802 json_object_object_foreachC(jobj, iter) {
803 type = json_object_get_type(iter.val);
804 switch (type) {
805 case json_type_int:
806 x = json_object_get_int64(iter.val);
807 if (xstrcmp(iter.key, "nid") == 0) {
808 ent->nid = x;
809 }
810 break;
811 case json_type_string:
812 p = json_object_get_string(iter.val);
813 if (xstrcmp(iter.key, "numa_cfg") == 0) {
814 ent->numa_cfg = xstrdup(p);
815 }
816 break;
817 default:
818 break;
819 }
820 }
821 }
822
_json_parse_numa_cfg_object(json_object * jobj,numa_cfg_t * ent)823 static void _json_parse_numa_cfg_object(json_object *jobj, numa_cfg_t *ent)
824 {
825 enum json_type type;
826 struct json_object_iter iter;
827 int64_t x;
828 const char *p;
829
830 json_object_object_foreachC(jobj, iter) {
831 type = json_object_get_type(iter.val);
832 switch (type) {
833 case json_type_int:
834 x = json_object_get_int64(iter.val);
835 if (xstrcmp(iter.key, "nid") == 0) {
836 ent->nid = x;
837 }
838 break;
839 case json_type_string:
840 p = json_object_get_string(iter.val);
841 if (xstrcmp(iter.key, "numa_cfg") == 0) {
842 ent->numa_cfg = xstrdup(p);
843 }
844 break;
845 default:
846 break;
847 }
848 }
849 }
850
_json_parse_mcdram_cap_array(json_object * jobj,char * key,int * num)851 static mcdram_cap_t *_json_parse_mcdram_cap_array(json_object *jobj, char *key,
852 int *num)
853 {
854 json_object *jarray;
855 json_object *jvalue;
856 mcdram_cap_t *ents;
857 int i;
858
859 jarray = jobj;
860 json_object_object_get_ex(jobj, key, &jarray);
861
862 *num = json_object_array_length(jarray);
863 ents = xmalloc(*num * sizeof(mcdram_cap_t));
864
865 for (i = 0; i < *num; i++) {
866 jvalue = json_object_array_get_idx(jarray, i);
867 _json_parse_mcdram_cap_object(jvalue, &ents[i]);
868 }
869
870 return ents;
871 }
872
873 /* Return NID string for all nodes with specified MCDRAM mode (HBM percentage).
874 * NOTE: Information not returned for nodes which are not up
875 * NOTE: xfree() the return value. */
_load_mcdram_type(int cache_pct)876 static char *_load_mcdram_type(int cache_pct)
877 {
878 char **script_argv, *resp_msg;
879 int i, status = 0;
880 DEF_TIMERS;
881
882 if (cache_pct < 0) /* Unsupported configuration on this system */
883 return NULL;
884 script_argv = xmalloc(sizeof(char *) * 4); /* NULL terminated */
885 script_argv[0] = xstrdup("cnselect");
886 script_argv[1] = xstrdup("-e");
887 xstrfmtcat(script_argv[2], "hbmcachepct.eq.%d", cache_pct);
888 START_TIMER;
889 resp_msg = _run_script(cnselect_path, script_argv, &status);
890 END_TIMER;
891 if (debug_flag) {
892 info("%s: %s %s %s ran for %s", __func__,
893 script_argv[0], script_argv[1], script_argv[2], TIME_STR);
894 }
895 if (resp_msg == NULL) {
896 debug("%s: %s %s %s returned no information",
897 __func__, script_argv[0], script_argv[1], script_argv[2]);
898 } else {
899 i = strlen(resp_msg);
900 if (resp_msg[i-1] == '\n')
901 resp_msg[i-1] = '\0';
902 }
903 _log_script_argv(script_argv, resp_msg);
904 if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
905 error("%s: %s %s %s status:%u response:%s", __func__,
906 script_argv[0], script_argv[1], script_argv[2],
907 status, resp_msg);
908 }
909 _free_script_argv(script_argv);
910 return resp_msg;
911 }
912
913 /* Return table of MCDRAM modes and NID string identifying nodes with that mode.
914 * Use _mcdram_cfg2_free() to release returned data structure */
_load_current_mcdram(int * num)915 static mcdram_cfg2_t *_load_current_mcdram(int *num)
916 {
917 mcdram_cfg2_t *mcdram_cfg;
918 int i;
919
920 mcdram_cfg = xmalloc(sizeof(mcdram_cfg2_t) * 4);
921
922 for (i = 0; i < 4; i++) {
923 mcdram_cfg[i].cache_pct = mcdram_pct[i];
924 mcdram_cfg[i].mcdram_cfg = _knl_mcdram_str(KNL_CACHE << i);
925 mcdram_cfg[i].nid_str = _load_mcdram_type(mcdram_cfg[i].cache_pct);
926 if (mcdram_cfg[i].nid_str && mcdram_cfg[i].nid_str[0]) {
927 mcdram_cfg[i].node_bitmap = bit_alloc(100000);
928 (void) bit_unfmt(mcdram_cfg[i].node_bitmap,
929 mcdram_cfg[i].nid_str);
930 }
931 }
932 *num = 4;
933 return mcdram_cfg;
934 }
935
936 /* Return NID string for all nodes with specified NUMA mode.
937 * NOTE: Information not returned for nodes which are not up
938 * NOTE: xfree() the return value. */
_load_numa_type(char * type)939 static char *_load_numa_type(char *type)
940 {
941 char **script_argv, *resp_msg;
942 int i, status = 0;
943 DEF_TIMERS;
944
945 script_argv = xmalloc(sizeof(char *) * 4); /* NULL terminated */
946 script_argv[0] = xstrdup("cnselect");
947 script_argv[1] = xstrdup("-e");
948 xstrfmtcat(script_argv[2], "numa_cfg.eq.%s", type);
949 START_TIMER;
950 resp_msg = _run_script(cnselect_path, script_argv, &status);
951 END_TIMER;
952 if (debug_flag) {
953 info("%s: %s %s %s ran for %s", __func__,
954 script_argv[0], script_argv[1], script_argv[2], TIME_STR);
955 }
956 if (resp_msg == NULL) {
957 debug("%s: %s %s %s returned no information",
958 __func__, script_argv[0], script_argv[1], script_argv[2]);
959 } else {
960 i = strlen(resp_msg);
961 if (resp_msg[i-1] == '\n')
962 resp_msg[i-1] = '\0';
963 }
964 _log_script_argv(script_argv, resp_msg);
965 if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) {
966 error("%s: %s %s %s status:%u response:%s", __func__,
967 script_argv[0], script_argv[1], script_argv[2],
968 status, resp_msg);
969 }
970 _free_script_argv(script_argv);
971 return resp_msg;
972 }
973
974 /* Return table of NUMA modes and NID string identifying nodes with that mode.
975 * Use _numa_cfg2_free() to release returned data structure */
_load_current_numa(int * num)976 static numa_cfg2_t *_load_current_numa(int *num)
977 {
978 numa_cfg2_t *numa_cfg2;
979 int i;
980
981 numa_cfg2 = xmalloc(sizeof(numa_cfg2_t) * 5);
982 numa_cfg2[0].numa_cfg = xstrdup("a2a");
983 numa_cfg2[1].numa_cfg = xstrdup("snc2");
984 numa_cfg2[2].numa_cfg = xstrdup("snc4");
985 numa_cfg2[3].numa_cfg = xstrdup("hemi");
986 numa_cfg2[4].numa_cfg = xstrdup("quad");
987
988 for (i = 0; i < 5; i++) {
989 numa_cfg2[i].nid_str = _load_numa_type(numa_cfg2[i].numa_cfg);
990 if (numa_cfg2[i].nid_str && numa_cfg2[i].nid_str[0]) {
991 numa_cfg2[i].node_bitmap = bit_alloc(100000);
992 (void) bit_unfmt(numa_cfg2[i].node_bitmap,
993 numa_cfg2[i].nid_str);
994 }
995 }
996 *num = 5;
997 return numa_cfg2;
998 }
999
_json_parse_mcdram_cfg_array(json_object * jobj,char * key,int * num)1000 static mcdram_cfg_t *_json_parse_mcdram_cfg_array(json_object *jobj, char *key,
1001 int *num)
1002 {
1003 json_object *jarray;
1004 json_object *jvalue;
1005 mcdram_cfg_t *ents;
1006 int i;
1007
1008 jarray = jobj;
1009 json_object_object_get_ex(jobj, key, &jarray);
1010
1011 *num = json_object_array_length(jarray);
1012 ents = xmalloc(*num * sizeof(mcdram_cfg_t));
1013
1014 for (i = 0; i < *num; i++) {
1015 jvalue = json_object_array_get_idx(jarray, i);
1016 _json_parse_mcdram_cfg_object(jvalue, &ents[i]);
1017 }
1018
1019 return ents;
1020 }
1021
_json_parse_numa_cap_array(json_object * jobj,char * key,int * num)1022 static numa_cap_t *_json_parse_numa_cap_array(json_object *jobj, char *key,
1023 int *num)
1024 {
1025 json_object *jarray;
1026 json_object *jvalue;
1027 numa_cap_t *ents;
1028 int i;
1029
1030 jarray = jobj;
1031 json_object_object_get_ex(jobj, key, &jarray);
1032
1033 *num = json_object_array_length(jarray);
1034 ents = xmalloc(*num * sizeof(numa_cap_t));
1035
1036 for (i = 0; i < *num; i++) {
1037 jvalue = json_object_array_get_idx(jarray, i);
1038 _json_parse_numa_cap_object(jvalue, &ents[i]);
1039 }
1040
1041 return ents;
1042 }
1043
_json_parse_numa_cfg_array(json_object * jobj,char * key,int * num)1044 static numa_cfg_t *_json_parse_numa_cfg_array(json_object *jobj, char *key,
1045 int *num)
1046 {
1047 json_object *jarray;
1048 json_object *jvalue;
1049 numa_cfg_t *ents;
1050 int i;
1051
1052 jarray = jobj;
1053 json_object_object_get_ex(jobj, key, &jarray);
1054
1055 *num = json_object_array_length(jarray);
1056 ents = xmalloc(*num * sizeof(numa_cfg_t));
1057
1058 for (i = 0; i < *num; i++) {
1059 jvalue = json_object_array_get_idx(jarray, i);
1060 _json_parse_numa_cfg_object(jvalue, &ents[i]);
1061 }
1062
1063 return ents;
1064 }
1065
1066 /* Log a command's arguments. */
_log_script_argv(char ** script_argv,char * resp_msg)1067 static void _log_script_argv(char **script_argv, char *resp_msg)
1068 {
1069 char *cmd_line = NULL;
1070 int i;
1071
1072 if (!debug_flag)
1073 return;
1074
1075 for (i = 0; script_argv[i]; i++) {
1076 if (i)
1077 xstrcat(cmd_line, " ");
1078 xstrcat(cmd_line, script_argv[i]);
1079 }
1080 info("%s", cmd_line);
1081 if (resp_msg && resp_msg[0])
1082 info("%s", resp_msg);
1083 xfree(cmd_line);
1084 }
1085
_mcdram_cap_free(mcdram_cap_t * mcdram_cap,int mcdram_cap_cnt)1086 static void _mcdram_cap_free(mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt)
1087 {
1088 int i;
1089
1090 if (!mcdram_cap)
1091 return;
1092 for (i = 0; i < mcdram_cap_cnt; i++) {
1093 xfree(mcdram_cap[i].mcdram_cfg);
1094 }
1095 xfree(mcdram_cap);
1096 }
1097
_mcdram_cap_log(mcdram_cap_t * mcdram_cap,int mcdram_cap_cnt)1098 static void _mcdram_cap_log(mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt)
1099 {
1100 int i;
1101
1102 if (!mcdram_cap)
1103 return;
1104 for (i = 0; i < mcdram_cap_cnt; i++) {
1105 info("MCDRAM_CAP[%d]: nid:%u mcdram_cfg:%s",
1106 i, mcdram_cap[i].nid, mcdram_cap[i].mcdram_cfg);
1107 }
1108 }
1109
_mcdram_cfg_free(mcdram_cfg_t * mcdram_cfg,int mcdram_cfg_cnt)1110 static void _mcdram_cfg_free(mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt)
1111 {
1112 int i;
1113
1114 if (!mcdram_cfg)
1115 return;
1116 for (i = 0; i < mcdram_cfg_cnt; i++) {
1117 xfree(mcdram_cfg[i].mcdram_cfg);
1118 }
1119 xfree(mcdram_cfg);
1120 }
1121
_mcdram_cfg2_free(mcdram_cfg2_t * mcdram_cfg2,int mcdram_cfg2_cnt)1122 static void _mcdram_cfg2_free(mcdram_cfg2_t *mcdram_cfg2, int mcdram_cfg2_cnt)
1123 {
1124 int i;
1125
1126 if (!mcdram_cfg2)
1127 return;
1128 for (i = 0; i < mcdram_cfg2_cnt; i++) {
1129 xfree(mcdram_cfg2[i].mcdram_cfg);
1130 FREE_NULL_BITMAP(mcdram_cfg2[i].node_bitmap);
1131 xfree(mcdram_cfg2[i].nid_str);
1132 }
1133 xfree(mcdram_cfg2);
1134 }
1135
_mcdram_cfg_log(mcdram_cfg_t * mcdram_cfg,int mcdram_cfg_cnt)1136 static void _mcdram_cfg_log(mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt)
1137 {
1138 int i;
1139
1140 if (!mcdram_cfg)
1141 return;
1142 for (i = 0; i < mcdram_cfg_cnt; i++) {
1143 info("MCDRAM_CFG[%d]: nid:%u dram_size:%"PRIu64" mcdram_cfg:%s mcdram_pct:%u mcdram_size:%"PRIu64,
1144 i, mcdram_cfg[i].nid, mcdram_cfg[i].dram_size,
1145 mcdram_cfg[i].mcdram_cfg, mcdram_cfg[i].mcdram_pct,
1146 mcdram_cfg[i].mcdram_size);
1147 }
1148 }
1149
_mcdram_cfg2_log(mcdram_cfg2_t * mcdram_cfg2,int mcdram_cfg2_cnt)1150 static void _mcdram_cfg2_log(mcdram_cfg2_t *mcdram_cfg2, int mcdram_cfg2_cnt)
1151 {
1152 int i;
1153
1154 if (!mcdram_cfg2)
1155 return;
1156 for (i = 0; i < mcdram_cfg2_cnt; i++) {
1157 info("MCDRAM_CFG[%d]: nid_str:%s mcdram_cfg:%s cache_pct:%d",
1158 i, mcdram_cfg2[i].nid_str, mcdram_cfg2[i].mcdram_cfg,
1159 mcdram_cfg2[i].cache_pct);
1160 }
1161 }
1162
_numa_cap_free(numa_cap_t * numa_cap,int numa_cap_cnt)1163 static void _numa_cap_free(numa_cap_t *numa_cap, int numa_cap_cnt)
1164 {
1165 int i;
1166
1167 if (!numa_cap)
1168 return;
1169 for (i = 0; i < numa_cap_cnt; i++) {
1170 xfree(numa_cap[i].numa_cfg);
1171 }
1172 xfree(numa_cap);
1173 }
1174
_numa_cap_log(numa_cap_t * numa_cap,int numa_cap_cnt)1175 static void _numa_cap_log(numa_cap_t *numa_cap, int numa_cap_cnt)
1176 {
1177 int i;
1178
1179 if (!numa_cap)
1180 return;
1181 for (i = 0; i < numa_cap_cnt; i++) {
1182 info("NUMA_CAP[%d]: nid:%u numa_cfg:%s",
1183 i, numa_cap[i].nid, numa_cap[i].numa_cfg);
1184 }
1185 }
1186
_numa_cfg_free(numa_cfg_t * numa_cfg,int numa_cfg_cnt)1187 static void _numa_cfg_free(numa_cfg_t *numa_cfg, int numa_cfg_cnt)
1188 {
1189 int i;
1190
1191 if (!numa_cfg)
1192 return;
1193 for (i = 0; i < numa_cfg_cnt; i++) {
1194 xfree(numa_cfg[i].numa_cfg);
1195 }
1196 xfree(numa_cfg);
1197 }
1198
_numa_cfg2_free(numa_cfg2_t * numa_cfg2,int numa_cfg2_cnt)1199 static void _numa_cfg2_free(numa_cfg2_t *numa_cfg2, int numa_cfg2_cnt)
1200 {
1201 int i;
1202
1203 if (!numa_cfg2)
1204 return;
1205 for (i = 0; i < numa_cfg2_cnt; i++) {
1206 xfree(numa_cfg2[i].nid_str);
1207 xfree(numa_cfg2[i].numa_cfg);
1208 FREE_NULL_BITMAP(numa_cfg2[i].node_bitmap);
1209 }
1210 xfree(numa_cfg2);
1211 }
1212
_numa_cfg_log(numa_cfg_t * numa_cfg,int numa_cfg_cnt)1213 static void _numa_cfg_log(numa_cfg_t *numa_cfg, int numa_cfg_cnt)
1214 {
1215 int i;
1216
1217 if (!numa_cfg)
1218 return;
1219 for (i = 0; i < numa_cfg_cnt; i++) {
1220 info("NUMA_CFG[%d]: nid:%u numa_cfg:%s",
1221 i, numa_cfg[i].nid, numa_cfg[i].numa_cfg);
1222 }
1223 }
1224
_numa_cfg2_log(numa_cfg2_t * numa_cfg2,int numa_cfg2_cnt)1225 static void _numa_cfg2_log(numa_cfg2_t *numa_cfg2, int numa_cfg2_cnt)
1226 {
1227 int i;
1228
1229 if (!numa_cfg2)
1230 return;
1231 for (i = 0; i < numa_cfg2_cnt; i++) {
1232 info("NUMA_CFG[%d]: nid_str:%s numa_cfg:%s",
1233 i, numa_cfg2[i].nid_str, numa_cfg2[i].numa_cfg);
1234 }
1235 }
1236
1237 /* Run a script and return its stdout plus exit status */
_run_script(char * cmd_path,char ** script_argv,int * status)1238 static char *_run_script(char *cmd_path, char **script_argv, int *status)
1239 {
1240 int cc, i, new_wait, resp_size = 0, resp_offset = 0;
1241 pid_t cpid;
1242 char *resp = NULL;
1243 int pfd[2] = { -1, -1 };
1244
1245 if (access(cmd_path, R_OK | X_OK) < 0) {
1246 error("%s: %s can not be executed: %m", __func__, cmd_path);
1247 *status = 127;
1248 resp = xstrdup("Slurm node_features/knl_cray configuration error");
1249 return resp;
1250 }
1251 if (pipe(pfd) != 0) {
1252 error("%s: pipe(): %m", __func__);
1253 *status = 127;
1254 resp = xstrdup("System error");
1255 return resp;
1256 }
1257
1258 if ((cpid = fork()) == 0) {
1259 cc = sysconf(_SC_OPEN_MAX);
1260 dup2(pfd[1], STDERR_FILENO);
1261 dup2(pfd[1], STDOUT_FILENO);
1262 for (i = 0; i < cc; i++) {
1263 if ((i != STDERR_FILENO) && (i != STDOUT_FILENO))
1264 close(i);
1265 }
1266 setpgid(0, 0);
1267 execv(cmd_path, script_argv);
1268 error("%s: execv(%s): %m", __func__, cmd_path);
1269 _exit(127);
1270 } else if (cpid < 0) {
1271 close(pfd[0]);
1272 close(pfd[1]);
1273 error("%s: fork(): %m", __func__);
1274 } else {
1275 struct pollfd fds;
1276 struct timeval tstart;
1277 resp_size = 1024;
1278 resp = xmalloc(resp_size);
1279 close(pfd[1]);
1280 gettimeofday(&tstart, NULL);
1281 while (1) {
1282 if (slurmctld_config.shutdown_time) {
1283 error("%s: killing %s operation on shutdown",
1284 __func__, script_argv[1]);
1285 break;
1286 }
1287 fds.fd = pfd[0];
1288 fds.events = POLLIN | POLLHUP | POLLRDHUP;
1289 fds.revents = 0;
1290 new_wait = capmc_timeout - _tot_wait(&tstart);
1291 if (new_wait <= 0) {
1292 error("%s: %s poll timeout @ %d msec",
1293 __func__, script_argv[1], capmc_timeout);
1294 break;
1295 }
1296 new_wait = MIN(new_wait, MAX_POLL_WAIT);
1297 i = poll(&fds, 1, new_wait);
1298 if (i == 0) {
1299 continue;
1300 } else if (i < 0) {
1301 error("%s: %s poll:%m", __func__,
1302 script_argv[1]);
1303 break;
1304 }
1305 if ((fds.revents & POLLIN) == 0)
1306 break;
1307 i = read(pfd[0], resp + resp_offset,
1308 resp_size - resp_offset);
1309 if (i == 0) {
1310 break;
1311 } else if (i < 0) {
1312 if (errno == EAGAIN)
1313 continue;
1314 error("%s: read(%s): %m", __func__, cmd_path);
1315 break;
1316 } else {
1317 resp_offset += i;
1318 if (resp_offset + 1024 >= resp_size) {
1319 resp_size *= 2;
1320 resp = xrealloc(resp, resp_size);
1321 }
1322 }
1323 }
1324 killpg(cpid, SIGTERM);
1325 usleep(10000);
1326 killpg(cpid, SIGKILL);
1327 waitpid(cpid, status, 0);
1328 close(pfd[0]);
1329 }
1330 return resp;
1331 }
1332
_merge_strings(char ** node_features,char * node_cfg,uint16_t allow_types)1333 static void _merge_strings(char **node_features, char *node_cfg,
1334 uint16_t allow_types)
1335 {
1336 char *tmp_str1, *tok1, *save_ptr1 = NULL;
1337 char *tmp_str2, *tok2, *save_ptr2 = NULL;
1338 bool mcdram_filter = false, numa_filter = false;
1339
1340 if ((node_cfg == NULL) || (node_cfg[0] == '\0'))
1341 return;
1342 if (*node_features == NULL) {
1343 *node_features = xstrdup(node_cfg);
1344 return;
1345 }
1346
1347 if ((allow_types & KNL_MCDRAM_FLAG) &&
1348 (allow_types != KNL_MCDRAM_FLAG))
1349 mcdram_filter = true;
1350 if ((allow_types & KNL_NUMA_FLAG) &&
1351 (allow_types != KNL_NUMA_FLAG))
1352 numa_filter = true;
1353
1354 /* Merge strings and avoid duplicates */
1355 tmp_str1 = xstrdup(node_cfg);
1356 tok1 = strtok_r(tmp_str1, ",", &save_ptr1);
1357 while (tok1) {
1358 bool match = false;
1359 if (mcdram_filter &&
1360 ((_knl_mcdram_token(tok1) & allow_types) == 0))
1361 goto next_tok;
1362 if (numa_filter &&
1363 ((_knl_numa_token(tok1) & allow_types) == 0))
1364 goto next_tok;
1365 tmp_str2 = xstrdup(*node_features);
1366 tok2 = strtok_r(tmp_str2, ",", &save_ptr2);
1367 while (tok2) {
1368 if (!xstrcmp(tok1, tok2)) {
1369 match = true;
1370 break;
1371 }
1372 tok2 = strtok_r(NULL, ",", &save_ptr2);
1373 }
1374 xfree(tmp_str2);
1375 if (!match)
1376 xstrfmtcat(*node_features, ",%s", tok1);
1377 next_tok: tok1 = strtok_r(NULL, ",", &save_ptr1);
1378 }
1379 xfree(tmp_str1);
1380 }
1381
_make_node_down(node_record_t * node_ptr)1382 static void _make_node_down(node_record_t *node_ptr)
1383 {
1384 if (!avail_node_bitmap) {
1385 /*
1386 * In process of initial slurmctld startup,
1387 * node data structures not completely built yet
1388 */
1389 node_ptr->node_state |= NODE_STATE_DRAIN;
1390 node_ptr->reason = xstrdup("Invalid KNL modes");
1391 node_ptr->reason_time = time(NULL);
1392 node_ptr->reason_uid = getuid();
1393 } else {
1394 (void) drain_nodes(node_ptr->name, "Invalid KNL modes",
1395 getuid());
1396 }
1397 }
1398
1399 /*
1400 * Determine that the actual KNL mode matches the available and current node
1401 * features, otherwise DRAIN the node
1402 */
_validate_node_features(node_record_t * node_ptr)1403 static void _validate_node_features(node_record_t *node_ptr)
1404 {
1405 char *tmp_str, *tok, *save_ptr = NULL;
1406 uint16_t actual_mcdram = 0, actual_numa = 0;
1407 uint16_t config_mcdram = 0, config_numa = 0;
1408 uint16_t count_mcdram = 0, count_numa = 0;
1409 uint16_t tmp_mcdram, tmp_numa;
1410
1411 if (!node_ptr->features || IS_NODE_DOWN(node_ptr))
1412 return;
1413
1414 tmp_str = xstrdup(node_ptr->features);
1415 tok = strtok_r(tmp_str, ",", &save_ptr);
1416 while (tok) {
1417 if ((tmp_mcdram = _knl_mcdram_token(tok))) {
1418 config_mcdram |= tmp_mcdram;
1419 count_mcdram++;
1420 } else if ((tmp_numa = _knl_numa_token(tok))) {
1421 config_numa |= tmp_numa;
1422 count_numa++;
1423 }
1424 tok = strtok_r(NULL, ",", &save_ptr);
1425 }
1426 xfree(tmp_str);
1427
1428 tmp_str = xstrdup(node_ptr->features_act);
1429 tok = strtok_r(tmp_str, ",", &save_ptr);
1430 while (tok) {
1431 if ((tmp_mcdram = _knl_mcdram_token(tok)))
1432 actual_mcdram |= tmp_mcdram;
1433 else if ((tmp_numa = _knl_numa_token(tok)))
1434 actual_numa |= tmp_numa;
1435 tok = strtok_r(NULL, ",", &save_ptr);
1436 }
1437 xfree(tmp_str);
1438
1439 if ((config_mcdram != actual_mcdram) || (count_mcdram != 1) ||
1440 (config_numa != actual_numa) || (count_numa != 1)) {
1441 _make_node_down(node_ptr);
1442 error("Invalid KNL modes on node %s", node_ptr->name);
1443 }
1444 }
1445
1446 /*
1447 * Remove all KNL MCDRAM and NUMA type GRES from this node (it isn't KNL),
1448 * returns count of KNL features found.
1449 */
_strip_knl_features(char ** node_feature)1450 static int _strip_knl_features(char **node_feature)
1451 {
1452 char *tmp_str1, *tok1, *save_ptr1 = NULL;
1453 char *tmp_str2 = NULL, *sep = "";
1454 int cnt = 0;
1455
1456 xassert(node_feature);
1457 if (*node_feature == NULL)
1458 return cnt;
1459 tmp_str1 = xstrdup(*node_feature);
1460 tok1 = strtok_r(tmp_str1, ",", &save_ptr1);
1461 while (tok1) {
1462 if (_knl_mcdram_token(tok1) || _knl_numa_token(tok1)) {
1463 cnt++;
1464 } else {
1465 xstrfmtcat(tmp_str2, "%s%s", sep, tok1);
1466 sep = ",";
1467 }
1468 tok1 = strtok_r(NULL, ",", &save_ptr1);
1469 }
1470 if (cnt) { /* Update the nodes features */
1471 xfree(*node_feature);
1472 *node_feature = tmp_str2;
1473 } else { /* Discard new feature list */
1474 xfree(tmp_str2);
1475 }
1476 xfree(tmp_str1);
1477 return cnt;
1478 }
1479
1480 /* Update features and features_act fields for ALL nodes based upon
1481 * its current configuration provided by capmc */
_update_all_node_features(mcdram_cap_t * mcdram_cap,int mcdram_cap_cnt,mcdram_cfg_t * mcdram_cfg,int mcdram_cfg_cnt,numa_cap_t * numa_cap,int numa_cap_cnt,numa_cfg_t * numa_cfg,int numa_cfg_cnt)1482 static void _update_all_node_features(
1483 mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt,
1484 mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt,
1485 numa_cap_t *numa_cap, int numa_cap_cnt,
1486 numa_cfg_t *numa_cfg, int numa_cfg_cnt)
1487 {
1488 node_record_t *node_ptr;
1489 char node_name[32], *prefix;
1490 int i, node_inx, numa_inx, width = 5;
1491 uint64_t mcdram_size;
1492
1493 if ((node_record_table_ptr == NULL) ||
1494 (node_record_table_ptr->name == NULL)) {
1495 prefix = xstrdup("nid");
1496 } else {
1497 prefix = xstrdup(node_record_table_ptr->name);
1498 for (i = 0; prefix[i]; i++) {
1499 if ((prefix[i] >= '0') && (prefix[i] <= '9')) {
1500 prefix[i] = '\0';
1501 width = 1;
1502 for (i++ ; prefix[i]; i++)
1503 width++;
1504 break;
1505 }
1506 }
1507 }
1508 if (mcdram_cap) {
1509 if (!knl_node_bitmap)
1510 knl_node_bitmap = bit_alloc(node_record_count);
1511 for (i = 0; i < mcdram_cap_cnt; i++) {
1512 snprintf(node_name, sizeof(node_name),
1513 "%s%.*d", prefix, width, mcdram_cap[i].nid);
1514 node_ptr = find_node_record(node_name);
1515 if (node_ptr) {
1516 node_inx = node_ptr - node_record_table_ptr;
1517 bit_set(knl_node_bitmap, node_inx);
1518 if (validate_mode == 0) {
1519 _merge_strings(&node_ptr->features,
1520 mcdram_cap[i].mcdram_cfg,
1521 allow_mcdram);
1522 }
1523 }
1524 }
1525 }
1526 if (mcdram_cfg) {
1527 for (i = 0; i < mcdram_cfg_cnt; i++) {
1528 snprintf(node_name, sizeof(node_name),
1529 "%s%.*d", prefix, width, mcdram_cfg[i].nid);
1530 if (!(node_ptr = find_node_record(node_name)))
1531 continue;
1532 mcdram_per_node[node_ptr - node_record_table_ptr] =
1533 mcdram_cfg[i].mcdram_size;
1534 _merge_strings(&node_ptr->features_act,
1535 mcdram_cfg[i].mcdram_cfg,
1536 allow_mcdram);
1537 mcdram_size = mcdram_cfg[i].mcdram_size *
1538 (100 - mcdram_cfg[i].mcdram_pct) / 100;
1539 if (!node_ptr->gres) {
1540 node_ptr->gres =
1541 xstrdup(node_ptr->config_ptr->gres);
1542 }
1543 gres_plugin_node_feature(node_ptr->name, "hbm",
1544 mcdram_size, &node_ptr->gres,
1545 &node_ptr->gres_list);
1546 }
1547 }
1548 if (numa_cap && (validate_mode == 0)) {
1549 for (i = 0; i < numa_cap_cnt; i++) {
1550 snprintf(node_name, sizeof(node_name),
1551 "%s%.*d", prefix, width, numa_cap[i].nid);
1552 node_ptr = find_node_record(node_name);
1553 if (node_ptr) {
1554 _merge_strings(&node_ptr->features,
1555 numa_cap[i].numa_cfg,
1556 allow_numa);
1557 }
1558 }
1559 }
1560 if (numa_cfg) {
1561 for (i = 0; i < numa_cfg_cnt; i++) {
1562 snprintf(node_name, sizeof(node_name),
1563 "%s%.*u", prefix, width, numa_cfg[i].nid);
1564 node_ptr = find_node_record(node_name);
1565 if (node_ptr) {
1566 _merge_strings(&node_ptr->features_act,
1567 numa_cfg[i].numa_cfg,
1568 allow_numa);
1569 numa_inx = _knl_numa_inx(numa_cfg[i].numa_cfg);
1570 if ((numa_inx >= 0) && cpu_bind[numa_inx])
1571 node_ptr->cpu_bind = cpu_bind[numa_inx];
1572 }
1573 }
1574 }
1575
1576 /*
1577 * Make sure that only nodes reported by "capmc get_mcdram_capabilities"
1578 * contain KNL features
1579 */
1580 for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
1581 i++, node_ptr++) {
1582 if (knl_node_bitmap && bit_test(knl_node_bitmap, i)) {
1583 if (validate_mode)
1584 _validate_node_features(node_ptr);
1585 continue;
1586 }
1587 node_inx = _strip_knl_features(&node_ptr->features) +
1588 _strip_knl_features(&node_ptr->features_act);
1589 if (node_inx) {
1590 error("Removed KNL features from non-KNL node %s",
1591 node_ptr->name);
1592 }
1593 if (!node_ptr->gres)
1594 node_ptr->gres = xstrdup(node_ptr->config_ptr->gres);
1595 gres_plugin_node_feature(node_ptr->name, "hbm", 0,
1596 &node_ptr->gres, &node_ptr->gres_list);
1597 }
1598
1599 xfree(prefix);
1600 }
1601
1602 /*
1603 * Update a specific node's features and features_act fields based upon
1604 * its current configuration provided by capmc
1605 */
_update_node_features(node_record_t * node_ptr,mcdram_cap_t * mcdram_cap,int mcdram_cap_cnt,mcdram_cfg_t * mcdram_cfg,int mcdram_cfg_cnt,numa_cap_t * numa_cap,int numa_cap_cnt,numa_cfg_t * numa_cfg,int numa_cfg_cnt)1606 static void _update_node_features(node_record_t *node_ptr,
1607 mcdram_cap_t *mcdram_cap, int mcdram_cap_cnt,
1608 mcdram_cfg_t *mcdram_cfg, int mcdram_cfg_cnt,
1609 numa_cap_t *numa_cap, int numa_cap_cnt,
1610 numa_cfg_t *numa_cfg, int numa_cfg_cnt)
1611 {
1612 int i, nid, node_inx, numa_inx;
1613 char *end_ptr = "";
1614 uint64_t mcdram_size;
1615 bitstr_t *node_bitmap = NULL;
1616 bool is_knl = false;
1617
1618 xassert(node_ptr);
1619 nid = strtol(node_ptr->name + 3, &end_ptr, 10);
1620 if (end_ptr[0] != '\0') {
1621 error("%s: Invalid node name (%s)", __func__, node_ptr->name);
1622 return;
1623 }
1624
1625 _strip_knl_opts(&node_ptr->features);
1626 if (node_ptr->features && !node_ptr->features_act)
1627 node_ptr->features_act = xstrdup(node_ptr->features);
1628 _strip_knl_opts(&node_ptr->features_act);
1629
1630 if (mcdram_cap && (validate_mode == 0)) {
1631 for (i = 0; i < mcdram_cap_cnt; i++) {
1632 if (nid == mcdram_cap[i].nid) {
1633 _merge_strings(&node_ptr->features,
1634 mcdram_cap[i].mcdram_cfg,
1635 allow_mcdram);
1636 is_knl = true;
1637 break;
1638 }
1639 }
1640 }
1641
1642 if (mcdram_cfg) {
1643 for (i = 0; i < mcdram_cfg_cnt; i++) {
1644 if (nid != mcdram_cfg[i].nid)
1645 continue;
1646 _merge_strings(&node_ptr->features_act,
1647 mcdram_cfg[i].mcdram_cfg, allow_mcdram);
1648
1649 mcdram_per_node[node_ptr - node_record_table_ptr] =
1650 mcdram_cfg[i].mcdram_size;
1651 mcdram_size = mcdram_cfg[i].mcdram_size *
1652 (100 - mcdram_cfg[i].mcdram_pct) / 100;
1653 if (!node_ptr->gres) {
1654 node_ptr->gres =
1655 xstrdup(node_ptr->config_ptr->gres);
1656 }
1657 if (!node_ptr->gres) {
1658 node_ptr->gres =
1659 xstrdup(node_ptr->config_ptr->gres);
1660 }
1661 gres_plugin_node_feature(node_ptr->name, "hbm",
1662 mcdram_size, &node_ptr->gres,
1663 &node_ptr->gres_list);
1664 break;
1665 }
1666 }
1667 if (numa_cap && (validate_mode == 0)) {
1668 for (i = 0; i < numa_cap_cnt; i++) {
1669 if (nid == numa_cap[i].nid) {
1670 _merge_strings(&node_ptr->features,
1671 numa_cap[i].numa_cfg,
1672 allow_numa);
1673 break;
1674 }
1675 }
1676 }
1677 if (numa_cfg) {
1678 for (i = 0; i < numa_cfg_cnt; i++) {
1679 if (nid == numa_cfg[i].nid) {
1680 _merge_strings(&node_ptr->features_act,
1681 numa_cfg[i].numa_cfg,
1682 allow_numa);
1683 numa_inx = _knl_numa_inx(numa_cfg[i].numa_cfg);
1684 if ((numa_inx >= 0) && cpu_bind[numa_inx])
1685 node_ptr->cpu_bind = cpu_bind[numa_inx];
1686 break;
1687 }
1688 }
1689 }
1690
1691 /* Make sure that only nodes reported by "capmc get_mcdram_capabilities"
1692 * contain KNL features */
1693 if (is_knl) {
1694 if (validate_mode)
1695 _validate_node_features(node_ptr);
1696 } else {
1697 node_inx = _strip_knl_features(&node_ptr->features) +
1698 _strip_knl_features(&node_ptr->features_act);
1699 if (node_inx) {
1700 error("Removed KNL features from non-KNL node %s",
1701 node_ptr->name);
1702 }
1703 if (!node_ptr->gres) {
1704 node_ptr->gres =
1705 xstrdup(node_ptr->config_ptr->gres);
1706 }
1707 gres_plugin_node_feature(node_ptr->name, "hbm", 0,
1708 &node_ptr->gres, &node_ptr->gres_list);
1709 }
1710
1711 /* Update bitmaps and lists used by slurmctld for scheduling */
1712 node_bitmap = bit_alloc(node_record_count);
1713 bit_set(node_bitmap, (node_ptr - node_record_table_ptr));
1714 update_feature_list(active_feature_list, node_ptr->features_act,
1715 node_bitmap);
1716 (void) node_features_p_node_update(node_ptr->features_act, node_bitmap);
1717 FREE_NULL_BITMAP(node_bitmap);
1718 }
1719
_make_uid_array(char * uid_str)1720 static void _make_uid_array(char *uid_str)
1721 {
1722 char *save_ptr = NULL, *tmp_str, *tok;
1723 int i, uid_cnt = 0;
1724
1725 if (!uid_str)
1726 return;
1727
1728 /* Count the number of users */
1729 for (i = 0; uid_str[i]; i++) {
1730 if (uid_str[i] == ',')
1731 uid_cnt++;
1732 }
1733 uid_cnt++;
1734
1735 allowed_uid = xmalloc(sizeof(uid_t) * uid_cnt);
1736 allowed_uid_cnt = 0;
1737 tmp_str = xstrdup(uid_str);
1738 tok = strtok_r(tmp_str, ",", &save_ptr);
1739 while (tok) {
1740 if (uid_from_string(tok, &allowed_uid[allowed_uid_cnt++]) < 0)
1741 error("knl_cray.conf: Invalid AllowUserBoot: %s", tok);
1742 tok = strtok_r(NULL, ",", &save_ptr);
1743 }
1744 xfree(tmp_str);
1745 }
1746
_make_uid_str(uid_t * uid_array,int uid_cnt)1747 static char *_make_uid_str(uid_t *uid_array, int uid_cnt)
1748 {
1749 char *sep = "", *tmp_str = NULL, *uid_str = NULL;
1750 int i;
1751
1752 if (allowed_uid_cnt == 0) {
1753 uid_str = xstrdup("ALL");
1754 return uid_str;
1755 }
1756
1757 for (i = 0; i < uid_cnt; i++) {
1758 tmp_str = uid_to_string(uid_array[i]);
1759 xstrfmtcat(uid_str, "%s%s(%d)", sep, tmp_str, uid_array[i]);
1760 xfree(tmp_str);
1761 sep = ",";
1762 }
1763
1764 return uid_str;
1765 }
1766
1767 /* Watch for Uncorrectable Memory Errors. Notify jobs if any detected */
_ume_agent(void * args)1768 static void *_ume_agent(void *args)
1769 {
1770 struct timespec req;
1771 int i, mc_num, csrow_num, ue_count, last_ue_count = -1;
1772 int *fd = NULL, fd_cnt = 0, fd_size = 0, ume_path_size;
1773 char buf[8], *ume_path;
1774 ssize_t rd_size;
1775
1776 /* Identify and open array of UME file descriptors */
1777 ume_path_size = strlen(mc_path) + 32;
1778 ume_path = xmalloc(ume_path_size);
1779 for (mc_num = 0; ; mc_num++) {
1780 for (csrow_num = 0; ; csrow_num++) {
1781 if (fd_cnt == fd_size) {
1782 fd_size += 64;
1783 fd = xrealloc(fd, sizeof(int) * fd_size);
1784 }
1785 snprintf(ume_path, ume_path_size,
1786 "%s/mc%d/csrow%d/ue_count",
1787 mc_path, mc_num, csrow_num);
1788 if ((fd[fd_cnt] = open(ume_path, 0)) >= 0)
1789 fd_cnt++;
1790 else
1791 break;
1792 }
1793 if (csrow_num == 0)
1794 break;
1795 }
1796 xfree(ume_path);
1797
1798 while (!shutdown_time) {
1799 /* Get current UME count */
1800 ue_count = 0;
1801 for (i = 0; i < fd_cnt; i++) {
1802 (void) lseek(fd[i], 0, SEEK_SET);
1803 rd_size = read(fd[i], buf, 7);
1804 if (rd_size <= 0)
1805 continue;
1806 buf[rd_size] = '\0';
1807 ue_count += atoi(buf);
1808 }
1809
1810 if (shutdown_time)
1811 break;
1812 /* If UME count changed, notify all steps */
1813 if ((last_ue_count < ue_count) && (last_ue_count != -1)) {
1814 i = ume_notify();
1815 error("UME error detected. Notified %d job steps", i);
1816 }
1817 last_ue_count = ue_count;
1818
1819 if (shutdown_time)
1820 break;
1821 /* Sleep before retry */
1822 req.tv_sec = ume_check_interval / USEC_IN_SEC;
1823 req.tv_nsec = (ume_check_interval % USEC_IN_SEC) *
1824 NSEC_IN_USEC;
1825 (void) nanosleep(&req, NULL);
1826 }
1827
1828 for (i = 0; i < fd_cnt; i++)
1829 (void) close(fd[i]);
1830 xfree(fd);
1831
1832 return NULL;
1833 }
1834
1835 /* Load configuration */
init(void)1836 extern int init(void)
1837 {
1838 char *allow_mcdram_str, *allow_numa_str, *allow_user_str;
1839 char *default_mcdram_str, *default_numa_str;
1840 char *knl_conf_file, *tmp_str = NULL;
1841 s_p_hashtbl_t *tbl;
1842 struct stat stat_buf;
1843 int i;
1844
1845 /* Set default values */
1846 allow_mcdram = KNL_MCDRAM_FLAG;
1847 allow_numa = KNL_NUMA_FLAG;
1848 xfree(allowed_uid);
1849 allowed_uid_cnt = 0;
1850 xfree(capmc_path);
1851 capmc_poll_freq = 45;
1852 capmc_timeout = DEFAULT_CAPMC_TIMEOUT;
1853 for (i = 0; i < KNL_NUMA_CNT; i++)
1854 cpu_bind[i] = 0;
1855 xfree(cnselect_path);
1856 debug_flag = false;
1857 default_mcdram = KNL_CACHE;
1858 default_numa = KNL_ALL2ALL;
1859 xfree(mc_path);
1860 for (i = 0; i < KNL_MCDRAM_CNT; i++)
1861 mcdram_pct[i] = -1;
1862 mcdram_set = 0;
1863 xfree(numa_cpu_bind);
1864 xfree(syscfg_path);
1865
1866 if (slurm_get_debug_flags() & DEBUG_FLAG_NODE_FEATURES)
1867 debug_flag = true;
1868
1869 knl_conf_file = get_extra_conf_path("knl_cray.conf");
1870 if ((stat(knl_conf_file, &stat_buf) == 0) &&
1871 (tbl = _config_make_tbl(knl_conf_file))) {
1872 if (s_p_get_string(&tmp_str, "AllowMCDRAM", tbl)) {
1873 allow_mcdram = _knl_mcdram_parse(tmp_str, ",");
1874 if (_knl_mcdram_bits_cnt(allow_mcdram) < 1) {
1875 fatal("knl_cray.conf: Invalid AllowMCDRAM=%s",
1876 tmp_str);
1877 }
1878 xfree(tmp_str);
1879 }
1880 if (s_p_get_string(&tmp_str, "AllowNUMA", tbl)) {
1881 allow_numa = _knl_numa_parse(tmp_str, ",");
1882 if (_knl_numa_bits_cnt(allow_numa) < 1) {
1883 fatal("knl_cray.conf: Invalid AllowNUMA=%s",
1884 tmp_str);
1885 }
1886 xfree(tmp_str);
1887 }
1888 if (s_p_get_string(&tmp_str, "AllowUserBoot", tbl)) {
1889 _make_uid_array(tmp_str);
1890 xfree(tmp_str);
1891 }
1892 (void) s_p_get_uint32(&boot_time, "BootTime", tbl);
1893 (void) s_p_get_string(&capmc_path, "CapmcPath", tbl);
1894 (void) s_p_get_uint32(&capmc_poll_freq, "CapmcPollFreq", tbl);
1895 (void) s_p_get_uint32(&capmc_retries, "CapmcRetries", tbl);
1896 (void) s_p_get_uint32(&capmc_timeout, "CapmcTimeout", tbl);
1897 (void) s_p_get_string(&cnselect_path, "CnselectPath", tbl);
1898 if (s_p_get_string(&tmp_str, "DefaultMCDRAM", tbl)) {
1899 default_mcdram = _knl_mcdram_parse(tmp_str, ",");
1900 if (_knl_mcdram_bits_cnt(default_mcdram) != 1) {
1901 fatal("knl_cray.conf: Invalid DefaultMCDRAM=%s",
1902 tmp_str);
1903 }
1904 xfree(tmp_str);
1905 }
1906 if (s_p_get_string(&tmp_str, "DefaultNUMA", tbl)) {
1907 default_numa = _knl_numa_parse(tmp_str, ",");
1908 if (_knl_numa_bits_cnt(default_numa) != 1) {
1909 fatal("knl_cray.conf: Invalid DefaultNUMA=%s",
1910 tmp_str);
1911 }
1912 xfree(tmp_str);
1913 }
1914 (void) s_p_get_string(&mc_path, "McPath", tbl);
1915 (void) s_p_get_uint32(&node_reboot_weight, "NodeRebootWeight",
1916 tbl);
1917 if (s_p_get_string(&numa_cpu_bind, "NumaCpuBind", tbl))
1918 _update_cpu_bind();
1919 (void) s_p_get_string(&syscfg_path, "SyscfgPath", tbl);
1920 (void) s_p_get_uint32(&ume_check_interval, "UmeCheckInterval",
1921 tbl);
1922 (void) s_p_get_uint32(&validate_mode, "ValidateMode", tbl);
1923 s_p_hashtbl_destroy(tbl);
1924 } else {
1925 error("something wrong with opening/reading knl_cray.conf");
1926 }
1927 xfree(knl_conf_file);
1928 if (!capmc_path)
1929 capmc_path = xstrdup("/opt/cray/capmc/default/bin/capmc");
1930 capmc_timeout = MAX(capmc_timeout, MIN_CAPMC_TIMEOUT);
1931 if (!cnselect_path)
1932 cnselect_path = xstrdup("/opt/cray/sdb/default/bin/cnselect");
1933 if (!mc_path)
1934 mc_path = xstrdup("/sys/devices/system/edac/mc");
1935 if (!syscfg_path)
1936 verbose("SyscfgPath is not configured");
1937
1938 if (slurm_get_debug_flags() & DEBUG_FLAG_NODE_FEATURES) {
1939 allow_mcdram_str = _knl_mcdram_str(allow_mcdram);
1940 allow_numa_str = _knl_numa_str(allow_numa);
1941 allow_user_str = _make_uid_str(allowed_uid, allowed_uid_cnt);
1942 default_mcdram_str = _knl_mcdram_str(default_mcdram);
1943 default_numa_str = _knl_numa_str(default_numa);
1944 info("AllowMCDRAM=%s AllowNUMA=%s",
1945 allow_mcdram_str, allow_numa_str);
1946 info("AllowUserBoot=%s", allow_user_str);
1947 info("BootTIme=%u", boot_time);
1948 info("CapmcPath=%s", capmc_path);
1949 info("CapmcPollFreq=%u sec", capmc_poll_freq);
1950 info("CapmcRetries=%u", capmc_retries);
1951 info("CapmcTimeout=%u msec", capmc_timeout);
1952 info("CnselectPath=%s", cnselect_path);
1953 info("DefaultMCDRAM=%s DefaultNUMA=%s",
1954 default_mcdram_str, default_numa_str);
1955 info("McPath=%s", mc_path);
1956 info("NodeRebootWeight=%u", node_reboot_weight);
1957 info("NumaCpuBind=%s", numa_cpu_bind);
1958 info("SyscfgPath=%s", syscfg_path);
1959 info("UmeCheckInterval=%u", ume_check_interval);
1960 info("ValidateMode=%u", validate_mode);
1961 xfree(allow_mcdram_str);
1962 xfree(allow_numa_str);
1963 xfree(allow_user_str);
1964 xfree(default_mcdram_str);
1965 xfree(default_numa_str);
1966 }
1967 gres_plugin_add("hbm");
1968
1969 if (ume_check_interval && running_in_slurmd()) {
1970 slurm_mutex_lock(&ume_mutex);
1971 slurm_thread_create(&ume_thread, _ume_agent, NULL);
1972 slurm_mutex_unlock(&ume_mutex);
1973 }
1974
1975 slurm_mutex_lock(&queue_mutex);
1976 if (queue_thread == 0) {
1977 /* since we do a join on this later we don't make it detached */
1978 slurm_thread_create(&queue_thread, _queue_agent, NULL);
1979 }
1980 slurm_mutex_unlock(&queue_mutex);
1981
1982 return SLURM_SUCCESS;
1983 }
1984
1985 /* Release allocated memory */
fini(void)1986 extern int fini(void)
1987 {
1988 shutdown_time = time(NULL);
1989 slurm_mutex_lock(&ume_mutex);
1990 if (ume_thread) {
1991 pthread_join(ume_thread, NULL);
1992 ume_thread = 0;
1993 }
1994 slurm_mutex_unlock(&ume_mutex);
1995 pthread_join(queue_thread, NULL);
1996 slurm_mutex_lock(&queue_mutex);
1997 xfree(node_list_queue); /* just drop requessts */
1998 shutdown_time = (time_t) 0;
1999 queue_thread = 0;
2000 slurm_mutex_unlock(&queue_mutex);
2001
2002 xfree(allowed_uid);
2003 allowed_uid_cnt = 0;
2004 xfree(capmc_path);
2005 xfree(cnselect_path);
2006 capmc_timeout = 0;
2007 debug_flag = false;
2008 xfree(mc_path);
2009 xfree(mcdram_per_node);
2010 xfree(numa_cpu_bind);
2011 xfree(syscfg_path);
2012 FREE_NULL_BITMAP(knl_node_bitmap);
2013
2014 return SLURM_SUCCESS;
2015 }
2016
2017 /* Reload configuration */
node_features_p_reconfig(void)2018 extern int node_features_p_reconfig(void)
2019 {
2020 slurm_mutex_lock(&config_mutex);
2021 reconfig = true;
2022 slurm_mutex_unlock(&config_mutex);
2023 return SLURM_SUCCESS;
2024 }
2025
2026 /* Put any nodes NOT found by "capmc node_status" into DRAIN state */
_check_node_status(void)2027 static void _check_node_status(void)
2028 {
2029 json_object *j_obj;
2030 json_object_iter iter;
2031 json_object *j_array = NULL;
2032 json_object *j_value;
2033 char *resp_msg, **script_argv;
2034 int i, nid, num_ent, retry, status = 0;
2035 node_record_t *node_ptr;
2036 bitstr_t *capmc_node_bitmap = NULL;
2037 DEF_TIMERS;
2038
2039 script_argv = xmalloc(sizeof(char *) * 4); /* NULL terminated */
2040 script_argv[0] = xstrdup("capmc");
2041 script_argv[1] = xstrdup("node_status");
2042 for (retry = 0; ; retry++) {
2043 START_TIMER;
2044 resp_msg = _run_script(capmc_path, script_argv, &status);
2045 END_TIMER;
2046 if (debug_flag)
2047 info("%s: node_status ran for %s", __func__, TIME_STR);
2048 _log_script_argv(script_argv, resp_msg);
2049 if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
2050 break; /* Success */
2051 error("%s: node_status status:%u response:%s",
2052 __func__, status, resp_msg);
2053 if (resp_msg == NULL) {
2054 info("%s: node_status returned no information",
2055 __func__);
2056 _free_script_argv(script_argv);
2057 return;
2058 }
2059 if (strstr(resp_msg, "Could not lookup") &&
2060 (retry <= capmc_retries)) {
2061 /* State Manager is down. Sleep and retry */
2062 sleep(1);
2063 xfree(resp_msg);
2064 } else {
2065 xfree(resp_msg);
2066 _free_script_argv(script_argv);
2067 return;
2068 }
2069 }
2070 _free_script_argv(script_argv);
2071
2072 j_obj = json_tokener_parse(resp_msg);
2073 if (j_obj == NULL) {
2074 error("%s: json parser failed on %s", __func__, resp_msg);
2075 xfree(resp_msg);
2076 return;
2077 }
2078 xfree(resp_msg);
2079
2080 capmc_node_bitmap = bit_alloc(100000);
2081 json_object_object_foreachC(j_obj, iter) {
2082 /* NOTE: The error number "e" and message "err_msg"
2083 * fields are currently ignored. */
2084 if (!xstrcmp(iter.key, "e") ||
2085 !xstrcmp(iter.key, "err_msg"))
2086 continue;
2087 if (json_object_get_type(iter.val) != json_type_array)
2088 continue;
2089 json_object_object_get_ex(j_obj, iter.key, &j_array);
2090 if (!j_array) {
2091 error("%s: Unable to parse nid specification",
2092 __func__);
2093 FREE_NULL_BITMAP(capmc_node_bitmap);
2094 return;
2095 }
2096 num_ent = json_object_array_length(j_array);
2097 for (i = 0; i < num_ent; i++) {
2098 j_value = json_object_array_get_idx(j_array, i);
2099 if (json_object_get_type(j_value) !=
2100 json_type_int) {
2101 error("%s: Unable to parse nid specification",
2102 __func__);
2103 } else {
2104 nid = json_object_get_int64(j_value);
2105 if ((nid >= 0) && (nid < 100000))
2106 bit_set(capmc_node_bitmap, nid);
2107 }
2108 }
2109 }
2110 json_object_put(j_obj); /* Frees json memory */
2111
2112 for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
2113 i++, node_ptr++) {
2114 nid = atoi(node_ptr->name + 3); /* Skip "nid" */
2115 if ((nid < 0) || (nid >= 100000) ||
2116 bit_test(capmc_node_bitmap, nid))
2117 continue;
2118 info("Node %s not found by \'capmc node_status\', draining it",
2119 node_ptr->name);
2120 if (IS_NODE_DOWN(node_ptr) || IS_NODE_DRAIN(node_ptr))
2121 continue;
2122 node_ptr->node_state |= NODE_STATE_DRAIN;
2123 xfree(node_ptr->reason);
2124 node_ptr->reason = xstrdup("Node not found by capmc");
2125 node_ptr->reason_time = time(NULL);
2126 node_ptr->reason_uid = slurm_get_slurm_user_id();
2127 if (avail_node_bitmap)
2128 bit_clear(avail_node_bitmap, i);
2129 }
2130 FREE_NULL_BITMAP(capmc_node_bitmap);
2131 }
2132
2133 /* Put any disabled nodes into DRAIN state */
_check_node_disabled(void)2134 static void _check_node_disabled(void)
2135 {
2136 /* FIXME: To be added
2137 *
2138 * STEP 0 (for testing), disable/enable nodes:
2139 * > xtcli disable ${TARGET_NODE}
2140 * > xtcli enable ${TARGET_NODE}
2141 *
2142 * STEP 1: Identify disabled compute nodes
2143 * > xtshow --compute --disabled
2144 * L1s ...
2145 * L0s ...
2146 * Nodes ...
2147 * c0-0c0s7n0: -| disabled [noflags|]
2148 * SeaStars ...
2149 * Links ...
2150 * c1-0c2s1s1l1: -| disabled [noflags|]
2151 *
2152 * STEP 2: Map cname to nid name
2153 * > rtr -Im ${TARGET_BLADE}
2154 *
2155 * STEP 3: Drain the disabled compute nodes
2156 * See logic in _check_node_status() above.
2157 */
2158 }
2159
2160 /* Periodically update node information for specified nodes. We can't do this
2161 * work in real-time since capmc takes multiple seconds to execute. */
_queue_agent(void * args)2162 extern void *_queue_agent(void *args)
2163 {
2164 char *node_list;
2165
2166 while (shutdown_time == 0) {
2167 sleep(1);
2168 if (shutdown_time)
2169 break;
2170
2171 if (node_list_queue &&
2172 (difftime(time(NULL), node_time_queue) >= 30)) {
2173 slurm_mutex_lock(&queue_mutex);
2174 node_list = node_list_queue;
2175 node_list_queue = NULL;
2176 node_time_queue = (time_t) 0;
2177 slurm_mutex_unlock(&queue_mutex);
2178 (void) _update_node_state(node_list, true);
2179 xfree(node_list);
2180 }
2181 }
2182
2183 return NULL;
2184 }
2185
2186 /* Queue request to update node information */
_queue_node_update(char * node_list)2187 static int _queue_node_update(char *node_list)
2188 {
2189 slurm_mutex_lock(&queue_mutex);
2190 if (node_time_queue == 0)
2191 node_time_queue = time(NULL);
2192 if (node_list_queue)
2193 xstrcat(node_list_queue, ",");
2194 xstrcat(node_list_queue, node_list);
2195 slurm_mutex_unlock(&queue_mutex);
2196
2197 return SLURM_SUCCESS;
2198 }
2199
2200 /* Update active and available features on specified nodes.
2201 * If node_list is NULL then update ALL nodes now.
2202 * If node_list is not NULL, then queue a request to update select nodes later.
2203 */
node_features_p_get_node(char * node_list)2204 extern int node_features_p_get_node(char *node_list)
2205 {
2206 if (node_list && /* Selected node to be update */
2207 mcdram_per_node && /* and needed global info is */
2208 (mcdram_pct[0] != -1)) /* already available */
2209 return _queue_node_update(node_list);
2210
2211 return _update_node_state(node_list, false);
2212 }
2213
_update_node_state(char * node_list,bool set_locks)2214 static int _update_node_state(char *node_list, bool set_locks)
2215 {
2216 json_object *j;
2217 json_object_iter iter;
2218 int i, k, rc = SLURM_SUCCESS, retry, status = 0;
2219 DEF_TIMERS;
2220 char *resp_msg, **script_argv;
2221 mcdram_cap_t *mcdram_cap = NULL;
2222 mcdram_cfg_t *mcdram_cfg = NULL;
2223 mcdram_cfg2_t *mcdram_cfg2 = NULL;
2224 numa_cap_t *numa_cap = NULL;
2225 numa_cfg_t *numa_cfg = NULL;
2226 numa_cfg2_t *numa_cfg2 = NULL;
2227 int mcdram_cap_cnt = 0, mcdram_cfg_cnt = 0, mcdram_cfg2_cnt = 0;
2228 int numa_cap_cnt = 0, numa_cfg_cnt = 0, numa_cfg2_cnt = 0;
2229 node_record_t *node_ptr;
2230 hostlist_t host_list;
2231 char *node_name;
2232
2233 slurm_mutex_lock(&config_mutex);
2234 if (reconfig) {
2235 (void) init();
2236 reconfig = false;
2237 }
2238 slurm_mutex_unlock(&config_mutex);
2239
2240 _check_node_status(); /* Drain nodes not found by capmc */
2241 _check_node_disabled(); /* Drain disabled nodes */
2242
2243 if (!mcdram_per_node)
2244 mcdram_per_node = xmalloc(sizeof(uint64_t) * node_record_count);
2245
2246 /*
2247 * Load available MCDRAM capabilities
2248 */
2249 script_argv = xmalloc(sizeof(char *) * 4); /* NULL terminated */
2250 script_argv[0] = xstrdup("capmc");
2251 script_argv[1] = xstrdup("get_mcdram_capabilities");
2252 for (retry = 0; ; retry++) {
2253 START_TIMER;
2254 resp_msg = _run_script(capmc_path, script_argv, &status);
2255 END_TIMER;
2256 if (debug_flag) {
2257 info("%s: get_mcdram_capabilities ran for %s",
2258 __func__, TIME_STR);
2259 }
2260 _log_script_argv(script_argv, resp_msg);
2261 if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
2262 break; /* Success */
2263 error("%s: get_mcdram_capabilities status:%u response:%s",
2264 __func__, status, resp_msg);
2265 if (resp_msg == NULL) {
2266 info("%s: get_mcdram_capabilities returned no information",
2267 __func__);
2268 _free_script_argv(script_argv);
2269 rc = SLURM_ERROR;
2270 goto fini;
2271 }
2272 if (strstr(resp_msg, "Could not lookup") &&
2273 (retry <= capmc_retries)) {
2274 /* State Manager is down. Sleep and retry */
2275 sleep(1);
2276 xfree(resp_msg);
2277 } else {
2278 xfree(resp_msg);
2279 _free_script_argv(script_argv);
2280 rc = SLURM_ERROR;
2281 goto fini;
2282 }
2283 }
2284 _free_script_argv(script_argv);
2285
2286 j = json_tokener_parse(resp_msg);
2287 if (j == NULL) {
2288 error("%s: json parser failed on %s", __func__, resp_msg);
2289 xfree(resp_msg);
2290 rc = SLURM_ERROR;
2291 goto fini;
2292 }
2293 xfree(resp_msg);
2294 json_object_object_foreachC(j, iter) {
2295 if (xstrcmp(iter.key, "nids"))
2296 continue;
2297 mcdram_cap = _json_parse_mcdram_cap_array(j, iter.key,
2298 &mcdram_cap_cnt);
2299 break;
2300 }
2301 json_object_put(j); /* Frees json memory */
2302
2303 /*
2304 * Load current MCDRAM configuration
2305 */
2306 script_argv = xmalloc(sizeof(char *) * 4); /* NULL terminated */
2307 script_argv[0] = xstrdup("capmc");
2308 script_argv[1] = xstrdup("get_mcdram_cfg");
2309 for (retry = 0; ; retry++) {
2310 START_TIMER;
2311 resp_msg = _run_script(capmc_path, script_argv, &status);
2312 END_TIMER;
2313 if (debug_flag) {
2314 info("%s: get_mcdram_cfg ran for %s",
2315 __func__, TIME_STR);
2316 }
2317 _log_script_argv(script_argv, resp_msg);
2318 if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
2319 break; /* Success */
2320 error("%s: get_mcdram_cfg status:%u response:%s",
2321 __func__, status, resp_msg);
2322 if (resp_msg == NULL) {
2323 info("%s: get_mcdram_cfg returned no information",
2324 __func__);
2325 _free_script_argv(script_argv);
2326 rc = SLURM_ERROR;
2327 goto fini;
2328 }
2329 if (strstr(resp_msg, "Could not lookup") &&
2330 (retry <= capmc_retries)) {
2331 /* State Manager is down. Sleep and retry */
2332 sleep(1);
2333 xfree(resp_msg);
2334 } else {
2335 xfree(resp_msg);
2336 _free_script_argv(script_argv);
2337 rc = SLURM_ERROR;
2338 goto fini;
2339 }
2340 }
2341 _free_script_argv(script_argv);
2342
2343 j = json_tokener_parse(resp_msg);
2344 if (j == NULL) {
2345 error("%s: json parser failed on %s", __func__, resp_msg);
2346 xfree(resp_msg);
2347 rc = SLURM_ERROR;
2348 goto fini;
2349 }
2350 xfree(resp_msg);
2351 json_object_object_foreachC(j, iter) {
2352 if (xstrcmp(iter.key, "nids"))
2353 continue;
2354 mcdram_cfg = _json_parse_mcdram_cfg_array(j, iter.key,
2355 &mcdram_cfg_cnt);
2356 break;
2357 }
2358 json_object_put(j); /* Frees json memory */
2359
2360 mcdram_cfg2 = _load_current_mcdram(&mcdram_cfg2_cnt);
2361
2362 /*
2363 * Load available NUMA capabilities
2364 */
2365 script_argv = xmalloc(sizeof(char *) * 4); /* NULL terminated */
2366 script_argv[0] = xstrdup("capmc");
2367 script_argv[1] = xstrdup("get_numa_capabilities");
2368 for (retry = 0; ; retry++) {
2369 START_TIMER;
2370 resp_msg = _run_script(capmc_path, script_argv, &status);
2371 END_TIMER;
2372 if (debug_flag) {
2373 info("%s: get_numa_capabilities ran for %s",
2374 __func__, TIME_STR);
2375 }
2376 _log_script_argv(script_argv, resp_msg);
2377 if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
2378 break; /* Success */
2379 error("%s: get_numa_capabilities status:%u response:%s",
2380 __func__, status, resp_msg);
2381 if (resp_msg == NULL) {
2382 info("%s: get_numa_capabilities returned no information",
2383 __func__);
2384 _free_script_argv(script_argv);
2385 rc = SLURM_ERROR;
2386 goto fini;
2387 }
2388 if (strstr(resp_msg, "Could not lookup") &&
2389 (retry <= capmc_retries)) {
2390 /* State Manager is down. Sleep and retry */
2391 sleep(1);
2392 xfree(resp_msg);
2393 } else {
2394 xfree(resp_msg);
2395 _free_script_argv(script_argv);
2396 rc = SLURM_ERROR;
2397 goto fini;
2398 }
2399 }
2400 _free_script_argv(script_argv);
2401
2402 j = json_tokener_parse(resp_msg);
2403 if (j == NULL) {
2404 error("%s: json parser failed on %s", __func__, resp_msg);
2405 xfree(resp_msg);
2406 rc = SLURM_ERROR;
2407 goto fini;
2408 }
2409 xfree(resp_msg);
2410 json_object_object_foreachC(j, iter) {
2411 if (xstrcmp(iter.key, "nids"))
2412 continue;
2413 numa_cap = _json_parse_numa_cap_array(j, iter.key,
2414 &numa_cap_cnt);
2415 break;
2416 }
2417 json_object_put(j); /* Frees json memory */
2418
2419 /*
2420 * Load current NUMA configuration
2421 */
2422 script_argv = xmalloc(sizeof(char *) * 4); /* NULL terminated */
2423 script_argv[0] = xstrdup("capmc");
2424 script_argv[1] = xstrdup("get_numa_cfg");
2425 for (retry = 0; ; retry++) {
2426 START_TIMER;
2427 resp_msg = _run_script(capmc_path, script_argv, &status);
2428 END_TIMER;
2429 if (debug_flag)
2430 info("%s: get_numa_cfg ran for %s", __func__, TIME_STR);
2431 _log_script_argv(script_argv, resp_msg);
2432 if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
2433 break; /* Success */
2434 error("%s: get_numa_cfg status:%u response:%s",
2435 __func__, status, resp_msg);
2436 if (resp_msg == NULL) {
2437 info("%s: get_numa_cfg returned no information",
2438 __func__);
2439 _free_script_argv(script_argv);
2440 rc = SLURM_ERROR;
2441 goto fini;
2442 }
2443 if (strstr(resp_msg, "Could not lookup") &&
2444 (retry <= capmc_retries)) {
2445 /* State Manager is down. Sleep and retry */
2446 sleep(1);
2447 xfree(resp_msg);
2448 } else {
2449 xfree(resp_msg);
2450 _free_script_argv(script_argv);
2451 rc = SLURM_ERROR;
2452 goto fini;
2453 }
2454 }
2455 _free_script_argv(script_argv);
2456
2457 j = json_tokener_parse(resp_msg);
2458 if (j == NULL) {
2459 error("%s: json parser failed on %s", __func__, resp_msg);
2460 xfree(resp_msg);
2461 rc = SLURM_ERROR;
2462 goto fini;
2463 }
2464 xfree(resp_msg);
2465 json_object_object_foreachC(j, iter) {
2466 if (xstrcmp(iter.key, "nids"))
2467 continue;
2468 numa_cfg = _json_parse_numa_cfg_array(j, iter.key,
2469 &numa_cfg_cnt);
2470 break;
2471 }
2472 json_object_put(j); /* Frees json memory */
2473
2474 numa_cfg2 = _load_current_numa(&numa_cfg2_cnt);
2475
2476 if (debug_flag) {
2477 _mcdram_cap_log(mcdram_cap, mcdram_cap_cnt);
2478 _mcdram_cfg_log(mcdram_cfg, mcdram_cfg_cnt);
2479 _mcdram_cfg2_log(mcdram_cfg2, mcdram_cfg2_cnt);
2480 _numa_cap_log(numa_cap, numa_cap_cnt);
2481 _numa_cfg_log(numa_cfg, numa_cfg_cnt);
2482 _numa_cfg2_log(numa_cfg2, numa_cfg2_cnt);
2483 }
2484 for (i = 0; i < mcdram_cfg_cnt; i++) {
2485 for (k = 0; k < mcdram_cfg2_cnt; k++) {
2486 if (!mcdram_cfg2[k].node_bitmap ||
2487 !bit_test(mcdram_cfg2[k].node_bitmap,
2488 mcdram_cfg[i].nid))
2489 continue;
2490 if (mcdram_cfg[i].mcdram_pct !=
2491 mcdram_cfg2[k].cache_pct) {
2492 if (mcdram_cfg[i].mcdram_pct == NO_VAL16) {
2493 info("%s: No mcdram_pct from capmc for nid %u",
2494 __func__, mcdram_cfg[i].nid);
2495 } else {
2496 info("%s: HBM mismatch between capmc "
2497 "and cnselect for nid %u (%u != %d)",
2498 __func__, mcdram_cfg[i].nid,
2499 mcdram_cfg[i].mcdram_pct,
2500 mcdram_cfg2[k].cache_pct);
2501 }
2502 mcdram_cfg[i].mcdram_pct =
2503 mcdram_cfg2[k].cache_pct;
2504 xfree(mcdram_cfg[i].mcdram_cfg);
2505 mcdram_cfg[i].mcdram_cfg =
2506 xstrdup(mcdram_cfg2[k].mcdram_cfg);
2507 }
2508 break;
2509 }
2510 }
2511 for (i = 0; i < numa_cfg_cnt; i++) {
2512 for (k = 0; k < numa_cfg2_cnt; k++) {
2513 if (!numa_cfg2[k].node_bitmap ||
2514 !bit_test(numa_cfg2[k].node_bitmap,
2515 numa_cfg[i].nid))
2516 continue;
2517 if (xstrcmp(numa_cfg[i].numa_cfg,
2518 numa_cfg2[k].numa_cfg)) {
2519 if (!numa_cfg[i].numa_cfg) {
2520 info("%s: No numa_cfg from capmc for nid %u",
2521 __func__, numa_cfg[i].nid);
2522 } else {
2523 info("%s: NUMA mismatch between capmc "
2524 "and cnselect for nid %u (%s != %s)",
2525 __func__, numa_cfg[i].nid,
2526 numa_cfg[i].numa_cfg,
2527 numa_cfg2[k].numa_cfg);
2528 }
2529 xfree(numa_cfg[i].numa_cfg);
2530 numa_cfg[i].numa_cfg =
2531 xstrdup(numa_cfg2[k].numa_cfg);
2532 }
2533 break;
2534 }
2535 }
2536
2537 START_TIMER;
2538 if (node_list) {
2539 /* Write nodes */
2540 slurmctld_lock_t write_nodes_lock = {
2541 NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK};
2542
2543 if ((host_list = hostlist_create(node_list)) == NULL) {
2544 error ("hostlist_create error on %s: %m", node_list);
2545 goto fini;
2546 }
2547 hostlist_uniq(host_list);
2548
2549 if (set_locks)
2550 lock_slurmctld(write_nodes_lock);
2551 while ((node_name = hostlist_shift(host_list))) {
2552 node_ptr = find_node_record(node_name);
2553 if (node_ptr) {
2554 _update_node_features(node_ptr,
2555 mcdram_cap,mcdram_cap_cnt,
2556 mcdram_cfg,mcdram_cfg_cnt,
2557 numa_cap, numa_cap_cnt,
2558 numa_cfg, numa_cfg_cnt);
2559 }
2560 free(node_name);
2561 }
2562 if (set_locks)
2563 unlock_slurmctld(write_nodes_lock);
2564 hostlist_destroy(host_list);
2565 } else {
2566 time_t now = time(NULL);
2567 for (i = 0, node_ptr = node_record_table_ptr;
2568 i < node_record_count; i++, node_ptr++) {
2569 if ((node_ptr->last_response > now) &&
2570 IS_NODE_NO_RESPOND(node_ptr)) {
2571 /*
2572 * Reboot likely in progress.
2573 * Preserve active KNL features and merge
2574 * with configured non-KNL features
2575 */
2576 _merge_strings(&node_ptr->features_act,
2577 node_ptr->features, 0);
2578 continue;
2579 }
2580 if (validate_mode == 0) {
2581 _strip_knl_opts(&node_ptr->features);
2582 xfree(node_ptr->features_act);
2583 if (node_ptr->features) {
2584 node_ptr->features_act =
2585 xstrdup(node_ptr->features);
2586 }
2587 } else {
2588 if (node_ptr->features) {
2589 node_ptr->features_act =
2590 xstrdup(node_ptr->features);
2591 }
2592 }
2593 }
2594 _update_all_node_features(mcdram_cap, mcdram_cap_cnt,
2595 mcdram_cfg, mcdram_cfg_cnt,
2596 numa_cap, numa_cap_cnt,
2597 numa_cfg, numa_cfg_cnt);
2598 }
2599 END_TIMER;
2600 if (debug_flag)
2601 info("%s: update_node_features ran for %s", __func__, TIME_STR);
2602
2603 last_node_update = time(NULL);
2604
2605 fini: _mcdram_cap_free(mcdram_cap, mcdram_cap_cnt);
2606 _mcdram_cfg_free(mcdram_cfg, mcdram_cfg_cnt);
2607 _mcdram_cfg2_free(mcdram_cfg2, mcdram_cfg2_cnt);
2608 _numa_cap_free(numa_cap, numa_cap_cnt);
2609 _numa_cfg_free(numa_cfg, numa_cfg_cnt);
2610 _numa_cfg2_free(numa_cfg2, numa_cfg2_cnt);
2611
2612 return rc;
2613 }
2614
2615 /* Get this node's current and available MCDRAM and NUMA settings from BIOS.
2616 * avail_modes IN/OUT - append available modes, must be xfreed
2617 * current_mode IN/OUT - append current modes, must be xfreed
2618 *
2619 * NOTE: Not applicable on Cray systems; can be used on other systems.
2620 *
2621 * NOTES about syscfg (from Intel):
2622 * To display the BIOS Parameters:
2623 * >> syscfg /d biossettings <"BIOS variable Name">
2624 *
2625 * To Set the BIOS Parameters:
2626 * >> syscfg /bcs <AdminPw> <"BIOS variable Name"> <Value>
2627 * Note: If AdminPw is not set use ""
2628 */
node_features_p_node_state(char ** avail_modes,char ** current_mode)2629 extern void node_features_p_node_state(char **avail_modes, char **current_mode)
2630 {
2631 return; /* Not applicable on Cray systems */
2632 }
2633
2634 /* Test if a job's feature specification is valid */
node_features_p_job_valid(char * job_features)2635 extern int node_features_p_job_valid(char *job_features)
2636 {
2637 uint16_t job_mcdram, job_numa;
2638 int mcdram_cnt, numa_cnt;
2639 int last_mcdram_cnt = 0, last_numa_cnt = 0;
2640 int rc = SLURM_SUCCESS;
2641 char last_sep = '\0', *tmp, *tok, *save_ptr = NULL;
2642
2643 if ((job_features == NULL) || (job_features[0] == '\0'))
2644 return SLURM_SUCCESS;
2645
2646 tmp = xstrdup(job_features);
2647 tok = strtok_r(tmp, "[]()|", &save_ptr);
2648 while (tok) {
2649 last_sep = tok[strlen(tok) - 1];
2650 job_mcdram = _knl_mcdram_parse(tok, "&,*");
2651 mcdram_cnt = _knl_mcdram_bits_cnt(job_mcdram) + last_mcdram_cnt;
2652 if (mcdram_cnt > 1) { /* Multiple ANDed MCDRAM options */
2653 rc = ESLURM_INVALID_KNL;
2654 break;
2655 }
2656
2657 job_numa = _knl_numa_parse(tok, "&,*");
2658 numa_cnt = _knl_numa_bits_cnt(job_numa) + last_numa_cnt;
2659 if (numa_cnt > 1) { /* Multiple ANDed NUMA options */
2660 rc = ESLURM_INVALID_KNL;
2661 break;
2662 }
2663 tok = strtok_r(NULL, "[]()|", &save_ptr);
2664 if (tok &&
2665 ((last_sep == '&') || /* e.g. "equal&(flat|cache)" */
2666 (tok[0] == '&'))) { /* e.g. "(flat|cache)&equal" */
2667 last_mcdram_cnt += mcdram_cnt;
2668 last_numa_cnt += numa_cnt;
2669 } else {
2670 last_mcdram_cnt = 0;
2671 last_numa_cnt = 0;
2672 }
2673 }
2674 xfree(tmp);
2675
2676 return rc;
2677 }
2678
2679 /*
2680 * Translate a job's feature request to the node features needed at boot time.
2681 * If multiple MCDRAM or NUMA values are ORed, pick the first ones.
2682 * IN job_features - job's --constraint specification
2683 * RET features required on node reboot. Must xfree to release memory
2684 */
node_features_p_job_xlate(char * job_features)2685 extern char *node_features_p_job_xlate(char *job_features)
2686 {
2687 char *node_features = NULL;
2688 char *tmp, *save_ptr = NULL, *mult, *sep = "", *tok;
2689 bool has_numa = false, has_mcdram = false;
2690
2691 if ((job_features == NULL) || (job_features[0] == '\0'))
2692 return node_features;
2693
2694 tmp = xstrdup(job_features);
2695 tok = strtok_r(tmp, "[]()|&", &save_ptr);
2696 while (tok) {
2697 bool knl_opt = false;
2698 if ((mult = strchr(tok, '*')))
2699 mult[0] = '\0';
2700 if (_knl_mcdram_token(tok)) {
2701 if (!has_mcdram) {
2702 has_mcdram = true;
2703 knl_opt = true;
2704 }
2705 }
2706 if (_knl_numa_token(tok)) {
2707 if (!has_numa) {
2708 has_numa = true;
2709 knl_opt = true;
2710 }
2711 }
2712 if (knl_opt) {
2713 xstrfmtcat(node_features, "%s%s", sep, tok);
2714 sep = ",";
2715 }
2716 tok = strtok_r(NULL, "[]()|&", &save_ptr);
2717 }
2718 xfree(tmp);
2719
2720 return node_features;
2721 }
2722
2723 /* Return bitmap of KNL nodes, NULL if none identified */
node_features_p_get_node_bitmap(void)2724 extern bitstr_t *node_features_p_get_node_bitmap(void)
2725 {
2726 if (knl_node_bitmap)
2727 return bit_copy(knl_node_bitmap);
2728 return NULL;
2729 }
2730
2731 /* Return count of overlaping bits in active_bitmap and knl_node_bitmap */
node_features_p_overlap(bitstr_t * active_bitmap)2732 extern int node_features_p_overlap(bitstr_t *active_bitmap)
2733 {
2734 int cnt = 0;
2735
2736 if (!knl_node_bitmap || !active_bitmap ||
2737 !(cnt = bit_overlap(active_bitmap, knl_node_bitmap)))
2738 return 0;
2739
2740 return cnt;
2741 }
2742
2743 /* Return true if the plugin requires PowerSave mode for booting nodes */
node_features_p_node_power(void)2744 extern bool node_features_p_node_power(void)
2745 {
2746 return true;
2747 }
2748
2749 /* Set's the node's active features based upon job constraints.
2750 * NOTE: Executed by the slurmd daemon.
2751 * NOTE: Not applicable for knl_cray plugin, reconfiguration done by slurmctld
2752 * IN active_features - New active features
2753 * RET error code */
node_features_p_node_set(char * active_features)2754 extern int node_features_p_node_set(char *active_features)
2755 {
2756 return SLURM_SUCCESS;
2757 }
2758
2759 /*
2760 * Note the active features associated with a set of nodes have been updated.
2761 * Specifically update the node's "hbm" GRES and "CpuBind" values as needed.
2762 * IN active_features - New active features
2763 * IN node_bitmap - bitmap of nodes changed
2764 * RET error code
2765 */
node_features_p_node_update(char * active_features,bitstr_t * node_bitmap)2766 extern int node_features_p_node_update(char *active_features,
2767 bitstr_t *node_bitmap)
2768 {
2769 int i, i_first, i_last;
2770 int rc = SLURM_SUCCESS, numa_inx = -1;
2771 int mcdram_inx = 0;
2772 uint64_t mcdram_size;
2773 node_record_t *node_ptr;
2774 char *save_ptr = NULL, *tmp, *tok;
2775
2776 if (mcdram_per_node == NULL)
2777 error("%s: mcdram_per_node == NULL", __func__);
2778
2779 if (active_features) {
2780 tmp = xstrdup(active_features);
2781 tok = strtok_r(tmp, ",", &save_ptr);
2782 while (tok) {
2783 if (numa_inx == -1)
2784 numa_inx = _knl_numa_inx(tok);
2785 mcdram_inx |= _knl_mcdram_token(tok);
2786 tok = strtok_r(NULL, ",", &save_ptr);
2787 }
2788 xfree(tmp);
2789 }
2790
2791 if (mcdram_inx >= 0) {
2792 for (i = 0; i < KNL_MCDRAM_CNT; i++) {
2793 if ((KNL_CACHE << i) == mcdram_inx)
2794 break;
2795 }
2796 if ((i >= KNL_MCDRAM_CNT) || (mcdram_pct[i] == -1))
2797 mcdram_inx = -1;
2798 else
2799 mcdram_inx = i;
2800 } else {
2801 mcdram_inx = -1;
2802 }
2803
2804 xassert(node_bitmap);
2805 i_first = bit_ffs(node_bitmap);
2806 if (i_first >= 0)
2807 i_last = bit_fls(node_bitmap);
2808 else
2809 i_last = i_first - 1;
2810 for (i = i_first; i <= i_last; i++) {
2811 if (!bit_test(node_bitmap, i))
2812 continue;
2813 if (i >= node_record_count) {
2814 error("%s: Invalid node index (%d >= %d)",
2815 __func__, i, node_record_count);
2816 rc = SLURM_ERROR;
2817 break;
2818 }
2819 node_ptr = node_record_table_ptr + i;
2820 if ((numa_inx >= 0) && cpu_bind[numa_inx])
2821 node_ptr->cpu_bind = cpu_bind[numa_inx];
2822 if (mcdram_per_node && (mcdram_inx >= 0)) {
2823 mcdram_size = mcdram_per_node[i] *
2824 (100 - mcdram_pct[mcdram_inx]) / 100;
2825 gres_plugin_node_feature(node_ptr->name, "hbm",
2826 mcdram_size, &node_ptr->gres,
2827 &node_ptr->gres_list);
2828 }
2829 }
2830
2831 return rc;
2832 }
2833
2834 /*
2835 * Return TRUE if the specified node update request is valid with respect
2836 * to features changes (i.e. don't permit a non-KNL node to set KNL features).
2837 *
2838 * arg IN - Pointer to node_record_t record
2839 * update_node_msg IN - Pointer to update request
2840 */
node_features_p_node_update_valid(void * arg,update_node_msg_t * update_node_msg)2841 extern bool node_features_p_node_update_valid(void *arg,
2842 update_node_msg_t *update_node_msg)
2843 {
2844 node_record_t *node_ptr = (node_record_t *) arg;
2845 char *tmp, *save_ptr = NULL, *tok;
2846 bool is_knl = false, invalid_feature = false;
2847
2848 /* No feature changes */
2849 if (!update_node_msg->features && !update_node_msg->features_act)
2850 return true;
2851
2852 /* Determine if this is KNL node based upon current features */
2853 if (node_ptr->features && node_ptr->features[0]) {
2854 tmp = xstrdup(node_ptr->features);
2855 tok = strtok_r(tmp, ",", &save_ptr);
2856 while (tok) {
2857 if (_knl_mcdram_token(tok) || _knl_numa_token(tok)) {
2858 is_knl = true;
2859 break;
2860 }
2861 tok = strtok_r(NULL, ",", &save_ptr);
2862 }
2863 xfree(tmp);
2864 }
2865 if (is_knl)
2866 return true;
2867
2868 /* Validate that AvailableFeatures update request has no KNL modes */
2869 if (update_node_msg->features) {
2870 tmp = xstrdup(update_node_msg->features);
2871 tok = strtok_r(tmp, ",", &save_ptr);
2872 while (tok) {
2873 if (_knl_mcdram_token(tok) || _knl_numa_token(tok)) {
2874 invalid_feature = true;
2875 break;
2876 }
2877 tok = strtok_r(NULL, ",", &save_ptr);
2878 }
2879 xfree(tmp);
2880 if (invalid_feature) {
2881 info("Invalid AvailableFeatures update request (%s) for non-KNL node %s",
2882 update_node_msg->features, node_ptr->name);
2883 return false;
2884 }
2885 }
2886
2887 /* Validate that ActiveFeatures update request has no KNL modes */
2888 if (update_node_msg->features_act) {
2889 tmp = xstrdup(update_node_msg->features_act);
2890 tok = strtok_r(tmp, ",", &save_ptr);
2891 while (tok) {
2892 if (_knl_mcdram_token(tok) || _knl_numa_token(tok)) {
2893 invalid_feature = true;
2894 break;
2895 }
2896 tok = strtok_r(NULL, ",", &save_ptr);
2897 }
2898 xfree(tmp);
2899 if (invalid_feature) {
2900 info("Invalid ActiveFeatures update request (%s) for non-KNL node %s",
2901 update_node_msg->features_act, node_ptr->name);
2902 return false;
2903 }
2904 }
2905
2906 /*
2907 * For non-KNL node, active and available features must match
2908 */
2909 if (!update_node_msg->features) {
2910 update_node_msg->features =
2911 xstrdup(update_node_msg->features_act);
2912 } else if (!update_node_msg->features_act) {
2913 update_node_msg->features_act =
2914 xstrdup(update_node_msg->features);
2915 } else if (xstrcmp(update_node_msg->features,
2916 update_node_msg->features_act)) {
2917 info("Invalid ActiveFeatures != AvailableFeatures (%s != %s) for non-KNL node %s",
2918 update_node_msg->features, update_node_msg->features_act,
2919 node_ptr->name);
2920 return false;
2921 }
2922
2923 return true;
2924 }
2925
2926 /* Return TRUE if this (one) feature name is under this plugin's control */
node_features_p_changeable_feature(char * feature)2927 extern bool node_features_p_changeable_feature(char *feature)
2928 {
2929 if ((validate_mode == 0) &&
2930 (_knl_mcdram_token(feature) || _knl_numa_token(feature)))
2931 return true;
2932 return false;
2933 }
2934
2935 /*
2936 * Translate a node's feature specification by replacing any features associated
2937 * with this plugin in the original value with the new values, preserving
2938 * any features that are not associated with this plugin
2939 * IN new_features - newly active features
2940 * IN orig_features - original active features
2941 * IN avail_features - original available features
2942 * IN node_inx - index of node in node table
2943 * RET node's new merged features, must be xfreed
2944 */
node_features_p_node_xlate(char * new_features,char * orig_features,char * avail_features,int node_inx)2945 extern char *node_features_p_node_xlate(char *new_features, char *orig_features,
2946 char *avail_features, int node_inx)
2947 {
2948 char *node_features = NULL;
2949 char *tmp, *save_ptr = NULL, *sep = "", *tok;
2950 uint16_t new_mcdram = 0, new_numa = 0;
2951 uint16_t tmp_mcdram, tmp_numa;
2952 bool is_knl = false;
2953
2954 if (avail_features) {
2955 tmp = xstrdup(avail_features);
2956 tok = strtok_r(tmp, ",", &save_ptr);
2957 while (tok) {
2958 if (_knl_mcdram_token(tok) || _knl_numa_token(tok)) {
2959 is_knl = true;
2960 } else {
2961 xstrfmtcat(node_features, "%s%s", sep, tok);
2962 sep = ",";
2963 }
2964 tok = strtok_r(NULL, ",", &save_ptr);
2965 }
2966 xfree(tmp);
2967 if (!is_knl) {
2968 xfree(node_features);
2969 sep = "";
2970 }
2971 }
2972
2973 if (new_features) {
2974 /* Copy non-KNL features */
2975 if (!is_knl && new_features) {
2976 tmp = xstrdup(new_features);
2977 tok = strtok_r(tmp, ",", &save_ptr);
2978 while (tok) {
2979 if ((_knl_mcdram_token(tok) == 0) &&
2980 (_knl_numa_token(tok) == 0)) {
2981 xstrfmtcat(node_features, "%s%s", sep,
2982 tok);
2983 sep = ",";
2984 }
2985 tok = strtok_r(NULL, ",", &save_ptr);
2986 }
2987 xfree(tmp);
2988 }
2989
2990 /* Copy new KNL features in MCDRAM/NUMA order */
2991 tmp = xstrdup(new_features);
2992 tok = strtok_r(tmp, ",", &save_ptr);
2993 while (tok) {
2994 if ((tmp_mcdram = _knl_mcdram_token(tok)))
2995 new_mcdram |= tmp_mcdram;
2996 else if ((tmp_numa = _knl_numa_token(tok)))
2997 new_numa |= tmp_numa;
2998 tok = strtok_r(NULL, ",", &save_ptr);
2999 }
3000 xfree(tmp);
3001
3002 if (is_knl && ((new_mcdram == 0) || (new_numa == 0))) {
3003 /*
3004 * New active features lacks current MCDRAM or NUMA,
3005 * copy values from original
3006 */
3007 tmp = xstrdup(orig_features);
3008 tok = strtok_r(tmp, ",", &save_ptr);
3009 while (tok) {
3010 if ((new_mcdram == 0) &&
3011 (tmp_mcdram = _knl_mcdram_token(tok)))
3012 new_mcdram |= tmp_mcdram;
3013 else if ((new_numa == 0) &&
3014 (tmp_numa = _knl_numa_token(tok)))
3015 new_numa |= tmp_numa;
3016 tok = strtok_r(NULL, ",", &save_ptr);
3017 }
3018 xfree(tmp);
3019 }
3020 if (new_mcdram) {
3021 tmp = _knl_mcdram_str(new_mcdram);
3022 xstrfmtcat(node_features, "%s%s", sep, tmp);
3023 xfree(tmp);
3024 sep = ",";
3025 }
3026 if (new_numa) {
3027 tmp = _knl_numa_str(new_numa);
3028 xstrfmtcat(node_features, "%s%s", sep, tmp);
3029 xfree(tmp);
3030 }
3031 }
3032
3033 return node_features;
3034 }
3035
3036 /* Translate a node's new feature specification into a "standard" ordering
3037 * RET node's new merged features, must be xfreed */
node_features_p_node_xlate2(char * new_features)3038 extern char *node_features_p_node_xlate2(char *new_features)
3039 {
3040 char *node_features = NULL;
3041 char *tmp, *save_ptr = NULL, *sep = "", *tok;
3042 uint16_t new_mcdram = 0, new_numa = 0;
3043 uint16_t tmp_mcdram, tmp_numa;
3044
3045 if (new_features) {
3046 tmp = xstrdup(new_features);
3047 tok = strtok_r(tmp, ",", &save_ptr);
3048 while (tok) {
3049 if ((tmp_mcdram = _knl_mcdram_token(tok))) {
3050 new_mcdram |= tmp_mcdram;
3051 } else if ((tmp_numa = _knl_numa_token(tok))) {
3052 new_numa |= tmp_numa;
3053 } else {
3054 xstrfmtcat(node_features, "%s%s", sep, tok);
3055 sep = ",";
3056 }
3057 tok = strtok_r(NULL, ",", &save_ptr);
3058 }
3059 xfree(tmp);
3060 if (new_mcdram) {
3061 tmp = _knl_mcdram_str(new_mcdram);
3062 xstrfmtcat(node_features, "%s%s", sep, tmp);
3063 xfree(tmp);
3064 sep = ",";
3065 }
3066 if (new_numa) {
3067 tmp = _knl_numa_str(new_numa);
3068 xstrfmtcat(node_features, "%s%s", sep, tmp);
3069 xfree(tmp);
3070 }
3071 }
3072
3073 return node_features;
3074 }
3075
3076 /* Perform set up for step launch
3077 * mem_sort IN - Trigger sort of memory pages (KNL zonesort)
3078 * numa_bitmap IN - NUMA nodes allocated to this job */
node_features_p_step_config(bool mem_sort,bitstr_t * numa_bitmap)3079 extern void node_features_p_step_config(bool mem_sort, bitstr_t *numa_bitmap)
3080 {
3081 #ifdef HAVE_NUMA
3082 if (mem_sort && (numa_available() != -1)) {
3083 struct stat sb;
3084 int buf_len, fd, i, len, rc;
3085 char buf[12];
3086
3087 if (stat(ZONE_SORT_PATH, &sb) == -1) {
3088 rc = system(MODPROBE_PATH " zonesort_module");
3089 if (rc != -1)
3090 rc = WEXITSTATUS(rc);
3091 if (rc) {
3092 verbose("%s: zonesort execution failure. Return code: %d",
3093 __func__, rc);
3094 }
3095 }
3096 if ((fd = open(ZONE_SORT_PATH, O_WRONLY | O_SYNC)) == -1) {
3097 error("%s: Could not open file %s: %m",
3098 __func__, ZONE_SORT_PATH);
3099 } else {
3100 len = numa_max_node() + 1;
3101 for (i = 0; i < len; i++) {
3102 if (numa_bitmap && !bit_test(numa_bitmap, i))
3103 continue;
3104 snprintf(buf, sizeof(buf), "%d", i);
3105 buf_len = strlen(buf) + 1;
3106 if (write(fd, buf, buf_len) != buf_len) {
3107 error("%s: Could not write file %s: %m",
3108 __func__, ZONE_SORT_PATH);
3109 }
3110 }
3111 (void) close(fd);
3112 }
3113 }
3114 #endif
3115 }
3116
3117 /* Determine if the specified user can modify the currently available node
3118 * features */
node_features_p_user_update(uid_t uid)3119 extern bool node_features_p_user_update(uid_t uid)
3120 {
3121 int i;
3122
3123 if (allowed_uid_cnt == 0) /* Default is ALL users allowed to update */
3124 return true;
3125
3126 for (i = 0; i < allowed_uid_cnt; i++) {
3127 if (allowed_uid[i] == uid)
3128 return true;
3129 }
3130
3131 return false;
3132 }
3133
3134 /* Return estimated reboot time, in seconds */
node_features_p_boot_time(void)3135 extern uint32_t node_features_p_boot_time(void)
3136 {
3137 return boot_time;
3138 }
3139
3140 /* Get node features plugin configuration */
node_features_p_get_config(config_plugin_params_t * p)3141 extern void node_features_p_get_config(config_plugin_params_t *p)
3142 {
3143 config_key_pair_t *key_pair;
3144 List data;
3145
3146 xassert(p);
3147 xstrcat(p->name, plugin_type);
3148 data = p->key_pairs;
3149
3150 key_pair = xmalloc(sizeof(config_key_pair_t));
3151 key_pair->name = xstrdup("AllowMCDRAM");
3152 key_pair->value = _knl_mcdram_str(allow_mcdram);
3153 list_append(data, key_pair);
3154
3155 key_pair = xmalloc(sizeof(config_key_pair_t));
3156 key_pair->name = xstrdup("AllowNUMA");
3157 key_pair->value = _knl_numa_str(allow_numa);
3158 list_append(data, key_pair);
3159
3160 key_pair = xmalloc(sizeof(config_key_pair_t));
3161 key_pair->name = xstrdup("AllowUserBoot");
3162 key_pair->value = _make_uid_str(allowed_uid, allowed_uid_cnt);
3163 list_append(data, key_pair);
3164
3165 key_pair = xmalloc(sizeof(config_key_pair_t));
3166 key_pair->name = xstrdup("BootTime");
3167 key_pair->value = xstrdup_printf("%u", boot_time);
3168 list_append(data, key_pair);
3169
3170 key_pair = xmalloc(sizeof(config_key_pair_t));
3171 key_pair->name = xstrdup("CapmcPath");
3172 key_pair->value = xstrdup(capmc_path);
3173 list_append(data, key_pair);
3174
3175 key_pair = xmalloc(sizeof(config_key_pair_t));
3176 key_pair->name = xstrdup("CapmcPollFreq");
3177 key_pair->value = xstrdup_printf("%u", capmc_poll_freq);
3178 list_append(data, key_pair);
3179
3180 key_pair = xmalloc(sizeof(config_key_pair_t));
3181 key_pair->name = xstrdup("CapmcRetries");
3182 key_pair->value = xstrdup_printf("%u", capmc_retries);
3183 list_append(data, key_pair);
3184
3185 key_pair = xmalloc(sizeof(config_key_pair_t));
3186 key_pair->name = xstrdup("CapmcTimeout");
3187 key_pair->value = xstrdup_printf("%u", capmc_timeout);
3188 list_append(data, key_pair);
3189
3190 key_pair = xmalloc(sizeof(config_key_pair_t));
3191 key_pair->name = xstrdup("CnselectPath");
3192 key_pair->value = xstrdup(cnselect_path);
3193 list_append(data, key_pair);
3194
3195 key_pair = xmalloc(sizeof(config_key_pair_t));
3196 key_pair->name = xstrdup("DefaultMCDRAM");
3197 key_pair->value = _knl_mcdram_str(default_mcdram);
3198 list_append(data, key_pair);
3199
3200 key_pair = xmalloc(sizeof(config_key_pair_t));
3201 key_pair->name = xstrdup("DefaultNUMA");
3202 key_pair->value = _knl_numa_str(default_numa);
3203 list_append(data, key_pair);
3204
3205 key_pair = xmalloc(sizeof(config_key_pair_t));
3206 key_pair->name = xstrdup("McPath");
3207 key_pair->value = xstrdup(mc_path);
3208 list_append(data, key_pair);
3209
3210 key_pair = xmalloc(sizeof(config_key_pair_t));
3211 key_pair->name = xstrdup("NodeRebootWeight");
3212 key_pair->value = xstrdup_printf("%u", node_reboot_weight);
3213 list_append(data, key_pair);
3214
3215 key_pair = xmalloc(sizeof(config_key_pair_t));
3216 key_pair->name = xstrdup("SyscfgPath");
3217 key_pair->value = xstrdup(syscfg_path);
3218 list_append(data, key_pair);
3219
3220 key_pair = xmalloc(sizeof(config_key_pair_t));
3221 key_pair->name = xstrdup("UmeCheckInterval");
3222 key_pair->value = xstrdup_printf("%u", ume_check_interval);
3223 list_append(data, key_pair);
3224
3225 list_sort(data, (ListCmpF) sort_key_pairs);
3226
3227 return;
3228 }
3229
3230 /*
3231 * Return node "weight" field if reboot required to change mode
3232 */
node_features_p_reboot_weight(void)3233 extern uint32_t node_features_p_reboot_weight(void)
3234 {
3235 return node_reboot_weight;
3236 }
3237