1 /*****************************************************************************\
2 * gres.c - driver for gres plugin
3 *****************************************************************************
4 * Copyright (C) 2010 Lawrence Livermore National Security.
5 * Portions Copyright (C) 2014-2019 SchedMD LLC
6 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7 * Written by Morris Jette <jette1@llnl.gov>
8 * CODE-OCEC-09-009. All rights reserved.
9 *
10 * This file is part of Slurm, a resource management program.
11 * For details, see <https://slurm.schedmd.com/>.
12 * Please also read the included file: DISCLAIMER.
13 *
14 * Slurm is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option)
17 * any later version.
18 *
19 * In addition, as a special exception, the copyright holders give permission
20 * to link the code of portions of this program with the OpenSSL library under
21 * certain conditions as described in each individual source file, and
22 * distribute linked combinations including the two. You must obey the GNU
23 * General Public License in all respects for all of the code used other than
24 * OpenSSL. If you modify file(s) with this exception, you may extend this
25 * exception to your version of the file(s), but you are not obligated to do
26 * so. If you do not wish to do so, delete this exception statement from your
27 * version. If you delete this exception statement from all source files in
28 * the program, then also delete it here.
29 *
30 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
31 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
32 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
33 * details.
34 *
35 * You should have received a copy of the GNU General Public License along
36 * with Slurm; if not, write to the Free Software Foundation, Inc.,
37 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
38 \*****************************************************************************/
39
40 #include "config.h"
41
42 #define _GNU_SOURCE
43
44 #ifdef __FreeBSD__
45 # include <sys/param.h>
46 # include <sys/cpuset.h>
47 typedef cpuset_t cpu_set_t;
48 #endif
49
50 #include <ctype.h>
51 #include <inttypes.h>
52 #include <limits.h>
53 #include <sched.h>
54 #include <stdio.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #include <sys/stat.h>
58 #include <sys/types.h>
59
60 #ifdef MAJOR_IN_MKDEV
61 # include <sys/mkdev.h>
62 #endif
63 #ifdef MAJOR_IN_SYSMACROS
64 # include <sys/sysmacros.h>
65 #endif
66
67 #include <math.h>
68
69 #ifdef __NetBSD__
70 #define CPU_ZERO(c) cpuset_zero(*(c))
71 #define CPU_ISSET(i,c) cpuset_isset((i),*(c))
72 #define sched_getaffinity sched_getaffinity_np
73 #endif
74
75 #include "slurm/slurm.h"
76 #include "slurm/slurm_errno.h"
77 #include "src/common/assoc_mgr.h"
78 #include "src/common/bitstring.h"
79 #include "src/common/gres.h"
80 #include "src/common/job_resources.h"
81 #include "src/common/list.h"
82 #include "src/common/log.h"
83 #include "src/common/macros.h"
84 #include "src/common/node_conf.h"
85 #include "src/common/node_select.h"
86 #include "src/common/pack.h"
87 #include "src/common/parse_config.h"
88 #include "src/common/plugin.h"
89 #include "src/common/plugrack.h"
90 #include "src/common/read_config.h"
91 #include "src/common/slurm_protocol_api.h"
92 #include "src/common/strlcpy.h"
93 #include "src/common/xmalloc.h"
94 #include "src/common/xstring.h"
95
96 #define MAX_GRES_BITMAP 1024
97
98 strong_alias(gres_gresid_to_gresname, slurm_gres_gresid_to_gresname);
99 strong_alias(gres_get_node_used, slurm_gres_get_node_used);
100 strong_alias(gres_get_system_cnt, slurm_gres_get_system_cnt);
101 strong_alias(gres_get_value_by_type, slurm_gres_get_value_by_type);
102 strong_alias(gres_get_job_info, slurm_gres_get_job_info);
103 strong_alias(gres_build_job_details, slurm_gres_build_job_details);
104 strong_alias(gres_get_step_info, slurm_gres_get_step_info);
105 strong_alias(gres_get_step_state, slurm_gres_get_step_state);
106 strong_alias(gres_get_job_state, slurm_gres_get_job_state);
107 strong_alias(gres_2_tres_str, slurm_gres_2_tres_str);
108 strong_alias(gres_set_job_tres_cnt, slurm_gres_set_job_tres_cnt);
109 strong_alias(gres_set_node_tres_cnt, slurm_gres_set_node_tres_cnt);
110 strong_alias(gres_device_major, slurm_gres_device_major);
111 strong_alias(destroy_gres_device, slurm_destroy_gres_device);
112 strong_alias(destroy_gres_slurmd_conf, slurm_destroy_gres_slurmd_conf);
113
114 /* Gres symbols provided by the plugin */
115 typedef struct slurm_gres_ops {
116 int (*node_config_load) ( List gres_conf_list,
117 node_config_load_t *node_conf);
118 void (*job_set_env) ( char ***job_env_ptr,
119 void *gres_ptr, int node_inx );
120 void (*step_set_env) ( char ***job_env_ptr,
121 void *gres_ptr );
122 void (*step_reset_env) ( char ***job_env_ptr,
123 void *gres_ptr,
124 bitstr_t *usable_gres );
125 void (*send_stepd) ( int fd );
126 void (*recv_stepd) ( int fd );
127 int (*job_info) ( gres_job_state_t *job_gres_data,
128 uint32_t node_inx,
129 enum gres_job_data_type data_type,
130 void *data);
131 int (*step_info) ( gres_step_state_t *step_gres_data,
132 uint32_t node_inx,
133 enum gres_step_data_type data_type,
134 void *data);
135 List (*get_devices) ( void );
136 void (*step_hardware_init) ( bitstr_t *, char * );
137 void (*step_hardware_fini) ( void );
138 gres_epilog_info_t *(*epilog_build_env)(gres_job_state_t *gres_job_ptr);
139 void (*epilog_set_env) ( char ***epilog_env_ptr,
140 gres_epilog_info_t *epilog_info,
141 int node_inx );
142 } slurm_gres_ops_t;
143
144 /*
145 * Gres plugin context, one for each gres type.
146 * Add to gres_context through _add_gres_context().
147 */
148 typedef struct slurm_gres_context {
149 plugin_handle_t cur_plugin;
150 uint8_t config_flags; /* See GRES_CONF_* in gres.h */
151 char * gres_name; /* name (e.g. "gpu") */
152 char * gres_name_colon; /* name + colon (e.g. "gpu:") */
153 int gres_name_colon_len; /* size of gres_name_colon */
154 char * gres_type; /* plugin name (e.g. "gres/gpu") */
155 slurm_gres_ops_t ops; /* pointers to plugin symbols */
156 uint32_t plugin_id; /* key for searches */
157 plugrack_t *plugin_list; /* plugrack info */
158 uint64_t total_cnt; /* Total GRES across all nodes */
159 } slurm_gres_context_t;
160
161 /* Generic gres data structure for adding to a list. Depending upon the
162 * context, gres_data points to gres_node_state_t, gres_job_state_t or
163 * gres_step_state_t */
164 typedef struct gres_state {
165 uint32_t plugin_id;
166 void *gres_data;
167 } gres_state_t;
168
169 typedef struct gres_search_key {
170 int node_offset;
171 uint32_t plugin_id;
172 uint32_t type_id;
173 } gres_key_t;
174
175 /* Pointers to functions in src/slurmd/common/xcpuinfo.h that we may use */
176 typedef struct xcpuinfo_funcs {
177 int (*xcpuinfo_abs_to_mac) (char *abs, char **mac);
178 } xcpuinfo_funcs_t;
179 xcpuinfo_funcs_t xcpuinfo_ops;
180
181 /* Local variables */
182 static int gres_context_cnt = -1;
183 static uint32_t gres_cpu_cnt = 0;
184 static bool gres_debug = false;
185 static slurm_gres_context_t *gres_context = NULL;
186 static char *gres_node_name = NULL;
187 static char *gres_plugin_list = NULL;
188 static pthread_mutex_t gres_context_lock = PTHREAD_MUTEX_INITIALIZER;
189 static List gres_conf_list = NULL;
190 static bool init_run = false;
191 static bool have_gpu = false, have_mps = false;
192 static uint32_t gpu_plugin_id = NO_VAL, mps_plugin_id = NO_VAL;
193 static volatile uint32_t autodetect_types = GRES_AUTODETECT_NONE;
194 static uint32_t select_plugin_type = NO_VAL;
195
196 /* Local functions */
197 static void _add_gres_context(char *gres_name);
198 static gres_node_state_t *
199 _build_gres_node_state(void);
200 static void _build_node_gres_str(List *gres_list, char **gres_str,
201 int cores_per_sock, int sock_per_node);
202 static uint32_t **_build_tasks_per_node_sock(struct job_resources *job_res,
203 uint8_t overcommit,
204 gres_mc_data_t *tres_mc_ptr,
205 node_record_t *node_table_ptr);
206 static bitstr_t *_core_bitmap_rebuild(bitstr_t *old_core_bitmap, int new_size);
207 static void _epilog_list_del(void *x);
208 static int _find_job_by_sock_gres(void *x, void *key);
209 static int _find_sock_by_job_gres(void *x, void *key);
210 static void _free_tasks_per_node_sock(uint32_t **tasks_per_node_socket,
211 int node_cnt);
212 static void _get_gres_cnt(gres_node_state_t *gres_data, char *orig_config,
213 char *gres_name, char *gres_name_colon,
214 int gres_name_colon_len);
215 static uint32_t _get_task_cnt_node(uint32_t **tasks_per_node_socket,
216 int node_inx, int sock_cnt);
217 static uint64_t _get_tot_gres_cnt(uint32_t plugin_id, uint64_t *topo_cnt,
218 int *config_type_cnt);
219 static int _gres_find_id(void *x, void *key);
220 static int _gres_find_job_by_key(void *x, void *key);
221 static int _gres_find_step_by_key(void *x, void *key);
222 static void _gres_job_list_delete(void *list_element);
223 static int _job_alloc(void *job_gres_data, void *node_gres_data,
224 int node_cnt, int node_index, int node_offset,
225 char *gres_name, uint32_t job_id, char *node_name,
226 bitstr_t *core_bitmap, uint32_t plugin_id,
227 uint32_t user_id);
228 static void _job_core_filter(void *job_gres_data, void *node_gres_data,
229 bool use_total_gres, bitstr_t *core_bitmap,
230 int core_start_bit, int core_end_bit,
231 char *gres_name, char *node_name,
232 uint32_t plugin_id);
233 static int _job_dealloc(void *job_gres_data, void *node_gres_data,
234 int node_offset, char *gres_name, uint32_t job_id,
235 char *node_name, bool old_job, uint32_t plugin_id,
236 uint32_t user_id, bool job_fini);
237 static void _job_state_delete(void *gres_data);
238 static void * _job_state_dup(void *gres_data);
239 static void * _job_state_dup2(void *gres_data, int node_index);
240 static void _job_state_log(void *gres_data, uint32_t job_id,
241 uint32_t plugin_id);
242 static uint32_t _job_test(void *job_gres_data, void *node_gres_data,
243 bool use_total_gres, bitstr_t *core_bitmap,
244 int core_start_bit, int core_end_bit, bool *topo_set,
245 uint32_t job_id, char *node_name, char *gres_name,
246 uint32_t plugin_id, bool disable_binding);
247 static int _load_gres_plugin(slurm_gres_context_t *plugin_context);
248 static int _log_gres_slurmd_conf(void *x, void *arg);
249 static void _my_stat(char *file_name);
250 static int _node_config_init(char *node_name, char *orig_config,
251 slurm_gres_context_t *context_ptr,
252 gres_state_t *gres_ptr);
253 static char * _node_gres_used(void *gres_data, char *gres_name);
254 static int _node_reconfig(char *node_name, char *new_gres, char **gres_str,
255 gres_state_t *gres_ptr, bool config_overrides,
256 slurm_gres_context_t *context_ptr,
257 bool *updated_gpu_cnt);
258 static int _node_reconfig_test(char *node_name, char *new_gres,
259 gres_state_t *gres_ptr,
260 slurm_gres_context_t *context_ptr);
261 static void _node_state_dealloc(gres_state_t *gres_ptr);
262 static void * _node_state_dup(void *gres_data);
263 static void _node_state_log(void *gres_data, char *node_name,
264 char *gres_name);
265 static int _parse_gres_config(void **dest, slurm_parser_enum_t type,
266 const char *key, const char *value,
267 const char *line, char **leftover);
268 static int _parse_gres_config2(void **dest, slurm_parser_enum_t type,
269 const char *key, const char *value,
270 const char *line, char **leftover);
271 static bool _shared_gres(uint32_t plugin_id);
272 static bool _sharing_gres(uint32_t plugin_id);
273 static void _sock_gres_del(void *x);
274 static int _step_alloc(void *step_gres_data, void *job_gres_data,
275 uint32_t plugin_id, int node_offset,
276 bool first_step_node,
277 uint32_t job_id, uint32_t step_id,
278 uint16_t tasks_on_node, uint32_t rem_nodes);
279 static int _step_dealloc(gres_state_t *step_gres_ptr, List job_gres_list,
280 uint32_t job_id, uint32_t step_id);
281 static void * _step_state_dup(void *gres_data);
282 static void * _step_state_dup2(void *gres_data, int node_index);
283 static void _step_state_log(void *gres_data, uint32_t job_id,
284 uint32_t step_id, char *gres_name);
285 static uint64_t _step_test(void *step_gres_data, void *job_gres_data,
286 int node_offset, bool first_step_node,
287 uint16_t cpus_per_task, int max_rem_nodes,
288 bool ignore_alloc,
289 uint32_t job_id, uint32_t step_id,
290 uint32_t plugin_id);
291 static void _sync_node_mps_to_gpu(gres_state_t *mps_gres_ptr,
292 gres_state_t *gpu_gres_ptr);
293 static int _unload_gres_plugin(slurm_gres_context_t *plugin_context);
294 static void _validate_slurm_conf(List slurm_conf_list,
295 slurm_gres_context_t *context_ptr);
296 static void _validate_gres_conf(List gres_conf_list,
297 slurm_gres_context_t *context_ptr);
298 static int _validate_file(char *path_name, char *gres_name);
299 static void _validate_links(gres_slurmd_conf_t *p);
300 static void _validate_gres_node_cores(gres_node_state_t *node_gres_ptr,
301 int cpus_ctld, char *node_name);
302 static int _valid_gres_type(char *gres_name, gres_node_state_t *gres_data,
303 bool config_overrides, char **reason_down);
304
gres_plugin_build_id(char * name)305 extern uint32_t gres_plugin_build_id(char *name)
306 {
307 int i, j;
308 uint32_t id = 0;
309
310 if (!name)
311 return id;
312
313 for (i = 0, j = 0; name[i]; i++) {
314 id += (name[i] << j);
315 j = (j + 8) % 32;
316 }
317
318 return id;
319 }
320
_gres_find_id(void * x,void * key)321 static int _gres_find_id(void *x, void *key)
322 {
323 uint32_t *plugin_id = (uint32_t *)key;
324 gres_state_t *state_ptr = (gres_state_t *) x;
325 if (state_ptr->plugin_id == *plugin_id)
326 return 1;
327 return 0;
328 }
329
330 /* Find job record with matching name and type */
_gres_find_job_by_key(void * x,void * key)331 static int _gres_find_job_by_key(void *x, void *key)
332 {
333 gres_state_t *state_ptr = (gres_state_t *) x;
334 gres_key_t *job_key = (gres_key_t *) key;
335 gres_job_state_t *gres_data_ptr;
336 gres_data_ptr = (gres_job_state_t *)state_ptr->gres_data;
337
338 if ((state_ptr->plugin_id == job_key->plugin_id) &&
339 ((job_key->type_id == NO_VAL) ||
340 (gres_data_ptr->type_id == job_key->type_id)))
341 return 1;
342 return 0;
343 }
344
345 /* Find job record with matching name and type */
_gres_find_job_by_key_with_cnt(void * x,void * key)346 static int _gres_find_job_by_key_with_cnt(void *x, void *key)
347 {
348 gres_state_t *state_ptr = (gres_state_t *) x;
349 gres_key_t *job_key = (gres_key_t *) key;
350 gres_job_state_t *gres_data_ptr;
351 gres_data_ptr = (gres_job_state_t *)state_ptr->gres_data;
352
353 if (!_gres_find_job_by_key(x, key))
354 return 0;
355 /* ignore count on no_consume gres */
356 if (!gres_data_ptr->node_cnt ||
357 gres_data_ptr->gres_cnt_node_alloc[job_key->node_offset])
358 return 1;
359 return 0;
360 }
361
_gres_find_step_by_key(void * x,void * key)362 static int _gres_find_step_by_key(void *x, void *key)
363 {
364 gres_state_t *state_ptr = (gres_state_t *) x;
365 gres_key_t *step_key = (gres_key_t *) key;
366 gres_step_state_t *gres_data_ptr;
367 gres_data_ptr = (gres_step_state_t *)state_ptr->gres_data;
368
369 if ((state_ptr->plugin_id == step_key->plugin_id) &&
370 (gres_data_ptr->type_id == step_key->type_id))
371 return 1;
372 return 0;
373 }
374
_gres_find_name_internal(char * name,char * key,uint32_t plugin_id)375 static int _gres_find_name_internal(char *name, char *key, uint32_t plugin_id)
376 {
377 if (!name) {
378 int i;
379 for (i = 0; i < gres_context_cnt; i++) {
380 if (gres_context[i].plugin_id == plugin_id) {
381 name = gres_context[i].gres_name;
382 break;
383 }
384 }
385
386 if (!name) {
387 debug("%s: couldn't find name", __func__);
388 return 0;
389 }
390 }
391
392 if (!xstrcmp(name, key))
393 return 1;
394 return 0;
395 }
396
_gres_job_find_name(void * x,void * key)397 static int _gres_job_find_name(void *x, void *key)
398 {
399 gres_state_t *state_ptr = (gres_state_t *) x;
400 gres_job_state_t *gres_data_ptr =
401 (gres_job_state_t *)state_ptr->gres_data;
402
403 return _gres_find_name_internal(gres_data_ptr->type_name, (char *)key,
404 state_ptr->plugin_id);
405 }
406
_gres_step_find_name(void * x,void * key)407 static int _gres_step_find_name(void *x, void *key)
408 {
409 gres_state_t *state_ptr = (gres_state_t *) x;
410 gres_step_state_t *gres_data_ptr =
411 (gres_step_state_t *)state_ptr->gres_data;
412 return _gres_find_name_internal(gres_data_ptr->type_name, (char *)key,
413 state_ptr->plugin_id);
414 }
415
_load_gres_plugin(slurm_gres_context_t * plugin_context)416 static int _load_gres_plugin(slurm_gres_context_t *plugin_context)
417 {
418 /*
419 * Must be synchronized with slurm_gres_ops_t above.
420 */
421 static const char *syms[] = {
422 "node_config_load",
423 "job_set_env",
424 "step_set_env",
425 "step_reset_env",
426 "send_stepd",
427 "recv_stepd",
428 "job_info",
429 "step_info",
430 "get_devices",
431 "step_hardware_init",
432 "step_hardware_fini",
433 "epilog_build_env",
434 "epilog_set_env"
435 };
436 int n_syms = sizeof(syms) / sizeof(char *);
437
438 /* Find the correct plugin */
439 if (plugin_context->config_flags & GRES_CONF_COUNT_ONLY) {
440 debug("Plugin of type %s only tracks gres counts",
441 plugin_context->gres_type);
442 return SLURM_SUCCESS;
443 }
444
445 plugin_context->cur_plugin = plugin_load_and_link(
446 plugin_context->gres_type,
447 n_syms, syms,
448 (void **) &plugin_context->ops);
449 if (plugin_context->cur_plugin != PLUGIN_INVALID_HANDLE)
450 return SLURM_SUCCESS;
451
452 if (errno != EPLUGIN_NOTFOUND) {
453 error("Couldn't load specified plugin name for %s: %s",
454 plugin_context->gres_type, plugin_strerror(errno));
455 return SLURM_ERROR;
456 }
457
458 debug("gres: Couldn't find the specified plugin name for %s looking "
459 "at all files", plugin_context->gres_type);
460
461 /* Get plugin list */
462 if (plugin_context->plugin_list == NULL) {
463 char *plugin_dir;
464 plugin_context->plugin_list = plugrack_create("gres");
465 plugin_dir = slurm_get_plugin_dir();
466 plugrack_read_dir(plugin_context->plugin_list, plugin_dir);
467 xfree(plugin_dir);
468 }
469
470 plugin_context->cur_plugin = plugrack_use_by_type(
471 plugin_context->plugin_list,
472 plugin_context->gres_type );
473 if (plugin_context->cur_plugin == PLUGIN_INVALID_HANDLE) {
474 debug("Cannot find plugin of type %s, just track gres counts",
475 plugin_context->gres_type);
476 plugin_context->config_flags |= GRES_CONF_COUNT_ONLY;
477 return SLURM_ERROR;
478 }
479
480 /* Dereference the API. */
481 if (plugin_get_syms(plugin_context->cur_plugin,
482 n_syms, syms,
483 (void **) &plugin_context->ops ) < n_syms ) {
484 error("Incomplete %s plugin detected",
485 plugin_context->gres_type);
486 return SLURM_ERROR;
487 }
488
489 return SLURM_SUCCESS;
490 }
491
_unload_gres_plugin(slurm_gres_context_t * plugin_context)492 static int _unload_gres_plugin(slurm_gres_context_t *plugin_context)
493 {
494 int rc;
495
496 /*
497 * Must check return code here because plugins might still
498 * be loaded and active.
499 */
500 if (plugin_context->plugin_list)
501 rc = plugrack_destroy(plugin_context->plugin_list);
502 else {
503 rc = SLURM_SUCCESS;
504 plugin_unload(plugin_context->cur_plugin);
505 }
506 xfree(plugin_context->gres_name);
507 xfree(plugin_context->gres_name_colon);
508 xfree(plugin_context->gres_type);
509
510 return rc;
511 }
512
513 /*
514 * Add new gres context to gres_context array and load the plugin.
515 * Must hold gres_context_lock before calling.
516 */
_add_gres_context(char * gres_name)517 static void _add_gres_context(char *gres_name)
518 {
519 slurm_gres_context_t *plugin_context;
520
521 if (!gres_name || !gres_name[0])
522 fatal("%s: invalid empty gres_name", __func__);
523
524 xrecalloc(gres_context, (gres_context_cnt + 1),
525 sizeof(slurm_gres_context_t));
526
527 plugin_context = &gres_context[gres_context_cnt];
528 plugin_context->gres_name = xstrdup(gres_name);
529 plugin_context->plugin_id = gres_plugin_build_id(gres_name);
530 plugin_context->gres_type = xstrdup_printf("gres/%s", gres_name);
531 plugin_context->plugin_list = NULL;
532 plugin_context->cur_plugin = PLUGIN_INVALID_HANDLE;
533
534 gres_context_cnt++;
535 }
536
537 /*
538 * Initialize the GRES plugins.
539 *
540 * Returns a Slurm errno.
541 */
gres_plugin_init(void)542 extern int gres_plugin_init(void)
543 {
544 int i, j, rc = SLURM_SUCCESS;
545 char *last = NULL, *names, *one_name, *full_name;
546 char *sorted_names = NULL, *sep = "";
547 bool append_mps = false;
548
549 if (init_run && (gres_context_cnt >= 0))
550 return rc;
551
552 slurm_mutex_lock(&gres_context_lock);
553 if (slurm_get_debug_flags() & DEBUG_FLAG_GRES)
554 gres_debug = true;
555 else
556 gres_debug = false;
557
558 if (gres_context_cnt >= 0)
559 goto fini;
560
561 gres_plugin_list = slurm_get_gres_plugins();
562 gres_context_cnt = 0;
563 if ((gres_plugin_list == NULL) || (gres_plugin_list[0] == '\0'))
564 goto fini;
565
566 /* Ensure that "gres/mps" follows "gres/gpu" */
567 have_gpu = false;
568 have_mps = false;
569 names = xstrdup(gres_plugin_list);
570 one_name = strtok_r(names, ",", &last);
571 while (one_name) {
572 bool skip_name = false;
573 if (!xstrcmp(one_name, "mps")) {
574 have_mps = true;
575 if (!have_gpu) {
576 append_mps = true; /* "mps" must follow "gpu" */
577 skip_name = true;
578 }
579 mps_plugin_id = gres_plugin_build_id("mps");
580 } else if (!xstrcmp(one_name, "gpu")) {
581 have_gpu = true;
582 gpu_plugin_id = gres_plugin_build_id("gpu");
583 }
584 if (!skip_name) {
585 xstrfmtcat(sorted_names, "%s%s", sep, one_name);
586 sep = ",";
587 }
588 one_name = strtok_r(NULL, ",", &last);
589 }
590 if (append_mps) {
591 if (!have_gpu)
592 fatal("GresTypes: gres/mps requires that gres/gpu also be configured");
593 xstrfmtcat(sorted_names, "%s%s", sep, "mps");
594 }
595 xfree(names);
596
597 gres_context_cnt = 0;
598 one_name = strtok_r(sorted_names, ",", &last);
599 while (one_name) {
600 full_name = xstrdup("gres/");
601 xstrcat(full_name, one_name);
602 for (i = 0; i < gres_context_cnt; i++) {
603 if (!xstrcmp(full_name, gres_context[i].gres_type))
604 break;
605 }
606 xfree(full_name);
607 if (i < gres_context_cnt) {
608 error("Duplicate plugin %s ignored",
609 gres_context[i].gres_type);
610 } else {
611 _add_gres_context(one_name);
612 }
613 one_name = strtok_r(NULL, ",", &last);
614 }
615 xfree(sorted_names);
616
617 /* Ensure that plugin_id is valid and unique */
618 for (i = 0; i < gres_context_cnt; i++) {
619 for (j = i + 1; j < gres_context_cnt; j++) {
620 if (gres_context[i].plugin_id !=
621 gres_context[j].plugin_id)
622 continue;
623 fatal("Gres: Duplicate plugin_id %u for %s and %s, "
624 "change gres name for one of them",
625 gres_context[i].plugin_id,
626 gres_context[i].gres_type,
627 gres_context[j].gres_type);
628 }
629 xassert(gres_context[i].gres_name);
630
631 gres_context[i].gres_name_colon =
632 xstrdup_printf("%s:", gres_context[i].gres_name);
633 gres_context[i].gres_name_colon_len =
634 strlen(gres_context[i].gres_name_colon);
635 }
636 init_run = true;
637
638 if ((select_plugin_type == NO_VAL) &&
639 (select_g_get_info_from_plugin(SELECT_CR_PLUGIN, NULL,
640 &select_plugin_type) != SLURM_SUCCESS)) {
641 select_plugin_type = NO_VAL; /* error */
642 }
643 if (have_mps && running_in_slurmctld() &&
644 (select_plugin_type != SELECT_TYPE_CONS_TRES)) {
645 fatal("Use of gres/mps requires the use of select/cons_tres");
646 }
647
648 fini: slurm_mutex_unlock(&gres_context_lock);
649 return rc;
650 }
651
gres_plugin_get_gres_cnt(void)652 extern int gres_plugin_get_gres_cnt(void)
653 {
654 static int cnt = -1;
655
656 if (cnt != -1)
657 return cnt;
658
659 gres_plugin_init();
660
661 slurm_mutex_lock(&gres_context_lock);
662 cnt = gres_context_cnt;
663 slurm_mutex_unlock(&gres_context_lock);
664
665 return cnt;
666 }
667
668 /*
669 * Add a GRES record. This is used by the node_features plugin after the
670 * slurm.conf file is read and the initial GRES records are built by
671 * gres_plugin_init().
672 */
gres_plugin_add(char * gres_name)673 extern void gres_plugin_add(char *gres_name)
674 {
675 int i;
676
677 slurm_mutex_lock(&gres_context_lock);
678 for (i = 0; i < gres_context_cnt; i++) {
679 if (!xstrcmp(gres_context[i].gres_name, gres_name))
680 goto fini;
681 }
682
683 _add_gres_context(gres_name);
684 fini: slurm_mutex_unlock(&gres_context_lock);
685 }
686
687 /* Given a gres_name, return its context index or -1 if not found */
_gres_name_context(char * gres_name)688 static int _gres_name_context(char *gres_name)
689 {
690 int i;
691
692 for (i = 0; i < gres_context_cnt; i++) {
693 if (!xstrcmp(gres_context[i].gres_name, gres_name))
694 return i;
695 }
696
697 return -1;
698 }
699
700 /*
701 * Takes a GRES config line (typically from slurm.conf) and remove any
702 * records for GRES which are not defined in GresTypes.
703 * RET string of valid GRES, Release memory using xfree()
704 */
gres_plugin_name_filter(char * orig_gres,char * nodes)705 extern char *gres_plugin_name_filter(char *orig_gres, char *nodes)
706 {
707 char *new_gres = NULL, *save_ptr = NULL;
708 char *colon, *sep = "", *tmp, *tok, *name;
709
710 slurm_mutex_lock(&gres_context_lock);
711 if (!orig_gres || !orig_gres[0] || !gres_context_cnt) {
712 slurm_mutex_unlock(&gres_context_lock);
713 return new_gres;
714 }
715
716 tmp = xstrdup(orig_gres);
717 tok = strtok_r(tmp, ",", &save_ptr);
718 while (tok) {
719 name = xstrdup(tok);
720 if ((colon = strchr(name, ':')))
721 colon[0] = '\0';
722 if (_gres_name_context(name) != -1) {
723 xstrfmtcat(new_gres, "%s%s", sep, tok);
724 sep = ",";
725 } else {
726 /* Logging may not be initialized at this point */
727 error("Invalid GRES configured on node %s: %s", nodes,
728 tok);
729 }
730 xfree(name);
731 tok = strtok_r(NULL, ",", &save_ptr);
732 }
733 slurm_mutex_unlock(&gres_context_lock);
734 xfree(tmp);
735
736 return new_gres;
737 }
738
739 /*
740 * Terminate the gres plugin. Free memory.
741 *
742 * Returns a Slurm errno.
743 */
gres_plugin_fini(void)744 extern int gres_plugin_fini(void)
745 {
746 int i, j, rc = SLURM_SUCCESS;
747
748 slurm_mutex_lock(&gres_context_lock);
749 xfree(gres_node_name);
750 if (gres_context_cnt < 0)
751 goto fini;
752
753 init_run = false;
754 for (i = 0; i < gres_context_cnt; i++) {
755 j = _unload_gres_plugin(gres_context + i);
756 if (j != SLURM_SUCCESS)
757 rc = j;
758 }
759 xfree(gres_context);
760 xfree(gres_plugin_list);
761 FREE_NULL_LIST(gres_conf_list);
762 gres_context_cnt = -1;
763
764 fini: slurm_mutex_unlock(&gres_context_lock);
765 return rc;
766 }
767
768 /*
769 **************************************************************************
770 * P L U G I N C A L L S *
771 **************************************************************************
772 */
773
774 /*
775 * Return a plugin-specific help message for salloc, sbatch and srun
776 * Result must be xfree()'d.
777 *
778 * NOTE: GRES "type" (e.g. model) information is only available from slurmctld
779 * after slurmd registers. It is not readily available from srun (as used here).
780 */
gres_plugin_help_msg(void)781 extern char *gres_plugin_help_msg(void)
782 {
783 int i;
784 char *msg = xstrdup("Valid gres options are:\n");
785
786 gres_plugin_init();
787
788 slurm_mutex_lock(&gres_context_lock);
789 for (i = 0; i < gres_context_cnt; i++) {
790 xstrcat(msg, gres_context[i].gres_name);
791 xstrcat(msg, "[[:type]:count]\n");
792 }
793 slurm_mutex_unlock(&gres_context_lock);
794
795 return msg;
796 }
797
798 /*
799 * Perform reconfig, re-read any configuration files
800 * OUT did_change - set if gres configuration changed
801 */
gres_plugin_reconfig(void)802 extern int gres_plugin_reconfig(void)
803 {
804 int rc = SLURM_SUCCESS;
805 char *plugin_names = slurm_get_gres_plugins();
806 bool plugin_change;
807
808 slurm_mutex_lock(&gres_context_lock);
809 if (slurm_get_debug_flags() & DEBUG_FLAG_GRES)
810 gres_debug = true;
811 else
812 gres_debug = false;
813
814 if (xstrcmp(plugin_names, gres_plugin_list))
815 plugin_change = true;
816 else
817 plugin_change = false;
818 slurm_mutex_unlock(&gres_context_lock);
819
820 if (plugin_change) {
821 error("GresPlugins changed from %s to %s ignored",
822 gres_plugin_list, plugin_names);
823 error("Restart the slurmctld daemon to change GresPlugins");
824 #if 0
825 /* This logic would load new plugins, but we need the old
826 * plugins to persist in order to process old state
827 * information. */
828 rc = gres_plugin_fini();
829 if (rc == SLURM_SUCCESS)
830 rc = gres_plugin_init();
831 #endif
832 }
833 xfree(plugin_names);
834
835 return rc;
836 }
837
838
839
840 /*
841 * Remove file-less GPUs from the final GRES list, since File is a requirement.
842 */
_remove_fileless_gpus(List gres_conf_list,slurm_gres_context_t * context_ptr)843 static void _remove_fileless_gpus(List gres_conf_list,
844 slurm_gres_context_t *context_ptr)
845 {
846 gres_slurmd_conf_t *gres_conf;
847 ListIterator iter;
848
849 if (!gres_conf_list)
850 return;
851
852 /* Only work in the GPU plugin */
853 if (context_ptr->plugin_id != gres_plugin_build_id("gpu"))
854 return;
855
856 iter = list_iterator_create(gres_conf_list);
857 while ((gres_conf = list_next(iter))) {
858 if (gres_conf->plugin_id != context_ptr->plugin_id)
859 continue;
860
861 if (!gres_conf->file) {
862 debug("Removing file-less GPU %s:%s from final GRES list",
863 gres_conf->name, gres_conf->type_name);
864 list_delete_item(iter);
865 }
866 }
867 list_iterator_destroy(iter);
868 }
869
870 /*
871 * Log the contents of a gres_slurmd_conf_t record
872 */
_log_gres_slurmd_conf(void * x,void * arg)873 static int _log_gres_slurmd_conf(void *x, void *arg)
874 {
875 gres_slurmd_conf_t *p;
876 char *links = NULL;
877 int index = -1, offset, mult = 1;
878
879 p = (gres_slurmd_conf_t *) x;
880 xassert(p);
881
882 if (!gres_debug) {
883 verbose("Gres Name=%s Type=%s Count=%"PRIu64,
884 p->name, p->type_name, p->count);
885 return 0;
886 }
887
888 if (p->file) {
889 index = 0;
890 offset = strlen(p->file);
891 while (offset > 0) {
892 offset--;
893 if ((p->file[offset] < '0') || (p->file[offset] > '9'))
894 break;
895 index += (p->file[offset] - '0') * mult;
896 mult *= 10;
897 }
898 }
899
900 if (p->links)
901 xstrfmtcat(links, "Links=%s", p->links);
902 if (p->cpus && (index != -1)) {
903 info("Gres Name=%s Type=%s Count=%"PRIu64" Index=%d ID=%u "
904 "File=%s Cores=%s CoreCnt=%u %s",
905 p->name, p->type_name, p->count, index, p->plugin_id,
906 p->file, p->cpus, p->cpu_cnt, links);
907 } else if (index != -1) {
908 info("Gres Name=%s Type=%s Count=%"PRIu64" Index=%d ID=%u File=%s %s",
909 p->name, p->type_name, p->count, index, p->plugin_id,
910 p->file, links);
911 } else if (p->file) {
912 info("Gres Name=%s Type=%s Count=%"PRIu64" ID=%u File=%s %s",
913 p->name, p->type_name, p->count, p->plugin_id, p->file,
914 links);
915 } else {
916 info("Gres Name=%s Type=%s Count=%"PRIu64" ID=%u %s", p->name,
917 p->type_name, p->count, p->plugin_id, links);
918 }
919 xfree(links);
920
921 return 0;
922 }
923
924 /* Make sure that specified file name exists, wait up to 20 seconds or generate
925 * fatal error and exit. */
_my_stat(char * file_name)926 static void _my_stat(char *file_name)
927 {
928 struct stat config_stat;
929 bool sent_msg = false;
930 int i;
931
932 if (!running_in_slurmdstepd())
933 return;
934
935 for (i = 0; i < 20; i++) {
936 if (i)
937 sleep(1);
938 if (stat(file_name, &config_stat) == 0) {
939 if (sent_msg)
940 info("gres.conf file %s now exists", file_name);
941 return;
942 }
943
944 if (errno != ENOENT)
945 break;
946
947 if (!sent_msg) {
948 error("Waiting for gres.conf file %s", file_name);
949 sent_msg = true;
950 }
951 }
952 fatal("can't stat gres.conf file %s: %m", file_name);
953 return;
954 }
955
_validate_file(char * path_name,char * gres_name)956 static int _validate_file(char *path_name, char *gres_name)
957 {
958 char *file_name, *slash, *one_name, *root_path;
959 hostlist_t hl;
960 int i, file_count = 0;
961
962 i = strlen(path_name);
963 if ((i < 3) || (path_name[i-1] != ']')) {
964 _my_stat(path_name);
965 return 1;
966 }
967
968 slash = strrchr(path_name, '/');
969 if (slash) {
970 slash[0] = '\0';
971 root_path = xstrdup(path_name);
972 xstrcat(root_path, "/");
973 slash[0] = '/';
974 file_name = slash + 1;
975 } else {
976 file_name = path_name;
977 root_path = NULL;
978 }
979 hl = hostlist_create(file_name);
980 if (hl == NULL)
981 fatal("can't parse File=%s", path_name);
982 while ((one_name = hostlist_shift(hl))) {
983 if (slash) {
984 char *formatted_path = NULL;
985 xstrfmtcat(formatted_path, "%s/%s",
986 root_path, one_name);
987 _my_stat(formatted_path);
988 xfree(formatted_path);
989 } else {
990 _my_stat(one_name);
991 }
992 file_count++;
993 free(one_name);
994 }
995 hostlist_destroy(hl);
996 xfree(root_path);
997
998 return file_count;
999 }
1000
1001 /*
1002 * Check that we have a comma-delimited list of numbers
1003 */
_validate_links(gres_slurmd_conf_t * p)1004 static void _validate_links(gres_slurmd_conf_t *p)
1005 {
1006 char *tmp, *tok, *save_ptr = NULL, *end_ptr = NULL;
1007 long int val;
1008
1009 if (!p->links)
1010 return;
1011 if (p->links[0] == '\0') {
1012 xfree(p->links);
1013 return;
1014 }
1015
1016 tmp = xstrdup(p->links);
1017 tok = strtok_r(tmp, ",", &save_ptr);
1018 while (tok) {
1019 val = strtol(tok, &end_ptr, 10);
1020 if ((val < -2) || (val > GRES_MAX_LINK) || (val == LONG_MIN) ||
1021 (end_ptr[0] != '\0')) {
1022 error("gres.conf: Ignoring invalid Link (%s) for Name=%s",
1023 tok, p->name);
1024 xfree(p->links);
1025 break;
1026 }
1027 tok = strtok_r(NULL, ",", &save_ptr);
1028 }
1029 xfree(tmp);
1030 }
1031
1032 /*
1033 * Return true if count can be greater than 1 for a given file.
1034 * For example, each GPU can have arbitrary count of MPS elements.
1035 */
_multi_count_per_file(char * name)1036 static bool _multi_count_per_file(char *name)
1037 {
1038 if (!xstrcmp(name, "mps"))
1039 return true;
1040 return false;
1041 }
1042
1043 /*
1044 * Build gres_slurmd_conf_t record based upon a line from the gres.conf file
1045 */
_parse_gres_config(void ** dest,slurm_parser_enum_t type,const char * key,const char * value,const char * line,char ** leftover)1046 static int _parse_gres_config(void **dest, slurm_parser_enum_t type,
1047 const char *key, const char *value,
1048 const char *line, char **leftover)
1049 {
1050 static s_p_options_t _gres_options[] = {
1051 {"Count", S_P_STRING}, /* Number of Gres available */
1052 {"CPUs" , S_P_STRING}, /* CPUs to bind to Gres resource
1053 * (deprecated, use Cores) */
1054 {"Cores", S_P_STRING}, /* Cores to bind to Gres resource */
1055 {"File", S_P_STRING}, /* Path to Gres device */
1056 {"Files", S_P_STRING}, /* Path to Gres device */
1057 {"Flags", S_P_STRING}, /* GRES Flags */
1058 {"Link", S_P_STRING}, /* Communication link IDs */
1059 {"Links", S_P_STRING}, /* Communication link IDs */
1060 {"Name", S_P_STRING}, /* Gres name */
1061 {"Type", S_P_STRING}, /* Gres type (e.g. model name) */
1062 {NULL}
1063 };
1064 int i;
1065 s_p_hashtbl_t *tbl;
1066 gres_slurmd_conf_t *p;
1067 uint64_t tmp_uint64, mult;
1068 char *tmp_str, *last;
1069 bool cores_flag = false, cpus_flag = false;
1070 char *type_str = NULL;
1071
1072 tbl = s_p_hashtbl_create(_gres_options);
1073 s_p_parse_line(tbl, *leftover, leftover);
1074
1075 p = xmalloc(sizeof(gres_slurmd_conf_t));
1076 if (!value) {
1077 if (!s_p_get_string(&p->name, "Name", tbl)) {
1078 error("Invalid GRES data, no type name (%s)", line);
1079 xfree(p);
1080 s_p_hashtbl_destroy(tbl);
1081 return 0;
1082 }
1083 } else {
1084 p->name = xstrdup(value);
1085 }
1086
1087 p->cpu_cnt = gres_cpu_cnt;
1088 if (s_p_get_string(&p->cpus, "Cores", tbl)) {
1089 cores_flag = true;
1090 type_str = "Cores";
1091 } else if (s_p_get_string(&p->cpus, "CPUs", tbl)) {
1092 cpus_flag = true;
1093 type_str = "CPUs";
1094 }
1095 if (cores_flag || cpus_flag) {
1096 char *local_cpus = NULL;
1097 if (xcpuinfo_ops.xcpuinfo_abs_to_mac) {
1098 i = (xcpuinfo_ops.xcpuinfo_abs_to_mac)
1099 (p->cpus, &local_cpus);
1100 /*
1101 * Only executed by slurmstepd and we don't want
1102 * fatal here. Ignore bad Core/CPU configuration.
1103 */
1104 if (i != SLURM_SUCCESS) {
1105 error("Invalid GRES data for %s, %s=%s",
1106 p->name, type_str, p->cpus);
1107 }
1108 } else {
1109 local_cpus = xstrdup(p->cpus);
1110 i = SLURM_SUCCESS;
1111 }
1112 if (i == SLURM_SUCCESS) {
1113 p->cpus_bitmap = bit_alloc(gres_cpu_cnt);
1114 if ((bit_size(p->cpus_bitmap) == 0) ||
1115 bit_unfmt(p->cpus_bitmap, local_cpus) != 0) {
1116 fatal("Invalid GRES data for %s, %s=%s (only %u CPUs are available)",
1117 p->name, type_str, p->cpus, gres_cpu_cnt);
1118 }
1119 }
1120 xfree(local_cpus);
1121 }
1122
1123 if (s_p_get_string(&p->file, "File", tbl) ||
1124 s_p_get_string(&p->file, "Files", tbl)) {
1125 p->count = _validate_file(p->file, p->name);
1126 p->config_flags |= GRES_CONF_HAS_FILE;
1127 }
1128
1129 if (s_p_get_string(&tmp_str, "Flags", tbl)) {
1130 if (xstrcasestr(tmp_str, "CountOnly"))
1131 p->config_flags |= GRES_CONF_COUNT_ONLY;
1132 xfree(tmp_str);
1133 }
1134
1135 if (s_p_get_string(&p->links, "Link", tbl) ||
1136 s_p_get_string(&p->links, "Links", tbl)) {
1137 _validate_links(p);
1138 }
1139
1140 if (s_p_get_string(&p->type_name, "Type", tbl)) {
1141 p->config_flags |= GRES_CONF_HAS_TYPE;
1142 }
1143
1144 if (s_p_get_string(&tmp_str, "Count", tbl)) {
1145 tmp_uint64 = strtoll(tmp_str, &last, 10);
1146 if ((tmp_uint64 == LONG_MIN) || (tmp_uint64 == LONG_MAX)) {
1147 fatal("Invalid GRES record for %s, invalid count %s",
1148 p->name, tmp_str);
1149 }
1150 if ((mult = suffix_mult(last)) != NO_VAL64) {
1151 tmp_uint64 *= mult;
1152 } else {
1153 fatal("Invalid GRES record for %s, invalid count %s",
1154 p->name, tmp_str);
1155 }
1156 /*
1157 * Some GRES can have count > 1 for a given file. For example,
1158 * each GPU can have arbitrary count of MPS elements.
1159 */
1160 if (p->count && (p->count != tmp_uint64) &&
1161 !_multi_count_per_file(p->name)) {
1162 fatal("Invalid GRES record for %s, count does not match File value",
1163 p->name);
1164 }
1165 if (tmp_uint64 >= NO_VAL64) {
1166 fatal("GRES %s has invalid count value %"PRIu64,
1167 p->name, tmp_uint64);
1168 }
1169 p->count = tmp_uint64;
1170 xfree(tmp_str);
1171 } else if (p->count == 0)
1172 p->count = 1;
1173
1174 s_p_hashtbl_destroy(tbl);
1175
1176 for (i = 0; i < gres_context_cnt; i++) {
1177 if (xstrcasecmp(p->name, gres_context[i].gres_name) == 0)
1178 break;
1179 }
1180 if (i >= gres_context_cnt) {
1181 error("Ignoring gres.conf record, invalid name: %s", p->name);
1182 destroy_gres_slurmd_conf(p);
1183 return 0;
1184 }
1185 p->plugin_id = gres_context[i].plugin_id;
1186 *dest = (void *)p;
1187 return 1;
1188 }
_parse_gres_config2(void ** dest,slurm_parser_enum_t type,const char * key,const char * value,const char * line,char ** leftover)1189 static int _parse_gres_config2(void **dest, slurm_parser_enum_t type,
1190 const char *key, const char *value,
1191 const char *line, char **leftover)
1192 {
1193 static s_p_options_t _gres_options[] = {
1194 {"Count", S_P_STRING}, /* Number of Gres available */
1195 {"CPUs" , S_P_STRING}, /* CPUs to bind to Gres resource */
1196 {"Cores", S_P_STRING}, /* Cores to bind to Gres resource */
1197 {"File", S_P_STRING}, /* Path to Gres device */
1198 {"Files", S_P_STRING}, /* Path to Gres device */
1199 {"Flags", S_P_STRING}, /* GRES Flags */
1200 {"Link", S_P_STRING}, /* Communication link IDs */
1201 {"Links", S_P_STRING}, /* Communication link IDs */
1202 {"Name", S_P_STRING}, /* Gres name */
1203 {"Type", S_P_STRING}, /* Gres type (e.g. model name) */
1204 {NULL}
1205 };
1206 s_p_hashtbl_t *tbl;
1207
1208 if (gres_node_name && value) {
1209 bool match = false;
1210 hostlist_t hl;
1211 hl = hostlist_create(value);
1212 if (hl) {
1213 match = (hostlist_find(hl, gres_node_name) >= 0);
1214 hostlist_destroy(hl);
1215 }
1216 if (!match) {
1217 debug("skipping GRES for NodeName=%s %s", value, line);
1218 tbl = s_p_hashtbl_create(_gres_options);
1219 s_p_parse_line(tbl, *leftover, leftover);
1220 s_p_hashtbl_destroy(tbl);
1221 return 0;
1222 }
1223 }
1224 return _parse_gres_config(dest, type, key, NULL, line, leftover);
1225 }
1226
_validate_slurm_conf(List slurm_conf_list,slurm_gres_context_t * context_ptr)1227 static void _validate_slurm_conf(List slurm_conf_list,
1228 slurm_gres_context_t *context_ptr)
1229 {
1230 ListIterator iter;
1231 gres_state_t *gres_ptr;
1232
1233 if (!slurm_conf_list)
1234 return;
1235
1236 iter = list_iterator_create(slurm_conf_list);
1237 while ((gres_ptr = list_next(iter))) {
1238 gres_node_state_t *slurm_gres;
1239 uint64_t tmp_count = 0;
1240
1241 /* Only look at the GRES under the current plugin (same name) */
1242 if (gres_ptr->plugin_id != context_ptr->plugin_id)
1243 continue;
1244
1245 slurm_gres = (gres_node_state_t *)gres_ptr->gres_data;
1246
1247 /*
1248 * gres_cnt_config should equal the combined count from
1249 * type_cnt_avail if there are no untyped GRES
1250 */
1251 for (uint16_t i = 0; i < slurm_gres->type_cnt; i++)
1252 tmp_count += slurm_gres->type_cnt_avail[i];
1253
1254 /* Forbid mixing typed and untyped GRES under the same name */
1255 if (slurm_gres->type_cnt &&
1256 slurm_gres->gres_cnt_config > tmp_count)
1257 fatal("%s: Some %s GRES in slurm.conf have a type while others do not (slurm_gres->gres_cnt_config (%"PRIu64") > tmp_count (%"PRIu64"))",
1258 __func__, context_ptr->gres_name,
1259 slurm_gres->gres_cnt_config, tmp_count);
1260 }
1261 }
1262
_validate_gres_conf(List gres_conf_list,slurm_gres_context_t * context_ptr)1263 static void _validate_gres_conf(List gres_conf_list,
1264 slurm_gres_context_t *context_ptr)
1265 {
1266 ListIterator iter;
1267 gres_slurmd_conf_t *gres_slurmd_conf;
1268 int new_has_file = -1, new_has_type = -1, rec_count = 0;
1269 bool orig_has_file, orig_has_type;
1270
1271 iter = list_iterator_create(gres_conf_list);
1272 while ((gres_slurmd_conf = (gres_slurmd_conf_t *) list_next(iter))) {
1273 if (gres_slurmd_conf->plugin_id != context_ptr->plugin_id)
1274 continue;
1275
1276 /*
1277 * If any plugin of this type has this set it will virally set
1278 * any other to be the same as we use the context_ptr from here
1279 * on out.
1280 */
1281 if (gres_slurmd_conf->config_flags & GRES_CONF_COUNT_ONLY)
1282 context_ptr->config_flags |= GRES_CONF_COUNT_ONLY;
1283
1284 /*
1285 * Since there could be multiple types of the same plugin we
1286 * need to only make sure we load it once.
1287 */
1288 if (!(context_ptr->config_flags & GRES_CONF_LOADED)) {
1289 /*
1290 * Ignore return code, as we will still support the gres
1291 * with or without the plugin.
1292 */
1293 if (_load_gres_plugin(context_ptr) == SLURM_SUCCESS)
1294 context_ptr->config_flags |= GRES_CONF_LOADED;
1295 }
1296
1297 rec_count++;
1298 orig_has_file = gres_slurmd_conf->config_flags &
1299 GRES_CONF_HAS_FILE;
1300 if (new_has_file == -1) {
1301 if (gres_slurmd_conf->config_flags &
1302 GRES_CONF_HAS_FILE) {
1303 new_has_file = 1;
1304 } else
1305 new_has_file = 0;
1306 } else if (( new_has_file && !orig_has_file) ||
1307 (!new_has_file && orig_has_file)) {
1308 fatal("gres.conf for %s, some records have \"File\" specification while others do not",
1309 context_ptr->gres_name);
1310 }
1311 orig_has_type = gres_slurmd_conf->config_flags &
1312 GRES_CONF_HAS_TYPE;
1313 if (new_has_type == -1) {
1314 if (gres_slurmd_conf->config_flags &
1315 GRES_CONF_HAS_TYPE) {
1316 new_has_type = 1;
1317 } else
1318 new_has_type = 0;
1319 } else if (( new_has_type && !orig_has_type) ||
1320 (!new_has_type && orig_has_type)) {
1321 fatal("gres.conf for %s, some records have \"Type=\" specification while others do not",
1322 context_ptr->gres_name);
1323 }
1324 if ((new_has_file == 0) && (new_has_type == 0) &&
1325 (rec_count > 1)) {
1326 fatal("gres.conf duplicate records for %s",
1327 context_ptr->gres_name);
1328 }
1329
1330 if (new_has_file)
1331 context_ptr->config_flags |= GRES_CONF_HAS_FILE;
1332 }
1333 list_iterator_destroy(iter);
1334
1335 if (!(context_ptr->config_flags & GRES_CONF_LOADED)) {
1336 /*
1337 * This means there was no gre.conf line for this gres found.
1338 * We still need to try to load it for AutoDetect's sake.
1339 * If we fail loading we will treat it as a count
1340 * only GRES since the stepd will try to load it elsewise.
1341 */
1342 if (_load_gres_plugin(context_ptr) != SLURM_SUCCESS)
1343 context_ptr->config_flags |= GRES_CONF_COUNT_ONLY;
1344 } else
1345 /* Remove as this is only really used locally */
1346 context_ptr->config_flags &= (~GRES_CONF_LOADED);
1347 }
1348
1349 /*
1350 * Keep track of which gres.conf lines have a count greater than expected
1351 * according to the current slurm.conf GRES. Modify the count of throw-away
1352 * records in gres_conf_list_tmp to keep track of this. Any gres.conf records
1353 * with a count > 0 means that slurm.conf did not account for it completely.
1354 *
1355 * gres_conf_list_tmp - (in/out) The temporary gres.conf list.
1356 * count - (in) The count of the current slurm.conf GRES record.
1357 * type_name - (in) The type of the current slurm.conf GRES record.
1358 */
_compare_conf_counts(List gres_conf_list_tmp,uint64_t count,char * type_name)1359 static void _compare_conf_counts(List gres_conf_list_tmp, uint64_t count,
1360 char *type_name)
1361 {
1362 gres_slurmd_conf_t *gres_conf;
1363 ListIterator iter = list_iterator_create(gres_conf_list_tmp);
1364 while ((gres_conf = list_next(iter))) {
1365 /* Note: plugin type filter already applied */
1366 /* Check that type is the same */
1367 if (xstrcasecmp(gres_conf->type_name, type_name))
1368 continue;
1369 /* Keep track of counts */
1370 if (gres_conf->count > count) {
1371 gres_conf->count -= count;
1372 /* This slurm.conf GRES specification is now used up */
1373 list_iterator_destroy(iter);
1374 return;
1375 } else {
1376 count -= gres_conf->count;
1377 gres_conf->count = 0;
1378 }
1379 }
1380 list_iterator_destroy(iter);
1381 }
1382
1383 /*
1384 * Loop through each entry in gres.conf and see if there is a corresponding
1385 * entry in slurm.conf. If so, see if the counts line up. If there are more
1386 * devices specified in gres.conf than in slurm.conf, emit errors.
1387 *
1388 * slurm_conf_list - (in) The slurm.conf GRES list.
1389 * gres_conf_list - (in) The gres.conf GRES list.
1390 * context_ptr - (in) Which GRES plugin we are currently working in.
1391 */
_check_conf_mismatch(List slurm_conf_list,List gres_conf_list,slurm_gres_context_t * context_ptr)1392 static void _check_conf_mismatch(List slurm_conf_list, List gres_conf_list,
1393 slurm_gres_context_t *context_ptr)
1394 {
1395 ListIterator iter;
1396 gres_slurmd_conf_t *gres_conf;
1397 gres_state_t *slurm_conf;
1398 List gres_conf_list_tmp;
1399
1400 /* E.g. slurm_conf_list will be NULL in the case of --gpu-bind */
1401 if (!slurm_conf_list || !gres_conf_list)
1402 return;
1403
1404 /*
1405 * Duplicate the gres.conf list with records relevant to this GRES plugin
1406 * only so we can mangle records. Only add records under the current plugin.
1407 */
1408 gres_conf_list_tmp = list_create(destroy_gres_slurmd_conf);
1409 iter = list_iterator_create(gres_conf_list);
1410 while ((gres_conf = list_next(iter))) {
1411 gres_slurmd_conf_t *gres_conf_tmp;
1412 if (gres_conf->plugin_id != context_ptr->plugin_id)
1413 continue;
1414
1415 gres_conf_tmp = xmalloc(sizeof(*gres_conf_tmp));
1416 gres_conf_tmp->name = xstrdup(gres_conf->name);
1417 gres_conf_tmp->type_name = xstrdup(gres_conf->type_name);
1418 gres_conf_tmp->count = gres_conf->count;
1419 list_append(gres_conf_list_tmp, gres_conf_tmp);
1420 }
1421 list_iterator_destroy(iter);
1422
1423 /*
1424 * Loop through the slurm.conf list and see if there are more gres.conf
1425 * GRES than expected.
1426 */
1427 iter = list_iterator_create(slurm_conf_list);
1428 while ((slurm_conf = list_next(iter))) {
1429 gres_node_state_t *slurm_gres;
1430
1431 if (slurm_conf->plugin_id != context_ptr->plugin_id)
1432 continue;
1433
1434 /* Determine if typed or untyped, and act accordingly */
1435 slurm_gres = (gres_node_state_t *)slurm_conf->gres_data;
1436 if (!slurm_gres->type_name) {
1437 _compare_conf_counts(gres_conf_list_tmp,
1438 slurm_gres->gres_cnt_config, NULL);
1439 continue;
1440 }
1441
1442 for (int i = 0; i < slurm_gres->type_cnt; ++i) {
1443 _compare_conf_counts(gres_conf_list_tmp,
1444 slurm_gres->type_cnt_avail[i],
1445 slurm_gres->type_name[i]);
1446 }
1447 }
1448 list_iterator_destroy(iter);
1449
1450 /*
1451 * Loop through gres_conf_list_tmp to print errors for gres.conf
1452 * records that were not completely accounted for in slurm.conf.
1453 */
1454 iter = list_iterator_create(gres_conf_list_tmp);
1455 while ((gres_conf = list_next(iter)))
1456 if (gres_conf->count > 0)
1457 info("WARNING: A line in gres.conf for GRES %s%s%s has %"PRIu64" more configured than expected in slurm.conf. Ignoring extra GRES.",
1458 gres_conf->name,
1459 (gres_conf->type_name) ? ":" : "",
1460 (gres_conf->type_name) ? gres_conf->type_name : "",
1461 gres_conf->count);
1462 list_iterator_destroy(iter);
1463
1464 FREE_NULL_LIST(gres_conf_list_tmp);
1465 }
1466
1467 /*
1468 * Match the type of a GRES from slurm.conf to a GRES in the gres.conf list. If
1469 * a match is found, pop it off the gres.conf list and return it.
1470 *
1471 * gres_conf_list - (in) The gres.conf list to search through.
1472 * gres_context - (in) Which GRES plugin we are currently working in.
1473 * type_name - (in) The type of the slurm.conf GRES record. If null, then
1474 * it's an untyped GRES.
1475 *
1476 * Returns the first gres.conf record from gres_conf_list with the same type
1477 * name as the slurm.conf record.
1478 */
_match_type(List gres_conf_list,slurm_gres_context_t * gres_context,char * type_name)1479 static gres_slurmd_conf_t *_match_type(List gres_conf_list,
1480 slurm_gres_context_t *gres_context,
1481 char *type_name)
1482 {
1483 ListIterator gres_conf_itr;
1484 gres_slurmd_conf_t *gres_conf = NULL;
1485
1486 gres_conf_itr = list_iterator_create(gres_conf_list);
1487 while ((gres_conf = list_next(gres_conf_itr))) {
1488 if (gres_conf->plugin_id != gres_context->plugin_id)
1489 continue;
1490
1491 /*
1492 * If type_name is NULL we will take the first matching
1493 * gres_conf that we find. This means we also will remove the
1494 * type from the gres_conf to match 18.08 stylings.
1495 */
1496 if (!type_name)
1497 xfree(gres_conf->type_name);
1498 else if (xstrcasecmp(gres_conf->type_name, type_name))
1499 continue;
1500
1501 /* We found a match, so remove from gres_conf_list and break */
1502 list_remove(gres_conf_itr);
1503 break;
1504 }
1505 list_iterator_destroy(gres_conf_itr);
1506
1507 return gres_conf;
1508 }
1509
1510 /*
1511 * Add a GRES conf record with count == 0 to gres_list.
1512 *
1513 * gres_list - (in/out) The gres list to add to.
1514 * gres_context - (in) The GRES plugin to add a GRES record for.
1515 * cpu_cnt - (in) The cpu count configured for the node.
1516 */
_add_gres_config_empty(List gres_list,slurm_gres_context_t * gres_context,uint32_t cpu_cnt)1517 static void _add_gres_config_empty(List gres_list,
1518 slurm_gres_context_t *gres_context,
1519 uint32_t cpu_cnt)
1520 {
1521 gres_slurmd_conf_t *gres_conf = xmalloc(sizeof(*gres_conf));
1522 gres_conf->cpu_cnt = cpu_cnt;
1523 gres_conf->name = xstrdup(gres_context->gres_name);
1524 gres_conf->plugin_id = gres_context->plugin_id;
1525 list_append(gres_list, gres_conf);
1526 }
1527
1528 /*
1529 * Truncate the File hostrange string of a GRES record to be to be at most
1530 * new_count entries. The extra entries will be removed.
1531 *
1532 * gres_conf - (in/out) The GRES record to modify.
1533 * count - (in) The new number of entries in File
1534 */
_set_file_subset(gres_slurmd_conf_t * gres_conf,uint64_t new_count)1535 static void _set_file_subset(gres_slurmd_conf_t *gres_conf, uint64_t new_count)
1536 {
1537 /* Convert file to hostrange */
1538 hostlist_t hl = hostlist_create(gres_conf->file);
1539 unsigned long old_count = hostlist_count(hl);
1540
1541 if (new_count >= old_count) {
1542 hostlist_destroy(hl);
1543 /* Nothing to do */
1544 return;
1545 }
1546
1547 /* Remove all but the first entries */
1548 for (int i = old_count; i > new_count; --i) {
1549 free(hostlist_pop(hl));
1550 }
1551
1552 debug3("%s: Truncating %s:%s File from (%ld) %s", __func__,
1553 gres_conf->name, gres_conf->type_name, old_count,
1554 gres_conf->file);
1555
1556 /* Set file to the new subset */
1557 xfree(gres_conf->file);
1558 gres_conf->file = hostlist_ranged_string_xmalloc(hl);
1559
1560 debug3("%s: to (%"PRIu64") %s", __func__, new_count, gres_conf->file);
1561 hostlist_destroy(hl);
1562 }
1563
1564 /*
1565 * A continuation of _merge_gres() depending on if the slurm.conf GRES is typed
1566 * or not.
1567 *
1568 * gres_conf_list - (in) The gres.conf list.
1569 * new_list - (out) The new merged [slurm|gres].conf list.
1570 * count - (in) The count of the slurm.conf GRES record.
1571 * type_name - (in) The type of the slurm.conf GRES record, if it exists.
1572 * gres_context - (in) Which GRES plugin we are working in.
1573 * cpu_cnt - (in) A count of CPUs on the node.
1574 */
_merge_gres2(List gres_conf_list,List new_list,uint64_t count,char * type_name,slurm_gres_context_t * gres_context,uint32_t cpu_count)1575 static void _merge_gres2(List gres_conf_list, List new_list, uint64_t count,
1576 char *type_name, slurm_gres_context_t *gres_context,
1577 uint32_t cpu_count)
1578 {
1579 gres_slurmd_conf_t *gres_conf, *match;
1580
1581 /* If slurm.conf count is initially 0, don't waste time on it */
1582 if (count == 0)
1583 return;
1584
1585 /*
1586 * There can be multiple gres.conf GRES lines contained within a
1587 * single slurm.conf GRES line, due to different values of Cores
1588 * and Links. Append them to the list where possible.
1589 */
1590 while ((match = _match_type(gres_conf_list, gres_context, type_name))) {
1591 list_append(new_list, match);
1592
1593 debug3("%s: From gres.conf, using %s:%s:%"PRIu64":%s", __func__,
1594 match->name, match->type_name, match->count,
1595 match->file);
1596
1597 /* See if we need to merge with any more gres.conf records. */
1598 if (match->count > count) {
1599 /*
1600 * Truncate excess count of gres.conf to match total
1601 * count of slurm.conf.
1602 */
1603 match->count = count;
1604 /*
1605 * Truncate excess file of gres.conf to match total
1606 * count of slurm.conf.
1607 */
1608 if (match->file)
1609 _set_file_subset(match, count);
1610 /* Floor to 0 to break out of loop. */
1611 count = 0;
1612 } else
1613 /*
1614 * Subtract this gres.conf line count from the
1615 * slurm.conf total.
1616 */
1617 count -= match->count;
1618
1619 /*
1620 * All devices outlined by this slurm.conf record have now been
1621 * merged with gres.conf records and added to new_list, so exit.
1622 */
1623 if (count == 0)
1624 break;
1625 }
1626
1627 if (count == 0)
1628 return;
1629
1630 /*
1631 * There are leftover GRES specified in this slurm.conf record that are
1632 * not accounted for in gres.conf that still need to be added.
1633 */
1634 gres_conf = xmalloc(sizeof(*gres_conf));
1635 gres_conf->count = count;
1636 gres_conf->cpu_cnt = cpu_count;
1637 gres_conf->name = xstrdup(gres_context->gres_name);
1638 gres_conf->plugin_id = gres_context->plugin_id;
1639 if (type_name) {
1640 gres_conf->config_flags = GRES_CONF_HAS_TYPE;
1641 gres_conf->type_name = xstrdup(type_name);
1642 }
1643
1644 if (gres_context->config_flags & GRES_CONF_COUNT_ONLY)
1645 gres_conf->config_flags |= GRES_CONF_COUNT_ONLY;
1646
1647 list_append(new_list, gres_conf);
1648 }
1649
1650 /*
1651 * Merge a single slurm.conf GRES specification with any relevant gres.conf
1652 * records and append the result to new_list.
1653 *
1654 * gres_conf_list - (in) The gres.conf list.
1655 * new_list - (out) The new merged [slurm|gres].conf list.
1656 * ptr - (in) A slurm.conf GRES record.
1657 * gres_context - (in) Which GRES plugin we are working in.
1658 * cpu_cnt - (in) A count of CPUs on the node.
1659 */
_merge_gres(List gres_conf_list,List new_list,gres_state_t * ptr,slurm_gres_context_t * gres_context,uint32_t cpu_cnt)1660 static void _merge_gres(List gres_conf_list, List new_list, gres_state_t *ptr,
1661 slurm_gres_context_t *gres_context, uint32_t cpu_cnt)
1662 {
1663 gres_node_state_t *slurm_gres = (gres_node_state_t *)ptr->gres_data;
1664
1665 /* If this GRES has no types, merge in the single untyped GRES */
1666 if (slurm_gres->type_cnt == 0) {
1667 _merge_gres2(gres_conf_list, new_list,
1668 slurm_gres->gres_cnt_config, NULL, gres_context,
1669 cpu_cnt);
1670 return;
1671 }
1672
1673 /* If this GRES has types, merge in each typed GRES */
1674 for (int i = 0; i < slurm_gres->type_cnt; i++) {
1675 _merge_gres2(gres_conf_list, new_list,
1676 slurm_gres->type_cnt_avail[i],
1677 slurm_gres->type_name[i], gres_context, cpu_cnt);
1678 }
1679 }
1680
1681 /*
1682 * Merge slurm.conf and gres.conf GRES configuration.
1683 * gres.conf can only work within what is outlined in slurm.conf. Every
1684 * gres.conf device that does not match up to a device in slurm.conf is
1685 * discarded with an error. If no gres conf found for what is specified in
1686 * slurm.conf, create a zero-count conf record.
1687 *
1688 * node_conf - (in) node configuration info (cpu count).
1689 * gres_conf_list - (in/out) GRES data from gres.conf. This becomes the new
1690 * merged slurm.conf/gres.conf list.
1691 * slurm_conf_list - (in) GRES data from slurm.conf.
1692 */
_merge_config(node_config_load_t * node_conf,List gres_conf_list,List slurm_conf_list)1693 static void _merge_config(node_config_load_t *node_conf, List gres_conf_list,
1694 List slurm_conf_list)
1695 {
1696 int i;
1697 gres_state_t *gres_ptr;
1698 ListIterator iter;
1699 bool found;
1700
1701 List new_gres_list = list_create(destroy_gres_slurmd_conf);
1702
1703 for (i = 0; i < gres_context_cnt; i++) {
1704 /* Copy GRES configuration from slurm.conf */
1705 if (slurm_conf_list) {
1706 found = false;
1707 iter = list_iterator_create(slurm_conf_list);
1708 while ((gres_ptr = (gres_state_t *) list_next(iter))) {
1709 if (gres_ptr->plugin_id !=
1710 gres_context[i].plugin_id)
1711 continue;
1712 found = true;
1713 _merge_gres(gres_conf_list, new_gres_list,
1714 gres_ptr, &gres_context[i],
1715 node_conf->cpu_cnt);
1716 }
1717 list_iterator_destroy(iter);
1718 if (found)
1719 continue;
1720 }
1721
1722 /* Add GRES record with zero count */
1723 _add_gres_config_empty(new_gres_list, &gres_context[i],
1724 node_conf->cpu_cnt);
1725 }
1726 /* Set gres_conf_list to be the new merged list */
1727 list_flush(gres_conf_list);
1728 list_transfer(gres_conf_list, new_gres_list);
1729 FREE_NULL_LIST(new_gres_list);
1730 }
1731
1732 /*
1733 * Load this node's configuration (how many resources it has, topology, etc.)
1734 * IN cpu_cnt - Number of CPUs configured on this node
1735 * IN node_name - Name of this node
1736 * IN gres_list - Node's GRES information as loaded from slurm.conf by slurmd
1737 * IN xcpuinfo_abs_to_mac - Pointer to xcpuinfo_abs_to_mac() funct, if available
1738 * IN xcpuinfo_mac_to_abs - Pointer to xcpuinfo_mac_to_abs() funct, if available
1739 * NOTE: Called from slurmd and slurmstepd
1740 */
gres_plugin_node_config_load(uint32_t cpu_cnt,char * node_name,List gres_list,void * xcpuinfo_abs_to_mac,void * xcpuinfo_mac_to_abs)1741 extern int gres_plugin_node_config_load(uint32_t cpu_cnt, char *node_name,
1742 List gres_list,
1743 void *xcpuinfo_abs_to_mac,
1744 void *xcpuinfo_mac_to_abs)
1745 {
1746 static s_p_options_t _gres_options[] = {
1747 {"AutoDetect", S_P_STRING},
1748 {"Name", S_P_ARRAY, _parse_gres_config, NULL},
1749 {"NodeName", S_P_ARRAY, _parse_gres_config2, NULL},
1750 {NULL}
1751 };
1752
1753 int count = 0, i, rc, rc2;
1754 struct stat config_stat;
1755 s_p_hashtbl_t *tbl;
1756 gres_slurmd_conf_t **gres_array;
1757 char *gres_conf_file;
1758 char *autodetect_string = NULL;
1759
1760 node_config_load_t node_conf = {
1761 .cpu_cnt = cpu_cnt,
1762 .xcpuinfo_mac_to_abs = xcpuinfo_mac_to_abs
1763 };
1764
1765 if (cpu_cnt == 0) {
1766 error("%s: Invalid cpu_cnt of 0 for node %s",
1767 __func__, node_name);
1768 return SLURM_ERROR;
1769 }
1770
1771 if (xcpuinfo_abs_to_mac)
1772 xcpuinfo_ops.xcpuinfo_abs_to_mac = xcpuinfo_abs_to_mac;
1773
1774 rc = gres_plugin_init();
1775 if (gres_context_cnt == 0)
1776 return SLURM_SUCCESS;
1777
1778 slurm_mutex_lock(&gres_context_lock);
1779 FREE_NULL_LIST(gres_conf_list);
1780 gres_conf_list = list_create(destroy_gres_slurmd_conf);
1781 gres_conf_file = get_extra_conf_path("gres.conf");
1782 if (stat(gres_conf_file, &config_stat) < 0) {
1783 info("Can not stat gres.conf file (%s), using slurm.conf data",
1784 gres_conf_file);
1785 } else {
1786 if (xstrcmp(gres_node_name, node_name)) {
1787 xfree(gres_node_name);
1788 gres_node_name = xstrdup(node_name);
1789 }
1790
1791 gres_cpu_cnt = cpu_cnt;
1792 tbl = s_p_hashtbl_create(_gres_options);
1793 if (s_p_parse_file(tbl, NULL, gres_conf_file, false) == SLURM_ERROR)
1794 fatal("error opening/reading %s", gres_conf_file);
1795
1796 if (s_p_get_string(&autodetect_string, "Autodetect", tbl)) {
1797 if (xstrcasestr(autodetect_string, "nvml"))
1798 autodetect_types |= GRES_AUTODETECT_NVML;
1799 if (xstrcasestr(autodetect_string, "rsmi"))
1800 autodetect_types |= GRES_AUTODETECT_RSMI;
1801 xfree(autodetect_string);
1802 }
1803
1804 if (s_p_get_array((void ***) &gres_array, &count, "Name", tbl)) {
1805 for (i = 0; i < count; i++) {
1806 list_append(gres_conf_list, gres_array[i]);
1807 gres_array[i] = NULL;
1808 }
1809 }
1810 if (s_p_get_array((void ***) &gres_array, &count, "NodeName", tbl)) {
1811 for (i = 0; i < count; i++) {
1812 list_append(gres_conf_list, gres_array[i]);
1813 gres_array[i] = NULL;
1814 }
1815 }
1816 s_p_hashtbl_destroy(tbl);
1817 }
1818 xfree(gres_conf_file);
1819
1820 /* Validate gres.conf and slurm.conf somewhat before merging */
1821 for (i = 0; i < gres_context_cnt; i++) {
1822 _validate_slurm_conf(gres_list, &gres_context[i]);
1823 _validate_gres_conf(gres_conf_list, &gres_context[i]);
1824 _check_conf_mismatch(gres_list, gres_conf_list,
1825 &gres_context[i]);
1826 }
1827
1828 /* Merge slurm.conf and gres.conf together into gres_conf_list */
1829 _merge_config(&node_conf, gres_conf_list, gres_list);
1830
1831 for (i = 0; i < gres_context_cnt; i++) {
1832 if (gres_context[i].ops.node_config_load == NULL)
1833 continue; /* No plugin */
1834 rc2 = (*(gres_context[i].ops.node_config_load))(gres_conf_list,
1835 &node_conf);
1836 if (rc == SLURM_SUCCESS)
1837 rc = rc2;
1838
1839 }
1840
1841 /* Postprocess gres_conf_list after all plugins' node_config_load */
1842 for (i = 0; i < gres_context_cnt; i++) {
1843 /* Remove every GPU with an empty File */
1844 _remove_fileless_gpus(gres_conf_list, &gres_context[i]);
1845 }
1846
1847 list_for_each(gres_conf_list, _log_gres_slurmd_conf, NULL);
1848 slurm_mutex_unlock(&gres_context_lock);
1849
1850 return rc;
1851 }
1852
1853 /*
1854 * Pack this node's gres configuration into a buffer
1855 * IN/OUT buffer - message buffer to pack
1856 */
gres_plugin_node_config_pack(Buf buffer)1857 extern int gres_plugin_node_config_pack(Buf buffer)
1858 {
1859 int rc;
1860 uint32_t magic = GRES_MAGIC;
1861 uint16_t rec_cnt = 0, version = SLURM_PROTOCOL_VERSION;
1862 ListIterator iter;
1863 gres_slurmd_conf_t *gres_slurmd_conf;
1864
1865 rc = gres_plugin_init();
1866
1867 slurm_mutex_lock(&gres_context_lock);
1868 pack16(version, buffer);
1869 if (gres_conf_list)
1870 rec_cnt = list_count(gres_conf_list);
1871 pack16(rec_cnt, buffer);
1872 if (rec_cnt) {
1873 iter = list_iterator_create(gres_conf_list);
1874 while ((gres_slurmd_conf =
1875 (gres_slurmd_conf_t *) list_next(iter))) {
1876 pack32(magic, buffer);
1877 pack64(gres_slurmd_conf->count, buffer);
1878 pack32(gres_slurmd_conf->cpu_cnt, buffer);
1879 pack8(gres_slurmd_conf->config_flags, buffer);
1880 pack32(gres_slurmd_conf->plugin_id, buffer);
1881 packstr(gres_slurmd_conf->cpus, buffer);
1882 packstr(gres_slurmd_conf->links, buffer);
1883 packstr(gres_slurmd_conf->name, buffer);
1884 packstr(gres_slurmd_conf->type_name, buffer);
1885 }
1886 list_iterator_destroy(iter);
1887 }
1888 slurm_mutex_unlock(&gres_context_lock);
1889
1890 return rc;
1891 }
1892
1893 /*
1894 * Unpack this node's configuration from a buffer (built/packed by slurmd)
1895 * IN/OUT buffer - message buffer to unpack
1896 * IN node_name - name of node whose data is being unpacked
1897 */
gres_plugin_node_config_unpack(Buf buffer,char * node_name)1898 extern int gres_plugin_node_config_unpack(Buf buffer, char *node_name)
1899 {
1900 int i, j, rc;
1901 uint32_t cpu_cnt = 0, magic = 0, plugin_id = 0, utmp32 = 0;
1902 uint64_t count64 = 0;
1903 uint16_t rec_cnt = 0, protocol_version = 0;
1904 uint8_t config_flags = 0;
1905 char *tmp_cpus = NULL, *tmp_links = NULL, *tmp_name = NULL;
1906 char *tmp_type = NULL;
1907 gres_slurmd_conf_t *p;
1908
1909 rc = gres_plugin_init();
1910
1911 FREE_NULL_LIST(gres_conf_list);
1912 gres_conf_list = list_create(destroy_gres_slurmd_conf);
1913
1914 safe_unpack16(&protocol_version, buffer);
1915
1916 safe_unpack16(&rec_cnt, buffer);
1917 if (rec_cnt == 0)
1918 return SLURM_SUCCESS;
1919 if (rec_cnt > NO_VAL16)
1920 goto unpack_error;
1921
1922 slurm_mutex_lock(&gres_context_lock);
1923 if (protocol_version < SLURM_MIN_PROTOCOL_VERSION) {
1924 error("%s: protocol_version %hu not supported",
1925 __func__, protocol_version);
1926 goto unpack_error;
1927 }
1928 for (i = 0; i < rec_cnt; i++) {
1929 if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
1930 safe_unpack32(&magic, buffer);
1931 if (magic != GRES_MAGIC)
1932 goto unpack_error;
1933
1934 safe_unpack64(&count64, buffer);
1935 safe_unpack32(&cpu_cnt, buffer);
1936 safe_unpack8(&config_flags, buffer);
1937 safe_unpack32(&plugin_id, buffer);
1938 safe_unpackstr_xmalloc(&tmp_cpus, &utmp32, buffer);
1939 safe_unpackstr_xmalloc(&tmp_links, &utmp32, buffer);
1940 safe_unpackstr_xmalloc(&tmp_name, &utmp32, buffer);
1941 safe_unpackstr_xmalloc(&tmp_type, &utmp32, buffer);
1942 }
1943
1944 if (slurm_get_debug_flags() & DEBUG_FLAG_GRES) {
1945 info("Node:%s Gres:%s Type:%s Flags:%s CPU_IDs:%s CPU#:%u Count:%"
1946 PRIu64" Links:%s",
1947 node_name, tmp_name, tmp_type,
1948 gres_flags2str(config_flags), tmp_cpus, cpu_cnt,
1949 count64, tmp_links);
1950 }
1951 for (j = 0; j < gres_context_cnt; j++) {
1952 bool new_has_file, new_has_type;
1953 bool orig_has_file, orig_has_type;
1954 if (gres_context[j].plugin_id != plugin_id)
1955 continue;
1956 if (xstrcmp(gres_context[j].gres_name, tmp_name)) {
1957 /*
1958 * Should have been caught in
1959 * gres_plugin_init()
1960 */
1961 error("%s: gres/%s duplicate plugin ID with %s, unable to process",
1962 __func__, tmp_name,
1963 gres_context[j].gres_name);
1964 continue;
1965 }
1966 new_has_file = config_flags & GRES_CONF_HAS_FILE;
1967 orig_has_file = gres_context[j].config_flags &
1968 GRES_CONF_HAS_FILE;
1969 if (orig_has_file && !new_has_file && count64) {
1970 error("%s: gres/%s lacks \"File=\" parameter for node %s",
1971 __func__, tmp_name, node_name);
1972 config_flags |= GRES_CONF_HAS_FILE;
1973 }
1974 if (new_has_file && (count64 > MAX_GRES_BITMAP)) {
1975 /*
1976 * Avoid over-subscribing memory with
1977 * huge bitmaps
1978 */
1979 error("%s: gres/%s has \"File=\" plus very large "
1980 "\"Count\" (%"PRIu64") for node %s, "
1981 "resetting value to %d",
1982 __func__, tmp_name, count64,
1983 node_name, MAX_GRES_BITMAP);
1984 count64 = MAX_GRES_BITMAP;
1985 }
1986 new_has_type = config_flags & GRES_CONF_HAS_TYPE;
1987 orig_has_type = gres_context[j].config_flags &
1988 GRES_CONF_HAS_TYPE;
1989 if (orig_has_type && !new_has_type && count64) {
1990 error("%s: gres/%s lacks \"Type\" parameter for node %s",
1991 __func__, tmp_name, node_name);
1992 config_flags |= GRES_CONF_HAS_TYPE;
1993 }
1994 gres_context[j].config_flags |= config_flags;
1995
1996 /*
1997 * On the slurmctld we need to load the plugins to
1998 * correctly set env vars. We want to call this only
1999 * after we have the config_flags so we can tell if we
2000 * are CountOnly or not.
2001 */
2002 if (!(gres_context[j].config_flags &
2003 GRES_CONF_LOADED)) {
2004 (void)_load_gres_plugin(&gres_context[j]);
2005 gres_context[j].config_flags |=
2006 GRES_CONF_LOADED;
2007 }
2008
2009 break;
2010 }
2011 if (j >= gres_context_cnt) {
2012 /*
2013 * GresPlugins is inconsistently configured.
2014 * Not a fatal error, but skip this data.
2015 */
2016 error("%s: No plugin configured to process GRES data from node %s (Name:%s Type:%s PluginID:%u Count:%"PRIu64")",
2017 __func__, node_name, tmp_name, tmp_type,
2018 plugin_id, count64);
2019 xfree(tmp_cpus);
2020 xfree(tmp_links);
2021 xfree(tmp_name);
2022 xfree(tmp_type);
2023 continue;
2024 }
2025 p = xmalloc(sizeof(gres_slurmd_conf_t));
2026 p->config_flags = config_flags;
2027 p->count = count64;
2028 p->cpu_cnt = cpu_cnt;
2029 p->cpus = tmp_cpus;
2030 tmp_cpus = NULL; /* Nothing left to xfree */
2031 p->links = tmp_links;
2032 tmp_links = NULL; /* Nothing left to xfree */
2033 p->name = tmp_name; /* Preserve for accounting! */
2034 p->type_name = tmp_type;
2035 tmp_type = NULL; /* Nothing left to xfree */
2036 p->plugin_id = plugin_id;
2037 _validate_links(p);
2038 list_append(gres_conf_list, p);
2039 }
2040
2041 slurm_mutex_unlock(&gres_context_lock);
2042 return rc;
2043
2044 unpack_error:
2045 error("%s: unpack error from node %s", __func__, node_name);
2046 xfree(tmp_cpus);
2047 xfree(tmp_links);
2048 xfree(tmp_name);
2049 xfree(tmp_type);
2050 slurm_mutex_unlock(&gres_context_lock);
2051 return SLURM_ERROR;
2052 }
2053
_gres_node_state_delete_topo(gres_node_state_t * gres_node_ptr)2054 static void _gres_node_state_delete_topo(gres_node_state_t *gres_node_ptr)
2055 {
2056 int i;
2057
2058 for (i = 0; i < gres_node_ptr->topo_cnt; i++) {
2059 if (gres_node_ptr->topo_gres_bitmap)
2060 FREE_NULL_BITMAP(gres_node_ptr->topo_gres_bitmap[i]);
2061 if (gres_node_ptr->topo_core_bitmap)
2062 FREE_NULL_BITMAP(gres_node_ptr->topo_core_bitmap[i]);
2063 xfree(gres_node_ptr->topo_type_name[i]);
2064 }
2065 xfree(gres_node_ptr->topo_gres_bitmap);
2066 xfree(gres_node_ptr->topo_core_bitmap);
2067 xfree(gres_node_ptr->topo_gres_cnt_alloc);
2068 xfree(gres_node_ptr->topo_gres_cnt_avail);
2069 xfree(gres_node_ptr->topo_type_id);
2070 xfree(gres_node_ptr->topo_type_name);
2071 }
2072
_gres_node_state_delete(gres_node_state_t * gres_node_ptr)2073 static void _gres_node_state_delete(gres_node_state_t *gres_node_ptr)
2074 {
2075 int i;
2076
2077 FREE_NULL_BITMAP(gres_node_ptr->gres_bit_alloc);
2078 xfree(gres_node_ptr->gres_used);
2079 if (gres_node_ptr->links_cnt) {
2080 for (i = 0; i < gres_node_ptr->link_len; i++)
2081 xfree(gres_node_ptr->links_cnt[i]);
2082 xfree(gres_node_ptr->links_cnt);
2083 }
2084
2085 _gres_node_state_delete_topo(gres_node_ptr);
2086
2087 for (i = 0; i < gres_node_ptr->type_cnt; i++) {
2088 xfree(gres_node_ptr->type_name[i]);
2089 }
2090 xfree(gres_node_ptr->type_cnt_alloc);
2091 xfree(gres_node_ptr->type_cnt_avail);
2092 xfree(gres_node_ptr->type_id);
2093 xfree(gres_node_ptr->type_name);
2094 xfree(gres_node_ptr);
2095 }
2096
2097 /*
2098 * Delete an element placed on gres_list by _node_config_validate()
2099 * free associated memory
2100 */
_gres_node_list_delete(void * list_element)2101 static void _gres_node_list_delete(void *list_element)
2102 {
2103 gres_state_t *gres_ptr;
2104 gres_node_state_t *gres_node_ptr;
2105
2106 gres_ptr = (gres_state_t *) list_element;
2107 gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
2108 _gres_node_state_delete(gres_node_ptr);
2109 xfree(gres_ptr);
2110 }
2111
_add_gres_type(char * type,gres_node_state_t * gres_data,uint64_t tmp_gres_cnt)2112 static void _add_gres_type(char *type, gres_node_state_t *gres_data,
2113 uint64_t tmp_gres_cnt)
2114 {
2115 int i;
2116 uint32_t type_id;
2117
2118 if (!xstrcasecmp(type, "no_consume")) {
2119 gres_data->no_consume = true;
2120 return;
2121 }
2122
2123 type_id = gres_plugin_build_id(type);
2124 for (i = 0; i < gres_data->type_cnt; i++) {
2125 if (gres_data->type_id[i] != type_id)
2126 continue;
2127 gres_data->type_cnt_avail[i] += tmp_gres_cnt;
2128 break;
2129 }
2130
2131 if (i >= gres_data->type_cnt) {
2132 gres_data->type_cnt++;
2133 gres_data->type_cnt_alloc =
2134 xrealloc(gres_data->type_cnt_alloc,
2135 sizeof(uint64_t) * gres_data->type_cnt);
2136 gres_data->type_cnt_avail =
2137 xrealloc(gres_data->type_cnt_avail,
2138 sizeof(uint64_t) * gres_data->type_cnt);
2139 gres_data->type_id =
2140 xrealloc(gres_data->type_id,
2141 sizeof(uint32_t) * gres_data->type_cnt);
2142 gres_data->type_name =
2143 xrealloc(gres_data->type_name,
2144 sizeof(char *) * gres_data->type_cnt);
2145 gres_data->type_cnt_avail[i] += tmp_gres_cnt;
2146 gres_data->type_id[i] = type_id;
2147 gres_data->type_name[i] = xstrdup(type);
2148 }
2149 }
2150
2151 /*
2152 * Compute the total GRES count for a particular gres_name.
2153 * Note that a given gres_name can appear multiple times in the orig_config
2154 * string for multiple types (e.g. "gres=gpu:kepler:1,gpu:tesla:2").
2155 * IN/OUT gres_data - set gres_cnt_config field in this structure
2156 * IN orig_config - gres configuration from slurm.conf
2157 * IN gres_name - name of the gres type (e.g. "gpu")
2158 * IN gres_name_colon - gres name with appended colon
2159 * IN gres_name_colon_len - size of gres_name_colon
2160 * RET - Total configured count for this GRES type
2161 */
_get_gres_cnt(gres_node_state_t * gres_data,char * orig_config,char * gres_name,char * gres_name_colon,int gres_name_colon_len)2162 static void _get_gres_cnt(gres_node_state_t *gres_data, char *orig_config,
2163 char *gres_name, char *gres_name_colon,
2164 int gres_name_colon_len)
2165 {
2166 char *node_gres_config, *tok, *last_tok = NULL;
2167 char *sub_tok, *last_sub_tok = NULL;
2168 char *num, *paren, *last_num = NULL;
2169 uint64_t gres_config_cnt = 0, tmp_gres_cnt = 0, mult;
2170 int i;
2171
2172 xassert(gres_data);
2173 if (orig_config == NULL) {
2174 gres_data->gres_cnt_config = 0;
2175 return;
2176 }
2177
2178 for (i = 0; i < gres_data->type_cnt; i++) {
2179 gres_data->type_cnt_avail[i] = 0;
2180 }
2181
2182 node_gres_config = xstrdup(orig_config);
2183 tok = strtok_r(node_gres_config, ",", &last_tok);
2184 while (tok) {
2185 if (!xstrcmp(tok, gres_name)) {
2186 gres_config_cnt = 1;
2187 break;
2188 }
2189 if (!xstrncmp(tok, gres_name_colon, gres_name_colon_len)) {
2190 paren = strrchr(tok, '(');
2191 if (paren) /* Ignore socket binding info */
2192 paren[0] = '\0';
2193 num = strrchr(tok, ':');
2194 if (!num) {
2195 error("Bad GRES configuration: %s", tok);
2196 break;
2197 }
2198 tmp_gres_cnt = strtoll(num + 1, &last_num, 10);
2199 if ((num[1] < '0') || (num[1] > '9')) {
2200 /*
2201 * Type name, no count (e.g. "gpu:tesla").
2202 * assume count of 1.
2203 */
2204 tmp_gres_cnt = 1;
2205 } else if ((mult = suffix_mult(last_num)) != NO_VAL64) {
2206 tmp_gres_cnt *= mult;
2207 } else {
2208 error("Bad GRES configuration: %s", tok);
2209 break;
2210 }
2211
2212 gres_config_cnt += tmp_gres_cnt;
2213 num[0] = '\0';
2214
2215 sub_tok = strtok_r(tok, ":", &last_sub_tok);
2216 if (sub_tok) /* Skip GRES name */
2217 sub_tok = strtok_r(NULL, ":", &last_sub_tok);
2218 while (sub_tok) {
2219 _add_gres_type(sub_tok, gres_data,
2220 tmp_gres_cnt);
2221 sub_tok = strtok_r(NULL, ":", &last_sub_tok);
2222 }
2223 }
2224 tok = strtok_r(NULL, ",", &last_tok);
2225 }
2226 xfree(node_gres_config);
2227
2228 gres_data->gres_cnt_config = gres_config_cnt;
2229 }
2230
_valid_gres_type(char * gres_name,gres_node_state_t * gres_data,bool config_overrides,char ** reason_down)2231 static int _valid_gres_type(char *gres_name, gres_node_state_t *gres_data,
2232 bool config_overrides, char **reason_down)
2233 {
2234 int i, j;
2235 uint64_t model_cnt;
2236
2237 if (gres_data->type_cnt == 0)
2238 return 0;
2239
2240 for (i = 0; i < gres_data->type_cnt; i++) {
2241 model_cnt = 0;
2242 if (gres_data->type_cnt) {
2243 for (j = 0; j < gres_data->type_cnt; j++) {
2244 if (gres_data->type_id[i] ==
2245 gres_data->type_id[j])
2246 model_cnt +=
2247 gres_data->type_cnt_avail[j];
2248 }
2249 } else {
2250 for (j = 0; j < gres_data->topo_cnt; j++) {
2251 if (gres_data->type_id[i] ==
2252 gres_data->topo_type_id[j])
2253 model_cnt +=
2254 gres_data->topo_gres_cnt_avail[j];
2255 }
2256 }
2257 if (config_overrides) {
2258 gres_data->type_cnt_avail[i] = model_cnt;
2259 } else if (model_cnt < gres_data->type_cnt_avail[i]) {
2260 if (reason_down) {
2261 xstrfmtcat(*reason_down,
2262 "%s:%s count too low "
2263 "(%"PRIu64" < %"PRIu64")",
2264 gres_name, gres_data->type_name[i],
2265 model_cnt,
2266 gres_data->type_cnt_avail[i]);
2267 }
2268 return -1;
2269 }
2270 }
2271 return 0;
2272 }
2273
_build_gres_node_state(void)2274 static gres_node_state_t *_build_gres_node_state(void)
2275 {
2276 gres_node_state_t *gres_data;
2277
2278 gres_data = xmalloc(sizeof(gres_node_state_t));
2279 gres_data->gres_cnt_config = NO_VAL64;
2280 gres_data->gres_cnt_found = NO_VAL64;
2281
2282 return gres_data;
2283 }
2284
2285 /*
2286 * Build a node's gres record based only upon the slurm.conf contents
2287 */
_node_config_init(char * node_name,char * orig_config,slurm_gres_context_t * context_ptr,gres_state_t * gres_ptr)2288 static int _node_config_init(char *node_name, char *orig_config,
2289 slurm_gres_context_t *context_ptr,
2290 gres_state_t *gres_ptr)
2291 {
2292 int rc = SLURM_SUCCESS;
2293 gres_node_state_t *gres_data;
2294
2295 if (!gres_ptr->gres_data)
2296 gres_ptr->gres_data = _build_gres_node_state();
2297 gres_data = (gres_node_state_t *) gres_ptr->gres_data;
2298
2299 /* If the resource isn't configured for use with this node */
2300 if ((orig_config == NULL) || (orig_config[0] == '\0')) {
2301 gres_data->gres_cnt_config = 0;
2302 return rc;
2303 }
2304
2305 _get_gres_cnt(gres_data, orig_config,
2306 context_ptr->gres_name,
2307 context_ptr->gres_name_colon,
2308 context_ptr->gres_name_colon_len);
2309
2310 context_ptr->total_cnt += gres_data->gres_cnt_config;
2311
2312 /* Use count from recovered state, if higher */
2313 gres_data->gres_cnt_avail = MAX(gres_data->gres_cnt_avail,
2314 gres_data->gres_cnt_config);
2315 if ((gres_data->gres_bit_alloc != NULL) &&
2316 (gres_data->gres_cnt_avail >
2317 bit_size(gres_data->gres_bit_alloc)) &&
2318 !_shared_gres(context_ptr->plugin_id)) {
2319 gres_data->gres_bit_alloc =
2320 bit_realloc(gres_data->gres_bit_alloc,
2321 gres_data->gres_cnt_avail);
2322 }
2323
2324 return rc;
2325 }
2326
2327 /*
2328 * Build a node's gres record based only upon the slurm.conf contents
2329 * IN node_name - name of the node for which the gres information applies
2330 * IN orig_config - Gres information supplied from slurm.conf
2331 * IN/OUT gres_list - List of Gres records for this node to track usage
2332 */
gres_plugin_init_node_config(char * node_name,char * orig_config,List * gres_list)2333 extern int gres_plugin_init_node_config(char *node_name, char *orig_config,
2334 List *gres_list)
2335 {
2336 int i, rc, rc2;
2337 ListIterator gres_iter;
2338 gres_state_t *gres_ptr;
2339
2340 rc = gres_plugin_init();
2341
2342 slurm_mutex_lock(&gres_context_lock);
2343 if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
2344 *gres_list = list_create(_gres_node_list_delete);
2345 }
2346 for (i = 0; i < gres_context_cnt; i++) {
2347 /* Find or create gres_state entry on the list */
2348 gres_iter = list_iterator_create(*gres_list);
2349 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
2350 if (gres_ptr->plugin_id == gres_context[i].plugin_id)
2351 break;
2352 }
2353 list_iterator_destroy(gres_iter);
2354 if (gres_ptr == NULL) {
2355 gres_ptr = xmalloc(sizeof(gres_state_t));
2356 gres_ptr->plugin_id = gres_context[i].plugin_id;
2357 list_append(*gres_list, gres_ptr);
2358 }
2359
2360 rc2 = _node_config_init(node_name, orig_config,
2361 &gres_context[i], gres_ptr);
2362 if (rc == SLURM_SUCCESS)
2363 rc = rc2;
2364 }
2365 slurm_mutex_unlock(&gres_context_lock);
2366
2367 return rc;
2368 }
2369
2370 /*
2371 * Determine GRES availability on some node
2372 * plugin_id IN - plugin number to search for
2373 * topo_cnt OUT - count of gres.conf records of this ID found by slurmd
2374 * (each can have different topology)
2375 * config_type_cnt OUT - Count of records for this GRES found in configuration,
2376 * each of this represesents a different Type of of GRES with
2377 * with this name (e.g. GPU model)
2378 * RET - total number of GRES available of this ID on this node in (sum
2379 * across all records of this ID)
2380 */
_get_tot_gres_cnt(uint32_t plugin_id,uint64_t * topo_cnt,int * config_type_cnt)2381 static uint64_t _get_tot_gres_cnt(uint32_t plugin_id, uint64_t *topo_cnt,
2382 int *config_type_cnt)
2383 {
2384 ListIterator iter;
2385 gres_slurmd_conf_t *gres_slurmd_conf;
2386 uint32_t cpu_set_cnt = 0, rec_cnt = 0;
2387 uint64_t gres_cnt = 0;
2388
2389 xassert(config_type_cnt);
2390 xassert(topo_cnt);
2391 *config_type_cnt = 0;
2392 *topo_cnt = 0;
2393 if (gres_conf_list == NULL)
2394 return gres_cnt;
2395
2396 iter = list_iterator_create(gres_conf_list);
2397 while ((gres_slurmd_conf = (gres_slurmd_conf_t *) list_next(iter))) {
2398 if (gres_slurmd_conf->plugin_id != plugin_id)
2399 continue;
2400 gres_cnt += gres_slurmd_conf->count;
2401 rec_cnt++;
2402 if (gres_slurmd_conf->cpus || gres_slurmd_conf->type_name)
2403 cpu_set_cnt++;
2404 }
2405 list_iterator_destroy(iter);
2406 *config_type_cnt = rec_cnt;
2407 if (cpu_set_cnt)
2408 *topo_cnt = rec_cnt;
2409 return gres_cnt;
2410 }
2411
2412 /*
2413 * Map a given GRES type ID back to a GRES type name.
2414 * gres_id IN - GRES type ID to search for.
2415 * gres_name IN - Pre-allocated string in which to store the GRES type name.
2416 * gres_name_len - Size of gres_name in bytes
2417 * RET - error code (currently not used--always return SLURM_SUCCESS)
2418 */
gres_gresid_to_gresname(uint32_t gres_id,char * gres_name,int gres_name_len)2419 extern int gres_gresid_to_gresname(uint32_t gres_id, char* gres_name,
2420 int gres_name_len)
2421 {
2422 int rc = SLURM_SUCCESS;
2423 int found = 0;
2424 int i;
2425
2426 /*
2427 * Check GresTypes from slurm.conf (gres_context) for GRES type name
2428 */
2429 slurm_mutex_lock(&gres_context_lock);
2430 for (i = 0; i < gres_context_cnt; ++i) {
2431 if (gres_id == gres_context[i].plugin_id) {
2432 strlcpy(gres_name, gres_context[i].gres_name,
2433 gres_name_len);
2434 found = 1;
2435 break;
2436 }
2437 }
2438 slurm_mutex_unlock(&gres_context_lock);
2439
2440 /*
2441 * If can't find GRES type name, emit error and default to GRES type ID
2442 */
2443 if (!found) {
2444 error("Could not find GRES type name in slurm.conf that corresponds to GRES type ID `%d`. Using ID as GRES type name instead.",
2445 gres_id);
2446 snprintf(gres_name, gres_name_len, "%u", gres_id);
2447 }
2448
2449 return rc;
2450 }
2451
2452 /* Convert comma-delimited array of link counts to an integer array */
_links_str2array(char * links,char * node_name,gres_node_state_t * gres_data,int gres_inx,int gres_cnt)2453 static void _links_str2array(char *links, char *node_name,
2454 gres_node_state_t *gres_data,
2455 int gres_inx, int gres_cnt)
2456 {
2457 char *start_ptr, *end_ptr = NULL;
2458 int i = 0;
2459
2460 if (!links) /* No "Links=" data */
2461 return;
2462 if (gres_inx >= gres_data->link_len) {
2463 error("%s: Invalid GRES index (%d >= %d)", __func__, gres_inx,
2464 gres_cnt);
2465 return;
2466 }
2467
2468 start_ptr = links;
2469 while (1) {
2470 gres_data->links_cnt[gres_inx][i] =
2471 strtol(start_ptr, &end_ptr, 10);
2472 if (gres_data->links_cnt[gres_inx][i] < -2) {
2473 error("%s: Invalid GRES Links value (%s) on node %s:"
2474 "Link value '%d' < -2", __func__, links,
2475 node_name, gres_data->links_cnt[gres_inx][i]);
2476 gres_data->links_cnt[gres_inx][i] = 0;
2477 return;
2478 }
2479 if (end_ptr[0] == '\0')
2480 return;
2481 if (end_ptr[0] != ',') {
2482 error("%s: Invalid GRES Links value (%s) on node %s:"
2483 "end_ptr[0]='%c' != ','", __func__, links,
2484 node_name, end_ptr[0]);
2485 return;
2486 }
2487 if (++i >= gres_data->link_len) {
2488 error("%s: Invalid GRES Links value (%s) on node %s:"
2489 "i=%d >= link_len=%d", __func__, links, node_name,
2490 i, gres_data->link_len);
2491 return;
2492 }
2493 start_ptr = end_ptr + 1;
2494 }
2495 }
2496
_valid_gres_types(char * gres_name,gres_node_state_t * gres_data,char ** reason_down)2497 static bool _valid_gres_types(char *gres_name, gres_node_state_t *gres_data,
2498 char **reason_down)
2499 {
2500 bool rc = true;
2501 uint64_t gres_cnt_found = 0, gres_sum;
2502 int topo_inx, type_inx;
2503
2504 if ((gres_data->type_cnt == 0) || (gres_data->topo_cnt == 0))
2505 return rc;
2506
2507 for (type_inx = 0; type_inx < gres_data->type_cnt; type_inx++) {
2508 gres_cnt_found = 0;
2509 for (topo_inx = 0; topo_inx < gres_data->topo_cnt; topo_inx++) {
2510 if (gres_data->topo_type_id[topo_inx] !=
2511 gres_data->type_id[type_inx])
2512 continue;
2513 gres_sum = gres_cnt_found +
2514 gres_data->topo_gres_cnt_avail[topo_inx];
2515 if (gres_sum > gres_data->type_cnt_avail[type_inx]) {
2516 gres_data->topo_gres_cnt_avail[topo_inx] -=
2517 (gres_sum -
2518 gres_data->type_cnt_avail[type_inx]);
2519 }
2520 gres_cnt_found +=
2521 gres_data->topo_gres_cnt_avail[topo_inx];
2522 }
2523 if (gres_cnt_found < gres_data->type_cnt_avail[type_inx]) {
2524 rc = false;
2525 break;
2526 }
2527 }
2528 if (!rc && reason_down && (*reason_down == NULL)) {
2529 xstrfmtcat(*reason_down,
2530 "%s:%s count too low (%"PRIu64" < %"PRIu64")",
2531 gres_name, gres_data->type_name[type_inx],
2532 gres_cnt_found, gres_data->type_cnt_avail[type_inx]);
2533 }
2534
2535 return rc;
2536 }
2537
_gres_bit_alloc_resize(gres_node_state_t * gres_data,uint64_t gres_bits)2538 static void _gres_bit_alloc_resize(gres_node_state_t *gres_data,
2539 uint64_t gres_bits)
2540 {
2541 if (!gres_bits) {
2542 FREE_NULL_BITMAP(gres_data->gres_bit_alloc);
2543 return;
2544 }
2545
2546 if (!gres_data->gres_bit_alloc)
2547 gres_data->gres_bit_alloc = bit_alloc(gres_bits);
2548 else if (gres_bits != bit_size(gres_data->gres_bit_alloc))
2549 gres_data->gres_bit_alloc =
2550 bit_realloc(gres_data->gres_bit_alloc, gres_bits);
2551 }
2552
_node_config_validate(char * node_name,char * orig_config,gres_state_t * gres_ptr,int cpu_cnt,int core_cnt,int sock_cnt,bool config_overrides,char ** reason_down,slurm_gres_context_t * context_ptr)2553 static int _node_config_validate(char *node_name, char *orig_config,
2554 gres_state_t *gres_ptr,
2555 int cpu_cnt, int core_cnt, int sock_cnt,
2556 bool config_overrides, char **reason_down,
2557 slurm_gres_context_t *context_ptr)
2558 {
2559 int cpus_config = 0, i, j, gres_inx, rc = SLURM_SUCCESS;
2560 int config_type_cnt = 0;
2561 uint64_t dev_cnt, gres_cnt, topo_cnt = 0;
2562 bool cpu_config_err = false, updated_config = false;
2563 gres_node_state_t *gres_data;
2564 ListIterator iter;
2565 gres_slurmd_conf_t *gres_slurmd_conf;
2566 bool has_file, has_type, rebuild_topo = false;
2567 uint32_t type_id;
2568
2569 xassert(core_cnt);
2570 if (gres_ptr->gres_data == NULL)
2571 gres_ptr->gres_data = _build_gres_node_state();
2572 gres_data = (gres_node_state_t *) gres_ptr->gres_data;
2573 if (gres_data->node_feature)
2574 return rc;
2575
2576 gres_cnt = _get_tot_gres_cnt(context_ptr->plugin_id, &topo_cnt,
2577 &config_type_cnt);
2578 if ((gres_data->gres_cnt_config > gres_cnt) && !config_overrides) {
2579 if (reason_down && (*reason_down == NULL)) {
2580 xstrfmtcat(*reason_down,
2581 "%s count reported lower than configured "
2582 "(%"PRIu64" < %"PRIu64")",
2583 context_ptr->gres_type,
2584 gres_cnt, gres_data->gres_cnt_config);
2585 }
2586 rc = EINVAL;
2587 }
2588 if ((gres_cnt > gres_data->gres_cnt_config)) {
2589 debug("%s: %s: Ignoring excess count on node %s (%"
2590 PRIu64" > %"PRIu64")",
2591 __func__, context_ptr->gres_type, node_name, gres_cnt,
2592 gres_data->gres_cnt_config);
2593 gres_cnt = gres_data->gres_cnt_config;
2594 }
2595 if (gres_data->gres_cnt_found != gres_cnt) {
2596 if (gres_data->gres_cnt_found != NO_VAL64) {
2597 info("%s: %s: Count changed on node %s (%"PRIu64" != %"PRIu64")",
2598 __func__, context_ptr->gres_type, node_name,
2599 gres_data->gres_cnt_found, gres_cnt);
2600 }
2601 if ((gres_data->gres_cnt_found != NO_VAL64) &&
2602 (gres_data->gres_cnt_alloc != 0)) {
2603 if (reason_down && (*reason_down == NULL)) {
2604 xstrfmtcat(*reason_down,
2605 "%s count changed and jobs are using them "
2606 "(%"PRIu64" != %"PRIu64")",
2607 context_ptr->gres_type,
2608 gres_data->gres_cnt_found, gres_cnt);
2609 }
2610 rc = EINVAL;
2611 } else {
2612 gres_data->gres_cnt_found = gres_cnt;
2613 updated_config = true;
2614 }
2615 }
2616 if (!updated_config && gres_data->type_cnt) {
2617 /*
2618 * This is needed to address the GRES specification in
2619 * gres.conf having a Type option, while the GRES specification
2620 * in slurm.conf does not.
2621 */
2622 for (i = 0; i < gres_data->type_cnt; i++) {
2623 if (gres_data->type_cnt_avail[i])
2624 continue;
2625 updated_config = true;
2626 break;
2627 }
2628 }
2629 if (!updated_config)
2630 return rc;
2631 if ((gres_cnt > gres_data->gres_cnt_config) && config_overrides) {
2632 info("%s: %s: count on node %s inconsistent with slurmctld count (%"PRIu64" != %"PRIu64")",
2633 __func__, context_ptr->gres_type, node_name,
2634 gres_cnt, gres_data->gres_cnt_config);
2635 gres_cnt = gres_data->gres_cnt_config; /* Ignore excess GRES */
2636 }
2637 if ((topo_cnt == 0) && (topo_cnt != gres_data->topo_cnt)) {
2638 /* Need to clear topology info */
2639 _gres_node_state_delete_topo(gres_data);
2640
2641 gres_data->topo_cnt = topo_cnt;
2642 }
2643
2644 has_file = context_ptr->config_flags & GRES_CONF_HAS_FILE;
2645 has_type = context_ptr->config_flags & GRES_CONF_HAS_TYPE;
2646 if (_shared_gres(context_ptr->plugin_id))
2647 dev_cnt = topo_cnt;
2648 else
2649 dev_cnt = gres_cnt;
2650 if (has_file && (topo_cnt != gres_data->topo_cnt) && (dev_cnt == 0)) {
2651 /*
2652 * Clear any vestigial GRES node state info.
2653 */
2654 _gres_node_state_delete_topo(gres_data);
2655
2656 xfree(gres_data->gres_bit_alloc);
2657
2658 gres_data->topo_cnt = 0;
2659 } else if (has_file && (topo_cnt != gres_data->topo_cnt)) {
2660 /*
2661 * Need to rebuild topology info.
2662 * Resize the data structures here.
2663 */
2664 rebuild_topo = true;
2665 gres_data->topo_gres_cnt_alloc =
2666 xrealloc(gres_data->topo_gres_cnt_alloc,
2667 topo_cnt * sizeof(uint64_t));
2668 gres_data->topo_gres_cnt_avail =
2669 xrealloc(gres_data->topo_gres_cnt_avail,
2670 topo_cnt * sizeof(uint64_t));
2671 for (i = 0; i < gres_data->topo_cnt; i++) {
2672 if (gres_data->topo_gres_bitmap) {
2673 FREE_NULL_BITMAP(gres_data->
2674 topo_gres_bitmap[i]);
2675 }
2676 if (gres_data->topo_core_bitmap) {
2677 FREE_NULL_BITMAP(gres_data->
2678 topo_core_bitmap[i]);
2679 }
2680 xfree(gres_data->topo_type_name[i]);
2681 }
2682 gres_data->topo_gres_bitmap =
2683 xrealloc(gres_data->topo_gres_bitmap,
2684 topo_cnt * sizeof(bitstr_t *));
2685 gres_data->topo_core_bitmap =
2686 xrealloc(gres_data->topo_core_bitmap,
2687 topo_cnt * sizeof(bitstr_t *));
2688 gres_data->topo_type_id = xrealloc(gres_data->topo_type_id,
2689 topo_cnt * sizeof(uint32_t));
2690 gres_data->topo_type_name = xrealloc(gres_data->topo_type_name,
2691 topo_cnt * sizeof(char *));
2692 if (gres_data->gres_bit_alloc)
2693 gres_data->gres_bit_alloc = bit_realloc(
2694 gres_data->gres_bit_alloc, dev_cnt);
2695 gres_data->topo_cnt = topo_cnt;
2696 } else if (_shared_gres(context_ptr->plugin_id) && gres_data->topo_cnt){
2697 /*
2698 * Need to rebuild topology info to recover state after
2699 * slurmctld restart with running jobs.
2700 */
2701 rebuild_topo = true;
2702 }
2703
2704 if (rebuild_topo) {
2705 iter = list_iterator_create(gres_conf_list);
2706 gres_inx = i = 0;
2707 while ((gres_slurmd_conf = (gres_slurmd_conf_t *)
2708 list_next(iter))) {
2709 if (gres_slurmd_conf->plugin_id !=
2710 context_ptr->plugin_id)
2711 continue;
2712 if ((gres_data->gres_bit_alloc) &&
2713 !_shared_gres(context_ptr->plugin_id))
2714 gres_data->topo_gres_cnt_alloc[i] = 0;
2715 gres_data->topo_gres_cnt_avail[i] =
2716 gres_slurmd_conf->count;
2717 if (gres_slurmd_conf->cpus) {
2718 bitstr_t *tmp_bitmap;
2719 tmp_bitmap =
2720 bit_alloc(gres_slurmd_conf->cpu_cnt);
2721 bit_unfmt(tmp_bitmap, gres_slurmd_conf->cpus);
2722 if (gres_slurmd_conf->cpu_cnt == core_cnt) {
2723 gres_data->topo_core_bitmap[i] =
2724 tmp_bitmap;
2725 tmp_bitmap = NULL; /* Nothing to free */
2726 } else if (gres_slurmd_conf->cpu_cnt ==
2727 cpu_cnt) {
2728 /* Translate CPU to core bitmap */
2729 int cpus_per_core = cpu_cnt / core_cnt;
2730 int j, core_inx;
2731 gres_data->topo_core_bitmap[i] =
2732 bit_alloc(core_cnt);
2733 for (j = 0; j < cpu_cnt; j++) {
2734 if (!bit_test(tmp_bitmap, j))
2735 continue;
2736 core_inx = j / cpus_per_core;
2737 bit_set(gres_data->
2738 topo_core_bitmap[i],
2739 core_inx);
2740 }
2741 } else if (i == 0) {
2742 error("%s: %s: invalid GRES cpu count (%u) on node %s",
2743 __func__, context_ptr->gres_type,
2744 gres_slurmd_conf->cpu_cnt,
2745 node_name);
2746 }
2747 FREE_NULL_BITMAP(tmp_bitmap);
2748 cpus_config = core_cnt;
2749 } else if (cpus_config && !cpu_config_err) {
2750 cpu_config_err = true;
2751 error("%s: %s: has CPUs configured for only some of the records on node %s",
2752 __func__, context_ptr->gres_type,
2753 node_name);
2754 }
2755
2756 if (gres_slurmd_conf->links) {
2757 if (gres_data->links_cnt &&
2758 (gres_data->link_len != gres_cnt)) {
2759 /* Size changed, need to rebuild */
2760 for (j = 0; j < gres_data->link_len;j++)
2761 xfree(gres_data->links_cnt[j]);
2762 xfree(gres_data->links_cnt);
2763 }
2764 if (!gres_data->links_cnt) {
2765 gres_data->link_len = gres_cnt;
2766 gres_data->links_cnt =
2767 xcalloc(gres_cnt,
2768 sizeof(int *));
2769 for (j = 0; j < gres_cnt; j++) {
2770 gres_data->links_cnt[j] =
2771 xcalloc(gres_cnt,
2772 sizeof(int));
2773 }
2774 }
2775 }
2776 if (_shared_gres(gres_slurmd_conf->plugin_id)) {
2777 /* If running jobs recovered then already set */
2778 if (!gres_data->topo_gres_bitmap[i]) {
2779 gres_data->topo_gres_bitmap[i] =
2780 bit_alloc(dev_cnt);
2781 bit_set(gres_data->topo_gres_bitmap[i],
2782 gres_inx);
2783 }
2784 gres_inx++;
2785 } else if (dev_cnt == 0) {
2786 /*
2787 * Slurmd found GRES, but slurmctld can't use
2788 * them. Avoid creating zero-size bitmaps.
2789 */
2790 has_file = false;
2791 } else {
2792 gres_data->topo_gres_bitmap[i] =
2793 bit_alloc(dev_cnt);
2794 for (j = 0; j < gres_slurmd_conf->count; j++) {
2795 if (gres_inx >= dev_cnt) {
2796 /* Ignore excess GRES on node */
2797 break;
2798 }
2799 bit_set(gres_data->topo_gres_bitmap[i],
2800 gres_inx);
2801 if (gres_data->gres_bit_alloc &&
2802 bit_test(gres_data->gres_bit_alloc,
2803 gres_inx)) {
2804 /* Set by recovered job */
2805 gres_data->topo_gres_cnt_alloc[i]++;
2806 }
2807 _links_str2array(
2808 gres_slurmd_conf->links,
2809 node_name, gres_data,
2810 gres_inx, gres_cnt);
2811 gres_inx++;
2812 }
2813 }
2814 gres_data->topo_type_id[i] =
2815 gres_plugin_build_id(gres_slurmd_conf->
2816 type_name);
2817 gres_data->topo_type_name[i] =
2818 xstrdup(gres_slurmd_conf->type_name);
2819 i++;
2820 if (i >= gres_data->topo_cnt)
2821 break;
2822 }
2823 list_iterator_destroy(iter);
2824 if (cpu_config_err) {
2825 /*
2826 * Some GRES of this type have "CPUs" configured. Set
2827 * topo_core_bitmap for all others with all bits set.
2828 */
2829 iter = list_iterator_create(gres_conf_list);
2830 while ((gres_slurmd_conf = (gres_slurmd_conf_t *)
2831 list_next(iter))) {
2832 if (gres_slurmd_conf->plugin_id !=
2833 context_ptr->plugin_id)
2834 continue;
2835 for (j = 0; j < i; j++) {
2836 if (gres_data->topo_core_bitmap[j])
2837 continue;
2838 gres_data->topo_core_bitmap[j] =
2839 bit_alloc(cpus_config);
2840 bit_set_all(gres_data->
2841 topo_core_bitmap[j]);
2842 }
2843 }
2844 list_iterator_destroy(iter);
2845 }
2846 } else if (!has_file && has_type) {
2847 /* Add GRES Type information as needed */
2848 iter = list_iterator_create(gres_conf_list);
2849 while ((gres_slurmd_conf = (gres_slurmd_conf_t *)
2850 list_next(iter))) {
2851 if (gres_slurmd_conf->plugin_id !=
2852 context_ptr->plugin_id)
2853 continue;
2854 type_id = gres_plugin_build_id(
2855 gres_slurmd_conf->type_name);
2856 for (i = 0; i < gres_data->type_cnt; i++) {
2857 if (type_id == gres_data->type_id[i])
2858 break;
2859 }
2860 if (i < gres_data->type_cnt) {
2861 /* Update count as needed */
2862 gres_data->type_cnt_avail[i] =
2863 gres_slurmd_conf->count;
2864 } else {
2865 _add_gres_type(gres_slurmd_conf->type_name,
2866 gres_data,
2867 gres_slurmd_conf->count);
2868 }
2869
2870 }
2871 list_iterator_destroy(iter);
2872 }
2873
2874 if ((orig_config == NULL) || (orig_config[0] == '\0'))
2875 gres_data->gres_cnt_config = 0;
2876 else if (gres_data->gres_cnt_config == NO_VAL64) {
2877 /* This should have been filled in by _node_config_init() */
2878 _get_gres_cnt(gres_data, orig_config,
2879 context_ptr->gres_name,
2880 context_ptr->gres_name_colon,
2881 context_ptr->gres_name_colon_len);
2882 }
2883
2884 gres_data->gres_cnt_avail = gres_data->gres_cnt_config;
2885
2886 if (has_file) {
2887 uint64_t gres_bits;
2888 if (_shared_gres(context_ptr->plugin_id)) {
2889 gres_bits = topo_cnt;
2890 } else {
2891 if (gres_data->gres_cnt_avail > MAX_GRES_BITMAP) {
2892 error("%s: %s has \"File\" plus very large \"Count\" "
2893 "(%"PRIu64") for node %s, resetting value to %u",
2894 __func__, context_ptr->gres_type,
2895 gres_data->gres_cnt_avail, node_name,
2896 MAX_GRES_BITMAP);
2897 gres_data->gres_cnt_avail = MAX_GRES_BITMAP;
2898 gres_data->gres_cnt_found = MAX_GRES_BITMAP;
2899 }
2900 gres_bits = gres_data->gres_cnt_avail;
2901 }
2902
2903 _gres_bit_alloc_resize(gres_data, gres_bits);
2904 }
2905
2906 if ((config_type_cnt > 1) &&
2907 !_valid_gres_types(context_ptr->gres_type, gres_data, reason_down)){
2908 rc = EINVAL;
2909 } else if (!config_overrides &&
2910 (gres_data->gres_cnt_found < gres_data->gres_cnt_config)) {
2911 if (reason_down && (*reason_down == NULL)) {
2912 xstrfmtcat(*reason_down,
2913 "%s count too low (%"PRIu64" < %"PRIu64")",
2914 context_ptr->gres_type,
2915 gres_data->gres_cnt_found,
2916 gres_data->gres_cnt_config);
2917 }
2918 rc = EINVAL;
2919 } else if (_valid_gres_type(context_ptr->gres_type, gres_data,
2920 config_overrides, reason_down)) {
2921 rc = EINVAL;
2922 } else if (config_overrides && gres_data->topo_cnt &&
2923 (gres_data->gres_cnt_found != gres_data->gres_cnt_config)) {
2924 error("%s on node %s configured for %"PRIu64" resources but "
2925 "%"PRIu64" found, ignoring topology support",
2926 context_ptr->gres_type, node_name,
2927 gres_data->gres_cnt_config, gres_data->gres_cnt_found);
2928 if (gres_data->topo_core_bitmap) {
2929 for (i = 0; i < gres_data->topo_cnt; i++) {
2930 if (gres_data->topo_core_bitmap) {
2931 FREE_NULL_BITMAP(gres_data->
2932 topo_core_bitmap[i]);
2933 }
2934 if (gres_data->topo_gres_bitmap) {
2935 FREE_NULL_BITMAP(gres_data->
2936 topo_gres_bitmap[i]);
2937 }
2938 xfree(gres_data->topo_type_name[i]);
2939 }
2940 xfree(gres_data->topo_core_bitmap);
2941 xfree(gres_data->topo_gres_bitmap);
2942 xfree(gres_data->topo_gres_cnt_alloc);
2943 xfree(gres_data->topo_gres_cnt_avail);
2944 xfree(gres_data->topo_type_id);
2945 xfree(gres_data->topo_type_name);
2946 }
2947 gres_data->topo_cnt = 0;
2948 }
2949
2950 return rc;
2951 }
2952
2953 /*
2954 * Validate a node's configuration and put a gres record onto a list
2955 * Called immediately after gres_plugin_node_config_unpack().
2956 * IN node_name - name of the node for which the gres information applies
2957 * IN orig_config - Gres information supplied from merged slurm.conf/gres.conf
2958 * IN/OUT new_config - Updated gres info from slurm.conf
2959 * IN/OUT gres_list - List of Gres records for this node to track usage
2960 * IN threads_per_core - Count of CPUs (threads) per core on this node
2961 * IN cores_per_sock - Count of cores per socket on this node
2962 * IN sock_cnt - Count of sockets on this node
2963 * IN config_overrides - true: Don't validate hardware, use slurm.conf
2964 * configuration
2965 * false: Validate hardware config, but use slurm.conf
2966 * config
2967 * OUT reason_down - set to an explanation of failure, if any, don't set if NULL
2968 */
gres_plugin_node_config_validate(char * node_name,char * orig_config,char ** new_config,List * gres_list,int threads_per_core,int cores_per_sock,int sock_cnt,bool config_overrides,char ** reason_down)2969 extern int gres_plugin_node_config_validate(char *node_name,
2970 char *orig_config,
2971 char **new_config,
2972 List *gres_list,
2973 int threads_per_core,
2974 int cores_per_sock, int sock_cnt,
2975 bool config_overrides,
2976 char **reason_down)
2977 {
2978 int i, rc, rc2;
2979 gres_state_t *gres_ptr, *gres_gpu_ptr = NULL, *gres_mps_ptr = NULL;
2980 int core_cnt = sock_cnt * cores_per_sock;
2981 int cpu_cnt = core_cnt * threads_per_core;
2982
2983 rc = gres_plugin_init();
2984
2985 slurm_mutex_lock(&gres_context_lock);
2986 if ((gres_context_cnt > 0) && (*gres_list == NULL))
2987 *gres_list = list_create(_gres_node_list_delete);
2988 for (i = 0; i < gres_context_cnt; i++) {
2989 /* Find or create gres_state entry on the list */
2990 gres_ptr = list_find_first(*gres_list, _gres_find_id,
2991 &gres_context[i].plugin_id);
2992 if (gres_ptr == NULL) {
2993 gres_ptr = xmalloc(sizeof(gres_state_t));
2994 gres_ptr->plugin_id = gres_context[i].plugin_id;
2995 list_append(*gres_list, gres_ptr);
2996 }
2997 rc2 = _node_config_validate(node_name, orig_config,
2998 gres_ptr, cpu_cnt, core_cnt,
2999 sock_cnt, config_overrides,
3000 reason_down, &gres_context[i]);
3001 rc = MAX(rc, rc2);
3002 if (gres_ptr->plugin_id == gpu_plugin_id)
3003 gres_gpu_ptr = gres_ptr;
3004 else if (gres_ptr->plugin_id == mps_plugin_id)
3005 gres_mps_ptr = gres_ptr;
3006 }
3007 _sync_node_mps_to_gpu(gres_mps_ptr, gres_gpu_ptr);
3008 _build_node_gres_str(gres_list, new_config, cores_per_sock, sock_cnt);
3009 slurm_mutex_unlock(&gres_context_lock);
3010
3011 return rc;
3012 }
3013
3014 /* Convert number to new value with suffix (e.g. 2096 -> 2K) */
_gres_scale_value(uint64_t gres_size,uint64_t * gres_scaled,char ** suffix)3015 static void _gres_scale_value(uint64_t gres_size, uint64_t *gres_scaled,
3016 char **suffix)
3017 {
3018 uint64_t tmp_gres_size = gres_size;
3019 int i;
3020
3021 tmp_gres_size = gres_size;
3022 for (i = 0; i < 4; i++) {
3023 if ((tmp_gres_size != 0) && ((tmp_gres_size % 1024) == 0))
3024 tmp_gres_size /= 1024;
3025 else
3026 break;
3027 }
3028
3029 *gres_scaled = tmp_gres_size;
3030 if (i == 0)
3031 *suffix = "";
3032 else if (i == 1)
3033 *suffix = "K";
3034 else if (i == 2)
3035 *suffix = "M";
3036 else if (i == 3)
3037 *suffix = "G";
3038 else
3039 *suffix = "T";
3040 }
3041
3042 /*
3043 * Add a GRES from node_feature plugin
3044 * IN node_name - name of the node for which the gres information applies
3045 * IN gres_name - name of the GRES being added or updated from the plugin
3046 * IN gres_size - count of this GRES on this node
3047 * IN/OUT new_config - Updated GRES info from slurm.conf
3048 * IN/OUT gres_list - List of GRES records for this node to track usage
3049 */
gres_plugin_node_feature(char * node_name,char * gres_name,uint64_t gres_size,char ** new_config,List * gres_list)3050 extern void gres_plugin_node_feature(char *node_name,
3051 char *gres_name, uint64_t gres_size,
3052 char **new_config, List *gres_list)
3053 {
3054 char *new_gres = NULL, *tok, *save_ptr = NULL, *sep = "", *suffix = "";
3055 gres_state_t *gres_ptr;
3056 gres_node_state_t *gres_node_ptr;
3057 uint32_t plugin_id;
3058 uint64_t gres_scaled = 0;
3059 int gres_name_len;
3060
3061 xassert(gres_name);
3062 gres_name_len = strlen(gres_name);
3063 plugin_id = gres_plugin_build_id(gres_name);
3064 if (*new_config) {
3065 tok = strtok_r(*new_config, ",", &save_ptr);
3066 while (tok) {
3067 if (!strncmp(tok, gres_name, gres_name_len) &&
3068 ((tok[gres_name_len] == ':') ||
3069 (tok[gres_name_len] == '\0'))) {
3070 /* Skip this record */
3071 } else {
3072 xstrfmtcat(new_gres, "%s%s", sep, tok);
3073 sep = ",";
3074 }
3075 tok = strtok_r(NULL, ",", &save_ptr);
3076 }
3077 }
3078 _gres_scale_value(gres_size, &gres_scaled, &suffix);
3079 xstrfmtcat(new_gres, "%s%s:%"PRIu64"%s",
3080 sep, gres_name, gres_scaled, suffix);
3081 xfree(*new_config);
3082 *new_config = new_gres;
3083
3084 slurm_mutex_lock(&gres_context_lock);
3085 if (gres_context_cnt > 0) {
3086 if (*gres_list == NULL)
3087 *gres_list = list_create(_gres_node_list_delete);
3088 gres_ptr = list_find_first(*gres_list, _gres_find_id,
3089 &plugin_id);
3090 if (gres_ptr == NULL) {
3091 gres_ptr = xmalloc(sizeof(gres_state_t));
3092 gres_ptr->plugin_id = plugin_id;
3093 gres_ptr->gres_data = _build_gres_node_state();
3094 list_append(*gres_list, gres_ptr);
3095 }
3096 gres_node_ptr = gres_ptr->gres_data;
3097 if (gres_size >= gres_node_ptr->gres_cnt_alloc) {
3098 gres_node_ptr->gres_cnt_avail = gres_size -
3099 gres_node_ptr->gres_cnt_alloc;
3100 } else {
3101 error("%s: Changed size count of GRES %s from %"PRIu64
3102 " to %"PRIu64", resource over allocated",
3103 __func__, gres_name,
3104 gres_node_ptr->gres_cnt_avail, gres_size);
3105 gres_node_ptr->gres_cnt_avail = 0;
3106 }
3107 gres_node_ptr->gres_cnt_config = gres_size;
3108 gres_node_ptr->gres_cnt_found = gres_size;
3109 gres_node_ptr->node_feature = true;
3110 }
3111 slurm_mutex_unlock(&gres_context_lock);
3112 }
3113
3114 /*
3115 * Check validity of a GRES change. Specifically if a GRES type has "Files"
3116 * configured then the only valid new counts are the current count or zero
3117 *
3118 * RET true of the requested change is valid
3119 */
_node_reconfig_test(char * node_name,char * new_gres,gres_state_t * gres_ptr,slurm_gres_context_t * context_ptr)3120 static int _node_reconfig_test(char *node_name, char *new_gres,
3121 gres_state_t *gres_ptr,
3122 slurm_gres_context_t *context_ptr)
3123 {
3124 gres_node_state_t *orig_gres_data, *new_gres_data;
3125 int rc = SLURM_SUCCESS;
3126
3127 xassert(gres_ptr);
3128 if (!(context_ptr->config_flags & GRES_CONF_HAS_FILE))
3129 return SLURM_SUCCESS;
3130
3131 orig_gres_data = gres_ptr->gres_data;
3132 new_gres_data = _build_gres_node_state();
3133 _get_gres_cnt(new_gres_data, new_gres,
3134 context_ptr->gres_name,
3135 context_ptr->gres_name_colon,
3136 context_ptr->gres_name_colon_len);
3137 if ((new_gres_data->gres_cnt_config != 0) &&
3138 (new_gres_data->gres_cnt_config !=
3139 orig_gres_data->gres_cnt_config)) {
3140 error("Attempt to change gres/%s Count on node %s from %"
3141 PRIu64" to %"PRIu64" invalid with File configuration",
3142 context_ptr->gres_name, node_name,
3143 orig_gres_data->gres_cnt_config,
3144 new_gres_data->gres_cnt_config);
3145 rc = ESLURM_INVALID_GRES;
3146 }
3147 _gres_node_state_delete(new_gres_data);
3148
3149 return rc;
3150 }
3151
_node_reconfig(char * node_name,char * new_gres,char ** gres_str,gres_state_t * gres_ptr,bool config_overrides,slurm_gres_context_t * context_ptr,bool * updated_gpu_cnt)3152 static int _node_reconfig(char *node_name, char *new_gres, char **gres_str,
3153 gres_state_t *gres_ptr, bool config_overrides,
3154 slurm_gres_context_t *context_ptr,
3155 bool *updated_gpu_cnt)
3156 {
3157 int i;
3158 gres_node_state_t *gres_data;
3159 uint64_t gres_bits, orig_cnt;
3160
3161 xassert(gres_ptr);
3162 xassert(updated_gpu_cnt);
3163 *updated_gpu_cnt = false;
3164 if (gres_ptr->gres_data == NULL)
3165 gres_ptr->gres_data = _build_gres_node_state();
3166 gres_data = gres_ptr->gres_data;
3167 orig_cnt = gres_data->gres_cnt_config;
3168
3169 _get_gres_cnt(gres_data, new_gres,
3170 context_ptr->gres_name,
3171 context_ptr->gres_name_colon,
3172 context_ptr->gres_name_colon_len);
3173
3174 if (gres_data->gres_cnt_config == orig_cnt)
3175 return SLURM_SUCCESS; /* No change in count */
3176
3177 /* Update count */
3178 context_ptr->total_cnt -= orig_cnt;
3179 context_ptr->total_cnt += gres_data->gres_cnt_config;
3180
3181 if (!gres_data->gres_cnt_config)
3182 gres_data->gres_cnt_avail = gres_data->gres_cnt_config;
3183 else if (gres_data->gres_cnt_found != NO_VAL64)
3184 gres_data->gres_cnt_avail = gres_data->gres_cnt_found;
3185 else if (gres_data->gres_cnt_avail == NO_VAL64)
3186 gres_data->gres_cnt_avail = 0;
3187
3188 if (context_ptr->config_flags & GRES_CONF_HAS_FILE) {
3189 if (_shared_gres(context_ptr->plugin_id))
3190 gres_bits = gres_data->topo_cnt;
3191 else
3192 gres_bits = gres_data->gres_cnt_avail;
3193
3194 _gres_bit_alloc_resize(gres_data, gres_bits);
3195 } else if (gres_data->gres_bit_alloc &&
3196 !_shared_gres(context_ptr->plugin_id)) {
3197 /*
3198 * If GRES count changed in configuration between reboots,
3199 * update bitmap sizes as needed.
3200 */
3201 gres_bits = gres_data->gres_cnt_avail;
3202 if (gres_bits != bit_size(gres_data->gres_bit_alloc)) {
3203 info("gres/%s count changed on node %s to %"PRIu64,
3204 context_ptr->gres_name, node_name, gres_bits);
3205 if (_sharing_gres(context_ptr->plugin_id))
3206 *updated_gpu_cnt = true;
3207 gres_data->gres_bit_alloc =
3208 bit_realloc(gres_data->gres_bit_alloc,
3209 gres_bits);
3210 for (i = 0; i < gres_data->topo_cnt; i++) {
3211 if (gres_data->topo_gres_bitmap &&
3212 gres_data->topo_gres_bitmap[i] &&
3213 (gres_bits !=
3214 bit_size(gres_data->topo_gres_bitmap[i]))){
3215 gres_data->topo_gres_bitmap[i] =
3216 bit_realloc(
3217 gres_data->topo_gres_bitmap[i],
3218 gres_bits);
3219 }
3220 }
3221 }
3222 }
3223
3224 return SLURM_SUCCESS;
3225 }
3226
3227 /* The GPU count on a node changed. Update MPS data structures to match */
_sync_node_mps_to_gpu(gres_state_t * mps_gres_ptr,gres_state_t * gpu_gres_ptr)3228 static void _sync_node_mps_to_gpu(gres_state_t *mps_gres_ptr,
3229 gres_state_t *gpu_gres_ptr)
3230 {
3231 gres_node_state_t *gpu_gres_data, *mps_gres_data;
3232 uint64_t gpu_cnt, mps_alloc = 0, mps_rem;
3233 int i;
3234
3235 if (!gpu_gres_ptr || !mps_gres_ptr)
3236 return;
3237
3238 gpu_gres_data = gpu_gres_ptr->gres_data;
3239 mps_gres_data = mps_gres_ptr->gres_data;
3240
3241 gpu_cnt = gpu_gres_data->gres_cnt_avail;
3242 if (mps_gres_data->gres_bit_alloc) {
3243 if (gpu_cnt == bit_size(mps_gres_data->gres_bit_alloc))
3244 return; /* No change for gres/mps */
3245 }
3246
3247 if (gpu_cnt == 0)
3248 return; /* Still no GPUs */
3249
3250 /* Free any excess gres/mps topo records */
3251 for (i = gpu_cnt; i < mps_gres_data->topo_cnt; i++) {
3252 if (mps_gres_data->topo_core_bitmap)
3253 FREE_NULL_BITMAP(mps_gres_data->topo_core_bitmap[i]);
3254 if (mps_gres_data->topo_gres_bitmap)
3255 FREE_NULL_BITMAP(mps_gres_data->topo_gres_bitmap[i]);
3256 xfree(mps_gres_data->topo_type_name[i]);
3257 }
3258
3259 if (mps_gres_data->gres_cnt_avail == 0) {
3260 /* No gres/mps on this node */
3261 mps_gres_data->topo_cnt = 0;
3262 return;
3263 }
3264
3265 if (!mps_gres_data->gres_bit_alloc) {
3266 mps_gres_data->gres_bit_alloc = bit_alloc(gpu_cnt);
3267 } else {
3268 mps_gres_data->gres_bit_alloc =
3269 bit_realloc(mps_gres_data->gres_bit_alloc,
3270 gpu_cnt);
3271 }
3272
3273 /* Add any additional required gres/mps topo records */
3274 if (mps_gres_data->topo_cnt) {
3275 mps_gres_data->topo_core_bitmap =
3276 xrealloc(mps_gres_data->topo_core_bitmap,
3277 sizeof(bitstr_t *) * gpu_cnt);
3278 mps_gres_data->topo_gres_bitmap =
3279 xrealloc(mps_gres_data->topo_gres_bitmap,
3280 sizeof(bitstr_t *) * gpu_cnt);
3281 mps_gres_data->topo_gres_cnt_alloc =
3282 xrealloc(mps_gres_data->topo_gres_cnt_alloc,
3283 sizeof(uint64_t) * gpu_cnt);
3284 mps_gres_data->topo_gres_cnt_avail =
3285 xrealloc(mps_gres_data->topo_gres_cnt_avail,
3286 sizeof(uint64_t) * gpu_cnt);
3287 mps_gres_data->topo_type_id =
3288 xrealloc(mps_gres_data->topo_type_id,
3289 sizeof(uint32_t) * gpu_cnt);
3290 mps_gres_data->topo_type_name =
3291 xrealloc(mps_gres_data->topo_type_name,
3292 sizeof(char *) * gpu_cnt);
3293 } else {
3294 mps_gres_data->topo_core_bitmap =
3295 xcalloc(gpu_cnt, sizeof(bitstr_t *));
3296 mps_gres_data->topo_gres_bitmap =
3297 xcalloc(gpu_cnt, sizeof(bitstr_t *));
3298 mps_gres_data->topo_gres_cnt_alloc =
3299 xcalloc(gpu_cnt, sizeof(uint64_t));
3300 mps_gres_data->topo_gres_cnt_avail =
3301 xcalloc(gpu_cnt, sizeof(uint64_t));
3302 mps_gres_data->topo_type_id =
3303 xcalloc(gpu_cnt, sizeof(uint32_t));
3304 mps_gres_data->topo_type_name =
3305 xcalloc(gpu_cnt, sizeof(char *));
3306 }
3307
3308 /*
3309 * Evenly distribute any remaining MPS counts.
3310 * Counts get reset as needed when the node registers.
3311 */
3312 for (i = 0; i < mps_gres_data->topo_cnt; i++)
3313 mps_alloc += mps_gres_data->topo_gres_cnt_avail[i];
3314 if (mps_alloc >= mps_gres_data->gres_cnt_avail)
3315 mps_rem = 0;
3316 else
3317 mps_rem = mps_gres_data->gres_cnt_avail - mps_alloc;
3318 for (i = mps_gres_data->topo_cnt; i < gpu_cnt; i++) {
3319 mps_gres_data->topo_gres_bitmap[i] = bit_alloc(gpu_cnt);
3320 bit_set(mps_gres_data->topo_gres_bitmap[i], i);
3321 mps_alloc = mps_rem / (gpu_cnt - i);
3322 mps_gres_data->topo_gres_cnt_avail[i] = mps_alloc;
3323 mps_rem -= mps_alloc;
3324 }
3325 mps_gres_data->topo_cnt = gpu_cnt;
3326
3327 for (i = 0; i < mps_gres_data->topo_cnt; i++) {
3328 if (mps_gres_data->topo_gres_bitmap &&
3329 mps_gres_data->topo_gres_bitmap[i] &&
3330 (gpu_cnt != bit_size(mps_gres_data->topo_gres_bitmap[i]))) {
3331 mps_gres_data->topo_gres_bitmap[i] =
3332 bit_realloc(mps_gres_data->topo_gres_bitmap[i],
3333 gpu_cnt);
3334 }
3335 }
3336 }
3337
3338 /* Convert core bitmap into socket string, xfree return value */
_core_bitmap2str(bitstr_t * core_map,int cores_per_sock,int sock_per_node)3339 static char *_core_bitmap2str(bitstr_t *core_map, int cores_per_sock,
3340 int sock_per_node)
3341 {
3342 char *sock_info = NULL, tmp[256];
3343 bitstr_t *sock_map;
3344 int c, s, core_offset, max_core;
3345 bool any_set = false;
3346
3347 xassert(core_map);
3348 max_core = bit_size(core_map) - 1;
3349 sock_map = bit_alloc(sock_per_node);
3350 for (s = 0; s < sock_per_node; s++) {
3351 core_offset = s * cores_per_sock;
3352 for (c = 0; c < cores_per_sock; c++) {
3353 if (core_offset > max_core) {
3354 error("%s: bad core offset (%d >= %d)",
3355 __func__, core_offset, max_core);
3356 break;
3357 }
3358 if (bit_test(core_map, core_offset++)) {
3359 bit_set(sock_map, s);
3360 any_set = true;
3361 break;
3362 }
3363 }
3364 }
3365 if (any_set) {
3366 bit_fmt(tmp, sizeof(tmp), sock_map);
3367 xstrfmtcat(sock_info, "(S:%s)", tmp);
3368 } else {
3369 /* We have a core bitmap with no bits set */
3370 sock_info = xstrdup("");
3371 }
3372 bit_free(sock_map);
3373
3374 return sock_info;
3375 }
3376
3377 /* Given a count, modify it as needed and return suffix (e.g. "M" for mega ) */
_get_suffix(uint64_t * count)3378 static char *_get_suffix(uint64_t *count)
3379 {
3380 if (*count == 0)
3381 return "";
3382 if ((*count % ((uint64_t)1024 * 1024 * 1024 * 1024 * 1024)) == 0) {
3383 *count /= ((uint64_t)1024 * 1024 * 1024 * 1024 * 1024);
3384 return "P";
3385 } else if ((*count % ((uint64_t)1024 * 1024 * 1024 * 1024)) == 0) {
3386 *count /= ((uint64_t)1024 * 1024 * 1024 * 1024);
3387 return "T";
3388 } else if ((*count % ((uint64_t)1024 * 1024 * 1024)) == 0) {
3389 *count /= ((uint64_t)1024 * 1024 * 1024);
3390 return "G";
3391 } else if ((*count % (1024 * 1024)) == 0) {
3392 *count /= (1024 * 1024);
3393 return "M";
3394 } else if ((*count % 1024) == 0) {
3395 *count /= 1024;
3396 return "K";
3397 } else {
3398 return "";
3399 }
3400 }
3401
3402 /* Build node's GRES string based upon data in that node's GRES list */
_build_node_gres_str(List * gres_list,char ** gres_str,int cores_per_sock,int sock_per_node)3403 static void _build_node_gres_str(List *gres_list, char **gres_str,
3404 int cores_per_sock, int sock_per_node)
3405 {
3406 gres_state_t *gres_ptr;
3407 gres_node_state_t *gres_node_state;
3408 bitstr_t *done_topo, *core_map;
3409 uint64_t gres_sum;
3410 char *sep = "", *suffix, *sock_info = NULL, *sock_str;
3411 int c, i, j;
3412
3413 xassert(gres_str);
3414 xfree(*gres_str);
3415 for (c = 0; c < gres_context_cnt; c++) {
3416 /* Find gres_state entry on the list */
3417 gres_ptr = list_find_first(*gres_list, _gres_find_id,
3418 &gres_context[c].plugin_id);
3419 if (gres_ptr == NULL)
3420 continue; /* Node has none of this GRES */
3421
3422 gres_node_state = (gres_node_state_t *) gres_ptr->gres_data;
3423 if (gres_node_state->topo_cnt &&
3424 gres_node_state->gres_cnt_avail) {
3425 done_topo = bit_alloc(gres_node_state->topo_cnt);
3426 for (i = 0; i < gres_node_state->topo_cnt; i++) {
3427 if (bit_test(done_topo, i))
3428 continue;
3429 bit_set(done_topo, i);
3430 gres_sum = gres_node_state->
3431 topo_gres_cnt_avail[i];
3432 if (gres_node_state->topo_core_bitmap[i]) {
3433 core_map = bit_copy(
3434 gres_node_state->
3435 topo_core_bitmap[i]);
3436 } else
3437 core_map = NULL;
3438 for (j = 0; j < gres_node_state->topo_cnt; j++){
3439 if (gres_node_state->topo_type_id[i] !=
3440 gres_node_state->topo_type_id[j])
3441 continue;
3442 if (bit_test(done_topo, j))
3443 continue;
3444 bit_set(done_topo, j);
3445 gres_sum += gres_node_state->
3446 topo_gres_cnt_avail[j];
3447 if (core_map &&
3448 gres_node_state->
3449 topo_core_bitmap[j]) {
3450 bit_or(core_map,
3451 gres_node_state->
3452 topo_core_bitmap[j]);
3453 } else if (gres_node_state->
3454 topo_core_bitmap[j]) {
3455 core_map = bit_copy(
3456 gres_node_state->
3457 topo_core_bitmap[j]);
3458 }
3459 }
3460 if (core_map) {
3461 sock_info = _core_bitmap2str(core_map,
3462 cores_per_sock,
3463 sock_per_node);
3464 bit_free(core_map);
3465 sock_str = sock_info;
3466 } else
3467 sock_str = "";
3468 suffix = _get_suffix(&gres_sum);
3469 if (gres_node_state->topo_type_name[i]) {
3470 xstrfmtcat(*gres_str,
3471 "%s%s:%s:%"PRIu64"%s%s", sep,
3472 gres_context[c].gres_name,
3473 gres_node_state->
3474 topo_type_name[i],
3475 gres_sum, suffix, sock_str);
3476 } else {
3477 xstrfmtcat(*gres_str,
3478 "%s%s:%"PRIu64"%s%s", sep,
3479 gres_context[c].gres_name,
3480 gres_sum, suffix, sock_str);
3481 }
3482 xfree(sock_info);
3483 sep = ",";
3484 }
3485 bit_free(done_topo);
3486 } else if (gres_node_state->type_cnt &&
3487 gres_node_state->gres_cnt_avail) {
3488 for (i = 0; i < gres_node_state->type_cnt; i++) {
3489 gres_sum = gres_node_state->type_cnt_avail[i];
3490 suffix = _get_suffix(&gres_sum);
3491 xstrfmtcat(*gres_str, "%s%s:%s:%"PRIu64"%s",
3492 sep, gres_context[c].gres_name,
3493 gres_node_state->type_name[i],
3494 gres_sum, suffix);
3495 sep = ",";
3496 }
3497 } else if (gres_node_state->gres_cnt_avail) {
3498 gres_sum = gres_node_state->gres_cnt_avail;
3499 suffix = _get_suffix(&gres_sum);
3500 xstrfmtcat(*gres_str, "%s%s:%"PRIu64"%s",
3501 sep, gres_context[c].gres_name,
3502 gres_sum, suffix);
3503 sep = ",";
3504 }
3505 }
3506 }
3507
3508 /*
3509 * Note that a node's configuration has been modified (e.g. "scontol update ..")
3510 * IN node_name - name of the node for which the gres information applies
3511 * IN new_gres - Updated GRES information supplied from slurm.conf or scontrol
3512 * IN/OUT gres_str - Node's current GRES string, updated as needed
3513 * IN/OUT gres_list - List of Gres records for this node to track usage
3514 * IN config_overrides - true: Don't validate hardware, use slurm.conf
3515 * configuration
3516 * false: Validate hardware config, but use slurm.conf
3517 * config
3518 * IN cores_per_sock - Number of cores per socket on this node
3519 * IN sock_per_node - Total count of sockets on this node (on any board)
3520 */
gres_plugin_node_reconfig(char * node_name,char * new_gres,char ** gres_str,List * gres_list,bool config_overrides,int cores_per_sock,int sock_per_node)3521 extern int gres_plugin_node_reconfig(char *node_name,
3522 char *new_gres,
3523 char **gres_str,
3524 List *gres_list,
3525 bool config_overrides,
3526 int cores_per_sock,
3527 int sock_per_node)
3528 {
3529 int i, rc;
3530 ListIterator gres_iter;
3531 gres_state_t *gres_ptr = NULL, **gres_ptr_array;
3532 gres_state_t *gpu_gres_ptr = NULL, *mps_gres_ptr;
3533
3534 rc = gres_plugin_init();
3535 slurm_mutex_lock(&gres_context_lock);
3536 gres_ptr_array = xcalloc(gres_context_cnt, sizeof(gres_state_t *));
3537 if ((gres_context_cnt > 0) && (*gres_list == NULL))
3538 *gres_list = list_create(_gres_node_list_delete);
3539
3540 /* First validate all of the requested GRES changes */
3541 for (i = 0; (rc == SLURM_SUCCESS) && (i < gres_context_cnt); i++) {
3542 /* Find gres_state entry on the list */
3543 gres_ptr = list_find_first(*gres_list, _gres_find_id,
3544 &gres_context[i].plugin_id);
3545 if (gres_ptr == NULL)
3546 continue;
3547 gres_ptr_array[i] = gres_ptr;
3548 rc = _node_reconfig_test(node_name, new_gres, gres_ptr,
3549 &gres_context[i]);
3550 }
3551
3552 /* Now update the GRES counts */
3553 for (i = 0; (rc == SLURM_SUCCESS) && (i < gres_context_cnt); i++) {
3554 bool updated_gpu_cnt = false;
3555 if (gres_ptr_array[i] == NULL)
3556 continue;
3557 rc = _node_reconfig(node_name, new_gres, gres_str,
3558 gres_ptr_array[i], config_overrides,
3559 &gres_context[i], &updated_gpu_cnt);
3560 if (updated_gpu_cnt)
3561 gpu_gres_ptr = gres_ptr;
3562 }
3563
3564 /* Now synchronize gres/gpu and gres/mps state */
3565 if (gpu_gres_ptr && have_mps) {
3566 /* Update gres/mps counts and bitmaps to match gres/gpu */
3567 gres_iter = list_iterator_create(*gres_list);
3568 while ((mps_gres_ptr = (gres_state_t *) list_next(gres_iter))) {
3569 if (_shared_gres(mps_gres_ptr->plugin_id))
3570 break;
3571 }
3572 list_iterator_destroy(gres_iter);
3573 _sync_node_mps_to_gpu(mps_gres_ptr, gpu_gres_ptr);
3574 }
3575
3576 /* Build new per-node gres_str */
3577 _build_node_gres_str(gres_list, gres_str, cores_per_sock,sock_per_node);
3578 slurm_mutex_unlock(&gres_context_lock);
3579 xfree(gres_ptr_array);
3580
3581 return rc;
3582 }
3583
3584 /*
3585 * Pack a node's current gres status, called from slurmctld for save/restore
3586 * IN gres_list - generated by gres_plugin_node_config_validate()
3587 * IN/OUT buffer - location to write state to
3588 * IN node_name - name of the node for which the gres information applies
3589 */
gres_plugin_node_state_pack(List gres_list,Buf buffer,char * node_name)3590 extern int gres_plugin_node_state_pack(List gres_list, Buf buffer,
3591 char *node_name)
3592 {
3593 int rc = SLURM_SUCCESS;
3594 uint32_t top_offset, tail_offset;
3595 uint32_t magic = GRES_MAGIC;
3596 uint16_t gres_bitmap_size, rec_cnt = 0;
3597 ListIterator gres_iter;
3598 gres_state_t *gres_ptr;
3599 gres_node_state_t *gres_node_ptr;
3600
3601 if (gres_list == NULL) {
3602 pack16(rec_cnt, buffer);
3603 return rc;
3604 }
3605
3606 top_offset = get_buf_offset(buffer);
3607 pack16(rec_cnt, buffer); /* placeholder if data */
3608
3609 (void) gres_plugin_init();
3610
3611 slurm_mutex_lock(&gres_context_lock);
3612 gres_iter = list_iterator_create(gres_list);
3613 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
3614 gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
3615 pack32(magic, buffer);
3616 pack32(gres_ptr->plugin_id, buffer);
3617 pack64(gres_node_ptr->gres_cnt_avail, buffer);
3618 /*
3619 * Just note if gres_bit_alloc exists.
3620 * Rebuild it based upon the state of recovered jobs
3621 */
3622 if (gres_node_ptr->gres_bit_alloc)
3623 gres_bitmap_size = bit_size(gres_node_ptr->gres_bit_alloc);
3624 else
3625 gres_bitmap_size = 0;
3626 pack16(gres_bitmap_size, buffer);
3627 rec_cnt++;
3628 }
3629 list_iterator_destroy(gres_iter);
3630 slurm_mutex_unlock(&gres_context_lock);
3631
3632 tail_offset = get_buf_offset(buffer);
3633 set_buf_offset(buffer, top_offset);
3634 pack16(rec_cnt, buffer);
3635 set_buf_offset(buffer, tail_offset);
3636
3637 return rc;
3638 }
3639
3640 /*
3641 * Unpack a node's current gres status, called from slurmctld for save/restore
3642 * OUT gres_list - restored state stored by gres_plugin_node_state_pack()
3643 * IN/OUT buffer - location to read state from
3644 * IN node_name - name of the node for which the gres information applies
3645 */
gres_plugin_node_state_unpack(List * gres_list,Buf buffer,char * node_name,uint16_t protocol_version)3646 extern int gres_plugin_node_state_unpack(List *gres_list, Buf buffer,
3647 char *node_name,
3648 uint16_t protocol_version)
3649 {
3650 int i, rc;
3651 uint32_t magic = 0, plugin_id = 0;
3652 uint64_t gres_cnt_avail = 0;
3653 uint16_t gres_bitmap_size = 0, rec_cnt = 0;
3654 uint8_t has_bitmap = 0;
3655 gres_state_t *gres_ptr;
3656 gres_node_state_t *gres_node_ptr;
3657
3658 safe_unpack16(&rec_cnt, buffer);
3659 if (rec_cnt == 0)
3660 return SLURM_SUCCESS;
3661
3662 rc = gres_plugin_init();
3663
3664 slurm_mutex_lock(&gres_context_lock);
3665 if ((gres_context_cnt > 0) && (*gres_list == NULL))
3666 *gres_list = list_create(_gres_node_list_delete);
3667
3668 while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
3669 if ((buffer == NULL) || (remaining_buf(buffer) == 0))
3670 break;
3671 rec_cnt--;
3672 if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
3673 safe_unpack32(&magic, buffer);
3674 if (magic != GRES_MAGIC)
3675 goto unpack_error;
3676 safe_unpack32(&plugin_id, buffer);
3677 safe_unpack64(&gres_cnt_avail, buffer);
3678 safe_unpack16(&gres_bitmap_size, buffer);
3679 } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
3680 safe_unpack32(&magic, buffer);
3681 if (magic != GRES_MAGIC)
3682 goto unpack_error;
3683 safe_unpack32(&plugin_id, buffer);
3684 safe_unpack64(&gres_cnt_avail, buffer);
3685 safe_unpack8(&has_bitmap, buffer);
3686 if (has_bitmap)
3687 gres_bitmap_size = gres_cnt_avail;
3688 else
3689 gres_bitmap_size = 0;
3690 } else {
3691 error("%s: protocol_version %hu not supported",
3692 __func__, protocol_version);
3693 goto unpack_error;
3694 }
3695 for (i = 0; i < gres_context_cnt; i++) {
3696 if (gres_context[i].plugin_id == plugin_id)
3697 break;
3698 }
3699 if (i >= gres_context_cnt) {
3700 error("%s: no plugin configured to unpack data type %u from node %s",
3701 __func__, plugin_id, node_name);
3702 /*
3703 * A likely sign that GresPlugins has changed.
3704 * Not a fatal error, skip over the data.
3705 */
3706 continue;
3707 }
3708 gres_node_ptr = _build_gres_node_state();
3709 gres_node_ptr->gres_cnt_avail = gres_cnt_avail;
3710 if (gres_bitmap_size) {
3711 gres_node_ptr->gres_bit_alloc =
3712 bit_alloc(gres_bitmap_size);
3713 }
3714 gres_ptr = xmalloc(sizeof(gres_state_t));
3715 gres_ptr->plugin_id = gres_context[i].plugin_id;
3716 gres_ptr->gres_data = gres_node_ptr;
3717 list_append(*gres_list, gres_ptr);
3718 }
3719 slurm_mutex_unlock(&gres_context_lock);
3720 return rc;
3721
3722 unpack_error:
3723 error("%s: unpack error from node %s", __func__, node_name);
3724 slurm_mutex_unlock(&gres_context_lock);
3725 return SLURM_ERROR;
3726 }
3727
_node_state_dup(void * gres_data)3728 static void *_node_state_dup(void *gres_data)
3729 {
3730 int i, j;
3731 gres_node_state_t *gres_ptr = (gres_node_state_t *) gres_data;
3732 gres_node_state_t *new_gres;
3733
3734 if (gres_ptr == NULL)
3735 return NULL;
3736
3737 new_gres = xmalloc(sizeof(gres_node_state_t));
3738 new_gres->gres_cnt_found = gres_ptr->gres_cnt_found;
3739 new_gres->gres_cnt_config = gres_ptr->gres_cnt_config;
3740 new_gres->gres_cnt_avail = gres_ptr->gres_cnt_avail;
3741 new_gres->gres_cnt_alloc = gres_ptr->gres_cnt_alloc;
3742 new_gres->no_consume = gres_ptr->no_consume;
3743 if (gres_ptr->gres_bit_alloc)
3744 new_gres->gres_bit_alloc = bit_copy(gres_ptr->gres_bit_alloc);
3745
3746 if (gres_ptr->links_cnt && gres_ptr->link_len) {
3747 new_gres->links_cnt = xcalloc(gres_ptr->link_len,
3748 sizeof(int *));
3749 j = sizeof(int) * gres_ptr->link_len;
3750 for (i = 0; i < gres_ptr->link_len; i++) {
3751 new_gres->links_cnt[i] = xmalloc(j);
3752 memcpy(new_gres->links_cnt[i],gres_ptr->links_cnt[i],j);
3753 }
3754 new_gres->link_len = gres_ptr->link_len;
3755 }
3756
3757 if (gres_ptr->topo_cnt) {
3758 new_gres->topo_cnt = gres_ptr->topo_cnt;
3759 new_gres->topo_core_bitmap = xcalloc(gres_ptr->topo_cnt,
3760 sizeof(bitstr_t *));
3761 new_gres->topo_gres_bitmap = xcalloc(gres_ptr->topo_cnt,
3762 sizeof(bitstr_t *));
3763 new_gres->topo_gres_cnt_alloc = xcalloc(gres_ptr->topo_cnt,
3764 sizeof(uint64_t));
3765 new_gres->topo_gres_cnt_avail = xcalloc(gres_ptr->topo_cnt,
3766 sizeof(uint64_t));
3767 new_gres->topo_type_id = xcalloc(gres_ptr->topo_cnt,
3768 sizeof(uint32_t));
3769 new_gres->topo_type_name = xcalloc(gres_ptr->topo_cnt,
3770 sizeof(char *));
3771 for (i = 0; i < gres_ptr->topo_cnt; i++) {
3772 if (gres_ptr->topo_core_bitmap[i]) {
3773 new_gres->topo_core_bitmap[i] =
3774 bit_copy(gres_ptr->topo_core_bitmap[i]);
3775 }
3776 new_gres->topo_gres_bitmap[i] =
3777 bit_copy(gres_ptr->topo_gres_bitmap[i]);
3778 new_gres->topo_gres_cnt_alloc[i] =
3779 gres_ptr->topo_gres_cnt_alloc[i];
3780 new_gres->topo_gres_cnt_avail[i] =
3781 gres_ptr->topo_gres_cnt_avail[i];
3782 new_gres->topo_type_id[i] = gres_ptr->topo_type_id[i];
3783 new_gres->topo_type_name[i] =
3784 xstrdup(gres_ptr->topo_type_name[i]);
3785 }
3786 }
3787
3788 if (gres_ptr->type_cnt) {
3789 new_gres->type_cnt = gres_ptr->type_cnt;
3790 new_gres->type_cnt_alloc = xcalloc(gres_ptr->type_cnt,
3791 sizeof(uint64_t));
3792 new_gres->type_cnt_avail = xcalloc(gres_ptr->type_cnt,
3793 sizeof(uint64_t));
3794 new_gres->type_id = xcalloc(gres_ptr->type_cnt,
3795 sizeof(uint32_t));
3796 new_gres->type_name = xcalloc(gres_ptr->type_cnt,
3797 sizeof(char *));
3798 for (i = 0; i < gres_ptr->type_cnt; i++) {
3799 new_gres->type_cnt_alloc[i] =
3800 gres_ptr->type_cnt_alloc[i];
3801 new_gres->type_cnt_avail[i] =
3802 gres_ptr->type_cnt_avail[i];
3803 new_gres->type_id[i] = gres_ptr->type_id[i];
3804 new_gres->type_name[i] =
3805 xstrdup(gres_ptr->type_name[i]);
3806 }
3807 }
3808
3809 return new_gres;
3810 }
3811
3812 /*
3813 * Duplicate a node gres status (used for will-run logic)
3814 * IN gres_list - node gres state information
3815 * RET a copy of gres_list or NULL on failure
3816 */
gres_plugin_node_state_dup(List gres_list)3817 extern List gres_plugin_node_state_dup(List gres_list)
3818 {
3819 int i;
3820 List new_list = NULL;
3821 ListIterator gres_iter;
3822 gres_state_t *gres_ptr, *new_gres;
3823 void *gres_data;
3824
3825 if (gres_list == NULL)
3826 return new_list;
3827
3828 (void) gres_plugin_init();
3829
3830 slurm_mutex_lock(&gres_context_lock);
3831 if ((gres_context_cnt > 0)) {
3832 new_list = list_create(_gres_node_list_delete);
3833 }
3834 gres_iter = list_iterator_create(gres_list);
3835 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
3836 for (i=0; i<gres_context_cnt; i++) {
3837 if (gres_ptr->plugin_id != gres_context[i].plugin_id)
3838 continue;
3839 gres_data = _node_state_dup(gres_ptr->gres_data);
3840 if (gres_data) {
3841 new_gres = xmalloc(sizeof(gres_state_t));
3842 new_gres->plugin_id = gres_ptr->plugin_id;
3843 new_gres->gres_data = gres_data;
3844 list_append(new_list, new_gres);
3845 }
3846 break;
3847 }
3848 if (i >= gres_context_cnt) {
3849 error("Could not find plugin id %u to dup node record",
3850 gres_ptr->plugin_id);
3851 }
3852 }
3853 list_iterator_destroy(gres_iter);
3854 slurm_mutex_unlock(&gres_context_lock);
3855
3856 return new_list;
3857 }
3858
_node_state_dealloc(gres_state_t * gres_ptr)3859 static void _node_state_dealloc(gres_state_t *gres_ptr)
3860 {
3861 int i;
3862 gres_node_state_t *gres_node_ptr;
3863 char *gres_name = NULL;
3864
3865 gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
3866 gres_node_ptr->gres_cnt_alloc = 0;
3867 if (gres_node_ptr->gres_bit_alloc) {
3868 int i = bit_size(gres_node_ptr->gres_bit_alloc) - 1;
3869 if (i >= 0)
3870 bit_nclear(gres_node_ptr->gres_bit_alloc, 0, i);
3871 }
3872
3873 if (gres_node_ptr->topo_cnt && !gres_node_ptr->topo_gres_cnt_alloc) {
3874 for (i = 0; i < gres_context_cnt; i++) {
3875 if (gres_ptr->plugin_id == gres_context[i].plugin_id) {
3876 gres_name = gres_context[i].gres_name;
3877 break;
3878 }
3879 }
3880 error("gres_plugin_node_state_dealloc_all: gres/%s topo_cnt!=0 "
3881 "and topo_gres_cnt_alloc is NULL", gres_name);
3882 } else if (gres_node_ptr->topo_cnt) {
3883 for (i = 0; i < gres_node_ptr->topo_cnt; i++) {
3884 gres_node_ptr->topo_gres_cnt_alloc[i] = 0;
3885 }
3886 } else {
3887 /*
3888 * This array can be set at startup if a job has been allocated
3889 * specific GRES and the node has not registered with the
3890 * details needed to track individual GRES (rather than only
3891 * a GRES count).
3892 */
3893 xfree(gres_node_ptr->topo_gres_cnt_alloc);
3894 }
3895
3896 for (i = 0; i < gres_node_ptr->type_cnt; i++) {
3897 gres_node_ptr->type_cnt_alloc[i] = 0;
3898 }
3899 }
3900
3901 /*
3902 * Deallocate all resources on this node previous allocated to any jobs.
3903 * This function isused to synchronize state after slurmctld restarts or
3904 * is reconfigured.
3905 * IN gres_list - node gres state information
3906 */
gres_plugin_node_state_dealloc_all(List gres_list)3907 extern void gres_plugin_node_state_dealloc_all(List gres_list)
3908 {
3909 ListIterator gres_iter;
3910 gres_state_t *gres_ptr;
3911
3912 if (gres_list == NULL)
3913 return;
3914
3915 (void) gres_plugin_init();
3916
3917 slurm_mutex_lock(&gres_context_lock);
3918 gres_iter = list_iterator_create(gres_list);
3919 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
3920 _node_state_dealloc(gres_ptr);
3921 }
3922 list_iterator_destroy(gres_iter);
3923 slurm_mutex_unlock(&gres_context_lock);
3924 }
3925
_node_gres_used(void * gres_data,char * gres_name)3926 static char *_node_gres_used(void *gres_data, char *gres_name)
3927 {
3928 gres_node_state_t *gres_node_ptr;
3929 char *sep = "";
3930 int i, j;
3931
3932 xassert(gres_data);
3933 gres_node_ptr = (gres_node_state_t *) gres_data;
3934
3935 if ((gres_node_ptr->topo_cnt != 0) &&
3936 (gres_node_ptr->no_consume == false)) {
3937 bitstr_t *topo_printed = bit_alloc(gres_node_ptr->topo_cnt);
3938 xfree(gres_node_ptr->gres_used); /* Free any cached value */
3939 for (i = 0; i < gres_node_ptr->topo_cnt; i++) {
3940 bitstr_t *topo_gres_bitmap = NULL;
3941 uint64_t gres_alloc_cnt = 0;
3942 char *gres_alloc_idx, tmp_str[64];
3943 if (bit_test(topo_printed, i))
3944 continue;
3945 bit_set(topo_printed, i);
3946 if (gres_node_ptr->topo_gres_bitmap[i]) {
3947 topo_gres_bitmap =
3948 bit_copy(gres_node_ptr->
3949 topo_gres_bitmap[i]);
3950 }
3951 for (j = i + 1; j < gres_node_ptr->topo_cnt; j++) {
3952 if (bit_test(topo_printed, j))
3953 continue;
3954 if (gres_node_ptr->topo_type_id[i] !=
3955 gres_node_ptr->topo_type_id[j])
3956 continue;
3957 bit_set(topo_printed, j);
3958 if (gres_node_ptr->topo_gres_bitmap[j]) {
3959 if (!topo_gres_bitmap) {
3960 topo_gres_bitmap =
3961 bit_copy(gres_node_ptr->
3962 topo_gres_bitmap[j]);
3963 } else if (bit_size(topo_gres_bitmap) ==
3964 bit_size(gres_node_ptr->
3965 topo_gres_bitmap[j])){
3966 bit_or(topo_gres_bitmap,
3967 gres_node_ptr->
3968 topo_gres_bitmap[j]);
3969 }
3970 }
3971 }
3972 if (gres_node_ptr->gres_bit_alloc && topo_gres_bitmap &&
3973 (bit_size(topo_gres_bitmap) ==
3974 bit_size(gres_node_ptr->gres_bit_alloc))) {
3975 bit_and(topo_gres_bitmap,
3976 gres_node_ptr->gres_bit_alloc);
3977 gres_alloc_cnt = bit_set_count(topo_gres_bitmap);
3978 }
3979 if (gres_alloc_cnt > 0) {
3980 bit_fmt(tmp_str, sizeof(tmp_str),
3981 topo_gres_bitmap);
3982 gres_alloc_idx = tmp_str;
3983 } else {
3984 gres_alloc_idx = "N/A";
3985 }
3986 xstrfmtcat(gres_node_ptr->gres_used,
3987 "%s%s:%s:%"PRIu64"(IDX:%s)", sep, gres_name,
3988 gres_node_ptr->topo_type_name[i],
3989 gres_alloc_cnt, gres_alloc_idx);
3990 sep = ",";
3991 FREE_NULL_BITMAP(topo_gres_bitmap);
3992 }
3993 FREE_NULL_BITMAP(topo_printed);
3994 } else if (gres_node_ptr->gres_used) {
3995 ; /* Used cached value */
3996 } else if (gres_node_ptr->type_cnt == 0) {
3997 if (gres_node_ptr->no_consume) {
3998 xstrfmtcat(gres_node_ptr->gres_used, "%s:0", gres_name);
3999 } else {
4000 xstrfmtcat(gres_node_ptr->gres_used, "%s:%"PRIu64,
4001 gres_name, gres_node_ptr->gres_cnt_alloc);
4002 }
4003 } else {
4004 for (i = 0; i < gres_node_ptr->type_cnt; i++) {
4005 if (gres_node_ptr->no_consume) {
4006 xstrfmtcat(gres_node_ptr->gres_used,
4007 "%s%s:%s:0", sep, gres_name,
4008 gres_node_ptr->type_name[i]);
4009 } else {
4010 xstrfmtcat(gres_node_ptr->gres_used,
4011 "%s%s:%s:%"PRIu64, sep, gres_name,
4012 gres_node_ptr->type_name[i],
4013 gres_node_ptr->type_cnt_alloc[i]);
4014 }
4015 sep = ",";
4016 }
4017 }
4018
4019 return gres_node_ptr->gres_used;
4020 }
4021
_node_state_log(void * gres_data,char * node_name,char * gres_name)4022 static void _node_state_log(void *gres_data, char *node_name, char *gres_name)
4023 {
4024 gres_node_state_t *gres_node_ptr;
4025 int i, j;
4026 char *buf = NULL, *sep, tmp_str[128];
4027
4028 xassert(gres_data);
4029 gres_node_ptr = (gres_node_state_t *) gres_data;
4030
4031 info("gres/%s: state for %s", gres_name, node_name);
4032 if (gres_node_ptr->gres_cnt_found == NO_VAL64) {
4033 snprintf(tmp_str, sizeof(tmp_str), "TBD");
4034 } else {
4035 snprintf(tmp_str, sizeof(tmp_str), "%"PRIu64,
4036 gres_node_ptr->gres_cnt_found);
4037 }
4038
4039 if (gres_node_ptr->no_consume) {
4040 info(" gres_cnt found:%s configured:%"PRIu64" "
4041 "avail:%"PRIu64" no_consume",
4042 tmp_str, gres_node_ptr->gres_cnt_config,
4043 gres_node_ptr->gres_cnt_avail);
4044 } else {
4045 info(" gres_cnt found:%s configured:%"PRIu64" "
4046 "avail:%"PRIu64" alloc:%"PRIu64"",
4047 tmp_str, gres_node_ptr->gres_cnt_config,
4048 gres_node_ptr->gres_cnt_avail,
4049 gres_node_ptr->gres_cnt_alloc);
4050 }
4051
4052 if (gres_node_ptr->gres_bit_alloc) {
4053 bit_fmt(tmp_str, sizeof(tmp_str),gres_node_ptr->gres_bit_alloc);
4054 info(" gres_bit_alloc:%s of %d",
4055 tmp_str, (int) bit_size(gres_node_ptr->gres_bit_alloc));
4056 } else {
4057 info(" gres_bit_alloc:NULL");
4058 }
4059
4060 info(" gres_used:%s", gres_node_ptr->gres_used);
4061
4062 if (gres_node_ptr->links_cnt && gres_node_ptr->link_len) {
4063 for (i = 0; i < gres_node_ptr->link_len; i++) {
4064 sep = "";
4065 for (j = 0; j < gres_node_ptr->link_len; j++) {
4066 xstrfmtcat(buf, "%s%d", sep,
4067 gres_node_ptr->links_cnt[i][j]);
4068 sep = ", ";
4069 }
4070 info(" links[%d]:%s", i, buf);
4071 xfree(buf);
4072 }
4073 }
4074
4075 for (i = 0; i < gres_node_ptr->topo_cnt; i++) {
4076 info(" topo[%d]:%s(%u)", i, gres_node_ptr->topo_type_name[i],
4077 gres_node_ptr->topo_type_id[i]);
4078 if (gres_node_ptr->topo_core_bitmap[i]) {
4079 bit_fmt(tmp_str, sizeof(tmp_str),
4080 gres_node_ptr->topo_core_bitmap[i]);
4081 info(" topo_core_bitmap[%d]:%s of %d", i, tmp_str,
4082 (int)bit_size(gres_node_ptr->topo_core_bitmap[i]));
4083 } else
4084 info(" topo_core_bitmap[%d]:NULL", i);
4085 if (gres_node_ptr->topo_gres_bitmap[i]) {
4086 bit_fmt(tmp_str, sizeof(tmp_str),
4087 gres_node_ptr->topo_gres_bitmap[i]);
4088 info(" topo_gres_bitmap[%d]:%s of %d", i, tmp_str,
4089 (int)bit_size(gres_node_ptr->topo_gres_bitmap[i]));
4090 } else
4091 info(" topo_gres_bitmap[%d]:NULL", i);
4092 info(" topo_gres_cnt_alloc[%d]:%"PRIu64"", i,
4093 gres_node_ptr->topo_gres_cnt_alloc[i]);
4094 info(" topo_gres_cnt_avail[%d]:%"PRIu64"", i,
4095 gres_node_ptr->topo_gres_cnt_avail[i]);
4096 }
4097
4098 for (i = 0; i < gres_node_ptr->type_cnt; i++) {
4099 info(" type[%d]:%s(%u)", i, gres_node_ptr->type_name[i],
4100 gres_node_ptr->type_id[i]);
4101 info(" type_cnt_alloc[%d]:%"PRIu64, i,
4102 gres_node_ptr->type_cnt_alloc[i]);
4103 info(" type_cnt_avail[%d]:%"PRIu64, i,
4104 gres_node_ptr->type_cnt_avail[i]);
4105 }
4106 }
4107
4108 /*
4109 * Log a node's current gres state
4110 * IN gres_list - generated by gres_plugin_node_config_validate()
4111 * IN node_name - name of the node for which the gres information applies
4112 */
gres_plugin_node_state_log(List gres_list,char * node_name)4113 extern void gres_plugin_node_state_log(List gres_list, char *node_name)
4114 {
4115 int i;
4116 ListIterator gres_iter;
4117 gres_state_t *gres_ptr;
4118
4119 if (!gres_debug || (gres_list == NULL))
4120 return;
4121
4122 (void) gres_plugin_init();
4123
4124 slurm_mutex_lock(&gres_context_lock);
4125 gres_iter = list_iterator_create(gres_list);
4126 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
4127 for (i = 0; i < gres_context_cnt; i++) {
4128 if (gres_ptr->plugin_id !=
4129 gres_context[i].plugin_id)
4130 continue;
4131 _node_state_log(gres_ptr->gres_data, node_name,
4132 gres_context[i].gres_name);
4133 break;
4134 }
4135 }
4136 list_iterator_destroy(gres_iter);
4137 slurm_mutex_unlock(&gres_context_lock);
4138 }
4139
4140 /*
4141 * Build a string indicating a node's drained GRES
4142 * IN gres_list - generated by gres_plugin_node_config_validate()
4143 * RET - string, must be xfreed by caller
4144 */
gres_get_node_drain(List gres_list)4145 extern char *gres_get_node_drain(List gres_list)
4146 {
4147 char *node_drain = xstrdup("N/A");
4148
4149 return node_drain;
4150 }
4151
4152 /*
4153 * Build a string indicating a node's used GRES
4154 * IN gres_list - generated by gres_plugin_node_config_validate()
4155 * RET - string, must be xfreed by caller
4156 */
gres_get_node_used(List gres_list)4157 extern char *gres_get_node_used(List gres_list)
4158 {
4159 int i;
4160 ListIterator gres_iter;
4161 gres_state_t *gres_ptr;
4162 char *gres_used = NULL, *tmp;
4163
4164 if (!gres_list)
4165 return gres_used;
4166
4167 (void) gres_plugin_init();
4168
4169 slurm_mutex_lock(&gres_context_lock);
4170 gres_iter = list_iterator_create(gres_list);
4171 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
4172 for (i = 0; i < gres_context_cnt; i++) {
4173 if (gres_ptr->plugin_id !=
4174 gres_context[i].plugin_id)
4175 continue;
4176 tmp = _node_gres_used(gres_ptr->gres_data,
4177 gres_context[i].gres_name);
4178 if (!tmp)
4179 continue;
4180 if (gres_used)
4181 xstrcat(gres_used, ",");
4182 xstrcat(gres_used, tmp);
4183 break;
4184 }
4185 }
4186 list_iterator_destroy(gres_iter);
4187 slurm_mutex_unlock(&gres_context_lock);
4188
4189 return gres_used;
4190 }
4191
4192 /*
4193 * Give the total system count of a given GRES
4194 * Returns NO_VAL64 if name not found
4195 */
gres_get_system_cnt(char * name)4196 extern uint64_t gres_get_system_cnt(char *name)
4197 {
4198 uint64_t count = NO_VAL64;
4199 int i;
4200
4201 if (!name)
4202 return NO_VAL64;
4203
4204 (void) gres_plugin_init();
4205
4206 slurm_mutex_lock(&gres_context_lock);
4207 for (i = 0; i < gres_context_cnt; i++) {
4208 if (!xstrcmp(gres_context[i].gres_name, name)) {
4209 count = gres_context[i].total_cnt;
4210 break;
4211 }
4212 }
4213 slurm_mutex_unlock(&gres_context_lock);
4214 return count;
4215 }
4216
4217
4218 /*
4219 * Get the count of a node's GRES
4220 * IN gres_list - List of Gres records for this node to track usage
4221 * IN name - name of gres
4222 */
gres_plugin_node_config_cnt(List gres_list,char * name)4223 extern uint64_t gres_plugin_node_config_cnt(List gres_list, char *name)
4224 {
4225 int i;
4226 gres_state_t *gres_ptr;
4227 gres_node_state_t *data_ptr;
4228 uint64_t count = 0;
4229
4230 if (!gres_list || !name || !list_count(gres_list))
4231 return count;
4232
4233 (void) gres_plugin_init();
4234
4235 slurm_mutex_lock(&gres_context_lock);
4236 for (i = 0; i < gres_context_cnt; i++) {
4237 if (!xstrcmp(gres_context[i].gres_name, name)) {
4238 /* Find or create gres_state entry on the list */
4239 gres_ptr = list_find_first(gres_list, _gres_find_id,
4240 &gres_context[i].plugin_id);
4241
4242 if (!gres_ptr || !gres_ptr->gres_data)
4243 break;
4244 data_ptr = (gres_node_state_t *)gres_ptr->gres_data;
4245 count = data_ptr->gres_cnt_config;
4246 break;
4247 } else if (!xstrncmp(name, gres_context[i].gres_name_colon,
4248 gres_context[i].gres_name_colon_len)) {
4249 int type;
4250 uint32_t type_id;
4251 char *type_str = NULL;
4252
4253 if (!(type_str = strchr(name, ':'))) {
4254 error("Invalid gres name '%s'", name);
4255 break;
4256 }
4257 type_str++;
4258
4259 gres_ptr = list_find_first(gres_list, _gres_find_id,
4260 &gres_context[i].plugin_id);
4261
4262 if (!gres_ptr || !gres_ptr->gres_data)
4263 break;
4264 data_ptr = (gres_node_state_t *)gres_ptr->gres_data;
4265 type_id = gres_plugin_build_id(type_str);
4266 for (type = 0; type < data_ptr->type_cnt; type++) {
4267 if (data_ptr->type_id[type] == type_id) {
4268 count = data_ptr->type_cnt_avail[type];
4269 break;
4270 }
4271 }
4272 break;
4273 }
4274 }
4275 slurm_mutex_unlock(&gres_context_lock);
4276
4277 return count;
4278 }
4279
_job_state_delete(void * gres_data)4280 static void _job_state_delete(void *gres_data)
4281 {
4282 int i;
4283 gres_job_state_t *gres_ptr = (gres_job_state_t *) gres_data;
4284
4285 if (gres_ptr == NULL)
4286 return;
4287
4288 for (i = 0; i < gres_ptr->node_cnt; i++) {
4289 if (gres_ptr->gres_bit_alloc)
4290 FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc[i]);
4291 if (gres_ptr->gres_bit_step_alloc)
4292 FREE_NULL_BITMAP(gres_ptr->gres_bit_step_alloc[i]);
4293 }
4294 xfree(gres_ptr->gres_bit_alloc);
4295 xfree(gres_ptr->gres_cnt_node_alloc);
4296 xfree(gres_ptr->gres_bit_step_alloc);
4297 xfree(gres_ptr->gres_cnt_step_alloc);
4298 if (gres_ptr->gres_bit_select) {
4299 for (i = 0; i < gres_ptr->total_node_cnt; i++)
4300 FREE_NULL_BITMAP(gres_ptr->gres_bit_select[i]);
4301 xfree(gres_ptr->gres_bit_select);
4302 }
4303 xfree(gres_ptr->gres_cnt_node_alloc);
4304 xfree(gres_ptr->gres_cnt_node_select);
4305 xfree(gres_ptr->gres_name);
4306 xfree(gres_ptr->type_name);
4307 xfree(gres_ptr);
4308 }
4309
_gres_job_list_delete(void * list_element)4310 static void _gres_job_list_delete(void *list_element)
4311 {
4312 gres_state_t *gres_ptr;
4313
4314 if (gres_plugin_init() != SLURM_SUCCESS)
4315 return;
4316
4317 gres_ptr = (gres_state_t *) list_element;
4318 slurm_mutex_lock(&gres_context_lock);
4319 _job_state_delete(gres_ptr->gres_data);
4320 xfree(gres_ptr);
4321 slurm_mutex_unlock(&gres_context_lock);
4322 }
4323
_clear_cpus_per_gres(void * x,void * arg)4324 static int _clear_cpus_per_gres(void *x, void *arg)
4325 {
4326 gres_state_t *gres_ptr = (gres_state_t *) x;
4327 gres_job_state_t *job_gres_data;
4328 job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
4329 job_gres_data->cpus_per_gres = 0;
4330 return 0;
4331 }
_clear_gres_per_job(void * x,void * arg)4332 static int _clear_gres_per_job(void *x, void *arg)
4333 {
4334 gres_state_t *gres_ptr = (gres_state_t *) x;
4335 gres_job_state_t *job_gres_data;
4336 job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
4337 job_gres_data->gres_per_job = 0;
4338 return 0;
4339 }
_clear_gres_per_node(void * x,void * arg)4340 static int _clear_gres_per_node(void *x, void *arg)
4341 {
4342 gres_state_t *gres_ptr = (gres_state_t *) x;
4343 gres_job_state_t *job_gres_data;
4344 job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
4345 job_gres_data->gres_per_node = 0;
4346 return 0;
4347 }
_clear_gres_per_socket(void * x,void * arg)4348 static int _clear_gres_per_socket(void *x, void *arg)
4349 {
4350 gres_state_t *gres_ptr = (gres_state_t *) x;
4351 gres_job_state_t *job_gres_data;
4352 job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
4353 job_gres_data->gres_per_socket = 0;
4354 return 0;
4355 }
_clear_gres_per_task(void * x,void * arg)4356 static int _clear_gres_per_task(void *x, void *arg)
4357 {
4358 gres_state_t *gres_ptr = (gres_state_t *) x;
4359 gres_job_state_t *job_gres_data;
4360 job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
4361 job_gres_data->gres_per_task = 0;
4362 return 0;
4363 }
_clear_mem_per_gres(void * x,void * arg)4364 static int _clear_mem_per_gres(void *x, void *arg)
4365 {
4366 gres_state_t *gres_ptr = (gres_state_t *) x;
4367 gres_job_state_t *job_gres_data;
4368 job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
4369 job_gres_data->mem_per_gres = 0;
4370 return 0;
4371 }
_clear_total_gres(void * x,void * arg)4372 static int _clear_total_gres(void *x, void *arg)
4373 {
4374 gres_state_t *gres_ptr = (gres_state_t *) x;
4375 gres_job_state_t *job_gres_data;
4376 job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
4377 job_gres_data->total_gres = 0;
4378 return 0;
4379 }
4380
4381 /*
4382 * Ensure consistency of gres_per_* options
4383 * Modify task and node count as needed for consistentcy with GRES options
4384 * RET -1 on failure, 0 on success
4385 */
_test_gres_cnt(gres_job_state_t * job_gres_data,uint32_t * num_tasks,uint32_t * min_nodes,uint32_t * max_nodes,uint16_t * ntasks_per_node,uint16_t * ntasks_per_socket,uint16_t * sockets_per_node,uint16_t * cpus_per_task)4386 static int _test_gres_cnt(gres_job_state_t *job_gres_data,
4387 uint32_t *num_tasks,
4388 uint32_t *min_nodes, uint32_t *max_nodes,
4389 uint16_t *ntasks_per_node,
4390 uint16_t *ntasks_per_socket,
4391 uint16_t *sockets_per_node,
4392 uint16_t *cpus_per_task)
4393 {
4394 int req_nodes, req_tasks, req_tasks_per_node, req_tasks_per_socket;
4395 int req_sockets, req_cpus_per_task;
4396 uint16_t cpus_per_gres;
4397
4398 /* Ensure gres_per_job >= gres_per_node >= gres_per_socket */
4399 if (job_gres_data->gres_per_job &&
4400 ((job_gres_data->gres_per_node &&
4401 (job_gres_data->gres_per_node > job_gres_data->gres_per_job)) ||
4402 (job_gres_data->gres_per_task &&
4403 (job_gres_data->gres_per_task > job_gres_data->gres_per_job)) ||
4404 (job_gres_data->gres_per_socket &&
4405 (job_gres_data->gres_per_socket > job_gres_data->gres_per_job))))
4406 return -1;
4407
4408 /* Ensure gres_per_job >= gres_per_task */
4409 if (job_gres_data->gres_per_node &&
4410 ((job_gres_data->gres_per_task &&
4411 (job_gres_data->gres_per_task > job_gres_data->gres_per_node)) ||
4412 (job_gres_data->gres_per_socket &&
4413 (job_gres_data->gres_per_socket > job_gres_data->gres_per_node))))
4414 return -1;
4415
4416 /* gres_per_socket requires sockets-per-node count specification */
4417 if (job_gres_data->gres_per_socket) {
4418 if (*sockets_per_node == NO_VAL16)
4419 return -1;
4420 }
4421
4422 /*
4423 * Ensure gres_per_job is multiple of gres_per_node
4424 * Ensure node count is consistent with GRES parameters
4425 */
4426 if (job_gres_data->gres_per_job && job_gres_data->gres_per_node) {
4427 if (job_gres_data->gres_per_job % job_gres_data->gres_per_node){
4428 /* gres_per_job not multiple of gres_per_node */
4429 return -1;
4430 }
4431 req_nodes = job_gres_data->gres_per_job /
4432 job_gres_data->gres_per_node;
4433 if ((req_nodes < *min_nodes) || (req_nodes > *max_nodes))
4434 return -1;
4435 *min_nodes = *max_nodes = req_nodes;
4436 }
4437
4438 /*
4439 * Ensure gres_per_node is multiple of gres_per_socket
4440 * Ensure task count is consistent with GRES parameters
4441 */
4442 if (job_gres_data->gres_per_node && job_gres_data->gres_per_socket) {
4443 if (job_gres_data->gres_per_node %
4444 job_gres_data->gres_per_socket) {
4445 /* gres_per_node not multiple of gres_per_socket */
4446 return -1;
4447 }
4448 req_sockets = job_gres_data->gres_per_node /
4449 job_gres_data->gres_per_socket;
4450 if (*sockets_per_node == NO_VAL16)
4451 *sockets_per_node = req_sockets;
4452 else if (*sockets_per_node != req_sockets)
4453 return -1;
4454 }
4455 /*
4456 * Ensure gres_per_job is multiple of gres_per_task
4457 * Ensure task count is consistent with GRES parameters
4458 */
4459 if (job_gres_data->gres_per_task) {
4460 if(job_gres_data->gres_per_job) {
4461 if (job_gres_data->gres_per_job %
4462 job_gres_data->gres_per_task) {
4463 /* gres_per_job not multiple of gres_per_task */
4464 return -1;
4465 }
4466 req_tasks = job_gres_data->gres_per_job /
4467 job_gres_data->gres_per_task;
4468 if (*num_tasks == NO_VAL)
4469 *num_tasks = req_tasks;
4470 else if (*num_tasks != req_tasks)
4471 return -1;
4472 } else if (*num_tasks != NO_VAL) {
4473 job_gres_data->gres_per_job = *num_tasks *
4474 job_gres_data->gres_per_task;
4475 } else {
4476 return -1;
4477 }
4478 }
4479
4480 /*
4481 * Ensure gres_per_node is multiple of gres_per_task
4482 * Ensure tasks_per_node is consistent with GRES parameters
4483 */
4484 if (job_gres_data->gres_per_node && job_gres_data->gres_per_task) {
4485 if (job_gres_data->gres_per_node %
4486 job_gres_data->gres_per_task) {
4487 /* gres_per_node not multiple of gres_per_task */
4488 return -1;
4489 }
4490 req_tasks_per_node = job_gres_data->gres_per_node /
4491 job_gres_data->gres_per_task;
4492 if ((*ntasks_per_node == NO_VAL16) ||
4493 (*ntasks_per_node == 0))
4494 *ntasks_per_node = req_tasks_per_node;
4495 else if (*ntasks_per_node != req_tasks_per_node)
4496 return -1;
4497 }
4498
4499 /*
4500 * Ensure gres_per_socket is multiple of gres_per_task
4501 * Ensure ntasks_per_socket is consistent with GRES parameters
4502 */
4503 if (job_gres_data->gres_per_socket && job_gres_data->gres_per_task) {
4504 if (job_gres_data->gres_per_socket %
4505 job_gres_data->gres_per_task) {
4506 /* gres_per_socket not multiple of gres_per_task */
4507 return -1;
4508 }
4509 req_tasks_per_socket = job_gres_data->gres_per_socket /
4510 job_gres_data->gres_per_task;
4511 if ((*ntasks_per_socket == NO_VAL16) ||
4512 (*ntasks_per_socket == 0))
4513 *ntasks_per_socket = req_tasks_per_socket;
4514 else if (*ntasks_per_socket != req_tasks_per_socket)
4515 return -1;
4516 }
4517
4518 /* Ensure that cpus_per_gres * gres_per_task == cpus_per_task */
4519 if (job_gres_data->cpus_per_gres)
4520 cpus_per_gres = job_gres_data->cpus_per_gres;
4521 else
4522 cpus_per_gres = job_gres_data->def_cpus_per_gres;
4523 if (cpus_per_gres && job_gres_data->gres_per_task) {
4524 req_cpus_per_task = cpus_per_gres *job_gres_data->gres_per_task;
4525 if ((*cpus_per_task == NO_VAL16) ||
4526 (*cpus_per_task == 0))
4527 *cpus_per_task = req_cpus_per_task;
4528 else if (*cpus_per_task != req_cpus_per_task)
4529 return -1;
4530 }
4531
4532 /* Ensure tres_per_job >= node count */
4533 if (job_gres_data->gres_per_job) {
4534 if (job_gres_data->gres_per_job < *min_nodes)
4535 return -1;
4536 if (job_gres_data->gres_per_job < *max_nodes)
4537 *max_nodes = job_gres_data->gres_per_job;
4538 }
4539
4540 return 0;
4541 }
4542
4543 /*
4544 * Translate a string, with optional suffix, into its equivalent numeric value
4545 * tok IN - the string to translate
4546 * value IN - numeric value
4547 * RET true if "tok" is a valid number
4548 */
_is_valid_number(char * tok,unsigned long long int * value)4549 static bool _is_valid_number(char *tok, unsigned long long int *value)
4550 {
4551 unsigned long long int tmp_val;
4552 uint64_t mult;
4553 char *end_ptr = NULL;
4554
4555 tmp_val = strtoull(tok, &end_ptr, 10);
4556 if (tmp_val == ULLONG_MAX)
4557 return false;
4558 if ((mult = suffix_mult(end_ptr)) == NO_VAL64)
4559 return false;
4560 tmp_val *= mult;
4561 *value = tmp_val;
4562 return true;
4563 }
4564
4565 /*
4566 * Reentrant TRES specification parse logic
4567 * in_val IN - initial input string
4568 * type OUT - must be xfreed by caller
4569 * cnt OUT - count of values
4570 * flags OUT - user flags (GRES_NO_CONSUME)
4571 * save_ptr IN/OUT - NULL on initial call, otherwise value from previous call
4572 * RET rc - error code
4573 */
_get_next_gres(char * in_val,char ** type_ptr,int * context_inx_ptr,uint64_t * cnt,uint16_t * flags,char ** save_ptr)4574 static int _get_next_gres(char *in_val, char **type_ptr, int *context_inx_ptr,
4575 uint64_t *cnt, uint16_t *flags, char **save_ptr)
4576 {
4577 char *comma, *sep, *sep2, *name = NULL, *type = NULL;
4578 int i, rc = SLURM_SUCCESS;
4579 unsigned long long int value = 0;
4580
4581 xassert(cnt);
4582 xassert(flags);
4583 xassert(save_ptr);
4584 *flags = 0;
4585
4586 if (!in_val && (*save_ptr == NULL)) {
4587 return rc;
4588 }
4589
4590 if (*save_ptr == NULL) {
4591 *save_ptr = in_val;
4592 }
4593
4594 next: if (*save_ptr[0] == '\0') { /* Empty input token */
4595 *save_ptr = NULL;
4596 goto fini;
4597 }
4598
4599 name = xstrdup(*save_ptr);
4600 comma = strchr(name, ',');
4601 if (comma) {
4602 *save_ptr += (comma - name + 1);
4603 comma[0] = '\0';
4604 } else {
4605 *save_ptr += strlen(name);
4606 }
4607
4608 if (name[0] == '\0') {
4609 /* Nothing but a comma */
4610 xfree(name);
4611 goto next;
4612 }
4613
4614 sep = strchr(name, ':');
4615 if (sep) {
4616 sep[0] = '\0';
4617 sep++;
4618 sep2 = strchr(sep, ':');
4619 if (sep2) {
4620 sep2[0] = '\0';
4621 sep2++;
4622 }
4623 } else {
4624 sep2 = NULL;
4625 }
4626
4627 if (sep2) { /* Two colons */
4628 /* We have both type and count */
4629 if ((sep[0] == '\0') || (sep2[0] == '\0')) {
4630 /* Bad format (e.g. "gpu:tesla:" or "gpu::1") */
4631 rc = ESLURM_INVALID_GRES;
4632 goto fini;
4633 }
4634 type = xstrdup(sep);
4635 if (!_is_valid_number(sep2, &value)) {
4636 debug("%s: Invalid count value GRES %s:%s:%s", __func__,
4637 name, type, sep2);
4638 rc = ESLURM_INVALID_GRES;
4639 goto fini;
4640 }
4641 } else if (sep) { /* One colon */
4642 if (sep[0] == '\0') {
4643 /* Bad format (e.g. "gpu:") */
4644 rc = ESLURM_INVALID_GRES;
4645 goto fini;
4646 } else if (_is_valid_number(sep, &value)) {
4647 /* We have count, but no type */
4648 type = NULL;
4649 } else {
4650 /* We have type with implicit count of 1 */
4651 type = xstrdup(sep);
4652 value = 1;
4653 }
4654 } else { /* No colon */
4655 /* We have no type and implicit count of 1 */
4656 type = NULL;
4657 value = 1;
4658 }
4659 if (value == 0) {
4660 xfree(name);
4661 xfree(type);
4662 goto next;
4663 }
4664
4665 for (i = 0; i < gres_context_cnt; i++) {
4666 if (!xstrcmp(name, gres_context[i].gres_name) ||
4667 !xstrncmp(name, gres_context[i].gres_name_colon,
4668 gres_context[i].gres_name_colon_len))
4669 break; /* GRES name match found */
4670 }
4671 if (i >= gres_context_cnt) {
4672 debug("%s: Failed to locate GRES %s", __func__, name);
4673 rc = ESLURM_INVALID_GRES;
4674 goto fini;
4675 }
4676 *context_inx_ptr = i;
4677
4678 fini: if (rc != SLURM_SUCCESS) {
4679 *save_ptr = NULL;
4680 if (rc == ESLURM_INVALID_GRES) {
4681 info("%s: Invalid GRES job specification %s", __func__,
4682 in_val);
4683 }
4684 xfree(type);
4685 *type_ptr = NULL;
4686 } else {
4687 *cnt = value;
4688 *type_ptr = type;
4689 }
4690 xfree(name);
4691
4692 return rc;
4693 }
4694
4695 /*
4696 * TRES specification parse logic
4697 * in_val IN - initial input string
4698 * cnt OUT - count of values
4699 * gres_list IN/OUT - where to search for (or add) new job TRES record
4700 * save_ptr IN/OUT - NULL on initial call, otherwise value from previous call
4701 * rc OUT - unchanged or an error code
4702 * RET gres - job record to set value in, found or created by this function
4703 */
_get_next_job_gres(char * in_val,uint64_t * cnt,List gres_list,char ** save_ptr,int * rc)4704 static gres_job_state_t *_get_next_job_gres(char *in_val, uint64_t *cnt,
4705 List gres_list, char **save_ptr,
4706 int *rc)
4707 {
4708 static char *prev_save_ptr = NULL;
4709 int context_inx = NO_VAL, my_rc = SLURM_SUCCESS;
4710 gres_job_state_t *job_gres_data = NULL;
4711 gres_state_t *gres_ptr;
4712 gres_key_t job_search_key;
4713 char *type = NULL, *name = NULL;
4714 uint16_t flags = 0;
4715
4716 xassert(save_ptr);
4717 if (!in_val && (*save_ptr == NULL)) {
4718 return NULL;
4719 }
4720
4721 if (*save_ptr == NULL) {
4722 prev_save_ptr = in_val;
4723 } else if (*save_ptr != prev_save_ptr) {
4724 error("%s: parsing error", __func__);
4725 my_rc = SLURM_ERROR;
4726 goto fini;
4727 }
4728
4729 if (prev_save_ptr[0] == '\0') { /* Empty input token */
4730 *save_ptr = NULL;
4731 return NULL;
4732 }
4733
4734 if ((my_rc = _get_next_gres(in_val, &type, &context_inx,
4735 cnt, &flags, &prev_save_ptr)) ||
4736 (context_inx == NO_VAL)) {
4737 prev_save_ptr = NULL;
4738 goto fini;
4739 }
4740
4741 /* Find the job GRES record */
4742 job_search_key.plugin_id = gres_context[context_inx].plugin_id;
4743 job_search_key.type_id = gres_plugin_build_id(type);
4744 gres_ptr = list_find_first(gres_list, _gres_find_job_by_key,
4745 &job_search_key);
4746
4747 if (gres_ptr) {
4748 job_gres_data = gres_ptr->gres_data;
4749 } else {
4750 job_gres_data = xmalloc(sizeof(gres_job_state_t));
4751 job_gres_data->gres_name =
4752 xstrdup(gres_context[context_inx].gres_name);
4753 job_gres_data->type_id = gres_plugin_build_id(type);
4754 job_gres_data->type_name = type;
4755 type = NULL; /* String moved above */
4756 gres_ptr = xmalloc(sizeof(gres_state_t));
4757 gres_ptr->plugin_id = gres_context[context_inx].plugin_id;
4758 gres_ptr->gres_data = job_gres_data;
4759 list_append(gres_list, gres_ptr);
4760 }
4761 job_gres_data->flags = flags;
4762
4763 fini: xfree(name);
4764 xfree(type);
4765 if (my_rc != SLURM_SUCCESS) {
4766 prev_save_ptr = NULL;
4767 if (my_rc == ESLURM_INVALID_GRES) {
4768 info("%s: Invalid GRES job specification %s", __func__,
4769 in_val);
4770 }
4771 *rc = my_rc;
4772 }
4773 *save_ptr = prev_save_ptr;
4774 return job_gres_data;
4775 }
4776
4777 /* Return true if job specification only includes cpus_per_gres or mem_per_gres
4778 * Return false if any other field set
4779 */
_generic_job_state(gres_job_state_t * job_state)4780 static bool _generic_job_state(gres_job_state_t *job_state)
4781 {
4782 if (job_state->gres_per_job ||
4783 job_state->gres_per_node ||
4784 job_state->gres_per_socket ||
4785 job_state->gres_per_task)
4786 return false;
4787 return true;
4788 }
4789
4790 /*
4791 * Given a job's requested GRES configuration, validate it and build a GRES list
4792 * Note: This function can be used for a new request with gres_list==NULL or
4793 * used to update an existing job, in which case gres_list is a copy
4794 * of the job's original value (so we can clear fields as needed)
4795 * IN *tres* - job requested gres input string
4796 * IN/OUT num_tasks - requested task count, may be reset to provide
4797 * consistent gres_per_node/task values
4798 * IN/OUT min_nodes - requested minimum node count, may be reset to provide
4799 * consistent gres_per_node/task values
4800 * IN/OUT max_nodes - requested maximum node count, may be reset to provide
4801 * consistent gres_per_node/task values
4802 * IN/OUT ntasks_per_node - requested tasks_per_node count, may be reset to
4803 * provide consistent gres_per_node/task values
4804 * IN/OUT ntasks_per_socket - requested ntasks_per_socket count, may be reset to
4805 * provide consistent gres_per_node/task values
4806 * IN/OUT sockets_per_node - requested sockets_per_node count, may be reset to
4807 * provide consistent gres_per_socket/node values
4808 * IN/OUT cpus_per_task - requested cpus_per_task count, may be reset to
4809 * provide consistent gres_per_task/cpus_per_gres values
4810 * OUT gres_list - List of GRES records for this job to track usage
4811 * RET SLURM_SUCCESS or ESLURM_INVALID_GRES
4812 */
gres_plugin_job_state_validate(char * cpus_per_tres,char * tres_freq,char * tres_per_job,char * tres_per_node,char * tres_per_socket,char * tres_per_task,char * mem_per_tres,uint32_t * num_tasks,uint32_t * min_nodes,uint32_t * max_nodes,uint16_t * ntasks_per_node,uint16_t * ntasks_per_socket,uint16_t * sockets_per_node,uint16_t * cpus_per_task,List * gres_list)4813 extern int gres_plugin_job_state_validate(char *cpus_per_tres,
4814 char *tres_freq,
4815 char *tres_per_job,
4816 char *tres_per_node,
4817 char *tres_per_socket,
4818 char *tres_per_task,
4819 char *mem_per_tres,
4820 uint32_t *num_tasks,
4821 uint32_t *min_nodes,
4822 uint32_t *max_nodes,
4823 uint16_t *ntasks_per_node,
4824 uint16_t *ntasks_per_socket,
4825 uint16_t *sockets_per_node,
4826 uint16_t *cpus_per_task,
4827 List *gres_list)
4828 {
4829 typedef struct overlap_check {
4830 gres_job_state_t *without_model_state;
4831 uint32_t plugin_id;
4832 bool with_model;
4833 bool without_model;
4834 } overlap_check_t;
4835 overlap_check_t *over_list;
4836 int i, over_count = 0, rc = SLURM_SUCCESS, size;
4837 bool have_gres_gpu = false, have_gres_mps = false;
4838 bool overlap_merge = false;
4839 gres_state_t *gres_state;
4840 gres_job_state_t *job_gres_data;
4841 uint64_t cnt = 0;
4842 ListIterator iter;
4843
4844 if (!cpus_per_tres && !tres_per_job && !tres_per_node &&
4845 !tres_per_socket && !tres_per_task && !mem_per_tres)
4846 return SLURM_SUCCESS;
4847
4848 if (tres_per_task && (*num_tasks == NO_VAL) &&
4849 (*min_nodes != NO_VAL) && (*min_nodes == *max_nodes)) {
4850 /* Implicitly set task count */
4851 if (*ntasks_per_node != NO_VAL16)
4852 *num_tasks = *min_nodes * *ntasks_per_node;
4853 else if (*cpus_per_task == NO_VAL16)
4854 *num_tasks = *min_nodes;
4855 }
4856
4857 if ((rc = gres_plugin_init()) != SLURM_SUCCESS)
4858 return rc;
4859
4860 if ((select_plugin_type != SELECT_TYPE_CONS_TRES) &&
4861 (cpus_per_tres || tres_per_job || tres_per_socket ||
4862 tres_per_task || mem_per_tres))
4863 return ESLURM_UNSUPPORTED_GRES;
4864
4865 /*
4866 * Clear fields as requested by job update (i.e. input value is "")
4867 */
4868 if (*gres_list)
4869 (void) list_for_each(*gres_list, _clear_total_gres, NULL);
4870 if (*gres_list && cpus_per_tres && (cpus_per_tres[0] == '\0')) {
4871 (void) list_for_each(*gres_list, _clear_cpus_per_gres, NULL);
4872 cpus_per_tres = NULL;
4873 }
4874 if (*gres_list && tres_per_job && (tres_per_job[0] == '\0')) {
4875 (void) list_for_each(*gres_list, _clear_gres_per_job, NULL);
4876 tres_per_job = NULL;
4877 }
4878 if (*gres_list && tres_per_node && (tres_per_node[0] == '\0')) {
4879 (void) list_for_each(*gres_list, _clear_gres_per_node, NULL);
4880 tres_per_node = NULL;
4881 }
4882 if (*gres_list && tres_per_socket && (tres_per_socket[0] == '\0')) {
4883 (void) list_for_each(*gres_list, _clear_gres_per_socket, NULL);
4884 tres_per_socket = NULL;
4885 }
4886 if (*gres_list && tres_per_task && (tres_per_task[0] == '\0')) {
4887 (void) list_for_each(*gres_list, _clear_gres_per_task, NULL);
4888 tres_per_task = NULL;
4889 }
4890 if (*gres_list && mem_per_tres && (mem_per_tres[0] == '\0')) {
4891 (void) list_for_each(*gres_list, _clear_mem_per_gres, NULL);
4892 mem_per_tres = NULL;
4893 }
4894
4895 /*
4896 * Set new values as requested
4897 */
4898 if (*gres_list == NULL)
4899 *gres_list = list_create(_gres_job_list_delete);
4900 slurm_mutex_lock(&gres_context_lock);
4901 if (cpus_per_tres) {
4902 char *in_val = cpus_per_tres, *save_ptr = NULL;
4903 while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
4904 *gres_list,
4905 &save_ptr, &rc))) {
4906 job_gres_data->cpus_per_gres = cnt;
4907 in_val = NULL;
4908 }
4909 }
4910 if (tres_per_job) {
4911 char *in_val = tres_per_job, *save_ptr = NULL;
4912 while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
4913 *gres_list,
4914 &save_ptr, &rc))) {
4915 job_gres_data->gres_per_job = cnt;
4916 in_val = NULL;
4917 job_gres_data->total_gres =
4918 MAX(job_gres_data->total_gres, cnt);
4919 }
4920 }
4921 if (tres_per_node) {
4922 char *in_val = tres_per_node, *save_ptr = NULL;
4923 while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
4924 *gres_list,
4925 &save_ptr, &rc))) {
4926 job_gres_data->gres_per_node = cnt;
4927 in_val = NULL;
4928 if (*min_nodes != NO_VAL)
4929 cnt *= *min_nodes;
4930 job_gres_data->total_gres =
4931 MAX(job_gres_data->total_gres, cnt);
4932 }
4933 }
4934 if (tres_per_socket) {
4935 char *in_val = tres_per_socket, *save_ptr = NULL;
4936 while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
4937 *gres_list,
4938 &save_ptr, &rc))) {
4939 job_gres_data->gres_per_socket = cnt;
4940 in_val = NULL;
4941 if ((*min_nodes != NO_VAL) &&
4942 (*sockets_per_node != NO_VAL16)) {
4943 cnt *= (*min_nodes * *sockets_per_node);
4944 } else if ((*num_tasks != NO_VAL) &&
4945 (*ntasks_per_socket != NO_VAL16)) {
4946 cnt *= ((*num_tasks + *ntasks_per_socket - 1) /
4947 *ntasks_per_socket);
4948 }
4949 }
4950 }
4951 if (tres_per_task) {
4952 char *in_val = tres_per_task, *save_ptr = NULL;
4953 while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
4954 *gres_list,
4955 &save_ptr, &rc))) {
4956 job_gres_data->gres_per_task = cnt;
4957 in_val = NULL;
4958 if (*num_tasks != NO_VAL)
4959 cnt *= *num_tasks;
4960 job_gres_data->total_gres =
4961 MAX(job_gres_data->total_gres, cnt);
4962 }
4963 }
4964 if (mem_per_tres) {
4965 char *in_val = mem_per_tres, *save_ptr = NULL;
4966 while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
4967 *gres_list,
4968 &save_ptr, &rc))) {
4969 job_gres_data->mem_per_gres = cnt;
4970 in_val = NULL;
4971 }
4972 }
4973 slurm_mutex_unlock(&gres_context_lock);
4974
4975 if (rc != SLURM_SUCCESS)
4976 return rc;
4977 size = list_count(*gres_list);
4978 if (size == 0) {
4979 FREE_NULL_LIST(*gres_list);
4980 return rc;
4981 }
4982
4983 /*
4984 * Check for record overlap (e.g. "gpu:2,gpu:tesla:1")
4985 * Ensure tres_per_job >= tres_per_node >= tres_per_socket
4986 */
4987 over_list = xcalloc(size, sizeof(overlap_check_t));
4988 iter = list_iterator_create(*gres_list);
4989 while ((gres_state = (gres_state_t *) list_next(iter))) {
4990 job_gres_data = (gres_job_state_t *) gres_state->gres_data;
4991 if (_test_gres_cnt(job_gres_data, num_tasks, min_nodes,
4992 max_nodes, ntasks_per_node,
4993 ntasks_per_socket, sockets_per_node,
4994 cpus_per_task) != 0) {
4995 rc = ESLURM_INVALID_GRES;
4996 break;
4997 }
4998 if (!have_gres_gpu && !xstrcmp(job_gres_data->gres_name, "gpu"))
4999 have_gres_gpu = true;
5000 if (!xstrcmp(job_gres_data->gres_name, "mps")) {
5001 have_gres_mps = true;
5002 /*
5003 * gres/mps only supports a per-node count,
5004 * set either explicitly or implicitly.
5005 */
5006 if (job_gres_data->gres_per_job &&
5007 (*max_nodes != 1)) {
5008 rc = ESLURM_INVALID_GRES;
5009 break;
5010 }
5011 if (job_gres_data->gres_per_socket &&
5012 (*sockets_per_node != 1)) {
5013 rc = ESLURM_INVALID_GRES;
5014 break;
5015 }
5016 if (job_gres_data->gres_per_task && (*num_tasks != 1)) {
5017 rc = ESLURM_INVALID_GRES;
5018 break;
5019 }
5020 }
5021 if (have_gres_gpu && have_gres_mps) {
5022 rc = ESLURM_INVALID_GRES;
5023 break;
5024 }
5025
5026 for (i = 0; i < over_count; i++) {
5027 if (over_list[i].plugin_id == gres_state->plugin_id)
5028 break;
5029 }
5030 if (i >= over_count) {
5031 over_list[over_count++].plugin_id =
5032 gres_state->plugin_id;
5033 if (job_gres_data->type_name) {
5034 over_list[i].with_model = true;
5035 } else {
5036 over_list[i].without_model = true;
5037 over_list[i].without_model_state =
5038 job_gres_data;
5039 }
5040 } else if (job_gres_data->type_name) {
5041 over_list[i].with_model = true;
5042 if (over_list[i].without_model)
5043 overlap_merge = true;
5044 } else {
5045 over_list[i].without_model = true;
5046 over_list[i].without_model_state = job_gres_data;
5047 if (over_list[i].with_model)
5048 overlap_merge = true;
5049 }
5050 }
5051 if (have_gres_mps && (rc == SLURM_SUCCESS) && tres_freq &&
5052 strstr(tres_freq, "gpu")) {
5053 rc = ESLURM_INVALID_GRES;
5054 }
5055
5056 if (overlap_merge) { /* Merge generic data if possible */
5057 uint16_t cpus_per_gres;
5058 uint64_t mem_per_gres;
5059 for (i = 0; i < over_count; i++) {
5060 if (!over_list[i].with_model ||
5061 !over_list[i].without_model_state)
5062 continue;
5063 if (!_generic_job_state(
5064 over_list[i].without_model_state)) {
5065 rc = ESLURM_INVALID_GRES_TYPE;
5066 break;
5067 }
5068 /* Propagate generic parameters */
5069 cpus_per_gres =
5070 over_list[i].without_model_state->cpus_per_gres;
5071 mem_per_gres =
5072 over_list[i].without_model_state->mem_per_gres;
5073 list_iterator_reset(iter);
5074 while ((gres_state = (gres_state_t *)list_next(iter))) {
5075 job_gres_data = (gres_job_state_t *)
5076 gres_state->gres_data;
5077 if (over_list[i].plugin_id !=
5078 gres_state->plugin_id)
5079 continue;
5080 if (job_gres_data ==
5081 over_list[i].without_model_state) {
5082 list_remove(iter);
5083 continue;
5084 }
5085 if (job_gres_data->cpus_per_gres == 0) {
5086 job_gres_data->cpus_per_gres =
5087 cpus_per_gres;
5088 }
5089 if (job_gres_data->mem_per_gres == 0) {
5090 job_gres_data->mem_per_gres =
5091 mem_per_gres;
5092 }
5093 }
5094 }
5095 }
5096 list_iterator_destroy(iter);
5097 xfree(over_list);
5098
5099 return rc;
5100 }
5101
5102 /*
5103 * Determine if a job's specified GRES can be supported. This is designed to
5104 * prevent the running of a job using the GRES options only supported by the
5105 * select/cons_tres plugin when switching (on slurmctld restart) from the
5106 * cons_tres plugin to any other select plugin.
5107 *
5108 * IN gres_list - List of GRES records for this job to track usage
5109 * RET SLURM_SUCCESS or ESLURM_INVALID_GRES
5110 */
gres_plugin_job_revalidate(List gres_list)5111 extern int gres_plugin_job_revalidate(List gres_list)
5112 {
5113 gres_state_t *gres_state;
5114 gres_job_state_t *job_gres_data;
5115 ListIterator iter;
5116 int rc = SLURM_SUCCESS;
5117
5118 if (!gres_list || (select_plugin_type == SELECT_TYPE_CONS_TRES))
5119 return SLURM_SUCCESS;
5120
5121 iter = list_iterator_create(gres_list);
5122 while ((gres_state = (gres_state_t *) list_next(iter))) {
5123 job_gres_data = (gres_job_state_t *) gres_state->gres_data;
5124 if (job_gres_data->gres_per_job ||
5125 job_gres_data->gres_per_socket ||
5126 job_gres_data->gres_per_task) {
5127 rc = ESLURM_UNSUPPORTED_GRES;
5128 break;
5129 }
5130 }
5131 list_iterator_destroy(iter);
5132
5133 return rc;
5134 }
5135
5136 /*
5137 * Return TRUE if any of this job's GRES has a populated gres_bit_alloc element.
5138 * This indicates the allocated GRES has a File configuration parameter and is
5139 * tracking individual file assignments.
5140 */
_job_has_gres_bits(List job_gres_list)5141 static bool _job_has_gres_bits(List job_gres_list)
5142 {
5143 ListIterator job_gres_iter;
5144 gres_state_t *gres_ptr;
5145 gres_job_state_t *job_gres_ptr;
5146 bool rc = false;
5147 int i;
5148
5149 if (!job_gres_list)
5150 return false;
5151
5152 job_gres_iter = list_iterator_create(job_gres_list);
5153 while ((gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
5154 job_gres_ptr = gres_ptr->gres_data;
5155 if (!job_gres_ptr)
5156 continue;
5157 for (i = 0; i < job_gres_ptr->node_cnt; i++) {
5158 if (job_gres_ptr->gres_bit_alloc &&
5159 job_gres_ptr->gres_bit_alloc[i]) {
5160 rc = true;
5161 break;
5162 }
5163 }
5164 if (rc)
5165 break;
5166 }
5167 list_iterator_destroy(job_gres_iter);
5168
5169 return rc;
5170 }
5171
5172 /*
5173 * Return count of configured GRES.
5174 * NOTE: For gres/mps return count of gres/gpu
5175 */
_get_node_gres_cnt(List node_gres_list,uint32_t plugin_id)5176 static int _get_node_gres_cnt(List node_gres_list, uint32_t plugin_id)
5177 {
5178 ListIterator node_gres_iter;
5179 gres_node_state_t *gres_node_ptr;
5180 gres_state_t *gres_ptr;
5181 int gres_cnt = 0;
5182
5183 if (!node_gres_list)
5184 return 0;
5185
5186 if (plugin_id == mps_plugin_id)
5187 plugin_id = gpu_plugin_id;
5188 node_gres_iter = list_iterator_create(node_gres_list);
5189 while ((gres_ptr = (gres_state_t *) list_next(node_gres_iter))) {
5190 if (gres_ptr->plugin_id != plugin_id)
5191 continue;
5192 gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
5193 gres_cnt = (int) gres_node_ptr->gres_cnt_config;
5194 break;
5195 }
5196 list_iterator_destroy(node_gres_iter);
5197
5198 return gres_cnt;
5199 }
5200
5201 /*
5202 * Return TRUE if the identified node in the job allocation can satisfy the
5203 * job's GRES specification without change in its bitmaps. In other words,
5204 * return FALSE if the job allocation identifies specific GRES devices and the
5205 * count of those devices on this node has changed.
5206 *
5207 * IN job_gres_list - List of GRES records for this job to track usage
5208 * IN node_inx - zero-origin index into this job's node allocation
5209 * IN node_gres_list - List of GRES records for this node
5210 */
_validate_node_gres_cnt(uint32_t job_id,List job_gres_list,int node_inx,List node_gres_list,char * node_name)5211 static bool _validate_node_gres_cnt(uint32_t job_id, List job_gres_list,
5212 int node_inx, List node_gres_list,
5213 char *node_name)
5214 {
5215 ListIterator job_gres_iter;
5216 gres_state_t *gres_ptr;
5217 gres_job_state_t *job_gres_ptr;
5218 bool rc = true;
5219 int job_gres_cnt, node_gres_cnt;
5220
5221 if (!job_gres_list)
5222 return true;
5223
5224 (void) gres_plugin_init();
5225
5226 job_gres_iter = list_iterator_create(job_gres_list);
5227 while ((gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
5228 job_gres_ptr = gres_ptr->gres_data;
5229 if (!job_gres_ptr || !job_gres_ptr->gres_bit_alloc)
5230 continue;
5231 if ((node_inx >= job_gres_ptr->node_cnt) ||
5232 !job_gres_ptr->gres_bit_alloc[node_inx])
5233 continue;
5234 job_gres_cnt = bit_size(job_gres_ptr->gres_bit_alloc[node_inx]);
5235 node_gres_cnt = _get_node_gres_cnt(node_gres_list,
5236 gres_ptr->plugin_id);
5237 if (job_gres_cnt != node_gres_cnt) {
5238 error("%s: Killing job %u: gres/%s count mismatch on node "
5239 "%s (%d != %d)",
5240 __func__, job_id, job_gres_ptr->gres_name,
5241 node_name, job_gres_cnt, node_gres_cnt);
5242 rc = false;
5243 break;
5244 }
5245 }
5246 list_iterator_destroy(job_gres_iter);
5247
5248 return rc;
5249 }
5250
5251 /*
5252 * Determine if a job's specified GRES are currently valid. This is designed to
5253 * manage jobs allocated GRES which are either no longer supported or a GRES
5254 * configured with the "File" option in gres.conf where the count has changed,
5255 * in which case we don't know how to map the job's old GRES bitmap onto the
5256 * current GRES bitmaps.
5257 *
5258 * IN job_id - ID of job being validated (used for logging)
5259 * IN job_gres_list - List of GRES records for this job to track usage
5260 * RET SLURM_SUCCESS or ESLURM_INVALID_GRES
5261 */
gres_plugin_job_revalidate2(uint32_t job_id,List job_gres_list,bitstr_t * node_bitmap)5262 extern int gres_plugin_job_revalidate2(uint32_t job_id, List job_gres_list,
5263 bitstr_t *node_bitmap)
5264 {
5265 node_record_t *node_ptr;
5266 int rc = SLURM_SUCCESS;
5267 int i_first, i_last, i;
5268 int node_inx = -1;
5269
5270 if (!job_gres_list || !node_bitmap ||
5271 !_job_has_gres_bits(job_gres_list))
5272 return SLURM_SUCCESS;
5273
5274 i_first = bit_ffs(node_bitmap);
5275 if (i_first >= 0)
5276 i_last = bit_fls(node_bitmap);
5277 else
5278 i_last = -2;
5279 for (i = i_first; i <= i_last; i++) {
5280 if (!bit_test(node_bitmap, i))
5281 continue;
5282 node_ptr = node_record_table_ptr + i;
5283 node_inx++;
5284 if (!_validate_node_gres_cnt(job_id, job_gres_list, node_inx,
5285 node_ptr->gres_list,
5286 node_ptr->name)) {
5287 rc = ESLURM_INVALID_GRES;
5288 break;
5289 }
5290 }
5291
5292 return rc;
5293 }
5294
5295 /*
5296 * Find a sock_gres_t record in a list by matching the plugin_id and type_id
5297 * from a gres_state_t job record
5298 * IN x - a sock_gres_t record to test
5299 * IN key - the gres_state_t record (from a job) we want to match
5300 * RET 1 on match, otherwise 0
5301 */
_find_sock_by_job_gres(void * x,void * key)5302 static int _find_sock_by_job_gres(void *x, void *key)
5303 {
5304 sock_gres_t *sock_data = (sock_gres_t *) x;
5305 gres_state_t *job_gres_state = (gres_state_t *) key;
5306 gres_job_state_t *job_data;
5307
5308 job_data = (gres_job_state_t *) job_gres_state->gres_data;
5309 if ((sock_data->plugin_id == job_gres_state->plugin_id) &&
5310 (sock_data->type_id == job_data->type_id))
5311 return 1;
5312 return 0;
5313 }
5314
5315 /*
5316 * Find a gres_state_t job record in a list by matching the plugin_id and
5317 * type_id from a sock_gres_t record
5318 * IN x - a gres_state_t record (from a job) to test
5319 * IN key - the sock_gres_t record we want to match
5320 * RET 1 on match, otherwise 0
5321 */
_find_job_by_sock_gres(void * x,void * key)5322 static int _find_job_by_sock_gres(void *x, void *key)
5323 {
5324 gres_state_t *job_gres_state = (gres_state_t *) x;
5325 gres_job_state_t *job_data;
5326 sock_gres_t *sock_data = (sock_gres_t *) key;
5327
5328 job_data = (gres_job_state_t *) job_gres_state->gres_data;
5329 if ((sock_data->plugin_id == job_gres_state->plugin_id) &&
5330 (sock_data->type_id == job_data->type_id))
5331 return 1;
5332 return 0;
5333 }
5334
5335 /*
5336 * Clear GRES allocation info for all job GRES at start of scheduling cycle
5337 * Return TRUE if any gres_per_job constraints to satisfy
5338 */
gres_plugin_job_sched_init(List job_gres_list)5339 extern bool gres_plugin_job_sched_init(List job_gres_list)
5340 {
5341 ListIterator iter;
5342 gres_state_t *job_gres_state;
5343 gres_job_state_t *job_data;
5344 bool rc = false;
5345
5346 if (!job_gres_list)
5347 return rc;
5348
5349 iter = list_iterator_create(job_gres_list);
5350 while ((job_gres_state = (gres_state_t *) list_next(iter))) {
5351 job_data = (gres_job_state_t *) job_gres_state->gres_data;
5352 if (!job_data->gres_per_job)
5353 continue;
5354 job_data->total_gres = 0;
5355 rc = true;
5356 }
5357 list_iterator_destroy(iter);
5358
5359 return rc;
5360 }
5361
5362 /*
5363 * Return TRUE if all gres_per_job specifications are satisfied
5364 */
gres_plugin_job_sched_test(List job_gres_list,uint32_t job_id)5365 extern bool gres_plugin_job_sched_test(List job_gres_list, uint32_t job_id)
5366 {
5367 ListIterator iter;
5368 gres_state_t *job_gres_state;
5369 gres_job_state_t *job_data;
5370 bool rc = true;
5371
5372 if (!job_gres_list)
5373 return rc;
5374
5375 iter = list_iterator_create(job_gres_list);
5376 while ((job_gres_state = (gres_state_t *) list_next(iter))) {
5377 job_data = (gres_job_state_t *) job_gres_state->gres_data;
5378 if (job_data->gres_per_job &&
5379 (job_data->gres_per_job > job_data->total_gres)) {
5380 rc = false;
5381 break;
5382 }
5383 }
5384 list_iterator_destroy(iter);
5385
5386 return rc;
5387 }
5388
5389 /*
5390 * Return TRUE if all gres_per_job specifications will be satisfied with
5391 * the addtitional resources provided by a single node
5392 * IN job_gres_list - List of job's GRES requirements (job_gres_state_t)
5393 * IN sock_gres_list - Per socket GRES availability on this node (sock_gres_t)
5394 * IN job_id - The job being tested
5395 */
gres_plugin_job_sched_test2(List job_gres_list,List sock_gres_list,uint32_t job_id)5396 extern bool gres_plugin_job_sched_test2(List job_gres_list, List sock_gres_list,
5397 uint32_t job_id)
5398 {
5399 ListIterator iter;
5400 gres_state_t *job_gres_state;
5401 gres_job_state_t *job_data;
5402 sock_gres_t *sock_data;
5403 bool rc = true;
5404
5405 if (!job_gres_list)
5406 return rc;
5407
5408 iter = list_iterator_create(job_gres_list);
5409 while ((job_gres_state = (gres_state_t *) list_next(iter))) {
5410 job_data = (gres_job_state_t *) job_gres_state->gres_data;
5411 if ((job_data->gres_per_job == 0) ||
5412 (job_data->gres_per_job < job_data->total_gres))
5413 continue;
5414 sock_data = list_find_first(sock_gres_list,
5415 _find_sock_by_job_gres,
5416 job_gres_state);
5417 if (!sock_data ||
5418 (job_data->gres_per_job >
5419 (job_data->total_gres + sock_data->total_cnt))) {
5420 rc = false;
5421 break;
5422 }
5423 }
5424 list_iterator_destroy(iter);
5425
5426 return rc;
5427 }
5428
5429 /*
5430 * Update a job's total_gres counter as we add a node to potential allocation
5431 * IN job_gres_list - List of job's GRES requirements (job_gres_state_t)
5432 * IN sock_gres_list - Per socket GRES availability on this node (sock_gres_t)
5433 * IN avail_cpus - CPUs currently available on this node
5434 */
gres_plugin_job_sched_add(List job_gres_list,List sock_gres_list,uint16_t avail_cpus)5435 extern void gres_plugin_job_sched_add(List job_gres_list, List sock_gres_list,
5436 uint16_t avail_cpus)
5437 {
5438 ListIterator iter;
5439 gres_state_t *job_gres_state;
5440 gres_job_state_t *job_data;
5441 sock_gres_t *sock_data;
5442 uint64_t gres_limit;
5443
5444 if (!job_gres_list)
5445 return;
5446
5447 iter = list_iterator_create(job_gres_list);
5448 while ((job_gres_state = (gres_state_t *) list_next(iter))) {
5449 job_data = (gres_job_state_t *) job_gres_state->gres_data;
5450 if (!job_data->gres_per_job) /* Don't care about totals */
5451 continue;
5452 sock_data = list_find_first(sock_gres_list,
5453 _find_sock_by_job_gres,
5454 job_gres_state);
5455 if (!sock_data) /* None of this GRES available */
5456 continue;
5457 if (job_data->cpus_per_gres) {
5458 gres_limit = avail_cpus / job_data->cpus_per_gres;
5459 gres_limit = MIN(gres_limit, sock_data->total_cnt);
5460 } else
5461 gres_limit = sock_data->total_cnt;
5462 job_data->total_gres += gres_limit;
5463 }
5464 list_iterator_destroy(iter);
5465 }
5466
5467 /*
5468 * Create/update List GRES that can be made available on the specified node
5469 * IN/OUT consec_gres - List of sock_gres_t that can be made available on
5470 * a set of nodes
5471 * IN job_gres_list - List of job's GRES requirements (gres_job_state_t)
5472 * IN sock_gres_list - Per socket GRES availability on this node (sock_gres_t)
5473 */
gres_plugin_job_sched_consec(List * consec_gres,List job_gres_list,List sock_gres_list)5474 extern void gres_plugin_job_sched_consec(List *consec_gres, List job_gres_list,
5475 List sock_gres_list)
5476 {
5477 ListIterator iter;
5478 gres_state_t *job_gres_state;
5479 gres_job_state_t *job_data;
5480 sock_gres_t *sock_data, *consec_data;
5481
5482 if (!job_gres_list)
5483 return;
5484
5485 iter = list_iterator_create(job_gres_list);
5486 while ((job_gres_state = (gres_state_t *) list_next(iter))) {
5487 job_data = (gres_job_state_t *) job_gres_state->gres_data;
5488 if (!job_data->gres_per_job) /* Don't care about totals */
5489 continue;
5490 sock_data = list_find_first(sock_gres_list,
5491 _find_sock_by_job_gres,
5492 job_gres_state);
5493 if (!sock_data) /* None of this GRES available */
5494 continue;
5495 if (*consec_gres == NULL)
5496 *consec_gres = list_create(_sock_gres_del);
5497 consec_data = list_find_first(*consec_gres,
5498 _find_sock_by_job_gres,
5499 job_gres_state);
5500 if (!consec_data) {
5501 consec_data = xmalloc(sizeof(sock_gres_t));
5502 consec_data->plugin_id = sock_data->plugin_id;
5503 consec_data->type_id = sock_data->type_id;
5504 list_append(*consec_gres, consec_data);
5505 }
5506 consec_data->total_cnt += sock_data->total_cnt;
5507 }
5508 list_iterator_destroy(iter);
5509 }
5510
5511 /*
5512 * Determine if the additional sock_gres_list resources will result in
5513 * satisfying the job's gres_per_job constraints
5514 * IN job_gres_list - job's GRES requirements
5515 * IN sock_gres_list - available GRES in a set of nodes, data structure built
5516 * by gres_plugin_job_sched_consec()
5517 */
gres_plugin_job_sched_sufficient(List job_gres_list,List sock_gres_list)5518 extern bool gres_plugin_job_sched_sufficient(List job_gres_list,
5519 List sock_gres_list)
5520 {
5521 ListIterator iter;
5522 gres_state_t *job_gres_state;
5523 gres_job_state_t *job_data;
5524 sock_gres_t *sock_data;
5525 bool rc = true;
5526
5527 if (!job_gres_list)
5528 return true;
5529 if (!sock_gres_list)
5530 return false;
5531
5532 iter = list_iterator_create(job_gres_list);
5533 while ((job_gres_state = (gres_state_t *) list_next(iter))) {
5534 job_data = (gres_job_state_t *) job_gres_state->gres_data;
5535 if (!job_data->gres_per_job) /* Don't care about totals */
5536 continue;
5537 if (job_data->total_gres >= job_data->gres_per_job)
5538 continue;
5539 sock_data = list_find_first(sock_gres_list,
5540 _find_sock_by_job_gres,
5541 job_gres_state);
5542 if (!sock_data) { /* None of this GRES available */
5543 rc = false;
5544 break;
5545 }
5546 if ((job_data->total_gres + sock_data->total_cnt) <
5547 job_data->gres_per_job) {
5548 rc = false;
5549 break;
5550 }
5551 }
5552 list_iterator_destroy(iter);
5553
5554 return rc;
5555 }
5556
5557 /*
5558 * Given a List of sock_gres_t entries, return a string identifying the
5559 * count of each GRES available on this set of nodes
5560 * IN sock_gres_list - count of GRES available in this group of nodes
5561 * IN job_gres_list - job GRES specification, used only to get GRES name/type
5562 * RET xfree the returned string
5563 */
gres_plugin_job_sched_str(List sock_gres_list,List job_gres_list)5564 extern char *gres_plugin_job_sched_str(List sock_gres_list, List job_gres_list)
5565 {
5566 ListIterator iter;
5567 sock_gres_t *sock_data;
5568 gres_state_t *job_gres_state;
5569 gres_job_state_t *job_data;
5570 char *out_str = NULL, *sep;
5571
5572 if (!sock_gres_list)
5573 return NULL;
5574
5575 iter = list_iterator_create(sock_gres_list);
5576 while ((sock_data = (sock_gres_t *) list_next(iter))) {
5577 job_gres_state = list_find_first(job_gres_list,
5578 _find_job_by_sock_gres, sock_data);
5579 if (!job_gres_state) { /* Should never happen */
5580 error("%s: Could not find job GRES for type %u:%u",
5581 __func__, sock_data->plugin_id,
5582 sock_data->type_id);
5583 continue;
5584 }
5585 job_data = (gres_job_state_t *) job_gres_state->gres_data;
5586 if (out_str)
5587 sep = ",";
5588 else
5589 sep = "GRES:";
5590 if (job_data->type_name) {
5591 xstrfmtcat(out_str, "%s%s:%s:%"PRIu64, sep,
5592 job_data->gres_name, job_data->type_name,
5593 sock_data->total_cnt);
5594 } else {
5595 xstrfmtcat(out_str, "%s%s:%"PRIu64, sep,
5596 job_data->gres_name, sock_data->total_cnt);
5597 }
5598 }
5599 list_iterator_destroy(iter);
5600
5601 return out_str;
5602 }
5603
5604 /*
5605 * Create a (partial) copy of a job's gres state for job binding
5606 * IN gres_list - List of Gres records for this job to track usage
5607 * RET The copy or NULL on failure
5608 * NOTE: Only job details are copied, NOT the job step details
5609 */
gres_plugin_job_state_dup(List gres_list)5610 extern List gres_plugin_job_state_dup(List gres_list)
5611 {
5612 return gres_plugin_job_state_extract(gres_list, -1);
5613 }
5614
5615 /* Copy gres_job_state_t record for ALL nodes */
_job_state_dup(void * gres_data)5616 static void *_job_state_dup(void *gres_data)
5617 {
5618
5619 int i;
5620 gres_job_state_t *gres_ptr = (gres_job_state_t *) gres_data;
5621 gres_job_state_t *new_gres_ptr;
5622
5623 if (gres_ptr == NULL)
5624 return NULL;
5625
5626 new_gres_ptr = xmalloc(sizeof(gres_job_state_t));
5627 new_gres_ptr->cpus_per_gres = gres_ptr->cpus_per_gres;
5628 new_gres_ptr->gres_name = xstrdup(gres_ptr->gres_name);
5629 new_gres_ptr->gres_per_job = gres_ptr->gres_per_job;
5630 new_gres_ptr->gres_per_node = gres_ptr->gres_per_node;
5631 new_gres_ptr->gres_per_socket = gres_ptr->gres_per_socket;
5632 new_gres_ptr->gres_per_task = gres_ptr->gres_per_task;
5633 new_gres_ptr->mem_per_gres = gres_ptr->mem_per_gres;
5634 new_gres_ptr->node_cnt = gres_ptr->node_cnt;
5635 new_gres_ptr->total_gres = gres_ptr->total_gres;
5636 new_gres_ptr->type_id = gres_ptr->type_id;
5637 new_gres_ptr->type_name = xstrdup(gres_ptr->type_name);
5638
5639 if (gres_ptr->gres_cnt_node_alloc) {
5640 i = sizeof(uint64_t) * gres_ptr->node_cnt;
5641 new_gres_ptr->gres_cnt_node_alloc = xmalloc(i);
5642 memcpy(new_gres_ptr->gres_cnt_node_alloc,
5643 gres_ptr->gres_cnt_node_alloc, i);
5644 }
5645 if (gres_ptr->gres_bit_alloc) {
5646 new_gres_ptr->gres_bit_alloc = xcalloc(gres_ptr->node_cnt,
5647 sizeof(bitstr_t *));
5648 for (i = 0; i < gres_ptr->node_cnt; i++) {
5649 if (gres_ptr->gres_bit_alloc[i] == NULL)
5650 continue;
5651 new_gres_ptr->gres_bit_alloc[i] =
5652 bit_copy(gres_ptr->gres_bit_alloc[i]);
5653 }
5654 }
5655 return new_gres_ptr;
5656 }
5657
5658 /* Copy gres_job_state_t record for one specific node */
_job_state_dup2(void * gres_data,int node_index)5659 static void *_job_state_dup2(void *gres_data, int node_index)
5660 {
5661
5662 gres_job_state_t *gres_ptr = (gres_job_state_t *) gres_data;
5663 gres_job_state_t *new_gres_ptr;
5664
5665 if (gres_ptr == NULL)
5666 return NULL;
5667
5668 new_gres_ptr = xmalloc(sizeof(gres_job_state_t));
5669 new_gres_ptr->cpus_per_gres = gres_ptr->cpus_per_gres;
5670 new_gres_ptr->gres_name = xstrdup(gres_ptr->gres_name);
5671 new_gres_ptr->gres_per_job = gres_ptr->gres_per_job;
5672 new_gres_ptr->gres_per_node = gres_ptr->gres_per_node;
5673 new_gres_ptr->gres_per_socket = gres_ptr->gres_per_socket;
5674 new_gres_ptr->gres_per_task = gres_ptr->gres_per_task;
5675 new_gres_ptr->mem_per_gres = gres_ptr->mem_per_gres;
5676 new_gres_ptr->node_cnt = 1;
5677 new_gres_ptr->total_gres = gres_ptr->total_gres;
5678 new_gres_ptr->type_id = gres_ptr->type_id;
5679 new_gres_ptr->type_name = xstrdup(gres_ptr->type_name);
5680
5681 if (gres_ptr->gres_cnt_node_alloc) {
5682 new_gres_ptr->gres_cnt_node_alloc = xmalloc(sizeof(uint64_t));
5683 new_gres_ptr->gres_cnt_node_alloc[0] =
5684 gres_ptr->gres_cnt_node_alloc[node_index];
5685 }
5686 if (gres_ptr->gres_bit_alloc && gres_ptr->gres_bit_alloc[node_index]) {
5687 new_gres_ptr->gres_bit_alloc = xmalloc(sizeof(bitstr_t *));
5688 new_gres_ptr->gres_bit_alloc[0] =
5689 bit_copy(gres_ptr->gres_bit_alloc[node_index]);
5690 }
5691 return new_gres_ptr;
5692 }
5693
5694 /*
5695 * Create a (partial) copy of a job's gres state for a particular node index
5696 * IN gres_list - List of Gres records for this job to track usage
5697 * IN node_index - zero-origin index to the node
5698 * RET The copy or NULL on failure
5699 */
gres_plugin_job_state_extract(List gres_list,int node_index)5700 extern List gres_plugin_job_state_extract(List gres_list, int node_index)
5701 {
5702 ListIterator gres_iter;
5703 gres_state_t *gres_ptr, *new_gres_state;
5704 List new_gres_list = NULL;
5705 void *new_gres_data;
5706
5707 if (gres_list == NULL)
5708 return new_gres_list;
5709
5710 (void) gres_plugin_init();
5711
5712 slurm_mutex_lock(&gres_context_lock);
5713 gres_iter = list_iterator_create(gres_list);
5714 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
5715 if (node_index == -1)
5716 new_gres_data = _job_state_dup(gres_ptr->gres_data);
5717 else {
5718 new_gres_data = _job_state_dup2(gres_ptr->gres_data,
5719 node_index);
5720 }
5721 if (new_gres_data == NULL)
5722 break;
5723 if (new_gres_list == NULL) {
5724 new_gres_list = list_create(_gres_job_list_delete);
5725 }
5726 new_gres_state = xmalloc(sizeof(gres_state_t));
5727 new_gres_state->plugin_id = gres_ptr->plugin_id;
5728 new_gres_state->gres_data = new_gres_data;
5729 list_append(new_gres_list, new_gres_state);
5730 }
5731 list_iterator_destroy(gres_iter);
5732 slurm_mutex_unlock(&gres_context_lock);
5733
5734 return new_gres_list;
5735 }
5736
5737 /*
5738 * Pack a job's current gres status, called from slurmctld for save/restore
5739 * IN gres_list - generated by gres_plugin_job_config_validate()
5740 * IN/OUT buffer - location to write state to
5741 * IN job_id - job's ID
5742 * IN details - if set then pack job step allocation details (only needed to
5743 * save/restore job state, not needed in job credential for
5744 * slurmd task binding)
5745 *
5746 * NOTE: A job's allocation to steps is not recorded here, but recovered with
5747 * the job step state information upon slurmctld restart.
5748 */
gres_plugin_job_state_pack(List gres_list,Buf buffer,uint32_t job_id,bool details,uint16_t protocol_version)5749 extern int gres_plugin_job_state_pack(List gres_list, Buf buffer,
5750 uint32_t job_id, bool details,
5751 uint16_t protocol_version)
5752 {
5753 int i, rc = SLURM_SUCCESS;
5754 uint32_t top_offset, tail_offset;
5755 uint32_t magic = GRES_MAGIC;
5756 uint16_t rec_cnt = 0;
5757 ListIterator gres_iter;
5758 gres_state_t *gres_ptr;
5759 gres_job_state_t *gres_job_ptr;
5760
5761 top_offset = get_buf_offset(buffer);
5762 pack16(rec_cnt, buffer); /* placeholder if data */
5763
5764 if (gres_list == NULL)
5765 return rc;
5766
5767 (void) gres_plugin_init();
5768
5769 slurm_mutex_lock(&gres_context_lock);
5770 gres_iter = list_iterator_create(gres_list);
5771 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
5772 gres_job_ptr = (gres_job_state_t *) gres_ptr->gres_data;
5773
5774 if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
5775 pack32(magic, buffer);
5776 pack32(gres_ptr->plugin_id, buffer);
5777 pack16(gres_job_ptr->cpus_per_gres, buffer);
5778 pack16(gres_job_ptr->flags, buffer);
5779 pack64(gres_job_ptr->gres_per_job, buffer);
5780 pack64(gres_job_ptr->gres_per_node, buffer);
5781 pack64(gres_job_ptr->gres_per_socket, buffer);
5782 pack64(gres_job_ptr->gres_per_task, buffer);
5783 pack64(gres_job_ptr->mem_per_gres, buffer);
5784 pack64(gres_job_ptr->total_gres, buffer);
5785 packstr(gres_job_ptr->type_name, buffer);
5786 pack32(gres_job_ptr->node_cnt, buffer);
5787
5788 if (gres_job_ptr->gres_cnt_node_alloc) {
5789 pack8((uint8_t) 1, buffer);
5790 pack64_array(gres_job_ptr->gres_cnt_node_alloc,
5791 gres_job_ptr->node_cnt, buffer);
5792 } else {
5793 pack8((uint8_t) 0, buffer);
5794 }
5795
5796 if (gres_job_ptr->gres_bit_alloc) {
5797 pack8((uint8_t) 1, buffer);
5798 for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5799 pack_bit_str_hex(gres_job_ptr->
5800 gres_bit_alloc[i],
5801 buffer);
5802 }
5803 } else {
5804 pack8((uint8_t) 0, buffer);
5805 }
5806 if (details && gres_job_ptr->gres_bit_step_alloc) {
5807 pack8((uint8_t) 1, buffer);
5808 for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5809 pack_bit_str_hex(gres_job_ptr->
5810 gres_bit_step_alloc[i],
5811 buffer);
5812 }
5813 } else {
5814 pack8((uint8_t) 0, buffer);
5815 }
5816 if (details && gres_job_ptr->gres_cnt_step_alloc) {
5817 pack8((uint8_t) 1, buffer);
5818 for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5819 pack64(gres_job_ptr->
5820 gres_cnt_step_alloc[i],
5821 buffer);
5822 }
5823 } else {
5824 pack8((uint8_t) 0, buffer);
5825 }
5826 rec_cnt++;
5827 } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
5828 pack32(magic, buffer);
5829 pack32(gres_ptr->plugin_id, buffer);
5830 pack16(gres_job_ptr->cpus_per_gres, buffer);
5831 pack64(gres_job_ptr->gres_per_job, buffer);
5832 pack64(gres_job_ptr->gres_per_node, buffer);
5833 pack64(gres_job_ptr->gres_per_socket, buffer);
5834 pack64(gres_job_ptr->gres_per_task, buffer);
5835 pack64(gres_job_ptr->mem_per_gres, buffer);
5836 pack64(gres_job_ptr->total_gres, buffer);
5837 packstr(gres_job_ptr->type_name, buffer);
5838 pack32(gres_job_ptr->node_cnt, buffer);
5839
5840 if (gres_job_ptr->gres_cnt_node_alloc) {
5841 pack8((uint8_t) 1, buffer);
5842 pack64_array(gres_job_ptr->gres_cnt_node_alloc,
5843 gres_job_ptr->node_cnt, buffer);
5844 } else {
5845 pack8((uint8_t) 0, buffer);
5846 }
5847
5848 if (gres_job_ptr->gres_bit_alloc) {
5849 pack8((uint8_t) 1, buffer);
5850 for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5851 pack_bit_str_hex(gres_job_ptr->
5852 gres_bit_alloc[i],
5853 buffer);
5854 }
5855 } else {
5856 pack8((uint8_t) 0, buffer);
5857 }
5858 if (details && gres_job_ptr->gres_bit_step_alloc) {
5859 pack8((uint8_t) 1, buffer);
5860 for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5861 pack_bit_str_hex(gres_job_ptr->
5862 gres_bit_step_alloc[i],
5863 buffer);
5864 }
5865 } else {
5866 pack8((uint8_t) 0, buffer);
5867 }
5868 if (details && gres_job_ptr->gres_cnt_step_alloc) {
5869 pack8((uint8_t) 1, buffer);
5870 for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5871 pack64(gres_job_ptr->
5872 gres_cnt_step_alloc[i],
5873 buffer);
5874 }
5875 } else {
5876 pack8((uint8_t) 0, buffer);
5877 }
5878 rec_cnt++;
5879 } else {
5880 error("%s: protocol_version %hu not supported",
5881 __func__, protocol_version);
5882 break;
5883 }
5884 }
5885 list_iterator_destroy(gres_iter);
5886 slurm_mutex_unlock(&gres_context_lock);
5887
5888 tail_offset = get_buf_offset(buffer);
5889 set_buf_offset(buffer, top_offset);
5890 pack16(rec_cnt, buffer);
5891 set_buf_offset(buffer, tail_offset);
5892
5893 return rc;
5894 }
5895
5896 /*
5897 * Unpack a job's current gres status, called from slurmctld for save/restore
5898 * OUT gres_list - restored state stored by gres_plugin_job_state_pack()
5899 * IN/OUT buffer - location to read state from
5900 * IN job_id - job's ID
5901 */
gres_plugin_job_state_unpack(List * gres_list,Buf buffer,uint32_t job_id,uint16_t protocol_version)5902 extern int gres_plugin_job_state_unpack(List *gres_list, Buf buffer,
5903 uint32_t job_id,
5904 uint16_t protocol_version)
5905 {
5906 int i = 0, rc;
5907 uint32_t magic = 0, plugin_id = 0, utmp32 = 0;
5908 uint16_t rec_cnt = 0;
5909 uint8_t has_more = 0;
5910 gres_state_t *gres_ptr;
5911 gres_job_state_t *gres_job_ptr = NULL;
5912
5913 safe_unpack16(&rec_cnt, buffer);
5914 if (rec_cnt == 0)
5915 return SLURM_SUCCESS;
5916
5917 rc = gres_plugin_init();
5918
5919 slurm_mutex_lock(&gres_context_lock);
5920 if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
5921 *gres_list = list_create(_gres_job_list_delete);
5922 }
5923
5924 while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
5925 if ((buffer == NULL) || (remaining_buf(buffer) == 0))
5926 break;
5927 rec_cnt--;
5928
5929 if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
5930 safe_unpack32(&magic, buffer);
5931 if (magic != GRES_MAGIC)
5932 goto unpack_error;
5933 safe_unpack32(&plugin_id, buffer);
5934 gres_job_ptr = xmalloc(sizeof(gres_job_state_t));
5935 safe_unpack16(&gres_job_ptr->cpus_per_gres, buffer);
5936 safe_unpack16(&gres_job_ptr->flags, buffer);
5937 safe_unpack64(&gres_job_ptr->gres_per_job, buffer);
5938 safe_unpack64(&gres_job_ptr->gres_per_node, buffer);
5939 safe_unpack64(&gres_job_ptr->gres_per_socket, buffer);
5940 safe_unpack64(&gres_job_ptr->gres_per_task, buffer);
5941 safe_unpack64(&gres_job_ptr->mem_per_gres, buffer);
5942 safe_unpack64(&gres_job_ptr->total_gres, buffer);
5943 safe_unpackstr_xmalloc(&gres_job_ptr->type_name,
5944 &utmp32, buffer);
5945 gres_job_ptr->type_id =
5946 gres_plugin_build_id(gres_job_ptr->type_name);
5947 safe_unpack32(&gres_job_ptr->node_cnt, buffer);
5948 if (gres_job_ptr->node_cnt > NO_VAL)
5949 goto unpack_error;
5950
5951 safe_unpack8(&has_more, buffer);
5952 if (has_more) {
5953 safe_unpack64_array(
5954 &gres_job_ptr->gres_cnt_node_alloc,
5955 &utmp32, buffer);
5956 }
5957
5958 safe_unpack8(&has_more, buffer);
5959 if (has_more) {
5960 safe_xcalloc(gres_job_ptr->gres_bit_alloc,
5961 gres_job_ptr->node_cnt,
5962 sizeof(bitstr_t *));
5963 for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5964 unpack_bit_str_hex(&gres_job_ptr->
5965 gres_bit_alloc[i],
5966 buffer);
5967 }
5968 }
5969 safe_unpack8(&has_more, buffer);
5970 if (has_more) {
5971 safe_xcalloc(gres_job_ptr->gres_bit_step_alloc,
5972 gres_job_ptr->node_cnt,
5973 sizeof(bitstr_t *));
5974 for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5975 unpack_bit_str_hex(&gres_job_ptr->
5976 gres_bit_step_alloc[i],
5977 buffer);
5978 }
5979 }
5980 safe_unpack8(&has_more, buffer);
5981 if (has_more) {
5982 safe_xcalloc(gres_job_ptr->gres_cnt_step_alloc,
5983 gres_job_ptr->node_cnt,
5984 sizeof(uint64_t));
5985 for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5986 safe_unpack64(&gres_job_ptr->
5987 gres_cnt_step_alloc[i],
5988 buffer);
5989 }
5990 }
5991 } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
5992 safe_unpack32(&magic, buffer);
5993 if (magic != GRES_MAGIC)
5994 goto unpack_error;
5995 safe_unpack32(&plugin_id, buffer);
5996 gres_job_ptr = xmalloc(sizeof(gres_job_state_t));
5997 safe_unpack16(&gres_job_ptr->cpus_per_gres, buffer);
5998 safe_unpack64(&gres_job_ptr->gres_per_job, buffer);
5999 safe_unpack64(&gres_job_ptr->gres_per_node, buffer);
6000 safe_unpack64(&gres_job_ptr->gres_per_socket, buffer);
6001 safe_unpack64(&gres_job_ptr->gres_per_task, buffer);
6002 safe_unpack64(&gres_job_ptr->mem_per_gres, buffer);
6003 safe_unpack64(&gres_job_ptr->total_gres, buffer);
6004 safe_unpackstr_xmalloc(&gres_job_ptr->type_name,
6005 &utmp32, buffer);
6006 gres_job_ptr->type_id =
6007 gres_plugin_build_id(gres_job_ptr->type_name);
6008 safe_unpack32(&gres_job_ptr->node_cnt, buffer);
6009 if (gres_job_ptr->node_cnt > NO_VAL)
6010 goto unpack_error;
6011
6012 safe_unpack8(&has_more, buffer);
6013 if (has_more) {
6014 safe_unpack64_array(
6015 &gres_job_ptr->gres_cnt_node_alloc,
6016 &utmp32, buffer);
6017 }
6018
6019 safe_unpack8(&has_more, buffer);
6020 if (has_more) {
6021 safe_xcalloc(gres_job_ptr->gres_bit_alloc,
6022 gres_job_ptr->node_cnt,
6023 sizeof(bitstr_t *));
6024 for (i = 0; i < gres_job_ptr->node_cnt; i++) {
6025 unpack_bit_str_hex(&gres_job_ptr->
6026 gres_bit_alloc[i],
6027 buffer);
6028 }
6029 }
6030 safe_unpack8(&has_more, buffer);
6031 if (has_more) {
6032 safe_xcalloc(gres_job_ptr->gres_bit_step_alloc,
6033 gres_job_ptr->node_cnt,
6034 sizeof(bitstr_t *));
6035 for (i = 0; i < gres_job_ptr->node_cnt; i++) {
6036 unpack_bit_str_hex(&gres_job_ptr->
6037 gres_bit_step_alloc[i],
6038 buffer);
6039 }
6040 }
6041 safe_unpack8(&has_more, buffer);
6042 if (has_more) {
6043 safe_xcalloc(gres_job_ptr->gres_cnt_step_alloc,
6044 gres_job_ptr->node_cnt,
6045 sizeof(uint64_t));
6046 for (i = 0; i < gres_job_ptr->node_cnt; i++) {
6047 safe_unpack64(&gres_job_ptr->
6048 gres_cnt_step_alloc[i],
6049 buffer);
6050 }
6051 }
6052 } else {
6053 error("%s: protocol_version %hu not supported",
6054 __func__, protocol_version);
6055 goto unpack_error;
6056 }
6057
6058 for (i = 0; i < gres_context_cnt; i++) {
6059 if (gres_context[i].plugin_id == plugin_id)
6060 break;
6061 }
6062 if (i >= gres_context_cnt) {
6063 /*
6064 * A likely sign that GresPlugins has changed.
6065 * Not a fatal error, skip over the data.
6066 */
6067 error("%s: no plugin configured to unpack data type %u from job %u",
6068 __func__, plugin_id, job_id);
6069 _job_state_delete(gres_job_ptr);
6070 continue;
6071 }
6072 gres_job_ptr->gres_name = xstrdup(gres_context[i].gres_name);
6073 gres_ptr = xmalloc(sizeof(gres_state_t));
6074 gres_ptr->plugin_id = gres_context[i].plugin_id;
6075 gres_ptr->gres_data = gres_job_ptr;
6076 gres_job_ptr = NULL; /* nothing left to free on error */
6077 list_append(*gres_list, gres_ptr);
6078 }
6079 slurm_mutex_unlock(&gres_context_lock);
6080 return rc;
6081
6082 unpack_error:
6083 error("%s: unpack error from job %u", __func__, job_id);
6084 if (gres_job_ptr)
6085 _job_state_delete(gres_job_ptr);
6086 slurm_mutex_unlock(&gres_context_lock);
6087 return SLURM_ERROR;
6088 }
6089
6090 /*
6091 * Pack a job's allocated gres information for use by prolog/epilog
6092 * IN gres_list - generated by gres_plugin_job_config_validate()
6093 * IN/OUT buffer - location to write state to
6094 */
gres_plugin_job_alloc_pack(List gres_list,Buf buffer,uint16_t protocol_version)6095 extern int gres_plugin_job_alloc_pack(List gres_list, Buf buffer,
6096 uint16_t protocol_version)
6097 {
6098 int i, rc = SLURM_SUCCESS;
6099 uint32_t top_offset, tail_offset;
6100 uint32_t magic = GRES_MAGIC;
6101 uint16_t rec_cnt = 0;
6102 ListIterator gres_iter;
6103 gres_epilog_info_t *gres_job_ptr;
6104
6105 top_offset = get_buf_offset(buffer);
6106 pack16(rec_cnt, buffer); /* placeholder if data */
6107
6108 if (gres_list == NULL)
6109 return rc;
6110
6111 (void) gres_plugin_init();
6112
6113 slurm_mutex_lock(&gres_context_lock);
6114 gres_iter = list_iterator_create(gres_list);
6115 while ((gres_job_ptr = (gres_epilog_info_t *) list_next(gres_iter))) {
6116 if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
6117 pack32(magic, buffer);
6118 pack32(gres_job_ptr->plugin_id, buffer);
6119 pack32(gres_job_ptr->node_cnt, buffer);
6120 if (gres_job_ptr->gres_cnt_node_alloc) {
6121 pack8((uint8_t) 1, buffer);
6122 pack64_array(gres_job_ptr->gres_cnt_node_alloc,
6123 gres_job_ptr->node_cnt, buffer);
6124 } else {
6125 pack8((uint8_t) 0, buffer);
6126 }
6127 if (gres_job_ptr->gres_bit_alloc) {
6128 pack8((uint8_t) 1, buffer);
6129 for (i = 0; i < gres_job_ptr->node_cnt; i++) {
6130 pack_bit_str_hex(gres_job_ptr->
6131 gres_bit_alloc[i],
6132 buffer);
6133 }
6134 } else {
6135 pack8((uint8_t) 0, buffer);
6136 }
6137 rec_cnt++;
6138 } else {
6139 error("%s: protocol_version %hu not supported",
6140 __func__, protocol_version);
6141 break;
6142 }
6143 }
6144 list_iterator_destroy(gres_iter);
6145 slurm_mutex_unlock(&gres_context_lock);
6146
6147 tail_offset = get_buf_offset(buffer);
6148 set_buf_offset(buffer, top_offset);
6149 pack16(rec_cnt, buffer);
6150 set_buf_offset(buffer, tail_offset);
6151
6152 return rc;
6153 }
6154
_epilog_list_del(void * x)6155 static void _epilog_list_del(void *x)
6156 {
6157 gres_epilog_info_t *epilog_info = (gres_epilog_info_t *) x;
6158 int i;
6159
6160 if (!epilog_info)
6161 return;
6162
6163 if (epilog_info->gres_bit_alloc) {
6164 for (i = 0; i < epilog_info->node_cnt; i++)
6165 FREE_NULL_BITMAP(epilog_info->gres_bit_alloc[i]);
6166 xfree(epilog_info->gres_bit_alloc);
6167 }
6168 xfree(epilog_info->gres_cnt_node_alloc);
6169 xfree(epilog_info->node_list);
6170 xfree(epilog_info);
6171 }
6172
6173 /*
6174 * Unpack a job's allocated gres information for use by prolog/epilog
6175 * OUT gres_list - restored state stored by gres_plugin_job_alloc_pack()
6176 * IN/OUT buffer - location to read state from
6177 */
gres_plugin_job_alloc_unpack(List * gres_list,Buf buffer,uint16_t protocol_version)6178 extern int gres_plugin_job_alloc_unpack(List *gres_list, Buf buffer,
6179 uint16_t protocol_version)
6180 {
6181 int i = 0, rc;
6182 uint32_t magic = 0, utmp32 = 0;
6183 uint16_t rec_cnt = 0;
6184 uint8_t filled = 0;
6185 gres_epilog_info_t *gres_job_ptr = NULL;
6186
6187 safe_unpack16(&rec_cnt, buffer);
6188 if (rec_cnt == 0)
6189 return SLURM_SUCCESS;
6190
6191 rc = gres_plugin_init();
6192
6193 slurm_mutex_lock(&gres_context_lock);
6194 if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
6195 *gres_list = list_create(_epilog_list_del);
6196 }
6197
6198 while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
6199 if ((buffer == NULL) || (remaining_buf(buffer) == 0))
6200 break;
6201 rec_cnt--;
6202
6203 if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
6204 safe_unpack32(&magic, buffer);
6205 if (magic != GRES_MAGIC)
6206 goto unpack_error;
6207 gres_job_ptr = xmalloc(sizeof(gres_epilog_info_t));
6208 safe_unpack32(&gres_job_ptr->plugin_id, buffer);
6209 safe_unpack32(&gres_job_ptr->node_cnt, buffer);
6210 if (gres_job_ptr->node_cnt > NO_VAL)
6211 goto unpack_error;
6212 safe_unpack8(&filled, buffer);
6213 if (filled) {
6214 safe_unpack64_array(
6215 &gres_job_ptr->gres_cnt_node_alloc,
6216 &utmp32, buffer);
6217 }
6218 safe_unpack8(&filled, buffer);
6219 if (filled) {
6220 safe_xcalloc(gres_job_ptr->gres_bit_alloc,
6221 gres_job_ptr->node_cnt,
6222 sizeof(bitstr_t *));
6223 for (i = 0; i < gres_job_ptr->node_cnt; i++) {
6224 unpack_bit_str_hex(&gres_job_ptr->
6225 gres_bit_alloc[i],
6226 buffer);
6227 }
6228 }
6229 } else {
6230 error("%s: protocol_version %hu not supported",
6231 __func__, protocol_version);
6232 goto unpack_error;
6233 }
6234
6235 for (i = 0; i < gres_context_cnt; i++) {
6236 if (gres_context[i].plugin_id ==
6237 gres_job_ptr->plugin_id)
6238 break;
6239 }
6240 if (i >= gres_context_cnt) {
6241 /*
6242 * A likely sign that GresPlugins has changed.
6243 * Not a fatal error, skip over the data.
6244 */
6245 error("%s: no plugin configured to unpack data type %u",
6246 __func__, gres_job_ptr->plugin_id);
6247 _epilog_list_del(gres_job_ptr);
6248 continue;
6249 }
6250 list_append(*gres_list, gres_job_ptr);
6251 gres_job_ptr = NULL;
6252 }
6253 slurm_mutex_unlock(&gres_context_lock);
6254 return rc;
6255
6256 unpack_error:
6257 error("%s: unpack error", __func__);
6258 if (gres_job_ptr)
6259 _epilog_list_del(gres_job_ptr);
6260 slurm_mutex_unlock(&gres_context_lock);
6261 return SLURM_ERROR;
6262 }
6263
6264 /*
6265 * Build List of information needed to set job's Prolog or Epilog environment
6266 * variables
6267 *
6268 * IN job_gres_list - job's GRES allocation info
6269 * IN hostlist - list of nodes associated with the job
6270 * RET information about the job's GRES allocation needed by Prolog or Epilog
6271 */
gres_plugin_epilog_build_env(List job_gres_list,char * node_list)6272 extern List gres_plugin_epilog_build_env(List job_gres_list, char *node_list)
6273 {
6274 int i;
6275 ListIterator gres_iter;
6276 gres_state_t *gres_ptr = NULL;
6277 gres_epilog_info_t *epilog_info;
6278 List epilog_gres_list = NULL;
6279
6280 if (!job_gres_list)
6281 return NULL;
6282
6283 (void) gres_plugin_init();
6284
6285 slurm_mutex_lock(&gres_context_lock);
6286 gres_iter = list_iterator_create(job_gres_list);
6287 while ((gres_ptr = list_next(gres_iter))) {
6288 for (i = 0; i < gres_context_cnt; i++) {
6289 if (gres_ptr->plugin_id == gres_context[i].plugin_id)
6290 break;
6291 }
6292 if (i >= gres_context_cnt) {
6293 error("%s: gres not found in context. This should never happen",
6294 __func__);
6295 continue;
6296 }
6297
6298 if (!gres_context[i].ops.epilog_build_env)
6299 continue; /* No plugin to call */
6300 epilog_info = (*(gres_context[i].ops.epilog_build_env))
6301 (gres_ptr->gres_data);
6302 if (!epilog_info)
6303 continue; /* No info to add for this plugin */
6304 if (!epilog_gres_list)
6305 epilog_gres_list = list_create(_epilog_list_del);
6306 epilog_info->plugin_id = gres_context[i].plugin_id;
6307 epilog_info->node_list = xstrdup(node_list);
6308 list_append(epilog_gres_list, epilog_info);
6309 }
6310 list_iterator_destroy(gres_iter);
6311 slurm_mutex_unlock(&gres_context_lock);
6312
6313 return epilog_gres_list;
6314 }
6315
6316 /*
6317 * Set environment variables as appropriate for a job's prolog or epilog based
6318 * GRES allocated to the job.
6319 *
6320 * IN/OUT epilog_env_ptr - environment variable array
6321 * IN epilog_gres_list - generated by TBD
6322 * IN node_inx - zero origin node index
6323 */
gres_plugin_epilog_set_env(char *** epilog_env_ptr,List epilog_gres_list,int node_inx)6324 extern void gres_plugin_epilog_set_env(char ***epilog_env_ptr,
6325 List epilog_gres_list, int node_inx)
6326 {
6327 int i;
6328 ListIterator epilog_iter;
6329 gres_epilog_info_t *epilog_info;
6330
6331 *epilog_env_ptr = NULL;
6332 if (!epilog_gres_list)
6333 return;
6334
6335 (void) gres_plugin_init();
6336
6337 slurm_mutex_lock(&gres_context_lock);
6338 epilog_iter = list_iterator_create(epilog_gres_list);
6339 while ((epilog_info = list_next(epilog_iter))) {
6340 for (i = 0; i < gres_context_cnt; i++) {
6341 if (epilog_info->plugin_id == gres_context[i].plugin_id)
6342 break;
6343 }
6344 if (i >= gres_context_cnt) {
6345 error("%s: GRES ID %u not found in context",
6346 __func__, epilog_info->plugin_id);
6347 continue;
6348 }
6349
6350 if (!gres_context[i].ops.epilog_set_env)
6351 continue; /* No plugin to call */
6352 (*(gres_context[i].ops.epilog_set_env))
6353 (epilog_env_ptr, epilog_info, node_inx);
6354 }
6355 list_iterator_destroy(epilog_iter);
6356 slurm_mutex_unlock(&gres_context_lock);
6357 }
6358
6359 /*
6360 * If core bitmap from slurmd differs in size from that in slurmctld,
6361 * then modify bitmap from slurmd so we can use bit_and, bit_or, etc.
6362 */
_core_bitmap_rebuild(bitstr_t * old_core_bitmap,int new_size)6363 static bitstr_t *_core_bitmap_rebuild(bitstr_t *old_core_bitmap, int new_size)
6364 {
6365 int i, j, old_size, ratio;
6366 bitstr_t *new_core_bitmap;
6367
6368 new_core_bitmap = bit_alloc(new_size);
6369 old_size = bit_size(old_core_bitmap);
6370 if (old_size > new_size) {
6371 ratio = old_size / new_size;
6372 for (i = 0; i < new_size; i++) {
6373 for (j = 0; j < ratio; j++) {
6374 if (bit_test(old_core_bitmap, i*ratio+j)) {
6375 bit_set(new_core_bitmap, i);
6376 break;
6377 }
6378 }
6379 }
6380 } else {
6381 ratio = new_size / old_size;
6382 for (i = 0; i < old_size; i++) {
6383 if (!bit_test(old_core_bitmap, i))
6384 continue;
6385 for (j = 0; j < ratio; j++) {
6386 bit_set(new_core_bitmap, i*ratio+j);
6387 }
6388 }
6389 }
6390
6391 return new_core_bitmap;
6392 }
6393
_validate_gres_node_cores(gres_node_state_t * node_gres_ptr,int cores_ctld,char * node_name)6394 static void _validate_gres_node_cores(gres_node_state_t *node_gres_ptr,
6395 int cores_ctld, char *node_name)
6396 {
6397 int i, cores_slurmd;
6398 bitstr_t *new_core_bitmap;
6399 int log_mismatch = true;
6400
6401 if (node_gres_ptr->topo_cnt == 0)
6402 return;
6403
6404 if (node_gres_ptr->topo_core_bitmap == NULL) {
6405 error("Gres topo_core_bitmap is NULL on node %s", node_name);
6406 return;
6407 }
6408
6409
6410 for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
6411 if (!node_gres_ptr->topo_core_bitmap[i])
6412 continue;
6413 cores_slurmd = bit_size(node_gres_ptr->topo_core_bitmap[i]);
6414 if (cores_slurmd == cores_ctld)
6415 continue;
6416 if (log_mismatch) {
6417 debug("Rebuilding node %s gres core bitmap (%d != %d)",
6418 node_name, cores_slurmd, cores_ctld);
6419 log_mismatch = false;
6420 }
6421 new_core_bitmap = _core_bitmap_rebuild(
6422 node_gres_ptr->topo_core_bitmap[i],
6423 cores_ctld);
6424 FREE_NULL_BITMAP(node_gres_ptr->topo_core_bitmap[i]);
6425 node_gres_ptr->topo_core_bitmap[i] = new_core_bitmap;
6426 }
6427 }
6428
_job_core_filter(void * job_gres_data,void * node_gres_data,bool use_total_gres,bitstr_t * core_bitmap,int core_start_bit,int core_end_bit,char * gres_name,char * node_name,uint32_t plugin_id)6429 static void _job_core_filter(void *job_gres_data, void *node_gres_data,
6430 bool use_total_gres, bitstr_t *core_bitmap,
6431 int core_start_bit, int core_end_bit,
6432 char *gres_name, char *node_name,
6433 uint32_t plugin_id)
6434 {
6435 int i, j, core_ctld;
6436 gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data;
6437 gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data;
6438 bitstr_t *avail_core_bitmap = NULL;
6439 bool use_busy_dev = false;
6440
6441 if (!node_gres_ptr->topo_cnt || !core_bitmap || /* No topology info */
6442 !job_gres_ptr->gres_per_node) /* No job GRES */
6443 return;
6444
6445 if (!use_total_gres &&
6446 (plugin_id == mps_plugin_id) &&
6447 (node_gres_ptr->gres_cnt_alloc != 0)) {
6448 /* We must use the ONE already active GRES of this type */
6449 use_busy_dev = true;
6450 }
6451
6452 /* Determine which specific cores can be used */
6453 avail_core_bitmap = bit_copy(core_bitmap);
6454 bit_nclear(avail_core_bitmap, core_start_bit, core_end_bit);
6455 for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
6456 if (node_gres_ptr->topo_gres_cnt_avail[i] == 0)
6457 continue;
6458 if (!use_total_gres &&
6459 (node_gres_ptr->topo_gres_cnt_alloc[i] >=
6460 node_gres_ptr->topo_gres_cnt_avail[i]))
6461 continue;
6462 if (use_busy_dev &&
6463 (node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
6464 continue;
6465 if (job_gres_ptr->type_name &&
6466 (!node_gres_ptr->topo_type_name[i] ||
6467 (job_gres_ptr->type_id != node_gres_ptr->topo_type_id[i])))
6468 continue;
6469 if (!node_gres_ptr->topo_core_bitmap[i]) {
6470 FREE_NULL_BITMAP(avail_core_bitmap); /* No filter */
6471 return;
6472 }
6473 core_ctld = core_end_bit - core_start_bit + 1;
6474 _validate_gres_node_cores(node_gres_ptr, core_ctld, node_name);
6475 core_ctld = bit_size(node_gres_ptr->topo_core_bitmap[i]);
6476 for (j = 0; j < core_ctld; j++) {
6477 if (bit_test(node_gres_ptr->topo_core_bitmap[i], j)) {
6478 bit_set(avail_core_bitmap, core_start_bit + j);
6479 }
6480 }
6481 }
6482 bit_and(core_bitmap, avail_core_bitmap);
6483 FREE_NULL_BITMAP(avail_core_bitmap);
6484 }
6485
_job_test(void * job_gres_data,void * node_gres_data,bool use_total_gres,bitstr_t * core_bitmap,int core_start_bit,int core_end_bit,bool * topo_set,uint32_t job_id,char * node_name,char * gres_name,uint32_t plugin_id,bool disable_binding)6486 static uint32_t _job_test(void *job_gres_data, void *node_gres_data,
6487 bool use_total_gres, bitstr_t *core_bitmap,
6488 int core_start_bit, int core_end_bit, bool *topo_set,
6489 uint32_t job_id, char *node_name, char *gres_name,
6490 uint32_t plugin_id, bool disable_binding)
6491 {
6492 int i, j, core_size, core_ctld, top_inx = -1;
6493 uint64_t gres_avail = 0, gres_max = 0, gres_total, gres_tmp;
6494 uint64_t min_gres_node = 0;
6495 gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data;
6496 gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data;
6497 uint32_t *cores_addnt = NULL; /* Additional cores avail from this GRES */
6498 uint32_t *cores_avail = NULL; /* cores initially avail from this GRES */
6499 uint32_t core_cnt = 0;
6500 bitstr_t *alloc_core_bitmap = NULL;
6501 bitstr_t *avail_core_bitmap = NULL;
6502 bool shared_gres = _shared_gres(plugin_id);
6503 bool use_busy_dev = false;
6504
6505 if (node_gres_ptr->no_consume)
6506 use_total_gres = true;
6507
6508 if (!use_total_gres &&
6509 (plugin_id == mps_plugin_id) &&
6510 (node_gres_ptr->gres_cnt_alloc != 0)) {
6511 /* We must use the ONE already active GRES of this type */
6512 use_busy_dev = true;
6513 }
6514
6515 /* Determine minimum GRES count needed on this node */
6516 if (job_gres_ptr->gres_per_job)
6517 min_gres_node = 1;
6518 min_gres_node = MAX(min_gres_node, job_gres_ptr->gres_per_node);
6519 min_gres_node = MAX(min_gres_node, job_gres_ptr->gres_per_socket);
6520 min_gres_node = MAX(min_gres_node, job_gres_ptr->gres_per_task);
6521
6522 if (min_gres_node && node_gres_ptr->topo_cnt && *topo_set) {
6523 /*
6524 * Need to determine how many GRES available for these
6525 * specific cores
6526 */
6527 if (core_bitmap) {
6528 core_ctld = core_end_bit - core_start_bit + 1;
6529 if (core_ctld < 1) {
6530 error("gres/%s: job %u cores on node %s < 1",
6531 gres_name, job_id, node_name);
6532 return (uint32_t) 0;
6533 }
6534 _validate_gres_node_cores(node_gres_ptr, core_ctld,
6535 node_name);
6536 }
6537 for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
6538 if (job_gres_ptr->type_name &&
6539 (!node_gres_ptr->topo_type_name[i] ||
6540 (node_gres_ptr->topo_type_id[i] !=
6541 job_gres_ptr->type_id)))
6542 continue;
6543 if (use_busy_dev &&
6544 (node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
6545 continue;
6546 if (!node_gres_ptr->topo_core_bitmap[i]) {
6547 gres_avail += node_gres_ptr->
6548 topo_gres_cnt_avail[i];
6549 if (!use_total_gres) {
6550 gres_avail -= node_gres_ptr->
6551 topo_gres_cnt_alloc[i];
6552 }
6553 if (shared_gres)
6554 gres_max = MAX(gres_max, gres_avail);
6555 continue;
6556 }
6557 core_ctld = bit_size(node_gres_ptr->
6558 topo_core_bitmap[i]);
6559 for (j = 0; j < core_ctld; j++) {
6560 if (core_bitmap &&
6561 !bit_test(core_bitmap, core_start_bit + j))
6562 continue;
6563 if (!bit_test(node_gres_ptr->
6564 topo_core_bitmap[i], j))
6565 continue; /* not avail for this gres */
6566 gres_avail += node_gres_ptr->
6567 topo_gres_cnt_avail[i];
6568 if (!use_total_gres) {
6569 gres_avail -= node_gres_ptr->
6570 topo_gres_cnt_alloc[i];
6571 }
6572 if (shared_gres)
6573 gres_max = MAX(gres_max, gres_avail);
6574 break;
6575 }
6576 }
6577 if (shared_gres)
6578 gres_avail = gres_max;
6579 if (min_gres_node > gres_avail)
6580 return (uint32_t) 0; /* insufficient GRES avail */
6581 return NO_VAL;
6582 } else if (min_gres_node && node_gres_ptr->topo_cnt &&
6583 !disable_binding) {
6584 /* Need to determine which specific cores can be used */
6585 gres_avail = node_gres_ptr->gres_cnt_avail;
6586 if (!use_total_gres)
6587 gres_avail -= node_gres_ptr->gres_cnt_alloc;
6588 if (min_gres_node > gres_avail)
6589 return (uint32_t) 0; /* insufficient GRES avail */
6590
6591 core_ctld = core_end_bit - core_start_bit + 1;
6592 if (core_bitmap) {
6593 if (core_ctld < 1) {
6594 error("gres/%s: job %u cores on node %s < 1",
6595 gres_name, job_id, node_name);
6596 return (uint32_t) 0;
6597 }
6598 _validate_gres_node_cores(node_gres_ptr, core_ctld,
6599 node_name);
6600 } else {
6601 for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
6602 if (!node_gres_ptr->topo_core_bitmap[i])
6603 continue;
6604 core_ctld = bit_size(node_gres_ptr->
6605 topo_core_bitmap[i]);
6606 break;
6607 }
6608 }
6609
6610 alloc_core_bitmap = bit_alloc(core_ctld);
6611 if (core_bitmap) {
6612 for (j = 0; j < core_ctld; j++) {
6613 if (bit_test(core_bitmap, core_start_bit + j))
6614 bit_set(alloc_core_bitmap, j);
6615 }
6616 } else {
6617 bit_nset(alloc_core_bitmap, 0, core_ctld - 1);
6618 }
6619
6620 avail_core_bitmap = bit_copy(alloc_core_bitmap);
6621 cores_addnt = xcalloc(node_gres_ptr->topo_cnt,
6622 sizeof(uint32_t));
6623 cores_avail = xcalloc(node_gres_ptr->topo_cnt,
6624 sizeof(uint32_t));
6625 for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
6626 if (node_gres_ptr->topo_gres_cnt_avail[i] == 0)
6627 continue;
6628 if (use_busy_dev &&
6629 (node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
6630 continue;
6631 if (!use_total_gres &&
6632 (node_gres_ptr->topo_gres_cnt_alloc[i] >=
6633 node_gres_ptr->topo_gres_cnt_avail[i]))
6634 continue;
6635 if (job_gres_ptr->type_name &&
6636 (!node_gres_ptr->topo_type_name[i] ||
6637 (node_gres_ptr->topo_type_id[i] !=
6638 job_gres_ptr->type_id)))
6639 continue;
6640 if (!node_gres_ptr->topo_core_bitmap[i]) {
6641 cores_avail[i] = core_end_bit -
6642 core_start_bit + 1;
6643 continue;
6644 }
6645 core_size = bit_size(node_gres_ptr->topo_core_bitmap[i]);
6646 for (j = 0; j < core_size; j++) {
6647 if (core_bitmap &&
6648 !bit_test(core_bitmap, core_start_bit + j))
6649 continue;
6650 if (bit_test(node_gres_ptr->
6651 topo_core_bitmap[i], j)) {
6652 cores_avail[i]++;
6653 }
6654 }
6655 }
6656
6657 /* Pick the topology entries with the most cores available */
6658 gres_avail = 0;
6659 gres_total = 0;
6660 while (gres_avail < min_gres_node) {
6661 top_inx = -1;
6662 for (j = 0; j < node_gres_ptr->topo_cnt; j++) {
6663 if ((gres_avail == 0) || (cores_avail[j] == 0) ||
6664 !node_gres_ptr->topo_core_bitmap[j]) {
6665 cores_addnt[j] = cores_avail[j];
6666 } else {
6667 cores_addnt[j] = cores_avail[j] -
6668 bit_overlap(alloc_core_bitmap,
6669 node_gres_ptr->
6670 topo_core_bitmap[j]);
6671 }
6672
6673 if (top_inx == -1) {
6674 if (cores_avail[j])
6675 top_inx = j;
6676 } else if (cores_addnt[j] > cores_addnt[top_inx])
6677 top_inx = j;
6678 }
6679 if ((top_inx < 0) || (cores_avail[top_inx] == 0)) {
6680 if (gres_total < min_gres_node)
6681 core_cnt = 0;
6682 break;
6683 }
6684 cores_avail[top_inx] = 0; /* Flag as used */
6685 gres_tmp = node_gres_ptr->topo_gres_cnt_avail[top_inx];
6686 if (!use_total_gres &&
6687 (gres_tmp >=
6688 node_gres_ptr->topo_gres_cnt_alloc[top_inx])) {
6689 gres_tmp -= node_gres_ptr->
6690 topo_gres_cnt_alloc[top_inx];
6691 } else if (!use_total_gres) {
6692 gres_tmp = 0;
6693 }
6694 if (gres_tmp == 0) {
6695 error("gres/%s: topology allocation error on node %s",
6696 gres_name, node_name);
6697 break;
6698 }
6699 /* update counts of allocated cores and GRES */
6700 if (shared_gres) {
6701 /*
6702 * Process outside of loop after specific
6703 * device selected
6704 */
6705 } else if (!node_gres_ptr->topo_core_bitmap[top_inx]) {
6706 bit_nset(alloc_core_bitmap, 0, core_ctld - 1);
6707 } else if (gres_avail) {
6708 bit_or(alloc_core_bitmap,
6709 node_gres_ptr->
6710 topo_core_bitmap[top_inx]);
6711 if (core_bitmap)
6712 bit_and(alloc_core_bitmap,
6713 avail_core_bitmap);
6714 } else {
6715 bit_and(alloc_core_bitmap,
6716 node_gres_ptr->
6717 topo_core_bitmap[top_inx]);
6718 }
6719 if (shared_gres) {
6720 gres_total = MAX(gres_total, gres_tmp);
6721 gres_avail = gres_total;
6722 } else {
6723 /*
6724 * Available GRES count is up to gres_tmp,
6725 * but take 1 per loop to maximize available
6726 * core count
6727 */
6728 gres_avail += 1;
6729 gres_total += gres_tmp;
6730 core_cnt = bit_set_count(alloc_core_bitmap);
6731 }
6732 }
6733 if (shared_gres && (top_inx >= 0) &&
6734 (gres_avail >= min_gres_node)) {
6735 if (!node_gres_ptr->topo_core_bitmap[top_inx]) {
6736 bit_nset(alloc_core_bitmap, 0, core_ctld - 1);
6737 } else {
6738 bit_or(alloc_core_bitmap,
6739 node_gres_ptr->
6740 topo_core_bitmap[top_inx]);
6741 if (core_bitmap)
6742 bit_and(alloc_core_bitmap,
6743 avail_core_bitmap);
6744 }
6745 core_cnt = bit_set_count(alloc_core_bitmap);
6746 }
6747 if (core_bitmap && (core_cnt > 0)) {
6748 *topo_set = true;
6749 for (i = 0; i < core_ctld; i++) {
6750 if (!bit_test(alloc_core_bitmap, i)) {
6751 bit_clear(core_bitmap,
6752 core_start_bit + i);
6753 }
6754 }
6755 }
6756 FREE_NULL_BITMAP(alloc_core_bitmap);
6757 FREE_NULL_BITMAP(avail_core_bitmap);
6758 xfree(cores_addnt);
6759 xfree(cores_avail);
6760 return core_cnt;
6761 } else if (job_gres_ptr->type_name) {
6762 for (i = 0; i < node_gres_ptr->type_cnt; i++) {
6763 if (node_gres_ptr->type_name[i] &&
6764 (node_gres_ptr->type_id[i] ==
6765 job_gres_ptr->type_id))
6766 break;
6767 }
6768 if (i >= node_gres_ptr->type_cnt)
6769 return (uint32_t) 0; /* no such type */
6770 gres_avail = node_gres_ptr->type_cnt_avail[i];
6771 if (!use_total_gres)
6772 gres_avail -= node_gres_ptr->type_cnt_alloc[i];
6773 gres_tmp = node_gres_ptr->gres_cnt_avail;
6774 if (!use_total_gres)
6775 gres_tmp -= node_gres_ptr->gres_cnt_alloc;
6776 gres_avail = MIN(gres_avail, gres_tmp);
6777 if (min_gres_node > gres_avail)
6778 return (uint32_t) 0; /* insufficient GRES avail */
6779 return NO_VAL;
6780 } else {
6781 gres_avail = node_gres_ptr->gres_cnt_avail;
6782 if (!use_total_gres)
6783 gres_avail -= node_gres_ptr->gres_cnt_alloc;
6784 if (min_gres_node > gres_avail)
6785 return (uint32_t) 0; /* insufficient GRES avail */
6786 return NO_VAL;
6787 }
6788 }
6789
6790 /*
6791 * Clear the core_bitmap for cores which are not usable by this job (i.e. for
6792 * cores which are already bound to other jobs or lack GRES)
6793 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
6794 * IN node_gres_list - node's gres_list built by
6795 * gres_plugin_node_config_validate()
6796 * IN use_total_gres - if set then consider all GRES resources as available,
6797 * and none are commited to running jobs
6798 * IN/OUT core_bitmap - Identification of available cores (NULL if no restriction)
6799 * IN core_start_bit - index into core_bitmap for this node's first cores
6800 * IN core_end_bit - index into core_bitmap for this node's last cores
6801 */
gres_plugin_job_core_filter(List job_gres_list,List node_gres_list,bool use_total_gres,bitstr_t * core_bitmap,int core_start_bit,int core_end_bit,char * node_name)6802 extern void gres_plugin_job_core_filter(List job_gres_list, List node_gres_list,
6803 bool use_total_gres,
6804 bitstr_t *core_bitmap,
6805 int core_start_bit, int core_end_bit,
6806 char *node_name)
6807 {
6808 int i;
6809 ListIterator job_gres_iter;
6810 gres_state_t *job_gres_ptr, *node_gres_ptr;
6811
6812 if ((job_gres_list == NULL) || (core_bitmap == NULL))
6813 return;
6814 if (node_gres_list == NULL) {
6815 bit_nclear(core_bitmap, core_start_bit, core_end_bit);
6816 return;
6817 }
6818
6819 (void) gres_plugin_init();
6820
6821 slurm_mutex_lock(&gres_context_lock);
6822 job_gres_iter = list_iterator_create(job_gres_list);
6823 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
6824 node_gres_ptr = list_find_first(node_gres_list, _gres_find_id,
6825 &job_gres_ptr->plugin_id);
6826 if (node_gres_ptr == NULL) {
6827 /* node lack resources required by the job */
6828 bit_nclear(core_bitmap, core_start_bit, core_end_bit);
6829 break;
6830 }
6831
6832 for (i = 0; i < gres_context_cnt; i++) {
6833 if (job_gres_ptr->plugin_id !=
6834 gres_context[i].plugin_id)
6835 continue;
6836 _job_core_filter(job_gres_ptr->gres_data,
6837 node_gres_ptr->gres_data,
6838 use_total_gres, core_bitmap,
6839 core_start_bit, core_end_bit,
6840 gres_context[i].gres_name, node_name,
6841 job_gres_ptr->plugin_id);
6842 break;
6843 }
6844 }
6845 list_iterator_destroy(job_gres_iter);
6846 slurm_mutex_unlock(&gres_context_lock);
6847
6848 return;
6849 }
6850
6851 /*
6852 * Determine how many cores on the node can be used by this job
6853 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
6854 * IN node_gres_list - node's gres_list built by gres_plugin_node_config_validate()
6855 * IN use_total_gres - if set then consider all gres resources as available,
6856 * and none are commited to running jobs
6857 * IN core_bitmap - Identification of available cores (NULL if no restriction)
6858 * IN core_start_bit - index into core_bitmap for this node's first core
6859 * IN core_end_bit - index into core_bitmap for this node's last core
6860 * IN job_id - job's ID (for logging)
6861 * IN node_name - name of the node (for logging)
6862 * IN disable binding- --gres-flags=disable-binding
6863 * RET: NO_VAL - All cores on node are available
6864 * otherwise - Count of available cores
6865 */
gres_plugin_job_test(List job_gres_list,List node_gres_list,bool use_total_gres,bitstr_t * core_bitmap,int core_start_bit,int core_end_bit,uint32_t job_id,char * node_name,bool disable_binding)6866 extern uint32_t gres_plugin_job_test(List job_gres_list, List node_gres_list,
6867 bool use_total_gres, bitstr_t *core_bitmap,
6868 int core_start_bit, int core_end_bit,
6869 uint32_t job_id, char *node_name,
6870 bool disable_binding)
6871 {
6872 int i;
6873 uint32_t core_cnt, tmp_cnt;
6874 ListIterator job_gres_iter;
6875 gres_state_t *job_gres_ptr, *node_gres_ptr;
6876 bool topo_set = false;
6877
6878 if (job_gres_list == NULL)
6879 return NO_VAL;
6880 if (node_gres_list == NULL)
6881 return 0;
6882
6883 core_cnt = NO_VAL;
6884 (void) gres_plugin_init();
6885
6886 slurm_mutex_lock(&gres_context_lock);
6887 job_gres_iter = list_iterator_create(job_gres_list);
6888 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
6889 node_gres_ptr = list_find_first(node_gres_list, _gres_find_id,
6890 &job_gres_ptr->plugin_id);
6891 if (node_gres_ptr == NULL) {
6892 /* node lack resources required by the job */
6893 core_cnt = 0;
6894 break;
6895 }
6896
6897 for (i = 0; i < gres_context_cnt; i++) {
6898 if (job_gres_ptr->plugin_id !=
6899 gres_context[i].plugin_id)
6900 continue;
6901 tmp_cnt = _job_test(job_gres_ptr->gres_data,
6902 node_gres_ptr->gres_data,
6903 use_total_gres, core_bitmap,
6904 core_start_bit, core_end_bit,
6905 &topo_set, job_id, node_name,
6906 gres_context[i].gres_name,
6907 gres_context[i].plugin_id,
6908 disable_binding);
6909 if (tmp_cnt != NO_VAL) {
6910 if (core_cnt == NO_VAL)
6911 core_cnt = tmp_cnt;
6912 else
6913 core_cnt = MIN(tmp_cnt, core_cnt);
6914 }
6915 break;
6916 }
6917 if (core_cnt == 0)
6918 break;
6919 }
6920 list_iterator_destroy(job_gres_iter);
6921 slurm_mutex_unlock(&gres_context_lock);
6922
6923 return core_cnt;
6924 }
6925
_sock_gres_del(void * x)6926 static void _sock_gres_del(void *x)
6927 {
6928 sock_gres_t *sock_gres = (sock_gres_t *) x;
6929 int s;
6930
6931 if (sock_gres) {
6932 FREE_NULL_BITMAP(sock_gres->bits_any_sock);
6933 if (sock_gres->bits_by_sock) {
6934 for (s = 0; s < sock_gres->sock_cnt; s++)
6935 FREE_NULL_BITMAP(sock_gres->bits_by_sock[s]);
6936 xfree(sock_gres->bits_by_sock);
6937 }
6938 xfree(sock_gres->cnt_by_sock);
6939 xfree(sock_gres->gres_name);
6940 /* NOTE: sock_gres->job_specs is just a pointer, do not free */
6941 xfree(sock_gres->type_name);
6942 xfree(sock_gres);
6943 }
6944 }
6945
6946 /*
6947 * Build a string containing the GRES details for a given node and socket
6948 * sock_gres_list IN - List of sock_gres_t entries
6949 * sock_inx IN - zero-origin socket for which information is to be returned
6950 * if value < 0, then report GRES unconstrained by core
6951 * RET string, must call xfree() to release memory
6952 */
gres_plugin_sock_str(List sock_gres_list,int sock_inx)6953 extern char *gres_plugin_sock_str(List sock_gres_list, int sock_inx)
6954 {
6955 ListIterator iter;
6956 sock_gres_t *sock_gres;
6957 char *gres_str = NULL, *sep = "";
6958
6959 if (!sock_gres_list)
6960 return NULL;
6961
6962 iter = list_iterator_create(sock_gres_list);
6963 while ((sock_gres = (sock_gres_t *) list_next(iter))) {
6964 if (sock_inx < 0) {
6965 if (sock_gres->cnt_any_sock) {
6966 if (sock_gres->type_name) {
6967 xstrfmtcat(gres_str, "%s%s:%s:%"PRIu64,
6968 sep, sock_gres->gres_name,
6969 sock_gres->type_name,
6970 sock_gres->cnt_any_sock);
6971 } else {
6972 xstrfmtcat(gres_str, "%s%s:%"PRIu64,
6973 sep, sock_gres->gres_name,
6974 sock_gres->cnt_any_sock);
6975 }
6976 sep = " ";
6977 }
6978 continue;
6979 }
6980 if (!sock_gres->cnt_by_sock ||
6981 (sock_gres->cnt_by_sock[sock_inx] == 0))
6982 continue;
6983 if (sock_gres->type_name) {
6984 xstrfmtcat(gres_str, "%s%s:%s:%"PRIu64, sep,
6985 sock_gres->gres_name, sock_gres->type_name,
6986 sock_gres->cnt_by_sock[sock_inx]);
6987 } else {
6988 xstrfmtcat(gres_str, "%s%s:%"PRIu64, sep,
6989 sock_gres->gres_name,
6990 sock_gres->cnt_by_sock[sock_inx]);
6991 }
6992 sep = " ";
6993 }
6994 list_iterator_destroy(iter);
6995 return gres_str;
6996 }
6997
6998 /*
6999 * Determine how many GRES of a given type can be used by this job on a
7000 * given node and return a structure with the details. Note that multiple
7001 * GRES of a given type model can be distributed over multiple topo structures,
7002 * so we need to OR the core_bitmap over all of them.
7003 */
_build_sock_gres_by_topo(gres_job_state_t * job_gres_ptr,gres_node_state_t * node_gres_ptr,bool use_total_gres,bitstr_t * core_bitmap,uint16_t sockets,uint16_t cores_per_sock,uint32_t job_id,char * node_name,bool enforce_binding,uint32_t s_p_n,bitstr_t ** req_sock_map,uint32_t main_plugin_id,uint32_t alt_plugin_id,gres_node_state_t * alt_node_gres_ptr,uint32_t user_id,const uint32_t node_inx)7004 static sock_gres_t *_build_sock_gres_by_topo(gres_job_state_t *job_gres_ptr,
7005 gres_node_state_t *node_gres_ptr,
7006 bool use_total_gres, bitstr_t *core_bitmap,
7007 uint16_t sockets, uint16_t cores_per_sock,
7008 uint32_t job_id, char *node_name,
7009 bool enforce_binding, uint32_t s_p_n,
7010 bitstr_t **req_sock_map,
7011 uint32_t main_plugin_id, uint32_t alt_plugin_id,
7012 gres_node_state_t *alt_node_gres_ptr,
7013 uint32_t user_id, const uint32_t node_inx)
7014 {
7015 int i, j, s, c, tot_cores;
7016 sock_gres_t *sock_gres;
7017 int64_t add_gres;
7018 uint64_t avail_gres, min_gres = 1;
7019 bool match = false;
7020 bool use_busy_dev = false;
7021
7022 if (node_gres_ptr->gres_cnt_avail == 0)
7023 return NULL;
7024
7025 if (!use_total_gres &&
7026 (main_plugin_id == mps_plugin_id) &&
7027 (node_gres_ptr->gres_cnt_alloc != 0)) {
7028 /* We must use the ONE already active GRES of this type */
7029 use_busy_dev = true;
7030 }
7031
7032 sock_gres = xmalloc(sizeof(sock_gres_t));
7033 sock_gres->sock_cnt = sockets;
7034 sock_gres->bits_by_sock = xcalloc(sockets, sizeof(bitstr_t *));
7035 sock_gres->cnt_by_sock = xcalloc(sockets, sizeof(uint64_t));
7036 for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
7037 bool use_all_sockets = false;
7038 if (job_gres_ptr->type_name &&
7039 (job_gres_ptr->type_id != node_gres_ptr->topo_type_id[i]))
7040 continue; /* Wrong type_model */
7041 if (use_busy_dev &&
7042 (node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
7043 continue;
7044 if (!use_total_gres && !node_gres_ptr->no_consume &&
7045 (node_gres_ptr->topo_gres_cnt_alloc[i] >=
7046 node_gres_ptr->topo_gres_cnt_avail[i])) {
7047 continue; /* No GRES remaining */
7048 }
7049
7050 if (!use_total_gres && !node_gres_ptr->no_consume) {
7051 avail_gres = node_gres_ptr->topo_gres_cnt_avail[i] -
7052 node_gres_ptr->topo_gres_cnt_alloc[i];
7053 } else {
7054 avail_gres = node_gres_ptr->topo_gres_cnt_avail[i];
7055 }
7056 if (avail_gres == 0)
7057 continue;
7058
7059 /*
7060 * Job requested GPUs or MPS. Filter out resources already
7061 * allocated to the other GRES type.
7062 */
7063 if (alt_node_gres_ptr && alt_node_gres_ptr->gres_bit_alloc &&
7064 node_gres_ptr->topo_gres_bitmap[i]) {
7065 c = bit_overlap(node_gres_ptr->topo_gres_bitmap[i],
7066 alt_node_gres_ptr->gres_bit_alloc);
7067 if ((alt_plugin_id == gpu_plugin_id) && (c > 0))
7068 continue;
7069 if ((alt_plugin_id == mps_plugin_id) && (c > 0)) {
7070 avail_gres -= c;
7071 if (avail_gres == 0)
7072 continue;
7073 }
7074 }
7075
7076 /* gres/mps can only use one GPU per node */
7077 if ((main_plugin_id == mps_plugin_id) &&
7078 (avail_gres > sock_gres->max_node_gres))
7079 sock_gres->max_node_gres = avail_gres;
7080
7081 /*
7082 * If some GRES is available on every socket,
7083 * treat like no topo_core_bitmap is specified
7084 */
7085 tot_cores = sockets * cores_per_sock;
7086 if (node_gres_ptr->topo_core_bitmap &&
7087 node_gres_ptr->topo_core_bitmap[i]) {
7088 use_all_sockets = true;
7089 for (s = 0; s < sockets; s++) {
7090 bool use_this_socket = false;
7091 for (c = 0; c < cores_per_sock; c++) {
7092 j = (s * cores_per_sock) + c;
7093 if (bit_test(node_gres_ptr->
7094 topo_core_bitmap[i], j)) {
7095 use_this_socket = true;
7096 break;
7097 }
7098 }
7099 if (!use_this_socket) {
7100 use_all_sockets = false;
7101 break;
7102 }
7103 }
7104 }
7105
7106 if (!node_gres_ptr->topo_core_bitmap ||
7107 !node_gres_ptr->topo_core_bitmap[i] ||
7108 use_all_sockets) {
7109 /*
7110 * Not constrained by core, but only specific
7111 * GRES may be available (save their bitmap)
7112 */
7113 sock_gres->cnt_any_sock += avail_gres;
7114 sock_gres->total_cnt += avail_gres;
7115 if (!sock_gres->bits_any_sock) {
7116 sock_gres->bits_any_sock =
7117 bit_copy(node_gres_ptr->
7118 topo_gres_bitmap[i]);
7119 } else {
7120 bit_or(sock_gres->bits_any_sock,
7121 node_gres_ptr->topo_gres_bitmap[i]);
7122 }
7123 match = true;
7124 continue;
7125 }
7126
7127 /* Constrained by core */
7128 if (core_bitmap)
7129 tot_cores = MIN(tot_cores, bit_size(core_bitmap));
7130 if (node_gres_ptr->topo_core_bitmap[i]) {
7131 tot_cores = MIN(tot_cores,
7132 bit_size(node_gres_ptr->
7133 topo_core_bitmap[i]));
7134 }
7135 for (s = 0; ((s < sockets) && avail_gres); s++) {
7136 if (enforce_binding && core_bitmap) {
7137 for (c = 0; c < cores_per_sock; c++) {
7138 j = (s * cores_per_sock) + c;
7139 if (bit_test(core_bitmap, j))
7140 break;
7141 }
7142 if (c >= cores_per_sock) {
7143 /* No available cores on this socket */
7144 continue;
7145 }
7146 }
7147 for (c = 0; c < cores_per_sock; c++) {
7148 j = (s * cores_per_sock) + c;
7149 if (j >= tot_cores)
7150 break; /* Off end of core bitmap */
7151 if (node_gres_ptr->topo_core_bitmap[i] &&
7152 !bit_test(node_gres_ptr->topo_core_bitmap[i],
7153 j))
7154 continue;
7155 if (!node_gres_ptr->topo_gres_bitmap[i]) {
7156 error("%s: topo_gres_bitmap NULL on node %s",
7157 __func__, node_name);
7158 continue;
7159 }
7160 if (!sock_gres->bits_by_sock[s]) {
7161 sock_gres->bits_by_sock[s] =
7162 bit_copy(node_gres_ptr->
7163 topo_gres_bitmap[i]);
7164 } else {
7165 bit_or(sock_gres->bits_by_sock[s],
7166 node_gres_ptr->topo_gres_bitmap[i]);
7167 }
7168 sock_gres->cnt_by_sock[s] += avail_gres;
7169 sock_gres->total_cnt += avail_gres;
7170 avail_gres = 0;
7171 match = true;
7172 break;
7173 }
7174 }
7175 }
7176
7177 /* Process per-GRES limits */
7178 if (match && job_gres_ptr->gres_per_socket) {
7179 /*
7180 * Clear core bitmap on sockets with insufficient GRES
7181 * and disable excess GRES per socket
7182 */
7183 for (s = 0; s < sockets; s++) {
7184 if (sock_gres->cnt_by_sock[s] <
7185 job_gres_ptr->gres_per_socket) {
7186 /* Insufficient GRES, clear count */
7187 sock_gres->total_cnt -=
7188 sock_gres->cnt_by_sock[s];
7189 sock_gres->cnt_by_sock[s] = 0;
7190 if (enforce_binding && core_bitmap) {
7191 i = s * cores_per_sock;
7192 bit_nclear(core_bitmap, i,
7193 i + cores_per_sock - 1);
7194 }
7195 } else if (sock_gres->cnt_by_sock[s] >
7196 job_gres_ptr->gres_per_socket) {
7197 /* Excess GRES, reduce count */
7198 i = sock_gres->cnt_by_sock[s] -
7199 job_gres_ptr->gres_per_socket;
7200 sock_gres->cnt_by_sock[s] =
7201 job_gres_ptr->gres_per_socket;
7202 sock_gres->total_cnt -= i;
7203 }
7204 }
7205 }
7206
7207 /*
7208 * Satisfy sockets-per-node (s_p_n) limit by selecting the sockets with
7209 * the most GRES. Sockets with low GRES counts have their core_bitmap
7210 * cleared so that _allocate_sc() in cons_tres/job_test.c does not
7211 * remove sockets needed to satisfy the job's GRES specification.
7212 */
7213 if (match && enforce_binding && core_bitmap && (s_p_n < sockets)) {
7214 int avail_sock = 0;
7215 bool *avail_sock_flag = xcalloc(sockets, sizeof(bool));
7216 for (s = 0; s < sockets; s++) {
7217 if (sock_gres->cnt_by_sock[s] == 0)
7218 continue;
7219 for (c = 0; c < cores_per_sock; c++) {
7220 i = (s * cores_per_sock) + c;
7221 if (!bit_test(core_bitmap, i))
7222 continue;
7223 avail_sock++;
7224 avail_sock_flag[s] = true;
7225 break;
7226 }
7227 }
7228 while (avail_sock > s_p_n) {
7229 int low_gres_sock_inx = -1;
7230 for (s = 0; s < sockets; s++) {
7231 if (!avail_sock_flag[s])
7232 continue;
7233 if ((low_gres_sock_inx == -1) ||
7234 (sock_gres->cnt_by_sock[s] <
7235 sock_gres->cnt_by_sock[low_gres_sock_inx]))
7236 low_gres_sock_inx = s;
7237 }
7238 if (low_gres_sock_inx == -1)
7239 break;
7240 s = low_gres_sock_inx;
7241 i = s * cores_per_sock;
7242 bit_nclear(core_bitmap, i, i + cores_per_sock - 1);
7243 sock_gres->total_cnt -= sock_gres->cnt_by_sock[s];
7244 sock_gres->cnt_by_sock[s] = 0;
7245 avail_sock--;
7246 avail_sock_flag[s] = false;
7247 }
7248 xfree(avail_sock_flag);
7249 }
7250
7251 if (match) {
7252 if (job_gres_ptr->gres_per_node)
7253 min_gres = job_gres_ptr->gres_per_node;
7254 if (job_gres_ptr->gres_per_task)
7255 min_gres = MAX(min_gres, job_gres_ptr->gres_per_task);
7256 if (sock_gres->total_cnt < min_gres)
7257 match = false;
7258 }
7259
7260
7261 /*
7262 * If sockets-per-node (s_p_n) not specified then identify sockets
7263 * which are required to satisfy gres_per_node or task specification
7264 * so that allocated tasks can be distributed over multiple sockets
7265 * if necessary.
7266 */
7267 add_gres = min_gres - sock_gres->cnt_any_sock;
7268 if (match && core_bitmap && (s_p_n == NO_VAL) && (add_gres > 0) &&
7269 job_gres_ptr->gres_per_node) {
7270 int avail_sock = 0, best_sock_inx = -1;
7271 bool *avail_sock_flag = xcalloc(sockets, sizeof(bool));
7272 for (s = 0; s < sockets; s++) {
7273 if (sock_gres->cnt_by_sock[s] == 0)
7274 continue;
7275 for (c = 0; c < cores_per_sock; c++) {
7276 i = (s * cores_per_sock) + c;
7277 if (!bit_test(core_bitmap, i))
7278 continue;
7279 avail_sock++;
7280 avail_sock_flag[s] = true;
7281 if ((best_sock_inx == -1) ||
7282 (sock_gres->cnt_by_sock[s] >
7283 sock_gres->cnt_by_sock[best_sock_inx])) {
7284 best_sock_inx = s;
7285 }
7286 break;
7287 }
7288 }
7289 while ((best_sock_inx != -1) && (add_gres > 0)) {
7290 if (*req_sock_map == NULL)
7291 *req_sock_map = bit_alloc(sockets);
7292 bit_set(*req_sock_map, best_sock_inx);
7293 add_gres -= sock_gres->cnt_by_sock[best_sock_inx];
7294 avail_sock_flag[best_sock_inx] = false;
7295 if (add_gres <= 0)
7296 break;
7297 /* Find next best socket */
7298 best_sock_inx = -1;
7299 for (s = 0; s < sockets; s++) {
7300 if ((sock_gres->cnt_by_sock[s] == 0) ||
7301 !avail_sock_flag[s])
7302 continue;
7303 if ((best_sock_inx == -1) ||
7304 (sock_gres->cnt_by_sock[s] >
7305 sock_gres->cnt_by_sock[best_sock_inx])) {
7306 best_sock_inx = s;
7307 }
7308 }
7309 }
7310 xfree(avail_sock_flag);
7311 }
7312
7313 if (match) {
7314 sock_gres->type_id = job_gres_ptr->type_id;
7315 sock_gres->type_name = xstrdup(job_gres_ptr->type_name);
7316 } else {
7317 _sock_gres_del(sock_gres);
7318 sock_gres = NULL;
7319 }
7320 return sock_gres;
7321 }
7322
7323 /*
7324 * Determine how many GRES of a given type can be used by this job on a
7325 * given node and return a structure with the details. Note that multiple
7326 * GRES of a given type model can be configured, so pick the right one.
7327 */
_build_sock_gres_by_type(gres_job_state_t * job_gres_ptr,gres_node_state_t * node_gres_ptr,bool use_total_gres,bitstr_t * core_bitmap,uint16_t sockets,uint16_t cores_per_sock,uint32_t job_id,char * node_name)7328 static sock_gres_t *_build_sock_gres_by_type(gres_job_state_t *job_gres_ptr,
7329 gres_node_state_t *node_gres_ptr,
7330 bool use_total_gres, bitstr_t *core_bitmap,
7331 uint16_t sockets, uint16_t cores_per_sock,
7332 uint32_t job_id, char *node_name)
7333 {
7334 int i;
7335 sock_gres_t *sock_gres;
7336 uint64_t avail_gres, min_gres = 1, gres_tmp;
7337 bool match = false;
7338
7339 if (job_gres_ptr->gres_per_node)
7340 min_gres = job_gres_ptr-> gres_per_node;
7341 if (job_gres_ptr->gres_per_socket)
7342 min_gres = MAX(min_gres, job_gres_ptr->gres_per_socket);
7343 if (job_gres_ptr->gres_per_task)
7344 min_gres = MAX(min_gres, job_gres_ptr->gres_per_task);
7345 sock_gres = xmalloc(sizeof(sock_gres_t));
7346 for (i = 0; i < node_gres_ptr->type_cnt; i++) {
7347 if (job_gres_ptr->type_name &&
7348 (job_gres_ptr->type_id != node_gres_ptr->type_id[i]))
7349 continue; /* Wrong type_model */
7350 if (!use_total_gres &&
7351 (node_gres_ptr->type_cnt_alloc[i] >=
7352 node_gres_ptr->type_cnt_avail[i])) {
7353 continue; /* No GRES remaining */
7354 } else if (!use_total_gres) {
7355 avail_gres = node_gres_ptr->type_cnt_avail[i] -
7356 node_gres_ptr->type_cnt_alloc[i];
7357 } else {
7358 avail_gres = node_gres_ptr->type_cnt_avail[i];
7359 }
7360 gres_tmp = node_gres_ptr->gres_cnt_avail;
7361 if (!use_total_gres)
7362 gres_tmp -= node_gres_ptr->gres_cnt_alloc;
7363 avail_gres = MIN(avail_gres, gres_tmp);
7364 if (avail_gres < min_gres)
7365 continue; /* Insufficient GRES remaining */
7366 sock_gres->cnt_any_sock += avail_gres;
7367 sock_gres->total_cnt += avail_gres;
7368 match = true;
7369 }
7370 if (match) {
7371 sock_gres->type_id = job_gres_ptr->type_id;
7372 sock_gres->type_name = xstrdup(job_gres_ptr->type_name);
7373 } else
7374 xfree(sock_gres);
7375
7376 return sock_gres;
7377 }
7378
7379 /*
7380 * Determine how many GRES of a given type can be used by this job on a
7381 * given node and return a structure with the details. No GRES type.
7382 */
_build_sock_gres_basic(gres_job_state_t * job_gres_ptr,gres_node_state_t * node_gres_ptr,bool use_total_gres,bitstr_t * core_bitmap,uint16_t sockets,uint16_t cores_per_sock,uint32_t job_id,char * node_name)7383 static sock_gres_t *_build_sock_gres_basic(gres_job_state_t *job_gres_ptr,
7384 gres_node_state_t *node_gres_ptr,
7385 bool use_total_gres, bitstr_t *core_bitmap,
7386 uint16_t sockets, uint16_t cores_per_sock,
7387 uint32_t job_id, char *node_name)
7388 {
7389 sock_gres_t *sock_gres;
7390 uint64_t avail_gres, min_gres = 1;
7391
7392 if (job_gres_ptr->type_name)
7393 return NULL;
7394 if (!use_total_gres &&
7395 (node_gres_ptr->gres_cnt_alloc >= node_gres_ptr->gres_cnt_avail))
7396 return NULL; /* No GRES remaining */
7397
7398 if (job_gres_ptr->gres_per_node)
7399 min_gres = job_gres_ptr-> gres_per_node;
7400 if (job_gres_ptr->gres_per_socket)
7401 min_gres = MAX(min_gres, job_gres_ptr->gres_per_socket);
7402 if (job_gres_ptr->gres_per_task)
7403 min_gres = MAX(min_gres, job_gres_ptr->gres_per_task);
7404 if (!use_total_gres) {
7405 avail_gres = node_gres_ptr->gres_cnt_avail -
7406 node_gres_ptr->gres_cnt_alloc;
7407 } else
7408 avail_gres = node_gres_ptr->gres_cnt_avail;
7409 if (avail_gres < min_gres)
7410 return NULL; /* Insufficient GRES remaining */
7411
7412 sock_gres = xmalloc(sizeof(sock_gres_t));
7413 sock_gres->cnt_any_sock += avail_gres;
7414 sock_gres->total_cnt += avail_gres;
7415
7416 return sock_gres;
7417 }
7418
_sock_gres_log(List sock_gres_list,char * node_name)7419 static void _sock_gres_log(List sock_gres_list, char *node_name)
7420 {
7421 sock_gres_t *sock_gres;
7422 ListIterator iter;
7423 int i, len = -1;
7424 char tmp[32] = "";
7425
7426 if (!sock_gres_list)
7427 return;
7428
7429 info("Sock_gres state for %s", node_name);
7430 iter = list_iterator_create(sock_gres_list);
7431 while ((sock_gres = (sock_gres_t *) list_next(iter))) {
7432 info("Gres:%s Type:%s TotalCnt:%"PRIu64" MaxNodeGres:%"PRIu64,
7433 sock_gres->gres_name, sock_gres->type_name,
7434 sock_gres->total_cnt, sock_gres->max_node_gres);
7435 if (sock_gres->bits_any_sock) {
7436 bit_fmt(tmp, sizeof(tmp), sock_gres->bits_any_sock);
7437 len = bit_size(sock_gres->bits_any_sock);
7438 }
7439 info(" Sock[ANY]Cnt:%"PRIu64" Bits:%s of %d",
7440 sock_gres->cnt_any_sock, tmp, len);
7441
7442 for (i = 0; i < sock_gres->sock_cnt; i++) {
7443 if (sock_gres->cnt_by_sock[i] == 0)
7444 continue;
7445 tmp[0] = '\0';
7446 len = -1;
7447 if (sock_gres->bits_by_sock &&
7448 sock_gres->bits_by_sock[i]) {
7449 bit_fmt(tmp, sizeof(tmp),
7450 sock_gres->bits_by_sock[i]);
7451 len = bit_size(sock_gres->bits_by_sock[i]);
7452 }
7453 info(" Sock[%d]Cnt:%"PRIu64" Bits:%s of %d", i,
7454 sock_gres->cnt_by_sock[i], tmp, len);
7455 }
7456 }
7457 list_iterator_destroy(iter);
7458 }
7459
7460 /*
7461 * Determine how many cores on each socket of a node can be used by this job
7462 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
7463 * IN node_gres_list - node's gres_list built by gres_plugin_node_config_validate()
7464 * IN use_total_gres - if set then consider all gres resources as available,
7465 * and none are commited to running jobs
7466 * IN/OUT core_bitmap - Identification of available cores on this node
7467 * IN sockets - Count of sockets on the node
7468 * IN cores_per_sock - Count of cores per socket on this node
7469 * IN job_id - job's ID (for logging)
7470 * IN node_name - name of the node (for logging)
7471 * IN enforce_binding - if true then only use GRES with direct access to cores
7472 * IN s_p_n - Expected sockets_per_node (NO_VAL if not limited)
7473 * OUT req_sock_map - bitmap of specific requires sockets
7474 * IN user_id - job's user ID
7475 * IN node_inx - index of node to be evaluated
7476 * RET: List of sock_gres_t entries identifying what resources are available on
7477 * each socket. Returns NULL if none available. Call FREE_NULL_LIST() to
7478 * release memory.
7479 */
gres_plugin_job_test2(List job_gres_list,List node_gres_list,bool use_total_gres,bitstr_t * core_bitmap,uint16_t sockets,uint16_t cores_per_sock,uint32_t job_id,char * node_name,bool enforce_binding,uint32_t s_p_n,bitstr_t ** req_sock_map,uint32_t user_id,const uint32_t node_inx)7480 extern List gres_plugin_job_test2(List job_gres_list, List node_gres_list,
7481 bool use_total_gres, bitstr_t *core_bitmap,
7482 uint16_t sockets, uint16_t cores_per_sock,
7483 uint32_t job_id, char *node_name,
7484 bool enforce_binding, uint32_t s_p_n,
7485 bitstr_t **req_sock_map, uint32_t user_id,
7486 const uint32_t node_inx)
7487 {
7488 List sock_gres_list = NULL;
7489 ListIterator job_gres_iter;
7490 gres_state_t *job_gres_ptr, *node_gres_ptr;
7491 gres_job_state_t *job_data_ptr;
7492 gres_node_state_t *node_data_ptr;
7493 uint32_t local_s_p_n;
7494
7495 if (!job_gres_list || (list_count(job_gres_list) == 0))
7496 return sock_gres_list;
7497 if (!node_gres_list) /* Node lacks GRES to match */
7498 return sock_gres_list;
7499 (void) gres_plugin_init();
7500
7501 sock_gres_list = list_create(_sock_gres_del);
7502 slurm_mutex_lock(&gres_context_lock);
7503 job_gres_iter = list_iterator_create(job_gres_list);
7504 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
7505 sock_gres_t *sock_gres = NULL;
7506 node_gres_ptr = list_find_first(node_gres_list, _gres_find_id,
7507 &job_gres_ptr->plugin_id);
7508 if (node_gres_ptr == NULL) {
7509 /* node lack GRES of type required by the job */
7510 FREE_NULL_LIST(sock_gres_list);
7511 break;
7512 }
7513 job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
7514 node_data_ptr = (gres_node_state_t *) node_gres_ptr->gres_data;
7515
7516 if (job_data_ptr->gres_per_job &&
7517 !job_data_ptr->gres_per_socket)
7518 local_s_p_n = s_p_n; /* Maximize GRES per node */
7519 else
7520 local_s_p_n = NO_VAL; /* No need to optimize socket */
7521 if (core_bitmap && (bit_ffs(core_bitmap) == -1)) {
7522 sock_gres = NULL; /* No cores available */
7523 } else if (node_data_ptr->topo_cnt) {
7524 uint32_t alt_plugin_id = 0;
7525 gres_node_state_t *alt_node_data_ptr = NULL;
7526 if (!use_total_gres && have_gpu && have_mps) {
7527 if (job_gres_ptr->plugin_id == gpu_plugin_id)
7528 alt_plugin_id = mps_plugin_id;
7529 if (job_gres_ptr->plugin_id == mps_plugin_id)
7530 alt_plugin_id = gpu_plugin_id;
7531 }
7532 if (alt_plugin_id) {
7533 node_gres_ptr = list_find_first(node_gres_list,
7534 _gres_find_id,
7535 &alt_plugin_id);
7536 }
7537 if (alt_plugin_id && node_gres_ptr) {
7538 alt_node_data_ptr = (gres_node_state_t *)
7539 node_gres_ptr->gres_data;
7540 } else {
7541 /* GRES of interest not on this node */
7542 alt_plugin_id = 0;
7543 }
7544 sock_gres = _build_sock_gres_by_topo(job_data_ptr,
7545 node_data_ptr, use_total_gres,
7546 core_bitmap, sockets, cores_per_sock,
7547 job_id, node_name, enforce_binding,
7548 local_s_p_n, req_sock_map,
7549 job_gres_ptr->plugin_id,
7550 alt_plugin_id, alt_node_data_ptr,
7551 user_id, node_inx);
7552 } else if (node_data_ptr->type_cnt) {
7553 sock_gres = _build_sock_gres_by_type(job_data_ptr,
7554 node_data_ptr, use_total_gres,
7555 core_bitmap, sockets, cores_per_sock,
7556 job_id, node_name);
7557 } else {
7558 sock_gres = _build_sock_gres_basic(job_data_ptr,
7559 node_data_ptr, use_total_gres,
7560 core_bitmap, sockets, cores_per_sock,
7561 job_id, node_name);
7562 }
7563 if (!sock_gres) {
7564 /* node lack available resources required by the job */
7565 bit_clear_all(core_bitmap);
7566 FREE_NULL_LIST(sock_gres_list);
7567 break;
7568 }
7569 sock_gres->job_specs = job_data_ptr;
7570 sock_gres->gres_name = xstrdup(job_data_ptr->gres_name);
7571 sock_gres->node_specs = node_data_ptr;
7572 sock_gres->plugin_id = job_gres_ptr->plugin_id;
7573 list_append(sock_gres_list, sock_gres);
7574 }
7575 list_iterator_destroy(job_gres_iter);
7576 slurm_mutex_unlock(&gres_context_lock);
7577
7578 if (gres_debug)
7579 _sock_gres_log(sock_gres_list, node_name);
7580
7581 return sock_gres_list;
7582 }
7583
_build_avail_cores_by_sock(bitstr_t * core_bitmap,uint16_t sockets,uint16_t cores_per_sock)7584 static bool *_build_avail_cores_by_sock(bitstr_t *core_bitmap,
7585 uint16_t sockets,
7586 uint16_t cores_per_sock)
7587 {
7588 bool *avail_cores_by_sock = xcalloc(sockets, sizeof(bool));
7589 int s, c, i, lim = 0;
7590
7591 lim = bit_size(core_bitmap);
7592 for (s = 0; s < sockets; s++) {
7593 for (c = 0; c < cores_per_sock; c++) {
7594 i = (s * cores_per_sock) + c;
7595 if (i >= lim)
7596 goto fini; /* should never happen */
7597 if (bit_test(core_bitmap, i)) {
7598 avail_cores_by_sock[s] = true;
7599 break;
7600 }
7601 }
7602 }
7603
7604 fini: return avail_cores_by_sock;
7605 }
7606
7607 /*
7608 * Determine which GRES can be used on this node given the available cores.
7609 * Filter out unusable GRES.
7610 * IN sock_gres_list - list of sock_gres_t entries built by gres_plugin_job_test2()
7611 * IN avail_mem - memory available for the job
7612 * IN max_cpus - maximum CPUs available on this node (limited by
7613 * specialized cores and partition CPUs-per-node)
7614 * IN enforce_binding - GRES must be co-allocated with cores
7615 * IN core_bitmap - Identification of available cores on this node
7616 * IN sockets - Count of sockets on the node
7617 * IN cores_per_sock - Count of cores per socket on this node
7618 * IN cpus_per_core - Count of CPUs per core on this node
7619 * IN sock_per_node - sockets requested by job per node or NO_VAL
7620 * IN task_per_node - tasks requested by job per node or NO_VAL16
7621 * IN whole_node - we are requesting the whole node or not
7622 * OUT avail_gpus - Count of available GPUs on this node
7623 * OUT near_gpus - Count of GPUs available on sockets with available CPUs
7624 * RET - 0 if job can use this node, -1 otherwise (some GRES limit prevents use)
7625 */
gres_plugin_job_core_filter2(List sock_gres_list,uint64_t avail_mem,uint16_t max_cpus,bool enforce_binding,bitstr_t * core_bitmap,uint16_t sockets,uint16_t cores_per_sock,uint16_t cpus_per_core,uint32_t sock_per_node,uint16_t task_per_node,bool whole_node,uint16_t * avail_gpus,uint16_t * near_gpus)7626 extern int gres_plugin_job_core_filter2(List sock_gres_list, uint64_t avail_mem,
7627 uint16_t max_cpus,
7628 bool enforce_binding,
7629 bitstr_t *core_bitmap,
7630 uint16_t sockets,
7631 uint16_t cores_per_sock,
7632 uint16_t cpus_per_core,
7633 uint32_t sock_per_node,
7634 uint16_t task_per_node,
7635 bool whole_node,
7636 uint16_t *avail_gpus,
7637 uint16_t *near_gpus)
7638 {
7639 ListIterator sock_gres_iter;
7640 sock_gres_t *sock_gres;
7641 bool *avail_cores_by_sock = NULL;
7642 uint64_t max_gres, mem_per_gres = 0, near_gres_cnt = 0;
7643 uint16_t cpus_per_gres;
7644 int s, rc = 0;
7645
7646 *avail_gpus = 0;
7647 *near_gpus = 0;
7648 if (!core_bitmap || !sock_gres_list ||
7649 (list_count(sock_gres_list) == 0))
7650 return rc;
7651
7652 sock_gres_iter = list_iterator_create(sock_gres_list);
7653 while ((sock_gres = (sock_gres_t *) list_next(sock_gres_iter))) {
7654 uint64_t min_gres = 1, tmp_u64;
7655 if (sock_gres->job_specs) {
7656 gres_job_state_t *job_gres_ptr = sock_gres->job_specs;
7657 if (whole_node)
7658 min_gres = sock_gres->total_cnt;
7659 else if (job_gres_ptr->gres_per_node)
7660 min_gres = job_gres_ptr-> gres_per_node;
7661 if (job_gres_ptr->gres_per_socket) {
7662 tmp_u64 = job_gres_ptr->gres_per_socket;
7663 if (sock_per_node != NO_VAL)
7664 tmp_u64 *= sock_per_node;
7665 min_gres = MAX(min_gres, tmp_u64);
7666 }
7667 if (job_gres_ptr->gres_per_task) {
7668 tmp_u64 = job_gres_ptr->gres_per_task;
7669 if (task_per_node != NO_VAL16)
7670 tmp_u64 *= task_per_node;
7671 min_gres = MAX(min_gres, tmp_u64);
7672 }
7673 }
7674 if (!sock_gres->job_specs)
7675 cpus_per_gres = 0;
7676 else if (sock_gres->job_specs->cpus_per_gres)
7677 cpus_per_gres = sock_gres->job_specs->cpus_per_gres;
7678 else
7679 cpus_per_gres = sock_gres->job_specs->def_cpus_per_gres;
7680 if (cpus_per_gres) {
7681 max_gres = max_cpus / cpus_per_gres;
7682 if ((max_gres == 0) ||
7683 (sock_gres->job_specs->gres_per_node > max_gres) ||
7684 (sock_gres->job_specs->gres_per_task > max_gres) ||
7685 (sock_gres->job_specs->gres_per_socket > max_gres)){
7686 /* Insufficient CPUs for any GRES */
7687 rc = -1;
7688 break;
7689 }
7690 }
7691 if (!sock_gres->job_specs)
7692 mem_per_gres = 0;
7693 else if (sock_gres->job_specs->mem_per_gres)
7694 mem_per_gres = sock_gres->job_specs->mem_per_gres;
7695 else
7696 mem_per_gres = sock_gres->job_specs->def_mem_per_gres;
7697 if (mem_per_gres && avail_mem) {
7698 if (mem_per_gres <= avail_mem) {
7699 sock_gres->max_node_gres = avail_mem /
7700 mem_per_gres;
7701 } else { /* Insufficient memory for any GRES */
7702 rc = -1;
7703 break;
7704 }
7705 }
7706 if (sock_gres->cnt_by_sock || enforce_binding) {
7707 if (!avail_cores_by_sock) {
7708 avail_cores_by_sock =_build_avail_cores_by_sock(
7709 core_bitmap, sockets,
7710 cores_per_sock);
7711 }
7712 }
7713 /*
7714 * NOTE: gres_per_socket enforcement is performed by
7715 * _build_sock_gres_by_topo(), called by gres_plugin_job_test2()
7716 */
7717 if (sock_gres->cnt_by_sock && enforce_binding) {
7718 for (s = 0; s < sockets; s++) {
7719 if (avail_cores_by_sock[s] == 0) {
7720 sock_gres->total_cnt -=
7721 sock_gres->cnt_by_sock[s];
7722 sock_gres->cnt_by_sock[s] = 0;
7723 }
7724 }
7725 near_gres_cnt = sock_gres->total_cnt;
7726 } else if (sock_gres->cnt_by_sock) { /* NO enforce_binding */
7727 near_gres_cnt = sock_gres->total_cnt;
7728 for (s = 0; s < sockets; s++) {
7729 if (avail_cores_by_sock[s] == 0) {
7730 near_gres_cnt -=
7731 sock_gres->cnt_by_sock[s];
7732 }
7733 }
7734 } else {
7735 near_gres_cnt = sock_gres->total_cnt;
7736 }
7737 if (sock_gres->job_specs && !whole_node &&
7738 sock_gres->job_specs->gres_per_node) {
7739 if ((sock_gres->max_node_gres == 0) ||
7740 (sock_gres->max_node_gres >
7741 sock_gres->job_specs->gres_per_node)) {
7742 sock_gres->max_node_gres =
7743 sock_gres->job_specs->gres_per_node;
7744 }
7745 }
7746 if (cpus_per_gres) {
7747 int cpu_cnt;
7748 cpu_cnt = bit_set_count(core_bitmap);
7749 cpu_cnt *= cpus_per_core;
7750 max_gres = cpu_cnt / cpus_per_gres;
7751 if (max_gres == 0) {
7752 rc = -1;
7753 break;
7754 } else if ((sock_gres->max_node_gres == 0) ||
7755 (sock_gres->max_node_gres > max_gres)) {
7756 sock_gres->max_node_gres = max_gres;
7757 }
7758 }
7759 if (mem_per_gres) {
7760 max_gres = avail_mem / mem_per_gres;
7761 sock_gres->total_cnt = MIN(sock_gres->total_cnt,
7762 max_gres);
7763 }
7764 if ((sock_gres->total_cnt < min_gres) ||
7765 ((sock_gres->max_node_gres != 0) &&
7766 (sock_gres->max_node_gres < min_gres))) {
7767 rc = -1;
7768 break;
7769 }
7770
7771 if (_sharing_gres(sock_gres->plugin_id)) {
7772 *avail_gpus += sock_gres->total_cnt;
7773 if (sock_gres->max_node_gres &&
7774 (sock_gres->max_node_gres < near_gres_cnt))
7775 near_gres_cnt = sock_gres->max_node_gres;
7776 if (*near_gpus < 0xff) /* avoid overflow */
7777 *near_gpus += near_gres_cnt;
7778 }
7779 }
7780 list_iterator_destroy(sock_gres_iter);
7781 xfree(avail_cores_by_sock);
7782
7783 return rc;
7784 }
7785
7786 /* Order GRES scheduling. Schedule GRES requiring specific sockets first */
_sock_gres_sort(void * x,void * y)7787 static int _sock_gres_sort(void *x, void *y)
7788 {
7789 sock_gres_t *sock_gres1 = *(sock_gres_t **) x;
7790 sock_gres_t *sock_gres2 = *(sock_gres_t **) y;
7791 int weight1 = 0, weight2 = 0;
7792
7793 if (sock_gres1->node_specs && !sock_gres1->node_specs->topo_cnt)
7794 weight1 += 0x02;
7795 if (sock_gres1->job_specs && !sock_gres1->job_specs->gres_per_socket)
7796 weight1 += 0x01;
7797
7798 if (sock_gres2->node_specs && !sock_gres2->node_specs->topo_cnt)
7799 weight2 += 0x02;
7800 if (sock_gres2->job_specs && !sock_gres2->job_specs->gres_per_socket)
7801 weight2 += 0x01;
7802
7803 return weight1 - weight2;
7804 }
7805
_sort_sockets_by_avail_cores(const void * x,const void * y,void * socket_avail_cores)7806 int _sort_sockets_by_avail_cores(const void *x, const void *y,
7807 void *socket_avail_cores)
7808 {
7809 uint16_t *sockets = (uint16_t *)socket_avail_cores;
7810 return (sockets[*(int *)y] - sockets[*(int *)x]);
7811 }
7812
7813 /*
7814 * Determine how many tasks can be started on a given node and which
7815 * sockets/cores are required
7816 * IN mc_ptr - job's multi-core specs, NO_VAL and INFINITE mapped to zero
7817 * IN sock_gres_list - list of sock_gres_t entries built by gres_plugin_job_test2()
7818 * IN sockets - Count of sockets on the node
7819 * IN cores_per_socket - Count of cores per socket on the node
7820 * IN cpus_per_core - Count of CPUs per core on the node
7821 * IN avail_cpus - Count of available CPUs on the node, UPDATED
7822 * IN min_tasks_this_node - Minimum count of tasks that can be started on this
7823 * node, UPDATED
7824 * IN max_tasks_this_node - Maximum count of tasks that can be started on this
7825 * node or NO_VAL, UPDATED
7826 * IN rem_nodes - desired additional node count to allocate, including this node
7827 * IN enforce_binding - GRES must be co-allocated with cores
7828 * IN first_pass - set if first scheduling attempt for this job, use
7829 * co-located GRES and cores if possible
7830 * IN avail_core - cores available on this node, UPDATED
7831 */
gres_plugin_job_core_filter3(gres_mc_data_t * mc_ptr,List sock_gres_list,uint16_t sockets,uint16_t cores_per_socket,uint16_t cpus_per_core,uint16_t * avail_cpus,uint32_t * min_tasks_this_node,uint32_t * max_tasks_this_node,int rem_nodes,bool enforce_binding,bool first_pass,bitstr_t * avail_core)7832 extern void gres_plugin_job_core_filter3(gres_mc_data_t *mc_ptr,
7833 List sock_gres_list,
7834 uint16_t sockets,
7835 uint16_t cores_per_socket,
7836 uint16_t cpus_per_core,
7837 uint16_t *avail_cpus,
7838 uint32_t *min_tasks_this_node,
7839 uint32_t *max_tasks_this_node,
7840 int rem_nodes,
7841 bool enforce_binding,
7842 bool first_pass,
7843 bitstr_t *avail_core)
7844 {
7845 static uint16_t select_type_param = NO_VAL16;
7846 ListIterator sock_gres_iter;
7847 sock_gres_t *sock_gres;
7848 gres_job_state_t *job_specs;
7849 int i, j, c, s, sock_cnt = 0, req_cores, rem_sockets, full_socket;
7850 int tot_core_cnt = 0, min_core_cnt = 1;
7851 uint64_t cnt_avail_sock, cnt_avail_total, max_gres = 0, rem_gres = 0;
7852 uint64_t tot_gres_sock, max_tasks;
7853 uint32_t task_cnt_incr;
7854 bool *req_sock = NULL; /* Required socket */
7855 int *socket_index = NULL; /* Socket indexes */
7856 uint16_t *avail_cores_per_sock, cpus_per_gres;
7857 uint16_t avail_cores_tot;
7858
7859 if (*max_tasks_this_node == 0)
7860 return;
7861
7862 xassert(avail_core);
7863 avail_cores_per_sock = xcalloc(sockets, sizeof(uint16_t));
7864 for (s = 0; s < sockets; s++) {
7865 for (c = 0; c < cores_per_socket; c++) {
7866 i = (s * cores_per_socket) + c;
7867 if (bit_test(avail_core, i))
7868 avail_cores_per_sock[s]++;
7869 }
7870 tot_core_cnt += avail_cores_per_sock[s];
7871 }
7872
7873 task_cnt_incr = *min_tasks_this_node;
7874 req_sock = xcalloc(sockets, sizeof(bool));
7875 socket_index = xcalloc(sockets, sizeof(int));
7876
7877 list_sort(sock_gres_list, _sock_gres_sort);
7878 sock_gres_iter = list_iterator_create(sock_gres_list);
7879 while ((sock_gres = (sock_gres_t *) list_next(sock_gres_iter))) {
7880 bool sufficient_gres;
7881 job_specs = sock_gres->job_specs;
7882 if (!job_specs)
7883 continue;
7884 if (job_specs->gres_per_job &&
7885 (job_specs->total_gres < job_specs->gres_per_job)) {
7886 rem_gres = job_specs->gres_per_job -
7887 job_specs->total_gres;
7888 }
7889
7890 /*
7891 * gres_plugin_job_core_filter2() sets sock_gres->max_node_gres
7892 * for mem_per_gres enforcement; use it to set GRES limit for
7893 * this node (max_gres).
7894 */
7895 if (sock_gres->max_node_gres) {
7896 if (rem_gres && (rem_gres < sock_gres->max_node_gres))
7897 max_gres = rem_gres;
7898 else
7899 max_gres = sock_gres->max_node_gres;
7900 }
7901 rem_nodes = MAX(rem_nodes, 1);
7902 rem_sockets = MAX(1, mc_ptr->sockets_per_node);
7903 if (max_gres &&
7904 ((job_specs->gres_per_node > max_gres) ||
7905 ((job_specs->gres_per_socket * rem_sockets) > max_gres))) {
7906 *max_tasks_this_node = 0;
7907 break;
7908 }
7909 if (job_specs->gres_per_node && job_specs->gres_per_task) {
7910 max_tasks = job_specs->gres_per_node /
7911 job_specs->gres_per_task;
7912 if ((max_tasks == 0) ||
7913 (max_tasks > *max_tasks_this_node) ||
7914 (max_tasks < *min_tasks_this_node)) {
7915 *max_tasks_this_node = 0;
7916 break;
7917 }
7918 if ((*max_tasks_this_node == NO_VAL) ||
7919 (*max_tasks_this_node > max_tasks))
7920 *max_tasks_this_node = max_gres;
7921 }
7922
7923 min_core_cnt = MAX(*min_tasks_this_node, 1) *
7924 MAX(mc_ptr->cpus_per_task, 1);
7925 min_core_cnt = (min_core_cnt + cpus_per_core - 1) /
7926 cpus_per_core;
7927
7928 if (job_specs->cpus_per_gres)
7929 cpus_per_gres = job_specs->cpus_per_gres;
7930 else
7931 cpus_per_gres = job_specs->def_cpus_per_gres;
7932
7933 /* Filter out unusable GRES by socket */
7934 avail_cores_tot = 0;
7935 cnt_avail_total = sock_gres->cnt_any_sock;
7936 sufficient_gres = false;
7937 for (s = 0; s < sockets; s++)
7938 socket_index[s] = s;
7939 qsort_r(socket_index, sockets, sizeof(int),
7940 _sort_sockets_by_avail_cores, avail_cores_per_sock);
7941
7942 for (j = 0; j < sockets; j++) {
7943 /*
7944 * Test for sufficient gres_per_socket
7945 *
7946 * Start with socket with most cores available,
7947 * so we know that we have max number of cores on socket
7948 * with allocated GRES.
7949 */
7950 s = socket_index[j];
7951
7952 if (sock_gres->cnt_by_sock) {
7953 cnt_avail_sock = sock_gres->cnt_by_sock[s];
7954 } else
7955 cnt_avail_sock = 0;
7956
7957 /*
7958 * If enforce binding number of gres allocated per
7959 * socket has to be limited by cpus_per_gres
7960 */
7961 if ((enforce_binding || first_pass) && cpus_per_gres) {
7962 int max_gres_socket = (avail_cores_per_sock[s] *
7963 cpus_per_core) /
7964 cpus_per_gres;
7965 cnt_avail_sock = MIN(cnt_avail_sock,
7966 max_gres_socket);
7967 }
7968
7969 tot_gres_sock = sock_gres->cnt_any_sock +
7970 cnt_avail_sock;
7971 if ((job_specs->gres_per_socket > tot_gres_sock) ||
7972 (tot_gres_sock == 0)) {
7973 /*
7974 * Insufficient GRES on this socket
7975 * GRES removed here won't be used in 2nd pass
7976 */
7977 if (((job_specs->gres_per_socket >
7978 tot_gres_sock) ||
7979 enforce_binding) &&
7980 sock_gres->cnt_by_sock) {
7981 sock_gres->total_cnt -=
7982 sock_gres->cnt_by_sock[s];
7983 sock_gres->cnt_by_sock[s] = 0;
7984 }
7985 if (first_pass &&
7986 (tot_core_cnt > min_core_cnt)) {
7987 for (c = cores_per_socket - 1;
7988 c >= 0; c--) {
7989 i = (s * cores_per_socket) + c;
7990 if (!bit_test(avail_core, i))
7991 continue;
7992 bit_clear(avail_core, i);
7993
7994 avail_cores_per_sock[s]--;
7995 if (bit_set_count(avail_core) *
7996 cpus_per_core <
7997 *avail_cpus) {
7998 *avail_cpus -=
7999 cpus_per_core;
8000 }
8001 if (--tot_core_cnt <=
8002 min_core_cnt)
8003 break;
8004 }
8005 }
8006 }
8007
8008 avail_cores_tot += avail_cores_per_sock[s];
8009 /* Test for available cores on this socket */
8010 if ((enforce_binding || first_pass) &&
8011 (avail_cores_per_sock[s] == 0))
8012 continue;
8013
8014 cnt_avail_total += cnt_avail_sock;
8015 if (!sufficient_gres) {
8016 req_sock[s] = true;
8017 sock_cnt++;
8018 }
8019
8020 if (job_specs->gres_per_node &&
8021 (cnt_avail_total >= job_specs->gres_per_node) &&
8022 !sock_gres->cnt_any_sock) {
8023 /*
8024 * Sufficient gres will leave remaining CPUs as
8025 * !req_sock. We do this only when we
8026 * collected enough and all collected gres of
8027 * considered type are bound to socket.
8028 */
8029 sufficient_gres = true;
8030 }
8031 }
8032
8033 if (cpus_per_gres) {
8034 max_gres = *avail_cpus / cpus_per_gres;
8035 cnt_avail_total = MIN(cnt_avail_total, max_gres);
8036 }
8037 if ((cnt_avail_total == 0) ||
8038 (job_specs->gres_per_node > cnt_avail_total) ||
8039 (job_specs->gres_per_task > cnt_avail_total)) {
8040 *max_tasks_this_node = 0;
8041 }
8042 if (job_specs->gres_per_task) {
8043 max_tasks = cnt_avail_total / job_specs->gres_per_task;
8044 *max_tasks_this_node = MIN(*max_tasks_this_node,
8045 max_tasks);
8046 }
8047
8048 /*
8049 * min_tasks_this_node and max_tasks_this_node must be multiple
8050 * of original min_tasks_this_node value. This is to support
8051 * ntasks_per_* option and we just need to select a count of
8052 * tasks, sockets, etc. Round the values down.
8053 */
8054 *min_tasks_this_node = (*min_tasks_this_node / task_cnt_incr) *
8055 task_cnt_incr;
8056 *max_tasks_this_node = (*max_tasks_this_node / task_cnt_incr) *
8057 task_cnt_incr;
8058
8059 if (*max_tasks_this_node == 0)
8060 break;
8061
8062 /*
8063 * Remove cores on not required sockets when enforce-binding,
8064 * this has to happen also when max_tasks_this_node == NO_VAL
8065 */
8066 if (enforce_binding || first_pass) {
8067 for (s = 0; s < sockets; s++) {
8068 if (req_sock[s])
8069 continue;
8070 for (c = cores_per_socket - 1; c >= 0; c--) {
8071 i = (s * cores_per_socket) + c;
8072 if (!bit_test(avail_core, i))
8073 continue;
8074 bit_clear(avail_core, i);
8075 if (bit_set_count(avail_core) *
8076 cpus_per_core < *avail_cpus) {
8077 *avail_cpus -= cpus_per_core;
8078 }
8079 avail_cores_tot--;
8080 avail_cores_per_sock[s]--;
8081 }
8082 }
8083 }
8084
8085 if (*max_tasks_this_node == NO_VAL) {
8086 if (cpus_per_gres) {
8087 i = *avail_cpus / cpus_per_gres;
8088 sock_gres->total_cnt =
8089 MIN(i, sock_gres->total_cnt);
8090 }
8091 log_flag(GRES, "%s: max_tasks_this_node is set to NO_VAL, won't clear non-needed cores",
8092 __func__);
8093 continue;
8094 }
8095 if (*max_tasks_this_node < *min_tasks_this_node) {
8096 error("%s: min_tasks_this_node:%u > max_tasks_this_node:%u",
8097 __func__,
8098 *min_tasks_this_node,
8099 *max_tasks_this_node);
8100 }
8101
8102 /*
8103 * Determine how many cores are needed for this job.
8104 * Consider rounding errors if cpus_per_task not divisible
8105 * by cpus_per_core
8106 */
8107 req_cores = *max_tasks_this_node;
8108 if (mc_ptr->cpus_per_task) {
8109 int threads_per_core, removed_tasks = 0;
8110
8111 if (mc_ptr->threads_per_core)
8112 threads_per_core =
8113 MIN(cpus_per_core,
8114 mc_ptr->threads_per_core);
8115 else
8116 threads_per_core = cpus_per_core;
8117
8118 req_cores *= mc_ptr->cpus_per_task;
8119
8120 while (*max_tasks_this_node >= *min_tasks_this_node) {
8121 /* round up by full threads per core */
8122 req_cores += threads_per_core - 1;
8123 req_cores /= threads_per_core;
8124 if (req_cores <= avail_cores_tot) {
8125 if (removed_tasks)
8126 log_flag(GRES, "%s: settings required_cores=%d by max_tasks_this_node=%u(reduced=%d) cpus_per_task=%d cpus_per_core=%d threads_per_core:%d",
8127 __func__,
8128 req_cores,
8129 *max_tasks_this_node,
8130 removed_tasks,
8131 mc_ptr->cpus_per_task,
8132 cpus_per_core,
8133 mc_ptr->
8134 threads_per_core);
8135 break;
8136 }
8137 removed_tasks++;
8138 (*max_tasks_this_node)--;
8139 req_cores = *max_tasks_this_node;
8140 }
8141 }
8142 if (cpus_per_gres) {
8143 if (job_specs->gres_per_node) {
8144 i = job_specs->gres_per_node;
8145 log_flag(GRES, "%s: estimating req_cores gres_per_node=%"PRIu64,
8146 __func__, job_specs->gres_per_node);
8147 } else if (job_specs->gres_per_socket) {
8148 i = job_specs->gres_per_socket * sock_cnt;
8149 log_flag(GRES, "%s: estimating req_cores gres_per_socket=%"PRIu64,
8150 __func__, job_specs->gres_per_socket);
8151 } else if (job_specs->gres_per_task) {
8152 i = job_specs->gres_per_task *
8153 *max_tasks_this_node;
8154 log_flag(GRES, "%s: estimating req_cores max_tasks_this_node=%u gres_per_task=%"PRIu64,
8155 __func__,
8156 *max_tasks_this_node,
8157 job_specs->gres_per_task);
8158 } else if (cnt_avail_total) {
8159 i = cnt_avail_total;
8160 log_flag(GRES, "%s: estimating req_cores cnt_avail_total=%"PRIu64,
8161 __func__, cnt_avail_total);
8162 } else {
8163 i = 1;
8164 log_flag(GRES, "%s: estimating req_cores default to 1 task",
8165 __func__);
8166 }
8167 i *= cpus_per_gres;
8168 i = (i + cpus_per_core - 1) / cpus_per_core;
8169 if (req_cores < i)
8170 log_flag(GRES, "%s: Increasing req_cores=%d from cpus_per_gres=%d cpus_per_core=%"PRIu16,
8171 __func__, i, cpus_per_gres,
8172 cpus_per_core);
8173 req_cores = MAX(req_cores, i);
8174 }
8175
8176 if (req_cores > avail_cores_tot) {
8177 log_flag(GRES, "%s: Job cannot run on node req_cores:%d > aval_cores_tot:%d",
8178 __func__, req_cores, avail_cores_tot);
8179 *max_tasks_this_node = 0;
8180 break;
8181 }
8182
8183 /*
8184 * Clear extra avail_core bits on sockets we don't need
8185 * up to required number of cores based on max_tasks_this_node.
8186 * In case of enforce-binding those are already cleared.
8187 */
8188 if ((avail_cores_tot > req_cores) &&
8189 !enforce_binding && !first_pass) {
8190 for (s = 0; s < sockets; s++) {
8191 if (avail_cores_tot == req_cores)
8192 break;
8193 if (req_sock[s])
8194 continue;
8195 for (c = cores_per_socket - 1; c >= 0; c--) {
8196 i = (s * cores_per_socket) + c;
8197 if (!bit_test(avail_core, i))
8198 continue;
8199 bit_clear(avail_core, i);
8200 if (bit_set_count(avail_core) *
8201 cpus_per_core < *avail_cpus) {
8202 *avail_cpus -= cpus_per_core;
8203 }
8204 avail_cores_tot--;
8205 avail_cores_per_sock[s]--;
8206 if (avail_cores_tot == req_cores)
8207 break;
8208 }
8209 }
8210 }
8211
8212 /*
8213 * Clear extra avail_core bits on sockets we do need, but
8214 * spread them out so that every socket has some cores
8215 * available to use with the nearby GRES that we do need.
8216 */
8217 while (avail_cores_tot > req_cores) {
8218 full_socket = -1;
8219 for (s = 0; s < sockets; s++) {
8220 if (avail_cores_tot == req_cores)
8221 break;
8222 if (!req_sock[s] ||
8223 (avail_cores_per_sock[s] == 0))
8224 continue;
8225 if ((full_socket == -1) ||
8226 (avail_cores_per_sock[full_socket] <
8227 avail_cores_per_sock[s])) {
8228 full_socket = s;
8229 }
8230 }
8231 if (full_socket == -1)
8232 break;
8233 s = full_socket;
8234 for (c = cores_per_socket - 1; c >= 0; c--) {
8235 i = (s * cores_per_socket) + c;
8236 if (!bit_test(avail_core, i))
8237 continue;
8238 bit_clear(avail_core, i);
8239 if (bit_set_count(avail_core) * cpus_per_core <
8240 *avail_cpus) {
8241 *avail_cpus -= cpus_per_core;
8242 }
8243 avail_cores_per_sock[s]--;
8244 avail_cores_tot--;
8245 break;
8246 }
8247 }
8248 if (cpus_per_gres) {
8249 i = *avail_cpus / cpus_per_gres;
8250 sock_gres->total_cnt = MIN(i, sock_gres->total_cnt);
8251 if ((job_specs->gres_per_node > sock_gres->total_cnt) ||
8252 (job_specs->gres_per_task > sock_gres->total_cnt)) {
8253 *max_tasks_this_node = 0;
8254 }
8255 }
8256 }
8257 list_iterator_destroy(sock_gres_iter);
8258 xfree(avail_cores_per_sock);
8259 xfree(req_sock);
8260 xfree(socket_index);
8261
8262
8263 if (select_type_param == NO_VAL16)
8264 select_type_param = slurm_get_select_type_param();
8265 if ((mc_ptr->cpus_per_task > 1) ||
8266 ((select_type_param & CR_ONE_TASK_PER_CORE) == 0)) {
8267 /*
8268 * Only adjust *avail_cpus for the maximum task count if
8269 * cpus_per_task is explicitly set. There is currently no way
8270 * to tell if cpus_per_task==1 is explicitly set by the job
8271 * when SelectTypeParameters includes CR_ONE_TASK_PER_CORE.
8272 */
8273 *avail_cpus = MIN(*avail_cpus,
8274 *max_tasks_this_node * mc_ptr->cpus_per_task);
8275 }
8276 }
8277
8278 /*
8279 * Return the maximum number of tasks that can be started on a node with
8280 * sock_gres_list (per-socket GRES details for some node)
8281 */
gres_plugin_get_task_limit(List sock_gres_list)8282 extern uint32_t gres_plugin_get_task_limit(List sock_gres_list)
8283 {
8284 ListIterator sock_gres_iter;
8285 sock_gres_t *sock_gres;
8286 uint32_t max_tasks = NO_VAL;
8287 uint64_t task_limit;
8288
8289 sock_gres_iter = list_iterator_create(sock_gres_list);
8290 while ((sock_gres = (sock_gres_t *) list_next(sock_gres_iter))) {
8291 xassert(sock_gres->job_specs);
8292 if (sock_gres->job_specs->gres_per_task == 0)
8293 continue;
8294 task_limit = sock_gres->total_cnt /
8295 sock_gres->job_specs->gres_per_task;
8296 max_tasks = MIN(max_tasks, task_limit);
8297 }
8298 list_iterator_destroy(sock_gres_iter);
8299
8300 return max_tasks;
8301 }
8302
8303 /*
8304 * Return count of sockets allocated to this job on this node
8305 * job_res IN - job resource allocation
8306 * node_inx IN - global node index
8307 * job_node_inx IN - node index for this job's allocation
8308 * RET socket count
8309 */
_get_sock_cnt(struct job_resources * job_res,int node_inx,int job_node_inx)8310 static int _get_sock_cnt(struct job_resources *job_res, int node_inx,
8311 int job_node_inx)
8312 {
8313 int core_offset, used_sock_cnt = 0;
8314 uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
8315 int c, i, rc, s;
8316
8317 rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
8318 &cores_per_socket_cnt);
8319 if (rc != SLURM_SUCCESS) {
8320 error("%s: Invalid socket/core count", __func__);
8321 return 1;
8322 }
8323 core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
8324 if (core_offset < 0) {
8325 error("%s: Invalid core offset", __func__);
8326 return 1;
8327 }
8328 for (s = 0; s < sock_cnt; s++) {
8329 for (c = 0; c < cores_per_socket_cnt; c++) {
8330 i = (s * cores_per_socket_cnt) + c;
8331 if (bit_test(job_res->core_bitmap, (core_offset + i)))
8332 used_sock_cnt++;
8333 }
8334 }
8335 if (used_sock_cnt == 0) {
8336 error("%s: No allocated cores found", __func__);
8337 return 1;
8338 }
8339 return used_sock_cnt;
8340 }
8341
8342 /*
8343 * Select specific GRES (set GRES bitmap) for this job on this node based upon
8344 * per-job resource specification. Use only socket-local GRES
8345 * job_res IN - job resource allocation
8346 * node_inx IN - global node index
8347 * job_node_inx IN - node index for this job's allocation
8348 * rem_nodes IN - count of nodes remaining to place resources on
8349 * job_specs IN - job request specifications, UPDATED: set bits in
8350 * gres_bit_select
8351 * node_specs IN - node resource request specifications
8352 * job_id IN - job ID for logging
8353 * tres_mc_ptr IN - job's multi-core options
8354 * cpus_per_core IN - CPUs per core on this node
8355 * RET 0:more work, 1:fini
8356 */
_set_job_bits1(struct job_resources * job_res,int node_inx,int job_node_inx,int rem_nodes,sock_gres_t * sock_gres,uint32_t job_id,gres_mc_data_t * tres_mc_ptr,uint16_t cpus_per_core)8357 static int _set_job_bits1(struct job_resources *job_res, int node_inx,
8358 int job_node_inx, int rem_nodes,
8359 sock_gres_t *sock_gres, uint32_t job_id,
8360 gres_mc_data_t *tres_mc_ptr, uint16_t cpus_per_core)
8361 {
8362 int core_offset, gres_cnt;
8363 uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
8364 int c, i, g, rc, s;
8365 gres_job_state_t *job_specs;
8366 gres_node_state_t *node_specs;
8367 int *cores_on_sock = NULL, alloc_gres_cnt = 0;
8368 int max_gres, pick_gres, total_cores = 0;
8369 int fini = 0;
8370
8371 job_specs = sock_gres->job_specs;
8372 node_specs = sock_gres->node_specs;
8373 if (job_specs->gres_per_job == job_specs->total_gres)
8374 fini = 1;
8375 rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
8376 &cores_per_socket_cnt);
8377 if (rc != SLURM_SUCCESS) {
8378 error("%s: Invalid socket/core count for job %u on node %d",
8379 __func__, job_id, node_inx);
8380 return rc;
8381 }
8382 core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
8383 if (core_offset < 0) {
8384 error("%s: Invalid core offset for job %u on node %d",
8385 __func__, job_id, node_inx);
8386 return rc;
8387 }
8388 i = sock_gres->sock_cnt;
8389 if ((i != 0) && (i != sock_cnt)) {
8390 error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
8391 __func__, i, sock_cnt, job_id, node_inx);
8392 sock_cnt = MIN(sock_cnt, i);
8393 }
8394 xassert(job_res->core_bitmap);
8395 if (job_node_inx == 0)
8396 job_specs->total_gres = 0;
8397 max_gres = job_specs->gres_per_job - job_specs->total_gres -
8398 (rem_nodes - 1);
8399 cores_on_sock = xcalloc(sock_cnt, sizeof(int));
8400 gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
8401 for (s = 0; s < sock_cnt; s++) {
8402 for (c = 0; c < cores_per_socket_cnt; c++) {
8403 i = (s * cores_per_socket_cnt) + c;
8404 if (bit_test(job_res->core_bitmap, (core_offset + i))) {
8405 cores_on_sock[s]++;
8406 total_cores++;
8407 }
8408 }
8409 }
8410 if (job_specs->cpus_per_gres) {
8411 max_gres = MIN(max_gres,
8412 ((total_cores * cpus_per_core) /
8413 job_specs->cpus_per_gres));
8414 }
8415 if ((max_gres > 1) && (node_specs->link_len == gres_cnt))
8416 pick_gres = NO_VAL16;
8417 else
8418 pick_gres = max_gres;
8419 /*
8420 * Now pick specific GRES for these sockets.
8421 * First select all GRES that we might possibly use, starting with
8422 * those not constrained by socket, then contrained by socket.
8423 * Then remove those which are not required and not "best".
8424 */
8425 for (s = -1; /* Socket == - 1 if GRES avail from any socket */
8426 ((s < sock_cnt) && (alloc_gres_cnt < pick_gres)); s++) {
8427 if ((s >= 0) && !cores_on_sock[s])
8428 continue;
8429 for (g = 0; ((g < gres_cnt) && (alloc_gres_cnt < pick_gres));
8430 g++) {
8431 if ((s == -1) &&
8432 (!sock_gres->bits_any_sock ||
8433 !bit_test(sock_gres->bits_any_sock, g)))
8434 continue; /* GRES not avail any socket */
8435 if ((s >= 0) &&
8436 (!sock_gres->bits_by_sock ||
8437 !sock_gres->bits_by_sock[s] ||
8438 !bit_test(sock_gres->bits_by_sock[s], g)))
8439 continue; /* GRES not on this socket */
8440 if (bit_test(node_specs->gres_bit_alloc, g) ||
8441 bit_test(job_specs->gres_bit_select[node_inx], g))
8442 continue; /* Already allocated GRES */
8443 bit_set(job_specs->gres_bit_select[node_inx], g);
8444 job_specs->gres_cnt_node_select[node_inx]++;
8445 alloc_gres_cnt++;
8446 job_specs->total_gres++;
8447 }
8448 }
8449 if (alloc_gres_cnt == 0) {
8450 for (s = 0; ((s < sock_cnt) && (alloc_gres_cnt == 0)); s++) {
8451 if (cores_on_sock[s])
8452 continue;
8453 for (g = 0; g < gres_cnt; g++) {
8454 if (!sock_gres->bits_by_sock ||
8455 !sock_gres->bits_by_sock[s] ||
8456 !bit_test(sock_gres->bits_by_sock[s], g))
8457 continue; /* GRES not on this socket */
8458 if (bit_test(node_specs->gres_bit_alloc, g) ||
8459 bit_test(job_specs->
8460 gres_bit_select[node_inx], g))
8461 continue; /* Already allocated GRES */
8462 bit_set(job_specs->gres_bit_select[node_inx],g);
8463 job_specs->gres_cnt_node_select[node_inx]++;
8464 alloc_gres_cnt++;
8465 job_specs->total_gres++;
8466 break;
8467 }
8468 }
8469 }
8470 if (alloc_gres_cnt == 0) {
8471 error("%s: job %u failed to find any available GRES on node %d",
8472 __func__, job_id, node_inx);
8473 }
8474 /* Now pick the "best" max_gres GRES with respect to link counts. */
8475 if (alloc_gres_cnt > max_gres) {
8476 int best_link_cnt = -1, best_inx = -1;
8477 for (s = 0; s < gres_cnt; s++) {
8478 if (!bit_test(job_specs->gres_bit_select[node_inx], s))
8479 continue;
8480 for (g = s + 1; g < gres_cnt; g++) {
8481 if (!bit_test(job_specs->
8482 gres_bit_select[node_inx], g))
8483 continue;
8484 if (node_specs->links_cnt[s][g] <=
8485 best_link_cnt)
8486 continue;
8487 best_link_cnt = node_specs->links_cnt[s][g];
8488 best_inx = s;
8489 }
8490 }
8491 while ((alloc_gres_cnt > max_gres) && (best_link_cnt != -1)) {
8492 int worst_inx = -1, worst_link_cnt = NO_VAL16;
8493 for (g = 0; g < gres_cnt; g++) {
8494 if (g == best_inx)
8495 continue;
8496 if (!bit_test(job_specs->
8497 gres_bit_select[node_inx], g))
8498 continue;
8499 if (node_specs->links_cnt[best_inx][g] >=
8500 worst_link_cnt)
8501 continue;
8502 worst_link_cnt =
8503 node_specs->links_cnt[best_inx][g];
8504 worst_inx = g;
8505 }
8506 if (worst_inx == -1) {
8507 error("%s: error managing links_cnt", __func__);
8508 break;
8509 }
8510 bit_clear(job_specs->gres_bit_select[node_inx],
8511 worst_inx);
8512 job_specs->gres_cnt_node_select[node_inx]--;
8513 alloc_gres_cnt--;
8514 job_specs->total_gres--;
8515 }
8516 }
8517
8518 xfree(cores_on_sock);
8519 if (job_specs->total_gres >= job_specs->gres_per_job)
8520 fini = 1;
8521 return fini;
8522 }
8523
8524 /*
8525 * Select specific GRES (set GRES bitmap) for this job on this node based upon
8526 * per-job resource specification. Use any GRES on the node
8527 * job_res IN - job resource allocation
8528 * node_inx IN - global node index
8529 * job_node_inx IN - node index for this job's allocation
8530 * job_specs IN - job request specifications, UPDATED: set bits in
8531 * gres_bit_select
8532 * node_specs IN - node resource request specifications
8533 * job_id IN - job ID for logging
8534 * tres_mc_ptr IN - job's multi-core options
8535 * RET 0:more work, 1:fini
8536 */
_set_job_bits2(struct job_resources * job_res,int node_inx,int job_node_inx,sock_gres_t * sock_gres,uint32_t job_id,gres_mc_data_t * tres_mc_ptr)8537 static int _set_job_bits2(struct job_resources *job_res, int node_inx,
8538 int job_node_inx, sock_gres_t *sock_gres,
8539 uint32_t job_id, gres_mc_data_t *tres_mc_ptr)
8540 {
8541 int core_offset, gres_cnt;
8542 uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
8543 int i, g, l, rc, s;
8544 gres_job_state_t *job_specs;
8545 gres_node_state_t *node_specs;
8546 int fini = 0;
8547 int best_link_cnt = 0, best_inx = -1;
8548
8549 job_specs = sock_gres->job_specs;
8550 node_specs = sock_gres->node_specs;
8551 if (job_specs->gres_per_job == job_specs->total_gres) {
8552 fini = 1;
8553 return fini;
8554 }
8555 if (!job_specs->gres_bit_select ||
8556 !job_specs->gres_bit_select[node_inx]) {
8557 error("%s: gres_bit_select NULL for job %u on node %d",
8558 __func__, job_id, node_inx);
8559 return SLURM_ERROR;
8560 }
8561 rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
8562 &cores_per_socket_cnt);
8563 if (rc != SLURM_SUCCESS) {
8564 error("%s: Invalid socket/core count for job %u on node %d",
8565 __func__, job_id, node_inx);
8566 return rc;
8567 }
8568 core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
8569 if (core_offset < 0) {
8570 error("%s: Invalid core offset for job %u on node %d",
8571 __func__, job_id, node_inx);
8572 return rc;
8573 }
8574 i = sock_gres->sock_cnt;
8575 if ((i != 0) && (i != sock_cnt)) {
8576 error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
8577 __func__, i, sock_cnt, job_id, node_inx);
8578 sock_cnt = MIN(sock_cnt, i);
8579 }
8580
8581 /*
8582 * Identify the GRES (if any) that we want to use as a basis for
8583 * maximizing link count (connectivity of the GRES).
8584 */
8585 xassert(job_res->core_bitmap);
8586 gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
8587 if ((job_specs->gres_per_job > job_specs->total_gres) &&
8588 (node_specs->link_len == gres_cnt)) {
8589 for (g = 0; g < gres_cnt; g++) {
8590 if (!bit_test(job_specs->gres_bit_select[node_inx], g))
8591 continue;
8592 best_inx = g;
8593 for (s = 0; s < gres_cnt; s++) {
8594 best_link_cnt = MAX(node_specs->links_cnt[s][g],
8595 best_link_cnt);
8596 }
8597 break;
8598 }
8599 }
8600
8601 /*
8602 * Now pick specific GRES for these sockets.
8603 * Start with GRES available from any socket, then specific sockets
8604 */
8605 for (l = best_link_cnt;
8606 ((l >= 0) && (job_specs->gres_per_job > job_specs->total_gres));
8607 l--) {
8608 for (s = -1; /* Socket == - 1 if GRES avail from any socket */
8609 ((s < sock_cnt) &&
8610 (job_specs->gres_per_job > job_specs->total_gres)); s++) {
8611 for (g = 0;
8612 ((g < gres_cnt) &&
8613 (job_specs->gres_per_job >job_specs->total_gres));
8614 g++) {
8615 if ((l > 0) &&
8616 (node_specs->links_cnt[best_inx][g] < l))
8617 continue; /* Want better link count */
8618 if ((s == -1) &&
8619 (!sock_gres->bits_any_sock ||
8620 !bit_test(sock_gres->bits_any_sock, g)))
8621 continue; /* GRES not avail any sock */
8622 if ((s >= 0) &&
8623 (!sock_gres->bits_by_sock ||
8624 !sock_gres->bits_by_sock[s] ||
8625 !bit_test(sock_gres->bits_by_sock[s], g)))
8626 continue; /* GRES not on this socket */
8627 if (bit_test(node_specs->gres_bit_alloc, g) ||
8628 bit_test(job_specs->gres_bit_select[node_inx],
8629 g))
8630 continue; /* Already allocated GRES */
8631 bit_set(job_specs->gres_bit_select[node_inx],g);
8632 job_specs->gres_cnt_node_select[node_inx]++;
8633 job_specs->total_gres++;
8634 }
8635 }
8636 }
8637 if (job_specs->gres_per_job == job_specs->total_gres)
8638 fini = 1;
8639 return fini;
8640 }
8641
8642 /*
8643 * Select specific GRES (set GRES bitmap) for this job on this node based upon
8644 * per-node resource specification
8645 * job_res IN - job resource allocation
8646 * node_inx IN - global node index
8647 * job_node_inx IN - node index for this job's allocation
8648 * job_specs IN - job request specifications, UPDATED: set bits in
8649 * gres_bit_select
8650 * node_specs IN - node resource request specifications
8651 * job_id IN - job ID for logging
8652 * tres_mc_ptr IN - job's multi-core options
8653 */
_set_node_bits(struct job_resources * job_res,int node_inx,int job_node_inx,sock_gres_t * sock_gres,uint32_t job_id,gres_mc_data_t * tres_mc_ptr)8654 static void _set_node_bits(struct job_resources *job_res, int node_inx,
8655 int job_node_inx, sock_gres_t *sock_gres,
8656 uint32_t job_id, gres_mc_data_t *tres_mc_ptr)
8657 {
8658 int core_offset, gres_cnt;
8659 uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
8660 int c, i, g, l, rc, s;
8661 gres_job_state_t *job_specs;
8662 gres_node_state_t *node_specs;
8663 int *used_sock = NULL, alloc_gres_cnt = 0;
8664 int *links_cnt = NULL, best_link_cnt = 0;
8665 uint64_t gres_per_bit = 1;
8666
8667 job_specs = sock_gres->job_specs;
8668 node_specs = sock_gres->node_specs;
8669 rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
8670 &cores_per_socket_cnt);
8671 if (rc != SLURM_SUCCESS) {
8672 error("%s: Invalid socket/core count for job %u on node %d",
8673 __func__, job_id, node_inx);
8674 return;
8675 }
8676 core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
8677 if (core_offset < 0) {
8678 error("%s: Invalid core offset for job %u on node %d",
8679 __func__, job_id, node_inx);
8680 return;
8681 }
8682 i = sock_gres->sock_cnt;
8683 if ((i != 0) && (i != sock_cnt)) {
8684 error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
8685 __func__, i, sock_cnt, job_id, node_inx);
8686 sock_cnt = MIN(sock_cnt, i);
8687 }
8688
8689 xassert(job_res->core_bitmap);
8690 used_sock = xcalloc(sock_cnt, sizeof(int));
8691 gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
8692 for (s = 0; s < sock_cnt; s++) {
8693 for (c = 0; c < cores_per_socket_cnt; c++) {
8694 i = (s * cores_per_socket_cnt) + c;
8695 if (bit_test(job_res->core_bitmap, (core_offset + i))) {
8696 used_sock[s]++;
8697 break;
8698 }
8699 }
8700 }
8701
8702 /*
8703 * Now pick specific GRES for these sockets.
8704 * First: Try to place one GRES per socket in this job's allocation.
8705 * Second: Try to place additional GRES on allocated sockets.
8706 * Third: Use any additional available GRES.
8707 */
8708 if (node_specs->link_len == gres_cnt)
8709 links_cnt = xcalloc(gres_cnt, sizeof(int));
8710 if (_shared_gres(sock_gres->plugin_id))
8711 gres_per_bit = job_specs->gres_per_node;
8712 for (s = -1; /* Socket == - 1 if GRES avail from any socket */
8713 ((s < sock_cnt) && (alloc_gres_cnt < job_specs->gres_per_node));
8714 s++) {
8715 if ((s >= 0) && !used_sock[s])
8716 continue;
8717 for (g = 0; g < gres_cnt; g++) {
8718 if ((s == -1) &&
8719 (!sock_gres->bits_any_sock ||
8720 !bit_test(sock_gres->bits_any_sock, g)))
8721 continue; /* GRES not avail any socket */
8722 if ((s >= 0) &&
8723 (!sock_gres->bits_by_sock ||
8724 !sock_gres->bits_by_sock[s] ||
8725 !bit_test(sock_gres->bits_by_sock[s], g)))
8726 continue; /* GRES not on this socket */
8727 if (bit_test(job_specs->gres_bit_select[node_inx], g) ||
8728 ((gres_per_bit == 1) &&
8729 bit_test(node_specs->gres_bit_alloc, g)))
8730 continue; /* Already allocated GRES */
8731 bit_set(job_specs->gres_bit_select[node_inx], g);
8732 job_specs->gres_cnt_node_select[node_inx] +=
8733 gres_per_bit;
8734 alloc_gres_cnt += gres_per_bit;
8735 for (l = 0; links_cnt && (l < gres_cnt); l++) {
8736 if ((l == g) ||
8737 bit_test(node_specs->gres_bit_alloc, l))
8738 continue;
8739 links_cnt[l] += node_specs->links_cnt[g][l];
8740 }
8741 break;
8742 }
8743 }
8744
8745 if (links_cnt) {
8746 for (l = 0; l < gres_cnt; l++)
8747 best_link_cnt = MAX(links_cnt[l], best_link_cnt);
8748 if (best_link_cnt > 4) {
8749 /* Scale down to reasonable iteration count (<= 4) */
8750 g = (best_link_cnt + 3) / 4;
8751 best_link_cnt = 0;
8752 for (l = 0; l < gres_cnt; l++) {
8753 links_cnt[l] /= g;
8754 best_link_cnt = MAX(links_cnt[l],best_link_cnt);
8755 }
8756 }
8757 }
8758
8759 /*
8760 * Try to place additional GRES on allocated sockets. Favor use of
8761 * GRES which are best linked to GRES which have already been selected.
8762 */
8763 for (l = best_link_cnt;
8764 ((l >= 0) && (alloc_gres_cnt < job_specs->gres_per_node)); l--) {
8765 for (s = -1; /* Socket == - 1 if GRES avail from any socket */
8766 ((s < sock_cnt) &&
8767 (alloc_gres_cnt < job_specs->gres_per_node)); s++) {
8768 if ((s >= 0) && !used_sock[s])
8769 continue;
8770 for (g = 0; g < gres_cnt; g++) {
8771 if (links_cnt && (links_cnt[g] < l))
8772 continue;
8773 if ((s == -1) &&
8774 (!sock_gres->bits_any_sock ||
8775 !bit_test(sock_gres->bits_any_sock, g)))
8776 continue;/* GRES not avail any socket */
8777 if ((s >= 0) &&
8778 (!sock_gres->bits_by_sock ||
8779 !sock_gres->bits_by_sock[s] ||
8780 !bit_test(sock_gres->bits_by_sock[s], g)))
8781 continue; /* GRES not on this socket */
8782 if (bit_test(job_specs->gres_bit_select[node_inx],
8783 g) ||
8784 ((gres_per_bit == 1) &&
8785 bit_test(node_specs->gres_bit_alloc, g)))
8786 continue; /* Already allocated GRES */
8787 bit_set(job_specs->gres_bit_select[node_inx],g);
8788 job_specs->gres_cnt_node_select[node_inx] +=
8789 gres_per_bit;
8790 alloc_gres_cnt += gres_per_bit;
8791 if (alloc_gres_cnt >= job_specs->gres_per_node)
8792 break;
8793 }
8794 }
8795 }
8796
8797 /*
8798 * Use any additional available GRES. Again, favor use of GRES
8799 * which are best linked to GRES which have already been selected.
8800 */
8801 for (l = best_link_cnt;
8802 ((l >= 0) && (alloc_gres_cnt < job_specs->gres_per_node)); l--) {
8803 for (s = 0;
8804 ((s < sock_cnt) &&
8805 (alloc_gres_cnt < job_specs->gres_per_node)); s++) {
8806 if (used_sock[s])
8807 continue;
8808 for (g = 0; g < gres_cnt; g++) {
8809 if (links_cnt && (links_cnt[g] < l))
8810 continue;
8811 if (!sock_gres->bits_by_sock ||
8812 !sock_gres->bits_by_sock[s] ||
8813 !bit_test(sock_gres->bits_by_sock[s], g))
8814 continue; /* GRES not on this socket */
8815 if (bit_test(job_specs->gres_bit_select[node_inx],
8816 g) ||
8817 ((gres_per_bit == 1) &&
8818 bit_test(node_specs->gres_bit_alloc, g)))
8819 continue; /* Already allocated GRES */
8820 bit_set(job_specs->gres_bit_select[node_inx],g);
8821 job_specs->gres_cnt_node_select[node_inx] +=
8822 gres_per_bit;
8823 alloc_gres_cnt += gres_per_bit;
8824 if (alloc_gres_cnt >= job_specs->gres_per_node)
8825 break;
8826 }
8827 }
8828 }
8829
8830 xfree(links_cnt);
8831 xfree(used_sock);
8832 }
8833
8834 /*
8835 * Select one specific GRES topo entry (set GRES bitmap) for this job on this
8836 * node based upon per-node resource specification
8837 * job_res IN - job resource allocation
8838 * node_inx IN - global node index
8839 * job_node_inx IN - node index for this job's allocation
8840 * job_specs IN - job request specifications, UPDATED: set bits in
8841 * gres_bit_select
8842 * node_specs IN - node resource request specifications
8843 * job_id IN - job ID for logging
8844 * tres_mc_ptr IN - job's multi-core options
8845 */
_pick_specific_topo(struct job_resources * job_res,int node_inx,int job_node_inx,sock_gres_t * sock_gres,uint32_t job_id,gres_mc_data_t * tres_mc_ptr)8846 static void _pick_specific_topo(struct job_resources *job_res, int node_inx,
8847 int job_node_inx, sock_gres_t *sock_gres,
8848 uint32_t job_id, gres_mc_data_t *tres_mc_ptr)
8849 {
8850 int core_offset;
8851 uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
8852 int c, i, rc, s, t;
8853 gres_job_state_t *job_specs;
8854 gres_node_state_t *node_specs;
8855 int *used_sock = NULL, alloc_gres_cnt = 0;
8856 uint64_t gres_per_bit;
8857 bool use_busy_dev = false;
8858
8859 job_specs = sock_gres->job_specs;
8860 gres_per_bit = job_specs->gres_per_node;
8861 node_specs = sock_gres->node_specs;
8862 rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
8863 &cores_per_socket_cnt);
8864 if (rc != SLURM_SUCCESS) {
8865 error("%s: Invalid socket/core count for job %u on node %d",
8866 __func__, job_id, node_inx);
8867 return;
8868 }
8869 core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
8870 if (core_offset < 0) {
8871 error("%s: Invalid core offset for job %u on node %d",
8872 __func__, job_id, node_inx);
8873 return;
8874 }
8875 i = sock_gres->sock_cnt;
8876 if ((i != 0) && (i != sock_cnt)) {
8877 error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
8878 __func__, i, sock_cnt, job_id, node_inx);
8879 sock_cnt = MIN(sock_cnt, i);
8880 }
8881
8882 xassert(job_res->core_bitmap);
8883 used_sock = xcalloc(sock_cnt, sizeof(int));
8884 for (s = 0; s < sock_cnt; s++) {
8885 for (c = 0; c < cores_per_socket_cnt; c++) {
8886 i = (s * cores_per_socket_cnt) + c;
8887 if (bit_test(job_res->core_bitmap, (core_offset + i))) {
8888 used_sock[s]++;
8889 break;
8890 }
8891 }
8892 }
8893
8894 if ((sock_gres->plugin_id == mps_plugin_id) &&
8895 (node_specs->gres_cnt_alloc != 0)) {
8896 /* We must use the ONE already active GRES of this type */
8897 use_busy_dev = true;
8898 }
8899
8900 /*
8901 * Now pick specific GRES for these sockets.
8902 * First: Try to select a GRES local to allocated socket with
8903 * sufficient resources.
8904 * Second: Use available GRES with sufficient resources.
8905 * Third: Use any available GRES.
8906 */
8907 for (s = -1; /* Socket == - 1 if GRES avail from any socket */
8908 (s < sock_cnt) && (alloc_gres_cnt == 0); s++) {
8909 if ((s >= 0) && !used_sock[s])
8910 continue;
8911 for (t = 0; t < node_specs->topo_cnt; t++) {
8912 if (use_busy_dev &&
8913 (node_specs->topo_gres_cnt_alloc[t] == 0))
8914 continue;
8915 if (node_specs->topo_gres_cnt_alloc &&
8916 node_specs->topo_gres_cnt_avail &&
8917 ((node_specs->topo_gres_cnt_avail[t] -
8918 node_specs->topo_gres_cnt_alloc[t]) <
8919 gres_per_bit))
8920 continue; /* Insufficient resources */
8921 if ((s == -1) &&
8922 (!sock_gres->bits_any_sock ||
8923 !bit_test(sock_gres->bits_any_sock, t)))
8924 continue; /* GRES not avail any socket */
8925 if ((s >= 0) &&
8926 (!sock_gres->bits_by_sock ||
8927 !sock_gres->bits_by_sock[s] ||
8928 !bit_test(sock_gres->bits_by_sock[s], t)))
8929 continue; /* GRES not on this socket */
8930 bit_set(job_specs->gres_bit_select[node_inx], t);
8931 job_specs->gres_cnt_node_select[node_inx] +=
8932 gres_per_bit;
8933 alloc_gres_cnt += gres_per_bit;
8934 break;
8935 }
8936 }
8937
8938 /* Select available GRES with sufficient resources */
8939 for (t = 0; (t < node_specs->topo_cnt) && (alloc_gres_cnt == 0); t++) {
8940 if (use_busy_dev &&
8941 (node_specs->topo_gres_cnt_alloc[t] == 0))
8942 continue;
8943 if (node_specs->topo_gres_cnt_alloc &&
8944 node_specs->topo_gres_cnt_avail &&
8945 node_specs->topo_gres_cnt_avail[t] &&
8946 ((node_specs->topo_gres_cnt_avail[t] -
8947 node_specs->topo_gres_cnt_alloc[t]) < gres_per_bit))
8948 continue; /* Insufficient resources */
8949 bit_set(job_specs->gres_bit_select[node_inx], t);
8950 job_specs->gres_cnt_node_select[node_inx] += gres_per_bit;
8951 alloc_gres_cnt += gres_per_bit;
8952 break;
8953 }
8954
8955 /* Select available GRES with any resources */
8956 for (t = 0; (t < node_specs->topo_cnt) && (alloc_gres_cnt == 0); t++) {
8957 if (node_specs->topo_gres_cnt_alloc &&
8958 node_specs->topo_gres_cnt_avail &&
8959 node_specs->topo_gres_cnt_avail[t])
8960 continue; /* No resources */
8961 bit_set(job_specs->gres_bit_select[node_inx], t);
8962 job_specs->gres_cnt_node_select[node_inx] += gres_per_bit;
8963 alloc_gres_cnt += gres_per_bit;
8964 }
8965
8966 xfree(used_sock);
8967 }
8968
8969 /*
8970 * Select specific GRES (set GRES bitmap) for this job on this node based upon
8971 * per-socket resource specification
8972 * job_res IN - job resource allocation
8973 * node_inx IN - global node index
8974 * job_node_inx IN - node index for this job's allocation
8975 * job_specs IN - job request specifications, UPDATED: set bits in
8976 * gres_bit_select
8977 * node_specs IN - node resource request specifications
8978 * job_id IN - job ID for logging
8979 * tres_mc_ptr IN - job's multi-core options
8980 */
_set_sock_bits(struct job_resources * job_res,int node_inx,int job_node_inx,sock_gres_t * sock_gres,uint32_t job_id,gres_mc_data_t * tres_mc_ptr)8981 static void _set_sock_bits(struct job_resources *job_res, int node_inx,
8982 int job_node_inx, sock_gres_t *sock_gres,
8983 uint32_t job_id, gres_mc_data_t *tres_mc_ptr)
8984 {
8985 int core_offset, gres_cnt;
8986 uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
8987 int c, i, g, l, rc, s;
8988 gres_job_state_t *job_specs;
8989 gres_node_state_t *node_specs;
8990 int *used_sock = NULL, used_sock_cnt = 0;
8991 int *links_cnt = NULL, best_link_cnt = 0;
8992
8993 job_specs = sock_gres->job_specs;
8994 node_specs = sock_gres->node_specs;
8995 rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
8996 &cores_per_socket_cnt);
8997 if (rc != SLURM_SUCCESS) {
8998 error("%s: Invalid socket/core count for job %u on node %d",
8999 __func__, job_id, node_inx);
9000 return;
9001 }
9002 core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
9003 if (core_offset < 0) {
9004 error("%s: Invalid core offset for job %u on node %d",
9005 __func__, job_id, node_inx);
9006 return;
9007 }
9008 i = sock_gres->sock_cnt;
9009 if ((i != 0) && (i != sock_cnt)) {
9010 error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
9011 __func__, i, sock_cnt, job_id, node_inx);
9012 sock_cnt = MIN(sock_cnt, i);
9013 }
9014
9015 xassert(job_res->core_bitmap);
9016 used_sock = xcalloc(sock_cnt, sizeof(int));
9017 gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
9018 for (s = 0; s < sock_cnt; s++) {
9019 for (c = 0; c < cores_per_socket_cnt; c++) {
9020 i = (s * cores_per_socket_cnt) + c;
9021 if (bit_test(job_res->core_bitmap, (core_offset + i))) {
9022 used_sock[s]++;
9023 used_sock_cnt++;
9024 break;
9025 }
9026 }
9027 }
9028 if (tres_mc_ptr && tres_mc_ptr->sockets_per_node &&
9029 (tres_mc_ptr->sockets_per_node != used_sock_cnt) &&
9030 node_specs->gres_bit_alloc && sock_gres->bits_by_sock) {
9031 if (tres_mc_ptr->sockets_per_node > used_sock_cnt) {
9032 /* Somehow we have too few sockets in job allocation */
9033 error("%s: Inconsistent requested/allocated socket count "
9034 "(%d > %d) for job %u on node %d",
9035 __func__, tres_mc_ptr->sockets_per_node,
9036 used_sock_cnt, job_id, node_inx);
9037 for (s = 0; s < sock_cnt; s++) {
9038 if (used_sock[s] || !sock_gres->bits_by_sock[s])
9039 continue;
9040 /* Determine currently free GRES by socket */
9041 used_sock[s] = bit_set_count(
9042 sock_gres->bits_by_sock[s]) -
9043 bit_overlap(
9044 sock_gres->bits_by_sock[s],
9045 node_specs->gres_bit_alloc);
9046 if ((used_sock[s] == 0) ||
9047 (used_sock[s] < job_specs->gres_per_socket)){
9048 used_sock[s] = 0;
9049 } else if (++used_sock_cnt ==
9050 tres_mc_ptr->sockets_per_node) {
9051 break;
9052 }
9053 }
9054 } else {
9055 /* May have needed extra CPUs, exceeding socket count */
9056 debug("%s: Inconsistent requested/allocated socket count "
9057 "(%d < %d) for job %u on node %d",
9058 __func__, tres_mc_ptr->sockets_per_node,
9059 used_sock_cnt, job_id, node_inx);
9060 for (s = 0; s < sock_cnt; s++) {
9061 if (!used_sock[s] ||
9062 !sock_gres->bits_by_sock[s])
9063 continue;
9064 /* Determine currently free GRES by socket */
9065 used_sock[s] = bit_set_count(
9066 sock_gres->bits_by_sock[s]) -
9067 bit_overlap(
9068 sock_gres->bits_by_sock[s],
9069 node_specs->gres_bit_alloc);
9070 if (used_sock[s] == 0)
9071 used_sock_cnt--;
9072 }
9073 /* Exclude sockets with low GRES counts */
9074 while (tres_mc_ptr->sockets_per_node > used_sock_cnt) {
9075 int low_sock_inx = -1;
9076 for (s = sock_cnt - 1; s >= 0; s--) {
9077 if (used_sock[s] == 0)
9078 continue;
9079 if ((low_sock_inx == -1) ||
9080 (used_sock[s] <
9081 used_sock[low_sock_inx]))
9082 low_sock_inx = s;
9083 }
9084 if (low_sock_inx == -1)
9085 break;
9086 used_sock[low_sock_inx] = 0;
9087 used_sock_cnt--;
9088 }
9089 }
9090 }
9091
9092 /*
9093 * Identify the available GRES with best connectivity
9094 * (i.e. higher link_cnt)
9095 */
9096 if (node_specs->link_len == gres_cnt) {
9097 links_cnt = xcalloc(gres_cnt, sizeof(int));
9098 for (g = 0; g < gres_cnt; g++) {
9099 if (bit_test(node_specs->gres_bit_alloc, g))
9100 continue;
9101 for (l = 0; l < gres_cnt; l++) {
9102 if ((l == g) ||
9103 bit_test(node_specs->gres_bit_alloc, l))
9104 continue;
9105 links_cnt[l] += node_specs->links_cnt[g][l];
9106 }
9107 }
9108 for (l = 0; l < gres_cnt; l++)
9109 best_link_cnt = MAX(links_cnt[l], best_link_cnt);
9110 if (best_link_cnt > 4) {
9111 /* Scale down to reasonable iteration count (<= 4) */
9112 g = (best_link_cnt + 3) / 4;
9113 best_link_cnt = 0;
9114 for (l = 0; l < gres_cnt; l++) {
9115 links_cnt[l] /= g;
9116 best_link_cnt = MAX(links_cnt[l],best_link_cnt);
9117 }
9118 }
9119 }
9120
9121 /*
9122 * Now pick specific GRES for these sockets.
9123 * Try to use GRES with best connectivity (higher link_cnt values)
9124 */
9125 for (s = 0; s < sock_cnt; s++) {
9126 if (!used_sock[s])
9127 continue;
9128 i = 0;
9129 for (l = best_link_cnt;
9130 ((l >= 0) && (i < job_specs->gres_per_socket)); l--) {
9131 for (g = 0; g < gres_cnt; g++) {
9132 if (!sock_gres->bits_by_sock ||
9133 !sock_gres->bits_by_sock[s] ||
9134 !bit_test(sock_gres->bits_by_sock[s], g))
9135 continue; /* GRES not on this socket */
9136 if (node_specs->gres_bit_alloc &&
9137 bit_test(node_specs->gres_bit_alloc, g))
9138 continue; /* Already allocated GRES */
9139 if (job_specs->gres_bit_select[node_inx] &&
9140 bit_test(job_specs->gres_bit_select[node_inx],
9141 g))
9142 continue; /* Already allocated GRES */
9143 bit_set(job_specs->gres_bit_select[node_inx],g);
9144 job_specs->gres_cnt_node_select[node_inx]++;
9145 if (++i == job_specs->gres_per_socket)
9146 break;
9147 }
9148 }
9149 if ((i < job_specs->gres_per_socket) &&
9150 sock_gres->bits_any_sock) {
9151 /* Add GRES unconstrained by socket as needed */
9152 for (g = 0; g < gres_cnt; g++) {
9153 if (!sock_gres->bits_any_sock ||
9154 !bit_test(sock_gres->bits_any_sock, g))
9155 continue; /* GRES not on this socket */
9156 if (node_specs->gres_bit_alloc &&
9157 bit_test(node_specs->gres_bit_alloc, g))
9158 continue; /* Already allocated GRES */
9159 if (job_specs->gres_bit_select[node_inx] &&
9160 bit_test(job_specs->gres_bit_select[node_inx],
9161 g))
9162 continue; /* Already allocated GRES */
9163 bit_set(job_specs->gres_bit_select[node_inx],g);
9164 job_specs->gres_cnt_node_select[node_inx]++;
9165 if (++i == job_specs->gres_per_socket)
9166 break;
9167 }
9168 }
9169 }
9170 xfree(links_cnt);
9171 xfree(used_sock);
9172 }
9173
9174 /*
9175 * Select specific GRES (set GRES bitmap) for this job on this node based upon
9176 * per-task resource specification
9177 * job_res IN - job resource allocation
9178 * node_inx IN - global node index
9179 * job_node_inx IN - node index for this job's allocation
9180 * job_specs IN - job request specifications, UPDATED: set bits in
9181 * gres_bit_select
9182 * node_specs IN - node resource request specifications
9183 * job_id IN - job ID for logging
9184 * tres_mc_ptr IN - job's multi-core options
9185 */
_set_task_bits(struct job_resources * job_res,int node_inx,int job_node_inx,sock_gres_t * sock_gres,uint32_t job_id,gres_mc_data_t * tres_mc_ptr,uint32_t ** tasks_per_node_socket)9186 static void _set_task_bits(struct job_resources *job_res, int node_inx,
9187 int job_node_inx, sock_gres_t *sock_gres,
9188 uint32_t job_id, gres_mc_data_t *tres_mc_ptr,
9189 uint32_t **tasks_per_node_socket)
9190 {
9191 uint16_t sock_cnt = 0;
9192 int gres_cnt, g, l, s;
9193 gres_job_state_t *job_specs;
9194 gres_node_state_t *node_specs;
9195 uint32_t total_tasks = 0;
9196 uint64_t total_gres_cnt = 0, total_gres_goal;
9197 int *links_cnt = NULL, best_link_cnt = 0;
9198
9199 job_specs = sock_gres->job_specs;
9200 node_specs = sock_gres->node_specs;
9201 sock_cnt = sock_gres->sock_cnt;
9202 gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
9203 if (node_specs->link_len == gres_cnt)
9204 links_cnt = xcalloc(gres_cnt, sizeof(int));
9205
9206 /* First pick GRES for acitve sockets */
9207 for (s = -1; /* Socket == - 1 if GRES avail from any socket */
9208 s < sock_cnt; s++) {
9209 if ((s > 0) &&
9210 (!tasks_per_node_socket[node_inx] ||
9211 (tasks_per_node_socket[node_inx][s] == 0)))
9212 continue;
9213 total_tasks += tasks_per_node_socket[node_inx][s];
9214 total_gres_goal = total_tasks * job_specs->gres_per_task;
9215 for (g = 0; g < gres_cnt; g++) {
9216 if (total_gres_cnt >= total_gres_goal)
9217 break;
9218 if ((s == -1) &&
9219 (!sock_gres->bits_any_sock ||
9220 !bit_test(sock_gres->bits_any_sock, g)))
9221 continue; /* GRES not avail any sock */
9222 if ((s >= 0) &&
9223 (!sock_gres->bits_by_sock ||
9224 !sock_gres->bits_by_sock[s] ||
9225 !bit_test(sock_gres->bits_by_sock[s], g)))
9226 continue; /* GRES not on this socket */
9227 if (bit_test(node_specs->gres_bit_alloc, g))
9228 continue; /* Already allocated GRES */
9229 if (bit_test(node_specs->gres_bit_alloc, g) ||
9230 bit_test(job_specs->gres_bit_select[node_inx], g))
9231 continue; /* Already allocated GRES */
9232 bit_set(job_specs->gres_bit_select[node_inx], g);
9233 job_specs->gres_cnt_node_select[node_inx]++;
9234 total_gres_cnt++;
9235 for (l = 0; links_cnt && (l < gres_cnt); l++) {
9236 if ((l == g) ||
9237 bit_test(node_specs->gres_bit_alloc, l))
9238 continue;
9239 links_cnt[l] += node_specs->links_cnt[g][l];
9240 }
9241 }
9242 }
9243
9244 if (links_cnt) {
9245 for (l = 0; l < gres_cnt; l++)
9246 best_link_cnt = MAX(links_cnt[l], best_link_cnt);
9247 if (best_link_cnt > 4) {
9248 /* Scale down to reasonable iteration count (<= 4) */
9249 g = (best_link_cnt + 3) / 4;
9250 best_link_cnt = 0;
9251 for (l = 0; l < gres_cnt; l++) {
9252 links_cnt[l] /= g;
9253 best_link_cnt = MAX(links_cnt[l],best_link_cnt);
9254 }
9255 }
9256 }
9257
9258 /*
9259 * Next pick additional GRES as needed. Favor use of GRES which
9260 * are best linked to GRES which have already been selected.
9261 */
9262 total_gres_goal = total_tasks * job_specs->gres_per_task;
9263 for (l = best_link_cnt;
9264 ((l >= 0) && (total_gres_cnt < total_gres_goal)); l--) {
9265 for (s = -1; /* Socket == - 1 if GRES avail from any socket */
9266 ((s < sock_cnt) && (total_gres_cnt < total_gres_goal));
9267 s++) {
9268 for (g = 0;
9269 ((g < gres_cnt) &&
9270 (total_gres_cnt < total_gres_goal)); g++) {
9271 if (links_cnt && (links_cnt[g] < l))
9272 continue;
9273 if ((s == -1) &&
9274 (!sock_gres->bits_any_sock ||
9275 !bit_test(sock_gres->bits_any_sock, g)))
9276 continue; /* GRES not avail any sock */
9277 if ((s >= 0) &&
9278 (!sock_gres->bits_by_sock ||
9279 !sock_gres->bits_by_sock[s] ||
9280 !bit_test(sock_gres->bits_by_sock[s], g)))
9281 continue; /* GRES not on this socket */
9282 if (bit_test(node_specs->gres_bit_alloc, g) ||
9283 bit_test(job_specs->gres_bit_select[node_inx],
9284 g))
9285 continue; /* Already allocated GRES */
9286 bit_set(job_specs->gres_bit_select[node_inx],g);
9287 job_specs->gres_cnt_node_select[node_inx]++;
9288 total_gres_cnt++;
9289 }
9290 }
9291 }
9292 xfree(links_cnt);
9293
9294 if (total_gres_cnt < total_gres_goal) {
9295 /* Something bad happened on task layout for this GRES type */
9296 error("%s: Insufficient gres/%s allocated for job %u on node_inx %u "
9297 "(%"PRIu64" < %"PRIu64")", __func__,
9298 sock_gres->gres_name, job_id, node_inx,
9299 total_gres_cnt, total_gres_goal);
9300 }
9301 }
9302
9303 /* Build array to identify task count for each node-socket pair */
_build_tasks_per_node_sock(struct job_resources * job_res,uint8_t overcommit,gres_mc_data_t * tres_mc_ptr,node_record_t * node_table_ptr)9304 static uint32_t **_build_tasks_per_node_sock(struct job_resources *job_res,
9305 uint8_t overcommit,
9306 gres_mc_data_t *tres_mc_ptr,
9307 node_record_t *node_table_ptr)
9308 {
9309 uint32_t **tasks_per_node_socket;
9310 int i, i_first, i_last, j, node_cnt, job_node_inx = 0;
9311 int c, s, core_offset;
9312 int cpus_per_task = 1, cpus_per_node, cpus_per_core;
9313 int task_per_node_limit = 0;
9314 int32_t rem_tasks, excess_tasks;
9315 uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
9316
9317 rem_tasks = tres_mc_ptr->ntasks_per_job;
9318 node_cnt = bit_size(job_res->node_bitmap);
9319 tasks_per_node_socket = xcalloc(node_cnt, sizeof(uint32_t *));
9320 i_first = bit_ffs(job_res->node_bitmap);
9321 if (i_first != -1)
9322 i_last = bit_fls(job_res->node_bitmap);
9323 else
9324 i_last = -2;
9325 for (i = i_first; i <= i_last; i++) {
9326 int tasks_per_node = 0;
9327 if (!bit_test(job_res->node_bitmap, i))
9328 continue;
9329 if (get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
9330 &cores_per_socket_cnt)) {
9331 error("%s: failed to get socket/core count", __func__);
9332 /* Set default of 1 task on socket 0 */
9333 tasks_per_node_socket[i] = xmalloc(sizeof(uint32_t));
9334 tasks_per_node_socket[i][0] = 1;
9335 rem_tasks--;
9336 continue;
9337 }
9338 tasks_per_node_socket[i] = xcalloc(sock_cnt, sizeof(uint32_t));
9339 if (tres_mc_ptr->ntasks_per_node) {
9340 task_per_node_limit = tres_mc_ptr->ntasks_per_node;
9341 } else if (job_res->tasks_per_node &&
9342 job_res->tasks_per_node[job_node_inx]) {
9343 task_per_node_limit =
9344 job_res->tasks_per_node[job_node_inx];
9345 } else {
9346 /*
9347 * NOTE: We should never get here.
9348 * cpus_per_node reports CPUs actually used by this
9349 * job on this node. Divide by cpus_per_task to yield
9350 * valid task count on this node. This can be bad on
9351 * cores with more than one thread and job fails to
9352 * use all threads.
9353 */
9354 error("%s: tasks_per_node not set", __func__);
9355 cpus_per_node = get_job_resources_cpus(job_res,
9356 job_node_inx);
9357 if (cpus_per_node < 1) {
9358 error("%s: failed to get cpus_per_node count",
9359 __func__);
9360 /* Set default of 1 task on socket 0 */
9361 tasks_per_node_socket[i][0] = 1;
9362 rem_tasks--;
9363 continue;
9364 }
9365 if (tres_mc_ptr->cpus_per_task)
9366 cpus_per_task = tres_mc_ptr->cpus_per_task;
9367 else
9368 cpus_per_task = 1;
9369 task_per_node_limit = cpus_per_node / cpus_per_task;
9370 }
9371 core_offset = get_job_resources_offset(job_res, job_node_inx++,
9372 0, 0);
9373 if (node_table_ptr[i].cores) {
9374 cpus_per_core = node_table_ptr[i].cpus /
9375 node_table_ptr[i].cores;
9376 } else
9377 cpus_per_core = 1;
9378 for (s = 0; s < sock_cnt; s++) {
9379 int tasks_per_socket = 0, tpc, skip_cores = 0;
9380 for (c = 0; c < cores_per_socket_cnt; c++) {
9381 j = (s * cores_per_socket_cnt) + c;
9382 j += core_offset;
9383 if (!bit_test(job_res->core_bitmap, j))
9384 continue;
9385 if (skip_cores > 0) {
9386 skip_cores--;
9387 continue;
9388 }
9389 if (tres_mc_ptr->ntasks_per_core) {
9390 tpc = tres_mc_ptr->ntasks_per_core;
9391 } else {
9392 tpc = cpus_per_core / cpus_per_task;
9393 if (tpc < 1) {
9394 tpc = 1;
9395 skip_cores = cpus_per_task /
9396 cpus_per_core;
9397 skip_cores--; /* This core */
9398 }
9399 /* Start with 1 task per core */
9400 }
9401 tasks_per_node_socket[i][s] += tpc;
9402 tasks_per_node += tpc;
9403 tasks_per_socket += tpc;
9404 rem_tasks -= tpc;
9405 if (task_per_node_limit) {
9406 if (tasks_per_node >
9407 task_per_node_limit) {
9408 excess_tasks = tasks_per_node -
9409 task_per_node_limit;
9410 tasks_per_node_socket[i][s] -=
9411 excess_tasks;
9412 rem_tasks += excess_tasks;
9413 }
9414 if (tasks_per_node >=
9415 task_per_node_limit) {
9416 s = sock_cnt;
9417 break;
9418 }
9419 }
9420 /* NOTE: No support for ntasks_per_board */
9421 if (tres_mc_ptr->ntasks_per_socket) {
9422 if (tasks_per_socket >
9423 tres_mc_ptr->ntasks_per_socket) {
9424 excess_tasks = tasks_per_socket-
9425 tres_mc_ptr->ntasks_per_socket;
9426 tasks_per_node_socket[i][s] -=
9427 excess_tasks;
9428 rem_tasks += excess_tasks;
9429 }
9430 if (tasks_per_socket >=
9431 tres_mc_ptr->ntasks_per_socket) {
9432 break;
9433 }
9434 }
9435 }
9436 }
9437 }
9438 while ((rem_tasks > 0) && overcommit) {
9439 for (i = i_first; (rem_tasks > 0) && (i <= i_last); i++) {
9440 if (!bit_test(job_res->node_bitmap, i))
9441 continue;
9442 for (s = 0; (rem_tasks > 0) && (s < sock_cnt); s++) {
9443 for (c = 0; c < cores_per_socket_cnt; c++) {
9444 j = (s * cores_per_socket_cnt) + c;
9445 if (!bit_test(job_res->core_bitmap, j))
9446 continue;
9447 tasks_per_node_socket[i][s]++;
9448 rem_tasks--;
9449 break;
9450 }
9451 }
9452 }
9453 }
9454 if (rem_tasks > 0) /* This should never happen */
9455 error("%s: rem_tasks not zero (%d > 0)", __func__, rem_tasks);
9456
9457 return tasks_per_node_socket;
9458 }
9459
_free_tasks_per_node_sock(uint32_t ** tasks_per_node_socket,int node_cnt)9460 static void _free_tasks_per_node_sock(uint32_t **tasks_per_node_socket,
9461 int node_cnt)
9462 {
9463 int n;
9464
9465 if (!tasks_per_node_socket)
9466 return;
9467
9468 for (n = 0; n < node_cnt; n++)
9469 xfree(tasks_per_node_socket[n]);
9470 xfree(tasks_per_node_socket);
9471 }
9472
9473 /* Return the count of tasks for a job on a given node */
_get_task_cnt_node(uint32_t ** tasks_per_node_socket,int node_inx,int sock_cnt)9474 static uint32_t _get_task_cnt_node(uint32_t **tasks_per_node_socket,
9475 int node_inx, int sock_cnt)
9476 {
9477 uint32_t task_cnt = 0;
9478 int s;
9479
9480 if (!tasks_per_node_socket || !tasks_per_node_socket[node_inx]) {
9481 error("%s: tasks_per_node_socket is NULL", __func__);
9482 return 1; /* Best guess if no data structure */
9483 }
9484
9485 for (s = 0; s < sock_cnt; s++)
9486 task_cnt += tasks_per_node_socket[node_inx][s];
9487
9488 return task_cnt;
9489 }
9490
9491 /* Determine maximum GRES allocation count on this node; no topology */
_get_job_cnt(sock_gres_t * sock_gres,gres_node_state_t * node_specs,int rem_node_cnt)9492 static uint64_t _get_job_cnt(sock_gres_t *sock_gres,
9493 gres_node_state_t *node_specs, int rem_node_cnt)
9494 {
9495 uint64_t avail_gres, max_gres;
9496 gres_job_state_t *job_specs = sock_gres->job_specs;
9497
9498 avail_gres = node_specs->gres_cnt_avail - node_specs->gres_cnt_alloc;
9499 /* Ensure at least one GRES per node on remaining nodes */
9500 max_gres = job_specs->gres_per_job - job_specs->total_gres -
9501 (rem_node_cnt - 1);
9502 max_gres = MIN(avail_gres, max_gres);
9503
9504 return max_gres;
9505 }
9506
9507 /* Return count of GRES on this node */
_get_gres_node_cnt(gres_node_state_t * node_specs,int node_inx)9508 static int _get_gres_node_cnt(gres_node_state_t *node_specs, int node_inx)
9509 {
9510 int i, gres_cnt = 0;
9511
9512 if (node_specs->gres_bit_alloc) {
9513 gres_cnt = bit_size(node_specs->gres_bit_alloc);
9514 return gres_cnt;
9515 }
9516
9517 /* This logic should be redundant */
9518 if (node_specs->topo_gres_bitmap && node_specs->topo_gres_bitmap[0]) {
9519 gres_cnt = bit_size(node_specs->topo_gres_bitmap[0]);
9520 return gres_cnt;
9521 }
9522
9523 /* This logic should also be redundant */
9524 gres_cnt = 0;
9525 for (i = 0; i < node_specs->topo_cnt; i++)
9526 gres_cnt += node_specs->topo_gres_cnt_avail[i];
9527 return gres_cnt;
9528 }
9529
9530 /*
9531 * Make final GRES selection for the job
9532 * sock_gres_list IN - per-socket GRES details, one record per allocated node
9533 * job_id IN - job ID for logging
9534 * job_res IN - job resource allocation
9535 * overcommit IN - job's ability to overcommit resources
9536 * tres_mc_ptr IN - job's multi-core options
9537 * node_table_ptr IN - slurmctld's node records
9538 * RET SLURM_SUCCESS or error code
9539 */
gres_plugin_job_core_filter4(List * sock_gres_list,uint32_t job_id,struct job_resources * job_res,uint8_t overcommit,gres_mc_data_t * tres_mc_ptr,node_record_t * node_table_ptr)9540 extern int gres_plugin_job_core_filter4(List *sock_gres_list, uint32_t job_id,
9541 struct job_resources *job_res,
9542 uint8_t overcommit,
9543 gres_mc_data_t *tres_mc_ptr,
9544 node_record_t *node_table_ptr)
9545 {
9546 ListIterator sock_gres_iter;
9547 sock_gres_t *sock_gres;
9548 gres_job_state_t *job_specs;
9549 gres_node_state_t *node_specs;
9550 int i, i_first, i_last, node_inx = -1, gres_cnt;
9551 int node_cnt, rem_node_cnt;
9552 int job_fini = -1; /* -1: not applicable, 0: more work, 1: fini */
9553 uint32_t **tasks_per_node_socket = NULL;
9554 int rc = SLURM_SUCCESS;
9555
9556 if (!job_res || !job_res->node_bitmap)
9557 return SLURM_ERROR;
9558
9559 node_cnt = bit_size(job_res->node_bitmap);
9560 rem_node_cnt = bit_set_count(job_res->node_bitmap);
9561 i_first = bit_ffs(job_res->node_bitmap);
9562 if (i_first != -1)
9563 i_last = bit_fls(job_res->node_bitmap);
9564 else
9565 i_last = -2;
9566 for (i = i_first; i <= i_last; i++) {
9567 if (!bit_test(job_res->node_bitmap, i))
9568 continue;
9569 sock_gres_iter =
9570 list_iterator_create(sock_gres_list[++node_inx]);
9571 while ((sock_gres = (sock_gres_t *) list_next(sock_gres_iter))){
9572 job_specs = sock_gres->job_specs;
9573 node_specs = sock_gres->node_specs;
9574 if (!job_specs || !node_specs)
9575 continue;
9576 if (job_specs->gres_per_task && /* Data needed */
9577 !tasks_per_node_socket) { /* Not built yet */
9578 tasks_per_node_socket =
9579 _build_tasks_per_node_sock(job_res,
9580 overcommit,
9581 tres_mc_ptr,
9582 node_table_ptr);
9583 }
9584 if (job_specs->total_node_cnt == 0) {
9585 job_specs->total_node_cnt = node_cnt;
9586 job_specs->total_gres = 0;
9587 }
9588 if (!job_specs->gres_cnt_node_select) {
9589 job_specs->gres_cnt_node_select =
9590 xcalloc(node_cnt, sizeof(uint64_t));
9591 }
9592 if (i == i_first) /* Reinitialize counter */
9593 job_specs->total_gres = 0;
9594
9595 if (node_specs->topo_cnt == 0) {
9596 /* No topology, just set a count */
9597 if (job_specs->gres_per_node) {
9598 job_specs->gres_cnt_node_select[i] =
9599 job_specs->gres_per_node;
9600 } else if (job_specs->gres_per_socket) {
9601 job_specs->gres_cnt_node_select[i] =
9602 job_specs->gres_per_socket;
9603 job_specs->gres_cnt_node_select[i] *=
9604 _get_sock_cnt(job_res, i,
9605 node_inx);
9606 } else if (job_specs->gres_per_task) {
9607 job_specs->gres_cnt_node_select[i] =
9608 job_specs->gres_per_task;
9609 job_specs->gres_cnt_node_select[i] *=
9610 _get_task_cnt_node(
9611 tasks_per_node_socket, i,
9612 node_table_ptr[i].sockets);
9613 } else if (job_specs->gres_per_job) {
9614 job_specs->gres_cnt_node_select[i] =
9615 _get_job_cnt(sock_gres,
9616 node_specs,
9617 rem_node_cnt);
9618 }
9619 job_specs->total_gres +=
9620 job_specs->gres_cnt_node_select[i];
9621 continue;
9622 }
9623
9624 /* Working with topology, need to pick specific GRES */
9625 if (!job_specs->gres_bit_select) {
9626 job_specs->gres_bit_select =
9627 xcalloc(node_cnt, sizeof(bitstr_t *));
9628 }
9629 gres_cnt = _get_gres_node_cnt(node_specs, node_inx);
9630 FREE_NULL_BITMAP(job_specs->gres_bit_select[i]);
9631 job_specs->gres_bit_select[i] = bit_alloc(gres_cnt);
9632 job_specs->gres_cnt_node_select[i] = 0;
9633
9634 if (job_specs->gres_per_node &&
9635 _shared_gres(sock_gres->plugin_id)) {
9636 /* gres/mps: select specific topo bit for job */
9637 _pick_specific_topo(job_res, i, node_inx,
9638 sock_gres, job_id,
9639 tres_mc_ptr);
9640 } else if (job_specs->gres_per_node) {
9641 _set_node_bits(job_res, i, node_inx,
9642 sock_gres, job_id, tres_mc_ptr);
9643 } else if (job_specs->gres_per_socket) {
9644 _set_sock_bits(job_res, i, node_inx,
9645 sock_gres, job_id, tres_mc_ptr);
9646 } else if (job_specs->gres_per_task) {
9647 _set_task_bits(job_res, i, node_inx,
9648 sock_gres, job_id, tres_mc_ptr,
9649 tasks_per_node_socket);
9650 } else if (job_specs->gres_per_job) {
9651 uint16_t cpus_per_core;
9652 cpus_per_core = node_table_ptr[i].cpus /
9653 node_table_ptr[i].boards /
9654 node_table_ptr[i].sockets /
9655 node_table_ptr[i].cores;
9656 job_fini = _set_job_bits1(job_res, i, node_inx,
9657 rem_node_cnt, sock_gres,
9658 job_id, tres_mc_ptr,
9659 cpus_per_core);
9660 } else {
9661 error("%s job %u job_spec lacks GRES counter",
9662 __func__, job_id);
9663 }
9664 if (job_fini == -1) {
9665 /*
9666 * _set_job_bits1() updates total_gres counter,
9667 * this handle other cases.
9668 */
9669 job_specs->total_gres +=
9670 job_specs->gres_cnt_node_select[i];
9671 }
9672 }
9673 rem_node_cnt--;
9674 list_iterator_destroy(sock_gres_iter);
9675 }
9676
9677 if (job_fini == 0) {
9678 /*
9679 * Need more GRES to satisfy gres-per-job option with bitmaps.
9680 * This logic will make use of GRES that are not on allocated
9681 * sockets and are thus generally less desirable to use.
9682 */
9683 node_inx = -1;
9684 for (i = i_first; i <= i_last; i++) {
9685 if (!bit_test(job_res->node_bitmap, i))
9686 continue;
9687 sock_gres_iter =
9688 list_iterator_create(sock_gres_list[++node_inx]);
9689 while ((sock_gres = (sock_gres_t *)
9690 list_next(sock_gres_iter))) {
9691 job_specs = sock_gres->job_specs;
9692 node_specs = sock_gres->node_specs;
9693 if (!job_specs || !node_specs)
9694 continue;
9695 job_fini = _set_job_bits2(job_res, i, node_inx,
9696 sock_gres, job_id,
9697 tres_mc_ptr);
9698 if (job_fini == 1)
9699 break;
9700 }
9701 list_iterator_destroy(sock_gres_iter);
9702 if (job_fini == 1)
9703 break;
9704 }
9705 if (job_fini == 0) {
9706 error("%s job %u failed to satisfy gres-per-job counter",
9707 __func__, job_id);
9708 rc = ESLURM_NODE_NOT_AVAIL;
9709 }
9710 }
9711 _free_tasks_per_node_sock(tasks_per_node_socket, node_cnt);
9712
9713 return rc;
9714 }
9715
9716 /*
9717 * Determine if job GRES specification includes a tres-per-task specification
9718 * RET TRUE if any GRES requested by the job include a tres-per-task option
9719 */
gres_plugin_job_tres_per_task(List job_gres_list)9720 extern bool gres_plugin_job_tres_per_task(List job_gres_list)
9721 {
9722 ListIterator job_gres_iter;
9723 gres_state_t *job_gres_ptr;
9724 gres_job_state_t *job_data_ptr;
9725 bool have_gres_per_task = false;
9726
9727 if (!job_gres_list)
9728 return false;
9729
9730 job_gres_iter = list_iterator_create(job_gres_list);
9731 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
9732 job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
9733 if (job_data_ptr->gres_per_task == 0)
9734 continue;
9735 have_gres_per_task = true;
9736 break;
9737 }
9738 list_iterator_destroy(job_gres_iter);
9739
9740 return have_gres_per_task;
9741 }
9742
9743 /*
9744 * Determine if the job GRES specification includes a mem-per-tres specification
9745 * RET largest mem-per-tres specification found
9746 */
gres_plugin_job_mem_max(List job_gres_list)9747 extern uint64_t gres_plugin_job_mem_max(List job_gres_list)
9748 {
9749 ListIterator job_gres_iter;
9750 gres_state_t *job_gres_ptr;
9751 gres_job_state_t *job_data_ptr;
9752 uint64_t mem_max = 0, mem_per_gres;
9753
9754 if (!job_gres_list)
9755 return 0;
9756
9757 job_gres_iter = list_iterator_create(job_gres_list);
9758 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
9759 job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
9760 if (job_data_ptr->mem_per_gres)
9761 mem_per_gres = job_data_ptr->mem_per_gres;
9762 else
9763 mem_per_gres = job_data_ptr->def_mem_per_gres;
9764 mem_max = MAX(mem_max, mem_per_gres);
9765 }
9766 list_iterator_destroy(job_gres_iter);
9767
9768 return mem_max;
9769 }
9770
9771 /*
9772 * Set per-node memory limits based upon GRES assignments
9773 * RET TRUE if mem-per-tres specification used to set memory limits
9774 */
gres_plugin_job_mem_set(List job_gres_list,job_resources_t * job_res)9775 extern bool gres_plugin_job_mem_set(List job_gres_list,
9776 job_resources_t *job_res)
9777 {
9778 ListIterator job_gres_iter;
9779 gres_state_t *job_gres_ptr;
9780 gres_job_state_t *job_data_ptr;
9781 bool rc = false, first_set = true;
9782 uint64_t gres_cnt, mem_size, mem_per_gres;
9783 int i, i_first, i_last, node_off;
9784
9785 if (!job_gres_list)
9786 return false;
9787
9788 i_first = bit_ffs(job_res->node_bitmap);
9789 if (i_first < 0)
9790 return false;
9791 i_last = bit_fls(job_res->node_bitmap);
9792
9793 job_gres_iter = list_iterator_create(job_gres_list);
9794 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
9795 job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
9796 if (job_data_ptr->mem_per_gres)
9797 mem_per_gres = job_data_ptr->mem_per_gres;
9798 else
9799 mem_per_gres = job_data_ptr->def_mem_per_gres;
9800 /*
9801 * The logic below is correct because the only mem_per_gres
9802 * is --mem-per-gpu adding another option will require change
9803 * to take MAX of mem_per_gres for all types.
9804 */
9805 if ((mem_per_gres == 0) || !job_data_ptr->gres_cnt_node_select)
9806 continue;
9807 rc = true;
9808 node_off = -1;
9809 for (i = i_first; i <= i_last; i++) {
9810 if (!bit_test(job_res->node_bitmap, i))
9811 continue;
9812 node_off++;
9813 if (job_res->whole_node == 1) {
9814 gres_state_t *node_gres_ptr;
9815 gres_node_state_t *node_state_ptr;
9816
9817 node_gres_ptr = list_find_first(
9818 node_record_table_ptr[i].gres_list,
9819 _gres_find_id,
9820 &job_gres_ptr->plugin_id);
9821 if (!node_gres_ptr)
9822 continue;
9823 node_state_ptr = node_gres_ptr->gres_data;
9824 gres_cnt = node_state_ptr->gres_cnt_avail;
9825 } else
9826 gres_cnt =
9827 job_data_ptr->gres_cnt_node_select[i];
9828 mem_size = mem_per_gres * gres_cnt;
9829 if (first_set)
9830 job_res->memory_allocated[node_off] = mem_size;
9831 else
9832 job_res->memory_allocated[node_off] += mem_size;
9833 }
9834 first_set = false;
9835 }
9836 list_iterator_destroy(job_gres_iter);
9837
9838 return rc;
9839 }
9840
9841 /*
9842 * Determine the minimum number of CPUs required to satify the job's GRES
9843 * request (based upon total GRES times cpus_per_gres value)
9844 * node_count IN - count of nodes in job allocation
9845 * sockets_per_node IN - count of sockets per node in job allocation
9846 * task_count IN - count of tasks in job allocation
9847 * job_gres_list IN - job GRES specification
9848 * RET count of required CPUs for the job
9849 */
gres_plugin_job_min_cpus(uint32_t node_count,uint32_t sockets_per_node,uint32_t task_count,List job_gres_list)9850 extern int gres_plugin_job_min_cpus(uint32_t node_count,
9851 uint32_t sockets_per_node,
9852 uint32_t task_count,
9853 List job_gres_list)
9854 {
9855 ListIterator job_gres_iter;
9856 gres_state_t *job_gres_ptr;
9857 gres_job_state_t *job_data_ptr;
9858 int tmp, min_cpus = 0;
9859 uint16_t cpus_per_gres;
9860
9861 if (!job_gres_list || (list_count(job_gres_list) == 0))
9862 return 0;
9863
9864 job_gres_iter = list_iterator_create(job_gres_list);
9865 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
9866 uint64_t total_gres = 0;
9867 job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
9868 if (job_data_ptr->cpus_per_gres)
9869 cpus_per_gres = job_data_ptr->cpus_per_gres;
9870 else
9871 cpus_per_gres = job_data_ptr->def_cpus_per_gres;
9872 if (cpus_per_gres == 0)
9873 continue;
9874 if (job_data_ptr->gres_per_job) {
9875 total_gres = job_data_ptr->gres_per_job;
9876 } else if (job_data_ptr->gres_per_node) {
9877 total_gres = job_data_ptr->gres_per_node *
9878 node_count;
9879 } else if (job_data_ptr->gres_per_socket) {
9880 total_gres = job_data_ptr->gres_per_socket *
9881 node_count * sockets_per_node;
9882 } else if (job_data_ptr->gres_per_task) {
9883 total_gres = job_data_ptr->gres_per_task * task_count;
9884 } else
9885 continue;
9886 tmp = cpus_per_gres * total_gres;
9887 min_cpus = MAX(min_cpus, tmp);
9888 }
9889 list_iterator_destroy(job_gres_iter);
9890 return min_cpus;
9891 }
9892
9893 /*
9894 * Determine the minimum number of CPUs required to satify the job's GRES
9895 * request on one node
9896 * sockets_per_node IN - count of sockets per node in job allocation
9897 * tasks_per_node IN - count of tasks per node in job allocation
9898 * job_gres_list IN - job GRES specification
9899 * RET count of required CPUs for the job
9900 */
gres_plugin_job_min_cpu_node(uint32_t sockets_per_node,uint32_t tasks_per_node,List job_gres_list)9901 extern int gres_plugin_job_min_cpu_node(uint32_t sockets_per_node,
9902 uint32_t tasks_per_node,
9903 List job_gres_list)
9904 {
9905 ListIterator job_gres_iter;
9906 gres_state_t *job_gres_ptr;
9907 gres_job_state_t *job_data_ptr;
9908 int tmp, min_cpus = 0;
9909 uint16_t cpus_per_gres;
9910
9911 if (!job_gres_list || (list_count(job_gres_list) == 0))
9912 return 0;
9913
9914 job_gres_iter = list_iterator_create(job_gres_list);
9915 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
9916 uint64_t total_gres = 0;
9917 job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
9918 if (job_data_ptr->cpus_per_gres)
9919 cpus_per_gres = job_data_ptr->cpus_per_gres;
9920 else
9921 cpus_per_gres = job_data_ptr->def_cpus_per_gres;
9922 if (cpus_per_gres == 0)
9923 continue;
9924 if (job_data_ptr->gres_per_node) {
9925 total_gres = job_data_ptr->gres_per_node;
9926 } else if (job_data_ptr->gres_per_socket) {
9927 total_gres = job_data_ptr->gres_per_socket *
9928 sockets_per_node;
9929 } else if (job_data_ptr->gres_per_task) {
9930 total_gres = job_data_ptr->gres_per_task *
9931 tasks_per_node;
9932 } else
9933 total_gres = 1;
9934 tmp = cpus_per_gres * total_gres;
9935 min_cpus = MAX(min_cpus, tmp);
9936 }
9937 return min_cpus;
9938 }
9939
9940 /*
9941 * Determine if specific GRES index on node is available to a job's allocated
9942 * cores
9943 * IN core_bitmap - bitmap of cores allocated to the job on this node
9944 * IN/OUT alloc_core_bitmap - cores already allocated, NULL if don't care,
9945 * updated when the function returns true
9946 * IN node_gres_ptr - GRES data for this node
9947 * IN gres_inx - index of GRES being considered for use
9948 * IN job_gres_ptr - GRES data for this job
9949 * RET true if available to those core, false otherwise
9950 */
_cores_on_gres(bitstr_t * core_bitmap,bitstr_t * alloc_core_bitmap,gres_node_state_t * node_gres_ptr,int gres_inx,gres_job_state_t * job_gres_ptr)9951 static bool _cores_on_gres(bitstr_t *core_bitmap, bitstr_t *alloc_core_bitmap,
9952 gres_node_state_t *node_gres_ptr, int gres_inx,
9953 gres_job_state_t *job_gres_ptr)
9954 {
9955 int i, avail_cores;
9956
9957 if ((core_bitmap == NULL) || (node_gres_ptr->topo_cnt == 0))
9958 return true;
9959
9960 for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
9961 if (!node_gres_ptr->topo_gres_bitmap[i])
9962 continue;
9963 if (bit_size(node_gres_ptr->topo_gres_bitmap[i]) < gres_inx)
9964 continue;
9965 if (!bit_test(node_gres_ptr->topo_gres_bitmap[i], gres_inx))
9966 continue;
9967 if (job_gres_ptr->type_name &&
9968 (!node_gres_ptr->topo_type_name[i] ||
9969 (job_gres_ptr->type_id != node_gres_ptr->topo_type_id[i])))
9970 continue;
9971 if (!node_gres_ptr->topo_core_bitmap[i])
9972 return true;
9973 if (bit_size(node_gres_ptr->topo_core_bitmap[i]) !=
9974 bit_size(core_bitmap))
9975 break;
9976 avail_cores = bit_overlap(node_gres_ptr->topo_core_bitmap[i],
9977 core_bitmap);
9978 if (avail_cores && alloc_core_bitmap) {
9979 avail_cores -= bit_overlap(node_gres_ptr->
9980 topo_core_bitmap[i],
9981 alloc_core_bitmap);
9982 if (avail_cores) {
9983 bit_or(alloc_core_bitmap,
9984 node_gres_ptr->topo_core_bitmap[i]);
9985 }
9986 }
9987 if (avail_cores)
9988 return true;
9989 }
9990 return false;
9991 }
9992
9993 /* Clear any vestigial job gres state. This may be needed on job requeue. */
gres_plugin_job_clear(List job_gres_list)9994 extern void gres_plugin_job_clear(List job_gres_list)
9995 {
9996 int i;
9997 ListIterator job_gres_iter;
9998 gres_state_t *job_gres_ptr;
9999 gres_job_state_t *job_state_ptr;
10000
10001 if (job_gres_list == NULL)
10002 return;
10003
10004 (void) gres_plugin_init();
10005 slurm_mutex_lock(&gres_context_lock);
10006 job_gres_iter = list_iterator_create(job_gres_list);
10007 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
10008 job_state_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
10009 for (i = 0; i < job_state_ptr->node_cnt; i++) {
10010 if (job_state_ptr->gres_bit_alloc) {
10011 FREE_NULL_BITMAP(job_state_ptr->
10012 gres_bit_alloc[i]);
10013 }
10014 if (job_state_ptr->gres_bit_step_alloc) {
10015 FREE_NULL_BITMAP(job_state_ptr->
10016 gres_bit_step_alloc[i]);
10017 }
10018 }
10019 xfree(job_state_ptr->gres_bit_alloc);
10020 xfree(job_state_ptr->gres_bit_step_alloc);
10021 xfree(job_state_ptr->gres_cnt_step_alloc);
10022 xfree(job_state_ptr->gres_cnt_node_alloc);
10023 job_state_ptr->node_cnt = 0;
10024 }
10025 list_iterator_destroy(job_gres_iter);
10026 slurm_mutex_unlock(&gres_context_lock);
10027 }
10028
_job_alloc(void * job_gres_data,void * node_gres_data,int node_cnt,int node_index,int node_offset,char * gres_name,uint32_t job_id,char * node_name,bitstr_t * core_bitmap,uint32_t plugin_id,uint32_t user_id)10029 static int _job_alloc(void *job_gres_data, void *node_gres_data, int node_cnt,
10030 int node_index, int node_offset, char *gres_name,
10031 uint32_t job_id, char *node_name,
10032 bitstr_t *core_bitmap, uint32_t plugin_id,
10033 uint32_t user_id)
10034 {
10035 int j, sz1, sz2;
10036 int64_t gres_cnt, i;
10037 gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data;
10038 gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data;
10039 bool type_array_updated = false;
10040 bitstr_t *alloc_core_bitmap = NULL;
10041 uint64_t gres_per_bit = 1;
10042 bool log_cnt_err = true;
10043 char *log_type;
10044 bool shared_gres = false, use_busy_dev = false;
10045
10046 /*
10047 * Validate data structures. Either job_gres_data->node_cnt and
10048 * job_gres_data->gres_bit_alloc are both set or both zero/NULL.
10049 */
10050 xassert(node_cnt);
10051 xassert(node_offset >= 0);
10052 xassert(job_gres_ptr);
10053 xassert(node_gres_ptr);
10054
10055 if (node_gres_ptr->no_consume) {
10056 job_gres_ptr->total_gres = NO_CONSUME_VAL64;
10057 return SLURM_SUCCESS;
10058 }
10059
10060 if (_shared_gres(plugin_id)) {
10061 shared_gres = true;
10062 gres_per_bit = job_gres_ptr->gres_per_node;
10063 }
10064 if ((plugin_id == mps_plugin_id) &&
10065 (node_gres_ptr->gres_cnt_alloc != 0)) {
10066 /* We must use the ONE already active GRES of this type */
10067 use_busy_dev = true;
10068 }
10069
10070 if (job_gres_ptr->type_name && !job_gres_ptr->type_name[0])
10071 xfree(job_gres_ptr->type_name);
10072
10073 xfree(node_gres_ptr->gres_used); /* Clear cache */
10074 if (job_gres_ptr->node_cnt == 0) {
10075 job_gres_ptr->node_cnt = node_cnt;
10076 if (job_gres_ptr->gres_bit_alloc) {
10077 error("gres/%s: job %u node_cnt==0 and gres_bit_alloc is set",
10078 gres_name, job_id);
10079 xfree(job_gres_ptr->gres_bit_alloc);
10080 }
10081 }
10082 /*
10083 * These next 2 checks were added long before job resizing was allowed.
10084 * They are not errors as we need to keep the original size around for
10085 * any steps that might still be out there with the larger size. If the
10086 * job was sized up the gres_plugin_job_merge() function handles the
10087 * resize so we are set there.
10088 */
10089 else if (job_gres_ptr->node_cnt < node_cnt) {
10090 debug2("gres/%s: job %u node_cnt is now larger than it was when allocated from %u to %d",
10091 gres_name, job_id, job_gres_ptr->node_cnt, node_cnt);
10092 if (node_offset >= job_gres_ptr->node_cnt)
10093 return SLURM_ERROR;
10094 } else if (job_gres_ptr->node_cnt > node_cnt) {
10095 debug2("gres/%s: job %u node_cnt is now smaller than it was when allocated %u to %d",
10096 gres_name, job_id, job_gres_ptr->node_cnt, node_cnt);
10097 }
10098
10099 if (!job_gres_ptr->gres_bit_alloc) {
10100 job_gres_ptr->gres_bit_alloc = xcalloc(node_cnt,
10101 sizeof(bitstr_t *));
10102 }
10103 if (!job_gres_ptr->gres_cnt_node_alloc) {
10104 job_gres_ptr->gres_cnt_node_alloc = xcalloc(node_cnt,
10105 sizeof(uint64_t));
10106 }
10107
10108 /*
10109 * select/cons_tres pre-selects the resources and we just need to update
10110 * the data structures to reflect the selected GRES.
10111 */
10112 if (job_gres_ptr->total_node_cnt) {
10113 /* Resuming job */
10114 if (job_gres_ptr->gres_cnt_node_alloc[node_offset]) {
10115 gres_cnt = job_gres_ptr->
10116 gres_cnt_node_alloc[node_offset];
10117 } else if (job_gres_ptr->gres_bit_alloc[node_offset]) {
10118 gres_cnt = bit_set_count(
10119 job_gres_ptr->gres_bit_alloc[node_offset]);
10120 gres_cnt *= gres_per_bit;
10121 /* Using pre-selected GRES */
10122 } else if (job_gres_ptr->gres_cnt_node_select &&
10123 job_gres_ptr->gres_cnt_node_select[node_index]) {
10124 gres_cnt = job_gres_ptr->
10125 gres_cnt_node_select[node_index];
10126 } else if (job_gres_ptr->gres_bit_select &&
10127 job_gres_ptr->gres_bit_select[node_index]) {
10128 gres_cnt = bit_set_count(
10129 job_gres_ptr->gres_bit_select[node_index]);
10130 gres_cnt *= gres_per_bit;
10131 } else {
10132 error("gres/%s: job %u node %s no resources selected",
10133 gres_name, job_id, node_name);
10134 return SLURM_ERROR;
10135 }
10136 } else {
10137 gres_cnt = job_gres_ptr->gres_per_node;
10138 }
10139
10140 /*
10141 * Check that sufficient resources exist on this node
10142 */
10143 job_gres_ptr->gres_cnt_node_alloc[node_offset] = gres_cnt;
10144 i = node_gres_ptr->gres_cnt_alloc + gres_cnt;
10145 if (i > node_gres_ptr->gres_cnt_avail) {
10146 error("gres/%s: job %u node %s overallocated resources by %"
10147 PRIu64", (%"PRIu64" > %"PRIu64")",
10148 gres_name, job_id, node_name,
10149 i - node_gres_ptr->gres_cnt_avail,
10150 i, node_gres_ptr->gres_cnt_avail);
10151 /* proceed with request, give job what is available */
10152 }
10153
10154 if (!node_offset && job_gres_ptr->gres_cnt_step_alloc) {
10155 uint64_t *tmp = xcalloc(job_gres_ptr->node_cnt,
10156 sizeof(uint64_t));
10157 memcpy(tmp, job_gres_ptr->gres_cnt_step_alloc,
10158 sizeof(uint64_t) * MIN(node_cnt,
10159 job_gres_ptr->node_cnt));
10160 xfree(job_gres_ptr->gres_cnt_step_alloc);
10161 job_gres_ptr->gres_cnt_step_alloc = tmp;
10162 }
10163 if (job_gres_ptr->gres_cnt_step_alloc == NULL) {
10164 job_gres_ptr->gres_cnt_step_alloc =
10165 xcalloc(job_gres_ptr->node_cnt, sizeof(uint64_t));
10166 }
10167
10168 /*
10169 * Select and/or allocate specific resources for this job.
10170 */
10171 if (job_gres_ptr->gres_bit_alloc[node_offset]) {
10172 /*
10173 * Restarted slurmctld with active job or resuming a suspended
10174 * job. In any case, the resources already selected.
10175 */
10176 if (node_gres_ptr->gres_bit_alloc == NULL) {
10177 node_gres_ptr->gres_bit_alloc =
10178 bit_copy(job_gres_ptr->
10179 gres_bit_alloc[node_offset]);
10180 node_gres_ptr->gres_cnt_alloc +=
10181 bit_set_count(node_gres_ptr->gres_bit_alloc);
10182 node_gres_ptr->gres_cnt_alloc *= gres_per_bit;
10183 } else if (node_gres_ptr->gres_bit_alloc) {
10184 gres_cnt = (int64_t)MIN(
10185 bit_size(node_gres_ptr->gres_bit_alloc),
10186 bit_size(job_gres_ptr->
10187 gres_bit_alloc[node_offset]));
10188 for (i = 0; i < gres_cnt; i++) {
10189 if (bit_test(job_gres_ptr->
10190 gres_bit_alloc[node_offset], i) &&
10191 (shared_gres ||
10192 !bit_test(node_gres_ptr->gres_bit_alloc,
10193 i))) {
10194 bit_set(node_gres_ptr->gres_bit_alloc,i);
10195 node_gres_ptr->gres_cnt_alloc +=
10196 gres_per_bit;
10197 }
10198 }
10199 }
10200 } else if (job_gres_ptr->total_node_cnt &&
10201 job_gres_ptr->gres_bit_select &&
10202 job_gres_ptr->gres_bit_select[node_index] &&
10203 job_gres_ptr->gres_cnt_node_select) {
10204 /* Specific GRES already selected, update the node record */
10205 bool job_mod = false;
10206 sz1 = bit_size(job_gres_ptr->gres_bit_select[node_index]);
10207 sz2 = bit_size(node_gres_ptr->gres_bit_alloc);
10208 if (sz1 > sz2) {
10209 error("gres/%s: job %u node %s gres bitmap size bad (%d > %d)",
10210 gres_name, job_id, node_name, sz1, sz2);
10211 job_gres_ptr->gres_bit_select[node_index] =
10212 bit_realloc(
10213 job_gres_ptr->gres_bit_select[node_index], sz2);
10214 job_mod = true;
10215 } else if (sz1 < sz2) {
10216 error("gres/%s: job %u node %s gres bitmap size bad (%d < %d)",
10217 gres_name, job_id, node_name, sz1, sz2);
10218 job_gres_ptr->gres_bit_select[node_index] =
10219 bit_realloc(
10220 job_gres_ptr->gres_bit_select[node_index], sz2);
10221 }
10222
10223 if (!shared_gres &&
10224 bit_overlap_any(job_gres_ptr->gres_bit_select[node_index],
10225 node_gres_ptr->gres_bit_alloc)) {
10226 error("gres/%s: job %u node %s gres bitmap overlap",
10227 gres_name, job_id, node_name);
10228 bit_and_not(job_gres_ptr->gres_bit_select[node_index],
10229 node_gres_ptr->gres_bit_alloc);
10230 }
10231 job_gres_ptr->gres_bit_alloc[node_offset] =
10232 bit_copy(job_gres_ptr->gres_bit_select[node_index]);
10233 job_gres_ptr->gres_cnt_node_alloc[node_offset] =
10234 job_gres_ptr->gres_cnt_node_select[node_index];
10235 if (!node_gres_ptr->gres_bit_alloc) {
10236 node_gres_ptr->gres_bit_alloc =
10237 bit_copy(job_gres_ptr->
10238 gres_bit_alloc[node_offset]);
10239 } else {
10240 bit_or(node_gres_ptr->gres_bit_alloc,
10241 job_gres_ptr->gres_bit_alloc[node_offset]);
10242 }
10243 if (job_mod) {
10244 node_gres_ptr->gres_cnt_alloc =
10245 bit_set_count(node_gres_ptr->gres_bit_alloc);
10246 node_gres_ptr->gres_cnt_alloc *= gres_per_bit;
10247 } else {
10248 node_gres_ptr->gres_cnt_alloc += gres_cnt;
10249 }
10250 } else if (node_gres_ptr->gres_bit_alloc) {
10251 int64_t gres_avail = node_gres_ptr->gres_cnt_avail;
10252
10253 i = bit_size(node_gres_ptr->gres_bit_alloc);
10254 if (plugin_id == mps_plugin_id)
10255 gres_avail = i;
10256 else if (i < gres_avail) {
10257 error("gres/%s: node %s gres bitmap size bad (%"PRIi64" < %"PRIi64")",
10258 gres_name, node_name,
10259 i, gres_avail);
10260 node_gres_ptr->gres_bit_alloc =
10261 bit_realloc(node_gres_ptr->gres_bit_alloc,
10262 gres_avail);
10263 }
10264
10265 job_gres_ptr->gres_bit_alloc[node_offset] =
10266 bit_alloc(gres_avail);
10267
10268 if (core_bitmap)
10269 alloc_core_bitmap = bit_alloc(bit_size(core_bitmap));
10270 /* Pass 1: Allocate GRES overlapping all allocated cores */
10271 for (i=0; i<gres_avail && gres_cnt>0; i++) {
10272 if (bit_test(node_gres_ptr->gres_bit_alloc, i))
10273 continue;
10274 if (!_cores_on_gres(core_bitmap, alloc_core_bitmap,
10275 node_gres_ptr, i, job_gres_ptr))
10276 continue;
10277 bit_set(node_gres_ptr->gres_bit_alloc, i);
10278 bit_set(job_gres_ptr->gres_bit_alloc[node_offset], i);
10279 node_gres_ptr->gres_cnt_alloc += gres_per_bit;
10280 gres_cnt -= gres_per_bit;
10281 }
10282 FREE_NULL_BITMAP(alloc_core_bitmap);
10283 /* Pass 2: Allocate GRES overlapping any allocated cores */
10284 for (i=0; i<gres_avail && gres_cnt>0; i++) {
10285 if (bit_test(node_gres_ptr->gres_bit_alloc, i))
10286 continue;
10287 if (!_cores_on_gres(core_bitmap, NULL, node_gres_ptr, i,
10288 job_gres_ptr))
10289 continue;
10290 bit_set(node_gres_ptr->gres_bit_alloc, i);
10291 bit_set(job_gres_ptr->gres_bit_alloc[node_offset], i);
10292 node_gres_ptr->gres_cnt_alloc += gres_per_bit;
10293 gres_cnt -= gres_per_bit;
10294 }
10295 if (gres_cnt) {
10296 verbose("gres/%s topology sub-optimal for job %u",
10297 gres_name, job_id);
10298 }
10299 /* Pass 3: Allocate any available GRES */
10300 for (i=0; i<gres_avail && gres_cnt>0; i++) {
10301 if (bit_test(node_gres_ptr->gres_bit_alloc, i))
10302 continue;
10303 bit_set(node_gres_ptr->gres_bit_alloc, i);
10304 bit_set(job_gres_ptr->gres_bit_alloc[node_offset], i);
10305 node_gres_ptr->gres_cnt_alloc += gres_per_bit;
10306 gres_cnt -= gres_per_bit;
10307 }
10308 } else {
10309 node_gres_ptr->gres_cnt_alloc += gres_cnt;
10310 }
10311
10312 if (job_gres_ptr->gres_bit_alloc[node_offset] &&
10313 node_gres_ptr->topo_gres_bitmap &&
10314 node_gres_ptr->topo_gres_cnt_alloc) {
10315 for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
10316 if (job_gres_ptr->type_name &&
10317 (!node_gres_ptr->topo_type_name[i] ||
10318 (job_gres_ptr->type_id !=
10319 node_gres_ptr->topo_type_id[i])))
10320 continue;
10321 if (use_busy_dev &&
10322 (node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
10323 continue;
10324 sz1 = bit_size(job_gres_ptr->gres_bit_alloc[node_offset]);
10325 sz2 = bit_size(node_gres_ptr->topo_gres_bitmap[i]);
10326
10327 if ((sz1 != sz2) && log_cnt_err) {
10328 if (_shared_gres(plugin_id))
10329 log_type = "File";
10330 else
10331 log_type = "Count";
10332 /* Avoid abort on bit_overlap below */
10333 error("gres/%s %s mismatch for node %s (%d != %d)",
10334 gres_name, log_type, node_name, sz1, sz2);
10335 log_cnt_err = false;
10336 }
10337 if (sz1 != sz2)
10338 continue; /* See error above */
10339 gres_cnt = bit_overlap(job_gres_ptr->
10340 gres_bit_alloc[node_offset],
10341 node_gres_ptr->
10342 topo_gres_bitmap[i]);
10343 gres_cnt *= gres_per_bit;
10344 node_gres_ptr->topo_gres_cnt_alloc[i] += gres_cnt;
10345 if ((node_gres_ptr->type_cnt == 0) ||
10346 (node_gres_ptr->topo_type_name == NULL) ||
10347 (node_gres_ptr->topo_type_name[i] == NULL))
10348 continue;
10349 for (j = 0; j < node_gres_ptr->type_cnt; j++) {
10350 if (!node_gres_ptr->type_name[j] ||
10351 (node_gres_ptr->topo_type_id[i] !=
10352 node_gres_ptr->type_id[j]))
10353 continue;
10354 node_gres_ptr->type_cnt_alloc[j] += gres_cnt;
10355 break;
10356 }
10357 }
10358 type_array_updated = true;
10359 } else if (job_gres_ptr->gres_bit_alloc[node_offset]) {
10360 int len; /* length of the gres bitmap on this node */
10361 len = bit_size(job_gres_ptr->gres_bit_alloc[node_offset]);
10362 if (!node_gres_ptr->topo_gres_cnt_alloc) {
10363 node_gres_ptr->topo_gres_cnt_alloc =
10364 xcalloc(len, sizeof(uint64_t));
10365 } else {
10366 len = MIN(len, node_gres_ptr->gres_cnt_config);
10367 }
10368
10369 if ((node_gres_ptr->topo_cnt == 0) && shared_gres) {
10370 /*
10371 * Need to add node topo arrays for slurmctld restart
10372 * and job state recovery (with GRES counts per topo)
10373 */
10374 node_gres_ptr->topo_cnt =
10375 bit_size(job_gres_ptr->gres_bit_alloc[node_offset]);
10376 node_gres_ptr->topo_core_bitmap =
10377 xcalloc(node_gres_ptr->topo_cnt,
10378 sizeof(bitstr_t *));
10379 node_gres_ptr->topo_gres_bitmap =
10380 xcalloc(node_gres_ptr->topo_cnt,
10381 sizeof(bitstr_t *));
10382 node_gres_ptr->topo_gres_cnt_alloc =
10383 xcalloc(node_gres_ptr->topo_cnt,
10384 sizeof(uint64_t));
10385 node_gres_ptr->topo_gres_cnt_avail =
10386 xcalloc(node_gres_ptr->topo_cnt,
10387 sizeof(uint64_t));
10388 node_gres_ptr->topo_type_id =
10389 xcalloc(node_gres_ptr->topo_cnt,
10390 sizeof(uint32_t));
10391 node_gres_ptr->topo_type_name =
10392 xcalloc(node_gres_ptr->topo_cnt,
10393 sizeof(char *));
10394 for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
10395 node_gres_ptr->topo_gres_bitmap[i] =
10396 bit_alloc(node_gres_ptr->topo_cnt);
10397 bit_set(node_gres_ptr->topo_gres_bitmap[i], i);
10398 }
10399 }
10400
10401 for (i = 0; i < len; i++) {
10402 gres_cnt = 0;
10403 if (!bit_test(job_gres_ptr->
10404 gres_bit_alloc[node_offset], i))
10405 continue;
10406 /*
10407 * NOTE: Immediately after slurmctld restart and before
10408 * the node's registration, the GRES type and topology
10409 * information will not be available and we will be
10410 * unable to update topo_gres_cnt_alloc or
10411 * type_cnt_alloc. This results in some incorrect
10412 * internal bookkeeping, but does not cause failures
10413 * in terms of allocating GRES to jobs.
10414 */
10415 for (j = 0; j < node_gres_ptr->topo_cnt; j++) {
10416 if (use_busy_dev &&
10417 (node_gres_ptr->topo_gres_cnt_alloc[j] == 0))
10418 continue;
10419 if (node_gres_ptr->topo_gres_bitmap &&
10420 node_gres_ptr->topo_gres_bitmap[j] &&
10421 bit_test(node_gres_ptr->topo_gres_bitmap[j],
10422 i)) {
10423 node_gres_ptr->topo_gres_cnt_alloc[i] +=
10424 gres_per_bit;
10425 gres_cnt += gres_per_bit;
10426 }
10427 }
10428 if ((node_gres_ptr->type_cnt == 0) ||
10429 (node_gres_ptr->topo_type_name == NULL) ||
10430 (node_gres_ptr->topo_type_name[i] == NULL))
10431 continue;
10432 for (j = 0; j < node_gres_ptr->type_cnt; j++) {
10433 if (!node_gres_ptr->type_name[j] ||
10434 (node_gres_ptr->topo_type_id[i] !=
10435 node_gres_ptr->type_id[j]))
10436 continue;
10437 node_gres_ptr->type_cnt_alloc[j] += gres_cnt;
10438 break;
10439 }
10440 }
10441 type_array_updated = true;
10442 if (job_gres_ptr->type_name && job_gres_ptr->type_name[0]) {
10443 /*
10444 * We may not know how many GRES of this type will be
10445 * available on this node, but need to track how many
10446 * are allocated to this job from here to avoid
10447 * underflows when this job is deallocated
10448 */
10449 _add_gres_type(job_gres_ptr->type_name, node_gres_ptr,
10450 0);
10451 for (j = 0; j < node_gres_ptr->type_cnt; j++) {
10452 if (job_gres_ptr->type_id !=
10453 node_gres_ptr->type_id[j])
10454 continue;
10455 node_gres_ptr->type_cnt_alloc[j] +=
10456 job_gres_ptr->gres_per_node;
10457 break;
10458 }
10459 }
10460 }
10461
10462 if (!type_array_updated && job_gres_ptr->type_name) {
10463 gres_cnt = job_gres_ptr->gres_per_node;
10464 for (j = 0; j < node_gres_ptr->type_cnt; j++) {
10465 int64_t k;
10466 if (job_gres_ptr->type_id !=
10467 node_gres_ptr->type_id[j])
10468 continue;
10469 k = node_gres_ptr->type_cnt_avail[j] -
10470 node_gres_ptr->type_cnt_alloc[j];
10471 k = MIN(gres_cnt, k);
10472 node_gres_ptr->type_cnt_alloc[j] += k;
10473 gres_cnt -= k;
10474 if (gres_cnt == 0)
10475 break;
10476 }
10477 }
10478
10479 return SLURM_SUCCESS;
10480 }
10481
_job_select_whole_node_internal(gres_key_t * job_search_key,gres_node_state_t * node_state_ptr,int type_inx,int context_inx,List job_gres_list)10482 static void _job_select_whole_node_internal(
10483 gres_key_t *job_search_key, gres_node_state_t *node_state_ptr,
10484 int type_inx, int context_inx, List job_gres_list)
10485 {
10486 gres_state_t *job_gres_ptr;
10487 gres_job_state_t *job_state_ptr;
10488
10489 if (!(job_gres_ptr = list_find_first(job_gres_list,
10490 _gres_find_job_by_key,
10491 job_search_key))) {
10492 job_state_ptr = xmalloc(sizeof(gres_job_state_t));
10493
10494 job_gres_ptr = xmalloc(sizeof(gres_state_t));
10495 job_gres_ptr->plugin_id = job_search_key->plugin_id;
10496 job_gres_ptr->gres_data = job_state_ptr;
10497 job_state_ptr->gres_name =
10498 xstrdup(gres_context[context_inx].gres_name);
10499 if (type_inx != -1)
10500 job_state_ptr->type_name =
10501 xstrdup(node_state_ptr->type_name[type_inx]);
10502 job_state_ptr->type_id = job_search_key->type_id;
10503
10504 list_append(job_gres_list, job_gres_ptr);
10505 } else
10506 job_state_ptr = job_gres_ptr->gres_data;
10507
10508 /*
10509 * Add the total_gres here but no count, that will be done after
10510 * allocation.
10511 */
10512 if (node_state_ptr->no_consume) {
10513 job_state_ptr->total_gres = NO_CONSUME_VAL64;
10514 } else if (type_inx != -1)
10515 job_state_ptr->total_gres +=
10516 node_state_ptr->type_cnt_avail[type_inx];
10517 else
10518 job_state_ptr->total_gres += node_state_ptr->gres_cnt_avail;
10519 }
10520
_job_alloc_whole_node_internal(gres_key_t * job_search_key,gres_node_state_t * node_state_ptr,List job_gres_list,int node_cnt,int node_index,int node_offset,int type_index,uint32_t job_id,char * node_name,bitstr_t * core_bitmap,uint32_t user_id)10521 static int _job_alloc_whole_node_internal(
10522 gres_key_t *job_search_key, gres_node_state_t *node_state_ptr,
10523 List job_gres_list, int node_cnt, int node_index, int node_offset,
10524 int type_index, uint32_t job_id, char *node_name,
10525 bitstr_t *core_bitmap, uint32_t user_id)
10526 {
10527 gres_state_t *job_gres_ptr;
10528 gres_job_state_t *job_state_ptr;
10529
10530 if (!(job_gres_ptr = list_find_first(job_gres_list,
10531 _gres_find_job_by_key,
10532 job_search_key))) {
10533 error("%s: This should never happen, we couldn't find the gres %u:%u",
10534 __func__,
10535 job_search_key->plugin_id,
10536 job_search_key->type_id);
10537 return SLURM_ERROR;
10538 }
10539
10540 job_state_ptr = (gres_job_state_t *)job_gres_ptr->gres_data;
10541
10542 /*
10543 * As the amount of gres on each node could
10544 * differ. We need to set the gres_per_node
10545 * correctly here to avoid heterogeneous node
10546 * issues.
10547 */
10548 if (type_index != -1)
10549 job_state_ptr->gres_per_node =
10550 node_state_ptr->type_cnt_avail[type_index];
10551 else
10552 job_state_ptr->gres_per_node = node_state_ptr->gres_cnt_avail;
10553
10554 return _job_alloc(job_state_ptr, node_state_ptr,
10555 node_cnt, node_index, node_offset,
10556 job_state_ptr->gres_name,
10557 job_id, node_name, core_bitmap,
10558 job_gres_ptr->plugin_id,
10559 user_id);
10560 }
10561
10562 /*
10563 * Select and allocate GRES to a job and update node and job GRES information
10564 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
10565 * IN node_gres_list - node's gres_list built by
10566 * gres_plugin_node_config_validate()
10567 * IN node_cnt - total number of nodes originally allocated to the job
10568 * IN node_index - zero-origin global node index
10569 * IN node_offset - zero-origin index in job allocation to the node of interest
10570 * IN job_id - job's ID (for logging)
10571 * IN node_name - name of the node (for logging)
10572 * IN core_bitmap - cores allocated to this job on this node (NULL if not
10573 * available)
10574 * IN user_id - job's user ID
10575 * RET SLURM_SUCCESS or error code
10576 */
gres_plugin_job_alloc(List job_gres_list,List node_gres_list,int node_cnt,int node_index,int node_offset,uint32_t job_id,char * node_name,bitstr_t * core_bitmap,uint32_t user_id)10577 extern int gres_plugin_job_alloc(List job_gres_list, List node_gres_list,
10578 int node_cnt, int node_index, int node_offset,
10579 uint32_t job_id, char *node_name,
10580 bitstr_t *core_bitmap, uint32_t user_id)
10581 {
10582 int i, rc, rc2;
10583 ListIterator job_gres_iter, node_gres_iter;
10584 gres_state_t *job_gres_ptr, *node_gres_ptr;
10585
10586 if (job_gres_list == NULL)
10587 return SLURM_SUCCESS;
10588 if (node_gres_list == NULL) {
10589 error("%s: job %u has gres specification while node %s has none",
10590 __func__, job_id, node_name);
10591 return SLURM_ERROR;
10592 }
10593
10594 rc = gres_plugin_init();
10595
10596 slurm_mutex_lock(&gres_context_lock);
10597 job_gres_iter = list_iterator_create(job_gres_list);
10598 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
10599 for (i = 0; i < gres_context_cnt; i++) {
10600 if (job_gres_ptr->plugin_id ==
10601 gres_context[i].plugin_id)
10602 break;
10603 }
10604 if (i >= gres_context_cnt) {
10605 error("%s: no plugin configured for data type %u for job %u and node %s",
10606 __func__, job_gres_ptr->plugin_id, job_id,
10607 node_name);
10608 /* A likely sign that GresPlugins has changed */
10609 continue;
10610 }
10611
10612 node_gres_iter = list_iterator_create(node_gres_list);
10613 while ((node_gres_ptr = (gres_state_t *)
10614 list_next(node_gres_iter))) {
10615 if (job_gres_ptr->plugin_id == node_gres_ptr->plugin_id)
10616 break;
10617 }
10618 list_iterator_destroy(node_gres_iter);
10619 if (node_gres_ptr == NULL) {
10620 error("%s: job %u allocated gres/%s on node %s lacking that gres",
10621 __func__, job_id, gres_context[i].gres_name,
10622 node_name);
10623 continue;
10624 }
10625
10626 rc2 = _job_alloc(job_gres_ptr->gres_data,
10627 node_gres_ptr->gres_data, node_cnt, node_index,
10628 node_offset, gres_context[i].gres_name,
10629 job_id, node_name, core_bitmap,
10630 job_gres_ptr->plugin_id, user_id);
10631 if (rc2 != SLURM_SUCCESS)
10632 rc = rc2;
10633 }
10634 list_iterator_destroy(job_gres_iter);
10635 slurm_mutex_unlock(&gres_context_lock);
10636
10637 return rc;
10638 }
10639
10640 /*
10641 * Fill in job_gres_list with the total amount of GRES on a node.
10642 * OUT job_gres_list - This list will be destroyed and remade with all GRES on
10643 * node.
10644 * IN node_gres_list - node's gres_list built by
10645 * gres_plugin_node_config_validate()
10646 * IN job_id - job's ID (for logging)
10647 * IN node_name - name of the node (for logging)
10648 * RET SLURM_SUCCESS or error code
10649 */
gres_plugin_job_select_whole_node(List * job_gres_list,List node_gres_list,uint32_t job_id,char * node_name)10650 extern int gres_plugin_job_select_whole_node(
10651 List *job_gres_list, List node_gres_list,
10652 uint32_t job_id, char *node_name)
10653 {
10654 int i;
10655 ListIterator node_gres_iter;
10656 gres_state_t *node_gres_ptr;
10657 gres_node_state_t *node_state_ptr;
10658
10659 if (job_gres_list == NULL)
10660 return SLURM_SUCCESS;
10661 if (node_gres_list == NULL) {
10662 error("%s: job %u has gres specification while node %s has none",
10663 __func__, job_id, node_name);
10664 return SLURM_ERROR;
10665 }
10666
10667 if (!*job_gres_list)
10668 *job_gres_list = list_create(_gres_job_list_delete);
10669
10670 if (gres_plugin_init() != SLURM_SUCCESS)
10671 return SLURM_ERROR;
10672
10673 slurm_mutex_lock(&gres_context_lock);
10674 node_gres_iter = list_iterator_create(node_gres_list);
10675 while ((node_gres_ptr = list_next(node_gres_iter))) {
10676 gres_key_t job_search_key;
10677 node_state_ptr = (gres_node_state_t *) node_gres_ptr->gres_data;
10678
10679 /*
10680 * Don't check for no_consume here, we need them added here and
10681 * will filter them out in gres_plugin_job_alloc_whole_node()
10682 */
10683 if (!node_state_ptr->gres_cnt_config)
10684 continue;
10685
10686 for (i = 0; i < gres_context_cnt; i++) {
10687 if (node_gres_ptr->plugin_id ==
10688 gres_context[i].plugin_id)
10689 break;
10690 }
10691 if (i >= gres_context_cnt) {
10692 error("%s: no plugin configured for data type %u for job %u and node %s",
10693 __func__, node_gres_ptr->plugin_id, job_id,
10694 node_name);
10695 /* A likely sign that GresPlugins has changed */
10696 continue;
10697 }
10698
10699 job_search_key.plugin_id = node_gres_ptr->plugin_id;
10700
10701 if (!node_state_ptr->type_cnt) {
10702 job_search_key.type_id = 0;
10703 _job_select_whole_node_internal(
10704 &job_search_key, node_state_ptr,
10705 -1, i, *job_gres_list);
10706 } else {
10707 for (int j = 0; j < node_state_ptr->type_cnt; j++) {
10708 job_search_key.type_id = gres_plugin_build_id(
10709 node_state_ptr->type_name[j]);
10710 _job_select_whole_node_internal(
10711 &job_search_key, node_state_ptr,
10712 j, i, *job_gres_list);
10713 }
10714 }
10715 }
10716 list_iterator_destroy(node_gres_iter);
10717 slurm_mutex_unlock(&gres_context_lock);
10718
10719 return SLURM_SUCCESS;
10720 }
10721
10722 /*
10723 * Select and allocate all GRES on a node to a job and update node and job GRES
10724 * information
10725 * IN job_gres_list - job's gres_list built by gres_plugin_job_whole_node().
10726 * IN node_gres_list - node's gres_list built by
10727 * gres_plugin_node_config_validate()
10728 * IN node_cnt - total number of nodes originally allocated to the job
10729 * IN node_index - zero-origin global node index
10730 * IN node_offset - zero-origin index in job allocation to the node of interest
10731 * IN job_id - job's ID (for logging)
10732 * IN node_name - name of the node (for logging)
10733 * IN core_bitmap - cores allocated to this job on this node (NULL if not
10734 * available)
10735 * IN user_id - job's user ID
10736 * RET SLURM_SUCCESS or error code
10737 */
gres_plugin_job_alloc_whole_node(List job_gres_list,List node_gres_list,int node_cnt,int node_index,int node_offset,uint32_t job_id,char * node_name,bitstr_t * core_bitmap,uint32_t user_id)10738 extern int gres_plugin_job_alloc_whole_node(
10739 List job_gres_list, List node_gres_list,
10740 int node_cnt, int node_index, int node_offset,
10741 uint32_t job_id, char *node_name,
10742 bitstr_t *core_bitmap, uint32_t user_id)
10743 {
10744 int i, rc, rc2;
10745 ListIterator node_gres_iter;
10746 gres_state_t *node_gres_ptr;
10747 gres_node_state_t *node_state_ptr;
10748
10749 if (job_gres_list == NULL)
10750 return SLURM_SUCCESS;
10751 if (node_gres_list == NULL) {
10752 error("%s: job %u has gres specification while node %s has none",
10753 __func__, job_id, node_name);
10754 return SLURM_ERROR;
10755 }
10756
10757 rc = gres_plugin_init();
10758
10759 slurm_mutex_lock(&gres_context_lock);
10760 node_gres_iter = list_iterator_create(node_gres_list);
10761 while ((node_gres_ptr = list_next(node_gres_iter))) {
10762 gres_key_t job_search_key;
10763 node_state_ptr = (gres_node_state_t *) node_gres_ptr->gres_data;
10764
10765 if (node_state_ptr->no_consume ||
10766 !node_state_ptr->gres_cnt_config)
10767 continue;
10768
10769 for (i = 0; i < gres_context_cnt; i++) {
10770 if (node_gres_ptr->plugin_id ==
10771 gres_context[i].plugin_id)
10772 break;
10773 }
10774 if (i >= gres_context_cnt) {
10775 error("%s: no plugin configured for data type %u for job %u and node %s",
10776 __func__, node_gres_ptr->plugin_id, job_id,
10777 node_name);
10778 /* A likely sign that GresPlugins has changed */
10779 continue;
10780 }
10781
10782 job_search_key.plugin_id = node_gres_ptr->plugin_id;
10783
10784 if (!node_state_ptr->type_cnt) {
10785 job_search_key.type_id = 0;
10786 rc2 = _job_alloc_whole_node_internal(
10787 &job_search_key, node_state_ptr,
10788 job_gres_list, node_cnt, node_index,
10789 node_offset, -1, job_id, node_name,
10790 core_bitmap, user_id);
10791 if (rc2 != SLURM_SUCCESS)
10792 rc = rc2;
10793 } else {
10794 for (int j = 0; j < node_state_ptr->type_cnt; j++) {
10795 job_search_key.type_id = gres_plugin_build_id(
10796 node_state_ptr->type_name[j]);
10797 rc2 = _job_alloc_whole_node_internal(
10798 &job_search_key, node_state_ptr,
10799 job_gres_list, node_cnt, node_index,
10800 node_offset, j, job_id, node_name,
10801 core_bitmap, user_id);
10802 if (rc2 != SLURM_SUCCESS)
10803 rc = rc2;
10804 }
10805 }
10806 }
10807 list_iterator_destroy(node_gres_iter);
10808 slurm_mutex_unlock(&gres_context_lock);
10809
10810 return rc;
10811 }
10812
_job_dealloc(void * job_gres_data,void * node_gres_data,int node_offset,char * gres_name,uint32_t job_id,char * node_name,bool old_job,uint32_t plugin_id,uint32_t user_id,bool job_fini)10813 static int _job_dealloc(void *job_gres_data, void *node_gres_data,
10814 int node_offset, char *gres_name, uint32_t job_id,
10815 char *node_name, bool old_job, uint32_t plugin_id,
10816 uint32_t user_id, bool job_fini)
10817 {
10818 int i, j, len, sz1, sz2;
10819 gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data;
10820 gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data;
10821 bool type_array_updated = false;
10822 uint64_t gres_cnt = 0, k;
10823 uint64_t gres_per_bit = 1;
10824
10825 /*
10826 * Validate data structures. Either job_gres_data->node_cnt and
10827 * job_gres_data->gres_bit_alloc are both set or both zero/NULL.
10828 */
10829 xassert(node_offset >= 0);
10830 xassert(job_gres_ptr);
10831 xassert(node_gres_ptr);
10832
10833 if (node_gres_ptr->no_consume)
10834 return SLURM_SUCCESS;
10835
10836 if (job_gres_ptr->node_cnt <= node_offset) {
10837 error("gres/%s: job %u dealloc of node %s bad node_offset %d "
10838 "count is %u", gres_name, job_id, node_name, node_offset,
10839 job_gres_ptr->node_cnt);
10840 return SLURM_ERROR;
10841 }
10842
10843 if (_shared_gres(plugin_id))
10844 gres_per_bit = job_gres_ptr->gres_per_node;
10845
10846 xfree(node_gres_ptr->gres_used); /* Clear cache */
10847 if (node_gres_ptr->gres_bit_alloc && job_gres_ptr->gres_bit_alloc &&
10848 job_gres_ptr->gres_bit_alloc[node_offset]) {
10849 len = bit_size(job_gres_ptr->gres_bit_alloc[node_offset]);
10850 i = bit_size(node_gres_ptr->gres_bit_alloc);
10851 if (i != len) {
10852 error("gres/%s: job %u and node %s bitmap sizes differ "
10853 "(%d != %d)", gres_name, job_id, node_name, len,
10854 i);
10855 len = MIN(len, i);
10856 /* proceed with request, make best effort */
10857 }
10858 for (i = 0; i < len; i++) {
10859 if (!bit_test(job_gres_ptr->gres_bit_alloc[node_offset],
10860 i)) {
10861 continue;
10862 }
10863 bit_clear(node_gres_ptr->gres_bit_alloc, i);
10864
10865 /*
10866 * NOTE: Do not clear bit from
10867 * job_gres_ptr->gres_bit_alloc[node_offset]
10868 * since this may only be an emulated deallocate
10869 */
10870 if (node_gres_ptr->gres_cnt_alloc >= gres_per_bit) {
10871 node_gres_ptr->gres_cnt_alloc -= gres_per_bit;
10872 } else {
10873 error("gres/%s: job %u dealloc node %s GRES count underflow (%"PRIu64" < %"PRIu64")",
10874 gres_name, job_id, node_name,
10875 node_gres_ptr->gres_cnt_alloc,
10876 gres_per_bit);
10877 node_gres_ptr->gres_cnt_alloc = 0;
10878 }
10879 }
10880 } else if (job_gres_ptr->gres_cnt_node_alloc) {
10881 gres_cnt = job_gres_ptr->gres_cnt_node_alloc[node_offset];
10882 } else {
10883 gres_cnt = job_gres_ptr->gres_per_node;
10884 }
10885 if (gres_cnt && (node_gres_ptr->gres_cnt_alloc >= gres_cnt))
10886 node_gres_ptr->gres_cnt_alloc -= gres_cnt;
10887 else if (gres_cnt) {
10888 error("gres/%s: job %u node %s GRES count underflow (%"PRIu64" < %"PRIu64")",
10889 gres_name, job_id, node_name,
10890 node_gres_ptr->gres_cnt_alloc, gres_cnt);
10891 node_gres_ptr->gres_cnt_alloc = 0;
10892 }
10893
10894 if (job_gres_ptr->gres_bit_alloc &&
10895 job_gres_ptr->gres_bit_alloc[node_offset] &&
10896 node_gres_ptr->topo_gres_bitmap &&
10897 node_gres_ptr->topo_gres_cnt_alloc) {
10898 for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
10899 sz1 = bit_size(
10900 job_gres_ptr->gres_bit_alloc[node_offset]);
10901 sz2 = bit_size(node_gres_ptr->topo_gres_bitmap[i]);
10902 if (sz1 != sz2)
10903 continue;
10904 gres_cnt = (uint64_t)bit_overlap(
10905 job_gres_ptr->gres_bit_alloc[node_offset],
10906 node_gres_ptr->topo_gres_bitmap[i]);
10907 gres_cnt *= gres_per_bit;
10908 if (node_gres_ptr->topo_gres_cnt_alloc[i] >= gres_cnt) {
10909 node_gres_ptr->topo_gres_cnt_alloc[i] -=
10910 gres_cnt;
10911 } else if (old_job) {
10912 node_gres_ptr->topo_gres_cnt_alloc[i] = 0;
10913 } else {
10914 error("gres/%s: job %u dealloc node %s topo gres count underflow "
10915 "(%"PRIu64" %"PRIu64")",
10916 gres_name, job_id, node_name,
10917 node_gres_ptr->topo_gres_cnt_alloc[i],
10918 gres_cnt);
10919 node_gres_ptr->topo_gres_cnt_alloc[i] = 0;
10920 }
10921 if ((node_gres_ptr->type_cnt == 0) ||
10922 (node_gres_ptr->topo_type_name == NULL) ||
10923 (node_gres_ptr->topo_type_name[i] == NULL))
10924 continue;
10925 for (j = 0; j < node_gres_ptr->type_cnt; j++) {
10926 if (!node_gres_ptr->type_name[j] ||
10927 (node_gres_ptr->topo_type_id[i] !=
10928 node_gres_ptr->type_id[j]))
10929 continue;
10930 if (node_gres_ptr->type_cnt_alloc[j] >=
10931 gres_cnt) {
10932 node_gres_ptr->type_cnt_alloc[j] -=
10933 gres_cnt;
10934 } else if (old_job) {
10935 node_gres_ptr->type_cnt_alloc[j] = 0;
10936 } else {
10937 error("gres/%s: job %u dealloc node %s type %s gres count underflow "
10938 "(%"PRIu64" %"PRIu64")",
10939 gres_name, job_id, node_name,
10940 node_gres_ptr->type_name[j],
10941 node_gres_ptr->type_cnt_alloc[j],
10942 gres_cnt);
10943 node_gres_ptr->type_cnt_alloc[j] = 0;
10944 }
10945 }
10946 }
10947 type_array_updated = true;
10948 } else if (job_gres_ptr->gres_bit_alloc &&
10949 job_gres_ptr->gres_bit_alloc[node_offset] &&
10950 node_gres_ptr->topo_gres_cnt_alloc) {
10951 /* Avoid crash if configuration inconsistent */
10952 len = MIN(node_gres_ptr->gres_cnt_config,
10953 bit_size(job_gres_ptr->
10954 gres_bit_alloc[node_offset]));
10955 for (i = 0; i < len; i++) {
10956 if (!bit_test(job_gres_ptr->
10957 gres_bit_alloc[node_offset], i) ||
10958 !node_gres_ptr->topo_gres_cnt_alloc[i])
10959 continue;
10960 if (node_gres_ptr->topo_gres_cnt_alloc[i] >=
10961 gres_per_bit) {
10962 node_gres_ptr->topo_gres_cnt_alloc[i] -=
10963 gres_per_bit;
10964 } else {
10965 error("gres/%s: job %u dealloc node %s "
10966 "topo_gres_cnt_alloc[%d] count underflow "
10967 "(%"PRIu64" %"PRIu64")",
10968 gres_name, job_id, node_name, i,
10969 node_gres_ptr->topo_gres_cnt_alloc[i],
10970 gres_per_bit);
10971 node_gres_ptr->topo_gres_cnt_alloc[i] = 0;
10972 }
10973 if ((node_gres_ptr->type_cnt == 0) ||
10974 (node_gres_ptr->topo_type_name == NULL) ||
10975 (node_gres_ptr->topo_type_name[i] == NULL))
10976 continue;
10977 for (j = 0; j < node_gres_ptr->type_cnt; j++) {
10978 if (!node_gres_ptr->type_name[j] ||
10979 (node_gres_ptr->topo_type_id[i] !=
10980 node_gres_ptr->type_id[j]))
10981 continue;
10982 if (node_gres_ptr->type_cnt_alloc[j] >=
10983 gres_per_bit) {
10984 node_gres_ptr->type_cnt_alloc[j] -=
10985 gres_per_bit;
10986 } else {
10987 error("gres/%s: job %u dealloc node %s "
10988 "type %s type_cnt_alloc count underflow "
10989 "(%"PRIu64" %"PRIu64")",
10990 gres_name, job_id, node_name,
10991 node_gres_ptr->type_name[j],
10992 node_gres_ptr->type_cnt_alloc[j],
10993 gres_per_bit);
10994 node_gres_ptr->type_cnt_alloc[j] = 0;
10995 }
10996 }
10997 }
10998 type_array_updated = true;
10999 }
11000
11001 if (!type_array_updated && job_gres_ptr->type_name) {
11002 gres_cnt = job_gres_ptr->gres_per_node;
11003 for (j = 0; j < node_gres_ptr->type_cnt; j++) {
11004 if (job_gres_ptr->type_id !=
11005 node_gres_ptr->type_id[j])
11006 continue;
11007 k = MIN(gres_cnt, node_gres_ptr->type_cnt_alloc[j]);
11008 node_gres_ptr->type_cnt_alloc[j] -= k;
11009 gres_cnt -= k;
11010 if (gres_cnt == 0)
11011 break;
11012 }
11013 }
11014
11015 return SLURM_SUCCESS;
11016 }
11017
11018 /*
11019 * Deallocate resource from a job and update node and job gres information
11020 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
11021 * IN node_gres_list - node's gres_list built by
11022 * gres_plugin_node_config_validate()
11023 * IN node_offset - zero-origin index to the node of interest
11024 * IN job_id - job's ID (for logging)
11025 * IN node_name - name of the node (for logging)
11026 * IN old_job - true if job started before last slurmctld reboot.
11027 * Immediately after slurmctld restart and before the node's
11028 * registration, the GRES type and topology. This results in
11029 * some incorrect internal bookkeeping, but does not cause
11030 * failures in terms of allocating GRES to jobs.
11031 * IN user_id - job's user ID
11032 * IN: job_fini - job fully terminating on this node (not just a test)
11033 * RET SLURM_SUCCESS or error code
11034 */
gres_plugin_job_dealloc(List job_gres_list,List node_gres_list,int node_offset,uint32_t job_id,char * node_name,bool old_job,uint32_t user_id,bool job_fini)11035 extern int gres_plugin_job_dealloc(List job_gres_list, List node_gres_list,
11036 int node_offset, uint32_t job_id,
11037 char *node_name, bool old_job,
11038 uint32_t user_id, bool job_fini)
11039 {
11040 int i, rc, rc2;
11041 ListIterator job_gres_iter;
11042 gres_state_t *job_gres_ptr, *node_gres_ptr;
11043 char *gres_name = NULL;
11044
11045 if (job_gres_list == NULL)
11046 return SLURM_SUCCESS;
11047 if (node_gres_list == NULL) {
11048 error("%s: job %u has gres specification while node %s has none",
11049 __func__, job_id, node_name);
11050 return SLURM_ERROR;
11051 }
11052
11053 rc = gres_plugin_init();
11054
11055 slurm_mutex_lock(&gres_context_lock);
11056 job_gres_iter = list_iterator_create(job_gres_list);
11057 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
11058 for (i = 0; i < gres_context_cnt; i++) {
11059 if (job_gres_ptr->plugin_id ==
11060 gres_context[i].plugin_id)
11061 break;
11062 }
11063 if (i >= gres_context_cnt) {
11064 error("%s: no plugin configured for data type %u for job %u and node %s",
11065 __func__, job_gres_ptr->plugin_id, job_id,
11066 node_name);
11067 /* A likely sign that GresPlugins has changed */
11068 gres_name = "UNKNOWN";
11069 } else
11070 gres_name = gres_context[i].gres_name;
11071
11072 node_gres_ptr = list_find_first(node_gres_list, _gres_find_id,
11073 &job_gres_ptr->plugin_id);
11074
11075 if (node_gres_ptr == NULL) {
11076 error("%s: node %s lacks gres/%s for job %u", __func__,
11077 node_name, gres_name , job_id);
11078 continue;
11079 }
11080
11081 rc2 = _job_dealloc(job_gres_ptr->gres_data,
11082 node_gres_ptr->gres_data, node_offset,
11083 gres_name, job_id, node_name, old_job,
11084 job_gres_ptr->plugin_id, user_id, job_fini);
11085 if (rc2 != SLURM_SUCCESS)
11086 rc = rc2;
11087 }
11088 list_iterator_destroy(job_gres_iter);
11089 slurm_mutex_unlock(&gres_context_lock);
11090
11091 return rc;
11092 }
11093
11094 /*
11095 * Merge one job's gres allocation into another job's gres allocation.
11096 * IN from_job_gres_list - List of gres records for the job being merged
11097 * into another job
11098 * IN from_job_node_bitmap - bitmap of nodes for the job being merged into
11099 * another job
11100 * IN/OUT to_job_gres_list - List of gres records for the job being merged
11101 * into job
11102 * IN to_job_node_bitmap - bitmap of nodes for the job being merged into
11103 */
gres_plugin_job_merge(List from_job_gres_list,bitstr_t * from_job_node_bitmap,List to_job_gres_list,bitstr_t * to_job_node_bitmap)11104 extern void gres_plugin_job_merge(List from_job_gres_list,
11105 bitstr_t *from_job_node_bitmap,
11106 List to_job_gres_list,
11107 bitstr_t *to_job_node_bitmap)
11108 {
11109 static int select_hetero = -1;
11110 ListIterator gres_iter;
11111 gres_state_t *gres_ptr, *gres_ptr2;
11112 gres_job_state_t *gres_job_ptr, *gres_job_ptr2;
11113 int new_node_cnt;
11114 int i_first, i_last, i;
11115 int from_inx, to_inx, new_inx;
11116 bitstr_t **new_gres_bit_alloc, **new_gres_bit_step_alloc;
11117 uint64_t *new_gres_cnt_step_alloc, *new_gres_cnt_node_alloc;
11118
11119 if (select_hetero == -1) {
11120 /*
11121 * Determine if the select plugin supports heterogeneous
11122 * GRES allocations (count differ by node): 1=yes, 0=no
11123 */
11124 char *select_type = slurm_get_select_type();
11125 if (select_type &&
11126 (strstr(select_type, "cons_tres") ||
11127 (strstr(select_type, "cray_aries") &&
11128 (slurm_get_select_type_param() & CR_OTHER_CONS_TRES)))) {
11129 select_hetero = 1;
11130 } else
11131 select_hetero = 0;
11132 xfree(select_type);
11133 }
11134
11135 (void) gres_plugin_init();
11136 new_node_cnt = bit_set_count(from_job_node_bitmap) +
11137 bit_set_count(to_job_node_bitmap) -
11138 bit_overlap(from_job_node_bitmap, to_job_node_bitmap);
11139 i_first = MIN(bit_ffs(from_job_node_bitmap),
11140 bit_ffs(to_job_node_bitmap));
11141 i_first = MAX(i_first, 0);
11142 i_last = MAX(bit_fls(from_job_node_bitmap),
11143 bit_fls(to_job_node_bitmap));
11144 if (i_last == -1) {
11145 error("%s: node_bitmaps are empty", __func__);
11146 return;
11147 }
11148
11149 slurm_mutex_lock(&gres_context_lock);
11150
11151 /* Step one - Expand the gres data structures in "to" job */
11152 if (!to_job_gres_list)
11153 goto step2;
11154 gres_iter = list_iterator_create(to_job_gres_list);
11155 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
11156 gres_job_ptr = (gres_job_state_t *) gres_ptr->gres_data;
11157 new_gres_bit_alloc = xcalloc(new_node_cnt, sizeof(bitstr_t *));
11158 new_gres_cnt_node_alloc = xcalloc(new_node_cnt,
11159 sizeof(uint64_t));
11160 new_gres_bit_step_alloc = xcalloc(new_node_cnt,
11161 sizeof(bitstr_t *));
11162 new_gres_cnt_step_alloc = xcalloc(new_node_cnt,
11163 sizeof(uint64_t));
11164
11165 from_inx = to_inx = new_inx = -1;
11166 for (i = i_first; i <= i_last; i++) {
11167 bool from_match = false, to_match = false;
11168 if (bit_test(to_job_node_bitmap, i)) {
11169 to_match = true;
11170 to_inx++;
11171 }
11172 if (bit_test(from_job_node_bitmap, i)) {
11173 from_match = true;
11174 from_inx++;
11175 }
11176 if (from_match || to_match)
11177 new_inx++;
11178 if (to_match) {
11179 if (gres_job_ptr->gres_bit_alloc) {
11180 new_gres_bit_alloc[new_inx] =
11181 gres_job_ptr->
11182 gres_bit_alloc[to_inx];
11183 }
11184 if (gres_job_ptr->gres_cnt_node_alloc) {
11185 new_gres_cnt_node_alloc[new_inx] =
11186 gres_job_ptr->
11187 gres_cnt_node_alloc[to_inx];
11188 }
11189 if (gres_job_ptr->gres_bit_step_alloc) {
11190 new_gres_bit_step_alloc[new_inx] =
11191 gres_job_ptr->
11192 gres_bit_step_alloc[to_inx];
11193 }
11194 if (gres_job_ptr->gres_cnt_step_alloc) {
11195 new_gres_cnt_step_alloc[new_inx] =
11196 gres_job_ptr->
11197 gres_cnt_step_alloc[to_inx];
11198 }
11199 }
11200 }
11201 gres_job_ptr->node_cnt = new_node_cnt;
11202 xfree(gres_job_ptr->gres_bit_alloc);
11203 gres_job_ptr->gres_bit_alloc = new_gres_bit_alloc;
11204 xfree(gres_job_ptr->gres_cnt_node_alloc);
11205 gres_job_ptr->gres_cnt_node_alloc = new_gres_cnt_node_alloc;
11206 xfree(gres_job_ptr->gres_bit_step_alloc);
11207 gres_job_ptr->gres_bit_step_alloc = new_gres_bit_step_alloc;
11208 xfree(gres_job_ptr->gres_cnt_step_alloc);
11209 gres_job_ptr->gres_cnt_step_alloc = new_gres_cnt_step_alloc;
11210 }
11211 list_iterator_destroy(gres_iter);
11212
11213 /*
11214 * Step two - Merge the gres information from the "from" job into the
11215 * existing gres information for the "to" job
11216 */
11217 step2: if (!from_job_gres_list)
11218 goto step3;
11219 if (!to_job_gres_list) {
11220 to_job_gres_list = list_create(_gres_job_list_delete);
11221 }
11222 gres_iter = list_iterator_create(from_job_gres_list);
11223 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
11224 gres_job_ptr = (gres_job_state_t *) gres_ptr->gres_data;
11225 gres_ptr2 = list_find_first(to_job_gres_list, _gres_find_id,
11226 &gres_ptr->plugin_id);
11227 if (gres_ptr2) {
11228 gres_job_ptr2 = gres_ptr2->gres_data;
11229 } else {
11230 gres_ptr2 = xmalloc(sizeof(gres_state_t));
11231 gres_job_ptr2 = xmalloc(sizeof(gres_job_state_t));
11232 gres_ptr2->plugin_id = gres_ptr->plugin_id;
11233 gres_ptr2->gres_data = gres_job_ptr2;
11234 gres_job_ptr2->gres_name =
11235 xstrdup(gres_job_ptr->gres_name);
11236 gres_job_ptr2->cpus_per_gres =
11237 gres_job_ptr->cpus_per_gres;
11238 gres_job_ptr2->gres_per_job =
11239 gres_job_ptr->gres_per_job;
11240 gres_job_ptr2->gres_per_job =
11241 gres_job_ptr->gres_per_job;
11242 gres_job_ptr2->gres_per_socket =
11243 gres_job_ptr->gres_per_socket;
11244 gres_job_ptr2->gres_per_task =
11245 gres_job_ptr->gres_per_task;
11246 gres_job_ptr2->mem_per_gres =
11247 gres_job_ptr->mem_per_gres;
11248 gres_job_ptr2->node_cnt = new_node_cnt;
11249 gres_job_ptr2->gres_bit_alloc =
11250 xcalloc(new_node_cnt, sizeof(bitstr_t *));
11251 gres_job_ptr2->gres_cnt_node_alloc =
11252 xcalloc(new_node_cnt, sizeof(uint64_t));
11253 gres_job_ptr2->gres_bit_step_alloc =
11254 xcalloc(new_node_cnt, sizeof(bitstr_t *));
11255 gres_job_ptr2->gres_cnt_step_alloc =
11256 xcalloc(new_node_cnt, sizeof(uint64_t));
11257 list_append(to_job_gres_list, gres_ptr2);
11258 }
11259 from_inx = to_inx = new_inx = -1;
11260 for (i = i_first; i <= i_last; i++) {
11261 bool from_match = false, to_match = false;
11262 if (bit_test(to_job_node_bitmap, i)) {
11263 to_match = true;
11264 to_inx++;
11265 }
11266 if (bit_test(from_job_node_bitmap, i)) {
11267 from_match = true;
11268 from_inx++;
11269 }
11270 if (from_match || to_match)
11271 new_inx++;
11272 if (from_match) {
11273 if (!gres_job_ptr->gres_bit_alloc) {
11274 ;
11275 } else if (select_hetero &&
11276 gres_job_ptr2->
11277 gres_bit_alloc[new_inx] &&
11278 gres_job_ptr->gres_bit_alloc &&
11279 gres_job_ptr->
11280 gres_bit_alloc[new_inx]) {
11281 /* Merge job's GRES bitmaps */
11282 bit_or(gres_job_ptr2->
11283 gres_bit_alloc[new_inx],
11284 gres_job_ptr->
11285 gres_bit_alloc[from_inx]);
11286 } else if (gres_job_ptr2->
11287 gres_bit_alloc[new_inx]) {
11288 /* Keep original job's GRES bitmap */
11289 } else {
11290 gres_job_ptr2->gres_bit_alloc[new_inx] =
11291 gres_job_ptr->
11292 gres_bit_alloc[from_inx];
11293 gres_job_ptr->
11294 gres_bit_alloc
11295 [from_inx] = NULL;
11296 }
11297 if (!gres_job_ptr->gres_bit_alloc) {
11298 ;
11299 } else if (select_hetero &&
11300 gres_job_ptr2->
11301 gres_cnt_node_alloc[new_inx] &&
11302 gres_job_ptr->gres_cnt_node_alloc &&
11303 gres_job_ptr->
11304 gres_cnt_node_alloc[new_inx]) {
11305 gres_job_ptr2->
11306 gres_cnt_node_alloc[new_inx] +=
11307 gres_job_ptr->
11308 gres_cnt_node_alloc[from_inx];
11309 } else if (gres_job_ptr2->
11310 gres_cnt_node_alloc[new_inx]) {
11311 /* Keep original job's GRES bitmap */
11312 } else {
11313 gres_job_ptr2->
11314 gres_cnt_node_alloc[new_inx] =
11315 gres_job_ptr->
11316 gres_cnt_node_alloc[from_inx];
11317 gres_job_ptr->
11318 gres_cnt_node_alloc[from_inx]=0;
11319 }
11320 if (gres_job_ptr->gres_cnt_step_alloc &&
11321 gres_job_ptr->
11322 gres_cnt_step_alloc[from_inx]) {
11323 error("Attempt to merge gres, from "
11324 "job has active steps");
11325 }
11326 }
11327 }
11328 }
11329 list_iterator_destroy(gres_iter);
11330
11331 step3: slurm_mutex_unlock(&gres_context_lock);
11332 return;
11333 }
11334
11335 /*
11336 * Set environment variables as required for a batch job
11337 * IN/OUT job_env_ptr - environment variable array
11338 * IN gres_list - generated by gres_plugin_job_alloc()
11339 * IN node_inx - zero origin node index
11340 */
gres_plugin_job_set_env(char *** job_env_ptr,List job_gres_list,int node_inx)11341 extern void gres_plugin_job_set_env(char ***job_env_ptr, List job_gres_list,
11342 int node_inx)
11343 {
11344 int i;
11345 ListIterator gres_iter;
11346 gres_state_t *gres_ptr = NULL;
11347 bool found;
11348
11349 (void) gres_plugin_init();
11350
11351 slurm_mutex_lock(&gres_context_lock);
11352 for (i=0; i<gres_context_cnt; i++) {
11353 if (gres_context[i].ops.job_set_env == NULL)
11354 continue; /* No plugin to call */
11355 found = false;
11356 if (job_gres_list) {
11357 gres_iter = list_iterator_create(job_gres_list);
11358 while ((gres_ptr = (gres_state_t *)
11359 list_next(gres_iter))) {
11360 if (gres_ptr->plugin_id !=
11361 gres_context[i].plugin_id)
11362 continue;
11363 (*(gres_context[i].ops.job_set_env))
11364 (job_env_ptr, gres_ptr->gres_data,
11365 node_inx);
11366 found = true;
11367 }
11368 list_iterator_destroy(gres_iter);
11369 }
11370 /*
11371 * We call the job_set_env of the gres even if this one is not
11372 * requested in the job. This may be convenient on certain
11373 * plugins, i.e. setting an env variable to say the GRES is not
11374 * available.
11375 */
11376 if (!found) {
11377 (*(gres_context[i].ops.job_set_env))
11378 (job_env_ptr, NULL, node_inx);
11379 }
11380 }
11381 slurm_mutex_unlock(&gres_context_lock);
11382 }
11383
11384 /*
11385 * Set job default parameters in a given element of a list
11386 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
11387 * IN gres_name - name of gres, apply defaults to all elements (e.g. updates to
11388 * gres_name="gpu" would apply to "gpu:tesla", "gpu:volta", etc.)
11389 * IN cpu_per_gpu - value to set as default
11390 * IN mem_per_gpu - value to set as default
11391 */
gres_plugin_job_set_defs(List job_gres_list,char * gres_name,uint64_t cpu_per_gpu,uint64_t mem_per_gpu)11392 extern void gres_plugin_job_set_defs(List job_gres_list, char *gres_name,
11393 uint64_t cpu_per_gpu,
11394 uint64_t mem_per_gpu)
11395 {
11396 uint32_t plugin_id;
11397 ListIterator gres_iter;
11398 gres_state_t *gres_ptr = NULL;
11399 gres_job_state_t *job_gres_data;
11400
11401 if (!job_gres_list)
11402 return;
11403
11404 plugin_id = gres_plugin_build_id(gres_name);
11405 gres_iter = list_iterator_create(job_gres_list);
11406 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
11407 if (gres_ptr->plugin_id != plugin_id)
11408 continue;
11409 job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
11410 if (!job_gres_data)
11411 continue;
11412 job_gres_data->def_cpus_per_gres = cpu_per_gpu;
11413 job_gres_data->def_mem_per_gres = mem_per_gpu;
11414 }
11415 list_iterator_destroy(gres_iter);
11416 }
11417
11418 /*
11419 * Translate GRES flag to string.
11420 * NOT reentrant
11421 */
_gres_flags_str(uint16_t flags)11422 static char *_gres_flags_str(uint16_t flags)
11423 {
11424 if (flags & GRES_NO_CONSUME)
11425 return "no_consume";
11426 return "";
11427 }
11428
_job_state_log(void * gres_data,uint32_t job_id,uint32_t plugin_id)11429 static void _job_state_log(void *gres_data, uint32_t job_id, uint32_t plugin_id)
11430 {
11431 gres_job_state_t *gres_ptr;
11432 char *sparse_msg = "", tmp_str[128];
11433 int i;
11434
11435 xassert(gres_data);
11436 gres_ptr = (gres_job_state_t *) gres_data;
11437 info("gres:%s(%u) type:%s(%u) job:%u flags:%s state",
11438 gres_ptr->gres_name, plugin_id, gres_ptr->type_name,
11439 gres_ptr->type_id, job_id, _gres_flags_str(gres_ptr->flags));
11440 if (gres_ptr->cpus_per_gres)
11441 info(" cpus_per_gres:%u", gres_ptr->cpus_per_gres);
11442 else if (gres_ptr->def_cpus_per_gres)
11443 info(" def_cpus_per_gres:%u", gres_ptr->def_cpus_per_gres);
11444 if (gres_ptr->gres_per_job)
11445 info(" gres_per_job:%"PRIu64, gres_ptr->gres_per_job);
11446 if (gres_ptr->gres_per_node) {
11447 info(" gres_per_node:%"PRIu64" node_cnt:%u",
11448 gres_ptr->gres_per_node, gres_ptr->node_cnt);
11449 }
11450 if (gres_ptr->gres_per_socket)
11451 info(" gres_per_socket:%"PRIu64, gres_ptr->gres_per_socket);
11452 if (gres_ptr->gres_per_task)
11453 info(" gres_per_task:%"PRIu64, gres_ptr->gres_per_task);
11454 if (gres_ptr->mem_per_gres)
11455 info(" mem_per_gres:%"PRIu64, gres_ptr->mem_per_gres);
11456 else if (gres_ptr->def_mem_per_gres)
11457 info(" def_mem_per_gres:%"PRIu64, gres_ptr->def_mem_per_gres);
11458
11459 if (gres_ptr->node_cnt == 0)
11460 return;
11461 if (gres_ptr->gres_bit_alloc == NULL)
11462 info(" gres_bit_alloc:NULL");
11463 if (gres_ptr->gres_cnt_node_alloc == NULL)
11464 info(" gres_cnt_node_alloc:NULL");
11465 if (gres_ptr->gres_bit_step_alloc == NULL)
11466 info(" gres_bit_step_alloc:NULL");
11467 if (gres_ptr->gres_cnt_step_alloc == NULL)
11468 info(" gres_cnt_step_alloc:NULL");
11469 if (gres_ptr->gres_bit_select == NULL)
11470 info(" gres_bit_select:NULL");
11471 if (gres_ptr->gres_cnt_node_select == NULL)
11472 info(" gres_cnt_node_select:NULL");
11473
11474 for (i = 0; i < gres_ptr->node_cnt; i++) {
11475 if (gres_ptr->gres_cnt_node_alloc &&
11476 gres_ptr->gres_cnt_node_alloc[i]) {
11477 info(" gres_cnt_node_alloc[%d]:%"PRIu64,
11478 i, gres_ptr->gres_cnt_node_alloc[i]);
11479 } else if (gres_ptr->gres_cnt_node_alloc)
11480 info(" gres_cnt_node_alloc[%d]:NULL", i);
11481
11482 if (gres_ptr->gres_bit_alloc && gres_ptr->gres_bit_alloc[i]) {
11483 bit_fmt(tmp_str, sizeof(tmp_str),
11484 gres_ptr->gres_bit_alloc[i]);
11485 info(" gres_bit_alloc[%d]:%s of %d", i, tmp_str,
11486 (int) bit_size(gres_ptr->gres_bit_alloc[i]));
11487 } else if (gres_ptr->gres_bit_alloc)
11488 info(" gres_bit_alloc[%d]:NULL", i);
11489
11490 if (gres_ptr->gres_bit_step_alloc &&
11491 gres_ptr->gres_bit_step_alloc[i]) {
11492 bit_fmt(tmp_str, sizeof(tmp_str),
11493 gres_ptr->gres_bit_step_alloc[i]);
11494 info(" gres_bit_step_alloc[%d]:%s of %d", i, tmp_str,
11495 (int) bit_size(gres_ptr->gres_bit_step_alloc[i]));
11496 } else if (gres_ptr->gres_bit_step_alloc)
11497 info(" gres_bit_step_alloc[%d]:NULL", i);
11498
11499 if (gres_ptr->gres_cnt_step_alloc) {
11500 info(" gres_cnt_step_alloc[%d]:%"PRIu64"", i,
11501 gres_ptr->gres_cnt_step_alloc[i]);
11502 }
11503 }
11504
11505 /*
11506 * These arrays are only used for resource selection and may include
11507 * data for many nodes not used in the resources eventually allocated
11508 * to this job.
11509 */
11510 if (gres_ptr->total_node_cnt)
11511 sparse_msg = " (sparsely populated for resource selection)";
11512 info(" total_node_cnt:%u%s", gres_ptr->total_node_cnt, sparse_msg);
11513 for (i = 0; i < gres_ptr->total_node_cnt; i++) {
11514 if (gres_ptr->gres_cnt_node_select &&
11515 gres_ptr->gres_cnt_node_select[i]) {
11516 info(" gres_cnt_node_select[%d]:%"PRIu64,
11517 i, gres_ptr->gres_cnt_node_select[i]);
11518 }
11519 if (gres_ptr->gres_bit_select &&
11520 gres_ptr->gres_bit_select[i]) {
11521 bit_fmt(tmp_str, sizeof(tmp_str),
11522 gres_ptr->gres_bit_select[i]);
11523 info(" gres_bit_select[%d]:%s of %d", i, tmp_str,
11524 (int) bit_size(gres_ptr->gres_bit_select[i]));
11525 }
11526 }
11527 }
11528
11529 /*
11530 * Extract from the job record's gres_list the count of allocated resources of
11531 * the named gres type.
11532 * IN job_gres_list - job record's gres_list.
11533 * IN gres_name_type - the name of the gres type to retrieve the associated
11534 * value from.
11535 * RET The value associated with the gres type or NO_VAL if not found.
11536 */
gres_plugin_get_job_value_by_type(List job_gres_list,char * gres_name_type)11537 extern uint64_t gres_plugin_get_job_value_by_type(List job_gres_list,
11538 char *gres_name_type)
11539 {
11540 uint64_t gres_val;
11541 uint32_t gres_name_type_id;
11542 ListIterator job_gres_iter;
11543 gres_state_t *job_gres_ptr;
11544
11545 if (job_gres_list == NULL)
11546 return NO_VAL64;
11547
11548 slurm_mutex_lock(&gres_context_lock);
11549 gres_name_type_id = gres_plugin_build_id(gres_name_type);
11550 gres_val = NO_VAL64;
11551
11552 job_gres_iter = list_iterator_create(job_gres_list);
11553 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
11554 if (job_gres_ptr->plugin_id == gres_name_type_id) {
11555 gres_val = ((gres_job_state_t *)
11556 (job_gres_ptr->gres_data))->gres_per_node;
11557 break;
11558 }
11559 }
11560 list_iterator_destroy(job_gres_iter);
11561
11562 slurm_mutex_unlock(&gres_context_lock);
11563
11564 return gres_val;
11565 }
11566
11567 /*
11568 * Log a job's current gres state
11569 * IN gres_list - generated by gres_plugin_job_state_validate()
11570 * IN job_id - job's ID
11571 */
gres_plugin_job_state_log(List gres_list,uint32_t job_id)11572 extern void gres_plugin_job_state_log(List gres_list, uint32_t job_id)
11573 {
11574 ListIterator gres_iter;
11575 gres_state_t *gres_ptr;
11576
11577 if (!gres_debug || (gres_list == NULL))
11578 return;
11579
11580 (void) gres_plugin_init();
11581
11582 slurm_mutex_lock(&gres_context_lock);
11583 gres_iter = list_iterator_create(gres_list);
11584 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
11585 _job_state_log(gres_ptr->gres_data, job_id,
11586 gres_ptr->plugin_id);
11587 }
11588 list_iterator_destroy(gres_iter);
11589 slurm_mutex_unlock(&gres_context_lock);
11590 }
11591
_find_device(void * x,void * key)11592 static int _find_device(void *x, void *key)
11593 {
11594 gres_device_t *device_x = (gres_device_t *)x;
11595 gres_device_t *device_key = (gres_device_t *)key;
11596
11597 if (!xstrcmp(device_x->path, device_key->path))
11598 return 1;
11599
11600 return 0;
11601 }
11602
gres_plugin_get_allocated_devices(List gres_list,bool is_job)11603 extern List gres_plugin_get_allocated_devices(List gres_list, bool is_job)
11604 {
11605 int i, j;
11606 ListIterator gres_itr, dev_itr;
11607 gres_state_t *gres_ptr;
11608 bitstr_t **local_bit_alloc = NULL;
11609 uint32_t node_cnt;
11610 gres_device_t *gres_device;
11611 List gres_devices;
11612 List device_list = NULL;
11613
11614 (void) gres_plugin_init();
11615
11616 /*
11617 * Create a unique device list of all possible GRES device files.
11618 * Initialize each device to deny.
11619 */
11620 for (j = 0; j < gres_context_cnt; j++) {
11621 if (!gres_context[j].ops.get_devices)
11622 continue;
11623 gres_devices = (*(gres_context[j].ops.get_devices))();
11624 if (!gres_devices || !list_count(gres_devices))
11625 continue;
11626 dev_itr = list_iterator_create(gres_devices);
11627 while ((gres_device = list_next(dev_itr))) {
11628 if (!device_list)
11629 device_list = list_create(NULL);
11630 gres_device->alloc = 0;
11631 /*
11632 * Keep the list unique by not adding duplicates (in the
11633 * case of MPS and GPU)
11634 */
11635 if (!list_find_first(device_list, _find_device,
11636 gres_device))
11637 list_append(device_list, gres_device);
11638 }
11639 list_iterator_destroy(dev_itr);
11640 }
11641
11642 if (!gres_list)
11643 return device_list;
11644
11645 slurm_mutex_lock(&gres_context_lock);
11646 gres_itr = list_iterator_create(gres_list);
11647 while ((gres_ptr = list_next(gres_itr))) {
11648 for (j = 0; j < gres_context_cnt; j++) {
11649 if (gres_ptr->plugin_id == gres_context[j].plugin_id)
11650 break;
11651 }
11652
11653 if (j >= gres_context_cnt) {
11654 error("We were unable to find the gres in the context!!! This should never happen");
11655 continue;
11656 }
11657
11658 if (!gres_ptr->gres_data)
11659 continue;
11660
11661 if (is_job) {
11662 gres_job_state_t *gres_data_ptr =
11663 (gres_job_state_t *)gres_ptr->gres_data;
11664 local_bit_alloc = gres_data_ptr->gres_bit_alloc;
11665 node_cnt = gres_data_ptr->node_cnt;
11666 } else {
11667 gres_step_state_t *gres_data_ptr =
11668 (gres_step_state_t *)gres_ptr->gres_data;
11669 local_bit_alloc = gres_data_ptr->gres_bit_alloc;
11670 node_cnt = gres_data_ptr->node_cnt;
11671 }
11672
11673 if ((node_cnt != 1) ||
11674 !local_bit_alloc ||
11675 !local_bit_alloc[0] ||
11676 !gres_context[j].ops.get_devices)
11677 continue;
11678
11679 gres_devices = (*(gres_context[j].ops.get_devices))();
11680 if (!gres_devices) {
11681 error("We should had got gres_devices, but for some reason none were set in the plugin.");
11682 continue;
11683 } else if ((int)bit_size(local_bit_alloc[0]) !=
11684 list_count(gres_devices)) {
11685 error("We got %d gres devices when we were only told about %d. This should never happen.",
11686 list_count(gres_devices),
11687 (int)bit_size(local_bit_alloc[0]));
11688 continue;
11689
11690 }
11691
11692 dev_itr = list_iterator_create(gres_devices);
11693 i = 0;
11694 while ((gres_device = list_next(dev_itr))) {
11695 if (bit_test(local_bit_alloc[0], i)) {
11696 gres_device_t *gres_device2;
11697 /*
11698 * search for the device among the unique
11699 * devices list (since two plugins could have
11700 * device records that point to the same file,
11701 * like with GPU and MPS)
11702 */
11703 gres_device2 = list_find_first(device_list,
11704 _find_device,
11705 gres_device);
11706 /*
11707 * Set both, in case they point to different
11708 * records
11709 */
11710 gres_device->alloc = 1;
11711 if (gres_device2)
11712 gres_device2->alloc = 1;
11713 }
11714 //info("%d is %d", i, gres_device->alloc);
11715 i++;
11716 }
11717 list_iterator_destroy(dev_itr);
11718 }
11719 list_iterator_destroy(gres_itr);
11720 slurm_mutex_unlock(&gres_context_lock);
11721
11722 return device_list;
11723 }
11724
_step_state_delete(void * gres_data)11725 static void _step_state_delete(void *gres_data)
11726 {
11727 int i;
11728 gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data;
11729
11730 if (gres_ptr == NULL)
11731 return;
11732
11733 FREE_NULL_BITMAP(gres_ptr->node_in_use);
11734 if (gres_ptr->gres_bit_alloc) {
11735 for (i = 0; i < gres_ptr->node_cnt; i++)
11736 FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc[i]);
11737 xfree(gres_ptr->gres_bit_alloc);
11738 }
11739 xfree(gres_ptr->gres_cnt_node_alloc);
11740 xfree(gres_ptr->type_name);
11741 xfree(gres_ptr);
11742 }
11743
_gres_step_list_delete(void * list_element)11744 static void _gres_step_list_delete(void *list_element)
11745 {
11746 gres_state_t *gres_ptr = (gres_state_t *) list_element;
11747
11748 _step_state_delete(gres_ptr->gres_data);
11749 xfree(gres_ptr);
11750 }
11751
_step_test(void * step_gres_data,void * job_gres_data,int node_offset,bool first_step_node,uint16_t cpus_per_task,int max_rem_nodes,bool ignore_alloc,uint32_t job_id,uint32_t step_id,uint32_t plugin_id)11752 static uint64_t _step_test(void *step_gres_data, void *job_gres_data,
11753 int node_offset, bool first_step_node,
11754 uint16_t cpus_per_task, int max_rem_nodes,
11755 bool ignore_alloc,
11756 uint32_t job_id, uint32_t step_id,
11757 uint32_t plugin_id)
11758 {
11759 gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data;
11760 gres_step_state_t *step_gres_ptr = (gres_step_state_t *) step_gres_data;
11761 uint64_t core_cnt, gres_cnt, min_gres = 1, task_cnt;
11762
11763 xassert(job_gres_ptr);
11764 xassert(step_gres_ptr);
11765
11766 if ((node_offset >= job_gres_ptr->node_cnt) &&
11767 (job_gres_ptr->node_cnt != 0)) { /* GRES is type no_consume */
11768 error("gres/%s: %s %u.%u node offset invalid (%d >= %u)",
11769 job_gres_ptr->gres_name, __func__, job_id,
11770 step_id, node_offset,
11771 job_gres_ptr->node_cnt);
11772 return 0;
11773 }
11774
11775 if (first_step_node) {
11776 if (ignore_alloc)
11777 step_gres_ptr->gross_gres = 0;
11778 else
11779 step_gres_ptr->total_gres = 0;
11780 }
11781 if (step_gres_ptr->gres_per_node)
11782 min_gres = step_gres_ptr-> gres_per_node;
11783 if (step_gres_ptr->gres_per_socket)
11784 min_gres = MAX(min_gres, step_gres_ptr->gres_per_socket);
11785 if (step_gres_ptr->gres_per_task)
11786 min_gres = MAX(min_gres, step_gres_ptr->gres_per_task);
11787 if (step_gres_ptr->gres_per_step &&
11788 (step_gres_ptr->gres_per_step > step_gres_ptr->total_gres) &&
11789 (max_rem_nodes == 1)) {
11790 gres_cnt = step_gres_ptr->gres_per_step;
11791 if (ignore_alloc)
11792 gres_cnt -= step_gres_ptr->gross_gres;
11793 else
11794 gres_cnt -= step_gres_ptr->total_gres;
11795 min_gres = MAX(min_gres, gres_cnt);
11796 }
11797
11798 if (!_shared_gres(plugin_id) &&
11799 job_gres_ptr->gres_bit_alloc &&
11800 job_gres_ptr->gres_bit_alloc[node_offset]) {
11801 gres_cnt = bit_set_count(job_gres_ptr->
11802 gres_bit_alloc[node_offset]);
11803 if (!ignore_alloc &&
11804 job_gres_ptr->gres_bit_step_alloc &&
11805 job_gres_ptr->gres_bit_step_alloc[node_offset]) {
11806 gres_cnt -= bit_set_count(job_gres_ptr->
11807 gres_bit_step_alloc
11808 [node_offset]);
11809 }
11810 if (min_gres > gres_cnt) {
11811 core_cnt = 0;
11812 } else if (step_gres_ptr->gres_per_task) {
11813 task_cnt = (gres_cnt + step_gres_ptr->gres_per_task - 1)
11814 / step_gres_ptr->gres_per_task;
11815 core_cnt = task_cnt * cpus_per_task;
11816 } else
11817 core_cnt = NO_VAL64;
11818 } else if (job_gres_ptr->gres_cnt_node_alloc &&
11819 job_gres_ptr->gres_cnt_step_alloc) {
11820 gres_cnt = job_gres_ptr->gres_cnt_node_alloc[node_offset];
11821 if (!ignore_alloc) {
11822 gres_cnt -= job_gres_ptr->
11823 gres_cnt_step_alloc[node_offset];
11824 }
11825 if (min_gres > gres_cnt) {
11826 core_cnt = 0;
11827 } else if (step_gres_ptr->gres_per_task) {
11828 task_cnt = (gres_cnt + step_gres_ptr->gres_per_task - 1)
11829 / step_gres_ptr->gres_per_task;
11830 core_cnt = task_cnt * cpus_per_task;
11831 } else
11832 core_cnt = NO_VAL64;
11833 } else {
11834 debug3("gres/%s: %s %u.%u gres_bit_alloc and gres_cnt_node_alloc are NULL",
11835 job_gres_ptr->gres_name, __func__, job_id, step_id);
11836 gres_cnt = 0;
11837 core_cnt = NO_VAL64;
11838 }
11839 if (core_cnt != 0) {
11840 if (ignore_alloc)
11841 step_gres_ptr->gross_gres += gres_cnt;
11842 else
11843 step_gres_ptr->total_gres += gres_cnt;
11844 }
11845
11846 return core_cnt;
11847 }
11848
11849 /*
11850 * TRES specification parse logic
11851 * in_val IN - initial input string
11852 * cnt OUT - count of values
11853 * gres_list IN/OUT - where to search for (or add) new step TRES record
11854 * save_ptr IN/OUT - NULL on initial call, otherwise value from previous call
11855 * rc OUT - unchanged or an error code
11856 * RET gres - step record to set value in, found or created by this function
11857 */
_get_next_step_gres(char * in_val,uint64_t * cnt,List gres_list,char ** save_ptr,int * rc)11858 static gres_step_state_t *_get_next_step_gres(char *in_val, uint64_t *cnt,
11859 List gres_list, char **save_ptr,
11860 int *rc)
11861 {
11862 static char *prev_save_ptr = NULL;
11863 int context_inx = NO_VAL, my_rc = SLURM_SUCCESS;
11864 gres_step_state_t *step_gres_data = NULL;
11865 gres_state_t *gres_ptr;
11866 gres_key_t step_search_key;
11867 char *type = NULL, *name = NULL;
11868 uint16_t flags = 0;
11869
11870 xassert(save_ptr);
11871 if (!in_val && (*save_ptr == NULL)) {
11872 return NULL;
11873 }
11874
11875 if (*save_ptr == NULL) {
11876 prev_save_ptr = in_val;
11877 } else if (*save_ptr != prev_save_ptr) {
11878 error("%s: parsing error", __func__);
11879 my_rc = SLURM_ERROR;
11880 goto fini;
11881 }
11882
11883 if (prev_save_ptr[0] == '\0') { /* Empty input token */
11884 *save_ptr = NULL;
11885 return NULL;
11886 }
11887
11888 if ((my_rc = _get_next_gres(in_val, &type, &context_inx,
11889 cnt, &flags, &prev_save_ptr)) ||
11890 (context_inx == NO_VAL)) {
11891 prev_save_ptr = NULL;
11892 goto fini;
11893 }
11894
11895 /* Find the step GRES record */
11896 step_search_key.plugin_id = gres_context[context_inx].plugin_id;
11897 step_search_key.type_id = gres_plugin_build_id(type);
11898 gres_ptr = list_find_first(gres_list, _gres_find_step_by_key,
11899 &step_search_key);
11900
11901 if (gres_ptr) {
11902 step_gres_data = gres_ptr->gres_data;
11903 } else {
11904 step_gres_data = xmalloc(sizeof(gres_step_state_t));
11905 step_gres_data->type_id = gres_plugin_build_id(type);
11906 step_gres_data->type_name = type;
11907 type = NULL; /* String moved above */
11908 gres_ptr = xmalloc(sizeof(gres_state_t));
11909 gres_ptr->plugin_id = gres_context[context_inx].plugin_id;
11910 gres_ptr->gres_data = step_gres_data;
11911 list_append(gres_list, gres_ptr);
11912 }
11913 step_gres_data->flags = flags;
11914
11915 fini: xfree(name);
11916 xfree(type);
11917 if (my_rc != SLURM_SUCCESS) {
11918 prev_save_ptr = NULL;
11919 if (my_rc == ESLURM_INVALID_GRES)
11920 info("Invalid GRES job specification %s", in_val);
11921 *rc = my_rc;
11922 }
11923 *save_ptr = prev_save_ptr;
11924 return step_gres_data;
11925 }
11926
11927 /* Test that the step does not request more GRES than the job contains */
_validate_step_counts(List step_gres_list,List job_gres_list,int * rc)11928 static void _validate_step_counts(List step_gres_list, List job_gres_list,
11929 int *rc)
11930 {
11931 ListIterator iter;
11932 gres_state_t *job_gres_ptr, *step_gres_ptr;
11933 gres_job_state_t *job_gres_data;
11934 gres_step_state_t *step_gres_data;
11935 gres_key_t job_search_key;
11936 uint16_t cpus_per_gres;
11937 uint64_t mem_per_gres;
11938
11939 if (!step_gres_list || (list_count(step_gres_list) == 0))
11940 return;
11941 if (!job_gres_list || (list_count(job_gres_list) == 0)) {
11942 *rc = ESLURM_INVALID_GRES;
11943 return;
11944 }
11945
11946 iter = list_iterator_create(step_gres_list);
11947 while ((step_gres_ptr = (gres_state_t *) list_next(iter))) {
11948 step_gres_data = (gres_step_state_t *) step_gres_ptr->gres_data;
11949 job_search_key.plugin_id = step_gres_ptr->plugin_id;
11950 if (step_gres_data->type_id == 0)
11951 job_search_key.type_id = NO_VAL;
11952 else
11953 job_search_key.type_id = step_gres_data->type_id;
11954 job_gres_ptr = list_find_first(job_gres_list,
11955 _gres_find_job_by_key,
11956 &job_search_key);
11957 if (!job_gres_ptr || !job_gres_ptr->gres_data) {
11958 *rc = ESLURM_INVALID_GRES;
11959 break;
11960 }
11961 job_gres_data = (gres_job_state_t *) job_gres_ptr->gres_data;
11962 if (job_gres_data->cpus_per_gres)
11963 cpus_per_gres = job_gres_data->cpus_per_gres;
11964 else
11965 cpus_per_gres = job_gres_data->def_cpus_per_gres;
11966 if (cpus_per_gres && step_gres_data->cpus_per_gres &&
11967 (cpus_per_gres < step_gres_data->cpus_per_gres)) {
11968 *rc = ESLURM_INVALID_GRES;
11969 break;
11970 }
11971 if (job_gres_data->gres_per_job &&
11972 step_gres_data->gres_per_step &&
11973 (job_gres_data->gres_per_job <
11974 step_gres_data->gres_per_step)) {
11975 *rc = ESLURM_INVALID_GRES;
11976 break;
11977 }
11978 if (job_gres_data->gres_per_node &&
11979 step_gres_data->gres_per_node &&
11980 (job_gres_data->gres_per_node <
11981 step_gres_data->gres_per_node)) {
11982 *rc = ESLURM_INVALID_GRES;
11983 break;
11984 }
11985 if (job_gres_data->gres_per_socket &&
11986 step_gres_data->gres_per_socket &&
11987 (job_gres_data->gres_per_socket <
11988 step_gres_data->gres_per_socket)) {
11989 *rc = ESLURM_INVALID_GRES;
11990 break;
11991 }
11992 if (job_gres_data->gres_per_task &&
11993 step_gres_data->gres_per_task &&
11994 (job_gres_data->gres_per_task <
11995 step_gres_data->gres_per_task)) {
11996 *rc = ESLURM_INVALID_GRES;
11997 break;
11998 }
11999 if (job_gres_data->mem_per_gres)
12000 mem_per_gres = job_gres_data->mem_per_gres;
12001 else
12002 mem_per_gres = job_gres_data->def_mem_per_gres;
12003 if (mem_per_gres && step_gres_data->mem_per_gres &&
12004 (mem_per_gres < step_gres_data->mem_per_gres)) {
12005 *rc = ESLURM_INVALID_GRES;
12006 break;
12007 }
12008
12009 }
12010 list_iterator_destroy(iter);
12011 }
12012
12013 /*
12014 * Given a step's requested gres configuration, validate it and build gres list
12015 * IN *tres* - step's requested gres input string
12016 * OUT step_gres_list - List of Gres records for this step to track usage
12017 * IN job_gres_list - List of Gres records for this job
12018 * IN job_id, step_id - ID of the step being allocated.
12019 * RET SLURM_SUCCESS or ESLURM_INVALID_GRES
12020 */
gres_plugin_step_state_validate(char * cpus_per_tres,char * tres_per_step,char * tres_per_node,char * tres_per_socket,char * tres_per_task,char * mem_per_tres,List * step_gres_list,List job_gres_list,uint32_t job_id,uint32_t step_id)12021 extern int gres_plugin_step_state_validate(char *cpus_per_tres,
12022 char *tres_per_step,
12023 char *tres_per_node,
12024 char *tres_per_socket,
12025 char *tres_per_task,
12026 char *mem_per_tres,
12027 List *step_gres_list,
12028 List job_gres_list, uint32_t job_id,
12029 uint32_t step_id)
12030 {
12031 int rc;
12032 gres_step_state_t *step_gres_data;
12033 List new_step_list;
12034 uint64_t cnt = 0;
12035
12036 *step_gres_list = NULL;
12037 if ((rc = gres_plugin_init()) != SLURM_SUCCESS)
12038 return rc;
12039
12040 slurm_mutex_lock(&gres_context_lock);
12041 new_step_list = list_create(_gres_step_list_delete);
12042 if (cpus_per_tres) {
12043 char *in_val = cpus_per_tres, *save_ptr = NULL;
12044 while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
12045 new_step_list,
12046 &save_ptr, &rc))) {
12047 step_gres_data->cpus_per_gres = cnt;
12048 in_val = NULL;
12049 }
12050 }
12051 if (tres_per_step) {
12052 char *in_val = tres_per_step, *save_ptr = NULL;
12053 while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
12054 new_step_list,
12055 &save_ptr, &rc))) {
12056 step_gres_data->gres_per_step = cnt;
12057 in_val = NULL;
12058 }
12059 }
12060 if (tres_per_node) {
12061 char *in_val = tres_per_node, *save_ptr = NULL;
12062 while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
12063 new_step_list,
12064 &save_ptr, &rc))) {
12065 step_gres_data->gres_per_node = cnt;
12066 in_val = NULL;
12067 }
12068 }
12069 if (tres_per_socket) {
12070 char *in_val = tres_per_socket, *save_ptr = NULL;
12071 while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
12072 new_step_list,
12073 &save_ptr, &rc))) {
12074 step_gres_data->gres_per_socket = cnt;
12075 in_val = NULL;
12076 }
12077 }
12078 if (tres_per_task) {
12079 char *in_val = tres_per_task, *save_ptr = NULL;
12080 while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
12081 new_step_list,
12082 &save_ptr, &rc))) {
12083 step_gres_data->gres_per_task = cnt;
12084 in_val = NULL;
12085 }
12086 }
12087 if (mem_per_tres) {
12088 char *in_val = mem_per_tres, *save_ptr = NULL;
12089 while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
12090 new_step_list,
12091 &save_ptr, &rc))) {
12092 step_gres_data->mem_per_gres = cnt;
12093 in_val = NULL;
12094 }
12095 }
12096 if (list_count(new_step_list) == 0) {
12097 FREE_NULL_LIST(new_step_list);
12098 } else {
12099 if (rc == SLURM_SUCCESS)
12100 _validate_step_counts(new_step_list, job_gres_list,
12101 &rc);
12102 if (rc == SLURM_SUCCESS)
12103 *step_gres_list = new_step_list;
12104 else
12105 FREE_NULL_LIST(new_step_list);
12106 }
12107 slurm_mutex_unlock(&gres_context_lock);
12108 return rc;
12109 }
12110
_step_state_dup(void * gres_data)12111 static void *_step_state_dup(void *gres_data)
12112 {
12113
12114 int i;
12115 gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data;
12116 gres_step_state_t *new_gres_ptr;
12117
12118 xassert(gres_ptr);
12119 new_gres_ptr = xmalloc(sizeof(gres_step_state_t));
12120 new_gres_ptr->cpus_per_gres = gres_ptr->cpus_per_gres;
12121 new_gres_ptr->gres_per_step = gres_ptr->gres_per_step;
12122 new_gres_ptr->gres_per_node = gres_ptr->gres_per_node;
12123 new_gres_ptr->gres_per_socket = gres_ptr->gres_per_socket;
12124 new_gres_ptr->gres_per_task = gres_ptr->gres_per_task;
12125 new_gres_ptr->mem_per_gres = gres_ptr->mem_per_gres;
12126 new_gres_ptr->node_cnt = gres_ptr->node_cnt;
12127 new_gres_ptr->total_gres = gres_ptr->total_gres;
12128
12129 if (gres_ptr->node_in_use)
12130 new_gres_ptr->node_in_use = bit_copy(gres_ptr->node_in_use);
12131
12132 if (gres_ptr->gres_cnt_node_alloc) {
12133 i = sizeof(uint64_t) * gres_ptr->node_cnt;
12134 new_gres_ptr->gres_cnt_node_alloc = xmalloc(i);
12135 memcpy(new_gres_ptr->gres_cnt_node_alloc,
12136 gres_ptr->gres_cnt_node_alloc, i);
12137 }
12138 if (gres_ptr->gres_bit_alloc) {
12139 new_gres_ptr->gres_bit_alloc = xcalloc(gres_ptr->node_cnt,
12140 sizeof(bitstr_t *));
12141 for (i = 0; i < gres_ptr->node_cnt; i++) {
12142 if (gres_ptr->gres_bit_alloc[i] == NULL)
12143 continue;
12144 new_gres_ptr->gres_bit_alloc[i] =
12145 bit_copy(gres_ptr->gres_bit_alloc[i]);
12146 }
12147 }
12148 return new_gres_ptr;
12149 }
12150
12151 uint64_t *gres_cnt_node_alloc; /* Per node GRES allocated, */
12152
_step_state_dup2(void * gres_data,int node_index)12153 static void *_step_state_dup2(void *gres_data, int node_index)
12154 {
12155
12156 gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data;
12157 gres_step_state_t *new_gres_ptr;
12158
12159 xassert(gres_ptr);
12160 new_gres_ptr = xmalloc(sizeof(gres_step_state_t));
12161 new_gres_ptr->cpus_per_gres = gres_ptr->cpus_per_gres;
12162 new_gres_ptr->gres_per_step = gres_ptr->gres_per_step;
12163 new_gres_ptr->gres_per_node = gres_ptr->gres_per_node;
12164 new_gres_ptr->gres_per_socket = gres_ptr->gres_per_socket;
12165 new_gres_ptr->gres_per_task = gres_ptr->gres_per_task;
12166 new_gres_ptr->mem_per_gres = gres_ptr->mem_per_gres;
12167 new_gres_ptr->node_cnt = 1;
12168 new_gres_ptr->total_gres = gres_ptr->total_gres;
12169
12170 if (gres_ptr->node_in_use)
12171 new_gres_ptr->node_in_use = bit_copy(gres_ptr->node_in_use);
12172
12173 if (gres_ptr->gres_cnt_node_alloc) {
12174 new_gres_ptr->gres_cnt_node_alloc = xmalloc(sizeof(uint64_t));
12175 new_gres_ptr->gres_cnt_node_alloc[0] =
12176 gres_ptr->gres_cnt_node_alloc[node_index];
12177 }
12178
12179 if ((node_index < gres_ptr->node_cnt) && gres_ptr->gres_bit_alloc &&
12180 gres_ptr->gres_bit_alloc[node_index]) {
12181 new_gres_ptr->gres_bit_alloc = xmalloc(sizeof(bitstr_t *));
12182 new_gres_ptr->gres_bit_alloc[0] =
12183 bit_copy(gres_ptr->gres_bit_alloc[node_index]);
12184 }
12185 return new_gres_ptr;
12186 }
12187
12188 /*
12189 * Create a copy of a step's gres state
12190 * IN gres_list - List of Gres records for this step to track usage
12191 * RET The copy or NULL on failure
12192 */
gres_plugin_step_state_dup(List gres_list)12193 List gres_plugin_step_state_dup(List gres_list)
12194 {
12195 return gres_plugin_step_state_extract(gres_list, -1);
12196 }
12197
12198 /*
12199 * Create a copy of a step's gres state for a particular node index
12200 * IN gres_list - List of Gres records for this step to track usage
12201 * IN node_index - zero-origin index to the node
12202 * RET The copy or NULL on failure
12203 */
gres_plugin_step_state_extract(List gres_list,int node_index)12204 List gres_plugin_step_state_extract(List gres_list, int node_index)
12205 {
12206 ListIterator gres_iter;
12207 gres_state_t *gres_ptr, *new_gres_state;
12208 List new_gres_list = NULL;
12209 void *new_gres_data;
12210
12211 if (gres_list == NULL)
12212 return new_gres_list;
12213
12214 (void) gres_plugin_init();
12215
12216 slurm_mutex_lock(&gres_context_lock);
12217 gres_iter = list_iterator_create(gres_list);
12218 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
12219 if (node_index == -1)
12220 new_gres_data = _step_state_dup(gres_ptr->gres_data);
12221 else {
12222 new_gres_data = _step_state_dup2(gres_ptr->gres_data,
12223 node_index);
12224 }
12225 if (new_gres_list == NULL) {
12226 new_gres_list = list_create(_gres_step_list_delete);
12227 }
12228 new_gres_state = xmalloc(sizeof(gres_state_t));
12229 new_gres_state->plugin_id = gres_ptr->plugin_id;
12230 new_gres_state->gres_data = new_gres_data;
12231 list_append(new_gres_list, new_gres_state);
12232 }
12233 list_iterator_destroy(gres_iter);
12234 slurm_mutex_unlock(&gres_context_lock);
12235
12236 return new_gres_list;
12237 }
12238
12239 /*
12240 * A job allocation size has changed. Update the job step gres information
12241 * bitmaps and other data structures.
12242 * IN gres_list - List of Gres records for this step to track usage
12243 * IN orig_job_node_bitmap - bitmap of nodes in the original job allocation
12244 * IN new_job_node_bitmap - bitmap of nodes in the new job allocation
12245 */
gres_plugin_step_state_rebase(List gres_list,bitstr_t * orig_job_node_bitmap,bitstr_t * new_job_node_bitmap)12246 void gres_plugin_step_state_rebase(List gres_list,
12247 bitstr_t *orig_job_node_bitmap,
12248 bitstr_t *new_job_node_bitmap)
12249 {
12250 ListIterator gres_iter;
12251 gres_state_t *gres_ptr;
12252 gres_step_state_t *gres_step_ptr;
12253 int new_node_cnt;
12254 int i_first, i_last, i;
12255 int old_inx, new_inx;
12256 bitstr_t *new_node_in_use;
12257 bitstr_t **new_gres_bit_alloc = NULL;
12258
12259 if (gres_list == NULL)
12260 return;
12261
12262 (void) gres_plugin_init();
12263
12264 slurm_mutex_lock(&gres_context_lock);
12265 gres_iter = list_iterator_create(gres_list);
12266 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
12267 gres_step_ptr = (gres_step_state_t *) gres_ptr->gres_data;
12268 if (!gres_step_ptr)
12269 continue;
12270 if (!gres_step_ptr->node_in_use) {
12271 error("gres_plugin_step_state_rebase: node_in_use is NULL");
12272 continue;
12273 }
12274 new_node_cnt = bit_set_count(new_job_node_bitmap);
12275 i_first = MIN(bit_ffs(orig_job_node_bitmap),
12276 bit_ffs(new_job_node_bitmap));
12277 i_first = MAX(i_first, 0);
12278 i_last = MAX(bit_fls(orig_job_node_bitmap),
12279 bit_fls(new_job_node_bitmap));
12280 if (i_last == -1) {
12281 error("gres_plugin_step_state_rebase: node_bitmaps "
12282 "are empty");
12283 continue;
12284 }
12285 new_node_in_use = bit_alloc(new_node_cnt);
12286
12287 old_inx = new_inx = -1;
12288 for (i = i_first; i <= i_last; i++) {
12289 bool old_match = false, new_match = false;
12290 if (bit_test(orig_job_node_bitmap, i)) {
12291 old_match = true;
12292 old_inx++;
12293 }
12294 if (bit_test(new_job_node_bitmap, i)) {
12295 new_match = true;
12296 new_inx++;
12297 }
12298 if (old_match && new_match) {
12299 bit_set(new_node_in_use, new_inx);
12300 if (gres_step_ptr->gres_bit_alloc) {
12301 if (!new_gres_bit_alloc) {
12302 new_gres_bit_alloc =
12303 xcalloc(new_node_cnt,
12304 sizeof(bitstr_t *));
12305 }
12306 new_gres_bit_alloc[new_inx] =
12307 gres_step_ptr->gres_bit_alloc[old_inx];
12308 }
12309 } else if (old_match &&
12310 gres_step_ptr->gres_bit_alloc &&
12311 gres_step_ptr->gres_bit_alloc[old_inx]) {
12312 /* Node removed from job allocation,
12313 * release step's resources */
12314 bit_free(gres_step_ptr->
12315 gres_bit_alloc[old_inx]);
12316 }
12317 }
12318
12319 gres_step_ptr->node_cnt = new_node_cnt;
12320 bit_free(gres_step_ptr->node_in_use);
12321 gres_step_ptr->node_in_use = new_node_in_use;
12322 xfree(gres_step_ptr->gres_bit_alloc);
12323 gres_step_ptr->gres_bit_alloc = new_gres_bit_alloc;
12324 }
12325 list_iterator_destroy(gres_iter);
12326 slurm_mutex_unlock(&gres_context_lock);
12327
12328 return;
12329 }
12330
12331 /*
12332 * Pack a step's current gres status, called from slurmctld for save/restore
12333 * IN gres_list - generated by gres_plugin_step_alloc()
12334 * IN/OUT buffer - location to write state to
12335 * IN job_id, step_id - job and step ID for logging
12336 */
gres_plugin_step_state_pack(List gres_list,Buf buffer,uint32_t job_id,uint32_t step_id,uint16_t protocol_version)12337 extern int gres_plugin_step_state_pack(List gres_list, Buf buffer,
12338 uint32_t job_id, uint32_t step_id,
12339 uint16_t protocol_version)
12340 {
12341 int i, rc = SLURM_SUCCESS;
12342 uint32_t top_offset, tail_offset, magic = GRES_MAGIC;
12343 uint16_t rec_cnt = 0;
12344 ListIterator gres_iter;
12345 gres_state_t *gres_ptr;
12346 gres_step_state_t *gres_step_ptr;
12347
12348 top_offset = get_buf_offset(buffer);
12349 pack16(rec_cnt, buffer); /* placeholder if data */
12350
12351 if (gres_list == NULL)
12352 return rc;
12353
12354 (void) gres_plugin_init();
12355
12356 slurm_mutex_lock(&gres_context_lock);
12357 gres_iter = list_iterator_create(gres_list);
12358 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
12359 gres_step_ptr = (gres_step_state_t *) gres_ptr->gres_data;
12360
12361 if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
12362 pack32(magic, buffer);
12363 pack32(gres_ptr->plugin_id, buffer);
12364 pack16(gres_step_ptr->cpus_per_gres, buffer);
12365 pack16(gres_step_ptr->flags, buffer);
12366 pack64(gres_step_ptr->gres_per_step, buffer);
12367 pack64(gres_step_ptr->gres_per_node, buffer);
12368 pack64(gres_step_ptr->gres_per_socket, buffer);
12369 pack64(gres_step_ptr->gres_per_task, buffer);
12370 pack64(gres_step_ptr->mem_per_gres, buffer);
12371 pack64(gres_step_ptr->total_gres, buffer);
12372 pack32(gres_step_ptr->node_cnt, buffer);
12373 pack_bit_str_hex(gres_step_ptr->node_in_use, buffer);
12374 if (gres_step_ptr->gres_cnt_node_alloc) {
12375 pack8((uint8_t) 1, buffer);
12376 pack64_array(gres_step_ptr->gres_cnt_node_alloc,
12377 gres_step_ptr->node_cnt, buffer);
12378 } else {
12379 pack8((uint8_t) 0, buffer);
12380 }
12381 if (gres_step_ptr->gres_bit_alloc) {
12382 pack8((uint8_t) 1, buffer);
12383 for (i = 0; i < gres_step_ptr->node_cnt; i++)
12384 pack_bit_str_hex(gres_step_ptr->
12385 gres_bit_alloc[i],
12386 buffer);
12387 } else {
12388 pack8((uint8_t) 0, buffer);
12389 }
12390 rec_cnt++;
12391 } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
12392 pack32(magic, buffer);
12393 pack32(gres_ptr->plugin_id, buffer);
12394 pack16(gres_step_ptr->cpus_per_gres, buffer);
12395 pack64(gres_step_ptr->gres_per_step, buffer);
12396 pack64(gres_step_ptr->gres_per_node, buffer);
12397 pack64(gres_step_ptr->gres_per_socket, buffer);
12398 pack64(gres_step_ptr->gres_per_task, buffer);
12399 pack64(gres_step_ptr->mem_per_gres, buffer);
12400 pack64(gres_step_ptr->total_gres, buffer);
12401 pack32(gres_step_ptr->node_cnt, buffer);
12402 pack_bit_str_hex(gres_step_ptr->node_in_use, buffer);
12403 if (gres_step_ptr->gres_cnt_node_alloc) {
12404 pack8((uint8_t) 1, buffer);
12405 pack64_array(gres_step_ptr->gres_cnt_node_alloc,
12406 gres_step_ptr->node_cnt, buffer);
12407 } else {
12408 pack8((uint8_t) 0, buffer);
12409 }
12410 if (gres_step_ptr->gres_bit_alloc) {
12411 pack8((uint8_t) 1, buffer);
12412 for (i = 0; i < gres_step_ptr->node_cnt; i++)
12413 pack_bit_str_hex(gres_step_ptr->
12414 gres_bit_alloc[i],
12415 buffer);
12416 } else {
12417 pack8((uint8_t) 0, buffer);
12418 }
12419 rec_cnt++;
12420 } else {
12421 error("%s: protocol_version %hu not supported",
12422 __func__, protocol_version);
12423 break;
12424 }
12425 }
12426 list_iterator_destroy(gres_iter);
12427 slurm_mutex_unlock(&gres_context_lock);
12428
12429 tail_offset = get_buf_offset(buffer);
12430 set_buf_offset(buffer, top_offset);
12431 pack16(rec_cnt, buffer);
12432 set_buf_offset(buffer, tail_offset);
12433
12434 return rc;
12435 }
12436
12437 /*
12438 * Unpack a step's current gres status, called from slurmctld for save/restore
12439 * OUT gres_list - restored state stored by gres_plugin_step_state_pack()
12440 * IN/OUT buffer - location to read state from
12441 * IN job_id, step_id - job and step ID for logging
12442 */
gres_plugin_step_state_unpack(List * gres_list,Buf buffer,uint32_t job_id,uint32_t step_id,uint16_t protocol_version)12443 extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer,
12444 uint32_t job_id, uint32_t step_id,
12445 uint16_t protocol_version)
12446 {
12447 int i, rc;
12448 uint32_t magic = 0, plugin_id = 0, uint32_tmp = 0;
12449 uint16_t rec_cnt = 0;
12450 uint8_t data_flag = 0;
12451 gres_state_t *gres_ptr;
12452 gres_step_state_t *gres_step_ptr = NULL;
12453
12454 safe_unpack16(&rec_cnt, buffer);
12455 if (rec_cnt == 0)
12456 return SLURM_SUCCESS;
12457
12458 rc = gres_plugin_init();
12459
12460 slurm_mutex_lock(&gres_context_lock);
12461 if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
12462 *gres_list = list_create(_gres_step_list_delete);
12463 }
12464
12465 while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
12466 if ((buffer == NULL) || (remaining_buf(buffer) == 0))
12467 break;
12468 rec_cnt--;
12469 if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
12470 safe_unpack32(&magic, buffer);
12471 if (magic != GRES_MAGIC)
12472 goto unpack_error;
12473 safe_unpack32(&plugin_id, buffer);
12474 gres_step_ptr = xmalloc(sizeof(gres_step_state_t));
12475 safe_unpack16(&gres_step_ptr->cpus_per_gres, buffer);
12476 safe_unpack16(&gres_step_ptr->flags, buffer);
12477 safe_unpack64(&gres_step_ptr->gres_per_step, buffer);
12478 safe_unpack64(&gres_step_ptr->gres_per_node, buffer);
12479 safe_unpack64(&gres_step_ptr->gres_per_socket, buffer);
12480 safe_unpack64(&gres_step_ptr->gres_per_task, buffer);
12481 safe_unpack64(&gres_step_ptr->mem_per_gres, buffer);
12482 safe_unpack64(&gres_step_ptr->total_gres, buffer);
12483 safe_unpack32(&gres_step_ptr->node_cnt, buffer);
12484 if (gres_step_ptr->node_cnt > NO_VAL)
12485 goto unpack_error;
12486 unpack_bit_str_hex(&gres_step_ptr->node_in_use, buffer);
12487 safe_unpack8(&data_flag, buffer);
12488 if (data_flag) {
12489 safe_unpack64_array(
12490 &gres_step_ptr->gres_cnt_node_alloc,
12491 &uint32_tmp, buffer);
12492 }
12493 safe_unpack8(&data_flag, buffer);
12494 if (data_flag) {
12495 gres_step_ptr->gres_bit_alloc =
12496 xcalloc(gres_step_ptr->node_cnt,
12497 sizeof(bitstr_t *));
12498 for (i = 0; i < gres_step_ptr->node_cnt; i++) {
12499 unpack_bit_str_hex(&gres_step_ptr->
12500 gres_bit_alloc[i],
12501 buffer);
12502 }
12503 }
12504 } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
12505 safe_unpack32(&magic, buffer);
12506 if (magic != GRES_MAGIC)
12507 goto unpack_error;
12508 safe_unpack32(&plugin_id, buffer);
12509 gres_step_ptr = xmalloc(sizeof(gres_step_state_t));
12510 safe_unpack16(&gres_step_ptr->cpus_per_gres, buffer);
12511 safe_unpack64(&gres_step_ptr->gres_per_step, buffer);
12512 safe_unpack64(&gres_step_ptr->gres_per_node, buffer);
12513 safe_unpack64(&gres_step_ptr->gres_per_socket, buffer);
12514 safe_unpack64(&gres_step_ptr->gres_per_task, buffer);
12515 safe_unpack64(&gres_step_ptr->mem_per_gres, buffer);
12516 safe_unpack64(&gres_step_ptr->total_gres, buffer);
12517 safe_unpack32(&gres_step_ptr->node_cnt, buffer);
12518 if (gres_step_ptr->node_cnt > NO_VAL)
12519 goto unpack_error;
12520 unpack_bit_str_hex(&gres_step_ptr->node_in_use, buffer);
12521 safe_unpack8(&data_flag, buffer);
12522 if (data_flag) {
12523 safe_unpack64_array(
12524 &gres_step_ptr->gres_cnt_node_alloc,
12525 &uint32_tmp, buffer);
12526 }
12527 safe_unpack8(&data_flag, buffer);
12528 if (data_flag) {
12529 gres_step_ptr->gres_bit_alloc =
12530 xcalloc(gres_step_ptr->node_cnt,
12531 sizeof(bitstr_t *));
12532 for (i = 0; i < gres_step_ptr->node_cnt; i++) {
12533 unpack_bit_str_hex(&gres_step_ptr->
12534 gres_bit_alloc[i],
12535 buffer);
12536 }
12537 }
12538 } else {
12539 error("%s: protocol_version %hu not supported",
12540 __func__, protocol_version);
12541 goto unpack_error;
12542 }
12543
12544 for (i = 0; i < gres_context_cnt; i++) {
12545 if (gres_context[i].plugin_id == plugin_id)
12546 break;
12547 }
12548 if (i >= gres_context_cnt) {
12549 /*
12550 * A likely sign that GresPlugins has changed.
12551 * Not a fatal error, skip over the data.
12552 */
12553 info("%s: no plugin configured to unpack data type %u from step %u.%u",
12554 __func__, plugin_id, job_id, step_id);
12555 _step_state_delete(gres_step_ptr);
12556 gres_step_ptr = NULL;
12557 continue;
12558 }
12559 gres_ptr = xmalloc(sizeof(gres_state_t));
12560 gres_ptr->plugin_id = gres_context[i].plugin_id;
12561 gres_ptr->gres_data = gres_step_ptr;
12562 gres_step_ptr = NULL;
12563 list_append(*gres_list, gres_ptr);
12564 }
12565 slurm_mutex_unlock(&gres_context_lock);
12566 return rc;
12567
12568 unpack_error:
12569 error("%s: unpack error from step %u.%u", __func__, job_id, step_id);
12570 if (gres_step_ptr)
12571 _step_state_delete(gres_step_ptr);
12572 slurm_mutex_unlock(&gres_context_lock);
12573 return SLURM_ERROR;
12574 }
12575
12576 /* Return the count of GRES of a specific name on this machine
12577 * IN step_gres_list - generated by gres_plugin_step_alloc()
12578 * IN gres_name - name of the GRES to match
12579 * RET count of GRES of this specific name available to the job or NO_VAL64
12580 */
gres_plugin_step_count(List step_gres_list,char * gres_name)12581 extern uint64_t gres_plugin_step_count(List step_gres_list, char *gres_name)
12582 {
12583 uint64_t gres_cnt = NO_VAL64;
12584 gres_state_t *gres_ptr = NULL;
12585 gres_step_state_t *gres_step_ptr = NULL;
12586 ListIterator gres_iter;
12587 int i;
12588
12589 if (!step_gres_list)
12590 return gres_cnt;
12591
12592 slurm_mutex_lock(&gres_context_lock);
12593 for (i = 0; i < gres_context_cnt; i++) {
12594 if (xstrcmp(gres_context[i].gres_name, gres_name))
12595 continue;
12596 gres_iter = list_iterator_create(step_gres_list);
12597 while ((gres_ptr = (gres_state_t *)list_next(gres_iter))) {
12598 if (gres_ptr->plugin_id != gres_context[i].plugin_id)
12599 continue;
12600 gres_step_ptr = (gres_step_state_t*)gres_ptr->gres_data;
12601 if (gres_cnt == NO_VAL64)
12602 gres_cnt = gres_step_ptr->gres_per_node;
12603 else
12604 gres_cnt += gres_step_ptr->gres_per_node;
12605 }
12606 list_iterator_destroy(gres_iter);
12607 break;
12608 }
12609 slurm_mutex_unlock(&gres_context_lock);
12610
12611 return gres_cnt;
12612 }
12613
12614 /*
12615 * Given a GRES context index, return a bitmap representing those GRES
12616 * which are available from the CPUs current allocated to this process.
12617 * This function only works with task/cgroup and constrained devices or
12618 * if the job step has access to the entire node's resources.
12619 */
_get_usable_gres(int context_inx)12620 static bitstr_t * _get_usable_gres(int context_inx)
12621 {
12622 #if defined(__APPLE__)
12623 return NULL;
12624 #else
12625 #ifdef __NetBSD__
12626 // On NetBSD, cpuset_t is an opaque data type
12627 cpuset_t *mask = cpuset_create();
12628 #else
12629 cpu_set_t mask;
12630 #endif
12631 bitstr_t *usable_gres = NULL;
12632 int i, i_last, rc;
12633 ListIterator iter;
12634 gres_slurmd_conf_t *gres_slurmd_conf;
12635 int gres_inx = 0;
12636
12637 if (!gres_conf_list) {
12638 error("gres_conf_list is null!");
12639 return NULL;
12640 }
12641
12642 CPU_ZERO(&mask);
12643 #ifdef __FreeBSD__
12644 rc = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
12645 sizeof(mask), &mask);
12646 #else
12647 rc = sched_getaffinity(0, sizeof(mask), &mask);
12648 #endif
12649 if (rc) {
12650 error("sched_getaffinity error: %m");
12651 return usable_gres;
12652 }
12653
12654 usable_gres = bit_alloc(MAX_GRES_BITMAP);
12655 iter = list_iterator_create(gres_conf_list);
12656 while ((gres_slurmd_conf = (gres_slurmd_conf_t *) list_next(iter))) {
12657 if (gres_slurmd_conf->plugin_id !=
12658 gres_context[context_inx].plugin_id)
12659 continue;
12660 if ((gres_inx + gres_slurmd_conf->count) >= MAX_GRES_BITMAP) {
12661 error("GRES %s bitmap overflow ((%d + %"PRIu64") >= %d)",
12662 gres_slurmd_conf->name, gres_inx,
12663 gres_slurmd_conf->count, MAX_GRES_BITMAP);
12664 continue;
12665 }
12666 if (!gres_slurmd_conf->cpus_bitmap) {
12667 bit_nset(usable_gres, gres_inx,
12668 gres_inx + gres_slurmd_conf->count - 1);
12669 } else {
12670 i_last = bit_fls(gres_slurmd_conf->cpus_bitmap);
12671 for (i = 0; i <= i_last; i++) {
12672 if (!bit_test(gres_slurmd_conf->cpus_bitmap, i))
12673 continue;
12674 if (!CPU_ISSET(i, &mask))
12675 continue;
12676 bit_nset(usable_gres, gres_inx,
12677 gres_inx + gres_slurmd_conf->count -1);
12678 break;
12679 }
12680 }
12681 gres_inx += gres_slurmd_conf->count;
12682 }
12683 list_iterator_destroy(iter);
12684
12685 #ifdef __NetBSD__
12686 cpuset_destroy(mask);
12687 #endif
12688
12689 return usable_gres;
12690 #endif
12691 }
12692
12693 /*
12694 * Configure the GRES hardware allocated to the current step while privileged
12695 *
12696 * IN step_gres_list - Step's GRES specification
12697 * IN node_id - relative position of this node in step
12698 * IN settings - string containing configuration settings for the hardware
12699 */
gres_plugin_step_hardware_init(List step_gres_list,uint32_t node_id,char * settings)12700 extern void gres_plugin_step_hardware_init(List step_gres_list,
12701 uint32_t node_id, char *settings)
12702 {
12703 int i;
12704 ListIterator iter;
12705 gres_state_t *gres_ptr;
12706 gres_step_state_t *gres_step_ptr;
12707 bitstr_t *devices;
12708
12709 if (!step_gres_list)
12710 return;
12711
12712 (void) gres_plugin_init();
12713 slurm_mutex_lock(&gres_context_lock);
12714 for (i = 0; i < gres_context_cnt; i++) {
12715 if (gres_context[i].ops.step_hardware_init == NULL)
12716 continue;
12717
12718 iter = list_iterator_create(step_gres_list);
12719 while ((gres_ptr = list_next(iter))) {
12720 if (gres_ptr->plugin_id == gres_context[i].plugin_id)
12721 break;
12722 }
12723 list_iterator_destroy(iter);
12724 if (!gres_ptr || !gres_ptr->gres_data)
12725 continue;
12726 gres_step_ptr = (gres_step_state_t *) gres_ptr->gres_data;
12727 if ((gres_step_ptr->node_cnt != 1) ||
12728 !gres_step_ptr->gres_bit_alloc ||
12729 !gres_step_ptr->gres_bit_alloc[0])
12730 continue;
12731
12732 devices = gres_step_ptr->gres_bit_alloc[0];
12733 if (settings)
12734 debug2("settings: %s", settings);
12735 if (devices) {
12736 char *dev_str = bit_fmt_full(devices);
12737 info("devices: %s", dev_str);
12738 xfree(dev_str);
12739 }
12740 (*(gres_context[i].ops.step_hardware_init))(devices, settings);
12741 }
12742 slurm_mutex_unlock(&gres_context_lock);
12743 }
12744
12745 /*
12746 * Optionally undo GRES hardware configuration while privileged
12747 */
gres_plugin_step_hardware_fini(void)12748 extern void gres_plugin_step_hardware_fini(void)
12749 {
12750 int i;
12751 (void) gres_plugin_init();
12752 slurm_mutex_lock(&gres_context_lock);
12753 for (i = 0; i < gres_context_cnt; i++) {
12754 if (gres_context[i].ops.step_hardware_fini == NULL) {
12755 continue;
12756 }
12757 (*(gres_context[i].ops.step_hardware_fini)) ();
12758 }
12759 slurm_mutex_unlock(&gres_context_lock);
12760 }
12761
12762 /*
12763 * Given a set GRES maps and the local process ID, return the bitmap of
12764 * GRES that should be available to this task.
12765 */
_get_gres_map(char * map_gres,int local_proc_id)12766 static bitstr_t *_get_gres_map(char *map_gres, int local_proc_id)
12767 {
12768 bitstr_t *usable_gres = NULL;
12769 char *tmp, *tok, *save_ptr = NULL, *mult;
12770 int task_offset = 0, task_mult;
12771 int map_value;
12772
12773 if (!map_gres || !map_gres[0])
12774 return NULL;
12775
12776 while (usable_gres == NULL) {
12777 tmp = xstrdup(map_gres);
12778 tok = strtok_r(tmp, ",", &save_ptr);
12779 while (tok) {
12780 if ((mult = strchr(tok, '*'))) {
12781 mult[0] = '\0';
12782 task_mult = atoi(mult + 1);
12783 } else
12784 task_mult = 1;
12785 if (task_mult == 0)
12786 task_mult = 1;
12787 if ((local_proc_id >= task_offset) &&
12788 (local_proc_id <= (task_offset + task_mult - 1))) {
12789 map_value = strtol(tok, NULL, 0);
12790 if ((map_value < 0) ||
12791 (map_value >= MAX_GRES_BITMAP)) {
12792 xfree(tmp);
12793 goto end; /* Bad value */
12794 }
12795 usable_gres = bit_alloc(MAX_GRES_BITMAP);
12796 bit_set(usable_gres, map_value);
12797 break; /* All done */
12798 } else {
12799 task_offset += task_mult;
12800 }
12801 tok = strtok_r(NULL, ",", &save_ptr);
12802 }
12803 xfree(tmp);
12804 }
12805 end:
12806
12807 return usable_gres;
12808 }
12809
12810 /*
12811 * Given a set GRES masks and the local process ID, return the bitmap of
12812 * GRES that should be available to this task.
12813 */
_get_gres_mask(char * mask_gres,int local_proc_id)12814 static bitstr_t * _get_gres_mask(char *mask_gres, int local_proc_id)
12815 {
12816 bitstr_t *usable_gres = NULL;
12817 char *tmp, *tok, *save_ptr = NULL, *mult;
12818 int i, task_offset = 0, task_mult;
12819 uint64_t mask_value;
12820
12821 if (!mask_gres || !mask_gres[0])
12822 return NULL;
12823
12824 tmp = xstrdup(mask_gres);
12825 tok = strtok_r(tmp, ",", &save_ptr);
12826 while (tok) {
12827 if ((mult = strchr(tok, '*')))
12828 task_mult = atoi(mult + 1);
12829 else
12830 task_mult = 1;
12831 if ((local_proc_id >= task_offset) &&
12832 (local_proc_id <= (task_offset + task_mult - 1))) {
12833 mask_value = strtol(tok, NULL, 0);
12834 if ((mask_value <= 0) || (mask_value >= 0xffffffff))
12835 break; /* Bad value */
12836 usable_gres = bit_alloc(MAX_GRES_BITMAP);
12837 for (i = 0; i < 64; i++) {
12838 if ((mask_value >> i) & 0x1)
12839 bit_set(usable_gres, i);
12840 }
12841 break; /* All done */
12842 } else {
12843 task_offset += task_mult;
12844 }
12845 tok = strtok_r(NULL, ",", &save_ptr);
12846 }
12847 xfree(tmp);
12848
12849 return usable_gres;
12850 }
12851
12852 /*
12853 * Set environment as required for all tasks of a job step
12854 * IN/OUT job_env_ptr - environment variable array
12855 * IN step_gres_list - generated by gres_plugin_step_alloc()
12856 * IN accel_bind_type - GRES binding options (old format, a bitmap)
12857 * IN tres_bind - TRES binding directives (new format, a string)
12858 * IN local_proc_id - task rank, local to this compute node only
12859 */
gres_plugin_step_set_env(char *** job_env_ptr,List step_gres_list,uint16_t accel_bind_type,char * tres_bind,int local_proc_id)12860 extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list,
12861 uint16_t accel_bind_type, char *tres_bind,
12862 int local_proc_id)
12863 {
12864 int i;
12865 ListIterator gres_iter;
12866 gres_state_t *gres_ptr = NULL;
12867 bool bind_gpu = accel_bind_type & ACCEL_BIND_CLOSEST_GPU;
12868 bool bind_nic = accel_bind_type & ACCEL_BIND_CLOSEST_NIC;
12869 bool bind_mic = accel_bind_type & ACCEL_BIND_CLOSEST_MIC;
12870 char *sep, *map_gpu = NULL, *mask_gpu = NULL;
12871 bitstr_t *usable_gres = NULL;
12872 bool found;
12873
12874 if (!bind_gpu && tres_bind && (sep = strstr(tres_bind, "gpu:"))) {
12875 sep += 4;
12876 if (!strncasecmp(sep, "closest", 7))
12877 bind_gpu = true;
12878 else if (!strncasecmp(sep, "map_gpu:", 8))
12879 map_gpu = sep + 8;
12880 else if (!strncasecmp(sep, "mask_gpu:", 9))
12881 mask_gpu = sep + 9;
12882 }
12883
12884 (void) gres_plugin_init();
12885 slurm_mutex_lock(&gres_context_lock);
12886 for (i = 0; i < gres_context_cnt; i++) {
12887 if (!gres_context[i].ops.step_set_env)
12888 continue; /* No plugin to call */
12889 if (bind_gpu || bind_mic || bind_nic || map_gpu || mask_gpu) {
12890 if (!xstrcmp(gres_context[i].gres_name, "gpu")) {
12891 if (map_gpu) {
12892 usable_gres = _get_gres_map(map_gpu,
12893 local_proc_id);
12894 } else if (mask_gpu) {
12895 usable_gres = _get_gres_mask(mask_gpu,
12896 local_proc_id);
12897 } else if (bind_gpu)
12898 usable_gres = _get_usable_gres(i);
12899 else
12900 continue;
12901 } else if (!xstrcmp(gres_context[i].gres_name,
12902 "mic")) {
12903 if (bind_mic)
12904 usable_gres = _get_usable_gres(i);
12905 else
12906 continue;
12907 } else if (!xstrcmp(gres_context[i].gres_name,
12908 "nic")) {
12909 if (bind_nic)
12910 usable_gres = _get_usable_gres(i);
12911 else
12912 continue;
12913 } else {
12914 continue;
12915 }
12916 }
12917 found = false;
12918 if (step_gres_list) {
12919 gres_iter = list_iterator_create(step_gres_list);
12920 while ((gres_ptr = (gres_state_t *)
12921 list_next(gres_iter))) {
12922 if (gres_ptr->plugin_id !=
12923 gres_context[i].plugin_id)
12924 continue;
12925 if (accel_bind_type || tres_bind) {
12926 (*(gres_context[i].ops.step_reset_env))
12927 (job_env_ptr,
12928 gres_ptr->gres_data,
12929 usable_gres);
12930 } else {
12931 (*(gres_context[i].ops.step_set_env))
12932 (job_env_ptr,
12933 gres_ptr->gres_data);
12934 }
12935 found = true;
12936 }
12937 list_iterator_destroy(gres_iter);
12938 }
12939 if (!found) { /* No data fond */
12940 if (accel_bind_type || tres_bind) {
12941 (*(gres_context[i].ops.step_reset_env))
12942 (job_env_ptr, NULL, NULL);
12943 } else {
12944 (*(gres_context[i].ops.step_set_env))
12945 (job_env_ptr, NULL);
12946 }
12947 }
12948 FREE_NULL_BITMAP(usable_gres);
12949 }
12950 slurm_mutex_unlock(&gres_context_lock);
12951 FREE_NULL_BITMAP(usable_gres);
12952 }
12953
_step_state_log(void * gres_data,uint32_t job_id,uint32_t step_id,char * gres_name)12954 static void _step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id,
12955 char *gres_name)
12956 {
12957 gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data;
12958 char tmp_str[128];
12959 int i;
12960
12961 xassert(gres_ptr);
12962 info("gres:%s type:%s(%u) step:%u.%u flags:%s state", gres_name,
12963 gres_ptr->type_name, gres_ptr->type_id, job_id, step_id,
12964 _gres_flags_str(gres_ptr->flags));
12965 if (gres_ptr->cpus_per_gres)
12966 info(" cpus_per_gres:%u", gres_ptr->cpus_per_gres);
12967 if (gres_ptr->gres_per_step)
12968 info(" gres_per_step:%"PRIu64, gres_ptr->gres_per_step);
12969 if (gres_ptr->gres_per_node) {
12970 info(" gres_per_node:%"PRIu64" node_cnt:%u",
12971 gres_ptr->gres_per_node, gres_ptr->node_cnt);
12972 }
12973 if (gres_ptr->gres_per_socket)
12974 info(" gres_per_socket:%"PRIu64, gres_ptr->gres_per_socket);
12975 if (gres_ptr->gres_per_task)
12976 info(" gres_per_task:%"PRIu64, gres_ptr->gres_per_task);
12977 if (gres_ptr->mem_per_gres)
12978 info(" mem_per_gres:%"PRIu64, gres_ptr->mem_per_gres);
12979
12980 if (gres_ptr->node_in_use == NULL)
12981 info(" node_in_use:NULL");
12982 else if (gres_ptr->gres_bit_alloc == NULL)
12983 info(" gres_bit_alloc:NULL");
12984 else {
12985 for (i = 0; i < gres_ptr->node_cnt; i++) {
12986 if (!bit_test(gres_ptr->node_in_use, i))
12987 continue;
12988 if (gres_ptr->gres_bit_alloc[i]) {
12989 bit_fmt(tmp_str, sizeof(tmp_str),
12990 gres_ptr->gres_bit_alloc[i]);
12991 info(" gres_bit_alloc[%d]:%s of %d", i,
12992 tmp_str,
12993 (int)bit_size(gres_ptr->gres_bit_alloc[i]));
12994 } else
12995 info(" gres_bit_alloc[%d]:NULL", i);
12996 }
12997 }
12998 }
12999
13000 /*
13001 * Log a step's current gres state
13002 * IN gres_list - generated by gres_plugin_step_alloc()
13003 * IN job_id - job's ID
13004 */
gres_plugin_step_state_log(List gres_list,uint32_t job_id,uint32_t step_id)13005 extern void gres_plugin_step_state_log(List gres_list, uint32_t job_id,
13006 uint32_t step_id)
13007 {
13008 int i;
13009 ListIterator gres_iter;
13010 gres_state_t *gres_ptr;
13011
13012 if (!gres_debug || (gres_list == NULL))
13013 return;
13014
13015 (void) gres_plugin_init();
13016
13017 slurm_mutex_lock(&gres_context_lock);
13018 gres_iter = list_iterator_create(gres_list);
13019 while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
13020 for (i = 0; i < gres_context_cnt; i++) {
13021 if (gres_ptr->plugin_id != gres_context[i].plugin_id)
13022 continue;
13023 _step_state_log(gres_ptr->gres_data, job_id, step_id,
13024 gres_context[i].gres_name);
13025 break;
13026 }
13027 }
13028 list_iterator_destroy(gres_iter);
13029 slurm_mutex_unlock(&gres_context_lock);
13030 }
13031
13032 /*
13033 * Determine how many cores of a job's allocation can be allocated to a step
13034 * on a specific node
13035 * IN job_gres_list - a running job's gres info
13036 * IN/OUT step_gres_list - a pending job step's gres requirements
13037 * IN node_offset - index into the job's node allocation
13038 * IN first_step_node - true if this is node zero of the step (do initialization)
13039 * IN cpus_per_task - number of CPUs required per task
13040 * IN max_rem_nodes - maximum nodes remaining for step (including this one)
13041 * IN ignore_alloc - if set ignore resources already allocated to running steps
13042 * IN job_id, step_id - ID of the step being allocated.
13043 * RET Count of available cores on this node (sort of):
13044 * NO_VAL64 if no limit or 0 if node is not usable
13045 */
gres_plugin_step_test(List step_gres_list,List job_gres_list,int node_offset,bool first_step_node,uint16_t cpus_per_task,int max_rem_nodes,bool ignore_alloc,uint32_t job_id,uint32_t step_id)13046 extern uint64_t gres_plugin_step_test(List step_gres_list, List job_gres_list,
13047 int node_offset, bool first_step_node,
13048 uint16_t cpus_per_task, int max_rem_nodes,
13049 bool ignore_alloc,
13050 uint32_t job_id, uint32_t step_id)
13051 {
13052 uint64_t core_cnt, tmp_cnt;
13053 ListIterator step_gres_iter;
13054 gres_state_t *job_gres_ptr, *step_gres_ptr;
13055 gres_step_state_t *step_data_ptr = NULL;
13056
13057 if (step_gres_list == NULL)
13058 return NO_VAL64;
13059 if (job_gres_list == NULL)
13060 return 0;
13061
13062 if (cpus_per_task == 0)
13063 cpus_per_task = 1;
13064 core_cnt = NO_VAL64;
13065 (void) gres_plugin_init();
13066
13067 slurm_mutex_lock(&gres_context_lock);
13068 step_gres_iter = list_iterator_create(step_gres_list);
13069 while ((step_gres_ptr = (gres_state_t *) list_next(step_gres_iter))) {
13070 gres_key_t job_search_key;
13071 step_data_ptr = (gres_step_state_t *)step_gres_ptr->gres_data;
13072 job_search_key.plugin_id = step_gres_ptr->plugin_id;
13073 if (step_data_ptr->type_name)
13074 job_search_key.type_id = step_data_ptr->type_id;
13075 else
13076 job_search_key.type_id = NO_VAL;
13077
13078 job_search_key.node_offset = node_offset;
13079 if (!(job_gres_ptr = list_find_first(
13080 job_gres_list,
13081 _gres_find_job_by_key_with_cnt,
13082 &job_search_key))) {
13083 /* job lack resources required by the step */
13084 core_cnt = 0;
13085 break;
13086 }
13087
13088 tmp_cnt = _step_test(step_data_ptr,
13089 job_gres_ptr->gres_data,
13090 node_offset, first_step_node,
13091 cpus_per_task, max_rem_nodes,
13092 ignore_alloc,
13093 job_id, step_id,
13094 step_gres_ptr->plugin_id);
13095 if ((tmp_cnt != NO_VAL64) && (tmp_cnt < core_cnt))
13096 core_cnt = tmp_cnt;
13097
13098 if (core_cnt == 0)
13099 break;
13100 }
13101 list_iterator_destroy(step_gres_iter);
13102 slurm_mutex_unlock(&gres_context_lock);
13103
13104 return core_cnt;
13105 }
13106
13107 /*
13108 * Return TRUE if this plugin ID consumes GRES count > 1 for a single device
13109 * file (e.g. MPS)
13110 */
_shared_gres(uint32_t plugin_id)13111 static bool _shared_gres(uint32_t plugin_id)
13112 {
13113 if (plugin_id == mps_plugin_id)
13114 return true;
13115 return false;
13116 }
13117 /*
13118 * Return TRUE if this plugin ID shares resources with another GRES that
13119 * consumes subsets of its resources (e.g. GPU)
13120 */
_sharing_gres(uint32_t plugin_id)13121 static bool _sharing_gres(uint32_t plugin_id)
13122 {
13123 if (plugin_id == gpu_plugin_id)
13124 return true;
13125 return false;
13126 }
13127
_step_alloc(void * step_gres_data,void * job_gres_data,uint32_t plugin_id,int node_offset,bool first_step_node,uint32_t job_id,uint32_t step_id,uint16_t tasks_on_node,uint32_t rem_nodes)13128 static int _step_alloc(void *step_gres_data, void *job_gres_data,
13129 uint32_t plugin_id, int node_offset,
13130 bool first_step_node,
13131 uint32_t job_id, uint32_t step_id,
13132 uint16_t tasks_on_node, uint32_t rem_nodes)
13133 {
13134 gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data;
13135 gres_step_state_t *step_gres_ptr = (gres_step_state_t *) step_gres_data;
13136 uint64_t gres_needed, gres_avail, max_gres = 0;
13137 bitstr_t *gres_bit_alloc;
13138 int i, len;
13139
13140 xassert(job_gres_ptr);
13141 xassert(step_gres_ptr);
13142
13143 if (job_gres_ptr->node_cnt == 0) /* no_consume */
13144 return SLURM_SUCCESS;
13145
13146 if (node_offset >= job_gres_ptr->node_cnt) {
13147 error("gres/%s: %s for %u.%u, node offset invalid (%d >= %u)",
13148 job_gres_ptr->gres_name, __func__, job_id,
13149 step_id, node_offset,
13150 job_gres_ptr->node_cnt);
13151 return SLURM_ERROR;
13152 }
13153
13154 if (first_step_node)
13155 step_gres_ptr->total_gres = 0;
13156 if (step_gres_ptr->gres_per_node) {
13157 gres_needed = step_gres_ptr->gres_per_node;
13158 } else if (step_gres_ptr->gres_per_task) {
13159 gres_needed = step_gres_ptr->gres_per_task * tasks_on_node;
13160 } else if (step_gres_ptr->gres_per_step && (rem_nodes == 1)) {
13161 gres_needed = step_gres_ptr->gres_per_step -
13162 step_gres_ptr->total_gres;
13163 } else if (step_gres_ptr->gres_per_step) {
13164 /* Leave at least one GRES per remaining node */
13165 max_gres = step_gres_ptr->gres_per_step -
13166 step_gres_ptr->total_gres - (rem_nodes - 1);
13167 gres_needed = 1;
13168 } else {
13169 /*
13170 * No explicit step GRES specification.
13171 * Note that gres_per_socket is not supported for steps
13172 */
13173 gres_needed = job_gres_ptr->gres_cnt_node_alloc[node_offset];
13174 }
13175 if (step_gres_ptr->node_cnt == 0)
13176 step_gres_ptr->node_cnt = job_gres_ptr->node_cnt;
13177 if (!step_gres_ptr->gres_cnt_node_alloc) {
13178 step_gres_ptr->gres_cnt_node_alloc =
13179 xcalloc(step_gres_ptr->node_cnt, sizeof(uint64_t));
13180 }
13181
13182 if (job_gres_ptr->gres_cnt_node_alloc &&
13183 job_gres_ptr->gres_cnt_node_alloc[node_offset])
13184 gres_avail = job_gres_ptr->gres_cnt_node_alloc[node_offset];
13185 else if (job_gres_ptr->gres_bit_select &&
13186 job_gres_ptr->gres_bit_select[node_offset])
13187 gres_avail = bit_set_count(
13188 job_gres_ptr->gres_bit_select[node_offset]);
13189 else if (job_gres_ptr->gres_cnt_node_alloc)
13190 gres_avail = job_gres_ptr->gres_cnt_node_alloc[node_offset];
13191 else
13192 gres_avail = job_gres_ptr->gres_per_node;
13193 if (gres_needed > gres_avail) {
13194 error("gres/%s: %s for %u.%u, step's > job's "
13195 "for node %d (%"PRIu64" > %"PRIu64")",
13196 job_gres_ptr->gres_name, __func__, job_id,
13197 step_id, node_offset, gres_needed, gres_avail);
13198 return SLURM_ERROR;
13199 }
13200
13201 if (!job_gres_ptr->gres_cnt_step_alloc) {
13202 job_gres_ptr->gres_cnt_step_alloc =
13203 xcalloc(job_gres_ptr->node_cnt, sizeof(uint64_t));
13204 }
13205
13206 if (gres_needed >
13207 (gres_avail - job_gres_ptr->gres_cnt_step_alloc[node_offset])) {
13208 error("gres/%s: %s for %u.%u, step's > job's "
13209 "remaining for node %d (%"PRIu64" > "
13210 "(%"PRIu64" - %"PRIu64"))",
13211 job_gres_ptr->gres_name, __func__, job_id,
13212 step_id, node_offset, gres_needed, gres_avail,
13213 job_gres_ptr->gres_cnt_step_alloc[node_offset]);
13214 return SLURM_ERROR;
13215 }
13216 gres_avail -= job_gres_ptr->gres_cnt_step_alloc[node_offset];
13217 if (max_gres)
13218 gres_needed = MIN(gres_avail, max_gres);
13219
13220 if (step_gres_ptr->gres_cnt_node_alloc &&
13221 (node_offset < step_gres_ptr->node_cnt))
13222 step_gres_ptr->gres_cnt_node_alloc[node_offset] = gres_needed;
13223 step_gres_ptr->total_gres += gres_needed;
13224
13225 if (step_gres_ptr->node_in_use == NULL) {
13226 step_gres_ptr->node_in_use = bit_alloc(job_gres_ptr->node_cnt);
13227 }
13228 bit_set(step_gres_ptr->node_in_use, node_offset);
13229 job_gres_ptr->gres_cnt_step_alloc[node_offset] += gres_needed;
13230
13231 if ((job_gres_ptr->gres_bit_alloc == NULL) ||
13232 (job_gres_ptr->gres_bit_alloc[node_offset] == NULL)) {
13233 debug3("gres/%s: %s gres_bit_alloc for %u.%u is NULL",
13234 job_gres_ptr->gres_name, __func__, job_id, step_id);
13235 return SLURM_SUCCESS;
13236 }
13237
13238 gres_bit_alloc = bit_copy(job_gres_ptr->gres_bit_alloc[node_offset]);
13239 len = bit_size(gres_bit_alloc);
13240 if (_shared_gres(plugin_id)) {
13241 for (i = 0; i < len; i++) {
13242 if (gres_needed > 0) {
13243 if (bit_test(gres_bit_alloc, i))
13244 gres_needed = 0;
13245 } else {
13246 bit_clear(gres_bit_alloc, i);
13247 }
13248 }
13249 } else {
13250 if (job_gres_ptr->gres_bit_step_alloc &&
13251 job_gres_ptr->gres_bit_step_alloc[node_offset]) {
13252 bit_and_not(gres_bit_alloc,
13253 job_gres_ptr->gres_bit_step_alloc[node_offset]);
13254 }
13255 for (i = 0; i < len; i++) {
13256 if (gres_needed > 0) {
13257 if (bit_test(gres_bit_alloc, i))
13258 gres_needed--;
13259 } else {
13260 bit_clear(gres_bit_alloc, i);
13261 }
13262 }
13263 }
13264 if (gres_needed) {
13265 error("gres/%s: %s step %u.%u oversubscribed resources on node %d",
13266 job_gres_ptr->gres_name, __func__,
13267 job_id, step_id, node_offset);
13268 }
13269
13270 if (job_gres_ptr->gres_bit_step_alloc == NULL) {
13271 job_gres_ptr->gres_bit_step_alloc =
13272 xcalloc(job_gres_ptr->node_cnt, sizeof(bitstr_t *));
13273 }
13274 if (job_gres_ptr->gres_bit_step_alloc[node_offset]) {
13275 bit_or(job_gres_ptr->gres_bit_step_alloc[node_offset],
13276 gres_bit_alloc);
13277 } else {
13278 job_gres_ptr->gres_bit_step_alloc[node_offset] =
13279 bit_copy(gres_bit_alloc);
13280 }
13281 if (step_gres_ptr->gres_bit_alloc == NULL) {
13282 step_gres_ptr->gres_bit_alloc = xcalloc(job_gres_ptr->node_cnt,
13283 sizeof(bitstr_t *));
13284 }
13285 if (step_gres_ptr->gres_bit_alloc[node_offset]) {
13286 error("gres/%s: %s step %u.%u bit_alloc already exists",
13287 job_gres_ptr->gres_name, __func__, job_id, step_id);
13288 bit_or(step_gres_ptr->gres_bit_alloc[node_offset],
13289 gres_bit_alloc);
13290 FREE_NULL_BITMAP(gres_bit_alloc);
13291 } else {
13292 step_gres_ptr->gres_bit_alloc[node_offset] = gres_bit_alloc;
13293 }
13294
13295 return SLURM_SUCCESS;
13296 }
13297
13298 /*
13299 * Allocate resource to a step and update job and step gres information
13300 * IN step_gres_list - step's gres_list built by
13301 * gres_plugin_step_state_validate()
13302 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
13303 * IN node_offset - job's zero-origin index to the node of interest
13304 * IN first_step_node - true if this is the first node in the step's allocation
13305 * IN tasks_on_node - number of tasks to be launched on this node
13306 * IN rem_nodes - desired additional node count to allocate, including this node
13307 * IN job_id, step_id - ID of the step being allocated.
13308 * RET SLURM_SUCCESS or error code
13309 */
gres_plugin_step_alloc(List step_gres_list,List job_gres_list,int node_offset,bool first_step_node,uint16_t tasks_on_node,uint32_t rem_nodes,uint32_t job_id,uint32_t step_id)13310 extern int gres_plugin_step_alloc(List step_gres_list, List job_gres_list,
13311 int node_offset, bool first_step_node,
13312 uint16_t tasks_on_node, uint32_t rem_nodes,
13313 uint32_t job_id, uint32_t step_id)
13314 {
13315 int rc, rc2;
13316 ListIterator step_gres_iter;
13317 gres_state_t *step_gres_ptr, *job_gres_ptr;
13318
13319 if (step_gres_list == NULL)
13320 return SLURM_SUCCESS;
13321 if (job_gres_list == NULL) {
13322 error("%s: step allocates GRES, but job %u has none",
13323 __func__, job_id);
13324 return SLURM_ERROR;
13325 }
13326
13327 rc = gres_plugin_init();
13328
13329 slurm_mutex_lock(&gres_context_lock);
13330 step_gres_iter = list_iterator_create(step_gres_list);
13331 while ((step_gres_ptr = (gres_state_t *) list_next(step_gres_iter))) {
13332 gres_step_state_t *step_data_ptr =
13333 (gres_step_state_t *) step_gres_ptr->gres_data;
13334 gres_key_t job_search_key;
13335 step_data_ptr = (gres_step_state_t *)step_gres_ptr->gres_data;
13336 job_search_key.plugin_id = step_gres_ptr->plugin_id;
13337 if (step_data_ptr->type_name)
13338 job_search_key.type_id = step_data_ptr->type_id;
13339 else
13340 job_search_key.type_id = NO_VAL;
13341
13342 job_search_key.node_offset = node_offset;
13343 if (!(job_gres_ptr = list_find_first(
13344 job_gres_list,
13345 _gres_find_job_by_key_with_cnt,
13346 &job_search_key))) {
13347 /* job lack resources required by the step */
13348 rc = ESLURM_INVALID_GRES;
13349 break;
13350 }
13351
13352 rc2 = _step_alloc(step_data_ptr,
13353 job_gres_ptr->gres_data,
13354 step_gres_ptr->plugin_id, node_offset,
13355 first_step_node,
13356 job_id, step_id, tasks_on_node, rem_nodes);
13357 if (rc2 != SLURM_SUCCESS)
13358 rc = rc2;
13359 }
13360 list_iterator_destroy(step_gres_iter);
13361 slurm_mutex_unlock(&gres_context_lock);
13362
13363 return rc;
13364 }
13365
13366
_step_dealloc(gres_state_t * step_gres_ptr,List job_gres_list,uint32_t job_id,uint32_t step_id)13367 static int _step_dealloc(gres_state_t *step_gres_ptr, List job_gres_list,
13368 uint32_t job_id, uint32_t step_id)
13369 {
13370 gres_state_t *job_gres_ptr;
13371 gres_step_state_t *step_data_ptr =
13372 (gres_step_state_t *)step_gres_ptr->gres_data;
13373 gres_job_state_t *job_data_ptr;
13374 uint32_t i, j;
13375 uint64_t gres_cnt;
13376 int len_j, len_s;
13377 gres_key_t job_search_key;
13378
13379 xassert(job_gres_list);
13380 xassert(step_data_ptr);
13381
13382 job_search_key.plugin_id = step_gres_ptr->plugin_id;
13383 if (step_data_ptr->type_name)
13384 job_search_key.type_id = step_data_ptr->type_id;
13385 else
13386 job_search_key.type_id = NO_VAL;
13387 for (i = 0; i < step_data_ptr->node_cnt; i++) {
13388 job_search_key.node_offset = i;
13389 if (!(job_gres_ptr = list_find_first(
13390 job_gres_list,
13391 _gres_find_job_by_key_with_cnt,
13392 &job_search_key)))
13393 continue;
13394
13395 job_data_ptr = (gres_job_state_t *)job_gres_ptr->gres_data;
13396 if (job_data_ptr->node_cnt == 0) { /* no_consume */
13397 xassert(!step_data_ptr->node_in_use);
13398 xassert(!step_data_ptr->gres_bit_alloc);
13399 return SLURM_SUCCESS;
13400 } else if (job_data_ptr->node_cnt < i)
13401 return SLURM_SUCCESS;
13402
13403 if (!step_data_ptr->node_in_use) {
13404 error("gres/%s: %s step %u.%u dealloc, node_in_use is NULL",
13405 job_data_ptr->gres_name, __func__,
13406 job_id, step_id);
13407 return SLURM_ERROR;
13408 }
13409
13410 if (!bit_test(step_data_ptr->node_in_use, i))
13411 continue;
13412
13413 if (step_data_ptr->gres_cnt_node_alloc)
13414 gres_cnt = step_data_ptr->gres_cnt_node_alloc[i];
13415 else
13416 gres_cnt = step_data_ptr->gres_per_node;
13417
13418 if (job_data_ptr->gres_cnt_step_alloc) {
13419 if (job_data_ptr->gres_cnt_step_alloc[i] >=
13420 gres_cnt) {
13421 job_data_ptr->gres_cnt_step_alloc[i] -=
13422 gres_cnt;
13423 } else {
13424 error("gres/%s: %s step %u.%u dealloc count underflow",
13425 job_data_ptr->gres_name, __func__,
13426 job_id, step_id);
13427 job_data_ptr->gres_cnt_step_alloc[i] = 0;
13428 }
13429 }
13430 if ((step_data_ptr->gres_bit_alloc == NULL) ||
13431 (step_data_ptr->gres_bit_alloc[i] == NULL))
13432 continue;
13433 if (job_data_ptr->gres_bit_alloc[i] == NULL) {
13434 error("gres/%s: %s job %u gres_bit_alloc[%d] is NULL",
13435 job_data_ptr->gres_name, __func__, job_id, i);
13436 continue;
13437 }
13438 len_j = bit_size(job_data_ptr->gres_bit_alloc[i]);
13439 len_s = bit_size(step_data_ptr->gres_bit_alloc[i]);
13440 if (len_j != len_s) {
13441 error("gres/%s: %s step %u.%u dealloc, bit_alloc[%d] size mis-match (%d != %d)",
13442 job_data_ptr->gres_name, __func__,
13443 job_id, step_id, i, len_j, len_s);
13444 len_j = MIN(len_j, len_s);
13445 }
13446 for (j = 0; j < len_j; j++) {
13447 if (!bit_test(step_data_ptr->gres_bit_alloc[i], j))
13448 continue;
13449 if (job_data_ptr->gres_bit_step_alloc &&
13450 job_data_ptr->gres_bit_step_alloc[i]) {
13451 bit_clear(job_data_ptr->gres_bit_step_alloc[i],
13452 j);
13453 }
13454 }
13455 FREE_NULL_BITMAP(step_data_ptr->gres_bit_alloc[i]);
13456 }
13457
13458 return SLURM_SUCCESS;
13459 }
13460
13461 /*
13462 * Deallocate resource to a step and update job and step gres information
13463 * IN step_gres_list - step's gres_list built by
13464 * gres_plugin_step_state_validate()
13465 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
13466 * IN job_id, step_id - ID of the step being allocated.
13467 * RET SLURM_SUCCESS or error code
13468 */
gres_plugin_step_dealloc(List step_gres_list,List job_gres_list,uint32_t job_id,uint32_t step_id)13469 extern int gres_plugin_step_dealloc(List step_gres_list, List job_gres_list,
13470 uint32_t job_id, uint32_t step_id)
13471 {
13472 int rc, rc2;
13473 ListIterator step_gres_iter;
13474 gres_state_t *step_gres_ptr;
13475
13476 if (step_gres_list == NULL)
13477 return SLURM_SUCCESS;
13478 if (job_gres_list == NULL) {
13479 error("%s: step deallocates gres, but job %u has none",
13480 __func__, job_id);
13481 return SLURM_ERROR;
13482 }
13483
13484 rc = gres_plugin_init();
13485
13486 slurm_mutex_lock(&gres_context_lock);
13487 step_gres_iter = list_iterator_create(step_gres_list);
13488 while ((step_gres_ptr = list_next(step_gres_iter))) {
13489 rc2 = _step_dealloc(step_gres_ptr,
13490 job_gres_list,
13491 job_id, step_id);
13492 if (rc2 != SLURM_SUCCESS)
13493 rc = rc2;
13494 }
13495 list_iterator_destroy(step_gres_iter);
13496 slurm_mutex_unlock(&gres_context_lock);
13497
13498 return rc;
13499 }
13500
13501 /*
13502 * Determine total count GRES of a given type are allocated to a job across
13503 * all nodes
13504 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
13505 * IN gres_name - name of a GRES type
13506 * RET count of this GRES allocated to this job
13507 */
gres_get_value_by_type(List job_gres_list,char * gres_name)13508 extern uint64_t gres_get_value_by_type(List job_gres_list, char *gres_name)
13509 {
13510 int i;
13511 uint32_t plugin_id;
13512 uint64_t gres_cnt = 0;
13513 ListIterator job_gres_iter;
13514 gres_state_t *job_gres_ptr;
13515 gres_job_state_t *job_gres_data;
13516
13517 if (job_gres_list == NULL)
13518 return NO_VAL64;
13519
13520 gres_cnt = NO_VAL64;
13521 (void) gres_plugin_init();
13522 plugin_id = gres_plugin_build_id(gres_name);
13523
13524 slurm_mutex_lock(&gres_context_lock);
13525 job_gres_iter = list_iterator_create(job_gres_list);
13526 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
13527 for (i = 0; i < gres_context_cnt; i++) {
13528 if (job_gres_ptr->plugin_id != plugin_id)
13529 continue;
13530 job_gres_data = (gres_job_state_t *)
13531 job_gres_ptr->gres_data;
13532 gres_cnt = job_gres_data->gres_per_node;
13533 break;
13534 }
13535 }
13536 list_iterator_destroy(job_gres_iter);
13537 slurm_mutex_unlock(&gres_context_lock);
13538
13539 return gres_cnt;
13540 }
13541
13542 /*
13543 * Fill in an array of GRES type IDs contained within the given job gres_list
13544 * and an array of corresponding counts of those GRES types.
13545 * IN gres_list - a List of GRES types allocated to a job.
13546 * IN arr_len - Length of the arrays (the number of elements in the gres_list).
13547 * IN gres_count_ids, gres_count_vals - the GRES type ID's and values found
13548 * in the gres_list.
13549 * RET SLURM_SUCCESS or error code
13550 */
gres_plugin_job_count(List gres_list,int arr_len,uint32_t * gres_count_ids,uint64_t * gres_count_vals)13551 extern int gres_plugin_job_count(List gres_list, int arr_len,
13552 uint32_t *gres_count_ids,
13553 uint64_t *gres_count_vals)
13554 {
13555 ListIterator job_gres_iter;
13556 gres_state_t *job_gres_ptr;
13557 void *job_gres_data;
13558 int rc, ix = 0;
13559
13560 rc = gres_plugin_init();
13561 if ((rc == SLURM_SUCCESS) && (arr_len <= 0))
13562 rc = EINVAL;
13563 if (rc != SLURM_SUCCESS)
13564 return rc;
13565
13566 slurm_mutex_lock(&gres_context_lock);
13567
13568 job_gres_iter = list_iterator_create(gres_list);
13569 while ((job_gres_ptr = (gres_state_t*) list_next(job_gres_iter))) {
13570 gres_job_state_t *job_gres_state_ptr;
13571 job_gres_data = job_gres_ptr->gres_data;
13572 job_gres_state_ptr = (gres_job_state_t *) job_gres_data;
13573 xassert(job_gres_state_ptr);
13574
13575 gres_count_ids[ix] = job_gres_ptr->plugin_id;
13576 if (job_gres_state_ptr->total_gres == NO_CONSUME_VAL64)
13577 gres_count_vals[ix] = 0;
13578 else
13579 gres_count_vals[ix] = job_gres_state_ptr->total_gres;
13580 if (++ix >= arr_len)
13581 break;
13582 }
13583 list_iterator_destroy(job_gres_iter);
13584
13585 slurm_mutex_unlock(&gres_context_lock);
13586
13587 return rc;
13588 }
13589
13590 /*
13591 * Build a string identifying total GRES counts of each type
13592 * IN gres_list - a List of GRES types allocated to a job.
13593 * RET string containing comma-separated list of gres type:model:count
13594 * must release memory using xfree()
13595 */
gres_plugin_job_alloc_count(List gres_list)13596 extern char *gres_plugin_job_alloc_count(List gres_list)
13597 {
13598 ListIterator job_gres_iter;
13599 gres_state_t *job_gres_ptr;
13600 void *job_gres_data;
13601 char *gres_alloc = NULL, *gres_name, *sep = "";
13602 int i;
13603
13604 (void) gres_plugin_init();
13605 slurm_mutex_lock(&gres_context_lock);
13606
13607 job_gres_iter = list_iterator_create(gres_list);
13608 while ((job_gres_ptr = (gres_state_t*) list_next(job_gres_iter))) {
13609 gres_job_state_t *job_gres_state_ptr;
13610 uint64_t total_gres;
13611
13612 job_gres_data = job_gres_ptr->gres_data;
13613 job_gres_state_ptr = (gres_job_state_t *) job_gres_data;
13614 if (!job_gres_state_ptr) {
13615 error("%s: job gres_data is NULL", __func__);
13616 continue;
13617 }
13618 gres_name = "UNKNOWN";
13619 for (i = 0; i < gres_context_cnt; i++) {
13620 if (gres_context[i].plugin_id !=
13621 job_gres_ptr->plugin_id)
13622 continue;
13623 gres_name = gres_context[i].gres_name;
13624 }
13625
13626 if (job_gres_state_ptr->total_gres == NO_CONSUME_VAL64)
13627 total_gres = 0;
13628 else
13629 total_gres = job_gres_state_ptr->total_gres;
13630
13631 if (job_gres_state_ptr->type_name) {
13632 xstrfmtcat(gres_alloc, "%s%s:%s:%"PRIu64, sep,
13633 gres_name, job_gres_state_ptr->type_name,
13634 total_gres);
13635 } else {
13636 xstrfmtcat(gres_alloc, "%s%s:%"PRIu64, sep, gres_name,
13637 total_gres);
13638 }
13639 sep = ",";
13640 }
13641 list_iterator_destroy(job_gres_iter);
13642
13643 slurm_mutex_unlock(&gres_context_lock);
13644
13645 return gres_alloc;
13646 }
13647 /*
13648 * Fill in an array of GRES type ids contained within the given node gres_list
13649 * and an array of corresponding counts of those GRES types.
13650 * IN gres_list - a List of GRES types found on a node.
13651 * IN arrlen - Length of the arrays (the number of elements in the gres_list).
13652 * IN gres_count_ids, gres_count_vals - the GRES type ID's and values found
13653 * in the gres_list.
13654 * IN val_type - Type of value desired, see GRES_VAL_TYPE_*
13655 * RET SLURM_SUCCESS or error code
13656 */
gres_plugin_node_count(List gres_list,int arr_len,uint32_t * gres_count_ids,uint64_t * gres_count_vals,int val_type)13657 extern int gres_plugin_node_count(List gres_list, int arr_len,
13658 uint32_t *gres_count_ids,
13659 uint64_t *gres_count_vals,
13660 int val_type)
13661 {
13662 ListIterator node_gres_iter;
13663 gres_state_t* node_gres_ptr;
13664 void* node_gres_data;
13665 uint64_t val;
13666 int rc, ix = 0;
13667
13668 rc = gres_plugin_init();
13669 if ((rc == SLURM_SUCCESS) && (arr_len <= 0))
13670 rc = EINVAL;
13671 if (rc != SLURM_SUCCESS)
13672 return rc;
13673
13674 slurm_mutex_lock(&gres_context_lock);
13675
13676 node_gres_iter = list_iterator_create(gres_list);
13677 while ((node_gres_ptr = (gres_state_t*) list_next(node_gres_iter))) {
13678 gres_node_state_t *node_gres_state_ptr;
13679 val = 0;
13680 node_gres_data = node_gres_ptr->gres_data;
13681 node_gres_state_ptr = (gres_node_state_t *) node_gres_data;
13682 xassert(node_gres_state_ptr);
13683
13684 switch (val_type) {
13685 case (GRES_VAL_TYPE_FOUND):
13686 val = node_gres_state_ptr->gres_cnt_found;
13687 break;
13688 case (GRES_VAL_TYPE_CONFIG):
13689 val = node_gres_state_ptr->gres_cnt_config;
13690 break;
13691 case (GRES_VAL_TYPE_AVAIL):
13692 val = node_gres_state_ptr->gres_cnt_avail;
13693 break;
13694 case (GRES_VAL_TYPE_ALLOC):
13695 val = node_gres_state_ptr->gres_cnt_alloc;
13696 }
13697
13698 gres_count_ids[ix] = node_gres_ptr->plugin_id;
13699 gres_count_vals[ix] = val;
13700 if (++ix >= arr_len)
13701 break;
13702 }
13703 list_iterator_destroy(node_gres_iter);
13704
13705 slurm_mutex_unlock(&gres_context_lock);
13706
13707 return rc;
13708 }
13709
13710 /* Send GRES information to slurmstepd on the specified file descriptor */
gres_plugin_send_stepd(int fd)13711 extern void gres_plugin_send_stepd(int fd)
13712 {
13713 int i;
13714
13715 (void) gres_plugin_init();
13716
13717 slurm_mutex_lock(&gres_context_lock);
13718 for (i = 0; i < gres_context_cnt; i++) {
13719 safe_write(fd, &gres_context[i].config_flags, sizeof(uint8_t));
13720 if (gres_context[i].ops.send_stepd == NULL)
13721 continue; /* No plugin to call */
13722 (*(gres_context[i].ops.send_stepd)) (fd);
13723 }
13724 slurm_mutex_unlock(&gres_context_lock);
13725
13726 return;
13727 rwfail:
13728 error("%s: failed", __func__);
13729 slurm_mutex_unlock(&gres_context_lock);
13730 }
13731
13732 /* Receive GRES information from slurmd on the specified file descriptor */
gres_plugin_recv_stepd(int fd)13733 extern void gres_plugin_recv_stepd(int fd)
13734 {
13735 int i;
13736
13737 (void) gres_plugin_init();
13738
13739 slurm_mutex_lock(&gres_context_lock);
13740 for (i = 0; i < gres_context_cnt; i++) {
13741 safe_read(fd, &gres_context[i].config_flags, sizeof(uint8_t));
13742 (void)_load_gres_plugin(&gres_context[i]);
13743
13744 if (gres_context[i].ops.recv_stepd == NULL)
13745 continue; /* No plugin to call */
13746 (*(gres_context[i].ops.recv_stepd)) (fd);
13747 }
13748 slurm_mutex_unlock(&gres_context_lock);
13749
13750 return;
13751 rwfail:
13752 error("%s: failed", __func__);
13753 slurm_mutex_unlock(&gres_context_lock);
13754 }
13755
13756 /* Get generic GRES data types here. Call the plugin for others */
_get_job_info(int gres_inx,gres_job_state_t * job_gres_data,uint32_t node_inx,enum gres_job_data_type data_type,void * data)13757 static int _get_job_info(int gres_inx, gres_job_state_t *job_gres_data,
13758 uint32_t node_inx, enum gres_job_data_type data_type,
13759 void *data)
13760 {
13761 uint64_t *u64_data = (uint64_t *) data;
13762 bitstr_t **bit_data = (bitstr_t **) data;
13763 int rc = SLURM_SUCCESS;
13764
13765 if (!job_gres_data || !data)
13766 return EINVAL;
13767 if (node_inx >= job_gres_data->node_cnt)
13768 return ESLURM_INVALID_NODE_COUNT;
13769 if (data_type == GRES_JOB_DATA_COUNT) {
13770 *u64_data = job_gres_data->gres_per_node;
13771 } else if (data_type == GRES_JOB_DATA_BITMAP) {
13772 if (job_gres_data->gres_bit_alloc)
13773 *bit_data = job_gres_data->gres_bit_alloc[node_inx];
13774 else
13775 *bit_data = NULL;
13776 } else {
13777 /* Support here for plugin-specific data types */
13778 rc = (*(gres_context[gres_inx].ops.job_info))
13779 (job_gres_data, node_inx, data_type, data);
13780 }
13781
13782 return rc;
13783 }
13784
13785 /*
13786 * get data from a job's GRES data structure
13787 * IN job_gres_list - job's GRES data structure
13788 * IN gres_name - name of a GRES type
13789 * IN node_inx - zero-origin index of the node within the job's allocation
13790 * for which data is desired
13791 * IN data_type - type of data to get from the job's data
13792 * OUT data - pointer to the data from job's GRES data structure
13793 * DO NOT FREE: This is a pointer into the job's data structure
13794 * RET - SLURM_SUCCESS or error code
13795 */
gres_get_job_info(List job_gres_list,char * gres_name,uint32_t node_inx,enum gres_job_data_type data_type,void * data)13796 extern int gres_get_job_info(List job_gres_list, char *gres_name,
13797 uint32_t node_inx,
13798 enum gres_job_data_type data_type, void *data)
13799 {
13800 int i, rc = ESLURM_INVALID_GRES;
13801 uint32_t plugin_id;
13802 ListIterator job_gres_iter;
13803 gres_state_t *job_gres_ptr;
13804 gres_job_state_t *job_gres_data;
13805
13806 if (data == NULL)
13807 return EINVAL;
13808 if (job_gres_list == NULL) /* No GRES allocated */
13809 return ESLURM_INVALID_GRES;
13810
13811 (void) gres_plugin_init();
13812 plugin_id = gres_plugin_build_id(gres_name);
13813
13814 slurm_mutex_lock(&gres_context_lock);
13815 job_gres_iter = list_iterator_create(job_gres_list);
13816 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
13817 for (i = 0; i < gres_context_cnt; i++) {
13818 if (job_gres_ptr->plugin_id != plugin_id)
13819 continue;
13820 job_gres_data = (gres_job_state_t *)
13821 job_gres_ptr->gres_data;
13822 rc = _get_job_info(i, job_gres_data, node_inx,
13823 data_type, data);
13824 break;
13825 }
13826 }
13827 list_iterator_destroy(job_gres_iter);
13828 slurm_mutex_unlock(&gres_context_lock);
13829
13830 return rc;
13831 }
13832
13833 /* Given a job's GRES data structure, return the indecies for selected elements
13834 * IN job_gres_list - job's GRES data structure
13835 * OUT gres_detail_cnt - Number of elements (nodes) in gres_detail_str
13836 * OUT gres_detail_str - Description of GRES on each node
13837 * OUT total_gres_str - String containing all gres in the job and counts.
13838 */
gres_build_job_details(List job_gres_list,uint32_t * gres_detail_cnt,char *** gres_detail_str,char ** total_gres_str)13839 extern void gres_build_job_details(List job_gres_list,
13840 uint32_t *gres_detail_cnt,
13841 char ***gres_detail_str,
13842 char **total_gres_str)
13843 {
13844 int i, j;
13845 ListIterator job_gres_iter;
13846 gres_state_t *job_gres_ptr;
13847 gres_job_state_t *job_gres_data;
13848 char *sep1, *sep2, tmp_str[128], *type, **my_gres_details = NULL;
13849 uint32_t my_gres_cnt = 0;
13850 char *gres_name, *gres_str = NULL;
13851 uint64_t gres_cnt;
13852
13853 /* Release any vestigial data (e.g. from job requeue) */
13854 for (i = 0; i < *gres_detail_cnt; i++)
13855 xfree(gres_detail_str[0][i]);
13856 xfree(*gres_detail_str);
13857 xfree(*total_gres_str);
13858 *gres_detail_cnt = 0;
13859
13860 if (job_gres_list == NULL) /* No GRES allocated */
13861 return;
13862
13863 (void) gres_plugin_init();
13864
13865 job_gres_iter = list_iterator_create(job_gres_list);
13866 while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
13867 job_gres_data = (gres_job_state_t *) job_gres_ptr->gres_data;
13868 if (job_gres_data->gres_bit_alloc == NULL)
13869 continue;
13870 if (my_gres_details == NULL) {
13871 my_gres_cnt = job_gres_data->node_cnt;
13872 my_gres_details = xcalloc(my_gres_cnt, sizeof(char *));
13873 }
13874
13875 if (job_gres_data->type_name) {
13876 sep2 = ":";
13877 type = job_gres_data->type_name;
13878 } else {
13879 sep2 = "";
13880 type = "";
13881 }
13882
13883 gres_name = xstrdup_printf(
13884 "%s%s%s",
13885 job_gres_data->gres_name, sep2, type);
13886 gres_cnt = 0;
13887
13888 for (j = 0; j < my_gres_cnt; j++) {
13889 if (j >= job_gres_data->node_cnt)
13890 break; /* node count mismatch */
13891 if (my_gres_details[j])
13892 sep1 = ",";
13893 else
13894 sep1 = "";
13895
13896 gres_cnt += job_gres_data->gres_cnt_node_alloc[j];
13897
13898 if (job_gres_data->gres_bit_alloc[j]) {
13899 bit_fmt(tmp_str, sizeof(tmp_str),
13900 job_gres_data->gres_bit_alloc[j]);
13901 xstrfmtcat(my_gres_details[j],
13902 "%s%s:%"PRIu64"(IDX:%s)",
13903 sep1, gres_name,
13904 job_gres_data->
13905 gres_cnt_node_alloc[j],
13906 tmp_str);
13907 } else if (job_gres_data->gres_cnt_node_alloc[j]) {
13908 xstrfmtcat(my_gres_details[j],
13909 "%s%s(CNT:%"PRIu64")",
13910 sep1, gres_name,
13911 job_gres_data->
13912 gres_cnt_node_alloc[j]);
13913 }
13914 }
13915
13916 xstrfmtcat(gres_str, "%s%s:%"PRIu64,
13917 gres_str ? "," : "", gres_name, gres_cnt);
13918 xfree(gres_name);
13919 }
13920 list_iterator_destroy(job_gres_iter);
13921 *gres_detail_cnt = my_gres_cnt;
13922 *gres_detail_str = my_gres_details;
13923 *total_gres_str = gres_str;
13924 }
13925
13926 /* Get generic GRES data types here. Call the plugin for others */
_get_step_info(int gres_inx,gres_step_state_t * step_gres_data,uint32_t node_inx,enum gres_step_data_type data_type,void * data)13927 static int _get_step_info(int gres_inx, gres_step_state_t *step_gres_data,
13928 uint32_t node_inx, enum gres_step_data_type data_type,
13929 void *data)
13930 {
13931 uint64_t *u64_data = (uint64_t *) data;
13932 bitstr_t **bit_data = (bitstr_t **) data;
13933 int rc = SLURM_SUCCESS;
13934
13935 if (!step_gres_data || !data)
13936 return EINVAL;
13937 if (node_inx >= step_gres_data->node_cnt)
13938 return ESLURM_INVALID_NODE_COUNT;
13939 if (data_type == GRES_STEP_DATA_COUNT) {
13940 *u64_data = step_gres_data->gres_per_node;
13941 } else if (data_type == GRES_STEP_DATA_BITMAP) {
13942 if (step_gres_data->gres_bit_alloc)
13943 *bit_data = step_gres_data->gres_bit_alloc[node_inx];
13944 else
13945 *bit_data = NULL;
13946 } else {
13947 /* Support here for plugin-specific data types */
13948 rc = (*(gres_context[gres_inx].ops.step_info))
13949 (step_gres_data, node_inx, data_type, data);
13950 }
13951
13952 return rc;
13953 }
13954
13955 /*
13956 * get data from a step's GRES data structure
13957 * IN step_gres_list - step's GRES data structure
13958 * IN gres_name - name of a GRES type
13959 * IN node_inx - zero-origin index of the node within the job's allocation
13960 * for which data is desired. Note this can differ from the step's
13961 * node allocation index.
13962 * IN data_type - type of data to get from the step's data
13963 * OUT data - pointer to the data from step's GRES data structure
13964 * DO NOT FREE: This is a pointer into the step's data structure
13965 * RET - SLURM_SUCCESS or error code
13966 */
gres_get_step_info(List step_gres_list,char * gres_name,uint32_t node_inx,enum gres_step_data_type data_type,void * data)13967 extern int gres_get_step_info(List step_gres_list, char *gres_name,
13968 uint32_t node_inx,
13969 enum gres_step_data_type data_type, void *data)
13970 {
13971 int i, rc = ESLURM_INVALID_GRES;
13972 uint32_t plugin_id;
13973 ListIterator step_gres_iter;
13974 gres_state_t *step_gres_ptr;
13975 gres_step_state_t *step_gres_data;
13976
13977 if (data == NULL)
13978 return EINVAL;
13979 if (step_gres_list == NULL) /* No GRES allocated */
13980 return ESLURM_INVALID_GRES;
13981
13982 (void) gres_plugin_init();
13983 plugin_id = gres_plugin_build_id(gres_name);
13984
13985 slurm_mutex_lock(&gres_context_lock);
13986 step_gres_iter = list_iterator_create(step_gres_list);
13987 while ((step_gres_ptr = (gres_state_t *) list_next(step_gres_iter))) {
13988 for (i = 0; i < gres_context_cnt; i++) {
13989 if (step_gres_ptr->plugin_id != plugin_id)
13990 continue;
13991 step_gres_data = (gres_step_state_t *)
13992 step_gres_ptr->gres_data;
13993 rc = _get_step_info(i, step_gres_data, node_inx,
13994 data_type, data);
13995 break;
13996 }
13997 }
13998 list_iterator_destroy(step_gres_iter);
13999 slurm_mutex_unlock(&gres_context_lock);
14000
14001 return rc;
14002 }
14003
gres_get_step_state(List gres_list,char * name)14004 extern gres_step_state_t *gres_get_step_state(List gres_list, char *name)
14005 {
14006 gres_state_t *gres_state_ptr;
14007
14008 if (!gres_list || !name || !list_count(gres_list))
14009 return NULL;
14010
14011 slurm_mutex_lock(&gres_context_lock);
14012 gres_state_ptr = list_find_first(gres_list, _gres_step_find_name, name);
14013 slurm_mutex_unlock(&gres_context_lock);
14014
14015 if (!gres_state_ptr)
14016 return NULL;
14017
14018 return (gres_step_state_t *)gres_state_ptr->gres_data;
14019 }
14020
gres_get_job_state(List gres_list,char * name)14021 extern gres_job_state_t *gres_get_job_state(List gres_list, char *name)
14022 {
14023 gres_state_t *gres_state_ptr;
14024
14025 if (!gres_list || !name || !list_count(gres_list))
14026 return NULL;
14027
14028 slurm_mutex_lock(&gres_context_lock);
14029 gres_state_ptr = list_find_first(gres_list, _gres_job_find_name, name);
14030 slurm_mutex_unlock(&gres_context_lock);
14031
14032 if (!gres_state_ptr)
14033 return NULL;
14034
14035 return (gres_job_state_t *)gres_state_ptr->gres_data;
14036 }
14037
gres_get_autodetect_types(void)14038 extern uint32_t gres_get_autodetect_types(void)
14039 {
14040 return autodetect_types;
14041 }
14042
gres_2_tres_str(List gres_list,bool is_job,bool locked)14043 extern char *gres_2_tres_str(List gres_list, bool is_job, bool locked)
14044 {
14045 ListIterator itr;
14046 slurmdb_tres_rec_t *tres_rec;
14047 gres_state_t *gres_state_ptr;
14048 int i;
14049 uint64_t count;
14050 char *col_name = NULL;
14051 char *tres_str = NULL;
14052 static bool first_run = 1;
14053 static slurmdb_tres_rec_t tres_req;
14054 assoc_mgr_lock_t locks = { .tres = READ_LOCK };
14055
14056 /* we only need to init this once */
14057 if (first_run) {
14058 first_run = 0;
14059 memset(&tres_req, 0, sizeof(slurmdb_tres_rec_t));
14060 tres_req.type = "gres";
14061 }
14062
14063 if (!gres_list)
14064 return NULL;
14065
14066 /* must be locked first before gres_contrex_lock!!! */
14067 if (!locked)
14068 assoc_mgr_lock(&locks);
14069
14070 slurm_mutex_lock(&gres_context_lock);
14071 itr = list_iterator_create(gres_list);
14072 while ((gres_state_ptr = list_next(itr))) {
14073 if (is_job) {
14074 gres_job_state_t *gres_data_ptr = (gres_job_state_t *)
14075 gres_state_ptr->gres_data;
14076 col_name = gres_data_ptr->type_name;
14077 count = gres_data_ptr->total_gres;
14078 } else {
14079 gres_step_state_t *gres_data_ptr = (gres_step_state_t *)
14080 gres_state_ptr->gres_data;
14081 col_name = gres_data_ptr->type_name;
14082 count = gres_data_ptr->total_gres;
14083 }
14084
14085 for (i = 0; i < gres_context_cnt; i++) {
14086 if (gres_context[i].plugin_id ==
14087 gres_state_ptr->plugin_id) {
14088 tres_req.name = gres_context[i].gres_name;
14089 break;
14090 }
14091 }
14092
14093 if (!tres_req.name) {
14094 debug("%s: couldn't find name", __func__);
14095 continue;
14096 }
14097
14098 /* If we are no_consume, print a 0 */
14099 if (count == NO_CONSUME_VAL64)
14100 count = 0;
14101
14102 tres_rec = assoc_mgr_find_tres_rec(&tres_req);
14103
14104 if (tres_rec &&
14105 slurmdb_find_tres_count_in_string(
14106 tres_str, tres_rec->id) == INFINITE64)
14107 /* New gres */
14108 xstrfmtcat(tres_str, "%s%u=%"PRIu64,
14109 tres_str ? "," : "",
14110 tres_rec->id, count);
14111
14112 if (i < gres_context_cnt) {
14113 if (col_name) {
14114 /*
14115 * Now let's put of the : name TRES if we are
14116 * tracking it as well. This would be handy
14117 * for GRES like "gpu:tesla", where you might
14118 * want to track both as TRES.
14119 */
14120 tres_req.name = xstrdup_printf(
14121 "%s%s",
14122 gres_context[i].gres_name_colon,
14123 col_name);
14124 tres_rec = assoc_mgr_find_tres_rec(&tres_req);
14125 xfree(tres_req.name);
14126 if (tres_rec &&
14127 slurmdb_find_tres_count_in_string(
14128 tres_str, tres_rec->id) == INFINITE64)
14129 /* New GRES */
14130 xstrfmtcat(tres_str, "%s%u=%"PRIu64,
14131 tres_str ? "," : "",
14132 tres_rec->id, count);
14133 } else {
14134 /*
14135 * Job allocated GRES without "type"
14136 * specification, but Slurm is only accounting
14137 * for this GRES by specific "type", so pick
14138 * some valid "type" to get some accounting.
14139 * Although the reported "type" may not be
14140 * accurate, it is better than nothing...
14141 */
14142 tres_req.name = xstrdup_printf(
14143 "%s", gres_context[i].gres_name);
14144 tres_rec = assoc_mgr_find_tres_rec2(&tres_req);
14145 xfree(tres_req.name);
14146 if (tres_rec &&
14147 slurmdb_find_tres_count_in_string(
14148 tres_str, tres_rec->id) == INFINITE64)
14149 /* New GRES */
14150 xstrfmtcat(tres_str, "%s%u=%"PRIu64,
14151 tres_str ? "," : "",
14152 tres_rec->id, count);
14153 }
14154 }
14155 }
14156 list_iterator_destroy(itr);
14157 slurm_mutex_unlock(&gres_context_lock);
14158
14159 if (!locked)
14160 assoc_mgr_unlock(&locks);
14161
14162 return tres_str;
14163 }
14164
14165 /* Fill in job/node TRES arrays with allocated GRES. */
_set_type_tres_cnt(gres_state_type_enum_t state_type,List gres_list,uint32_t node_cnt,uint64_t * tres_cnt,bool locked)14166 static void _set_type_tres_cnt(gres_state_type_enum_t state_type,
14167 List gres_list,
14168 uint32_t node_cnt,
14169 uint64_t *tres_cnt,
14170 bool locked)
14171 {
14172 ListIterator itr;
14173 gres_state_t *gres_state_ptr;
14174 static bool first_run = 1;
14175 static slurmdb_tres_rec_t tres_rec;
14176 char *col_name = NULL;
14177 uint64_t count;
14178 int i, tres_pos;
14179 assoc_mgr_lock_t locks = { .tres = READ_LOCK };
14180
14181 /* we only need to init this once */
14182 if (first_run) {
14183 first_run = 0;
14184 memset(&tres_rec, 0, sizeof(slurmdb_tres_rec_t));
14185 tres_rec.type = "gres";
14186 }
14187
14188 if (!gres_list || !tres_cnt ||
14189 ((state_type == GRES_STATE_TYPE_JOB) &&
14190 (!node_cnt || (node_cnt == NO_VAL))))
14191 return;
14192
14193 /* must be locked first before gres_contrex_lock!!! */
14194 if (!locked)
14195 assoc_mgr_lock(&locks);
14196
14197 slurm_mutex_lock(&gres_context_lock);
14198 /* Initialize all GRES counters to zero. Increment them later. */
14199 for (i = 0; i < gres_context_cnt; i++) {
14200 tres_rec.name = gres_context[i].gres_name;
14201 if (tres_rec.name &&
14202 ((tres_pos = assoc_mgr_find_tres_pos(&tres_rec,true)) !=-1))
14203 tres_cnt[tres_pos] = 0;
14204 }
14205
14206 itr = list_iterator_create(gres_list);
14207 while ((gres_state_ptr = list_next(itr))) {
14208 bool set_total = false;
14209 for (i = 0; i < gres_context_cnt; i++) {
14210 if (gres_context[i].plugin_id ==
14211 gres_state_ptr->plugin_id) {
14212 tres_rec.name = gres_context[i].gres_name;
14213 break;
14214 }
14215 }
14216 if (!tres_rec.name) {
14217 debug("%s: couldn't find name", __func__);
14218 continue;
14219 }
14220
14221 /* Get alloc count for main GRES. */
14222 switch (state_type) {
14223 case GRES_STATE_TYPE_JOB:
14224 {
14225 gres_job_state_t *gres_data_ptr = (gres_job_state_t *)
14226 gres_state_ptr->gres_data;
14227 count = gres_data_ptr->total_gres;
14228 break;
14229 }
14230 case GRES_STATE_TYPE_NODE:
14231 {
14232 gres_node_state_t *gres_data_ptr = (gres_node_state_t *)
14233 gres_state_ptr->gres_data;
14234 count = gres_data_ptr->gres_cnt_alloc;
14235 break;
14236 }
14237 default:
14238 error("%s: unsupported state type %d", __func__,
14239 state_type);
14240 continue;
14241 }
14242 /*
14243 * Set main TRES's count (i.e. if no GRES "type" is being
14244 * accounted for). We need to increment counter since the job
14245 * may have been allocated multiple GRES types, but Slurm is
14246 * only configured to track the total count. For example, a job
14247 * allocated 1 GPU of type "tesla" and 1 GPU of type "volta",
14248 * but we want to record that the job was allocated a total of
14249 * 2 GPUs.
14250 */
14251 if ((tres_pos = assoc_mgr_find_tres_pos(&tres_rec,true)) != -1){
14252 if (count == NO_CONSUME_VAL64)
14253 tres_cnt[tres_pos] = NO_CONSUME_VAL64;
14254 else
14255 tres_cnt[tres_pos] += count;
14256 set_total = true;
14257 }
14258
14259 /*
14260 * Set TRES count for GRES model types. This would be handy for
14261 * GRES like "gpu:tesla", where you might want to track both as
14262 * TRES.
14263 */
14264 switch (state_type) {
14265 case GRES_STATE_TYPE_JOB:
14266 {
14267 gres_job_state_t *gres_data_ptr = (gres_job_state_t *)
14268 gres_state_ptr->gres_data;
14269
14270 col_name = gres_data_ptr->type_name;
14271 if (col_name) {
14272 tres_rec.name = xstrdup_printf(
14273 "%s%s",
14274 gres_context[i].gres_name_colon,
14275 col_name);
14276 if ((tres_pos = assoc_mgr_find_tres_pos(
14277 &tres_rec, true)) != -1)
14278 tres_cnt[tres_pos] = count;
14279 xfree(tres_rec.name);
14280 } else if (!set_total) {
14281 /*
14282 * Job allocated GRES without "type"
14283 * specification, but Slurm is only accounting
14284 * for this GRES by specific "type", so pick
14285 * some valid "type" to get some accounting.
14286 * Although the reported "type" may not be
14287 * accurate, it is better than nothing...
14288 */
14289 tres_rec.name = xstrdup_printf(
14290 "%s", gres_context[i].gres_name);
14291 if ((tres_pos = assoc_mgr_find_tres_pos2(
14292 &tres_rec, true)) != -1)
14293 tres_cnt[tres_pos] = count;
14294 xfree(tres_rec.name);
14295 }
14296 break;
14297 }
14298 case GRES_STATE_TYPE_NODE:
14299 {
14300 int type;
14301 gres_node_state_t *gres_data_ptr = (gres_node_state_t *)
14302 gres_state_ptr->gres_data;
14303
14304 for (type = 0; type < gres_data_ptr->type_cnt; type++) {
14305 col_name = gres_data_ptr->type_name[type];
14306 if (!col_name)
14307 continue;
14308
14309 tres_rec.name = xstrdup_printf(
14310 "%s%s",
14311 gres_context[i].gres_name_colon,
14312 col_name);
14313
14314 count = gres_data_ptr->type_cnt_alloc[type];
14315
14316 if ((tres_pos = assoc_mgr_find_tres_pos(
14317 &tres_rec, true)) != -1)
14318 tres_cnt[tres_pos] = count;
14319 xfree(tres_rec.name);
14320 }
14321 break;
14322 }
14323 default:
14324 error("%s: unsupported state type %d", __func__,
14325 state_type);
14326 continue;
14327 }
14328 }
14329 list_iterator_destroy(itr);
14330 slurm_mutex_unlock(&gres_context_lock);
14331
14332 if (!locked)
14333 assoc_mgr_unlock(&locks);
14334
14335 return;
14336 }
14337
gres_set_job_tres_cnt(List gres_list,uint32_t node_cnt,uint64_t * tres_cnt,bool locked)14338 extern void gres_set_job_tres_cnt(List gres_list,
14339 uint32_t node_cnt,
14340 uint64_t *tres_cnt,
14341 bool locked)
14342 {
14343 _set_type_tres_cnt(GRES_STATE_TYPE_JOB,
14344 gres_list, node_cnt, tres_cnt, locked);
14345 }
14346
gres_set_node_tres_cnt(List gres_list,uint64_t * tres_cnt,bool locked)14347 extern void gres_set_node_tres_cnt(List gres_list,
14348 uint64_t *tres_cnt,
14349 bool locked)
14350 {
14351 _set_type_tres_cnt(GRES_STATE_TYPE_NODE,
14352 gres_list, 0, tres_cnt, locked);
14353 }
14354
gres_device_major(char * dev_path)14355 extern char *gres_device_major(char *dev_path)
14356 {
14357 int loc_major, loc_minor;
14358 char *ret_major = NULL;
14359 struct stat fs;
14360
14361 if (stat(dev_path, &fs) < 0) {
14362 error("%s: stat(%s): %m", __func__, dev_path);
14363 return NULL;
14364 }
14365 loc_major = (int)major(fs.st_rdev);
14366 loc_minor = (int)minor(fs.st_rdev);
14367 debug3("%s : %s major %d, minor %d",
14368 __func__, dev_path, loc_major, loc_minor);
14369 if (S_ISBLK(fs.st_mode)) {
14370 xstrfmtcat(ret_major, "b %d:", loc_major);
14371 //info("device is block ");
14372 }
14373 if (S_ISCHR(fs.st_mode)) {
14374 xstrfmtcat(ret_major, "c %d:", loc_major);
14375 //info("device is character ");
14376 }
14377 xstrfmtcat(ret_major, "%d rwm", loc_minor);
14378
14379 return ret_major;
14380 }
14381
14382 /* Free memory for gres_device_t record */
destroy_gres_device(void * gres_device_ptr)14383 extern void destroy_gres_device(void *gres_device_ptr)
14384 {
14385 gres_device_t *gres_device = (gres_device_t *) gres_device_ptr;
14386
14387 if (!gres_device)
14388 return;
14389 xfree(gres_device->path);
14390 xfree(gres_device->major);
14391 xfree(gres_device);
14392 }
14393
14394 /* Destroy a gres_slurmd_conf_t record, free it's memory */
destroy_gres_slurmd_conf(void * x)14395 extern void destroy_gres_slurmd_conf(void *x)
14396 {
14397 gres_slurmd_conf_t *p = (gres_slurmd_conf_t *) x;
14398
14399 xassert(p);
14400 xfree(p->cpus);
14401 FREE_NULL_BITMAP(p->cpus_bitmap);
14402 xfree(p->file); /* Only used by slurmd */
14403 xfree(p->links);
14404 xfree(p->name);
14405 xfree(p->type_name);
14406 xfree(p);
14407 }
14408
14409
14410 /*
14411 * Convert GRES config_flags to a string. The pointer returned references local
14412 * storage in this function, which is not re-entrant.
14413 */
gres_flags2str(uint8_t config_flags)14414 extern char *gres_flags2str(uint8_t config_flags)
14415 {
14416 static char flag_str[128];
14417 char *sep = "";
14418
14419 flag_str[0] = '\0';
14420 if (config_flags & GRES_CONF_COUNT_ONLY) {
14421 strcat(flag_str, sep);
14422 strcat(flag_str, "CountOnly");
14423 sep = ",";
14424 }
14425
14426 if (config_flags & GRES_CONF_HAS_FILE) {
14427 strcat(flag_str, sep);
14428 strcat(flag_str, "HAS_FILE");
14429 sep = ",";
14430 }
14431
14432 if (config_flags & GRES_CONF_LOADED) {
14433 strcat(flag_str, sep);
14434 strcat(flag_str, "LOADED");
14435 sep = ",";
14436 }
14437
14438 if (config_flags & GRES_CONF_HAS_TYPE) {
14439 strcat(flag_str, sep);
14440 strcat(flag_str, "HAS_TYPE");
14441 sep = ",";
14442 }
14443
14444 return flag_str;
14445 }
14446
14447 /*
14448 * Creates a gres_slurmd_conf_t record to add to a list of gres_slurmd_conf_t
14449 * records
14450 */
add_gres_to_list(List gres_list,char * name,uint64_t device_cnt,int cpu_cnt,char * cpu_aff_abs_range,bitstr_t * cpu_aff_mac_bitstr,char * device_file,char * type,char * links)14451 extern void add_gres_to_list(List gres_list, char *name, uint64_t device_cnt,
14452 int cpu_cnt, char *cpu_aff_abs_range,
14453 bitstr_t *cpu_aff_mac_bitstr, char *device_file,
14454 char *type, char *links)
14455 {
14456 gres_slurmd_conf_t *gpu_record;
14457 bool use_empty_first_record = false;
14458 ListIterator itr = list_iterator_create(gres_list);
14459
14460 /*
14461 * If the first record already exists and has a count of 0 then
14462 * overwrite it.
14463 * This is a placeholder record created in _merge_config()
14464 */
14465 gpu_record = list_next(itr);
14466 if (gpu_record && (gpu_record->count == 0))
14467 use_empty_first_record = true;
14468 else
14469 gpu_record = xmalloc(sizeof(gres_slurmd_conf_t));
14470 gpu_record->cpu_cnt = cpu_cnt;
14471 if (cpu_aff_mac_bitstr)
14472 gpu_record->cpus_bitmap = bit_copy(cpu_aff_mac_bitstr);
14473 if (device_file)
14474 gpu_record->config_flags |= GRES_CONF_HAS_FILE;
14475 if (type)
14476 gpu_record->config_flags |= GRES_CONF_HAS_TYPE;
14477 gpu_record->cpus = xstrdup(cpu_aff_abs_range);
14478 gpu_record->type_name = xstrdup(type);
14479 gpu_record->name = xstrdup(name);
14480 gpu_record->file = xstrdup(device_file);
14481 gpu_record->links = xstrdup(links);
14482 gpu_record->count = device_cnt;
14483 gpu_record->plugin_id = gres_plugin_build_id(name);
14484 if (!use_empty_first_record)
14485 list_append(gres_list, gpu_record);
14486 list_iterator_destroy(itr);
14487 }
14488