1 /*****************************************************************************\
2  *  gres.c - driver for gres plugin
3  *****************************************************************************
4  *  Copyright (C) 2010 Lawrence Livermore National Security.
5  *  Portions Copyright (C) 2014-2019 SchedMD LLC
6  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7  *  Written by Morris Jette <jette1@llnl.gov>
8  *  CODE-OCEC-09-009. All rights reserved.
9  *
10  *  This file is part of Slurm, a resource management program.
11  *  For details, see <https://slurm.schedmd.com/>.
12  *  Please also read the included file: DISCLAIMER.
13  *
14  *  Slurm is free software; you can redistribute it and/or modify it under
15  *  the terms of the GNU General Public License as published by the Free
16  *  Software Foundation; either version 2 of the License, or (at your option)
17  *  any later version.
18  *
19  *  In addition, as a special exception, the copyright holders give permission
20  *  to link the code of portions of this program with the OpenSSL library under
21  *  certain conditions as described in each individual source file, and
22  *  distribute linked combinations including the two. You must obey the GNU
23  *  General Public License in all respects for all of the code used other than
24  *  OpenSSL. If you modify file(s) with this exception, you may extend this
25  *  exception to your version of the file(s), but you are not obligated to do
26  *  so. If you do not wish to do so, delete this exception statement from your
27  *  version.  If you delete this exception statement from all source files in
28  *  the program, then also delete it here.
29  *
30  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
31  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
32  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
33  *  details.
34  *
35  *  You should have received a copy of the GNU General Public License along
36  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
37  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
38 \*****************************************************************************/
39 
40 #include "config.h"
41 
42 #define _GNU_SOURCE
43 
44 #ifdef __FreeBSD__
45 #  include <sys/param.h>
46 #  include <sys/cpuset.h>
47 typedef cpuset_t cpu_set_t;
48 #endif
49 
50 #include <ctype.h>
51 #include <inttypes.h>
52 #include <limits.h>
53 #include <sched.h>
54 #include <stdio.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #include <sys/stat.h>
58 #include <sys/types.h>
59 
60 #ifdef MAJOR_IN_MKDEV
61 #  include <sys/mkdev.h>
62 #endif
63 #ifdef MAJOR_IN_SYSMACROS
64 #  include <sys/sysmacros.h>
65 #endif
66 
67 #include <math.h>
68 
69 #ifdef __NetBSD__
70 #define CPU_ZERO(c) cpuset_zero(*(c))
71 #define CPU_ISSET(i,c) cpuset_isset((i),*(c))
72 #define sched_getaffinity sched_getaffinity_np
73 #endif
74 
75 #include "slurm/slurm.h"
76 #include "slurm/slurm_errno.h"
77 #include "src/common/assoc_mgr.h"
78 #include "src/common/bitstring.h"
79 #include "src/common/gres.h"
80 #include "src/common/job_resources.h"
81 #include "src/common/list.h"
82 #include "src/common/log.h"
83 #include "src/common/macros.h"
84 #include "src/common/node_conf.h"
85 #include "src/common/node_select.h"
86 #include "src/common/pack.h"
87 #include "src/common/parse_config.h"
88 #include "src/common/plugin.h"
89 #include "src/common/plugrack.h"
90 #include "src/common/read_config.h"
91 #include "src/common/slurm_protocol_api.h"
92 #include "src/common/strlcpy.h"
93 #include "src/common/xmalloc.h"
94 #include "src/common/xstring.h"
95 
96 #define MAX_GRES_BITMAP 1024
97 
98 strong_alias(gres_gresid_to_gresname, slurm_gres_gresid_to_gresname);
99 strong_alias(gres_get_node_used, slurm_gres_get_node_used);
100 strong_alias(gres_get_system_cnt, slurm_gres_get_system_cnt);
101 strong_alias(gres_get_value_by_type, slurm_gres_get_value_by_type);
102 strong_alias(gres_get_job_info, slurm_gres_get_job_info);
103 strong_alias(gres_build_job_details, slurm_gres_build_job_details);
104 strong_alias(gres_get_step_info, slurm_gres_get_step_info);
105 strong_alias(gres_get_step_state, slurm_gres_get_step_state);
106 strong_alias(gres_get_job_state, slurm_gres_get_job_state);
107 strong_alias(gres_2_tres_str, slurm_gres_2_tres_str);
108 strong_alias(gres_set_job_tres_cnt, slurm_gres_set_job_tres_cnt);
109 strong_alias(gres_set_node_tres_cnt, slurm_gres_set_node_tres_cnt);
110 strong_alias(gres_device_major, slurm_gres_device_major);
111 strong_alias(destroy_gres_device, slurm_destroy_gres_device);
112 strong_alias(destroy_gres_slurmd_conf, slurm_destroy_gres_slurmd_conf);
113 
114 /* Gres symbols provided by the plugin */
115 typedef struct slurm_gres_ops {
116 	int		(*node_config_load)	( List gres_conf_list,
117 						  node_config_load_t *node_conf);
118 	void		(*job_set_env)		( char ***job_env_ptr,
119 						  void *gres_ptr, int node_inx );
120 	void		(*step_set_env)		( char ***job_env_ptr,
121 						  void *gres_ptr );
122 	void		(*step_reset_env)	( char ***job_env_ptr,
123 						  void *gres_ptr,
124 						  bitstr_t *usable_gres );
125 	void		(*send_stepd)		( int fd );
126 	void		(*recv_stepd)		( int fd );
127 	int		(*job_info)		( gres_job_state_t *job_gres_data,
128 						  uint32_t node_inx,
129 						  enum gres_job_data_type data_type,
130 						  void *data);
131 	int		(*step_info)		( gres_step_state_t *step_gres_data,
132 						  uint32_t node_inx,
133 						  enum gres_step_data_type data_type,
134 						  void *data);
135 	List            (*get_devices)		( void );
136 	void            (*step_hardware_init)	( bitstr_t *, char * );
137 	void            (*step_hardware_fini)	( void );
138 	gres_epilog_info_t *(*epilog_build_env)(gres_job_state_t *gres_job_ptr);
139 	void            (*epilog_set_env)	( char ***epilog_env_ptr,
140 						gres_epilog_info_t *epilog_info,
141 						int node_inx );
142 } slurm_gres_ops_t;
143 
144 /*
145  * Gres plugin context, one for each gres type.
146  * Add to gres_context through _add_gres_context().
147  */
148 typedef struct slurm_gres_context {
149 	plugin_handle_t	cur_plugin;
150 	uint8_t		config_flags;		/* See GRES_CONF_* in gres.h */
151 	char *		gres_name;		/* name (e.g. "gpu") */
152 	char *		gres_name_colon;	/* name + colon (e.g. "gpu:") */
153 	int		gres_name_colon_len;	/* size of gres_name_colon */
154 	char *		gres_type;		/* plugin name (e.g. "gres/gpu") */
155 	slurm_gres_ops_t ops;			/* pointers to plugin symbols */
156 	uint32_t	plugin_id;		/* key for searches */
157 	plugrack_t	*plugin_list;		/* plugrack info */
158 	uint64_t        total_cnt;		/* Total GRES across all nodes */
159 } slurm_gres_context_t;
160 
161 /* Generic gres data structure for adding to a list. Depending upon the
162  * context, gres_data points to gres_node_state_t, gres_job_state_t or
163  * gres_step_state_t */
164 typedef struct gres_state {
165 	uint32_t	plugin_id;
166 	void		*gres_data;
167 } gres_state_t;
168 
169 typedef struct gres_search_key {
170 	int node_offset;
171 	uint32_t plugin_id;
172 	uint32_t type_id;
173 } gres_key_t;
174 
175 /* Pointers to functions in src/slurmd/common/xcpuinfo.h that we may use */
176 typedef struct xcpuinfo_funcs {
177 	int (*xcpuinfo_abs_to_mac) (char *abs, char **mac);
178 } xcpuinfo_funcs_t;
179 xcpuinfo_funcs_t xcpuinfo_ops;
180 
181 /* Local variables */
182 static int gres_context_cnt = -1;
183 static uint32_t gres_cpu_cnt = 0;
184 static bool gres_debug = false;
185 static slurm_gres_context_t *gres_context = NULL;
186 static char *gres_node_name = NULL;
187 static char *gres_plugin_list = NULL;
188 static pthread_mutex_t gres_context_lock = PTHREAD_MUTEX_INITIALIZER;
189 static List gres_conf_list = NULL;
190 static bool init_run = false;
191 static bool have_gpu = false, have_mps = false;
192 static uint32_t gpu_plugin_id = NO_VAL, mps_plugin_id = NO_VAL;
193 static volatile uint32_t autodetect_types = GRES_AUTODETECT_NONE;
194 static uint32_t select_plugin_type = NO_VAL;
195 
196 /* Local functions */
197 static void _add_gres_context(char *gres_name);
198 static gres_node_state_t *
199 		_build_gres_node_state(void);
200 static void	_build_node_gres_str(List *gres_list, char **gres_str,
201 				     int cores_per_sock, int sock_per_node);
202 static uint32_t **_build_tasks_per_node_sock(struct job_resources *job_res,
203 					    uint8_t overcommit,
204 					    gres_mc_data_t *tres_mc_ptr,
205 					    node_record_t *node_table_ptr);
206 static bitstr_t *_core_bitmap_rebuild(bitstr_t *old_core_bitmap, int new_size);
207 static void	_epilog_list_del(void *x);
208 static int	_find_job_by_sock_gres(void *x, void *key);
209 static int	_find_sock_by_job_gres(void *x, void *key);
210 static void	_free_tasks_per_node_sock(uint32_t **tasks_per_node_socket,
211 					  int node_cnt);
212 static void	_get_gres_cnt(gres_node_state_t *gres_data, char *orig_config,
213 			      char *gres_name, char *gres_name_colon,
214 			      int gres_name_colon_len);
215 static uint32_t	_get_task_cnt_node(uint32_t **tasks_per_node_socket,
216 				   int node_inx, int sock_cnt);
217 static uint64_t	_get_tot_gres_cnt(uint32_t plugin_id, uint64_t *topo_cnt,
218 				  int *config_type_cnt);
219 static int	_gres_find_id(void *x, void *key);
220 static int	_gres_find_job_by_key(void *x, void *key);
221 static int	_gres_find_step_by_key(void *x, void *key);
222 static void	_gres_job_list_delete(void *list_element);
223 static int	_job_alloc(void *job_gres_data, void *node_gres_data,
224 			   int node_cnt, int node_index, int node_offset,
225 			   char *gres_name, uint32_t job_id, char *node_name,
226 			   bitstr_t *core_bitmap, uint32_t plugin_id,
227 			   uint32_t user_id);
228 static void	_job_core_filter(void *job_gres_data, void *node_gres_data,
229 				 bool use_total_gres, bitstr_t *core_bitmap,
230 				 int core_start_bit, int core_end_bit,
231 				 char *gres_name, char *node_name,
232 				 uint32_t plugin_id);
233 static int	_job_dealloc(void *job_gres_data, void *node_gres_data,
234 			     int node_offset, char *gres_name, uint32_t job_id,
235 			     char *node_name, bool old_job, uint32_t plugin_id,
236 			     uint32_t user_id, bool job_fini);
237 static void	_job_state_delete(void *gres_data);
238 static void *	_job_state_dup(void *gres_data);
239 static void *	_job_state_dup2(void *gres_data, int node_index);
240 static void	_job_state_log(void *gres_data, uint32_t job_id,
241 			       uint32_t plugin_id);
242 static uint32_t _job_test(void *job_gres_data, void *node_gres_data,
243 			  bool use_total_gres, bitstr_t *core_bitmap,
244 			  int core_start_bit, int core_end_bit, bool *topo_set,
245 			  uint32_t job_id, char *node_name, char *gres_name,
246 			  uint32_t plugin_id, bool disable_binding);
247 static int	_load_gres_plugin(slurm_gres_context_t *plugin_context);
248 static int	_log_gres_slurmd_conf(void *x, void *arg);
249 static void	_my_stat(char *file_name);
250 static int	_node_config_init(char *node_name, char *orig_config,
251 				  slurm_gres_context_t *context_ptr,
252 				  gres_state_t *gres_ptr);
253 static char *	_node_gres_used(void *gres_data, char *gres_name);
254 static int	_node_reconfig(char *node_name, char *new_gres, char **gres_str,
255 			       gres_state_t *gres_ptr, bool config_overrides,
256 			       slurm_gres_context_t *context_ptr,
257 			       bool *updated_gpu_cnt);
258 static int	_node_reconfig_test(char *node_name, char *new_gres,
259 				    gres_state_t *gres_ptr,
260 				    slurm_gres_context_t *context_ptr);
261 static void	_node_state_dealloc(gres_state_t *gres_ptr);
262 static void *	_node_state_dup(void *gres_data);
263 static void	_node_state_log(void *gres_data, char *node_name,
264 				char *gres_name);
265 static int	_parse_gres_config(void **dest, slurm_parser_enum_t type,
266 				   const char *key, const char *value,
267 				   const char *line, char **leftover);
268 static int	_parse_gres_config2(void **dest, slurm_parser_enum_t type,
269 				    const char *key, const char *value,
270 				    const char *line, char **leftover);
271 static bool	_shared_gres(uint32_t plugin_id);
272 static bool	_sharing_gres(uint32_t plugin_id);
273 static void	_sock_gres_del(void *x);
274 static int	_step_alloc(void *step_gres_data, void *job_gres_data,
275 			    uint32_t plugin_id, int node_offset,
276 			    bool first_step_node,
277 			    uint32_t job_id, uint32_t step_id,
278 			    uint16_t tasks_on_node, uint32_t rem_nodes);
279 static int      _step_dealloc(gres_state_t *step_gres_ptr, List job_gres_list,
280 			      uint32_t job_id, uint32_t step_id);
281 static void *	_step_state_dup(void *gres_data);
282 static void *	_step_state_dup2(void *gres_data, int node_index);
283 static void	_step_state_log(void *gres_data, uint32_t job_id,
284 				uint32_t step_id, char *gres_name);
285 static uint64_t _step_test(void *step_gres_data, void *job_gres_data,
286 			   int node_offset, bool first_step_node,
287 			   uint16_t cpus_per_task, int max_rem_nodes,
288 			   bool ignore_alloc,
289 			   uint32_t job_id, uint32_t step_id,
290 			   uint32_t plugin_id);
291 static void	_sync_node_mps_to_gpu(gres_state_t *mps_gres_ptr,
292 				      gres_state_t *gpu_gres_ptr);
293 static int	_unload_gres_plugin(slurm_gres_context_t *plugin_context);
294 static void	_validate_slurm_conf(List slurm_conf_list,
295 				     slurm_gres_context_t *context_ptr);
296 static void	_validate_gres_conf(List gres_conf_list,
297 				    slurm_gres_context_t *context_ptr);
298 static int	_validate_file(char *path_name, char *gres_name);
299 static void	_validate_links(gres_slurmd_conf_t *p);
300 static void	_validate_gres_node_cores(gres_node_state_t *node_gres_ptr,
301 					  int cpus_ctld, char *node_name);
302 static int	_valid_gres_type(char *gres_name, gres_node_state_t *gres_data,
303 				 bool config_overrides, char **reason_down);
304 
gres_plugin_build_id(char * name)305 extern uint32_t gres_plugin_build_id(char *name)
306 {
307 	int i, j;
308 	uint32_t id = 0;
309 
310 	if (!name)
311 		return id;
312 
313 	for (i = 0, j = 0; name[i]; i++) {
314 		id += (name[i] << j);
315 		j = (j + 8) % 32;
316 	}
317 
318 	return id;
319 }
320 
_gres_find_id(void * x,void * key)321 static int _gres_find_id(void *x, void *key)
322 {
323 	uint32_t *plugin_id = (uint32_t *)key;
324 	gres_state_t *state_ptr = (gres_state_t *) x;
325 	if (state_ptr->plugin_id == *plugin_id)
326 		return 1;
327 	return 0;
328 }
329 
330 /* Find job record with matching name and type */
_gres_find_job_by_key(void * x,void * key)331 static int _gres_find_job_by_key(void *x, void *key)
332 {
333 	gres_state_t *state_ptr = (gres_state_t *) x;
334 	gres_key_t *job_key = (gres_key_t *) key;
335 	gres_job_state_t *gres_data_ptr;
336 	gres_data_ptr = (gres_job_state_t *)state_ptr->gres_data;
337 
338 	if ((state_ptr->plugin_id == job_key->plugin_id) &&
339 	    ((job_key->type_id == NO_VAL) ||
340 	     (gres_data_ptr->type_id == job_key->type_id)))
341 		return 1;
342 	return 0;
343 }
344 
345 /* Find job record with matching name and type */
_gres_find_job_by_key_with_cnt(void * x,void * key)346 static int _gres_find_job_by_key_with_cnt(void *x, void *key)
347 {
348 	gres_state_t *state_ptr = (gres_state_t *) x;
349 	gres_key_t *job_key = (gres_key_t *) key;
350 	gres_job_state_t *gres_data_ptr;
351 	gres_data_ptr = (gres_job_state_t *)state_ptr->gres_data;
352 
353 	if (!_gres_find_job_by_key(x, key))
354 		return 0;
355 	/* ignore count on no_consume gres */
356 	if (!gres_data_ptr->node_cnt ||
357 	    gres_data_ptr->gres_cnt_node_alloc[job_key->node_offset])
358 		return 1;
359 	return 0;
360 }
361 
_gres_find_step_by_key(void * x,void * key)362 static int _gres_find_step_by_key(void *x, void *key)
363 {
364 	gres_state_t *state_ptr = (gres_state_t *) x;
365 	gres_key_t *step_key = (gres_key_t *) key;
366 	gres_step_state_t *gres_data_ptr;
367 	gres_data_ptr = (gres_step_state_t *)state_ptr->gres_data;
368 
369 	if ((state_ptr->plugin_id == step_key->plugin_id) &&
370 	    (gres_data_ptr->type_id == step_key->type_id))
371 		return 1;
372 	return 0;
373 }
374 
_gres_find_name_internal(char * name,char * key,uint32_t plugin_id)375 static int _gres_find_name_internal(char *name, char *key, uint32_t plugin_id)
376 {
377 	if (!name) {
378 		int i;
379 		for (i = 0; i < gres_context_cnt; i++) {
380 			if (gres_context[i].plugin_id == plugin_id) {
381 				name = gres_context[i].gres_name;
382 				break;
383 			}
384 		}
385 
386 		if (!name) {
387 			debug("%s: couldn't find name", __func__);
388 			return 0;
389 		}
390 	}
391 
392 	if (!xstrcmp(name, key))
393 		return 1;
394 	return 0;
395 }
396 
_gres_job_find_name(void * x,void * key)397 static int _gres_job_find_name(void *x, void *key)
398 {
399 	gres_state_t *state_ptr = (gres_state_t *) x;
400 	gres_job_state_t *gres_data_ptr =
401 		(gres_job_state_t *)state_ptr->gres_data;
402 
403 	return _gres_find_name_internal(gres_data_ptr->type_name, (char *)key,
404 					state_ptr->plugin_id);
405 }
406 
_gres_step_find_name(void * x,void * key)407 static int _gres_step_find_name(void *x, void *key)
408 {
409 	gres_state_t *state_ptr = (gres_state_t *) x;
410 	gres_step_state_t *gres_data_ptr =
411 		(gres_step_state_t *)state_ptr->gres_data;
412 	return _gres_find_name_internal(gres_data_ptr->type_name, (char *)key,
413 					state_ptr->plugin_id);
414 }
415 
_load_gres_plugin(slurm_gres_context_t * plugin_context)416 static int _load_gres_plugin(slurm_gres_context_t *plugin_context)
417 {
418 	/*
419 	 * Must be synchronized with slurm_gres_ops_t above.
420 	 */
421 	static const char *syms[] = {
422 		"node_config_load",
423 		"job_set_env",
424 		"step_set_env",
425 		"step_reset_env",
426 		"send_stepd",
427 		"recv_stepd",
428 		"job_info",
429 		"step_info",
430 		"get_devices",
431 		"step_hardware_init",
432 		"step_hardware_fini",
433 		"epilog_build_env",
434 		"epilog_set_env"
435 	};
436 	int n_syms = sizeof(syms) / sizeof(char *);
437 
438 	/* Find the correct plugin */
439 	if (plugin_context->config_flags & GRES_CONF_COUNT_ONLY) {
440 		debug("Plugin of type %s only tracks gres counts",
441 		      plugin_context->gres_type);
442 		return SLURM_SUCCESS;
443 	}
444 
445 	plugin_context->cur_plugin = plugin_load_and_link(
446 					plugin_context->gres_type,
447 					n_syms, syms,
448 					(void **) &plugin_context->ops);
449 	if (plugin_context->cur_plugin != PLUGIN_INVALID_HANDLE)
450 		return SLURM_SUCCESS;
451 
452 	if (errno != EPLUGIN_NOTFOUND) {
453 		error("Couldn't load specified plugin name for %s: %s",
454 		      plugin_context->gres_type, plugin_strerror(errno));
455 		return SLURM_ERROR;
456 	}
457 
458 	debug("gres: Couldn't find the specified plugin name for %s looking "
459 	      "at all files", plugin_context->gres_type);
460 
461 	/* Get plugin list */
462 	if (plugin_context->plugin_list == NULL) {
463 		char *plugin_dir;
464 		plugin_context->plugin_list = plugrack_create("gres");
465 		plugin_dir = slurm_get_plugin_dir();
466 		plugrack_read_dir(plugin_context->plugin_list, plugin_dir);
467 		xfree(plugin_dir);
468 	}
469 
470 	plugin_context->cur_plugin = plugrack_use_by_type(
471 					plugin_context->plugin_list,
472 					plugin_context->gres_type );
473 	if (plugin_context->cur_plugin == PLUGIN_INVALID_HANDLE) {
474 		debug("Cannot find plugin of type %s, just track gres counts",
475 		      plugin_context->gres_type);
476 		plugin_context->config_flags |= GRES_CONF_COUNT_ONLY;
477 		return SLURM_ERROR;
478 	}
479 
480 	/* Dereference the API. */
481 	if (plugin_get_syms(plugin_context->cur_plugin,
482 			    n_syms, syms,
483 			    (void **) &plugin_context->ops ) < n_syms ) {
484 		error("Incomplete %s plugin detected",
485 		      plugin_context->gres_type);
486 		return SLURM_ERROR;
487 	}
488 
489 	return SLURM_SUCCESS;
490 }
491 
_unload_gres_plugin(slurm_gres_context_t * plugin_context)492 static int _unload_gres_plugin(slurm_gres_context_t *plugin_context)
493 {
494 	int rc;
495 
496 	/*
497 	 * Must check return code here because plugins might still
498 	 * be loaded and active.
499 	 */
500 	if (plugin_context->plugin_list)
501 		rc = plugrack_destroy(plugin_context->plugin_list);
502 	else {
503 		rc = SLURM_SUCCESS;
504 		plugin_unload(plugin_context->cur_plugin);
505 	}
506 	xfree(plugin_context->gres_name);
507 	xfree(plugin_context->gres_name_colon);
508 	xfree(plugin_context->gres_type);
509 
510 	return rc;
511 }
512 
513 /*
514  * Add new gres context to gres_context array and load the plugin.
515  * Must hold gres_context_lock before calling.
516  */
_add_gres_context(char * gres_name)517 static void _add_gres_context(char *gres_name)
518 {
519 	slurm_gres_context_t *plugin_context;
520 
521 	if (!gres_name || !gres_name[0])
522 		fatal("%s: invalid empty gres_name", __func__);
523 
524 	xrecalloc(gres_context, (gres_context_cnt + 1),
525 		  sizeof(slurm_gres_context_t));
526 
527 	plugin_context = &gres_context[gres_context_cnt];
528 	plugin_context->gres_name = xstrdup(gres_name);
529 	plugin_context->plugin_id = gres_plugin_build_id(gres_name);
530 	plugin_context->gres_type = xstrdup_printf("gres/%s", gres_name);
531 	plugin_context->plugin_list = NULL;
532 	plugin_context->cur_plugin = PLUGIN_INVALID_HANDLE;
533 
534 	gres_context_cnt++;
535 }
536 
537 /*
538  * Initialize the GRES plugins.
539  *
540  * Returns a Slurm errno.
541  */
gres_plugin_init(void)542 extern int gres_plugin_init(void)
543 {
544 	int i, j, rc = SLURM_SUCCESS;
545 	char *last = NULL, *names, *one_name, *full_name;
546 	char *sorted_names = NULL, *sep = "";
547 	bool append_mps = false;
548 
549 	if (init_run && (gres_context_cnt >= 0))
550 		return rc;
551 
552 	slurm_mutex_lock(&gres_context_lock);
553 	if (slurm_get_debug_flags() & DEBUG_FLAG_GRES)
554 		gres_debug = true;
555 	else
556 		gres_debug = false;
557 
558 	if (gres_context_cnt >= 0)
559 		goto fini;
560 
561 	gres_plugin_list = slurm_get_gres_plugins();
562 	gres_context_cnt = 0;
563 	if ((gres_plugin_list == NULL) || (gres_plugin_list[0] == '\0'))
564 		goto fini;
565 
566 	/* Ensure that "gres/mps" follows "gres/gpu" */
567 	have_gpu = false;
568 	have_mps = false;
569 	names = xstrdup(gres_plugin_list);
570 	one_name = strtok_r(names, ",", &last);
571 	while (one_name) {
572 		bool skip_name = false;
573 		if (!xstrcmp(one_name, "mps")) {
574 			have_mps = true;
575 			if (!have_gpu) {
576 				append_mps = true; /* "mps" must follow "gpu" */
577 				skip_name = true;
578 			}
579 			mps_plugin_id = gres_plugin_build_id("mps");
580 		} else if (!xstrcmp(one_name, "gpu")) {
581 			have_gpu = true;
582 			gpu_plugin_id = gres_plugin_build_id("gpu");
583 		}
584 		if (!skip_name) {
585 			xstrfmtcat(sorted_names, "%s%s", sep, one_name);
586 			sep = ",";
587 		}
588 		one_name = strtok_r(NULL, ",", &last);
589 	}
590 	if (append_mps) {
591 		if (!have_gpu)
592 			fatal("GresTypes: gres/mps requires that gres/gpu also be configured");
593 		xstrfmtcat(sorted_names, "%s%s", sep, "mps");
594 	}
595 	xfree(names);
596 
597 	gres_context_cnt = 0;
598 	one_name = strtok_r(sorted_names, ",", &last);
599 	while (one_name) {
600 		full_name = xstrdup("gres/");
601 		xstrcat(full_name, one_name);
602 		for (i = 0; i < gres_context_cnt; i++) {
603 			if (!xstrcmp(full_name, gres_context[i].gres_type))
604 				break;
605 		}
606 		xfree(full_name);
607 		if (i < gres_context_cnt) {
608 			error("Duplicate plugin %s ignored",
609 			      gres_context[i].gres_type);
610 		} else {
611 			_add_gres_context(one_name);
612 		}
613 		one_name = strtok_r(NULL, ",", &last);
614 	}
615 	xfree(sorted_names);
616 
617 	/* Ensure that plugin_id is valid and unique */
618 	for (i = 0; i < gres_context_cnt; i++) {
619 		for (j = i + 1; j < gres_context_cnt; j++) {
620 			if (gres_context[i].plugin_id !=
621 			    gres_context[j].plugin_id)
622 				continue;
623 			fatal("Gres: Duplicate plugin_id %u for %s and %s, "
624 			      "change gres name for one of them",
625 			      gres_context[i].plugin_id,
626 			      gres_context[i].gres_type,
627 			      gres_context[j].gres_type);
628 		}
629 		xassert(gres_context[i].gres_name);
630 
631 		gres_context[i].gres_name_colon =
632 			xstrdup_printf("%s:", gres_context[i].gres_name);
633 		gres_context[i].gres_name_colon_len =
634 			strlen(gres_context[i].gres_name_colon);
635 	}
636 	init_run = true;
637 
638 	if ((select_plugin_type == NO_VAL) &&
639 	    (select_g_get_info_from_plugin(SELECT_CR_PLUGIN, NULL,
640 				&select_plugin_type) != SLURM_SUCCESS)) {
641 		select_plugin_type = NO_VAL;	/* error */
642 	}
643 	if (have_mps && running_in_slurmctld() &&
644 	    (select_plugin_type != SELECT_TYPE_CONS_TRES)) {
645 		fatal("Use of gres/mps requires the use of select/cons_tres");
646 	}
647 
648 fini:	slurm_mutex_unlock(&gres_context_lock);
649 	return rc;
650 }
651 
gres_plugin_get_gres_cnt(void)652 extern int gres_plugin_get_gres_cnt(void)
653 {
654 	static int cnt = -1;
655 
656 	if (cnt != -1)
657 		return cnt;
658 
659 	gres_plugin_init();
660 
661 	slurm_mutex_lock(&gres_context_lock);
662 	cnt = gres_context_cnt;
663 	slurm_mutex_unlock(&gres_context_lock);
664 
665 	return cnt;
666 }
667 
668 /*
669  * Add a GRES record. This is used by the node_features plugin after the
670  * slurm.conf file is read and the initial GRES records are built by
671  * gres_plugin_init().
672  */
gres_plugin_add(char * gres_name)673 extern void gres_plugin_add(char *gres_name)
674 {
675 	int i;
676 
677 	slurm_mutex_lock(&gres_context_lock);
678 	for (i = 0; i < gres_context_cnt; i++) {
679 		if (!xstrcmp(gres_context[i].gres_name, gres_name))
680 			goto fini;
681 	}
682 
683 	_add_gres_context(gres_name);
684 fini:	slurm_mutex_unlock(&gres_context_lock);
685 }
686 
687 /* Given a gres_name, return its context index or -1 if not found */
_gres_name_context(char * gres_name)688 static int _gres_name_context(char *gres_name)
689 {
690 	int i;
691 
692 	for (i = 0; i < gres_context_cnt; i++) {
693 		if (!xstrcmp(gres_context[i].gres_name, gres_name))
694 			return i;
695 	}
696 
697 	return -1;
698 }
699 
700 /*
701  * Takes a GRES config line (typically from slurm.conf) and remove any
702  * records for GRES which are not defined in GresTypes.
703  * RET string of valid GRES, Release memory using xfree()
704  */
gres_plugin_name_filter(char * orig_gres,char * nodes)705 extern char *gres_plugin_name_filter(char *orig_gres, char *nodes)
706 {
707 	char *new_gres = NULL, *save_ptr = NULL;
708 	char *colon, *sep = "", *tmp, *tok, *name;
709 
710 	slurm_mutex_lock(&gres_context_lock);
711 	if (!orig_gres || !orig_gres[0] || !gres_context_cnt) {
712 		slurm_mutex_unlock(&gres_context_lock);
713 		return new_gres;
714 	}
715 
716 	tmp = xstrdup(orig_gres);
717 	tok = strtok_r(tmp, ",", &save_ptr);
718 	while (tok) {
719 		name = xstrdup(tok);
720 		if ((colon = strchr(name, ':')))
721 			colon[0] = '\0';
722 		if (_gres_name_context(name) != -1) {
723 			xstrfmtcat(new_gres, "%s%s", sep, tok);
724 			sep = ",";
725 		} else {
726 			/* Logging may not be initialized at this point */
727 			error("Invalid GRES configured on node %s: %s", nodes,
728 			      tok);
729 		}
730 		xfree(name);
731 		tok = strtok_r(NULL, ",", &save_ptr);
732 	}
733 	slurm_mutex_unlock(&gres_context_lock);
734 	xfree(tmp);
735 
736 	return new_gres;
737 }
738 
739 /*
740  * Terminate the gres plugin. Free memory.
741  *
742  * Returns a Slurm errno.
743  */
gres_plugin_fini(void)744 extern int gres_plugin_fini(void)
745 {
746 	int i, j, rc = SLURM_SUCCESS;
747 
748 	slurm_mutex_lock(&gres_context_lock);
749 	xfree(gres_node_name);
750 	if (gres_context_cnt < 0)
751 		goto fini;
752 
753 	init_run = false;
754 	for (i = 0; i < gres_context_cnt; i++) {
755 		j = _unload_gres_plugin(gres_context + i);
756 		if (j != SLURM_SUCCESS)
757 			rc = j;
758 	}
759 	xfree(gres_context);
760 	xfree(gres_plugin_list);
761 	FREE_NULL_LIST(gres_conf_list);
762 	gres_context_cnt = -1;
763 
764 fini:	slurm_mutex_unlock(&gres_context_lock);
765 	return rc;
766 }
767 
768 /*
769  **************************************************************************
770  *                          P L U G I N   C A L L S                       *
771  **************************************************************************
772  */
773 
774 /*
775  * Return a plugin-specific help message for salloc, sbatch and srun
776  * Result must be xfree()'d.
777  *
778  * NOTE: GRES "type" (e.g. model) information is only available from slurmctld
779  * after slurmd registers. It is not readily available from srun (as used here).
780  */
gres_plugin_help_msg(void)781 extern char *gres_plugin_help_msg(void)
782 {
783 	int i;
784 	char *msg = xstrdup("Valid gres options are:\n");
785 
786 	gres_plugin_init();
787 
788 	slurm_mutex_lock(&gres_context_lock);
789 	for (i = 0; i < gres_context_cnt; i++) {
790 		xstrcat(msg, gres_context[i].gres_name);
791 		xstrcat(msg, "[[:type]:count]\n");
792 	}
793 	slurm_mutex_unlock(&gres_context_lock);
794 
795 	return msg;
796 }
797 
798 /*
799  * Perform reconfig, re-read any configuration files
800  * OUT did_change - set if gres configuration changed
801  */
gres_plugin_reconfig(void)802 extern int gres_plugin_reconfig(void)
803 {
804 	int rc = SLURM_SUCCESS;
805 	char *plugin_names = slurm_get_gres_plugins();
806 	bool plugin_change;
807 
808 	slurm_mutex_lock(&gres_context_lock);
809 	if (slurm_get_debug_flags() & DEBUG_FLAG_GRES)
810 		gres_debug = true;
811 	else
812 		gres_debug = false;
813 
814 	if (xstrcmp(plugin_names, gres_plugin_list))
815 		plugin_change = true;
816 	else
817 		plugin_change = false;
818 	slurm_mutex_unlock(&gres_context_lock);
819 
820 	if (plugin_change) {
821 		error("GresPlugins changed from %s to %s ignored",
822 		     gres_plugin_list, plugin_names);
823 		error("Restart the slurmctld daemon to change GresPlugins");
824 #if 0
825 		/* This logic would load new plugins, but we need the old
826 		 * plugins to persist in order to process old state
827 		 * information. */
828 		rc = gres_plugin_fini();
829 		if (rc == SLURM_SUCCESS)
830 			rc = gres_plugin_init();
831 #endif
832 	}
833 	xfree(plugin_names);
834 
835 	return rc;
836 }
837 
838 
839 
840 /*
841  * Remove file-less GPUs from the final GRES list, since File is a requirement.
842  */
_remove_fileless_gpus(List gres_conf_list,slurm_gres_context_t * context_ptr)843 static void _remove_fileless_gpus(List gres_conf_list,
844 				  slurm_gres_context_t *context_ptr)
845 {
846 	gres_slurmd_conf_t *gres_conf;
847 	ListIterator iter;
848 
849 	if (!gres_conf_list)
850 		return;
851 
852 	/* Only work in the GPU plugin */
853 	if (context_ptr->plugin_id != gres_plugin_build_id("gpu"))
854 		return;
855 
856 	iter = list_iterator_create(gres_conf_list);
857 	while ((gres_conf = list_next(iter))) {
858 		if (gres_conf->plugin_id != context_ptr->plugin_id)
859 			continue;
860 
861 		if (!gres_conf->file) {
862 			debug("Removing file-less GPU %s:%s from final GRES list",
863 			      gres_conf->name, gres_conf->type_name);
864 			list_delete_item(iter);
865 		}
866 	}
867 	list_iterator_destroy(iter);
868 }
869 
870 /*
871  * Log the contents of a gres_slurmd_conf_t record
872  */
_log_gres_slurmd_conf(void * x,void * arg)873 static int _log_gres_slurmd_conf(void *x, void *arg)
874 {
875 	gres_slurmd_conf_t *p;
876 	char *links = NULL;
877 	int index = -1, offset, mult = 1;
878 
879 	p = (gres_slurmd_conf_t *) x;
880 	xassert(p);
881 
882 	if (!gres_debug) {
883 		verbose("Gres Name=%s Type=%s Count=%"PRIu64,
884 			p->name, p->type_name, p->count);
885 		return 0;
886 	}
887 
888 	if (p->file) {
889 		index = 0;
890 		offset = strlen(p->file);
891 		while (offset > 0) {
892 			offset--;
893 			if ((p->file[offset] < '0') || (p->file[offset] > '9'))
894 				break;
895 			index += (p->file[offset] - '0') * mult;
896 			mult *= 10;
897 		}
898 	}
899 
900 	if (p->links)
901 		xstrfmtcat(links, "Links=%s", p->links);
902 	if (p->cpus && (index != -1)) {
903 		info("Gres Name=%s Type=%s Count=%"PRIu64" Index=%d ID=%u "
904 		     "File=%s Cores=%s CoreCnt=%u %s",
905 		     p->name, p->type_name, p->count, index, p->plugin_id,
906 		     p->file, p->cpus, p->cpu_cnt, links);
907 	} else if (index != -1) {
908 		info("Gres Name=%s Type=%s Count=%"PRIu64" Index=%d ID=%u File=%s %s",
909 		     p->name, p->type_name, p->count, index, p->plugin_id,
910 		     p->file, links);
911 	} else if (p->file) {
912 		info("Gres Name=%s Type=%s Count=%"PRIu64" ID=%u File=%s %s",
913 		     p->name, p->type_name, p->count, p->plugin_id, p->file,
914 		    links);
915 	} else {
916 		info("Gres Name=%s Type=%s Count=%"PRIu64" ID=%u %s", p->name,
917 		     p->type_name, p->count, p->plugin_id, links);
918 	}
919 	xfree(links);
920 
921 	return 0;
922 }
923 
924 /* Make sure that specified file name exists, wait up to 20 seconds or generate
925  * fatal error and exit. */
_my_stat(char * file_name)926 static void _my_stat(char *file_name)
927 {
928 	struct stat config_stat;
929 	bool sent_msg = false;
930 	int i;
931 
932 	if (!running_in_slurmdstepd())
933 		return;
934 
935 	for (i = 0; i < 20; i++) {
936 		if (i)
937 			sleep(1);
938 		if (stat(file_name, &config_stat) == 0) {
939 			if (sent_msg)
940 				info("gres.conf file %s now exists", file_name);
941 			return;
942 		}
943 
944 		if (errno != ENOENT)
945 			break;
946 
947 		if (!sent_msg) {
948 			error("Waiting for gres.conf file %s", file_name);
949 			sent_msg = true;
950 		}
951 	}
952 	fatal("can't stat gres.conf file %s: %m", file_name);
953 	return;
954 }
955 
_validate_file(char * path_name,char * gres_name)956 static int _validate_file(char *path_name, char *gres_name)
957 {
958 	char *file_name, *slash, *one_name, *root_path;
959 	hostlist_t hl;
960 	int i, file_count = 0;
961 
962 	i = strlen(path_name);
963 	if ((i < 3) || (path_name[i-1] != ']')) {
964 		_my_stat(path_name);
965 		return 1;
966 	}
967 
968 	slash = strrchr(path_name, '/');
969 	if (slash) {
970 		slash[0] = '\0';
971 		root_path = xstrdup(path_name);
972 		xstrcat(root_path, "/");
973 		slash[0] = '/';
974 		file_name = slash + 1;
975 	} else {
976 		file_name = path_name;
977 		root_path = NULL;
978 	}
979 	hl = hostlist_create(file_name);
980 	if (hl == NULL)
981 		fatal("can't parse File=%s", path_name);
982 	while ((one_name = hostlist_shift(hl))) {
983 		if (slash) {
984 			char *formatted_path = NULL;
985 			xstrfmtcat(formatted_path, "%s/%s",
986 				   root_path, one_name);
987 			_my_stat(formatted_path);
988 			xfree(formatted_path);
989 		} else {
990 			_my_stat(one_name);
991 		}
992 		file_count++;
993 		free(one_name);
994 	}
995 	hostlist_destroy(hl);
996 	xfree(root_path);
997 
998 	return file_count;
999 }
1000 
1001 /*
1002  * Check that we have a comma-delimited list of numbers
1003  */
_validate_links(gres_slurmd_conf_t * p)1004 static void _validate_links(gres_slurmd_conf_t *p)
1005 {
1006 	char *tmp, *tok, *save_ptr = NULL, *end_ptr = NULL;
1007 	long int val;
1008 
1009 	if (!p->links)
1010 		return;
1011 	if (p->links[0] == '\0') {
1012 		xfree(p->links);
1013 		return;
1014 	}
1015 
1016 	tmp = xstrdup(p->links);
1017 	tok = strtok_r(tmp, ",", &save_ptr);
1018 	while (tok) {
1019 		val = strtol(tok, &end_ptr, 10);
1020 		if ((val < -2) || (val > GRES_MAX_LINK) || (val == LONG_MIN) ||
1021 		    (end_ptr[0] != '\0')) {
1022 			error("gres.conf: Ignoring invalid Link (%s) for Name=%s",
1023 			      tok, p->name);
1024 			xfree(p->links);
1025 			break;
1026 		}
1027 		tok = strtok_r(NULL, ",", &save_ptr);
1028 	}
1029 	xfree(tmp);
1030 }
1031 
1032 /*
1033  * Return true if count can be greater than 1 for a given file.
1034  * For example, each GPU can have arbitrary count of MPS elements.
1035  */
_multi_count_per_file(char * name)1036 static bool _multi_count_per_file(char *name)
1037 {
1038 	if (!xstrcmp(name, "mps"))
1039 		return true;
1040 	return false;
1041 }
1042 
1043 /*
1044  * Build gres_slurmd_conf_t record based upon a line from the gres.conf file
1045  */
_parse_gres_config(void ** dest,slurm_parser_enum_t type,const char * key,const char * value,const char * line,char ** leftover)1046 static int _parse_gres_config(void **dest, slurm_parser_enum_t type,
1047 			      const char *key, const char *value,
1048 			      const char *line, char **leftover)
1049 {
1050 	static s_p_options_t _gres_options[] = {
1051 		{"Count", S_P_STRING},	/* Number of Gres available */
1052 		{"CPUs" , S_P_STRING},	/* CPUs to bind to Gres resource
1053 					 * (deprecated, use Cores) */
1054 		{"Cores", S_P_STRING},	/* Cores to bind to Gres resource */
1055 		{"File",  S_P_STRING},	/* Path to Gres device */
1056 		{"Files", S_P_STRING},	/* Path to Gres device */
1057 		{"Flags", S_P_STRING},	/* GRES Flags */
1058 		{"Link",  S_P_STRING},	/* Communication link IDs */
1059 		{"Links", S_P_STRING},	/* Communication link IDs */
1060 		{"Name",  S_P_STRING},	/* Gres name */
1061 		{"Type",  S_P_STRING},	/* Gres type (e.g. model name) */
1062 		{NULL}
1063 	};
1064 	int i;
1065 	s_p_hashtbl_t *tbl;
1066 	gres_slurmd_conf_t *p;
1067 	uint64_t tmp_uint64, mult;
1068 	char *tmp_str, *last;
1069 	bool cores_flag = false, cpus_flag = false;
1070 	char *type_str = NULL;
1071 
1072 	tbl = s_p_hashtbl_create(_gres_options);
1073 	s_p_parse_line(tbl, *leftover, leftover);
1074 
1075 	p = xmalloc(sizeof(gres_slurmd_conf_t));
1076 	if (!value) {
1077 		if (!s_p_get_string(&p->name, "Name", tbl)) {
1078 			error("Invalid GRES data, no type name (%s)", line);
1079 			xfree(p);
1080 			s_p_hashtbl_destroy(tbl);
1081 			return 0;
1082 		}
1083 	} else {
1084 		p->name = xstrdup(value);
1085 	}
1086 
1087 	p->cpu_cnt = gres_cpu_cnt;
1088 	if (s_p_get_string(&p->cpus, "Cores", tbl)) {
1089 		cores_flag = true;
1090 		type_str = "Cores";
1091 	} else if (s_p_get_string(&p->cpus, "CPUs", tbl)) {
1092 		cpus_flag = true;
1093 		type_str = "CPUs";
1094 	}
1095 	if (cores_flag || cpus_flag) {
1096 		char *local_cpus = NULL;
1097 		if (xcpuinfo_ops.xcpuinfo_abs_to_mac) {
1098 			i = (xcpuinfo_ops.xcpuinfo_abs_to_mac)
1099 				(p->cpus, &local_cpus);
1100 			/*
1101 			 * Only executed by slurmstepd and we don't want
1102 			 * fatal here. Ignore bad Core/CPU configuration.
1103 			 */
1104 			if (i != SLURM_SUCCESS) {
1105 				error("Invalid GRES data for %s, %s=%s",
1106 				      p->name, type_str, p->cpus);
1107 			}
1108 		} else {
1109 			local_cpus = xstrdup(p->cpus);
1110 			i = SLURM_SUCCESS;
1111 		}
1112 		if (i == SLURM_SUCCESS) {
1113 			p->cpus_bitmap = bit_alloc(gres_cpu_cnt);
1114 			if ((bit_size(p->cpus_bitmap) == 0) ||
1115 			    bit_unfmt(p->cpus_bitmap, local_cpus) != 0) {
1116 				fatal("Invalid GRES data for %s, %s=%s (only %u CPUs are available)",
1117 				      p->name, type_str, p->cpus, gres_cpu_cnt);
1118 			}
1119 		}
1120 		xfree(local_cpus);
1121 	}
1122 
1123 	if (s_p_get_string(&p->file, "File", tbl) ||
1124 	    s_p_get_string(&p->file, "Files", tbl)) {
1125 		p->count = _validate_file(p->file, p->name);
1126 		p->config_flags |= GRES_CONF_HAS_FILE;
1127 	}
1128 
1129 	if (s_p_get_string(&tmp_str, "Flags", tbl)) {
1130 		if (xstrcasestr(tmp_str, "CountOnly"))
1131 			p->config_flags |= GRES_CONF_COUNT_ONLY;
1132 		xfree(tmp_str);
1133 	}
1134 
1135 	if (s_p_get_string(&p->links, "Link",  tbl) ||
1136 	    s_p_get_string(&p->links, "Links", tbl)) {
1137 		_validate_links(p);
1138 	}
1139 
1140 	if (s_p_get_string(&p->type_name, "Type", tbl)) {
1141 		p->config_flags |= GRES_CONF_HAS_TYPE;
1142 	}
1143 
1144 	if (s_p_get_string(&tmp_str, "Count", tbl)) {
1145 		tmp_uint64 = strtoll(tmp_str, &last, 10);
1146 		if ((tmp_uint64 == LONG_MIN) || (tmp_uint64 == LONG_MAX)) {
1147 			fatal("Invalid GRES record for %s, invalid count %s",
1148 			      p->name, tmp_str);
1149 		}
1150 		if ((mult = suffix_mult(last)) != NO_VAL64) {
1151 			tmp_uint64 *= mult;
1152 		} else {
1153 			fatal("Invalid GRES record for %s, invalid count %s",
1154 			      p->name, tmp_str);
1155 		}
1156 		/*
1157 		 * Some GRES can have count > 1 for a given file. For example,
1158 		 * each GPU can have arbitrary count of MPS elements.
1159 		 */
1160 		if (p->count && (p->count != tmp_uint64) &&
1161 		    !_multi_count_per_file(p->name)) {
1162 			fatal("Invalid GRES record for %s, count does not match File value",
1163 			      p->name);
1164 		}
1165 		if (tmp_uint64 >= NO_VAL64) {
1166 			fatal("GRES %s has invalid count value %"PRIu64,
1167 			      p->name, tmp_uint64);
1168 		}
1169 		p->count = tmp_uint64;
1170 		xfree(tmp_str);
1171 	} else if (p->count == 0)
1172 		p->count = 1;
1173 
1174 	s_p_hashtbl_destroy(tbl);
1175 
1176 	for (i = 0; i < gres_context_cnt; i++) {
1177 		if (xstrcasecmp(p->name, gres_context[i].gres_name) == 0)
1178 			break;
1179 	}
1180 	if (i >= gres_context_cnt) {
1181 		error("Ignoring gres.conf record, invalid name: %s", p->name);
1182 		destroy_gres_slurmd_conf(p);
1183 		return 0;
1184 	}
1185 	p->plugin_id = gres_context[i].plugin_id;
1186 	*dest = (void *)p;
1187 	return 1;
1188 }
_parse_gres_config2(void ** dest,slurm_parser_enum_t type,const char * key,const char * value,const char * line,char ** leftover)1189 static int _parse_gres_config2(void **dest, slurm_parser_enum_t type,
1190 			       const char *key, const char *value,
1191 			       const char *line, char **leftover)
1192 {
1193 	static s_p_options_t _gres_options[] = {
1194 		{"Count", S_P_STRING},	/* Number of Gres available */
1195 		{"CPUs" , S_P_STRING},	/* CPUs to bind to Gres resource */
1196 		{"Cores", S_P_STRING},	/* Cores to bind to Gres resource */
1197 		{"File",  S_P_STRING},	/* Path to Gres device */
1198 		{"Files",  S_P_STRING},	/* Path to Gres device */
1199 		{"Flags", S_P_STRING},	/* GRES Flags */
1200 		{"Link",  S_P_STRING},	/* Communication link IDs */
1201 		{"Links", S_P_STRING},	/* Communication link IDs */
1202 		{"Name",  S_P_STRING},	/* Gres name */
1203 		{"Type",  S_P_STRING},	/* Gres type (e.g. model name) */
1204 		{NULL}
1205 	};
1206 	s_p_hashtbl_t *tbl;
1207 
1208 	if (gres_node_name && value) {
1209 		bool match = false;
1210 		hostlist_t hl;
1211 		hl = hostlist_create(value);
1212 		if (hl) {
1213 			match = (hostlist_find(hl, gres_node_name) >= 0);
1214 			hostlist_destroy(hl);
1215 		}
1216 		if (!match) {
1217 			debug("skipping GRES for NodeName=%s %s", value, line);
1218 			tbl = s_p_hashtbl_create(_gres_options);
1219 			s_p_parse_line(tbl, *leftover, leftover);
1220 			s_p_hashtbl_destroy(tbl);
1221 			return 0;
1222 		}
1223 	}
1224 	return _parse_gres_config(dest, type, key, NULL, line, leftover);
1225 }
1226 
_validate_slurm_conf(List slurm_conf_list,slurm_gres_context_t * context_ptr)1227 static void _validate_slurm_conf(List slurm_conf_list,
1228 				 slurm_gres_context_t *context_ptr)
1229 {
1230 	ListIterator iter;
1231 	gres_state_t *gres_ptr;
1232 
1233 	if (!slurm_conf_list)
1234 		return;
1235 
1236 	iter = list_iterator_create(slurm_conf_list);
1237 	while ((gres_ptr = list_next(iter))) {
1238 		gres_node_state_t *slurm_gres;
1239 		uint64_t tmp_count = 0;
1240 
1241 		/* Only look at the GRES under the current plugin (same name) */
1242 		if (gres_ptr->plugin_id != context_ptr->plugin_id)
1243 			continue;
1244 
1245 		slurm_gres = (gres_node_state_t *)gres_ptr->gres_data;
1246 
1247 		/*
1248 		 * gres_cnt_config should equal the combined count from
1249 		 * type_cnt_avail if there are no untyped GRES
1250 		 */
1251 		for (uint16_t i = 0; i < slurm_gres->type_cnt; i++)
1252 			tmp_count += slurm_gres->type_cnt_avail[i];
1253 
1254 		/* Forbid mixing typed and untyped GRES under the same name */
1255 		if (slurm_gres->type_cnt &&
1256 		    slurm_gres->gres_cnt_config > tmp_count)
1257 			fatal("%s: Some %s GRES in slurm.conf have a type while others do not (slurm_gres->gres_cnt_config (%"PRIu64") > tmp_count (%"PRIu64"))",
1258 			      __func__, context_ptr->gres_name,
1259 			      slurm_gres->gres_cnt_config, tmp_count);
1260 	}
1261 }
1262 
_validate_gres_conf(List gres_conf_list,slurm_gres_context_t * context_ptr)1263 static void _validate_gres_conf(List gres_conf_list,
1264 				slurm_gres_context_t *context_ptr)
1265 {
1266 	ListIterator iter;
1267 	gres_slurmd_conf_t *gres_slurmd_conf;
1268 	int new_has_file = -1, new_has_type = -1, rec_count = 0;
1269 	bool orig_has_file, orig_has_type;
1270 
1271 	iter = list_iterator_create(gres_conf_list);
1272 	while ((gres_slurmd_conf = (gres_slurmd_conf_t *) list_next(iter))) {
1273 		if (gres_slurmd_conf->plugin_id != context_ptr->plugin_id)
1274 			continue;
1275 
1276 		/*
1277 		 * If any plugin of this type has this set it will virally set
1278 		 * any other to be the same as we use the context_ptr from here
1279 		 * on out.
1280 		 */
1281 		if (gres_slurmd_conf->config_flags & GRES_CONF_COUNT_ONLY)
1282 			context_ptr->config_flags |= GRES_CONF_COUNT_ONLY;
1283 
1284 		/*
1285 		 * Since there could be multiple types of the same plugin we
1286 		 * need to only make sure we load it once.
1287 		 */
1288 		if (!(context_ptr->config_flags & GRES_CONF_LOADED)) {
1289 			/*
1290 			 * Ignore return code, as we will still support the gres
1291 			 * with or without the plugin.
1292 			 */
1293 			if (_load_gres_plugin(context_ptr) == SLURM_SUCCESS)
1294 				context_ptr->config_flags |= GRES_CONF_LOADED;
1295 		}
1296 
1297 		rec_count++;
1298 		orig_has_file = gres_slurmd_conf->config_flags &
1299 				GRES_CONF_HAS_FILE;
1300 		if (new_has_file == -1) {
1301 			if (gres_slurmd_conf->config_flags &
1302 			    GRES_CONF_HAS_FILE) {
1303 				new_has_file = 1;
1304 			} else
1305 				new_has_file = 0;
1306 		} else if (( new_has_file && !orig_has_file) ||
1307 			   (!new_has_file &&  orig_has_file)) {
1308 			fatal("gres.conf for %s, some records have \"File\" specification while others do not",
1309 			      context_ptr->gres_name);
1310 		}
1311 		orig_has_type = gres_slurmd_conf->config_flags &
1312 				GRES_CONF_HAS_TYPE;
1313 		if (new_has_type == -1) {
1314 			if (gres_slurmd_conf->config_flags &
1315 			    GRES_CONF_HAS_TYPE) {
1316 				new_has_type = 1;
1317 			} else
1318 				new_has_type = 0;
1319 		} else if (( new_has_type && !orig_has_type) ||
1320 			   (!new_has_type &&  orig_has_type)) {
1321 			fatal("gres.conf for %s, some records have \"Type=\" specification while others do not",
1322 			      context_ptr->gres_name);
1323 		}
1324 		if ((new_has_file == 0) && (new_has_type == 0) &&
1325 		    (rec_count > 1)) {
1326 			fatal("gres.conf duplicate records for %s",
1327 			      context_ptr->gres_name);
1328 		}
1329 
1330 		if (new_has_file)
1331 			context_ptr->config_flags |= GRES_CONF_HAS_FILE;
1332 	}
1333 	list_iterator_destroy(iter);
1334 
1335 	if (!(context_ptr->config_flags & GRES_CONF_LOADED)) {
1336 		/*
1337 		 * This means there was no gre.conf line for this gres found.
1338 		 * We still need to try to load it for AutoDetect's sake.
1339 		 * If we fail loading we will treat it as a count
1340 		 * only GRES since the stepd will try to load it elsewise.
1341 		 */
1342 		if (_load_gres_plugin(context_ptr) != SLURM_SUCCESS)
1343 			context_ptr->config_flags |= GRES_CONF_COUNT_ONLY;
1344 	} else
1345 		/* Remove as this is only really used locally */
1346 		context_ptr->config_flags &= (~GRES_CONF_LOADED);
1347 }
1348 
1349 /*
1350  * Keep track of which gres.conf lines have a count greater than expected
1351  * according to the current slurm.conf GRES. Modify the count of throw-away
1352  * records in gres_conf_list_tmp to keep track of this. Any gres.conf records
1353  * with a count > 0 means that slurm.conf did not account for it completely.
1354  *
1355  * gres_conf_list_tmp - (in/out) The temporary gres.conf list.
1356  * count              - (in) The count of the current slurm.conf GRES record.
1357  * type_name          - (in) The type of the current slurm.conf GRES record.
1358  */
_compare_conf_counts(List gres_conf_list_tmp,uint64_t count,char * type_name)1359 static void _compare_conf_counts(List gres_conf_list_tmp, uint64_t count,
1360 				 char *type_name)
1361 {
1362 	gres_slurmd_conf_t *gres_conf;
1363 	ListIterator iter = list_iterator_create(gres_conf_list_tmp);
1364 	while ((gres_conf = list_next(iter))) {
1365 		/* Note: plugin type filter already applied */
1366 		/* Check that type is the same */
1367 		if (xstrcasecmp(gres_conf->type_name, type_name))
1368 			continue;
1369 		/* Keep track of counts */
1370 		if (gres_conf->count > count) {
1371 			gres_conf->count -= count;
1372 			/* This slurm.conf GRES specification is now used up */
1373 			list_iterator_destroy(iter);
1374 			return;
1375 		} else {
1376 			count -= gres_conf->count;
1377 			gres_conf->count = 0;
1378 		}
1379 	}
1380 	list_iterator_destroy(iter);
1381 }
1382 
1383 /*
1384  * Loop through each entry in gres.conf and see if there is a corresponding
1385  * entry in slurm.conf. If so, see if the counts line up. If there are more
1386  * devices specified in gres.conf than in slurm.conf, emit errors.
1387  *
1388  * slurm_conf_list - (in) The slurm.conf GRES list.
1389  * gres_conf_list  - (in) The gres.conf GRES list.
1390  * context_ptr     - (in) Which GRES plugin we are currently working in.
1391  */
_check_conf_mismatch(List slurm_conf_list,List gres_conf_list,slurm_gres_context_t * context_ptr)1392 static void _check_conf_mismatch(List slurm_conf_list, List gres_conf_list,
1393 				 slurm_gres_context_t *context_ptr)
1394 {
1395 	ListIterator iter;
1396 	gres_slurmd_conf_t *gres_conf;
1397 	gres_state_t *slurm_conf;
1398 	List gres_conf_list_tmp;
1399 
1400 	/* E.g. slurm_conf_list will be NULL in the case of --gpu-bind */
1401 	if (!slurm_conf_list || !gres_conf_list)
1402 		return;
1403 
1404 	/*
1405 	 * Duplicate the gres.conf list with records relevant to this GRES plugin
1406 	 * only so we can mangle records. Only add records under the current plugin.
1407 	 */
1408 	gres_conf_list_tmp = list_create(destroy_gres_slurmd_conf);
1409 	iter = list_iterator_create(gres_conf_list);
1410 	while ((gres_conf = list_next(iter))) {
1411 		gres_slurmd_conf_t *gres_conf_tmp;
1412 		if (gres_conf->plugin_id != context_ptr->plugin_id)
1413 			continue;
1414 
1415 		gres_conf_tmp = xmalloc(sizeof(*gres_conf_tmp));
1416 		gres_conf_tmp->name = xstrdup(gres_conf->name);
1417 		gres_conf_tmp->type_name = xstrdup(gres_conf->type_name);
1418 		gres_conf_tmp->count = gres_conf->count;
1419 		list_append(gres_conf_list_tmp, gres_conf_tmp);
1420 	}
1421 	list_iterator_destroy(iter);
1422 
1423 	/*
1424 	 * Loop through the slurm.conf list and see if there are more gres.conf
1425 	 * GRES than expected.
1426 	 */
1427 	iter = list_iterator_create(slurm_conf_list);
1428 	while ((slurm_conf = list_next(iter))) {
1429 		gres_node_state_t *slurm_gres;
1430 
1431 		if (slurm_conf->plugin_id != context_ptr->plugin_id)
1432 			continue;
1433 
1434 		/* Determine if typed or untyped, and act accordingly */
1435 		slurm_gres = (gres_node_state_t *)slurm_conf->gres_data;
1436 		if (!slurm_gres->type_name) {
1437 			_compare_conf_counts(gres_conf_list_tmp,
1438 					     slurm_gres->gres_cnt_config, NULL);
1439 			continue;
1440 		}
1441 
1442 		for (int i = 0; i < slurm_gres->type_cnt; ++i) {
1443 			_compare_conf_counts(gres_conf_list_tmp,
1444 					     slurm_gres->type_cnt_avail[i],
1445 					     slurm_gres->type_name[i]);
1446 		}
1447 	}
1448 	list_iterator_destroy(iter);
1449 
1450 	/*
1451 	 * Loop through gres_conf_list_tmp to print errors for gres.conf
1452 	 * records that were not completely accounted for in slurm.conf.
1453 	 */
1454 	iter = list_iterator_create(gres_conf_list_tmp);
1455 	while ((gres_conf = list_next(iter)))
1456 		if (gres_conf->count > 0)
1457 			info("WARNING: A line in gres.conf for GRES %s%s%s has %"PRIu64" more configured than expected in slurm.conf. Ignoring extra GRES.",
1458 			     gres_conf->name,
1459 			     (gres_conf->type_name) ? ":" : "",
1460 			     (gres_conf->type_name) ? gres_conf->type_name : "",
1461 			     gres_conf->count);
1462 	list_iterator_destroy(iter);
1463 
1464 	FREE_NULL_LIST(gres_conf_list_tmp);
1465 }
1466 
1467 /*
1468  * Match the type of a GRES from slurm.conf to a GRES in the gres.conf list. If
1469  * a match is found, pop it off the gres.conf list and return it.
1470  *
1471  * gres_conf_list - (in) The gres.conf list to search through.
1472  * gres_context   - (in) Which GRES plugin we are currently working in.
1473  * type_name      - (in) The type of the slurm.conf GRES record. If null, then
1474  * 			 it's an untyped GRES.
1475  *
1476  * Returns the first gres.conf record from gres_conf_list with the same type
1477  * name as the slurm.conf record.
1478  */
_match_type(List gres_conf_list,slurm_gres_context_t * gres_context,char * type_name)1479 static gres_slurmd_conf_t *_match_type(List gres_conf_list,
1480 				       slurm_gres_context_t *gres_context,
1481 				       char *type_name)
1482 {
1483 	ListIterator gres_conf_itr;
1484 	gres_slurmd_conf_t *gres_conf = NULL;
1485 
1486 	gres_conf_itr = list_iterator_create(gres_conf_list);
1487 	while ((gres_conf = list_next(gres_conf_itr))) {
1488 		if (gres_conf->plugin_id != gres_context->plugin_id)
1489 			continue;
1490 
1491 		/*
1492 		 * If type_name is NULL we will take the first matching
1493 		 * gres_conf that we find.  This means we also will remove the
1494 		 * type from the gres_conf to match 18.08 stylings.
1495 		 */
1496 		if (!type_name)
1497 			xfree(gres_conf->type_name);
1498 		else if (xstrcasecmp(gres_conf->type_name, type_name))
1499 			continue;
1500 
1501 		/* We found a match, so remove from gres_conf_list and break */
1502 		list_remove(gres_conf_itr);
1503 		break;
1504 	}
1505 	list_iterator_destroy(gres_conf_itr);
1506 
1507 	return gres_conf;
1508 }
1509 
1510 /*
1511  * Add a GRES conf record with count == 0 to gres_list.
1512  *
1513  * gres_list    - (in/out) The gres list to add to.
1514  * gres_context - (in) The GRES plugin to add a GRES record for.
1515  * cpu_cnt      - (in) The cpu count configured for the node.
1516  */
_add_gres_config_empty(List gres_list,slurm_gres_context_t * gres_context,uint32_t cpu_cnt)1517 static void _add_gres_config_empty(List gres_list,
1518 				   slurm_gres_context_t *gres_context,
1519 				   uint32_t cpu_cnt)
1520 {
1521 	gres_slurmd_conf_t *gres_conf = xmalloc(sizeof(*gres_conf));
1522 	gres_conf->cpu_cnt = cpu_cnt;
1523 	gres_conf->name = xstrdup(gres_context->gres_name);
1524 	gres_conf->plugin_id = gres_context->plugin_id;
1525 	list_append(gres_list, gres_conf);
1526 }
1527 
1528 /*
1529  * Truncate the File hostrange string of a GRES record to be to be at most
1530  * new_count entries. The extra entries will be removed.
1531  *
1532  * gres_conf - (in/out) The GRES record to modify.
1533  * count     - (in) The new number of entries in File
1534  */
_set_file_subset(gres_slurmd_conf_t * gres_conf,uint64_t new_count)1535 static void _set_file_subset(gres_slurmd_conf_t *gres_conf, uint64_t new_count)
1536 {
1537 	/* Convert file to hostrange */
1538 	hostlist_t hl = hostlist_create(gres_conf->file);
1539 	unsigned long old_count = hostlist_count(hl);
1540 
1541 	if (new_count >= old_count) {
1542 		hostlist_destroy(hl);
1543 		/* Nothing to do */
1544 		return;
1545 	}
1546 
1547 	/* Remove all but the first entries */
1548 	for (int i = old_count; i > new_count; --i) {
1549 		free(hostlist_pop(hl));
1550 	}
1551 
1552 	debug3("%s: Truncating %s:%s File from (%ld) %s", __func__,
1553 	       gres_conf->name, gres_conf->type_name, old_count,
1554 	       gres_conf->file);
1555 
1556 	/* Set file to the new subset */
1557 	xfree(gres_conf->file);
1558 	gres_conf->file = hostlist_ranged_string_xmalloc(hl);
1559 
1560 	debug3("%s: to (%"PRIu64") %s", __func__, new_count, gres_conf->file);
1561 	hostlist_destroy(hl);
1562 }
1563 
1564 /*
1565  * A continuation of _merge_gres() depending on if the slurm.conf GRES is typed
1566  * or not.
1567  *
1568  * gres_conf_list - (in) The gres.conf list.
1569  * new_list       - (out) The new merged [slurm|gres].conf list.
1570  * count          - (in) The count of the slurm.conf GRES record.
1571  * type_name      - (in) The type of the slurm.conf GRES record, if it exists.
1572  * gres_context   - (in) Which GRES plugin we are working in.
1573  * cpu_cnt        - (in) A count of CPUs on the node.
1574  */
_merge_gres2(List gres_conf_list,List new_list,uint64_t count,char * type_name,slurm_gres_context_t * gres_context,uint32_t cpu_count)1575 static void _merge_gres2(List gres_conf_list, List new_list, uint64_t count,
1576 			 char *type_name, slurm_gres_context_t *gres_context,
1577 			 uint32_t cpu_count)
1578 {
1579 	gres_slurmd_conf_t *gres_conf, *match;
1580 
1581 	/* If slurm.conf count is initially 0, don't waste time on it */
1582 	if (count == 0)
1583 		return;
1584 
1585 	/*
1586 	 * There can be multiple gres.conf GRES lines contained within a
1587 	 * single slurm.conf GRES line, due to different values of Cores
1588 	 * and Links. Append them to the list where possible.
1589 	 */
1590 	while ((match = _match_type(gres_conf_list, gres_context, type_name))) {
1591 		list_append(new_list, match);
1592 
1593 		debug3("%s: From gres.conf, using %s:%s:%"PRIu64":%s", __func__,
1594 		       match->name, match->type_name, match->count,
1595 		       match->file);
1596 
1597 		/* See if we need to merge with any more gres.conf records. */
1598 		if (match->count > count) {
1599 			/*
1600 			 * Truncate excess count of gres.conf to match total
1601 			 * count of slurm.conf.
1602 			 */
1603 			match->count = count;
1604 			/*
1605 			 * Truncate excess file of gres.conf to match total
1606 			 * count of slurm.conf.
1607 			 */
1608 			if (match->file)
1609 				_set_file_subset(match, count);
1610 			/* Floor to 0 to break out of loop. */
1611 			count = 0;
1612 		} else
1613 			/*
1614 			 * Subtract this gres.conf line count from the
1615 			 * slurm.conf total.
1616 			 */
1617 			count -= match->count;
1618 
1619 		/*
1620 		 * All devices outlined by this slurm.conf record have now been
1621 		 * merged with gres.conf records and added to new_list, so exit.
1622 		 */
1623 		if (count == 0)
1624 			break;
1625 	}
1626 
1627 	if (count == 0)
1628 		return;
1629 
1630 	/*
1631 	 * There are leftover GRES specified in this slurm.conf record that are
1632 	 * not accounted for in gres.conf that still need to be added.
1633 	 */
1634 	gres_conf = xmalloc(sizeof(*gres_conf));
1635 	gres_conf->count = count;
1636 	gres_conf->cpu_cnt = cpu_count;
1637 	gres_conf->name = xstrdup(gres_context->gres_name);
1638 	gres_conf->plugin_id = gres_context->plugin_id;
1639 	if (type_name) {
1640 		gres_conf->config_flags = GRES_CONF_HAS_TYPE;
1641 		gres_conf->type_name = xstrdup(type_name);
1642 	}
1643 
1644 	if (gres_context->config_flags & GRES_CONF_COUNT_ONLY)
1645 		gres_conf->config_flags |= GRES_CONF_COUNT_ONLY;
1646 
1647 	list_append(new_list, gres_conf);
1648 }
1649 
1650 /*
1651  * Merge a single slurm.conf GRES specification with any relevant gres.conf
1652  * records and append the result to new_list.
1653  *
1654  * gres_conf_list - (in) The gres.conf list.
1655  * new_list       - (out) The new merged [slurm|gres].conf list.
1656  * ptr            - (in) A slurm.conf GRES record.
1657  * gres_context   - (in) Which GRES plugin we are working in.
1658  * cpu_cnt        - (in) A count of CPUs on the node.
1659  */
_merge_gres(List gres_conf_list,List new_list,gres_state_t * ptr,slurm_gres_context_t * gres_context,uint32_t cpu_cnt)1660 static void _merge_gres(List gres_conf_list, List new_list, gres_state_t *ptr,
1661 			slurm_gres_context_t *gres_context, uint32_t cpu_cnt)
1662 {
1663 	gres_node_state_t *slurm_gres = (gres_node_state_t *)ptr->gres_data;
1664 
1665 	/* If this GRES has no types, merge in the single untyped GRES */
1666 	if (slurm_gres->type_cnt == 0) {
1667 		_merge_gres2(gres_conf_list, new_list,
1668 			     slurm_gres->gres_cnt_config, NULL, gres_context,
1669 			     cpu_cnt);
1670 		return;
1671 	}
1672 
1673 	/* If this GRES has types, merge in each typed GRES */
1674 	for (int i = 0; i < slurm_gres->type_cnt; i++) {
1675 		_merge_gres2(gres_conf_list, new_list,
1676 			     slurm_gres->type_cnt_avail[i],
1677 			     slurm_gres->type_name[i], gres_context, cpu_cnt);
1678 	}
1679 }
1680 
1681 /*
1682  * Merge slurm.conf and gres.conf GRES configuration.
1683  * gres.conf can only work within what is outlined in slurm.conf. Every
1684  * gres.conf device that does not match up to a device in slurm.conf is
1685  * discarded with an error. If no gres conf found for what is specified in
1686  * slurm.conf, create a zero-count conf record.
1687  *
1688  * node_conf       - (in) node configuration info (cpu count).
1689  * gres_conf_list  - (in/out) GRES data from gres.conf. This becomes the new
1690  * 		     merged slurm.conf/gres.conf list.
1691  * slurm_conf_list - (in) GRES data from slurm.conf.
1692  */
_merge_config(node_config_load_t * node_conf,List gres_conf_list,List slurm_conf_list)1693 static void _merge_config(node_config_load_t *node_conf, List gres_conf_list,
1694 			  List slurm_conf_list)
1695 {
1696 	int i;
1697 	gres_state_t *gres_ptr;
1698 	ListIterator iter;
1699 	bool found;
1700 
1701 	List new_gres_list = list_create(destroy_gres_slurmd_conf);
1702 
1703 	for (i = 0; i < gres_context_cnt; i++) {
1704 		/* Copy GRES configuration from slurm.conf */
1705 		if (slurm_conf_list) {
1706 			found = false;
1707 			iter = list_iterator_create(slurm_conf_list);
1708 			while ((gres_ptr = (gres_state_t *) list_next(iter))) {
1709 				if (gres_ptr->plugin_id !=
1710 				    gres_context[i].plugin_id)
1711 					continue;
1712 				found = true;
1713 				_merge_gres(gres_conf_list, new_gres_list,
1714 					    gres_ptr, &gres_context[i],
1715 					    node_conf->cpu_cnt);
1716 			}
1717 			list_iterator_destroy(iter);
1718 			if (found)
1719 				continue;
1720 		}
1721 
1722 		/* Add GRES record with zero count */
1723 		_add_gres_config_empty(new_gres_list, &gres_context[i],
1724 				       node_conf->cpu_cnt);
1725 	}
1726 	/* Set gres_conf_list to be the new merged list */
1727 	list_flush(gres_conf_list);
1728 	list_transfer(gres_conf_list, new_gres_list);
1729 	FREE_NULL_LIST(new_gres_list);
1730 }
1731 
1732 /*
1733  * Load this node's configuration (how many resources it has, topology, etc.)
1734  * IN cpu_cnt - Number of CPUs configured on this node
1735  * IN node_name - Name of this node
1736  * IN gres_list - Node's GRES information as loaded from slurm.conf by slurmd
1737  * IN xcpuinfo_abs_to_mac - Pointer to xcpuinfo_abs_to_mac() funct, if available
1738  * IN xcpuinfo_mac_to_abs - Pointer to xcpuinfo_mac_to_abs() funct, if available
1739  * NOTE: Called from slurmd and slurmstepd
1740  */
gres_plugin_node_config_load(uint32_t cpu_cnt,char * node_name,List gres_list,void * xcpuinfo_abs_to_mac,void * xcpuinfo_mac_to_abs)1741 extern int gres_plugin_node_config_load(uint32_t cpu_cnt, char *node_name,
1742 					List gres_list,
1743 					void *xcpuinfo_abs_to_mac,
1744 					void *xcpuinfo_mac_to_abs)
1745 {
1746 	static s_p_options_t _gres_options[] = {
1747 		{"AutoDetect", S_P_STRING},
1748 		{"Name",     S_P_ARRAY, _parse_gres_config,  NULL},
1749 		{"NodeName", S_P_ARRAY, _parse_gres_config2, NULL},
1750 		{NULL}
1751 	};
1752 
1753 	int count = 0, i, rc, rc2;
1754 	struct stat config_stat;
1755 	s_p_hashtbl_t *tbl;
1756 	gres_slurmd_conf_t **gres_array;
1757 	char *gres_conf_file;
1758 	char *autodetect_string = NULL;
1759 
1760 	node_config_load_t node_conf = {
1761 		.cpu_cnt = cpu_cnt,
1762 		.xcpuinfo_mac_to_abs = xcpuinfo_mac_to_abs
1763 	};
1764 
1765 	if (cpu_cnt == 0) {
1766 		error("%s: Invalid cpu_cnt of 0 for node %s",
1767 		      __func__, node_name);
1768 		return SLURM_ERROR;
1769 	}
1770 
1771 	if (xcpuinfo_abs_to_mac)
1772 		xcpuinfo_ops.xcpuinfo_abs_to_mac = xcpuinfo_abs_to_mac;
1773 
1774 	rc = gres_plugin_init();
1775 	if (gres_context_cnt == 0)
1776 		return SLURM_SUCCESS;
1777 
1778 	slurm_mutex_lock(&gres_context_lock);
1779 	FREE_NULL_LIST(gres_conf_list);
1780 	gres_conf_list = list_create(destroy_gres_slurmd_conf);
1781 	gres_conf_file = get_extra_conf_path("gres.conf");
1782 	if (stat(gres_conf_file, &config_stat) < 0) {
1783 		info("Can not stat gres.conf file (%s), using slurm.conf data",
1784 		      gres_conf_file);
1785 	} else {
1786 		if (xstrcmp(gres_node_name, node_name)) {
1787 			xfree(gres_node_name);
1788 			gres_node_name = xstrdup(node_name);
1789 		}
1790 
1791 		gres_cpu_cnt = cpu_cnt;
1792 		tbl = s_p_hashtbl_create(_gres_options);
1793 		if (s_p_parse_file(tbl, NULL, gres_conf_file, false) == SLURM_ERROR)
1794 			fatal("error opening/reading %s", gres_conf_file);
1795 
1796 		if (s_p_get_string(&autodetect_string, "Autodetect", tbl)) {
1797 			if (xstrcasestr(autodetect_string, "nvml"))
1798 				autodetect_types |= GRES_AUTODETECT_NVML;
1799 			if (xstrcasestr(autodetect_string, "rsmi"))
1800 				autodetect_types |= GRES_AUTODETECT_RSMI;
1801 			xfree(autodetect_string);
1802 		}
1803 
1804 		if (s_p_get_array((void ***) &gres_array, &count, "Name", tbl)) {
1805 			for (i = 0; i < count; i++) {
1806 				list_append(gres_conf_list, gres_array[i]);
1807 				gres_array[i] = NULL;
1808 			}
1809 		}
1810 		if (s_p_get_array((void ***) &gres_array, &count, "NodeName", tbl)) {
1811 			for (i = 0; i < count; i++) {
1812 				list_append(gres_conf_list, gres_array[i]);
1813 				gres_array[i] = NULL;
1814 			}
1815 		}
1816 		s_p_hashtbl_destroy(tbl);
1817 	}
1818 	xfree(gres_conf_file);
1819 
1820 	/* Validate gres.conf and slurm.conf somewhat before merging */
1821 	for (i = 0; i < gres_context_cnt; i++) {
1822 		_validate_slurm_conf(gres_list, &gres_context[i]);
1823 		_validate_gres_conf(gres_conf_list, &gres_context[i]);
1824 		_check_conf_mismatch(gres_list, gres_conf_list,
1825 				     &gres_context[i]);
1826 	}
1827 
1828 	/* Merge slurm.conf and gres.conf together into gres_conf_list */
1829 	_merge_config(&node_conf, gres_conf_list, gres_list);
1830 
1831 	for (i = 0; i < gres_context_cnt; i++) {
1832 		if (gres_context[i].ops.node_config_load == NULL)
1833 			continue;	/* No plugin */
1834 		rc2 = (*(gres_context[i].ops.node_config_load))(gres_conf_list,
1835 								&node_conf);
1836 		if (rc == SLURM_SUCCESS)
1837 			rc = rc2;
1838 
1839 	}
1840 
1841 	/* Postprocess gres_conf_list after all plugins' node_config_load */
1842 	for (i = 0; i < gres_context_cnt; i++) {
1843 		/* Remove every GPU with an empty File */
1844 		_remove_fileless_gpus(gres_conf_list, &gres_context[i]);
1845 	}
1846 
1847 	list_for_each(gres_conf_list, _log_gres_slurmd_conf, NULL);
1848 	slurm_mutex_unlock(&gres_context_lock);
1849 
1850 	return rc;
1851 }
1852 
1853 /*
1854  * Pack this node's gres configuration into a buffer
1855  * IN/OUT buffer - message buffer to pack
1856  */
gres_plugin_node_config_pack(Buf buffer)1857 extern int gres_plugin_node_config_pack(Buf buffer)
1858 {
1859 	int rc;
1860 	uint32_t magic = GRES_MAGIC;
1861 	uint16_t rec_cnt = 0, version = SLURM_PROTOCOL_VERSION;
1862 	ListIterator iter;
1863 	gres_slurmd_conf_t *gres_slurmd_conf;
1864 
1865 	rc = gres_plugin_init();
1866 
1867 	slurm_mutex_lock(&gres_context_lock);
1868 	pack16(version, buffer);
1869 	if (gres_conf_list)
1870 		rec_cnt = list_count(gres_conf_list);
1871 	pack16(rec_cnt, buffer);
1872 	if (rec_cnt) {
1873 		iter = list_iterator_create(gres_conf_list);
1874 		while ((gres_slurmd_conf =
1875 			(gres_slurmd_conf_t *) list_next(iter))) {
1876 			pack32(magic, buffer);
1877 			pack64(gres_slurmd_conf->count, buffer);
1878 			pack32(gres_slurmd_conf->cpu_cnt, buffer);
1879 			pack8(gres_slurmd_conf->config_flags, buffer);
1880 			pack32(gres_slurmd_conf->plugin_id, buffer);
1881 			packstr(gres_slurmd_conf->cpus, buffer);
1882 			packstr(gres_slurmd_conf->links, buffer);
1883 			packstr(gres_slurmd_conf->name, buffer);
1884 			packstr(gres_slurmd_conf->type_name, buffer);
1885 		}
1886 		list_iterator_destroy(iter);
1887 	}
1888 	slurm_mutex_unlock(&gres_context_lock);
1889 
1890 	return rc;
1891 }
1892 
1893 /*
1894  * Unpack this node's configuration from a buffer (built/packed by slurmd)
1895  * IN/OUT buffer - message buffer to unpack
1896  * IN node_name - name of node whose data is being unpacked
1897  */
gres_plugin_node_config_unpack(Buf buffer,char * node_name)1898 extern int gres_plugin_node_config_unpack(Buf buffer, char *node_name)
1899 {
1900 	int i, j, rc;
1901 	uint32_t cpu_cnt = 0, magic = 0, plugin_id = 0, utmp32 = 0;
1902 	uint64_t count64 = 0;
1903 	uint16_t rec_cnt = 0, protocol_version = 0;
1904 	uint8_t config_flags = 0;
1905 	char *tmp_cpus = NULL, *tmp_links = NULL, *tmp_name = NULL;
1906 	char *tmp_type = NULL;
1907 	gres_slurmd_conf_t *p;
1908 
1909 	rc = gres_plugin_init();
1910 
1911 	FREE_NULL_LIST(gres_conf_list);
1912 	gres_conf_list = list_create(destroy_gres_slurmd_conf);
1913 
1914 	safe_unpack16(&protocol_version, buffer);
1915 
1916 	safe_unpack16(&rec_cnt, buffer);
1917 	if (rec_cnt == 0)
1918 		return SLURM_SUCCESS;
1919 	if (rec_cnt > NO_VAL16)
1920 		goto unpack_error;
1921 
1922 	slurm_mutex_lock(&gres_context_lock);
1923 	if (protocol_version < SLURM_MIN_PROTOCOL_VERSION) {
1924 		error("%s: protocol_version %hu not supported",
1925 		      __func__, protocol_version);
1926 		goto unpack_error;
1927 	}
1928 	for (i = 0; i < rec_cnt; i++) {
1929 		if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
1930 			safe_unpack32(&magic, buffer);
1931 			if (magic != GRES_MAGIC)
1932 				goto unpack_error;
1933 
1934 			safe_unpack64(&count64, buffer);
1935 			safe_unpack32(&cpu_cnt, buffer);
1936 			safe_unpack8(&config_flags, buffer);
1937 			safe_unpack32(&plugin_id, buffer);
1938 			safe_unpackstr_xmalloc(&tmp_cpus, &utmp32, buffer);
1939 			safe_unpackstr_xmalloc(&tmp_links, &utmp32, buffer);
1940 			safe_unpackstr_xmalloc(&tmp_name, &utmp32, buffer);
1941 			safe_unpackstr_xmalloc(&tmp_type, &utmp32, buffer);
1942 		}
1943 
1944 		if (slurm_get_debug_flags() & DEBUG_FLAG_GRES) {
1945 			info("Node:%s Gres:%s Type:%s Flags:%s CPU_IDs:%s CPU#:%u Count:%"
1946 			     PRIu64" Links:%s",
1947 			     node_name, tmp_name, tmp_type,
1948 			     gres_flags2str(config_flags), tmp_cpus, cpu_cnt,
1949 			     count64, tmp_links);
1950 		}
1951 	 	for (j = 0; j < gres_context_cnt; j++) {
1952 			bool new_has_file,  new_has_type;
1953 			bool orig_has_file, orig_has_type;
1954 	 		if (gres_context[j].plugin_id != plugin_id)
1955 				continue;
1956 			if (xstrcmp(gres_context[j].gres_name, tmp_name)) {
1957 				/*
1958 				 * Should have been caught in
1959 				 * gres_plugin_init()
1960 				 */
1961 				error("%s: gres/%s duplicate plugin ID with %s, unable to process",
1962 				      __func__, tmp_name,
1963 				      gres_context[j].gres_name);
1964 				continue;
1965 			}
1966 			new_has_file  = config_flags & GRES_CONF_HAS_FILE;
1967 			orig_has_file = gres_context[j].config_flags &
1968 					GRES_CONF_HAS_FILE;
1969 			if (orig_has_file && !new_has_file && count64) {
1970 				error("%s: gres/%s lacks \"File=\" parameter for node %s",
1971 				      __func__, tmp_name, node_name);
1972 				config_flags |= GRES_CONF_HAS_FILE;
1973 			}
1974 			if (new_has_file && (count64 > MAX_GRES_BITMAP)) {
1975 				/*
1976 				 * Avoid over-subscribing memory with
1977 				 * huge bitmaps
1978 				 */
1979 				error("%s: gres/%s has \"File=\" plus very large "
1980 				      "\"Count\" (%"PRIu64") for node %s, "
1981 				      "resetting value to %d",
1982 				      __func__, tmp_name, count64,
1983 				      node_name, MAX_GRES_BITMAP);
1984 				count64 = MAX_GRES_BITMAP;
1985 			}
1986 			new_has_type  = config_flags & GRES_CONF_HAS_TYPE;
1987 			orig_has_type = gres_context[j].config_flags &
1988 					GRES_CONF_HAS_TYPE;
1989 			if (orig_has_type && !new_has_type && count64) {
1990 				error("%s: gres/%s lacks \"Type\" parameter for node %s",
1991 				      __func__, tmp_name, node_name);
1992 				config_flags |= GRES_CONF_HAS_TYPE;
1993 			}
1994 			gres_context[j].config_flags |= config_flags;
1995 
1996 			/*
1997 			 * On the slurmctld we need to load the plugins to
1998 			 * correctly set env vars.  We want to call this only
1999 			 * after we have the config_flags so we can tell if we
2000 			 * are CountOnly or not.
2001 			 */
2002 			if (!(gres_context[j].config_flags &
2003 			      GRES_CONF_LOADED)) {
2004 				(void)_load_gres_plugin(&gres_context[j]);
2005 				gres_context[j].config_flags |=
2006 					GRES_CONF_LOADED;
2007 			}
2008 
2009 			break;
2010 	 	}
2011 		if (j >= gres_context_cnt) {
2012 			/*
2013 			 * GresPlugins is inconsistently configured.
2014 			 * Not a fatal error, but skip this data.
2015 			 */
2016 			error("%s: No plugin configured to process GRES data from node %s (Name:%s Type:%s PluginID:%u Count:%"PRIu64")",
2017 			      __func__, node_name, tmp_name, tmp_type,
2018 			      plugin_id, count64);
2019 			xfree(tmp_cpus);
2020 			xfree(tmp_links);
2021 			xfree(tmp_name);
2022 			xfree(tmp_type);
2023 			continue;
2024 		}
2025 		p = xmalloc(sizeof(gres_slurmd_conf_t));
2026 		p->config_flags = config_flags;
2027 		p->count = count64;
2028 		p->cpu_cnt = cpu_cnt;
2029 		p->cpus = tmp_cpus;
2030 		tmp_cpus = NULL;	/* Nothing left to xfree */
2031 		p->links = tmp_links;
2032 		tmp_links = NULL;	/* Nothing left to xfree */
2033 		p->name = tmp_name;     /* Preserve for accounting! */
2034 		p->type_name = tmp_type;
2035 		tmp_type = NULL;	/* Nothing left to xfree */
2036 		p->plugin_id = plugin_id;
2037 		_validate_links(p);
2038 		list_append(gres_conf_list, p);
2039 	}
2040 
2041 	slurm_mutex_unlock(&gres_context_lock);
2042 	return rc;
2043 
2044 unpack_error:
2045 	error("%s: unpack error from node %s", __func__, node_name);
2046 	xfree(tmp_cpus);
2047 	xfree(tmp_links);
2048 	xfree(tmp_name);
2049 	xfree(tmp_type);
2050 	slurm_mutex_unlock(&gres_context_lock);
2051 	return SLURM_ERROR;
2052 }
2053 
_gres_node_state_delete_topo(gres_node_state_t * gres_node_ptr)2054 static void _gres_node_state_delete_topo(gres_node_state_t *gres_node_ptr)
2055 {
2056 	int i;
2057 
2058 	for (i = 0; i < gres_node_ptr->topo_cnt; i++) {
2059 		if (gres_node_ptr->topo_gres_bitmap)
2060 			FREE_NULL_BITMAP(gres_node_ptr->topo_gres_bitmap[i]);
2061 		if (gres_node_ptr->topo_core_bitmap)
2062 			FREE_NULL_BITMAP(gres_node_ptr->topo_core_bitmap[i]);
2063 		xfree(gres_node_ptr->topo_type_name[i]);
2064 	}
2065 	xfree(gres_node_ptr->topo_gres_bitmap);
2066 	xfree(gres_node_ptr->topo_core_bitmap);
2067 	xfree(gres_node_ptr->topo_gres_cnt_alloc);
2068 	xfree(gres_node_ptr->topo_gres_cnt_avail);
2069 	xfree(gres_node_ptr->topo_type_id);
2070 	xfree(gres_node_ptr->topo_type_name);
2071 }
2072 
_gres_node_state_delete(gres_node_state_t * gres_node_ptr)2073 static void _gres_node_state_delete(gres_node_state_t *gres_node_ptr)
2074 {
2075 	int i;
2076 
2077 	FREE_NULL_BITMAP(gres_node_ptr->gres_bit_alloc);
2078 	xfree(gres_node_ptr->gres_used);
2079 	if (gres_node_ptr->links_cnt) {
2080 		for (i = 0; i < gres_node_ptr->link_len; i++)
2081 			xfree(gres_node_ptr->links_cnt[i]);
2082 		xfree(gres_node_ptr->links_cnt);
2083 	}
2084 
2085 	_gres_node_state_delete_topo(gres_node_ptr);
2086 
2087 	for (i = 0; i < gres_node_ptr->type_cnt; i++) {
2088 		xfree(gres_node_ptr->type_name[i]);
2089 	}
2090 	xfree(gres_node_ptr->type_cnt_alloc);
2091 	xfree(gres_node_ptr->type_cnt_avail);
2092 	xfree(gres_node_ptr->type_id);
2093 	xfree(gres_node_ptr->type_name);
2094 	xfree(gres_node_ptr);
2095 }
2096 
2097 /*
2098  * Delete an element placed on gres_list by _node_config_validate()
2099  * free associated memory
2100  */
_gres_node_list_delete(void * list_element)2101 static void _gres_node_list_delete(void *list_element)
2102 {
2103 	gres_state_t *gres_ptr;
2104 	gres_node_state_t *gres_node_ptr;
2105 
2106 	gres_ptr = (gres_state_t *) list_element;
2107 	gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
2108 	_gres_node_state_delete(gres_node_ptr);
2109 	xfree(gres_ptr);
2110 }
2111 
_add_gres_type(char * type,gres_node_state_t * gres_data,uint64_t tmp_gres_cnt)2112 static void _add_gres_type(char *type, gres_node_state_t *gres_data,
2113 			   uint64_t tmp_gres_cnt)
2114 {
2115 	int i;
2116 	uint32_t type_id;
2117 
2118 	if (!xstrcasecmp(type, "no_consume")) {
2119 		gres_data->no_consume = true;
2120 		return;
2121 	}
2122 
2123 	type_id = gres_plugin_build_id(type);
2124 	for (i = 0; i < gres_data->type_cnt; i++) {
2125 		if (gres_data->type_id[i] != type_id)
2126 			continue;
2127 		gres_data->type_cnt_avail[i] += tmp_gres_cnt;
2128 		break;
2129 	}
2130 
2131 	if (i >= gres_data->type_cnt) {
2132 		gres_data->type_cnt++;
2133 		gres_data->type_cnt_alloc =
2134 			xrealloc(gres_data->type_cnt_alloc,
2135 				 sizeof(uint64_t) * gres_data->type_cnt);
2136 		gres_data->type_cnt_avail =
2137 			xrealloc(gres_data->type_cnt_avail,
2138 				 sizeof(uint64_t) * gres_data->type_cnt);
2139 		gres_data->type_id =
2140 			xrealloc(gres_data->type_id,
2141 				 sizeof(uint32_t) * gres_data->type_cnt);
2142 		gres_data->type_name =
2143 			xrealloc(gres_data->type_name,
2144 				 sizeof(char *) * gres_data->type_cnt);
2145 		gres_data->type_cnt_avail[i] += tmp_gres_cnt;
2146 		gres_data->type_id[i] = type_id;
2147 		gres_data->type_name[i] = xstrdup(type);
2148 	}
2149 }
2150 
2151 /*
2152  * Compute the total GRES count for a particular gres_name.
2153  * Note that a given gres_name can appear multiple times in the orig_config
2154  * string for multiple types (e.g. "gres=gpu:kepler:1,gpu:tesla:2").
2155  * IN/OUT gres_data - set gres_cnt_config field in this structure
2156  * IN orig_config - gres configuration from slurm.conf
2157  * IN gres_name - name of the gres type (e.g. "gpu")
2158  * IN gres_name_colon - gres name with appended colon
2159  * IN gres_name_colon_len - size of gres_name_colon
2160  * RET - Total configured count for this GRES type
2161  */
_get_gres_cnt(gres_node_state_t * gres_data,char * orig_config,char * gres_name,char * gres_name_colon,int gres_name_colon_len)2162 static void _get_gres_cnt(gres_node_state_t *gres_data, char *orig_config,
2163 			  char *gres_name, char *gres_name_colon,
2164 			  int gres_name_colon_len)
2165 {
2166 	char *node_gres_config, *tok, *last_tok = NULL;
2167 	char *sub_tok, *last_sub_tok = NULL;
2168 	char *num, *paren, *last_num = NULL;
2169 	uint64_t gres_config_cnt = 0, tmp_gres_cnt = 0, mult;
2170 	int i;
2171 
2172 	xassert(gres_data);
2173 	if (orig_config == NULL) {
2174 		gres_data->gres_cnt_config = 0;
2175 		return;
2176 	}
2177 
2178 	for (i = 0; i < gres_data->type_cnt; i++) {
2179 		gres_data->type_cnt_avail[i] = 0;
2180 	}
2181 
2182 	node_gres_config = xstrdup(orig_config);
2183 	tok = strtok_r(node_gres_config, ",", &last_tok);
2184 	while (tok) {
2185 		if (!xstrcmp(tok, gres_name)) {
2186 			gres_config_cnt = 1;
2187 			break;
2188 		}
2189 		if (!xstrncmp(tok, gres_name_colon, gres_name_colon_len)) {
2190 			paren = strrchr(tok, '(');
2191 			if (paren)	/* Ignore socket binding info */
2192 				paren[0] = '\0';
2193 			num = strrchr(tok, ':');
2194 			if (!num) {
2195 				error("Bad GRES configuration: %s", tok);
2196 				break;
2197 			}
2198 			tmp_gres_cnt = strtoll(num + 1, &last_num, 10);
2199 			if ((num[1] < '0') || (num[1] > '9')) {
2200 				/*
2201 				 * Type name, no count (e.g. "gpu:tesla").
2202 				 * assume count of 1.
2203 				 */
2204 				tmp_gres_cnt = 1;
2205 			} else if ((mult = suffix_mult(last_num)) != NO_VAL64) {
2206 				tmp_gres_cnt *= mult;
2207 			} else {
2208 				error("Bad GRES configuration: %s", tok);
2209 				break;
2210 			}
2211 
2212 			gres_config_cnt += tmp_gres_cnt;
2213 			num[0] = '\0';
2214 
2215 			sub_tok = strtok_r(tok, ":", &last_sub_tok);
2216 			if (sub_tok)	/* Skip GRES name */
2217 				sub_tok = strtok_r(NULL, ":", &last_sub_tok);
2218 			while (sub_tok) {
2219 				_add_gres_type(sub_tok, gres_data,
2220 					       tmp_gres_cnt);
2221 				sub_tok = strtok_r(NULL, ":", &last_sub_tok);
2222 			}
2223 		}
2224 		tok = strtok_r(NULL, ",", &last_tok);
2225 	}
2226 	xfree(node_gres_config);
2227 
2228 	gres_data->gres_cnt_config = gres_config_cnt;
2229 }
2230 
_valid_gres_type(char * gres_name,gres_node_state_t * gres_data,bool config_overrides,char ** reason_down)2231 static int _valid_gres_type(char *gres_name, gres_node_state_t *gres_data,
2232 			    bool config_overrides, char **reason_down)
2233 {
2234 	int i, j;
2235 	uint64_t model_cnt;
2236 
2237 	if (gres_data->type_cnt == 0)
2238 		return 0;
2239 
2240 	for (i = 0; i < gres_data->type_cnt; i++) {
2241 		model_cnt = 0;
2242 		if (gres_data->type_cnt) {
2243 			for (j = 0; j < gres_data->type_cnt; j++) {
2244 				if (gres_data->type_id[i] ==
2245 				    gres_data->type_id[j])
2246 					model_cnt +=
2247 						gres_data->type_cnt_avail[j];
2248 			}
2249 		} else {
2250 			for (j = 0; j < gres_data->topo_cnt; j++) {
2251 				if (gres_data->type_id[i] ==
2252 				    gres_data->topo_type_id[j])
2253 					model_cnt +=
2254 					      gres_data->topo_gres_cnt_avail[j];
2255 			}
2256 		}
2257 		if (config_overrides) {
2258 			gres_data->type_cnt_avail[i] = model_cnt;
2259 		} else if (model_cnt < gres_data->type_cnt_avail[i]) {
2260 			if (reason_down) {
2261 				xstrfmtcat(*reason_down,
2262 					   "%s:%s count too low "
2263 					   "(%"PRIu64" < %"PRIu64")",
2264 					   gres_name, gres_data->type_name[i],
2265 					   model_cnt,
2266 					   gres_data->type_cnt_avail[i]);
2267 			}
2268 			return -1;
2269 		}
2270 	}
2271 	return 0;
2272 }
2273 
_build_gres_node_state(void)2274 static gres_node_state_t *_build_gres_node_state(void)
2275 {
2276 	gres_node_state_t *gres_data;
2277 
2278 	gres_data = xmalloc(sizeof(gres_node_state_t));
2279 	gres_data->gres_cnt_config = NO_VAL64;
2280 	gres_data->gres_cnt_found  = NO_VAL64;
2281 
2282 	return gres_data;
2283 }
2284 
2285 /*
2286  * Build a node's gres record based only upon the slurm.conf contents
2287  */
_node_config_init(char * node_name,char * orig_config,slurm_gres_context_t * context_ptr,gres_state_t * gres_ptr)2288 static int _node_config_init(char *node_name, char *orig_config,
2289 			     slurm_gres_context_t *context_ptr,
2290 			     gres_state_t *gres_ptr)
2291 {
2292 	int rc = SLURM_SUCCESS;
2293 	gres_node_state_t *gres_data;
2294 
2295 	if (!gres_ptr->gres_data)
2296 		gres_ptr->gres_data = _build_gres_node_state();
2297 	gres_data = (gres_node_state_t *) gres_ptr->gres_data;
2298 
2299 	/* If the resource isn't configured for use with this node */
2300 	if ((orig_config == NULL) || (orig_config[0] == '\0')) {
2301 		gres_data->gres_cnt_config = 0;
2302 		return rc;
2303 	}
2304 
2305 	_get_gres_cnt(gres_data, orig_config,
2306 		      context_ptr->gres_name,
2307 		      context_ptr->gres_name_colon,
2308 		      context_ptr->gres_name_colon_len);
2309 
2310 	context_ptr->total_cnt += gres_data->gres_cnt_config;
2311 
2312 	/* Use count from recovered state, if higher */
2313 	gres_data->gres_cnt_avail = MAX(gres_data->gres_cnt_avail,
2314 					gres_data->gres_cnt_config);
2315 	if ((gres_data->gres_bit_alloc != NULL) &&
2316 	    (gres_data->gres_cnt_avail >
2317 	     bit_size(gres_data->gres_bit_alloc)) &&
2318 	    !_shared_gres(context_ptr->plugin_id)) {
2319 		gres_data->gres_bit_alloc =
2320 			bit_realloc(gres_data->gres_bit_alloc,
2321 				    gres_data->gres_cnt_avail);
2322 	}
2323 
2324 	return rc;
2325 }
2326 
2327 /*
2328  * Build a node's gres record based only upon the slurm.conf contents
2329  * IN node_name - name of the node for which the gres information applies
2330  * IN orig_config - Gres information supplied from slurm.conf
2331  * IN/OUT gres_list - List of Gres records for this node to track usage
2332  */
gres_plugin_init_node_config(char * node_name,char * orig_config,List * gres_list)2333 extern int gres_plugin_init_node_config(char *node_name, char *orig_config,
2334 					List *gres_list)
2335 {
2336 	int i, rc, rc2;
2337 	ListIterator gres_iter;
2338 	gres_state_t *gres_ptr;
2339 
2340 	rc = gres_plugin_init();
2341 
2342 	slurm_mutex_lock(&gres_context_lock);
2343 	if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
2344 		*gres_list = list_create(_gres_node_list_delete);
2345 	}
2346 	for (i = 0; i < gres_context_cnt; i++) {
2347 		/* Find or create gres_state entry on the list */
2348 		gres_iter = list_iterator_create(*gres_list);
2349 		while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
2350 			if (gres_ptr->plugin_id == gres_context[i].plugin_id)
2351 				break;
2352 		}
2353 		list_iterator_destroy(gres_iter);
2354 		if (gres_ptr == NULL) {
2355 			gres_ptr = xmalloc(sizeof(gres_state_t));
2356 			gres_ptr->plugin_id = gres_context[i].plugin_id;
2357 			list_append(*gres_list, gres_ptr);
2358 		}
2359 
2360 		rc2 = _node_config_init(node_name, orig_config,
2361 					&gres_context[i], gres_ptr);
2362 		if (rc == SLURM_SUCCESS)
2363 			rc = rc2;
2364 	}
2365 	slurm_mutex_unlock(&gres_context_lock);
2366 
2367 	return rc;
2368 }
2369 
2370 /*
2371  * Determine GRES availability on some node
2372  * plugin_id IN - plugin number to search for
2373  * topo_cnt OUT - count of gres.conf records of this ID found by slurmd
2374  *		  (each can have different topology)
2375  * config_type_cnt OUT - Count of records for this GRES found in configuration,
2376  *		  each of this represesents a different Type of of GRES with
2377  *		  with this name (e.g. GPU model)
2378  * RET - total number of GRES available of this ID on this node in (sum
2379  *	 across all records of this ID)
2380  */
_get_tot_gres_cnt(uint32_t plugin_id,uint64_t * topo_cnt,int * config_type_cnt)2381 static uint64_t _get_tot_gres_cnt(uint32_t plugin_id, uint64_t *topo_cnt,
2382 				  int *config_type_cnt)
2383 {
2384 	ListIterator iter;
2385 	gres_slurmd_conf_t *gres_slurmd_conf;
2386 	uint32_t cpu_set_cnt = 0, rec_cnt = 0;
2387 	uint64_t gres_cnt = 0;
2388 
2389 	xassert(config_type_cnt);
2390 	xassert(topo_cnt);
2391 	*config_type_cnt = 0;
2392 	*topo_cnt = 0;
2393 	if (gres_conf_list == NULL)
2394 		return gres_cnt;
2395 
2396 	iter = list_iterator_create(gres_conf_list);
2397 	while ((gres_slurmd_conf = (gres_slurmd_conf_t *) list_next(iter))) {
2398 		if (gres_slurmd_conf->plugin_id != plugin_id)
2399 			continue;
2400 		gres_cnt += gres_slurmd_conf->count;
2401 		rec_cnt++;
2402 		if (gres_slurmd_conf->cpus || gres_slurmd_conf->type_name)
2403 			cpu_set_cnt++;
2404 	}
2405 	list_iterator_destroy(iter);
2406 	*config_type_cnt = rec_cnt;
2407 	if (cpu_set_cnt)
2408 		*topo_cnt = rec_cnt;
2409 	return gres_cnt;
2410 }
2411 
2412 /*
2413  * Map a given GRES type ID back to a GRES type name.
2414  * gres_id IN - GRES type ID to search for.
2415  * gres_name IN - Pre-allocated string in which to store the GRES type name.
2416  * gres_name_len - Size of gres_name in bytes
2417  * RET - error code (currently not used--always return SLURM_SUCCESS)
2418  */
gres_gresid_to_gresname(uint32_t gres_id,char * gres_name,int gres_name_len)2419 extern int gres_gresid_to_gresname(uint32_t gres_id, char* gres_name,
2420 				   int gres_name_len)
2421 {
2422 	int rc = SLURM_SUCCESS;
2423 	int      found = 0;
2424 	int i;
2425 
2426 	/*
2427 	 * Check GresTypes from slurm.conf (gres_context) for GRES type name
2428 	 */
2429 	slurm_mutex_lock(&gres_context_lock);
2430 	for (i = 0; i < gres_context_cnt; ++i) {
2431 		if (gres_id == gres_context[i].plugin_id) {
2432 			strlcpy(gres_name, gres_context[i].gres_name,
2433 				gres_name_len);
2434 			found = 1;
2435 			break;
2436 		}
2437 	}
2438 	slurm_mutex_unlock(&gres_context_lock);
2439 
2440 	/*
2441 	 * If can't find GRES type name, emit error and default to GRES type ID
2442 	 */
2443 	if (!found) {
2444 		error("Could not find GRES type name in slurm.conf that corresponds to GRES type ID `%d`.  Using ID as GRES type name instead.",
2445 		      gres_id);
2446 		snprintf(gres_name, gres_name_len, "%u", gres_id);
2447 	}
2448 
2449 	return rc;
2450 }
2451 
2452 /* Convert comma-delimited array of link counts to an integer array */
_links_str2array(char * links,char * node_name,gres_node_state_t * gres_data,int gres_inx,int gres_cnt)2453 static void _links_str2array(char *links, char *node_name,
2454 			     gres_node_state_t *gres_data,
2455 			     int gres_inx, int gres_cnt)
2456 {
2457 	char *start_ptr, *end_ptr = NULL;
2458 	int i = 0;
2459 
2460 	if (!links)	/* No "Links=" data */
2461 		return;
2462 	if (gres_inx >= gres_data->link_len) {
2463 		error("%s: Invalid GRES index (%d >= %d)", __func__, gres_inx,
2464 		      gres_cnt);
2465 		return;
2466 	}
2467 
2468 	start_ptr = links;
2469 	while (1) {
2470 		gres_data->links_cnt[gres_inx][i] =
2471 			strtol(start_ptr, &end_ptr, 10);
2472 		if (gres_data->links_cnt[gres_inx][i] < -2) {
2473 			error("%s: Invalid GRES Links value (%s) on node %s:"
2474 			      "Link value '%d' < -2", __func__, links,
2475 			      node_name, gres_data->links_cnt[gres_inx][i]);
2476 			gres_data->links_cnt[gres_inx][i] = 0;
2477 			return;
2478 		}
2479 		if (end_ptr[0] == '\0')
2480 			return;
2481 		if (end_ptr[0] != ',') {
2482 			error("%s: Invalid GRES Links value (%s) on node %s:"
2483 			      "end_ptr[0]='%c' != ','", __func__, links,
2484 			      node_name, end_ptr[0]);
2485 			return;
2486 		}
2487 		if (++i >= gres_data->link_len) {
2488 			error("%s: Invalid GRES Links value (%s) on node %s:"
2489 			      "i=%d >= link_len=%d", __func__, links, node_name,
2490 			      i, gres_data->link_len);
2491 			return;
2492 		}
2493 		start_ptr = end_ptr + 1;
2494 	}
2495 }
2496 
_valid_gres_types(char * gres_name,gres_node_state_t * gres_data,char ** reason_down)2497 static bool _valid_gres_types(char *gres_name, gres_node_state_t *gres_data,
2498 			      char **reason_down)
2499 {
2500 	bool rc = true;
2501 	uint64_t gres_cnt_found = 0, gres_sum;
2502 	int topo_inx, type_inx;
2503 
2504 	if ((gres_data->type_cnt == 0) || (gres_data->topo_cnt == 0))
2505 		return rc;
2506 
2507 	for (type_inx = 0; type_inx < gres_data->type_cnt; type_inx++) {
2508 		gres_cnt_found = 0;
2509 		for (topo_inx = 0; topo_inx < gres_data->topo_cnt; topo_inx++) {
2510 			if (gres_data->topo_type_id[topo_inx] !=
2511 			    gres_data->type_id[type_inx])
2512 				continue;
2513 			gres_sum = gres_cnt_found +
2514 				   gres_data->topo_gres_cnt_avail[topo_inx];
2515 			if (gres_sum > gres_data->type_cnt_avail[type_inx]) {
2516 				gres_data->topo_gres_cnt_avail[topo_inx] -=
2517 					(gres_sum -
2518 					 gres_data->type_cnt_avail[type_inx]);
2519 			}
2520 			gres_cnt_found +=
2521 				gres_data->topo_gres_cnt_avail[topo_inx];
2522 		}
2523 		if (gres_cnt_found < gres_data->type_cnt_avail[type_inx]) {
2524 			rc = false;
2525 			break;
2526 		}
2527 	}
2528 	if (!rc && reason_down && (*reason_down == NULL)) {
2529 		xstrfmtcat(*reason_down,
2530 			   "%s:%s count too low (%"PRIu64" < %"PRIu64")",
2531 			   gres_name, gres_data->type_name[type_inx],
2532 			   gres_cnt_found, gres_data->type_cnt_avail[type_inx]);
2533 	}
2534 
2535 	return rc;
2536 }
2537 
_gres_bit_alloc_resize(gres_node_state_t * gres_data,uint64_t gres_bits)2538 static void _gres_bit_alloc_resize(gres_node_state_t *gres_data,
2539 				   uint64_t gres_bits)
2540 {
2541 	if (!gres_bits) {
2542 		FREE_NULL_BITMAP(gres_data->gres_bit_alloc);
2543 		return;
2544 	}
2545 
2546 	if (!gres_data->gres_bit_alloc)
2547 		gres_data->gres_bit_alloc = bit_alloc(gres_bits);
2548 	else if (gres_bits != bit_size(gres_data->gres_bit_alloc))
2549 		gres_data->gres_bit_alloc =
2550 			bit_realloc(gres_data->gres_bit_alloc, gres_bits);
2551 }
2552 
_node_config_validate(char * node_name,char * orig_config,gres_state_t * gres_ptr,int cpu_cnt,int core_cnt,int sock_cnt,bool config_overrides,char ** reason_down,slurm_gres_context_t * context_ptr)2553 static int _node_config_validate(char *node_name, char *orig_config,
2554 				 gres_state_t *gres_ptr,
2555 				 int cpu_cnt, int core_cnt, int sock_cnt,
2556 				 bool config_overrides, char **reason_down,
2557 				 slurm_gres_context_t *context_ptr)
2558 {
2559 	int cpus_config = 0, i, j, gres_inx, rc = SLURM_SUCCESS;
2560 	int config_type_cnt = 0;
2561 	uint64_t dev_cnt, gres_cnt, topo_cnt = 0;
2562 	bool cpu_config_err = false, updated_config = false;
2563 	gres_node_state_t *gres_data;
2564 	ListIterator iter;
2565 	gres_slurmd_conf_t *gres_slurmd_conf;
2566 	bool has_file, has_type, rebuild_topo = false;
2567 	uint32_t type_id;
2568 
2569 	xassert(core_cnt);
2570 	if (gres_ptr->gres_data == NULL)
2571 		gres_ptr->gres_data = _build_gres_node_state();
2572 	gres_data = (gres_node_state_t *) gres_ptr->gres_data;
2573 	if (gres_data->node_feature)
2574 		return rc;
2575 
2576 	gres_cnt = _get_tot_gres_cnt(context_ptr->plugin_id, &topo_cnt,
2577 				     &config_type_cnt);
2578 	if ((gres_data->gres_cnt_config > gres_cnt) && !config_overrides) {
2579 		if (reason_down && (*reason_down == NULL)) {
2580 			xstrfmtcat(*reason_down,
2581 				   "%s count reported lower than configured "
2582 				   "(%"PRIu64" < %"PRIu64")",
2583 				   context_ptr->gres_type,
2584 				   gres_cnt, gres_data->gres_cnt_config);
2585 		}
2586 		rc = EINVAL;
2587 	}
2588 	if ((gres_cnt > gres_data->gres_cnt_config)) {
2589 		debug("%s: %s: Ignoring excess count on node %s (%"
2590 		      PRIu64" > %"PRIu64")",
2591 		      __func__, context_ptr->gres_type, node_name, gres_cnt,
2592 		      gres_data->gres_cnt_config);
2593 		gres_cnt = gres_data->gres_cnt_config;
2594 	}
2595 	if (gres_data->gres_cnt_found != gres_cnt) {
2596 		if (gres_data->gres_cnt_found != NO_VAL64) {
2597 			info("%s: %s: Count changed on node %s (%"PRIu64" != %"PRIu64")",
2598 			     __func__, context_ptr->gres_type, node_name,
2599 			     gres_data->gres_cnt_found, gres_cnt);
2600 		}
2601 		if ((gres_data->gres_cnt_found != NO_VAL64) &&
2602 		    (gres_data->gres_cnt_alloc != 0)) {
2603 			if (reason_down && (*reason_down == NULL)) {
2604 				xstrfmtcat(*reason_down,
2605 					   "%s count changed and jobs are using them "
2606 					   "(%"PRIu64" != %"PRIu64")",
2607 					   context_ptr->gres_type,
2608 					   gres_data->gres_cnt_found, gres_cnt);
2609 			}
2610 			rc = EINVAL;
2611 		} else {
2612 			gres_data->gres_cnt_found = gres_cnt;
2613 			updated_config = true;
2614 		}
2615 	}
2616 	if (!updated_config && gres_data->type_cnt) {
2617 		/*
2618 		 * This is needed to address the GRES specification in
2619 		 * gres.conf having a Type option, while the GRES specification
2620 		 * in slurm.conf does not.
2621 		 */
2622 		for (i = 0; i < gres_data->type_cnt; i++) {
2623 			if (gres_data->type_cnt_avail[i])
2624 				continue;
2625 			updated_config = true;
2626 			break;
2627 		}
2628 	}
2629 	if (!updated_config)
2630 		return rc;
2631 	if ((gres_cnt > gres_data->gres_cnt_config) && config_overrides) {
2632 		info("%s: %s: count on node %s inconsistent with slurmctld count (%"PRIu64" != %"PRIu64")",
2633 		     __func__, context_ptr->gres_type, node_name,
2634 		     gres_cnt, gres_data->gres_cnt_config);
2635 		gres_cnt = gres_data->gres_cnt_config;	/* Ignore excess GRES */
2636 	}
2637 	if ((topo_cnt == 0) && (topo_cnt != gres_data->topo_cnt)) {
2638 		/* Need to clear topology info */
2639 		_gres_node_state_delete_topo(gres_data);
2640 
2641 		gres_data->topo_cnt = topo_cnt;
2642 	}
2643 
2644 	has_file = context_ptr->config_flags & GRES_CONF_HAS_FILE;
2645 	has_type = context_ptr->config_flags & GRES_CONF_HAS_TYPE;
2646 	if (_shared_gres(context_ptr->plugin_id))
2647 		dev_cnt = topo_cnt;
2648 	else
2649 		dev_cnt = gres_cnt;
2650 	if (has_file && (topo_cnt != gres_data->topo_cnt) && (dev_cnt == 0)) {
2651 		/*
2652 		 * Clear any vestigial GRES node state info.
2653 		 */
2654 		_gres_node_state_delete_topo(gres_data);
2655 
2656 		xfree(gres_data->gres_bit_alloc);
2657 
2658 		gres_data->topo_cnt = 0;
2659 	} else if (has_file && (topo_cnt != gres_data->topo_cnt)) {
2660 		/*
2661 		 * Need to rebuild topology info.
2662 		 * Resize the data structures here.
2663 		 */
2664 		rebuild_topo = true;
2665 		gres_data->topo_gres_cnt_alloc =
2666 			xrealloc(gres_data->topo_gres_cnt_alloc,
2667 				 topo_cnt * sizeof(uint64_t));
2668 		gres_data->topo_gres_cnt_avail =
2669 			xrealloc(gres_data->topo_gres_cnt_avail,
2670 				 topo_cnt * sizeof(uint64_t));
2671 		for (i = 0; i < gres_data->topo_cnt; i++) {
2672 			if (gres_data->topo_gres_bitmap) {
2673 				FREE_NULL_BITMAP(gres_data->
2674 						 topo_gres_bitmap[i]);
2675 			}
2676 			if (gres_data->topo_core_bitmap) {
2677 				FREE_NULL_BITMAP(gres_data->
2678 						 topo_core_bitmap[i]);
2679 			}
2680 			xfree(gres_data->topo_type_name[i]);
2681 		}
2682 		gres_data->topo_gres_bitmap =
2683 			xrealloc(gres_data->topo_gres_bitmap,
2684 				 topo_cnt * sizeof(bitstr_t *));
2685 		gres_data->topo_core_bitmap =
2686 			xrealloc(gres_data->topo_core_bitmap,
2687 				 topo_cnt * sizeof(bitstr_t *));
2688 		gres_data->topo_type_id = xrealloc(gres_data->topo_type_id,
2689 						   topo_cnt * sizeof(uint32_t));
2690 		gres_data->topo_type_name = xrealloc(gres_data->topo_type_name,
2691 						     topo_cnt * sizeof(char *));
2692 		if (gres_data->gres_bit_alloc)
2693 			gres_data->gres_bit_alloc = bit_realloc(
2694 				gres_data->gres_bit_alloc, dev_cnt);
2695 		gres_data->topo_cnt = topo_cnt;
2696 	} else if (_shared_gres(context_ptr->plugin_id) && gres_data->topo_cnt){
2697 		/*
2698 		 * Need to rebuild topology info to recover state after
2699 		 * slurmctld restart with running jobs.
2700 		 */
2701 		rebuild_topo = true;
2702 	}
2703 
2704 	if (rebuild_topo) {
2705 		iter = list_iterator_create(gres_conf_list);
2706 		gres_inx = i = 0;
2707 		while ((gres_slurmd_conf = (gres_slurmd_conf_t *)
2708 			list_next(iter))) {
2709 			if (gres_slurmd_conf->plugin_id !=
2710 			    context_ptr->plugin_id)
2711 				continue;
2712 			if ((gres_data->gres_bit_alloc) &&
2713 			    !_shared_gres(context_ptr->plugin_id))
2714 				gres_data->topo_gres_cnt_alloc[i] = 0;
2715 			gres_data->topo_gres_cnt_avail[i] =
2716 					gres_slurmd_conf->count;
2717 			if (gres_slurmd_conf->cpus) {
2718 				bitstr_t *tmp_bitmap;
2719 				tmp_bitmap =
2720 					bit_alloc(gres_slurmd_conf->cpu_cnt);
2721 				bit_unfmt(tmp_bitmap, gres_slurmd_conf->cpus);
2722 				if (gres_slurmd_conf->cpu_cnt == core_cnt) {
2723 					gres_data->topo_core_bitmap[i] =
2724 						tmp_bitmap;
2725 					tmp_bitmap = NULL; /* Nothing to free */
2726 				} else if (gres_slurmd_conf->cpu_cnt ==
2727 					   cpu_cnt) {
2728 					/* Translate CPU to core bitmap */
2729 					int cpus_per_core = cpu_cnt / core_cnt;
2730 					int j, core_inx;
2731 					gres_data->topo_core_bitmap[i] =
2732 						bit_alloc(core_cnt);
2733 					for (j = 0; j < cpu_cnt; j++) {
2734 						if (!bit_test(tmp_bitmap, j))
2735 							continue;
2736 						core_inx = j / cpus_per_core;
2737 						bit_set(gres_data->
2738 							topo_core_bitmap[i],
2739 							core_inx);
2740 					}
2741 				} else if (i == 0) {
2742 					error("%s: %s: invalid GRES cpu count (%u) on node %s",
2743 					      __func__, context_ptr->gres_type,
2744 					      gres_slurmd_conf->cpu_cnt,
2745 					      node_name);
2746 				}
2747 				FREE_NULL_BITMAP(tmp_bitmap);
2748 				cpus_config = core_cnt;
2749 			} else if (cpus_config && !cpu_config_err) {
2750 				cpu_config_err = true;
2751 				error("%s: %s: has CPUs configured for only some of the records on node %s",
2752 				      __func__, context_ptr->gres_type,
2753 				      node_name);
2754 			}
2755 
2756 			if (gres_slurmd_conf->links) {
2757 				if (gres_data->links_cnt &&
2758 				    (gres_data->link_len != gres_cnt)) {
2759 					/* Size changed, need to rebuild */
2760 					for (j = 0; j < gres_data->link_len;j++)
2761 						xfree(gres_data->links_cnt[j]);
2762 					xfree(gres_data->links_cnt);
2763 				}
2764 				if (!gres_data->links_cnt) {
2765 					gres_data->link_len = gres_cnt;
2766 					gres_data->links_cnt =
2767 						xcalloc(gres_cnt,
2768 							sizeof(int *));
2769 					for (j = 0; j < gres_cnt; j++) {
2770 						gres_data->links_cnt[j] =
2771 							xcalloc(gres_cnt,
2772 								sizeof(int));
2773 					}
2774 				}
2775 			}
2776 			if (_shared_gres(gres_slurmd_conf->plugin_id)) {
2777 				/* If running jobs recovered then already set */
2778 				if (!gres_data->topo_gres_bitmap[i]) {
2779 					gres_data->topo_gres_bitmap[i] =
2780 						bit_alloc(dev_cnt);
2781 					bit_set(gres_data->topo_gres_bitmap[i],
2782 						gres_inx);
2783 				}
2784 				gres_inx++;
2785 			} else if (dev_cnt == 0) {
2786 				/*
2787 				 * Slurmd found GRES, but slurmctld can't use
2788 				 * them. Avoid creating zero-size bitmaps.
2789 				 */
2790 				has_file = false;
2791 			} else {
2792 				gres_data->topo_gres_bitmap[i] =
2793 					bit_alloc(dev_cnt);
2794 				for (j = 0; j < gres_slurmd_conf->count; j++) {
2795 					if (gres_inx >= dev_cnt) {
2796 						/* Ignore excess GRES on node */
2797 						break;
2798 					}
2799 					bit_set(gres_data->topo_gres_bitmap[i],
2800 						gres_inx);
2801 					if (gres_data->gres_bit_alloc &&
2802 					    bit_test(gres_data->gres_bit_alloc,
2803 						     gres_inx)) {
2804 					    /* Set by recovered job */
2805 					    gres_data->topo_gres_cnt_alloc[i]++;
2806 					}
2807 					_links_str2array(
2808 							gres_slurmd_conf->links,
2809 							node_name, gres_data,
2810 							gres_inx, gres_cnt);
2811 					gres_inx++;
2812 				}
2813 			}
2814 			gres_data->topo_type_id[i] =
2815 				gres_plugin_build_id(gres_slurmd_conf->
2816 						     type_name);
2817 			gres_data->topo_type_name[i] =
2818 				xstrdup(gres_slurmd_conf->type_name);
2819 			i++;
2820 			if (i >= gres_data->topo_cnt)
2821 				break;
2822 		}
2823 		list_iterator_destroy(iter);
2824 		if (cpu_config_err) {
2825 			/*
2826 			 * Some GRES of this type have "CPUs" configured. Set
2827 			 * topo_core_bitmap for all others with all bits set.
2828 			 */
2829 			iter = list_iterator_create(gres_conf_list);
2830 			while ((gres_slurmd_conf = (gres_slurmd_conf_t *)
2831 				list_next(iter))) {
2832 				if (gres_slurmd_conf->plugin_id !=
2833 				    context_ptr->plugin_id)
2834 					continue;
2835 				for (j = 0; j < i; j++) {
2836 					if (gres_data->topo_core_bitmap[j])
2837 						continue;
2838 					gres_data->topo_core_bitmap[j] =
2839 						bit_alloc(cpus_config);
2840 					bit_set_all(gres_data->
2841 						    topo_core_bitmap[j]);
2842 				}
2843 			}
2844 			list_iterator_destroy(iter);
2845 		}
2846 	} else if (!has_file && has_type) {
2847 		/* Add GRES Type information as needed */
2848 		iter = list_iterator_create(gres_conf_list);
2849 		while ((gres_slurmd_conf = (gres_slurmd_conf_t *)
2850 			list_next(iter))) {
2851 			if (gres_slurmd_conf->plugin_id !=
2852 			    context_ptr->plugin_id)
2853 				continue;
2854 			type_id = gres_plugin_build_id(
2855 					gres_slurmd_conf->type_name);
2856 			for (i = 0; i < gres_data->type_cnt; i++) {
2857 				if (type_id == gres_data->type_id[i])
2858 					break;
2859 			}
2860 			if (i < gres_data->type_cnt) {
2861 				/* Update count as needed */
2862 				gres_data->type_cnt_avail[i] =
2863 					gres_slurmd_conf->count;
2864 			} else {
2865 				_add_gres_type(gres_slurmd_conf->type_name,
2866 					       gres_data,
2867 					       gres_slurmd_conf->count);
2868 			}
2869 
2870 		}
2871 		list_iterator_destroy(iter);
2872 	}
2873 
2874 	if ((orig_config == NULL) || (orig_config[0] == '\0'))
2875 		gres_data->gres_cnt_config = 0;
2876 	else if (gres_data->gres_cnt_config == NO_VAL64) {
2877 		/* This should have been filled in by _node_config_init() */
2878 		_get_gres_cnt(gres_data, orig_config,
2879 			      context_ptr->gres_name,
2880 			      context_ptr->gres_name_colon,
2881 			      context_ptr->gres_name_colon_len);
2882 	}
2883 
2884 	gres_data->gres_cnt_avail = gres_data->gres_cnt_config;
2885 
2886 	if (has_file) {
2887 		uint64_t gres_bits;
2888 		if (_shared_gres(context_ptr->plugin_id)) {
2889 			gres_bits = topo_cnt;
2890 		} else {
2891 			if (gres_data->gres_cnt_avail > MAX_GRES_BITMAP) {
2892 				error("%s: %s has \"File\" plus very large \"Count\" "
2893 				      "(%"PRIu64") for node %s, resetting value to %u",
2894 				      __func__, context_ptr->gres_type,
2895 				      gres_data->gres_cnt_avail, node_name,
2896 				      MAX_GRES_BITMAP);
2897 				gres_data->gres_cnt_avail = MAX_GRES_BITMAP;
2898 				gres_data->gres_cnt_found = MAX_GRES_BITMAP;
2899 			}
2900 			gres_bits = gres_data->gres_cnt_avail;
2901 		}
2902 
2903 		_gres_bit_alloc_resize(gres_data, gres_bits);
2904 	}
2905 
2906 	if ((config_type_cnt > 1) &&
2907 	    !_valid_gres_types(context_ptr->gres_type, gres_data, reason_down)){
2908 		rc = EINVAL;
2909 	} else if (!config_overrides &&
2910 		   (gres_data->gres_cnt_found < gres_data->gres_cnt_config)) {
2911 		if (reason_down && (*reason_down == NULL)) {
2912 			xstrfmtcat(*reason_down,
2913 				   "%s count too low (%"PRIu64" < %"PRIu64")",
2914 				   context_ptr->gres_type,
2915 				   gres_data->gres_cnt_found,
2916 				   gres_data->gres_cnt_config);
2917 		}
2918 		rc = EINVAL;
2919 	} else if (_valid_gres_type(context_ptr->gres_type, gres_data,
2920 				    config_overrides, reason_down)) {
2921 		rc = EINVAL;
2922 	} else if (config_overrides && gres_data->topo_cnt &&
2923 		   (gres_data->gres_cnt_found != gres_data->gres_cnt_config)) {
2924 		error("%s on node %s configured for %"PRIu64" resources but "
2925 		      "%"PRIu64" found, ignoring topology support",
2926 		      context_ptr->gres_type, node_name,
2927 		      gres_data->gres_cnt_config, gres_data->gres_cnt_found);
2928 		if (gres_data->topo_core_bitmap) {
2929 			for (i = 0; i < gres_data->topo_cnt; i++) {
2930 				if (gres_data->topo_core_bitmap) {
2931 					FREE_NULL_BITMAP(gres_data->
2932 							 topo_core_bitmap[i]);
2933 				}
2934 				if (gres_data->topo_gres_bitmap) {
2935 					FREE_NULL_BITMAP(gres_data->
2936 							 topo_gres_bitmap[i]);
2937 				}
2938 				xfree(gres_data->topo_type_name[i]);
2939 			}
2940 			xfree(gres_data->topo_core_bitmap);
2941 			xfree(gres_data->topo_gres_bitmap);
2942 			xfree(gres_data->topo_gres_cnt_alloc);
2943 			xfree(gres_data->topo_gres_cnt_avail);
2944 			xfree(gres_data->topo_type_id);
2945 			xfree(gres_data->topo_type_name);
2946 		}
2947 		gres_data->topo_cnt = 0;
2948 	}
2949 
2950 	return rc;
2951 }
2952 
2953 /*
2954  * Validate a node's configuration and put a gres record onto a list
2955  * Called immediately after gres_plugin_node_config_unpack().
2956  * IN node_name - name of the node for which the gres information applies
2957  * IN orig_config - Gres information supplied from merged slurm.conf/gres.conf
2958  * IN/OUT new_config - Updated gres info from slurm.conf
2959  * IN/OUT gres_list - List of Gres records for this node to track usage
2960  * IN threads_per_core - Count of CPUs (threads) per core on this node
2961  * IN cores_per_sock - Count of cores per socket on this node
2962  * IN sock_cnt - Count of sockets on this node
2963  * IN config_overrides - true: Don't validate hardware, use slurm.conf
2964  *                             configuration
2965  *		         false: Validate hardware config, but use slurm.conf
2966  *                              config
2967  * OUT reason_down - set to an explanation of failure, if any, don't set if NULL
2968  */
gres_plugin_node_config_validate(char * node_name,char * orig_config,char ** new_config,List * gres_list,int threads_per_core,int cores_per_sock,int sock_cnt,bool config_overrides,char ** reason_down)2969 extern int gres_plugin_node_config_validate(char *node_name,
2970 					    char *orig_config,
2971 					    char **new_config,
2972 					    List *gres_list,
2973 					    int threads_per_core,
2974 					    int cores_per_sock, int sock_cnt,
2975 					    bool config_overrides,
2976 					    char **reason_down)
2977 {
2978 	int i, rc, rc2;
2979 	gres_state_t *gres_ptr, *gres_gpu_ptr = NULL, *gres_mps_ptr = NULL;
2980 	int core_cnt = sock_cnt * cores_per_sock;
2981 	int cpu_cnt  = core_cnt * threads_per_core;
2982 
2983 	rc = gres_plugin_init();
2984 
2985 	slurm_mutex_lock(&gres_context_lock);
2986 	if ((gres_context_cnt > 0) && (*gres_list == NULL))
2987 		*gres_list = list_create(_gres_node_list_delete);
2988 	for (i = 0; i < gres_context_cnt; i++) {
2989 		/* Find or create gres_state entry on the list */
2990 		gres_ptr = list_find_first(*gres_list, _gres_find_id,
2991 		                           &gres_context[i].plugin_id);
2992 		if (gres_ptr == NULL) {
2993 			gres_ptr = xmalloc(sizeof(gres_state_t));
2994 			gres_ptr->plugin_id = gres_context[i].plugin_id;
2995 			list_append(*gres_list, gres_ptr);
2996 		}
2997 		rc2 = _node_config_validate(node_name, orig_config,
2998 					    gres_ptr, cpu_cnt, core_cnt,
2999 					    sock_cnt, config_overrides,
3000 					    reason_down, &gres_context[i]);
3001 		rc = MAX(rc, rc2);
3002 		if (gres_ptr->plugin_id == gpu_plugin_id)
3003 			gres_gpu_ptr = gres_ptr;
3004 		else if (gres_ptr->plugin_id == mps_plugin_id)
3005 			gres_mps_ptr = gres_ptr;
3006 	}
3007 	_sync_node_mps_to_gpu(gres_mps_ptr, gres_gpu_ptr);
3008 	_build_node_gres_str(gres_list, new_config, cores_per_sock, sock_cnt);
3009 	slurm_mutex_unlock(&gres_context_lock);
3010 
3011 	return rc;
3012 }
3013 
3014 /* Convert number to new value with suffix (e.g. 2096 -> 2K) */
_gres_scale_value(uint64_t gres_size,uint64_t * gres_scaled,char ** suffix)3015 static void _gres_scale_value(uint64_t gres_size, uint64_t *gres_scaled,
3016 			      char **suffix)
3017 {
3018 	uint64_t tmp_gres_size = gres_size;
3019 	int i;
3020 
3021 	tmp_gres_size = gres_size;
3022 	for (i = 0; i < 4; i++) {
3023 		if ((tmp_gres_size != 0) && ((tmp_gres_size % 1024) == 0))
3024 			tmp_gres_size /= 1024;
3025 		else
3026 			break;
3027 	}
3028 
3029 	*gres_scaled = tmp_gres_size;
3030 	if (i == 0)
3031 		*suffix = "";
3032 	else if (i == 1)
3033 		*suffix = "K";
3034 	else if (i == 2)
3035 		*suffix = "M";
3036 	else if (i == 3)
3037 		*suffix = "G";
3038 	else
3039 		*suffix = "T";
3040 }
3041 
3042 /*
3043  * Add a GRES from node_feature plugin
3044  * IN node_name - name of the node for which the gres information applies
3045  * IN gres_name - name of the GRES being added or updated from the plugin
3046  * IN gres_size - count of this GRES on this node
3047  * IN/OUT new_config - Updated GRES info from slurm.conf
3048  * IN/OUT gres_list - List of GRES records for this node to track usage
3049  */
gres_plugin_node_feature(char * node_name,char * gres_name,uint64_t gres_size,char ** new_config,List * gres_list)3050 extern void gres_plugin_node_feature(char *node_name,
3051 				     char *gres_name, uint64_t gres_size,
3052 				     char **new_config, List *gres_list)
3053 {
3054 	char *new_gres = NULL, *tok, *save_ptr = NULL, *sep = "", *suffix = "";
3055 	gres_state_t *gres_ptr;
3056 	gres_node_state_t *gres_node_ptr;
3057 	uint32_t plugin_id;
3058 	uint64_t gres_scaled = 0;
3059 	int gres_name_len;
3060 
3061 	xassert(gres_name);
3062 	gres_name_len = strlen(gres_name);
3063 	plugin_id = gres_plugin_build_id(gres_name);
3064 	if (*new_config) {
3065 		tok = strtok_r(*new_config, ",", &save_ptr);
3066 		while (tok) {
3067 			if (!strncmp(tok, gres_name, gres_name_len) &&
3068 			    ((tok[gres_name_len] == ':') ||
3069 			     (tok[gres_name_len] == '\0'))) {
3070 				/* Skip this record */
3071 			} else {
3072 				xstrfmtcat(new_gres, "%s%s", sep, tok);
3073 				sep = ",";
3074 			}
3075 			tok = strtok_r(NULL, ",", &save_ptr);
3076 		}
3077 	}
3078 	_gres_scale_value(gres_size, &gres_scaled, &suffix);
3079 	xstrfmtcat(new_gres, "%s%s:%"PRIu64"%s",
3080 		   sep, gres_name, gres_scaled, suffix);
3081 	xfree(*new_config);
3082 	*new_config = new_gres;
3083 
3084 	slurm_mutex_lock(&gres_context_lock);
3085 	if (gres_context_cnt > 0) {
3086 		if (*gres_list == NULL)
3087 			*gres_list = list_create(_gres_node_list_delete);
3088 		gres_ptr = list_find_first(*gres_list, _gres_find_id,
3089 		                           &plugin_id);
3090 		if (gres_ptr == NULL) {
3091 			gres_ptr = xmalloc(sizeof(gres_state_t));
3092 			gres_ptr->plugin_id = plugin_id;
3093 			gres_ptr->gres_data = _build_gres_node_state();
3094 			list_append(*gres_list, gres_ptr);
3095 		}
3096 		gres_node_ptr = gres_ptr->gres_data;
3097 		if (gres_size >= gres_node_ptr->gres_cnt_alloc) {
3098 			gres_node_ptr->gres_cnt_avail = gres_size -
3099 						gres_node_ptr->gres_cnt_alloc;
3100 		} else {
3101 			error("%s: Changed size count of GRES %s from %"PRIu64
3102 			      " to %"PRIu64", resource over allocated",
3103 			      __func__, gres_name,
3104 			      gres_node_ptr->gres_cnt_avail, gres_size);
3105 			gres_node_ptr->gres_cnt_avail = 0;
3106 		}
3107 		gres_node_ptr->gres_cnt_config = gres_size;
3108 		gres_node_ptr->gres_cnt_found = gres_size;
3109 		gres_node_ptr->node_feature = true;
3110 	}
3111 	slurm_mutex_unlock(&gres_context_lock);
3112 }
3113 
3114 /*
3115  * Check validity of a GRES change. Specifically if a GRES type has "Files"
3116  * configured then the only valid new counts are the current count or zero
3117  *
3118  * RET true of the requested change is valid
3119  */
_node_reconfig_test(char * node_name,char * new_gres,gres_state_t * gres_ptr,slurm_gres_context_t * context_ptr)3120 static int _node_reconfig_test(char *node_name, char *new_gres,
3121 			       gres_state_t *gres_ptr,
3122 			       slurm_gres_context_t *context_ptr)
3123 {
3124 	gres_node_state_t *orig_gres_data, *new_gres_data;
3125 	int rc = SLURM_SUCCESS;
3126 
3127 	xassert(gres_ptr);
3128 	if (!(context_ptr->config_flags & GRES_CONF_HAS_FILE))
3129 		return SLURM_SUCCESS;
3130 
3131 	orig_gres_data = gres_ptr->gres_data;
3132 	new_gres_data = _build_gres_node_state();
3133 	_get_gres_cnt(new_gres_data, new_gres,
3134 		      context_ptr->gres_name,
3135 		      context_ptr->gres_name_colon,
3136 		      context_ptr->gres_name_colon_len);
3137 	if ((new_gres_data->gres_cnt_config != 0) &&
3138 	    (new_gres_data->gres_cnt_config !=
3139 	     orig_gres_data->gres_cnt_config)) {
3140 		error("Attempt to change gres/%s Count on node %s from %"
3141 		      PRIu64" to %"PRIu64" invalid with File configuration",
3142 		      context_ptr->gres_name, node_name,
3143 		      orig_gres_data->gres_cnt_config,
3144 		      new_gres_data->gres_cnt_config);
3145 		rc = ESLURM_INVALID_GRES;
3146 	}
3147 	_gres_node_state_delete(new_gres_data);
3148 
3149 	return rc;
3150 }
3151 
_node_reconfig(char * node_name,char * new_gres,char ** gres_str,gres_state_t * gres_ptr,bool config_overrides,slurm_gres_context_t * context_ptr,bool * updated_gpu_cnt)3152 static int _node_reconfig(char *node_name, char *new_gres, char **gres_str,
3153 			  gres_state_t *gres_ptr, bool config_overrides,
3154 			  slurm_gres_context_t *context_ptr,
3155 			  bool *updated_gpu_cnt)
3156 {
3157 	int i;
3158 	gres_node_state_t *gres_data;
3159 	uint64_t gres_bits, orig_cnt;
3160 
3161 	xassert(gres_ptr);
3162 	xassert(updated_gpu_cnt);
3163 	*updated_gpu_cnt = false;
3164 	if (gres_ptr->gres_data == NULL)
3165 		gres_ptr->gres_data = _build_gres_node_state();
3166 	gres_data = gres_ptr->gres_data;
3167 	orig_cnt = gres_data->gres_cnt_config;
3168 
3169 	_get_gres_cnt(gres_data, new_gres,
3170 		      context_ptr->gres_name,
3171 		      context_ptr->gres_name_colon,
3172 		      context_ptr->gres_name_colon_len);
3173 
3174 	if (gres_data->gres_cnt_config == orig_cnt)
3175 		return SLURM_SUCCESS;	/* No change in count */
3176 
3177 	/* Update count */
3178 	context_ptr->total_cnt -= orig_cnt;
3179 	context_ptr->total_cnt += gres_data->gres_cnt_config;
3180 
3181 	if (!gres_data->gres_cnt_config)
3182 		gres_data->gres_cnt_avail = gres_data->gres_cnt_config;
3183 	else if (gres_data->gres_cnt_found != NO_VAL64)
3184 		gres_data->gres_cnt_avail = gres_data->gres_cnt_found;
3185 	else if (gres_data->gres_cnt_avail == NO_VAL64)
3186 		gres_data->gres_cnt_avail = 0;
3187 
3188 	if (context_ptr->config_flags & GRES_CONF_HAS_FILE) {
3189 		if (_shared_gres(context_ptr->plugin_id))
3190 			gres_bits = gres_data->topo_cnt;
3191 		else
3192 			gres_bits = gres_data->gres_cnt_avail;
3193 
3194 		_gres_bit_alloc_resize(gres_data, gres_bits);
3195 	} else if (gres_data->gres_bit_alloc &&
3196 		   !_shared_gres(context_ptr->plugin_id)) {
3197 		/*
3198 		 * If GRES count changed in configuration between reboots,
3199 		 * update bitmap sizes as needed.
3200 		 */
3201 		gres_bits = gres_data->gres_cnt_avail;
3202 		if (gres_bits != bit_size(gres_data->gres_bit_alloc)) {
3203 			info("gres/%s count changed on node %s to %"PRIu64,
3204 			     context_ptr->gres_name, node_name, gres_bits);
3205 			if (_sharing_gres(context_ptr->plugin_id))
3206 				*updated_gpu_cnt = true;
3207 			gres_data->gres_bit_alloc =
3208 				bit_realloc(gres_data->gres_bit_alloc,
3209 					    gres_bits);
3210 			for (i = 0; i < gres_data->topo_cnt; i++) {
3211 				if (gres_data->topo_gres_bitmap &&
3212 				    gres_data->topo_gres_bitmap[i] &&
3213 				    (gres_bits !=
3214 				     bit_size(gres_data->topo_gres_bitmap[i]))){
3215 					gres_data->topo_gres_bitmap[i] =
3216 						bit_realloc(
3217 						gres_data->topo_gres_bitmap[i],
3218 						gres_bits);
3219 				}
3220 			}
3221 		}
3222 	}
3223 
3224 	return SLURM_SUCCESS;
3225 }
3226 
3227 /* The GPU count on a node changed. Update MPS data structures to match */
_sync_node_mps_to_gpu(gres_state_t * mps_gres_ptr,gres_state_t * gpu_gres_ptr)3228 static void _sync_node_mps_to_gpu(gres_state_t *mps_gres_ptr,
3229 				  gres_state_t *gpu_gres_ptr)
3230 {
3231 	gres_node_state_t *gpu_gres_data, *mps_gres_data;
3232 	uint64_t gpu_cnt, mps_alloc = 0, mps_rem;
3233 	int i;
3234 
3235 	if (!gpu_gres_ptr || !mps_gres_ptr)
3236 		return;
3237 
3238 	gpu_gres_data = gpu_gres_ptr->gres_data;
3239 	mps_gres_data = mps_gres_ptr->gres_data;
3240 
3241 	gpu_cnt = gpu_gres_data->gres_cnt_avail;
3242 	if (mps_gres_data->gres_bit_alloc) {
3243 		if (gpu_cnt == bit_size(mps_gres_data->gres_bit_alloc))
3244 			return;		/* No change for gres/mps */
3245 	}
3246 
3247 	if (gpu_cnt == 0)
3248 		return;			/* Still no GPUs */
3249 
3250 	/* Free any excess gres/mps topo records */
3251 	for (i = gpu_cnt; i < mps_gres_data->topo_cnt; i++) {
3252 		if (mps_gres_data->topo_core_bitmap)
3253 			FREE_NULL_BITMAP(mps_gres_data->topo_core_bitmap[i]);
3254 		if (mps_gres_data->topo_gres_bitmap)
3255 			FREE_NULL_BITMAP(mps_gres_data->topo_gres_bitmap[i]);
3256 		xfree(mps_gres_data->topo_type_name[i]);
3257 	}
3258 
3259 	if (mps_gres_data->gres_cnt_avail == 0) {
3260 		/* No gres/mps on this node */
3261 		mps_gres_data->topo_cnt = 0;
3262 		return;
3263 	}
3264 
3265 	if (!mps_gres_data->gres_bit_alloc) {
3266 		mps_gres_data->gres_bit_alloc = bit_alloc(gpu_cnt);
3267 	} else {
3268 		mps_gres_data->gres_bit_alloc =
3269 				bit_realloc(mps_gres_data->gres_bit_alloc,
3270 					    gpu_cnt);
3271 	}
3272 
3273 	/* Add any additional required gres/mps topo records */
3274 	if (mps_gres_data->topo_cnt) {
3275 		mps_gres_data->topo_core_bitmap =
3276 			xrealloc(mps_gres_data->topo_core_bitmap,
3277 				 sizeof(bitstr_t *) * gpu_cnt);
3278 		mps_gres_data->topo_gres_bitmap =
3279 			xrealloc(mps_gres_data->topo_gres_bitmap,
3280 				 sizeof(bitstr_t *) * gpu_cnt);
3281 		mps_gres_data->topo_gres_cnt_alloc =
3282 			xrealloc(mps_gres_data->topo_gres_cnt_alloc,
3283 				 sizeof(uint64_t) * gpu_cnt);
3284 		mps_gres_data->topo_gres_cnt_avail =
3285 			xrealloc(mps_gres_data->topo_gres_cnt_avail,
3286 				 sizeof(uint64_t) * gpu_cnt);
3287 		mps_gres_data->topo_type_id =
3288 			xrealloc(mps_gres_data->topo_type_id,
3289 				 sizeof(uint32_t) * gpu_cnt);
3290 		mps_gres_data->topo_type_name =
3291 			xrealloc(mps_gres_data->topo_type_name,
3292 				 sizeof(char *) * gpu_cnt);
3293 	} else {
3294 		mps_gres_data->topo_core_bitmap =
3295 			xcalloc(gpu_cnt, sizeof(bitstr_t *));
3296 		mps_gres_data->topo_gres_bitmap =
3297 			xcalloc(gpu_cnt, sizeof(bitstr_t *));
3298 		mps_gres_data->topo_gres_cnt_alloc =
3299 			xcalloc(gpu_cnt, sizeof(uint64_t));
3300 		mps_gres_data->topo_gres_cnt_avail =
3301 			xcalloc(gpu_cnt, sizeof(uint64_t));
3302 		mps_gres_data->topo_type_id =
3303 			xcalloc(gpu_cnt, sizeof(uint32_t));
3304 		mps_gres_data->topo_type_name =
3305 			xcalloc(gpu_cnt, sizeof(char *));
3306 	}
3307 
3308 	/*
3309 	 * Evenly distribute any remaining MPS counts.
3310 	 * Counts get reset as needed when the node registers.
3311 	 */
3312 	for (i = 0; i < mps_gres_data->topo_cnt; i++)
3313 		mps_alloc += mps_gres_data->topo_gres_cnt_avail[i];
3314 	if (mps_alloc >= mps_gres_data->gres_cnt_avail)
3315 		mps_rem = 0;
3316 	else
3317 		mps_rem = mps_gres_data->gres_cnt_avail - mps_alloc;
3318 	for (i = mps_gres_data->topo_cnt; i < gpu_cnt; i++) {
3319 		mps_gres_data->topo_gres_bitmap[i] = bit_alloc(gpu_cnt);
3320 		bit_set(mps_gres_data->topo_gres_bitmap[i], i);
3321 		mps_alloc = mps_rem / (gpu_cnt - i);
3322 		mps_gres_data->topo_gres_cnt_avail[i] = mps_alloc;
3323 		mps_rem -= mps_alloc;
3324 	}
3325 	mps_gres_data->topo_cnt = gpu_cnt;
3326 
3327 	for (i = 0; i < mps_gres_data->topo_cnt; i++) {
3328 		if (mps_gres_data->topo_gres_bitmap &&
3329 		    mps_gres_data->topo_gres_bitmap[i] &&
3330 		    (gpu_cnt != bit_size(mps_gres_data->topo_gres_bitmap[i]))) {
3331 			mps_gres_data->topo_gres_bitmap[i] =
3332 				bit_realloc(mps_gres_data->topo_gres_bitmap[i],
3333 					    gpu_cnt);
3334 		}
3335 	}
3336 }
3337 
3338 /* Convert core bitmap into socket string, xfree return value */
_core_bitmap2str(bitstr_t * core_map,int cores_per_sock,int sock_per_node)3339 static char *_core_bitmap2str(bitstr_t *core_map, int cores_per_sock,
3340 			      int sock_per_node)
3341 {
3342 	char *sock_info = NULL, tmp[256];
3343 	bitstr_t *sock_map;
3344 	int c, s, core_offset, max_core;
3345 	bool any_set = false;
3346 
3347 	xassert(core_map);
3348 	max_core = bit_size(core_map) - 1;
3349 	sock_map = bit_alloc(sock_per_node);
3350 	for (s = 0; s < sock_per_node; s++) {
3351 		core_offset = s * cores_per_sock;
3352 		for (c = 0; c < cores_per_sock; c++) {
3353 			if (core_offset > max_core) {
3354 				error("%s: bad core offset (%d >= %d)",
3355 				      __func__, core_offset, max_core);
3356 				break;
3357 			}
3358 			if (bit_test(core_map, core_offset++)) {
3359 				bit_set(sock_map, s);
3360 				any_set = true;
3361 				break;
3362 			}
3363 		}
3364 	}
3365 	if (any_set) {
3366 		bit_fmt(tmp, sizeof(tmp), sock_map);
3367 		xstrfmtcat(sock_info, "(S:%s)", tmp);
3368 	} else {
3369 		/* We have a core bitmap with no bits set */
3370 		sock_info = xstrdup("");
3371 	}
3372 	bit_free(sock_map);
3373 
3374 	return sock_info;
3375 }
3376 
3377 /* Given a count, modify it as needed and return suffix (e.g. "M" for mega ) */
_get_suffix(uint64_t * count)3378 static char *_get_suffix(uint64_t *count)
3379 {
3380 	if (*count == 0)
3381 		return "";
3382 	if ((*count % ((uint64_t)1024 * 1024 * 1024 * 1024 * 1024)) == 0) {
3383 		*count /= ((uint64_t)1024 * 1024 * 1024 * 1024 * 1024);
3384 		return "P";
3385 	} else if ((*count % ((uint64_t)1024 * 1024 * 1024 * 1024)) == 0) {
3386 		*count /= ((uint64_t)1024 * 1024 * 1024 * 1024);
3387 		return "T";
3388 	} else if ((*count % ((uint64_t)1024 * 1024 * 1024)) == 0) {
3389 		*count /= ((uint64_t)1024 * 1024 * 1024);
3390 		return "G";
3391 	} else if ((*count % (1024 * 1024)) == 0) {
3392 		*count /= (1024 * 1024);
3393 		return "M";
3394 	} else if ((*count % 1024) == 0) {
3395 		*count /= 1024;
3396 		return "K";
3397 	} else {
3398 		return "";
3399 	}
3400 }
3401 
3402 /* Build node's GRES string based upon data in that node's GRES list */
_build_node_gres_str(List * gres_list,char ** gres_str,int cores_per_sock,int sock_per_node)3403 static void _build_node_gres_str(List *gres_list, char **gres_str,
3404 				 int cores_per_sock, int sock_per_node)
3405 {
3406 	gres_state_t *gres_ptr;
3407 	gres_node_state_t *gres_node_state;
3408 	bitstr_t *done_topo, *core_map;
3409 	uint64_t gres_sum;
3410 	char *sep = "", *suffix, *sock_info = NULL, *sock_str;
3411 	int c, i, j;
3412 
3413 	xassert(gres_str);
3414 	xfree(*gres_str);
3415 	for (c = 0; c < gres_context_cnt; c++) {
3416 		/* Find gres_state entry on the list */
3417 		gres_ptr = list_find_first(*gres_list, _gres_find_id,
3418 		                           &gres_context[c].plugin_id);
3419 		if (gres_ptr == NULL)
3420 			continue;	/* Node has none of this GRES */
3421 
3422 		gres_node_state = (gres_node_state_t *) gres_ptr->gres_data;
3423 		if (gres_node_state->topo_cnt &&
3424 		    gres_node_state->gres_cnt_avail) {
3425 			done_topo = bit_alloc(gres_node_state->topo_cnt);
3426 			for (i = 0; i < gres_node_state->topo_cnt; i++) {
3427 				if (bit_test(done_topo, i))
3428 					continue;
3429 				bit_set(done_topo, i);
3430 				gres_sum = gres_node_state->
3431 					   topo_gres_cnt_avail[i];
3432 				if (gres_node_state->topo_core_bitmap[i]) {
3433 					core_map = bit_copy(
3434 							gres_node_state->
3435 							topo_core_bitmap[i]);
3436 				} else
3437 					core_map = NULL;
3438 				for (j = 0; j < gres_node_state->topo_cnt; j++){
3439 					if (gres_node_state->topo_type_id[i] !=
3440 					    gres_node_state->topo_type_id[j])
3441 						continue;
3442 					if (bit_test(done_topo, j))
3443 						continue;
3444 					bit_set(done_topo, j);
3445 					gres_sum += gres_node_state->
3446 						    topo_gres_cnt_avail[j];
3447 					if (core_map &&
3448 					    gres_node_state->
3449 					    topo_core_bitmap[j]) {
3450 						bit_or(core_map,
3451 						       gres_node_state->
3452 						       topo_core_bitmap[j]);
3453 					} else if (gres_node_state->
3454 						   topo_core_bitmap[j]) {
3455 						core_map = bit_copy(
3456 							   gres_node_state->
3457 							   topo_core_bitmap[j]);
3458 					}
3459 				}
3460 				if (core_map) {
3461 					sock_info = _core_bitmap2str(core_map,
3462 							cores_per_sock,
3463 							sock_per_node);
3464 					bit_free(core_map);
3465 					sock_str = sock_info;
3466 				} else
3467 					sock_str = "";
3468 				suffix = _get_suffix(&gres_sum);
3469 				if (gres_node_state->topo_type_name[i]) {
3470 					xstrfmtcat(*gres_str,
3471 						   "%s%s:%s:%"PRIu64"%s%s", sep,
3472 						   gres_context[c].gres_name,
3473 						   gres_node_state->
3474 						   topo_type_name[i],
3475 						   gres_sum, suffix, sock_str);
3476 				} else {
3477 					xstrfmtcat(*gres_str,
3478 						   "%s%s:%"PRIu64"%s%s", sep,
3479 						   gres_context[c].gres_name,
3480 						   gres_sum, suffix, sock_str);
3481 				}
3482 				xfree(sock_info);
3483 				sep = ",";
3484 			}
3485 			bit_free(done_topo);
3486 		} else if (gres_node_state->type_cnt &&
3487 			   gres_node_state->gres_cnt_avail) {
3488 			for (i = 0; i < gres_node_state->type_cnt; i++) {
3489 				gres_sum = gres_node_state->type_cnt_avail[i];
3490 				suffix = _get_suffix(&gres_sum);
3491 				xstrfmtcat(*gres_str, "%s%s:%s:%"PRIu64"%s",
3492 					   sep, gres_context[c].gres_name,
3493 					   gres_node_state->type_name[i],
3494 					   gres_sum, suffix);
3495 				sep = ",";
3496 			}
3497 		} else if (gres_node_state->gres_cnt_avail) {
3498 			gres_sum = gres_node_state->gres_cnt_avail;
3499 			suffix = _get_suffix(&gres_sum);
3500 			xstrfmtcat(*gres_str, "%s%s:%"PRIu64"%s",
3501 				   sep, gres_context[c].gres_name,
3502 				   gres_sum, suffix);
3503 			sep = ",";
3504 		}
3505 	}
3506 }
3507 
3508 /*
3509  * Note that a node's configuration has been modified (e.g. "scontol update ..")
3510  * IN node_name - name of the node for which the gres information applies
3511  * IN new_gres - Updated GRES information supplied from slurm.conf or scontrol
3512  * IN/OUT gres_str - Node's current GRES string, updated as needed
3513  * IN/OUT gres_list - List of Gres records for this node to track usage
3514  * IN config_overrides - true: Don't validate hardware, use slurm.conf
3515  *                             configuration
3516  *		         false: Validate hardware config, but use slurm.conf
3517  *                              config
3518  * IN cores_per_sock - Number of cores per socket on this node
3519  * IN sock_per_node - Total count of sockets on this node (on any board)
3520  */
gres_plugin_node_reconfig(char * node_name,char * new_gres,char ** gres_str,List * gres_list,bool config_overrides,int cores_per_sock,int sock_per_node)3521 extern int gres_plugin_node_reconfig(char *node_name,
3522 				     char *new_gres,
3523 				     char **gres_str,
3524 				     List *gres_list,
3525 				     bool config_overrides,
3526 				     int cores_per_sock,
3527 				     int sock_per_node)
3528 {
3529 	int i, rc;
3530 	ListIterator gres_iter;
3531 	gres_state_t *gres_ptr = NULL, **gres_ptr_array;
3532 	gres_state_t *gpu_gres_ptr = NULL, *mps_gres_ptr;
3533 
3534 	rc = gres_plugin_init();
3535 	slurm_mutex_lock(&gres_context_lock);
3536 	gres_ptr_array = xcalloc(gres_context_cnt, sizeof(gres_state_t *));
3537 	if ((gres_context_cnt > 0) && (*gres_list == NULL))
3538 		*gres_list = list_create(_gres_node_list_delete);
3539 
3540 	/* First validate all of the requested GRES changes */
3541 	for (i = 0; (rc == SLURM_SUCCESS) && (i < gres_context_cnt); i++) {
3542 		/* Find gres_state entry on the list */
3543 		gres_ptr = list_find_first(*gres_list, _gres_find_id,
3544 		                           &gres_context[i].plugin_id);
3545 		if (gres_ptr == NULL)
3546 			continue;
3547 		gres_ptr_array[i] = gres_ptr;
3548 		rc = _node_reconfig_test(node_name, new_gres, gres_ptr,
3549 					 &gres_context[i]);
3550 	}
3551 
3552 	/* Now update the GRES counts */
3553 	for (i = 0; (rc == SLURM_SUCCESS) && (i < gres_context_cnt); i++) {
3554 		bool updated_gpu_cnt = false;
3555 		if (gres_ptr_array[i] == NULL)
3556 			continue;
3557 		rc = _node_reconfig(node_name, new_gres, gres_str,
3558 				    gres_ptr_array[i], config_overrides,
3559 				    &gres_context[i], &updated_gpu_cnt);
3560 		if (updated_gpu_cnt)
3561 			gpu_gres_ptr = gres_ptr;
3562 	}
3563 
3564 	/* Now synchronize gres/gpu and gres/mps state */
3565 	if (gpu_gres_ptr && have_mps) {
3566 		/* Update gres/mps counts and bitmaps to match gres/gpu */
3567 		gres_iter = list_iterator_create(*gres_list);
3568 		while ((mps_gres_ptr = (gres_state_t *) list_next(gres_iter))) {
3569 			if (_shared_gres(mps_gres_ptr->plugin_id))
3570 				break;
3571 		}
3572 		list_iterator_destroy(gres_iter);
3573 		_sync_node_mps_to_gpu(mps_gres_ptr, gpu_gres_ptr);
3574 	}
3575 
3576 	/* Build new per-node gres_str */
3577 	_build_node_gres_str(gres_list, gres_str, cores_per_sock,sock_per_node);
3578 	slurm_mutex_unlock(&gres_context_lock);
3579 	xfree(gres_ptr_array);
3580 
3581 	return rc;
3582 }
3583 
3584 /*
3585  * Pack a node's current gres status, called from slurmctld for save/restore
3586  * IN gres_list - generated by gres_plugin_node_config_validate()
3587  * IN/OUT buffer - location to write state to
3588  * IN node_name - name of the node for which the gres information applies
3589  */
gres_plugin_node_state_pack(List gres_list,Buf buffer,char * node_name)3590 extern int gres_plugin_node_state_pack(List gres_list, Buf buffer,
3591 				       char *node_name)
3592 {
3593 	int rc = SLURM_SUCCESS;
3594 	uint32_t top_offset, tail_offset;
3595 	uint32_t magic = GRES_MAGIC;
3596 	uint16_t gres_bitmap_size, rec_cnt = 0;
3597 	ListIterator gres_iter;
3598 	gres_state_t *gres_ptr;
3599 	gres_node_state_t *gres_node_ptr;
3600 
3601 	if (gres_list == NULL) {
3602 		pack16(rec_cnt, buffer);
3603 		return rc;
3604 	}
3605 
3606 	top_offset = get_buf_offset(buffer);
3607 	pack16(rec_cnt, buffer);	/* placeholder if data */
3608 
3609 	(void) gres_plugin_init();
3610 
3611 	slurm_mutex_lock(&gres_context_lock);
3612 	gres_iter = list_iterator_create(gres_list);
3613 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
3614 		gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
3615 		pack32(magic, buffer);
3616 		pack32(gres_ptr->plugin_id, buffer);
3617 		pack64(gres_node_ptr->gres_cnt_avail, buffer);
3618 		/*
3619 		 * Just note if gres_bit_alloc exists.
3620 		 * Rebuild it based upon the state of recovered jobs
3621 		 */
3622 		if (gres_node_ptr->gres_bit_alloc)
3623 			gres_bitmap_size = bit_size(gres_node_ptr->gres_bit_alloc);
3624 		else
3625 			gres_bitmap_size = 0;
3626 		pack16(gres_bitmap_size, buffer);
3627 		rec_cnt++;
3628 	}
3629 	list_iterator_destroy(gres_iter);
3630 	slurm_mutex_unlock(&gres_context_lock);
3631 
3632 	tail_offset = get_buf_offset(buffer);
3633 	set_buf_offset(buffer, top_offset);
3634 	pack16(rec_cnt, buffer);
3635 	set_buf_offset(buffer, tail_offset);
3636 
3637 	return rc;
3638 }
3639 
3640 /*
3641  * Unpack a node's current gres status, called from slurmctld for save/restore
3642  * OUT gres_list - restored state stored by gres_plugin_node_state_pack()
3643  * IN/OUT buffer - location to read state from
3644  * IN node_name - name of the node for which the gres information applies
3645  */
gres_plugin_node_state_unpack(List * gres_list,Buf buffer,char * node_name,uint16_t protocol_version)3646 extern int gres_plugin_node_state_unpack(List *gres_list, Buf buffer,
3647 					 char *node_name,
3648 					 uint16_t protocol_version)
3649 {
3650 	int i, rc;
3651 	uint32_t magic = 0, plugin_id = 0;
3652 	uint64_t gres_cnt_avail = 0;
3653 	uint16_t gres_bitmap_size = 0, rec_cnt = 0;
3654 	uint8_t  has_bitmap = 0;
3655 	gres_state_t *gres_ptr;
3656 	gres_node_state_t *gres_node_ptr;
3657 
3658 	safe_unpack16(&rec_cnt, buffer);
3659 	if (rec_cnt == 0)
3660 		return SLURM_SUCCESS;
3661 
3662 	rc = gres_plugin_init();
3663 
3664 	slurm_mutex_lock(&gres_context_lock);
3665 	if ((gres_context_cnt > 0) && (*gres_list == NULL))
3666 		*gres_list = list_create(_gres_node_list_delete);
3667 
3668 	while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
3669 		if ((buffer == NULL) || (remaining_buf(buffer) == 0))
3670 			break;
3671 		rec_cnt--;
3672 		if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
3673 			safe_unpack32(&magic, buffer);
3674 			if (magic != GRES_MAGIC)
3675 				goto unpack_error;
3676 			safe_unpack32(&plugin_id, buffer);
3677 			safe_unpack64(&gres_cnt_avail, buffer);
3678 			safe_unpack16(&gres_bitmap_size, buffer);
3679 		} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
3680 			safe_unpack32(&magic, buffer);
3681 			if (magic != GRES_MAGIC)
3682 				goto unpack_error;
3683 			safe_unpack32(&plugin_id, buffer);
3684 			safe_unpack64(&gres_cnt_avail, buffer);
3685 			safe_unpack8(&has_bitmap, buffer);
3686 			if (has_bitmap)
3687 				gres_bitmap_size = gres_cnt_avail;
3688 			else
3689 				gres_bitmap_size = 0;
3690 		} else {
3691 			error("%s: protocol_version %hu not supported",
3692 			      __func__, protocol_version);
3693 			goto unpack_error;
3694 		}
3695 		for (i = 0; i < gres_context_cnt; i++) {
3696 			if (gres_context[i].plugin_id == plugin_id)
3697 				break;
3698 		}
3699 		if (i >= gres_context_cnt) {
3700 			error("%s: no plugin configured to unpack data type %u from node %s",
3701 			      __func__, plugin_id, node_name);
3702 			/*
3703 			 * A likely sign that GresPlugins has changed.
3704 			 * Not a fatal error, skip over the data.
3705 			 */
3706 			continue;
3707 		}
3708 		gres_node_ptr = _build_gres_node_state();
3709 		gres_node_ptr->gres_cnt_avail = gres_cnt_avail;
3710 		if (gres_bitmap_size) {
3711 			gres_node_ptr->gres_bit_alloc =
3712 				bit_alloc(gres_bitmap_size);
3713 		}
3714 		gres_ptr = xmalloc(sizeof(gres_state_t));
3715 		gres_ptr->plugin_id = gres_context[i].plugin_id;
3716 		gres_ptr->gres_data = gres_node_ptr;
3717 		list_append(*gres_list, gres_ptr);
3718 	}
3719 	slurm_mutex_unlock(&gres_context_lock);
3720 	return rc;
3721 
3722 unpack_error:
3723 	error("%s: unpack error from node %s", __func__, node_name);
3724 	slurm_mutex_unlock(&gres_context_lock);
3725 	return SLURM_ERROR;
3726 }
3727 
_node_state_dup(void * gres_data)3728 static void *_node_state_dup(void *gres_data)
3729 {
3730 	int i, j;
3731 	gres_node_state_t *gres_ptr = (gres_node_state_t *) gres_data;
3732 	gres_node_state_t *new_gres;
3733 
3734 	if (gres_ptr == NULL)
3735 		return NULL;
3736 
3737 	new_gres = xmalloc(sizeof(gres_node_state_t));
3738 	new_gres->gres_cnt_found  = gres_ptr->gres_cnt_found;
3739 	new_gres->gres_cnt_config = gres_ptr->gres_cnt_config;
3740 	new_gres->gres_cnt_avail  = gres_ptr->gres_cnt_avail;
3741 	new_gres->gres_cnt_alloc  = gres_ptr->gres_cnt_alloc;
3742 	new_gres->no_consume      = gres_ptr->no_consume;
3743 	if (gres_ptr->gres_bit_alloc)
3744 		new_gres->gres_bit_alloc = bit_copy(gres_ptr->gres_bit_alloc);
3745 
3746 	if (gres_ptr->links_cnt && gres_ptr->link_len) {
3747 		new_gres->links_cnt = xcalloc(gres_ptr->link_len,
3748 					      sizeof(int *));
3749 		j = sizeof(int) * gres_ptr->link_len;
3750 		for (i = 0; i < gres_ptr->link_len; i++) {
3751 			new_gres->links_cnt[i] = xmalloc(j);
3752 			memcpy(new_gres->links_cnt[i],gres_ptr->links_cnt[i],j);
3753 		}
3754 		new_gres->link_len = gres_ptr->link_len;
3755 	}
3756 
3757 	if (gres_ptr->topo_cnt) {
3758 		new_gres->topo_cnt         = gres_ptr->topo_cnt;
3759 		new_gres->topo_core_bitmap = xcalloc(gres_ptr->topo_cnt,
3760 						     sizeof(bitstr_t *));
3761 		new_gres->topo_gres_bitmap = xcalloc(gres_ptr->topo_cnt,
3762 						     sizeof(bitstr_t *));
3763 		new_gres->topo_gres_cnt_alloc = xcalloc(gres_ptr->topo_cnt,
3764 							sizeof(uint64_t));
3765 		new_gres->topo_gres_cnt_avail = xcalloc(gres_ptr->topo_cnt,
3766 							sizeof(uint64_t));
3767 		new_gres->topo_type_id = xcalloc(gres_ptr->topo_cnt,
3768 						 sizeof(uint32_t));
3769 		new_gres->topo_type_name = xcalloc(gres_ptr->topo_cnt,
3770 						   sizeof(char *));
3771 		for (i = 0; i < gres_ptr->topo_cnt; i++) {
3772 			if (gres_ptr->topo_core_bitmap[i]) {
3773 				new_gres->topo_core_bitmap[i] =
3774 					bit_copy(gres_ptr->topo_core_bitmap[i]);
3775 			}
3776 			new_gres->topo_gres_bitmap[i] =
3777 				bit_copy(gres_ptr->topo_gres_bitmap[i]);
3778 			new_gres->topo_gres_cnt_alloc[i] =
3779 				gres_ptr->topo_gres_cnt_alloc[i];
3780 			new_gres->topo_gres_cnt_avail[i] =
3781 				gres_ptr->topo_gres_cnt_avail[i];
3782 			new_gres->topo_type_id[i] = gres_ptr->topo_type_id[i];
3783 			new_gres->topo_type_name[i] =
3784 				xstrdup(gres_ptr->topo_type_name[i]);
3785 		}
3786 	}
3787 
3788 	if (gres_ptr->type_cnt) {
3789 		new_gres->type_cnt       = gres_ptr->type_cnt;
3790 		new_gres->type_cnt_alloc = xcalloc(gres_ptr->type_cnt,
3791 						   sizeof(uint64_t));
3792 		new_gres->type_cnt_avail = xcalloc(gres_ptr->type_cnt,
3793 						   sizeof(uint64_t));
3794 		new_gres->type_id = xcalloc(gres_ptr->type_cnt,
3795 					    sizeof(uint32_t));
3796 		new_gres->type_name = xcalloc(gres_ptr->type_cnt,
3797 					      sizeof(char *));
3798 		for (i = 0; i < gres_ptr->type_cnt; i++) {
3799 			new_gres->type_cnt_alloc[i] =
3800 				gres_ptr->type_cnt_alloc[i];
3801 			new_gres->type_cnt_avail[i] =
3802 				gres_ptr->type_cnt_avail[i];
3803 			new_gres->type_id[i] = gres_ptr->type_id[i];
3804 			new_gres->type_name[i] =
3805 				xstrdup(gres_ptr->type_name[i]);
3806 		}
3807 	}
3808 
3809 	return new_gres;
3810 }
3811 
3812 /*
3813  * Duplicate a node gres status (used for will-run logic)
3814  * IN gres_list - node gres state information
3815  * RET a copy of gres_list or NULL on failure
3816  */
gres_plugin_node_state_dup(List gres_list)3817 extern List gres_plugin_node_state_dup(List gres_list)
3818 {
3819 	int i;
3820 	List new_list = NULL;
3821 	ListIterator gres_iter;
3822 	gres_state_t *gres_ptr, *new_gres;
3823 	void *gres_data;
3824 
3825 	if (gres_list == NULL)
3826 		return new_list;
3827 
3828 	(void) gres_plugin_init();
3829 
3830 	slurm_mutex_lock(&gres_context_lock);
3831 	if ((gres_context_cnt > 0)) {
3832 		new_list = list_create(_gres_node_list_delete);
3833 	}
3834 	gres_iter = list_iterator_create(gres_list);
3835 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
3836 		for (i=0; i<gres_context_cnt; i++) {
3837 			if (gres_ptr->plugin_id != gres_context[i].plugin_id)
3838 				continue;
3839 			gres_data = _node_state_dup(gres_ptr->gres_data);
3840 			if (gres_data) {
3841 				new_gres = xmalloc(sizeof(gres_state_t));
3842 				new_gres->plugin_id = gres_ptr->plugin_id;
3843 				new_gres->gres_data = gres_data;
3844 				list_append(new_list, new_gres);
3845 			}
3846 			break;
3847 		}
3848 		if (i >= gres_context_cnt) {
3849 			error("Could not find plugin id %u to dup node record",
3850 			      gres_ptr->plugin_id);
3851 		}
3852 	}
3853 	list_iterator_destroy(gres_iter);
3854 	slurm_mutex_unlock(&gres_context_lock);
3855 
3856 	return new_list;
3857 }
3858 
_node_state_dealloc(gres_state_t * gres_ptr)3859 static void _node_state_dealloc(gres_state_t *gres_ptr)
3860 {
3861 	int i;
3862 	gres_node_state_t *gres_node_ptr;
3863 	char *gres_name = NULL;
3864 
3865 	gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
3866 	gres_node_ptr->gres_cnt_alloc = 0;
3867 	if (gres_node_ptr->gres_bit_alloc) {
3868 		int i = bit_size(gres_node_ptr->gres_bit_alloc) - 1;
3869 		if (i >= 0)
3870 			bit_nclear(gres_node_ptr->gres_bit_alloc, 0, i);
3871 	}
3872 
3873 	if (gres_node_ptr->topo_cnt && !gres_node_ptr->topo_gres_cnt_alloc) {
3874 		for (i = 0; i < gres_context_cnt; i++) {
3875 			if (gres_ptr->plugin_id == gres_context[i].plugin_id) {
3876 				gres_name = gres_context[i].gres_name;
3877 				break;
3878 			}
3879 		}
3880 		error("gres_plugin_node_state_dealloc_all: gres/%s topo_cnt!=0 "
3881 		      "and topo_gres_cnt_alloc is NULL", gres_name);
3882 	} else if (gres_node_ptr->topo_cnt) {
3883 		for (i = 0; i < gres_node_ptr->topo_cnt; i++) {
3884 			gres_node_ptr->topo_gres_cnt_alloc[i] = 0;
3885 		}
3886 	} else {
3887 		/*
3888 		 * This array can be set at startup if a job has been allocated
3889 		 * specific GRES and the node has not registered with the
3890 		 * details needed to track individual GRES (rather than only
3891 		 * a GRES count).
3892 		 */
3893 		xfree(gres_node_ptr->topo_gres_cnt_alloc);
3894 	}
3895 
3896 	for (i = 0; i < gres_node_ptr->type_cnt; i++) {
3897 		gres_node_ptr->type_cnt_alloc[i] = 0;
3898 	}
3899 }
3900 
3901 /*
3902  * Deallocate all resources on this node previous allocated to any jobs.
3903  *	This function isused to synchronize state after slurmctld restarts or
3904  *	is reconfigured.
3905  * IN gres_list - node gres state information
3906  */
gres_plugin_node_state_dealloc_all(List gres_list)3907 extern void gres_plugin_node_state_dealloc_all(List gres_list)
3908 {
3909 	ListIterator gres_iter;
3910 	gres_state_t *gres_ptr;
3911 
3912 	if (gres_list == NULL)
3913 		return;
3914 
3915 	(void) gres_plugin_init();
3916 
3917 	slurm_mutex_lock(&gres_context_lock);
3918 	gres_iter = list_iterator_create(gres_list);
3919 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
3920 		_node_state_dealloc(gres_ptr);
3921 	}
3922 	list_iterator_destroy(gres_iter);
3923 	slurm_mutex_unlock(&gres_context_lock);
3924 }
3925 
_node_gres_used(void * gres_data,char * gres_name)3926 static char *_node_gres_used(void *gres_data, char *gres_name)
3927 {
3928 	gres_node_state_t *gres_node_ptr;
3929 	char *sep = "";
3930 	int i, j;
3931 
3932 	xassert(gres_data);
3933 	gres_node_ptr = (gres_node_state_t *) gres_data;
3934 
3935 	if ((gres_node_ptr->topo_cnt != 0) &&
3936 	    (gres_node_ptr->no_consume == false)) {
3937 		bitstr_t *topo_printed = bit_alloc(gres_node_ptr->topo_cnt);
3938 		xfree(gres_node_ptr->gres_used);    /* Free any cached value */
3939 		for (i = 0; i < gres_node_ptr->topo_cnt; i++) {
3940 			bitstr_t *topo_gres_bitmap = NULL;
3941 			uint64_t gres_alloc_cnt = 0;
3942 			char *gres_alloc_idx, tmp_str[64];
3943 			if (bit_test(topo_printed, i))
3944 				continue;
3945 			bit_set(topo_printed, i);
3946 			if (gres_node_ptr->topo_gres_bitmap[i]) {
3947 				topo_gres_bitmap =
3948 					bit_copy(gres_node_ptr->
3949 						 topo_gres_bitmap[i]);
3950 			}
3951 			for (j = i + 1; j < gres_node_ptr->topo_cnt; j++) {
3952 				if (bit_test(topo_printed, j))
3953 					continue;
3954 				if (gres_node_ptr->topo_type_id[i] !=
3955 				    gres_node_ptr->topo_type_id[j])
3956 					continue;
3957 				bit_set(topo_printed, j);
3958 				if (gres_node_ptr->topo_gres_bitmap[j]) {
3959 					if (!topo_gres_bitmap) {
3960 						topo_gres_bitmap =
3961 							bit_copy(gres_node_ptr->
3962 								 topo_gres_bitmap[j]);
3963 					} else if (bit_size(topo_gres_bitmap) ==
3964 						   bit_size(gres_node_ptr->
3965 							    topo_gres_bitmap[j])){
3966 						bit_or(topo_gres_bitmap,
3967 						       gres_node_ptr->
3968 						       topo_gres_bitmap[j]);
3969 					}
3970 				}
3971 			}
3972 			if (gres_node_ptr->gres_bit_alloc && topo_gres_bitmap &&
3973 			    (bit_size(topo_gres_bitmap) ==
3974 			     bit_size(gres_node_ptr->gres_bit_alloc))) {
3975 				bit_and(topo_gres_bitmap,
3976 					gres_node_ptr->gres_bit_alloc);
3977 				gres_alloc_cnt = bit_set_count(topo_gres_bitmap);
3978 			}
3979 			if (gres_alloc_cnt > 0) {
3980 				bit_fmt(tmp_str, sizeof(tmp_str),
3981 					topo_gres_bitmap);
3982 				gres_alloc_idx = tmp_str;
3983 			} else {
3984 				gres_alloc_idx = "N/A";
3985 			}
3986 			xstrfmtcat(gres_node_ptr->gres_used,
3987 				   "%s%s:%s:%"PRIu64"(IDX:%s)", sep, gres_name,
3988 				   gres_node_ptr->topo_type_name[i],
3989 				   gres_alloc_cnt, gres_alloc_idx);
3990 			sep = ",";
3991 			FREE_NULL_BITMAP(topo_gres_bitmap);
3992 		}
3993 		FREE_NULL_BITMAP(topo_printed);
3994 	} else if (gres_node_ptr->gres_used) {
3995 		;	/* Used cached value */
3996 	} else if (gres_node_ptr->type_cnt == 0) {
3997 		if (gres_node_ptr->no_consume) {
3998 			xstrfmtcat(gres_node_ptr->gres_used, "%s:0", gres_name);
3999 		} else {
4000 			xstrfmtcat(gres_node_ptr->gres_used, "%s:%"PRIu64,
4001 				   gres_name, gres_node_ptr->gres_cnt_alloc);
4002 		}
4003 	} else {
4004 		for (i = 0; i < gres_node_ptr->type_cnt; i++) {
4005 			if (gres_node_ptr->no_consume) {
4006 				xstrfmtcat(gres_node_ptr->gres_used,
4007 					   "%s%s:%s:0", sep, gres_name,
4008 					   gres_node_ptr->type_name[i]);
4009 			} else {
4010 				xstrfmtcat(gres_node_ptr->gres_used,
4011 					   "%s%s:%s:%"PRIu64, sep, gres_name,
4012 					   gres_node_ptr->type_name[i],
4013 					   gres_node_ptr->type_cnt_alloc[i]);
4014 			}
4015 			sep = ",";
4016 		}
4017 	}
4018 
4019 	return gres_node_ptr->gres_used;
4020 }
4021 
_node_state_log(void * gres_data,char * node_name,char * gres_name)4022 static void _node_state_log(void *gres_data, char *node_name, char *gres_name)
4023 {
4024 	gres_node_state_t *gres_node_ptr;
4025 	int i, j;
4026 	char *buf = NULL, *sep, tmp_str[128];
4027 
4028 	xassert(gres_data);
4029 	gres_node_ptr = (gres_node_state_t *) gres_data;
4030 
4031 	info("gres/%s: state for %s", gres_name, node_name);
4032 	if (gres_node_ptr->gres_cnt_found == NO_VAL64) {
4033 		snprintf(tmp_str, sizeof(tmp_str), "TBD");
4034 	} else {
4035 		snprintf(tmp_str, sizeof(tmp_str), "%"PRIu64,
4036 			 gres_node_ptr->gres_cnt_found);
4037 	}
4038 
4039 	if (gres_node_ptr->no_consume) {
4040 		info("  gres_cnt found:%s configured:%"PRIu64" "
4041 		     "avail:%"PRIu64" no_consume",
4042 		     tmp_str, gres_node_ptr->gres_cnt_config,
4043 		     gres_node_ptr->gres_cnt_avail);
4044 	} else {
4045 		info("  gres_cnt found:%s configured:%"PRIu64" "
4046 		     "avail:%"PRIu64" alloc:%"PRIu64"",
4047 		     tmp_str, gres_node_ptr->gres_cnt_config,
4048 		     gres_node_ptr->gres_cnt_avail,
4049 		     gres_node_ptr->gres_cnt_alloc);
4050 	}
4051 
4052 	if (gres_node_ptr->gres_bit_alloc) {
4053 		bit_fmt(tmp_str, sizeof(tmp_str),gres_node_ptr->gres_bit_alloc);
4054 		info("  gres_bit_alloc:%s of %d",
4055 		     tmp_str, (int) bit_size(gres_node_ptr->gres_bit_alloc));
4056 	} else {
4057 		info("  gres_bit_alloc:NULL");
4058 	}
4059 
4060 	info("  gres_used:%s", gres_node_ptr->gres_used);
4061 
4062 	if (gres_node_ptr->links_cnt && gres_node_ptr->link_len) {
4063 		for (i = 0; i < gres_node_ptr->link_len; i++) {
4064 			sep = "";
4065 			for (j = 0; j < gres_node_ptr->link_len; j++) {
4066 				xstrfmtcat(buf, "%s%d", sep,
4067 					   gres_node_ptr->links_cnt[i][j]);
4068 				sep = ", ";
4069 			}
4070 			info("  links[%d]:%s", i, buf);
4071 			xfree(buf);
4072 		}
4073 	}
4074 
4075 	for (i = 0; i < gres_node_ptr->topo_cnt; i++) {
4076 		info("  topo[%d]:%s(%u)", i, gres_node_ptr->topo_type_name[i],
4077 		     gres_node_ptr->topo_type_id[i]);
4078 		if (gres_node_ptr->topo_core_bitmap[i]) {
4079 			bit_fmt(tmp_str, sizeof(tmp_str),
4080 				gres_node_ptr->topo_core_bitmap[i]);
4081 			info("   topo_core_bitmap[%d]:%s of %d", i, tmp_str,
4082 			     (int)bit_size(gres_node_ptr->topo_core_bitmap[i]));
4083 		} else
4084 			info("   topo_core_bitmap[%d]:NULL", i);
4085 		if (gres_node_ptr->topo_gres_bitmap[i]) {
4086 			bit_fmt(tmp_str, sizeof(tmp_str),
4087 				gres_node_ptr->topo_gres_bitmap[i]);
4088 			info("   topo_gres_bitmap[%d]:%s of %d", i, tmp_str,
4089 			     (int)bit_size(gres_node_ptr->topo_gres_bitmap[i]));
4090 		} else
4091 			info("   topo_gres_bitmap[%d]:NULL", i);
4092 		info("   topo_gres_cnt_alloc[%d]:%"PRIu64"", i,
4093 		     gres_node_ptr->topo_gres_cnt_alloc[i]);
4094 		info("   topo_gres_cnt_avail[%d]:%"PRIu64"", i,
4095 		     gres_node_ptr->topo_gres_cnt_avail[i]);
4096 	}
4097 
4098 	for (i = 0; i < gres_node_ptr->type_cnt; i++) {
4099 		info("  type[%d]:%s(%u)", i, gres_node_ptr->type_name[i],
4100 		     gres_node_ptr->type_id[i]);
4101 		info("   type_cnt_alloc[%d]:%"PRIu64, i,
4102 		     gres_node_ptr->type_cnt_alloc[i]);
4103 		info("   type_cnt_avail[%d]:%"PRIu64, i,
4104 		     gres_node_ptr->type_cnt_avail[i]);
4105 	}
4106 }
4107 
4108 /*
4109  * Log a node's current gres state
4110  * IN gres_list - generated by gres_plugin_node_config_validate()
4111  * IN node_name - name of the node for which the gres information applies
4112  */
gres_plugin_node_state_log(List gres_list,char * node_name)4113 extern void gres_plugin_node_state_log(List gres_list, char *node_name)
4114 {
4115 	int i;
4116 	ListIterator gres_iter;
4117 	gres_state_t *gres_ptr;
4118 
4119 	if (!gres_debug || (gres_list == NULL))
4120 		return;
4121 
4122 	(void) gres_plugin_init();
4123 
4124 	slurm_mutex_lock(&gres_context_lock);
4125 	gres_iter = list_iterator_create(gres_list);
4126 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
4127 		for (i = 0; i < gres_context_cnt; i++) {
4128 			if (gres_ptr->plugin_id !=
4129 			    gres_context[i].plugin_id)
4130 				continue;
4131 			_node_state_log(gres_ptr->gres_data, node_name,
4132 					gres_context[i].gres_name);
4133 			break;
4134 		}
4135 	}
4136 	list_iterator_destroy(gres_iter);
4137 	slurm_mutex_unlock(&gres_context_lock);
4138 }
4139 
4140 /*
4141  * Build a string indicating a node's drained GRES
4142  * IN gres_list - generated by gres_plugin_node_config_validate()
4143  * RET - string, must be xfreed by caller
4144  */
gres_get_node_drain(List gres_list)4145 extern char *gres_get_node_drain(List gres_list)
4146 {
4147 	char *node_drain = xstrdup("N/A");
4148 
4149 	return node_drain;
4150 }
4151 
4152 /*
4153  * Build a string indicating a node's used GRES
4154  * IN gres_list - generated by gres_plugin_node_config_validate()
4155  * RET - string, must be xfreed by caller
4156  */
gres_get_node_used(List gres_list)4157 extern char *gres_get_node_used(List gres_list)
4158 {
4159 	int i;
4160 	ListIterator gres_iter;
4161 	gres_state_t *gres_ptr;
4162 	char *gres_used = NULL, *tmp;
4163 
4164 	if (!gres_list)
4165 		return gres_used;
4166 
4167 	(void) gres_plugin_init();
4168 
4169 	slurm_mutex_lock(&gres_context_lock);
4170 	gres_iter = list_iterator_create(gres_list);
4171 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
4172 		for (i = 0; i < gres_context_cnt; i++) {
4173 			if (gres_ptr->plugin_id !=
4174 			    gres_context[i].plugin_id)
4175 				continue;
4176 			tmp = _node_gres_used(gres_ptr->gres_data,
4177 					      gres_context[i].gres_name);
4178 			if (!tmp)
4179 				continue;
4180 			if (gres_used)
4181 				xstrcat(gres_used, ",");
4182 			xstrcat(gres_used, tmp);
4183 			break;
4184 		}
4185 	}
4186 	list_iterator_destroy(gres_iter);
4187 	slurm_mutex_unlock(&gres_context_lock);
4188 
4189 	return gres_used;
4190 }
4191 
4192 /*
4193  * Give the total system count of a given GRES
4194  * Returns NO_VAL64 if name not found
4195  */
gres_get_system_cnt(char * name)4196 extern uint64_t gres_get_system_cnt(char *name)
4197 {
4198 	uint64_t count = NO_VAL64;
4199 	int i;
4200 
4201 	if (!name)
4202 		return NO_VAL64;
4203 
4204 	(void) gres_plugin_init();
4205 
4206 	slurm_mutex_lock(&gres_context_lock);
4207 	for (i = 0; i < gres_context_cnt; i++) {
4208 		if (!xstrcmp(gres_context[i].gres_name, name)) {
4209 			count = gres_context[i].total_cnt;
4210 			break;
4211 		}
4212 	}
4213 	slurm_mutex_unlock(&gres_context_lock);
4214 	return count;
4215 }
4216 
4217 
4218 /*
4219  * Get the count of a node's GRES
4220  * IN gres_list - List of Gres records for this node to track usage
4221  * IN name - name of gres
4222  */
gres_plugin_node_config_cnt(List gres_list,char * name)4223 extern uint64_t gres_plugin_node_config_cnt(List gres_list, char *name)
4224 {
4225 	int i;
4226 	gres_state_t *gres_ptr;
4227 	gres_node_state_t *data_ptr;
4228 	uint64_t count = 0;
4229 
4230 	if (!gres_list || !name || !list_count(gres_list))
4231 		return count;
4232 
4233 	(void) gres_plugin_init();
4234 
4235 	slurm_mutex_lock(&gres_context_lock);
4236 	for (i = 0; i < gres_context_cnt; i++) {
4237 		if (!xstrcmp(gres_context[i].gres_name, name)) {
4238 			/* Find or create gres_state entry on the list */
4239 			gres_ptr = list_find_first(gres_list, _gres_find_id,
4240 			                           &gres_context[i].plugin_id);
4241 
4242 			if (!gres_ptr || !gres_ptr->gres_data)
4243 				break;
4244 			data_ptr = (gres_node_state_t *)gres_ptr->gres_data;
4245 			count = data_ptr->gres_cnt_config;
4246 			break;
4247 		} else if (!xstrncmp(name, gres_context[i].gres_name_colon,
4248 				     gres_context[i].gres_name_colon_len)) {
4249 			int type;
4250 			uint32_t type_id;
4251 			char *type_str = NULL;
4252 
4253 			if (!(type_str = strchr(name, ':'))) {
4254 				error("Invalid gres name '%s'", name);
4255 				break;
4256 			}
4257 			type_str++;
4258 
4259 			gres_ptr = list_find_first(gres_list, _gres_find_id,
4260 			                           &gres_context[i].plugin_id);
4261 
4262 			if (!gres_ptr || !gres_ptr->gres_data)
4263 				break;
4264 			data_ptr = (gres_node_state_t *)gres_ptr->gres_data;
4265 			type_id = gres_plugin_build_id(type_str);
4266 			for (type = 0; type < data_ptr->type_cnt; type++) {
4267 				if (data_ptr->type_id[type] == type_id) {
4268 					count = data_ptr->type_cnt_avail[type];
4269 					break;
4270 				}
4271 			}
4272 			break;
4273 		}
4274 	}
4275 	slurm_mutex_unlock(&gres_context_lock);
4276 
4277 	return count;
4278 }
4279 
_job_state_delete(void * gres_data)4280 static void _job_state_delete(void *gres_data)
4281 {
4282 	int i;
4283 	gres_job_state_t *gres_ptr = (gres_job_state_t *) gres_data;
4284 
4285 	if (gres_ptr == NULL)
4286 		return;
4287 
4288 	for (i = 0; i < gres_ptr->node_cnt; i++) {
4289 		if (gres_ptr->gres_bit_alloc)
4290 			FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc[i]);
4291 		if (gres_ptr->gres_bit_step_alloc)
4292 			FREE_NULL_BITMAP(gres_ptr->gres_bit_step_alloc[i]);
4293 	}
4294 	xfree(gres_ptr->gres_bit_alloc);
4295 	xfree(gres_ptr->gres_cnt_node_alloc);
4296 	xfree(gres_ptr->gres_bit_step_alloc);
4297 	xfree(gres_ptr->gres_cnt_step_alloc);
4298 	if (gres_ptr->gres_bit_select) {
4299 		for (i = 0; i < gres_ptr->total_node_cnt; i++)
4300 			FREE_NULL_BITMAP(gres_ptr->gres_bit_select[i]);
4301 		xfree(gres_ptr->gres_bit_select);
4302 	}
4303 	xfree(gres_ptr->gres_cnt_node_alloc);
4304 	xfree(gres_ptr->gres_cnt_node_select);
4305 	xfree(gres_ptr->gres_name);
4306 	xfree(gres_ptr->type_name);
4307 	xfree(gres_ptr);
4308 }
4309 
_gres_job_list_delete(void * list_element)4310 static void _gres_job_list_delete(void *list_element)
4311 {
4312 	gres_state_t *gres_ptr;
4313 
4314 	if (gres_plugin_init() != SLURM_SUCCESS)
4315 		return;
4316 
4317 	gres_ptr = (gres_state_t *) list_element;
4318 	slurm_mutex_lock(&gres_context_lock);
4319 	_job_state_delete(gres_ptr->gres_data);
4320 	xfree(gres_ptr);
4321 	slurm_mutex_unlock(&gres_context_lock);
4322 }
4323 
_clear_cpus_per_gres(void * x,void * arg)4324 static int _clear_cpus_per_gres(void *x, void *arg)
4325 {
4326 	gres_state_t *gres_ptr = (gres_state_t *) x;
4327 	gres_job_state_t *job_gres_data;
4328 	job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
4329 	job_gres_data->cpus_per_gres = 0;
4330 	return 0;
4331 }
_clear_gres_per_job(void * x,void * arg)4332 static int _clear_gres_per_job(void *x, void *arg)
4333 {
4334 	gres_state_t *gres_ptr = (gres_state_t *) x;
4335 	gres_job_state_t *job_gres_data;
4336 	job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
4337 	job_gres_data->gres_per_job = 0;
4338 	return 0;
4339 }
_clear_gres_per_node(void * x,void * arg)4340 static int _clear_gres_per_node(void *x, void *arg)
4341 {
4342 	gres_state_t *gres_ptr = (gres_state_t *) x;
4343 	gres_job_state_t *job_gres_data;
4344 	job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
4345 	job_gres_data->gres_per_node = 0;
4346 	return 0;
4347 }
_clear_gres_per_socket(void * x,void * arg)4348 static int _clear_gres_per_socket(void *x, void *arg)
4349 {
4350 	gres_state_t *gres_ptr = (gres_state_t *) x;
4351 	gres_job_state_t *job_gres_data;
4352 	job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
4353 	job_gres_data->gres_per_socket = 0;
4354 	return 0;
4355 }
_clear_gres_per_task(void * x,void * arg)4356 static int _clear_gres_per_task(void *x, void *arg)
4357 {
4358 	gres_state_t *gres_ptr = (gres_state_t *) x;
4359 	gres_job_state_t *job_gres_data;
4360 	job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
4361 	job_gres_data->gres_per_task = 0;
4362 	return 0;
4363 }
_clear_mem_per_gres(void * x,void * arg)4364 static int _clear_mem_per_gres(void *x, void *arg)
4365 {
4366 	gres_state_t *gres_ptr = (gres_state_t *) x;
4367 	gres_job_state_t *job_gres_data;
4368 	job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
4369 	job_gres_data->mem_per_gres = 0;
4370 	return 0;
4371 }
_clear_total_gres(void * x,void * arg)4372 static int _clear_total_gres(void *x, void *arg)
4373 {
4374 	gres_state_t *gres_ptr = (gres_state_t *) x;
4375 	gres_job_state_t *job_gres_data;
4376 	job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
4377 	job_gres_data->total_gres = 0;
4378 	return 0;
4379 }
4380 
4381 /*
4382  * Ensure consistency of gres_per_* options
4383  * Modify task and node count as needed for consistentcy with GRES options
4384  * RET -1 on failure, 0 on success
4385  */
_test_gres_cnt(gres_job_state_t * job_gres_data,uint32_t * num_tasks,uint32_t * min_nodes,uint32_t * max_nodes,uint16_t * ntasks_per_node,uint16_t * ntasks_per_socket,uint16_t * sockets_per_node,uint16_t * cpus_per_task)4386 static int _test_gres_cnt(gres_job_state_t *job_gres_data,
4387 			  uint32_t *num_tasks,
4388 			  uint32_t *min_nodes, uint32_t *max_nodes,
4389 			  uint16_t *ntasks_per_node,
4390 			  uint16_t *ntasks_per_socket,
4391 			  uint16_t *sockets_per_node,
4392 			  uint16_t *cpus_per_task)
4393 {
4394 	int req_nodes, req_tasks, req_tasks_per_node, req_tasks_per_socket;
4395 	int req_sockets, req_cpus_per_task;
4396 	uint16_t cpus_per_gres;
4397 
4398 	/* Ensure gres_per_job >= gres_per_node >= gres_per_socket */
4399 	if (job_gres_data->gres_per_job &&
4400 	    ((job_gres_data->gres_per_node &&
4401 	      (job_gres_data->gres_per_node > job_gres_data->gres_per_job)) ||
4402 	     (job_gres_data->gres_per_task &&
4403 	      (job_gres_data->gres_per_task > job_gres_data->gres_per_job)) ||
4404 	     (job_gres_data->gres_per_socket &&
4405 	      (job_gres_data->gres_per_socket > job_gres_data->gres_per_job))))
4406 		return -1;
4407 
4408 	/* Ensure gres_per_job >= gres_per_task */
4409 	if (job_gres_data->gres_per_node &&
4410 	    ((job_gres_data->gres_per_task &&
4411 	      (job_gres_data->gres_per_task > job_gres_data->gres_per_node)) ||
4412 	     (job_gres_data->gres_per_socket &&
4413 	      (job_gres_data->gres_per_socket > job_gres_data->gres_per_node))))
4414 		return -1;
4415 
4416 	/* gres_per_socket requires sockets-per-node count specification */
4417 	if (job_gres_data->gres_per_socket) {
4418 		if (*sockets_per_node == NO_VAL16)
4419 			return -1;
4420 	}
4421 
4422 	/*
4423 	 * Ensure gres_per_job is multiple of gres_per_node
4424 	 * Ensure node count is consistent with GRES parameters
4425 	 */
4426 	if (job_gres_data->gres_per_job && job_gres_data->gres_per_node) {
4427 		if (job_gres_data->gres_per_job % job_gres_data->gres_per_node){
4428 			/* gres_per_job not multiple of gres_per_node */
4429 			return -1;
4430 		}
4431 		req_nodes = job_gres_data->gres_per_job /
4432 			    job_gres_data->gres_per_node;
4433 		if ((req_nodes < *min_nodes) || (req_nodes > *max_nodes))
4434 			return -1;
4435 		*min_nodes = *max_nodes = req_nodes;
4436 	}
4437 
4438 	/*
4439 	 * Ensure gres_per_node is multiple of gres_per_socket
4440 	 * Ensure task count is consistent with GRES parameters
4441 	 */
4442 	if (job_gres_data->gres_per_node && job_gres_data->gres_per_socket) {
4443 		if (job_gres_data->gres_per_node %
4444 		    job_gres_data->gres_per_socket) {
4445 			/* gres_per_node not multiple of gres_per_socket */
4446 			return -1;
4447 		}
4448 		req_sockets = job_gres_data->gres_per_node /
4449 			      job_gres_data->gres_per_socket;
4450 		if (*sockets_per_node == NO_VAL16)
4451 			*sockets_per_node = req_sockets;
4452 		else if (*sockets_per_node != req_sockets)
4453 			return -1;
4454 	}
4455 	/*
4456 	 * Ensure gres_per_job is multiple of gres_per_task
4457 	 * Ensure task count is consistent with GRES parameters
4458 	 */
4459 	if (job_gres_data->gres_per_task) {
4460 		if(job_gres_data->gres_per_job) {
4461 			if (job_gres_data->gres_per_job %
4462 			    job_gres_data->gres_per_task) {
4463 				/* gres_per_job not multiple of gres_per_task */
4464 				return -1;
4465 			}
4466 			req_tasks = job_gres_data->gres_per_job /
4467 				    job_gres_data->gres_per_task;
4468 			if (*num_tasks == NO_VAL)
4469 				*num_tasks = req_tasks;
4470 			else if (*num_tasks != req_tasks)
4471 				return -1;
4472 		} else if (*num_tasks != NO_VAL) {
4473 			job_gres_data->gres_per_job = *num_tasks *
4474 						job_gres_data->gres_per_task;
4475 		} else {
4476 			return -1;
4477 		}
4478 	}
4479 
4480 	/*
4481 	 * Ensure gres_per_node is multiple of gres_per_task
4482 	 * Ensure tasks_per_node is consistent with GRES parameters
4483 	 */
4484 	if (job_gres_data->gres_per_node && job_gres_data->gres_per_task) {
4485 		if (job_gres_data->gres_per_node %
4486 		    job_gres_data->gres_per_task) {
4487 			/* gres_per_node not multiple of gres_per_task */
4488 			return -1;
4489 		}
4490 		req_tasks_per_node = job_gres_data->gres_per_node /
4491 				     job_gres_data->gres_per_task;
4492 		if ((*ntasks_per_node == NO_VAL16) ||
4493 		    (*ntasks_per_node == 0))
4494 			*ntasks_per_node = req_tasks_per_node;
4495 		else if (*ntasks_per_node != req_tasks_per_node)
4496 			return -1;
4497 	}
4498 
4499 	/*
4500 	 * Ensure gres_per_socket is multiple of gres_per_task
4501 	 * Ensure ntasks_per_socket is consistent with GRES parameters
4502 	 */
4503 	if (job_gres_data->gres_per_socket && job_gres_data->gres_per_task) {
4504 		if (job_gres_data->gres_per_socket %
4505 		    job_gres_data->gres_per_task) {
4506 			/* gres_per_socket not multiple of gres_per_task */
4507 			return -1;
4508 		}
4509 		req_tasks_per_socket = job_gres_data->gres_per_socket /
4510 				       job_gres_data->gres_per_task;
4511 		if ((*ntasks_per_socket == NO_VAL16) ||
4512 		    (*ntasks_per_socket == 0))
4513 			*ntasks_per_socket = req_tasks_per_socket;
4514 		else if (*ntasks_per_socket != req_tasks_per_socket)
4515 			return -1;
4516 	}
4517 
4518 	/* Ensure that cpus_per_gres * gres_per_task == cpus_per_task */
4519 	if (job_gres_data->cpus_per_gres)
4520 		cpus_per_gres = job_gres_data->cpus_per_gres;
4521 	else
4522 		cpus_per_gres = job_gres_data->def_cpus_per_gres;
4523 	if (cpus_per_gres && job_gres_data->gres_per_task) {
4524 		req_cpus_per_task = cpus_per_gres *job_gres_data->gres_per_task;
4525 		if ((*cpus_per_task == NO_VAL16) ||
4526 		    (*cpus_per_task == 0))
4527 			*cpus_per_task = req_cpus_per_task;
4528 		else if (*cpus_per_task != req_cpus_per_task)
4529 			return -1;
4530 	}
4531 
4532 	/* Ensure tres_per_job >= node count */
4533 	if (job_gres_data->gres_per_job) {
4534 		if (job_gres_data->gres_per_job < *min_nodes)
4535 			return -1;
4536 		if (job_gres_data->gres_per_job < *max_nodes)
4537 			*max_nodes = job_gres_data->gres_per_job;
4538 	}
4539 
4540 	return 0;
4541 }
4542 
4543 /*
4544  * Translate a string, with optional suffix, into its equivalent numeric value
4545  * tok IN - the string to translate
4546  * value IN - numeric value
4547  * RET true if "tok" is a valid number
4548  */
_is_valid_number(char * tok,unsigned long long int * value)4549 static bool _is_valid_number(char *tok, unsigned long long int *value)
4550 {
4551 	unsigned long long int tmp_val;
4552 	uint64_t mult;
4553 	char *end_ptr = NULL;
4554 
4555 	tmp_val = strtoull(tok, &end_ptr, 10);
4556 	if (tmp_val == ULLONG_MAX)
4557 		return false;
4558 	if ((mult = suffix_mult(end_ptr)) == NO_VAL64)
4559 		return false;
4560 	tmp_val *= mult;
4561 	*value = tmp_val;
4562 	return true;
4563 }
4564 
4565 /*
4566  * Reentrant TRES specification parse logic
4567  * in_val IN - initial input string
4568  * type OUT -  must be xfreed by caller
4569  * cnt OUT - count of values
4570  * flags OUT - user flags (GRES_NO_CONSUME)
4571  * save_ptr IN/OUT - NULL on initial call, otherwise value from previous call
4572  * RET rc - error code
4573  */
_get_next_gres(char * in_val,char ** type_ptr,int * context_inx_ptr,uint64_t * cnt,uint16_t * flags,char ** save_ptr)4574 static int _get_next_gres(char *in_val, char **type_ptr, int *context_inx_ptr,
4575 			  uint64_t *cnt, uint16_t *flags, char **save_ptr)
4576 {
4577 	char *comma, *sep, *sep2, *name = NULL, *type = NULL;
4578 	int i, rc = SLURM_SUCCESS;
4579 	unsigned long long int value = 0;
4580 
4581 	xassert(cnt);
4582 	xassert(flags);
4583 	xassert(save_ptr);
4584 	*flags = 0;
4585 
4586 	if (!in_val && (*save_ptr == NULL)) {
4587 		return rc;
4588 	}
4589 
4590 	if (*save_ptr == NULL) {
4591 		*save_ptr = in_val;
4592 	}
4593 
4594 next:	if (*save_ptr[0] == '\0') {	/* Empty input token */
4595 		*save_ptr = NULL;
4596 		goto fini;
4597 	}
4598 
4599 	name = xstrdup(*save_ptr);
4600 	comma = strchr(name, ',');
4601 	if (comma) {
4602 		*save_ptr += (comma - name + 1);
4603 		comma[0] = '\0';
4604 	} else {
4605 		*save_ptr += strlen(name);
4606 	}
4607 
4608 	if (name[0] == '\0') {
4609 		/* Nothing but a comma */
4610 		xfree(name);
4611 		goto next;
4612 	}
4613 
4614 	sep = strchr(name, ':');
4615 	if (sep) {
4616 		sep[0] = '\0';
4617 		sep++;
4618 		sep2 = strchr(sep, ':');
4619 		if (sep2) {
4620 			sep2[0] = '\0';
4621 			sep2++;
4622 		}
4623 	} else {
4624 		sep2 = NULL;
4625 	}
4626 
4627 	if (sep2) {		/* Two colons */
4628 		/* We have both type and count */
4629 		if ((sep[0] == '\0') || (sep2[0] == '\0')) {
4630 			/* Bad format (e.g. "gpu:tesla:" or "gpu::1") */
4631 			rc = ESLURM_INVALID_GRES;
4632 			goto fini;
4633 		}
4634 		type = xstrdup(sep);
4635 		if (!_is_valid_number(sep2, &value)) {
4636 			debug("%s: Invalid count value GRES %s:%s:%s", __func__,
4637 			      name, type, sep2);
4638 			rc = ESLURM_INVALID_GRES;
4639 			goto fini;
4640 		}
4641 	} else if (sep) {	/* One colon */
4642 		if (sep[0] == '\0') {
4643 			/* Bad format (e.g. "gpu:") */
4644 			rc = ESLURM_INVALID_GRES;
4645 			goto fini;
4646 		} else if (_is_valid_number(sep, &value)) {
4647 			/* We have count, but no type */
4648 			type = NULL;
4649 		} else {
4650 			/* We have type with implicit count of 1 */
4651 			type = xstrdup(sep);
4652 			value = 1;
4653 		}
4654 	} else {		/* No colon */
4655 		/* We have no type and implicit count of 1 */
4656 		type = NULL;
4657 		value = 1;
4658 	}
4659 	if (value == 0) {
4660 		xfree(name);
4661 		xfree(type);
4662 		goto next;
4663 	}
4664 
4665 	for (i = 0; i < gres_context_cnt; i++) {
4666 		if (!xstrcmp(name, gres_context[i].gres_name) ||
4667 		    !xstrncmp(name, gres_context[i].gres_name_colon,
4668 			      gres_context[i].gres_name_colon_len))
4669 			break;	/* GRES name match found */
4670 	}
4671 	if (i >= gres_context_cnt) {
4672 		debug("%s: Failed to locate GRES %s", __func__, name);
4673 		rc = ESLURM_INVALID_GRES;
4674 		goto fini;
4675 	}
4676 	*context_inx_ptr = i;
4677 
4678 fini:	if (rc != SLURM_SUCCESS) {
4679 		*save_ptr = NULL;
4680 		if (rc == ESLURM_INVALID_GRES) {
4681 			info("%s: Invalid GRES job specification %s", __func__,
4682 			     in_val);
4683 		}
4684 		xfree(type);
4685 		*type_ptr = NULL;
4686 	} else {
4687 		*cnt = value;
4688 		*type_ptr = type;
4689 	}
4690 	xfree(name);
4691 
4692 	return rc;
4693 }
4694 
4695 /*
4696  * TRES specification parse logic
4697  * in_val IN - initial input string
4698  * cnt OUT - count of values
4699  * gres_list IN/OUT - where to search for (or add) new job TRES record
4700  * save_ptr IN/OUT - NULL on initial call, otherwise value from previous call
4701  * rc OUT - unchanged or an error code
4702  * RET gres - job record to set value in, found or created by this function
4703  */
_get_next_job_gres(char * in_val,uint64_t * cnt,List gres_list,char ** save_ptr,int * rc)4704 static gres_job_state_t *_get_next_job_gres(char *in_val, uint64_t *cnt,
4705 					    List gres_list, char **save_ptr,
4706 					    int *rc)
4707 {
4708 	static char *prev_save_ptr = NULL;
4709 	int context_inx = NO_VAL, my_rc = SLURM_SUCCESS;
4710 	gres_job_state_t *job_gres_data = NULL;
4711 	gres_state_t *gres_ptr;
4712 	gres_key_t job_search_key;
4713 	char *type = NULL, *name = NULL;
4714 	uint16_t flags = 0;
4715 
4716 	xassert(save_ptr);
4717 	if (!in_val && (*save_ptr == NULL)) {
4718 		return NULL;
4719 	}
4720 
4721 	if (*save_ptr == NULL) {
4722 		prev_save_ptr = in_val;
4723 	} else if (*save_ptr != prev_save_ptr) {
4724 		error("%s: parsing error", __func__);
4725 		my_rc = SLURM_ERROR;
4726 		goto fini;
4727 	}
4728 
4729 	if (prev_save_ptr[0] == '\0') {	/* Empty input token */
4730 		*save_ptr = NULL;
4731 		return NULL;
4732 	}
4733 
4734 	if ((my_rc = _get_next_gres(in_val, &type, &context_inx,
4735 				    cnt, &flags, &prev_save_ptr)) ||
4736 	    (context_inx == NO_VAL)) {
4737 		prev_save_ptr = NULL;
4738 		goto fini;
4739 	}
4740 
4741 	/* Find the job GRES record */
4742 	job_search_key.plugin_id = gres_context[context_inx].plugin_id;
4743 	job_search_key.type_id = gres_plugin_build_id(type);
4744 	gres_ptr = list_find_first(gres_list, _gres_find_job_by_key,
4745 				   &job_search_key);
4746 
4747 	if (gres_ptr) {
4748 		job_gres_data = gres_ptr->gres_data;
4749 	} else {
4750 		job_gres_data = xmalloc(sizeof(gres_job_state_t));
4751 		job_gres_data->gres_name =
4752 			xstrdup(gres_context[context_inx].gres_name);
4753 		job_gres_data->type_id = gres_plugin_build_id(type);
4754 		job_gres_data->type_name = type;
4755 		type = NULL;	/* String moved above */
4756 		gres_ptr = xmalloc(sizeof(gres_state_t));
4757 		gres_ptr->plugin_id = gres_context[context_inx].plugin_id;
4758 		gres_ptr->gres_data = job_gres_data;
4759 		list_append(gres_list, gres_ptr);
4760 	}
4761 	job_gres_data->flags = flags;
4762 
4763 fini:	xfree(name);
4764 	xfree(type);
4765 	if (my_rc != SLURM_SUCCESS) {
4766 		prev_save_ptr = NULL;
4767 		if (my_rc == ESLURM_INVALID_GRES) {
4768 			info("%s: Invalid GRES job specification %s", __func__,
4769 			     in_val);
4770 		}
4771 		*rc = my_rc;
4772 	}
4773 	*save_ptr = prev_save_ptr;
4774 	return job_gres_data;
4775 }
4776 
4777 /* Return true if job specification only includes cpus_per_gres or mem_per_gres
4778  * Return false if any other field set
4779  */
_generic_job_state(gres_job_state_t * job_state)4780 static bool _generic_job_state(gres_job_state_t *job_state)
4781 {
4782 	if (job_state->gres_per_job ||
4783 	    job_state->gres_per_node ||
4784 	    job_state->gres_per_socket ||
4785 	    job_state->gres_per_task)
4786 		return false;
4787 	return true;
4788 }
4789 
4790 /*
4791  * Given a job's requested GRES configuration, validate it and build a GRES list
4792  * Note: This function can be used for a new request with gres_list==NULL or
4793  *	 used to update an existing job, in which case gres_list is a copy
4794  *	 of the job's original value (so we can clear fields as needed)
4795  * IN *tres* - job requested gres input string
4796  * IN/OUT num_tasks - requested task count, may be reset to provide
4797  *		      consistent gres_per_node/task values
4798  * IN/OUT min_nodes - requested minimum node count, may be reset to provide
4799  *		      consistent gres_per_node/task values
4800  * IN/OUT max_nodes - requested maximum node count, may be reset to provide
4801  *		      consistent gres_per_node/task values
4802  * IN/OUT ntasks_per_node - requested tasks_per_node count, may be reset to
4803  *		      provide consistent gres_per_node/task values
4804  * IN/OUT ntasks_per_socket - requested ntasks_per_socket count, may be reset to
4805  *		      provide consistent gres_per_node/task values
4806  * IN/OUT sockets_per_node - requested sockets_per_node count, may be reset to
4807  *		      provide consistent gres_per_socket/node values
4808  * IN/OUT cpus_per_task - requested cpus_per_task count, may be reset to
4809  *		      provide consistent gres_per_task/cpus_per_gres values
4810  * OUT gres_list - List of GRES records for this job to track usage
4811  * RET SLURM_SUCCESS or ESLURM_INVALID_GRES
4812  */
gres_plugin_job_state_validate(char * cpus_per_tres,char * tres_freq,char * tres_per_job,char * tres_per_node,char * tres_per_socket,char * tres_per_task,char * mem_per_tres,uint32_t * num_tasks,uint32_t * min_nodes,uint32_t * max_nodes,uint16_t * ntasks_per_node,uint16_t * ntasks_per_socket,uint16_t * sockets_per_node,uint16_t * cpus_per_task,List * gres_list)4813 extern int gres_plugin_job_state_validate(char *cpus_per_tres,
4814 					  char *tres_freq,
4815 					  char *tres_per_job,
4816 					  char *tres_per_node,
4817 					  char *tres_per_socket,
4818 					  char *tres_per_task,
4819 					  char *mem_per_tres,
4820 					  uint32_t *num_tasks,
4821 					  uint32_t *min_nodes,
4822 					  uint32_t *max_nodes,
4823 					  uint16_t *ntasks_per_node,
4824 					  uint16_t *ntasks_per_socket,
4825 					  uint16_t *sockets_per_node,
4826 					  uint16_t *cpus_per_task,
4827 					  List *gres_list)
4828 {
4829 	typedef struct overlap_check {
4830 		gres_job_state_t *without_model_state;
4831 		uint32_t plugin_id;
4832 		bool with_model;
4833 		bool without_model;
4834 	} overlap_check_t;
4835 	overlap_check_t *over_list;
4836 	int i, over_count = 0, rc = SLURM_SUCCESS, size;
4837 	bool have_gres_gpu = false, have_gres_mps = false;
4838 	bool overlap_merge = false;
4839 	gres_state_t *gres_state;
4840 	gres_job_state_t *job_gres_data;
4841 	uint64_t cnt = 0;
4842 	ListIterator iter;
4843 
4844 	if (!cpus_per_tres && !tres_per_job && !tres_per_node &&
4845 	    !tres_per_socket && !tres_per_task && !mem_per_tres)
4846 		return SLURM_SUCCESS;
4847 
4848 	if (tres_per_task && (*num_tasks == NO_VAL) &&
4849 	    (*min_nodes != NO_VAL) && (*min_nodes == *max_nodes)) {
4850 		/* Implicitly set task count */
4851 		if (*ntasks_per_node != NO_VAL16)
4852 			*num_tasks = *min_nodes * *ntasks_per_node;
4853 		else if (*cpus_per_task == NO_VAL16)
4854 			*num_tasks = *min_nodes;
4855 	}
4856 
4857 	if ((rc = gres_plugin_init()) != SLURM_SUCCESS)
4858 		return rc;
4859 
4860 	if ((select_plugin_type != SELECT_TYPE_CONS_TRES) &&
4861 	    (cpus_per_tres || tres_per_job || tres_per_socket ||
4862 	     tres_per_task || mem_per_tres))
4863 		return ESLURM_UNSUPPORTED_GRES;
4864 
4865 	/*
4866 	 * Clear fields as requested by job update (i.e. input value is "")
4867 	 */
4868 	if (*gres_list)
4869 		(void) list_for_each(*gres_list, _clear_total_gres, NULL);
4870 	if (*gres_list && cpus_per_tres && (cpus_per_tres[0] == '\0')) {
4871 		(void) list_for_each(*gres_list, _clear_cpus_per_gres, NULL);
4872 		cpus_per_tres = NULL;
4873 	}
4874 	if (*gres_list && tres_per_job && (tres_per_job[0] == '\0')) {
4875 		(void) list_for_each(*gres_list, _clear_gres_per_job, NULL);
4876 		tres_per_job = NULL;
4877 	}
4878 	if (*gres_list && tres_per_node && (tres_per_node[0] == '\0')) {
4879 		(void) list_for_each(*gres_list, _clear_gres_per_node, NULL);
4880 		tres_per_node = NULL;
4881 	}
4882 	if (*gres_list && tres_per_socket && (tres_per_socket[0] == '\0')) {
4883 		(void) list_for_each(*gres_list, _clear_gres_per_socket, NULL);
4884 		tres_per_socket = NULL;
4885 	}
4886 	if (*gres_list && tres_per_task && (tres_per_task[0] == '\0')) {
4887 		(void) list_for_each(*gres_list, _clear_gres_per_task, NULL);
4888 		tres_per_task = NULL;
4889 	}
4890 	if (*gres_list && mem_per_tres && (mem_per_tres[0] == '\0')) {
4891 		(void) list_for_each(*gres_list, _clear_mem_per_gres, NULL);
4892 		mem_per_tres = NULL;
4893 	}
4894 
4895 	/*
4896 	 * Set new values as requested
4897 	 */
4898 	if (*gres_list == NULL)
4899 		*gres_list = list_create(_gres_job_list_delete);
4900 	slurm_mutex_lock(&gres_context_lock);
4901 	if (cpus_per_tres) {
4902 		char *in_val = cpus_per_tres, *save_ptr = NULL;
4903 		while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
4904 							   *gres_list,
4905 							   &save_ptr, &rc))) {
4906 			job_gres_data->cpus_per_gres = cnt;
4907 			in_val = NULL;
4908 		}
4909 	}
4910 	if (tres_per_job) {
4911 		char *in_val = tres_per_job, *save_ptr = NULL;
4912 		while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
4913 							   *gres_list,
4914 							   &save_ptr, &rc))) {
4915 			job_gres_data->gres_per_job = cnt;
4916 			in_val = NULL;
4917 			job_gres_data->total_gres =
4918 				MAX(job_gres_data->total_gres, cnt);
4919 		}
4920 	}
4921 	if (tres_per_node) {
4922 		char *in_val = tres_per_node, *save_ptr = NULL;
4923 		while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
4924 							   *gres_list,
4925 							   &save_ptr, &rc))) {
4926 			job_gres_data->gres_per_node = cnt;
4927 			in_val = NULL;
4928 			if (*min_nodes != NO_VAL)
4929 				cnt *= *min_nodes;
4930 			job_gres_data->total_gres =
4931 				MAX(job_gres_data->total_gres, cnt);
4932 		}
4933 	}
4934 	if (tres_per_socket) {
4935 		char *in_val = tres_per_socket, *save_ptr = NULL;
4936 		while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
4937 							   *gres_list,
4938 							   &save_ptr, &rc))) {
4939 			job_gres_data->gres_per_socket = cnt;
4940 			in_val = NULL;
4941 			if ((*min_nodes != NO_VAL) &&
4942 			    (*sockets_per_node != NO_VAL16)) {
4943 				cnt *= (*min_nodes * *sockets_per_node);
4944 			} else if ((*num_tasks != NO_VAL) &&
4945 				   (*ntasks_per_socket != NO_VAL16)) {
4946 				cnt *= ((*num_tasks + *ntasks_per_socket - 1) /
4947 				        *ntasks_per_socket);
4948 			}
4949 		}
4950 	}
4951 	if (tres_per_task) {
4952 		char *in_val = tres_per_task, *save_ptr = NULL;
4953 		while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
4954 							   *gres_list,
4955 							   &save_ptr, &rc))) {
4956 			job_gres_data->gres_per_task = cnt;
4957 			in_val = NULL;
4958 			if (*num_tasks != NO_VAL)
4959 				cnt *= *num_tasks;
4960 			job_gres_data->total_gres =
4961 				MAX(job_gres_data->total_gres, cnt);
4962 		}
4963 	}
4964 	if (mem_per_tres) {
4965 		char *in_val = mem_per_tres, *save_ptr = NULL;
4966 		while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
4967 							   *gres_list,
4968 							   &save_ptr, &rc))) {
4969 			job_gres_data->mem_per_gres = cnt;
4970 			in_val = NULL;
4971 		}
4972 	}
4973 	slurm_mutex_unlock(&gres_context_lock);
4974 
4975 	if (rc != SLURM_SUCCESS)
4976 		return rc;
4977 	size = list_count(*gres_list);
4978 	if (size == 0) {
4979 		FREE_NULL_LIST(*gres_list);
4980 		return rc;
4981 	}
4982 
4983 	/*
4984 	 * Check for record overlap (e.g. "gpu:2,gpu:tesla:1")
4985 	 * Ensure tres_per_job >= tres_per_node >= tres_per_socket
4986 	 */
4987 	over_list = xcalloc(size, sizeof(overlap_check_t));
4988 	iter = list_iterator_create(*gres_list);
4989 	while ((gres_state = (gres_state_t *) list_next(iter))) {
4990 		job_gres_data = (gres_job_state_t *) gres_state->gres_data;
4991 		if (_test_gres_cnt(job_gres_data, num_tasks, min_nodes,
4992 				   max_nodes, ntasks_per_node,
4993 				   ntasks_per_socket, sockets_per_node,
4994 				   cpus_per_task) != 0) {
4995 			rc = ESLURM_INVALID_GRES;
4996 			break;
4997 		}
4998 		if (!have_gres_gpu && !xstrcmp(job_gres_data->gres_name, "gpu"))
4999 			have_gres_gpu = true;
5000 		if (!xstrcmp(job_gres_data->gres_name, "mps")) {
5001 			have_gres_mps = true;
5002 			/*
5003 			 * gres/mps only supports a per-node count,
5004 			 * set either explicitly or implicitly.
5005 			 */
5006 			if (job_gres_data->gres_per_job &&
5007 			    (*max_nodes != 1)) {
5008 				rc = ESLURM_INVALID_GRES;
5009 				break;
5010 			}
5011 			if (job_gres_data->gres_per_socket &&
5012 			    (*sockets_per_node != 1)) {
5013 				rc = ESLURM_INVALID_GRES;
5014 				break;
5015 			}
5016 			if (job_gres_data->gres_per_task && (*num_tasks != 1)) {
5017 				rc = ESLURM_INVALID_GRES;
5018 				break;
5019 			}
5020 		}
5021 		if (have_gres_gpu && have_gres_mps) {
5022 			rc = ESLURM_INVALID_GRES;
5023 			break;
5024 		}
5025 
5026 		for (i = 0; i < over_count; i++) {
5027 			if (over_list[i].plugin_id == gres_state->plugin_id)
5028 				break;
5029 		}
5030 		if (i >= over_count) {
5031 			over_list[over_count++].plugin_id =
5032 				gres_state->plugin_id;
5033 			if (job_gres_data->type_name) {
5034 				over_list[i].with_model = true;
5035 			} else {
5036 				over_list[i].without_model = true;
5037 				over_list[i].without_model_state =
5038 					job_gres_data;
5039 			}
5040 		} else if (job_gres_data->type_name) {
5041 			over_list[i].with_model = true;
5042 			if (over_list[i].without_model)
5043 				overlap_merge = true;
5044 		} else {
5045 			over_list[i].without_model = true;
5046 			over_list[i].without_model_state = job_gres_data;
5047 			if (over_list[i].with_model)
5048 				overlap_merge = true;
5049 		}
5050 	}
5051 	if (have_gres_mps && (rc == SLURM_SUCCESS) && tres_freq &&
5052 	    strstr(tres_freq, "gpu")) {
5053 		rc = ESLURM_INVALID_GRES;
5054 	}
5055 
5056 	if (overlap_merge) {	/* Merge generic data if possible */
5057 		uint16_t cpus_per_gres;
5058 		uint64_t mem_per_gres;
5059 		for (i = 0; i < over_count; i++) {
5060 			if (!over_list[i].with_model ||
5061 			    !over_list[i].without_model_state)
5062 				continue;
5063 			if (!_generic_job_state(
5064 					over_list[i].without_model_state)) {
5065 				rc = ESLURM_INVALID_GRES_TYPE;
5066 				break;
5067 			}
5068 			/* Propagate generic parameters */
5069 			cpus_per_gres =
5070 				over_list[i].without_model_state->cpus_per_gres;
5071 			mem_per_gres =
5072 				over_list[i].without_model_state->mem_per_gres;
5073 			list_iterator_reset(iter);
5074 			while ((gres_state = (gres_state_t *)list_next(iter))) {
5075 				job_gres_data = (gres_job_state_t *)
5076 					gres_state->gres_data;
5077 				if (over_list[i].plugin_id !=
5078 				    gres_state->plugin_id)
5079 					continue;
5080 				if (job_gres_data ==
5081 				    over_list[i].without_model_state) {
5082 					list_remove(iter);
5083 					continue;
5084 				}
5085 				if (job_gres_data->cpus_per_gres == 0) {
5086 					job_gres_data->cpus_per_gres =
5087 						cpus_per_gres;
5088 				}
5089 				if (job_gres_data->mem_per_gres == 0) {
5090 					job_gres_data->mem_per_gres =
5091 						mem_per_gres;
5092 				}
5093 			}
5094 		}
5095 	}
5096 	list_iterator_destroy(iter);
5097 	xfree(over_list);
5098 
5099 	return rc;
5100 }
5101 
5102 /*
5103  * Determine if a job's specified GRES can be supported. This is designed to
5104  * prevent the running of a job using the GRES options only supported by the
5105  * select/cons_tres plugin when switching (on slurmctld restart) from the
5106  * cons_tres plugin to any other select plugin.
5107  *
5108  * IN gres_list - List of GRES records for this job to track usage
5109  * RET SLURM_SUCCESS or ESLURM_INVALID_GRES
5110  */
gres_plugin_job_revalidate(List gres_list)5111 extern int gres_plugin_job_revalidate(List gres_list)
5112 {
5113 	gres_state_t *gres_state;
5114 	gres_job_state_t *job_gres_data;
5115 	ListIterator iter;
5116 	int rc = SLURM_SUCCESS;
5117 
5118 	if (!gres_list || (select_plugin_type == SELECT_TYPE_CONS_TRES))
5119 		return SLURM_SUCCESS;
5120 
5121 	iter = list_iterator_create(gres_list);
5122 	while ((gres_state = (gres_state_t *) list_next(iter))) {
5123 		job_gres_data = (gres_job_state_t *) gres_state->gres_data;
5124 		if (job_gres_data->gres_per_job ||
5125 		    job_gres_data->gres_per_socket ||
5126 		    job_gres_data->gres_per_task) {
5127 			rc = ESLURM_UNSUPPORTED_GRES;
5128 			break;
5129 		}
5130 	}
5131 	list_iterator_destroy(iter);
5132 
5133 	return rc;
5134 }
5135 
5136 /*
5137  * Return TRUE if any of this job's GRES has a populated gres_bit_alloc element.
5138  * This indicates the allocated GRES has a File configuration parameter and is
5139  * tracking individual file assignments.
5140  */
_job_has_gres_bits(List job_gres_list)5141 static bool _job_has_gres_bits(List job_gres_list)
5142 {
5143 	ListIterator job_gres_iter;
5144 	gres_state_t *gres_ptr;
5145 	gres_job_state_t *job_gres_ptr;
5146 	bool rc = false;
5147 	int i;
5148 
5149 	if (!job_gres_list)
5150 		return false;
5151 
5152 	job_gres_iter = list_iterator_create(job_gres_list);
5153 	while ((gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
5154 		job_gres_ptr = gres_ptr->gres_data;
5155 		if (!job_gres_ptr)
5156 			continue;
5157 		for (i = 0; i < job_gres_ptr->node_cnt; i++) {
5158 			if (job_gres_ptr->gres_bit_alloc &&
5159 			    job_gres_ptr->gres_bit_alloc[i]) {
5160 				rc = true;
5161 				break;
5162 			}
5163 		}
5164 		if (rc)
5165 			break;
5166 	}
5167 	list_iterator_destroy(job_gres_iter);
5168 
5169 	return rc;
5170 }
5171 
5172 /*
5173  * Return count of configured GRES.
5174  * NOTE: For gres/mps return count of gres/gpu
5175  */
_get_node_gres_cnt(List node_gres_list,uint32_t plugin_id)5176 static int _get_node_gres_cnt(List node_gres_list, uint32_t plugin_id)
5177 {
5178 	ListIterator node_gres_iter;
5179 	gres_node_state_t *gres_node_ptr;
5180 	gres_state_t *gres_ptr;
5181 	int gres_cnt = 0;
5182 
5183 	if (!node_gres_list)
5184 		return 0;
5185 
5186 	if (plugin_id == mps_plugin_id)
5187 		plugin_id = gpu_plugin_id;
5188 	node_gres_iter = list_iterator_create(node_gres_list);
5189         while ((gres_ptr = (gres_state_t *) list_next(node_gres_iter))) {
5190 		if (gres_ptr->plugin_id != plugin_id)
5191 			continue;
5192 		gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
5193 		gres_cnt = (int) gres_node_ptr->gres_cnt_config;
5194 		break;
5195 	}
5196 	list_iterator_destroy(node_gres_iter);
5197 
5198 	return gres_cnt;
5199 }
5200 
5201 /*
5202  * Return TRUE if the identified node in the job allocation can satisfy the
5203  * job's GRES specification without change in its bitmaps. In other words,
5204  * return FALSE if the job allocation identifies specific GRES devices and the
5205  * count of those devices on this node has changed.
5206  *
5207  * IN job_gres_list - List of GRES records for this job to track usage
5208  * IN node_inx - zero-origin index into this job's node allocation
5209  * IN node_gres_list - List of GRES records for this node
5210  */
_validate_node_gres_cnt(uint32_t job_id,List job_gres_list,int node_inx,List node_gres_list,char * node_name)5211 static bool _validate_node_gres_cnt(uint32_t job_id, List job_gres_list,
5212 				    int node_inx, List node_gres_list,
5213 				    char *node_name)
5214 {
5215 	ListIterator job_gres_iter;
5216 	gres_state_t *gres_ptr;
5217 	gres_job_state_t *job_gres_ptr;
5218 	bool rc = true;
5219 	int job_gres_cnt, node_gres_cnt;
5220 
5221 	if (!job_gres_list)
5222 		return true;
5223 
5224 	(void) gres_plugin_init();
5225 
5226 	job_gres_iter = list_iterator_create(job_gres_list);
5227 	while ((gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
5228 		job_gres_ptr = gres_ptr->gres_data;
5229 		if (!job_gres_ptr || !job_gres_ptr->gres_bit_alloc)
5230 			continue;
5231 		if ((node_inx >= job_gres_ptr->node_cnt) ||
5232 		    !job_gres_ptr->gres_bit_alloc[node_inx])
5233 			continue;
5234 		job_gres_cnt = bit_size(job_gres_ptr->gres_bit_alloc[node_inx]);
5235 		node_gres_cnt = _get_node_gres_cnt(node_gres_list,
5236 						   gres_ptr->plugin_id);
5237 		if (job_gres_cnt != node_gres_cnt) {
5238 			error("%s: Killing job %u: gres/%s count mismatch on node "
5239 			      "%s (%d != %d)",
5240 			      __func__, job_id, job_gres_ptr->gres_name,
5241 			      node_name, job_gres_cnt, node_gres_cnt);
5242 			rc = false;
5243 			break;
5244 		}
5245 	}
5246 	list_iterator_destroy(job_gres_iter);
5247 
5248 	return rc;
5249 }
5250 
5251 /*
5252  * Determine if a job's specified GRES are currently valid. This is designed to
5253  * manage jobs allocated GRES which are either no longer supported or a GRES
5254  * configured with the "File" option in gres.conf where the count has changed,
5255  * in which case we don't know how to map the job's old GRES bitmap onto the
5256  * current GRES bitmaps.
5257  *
5258  * IN job_id - ID of job being validated (used for logging)
5259  * IN job_gres_list - List of GRES records for this job to track usage
5260  * RET SLURM_SUCCESS or ESLURM_INVALID_GRES
5261  */
gres_plugin_job_revalidate2(uint32_t job_id,List job_gres_list,bitstr_t * node_bitmap)5262 extern int gres_plugin_job_revalidate2(uint32_t job_id, List job_gres_list,
5263 				       bitstr_t *node_bitmap)
5264 {
5265 	node_record_t *node_ptr;
5266 	int rc = SLURM_SUCCESS;
5267 	int i_first, i_last, i;
5268 	int node_inx = -1;
5269 
5270 	if (!job_gres_list || !node_bitmap ||
5271 	    !_job_has_gres_bits(job_gres_list))
5272 		return SLURM_SUCCESS;
5273 
5274 	i_first = bit_ffs(node_bitmap);
5275 	if (i_first >= 0)
5276 		i_last = bit_fls(node_bitmap);
5277 	else
5278 		i_last = -2;
5279 	for (i = i_first; i <= i_last; i++) {
5280 		if (!bit_test(node_bitmap, i))
5281 			continue;
5282 		node_ptr = node_record_table_ptr + i;
5283 		node_inx++;
5284 		if (!_validate_node_gres_cnt(job_id, job_gres_list, node_inx,
5285 					     node_ptr->gres_list,
5286 					     node_ptr->name)) {
5287 			rc = ESLURM_INVALID_GRES;
5288 			break;
5289 		}
5290 	}
5291 
5292 	return rc;
5293 }
5294 
5295 /*
5296  * Find a sock_gres_t record in a list by matching the plugin_id and type_id
5297  *	from a gres_state_t job record
5298  * IN x - a sock_gres_t record to test
5299  * IN key - the gres_state_t record (from a job) we want to match
5300  * RET 1 on match, otherwise 0
5301  */
_find_sock_by_job_gres(void * x,void * key)5302 static int _find_sock_by_job_gres(void *x, void *key)
5303 {
5304 	sock_gres_t *sock_data = (sock_gres_t *) x;
5305 	gres_state_t *job_gres_state = (gres_state_t *) key;
5306 	gres_job_state_t *job_data;
5307 
5308 	job_data = (gres_job_state_t *) job_gres_state->gres_data;
5309 	if ((sock_data->plugin_id == job_gres_state->plugin_id) &&
5310 	    (sock_data->type_id   == job_data->type_id))
5311 		return 1;
5312 	return 0;
5313 }
5314 
5315 /*
5316  * Find a gres_state_t job record in a list by matching the plugin_id and
5317  *	type_id from a sock_gres_t record
5318  * IN x - a gres_state_t record (from a job) to test
5319  * IN key - the sock_gres_t record we want to match
5320  * RET 1 on match, otherwise 0
5321  */
_find_job_by_sock_gres(void * x,void * key)5322 static int _find_job_by_sock_gres(void *x, void *key)
5323 {
5324 	gres_state_t *job_gres_state = (gres_state_t *) x;
5325 	gres_job_state_t *job_data;
5326 	sock_gres_t *sock_data = (sock_gres_t *) key;
5327 
5328 	job_data = (gres_job_state_t *) job_gres_state->gres_data;
5329 	if ((sock_data->plugin_id == job_gres_state->plugin_id) &&
5330 	    (sock_data->type_id   == job_data->type_id))
5331 		return 1;
5332 	return 0;
5333 }
5334 
5335 /*
5336  * Clear GRES allocation info for all job GRES at start of scheduling cycle
5337  * Return TRUE if any gres_per_job constraints to satisfy
5338  */
gres_plugin_job_sched_init(List job_gres_list)5339 extern bool gres_plugin_job_sched_init(List job_gres_list)
5340 {
5341 	ListIterator iter;
5342 	gres_state_t *job_gres_state;
5343 	gres_job_state_t *job_data;
5344 	bool rc = false;
5345 
5346 	if (!job_gres_list)
5347 		return rc;
5348 
5349 	iter = list_iterator_create(job_gres_list);
5350 	while ((job_gres_state = (gres_state_t *) list_next(iter))) {
5351 		job_data = (gres_job_state_t *) job_gres_state->gres_data;
5352 		if (!job_data->gres_per_job)
5353 			continue;
5354 		job_data->total_gres = 0;
5355 		rc = true;
5356 	}
5357 	list_iterator_destroy(iter);
5358 
5359 	return rc;
5360 }
5361 
5362 /*
5363  * Return TRUE if all gres_per_job specifications are satisfied
5364  */
gres_plugin_job_sched_test(List job_gres_list,uint32_t job_id)5365 extern bool gres_plugin_job_sched_test(List job_gres_list, uint32_t job_id)
5366 {
5367 	ListIterator iter;
5368 	gres_state_t *job_gres_state;
5369 	gres_job_state_t *job_data;
5370 	bool rc = true;
5371 
5372 	if (!job_gres_list)
5373 		return rc;
5374 
5375 	iter = list_iterator_create(job_gres_list);
5376 	while ((job_gres_state = (gres_state_t *) list_next(iter))) {
5377 		job_data = (gres_job_state_t *) job_gres_state->gres_data;
5378 		if (job_data->gres_per_job &&
5379 		    (job_data->gres_per_job > job_data->total_gres)) {
5380 			rc = false;
5381 			break;
5382 		}
5383 	}
5384 	list_iterator_destroy(iter);
5385 
5386 	return rc;
5387 }
5388 
5389 /*
5390  * Return TRUE if all gres_per_job specifications will be satisfied with
5391  *	the addtitional resources provided by a single node
5392  * IN job_gres_list - List of job's GRES requirements (job_gres_state_t)
5393  * IN sock_gres_list - Per socket GRES availability on this node (sock_gres_t)
5394  * IN job_id - The job being tested
5395  */
gres_plugin_job_sched_test2(List job_gres_list,List sock_gres_list,uint32_t job_id)5396 extern bool gres_plugin_job_sched_test2(List job_gres_list, List sock_gres_list,
5397 					uint32_t job_id)
5398 {
5399 	ListIterator iter;
5400 	gres_state_t *job_gres_state;
5401 	gres_job_state_t *job_data;
5402 	sock_gres_t *sock_data;
5403 	bool rc = true;
5404 
5405 	if (!job_gres_list)
5406 		return rc;
5407 
5408 	iter = list_iterator_create(job_gres_list);
5409 	while ((job_gres_state = (gres_state_t *) list_next(iter))) {
5410 		job_data = (gres_job_state_t *) job_gres_state->gres_data;
5411 		if ((job_data->gres_per_job == 0) ||
5412 		    (job_data->gres_per_job < job_data->total_gres))
5413 			continue;
5414 		sock_data = list_find_first(sock_gres_list,
5415 					    _find_sock_by_job_gres,
5416 					    job_gres_state);
5417 		if (!sock_data ||
5418 		    (job_data->gres_per_job >
5419 		     (job_data->total_gres + sock_data->total_cnt))) {
5420 			rc = false;
5421 			break;
5422 		}
5423 	}
5424 	list_iterator_destroy(iter);
5425 
5426 	return rc;
5427 }
5428 
5429 /*
5430  * Update a job's total_gres counter as we add a node to potential allocation
5431  * IN job_gres_list - List of job's GRES requirements (job_gres_state_t)
5432  * IN sock_gres_list - Per socket GRES availability on this node (sock_gres_t)
5433  * IN avail_cpus - CPUs currently available on this node
5434  */
gres_plugin_job_sched_add(List job_gres_list,List sock_gres_list,uint16_t avail_cpus)5435 extern void gres_plugin_job_sched_add(List job_gres_list, List sock_gres_list,
5436 				      uint16_t avail_cpus)
5437 {
5438 	ListIterator iter;
5439 	gres_state_t *job_gres_state;
5440 	gres_job_state_t *job_data;
5441 	sock_gres_t *sock_data;
5442 	uint64_t gres_limit;
5443 
5444 	if (!job_gres_list)
5445 		return;
5446 
5447 	iter = list_iterator_create(job_gres_list);
5448 	while ((job_gres_state = (gres_state_t *) list_next(iter))) {
5449 		job_data = (gres_job_state_t *) job_gres_state->gres_data;
5450 		if (!job_data->gres_per_job)	/* Don't care about totals */
5451 			continue;
5452 		sock_data = list_find_first(sock_gres_list,
5453 					    _find_sock_by_job_gres,
5454 					    job_gres_state);
5455 		if (!sock_data)		/* None of this GRES available */
5456 			continue;
5457 		if (job_data->cpus_per_gres) {
5458 			gres_limit = avail_cpus / job_data->cpus_per_gres;
5459 			gres_limit = MIN(gres_limit, sock_data->total_cnt);
5460 		} else
5461 			gres_limit = sock_data->total_cnt;
5462 		job_data->total_gres += gres_limit;
5463 	}
5464 	list_iterator_destroy(iter);
5465 }
5466 
5467 /*
5468  * Create/update List GRES that can be made available on the specified node
5469  * IN/OUT consec_gres - List of sock_gres_t that can be made available on
5470  *			a set of nodes
5471  * IN job_gres_list - List of job's GRES requirements (gres_job_state_t)
5472  * IN sock_gres_list - Per socket GRES availability on this node (sock_gres_t)
5473  */
gres_plugin_job_sched_consec(List * consec_gres,List job_gres_list,List sock_gres_list)5474 extern void gres_plugin_job_sched_consec(List *consec_gres, List job_gres_list,
5475 					 List sock_gres_list)
5476 {
5477 	ListIterator iter;
5478 	gres_state_t *job_gres_state;
5479 	gres_job_state_t *job_data;
5480 	sock_gres_t *sock_data, *consec_data;
5481 
5482 	if (!job_gres_list)
5483 		return;
5484 
5485 	iter = list_iterator_create(job_gres_list);
5486 	while ((job_gres_state = (gres_state_t *) list_next(iter))) {
5487 		job_data = (gres_job_state_t *) job_gres_state->gres_data;
5488 		if (!job_data->gres_per_job)	/* Don't care about totals */
5489 			continue;
5490 		sock_data = list_find_first(sock_gres_list,
5491 					    _find_sock_by_job_gres,
5492 					    job_gres_state);
5493 		if (!sock_data)		/* None of this GRES available */
5494 			continue;
5495 		if (*consec_gres == NULL)
5496 			*consec_gres = list_create(_sock_gres_del);
5497 		consec_data = list_find_first(*consec_gres,
5498 					      _find_sock_by_job_gres,
5499 					      job_gres_state);
5500 		if (!consec_data) {
5501 			consec_data = xmalloc(sizeof(sock_gres_t));
5502 			consec_data->plugin_id = sock_data->plugin_id;
5503 			consec_data->type_id   = sock_data->type_id;
5504 			list_append(*consec_gres, consec_data);
5505 		}
5506 		consec_data->total_cnt += sock_data->total_cnt;
5507 	}
5508 	list_iterator_destroy(iter);
5509 }
5510 
5511 /*
5512  * Determine if the additional sock_gres_list resources will result in
5513  * satisfying the job's gres_per_job constraints
5514  * IN job_gres_list - job's GRES requirements
5515  * IN sock_gres_list - available GRES in a set of nodes, data structure built
5516  *		       by gres_plugin_job_sched_consec()
5517  */
gres_plugin_job_sched_sufficient(List job_gres_list,List sock_gres_list)5518 extern bool gres_plugin_job_sched_sufficient(List job_gres_list,
5519 					     List sock_gres_list)
5520 {
5521 	ListIterator iter;
5522 	gres_state_t *job_gres_state;
5523 	gres_job_state_t *job_data;
5524 	sock_gres_t *sock_data;
5525 	bool rc = true;
5526 
5527 	if (!job_gres_list)
5528 		return true;
5529 	if (!sock_gres_list)
5530 		return false;
5531 
5532 	iter = list_iterator_create(job_gres_list);
5533 	while ((job_gres_state = (gres_state_t *) list_next(iter))) {
5534 		job_data = (gres_job_state_t *) job_gres_state->gres_data;
5535 		if (!job_data->gres_per_job)	/* Don't care about totals */
5536 			continue;
5537 		if (job_data->total_gres >= job_data->gres_per_job)
5538 			continue;
5539 		sock_data = list_find_first(sock_gres_list,
5540 					    _find_sock_by_job_gres,
5541 					    job_gres_state);
5542 		if (!sock_data)	{	/* None of this GRES available */
5543 			rc = false;
5544 			break;
5545 		}
5546 		if ((job_data->total_gres + sock_data->total_cnt) <
5547 		    job_data->gres_per_job) {
5548 			rc = false;
5549 			break;
5550 		}
5551 	}
5552 	list_iterator_destroy(iter);
5553 
5554 	return rc;
5555 }
5556 
5557 /*
5558  * Given a List of sock_gres_t entries, return a string identifying the
5559  * count of each GRES available on this set of nodes
5560  * IN sock_gres_list - count of GRES available in this group of nodes
5561  * IN job_gres_list - job GRES specification, used only to get GRES name/type
5562  * RET xfree the returned string
5563  */
gres_plugin_job_sched_str(List sock_gres_list,List job_gres_list)5564 extern char *gres_plugin_job_sched_str(List sock_gres_list, List job_gres_list)
5565 {
5566 	ListIterator iter;
5567 	sock_gres_t *sock_data;
5568 	gres_state_t *job_gres_state;
5569 	gres_job_state_t *job_data;
5570 	char *out_str = NULL, *sep;
5571 
5572 	if (!sock_gres_list)
5573 		return NULL;
5574 
5575 	iter = list_iterator_create(sock_gres_list);
5576 	while ((sock_data = (sock_gres_t *) list_next(iter))) {
5577 		job_gres_state = list_find_first(job_gres_list,
5578 					   _find_job_by_sock_gres, sock_data);
5579 		if (!job_gres_state) {	/* Should never happen */
5580 			error("%s: Could not find job GRES for type %u:%u",
5581 			      __func__, sock_data->plugin_id,
5582 			      sock_data->type_id);
5583 			continue;
5584 		}
5585 		job_data = (gres_job_state_t *) job_gres_state->gres_data;
5586 		if (out_str)
5587 			sep = ",";
5588 		else
5589 			sep = "GRES:";
5590 		if (job_data->type_name) {
5591 			xstrfmtcat(out_str, "%s%s:%s:%"PRIu64, sep,
5592 				   job_data->gres_name, job_data->type_name,
5593 				   sock_data->total_cnt);
5594 		} else {
5595 			xstrfmtcat(out_str, "%s%s:%"PRIu64, sep,
5596 				   job_data->gres_name, sock_data->total_cnt);
5597 		}
5598 	}
5599 	list_iterator_destroy(iter);
5600 
5601 	return out_str;
5602 }
5603 
5604 /*
5605  * Create a (partial) copy of a job's gres state for job binding
5606  * IN gres_list - List of Gres records for this job to track usage
5607  * RET The copy or NULL on failure
5608  * NOTE: Only job details are copied, NOT the job step details
5609  */
gres_plugin_job_state_dup(List gres_list)5610 extern List gres_plugin_job_state_dup(List gres_list)
5611 {
5612 	return gres_plugin_job_state_extract(gres_list, -1);
5613 }
5614 
5615 /* Copy gres_job_state_t record for ALL nodes */
_job_state_dup(void * gres_data)5616 static void *_job_state_dup(void *gres_data)
5617 {
5618 
5619 	int i;
5620 	gres_job_state_t *gres_ptr = (gres_job_state_t *) gres_data;
5621 	gres_job_state_t *new_gres_ptr;
5622 
5623 	if (gres_ptr == NULL)
5624 		return NULL;
5625 
5626 	new_gres_ptr = xmalloc(sizeof(gres_job_state_t));
5627 	new_gres_ptr->cpus_per_gres	= gres_ptr->cpus_per_gres;
5628 	new_gres_ptr->gres_name		= xstrdup(gres_ptr->gres_name);
5629 	new_gres_ptr->gres_per_job	= gres_ptr->gres_per_job;
5630 	new_gres_ptr->gres_per_node	= gres_ptr->gres_per_node;
5631 	new_gres_ptr->gres_per_socket	= gres_ptr->gres_per_socket;
5632 	new_gres_ptr->gres_per_task	= gres_ptr->gres_per_task;
5633 	new_gres_ptr->mem_per_gres	= gres_ptr->mem_per_gres;
5634 	new_gres_ptr->node_cnt		= gres_ptr->node_cnt;
5635 	new_gres_ptr->total_gres	= gres_ptr->total_gres;
5636 	new_gres_ptr->type_id		= gres_ptr->type_id;
5637 	new_gres_ptr->type_name		= xstrdup(gres_ptr->type_name);
5638 
5639 	if (gres_ptr->gres_cnt_node_alloc) {
5640 		i = sizeof(uint64_t) * gres_ptr->node_cnt;
5641 		new_gres_ptr->gres_cnt_node_alloc = xmalloc(i);
5642 		memcpy(new_gres_ptr->gres_cnt_node_alloc,
5643 		       gres_ptr->gres_cnt_node_alloc, i);
5644 	}
5645 	if (gres_ptr->gres_bit_alloc) {
5646 		new_gres_ptr->gres_bit_alloc = xcalloc(gres_ptr->node_cnt,
5647 						       sizeof(bitstr_t *));
5648 		for (i = 0; i < gres_ptr->node_cnt; i++) {
5649 			if (gres_ptr->gres_bit_alloc[i] == NULL)
5650 				continue;
5651 			new_gres_ptr->gres_bit_alloc[i] =
5652 				bit_copy(gres_ptr->gres_bit_alloc[i]);
5653 		}
5654 	}
5655 	return new_gres_ptr;
5656 }
5657 
5658 /* Copy gres_job_state_t record for one specific node */
_job_state_dup2(void * gres_data,int node_index)5659 static void *_job_state_dup2(void *gres_data, int node_index)
5660 {
5661 
5662 	gres_job_state_t *gres_ptr = (gres_job_state_t *) gres_data;
5663 	gres_job_state_t *new_gres_ptr;
5664 
5665 	if (gres_ptr == NULL)
5666 		return NULL;
5667 
5668 	new_gres_ptr = xmalloc(sizeof(gres_job_state_t));
5669 	new_gres_ptr->cpus_per_gres	= gres_ptr->cpus_per_gres;
5670 	new_gres_ptr->gres_name		= xstrdup(gres_ptr->gres_name);
5671 	new_gres_ptr->gres_per_job	= gres_ptr->gres_per_job;
5672 	new_gres_ptr->gres_per_node	= gres_ptr->gres_per_node;
5673 	new_gres_ptr->gres_per_socket	= gres_ptr->gres_per_socket;
5674 	new_gres_ptr->gres_per_task	= gres_ptr->gres_per_task;
5675 	new_gres_ptr->mem_per_gres	= gres_ptr->mem_per_gres;
5676 	new_gres_ptr->node_cnt		= 1;
5677 	new_gres_ptr->total_gres	= gres_ptr->total_gres;
5678 	new_gres_ptr->type_id		= gres_ptr->type_id;
5679 	new_gres_ptr->type_name		= xstrdup(gres_ptr->type_name);
5680 
5681 	if (gres_ptr->gres_cnt_node_alloc) {
5682 		new_gres_ptr->gres_cnt_node_alloc = xmalloc(sizeof(uint64_t));
5683 		new_gres_ptr->gres_cnt_node_alloc[0] =
5684 		       gres_ptr->gres_cnt_node_alloc[node_index];
5685 	}
5686 	if (gres_ptr->gres_bit_alloc && gres_ptr->gres_bit_alloc[node_index]) {
5687 		new_gres_ptr->gres_bit_alloc	= xmalloc(sizeof(bitstr_t *));
5688 		new_gres_ptr->gres_bit_alloc[0] =
5689 				bit_copy(gres_ptr->gres_bit_alloc[node_index]);
5690 	}
5691 	return new_gres_ptr;
5692 }
5693 
5694 /*
5695  * Create a (partial) copy of a job's gres state for a particular node index
5696  * IN gres_list - List of Gres records for this job to track usage
5697  * IN node_index - zero-origin index to the node
5698  * RET The copy or NULL on failure
5699  */
gres_plugin_job_state_extract(List gres_list,int node_index)5700 extern List gres_plugin_job_state_extract(List gres_list, int node_index)
5701 {
5702 	ListIterator gres_iter;
5703 	gres_state_t *gres_ptr, *new_gres_state;
5704 	List new_gres_list = NULL;
5705 	void *new_gres_data;
5706 
5707 	if (gres_list == NULL)
5708 		return new_gres_list;
5709 
5710 	(void) gres_plugin_init();
5711 
5712 	slurm_mutex_lock(&gres_context_lock);
5713 	gres_iter = list_iterator_create(gres_list);
5714 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
5715 		if (node_index == -1)
5716 			new_gres_data = _job_state_dup(gres_ptr->gres_data);
5717 		else {
5718 			new_gres_data = _job_state_dup2(gres_ptr->gres_data,
5719 							node_index);
5720 		}
5721 		if (new_gres_data == NULL)
5722 			break;
5723 		if (new_gres_list == NULL) {
5724 			new_gres_list = list_create(_gres_job_list_delete);
5725 		}
5726 		new_gres_state = xmalloc(sizeof(gres_state_t));
5727 		new_gres_state->plugin_id = gres_ptr->plugin_id;
5728 		new_gres_state->gres_data = new_gres_data;
5729 		list_append(new_gres_list, new_gres_state);
5730 	}
5731 	list_iterator_destroy(gres_iter);
5732 	slurm_mutex_unlock(&gres_context_lock);
5733 
5734 	return new_gres_list;
5735 }
5736 
5737 /*
5738  * Pack a job's current gres status, called from slurmctld for save/restore
5739  * IN gres_list - generated by gres_plugin_job_config_validate()
5740  * IN/OUT buffer - location to write state to
5741  * IN job_id - job's ID
5742  * IN details - if set then pack job step allocation details (only needed to
5743  *	 	save/restore job state, not needed in job credential for
5744  *		slurmd task binding)
5745  *
5746  * NOTE: A job's allocation to steps is not recorded here, but recovered with
5747  *	 the job step state information upon slurmctld restart.
5748  */
gres_plugin_job_state_pack(List gres_list,Buf buffer,uint32_t job_id,bool details,uint16_t protocol_version)5749 extern int gres_plugin_job_state_pack(List gres_list, Buf buffer,
5750 				      uint32_t job_id, bool details,
5751 				      uint16_t protocol_version)
5752 {
5753 	int i, rc = SLURM_SUCCESS;
5754 	uint32_t top_offset, tail_offset;
5755 	uint32_t magic = GRES_MAGIC;
5756 	uint16_t rec_cnt = 0;
5757 	ListIterator gres_iter;
5758 	gres_state_t *gres_ptr;
5759 	gres_job_state_t *gres_job_ptr;
5760 
5761 	top_offset = get_buf_offset(buffer);
5762 	pack16(rec_cnt, buffer);	/* placeholder if data */
5763 
5764 	if (gres_list == NULL)
5765 		return rc;
5766 
5767 	(void) gres_plugin_init();
5768 
5769 	slurm_mutex_lock(&gres_context_lock);
5770 	gres_iter = list_iterator_create(gres_list);
5771 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
5772 		gres_job_ptr = (gres_job_state_t *) gres_ptr->gres_data;
5773 
5774 		if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
5775 			pack32(magic, buffer);
5776 			pack32(gres_ptr->plugin_id, buffer);
5777 			pack16(gres_job_ptr->cpus_per_gres, buffer);
5778 			pack16(gres_job_ptr->flags, buffer);
5779 			pack64(gres_job_ptr->gres_per_job, buffer);
5780 			pack64(gres_job_ptr->gres_per_node, buffer);
5781 			pack64(gres_job_ptr->gres_per_socket, buffer);
5782 			pack64(gres_job_ptr->gres_per_task, buffer);
5783 			pack64(gres_job_ptr->mem_per_gres, buffer);
5784 			pack64(gres_job_ptr->total_gres, buffer);
5785 			packstr(gres_job_ptr->type_name, buffer);
5786 			pack32(gres_job_ptr->node_cnt, buffer);
5787 
5788 			if (gres_job_ptr->gres_cnt_node_alloc) {
5789 				pack8((uint8_t) 1, buffer);
5790 				pack64_array(gres_job_ptr->gres_cnt_node_alloc,
5791 					     gres_job_ptr->node_cnt, buffer);
5792 			} else {
5793 				pack8((uint8_t) 0, buffer);
5794 			}
5795 
5796 			if (gres_job_ptr->gres_bit_alloc) {
5797 				pack8((uint8_t) 1, buffer);
5798 				for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5799 					pack_bit_str_hex(gres_job_ptr->
5800 							 gres_bit_alloc[i],
5801 							 buffer);
5802 				}
5803 			} else {
5804 				pack8((uint8_t) 0, buffer);
5805 			}
5806 			if (details && gres_job_ptr->gres_bit_step_alloc) {
5807 				pack8((uint8_t) 1, buffer);
5808 				for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5809 					pack_bit_str_hex(gres_job_ptr->
5810 							 gres_bit_step_alloc[i],
5811 							 buffer);
5812 				}
5813 			} else {
5814 				pack8((uint8_t) 0, buffer);
5815 			}
5816 			if (details && gres_job_ptr->gres_cnt_step_alloc) {
5817 				pack8((uint8_t) 1, buffer);
5818 				for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5819 					pack64(gres_job_ptr->
5820 					       gres_cnt_step_alloc[i],
5821 					       buffer);
5822 				}
5823 			} else {
5824 				pack8((uint8_t) 0, buffer);
5825 			}
5826 			rec_cnt++;
5827 		} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
5828 			pack32(magic, buffer);
5829 			pack32(gres_ptr->plugin_id, buffer);
5830 			pack16(gres_job_ptr->cpus_per_gres, buffer);
5831 			pack64(gres_job_ptr->gres_per_job, buffer);
5832 			pack64(gres_job_ptr->gres_per_node, buffer);
5833 			pack64(gres_job_ptr->gres_per_socket, buffer);
5834 			pack64(gres_job_ptr->gres_per_task, buffer);
5835 			pack64(gres_job_ptr->mem_per_gres, buffer);
5836 			pack64(gres_job_ptr->total_gres, buffer);
5837 			packstr(gres_job_ptr->type_name, buffer);
5838 			pack32(gres_job_ptr->node_cnt, buffer);
5839 
5840 			if (gres_job_ptr->gres_cnt_node_alloc) {
5841 				pack8((uint8_t) 1, buffer);
5842 				pack64_array(gres_job_ptr->gres_cnt_node_alloc,
5843 					     gres_job_ptr->node_cnt, buffer);
5844 			} else {
5845 				pack8((uint8_t) 0, buffer);
5846 			}
5847 
5848 			if (gres_job_ptr->gres_bit_alloc) {
5849 				pack8((uint8_t) 1, buffer);
5850 				for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5851 					pack_bit_str_hex(gres_job_ptr->
5852 							 gres_bit_alloc[i],
5853 							 buffer);
5854 				}
5855 			} else {
5856 				pack8((uint8_t) 0, buffer);
5857 			}
5858 			if (details && gres_job_ptr->gres_bit_step_alloc) {
5859 				pack8((uint8_t) 1, buffer);
5860 				for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5861 					pack_bit_str_hex(gres_job_ptr->
5862 							 gres_bit_step_alloc[i],
5863 							 buffer);
5864 				}
5865 			} else {
5866 				pack8((uint8_t) 0, buffer);
5867 			}
5868 			if (details && gres_job_ptr->gres_cnt_step_alloc) {
5869 				pack8((uint8_t) 1, buffer);
5870 				for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5871 					pack64(gres_job_ptr->
5872 					       gres_cnt_step_alloc[i],
5873 					       buffer);
5874 				}
5875 			} else {
5876 				pack8((uint8_t) 0, buffer);
5877 			}
5878 			rec_cnt++;
5879 		} else {
5880 			error("%s: protocol_version %hu not supported",
5881 			      __func__, protocol_version);
5882 			break;
5883 		}
5884 	}
5885 	list_iterator_destroy(gres_iter);
5886 	slurm_mutex_unlock(&gres_context_lock);
5887 
5888 	tail_offset = get_buf_offset(buffer);
5889 	set_buf_offset(buffer, top_offset);
5890 	pack16(rec_cnt, buffer);
5891 	set_buf_offset(buffer, tail_offset);
5892 
5893 	return rc;
5894 }
5895 
5896 /*
5897  * Unpack a job's current gres status, called from slurmctld for save/restore
5898  * OUT gres_list - restored state stored by gres_plugin_job_state_pack()
5899  * IN/OUT buffer - location to read state from
5900  * IN job_id - job's ID
5901  */
gres_plugin_job_state_unpack(List * gres_list,Buf buffer,uint32_t job_id,uint16_t protocol_version)5902 extern int gres_plugin_job_state_unpack(List *gres_list, Buf buffer,
5903 					uint32_t job_id,
5904 					uint16_t protocol_version)
5905 {
5906 	int i = 0, rc;
5907 	uint32_t magic = 0, plugin_id = 0, utmp32 = 0;
5908 	uint16_t rec_cnt = 0;
5909 	uint8_t  has_more = 0;
5910 	gres_state_t *gres_ptr;
5911 	gres_job_state_t *gres_job_ptr = NULL;
5912 
5913 	safe_unpack16(&rec_cnt, buffer);
5914 	if (rec_cnt == 0)
5915 		return SLURM_SUCCESS;
5916 
5917 	rc = gres_plugin_init();
5918 
5919 	slurm_mutex_lock(&gres_context_lock);
5920 	if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
5921 		*gres_list = list_create(_gres_job_list_delete);
5922 	}
5923 
5924 	while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
5925 		if ((buffer == NULL) || (remaining_buf(buffer) == 0))
5926 			break;
5927 		rec_cnt--;
5928 
5929 		if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
5930 			safe_unpack32(&magic, buffer);
5931 			if (magic != GRES_MAGIC)
5932 				goto unpack_error;
5933 			safe_unpack32(&plugin_id, buffer);
5934 			gres_job_ptr = xmalloc(sizeof(gres_job_state_t));
5935 			safe_unpack16(&gres_job_ptr->cpus_per_gres, buffer);
5936 			safe_unpack16(&gres_job_ptr->flags, buffer);
5937 			safe_unpack64(&gres_job_ptr->gres_per_job, buffer);
5938 			safe_unpack64(&gres_job_ptr->gres_per_node, buffer);
5939 			safe_unpack64(&gres_job_ptr->gres_per_socket, buffer);
5940 			safe_unpack64(&gres_job_ptr->gres_per_task, buffer);
5941 			safe_unpack64(&gres_job_ptr->mem_per_gres, buffer);
5942 			safe_unpack64(&gres_job_ptr->total_gres, buffer);
5943 			safe_unpackstr_xmalloc(&gres_job_ptr->type_name,
5944 					       &utmp32, buffer);
5945 			gres_job_ptr->type_id =
5946 				gres_plugin_build_id(gres_job_ptr->type_name);
5947 			safe_unpack32(&gres_job_ptr->node_cnt, buffer);
5948 			if (gres_job_ptr->node_cnt > NO_VAL)
5949 				goto unpack_error;
5950 
5951 			safe_unpack8(&has_more, buffer);
5952 			if (has_more) {
5953 				safe_unpack64_array(
5954 					&gres_job_ptr->gres_cnt_node_alloc,
5955 					&utmp32, buffer);
5956 			}
5957 
5958 			safe_unpack8(&has_more, buffer);
5959 			if (has_more) {
5960 				safe_xcalloc(gres_job_ptr->gres_bit_alloc,
5961 					     gres_job_ptr->node_cnt,
5962 					     sizeof(bitstr_t *));
5963 				for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5964 					unpack_bit_str_hex(&gres_job_ptr->
5965 							   gres_bit_alloc[i],
5966 							   buffer);
5967 				}
5968 			}
5969 			safe_unpack8(&has_more, buffer);
5970 			if (has_more) {
5971 				safe_xcalloc(gres_job_ptr->gres_bit_step_alloc,
5972 					     gres_job_ptr->node_cnt,
5973 					     sizeof(bitstr_t *));
5974 				for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5975 					unpack_bit_str_hex(&gres_job_ptr->
5976 							   gres_bit_step_alloc[i],
5977 							   buffer);
5978 				}
5979 			}
5980 			safe_unpack8(&has_more, buffer);
5981 			if (has_more) {
5982 				safe_xcalloc(gres_job_ptr->gres_cnt_step_alloc,
5983 					     gres_job_ptr->node_cnt,
5984 					     sizeof(uint64_t));
5985 				for (i = 0; i < gres_job_ptr->node_cnt; i++) {
5986 					safe_unpack64(&gres_job_ptr->
5987 						      gres_cnt_step_alloc[i],
5988 						      buffer);
5989 				}
5990 			}
5991 		} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
5992 			safe_unpack32(&magic, buffer);
5993 			if (magic != GRES_MAGIC)
5994 				goto unpack_error;
5995 			safe_unpack32(&plugin_id, buffer);
5996 			gres_job_ptr = xmalloc(sizeof(gres_job_state_t));
5997 			safe_unpack16(&gres_job_ptr->cpus_per_gres, buffer);
5998 			safe_unpack64(&gres_job_ptr->gres_per_job, buffer);
5999 			safe_unpack64(&gres_job_ptr->gres_per_node, buffer);
6000 			safe_unpack64(&gres_job_ptr->gres_per_socket, buffer);
6001 			safe_unpack64(&gres_job_ptr->gres_per_task, buffer);
6002 			safe_unpack64(&gres_job_ptr->mem_per_gres, buffer);
6003 			safe_unpack64(&gres_job_ptr->total_gres, buffer);
6004 			safe_unpackstr_xmalloc(&gres_job_ptr->type_name,
6005 					       &utmp32, buffer);
6006 			gres_job_ptr->type_id =
6007 				gres_plugin_build_id(gres_job_ptr->type_name);
6008 			safe_unpack32(&gres_job_ptr->node_cnt, buffer);
6009 			if (gres_job_ptr->node_cnt > NO_VAL)
6010 				goto unpack_error;
6011 
6012 			safe_unpack8(&has_more, buffer);
6013 			if (has_more) {
6014 				safe_unpack64_array(
6015 					&gres_job_ptr->gres_cnt_node_alloc,
6016 					&utmp32, buffer);
6017 			}
6018 
6019 			safe_unpack8(&has_more, buffer);
6020 			if (has_more) {
6021 				safe_xcalloc(gres_job_ptr->gres_bit_alloc,
6022 					     gres_job_ptr->node_cnt,
6023 					     sizeof(bitstr_t *));
6024 				for (i = 0; i < gres_job_ptr->node_cnt; i++) {
6025 					unpack_bit_str_hex(&gres_job_ptr->
6026 							   gres_bit_alloc[i],
6027 							   buffer);
6028 				}
6029 			}
6030 			safe_unpack8(&has_more, buffer);
6031 			if (has_more) {
6032 				safe_xcalloc(gres_job_ptr->gres_bit_step_alloc,
6033 					     gres_job_ptr->node_cnt,
6034 					     sizeof(bitstr_t *));
6035 				for (i = 0; i < gres_job_ptr->node_cnt; i++) {
6036 					unpack_bit_str_hex(&gres_job_ptr->
6037 							   gres_bit_step_alloc[i],
6038 							   buffer);
6039 				}
6040 			}
6041 			safe_unpack8(&has_more, buffer);
6042 			if (has_more) {
6043 				safe_xcalloc(gres_job_ptr->gres_cnt_step_alloc,
6044 					     gres_job_ptr->node_cnt,
6045 					     sizeof(uint64_t));
6046 				for (i = 0; i < gres_job_ptr->node_cnt; i++) {
6047 					safe_unpack64(&gres_job_ptr->
6048 						      gres_cnt_step_alloc[i],
6049 						      buffer);
6050 				}
6051 			}
6052 		} else {
6053 			error("%s: protocol_version %hu not supported",
6054 			      __func__, protocol_version);
6055 			goto unpack_error;
6056 		}
6057 
6058 		for (i = 0; i < gres_context_cnt; i++) {
6059 			if (gres_context[i].plugin_id == plugin_id)
6060 				break;
6061 		}
6062 		if (i >= gres_context_cnt) {
6063 			/*
6064 			 * A likely sign that GresPlugins has changed.
6065 			 * Not a fatal error, skip over the data.
6066 			 */
6067 			error("%s: no plugin configured to unpack data type %u from job %u",
6068 			      __func__, plugin_id, job_id);
6069 			_job_state_delete(gres_job_ptr);
6070 			continue;
6071 		}
6072 		gres_job_ptr->gres_name = xstrdup(gres_context[i].gres_name);
6073 		gres_ptr = xmalloc(sizeof(gres_state_t));
6074 		gres_ptr->plugin_id = gres_context[i].plugin_id;
6075 		gres_ptr->gres_data = gres_job_ptr;
6076 		gres_job_ptr = NULL;	/* nothing left to free on error */
6077 		list_append(*gres_list, gres_ptr);
6078 	}
6079 	slurm_mutex_unlock(&gres_context_lock);
6080 	return rc;
6081 
6082 unpack_error:
6083 	error("%s: unpack error from job %u", __func__, job_id);
6084 	if (gres_job_ptr)
6085 		_job_state_delete(gres_job_ptr);
6086 	slurm_mutex_unlock(&gres_context_lock);
6087 	return SLURM_ERROR;
6088 }
6089 
6090 /*
6091  * Pack a job's allocated gres information for use by prolog/epilog
6092  * IN gres_list - generated by gres_plugin_job_config_validate()
6093  * IN/OUT buffer - location to write state to
6094  */
gres_plugin_job_alloc_pack(List gres_list,Buf buffer,uint16_t protocol_version)6095 extern int gres_plugin_job_alloc_pack(List gres_list, Buf buffer,
6096 				      uint16_t protocol_version)
6097 {
6098 	int i, rc = SLURM_SUCCESS;
6099 	uint32_t top_offset, tail_offset;
6100 	uint32_t magic = GRES_MAGIC;
6101 	uint16_t rec_cnt = 0;
6102 	ListIterator gres_iter;
6103 	gres_epilog_info_t *gres_job_ptr;
6104 
6105 	top_offset = get_buf_offset(buffer);
6106 	pack16(rec_cnt, buffer);	/* placeholder if data */
6107 
6108 	if (gres_list == NULL)
6109 		return rc;
6110 
6111 	(void) gres_plugin_init();
6112 
6113 	slurm_mutex_lock(&gres_context_lock);
6114 	gres_iter = list_iterator_create(gres_list);
6115 	while ((gres_job_ptr = (gres_epilog_info_t *) list_next(gres_iter))) {
6116 		if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
6117 			pack32(magic, buffer);
6118 			pack32(gres_job_ptr->plugin_id, buffer);
6119 			pack32(gres_job_ptr->node_cnt, buffer);
6120 			if (gres_job_ptr->gres_cnt_node_alloc) {
6121 				pack8((uint8_t) 1, buffer);
6122 				pack64_array(gres_job_ptr->gres_cnt_node_alloc,
6123 					     gres_job_ptr->node_cnt, buffer);
6124 			} else {
6125 				pack8((uint8_t) 0, buffer);
6126 			}
6127 			if (gres_job_ptr->gres_bit_alloc) {
6128 				pack8((uint8_t) 1, buffer);
6129 				for (i = 0; i < gres_job_ptr->node_cnt; i++) {
6130 					pack_bit_str_hex(gres_job_ptr->
6131 							 gres_bit_alloc[i],
6132 							 buffer);
6133 				}
6134 			} else {
6135 				pack8((uint8_t) 0, buffer);
6136 			}
6137 			rec_cnt++;
6138 		} else {
6139 			error("%s: protocol_version %hu not supported",
6140 			      __func__, protocol_version);
6141 			break;
6142 		}
6143 	}
6144 	list_iterator_destroy(gres_iter);
6145 	slurm_mutex_unlock(&gres_context_lock);
6146 
6147 	tail_offset = get_buf_offset(buffer);
6148 	set_buf_offset(buffer, top_offset);
6149 	pack16(rec_cnt, buffer);
6150 	set_buf_offset(buffer, tail_offset);
6151 
6152 	return rc;
6153 }
6154 
_epilog_list_del(void * x)6155 static void _epilog_list_del(void *x)
6156 {
6157 	gres_epilog_info_t *epilog_info = (gres_epilog_info_t *) x;
6158 	int i;
6159 
6160 	if (!epilog_info)
6161 		return;
6162 
6163 	if (epilog_info->gres_bit_alloc) {
6164 		for (i = 0; i < epilog_info->node_cnt; i++)
6165 			FREE_NULL_BITMAP(epilog_info->gres_bit_alloc[i]);
6166 		xfree(epilog_info->gres_bit_alloc);
6167 	}
6168 	xfree(epilog_info->gres_cnt_node_alloc);
6169 	xfree(epilog_info->node_list);
6170 	xfree(epilog_info);
6171 }
6172 
6173 /*
6174  * Unpack a job's allocated gres information for use by prolog/epilog
6175  * OUT gres_list - restored state stored by gres_plugin_job_alloc_pack()
6176  * IN/OUT buffer - location to read state from
6177  */
gres_plugin_job_alloc_unpack(List * gres_list,Buf buffer,uint16_t protocol_version)6178 extern int gres_plugin_job_alloc_unpack(List *gres_list, Buf buffer,
6179 					uint16_t protocol_version)
6180 {
6181 	int i = 0, rc;
6182 	uint32_t magic = 0, utmp32 = 0;
6183 	uint16_t rec_cnt = 0;
6184 	uint8_t filled = 0;
6185 	gres_epilog_info_t *gres_job_ptr = NULL;
6186 
6187 	safe_unpack16(&rec_cnt, buffer);
6188 	if (rec_cnt == 0)
6189 		return SLURM_SUCCESS;
6190 
6191 	rc = gres_plugin_init();
6192 
6193 	slurm_mutex_lock(&gres_context_lock);
6194 	if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
6195 		*gres_list = list_create(_epilog_list_del);
6196 	}
6197 
6198 	while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
6199 		if ((buffer == NULL) || (remaining_buf(buffer) == 0))
6200 			break;
6201 		rec_cnt--;
6202 
6203 		if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
6204 			safe_unpack32(&magic, buffer);
6205 			if (magic != GRES_MAGIC)
6206 				goto unpack_error;
6207 			gres_job_ptr = xmalloc(sizeof(gres_epilog_info_t));
6208 			safe_unpack32(&gres_job_ptr->plugin_id, buffer);
6209 			safe_unpack32(&gres_job_ptr->node_cnt, buffer);
6210 			if (gres_job_ptr->node_cnt > NO_VAL)
6211 				goto unpack_error;
6212 			safe_unpack8(&filled, buffer);
6213 			if (filled) {
6214 				safe_unpack64_array(
6215 					&gres_job_ptr->gres_cnt_node_alloc,
6216 					&utmp32, buffer);
6217 			}
6218 			safe_unpack8(&filled, buffer);
6219 			if (filled) {
6220 				safe_xcalloc(gres_job_ptr->gres_bit_alloc,
6221 					     gres_job_ptr->node_cnt,
6222 					     sizeof(bitstr_t *));
6223 				for (i = 0; i < gres_job_ptr->node_cnt; i++) {
6224 					unpack_bit_str_hex(&gres_job_ptr->
6225 							   gres_bit_alloc[i],
6226 							   buffer);
6227 				}
6228 			}
6229 		} else {
6230 			error("%s: protocol_version %hu not supported",
6231 			      __func__, protocol_version);
6232 			goto unpack_error;
6233 		}
6234 
6235 		for (i = 0; i < gres_context_cnt; i++) {
6236 			if (gres_context[i].plugin_id ==
6237 			    gres_job_ptr->plugin_id)
6238 				break;
6239 		}
6240 		if (i >= gres_context_cnt) {
6241 			/*
6242 			 * A likely sign that GresPlugins has changed.
6243 			 * Not a fatal error, skip over the data.
6244 			 */
6245 			error("%s: no plugin configured to unpack data type %u",
6246 			      __func__, gres_job_ptr->plugin_id);
6247 			_epilog_list_del(gres_job_ptr);
6248 			continue;
6249 		}
6250 		list_append(*gres_list, gres_job_ptr);
6251 		gres_job_ptr = NULL;
6252 	}
6253 	slurm_mutex_unlock(&gres_context_lock);
6254 	return rc;
6255 
6256 unpack_error:
6257 	error("%s: unpack error", __func__);
6258 	if (gres_job_ptr)
6259 		_epilog_list_del(gres_job_ptr);
6260 	slurm_mutex_unlock(&gres_context_lock);
6261 	return SLURM_ERROR;
6262 }
6263 
6264 /*
6265  * Build List of information needed to set job's Prolog or Epilog environment
6266  * variables
6267  *
6268  * IN job_gres_list - job's GRES allocation info
6269  * IN hostlist - list of nodes associated with the job
6270  * RET information about the job's GRES allocation needed by Prolog or Epilog
6271  */
gres_plugin_epilog_build_env(List job_gres_list,char * node_list)6272 extern List gres_plugin_epilog_build_env(List job_gres_list, char *node_list)
6273 {
6274 	int i;
6275 	ListIterator gres_iter;
6276 	gres_state_t *gres_ptr = NULL;
6277 	gres_epilog_info_t *epilog_info;
6278 	List epilog_gres_list = NULL;
6279 
6280 	if (!job_gres_list)
6281 		return NULL;
6282 
6283 	(void) gres_plugin_init();
6284 
6285 	slurm_mutex_lock(&gres_context_lock);
6286 	gres_iter = list_iterator_create(job_gres_list);
6287 	while ((gres_ptr = list_next(gres_iter))) {
6288 		for (i = 0; i < gres_context_cnt; i++) {
6289 			if (gres_ptr->plugin_id == gres_context[i].plugin_id)
6290 				break;
6291 		}
6292 		if (i >= gres_context_cnt) {
6293 			error("%s: gres not found in context.  This should never happen",
6294 			      __func__);
6295 			continue;
6296 		}
6297 
6298 		if (!gres_context[i].ops.epilog_build_env)
6299 			continue;	/* No plugin to call */
6300 		epilog_info = (*(gres_context[i].ops.epilog_build_env))
6301 				(gres_ptr->gres_data);
6302 		if (!epilog_info)
6303 			continue;	/* No info to add for this plugin */
6304 		if (!epilog_gres_list)
6305 			epilog_gres_list = list_create(_epilog_list_del);
6306 		epilog_info->plugin_id = gres_context[i].plugin_id;
6307 		epilog_info->node_list = xstrdup(node_list);
6308 		list_append(epilog_gres_list, epilog_info);
6309 	}
6310 	list_iterator_destroy(gres_iter);
6311 	slurm_mutex_unlock(&gres_context_lock);
6312 
6313 	return epilog_gres_list;
6314 }
6315 
6316 /*
6317  * Set environment variables as appropriate for a job's prolog or epilog based
6318  * GRES allocated to the job.
6319  *
6320  * IN/OUT epilog_env_ptr - environment variable array
6321  * IN epilog_gres_list - generated by TBD
6322  * IN node_inx - zero origin node index
6323  */
gres_plugin_epilog_set_env(char *** epilog_env_ptr,List epilog_gres_list,int node_inx)6324 extern void gres_plugin_epilog_set_env(char ***epilog_env_ptr,
6325 				       List epilog_gres_list, int node_inx)
6326 {
6327 	int i;
6328 	ListIterator epilog_iter;
6329 	gres_epilog_info_t *epilog_info;
6330 
6331 	*epilog_env_ptr = NULL;
6332 	if (!epilog_gres_list)
6333 		return;
6334 
6335 	(void) gres_plugin_init();
6336 
6337 	slurm_mutex_lock(&gres_context_lock);
6338 	epilog_iter = list_iterator_create(epilog_gres_list);
6339 	while ((epilog_info = list_next(epilog_iter))) {
6340 		for (i = 0; i < gres_context_cnt; i++) {
6341 			if (epilog_info->plugin_id == gres_context[i].plugin_id)
6342 				break;
6343 		}
6344 		if (i >= gres_context_cnt) {
6345 			error("%s: GRES ID %u not found in context",
6346 			      __func__, epilog_info->plugin_id);
6347 			continue;
6348 		}
6349 
6350 		if (!gres_context[i].ops.epilog_set_env)
6351 			continue;	/* No plugin to call */
6352 		(*(gres_context[i].ops.epilog_set_env))
6353 			(epilog_env_ptr, epilog_info, node_inx);
6354 	}
6355 	list_iterator_destroy(epilog_iter);
6356 	slurm_mutex_unlock(&gres_context_lock);
6357 }
6358 
6359 /*
6360  * If core bitmap from slurmd differs in size from that in slurmctld,
6361  * then modify bitmap from slurmd so we can use bit_and, bit_or, etc.
6362  */
_core_bitmap_rebuild(bitstr_t * old_core_bitmap,int new_size)6363 static bitstr_t *_core_bitmap_rebuild(bitstr_t *old_core_bitmap, int new_size)
6364 {
6365 	int i, j, old_size, ratio;
6366 	bitstr_t *new_core_bitmap;
6367 
6368 	new_core_bitmap = bit_alloc(new_size);
6369 	old_size = bit_size(old_core_bitmap);
6370 	if (old_size > new_size) {
6371 		ratio = old_size / new_size;
6372 		for (i = 0; i < new_size; i++) {
6373 			for (j = 0; j < ratio; j++) {
6374 				if (bit_test(old_core_bitmap, i*ratio+j)) {
6375 					bit_set(new_core_bitmap, i);
6376 					break;
6377 				}
6378 			}
6379 		}
6380 	} else {
6381 		ratio = new_size / old_size;
6382 		for (i = 0; i < old_size; i++) {
6383 			if (!bit_test(old_core_bitmap, i))
6384 				continue;
6385 			for (j = 0; j < ratio; j++) {
6386 				bit_set(new_core_bitmap, i*ratio+j);
6387 			}
6388 		}
6389 	}
6390 
6391 	return new_core_bitmap;
6392 }
6393 
_validate_gres_node_cores(gres_node_state_t * node_gres_ptr,int cores_ctld,char * node_name)6394 static void _validate_gres_node_cores(gres_node_state_t *node_gres_ptr,
6395 				      int cores_ctld, char *node_name)
6396 {
6397 	int i, cores_slurmd;
6398 	bitstr_t *new_core_bitmap;
6399 	int log_mismatch = true;
6400 
6401 	if (node_gres_ptr->topo_cnt == 0)
6402 		return;
6403 
6404 	if (node_gres_ptr->topo_core_bitmap == NULL) {
6405 		error("Gres topo_core_bitmap is NULL on node %s", node_name);
6406 		return;
6407 	}
6408 
6409 
6410 	for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
6411 		if (!node_gres_ptr->topo_core_bitmap[i])
6412 			continue;
6413 		cores_slurmd = bit_size(node_gres_ptr->topo_core_bitmap[i]);
6414 		if (cores_slurmd == cores_ctld)
6415 			continue;
6416 		if (log_mismatch) {
6417 			debug("Rebuilding node %s gres core bitmap (%d != %d)",
6418 			      node_name, cores_slurmd, cores_ctld);
6419 			log_mismatch = false;
6420 		}
6421 		new_core_bitmap = _core_bitmap_rebuild(
6422 					node_gres_ptr->topo_core_bitmap[i],
6423 					cores_ctld);
6424 		FREE_NULL_BITMAP(node_gres_ptr->topo_core_bitmap[i]);
6425 		node_gres_ptr->topo_core_bitmap[i] = new_core_bitmap;
6426 	}
6427 }
6428 
_job_core_filter(void * job_gres_data,void * node_gres_data,bool use_total_gres,bitstr_t * core_bitmap,int core_start_bit,int core_end_bit,char * gres_name,char * node_name,uint32_t plugin_id)6429 static void	_job_core_filter(void *job_gres_data, void *node_gres_data,
6430 				 bool use_total_gres, bitstr_t *core_bitmap,
6431 				 int core_start_bit, int core_end_bit,
6432 				 char *gres_name, char *node_name,
6433 				 uint32_t plugin_id)
6434 {
6435 	int i, j, core_ctld;
6436 	gres_job_state_t  *job_gres_ptr  = (gres_job_state_t *)  job_gres_data;
6437 	gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data;
6438 	bitstr_t *avail_core_bitmap = NULL;
6439 	bool use_busy_dev = false;
6440 
6441 	if (!node_gres_ptr->topo_cnt || !core_bitmap ||	/* No topology info */
6442 	    !job_gres_ptr->gres_per_node)		/* No job GRES */
6443 		return;
6444 
6445 	if (!use_total_gres &&
6446 	    (plugin_id == mps_plugin_id) &&
6447 	    (node_gres_ptr->gres_cnt_alloc != 0)) {
6448 		/* We must use the ONE already active GRES of this type */
6449 		use_busy_dev = true;
6450 	}
6451 
6452 	/* Determine which specific cores can be used */
6453 	avail_core_bitmap = bit_copy(core_bitmap);
6454 	bit_nclear(avail_core_bitmap, core_start_bit, core_end_bit);
6455 	for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
6456 		if (node_gres_ptr->topo_gres_cnt_avail[i] == 0)
6457 			continue;
6458 		if (!use_total_gres &&
6459 		    (node_gres_ptr->topo_gres_cnt_alloc[i] >=
6460 		     node_gres_ptr->topo_gres_cnt_avail[i]))
6461 			continue;
6462 		if (use_busy_dev &&
6463 		    (node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
6464 			continue;
6465 		if (job_gres_ptr->type_name &&
6466 		    (!node_gres_ptr->topo_type_name[i] ||
6467 		     (job_gres_ptr->type_id != node_gres_ptr->topo_type_id[i])))
6468 			continue;
6469 		if (!node_gres_ptr->topo_core_bitmap[i]) {
6470 			FREE_NULL_BITMAP(avail_core_bitmap);	/* No filter */
6471 			return;
6472 		}
6473 		core_ctld = core_end_bit - core_start_bit + 1;
6474 		_validate_gres_node_cores(node_gres_ptr, core_ctld, node_name);
6475 		core_ctld = bit_size(node_gres_ptr->topo_core_bitmap[i]);
6476 		for (j = 0; j < core_ctld; j++) {
6477 			if (bit_test(node_gres_ptr->topo_core_bitmap[i], j)) {
6478 				bit_set(avail_core_bitmap, core_start_bit + j);
6479 			}
6480 		}
6481 	}
6482 	bit_and(core_bitmap, avail_core_bitmap);
6483 	FREE_NULL_BITMAP(avail_core_bitmap);
6484 }
6485 
_job_test(void * job_gres_data,void * node_gres_data,bool use_total_gres,bitstr_t * core_bitmap,int core_start_bit,int core_end_bit,bool * topo_set,uint32_t job_id,char * node_name,char * gres_name,uint32_t plugin_id,bool disable_binding)6486 static uint32_t _job_test(void *job_gres_data, void *node_gres_data,
6487 			  bool use_total_gres, bitstr_t *core_bitmap,
6488 			  int core_start_bit, int core_end_bit, bool *topo_set,
6489 			  uint32_t job_id, char *node_name, char *gres_name,
6490 			  uint32_t plugin_id, bool disable_binding)
6491 {
6492 	int i, j, core_size, core_ctld, top_inx = -1;
6493 	uint64_t gres_avail = 0, gres_max = 0, gres_total, gres_tmp;
6494 	uint64_t min_gres_node = 0;
6495 	gres_job_state_t  *job_gres_ptr  = (gres_job_state_t *)  job_gres_data;
6496 	gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data;
6497 	uint32_t *cores_addnt = NULL; /* Additional cores avail from this GRES */
6498 	uint32_t *cores_avail = NULL; /* cores initially avail from this GRES */
6499 	uint32_t core_cnt = 0;
6500 	bitstr_t *alloc_core_bitmap = NULL;
6501 	bitstr_t *avail_core_bitmap = NULL;
6502 	bool shared_gres = _shared_gres(plugin_id);
6503 	bool use_busy_dev = false;
6504 
6505 	if (node_gres_ptr->no_consume)
6506 		use_total_gres = true;
6507 
6508 	if (!use_total_gres &&
6509 	    (plugin_id == mps_plugin_id) &&
6510 	    (node_gres_ptr->gres_cnt_alloc != 0)) {
6511 		/* We must use the ONE already active GRES of this type */
6512 		use_busy_dev = true;
6513 	}
6514 
6515 	/* Determine minimum GRES count needed on this node */
6516 	if (job_gres_ptr->gres_per_job)
6517 		min_gres_node = 1;
6518 	min_gres_node = MAX(min_gres_node, job_gres_ptr->gres_per_node);
6519 	min_gres_node = MAX(min_gres_node, job_gres_ptr->gres_per_socket);
6520 	min_gres_node = MAX(min_gres_node, job_gres_ptr->gres_per_task);
6521 
6522 	if (min_gres_node && node_gres_ptr->topo_cnt && *topo_set) {
6523 		/*
6524 		 * Need to determine how many GRES available for these
6525 		 * specific cores
6526 		 */
6527 		if (core_bitmap) {
6528 			core_ctld = core_end_bit - core_start_bit + 1;
6529 			if (core_ctld < 1) {
6530 				error("gres/%s: job %u cores on node %s < 1",
6531 				      gres_name, job_id, node_name);
6532 				return (uint32_t) 0;
6533 			}
6534 			_validate_gres_node_cores(node_gres_ptr, core_ctld,
6535 						  node_name);
6536 		}
6537 		for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
6538 			if (job_gres_ptr->type_name &&
6539 			    (!node_gres_ptr->topo_type_name[i] ||
6540 			     (node_gres_ptr->topo_type_id[i] !=
6541 			      job_gres_ptr->type_id)))
6542 				continue;
6543 			if (use_busy_dev &&
6544 			    (node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
6545 				continue;
6546 			if (!node_gres_ptr->topo_core_bitmap[i]) {
6547 				gres_avail += node_gres_ptr->
6548 					      topo_gres_cnt_avail[i];
6549 				if (!use_total_gres) {
6550 					gres_avail -= node_gres_ptr->
6551 						      topo_gres_cnt_alloc[i];
6552 				}
6553 				if (shared_gres)
6554 					gres_max = MAX(gres_max, gres_avail);
6555 				continue;
6556 			}
6557 			core_ctld = bit_size(node_gres_ptr->
6558 					     topo_core_bitmap[i]);
6559 			for (j = 0; j < core_ctld; j++) {
6560 				if (core_bitmap &&
6561 				    !bit_test(core_bitmap, core_start_bit + j))
6562 					continue;
6563 				if (!bit_test(node_gres_ptr->
6564 					      topo_core_bitmap[i], j))
6565 					continue; /* not avail for this gres */
6566 				gres_avail += node_gres_ptr->
6567 					      topo_gres_cnt_avail[i];
6568 				if (!use_total_gres) {
6569 					gres_avail -= node_gres_ptr->
6570 						      topo_gres_cnt_alloc[i];
6571 				}
6572 				if (shared_gres)
6573 					gres_max = MAX(gres_max, gres_avail);
6574 				break;
6575 			}
6576 		}
6577 		if (shared_gres)
6578 			gres_avail = gres_max;
6579 		if (min_gres_node > gres_avail)
6580 			return (uint32_t) 0;	/* insufficient GRES avail */
6581 		return NO_VAL;
6582 	} else if (min_gres_node && node_gres_ptr->topo_cnt &&
6583 		   !disable_binding) {
6584 		/* Need to determine which specific cores can be used */
6585 		gres_avail = node_gres_ptr->gres_cnt_avail;
6586 		if (!use_total_gres)
6587 			gres_avail -= node_gres_ptr->gres_cnt_alloc;
6588 		if (min_gres_node > gres_avail)
6589 			return (uint32_t) 0;	/* insufficient GRES avail */
6590 
6591 		core_ctld = core_end_bit - core_start_bit + 1;
6592 		if (core_bitmap) {
6593 			if (core_ctld < 1) {
6594 				error("gres/%s: job %u cores on node %s < 1",
6595 				      gres_name, job_id, node_name);
6596 				return (uint32_t) 0;
6597 			}
6598 			_validate_gres_node_cores(node_gres_ptr, core_ctld,
6599 						  node_name);
6600 		} else {
6601 			for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
6602 				if (!node_gres_ptr->topo_core_bitmap[i])
6603 					continue;
6604 				core_ctld = bit_size(node_gres_ptr->
6605 						     topo_core_bitmap[i]);
6606 				break;
6607 			}
6608 		}
6609 
6610 		alloc_core_bitmap = bit_alloc(core_ctld);
6611 		if (core_bitmap) {
6612 			for (j = 0; j < core_ctld; j++) {
6613 				if (bit_test(core_bitmap, core_start_bit + j))
6614 					bit_set(alloc_core_bitmap, j);
6615 			}
6616 		} else {
6617 			bit_nset(alloc_core_bitmap, 0, core_ctld - 1);
6618 		}
6619 
6620 		avail_core_bitmap = bit_copy(alloc_core_bitmap);
6621 		cores_addnt = xcalloc(node_gres_ptr->topo_cnt,
6622 				      sizeof(uint32_t));
6623 		cores_avail = xcalloc(node_gres_ptr->topo_cnt,
6624 				      sizeof(uint32_t));
6625 		for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
6626 			if (node_gres_ptr->topo_gres_cnt_avail[i] == 0)
6627 				continue;
6628 			if (use_busy_dev &&
6629 			    (node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
6630 				continue;
6631 			if (!use_total_gres &&
6632 			    (node_gres_ptr->topo_gres_cnt_alloc[i] >=
6633 			     node_gres_ptr->topo_gres_cnt_avail[i]))
6634 				continue;
6635 			if (job_gres_ptr->type_name &&
6636 			    (!node_gres_ptr->topo_type_name[i] ||
6637 			     (node_gres_ptr->topo_type_id[i] !=
6638 			      job_gres_ptr->type_id)))
6639 				continue;
6640 			if (!node_gres_ptr->topo_core_bitmap[i]) {
6641 				cores_avail[i] = core_end_bit -
6642 						 core_start_bit + 1;
6643 				continue;
6644 			}
6645 			core_size = bit_size(node_gres_ptr->topo_core_bitmap[i]);
6646 			for (j = 0; j < core_size; j++) {
6647 				if (core_bitmap &&
6648 				    !bit_test(core_bitmap, core_start_bit + j))
6649 					continue;
6650 				if (bit_test(node_gres_ptr->
6651 					     topo_core_bitmap[i], j)) {
6652 					cores_avail[i]++;
6653 				}
6654 			}
6655 		}
6656 
6657 		/* Pick the topology entries with the most cores available */
6658 		gres_avail = 0;
6659 		gres_total = 0;
6660 		while (gres_avail < min_gres_node) {
6661 			top_inx = -1;
6662 			for (j = 0; j < node_gres_ptr->topo_cnt; j++) {
6663 				if ((gres_avail == 0) || (cores_avail[j] == 0) ||
6664 				    !node_gres_ptr->topo_core_bitmap[j]) {
6665 					cores_addnt[j] = cores_avail[j];
6666 				} else {
6667 					cores_addnt[j] = cores_avail[j] -
6668 						bit_overlap(alloc_core_bitmap,
6669 							    node_gres_ptr->
6670 							    topo_core_bitmap[j]);
6671 				}
6672 
6673 				if (top_inx == -1) {
6674 					if (cores_avail[j])
6675 						top_inx = j;
6676 				} else if (cores_addnt[j] > cores_addnt[top_inx])
6677 					top_inx = j;
6678 			}
6679 			if ((top_inx < 0) || (cores_avail[top_inx] == 0)) {
6680 				if (gres_total < min_gres_node)
6681 					core_cnt = 0;
6682 				break;
6683 			}
6684 			cores_avail[top_inx] = 0;	/* Flag as used */
6685 			gres_tmp = node_gres_ptr->topo_gres_cnt_avail[top_inx];
6686 			if (!use_total_gres &&
6687 			    (gres_tmp >=
6688 			     node_gres_ptr->topo_gres_cnt_alloc[top_inx])) {
6689 				gres_tmp -= node_gres_ptr->
6690 					    topo_gres_cnt_alloc[top_inx];
6691 			} else if (!use_total_gres) {
6692 				gres_tmp = 0;
6693 			}
6694 			if (gres_tmp == 0) {
6695 				error("gres/%s: topology allocation error on node %s",
6696 				      gres_name, node_name);
6697 				break;
6698 			}
6699 			/* update counts of allocated cores and GRES */
6700 			if (shared_gres) {
6701 				/*
6702 				 * Process outside of loop after specific
6703 				 * device selected
6704 				 */
6705 			} else if (!node_gres_ptr->topo_core_bitmap[top_inx]) {
6706 				bit_nset(alloc_core_bitmap, 0, core_ctld - 1);
6707 			} else if (gres_avail) {
6708 				bit_or(alloc_core_bitmap,
6709 				       node_gres_ptr->
6710 				       topo_core_bitmap[top_inx]);
6711 				if (core_bitmap)
6712 					bit_and(alloc_core_bitmap,
6713 						avail_core_bitmap);
6714 			} else {
6715 				bit_and(alloc_core_bitmap,
6716 					node_gres_ptr->
6717 					topo_core_bitmap[top_inx]);
6718 			}
6719 			if (shared_gres) {
6720 				gres_total = MAX(gres_total, gres_tmp);
6721 				gres_avail = gres_total;
6722 			} else {
6723 				/*
6724 				 * Available GRES count is up to gres_tmp,
6725 				 * but take 1 per loop to maximize available
6726 				 * core count
6727 				 */
6728 				gres_avail += 1;
6729 				gres_total += gres_tmp;
6730 				core_cnt = bit_set_count(alloc_core_bitmap);
6731 			}
6732 		}
6733 		if (shared_gres && (top_inx >= 0) &&
6734 		    (gres_avail >= min_gres_node)) {
6735 			if (!node_gres_ptr->topo_core_bitmap[top_inx]) {
6736 				bit_nset(alloc_core_bitmap, 0, core_ctld - 1);
6737 			} else {
6738 				bit_or(alloc_core_bitmap,
6739 				       node_gres_ptr->
6740 				       topo_core_bitmap[top_inx]);
6741 				if (core_bitmap)
6742 					bit_and(alloc_core_bitmap,
6743 						avail_core_bitmap);
6744 			}
6745 			core_cnt = bit_set_count(alloc_core_bitmap);
6746 		}
6747 		if (core_bitmap && (core_cnt > 0)) {
6748 			*topo_set = true;
6749 			for (i = 0; i < core_ctld; i++) {
6750 				if (!bit_test(alloc_core_bitmap, i)) {
6751 					bit_clear(core_bitmap,
6752 						  core_start_bit + i);
6753 				}
6754 			}
6755 		}
6756 		FREE_NULL_BITMAP(alloc_core_bitmap);
6757 		FREE_NULL_BITMAP(avail_core_bitmap);
6758 		xfree(cores_addnt);
6759 		xfree(cores_avail);
6760 		return core_cnt;
6761 	} else if (job_gres_ptr->type_name) {
6762 		for (i = 0; i < node_gres_ptr->type_cnt; i++) {
6763 			if (node_gres_ptr->type_name[i] &&
6764 			    (node_gres_ptr->type_id[i] ==
6765 			     job_gres_ptr->type_id))
6766 				break;
6767 		}
6768 		if (i >= node_gres_ptr->type_cnt)
6769 			return (uint32_t) 0;	/* no such type */
6770 		gres_avail = node_gres_ptr->type_cnt_avail[i];
6771 		if (!use_total_gres)
6772 			gres_avail -= node_gres_ptr->type_cnt_alloc[i];
6773 		gres_tmp = node_gres_ptr->gres_cnt_avail;
6774 		if (!use_total_gres)
6775 			gres_tmp -= node_gres_ptr->gres_cnt_alloc;
6776 		gres_avail = MIN(gres_avail, gres_tmp);
6777 		if (min_gres_node > gres_avail)
6778 			return (uint32_t) 0;	/* insufficient GRES avail */
6779 		return NO_VAL;
6780 	} else {
6781 		gres_avail = node_gres_ptr->gres_cnt_avail;
6782 		if (!use_total_gres)
6783 			gres_avail -= node_gres_ptr->gres_cnt_alloc;
6784 		if (min_gres_node > gres_avail)
6785 			return (uint32_t) 0;	/* insufficient GRES avail */
6786 		return NO_VAL;
6787 	}
6788 }
6789 
6790 /*
6791  * Clear the core_bitmap for cores which are not usable by this job (i.e. for
6792  *	cores which are already bound to other jobs or lack GRES)
6793  * IN job_gres_list   - job's gres_list built by gres_plugin_job_state_validate()
6794  * IN node_gres_list  - node's gres_list built by
6795  *                      gres_plugin_node_config_validate()
6796  * IN use_total_gres  - if set then consider all GRES resources as available,
6797  *		        and none are commited to running jobs
6798  * IN/OUT core_bitmap - Identification of available cores (NULL if no restriction)
6799  * IN core_start_bit  - index into core_bitmap for this node's first cores
6800  * IN core_end_bit    - index into core_bitmap for this node's last cores
6801  */
gres_plugin_job_core_filter(List job_gres_list,List node_gres_list,bool use_total_gres,bitstr_t * core_bitmap,int core_start_bit,int core_end_bit,char * node_name)6802 extern void gres_plugin_job_core_filter(List job_gres_list, List node_gres_list,
6803 					bool use_total_gres,
6804 					bitstr_t *core_bitmap,
6805 					int core_start_bit, int core_end_bit,
6806 					char *node_name)
6807 {
6808 	int i;
6809 	ListIterator  job_gres_iter;
6810 	gres_state_t *job_gres_ptr, *node_gres_ptr;
6811 
6812 	if ((job_gres_list == NULL) || (core_bitmap == NULL))
6813 		return;
6814 	if (node_gres_list == NULL) {
6815 		bit_nclear(core_bitmap, core_start_bit, core_end_bit);
6816 		return;
6817 	}
6818 
6819 	(void) gres_plugin_init();
6820 
6821 	slurm_mutex_lock(&gres_context_lock);
6822 	job_gres_iter = list_iterator_create(job_gres_list);
6823 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
6824 		node_gres_ptr = list_find_first(node_gres_list, _gres_find_id,
6825 		                                &job_gres_ptr->plugin_id);
6826 		if (node_gres_ptr == NULL) {
6827 			/* node lack resources required by the job */
6828 			bit_nclear(core_bitmap, core_start_bit, core_end_bit);
6829 			break;
6830 		}
6831 
6832 		for (i = 0; i < gres_context_cnt; i++) {
6833 			if (job_gres_ptr->plugin_id !=
6834 			    gres_context[i].plugin_id)
6835 				continue;
6836 			_job_core_filter(job_gres_ptr->gres_data,
6837 					 node_gres_ptr->gres_data,
6838 					 use_total_gres, core_bitmap,
6839 					 core_start_bit, core_end_bit,
6840 					 gres_context[i].gres_name, node_name,
6841 					 job_gres_ptr->plugin_id);
6842 			break;
6843 		}
6844 	}
6845 	list_iterator_destroy(job_gres_iter);
6846 	slurm_mutex_unlock(&gres_context_lock);
6847 
6848 	return;
6849 }
6850 
6851 /*
6852  * Determine how many cores on the node can be used by this job
6853  * IN job_gres_list  - job's gres_list built by gres_plugin_job_state_validate()
6854  * IN node_gres_list - node's gres_list built by gres_plugin_node_config_validate()
6855  * IN use_total_gres - if set then consider all gres resources as available,
6856  *		       and none are commited to running jobs
6857  * IN core_bitmap    - Identification of available cores (NULL if no restriction)
6858  * IN core_start_bit - index into core_bitmap for this node's first core
6859  * IN core_end_bit   - index into core_bitmap for this node's last core
6860  * IN job_id         - job's ID (for logging)
6861  * IN node_name      - name of the node (for logging)
6862  * IN disable binding- --gres-flags=disable-binding
6863  * RET: NO_VAL    - All cores on node are available
6864  *      otherwise - Count of available cores
6865  */
gres_plugin_job_test(List job_gres_list,List node_gres_list,bool use_total_gres,bitstr_t * core_bitmap,int core_start_bit,int core_end_bit,uint32_t job_id,char * node_name,bool disable_binding)6866 extern uint32_t gres_plugin_job_test(List job_gres_list, List node_gres_list,
6867 				     bool use_total_gres, bitstr_t *core_bitmap,
6868 				     int core_start_bit, int core_end_bit,
6869 				     uint32_t job_id, char *node_name,
6870 				     bool disable_binding)
6871 {
6872 	int i;
6873 	uint32_t core_cnt, tmp_cnt;
6874 	ListIterator job_gres_iter;
6875 	gres_state_t *job_gres_ptr, *node_gres_ptr;
6876 	bool topo_set = false;
6877 
6878 	if (job_gres_list == NULL)
6879 		return NO_VAL;
6880 	if (node_gres_list == NULL)
6881 		return 0;
6882 
6883 	core_cnt = NO_VAL;
6884 	(void) gres_plugin_init();
6885 
6886 	slurm_mutex_lock(&gres_context_lock);
6887 	job_gres_iter = list_iterator_create(job_gres_list);
6888 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
6889 		node_gres_ptr = list_find_first(node_gres_list, _gres_find_id,
6890 		                                &job_gres_ptr->plugin_id);
6891 		if (node_gres_ptr == NULL) {
6892 			/* node lack resources required by the job */
6893 			core_cnt = 0;
6894 			break;
6895 		}
6896 
6897 		for (i = 0; i < gres_context_cnt; i++) {
6898 			if (job_gres_ptr->plugin_id !=
6899 			    gres_context[i].plugin_id)
6900 				continue;
6901 			tmp_cnt = _job_test(job_gres_ptr->gres_data,
6902 					    node_gres_ptr->gres_data,
6903 					    use_total_gres, core_bitmap,
6904 					    core_start_bit, core_end_bit,
6905 					    &topo_set, job_id, node_name,
6906 					    gres_context[i].gres_name,
6907 					    gres_context[i].plugin_id,
6908 					    disable_binding);
6909 			if (tmp_cnt != NO_VAL) {
6910 				if (core_cnt == NO_VAL)
6911 					core_cnt = tmp_cnt;
6912 				else
6913 					core_cnt = MIN(tmp_cnt, core_cnt);
6914 			}
6915 			break;
6916 		}
6917 		if (core_cnt == 0)
6918 			break;
6919 	}
6920 	list_iterator_destroy(job_gres_iter);
6921 	slurm_mutex_unlock(&gres_context_lock);
6922 
6923 	return core_cnt;
6924 }
6925 
_sock_gres_del(void * x)6926 static void _sock_gres_del(void *x)
6927 {
6928 	sock_gres_t *sock_gres = (sock_gres_t *) x;
6929 	int s;
6930 
6931 	if (sock_gres) {
6932 		FREE_NULL_BITMAP(sock_gres->bits_any_sock);
6933 		if (sock_gres->bits_by_sock) {
6934 			for (s = 0; s < sock_gres->sock_cnt; s++)
6935 				FREE_NULL_BITMAP(sock_gres->bits_by_sock[s]);
6936 			xfree(sock_gres->bits_by_sock);
6937 		}
6938 		xfree(sock_gres->cnt_by_sock);
6939 		xfree(sock_gres->gres_name);
6940 		/* NOTE: sock_gres->job_specs is just a pointer, do not free */
6941 		xfree(sock_gres->type_name);
6942 		xfree(sock_gres);
6943 	}
6944 }
6945 
6946 /*
6947  * Build a string containing the GRES details for a given node and socket
6948  * sock_gres_list IN - List of sock_gres_t entries
6949  * sock_inx IN - zero-origin socket for which information is to be returned
6950  *		 if value < 0, then report GRES unconstrained by core
6951  * RET string, must call xfree() to release memory
6952  */
gres_plugin_sock_str(List sock_gres_list,int sock_inx)6953 extern char *gres_plugin_sock_str(List sock_gres_list, int sock_inx)
6954 {
6955 	ListIterator iter;
6956 	sock_gres_t *sock_gres;
6957 	char *gres_str = NULL, *sep = "";
6958 
6959 	if (!sock_gres_list)
6960 		return NULL;
6961 
6962 	iter = list_iterator_create(sock_gres_list);
6963 	while ((sock_gres = (sock_gres_t *) list_next(iter))) {
6964 		if (sock_inx < 0) {
6965 			if (sock_gres->cnt_any_sock) {
6966 				if (sock_gres->type_name) {
6967 					xstrfmtcat(gres_str, "%s%s:%s:%"PRIu64,
6968 						   sep, sock_gres->gres_name,
6969 						   sock_gres->type_name,
6970 						   sock_gres->cnt_any_sock);
6971 				} else {
6972 					xstrfmtcat(gres_str, "%s%s:%"PRIu64,
6973 						   sep, sock_gres->gres_name,
6974 						   sock_gres->cnt_any_sock);
6975 				}
6976 				sep = " ";
6977 			}
6978 			continue;
6979 		}
6980 		if (!sock_gres->cnt_by_sock ||
6981 		    (sock_gres->cnt_by_sock[sock_inx] == 0))
6982 			continue;
6983 		if (sock_gres->type_name) {
6984 			xstrfmtcat(gres_str, "%s%s:%s:%"PRIu64, sep,
6985 				   sock_gres->gres_name, sock_gres->type_name,
6986 				   sock_gres->cnt_by_sock[sock_inx]);
6987 		} else {
6988 			xstrfmtcat(gres_str, "%s%s:%"PRIu64, sep,
6989 				   sock_gres->gres_name,
6990 				   sock_gres->cnt_by_sock[sock_inx]);
6991 		}
6992 		sep = " ";
6993 	}
6994 	list_iterator_destroy(iter);
6995 	return gres_str;
6996 }
6997 
6998 /*
6999  * Determine how many GRES of a given type can be used by this job on a
7000  * given node and return a structure with the details. Note that multiple
7001  * GRES of a given type model can be distributed over multiple topo structures,
7002  * so we need to OR the core_bitmap over all of them.
7003  */
_build_sock_gres_by_topo(gres_job_state_t * job_gres_ptr,gres_node_state_t * node_gres_ptr,bool use_total_gres,bitstr_t * core_bitmap,uint16_t sockets,uint16_t cores_per_sock,uint32_t job_id,char * node_name,bool enforce_binding,uint32_t s_p_n,bitstr_t ** req_sock_map,uint32_t main_plugin_id,uint32_t alt_plugin_id,gres_node_state_t * alt_node_gres_ptr,uint32_t user_id,const uint32_t node_inx)7004 static sock_gres_t *_build_sock_gres_by_topo(gres_job_state_t *job_gres_ptr,
7005 				gres_node_state_t *node_gres_ptr,
7006 				bool use_total_gres, bitstr_t *core_bitmap,
7007 				uint16_t sockets, uint16_t cores_per_sock,
7008 				uint32_t job_id, char *node_name,
7009 				bool enforce_binding, uint32_t s_p_n,
7010 				bitstr_t **req_sock_map,
7011 				uint32_t main_plugin_id, uint32_t alt_plugin_id,
7012 				gres_node_state_t *alt_node_gres_ptr,
7013 				uint32_t user_id, const uint32_t node_inx)
7014 {
7015 	int i, j, s, c, tot_cores;
7016 	sock_gres_t *sock_gres;
7017 	int64_t add_gres;
7018 	uint64_t avail_gres, min_gres = 1;
7019 	bool match = false;
7020 	bool use_busy_dev = false;
7021 
7022 	if (node_gres_ptr->gres_cnt_avail == 0)
7023 		return NULL;
7024 
7025 	if (!use_total_gres &&
7026 	    (main_plugin_id == mps_plugin_id) &&
7027 	    (node_gres_ptr->gres_cnt_alloc != 0)) {
7028 		/* We must use the ONE already active GRES of this type */
7029 		use_busy_dev = true;
7030 	}
7031 
7032 	sock_gres = xmalloc(sizeof(sock_gres_t));
7033 	sock_gres->sock_cnt = sockets;
7034 	sock_gres->bits_by_sock = xcalloc(sockets, sizeof(bitstr_t *));
7035 	sock_gres->cnt_by_sock = xcalloc(sockets, sizeof(uint64_t));
7036 	for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
7037 		bool use_all_sockets = false;
7038 		if (job_gres_ptr->type_name &&
7039 		    (job_gres_ptr->type_id != node_gres_ptr->topo_type_id[i]))
7040 			continue;	/* Wrong type_model */
7041 		if (use_busy_dev &&
7042 		    (node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
7043 			continue;
7044 		if (!use_total_gres && !node_gres_ptr->no_consume &&
7045 		    (node_gres_ptr->topo_gres_cnt_alloc[i] >=
7046 		     node_gres_ptr->topo_gres_cnt_avail[i])) {
7047 			continue;	/* No GRES remaining */
7048 		}
7049 
7050 		if (!use_total_gres && !node_gres_ptr->no_consume) {
7051 			avail_gres = node_gres_ptr->topo_gres_cnt_avail[i] -
7052 				     node_gres_ptr->topo_gres_cnt_alloc[i];
7053 		} else {
7054 			avail_gres = node_gres_ptr->topo_gres_cnt_avail[i];
7055 		}
7056 		if (avail_gres == 0)
7057 			continue;
7058 
7059 		/*
7060 		 * Job requested GPUs or MPS. Filter out resources already
7061 		 * allocated to the other GRES type.
7062 		 */
7063 		if (alt_node_gres_ptr && alt_node_gres_ptr->gres_bit_alloc &&
7064 		    node_gres_ptr->topo_gres_bitmap[i]) {
7065 			c = bit_overlap(node_gres_ptr->topo_gres_bitmap[i],
7066 					alt_node_gres_ptr->gres_bit_alloc);
7067 			if ((alt_plugin_id == gpu_plugin_id) && (c > 0))
7068 				continue;
7069 			if ((alt_plugin_id == mps_plugin_id) && (c > 0)) {
7070 				avail_gres -= c;
7071 				if (avail_gres == 0)
7072 					continue;
7073 			}
7074 		}
7075 
7076 		/* gres/mps can only use one GPU per node */
7077 		if ((main_plugin_id == mps_plugin_id) &&
7078 		    (avail_gres > sock_gres->max_node_gres))
7079 			sock_gres->max_node_gres = avail_gres;
7080 
7081 		/*
7082 		 * If some GRES is available on every socket,
7083 		 * treat like no topo_core_bitmap is specified
7084 		 */
7085 		tot_cores = sockets * cores_per_sock;
7086 		if (node_gres_ptr->topo_core_bitmap &&
7087 		    node_gres_ptr->topo_core_bitmap[i]) {
7088 			use_all_sockets = true;
7089 			for (s = 0; s < sockets; s++) {
7090 				bool use_this_socket = false;
7091 				for (c = 0; c < cores_per_sock; c++) {
7092 					j = (s * cores_per_sock) + c;
7093 					if (bit_test(node_gres_ptr->
7094 						     topo_core_bitmap[i], j)) {
7095 						use_this_socket = true;
7096 						break;
7097 					}
7098 				}
7099 				if (!use_this_socket) {
7100 					use_all_sockets = false;
7101 					break;
7102 				}
7103 			}
7104 		}
7105 
7106 		if (!node_gres_ptr->topo_core_bitmap ||
7107 		    !node_gres_ptr->topo_core_bitmap[i] ||
7108 		    use_all_sockets) {
7109 			/*
7110 			 * Not constrained by core, but only specific
7111 			 * GRES may be available (save their bitmap)
7112 			 */
7113 			sock_gres->cnt_any_sock += avail_gres;
7114 			sock_gres->total_cnt += avail_gres;
7115 			if (!sock_gres->bits_any_sock) {
7116 				sock_gres->bits_any_sock =
7117 					bit_copy(node_gres_ptr->
7118 						 topo_gres_bitmap[i]);
7119 			} else {
7120 				bit_or(sock_gres->bits_any_sock,
7121 				       node_gres_ptr->topo_gres_bitmap[i]);
7122 			}
7123 			match = true;
7124 			continue;
7125 		}
7126 
7127 		/* Constrained by core */
7128 		if (core_bitmap)
7129 			tot_cores = MIN(tot_cores, bit_size(core_bitmap));
7130 		if (node_gres_ptr->topo_core_bitmap[i]) {
7131 			tot_cores = MIN(tot_cores,
7132 					bit_size(node_gres_ptr->
7133 						 topo_core_bitmap[i]));
7134 		}
7135 		for (s = 0; ((s < sockets) && avail_gres); s++) {
7136 			if (enforce_binding && core_bitmap) {
7137 				for (c = 0; c < cores_per_sock; c++) {
7138 					j = (s * cores_per_sock) + c;
7139 					if (bit_test(core_bitmap, j))
7140 						break;
7141 				}
7142 				if (c >= cores_per_sock) {
7143 					/* No available cores on this socket */
7144 					continue;
7145 				}
7146 			}
7147 			for (c = 0; c < cores_per_sock; c++) {
7148 				j = (s * cores_per_sock) + c;
7149 				if (j >= tot_cores)
7150 					break;	/* Off end of core bitmap */
7151 				if (node_gres_ptr->topo_core_bitmap[i] &&
7152 				    !bit_test(node_gres_ptr->topo_core_bitmap[i],
7153 					      j))
7154 					continue;
7155 				if (!node_gres_ptr->topo_gres_bitmap[i]) {
7156 					error("%s: topo_gres_bitmap NULL on node %s",
7157 					      __func__, node_name);
7158 					continue;
7159 				}
7160 				if (!sock_gres->bits_by_sock[s]) {
7161 					sock_gres->bits_by_sock[s] =
7162 						bit_copy(node_gres_ptr->
7163 							 topo_gres_bitmap[i]);
7164 				} else {
7165 					bit_or(sock_gres->bits_by_sock[s],
7166 					       node_gres_ptr->topo_gres_bitmap[i]);
7167 				}
7168 				sock_gres->cnt_by_sock[s] += avail_gres;
7169 				sock_gres->total_cnt += avail_gres;
7170 				avail_gres = 0;
7171 				match = true;
7172 				break;
7173 			}
7174 		}
7175 	}
7176 
7177 	/* Process per-GRES limits */
7178 	if (match && job_gres_ptr->gres_per_socket) {
7179 		/*
7180 		 * Clear core bitmap on sockets with insufficient GRES
7181 		 * and disable excess GRES per socket
7182 		 */
7183 		for (s = 0; s < sockets; s++) {
7184 			if (sock_gres->cnt_by_sock[s] <
7185 			    job_gres_ptr->gres_per_socket) {
7186 				/* Insufficient GRES, clear count */
7187 				sock_gres->total_cnt -=
7188 					sock_gres->cnt_by_sock[s];
7189 				sock_gres->cnt_by_sock[s] = 0;
7190 				if (enforce_binding && core_bitmap) {
7191 					i = s * cores_per_sock;
7192 					bit_nclear(core_bitmap, i,
7193 						   i + cores_per_sock - 1);
7194 				}
7195 			} else if (sock_gres->cnt_by_sock[s] >
7196 				   job_gres_ptr->gres_per_socket) {
7197 				/* Excess GRES, reduce count */
7198 				i = sock_gres->cnt_by_sock[s] -
7199 				    job_gres_ptr->gres_per_socket;
7200 				sock_gres->cnt_by_sock[s] =
7201 					job_gres_ptr->gres_per_socket;
7202 				sock_gres->total_cnt -= i;
7203 			}
7204 		}
7205 	}
7206 
7207 	/*
7208 	 * Satisfy sockets-per-node (s_p_n) limit by selecting the sockets with
7209 	 * the most GRES. Sockets with low GRES counts have their core_bitmap
7210 	 * cleared so that _allocate_sc() in cons_tres/job_test.c does not
7211 	 * remove sockets needed to satisfy the job's GRES specification.
7212 	 */
7213 	if (match && enforce_binding && core_bitmap && (s_p_n < sockets)) {
7214 		int avail_sock = 0;
7215 		bool *avail_sock_flag = xcalloc(sockets, sizeof(bool));
7216 		for (s = 0; s < sockets; s++) {
7217 			if (sock_gres->cnt_by_sock[s] == 0)
7218 				continue;
7219 			for (c = 0; c < cores_per_sock; c++) {
7220 				i = (s * cores_per_sock) + c;
7221 				if (!bit_test(core_bitmap, i))
7222 					continue;
7223 				avail_sock++;
7224 				avail_sock_flag[s] = true;
7225 				break;
7226 			}
7227 		}
7228 		while (avail_sock > s_p_n) {
7229 			int low_gres_sock_inx = -1;
7230 			for (s = 0; s < sockets; s++) {
7231 				if (!avail_sock_flag[s])
7232 					continue;
7233 				if ((low_gres_sock_inx == -1) ||
7234 				    (sock_gres->cnt_by_sock[s] <
7235 				     sock_gres->cnt_by_sock[low_gres_sock_inx]))
7236 					low_gres_sock_inx = s;
7237 			}
7238 			if (low_gres_sock_inx == -1)
7239 				break;
7240 			s = low_gres_sock_inx;
7241 			i = s * cores_per_sock;
7242 			bit_nclear(core_bitmap, i, i + cores_per_sock - 1);
7243 			sock_gres->total_cnt -= sock_gres->cnt_by_sock[s];
7244 			sock_gres->cnt_by_sock[s] = 0;
7245 			avail_sock--;
7246 			avail_sock_flag[s] = false;
7247 		}
7248 		xfree(avail_sock_flag);
7249 	}
7250 
7251 	if (match) {
7252 		if (job_gres_ptr->gres_per_node)
7253 			min_gres = job_gres_ptr->gres_per_node;
7254 		if (job_gres_ptr->gres_per_task)
7255 			min_gres = MAX(min_gres, job_gres_ptr->gres_per_task);
7256 		if (sock_gres->total_cnt < min_gres)
7257 			match = false;
7258 	}
7259 
7260 
7261 	/*
7262 	 * If sockets-per-node (s_p_n) not specified then identify sockets
7263 	 * which are required to satisfy gres_per_node or task specification
7264 	 * so that allocated tasks can be distributed over multiple sockets
7265 	 * if necessary.
7266 	 */
7267 	add_gres = min_gres - sock_gres->cnt_any_sock;
7268 	if (match && core_bitmap && (s_p_n == NO_VAL) && (add_gres > 0) &&
7269 	    job_gres_ptr->gres_per_node) {
7270 		int avail_sock = 0, best_sock_inx = -1;
7271 		bool *avail_sock_flag = xcalloc(sockets, sizeof(bool));
7272 		for (s = 0; s < sockets; s++) {
7273 			if (sock_gres->cnt_by_sock[s] == 0)
7274 				continue;
7275 			for (c = 0; c < cores_per_sock; c++) {
7276 				i = (s * cores_per_sock) + c;
7277 				if (!bit_test(core_bitmap, i))
7278 					continue;
7279 				avail_sock++;
7280 				avail_sock_flag[s] = true;
7281 				if ((best_sock_inx == -1) ||
7282 				    (sock_gres->cnt_by_sock[s] >
7283 				     sock_gres->cnt_by_sock[best_sock_inx])) {
7284 					best_sock_inx = s;
7285 				}
7286 				break;
7287 			}
7288 		}
7289 		while ((best_sock_inx != -1) && (add_gres > 0)) {
7290 			if (*req_sock_map == NULL)
7291 				*req_sock_map = bit_alloc(sockets);
7292 			bit_set(*req_sock_map, best_sock_inx);
7293 			add_gres -= sock_gres->cnt_by_sock[best_sock_inx];
7294 			avail_sock_flag[best_sock_inx] = false;
7295 			if (add_gres <= 0)
7296 				break;
7297 			/* Find next best socket */
7298 			best_sock_inx = -1;
7299 			for (s = 0; s < sockets; s++) {
7300 				if ((sock_gres->cnt_by_sock[s] == 0) ||
7301 				    !avail_sock_flag[s])
7302 					continue;
7303 				if ((best_sock_inx == -1) ||
7304 				    (sock_gres->cnt_by_sock[s] >
7305 				     sock_gres->cnt_by_sock[best_sock_inx])) {
7306 					best_sock_inx = s;
7307 				}
7308 			}
7309 		}
7310 		xfree(avail_sock_flag);
7311 	}
7312 
7313 	if (match) {
7314 		sock_gres->type_id = job_gres_ptr->type_id;
7315 		sock_gres->type_name = xstrdup(job_gres_ptr->type_name);
7316 	} else {
7317 		_sock_gres_del(sock_gres);
7318 		sock_gres = NULL;
7319 	}
7320 	return sock_gres;
7321 }
7322 
7323 /*
7324  * Determine how many GRES of a given type can be used by this job on a
7325  * given node and return a structure with the details. Note that multiple
7326  * GRES of a given type model can be configured, so pick the right one.
7327  */
_build_sock_gres_by_type(gres_job_state_t * job_gres_ptr,gres_node_state_t * node_gres_ptr,bool use_total_gres,bitstr_t * core_bitmap,uint16_t sockets,uint16_t cores_per_sock,uint32_t job_id,char * node_name)7328 static sock_gres_t *_build_sock_gres_by_type(gres_job_state_t *job_gres_ptr,
7329 				gres_node_state_t *node_gres_ptr,
7330 				bool use_total_gres, bitstr_t *core_bitmap,
7331 				uint16_t sockets, uint16_t cores_per_sock,
7332 				uint32_t job_id, char *node_name)
7333 {
7334 	int i;
7335 	sock_gres_t *sock_gres;
7336 	uint64_t avail_gres, min_gres = 1, gres_tmp;
7337 	bool match = false;
7338 
7339 	if (job_gres_ptr->gres_per_node)
7340 		min_gres = job_gres_ptr-> gres_per_node;
7341 	if (job_gres_ptr->gres_per_socket)
7342 		min_gres = MAX(min_gres, job_gres_ptr->gres_per_socket);
7343 	if (job_gres_ptr->gres_per_task)
7344 		min_gres = MAX(min_gres, job_gres_ptr->gres_per_task);
7345 	sock_gres = xmalloc(sizeof(sock_gres_t));
7346 	for (i = 0; i < node_gres_ptr->type_cnt; i++) {
7347 		if (job_gres_ptr->type_name &&
7348 		    (job_gres_ptr->type_id != node_gres_ptr->type_id[i]))
7349 			continue;	/* Wrong type_model */
7350 		if (!use_total_gres &&
7351 		    (node_gres_ptr->type_cnt_alloc[i] >=
7352 		     node_gres_ptr->type_cnt_avail[i])) {
7353 			continue;	/* No GRES remaining */
7354 		} else if (!use_total_gres) {
7355 			avail_gres = node_gres_ptr->type_cnt_avail[i] -
7356 				     node_gres_ptr->type_cnt_alloc[i];
7357 		} else {
7358 			avail_gres = node_gres_ptr->type_cnt_avail[i];
7359 		}
7360 		gres_tmp = node_gres_ptr->gres_cnt_avail;
7361 		if (!use_total_gres)
7362 			gres_tmp -= node_gres_ptr->gres_cnt_alloc;
7363 		avail_gres = MIN(avail_gres, gres_tmp);
7364 		if (avail_gres < min_gres)
7365 			continue;	/* Insufficient GRES remaining */
7366 		sock_gres->cnt_any_sock += avail_gres;
7367 		sock_gres->total_cnt += avail_gres;
7368 		match = true;
7369 	}
7370 	if (match) {
7371 		sock_gres->type_id = job_gres_ptr->type_id;
7372 		sock_gres->type_name = xstrdup(job_gres_ptr->type_name);
7373 	} else
7374 		xfree(sock_gres);
7375 
7376 	return sock_gres;
7377 }
7378 
7379 /*
7380  * Determine how many GRES of a given type can be used by this job on a
7381  * given node and return a structure with the details. No GRES type.
7382  */
_build_sock_gres_basic(gres_job_state_t * job_gres_ptr,gres_node_state_t * node_gres_ptr,bool use_total_gres,bitstr_t * core_bitmap,uint16_t sockets,uint16_t cores_per_sock,uint32_t job_id,char * node_name)7383 static sock_gres_t *_build_sock_gres_basic(gres_job_state_t *job_gres_ptr,
7384 				gres_node_state_t *node_gres_ptr,
7385 				bool use_total_gres, bitstr_t *core_bitmap,
7386 				uint16_t sockets, uint16_t cores_per_sock,
7387 				uint32_t job_id, char *node_name)
7388 {
7389 	sock_gres_t *sock_gres;
7390 	uint64_t avail_gres, min_gres = 1;
7391 
7392 	if (job_gres_ptr->type_name)
7393 		return NULL;
7394 	if (!use_total_gres &&
7395 	    (node_gres_ptr->gres_cnt_alloc >= node_gres_ptr->gres_cnt_avail))
7396 		return NULL;	/* No GRES remaining */
7397 
7398 	if (job_gres_ptr->gres_per_node)
7399 		min_gres = job_gres_ptr-> gres_per_node;
7400 	if (job_gres_ptr->gres_per_socket)
7401 		min_gres = MAX(min_gres, job_gres_ptr->gres_per_socket);
7402 	if (job_gres_ptr->gres_per_task)
7403 		min_gres = MAX(min_gres, job_gres_ptr->gres_per_task);
7404 	if (!use_total_gres) {
7405 		avail_gres = node_gres_ptr->gres_cnt_avail -
7406 			     node_gres_ptr->gres_cnt_alloc;
7407 	} else
7408 		avail_gres = node_gres_ptr->gres_cnt_avail;
7409 	if (avail_gres < min_gres)
7410 		return NULL;	/* Insufficient GRES remaining */
7411 
7412 	sock_gres = xmalloc(sizeof(sock_gres_t));
7413 	sock_gres->cnt_any_sock += avail_gres;
7414 	sock_gres->total_cnt += avail_gres;
7415 
7416 	return sock_gres;
7417 }
7418 
_sock_gres_log(List sock_gres_list,char * node_name)7419 static void _sock_gres_log(List sock_gres_list, char *node_name)
7420 {
7421 	sock_gres_t *sock_gres;
7422 	ListIterator iter;
7423 	int i, len = -1;
7424 	char tmp[32] = "";
7425 
7426 	if (!sock_gres_list)
7427 		return;
7428 
7429 	info("Sock_gres state for %s", node_name);
7430 	iter = list_iterator_create(sock_gres_list);
7431 	while ((sock_gres = (sock_gres_t *) list_next(iter))) {
7432 		info("Gres:%s Type:%s TotalCnt:%"PRIu64" MaxNodeGres:%"PRIu64,
7433 		     sock_gres->gres_name, sock_gres->type_name,
7434 		     sock_gres->total_cnt, sock_gres->max_node_gres);
7435 		if (sock_gres->bits_any_sock) {
7436 			bit_fmt(tmp, sizeof(tmp), sock_gres->bits_any_sock);
7437 			len = bit_size(sock_gres->bits_any_sock);
7438 		}
7439 		info("  Sock[ANY]Cnt:%"PRIu64" Bits:%s of %d",
7440 		     sock_gres->cnt_any_sock, tmp, len);
7441 
7442 		for (i = 0; i < sock_gres->sock_cnt; i++) {
7443 			if (sock_gres->cnt_by_sock[i] == 0)
7444 				continue;
7445 			tmp[0] = '\0';
7446 			len = -1;
7447 			if (sock_gres->bits_by_sock &&
7448 			    sock_gres->bits_by_sock[i]) {
7449 				bit_fmt(tmp, sizeof(tmp),
7450 					sock_gres->bits_by_sock[i]);
7451 				len = bit_size(sock_gres->bits_by_sock[i]);
7452 			}
7453 			info("  Sock[%d]Cnt:%"PRIu64" Bits:%s of %d", i,
7454 			     sock_gres->cnt_by_sock[i], tmp, len);
7455 		}
7456 	}
7457 	list_iterator_destroy(iter);
7458 }
7459 
7460 /*
7461  * Determine how many cores on each socket of a node can be used by this job
7462  * IN job_gres_list   - job's gres_list built by gres_plugin_job_state_validate()
7463  * IN node_gres_list  - node's gres_list built by gres_plugin_node_config_validate()
7464  * IN use_total_gres  - if set then consider all gres resources as available,
7465  *		        and none are commited to running jobs
7466  * IN/OUT core_bitmap - Identification of available cores on this node
7467  * IN sockets         - Count of sockets on the node
7468  * IN cores_per_sock  - Count of cores per socket on this node
7469  * IN job_id          - job's ID (for logging)
7470  * IN node_name       - name of the node (for logging)
7471  * IN enforce_binding - if true then only use GRES with direct access to cores
7472  * IN s_p_n           - Expected sockets_per_node (NO_VAL if not limited)
7473  * OUT req_sock_map   - bitmap of specific requires sockets
7474  * IN user_id         - job's user ID
7475  * IN node_inx        - index of node to be evaluated
7476  * RET: List of sock_gres_t entries identifying what resources are available on
7477  *	each socket. Returns NULL if none available. Call FREE_NULL_LIST() to
7478  *	release memory.
7479  */
gres_plugin_job_test2(List job_gres_list,List node_gres_list,bool use_total_gres,bitstr_t * core_bitmap,uint16_t sockets,uint16_t cores_per_sock,uint32_t job_id,char * node_name,bool enforce_binding,uint32_t s_p_n,bitstr_t ** req_sock_map,uint32_t user_id,const uint32_t node_inx)7480 extern List gres_plugin_job_test2(List job_gres_list, List node_gres_list,
7481 				  bool use_total_gres, bitstr_t *core_bitmap,
7482 				  uint16_t sockets, uint16_t cores_per_sock,
7483 				  uint32_t job_id, char *node_name,
7484 				  bool enforce_binding, uint32_t s_p_n,
7485 				  bitstr_t **req_sock_map, uint32_t user_id,
7486 				  const uint32_t node_inx)
7487 {
7488 	List sock_gres_list = NULL;
7489 	ListIterator job_gres_iter;
7490 	gres_state_t *job_gres_ptr, *node_gres_ptr;
7491 	gres_job_state_t  *job_data_ptr;
7492 	gres_node_state_t *node_data_ptr;
7493 	uint32_t local_s_p_n;
7494 
7495 	if (!job_gres_list || (list_count(job_gres_list) == 0))
7496 		return sock_gres_list;
7497 	if (!node_gres_list)	/* Node lacks GRES to match */
7498 		return sock_gres_list;
7499 	(void) gres_plugin_init();
7500 
7501 	sock_gres_list = list_create(_sock_gres_del);
7502 	slurm_mutex_lock(&gres_context_lock);
7503 	job_gres_iter = list_iterator_create(job_gres_list);
7504 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
7505 		sock_gres_t *sock_gres = NULL;
7506 		node_gres_ptr = list_find_first(node_gres_list, _gres_find_id,
7507 		                                &job_gres_ptr->plugin_id);
7508 		if (node_gres_ptr == NULL) {
7509 			/* node lack GRES of type required by the job */
7510 			FREE_NULL_LIST(sock_gres_list);
7511 			break;
7512 		}
7513 		job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
7514 		node_data_ptr = (gres_node_state_t *) node_gres_ptr->gres_data;
7515 
7516 		if (job_data_ptr->gres_per_job &&
7517 		    !job_data_ptr->gres_per_socket)
7518 			local_s_p_n = s_p_n;	/* Maximize GRES per node */
7519 		else
7520 			local_s_p_n = NO_VAL;	/* No need to optimize socket */
7521 		if (core_bitmap && (bit_ffs(core_bitmap) == -1)) {
7522 			sock_gres = NULL;	/* No cores available */
7523 		} else if (node_data_ptr->topo_cnt) {
7524 			uint32_t alt_plugin_id = 0;
7525 			gres_node_state_t *alt_node_data_ptr = NULL;
7526 			if (!use_total_gres && have_gpu && have_mps) {
7527 				if (job_gres_ptr->plugin_id == gpu_plugin_id)
7528 					alt_plugin_id = mps_plugin_id;
7529 				if (job_gres_ptr->plugin_id == mps_plugin_id)
7530 					alt_plugin_id = gpu_plugin_id;
7531 			}
7532 			if (alt_plugin_id) {
7533 				node_gres_ptr = list_find_first(node_gres_list,
7534 				                                _gres_find_id,
7535 				                                &alt_plugin_id);
7536 			}
7537 			if (alt_plugin_id && node_gres_ptr) {
7538 				alt_node_data_ptr = (gres_node_state_t *)
7539 						    node_gres_ptr->gres_data;
7540 			} else {
7541 				/* GRES of interest not on this node */
7542 				alt_plugin_id = 0;
7543 			}
7544 			sock_gres = _build_sock_gres_by_topo(job_data_ptr,
7545 					node_data_ptr, use_total_gres,
7546 					core_bitmap, sockets, cores_per_sock,
7547 					job_id, node_name, enforce_binding,
7548 					local_s_p_n, req_sock_map,
7549 					job_gres_ptr->plugin_id,
7550 					alt_plugin_id, alt_node_data_ptr,
7551 					user_id, node_inx);
7552 		} else if (node_data_ptr->type_cnt) {
7553 			sock_gres = _build_sock_gres_by_type(job_data_ptr,
7554 					node_data_ptr, use_total_gres,
7555 					core_bitmap, sockets, cores_per_sock,
7556 					job_id, node_name);
7557 		} else {
7558 			sock_gres = _build_sock_gres_basic(job_data_ptr,
7559 					node_data_ptr, use_total_gres,
7560 					core_bitmap, sockets, cores_per_sock,
7561 					job_id, node_name);
7562 		}
7563 		if (!sock_gres) {
7564 			/* node lack available resources required by the job */
7565 			bit_clear_all(core_bitmap);
7566 			FREE_NULL_LIST(sock_gres_list);
7567 			break;
7568 		}
7569 		sock_gres->job_specs  = job_data_ptr;
7570 		sock_gres->gres_name  = xstrdup(job_data_ptr->gres_name);
7571 		sock_gres->node_specs = node_data_ptr;
7572 		sock_gres->plugin_id  = job_gres_ptr->plugin_id;
7573 		list_append(sock_gres_list, sock_gres);
7574 	}
7575 	list_iterator_destroy(job_gres_iter);
7576 	slurm_mutex_unlock(&gres_context_lock);
7577 
7578 	if (gres_debug)
7579 		_sock_gres_log(sock_gres_list, node_name);
7580 
7581 	return sock_gres_list;
7582 }
7583 
_build_avail_cores_by_sock(bitstr_t * core_bitmap,uint16_t sockets,uint16_t cores_per_sock)7584 static bool *_build_avail_cores_by_sock(bitstr_t *core_bitmap,
7585 					uint16_t sockets,
7586 					uint16_t cores_per_sock)
7587 {
7588 	bool *avail_cores_by_sock = xcalloc(sockets, sizeof(bool));
7589 	int s, c, i, lim = 0;
7590 
7591 	lim = bit_size(core_bitmap);
7592 	for (s = 0; s < sockets; s++) {
7593 		for (c = 0; c < cores_per_sock; c++) {
7594 			i = (s * cores_per_sock) + c;
7595 			if (i >= lim)
7596 				goto fini;	/* should never happen */
7597 			if (bit_test(core_bitmap, i)) {
7598 				avail_cores_by_sock[s] = true;
7599 				break;
7600 			}
7601 		}
7602 	}
7603 
7604 fini:	return avail_cores_by_sock;
7605 }
7606 
7607 /*
7608  * Determine which GRES can be used on this node given the available cores.
7609  *	Filter out unusable GRES.
7610  * IN sock_gres_list  - list of sock_gres_t entries built by gres_plugin_job_test2()
7611  * IN avail_mem       - memory available for the job
7612  * IN max_cpus        - maximum CPUs available on this node (limited by
7613  *                      specialized cores and partition CPUs-per-node)
7614  * IN enforce_binding - GRES must be co-allocated with cores
7615  * IN core_bitmap     - Identification of available cores on this node
7616  * IN sockets         - Count of sockets on the node
7617  * IN cores_per_sock  - Count of cores per socket on this node
7618  * IN cpus_per_core   - Count of CPUs per core on this node
7619  * IN sock_per_node   - sockets requested by job per node or NO_VAL
7620  * IN task_per_node   - tasks requested by job per node or NO_VAL16
7621  * IN whole_node      - we are requesting the whole node or not
7622  * OUT avail_gpus     - Count of available GPUs on this node
7623  * OUT near_gpus      - Count of GPUs available on sockets with available CPUs
7624  * RET - 0 if job can use this node, -1 otherwise (some GRES limit prevents use)
7625  */
gres_plugin_job_core_filter2(List sock_gres_list,uint64_t avail_mem,uint16_t max_cpus,bool enforce_binding,bitstr_t * core_bitmap,uint16_t sockets,uint16_t cores_per_sock,uint16_t cpus_per_core,uint32_t sock_per_node,uint16_t task_per_node,bool whole_node,uint16_t * avail_gpus,uint16_t * near_gpus)7626 extern int gres_plugin_job_core_filter2(List sock_gres_list, uint64_t avail_mem,
7627 					uint16_t max_cpus,
7628 					bool enforce_binding,
7629 					bitstr_t *core_bitmap,
7630 					uint16_t sockets,
7631 					uint16_t cores_per_sock,
7632 					uint16_t cpus_per_core,
7633 					uint32_t sock_per_node,
7634 					uint16_t task_per_node,
7635 					bool whole_node,
7636 					uint16_t *avail_gpus,
7637 					uint16_t *near_gpus)
7638 {
7639 	ListIterator sock_gres_iter;
7640 	sock_gres_t *sock_gres;
7641 	bool *avail_cores_by_sock = NULL;
7642 	uint64_t max_gres, mem_per_gres = 0, near_gres_cnt = 0;
7643 	uint16_t cpus_per_gres;
7644 	int s, rc = 0;
7645 
7646 	*avail_gpus = 0;
7647 	*near_gpus = 0;
7648 	if (!core_bitmap || !sock_gres_list ||
7649 	    (list_count(sock_gres_list) == 0))
7650 		return rc;
7651 
7652 	sock_gres_iter = list_iterator_create(sock_gres_list);
7653 	while ((sock_gres = (sock_gres_t *) list_next(sock_gres_iter))) {
7654 		uint64_t min_gres = 1, tmp_u64;
7655 		if (sock_gres->job_specs) {
7656 			gres_job_state_t *job_gres_ptr = sock_gres->job_specs;
7657 			if (whole_node)
7658 				min_gres = sock_gres->total_cnt;
7659 			else if (job_gres_ptr->gres_per_node)
7660 				min_gres = job_gres_ptr-> gres_per_node;
7661 			if (job_gres_ptr->gres_per_socket) {
7662 				tmp_u64 = job_gres_ptr->gres_per_socket;
7663 				if (sock_per_node != NO_VAL)
7664 					tmp_u64 *= sock_per_node;
7665 				min_gres = MAX(min_gres, tmp_u64);
7666 			}
7667 			if (job_gres_ptr->gres_per_task) {
7668 				tmp_u64 = job_gres_ptr->gres_per_task;
7669 				if (task_per_node != NO_VAL16)
7670 					tmp_u64 *= task_per_node;
7671 				min_gres = MAX(min_gres, tmp_u64);
7672 			}
7673 		}
7674 		if (!sock_gres->job_specs)
7675 			cpus_per_gres = 0;
7676 		else if (sock_gres->job_specs->cpus_per_gres)
7677 			cpus_per_gres = sock_gres->job_specs->cpus_per_gres;
7678 		else
7679 			cpus_per_gres = sock_gres->job_specs->def_cpus_per_gres;
7680 		if (cpus_per_gres) {
7681 			max_gres = max_cpus / cpus_per_gres;
7682 			if ((max_gres == 0) ||
7683 			    (sock_gres->job_specs->gres_per_node > max_gres) ||
7684 			    (sock_gres->job_specs->gres_per_task > max_gres) ||
7685 			    (sock_gres->job_specs->gres_per_socket > max_gres)){
7686 				/* Insufficient CPUs for any GRES */
7687 				rc = -1;
7688 				break;
7689 			}
7690 		}
7691 		if (!sock_gres->job_specs)
7692 			mem_per_gres = 0;
7693 		else if (sock_gres->job_specs->mem_per_gres)
7694 			mem_per_gres = sock_gres->job_specs->mem_per_gres;
7695 		else
7696 			mem_per_gres = sock_gres->job_specs->def_mem_per_gres;
7697 		if (mem_per_gres && avail_mem) {
7698 			if (mem_per_gres <= avail_mem) {
7699 				sock_gres->max_node_gres = avail_mem /
7700 							   mem_per_gres;
7701 			} else { /* Insufficient memory for any GRES */
7702 				rc = -1;
7703 				break;
7704 			}
7705 		}
7706 		if (sock_gres->cnt_by_sock || enforce_binding) {
7707 			if (!avail_cores_by_sock) {
7708 				avail_cores_by_sock =_build_avail_cores_by_sock(
7709 							core_bitmap, sockets,
7710 							cores_per_sock);
7711 			}
7712 		}
7713 		/*
7714 		 * NOTE: gres_per_socket enforcement is performed by
7715 		 * _build_sock_gres_by_topo(), called by gres_plugin_job_test2()
7716 		 */
7717 		if (sock_gres->cnt_by_sock && enforce_binding) {
7718 			for (s = 0; s < sockets; s++) {
7719 				if (avail_cores_by_sock[s] == 0) {
7720 					sock_gres->total_cnt -=
7721 						sock_gres->cnt_by_sock[s];
7722 					sock_gres->cnt_by_sock[s] = 0;
7723 				}
7724 			}
7725 			near_gres_cnt = sock_gres->total_cnt;
7726 		} else if (sock_gres->cnt_by_sock) { /* NO enforce_binding */
7727 			near_gres_cnt = sock_gres->total_cnt;
7728 			for (s = 0; s < sockets; s++) {
7729 				if (avail_cores_by_sock[s] == 0) {
7730 					near_gres_cnt -=
7731 						sock_gres->cnt_by_sock[s];
7732 				}
7733 			}
7734 		} else {
7735 			near_gres_cnt = sock_gres->total_cnt;
7736 		}
7737 		if (sock_gres->job_specs && !whole_node &&
7738 		    sock_gres->job_specs->gres_per_node) {
7739 			if ((sock_gres->max_node_gres == 0) ||
7740 			    (sock_gres->max_node_gres >
7741 			     sock_gres->job_specs->gres_per_node)) {
7742 				sock_gres->max_node_gres =
7743 					sock_gres->job_specs->gres_per_node;
7744 			}
7745 		}
7746 		if (cpus_per_gres) {
7747 			int cpu_cnt;
7748 			cpu_cnt = bit_set_count(core_bitmap);
7749 			cpu_cnt *= cpus_per_core;
7750 			max_gres = cpu_cnt / cpus_per_gres;
7751 			if (max_gres == 0) {
7752 				rc = -1;
7753 				break;
7754 			} else if ((sock_gres->max_node_gres == 0) ||
7755 				   (sock_gres->max_node_gres > max_gres)) {
7756 				sock_gres->max_node_gres = max_gres;
7757 			}
7758 		}
7759 		if (mem_per_gres) {
7760 			max_gres = avail_mem / mem_per_gres;
7761 			sock_gres->total_cnt = MIN(sock_gres->total_cnt,
7762 						   max_gres);
7763 		}
7764 		if ((sock_gres->total_cnt < min_gres) ||
7765 		    ((sock_gres->max_node_gres != 0) &&
7766 		     (sock_gres->max_node_gres < min_gres))) {
7767 			rc = -1;
7768 			break;
7769 		}
7770 
7771 		if (_sharing_gres(sock_gres->plugin_id)) {
7772 			 *avail_gpus += sock_gres->total_cnt;
7773 			if (sock_gres->max_node_gres &&
7774 			    (sock_gres->max_node_gres < near_gres_cnt))
7775 				near_gres_cnt = sock_gres->max_node_gres;
7776 			if (*near_gpus < 0xff)	/* avoid overflow */
7777 				*near_gpus += near_gres_cnt;
7778 		}
7779 	}
7780 	list_iterator_destroy(sock_gres_iter);
7781 	xfree(avail_cores_by_sock);
7782 
7783 	return rc;
7784 }
7785 
7786 /* Order GRES scheduling. Schedule GRES requiring specific sockets first */
_sock_gres_sort(void * x,void * y)7787 static int _sock_gres_sort(void *x, void *y)
7788 {
7789 	sock_gres_t *sock_gres1 = *(sock_gres_t **) x;
7790 	sock_gres_t *sock_gres2 = *(sock_gres_t **) y;
7791 	int weight1 = 0, weight2 = 0;
7792 
7793 	if (sock_gres1->node_specs && !sock_gres1->node_specs->topo_cnt)
7794 		weight1 += 0x02;
7795 	if (sock_gres1->job_specs && !sock_gres1->job_specs->gres_per_socket)
7796 		weight1 += 0x01;
7797 
7798 	if (sock_gres2->node_specs && !sock_gres2->node_specs->topo_cnt)
7799 		weight2 += 0x02;
7800 	if (sock_gres2->job_specs && !sock_gres2->job_specs->gres_per_socket)
7801 		weight2 += 0x01;
7802 
7803 	return weight1 - weight2;
7804 }
7805 
_sort_sockets_by_avail_cores(const void * x,const void * y,void * socket_avail_cores)7806 int _sort_sockets_by_avail_cores(const void *x, const void *y,
7807 				 void *socket_avail_cores)
7808 {
7809 	uint16_t *sockets = (uint16_t *)socket_avail_cores;
7810 	return (sockets[*(int *)y] - sockets[*(int *)x]);
7811 }
7812 
7813 /*
7814  * Determine how many tasks can be started on a given node and which
7815  *	sockets/cores are required
7816  * IN mc_ptr - job's multi-core specs, NO_VAL and INFINITE mapped to zero
7817  * IN sock_gres_list - list of sock_gres_t entries built by gres_plugin_job_test2()
7818  * IN sockets - Count of sockets on the node
7819  * IN cores_per_socket - Count of cores per socket on the node
7820  * IN cpus_per_core - Count of CPUs per core on the node
7821  * IN avail_cpus - Count of available CPUs on the node, UPDATED
7822  * IN min_tasks_this_node - Minimum count of tasks that can be started on this
7823  *                          node, UPDATED
7824  * IN max_tasks_this_node - Maximum count of tasks that can be started on this
7825  *                          node or NO_VAL, UPDATED
7826  * IN rem_nodes - desired additional node count to allocate, including this node
7827  * IN enforce_binding - GRES must be co-allocated with cores
7828  * IN first_pass - set if first scheduling attempt for this job, use
7829  *		   co-located GRES and cores if possible
7830  * IN avail_core - cores available on this node, UPDATED
7831  */
gres_plugin_job_core_filter3(gres_mc_data_t * mc_ptr,List sock_gres_list,uint16_t sockets,uint16_t cores_per_socket,uint16_t cpus_per_core,uint16_t * avail_cpus,uint32_t * min_tasks_this_node,uint32_t * max_tasks_this_node,int rem_nodes,bool enforce_binding,bool first_pass,bitstr_t * avail_core)7832 extern void gres_plugin_job_core_filter3(gres_mc_data_t *mc_ptr,
7833 					 List sock_gres_list,
7834 					 uint16_t sockets,
7835 					 uint16_t cores_per_socket,
7836 					 uint16_t cpus_per_core,
7837 					 uint16_t *avail_cpus,
7838 					 uint32_t *min_tasks_this_node,
7839 					 uint32_t *max_tasks_this_node,
7840 					 int rem_nodes,
7841 					 bool enforce_binding,
7842 					 bool first_pass,
7843 					 bitstr_t *avail_core)
7844 {
7845 	static uint16_t select_type_param = NO_VAL16;
7846 	ListIterator sock_gres_iter;
7847 	sock_gres_t *sock_gres;
7848 	gres_job_state_t *job_specs;
7849 	int i, j, c, s, sock_cnt = 0, req_cores, rem_sockets, full_socket;
7850 	int tot_core_cnt = 0, min_core_cnt = 1;
7851 	uint64_t cnt_avail_sock, cnt_avail_total, max_gres = 0, rem_gres = 0;
7852 	uint64_t tot_gres_sock, max_tasks;
7853 	uint32_t task_cnt_incr;
7854 	bool *req_sock = NULL;	/* Required socket */
7855 	int *socket_index = NULL; /* Socket indexes */
7856 	uint16_t *avail_cores_per_sock, cpus_per_gres;
7857 	uint16_t avail_cores_tot;
7858 
7859 	if (*max_tasks_this_node == 0)
7860 		return;
7861 
7862 	xassert(avail_core);
7863 	avail_cores_per_sock = xcalloc(sockets, sizeof(uint16_t));
7864 	for (s = 0; s < sockets; s++) {
7865 		for (c = 0; c < cores_per_socket; c++) {
7866 			i = (s * cores_per_socket) + c;
7867 			if (bit_test(avail_core, i))
7868 				avail_cores_per_sock[s]++;
7869 		}
7870 		tot_core_cnt += avail_cores_per_sock[s];
7871 	}
7872 
7873 	task_cnt_incr = *min_tasks_this_node;
7874 	req_sock = xcalloc(sockets, sizeof(bool));
7875 	socket_index = xcalloc(sockets, sizeof(int));
7876 
7877 	list_sort(sock_gres_list, _sock_gres_sort);
7878 	sock_gres_iter = list_iterator_create(sock_gres_list);
7879 	while ((sock_gres = (sock_gres_t *) list_next(sock_gres_iter))) {
7880 		bool sufficient_gres;
7881 		job_specs = sock_gres->job_specs;
7882 		if (!job_specs)
7883 			continue;
7884 		if (job_specs->gres_per_job &&
7885 		    (job_specs->total_gres < job_specs->gres_per_job)) {
7886 			rem_gres = job_specs->gres_per_job -
7887 				   job_specs->total_gres;
7888 		}
7889 
7890 		/*
7891 		 * gres_plugin_job_core_filter2() sets sock_gres->max_node_gres
7892 		 * for mem_per_gres enforcement; use it to set GRES limit for
7893 		 * this node (max_gres).
7894 		 */
7895 		if (sock_gres->max_node_gres) {
7896 			if (rem_gres && (rem_gres < sock_gres->max_node_gres))
7897 				max_gres = rem_gres;
7898 			else
7899 				max_gres = sock_gres->max_node_gres;
7900 		}
7901 		rem_nodes = MAX(rem_nodes, 1);
7902 		rem_sockets = MAX(1, mc_ptr->sockets_per_node);
7903 		if (max_gres &&
7904 		    ((job_specs->gres_per_node > max_gres) ||
7905 		     ((job_specs->gres_per_socket * rem_sockets) > max_gres))) {
7906 			*max_tasks_this_node = 0;
7907 			break;
7908 		}
7909 		if (job_specs->gres_per_node && job_specs->gres_per_task) {
7910 			max_tasks = job_specs->gres_per_node /
7911 				    job_specs->gres_per_task;
7912 			if ((max_tasks == 0) ||
7913 			    (max_tasks > *max_tasks_this_node) ||
7914 			    (max_tasks < *min_tasks_this_node)) {
7915 				*max_tasks_this_node = 0;
7916 				break;
7917 			}
7918 			if ((*max_tasks_this_node == NO_VAL) ||
7919 			    (*max_tasks_this_node > max_tasks))
7920 				*max_tasks_this_node = max_gres;
7921 		}
7922 
7923 		min_core_cnt = MAX(*min_tasks_this_node, 1) *
7924 			       MAX(mc_ptr->cpus_per_task, 1);
7925 		min_core_cnt = (min_core_cnt + cpus_per_core - 1) /
7926 			       cpus_per_core;
7927 
7928 		if (job_specs->cpus_per_gres)
7929 			cpus_per_gres = job_specs->cpus_per_gres;
7930 		else
7931 			cpus_per_gres = job_specs->def_cpus_per_gres;
7932 
7933 		/* Filter out unusable GRES by socket */
7934 		avail_cores_tot = 0;
7935 		cnt_avail_total = sock_gres->cnt_any_sock;
7936 		sufficient_gres = false;
7937 		for (s = 0; s < sockets; s++)
7938 			socket_index[s] = s;
7939 		qsort_r(socket_index, sockets, sizeof(int),
7940 			_sort_sockets_by_avail_cores, avail_cores_per_sock);
7941 
7942 		for (j = 0; j < sockets; j++) {
7943 			/*
7944 			 * Test for sufficient gres_per_socket
7945 			 *
7946 			 * Start with socket with most cores available,
7947 			 * so we know that we have max number of cores on socket
7948 			 * with allocated GRES.
7949 			 */
7950 			s = socket_index[j];
7951 
7952 			if (sock_gres->cnt_by_sock) {
7953 				cnt_avail_sock = sock_gres->cnt_by_sock[s];
7954 			} else
7955 				cnt_avail_sock = 0;
7956 
7957 			/*
7958 			 * If enforce binding number of gres allocated per
7959 			 * socket has to be limited by cpus_per_gres
7960 			 */
7961 			if ((enforce_binding || first_pass) && cpus_per_gres) {
7962 				int max_gres_socket = (avail_cores_per_sock[s] *
7963 						       cpus_per_core) /
7964 						      cpus_per_gres;
7965 				cnt_avail_sock = MIN(cnt_avail_sock,
7966 						     max_gres_socket);
7967 			}
7968 
7969 			tot_gres_sock = sock_gres->cnt_any_sock +
7970 					cnt_avail_sock;
7971 			if ((job_specs->gres_per_socket > tot_gres_sock) ||
7972 			    (tot_gres_sock == 0)) {
7973 				/*
7974 				 * Insufficient GRES on this socket
7975 				 * GRES removed here won't be used in 2nd pass
7976 				 */
7977 				if (((job_specs->gres_per_socket >
7978 				      tot_gres_sock) ||
7979 				     enforce_binding) &&
7980 				    sock_gres->cnt_by_sock) {
7981 					sock_gres->total_cnt -=
7982 						sock_gres->cnt_by_sock[s];
7983 					sock_gres->cnt_by_sock[s] = 0;
7984 				}
7985 				if (first_pass &&
7986 				    (tot_core_cnt > min_core_cnt)) {
7987 					for (c = cores_per_socket - 1;
7988 					     c >= 0; c--) {
7989 						i = (s * cores_per_socket) + c;
7990 						if (!bit_test(avail_core, i))
7991 							continue;
7992 						bit_clear(avail_core, i);
7993 
7994 						avail_cores_per_sock[s]--;
7995 						if (bit_set_count(avail_core) *
7996 						    cpus_per_core <
7997 						    *avail_cpus) {
7998 							*avail_cpus -=
7999 								cpus_per_core;
8000 						}
8001 						if (--tot_core_cnt <=
8002 						    min_core_cnt)
8003 							break;
8004 					}
8005 				}
8006 			}
8007 
8008 			avail_cores_tot += avail_cores_per_sock[s];
8009 			/* Test for available cores on this socket */
8010 			if ((enforce_binding || first_pass) &&
8011 			    (avail_cores_per_sock[s] == 0))
8012 				continue;
8013 
8014 			cnt_avail_total += cnt_avail_sock;
8015 			if (!sufficient_gres) {
8016 				req_sock[s] = true;
8017 				sock_cnt++;
8018 			}
8019 
8020 			if (job_specs->gres_per_node &&
8021 			    (cnt_avail_total >= job_specs->gres_per_node) &&
8022 			    !sock_gres->cnt_any_sock) {
8023 				/*
8024 				 * Sufficient gres will leave remaining CPUs as
8025 				 * !req_sock. We do this only when we
8026 				 * collected enough and all collected gres of
8027 				 * considered type are bound to socket.
8028 				 */
8029 				sufficient_gres = true;
8030 			}
8031 		}
8032 
8033 		if (cpus_per_gres) {
8034 			max_gres = *avail_cpus / cpus_per_gres;
8035 			cnt_avail_total = MIN(cnt_avail_total, max_gres);
8036 		}
8037 		if ((cnt_avail_total == 0) ||
8038 		    (job_specs->gres_per_node > cnt_avail_total) ||
8039 		    (job_specs->gres_per_task > cnt_avail_total)) {
8040 			*max_tasks_this_node = 0;
8041 		}
8042 		if (job_specs->gres_per_task) {
8043 			max_tasks = cnt_avail_total / job_specs->gres_per_task;
8044 			*max_tasks_this_node = MIN(*max_tasks_this_node,
8045 						   max_tasks);
8046 		}
8047 
8048 		/*
8049 		 * min_tasks_this_node and max_tasks_this_node must be multiple
8050 		 * of original min_tasks_this_node value. This is to support
8051 		 * ntasks_per_* option and we just need to select a count of
8052 		 * tasks, sockets, etc. Round the values down.
8053 		 */
8054 		*min_tasks_this_node = (*min_tasks_this_node / task_cnt_incr) *
8055 				       task_cnt_incr;
8056 		*max_tasks_this_node = (*max_tasks_this_node / task_cnt_incr) *
8057 				       task_cnt_incr;
8058 
8059 		if (*max_tasks_this_node == 0)
8060 			break;
8061 
8062 		/*
8063 		 * Remove cores on not required sockets when enforce-binding,
8064 		 * this has to happen also when max_tasks_this_node == NO_VAL
8065 		 */
8066 		if (enforce_binding || first_pass) {
8067 			for (s = 0; s < sockets; s++) {
8068 				if (req_sock[s])
8069 					continue;
8070 				for (c = cores_per_socket - 1; c >= 0; c--) {
8071 					i = (s * cores_per_socket) + c;
8072 					if (!bit_test(avail_core, i))
8073 						continue;
8074 					bit_clear(avail_core, i);
8075 					if (bit_set_count(avail_core) *
8076 					    cpus_per_core < *avail_cpus) {
8077 						*avail_cpus -= cpus_per_core;
8078 					}
8079 					avail_cores_tot--;
8080 					avail_cores_per_sock[s]--;
8081 				}
8082 			}
8083 		}
8084 
8085 		if (*max_tasks_this_node == NO_VAL) {
8086 			if (cpus_per_gres) {
8087 				i = *avail_cpus / cpus_per_gres;
8088 				sock_gres->total_cnt =
8089 					MIN(i, sock_gres->total_cnt);
8090 			}
8091 			log_flag(GRES, "%s: max_tasks_this_node is set to NO_VAL, won't clear non-needed cores",
8092 				 __func__);
8093 			continue;
8094 		}
8095 		if (*max_tasks_this_node < *min_tasks_this_node) {
8096 			error("%s: min_tasks_this_node:%u > max_tasks_this_node:%u",
8097 			      __func__,
8098 			      *min_tasks_this_node,
8099 			      *max_tasks_this_node);
8100 		}
8101 
8102 		/*
8103 		 * Determine how many cores are needed for this job.
8104 		 * Consider rounding errors if cpus_per_task not divisible
8105 		 * by cpus_per_core
8106 		 */
8107 		req_cores = *max_tasks_this_node;
8108 		if (mc_ptr->cpus_per_task) {
8109 			int threads_per_core, removed_tasks = 0;
8110 
8111 			if (mc_ptr->threads_per_core)
8112 				threads_per_core =
8113 					MIN(cpus_per_core,
8114 					    mc_ptr->threads_per_core);
8115 			else
8116 				threads_per_core = cpus_per_core;
8117 
8118 			req_cores *= mc_ptr->cpus_per_task;
8119 
8120 			while (*max_tasks_this_node >= *min_tasks_this_node) {
8121 				/* round up by full threads per core */
8122 				req_cores += threads_per_core - 1;
8123 				req_cores /= threads_per_core;
8124 				if (req_cores <= avail_cores_tot) {
8125 					if (removed_tasks)
8126 						log_flag(GRES, "%s: settings required_cores=%d by max_tasks_this_node=%u(reduced=%d) cpus_per_task=%d cpus_per_core=%d threads_per_core:%d",
8127 							 __func__,
8128 							 req_cores,
8129 							 *max_tasks_this_node,
8130 							 removed_tasks,
8131 							 mc_ptr->cpus_per_task,
8132 							 cpus_per_core,
8133 							 mc_ptr->
8134 							 threads_per_core);
8135 					break;
8136 				}
8137 				removed_tasks++;
8138 				(*max_tasks_this_node)--;
8139 				req_cores = *max_tasks_this_node;
8140 			}
8141 		}
8142 		if (cpus_per_gres) {
8143 			if (job_specs->gres_per_node) {
8144 				i = job_specs->gres_per_node;
8145 				log_flag(GRES, "%s: estimating req_cores gres_per_node=%"PRIu64,
8146 					 __func__, job_specs->gres_per_node);
8147 			} else if (job_specs->gres_per_socket) {
8148 				i = job_specs->gres_per_socket * sock_cnt;
8149 				log_flag(GRES, "%s: estimating req_cores gres_per_socket=%"PRIu64,
8150 					 __func__, job_specs->gres_per_socket);
8151 			} else if (job_specs->gres_per_task) {
8152 				i = job_specs->gres_per_task *
8153 				    *max_tasks_this_node;
8154 				log_flag(GRES, "%s: estimating req_cores max_tasks_this_node=%u gres_per_task=%"PRIu64,
8155 					 __func__,
8156 					 *max_tasks_this_node,
8157 					 job_specs->gres_per_task);
8158 			} else if (cnt_avail_total) {
8159 				i = cnt_avail_total;
8160 				log_flag(GRES, "%s: estimating req_cores cnt_avail_total=%"PRIu64,
8161 					 __func__, cnt_avail_total);
8162 			} else {
8163 				i = 1;
8164 				log_flag(GRES, "%s: estimating req_cores default to 1 task",
8165 					 __func__);
8166 			}
8167 			i *= cpus_per_gres;
8168 			i = (i + cpus_per_core - 1) / cpus_per_core;
8169 			if (req_cores < i)
8170 				log_flag(GRES, "%s: Increasing req_cores=%d from cpus_per_gres=%d cpus_per_core=%"PRIu16,
8171 					 __func__, i, cpus_per_gres,
8172 					 cpus_per_core);
8173 			req_cores = MAX(req_cores, i);
8174 		}
8175 
8176 		if (req_cores > avail_cores_tot) {
8177 			log_flag(GRES, "%s: Job cannot run on node req_cores:%d > aval_cores_tot:%d",
8178 				 __func__, req_cores, avail_cores_tot);
8179 			*max_tasks_this_node = 0;
8180 			break;
8181 		}
8182 
8183 		/*
8184 		 * Clear extra avail_core bits on sockets we don't need
8185 		 * up to required number of cores based on max_tasks_this_node.
8186 		 * In case of enforce-binding those are already cleared.
8187 		 */
8188 		if ((avail_cores_tot > req_cores) &&
8189 		     !enforce_binding && !first_pass) {
8190 			for (s = 0; s < sockets; s++) {
8191 				if (avail_cores_tot == req_cores)
8192 					break;
8193 				if (req_sock[s])
8194 					continue;
8195 				for (c = cores_per_socket - 1; c >= 0; c--) {
8196 					i = (s * cores_per_socket) + c;
8197 					if (!bit_test(avail_core, i))
8198 						continue;
8199 					bit_clear(avail_core, i);
8200 					if (bit_set_count(avail_core) *
8201 					    cpus_per_core < *avail_cpus) {
8202 						*avail_cpus -= cpus_per_core;
8203 					}
8204 					avail_cores_tot--;
8205 					avail_cores_per_sock[s]--;
8206 					if (avail_cores_tot == req_cores)
8207 						break;
8208 				}
8209 			}
8210 		}
8211 
8212 		/*
8213 		 * Clear extra avail_core bits on sockets we do need, but
8214 		 * spread them out so that every socket has some cores
8215 		 * available to use with the nearby GRES that we do need.
8216 		 */
8217 		while (avail_cores_tot > req_cores) {
8218 			full_socket = -1;
8219 			for (s = 0; s < sockets; s++) {
8220 				if (avail_cores_tot == req_cores)
8221 					break;
8222 				if (!req_sock[s] ||
8223 				    (avail_cores_per_sock[s] == 0))
8224 					continue;
8225 				if ((full_socket == -1) ||
8226 				    (avail_cores_per_sock[full_socket] <
8227 				     avail_cores_per_sock[s])) {
8228 					full_socket = s;
8229 				}
8230 			}
8231 			if (full_socket == -1)
8232 				break;
8233 			s = full_socket;
8234 			for (c = cores_per_socket - 1; c >= 0; c--) {
8235 				i = (s * cores_per_socket) + c;
8236 				if (!bit_test(avail_core, i))
8237 					continue;
8238 				bit_clear(avail_core, i);
8239 				if (bit_set_count(avail_core) * cpus_per_core <
8240 				    *avail_cpus) {
8241 					*avail_cpus -= cpus_per_core;
8242 				}
8243 				avail_cores_per_sock[s]--;
8244 				avail_cores_tot--;
8245 				break;
8246 			}
8247 		}
8248 		if (cpus_per_gres) {
8249 			i = *avail_cpus / cpus_per_gres;
8250 			sock_gres->total_cnt = MIN(i, sock_gres->total_cnt);
8251 			if ((job_specs->gres_per_node > sock_gres->total_cnt) ||
8252 			    (job_specs->gres_per_task > sock_gres->total_cnt)) {
8253 				*max_tasks_this_node = 0;
8254 			}
8255 		}
8256 	}
8257 	list_iterator_destroy(sock_gres_iter);
8258 	xfree(avail_cores_per_sock);
8259 	xfree(req_sock);
8260 	xfree(socket_index);
8261 
8262 
8263 	if (select_type_param == NO_VAL16)
8264 		select_type_param = slurm_get_select_type_param();
8265 	if ((mc_ptr->cpus_per_task > 1) ||
8266 	    ((select_type_param & CR_ONE_TASK_PER_CORE) == 0)) {
8267 		/*
8268 		 * Only adjust *avail_cpus for the maximum task count if
8269 		 * cpus_per_task is explicitly set. There is currently no way
8270 		 * to tell if cpus_per_task==1 is explicitly set by the job
8271 		 * when SelectTypeParameters includes CR_ONE_TASK_PER_CORE.
8272 		 */
8273 		*avail_cpus = MIN(*avail_cpus,
8274 				  *max_tasks_this_node * mc_ptr->cpus_per_task);
8275 	}
8276 }
8277 
8278 /*
8279  * Return the maximum number of tasks that can be started on a node with
8280  * sock_gres_list (per-socket GRES details for some node)
8281  */
gres_plugin_get_task_limit(List sock_gres_list)8282 extern uint32_t gres_plugin_get_task_limit(List sock_gres_list)
8283 {
8284 	ListIterator sock_gres_iter;
8285 	sock_gres_t *sock_gres;
8286 	uint32_t max_tasks = NO_VAL;
8287 	uint64_t task_limit;
8288 
8289 	sock_gres_iter = list_iterator_create(sock_gres_list);
8290 	while ((sock_gres = (sock_gres_t *) list_next(sock_gres_iter))) {
8291 		xassert(sock_gres->job_specs);
8292 		if (sock_gres->job_specs->gres_per_task == 0)
8293 			continue;
8294 		task_limit = sock_gres->total_cnt /
8295 			     sock_gres->job_specs->gres_per_task;
8296 		max_tasks = MIN(max_tasks, task_limit);
8297 	}
8298 	list_iterator_destroy(sock_gres_iter);
8299 
8300 	return max_tasks;
8301 }
8302 
8303 /*
8304  * Return count of sockets allocated to this job on this node
8305  * job_res IN - job resource allocation
8306  * node_inx IN - global node index
8307  * job_node_inx IN - node index for this job's allocation
8308  * RET socket count
8309  */
_get_sock_cnt(struct job_resources * job_res,int node_inx,int job_node_inx)8310 static int _get_sock_cnt(struct job_resources *job_res, int node_inx,
8311 			 int job_node_inx)
8312 {
8313 	int core_offset, used_sock_cnt = 0;
8314 	uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
8315 	int c, i, rc, s;
8316 
8317 	rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
8318 				   &cores_per_socket_cnt);
8319 	if (rc != SLURM_SUCCESS) {
8320 		error("%s: Invalid socket/core count", __func__);
8321 		return 1;
8322 	}
8323 	core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
8324 	if (core_offset < 0) {
8325 		error("%s: Invalid core offset", __func__);
8326 		return 1;
8327 	}
8328 	for (s = 0; s < sock_cnt; s++) {
8329 		for (c = 0; c < cores_per_socket_cnt; c++) {
8330 			i = (s * cores_per_socket_cnt) + c;
8331 			if (bit_test(job_res->core_bitmap, (core_offset + i)))
8332 				used_sock_cnt++;
8333 		}
8334 	}
8335 	if (used_sock_cnt == 0) {
8336 		error("%s: No allocated cores found", __func__);
8337 		return 1;
8338 	}
8339 	return used_sock_cnt;
8340 }
8341 
8342 /*
8343  * Select specific GRES (set GRES bitmap) for this job on this node based upon
8344  *	per-job resource specification. Use only socket-local GRES
8345  * job_res IN - job resource allocation
8346  * node_inx IN - global node index
8347  * job_node_inx IN - node index for this job's allocation
8348  * rem_nodes IN - count of nodes remaining to place resources on
8349  * job_specs IN - job request specifications, UPDATED: set bits in
8350  *		  gres_bit_select
8351  * node_specs IN - node resource request specifications
8352  * job_id IN - job ID for logging
8353  * tres_mc_ptr IN - job's multi-core options
8354  * cpus_per_core IN - CPUs per core on this node
8355  * RET 0:more work, 1:fini
8356  */
_set_job_bits1(struct job_resources * job_res,int node_inx,int job_node_inx,int rem_nodes,sock_gres_t * sock_gres,uint32_t job_id,gres_mc_data_t * tres_mc_ptr,uint16_t cpus_per_core)8357 static int _set_job_bits1(struct job_resources *job_res, int node_inx,
8358 			  int job_node_inx, int rem_nodes,
8359 			  sock_gres_t *sock_gres, uint32_t job_id,
8360 			  gres_mc_data_t *tres_mc_ptr, uint16_t cpus_per_core)
8361 {
8362 	int core_offset, gres_cnt;
8363 	uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
8364 	int c, i, g, rc, s;
8365 	gres_job_state_t *job_specs;
8366 	gres_node_state_t *node_specs;
8367 	int *cores_on_sock = NULL, alloc_gres_cnt = 0;
8368 	int max_gres, pick_gres, total_cores = 0;
8369 	int fini = 0;
8370 
8371 	job_specs = sock_gres->job_specs;
8372 	node_specs = sock_gres->node_specs;
8373 	if (job_specs->gres_per_job == job_specs->total_gres)
8374 		fini = 1;
8375 	rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
8376 				   &cores_per_socket_cnt);
8377 	if (rc != SLURM_SUCCESS) {
8378 		error("%s: Invalid socket/core count for job %u on node %d",
8379 		      __func__, job_id, node_inx);
8380 		return rc;
8381 	}
8382 	core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
8383 	if (core_offset < 0) {
8384 		error("%s: Invalid core offset for job %u on node %d",
8385 		      __func__, job_id, node_inx);
8386 		return rc;
8387 	}
8388 	i = sock_gres->sock_cnt;
8389 	if ((i != 0) && (i != sock_cnt)) {
8390 		error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
8391 		      __func__, i, sock_cnt, job_id, node_inx);
8392 		sock_cnt = MIN(sock_cnt, i);
8393 	}
8394 	xassert(job_res->core_bitmap);
8395 	if (job_node_inx == 0)
8396 		job_specs->total_gres = 0;
8397 	max_gres = job_specs->gres_per_job - job_specs->total_gres -
8398 		   (rem_nodes - 1);
8399 	cores_on_sock = xcalloc(sock_cnt, sizeof(int));
8400 	gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
8401 	for (s = 0; s < sock_cnt; s++) {
8402 		for (c = 0; c < cores_per_socket_cnt; c++) {
8403 			i = (s * cores_per_socket_cnt) + c;
8404 			if (bit_test(job_res->core_bitmap, (core_offset + i))) {
8405 				cores_on_sock[s]++;
8406 				total_cores++;
8407 			}
8408 		}
8409 	}
8410 	if (job_specs->cpus_per_gres) {
8411 		max_gres = MIN(max_gres,
8412 			       ((total_cores * cpus_per_core) /
8413 				job_specs->cpus_per_gres));
8414 	}
8415 	if ((max_gres > 1) && (node_specs->link_len == gres_cnt))
8416 		pick_gres  = NO_VAL16;
8417 	else
8418 		pick_gres = max_gres;
8419 	/*
8420 	 * Now pick specific GRES for these sockets.
8421 	 * First select all GRES that we might possibly use, starting with
8422 	 * those not constrained by socket, then contrained by socket.
8423 	 * Then remove those which are not required and not "best".
8424 	 */
8425 	for (s = -1;	/* Socket == - 1 if GRES avail from any socket */
8426 	     ((s < sock_cnt) && (alloc_gres_cnt < pick_gres)); s++) {
8427 		if ((s >= 0) && !cores_on_sock[s])
8428 			continue;
8429 		for (g = 0; ((g < gres_cnt) && (alloc_gres_cnt < pick_gres));
8430 		     g++) {
8431 			if ((s == -1) &&
8432 			    (!sock_gres->bits_any_sock ||
8433 			     !bit_test(sock_gres->bits_any_sock, g)))
8434 				continue;   /* GRES not avail any socket */
8435 			if ((s >= 0) &&
8436 			    (!sock_gres->bits_by_sock ||
8437 			     !sock_gres->bits_by_sock[s] ||
8438 			     !bit_test(sock_gres->bits_by_sock[s], g)))
8439 				continue;   /* GRES not on this socket */
8440 			if (bit_test(node_specs->gres_bit_alloc, g) ||
8441 			    bit_test(job_specs->gres_bit_select[node_inx], g))
8442 				continue;   /* Already allocated GRES */
8443 			bit_set(job_specs->gres_bit_select[node_inx], g);
8444 			job_specs->gres_cnt_node_select[node_inx]++;
8445 			alloc_gres_cnt++;
8446 			job_specs->total_gres++;
8447 		}
8448 	}
8449 	if (alloc_gres_cnt == 0) {
8450 		for (s = 0; ((s < sock_cnt) && (alloc_gres_cnt == 0)); s++) {
8451 			if (cores_on_sock[s])
8452 				continue;
8453 			for (g = 0; g < gres_cnt; g++) {
8454 				if (!sock_gres->bits_by_sock ||
8455 				     !sock_gres->bits_by_sock[s] ||
8456 				     !bit_test(sock_gres->bits_by_sock[s], g))
8457 					continue;   /* GRES not on this socket */
8458 				if (bit_test(node_specs->gres_bit_alloc, g) ||
8459 				    bit_test(job_specs->
8460 					     gres_bit_select[node_inx], g))
8461 					continue;   /* Already allocated GRES */
8462 				bit_set(job_specs->gres_bit_select[node_inx],g);
8463 				job_specs->gres_cnt_node_select[node_inx]++;
8464 				alloc_gres_cnt++;
8465 				job_specs->total_gres++;
8466 				break;
8467 			}
8468 		}
8469 	}
8470 	if (alloc_gres_cnt == 0) {
8471 		error("%s: job %u failed to find any available GRES on node %d",
8472 		      __func__, job_id, node_inx);
8473 	}
8474 	/* Now pick the "best" max_gres GRES with respect to link counts. */
8475 	if (alloc_gres_cnt > max_gres) {
8476 		int best_link_cnt = -1, best_inx = -1;
8477 		for (s = 0; s < gres_cnt; s++) {
8478 			if (!bit_test(job_specs->gres_bit_select[node_inx], s))
8479 				continue;
8480 			for (g = s + 1; g < gres_cnt; g++) {
8481 				if (!bit_test(job_specs->
8482 					      gres_bit_select[node_inx], g))
8483 					continue;
8484 				if (node_specs->links_cnt[s][g] <=
8485 				    best_link_cnt)
8486 					continue;
8487 				best_link_cnt = node_specs->links_cnt[s][g];
8488 				best_inx = s;
8489 			}
8490 		}
8491 		while ((alloc_gres_cnt > max_gres) && (best_link_cnt != -1)) {
8492 			int worst_inx = -1, worst_link_cnt = NO_VAL16;
8493 			for (g = 0; g < gres_cnt; g++) {
8494 				if (g == best_inx)
8495 					continue;
8496 				if (!bit_test(job_specs->
8497 					      gres_bit_select[node_inx], g))
8498 					continue;
8499 				if (node_specs->links_cnt[best_inx][g] >=
8500 				    worst_link_cnt)
8501 					continue;
8502 				worst_link_cnt =
8503 					node_specs->links_cnt[best_inx][g];
8504 				worst_inx = g;
8505 			}
8506 			if (worst_inx == -1) {
8507 				error("%s: error managing links_cnt", __func__);
8508 				break;
8509 			}
8510 			bit_clear(job_specs->gres_bit_select[node_inx],
8511 				  worst_inx);
8512 			job_specs->gres_cnt_node_select[node_inx]--;
8513 			alloc_gres_cnt--;
8514 			job_specs->total_gres--;
8515 		}
8516 	}
8517 
8518 	xfree(cores_on_sock);
8519 	if (job_specs->total_gres >= job_specs->gres_per_job)
8520 		fini = 1;
8521 	return fini;
8522 }
8523 
8524 /*
8525  * Select specific GRES (set GRES bitmap) for this job on this node based upon
8526  *	per-job resource specification. Use any GRES on the node
8527  * job_res IN - job resource allocation
8528  * node_inx IN - global node index
8529  * job_node_inx IN - node index for this job's allocation
8530  * job_specs IN - job request specifications, UPDATED: set bits in
8531  *		  gres_bit_select
8532  * node_specs IN - node resource request specifications
8533  * job_id IN - job ID for logging
8534  * tres_mc_ptr IN - job's multi-core options
8535  * RET 0:more work, 1:fini
8536  */
_set_job_bits2(struct job_resources * job_res,int node_inx,int job_node_inx,sock_gres_t * sock_gres,uint32_t job_id,gres_mc_data_t * tres_mc_ptr)8537 static int _set_job_bits2(struct job_resources *job_res, int node_inx,
8538 			  int job_node_inx, sock_gres_t *sock_gres,
8539 			  uint32_t job_id, gres_mc_data_t *tres_mc_ptr)
8540 {
8541 	int core_offset, gres_cnt;
8542 	uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
8543 	int i, g, l, rc, s;
8544 	gres_job_state_t *job_specs;
8545 	gres_node_state_t *node_specs;
8546 	int fini = 0;
8547 	int best_link_cnt = 0, best_inx = -1;
8548 
8549 	job_specs = sock_gres->job_specs;
8550 	node_specs = sock_gres->node_specs;
8551 	if (job_specs->gres_per_job == job_specs->total_gres) {
8552 		fini = 1;
8553 		return fini;
8554 	}
8555 	if (!job_specs->gres_bit_select ||
8556 	    !job_specs->gres_bit_select[node_inx]) {
8557 		error("%s: gres_bit_select NULL for job %u on node %d",
8558 		      __func__, job_id, node_inx);
8559 		return SLURM_ERROR;
8560 	}
8561 	rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
8562 				   &cores_per_socket_cnt);
8563 	if (rc != SLURM_SUCCESS) {
8564 		error("%s: Invalid socket/core count for job %u on node %d",
8565 		      __func__, job_id, node_inx);
8566 		return rc;
8567 	}
8568 	core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
8569 	if (core_offset < 0) {
8570 		error("%s: Invalid core offset for job %u on node %d",
8571 		      __func__, job_id, node_inx);
8572 		return rc;
8573 	}
8574 	i = sock_gres->sock_cnt;
8575 	if ((i != 0) && (i != sock_cnt)) {
8576 		error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
8577 		      __func__, i, sock_cnt, job_id, node_inx);
8578 		sock_cnt = MIN(sock_cnt, i);
8579 	}
8580 
8581 	/*
8582 	 * Identify the GRES (if any) that we want to use as a basis for
8583 	 * maximizing link count (connectivity of the GRES).
8584 	 */
8585 	xassert(job_res->core_bitmap);
8586 	gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
8587 	if ((job_specs->gres_per_job > job_specs->total_gres) &&
8588 	    (node_specs->link_len == gres_cnt)) {
8589 		for (g = 0; g < gres_cnt; g++) {
8590 			if (!bit_test(job_specs->gres_bit_select[node_inx], g))
8591 				continue;
8592 			best_inx = g;
8593 			for (s = 0; s < gres_cnt; s++) {
8594 				best_link_cnt = MAX(node_specs->links_cnt[s][g],
8595 						    best_link_cnt);
8596 			}
8597 			break;
8598 		}
8599 	}
8600 
8601 	/*
8602 	 * Now pick specific GRES for these sockets.
8603 	 * Start with GRES available from any socket, then specific sockets
8604 	 */
8605 	for (l = best_link_cnt;
8606 	     ((l >= 0) && (job_specs->gres_per_job > job_specs->total_gres));
8607 	     l--) {
8608 		for (s = -1;   /* Socket == - 1 if GRES avail from any socket */
8609 		     ((s < sock_cnt) &&
8610 		      (job_specs->gres_per_job > job_specs->total_gres)); s++) {
8611 			for (g = 0;
8612 			     ((g < gres_cnt) &&
8613 			      (job_specs->gres_per_job >job_specs->total_gres));
8614 			     g++) {
8615 				if ((l > 0) &&
8616 				    (node_specs->links_cnt[best_inx][g] < l))
8617 					continue;   /* Want better link count */
8618 				if ((s == -1) &&
8619 				    (!sock_gres->bits_any_sock ||
8620 				     !bit_test(sock_gres->bits_any_sock, g)))
8621 					continue;  /* GRES not avail any sock */
8622 				if ((s >= 0) &&
8623 				    (!sock_gres->bits_by_sock ||
8624 				     !sock_gres->bits_by_sock[s] ||
8625 				     !bit_test(sock_gres->bits_by_sock[s], g)))
8626 					continue;  /* GRES not on this socket */
8627 				if (bit_test(node_specs->gres_bit_alloc, g) ||
8628 				    bit_test(job_specs->gres_bit_select[node_inx],
8629 					     g))
8630 					continue;   /* Already allocated GRES */
8631 				bit_set(job_specs->gres_bit_select[node_inx],g);
8632 				job_specs->gres_cnt_node_select[node_inx]++;
8633 				job_specs->total_gres++;
8634 			}
8635 		}
8636 	}
8637 	if (job_specs->gres_per_job == job_specs->total_gres)
8638 		fini = 1;
8639 	return fini;
8640 }
8641 
8642 /*
8643  * Select specific GRES (set GRES bitmap) for this job on this node based upon
8644  *	per-node resource specification
8645  * job_res IN - job resource allocation
8646  * node_inx IN - global node index
8647  * job_node_inx IN - node index for this job's allocation
8648  * job_specs IN - job request specifications, UPDATED: set bits in
8649  *		  gres_bit_select
8650  * node_specs IN - node resource request specifications
8651  * job_id IN - job ID for logging
8652  * tres_mc_ptr IN - job's multi-core options
8653  */
_set_node_bits(struct job_resources * job_res,int node_inx,int job_node_inx,sock_gres_t * sock_gres,uint32_t job_id,gres_mc_data_t * tres_mc_ptr)8654 static void _set_node_bits(struct job_resources *job_res, int node_inx,
8655 			   int job_node_inx, sock_gres_t *sock_gres,
8656 			   uint32_t job_id, gres_mc_data_t *tres_mc_ptr)
8657 {
8658 	int core_offset, gres_cnt;
8659 	uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
8660 	int c, i, g, l, rc, s;
8661 	gres_job_state_t *job_specs;
8662 	gres_node_state_t *node_specs;
8663 	int *used_sock = NULL, alloc_gres_cnt = 0;
8664 	int *links_cnt = NULL, best_link_cnt = 0;
8665 	uint64_t gres_per_bit = 1;
8666 
8667 	job_specs = sock_gres->job_specs;
8668 	node_specs = sock_gres->node_specs;
8669 	rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
8670 				   &cores_per_socket_cnt);
8671 	if (rc != SLURM_SUCCESS) {
8672 		error("%s: Invalid socket/core count for job %u on node %d",
8673 		      __func__, job_id, node_inx);
8674 		return;
8675 	}
8676 	core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
8677 	if (core_offset < 0) {
8678 		error("%s: Invalid core offset for job %u on node %d",
8679 		      __func__, job_id, node_inx);
8680 		return;
8681 	}
8682 	i = sock_gres->sock_cnt;
8683 	if ((i != 0) && (i != sock_cnt)) {
8684 		error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
8685 		      __func__, i, sock_cnt, job_id, node_inx);
8686 		sock_cnt = MIN(sock_cnt, i);
8687 	}
8688 
8689 	xassert(job_res->core_bitmap);
8690 	used_sock = xcalloc(sock_cnt, sizeof(int));
8691 	gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
8692 	for (s = 0; s < sock_cnt; s++) {
8693 		for (c = 0; c < cores_per_socket_cnt; c++) {
8694 			i = (s * cores_per_socket_cnt) + c;
8695 			if (bit_test(job_res->core_bitmap, (core_offset + i))) {
8696 				used_sock[s]++;
8697 				break;
8698 			}
8699 		}
8700 	}
8701 
8702 	/*
8703 	 * Now pick specific GRES for these sockets.
8704 	 * First: Try to place one GRES per socket in this job's allocation.
8705 	 * Second: Try to place additional GRES on allocated sockets.
8706 	 * Third: Use any additional available GRES.
8707 	 */
8708 	if (node_specs->link_len == gres_cnt)
8709 		links_cnt = xcalloc(gres_cnt, sizeof(int));
8710 	if (_shared_gres(sock_gres->plugin_id))
8711 		gres_per_bit = job_specs->gres_per_node;
8712 	for (s = -1;	/* Socket == - 1 if GRES avail from any socket */
8713 	     ((s < sock_cnt) && (alloc_gres_cnt < job_specs->gres_per_node));
8714 	     s++) {
8715 		if ((s >= 0) && !used_sock[s])
8716 			continue;
8717 		for (g = 0; g < gres_cnt; g++) {
8718 			if ((s == -1) &&
8719 			    (!sock_gres->bits_any_sock ||
8720 			     !bit_test(sock_gres->bits_any_sock, g)))
8721 				continue;   /* GRES not avail any socket */
8722 			if ((s >= 0) &&
8723 			    (!sock_gres->bits_by_sock ||
8724 			     !sock_gres->bits_by_sock[s] ||
8725 			     !bit_test(sock_gres->bits_by_sock[s], g)))
8726 				continue;   /* GRES not on this socket */
8727 			if (bit_test(job_specs->gres_bit_select[node_inx], g) ||
8728 			    ((gres_per_bit == 1) &&
8729 			     bit_test(node_specs->gres_bit_alloc, g)))
8730 				continue;   /* Already allocated GRES */
8731 			bit_set(job_specs->gres_bit_select[node_inx], g);
8732 			job_specs->gres_cnt_node_select[node_inx] +=
8733 								gres_per_bit;
8734 			alloc_gres_cnt += gres_per_bit;
8735 			for (l = 0; links_cnt && (l < gres_cnt); l++) {
8736 				if ((l == g) ||
8737 				    bit_test(node_specs->gres_bit_alloc, l))
8738 					continue;
8739 				links_cnt[l] += node_specs->links_cnt[g][l];
8740 			}
8741 			break;
8742 		}
8743 	}
8744 
8745 	if (links_cnt) {
8746 		for (l = 0; l < gres_cnt; l++)
8747 			best_link_cnt = MAX(links_cnt[l], best_link_cnt);
8748 		if (best_link_cnt > 4) {
8749 			/* Scale down to reasonable iteration count (<= 4) */
8750 			g = (best_link_cnt + 3) / 4;
8751 			best_link_cnt = 0;
8752 			for (l = 0; l < gres_cnt; l++) {
8753 				links_cnt[l] /= g;
8754 				best_link_cnt = MAX(links_cnt[l],best_link_cnt);
8755 			}
8756 		}
8757 	}
8758 
8759 	/*
8760 	 * Try to place additional GRES on allocated sockets. Favor use of
8761 	 * GRES which are best linked to GRES which have already been selected.
8762 	 */
8763 	for (l = best_link_cnt;
8764 	     ((l >= 0) && (alloc_gres_cnt < job_specs->gres_per_node)); l--) {
8765 		for (s = -1;   /* Socket == - 1 if GRES avail from any socket */
8766 		     ((s < sock_cnt) &&
8767 		      (alloc_gres_cnt < job_specs->gres_per_node)); s++) {
8768 			if ((s >= 0) && !used_sock[s])
8769 				continue;
8770 			for (g = 0; g < gres_cnt; g++) {
8771 				if (links_cnt && (links_cnt[g] < l))
8772 					continue;
8773 				if ((s == -1) &&
8774 				    (!sock_gres->bits_any_sock ||
8775 				     !bit_test(sock_gres->bits_any_sock, g)))
8776 					continue;/* GRES not avail any socket */
8777 				if ((s >= 0) &&
8778 				    (!sock_gres->bits_by_sock ||
8779 				     !sock_gres->bits_by_sock[s] ||
8780 				     !bit_test(sock_gres->bits_by_sock[s], g)))
8781 					continue;  /* GRES not on this socket */
8782 				if (bit_test(job_specs->gres_bit_select[node_inx],
8783 					     g) ||
8784 				    ((gres_per_bit == 1) &&
8785 				     bit_test(node_specs->gres_bit_alloc, g)))
8786 					continue;   /* Already allocated GRES */
8787 				bit_set(job_specs->gres_bit_select[node_inx],g);
8788 				job_specs->gres_cnt_node_select[node_inx] +=
8789 								gres_per_bit;
8790 				alloc_gres_cnt += gres_per_bit;
8791 				if (alloc_gres_cnt >= job_specs->gres_per_node)
8792 					break;
8793 			}
8794 		}
8795 	}
8796 
8797 	/*
8798 	 * Use any additional available GRES. Again, favor use of GRES
8799 	 * which are best linked to GRES which have already been selected.
8800 	 */
8801 	for (l = best_link_cnt;
8802 	     ((l >= 0) && (alloc_gres_cnt < job_specs->gres_per_node)); l--) {
8803 		for (s = 0;
8804 		     ((s < sock_cnt) &&
8805 		      (alloc_gres_cnt < job_specs->gres_per_node)); s++) {
8806 			if (used_sock[s])
8807 				continue;
8808 			for (g = 0; g < gres_cnt; g++) {
8809 				if (links_cnt && (links_cnt[g] < l))
8810 					continue;
8811 				if (!sock_gres->bits_by_sock ||
8812 				     !sock_gres->bits_by_sock[s] ||
8813 				     !bit_test(sock_gres->bits_by_sock[s], g))
8814 					continue;  /* GRES not on this socket */
8815 				if (bit_test(job_specs->gres_bit_select[node_inx],
8816 					     g) ||
8817 				    ((gres_per_bit == 1) &&
8818 				     bit_test(node_specs->gres_bit_alloc, g)))
8819 					continue;   /* Already allocated GRES */
8820 				bit_set(job_specs->gres_bit_select[node_inx],g);
8821 				job_specs->gres_cnt_node_select[node_inx] +=
8822 								gres_per_bit;
8823 				alloc_gres_cnt += gres_per_bit;
8824 				if (alloc_gres_cnt >= job_specs->gres_per_node)
8825 					break;
8826 			}
8827 		}
8828 	}
8829 
8830 	xfree(links_cnt);
8831 	xfree(used_sock);
8832 }
8833 
8834 /*
8835  * Select one specific GRES topo entry (set GRES bitmap) for this job on this
8836  *	node based upon per-node resource specification
8837  * job_res IN - job resource allocation
8838  * node_inx IN - global node index
8839  * job_node_inx IN - node index for this job's allocation
8840  * job_specs IN - job request specifications, UPDATED: set bits in
8841  *		  gres_bit_select
8842  * node_specs IN - node resource request specifications
8843  * job_id IN - job ID for logging
8844  * tres_mc_ptr IN - job's multi-core options
8845  */
_pick_specific_topo(struct job_resources * job_res,int node_inx,int job_node_inx,sock_gres_t * sock_gres,uint32_t job_id,gres_mc_data_t * tres_mc_ptr)8846 static void _pick_specific_topo(struct job_resources *job_res, int node_inx,
8847 				int job_node_inx, sock_gres_t *sock_gres,
8848 				uint32_t job_id, gres_mc_data_t *tres_mc_ptr)
8849 {
8850 	int core_offset;
8851 	uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
8852 	int c, i, rc, s, t;
8853 	gres_job_state_t *job_specs;
8854 	gres_node_state_t *node_specs;
8855 	int *used_sock = NULL, alloc_gres_cnt = 0;
8856 	uint64_t gres_per_bit;
8857 	bool use_busy_dev = false;
8858 
8859 	job_specs = sock_gres->job_specs;
8860 	gres_per_bit = job_specs->gres_per_node;
8861 	node_specs = sock_gres->node_specs;
8862 	rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
8863 				   &cores_per_socket_cnt);
8864 	if (rc != SLURM_SUCCESS) {
8865 		error("%s: Invalid socket/core count for job %u on node %d",
8866 		      __func__, job_id, node_inx);
8867 		return;
8868 	}
8869 	core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
8870 	if (core_offset < 0) {
8871 		error("%s: Invalid core offset for job %u on node %d",
8872 		      __func__, job_id, node_inx);
8873 		return;
8874 	}
8875 	i = sock_gres->sock_cnt;
8876 	if ((i != 0) && (i != sock_cnt)) {
8877 		error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
8878 		      __func__, i, sock_cnt, job_id, node_inx);
8879 		sock_cnt = MIN(sock_cnt, i);
8880 	}
8881 
8882 	xassert(job_res->core_bitmap);
8883 	used_sock = xcalloc(sock_cnt, sizeof(int));
8884 	for (s = 0; s < sock_cnt; s++) {
8885 		for (c = 0; c < cores_per_socket_cnt; c++) {
8886 			i = (s * cores_per_socket_cnt) + c;
8887 			if (bit_test(job_res->core_bitmap, (core_offset + i))) {
8888 				used_sock[s]++;
8889 				break;
8890 			}
8891 		}
8892 	}
8893 
8894 	if ((sock_gres->plugin_id == mps_plugin_id) &&
8895 	    (node_specs->gres_cnt_alloc != 0)) {
8896 		/* We must use the ONE already active GRES of this type */
8897 		use_busy_dev = true;
8898 	}
8899 
8900 	/*
8901 	 * Now pick specific GRES for these sockets.
8902 	 * First: Try to select a GRES local to allocated socket with
8903 	 *	sufficient resources.
8904 	 * Second: Use available GRES with sufficient resources.
8905 	 * Third: Use any available GRES.
8906 	 */
8907 	for (s = -1;	/* Socket == - 1 if GRES avail from any socket */
8908 	     (s < sock_cnt) && (alloc_gres_cnt == 0); s++) {
8909 		if ((s >= 0) && !used_sock[s])
8910 			continue;
8911 		for (t = 0; t < node_specs->topo_cnt; t++) {
8912 			if (use_busy_dev &&
8913 			    (node_specs->topo_gres_cnt_alloc[t] == 0))
8914 				continue;
8915 			if (node_specs->topo_gres_cnt_alloc    &&
8916 			    node_specs->topo_gres_cnt_avail    &&
8917 			    ((node_specs->topo_gres_cnt_avail[t] -
8918 			      node_specs->topo_gres_cnt_alloc[t]) <
8919 			     gres_per_bit))
8920 				continue;	/* Insufficient resources */
8921 			if ((s == -1) &&
8922 			    (!sock_gres->bits_any_sock ||
8923 			     !bit_test(sock_gres->bits_any_sock, t)))
8924 				continue;  /* GRES not avail any socket */
8925 			if ((s >= 0) &&
8926 			    (!sock_gres->bits_by_sock ||
8927 			     !sock_gres->bits_by_sock[s] ||
8928 			     !bit_test(sock_gres->bits_by_sock[s], t)))
8929 				continue;   /* GRES not on this socket */
8930 			bit_set(job_specs->gres_bit_select[node_inx], t);
8931 			job_specs->gres_cnt_node_select[node_inx] +=
8932 								gres_per_bit;
8933 			alloc_gres_cnt += gres_per_bit;
8934 			break;
8935 		}
8936 	}
8937 
8938 	/* Select available GRES with sufficient resources */
8939 	for (t = 0; (t < node_specs->topo_cnt) && (alloc_gres_cnt == 0); t++) {
8940 		if (use_busy_dev &&
8941 		    (node_specs->topo_gres_cnt_alloc[t] == 0))
8942 			continue;
8943 		if (node_specs->topo_gres_cnt_alloc    &&
8944 		    node_specs->topo_gres_cnt_avail    &&
8945 		    node_specs->topo_gres_cnt_avail[t] &&
8946 		    ((node_specs->topo_gres_cnt_avail[t] -
8947 		      node_specs->topo_gres_cnt_alloc[t]) < gres_per_bit))
8948 			continue;	/* Insufficient resources */
8949 		bit_set(job_specs->gres_bit_select[node_inx], t);
8950 		job_specs->gres_cnt_node_select[node_inx] += gres_per_bit;
8951 		alloc_gres_cnt += gres_per_bit;
8952 		break;
8953 	}
8954 
8955 	/* Select available GRES with any resources */
8956 	for (t = 0; (t < node_specs->topo_cnt) && (alloc_gres_cnt == 0); t++) {
8957 		if (node_specs->topo_gres_cnt_alloc    &&
8958 		    node_specs->topo_gres_cnt_avail    &&
8959 		    node_specs->topo_gres_cnt_avail[t])
8960 			continue;	/* No resources */
8961 		bit_set(job_specs->gres_bit_select[node_inx], t);
8962 		job_specs->gres_cnt_node_select[node_inx] += gres_per_bit;
8963 		alloc_gres_cnt += gres_per_bit;
8964 	}
8965 
8966 	xfree(used_sock);
8967 }
8968 
8969 /*
8970  * Select specific GRES (set GRES bitmap) for this job on this node based upon
8971  *	per-socket resource specification
8972  * job_res IN - job resource allocation
8973  * node_inx IN - global node index
8974  * job_node_inx IN - node index for this job's allocation
8975  * job_specs IN - job request specifications, UPDATED: set bits in
8976  *		  gres_bit_select
8977  * node_specs IN - node resource request specifications
8978  * job_id IN - job ID for logging
8979  * tres_mc_ptr IN - job's multi-core options
8980  */
_set_sock_bits(struct job_resources * job_res,int node_inx,int job_node_inx,sock_gres_t * sock_gres,uint32_t job_id,gres_mc_data_t * tres_mc_ptr)8981 static void _set_sock_bits(struct job_resources *job_res, int node_inx,
8982 			   int job_node_inx, sock_gres_t *sock_gres,
8983 			   uint32_t job_id, gres_mc_data_t *tres_mc_ptr)
8984 {
8985 	int core_offset, gres_cnt;
8986 	uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
8987 	int c, i, g, l, rc, s;
8988 	gres_job_state_t *job_specs;
8989 	gres_node_state_t *node_specs;
8990 	int *used_sock = NULL, used_sock_cnt = 0;
8991 	int *links_cnt = NULL, best_link_cnt = 0;
8992 
8993 	job_specs = sock_gres->job_specs;
8994 	node_specs = sock_gres->node_specs;
8995 	rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
8996 				   &cores_per_socket_cnt);
8997 	if (rc != SLURM_SUCCESS) {
8998 		error("%s: Invalid socket/core count for job %u on node %d",
8999 		      __func__, job_id, node_inx);
9000 		return;
9001 	}
9002 	core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
9003 	if (core_offset < 0) {
9004 		error("%s: Invalid core offset for job %u on node %d",
9005 		      __func__, job_id, node_inx);
9006 		return;
9007 	}
9008 	i = sock_gres->sock_cnt;
9009 	if ((i != 0) && (i != sock_cnt)) {
9010 		error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
9011 		      __func__, i, sock_cnt, job_id, node_inx);
9012 		sock_cnt = MIN(sock_cnt, i);
9013 	}
9014 
9015 	xassert(job_res->core_bitmap);
9016 	used_sock = xcalloc(sock_cnt, sizeof(int));
9017 	gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
9018 	for (s = 0; s < sock_cnt; s++) {
9019 		for (c = 0; c < cores_per_socket_cnt; c++) {
9020 			i = (s * cores_per_socket_cnt) + c;
9021 			if (bit_test(job_res->core_bitmap, (core_offset + i))) {
9022 				used_sock[s]++;
9023 				used_sock_cnt++;
9024 				break;
9025 			}
9026 		}
9027 	}
9028 	if (tres_mc_ptr && tres_mc_ptr->sockets_per_node     &&
9029 	    (tres_mc_ptr->sockets_per_node != used_sock_cnt) &&
9030 	    node_specs->gres_bit_alloc && sock_gres->bits_by_sock) {
9031 		if (tres_mc_ptr->sockets_per_node > used_sock_cnt) {
9032 			/* Somehow we have too few sockets in job allocation */
9033 			error("%s: Inconsistent requested/allocated socket count "
9034 			      "(%d > %d) for job %u on node %d",
9035 			      __func__, tres_mc_ptr->sockets_per_node,
9036 			      used_sock_cnt, job_id, node_inx);
9037 			for (s = 0; s < sock_cnt; s++) {
9038 				if (used_sock[s] || !sock_gres->bits_by_sock[s])
9039 					continue;
9040 				/* Determine currently free GRES by socket */
9041 				used_sock[s] = bit_set_count(
9042 						sock_gres->bits_by_sock[s]) -
9043 					       bit_overlap(
9044 						sock_gres->bits_by_sock[s],
9045 						node_specs->gres_bit_alloc);
9046 				if ((used_sock[s] == 0) ||
9047 				    (used_sock[s] < job_specs->gres_per_socket)){
9048 					used_sock[s] = 0;
9049 				} else if (++used_sock_cnt ==
9050 					   tres_mc_ptr->sockets_per_node) {
9051 					break;
9052 				}
9053 			}
9054 		} else {
9055 			/* May have needed extra CPUs, exceeding socket count */
9056 			debug("%s: Inconsistent requested/allocated socket count "
9057 			      "(%d < %d) for job %u on node %d",
9058 			      __func__, tres_mc_ptr->sockets_per_node,
9059 			      used_sock_cnt, job_id, node_inx);
9060 			for (s = 0; s < sock_cnt; s++) {
9061 				if (!used_sock[s] ||
9062 				    !sock_gres->bits_by_sock[s])
9063 					continue;
9064 				/* Determine currently free GRES by socket */
9065 				used_sock[s] = bit_set_count(
9066 						sock_gres->bits_by_sock[s]) -
9067 					       bit_overlap(
9068 						sock_gres->bits_by_sock[s],
9069 						node_specs->gres_bit_alloc);
9070 				if (used_sock[s] == 0)
9071 					used_sock_cnt--;
9072 			}
9073 			/* Exclude sockets with low GRES counts */
9074 			while (tres_mc_ptr->sockets_per_node > used_sock_cnt) {
9075 				int low_sock_inx = -1;
9076 				for (s = sock_cnt - 1; s >= 0; s--) {
9077 					if (used_sock[s] == 0)
9078 						continue;
9079 					if ((low_sock_inx == -1) ||
9080 					    (used_sock[s] <
9081 					     used_sock[low_sock_inx]))
9082 						low_sock_inx = s;
9083 				}
9084 				if (low_sock_inx == -1)
9085 					break;
9086 				used_sock[low_sock_inx] = 0;
9087 				used_sock_cnt--;
9088 			}
9089 		}
9090 	}
9091 
9092 	/*
9093 	 * Identify the available GRES with best connectivity
9094 	 * (i.e. higher link_cnt)
9095 	 */
9096 	if (node_specs->link_len == gres_cnt) {
9097 		links_cnt = xcalloc(gres_cnt, sizeof(int));
9098 		for (g = 0; g < gres_cnt; g++) {
9099 			if (bit_test(node_specs->gres_bit_alloc, g))
9100 				continue;
9101 			for (l = 0; l < gres_cnt; l++) {
9102 				if ((l == g) ||
9103 				    bit_test(node_specs->gres_bit_alloc, l))
9104 					continue;
9105 				links_cnt[l] += node_specs->links_cnt[g][l];
9106 			}
9107 		}
9108 		for (l = 0; l < gres_cnt; l++)
9109 			best_link_cnt = MAX(links_cnt[l], best_link_cnt);
9110 		if (best_link_cnt > 4) {
9111 			/* Scale down to reasonable iteration count (<= 4) */
9112 			g = (best_link_cnt + 3) / 4;
9113 			best_link_cnt = 0;
9114 			for (l = 0; l < gres_cnt; l++) {
9115 				links_cnt[l] /= g;
9116 				best_link_cnt = MAX(links_cnt[l],best_link_cnt);
9117 			}
9118 		}
9119 	}
9120 
9121 	/*
9122 	 * Now pick specific GRES for these sockets.
9123 	 * Try to use GRES with best connectivity (higher link_cnt values)
9124 	 */
9125 	for (s = 0; s < sock_cnt; s++) {
9126 		if (!used_sock[s])
9127 			continue;
9128 		i = 0;
9129 		for (l = best_link_cnt;
9130 		     ((l >= 0) && (i < job_specs->gres_per_socket)); l--) {
9131 			for (g = 0; g < gres_cnt; g++) {
9132 				if (!sock_gres->bits_by_sock ||
9133 				     !sock_gres->bits_by_sock[s] ||
9134 				     !bit_test(sock_gres->bits_by_sock[s], g))
9135 					continue;  /* GRES not on this socket */
9136 				if (node_specs->gres_bit_alloc &&
9137 				    bit_test(node_specs->gres_bit_alloc, g))
9138 					continue;   /* Already allocated GRES */
9139 				if (job_specs->gres_bit_select[node_inx] &&
9140 				    bit_test(job_specs->gres_bit_select[node_inx],
9141 					     g))
9142 					continue;   /* Already allocated GRES */
9143 				bit_set(job_specs->gres_bit_select[node_inx],g);
9144 				job_specs->gres_cnt_node_select[node_inx]++;
9145 				if (++i == job_specs->gres_per_socket)
9146 					break;
9147 			}
9148 		}
9149 		if ((i < job_specs->gres_per_socket) &&
9150 		    sock_gres->bits_any_sock) {
9151 			/* Add GRES unconstrained by socket as needed */
9152 			for (g = 0; g < gres_cnt; g++) {
9153 				if (!sock_gres->bits_any_sock ||
9154 				    !bit_test(sock_gres->bits_any_sock, g))
9155 					continue;  /* GRES not on this socket */
9156 				if (node_specs->gres_bit_alloc &&
9157 				    bit_test(node_specs->gres_bit_alloc, g))
9158 					continue;   /* Already allocated GRES */
9159 				if (job_specs->gres_bit_select[node_inx] &&
9160 				    bit_test(job_specs->gres_bit_select[node_inx],
9161 					     g))
9162 					continue;   /* Already allocated GRES */
9163 				bit_set(job_specs->gres_bit_select[node_inx],g);
9164 				job_specs->gres_cnt_node_select[node_inx]++;
9165 				if (++i == job_specs->gres_per_socket)
9166 					break;
9167 			}
9168 		}
9169 	}
9170 	xfree(links_cnt);
9171 	xfree(used_sock);
9172 }
9173 
9174 /*
9175  * Select specific GRES (set GRES bitmap) for this job on this node based upon
9176  *	per-task resource specification
9177  * job_res IN - job resource allocation
9178  * node_inx IN - global node index
9179  * job_node_inx IN - node index for this job's allocation
9180  * job_specs IN - job request specifications, UPDATED: set bits in
9181  *		  gres_bit_select
9182  * node_specs IN - node resource request specifications
9183  * job_id IN - job ID for logging
9184  * tres_mc_ptr IN - job's multi-core options
9185  */
_set_task_bits(struct job_resources * job_res,int node_inx,int job_node_inx,sock_gres_t * sock_gres,uint32_t job_id,gres_mc_data_t * tres_mc_ptr,uint32_t ** tasks_per_node_socket)9186 static void _set_task_bits(struct job_resources *job_res, int node_inx,
9187 			   int job_node_inx, sock_gres_t *sock_gres,
9188 			   uint32_t job_id, gres_mc_data_t *tres_mc_ptr,
9189 			   uint32_t **tasks_per_node_socket)
9190 {
9191 	uint16_t sock_cnt = 0;
9192 	int gres_cnt, g, l, s;
9193 	gres_job_state_t *job_specs;
9194 	gres_node_state_t *node_specs;
9195 	uint32_t total_tasks = 0;
9196 	uint64_t total_gres_cnt = 0, total_gres_goal;
9197 	int *links_cnt = NULL, best_link_cnt = 0;
9198 
9199 	job_specs = sock_gres->job_specs;
9200 	node_specs = sock_gres->node_specs;
9201 	sock_cnt = sock_gres->sock_cnt;
9202 	gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
9203 	if (node_specs->link_len == gres_cnt)
9204 		links_cnt = xcalloc(gres_cnt, sizeof(int));
9205 
9206 	/* First pick GRES for acitve sockets */
9207 	for (s = -1;	/* Socket == - 1 if GRES avail from any socket */
9208 	     s < sock_cnt; s++) {
9209 		if ((s > 0) &&
9210 		    (!tasks_per_node_socket[node_inx] ||
9211 		     (tasks_per_node_socket[node_inx][s] == 0)))
9212 			continue;
9213 		total_tasks += tasks_per_node_socket[node_inx][s];
9214 		total_gres_goal = total_tasks * job_specs->gres_per_task;
9215 		for (g = 0; g < gres_cnt; g++) {
9216 			if (total_gres_cnt >= total_gres_goal)
9217 				break;
9218 			if ((s == -1) &&
9219 			    (!sock_gres->bits_any_sock ||
9220 			     !bit_test(sock_gres->bits_any_sock, g)))
9221 				continue;  /* GRES not avail any sock */
9222 			if ((s >= 0) &&
9223 			    (!sock_gres->bits_by_sock ||
9224 			     !sock_gres->bits_by_sock[s] ||
9225 			     !bit_test(sock_gres->bits_by_sock[s], g)))
9226 				continue;   /* GRES not on this socket */
9227 			if (bit_test(node_specs->gres_bit_alloc, g))
9228 				continue;   /* Already allocated GRES */
9229 			if (bit_test(node_specs->gres_bit_alloc, g) ||
9230 			    bit_test(job_specs->gres_bit_select[node_inx], g))
9231 				continue;   /* Already allocated GRES */
9232 			bit_set(job_specs->gres_bit_select[node_inx], g);
9233 			job_specs->gres_cnt_node_select[node_inx]++;
9234 			total_gres_cnt++;
9235 			for (l = 0; links_cnt && (l < gres_cnt); l++) {
9236 				if ((l == g) ||
9237 				    bit_test(node_specs->gres_bit_alloc, l))
9238 					continue;
9239 				links_cnt[l] += node_specs->links_cnt[g][l];
9240 			}
9241 		}
9242 	}
9243 
9244 	if (links_cnt) {
9245 		for (l = 0; l < gres_cnt; l++)
9246 			best_link_cnt = MAX(links_cnt[l], best_link_cnt);
9247 		if (best_link_cnt > 4) {
9248 			/* Scale down to reasonable iteration count (<= 4) */
9249 			g = (best_link_cnt + 3) / 4;
9250 			best_link_cnt = 0;
9251 			for (l = 0; l < gres_cnt; l++) {
9252 				links_cnt[l] /= g;
9253 				best_link_cnt = MAX(links_cnt[l],best_link_cnt);
9254 			}
9255 		}
9256 	}
9257 
9258 	/*
9259 	 * Next pick additional GRES as needed. Favor use of GRES which
9260 	 * are best linked to GRES which have already been selected.
9261 	 */
9262 	total_gres_goal = total_tasks * job_specs->gres_per_task;
9263 	for (l = best_link_cnt;
9264 	     ((l >= 0) && (total_gres_cnt < total_gres_goal)); l--) {
9265 		for (s = -1;   /* Socket == - 1 if GRES avail from any socket */
9266 		     ((s < sock_cnt) && (total_gres_cnt < total_gres_goal));
9267 		     s++) {
9268 			for (g = 0;
9269 			     ((g < gres_cnt) &&
9270 			      (total_gres_cnt < total_gres_goal)); g++) {
9271 				if (links_cnt && (links_cnt[g] < l))
9272 					continue;
9273 				if ((s == -1) &&
9274 				    (!sock_gres->bits_any_sock ||
9275 				     !bit_test(sock_gres->bits_any_sock, g)))
9276 					continue;  /* GRES not avail any sock */
9277 				if ((s >= 0) &&
9278 				    (!sock_gres->bits_by_sock ||
9279 				     !sock_gres->bits_by_sock[s] ||
9280 				     !bit_test(sock_gres->bits_by_sock[s], g)))
9281 					continue;  /* GRES not on this socket */
9282 				if (bit_test(node_specs->gres_bit_alloc, g) ||
9283 				    bit_test(job_specs->gres_bit_select[node_inx],
9284 					     g))
9285 					continue;   /* Already allocated GRES */
9286 				bit_set(job_specs->gres_bit_select[node_inx],g);
9287 				job_specs->gres_cnt_node_select[node_inx]++;
9288 				total_gres_cnt++;
9289 			}
9290 		}
9291 	}
9292 	xfree(links_cnt);
9293 
9294 	if (total_gres_cnt < total_gres_goal) {
9295 		/* Something bad happened on task layout for this GRES type */
9296 		error("%s: Insufficient gres/%s allocated for job %u on node_inx %u "
9297 		      "(%"PRIu64" < %"PRIu64")",  __func__,
9298 		      sock_gres->gres_name, job_id, node_inx,
9299 		      total_gres_cnt, total_gres_goal);
9300 	}
9301 }
9302 
9303 /* Build array to identify task count for each node-socket pair */
_build_tasks_per_node_sock(struct job_resources * job_res,uint8_t overcommit,gres_mc_data_t * tres_mc_ptr,node_record_t * node_table_ptr)9304 static uint32_t **_build_tasks_per_node_sock(struct job_resources *job_res,
9305 					     uint8_t overcommit,
9306 					     gres_mc_data_t *tres_mc_ptr,
9307 					     node_record_t *node_table_ptr)
9308 {
9309 	uint32_t **tasks_per_node_socket;
9310 	int i, i_first, i_last, j, node_cnt, job_node_inx = 0;
9311 	int c, s, core_offset;
9312 	int cpus_per_task = 1, cpus_per_node, cpus_per_core;
9313 	int task_per_node_limit = 0;
9314 	int32_t rem_tasks, excess_tasks;
9315 	uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
9316 
9317 	rem_tasks = tres_mc_ptr->ntasks_per_job;
9318 	node_cnt = bit_size(job_res->node_bitmap);
9319 	tasks_per_node_socket = xcalloc(node_cnt, sizeof(uint32_t *));
9320 	i_first = bit_ffs(job_res->node_bitmap);
9321 	if (i_first != -1)
9322 		i_last  = bit_fls(job_res->node_bitmap);
9323 	else
9324 		i_last = -2;
9325 	for (i = i_first; i <= i_last; i++) {
9326 		int tasks_per_node = 0;
9327 		if (!bit_test(job_res->node_bitmap, i))
9328 			continue;
9329 		if (get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
9330 					  &cores_per_socket_cnt)) {
9331 			error("%s: failed to get socket/core count", __func__);
9332 			/* Set default of 1 task on socket 0 */
9333 			tasks_per_node_socket[i] = xmalloc(sizeof(uint32_t));
9334 			tasks_per_node_socket[i][0] = 1;
9335 			rem_tasks--;
9336 			continue;
9337 		}
9338 		tasks_per_node_socket[i] = xcalloc(sock_cnt, sizeof(uint32_t));
9339 		if (tres_mc_ptr->ntasks_per_node) {
9340 			task_per_node_limit = tres_mc_ptr->ntasks_per_node;
9341 		} else if (job_res->tasks_per_node &&
9342 			   job_res->tasks_per_node[job_node_inx]) {
9343 			task_per_node_limit =
9344 				job_res->tasks_per_node[job_node_inx];
9345 		} else {
9346 			/*
9347 			 * NOTE: We should never get here.
9348 			 * cpus_per_node reports CPUs actually used by this
9349 			 * job on this node. Divide by cpus_per_task to yield
9350 			 * valid task count on this node. This can be bad on
9351 			 * cores with more than one thread and job fails to
9352 			 * use all threads.
9353 			 */
9354 			error("%s: tasks_per_node not set", __func__);
9355 			cpus_per_node = get_job_resources_cpus(job_res,
9356 							       job_node_inx);
9357 			if (cpus_per_node < 1) {
9358 				error("%s: failed to get cpus_per_node count",
9359 				      __func__);
9360 				/* Set default of 1 task on socket 0 */
9361 				tasks_per_node_socket[i][0] = 1;
9362 				rem_tasks--;
9363 				continue;
9364 			}
9365 			if (tres_mc_ptr->cpus_per_task)
9366 				cpus_per_task = tres_mc_ptr->cpus_per_task;
9367 			else
9368 				cpus_per_task = 1;
9369 			task_per_node_limit = cpus_per_node / cpus_per_task;
9370 		}
9371 		core_offset = get_job_resources_offset(job_res, job_node_inx++,
9372 						       0, 0);
9373 		if (node_table_ptr[i].cores) {
9374 			cpus_per_core = node_table_ptr[i].cpus /
9375 					node_table_ptr[i].cores;
9376 		} else
9377 			cpus_per_core = 1;
9378 		for (s = 0; s < sock_cnt; s++) {
9379 			int tasks_per_socket = 0, tpc, skip_cores = 0;
9380 			for (c = 0; c < cores_per_socket_cnt; c++) {
9381 				j = (s * cores_per_socket_cnt) + c;
9382 				j += core_offset;
9383 				if (!bit_test(job_res->core_bitmap, j))
9384 					continue;
9385 				if (skip_cores > 0) {
9386 					skip_cores--;
9387 					continue;
9388 				}
9389 				if (tres_mc_ptr->ntasks_per_core) {
9390 					tpc = tres_mc_ptr->ntasks_per_core;
9391 				} else {
9392 					tpc = cpus_per_core / cpus_per_task;
9393 					if (tpc < 1) {
9394 						tpc = 1;
9395 						skip_cores = cpus_per_task /
9396 							     cpus_per_core;
9397 						skip_cores--;	/* This core */
9398 					}
9399 					/* Start with 1 task per core */
9400 				}
9401 				tasks_per_node_socket[i][s] += tpc;
9402 				tasks_per_node += tpc;
9403 				tasks_per_socket += tpc;
9404 				rem_tasks -= tpc;
9405 				if (task_per_node_limit) {
9406 					if (tasks_per_node >
9407 					    task_per_node_limit) {
9408 						excess_tasks = tasks_per_node -
9409 							task_per_node_limit;
9410 						tasks_per_node_socket[i][s] -=
9411 							excess_tasks;
9412 						rem_tasks += excess_tasks;
9413 					}
9414 					if (tasks_per_node >=
9415 					    task_per_node_limit) {
9416 						s = sock_cnt;
9417 						break;
9418 					}
9419 				}
9420 				/* NOTE: No support for ntasks_per_board */
9421 				if (tres_mc_ptr->ntasks_per_socket) {
9422 					if (tasks_per_socket >
9423 					    tres_mc_ptr->ntasks_per_socket) {
9424 						excess_tasks = tasks_per_socket-
9425 						 tres_mc_ptr->ntasks_per_socket;
9426 						tasks_per_node_socket[i][s] -=
9427 							excess_tasks;
9428 						rem_tasks += excess_tasks;
9429 					}
9430 					if (tasks_per_socket >=
9431 					    tres_mc_ptr->ntasks_per_socket) {
9432 						break;
9433 					}
9434 				}
9435 			}
9436 		}
9437 	}
9438 	while ((rem_tasks > 0) && overcommit) {
9439 		for (i = i_first; (rem_tasks > 0) && (i <= i_last); i++) {
9440 			if (!bit_test(job_res->node_bitmap, i))
9441 				continue;
9442 			for (s = 0; (rem_tasks > 0) && (s < sock_cnt); s++) {
9443 				for (c = 0; c < cores_per_socket_cnt; c++) {
9444 					j = (s * cores_per_socket_cnt) + c;
9445 					if (!bit_test(job_res->core_bitmap, j))
9446 						continue;
9447 					tasks_per_node_socket[i][s]++;
9448 					rem_tasks--;
9449 					break;
9450 				}
9451 			}
9452 		}
9453 	}
9454 	if (rem_tasks > 0)	/* This should never happen */
9455 		error("%s: rem_tasks not zero (%d > 0)", __func__, rem_tasks);
9456 
9457 	return tasks_per_node_socket;
9458 }
9459 
_free_tasks_per_node_sock(uint32_t ** tasks_per_node_socket,int node_cnt)9460 static void _free_tasks_per_node_sock(uint32_t **tasks_per_node_socket,
9461 				      int node_cnt)
9462 {
9463 	int n;
9464 
9465 	if (!tasks_per_node_socket)
9466 		return;
9467 
9468 	for (n = 0; n < node_cnt; n++)
9469 		xfree(tasks_per_node_socket[n]);
9470 	xfree(tasks_per_node_socket);
9471 }
9472 
9473 /* Return the count of tasks for a job on a given node */
_get_task_cnt_node(uint32_t ** tasks_per_node_socket,int node_inx,int sock_cnt)9474 static uint32_t _get_task_cnt_node(uint32_t **tasks_per_node_socket,
9475 				   int node_inx, int sock_cnt)
9476 {
9477 	uint32_t task_cnt = 0;
9478 	int s;
9479 
9480 	if (!tasks_per_node_socket || !tasks_per_node_socket[node_inx]) {
9481 		error("%s: tasks_per_node_socket is NULL", __func__);
9482 		return 1;	/* Best guess if no data structure */
9483 	}
9484 
9485 	for (s = 0; s < sock_cnt; s++)
9486 		task_cnt += tasks_per_node_socket[node_inx][s];
9487 
9488 	return task_cnt;
9489 }
9490 
9491 /* Determine maximum GRES allocation count on this node; no topology */
_get_job_cnt(sock_gres_t * sock_gres,gres_node_state_t * node_specs,int rem_node_cnt)9492 static uint64_t _get_job_cnt(sock_gres_t *sock_gres,
9493 			     gres_node_state_t *node_specs, int rem_node_cnt)
9494 {
9495 	uint64_t avail_gres, max_gres;
9496 	gres_job_state_t *job_specs = sock_gres->job_specs;
9497 
9498 	avail_gres = node_specs->gres_cnt_avail - node_specs->gres_cnt_alloc;
9499 	/* Ensure at least one GRES per node on remaining nodes */
9500 	max_gres = job_specs->gres_per_job - job_specs->total_gres -
9501 		   (rem_node_cnt - 1);
9502 	max_gres = MIN(avail_gres, max_gres);
9503 
9504 	return max_gres;
9505 }
9506 
9507 /* Return count of GRES on this node */
_get_gres_node_cnt(gres_node_state_t * node_specs,int node_inx)9508 static int _get_gres_node_cnt(gres_node_state_t *node_specs, int node_inx)
9509 {
9510 	int i, gres_cnt = 0;
9511 
9512 	if (node_specs->gres_bit_alloc) {
9513 		gres_cnt = bit_size(node_specs->gres_bit_alloc);
9514 		return gres_cnt;
9515 	}
9516 
9517 	/* This logic should be redundant */
9518 	if (node_specs->topo_gres_bitmap && node_specs->topo_gres_bitmap[0]) {
9519 		gres_cnt = bit_size(node_specs->topo_gres_bitmap[0]);
9520 		return gres_cnt;
9521 	}
9522 
9523 	/* This logic should also be redundant */
9524 	gres_cnt = 0;
9525 	for (i = 0; i < node_specs->topo_cnt; i++)
9526 		gres_cnt += node_specs->topo_gres_cnt_avail[i];
9527 	return gres_cnt;
9528 }
9529 
9530 /*
9531  * Make final GRES selection for the job
9532  * sock_gres_list IN - per-socket GRES details, one record per allocated node
9533  * job_id IN - job ID for logging
9534  * job_res IN - job resource allocation
9535  * overcommit IN - job's ability to overcommit resources
9536  * tres_mc_ptr IN - job's multi-core options
9537  * node_table_ptr IN - slurmctld's node records
9538  * RET SLURM_SUCCESS or error code
9539  */
gres_plugin_job_core_filter4(List * sock_gres_list,uint32_t job_id,struct job_resources * job_res,uint8_t overcommit,gres_mc_data_t * tres_mc_ptr,node_record_t * node_table_ptr)9540 extern int gres_plugin_job_core_filter4(List *sock_gres_list, uint32_t job_id,
9541 					struct job_resources *job_res,
9542 					uint8_t overcommit,
9543 					gres_mc_data_t *tres_mc_ptr,
9544 					node_record_t *node_table_ptr)
9545 {
9546 	ListIterator sock_gres_iter;
9547 	sock_gres_t *sock_gres;
9548 	gres_job_state_t *job_specs;
9549 	gres_node_state_t *node_specs;
9550 	int i, i_first, i_last, node_inx = -1, gres_cnt;
9551 	int node_cnt, rem_node_cnt;
9552 	int job_fini = -1;	/* -1: not applicable, 0: more work, 1: fini */
9553 	uint32_t **tasks_per_node_socket = NULL;
9554 	int rc = SLURM_SUCCESS;
9555 
9556 	if (!job_res || !job_res->node_bitmap)
9557 		return SLURM_ERROR;
9558 
9559 	node_cnt = bit_size(job_res->node_bitmap);
9560 	rem_node_cnt = bit_set_count(job_res->node_bitmap);
9561 	i_first = bit_ffs(job_res->node_bitmap);
9562 	if (i_first != -1)
9563 		i_last  = bit_fls(job_res->node_bitmap);
9564 	else
9565 		i_last = -2;
9566 	for (i = i_first; i <= i_last; i++) {
9567 		if (!bit_test(job_res->node_bitmap, i))
9568 			continue;
9569 		sock_gres_iter =
9570 			list_iterator_create(sock_gres_list[++node_inx]);
9571 		while ((sock_gres = (sock_gres_t *) list_next(sock_gres_iter))){
9572 			job_specs = sock_gres->job_specs;
9573 			node_specs = sock_gres->node_specs;
9574 			if (!job_specs || !node_specs)
9575 				continue;
9576 			if (job_specs->gres_per_task &&	/* Data needed */
9577 			    !tasks_per_node_socket) {	/* Not built yet */
9578 				tasks_per_node_socket =
9579 					_build_tasks_per_node_sock(job_res,
9580 								overcommit,
9581 								tres_mc_ptr,
9582 								node_table_ptr);
9583 			}
9584 			if (job_specs->total_node_cnt == 0) {
9585 				job_specs->total_node_cnt = node_cnt;
9586 				job_specs->total_gres = 0;
9587 			}
9588 			if (!job_specs->gres_cnt_node_select) {
9589 				job_specs->gres_cnt_node_select =
9590 					xcalloc(node_cnt, sizeof(uint64_t));
9591 			}
9592 			if (i == i_first)	/* Reinitialize counter */
9593 				job_specs->total_gres = 0;
9594 
9595 			if (node_specs->topo_cnt == 0) {
9596 				/* No topology, just set a count */
9597 				if (job_specs->gres_per_node) {
9598 					job_specs->gres_cnt_node_select[i] =
9599 						job_specs->gres_per_node;
9600 				} else if (job_specs->gres_per_socket) {
9601 					job_specs->gres_cnt_node_select[i] =
9602 						job_specs->gres_per_socket;
9603 					job_specs->gres_cnt_node_select[i] *=
9604 						_get_sock_cnt(job_res, i,
9605 							      node_inx);
9606 				} else if (job_specs->gres_per_task) {
9607 					job_specs->gres_cnt_node_select[i] =
9608 						job_specs->gres_per_task;
9609 					job_specs->gres_cnt_node_select[i] *=
9610 						_get_task_cnt_node(
9611 						tasks_per_node_socket, i,
9612 						node_table_ptr[i].sockets);
9613 				} else if (job_specs->gres_per_job) {
9614 					job_specs->gres_cnt_node_select[i] =
9615 						_get_job_cnt(sock_gres,
9616 							     node_specs,
9617 							     rem_node_cnt);
9618 				}
9619 				job_specs->total_gres +=
9620 					job_specs->gres_cnt_node_select[i];
9621 				continue;
9622 			}
9623 
9624 			/* Working with topology, need to pick specific GRES */
9625 			if (!job_specs->gres_bit_select) {
9626 				job_specs->gres_bit_select =
9627 					xcalloc(node_cnt, sizeof(bitstr_t *));
9628 			}
9629 			gres_cnt = _get_gres_node_cnt(node_specs, node_inx);
9630 			FREE_NULL_BITMAP(job_specs->gres_bit_select[i]);
9631 			job_specs->gres_bit_select[i] = bit_alloc(gres_cnt);
9632 			job_specs->gres_cnt_node_select[i] = 0;
9633 
9634 			if (job_specs->gres_per_node &&
9635 			    _shared_gres(sock_gres->plugin_id)) {
9636 				/* gres/mps: select specific topo bit for job */
9637 				_pick_specific_topo(job_res, i, node_inx,
9638 						    sock_gres, job_id,
9639 						    tres_mc_ptr);
9640 			} else if (job_specs->gres_per_node) {
9641 				_set_node_bits(job_res, i, node_inx,
9642 					       sock_gres, job_id, tres_mc_ptr);
9643 			} else if (job_specs->gres_per_socket) {
9644 				_set_sock_bits(job_res, i, node_inx,
9645 					       sock_gres, job_id, tres_mc_ptr);
9646 			} else if (job_specs->gres_per_task) {
9647 				_set_task_bits(job_res, i, node_inx,
9648 					       sock_gres, job_id, tres_mc_ptr,
9649 					       tasks_per_node_socket);
9650 			} else if (job_specs->gres_per_job) {
9651 				uint16_t cpus_per_core;
9652 				cpus_per_core = node_table_ptr[i].cpus /
9653 						node_table_ptr[i].boards /
9654 						node_table_ptr[i].sockets /
9655 						node_table_ptr[i].cores;
9656 				job_fini = _set_job_bits1(job_res, i, node_inx,
9657 					       rem_node_cnt, sock_gres,
9658 					       job_id, tres_mc_ptr,
9659 						cpus_per_core);
9660 			} else {
9661 				error("%s job %u job_spec lacks GRES counter",
9662 				      __func__, job_id);
9663 			}
9664 			if (job_fini == -1) {
9665 				/*
9666 				 * _set_job_bits1() updates total_gres counter,
9667 				 * this handle other cases.
9668 				 */
9669 				job_specs->total_gres +=
9670 					job_specs->gres_cnt_node_select[i];
9671 			}
9672 		}
9673 		rem_node_cnt--;
9674 		list_iterator_destroy(sock_gres_iter);
9675 	}
9676 
9677 	if (job_fini == 0) {
9678 		/*
9679 		 * Need more GRES to satisfy gres-per-job option with bitmaps.
9680 		 * This logic will make use of GRES that are not on allocated
9681 		 * sockets and are thus generally less desirable to use.
9682 		 */
9683 		node_inx = -1;
9684 		for (i = i_first; i <= i_last; i++) {
9685 			if (!bit_test(job_res->node_bitmap, i))
9686 				continue;
9687 			sock_gres_iter =
9688 				list_iterator_create(sock_gres_list[++node_inx]);
9689 			while ((sock_gres = (sock_gres_t *)
9690 					    list_next(sock_gres_iter))) {
9691 				job_specs = sock_gres->job_specs;
9692 				node_specs = sock_gres->node_specs;
9693 				if (!job_specs || !node_specs)
9694 					continue;
9695 				job_fini = _set_job_bits2(job_res, i, node_inx,
9696 							  sock_gres, job_id,
9697 							  tres_mc_ptr);
9698 				if (job_fini == 1)
9699 					break;
9700 			}
9701 			list_iterator_destroy(sock_gres_iter);
9702 			if (job_fini == 1)
9703 				break;
9704 		}
9705 		if (job_fini == 0) {
9706 			error("%s job %u failed to satisfy gres-per-job counter",
9707 			      __func__, job_id);
9708 			rc = ESLURM_NODE_NOT_AVAIL;
9709 		}
9710 	}
9711 	_free_tasks_per_node_sock(tasks_per_node_socket, node_cnt);
9712 
9713 	return rc;
9714 }
9715 
9716 /*
9717  * Determine if job GRES specification includes a tres-per-task specification
9718  * RET TRUE if any GRES requested by the job include a tres-per-task option
9719  */
gres_plugin_job_tres_per_task(List job_gres_list)9720 extern bool gres_plugin_job_tres_per_task(List job_gres_list)
9721 {
9722 	ListIterator job_gres_iter;
9723 	gres_state_t *job_gres_ptr;
9724 	gres_job_state_t *job_data_ptr;
9725 	bool have_gres_per_task = false;
9726 
9727 	if (!job_gres_list)
9728 		return false;
9729 
9730 	job_gres_iter = list_iterator_create(job_gres_list);
9731 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
9732 		job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
9733 		if (job_data_ptr->gres_per_task == 0)
9734 			continue;
9735 		have_gres_per_task = true;
9736 		break;
9737 	}
9738 	list_iterator_destroy(job_gres_iter);
9739 
9740 	return have_gres_per_task;
9741 }
9742 
9743 /*
9744  * Determine if the job GRES specification includes a mem-per-tres specification
9745  * RET largest mem-per-tres specification found
9746  */
gres_plugin_job_mem_max(List job_gres_list)9747 extern uint64_t gres_plugin_job_mem_max(List job_gres_list)
9748 {
9749 	ListIterator job_gres_iter;
9750 	gres_state_t *job_gres_ptr;
9751 	gres_job_state_t *job_data_ptr;
9752 	uint64_t mem_max = 0, mem_per_gres;
9753 
9754 	if (!job_gres_list)
9755 		return 0;
9756 
9757 	job_gres_iter = list_iterator_create(job_gres_list);
9758 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
9759 		job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
9760 		if (job_data_ptr->mem_per_gres)
9761 			mem_per_gres = job_data_ptr->mem_per_gres;
9762 		else
9763 			mem_per_gres = job_data_ptr->def_mem_per_gres;
9764 		mem_max = MAX(mem_max, mem_per_gres);
9765 	}
9766 	list_iterator_destroy(job_gres_iter);
9767 
9768 	return mem_max;
9769 }
9770 
9771 /*
9772  * Set per-node memory limits based upon GRES assignments
9773  * RET TRUE if mem-per-tres specification used to set memory limits
9774  */
gres_plugin_job_mem_set(List job_gres_list,job_resources_t * job_res)9775 extern bool gres_plugin_job_mem_set(List job_gres_list,
9776 				    job_resources_t *job_res)
9777 {
9778 	ListIterator job_gres_iter;
9779 	gres_state_t *job_gres_ptr;
9780 	gres_job_state_t *job_data_ptr;
9781 	bool rc = false, first_set = true;
9782 	uint64_t gres_cnt, mem_size, mem_per_gres;
9783 	int i, i_first, i_last, node_off;
9784 
9785 	if (!job_gres_list)
9786 		return false;
9787 
9788 	i_first = bit_ffs(job_res->node_bitmap);
9789 	if (i_first < 0)
9790 		return false;
9791 	i_last = bit_fls(job_res->node_bitmap);
9792 
9793 	job_gres_iter = list_iterator_create(job_gres_list);
9794 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
9795 		job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
9796 		if (job_data_ptr->mem_per_gres)
9797 			mem_per_gres = job_data_ptr->mem_per_gres;
9798 		else
9799 			mem_per_gres = job_data_ptr->def_mem_per_gres;
9800 		/*
9801 		 * The logic below is correct because the only mem_per_gres
9802 		 * is --mem-per-gpu adding another option will require change
9803 		 * to take MAX of mem_per_gres for all types.
9804 		 */
9805 		if ((mem_per_gres == 0) || !job_data_ptr->gres_cnt_node_select)
9806 			continue;
9807 		rc = true;
9808 		node_off = -1;
9809 		for (i = i_first; i <= i_last; i++) {
9810 			if (!bit_test(job_res->node_bitmap, i))
9811 				continue;
9812 			node_off++;
9813 			if (job_res->whole_node == 1) {
9814 				gres_state_t *node_gres_ptr;
9815 				gres_node_state_t *node_state_ptr;
9816 
9817 				node_gres_ptr = list_find_first(
9818 					node_record_table_ptr[i].gres_list,
9819 					_gres_find_id,
9820 					&job_gres_ptr->plugin_id);
9821 				if (!node_gres_ptr)
9822 					continue;
9823 				node_state_ptr = node_gres_ptr->gres_data;
9824 				gres_cnt = node_state_ptr->gres_cnt_avail;
9825 			} else
9826 				gres_cnt =
9827 					job_data_ptr->gres_cnt_node_select[i];
9828 			mem_size = mem_per_gres * gres_cnt;
9829 			if (first_set)
9830 				job_res->memory_allocated[node_off] = mem_size;
9831 			else
9832 				job_res->memory_allocated[node_off] += mem_size;
9833 		}
9834 		first_set = false;
9835 	}
9836 	list_iterator_destroy(job_gres_iter);
9837 
9838 	return rc;
9839 }
9840 
9841 /*
9842  * Determine the minimum number of CPUs required to satify the job's GRES
9843  *	request (based upon total GRES times cpus_per_gres value)
9844  * node_count IN - count of nodes in job allocation
9845  * sockets_per_node IN - count of sockets per node in job allocation
9846  * task_count IN - count of tasks in job allocation
9847  * job_gres_list IN - job GRES specification
9848  * RET count of required CPUs for the job
9849  */
gres_plugin_job_min_cpus(uint32_t node_count,uint32_t sockets_per_node,uint32_t task_count,List job_gres_list)9850 extern int gres_plugin_job_min_cpus(uint32_t node_count,
9851 				    uint32_t sockets_per_node,
9852 				    uint32_t task_count,
9853 				    List job_gres_list)
9854 {
9855 	ListIterator job_gres_iter;
9856 	gres_state_t *job_gres_ptr;
9857 	gres_job_state_t  *job_data_ptr;
9858 	int tmp, min_cpus = 0;
9859 	uint16_t cpus_per_gres;
9860 
9861 	if (!job_gres_list || (list_count(job_gres_list) == 0))
9862 		return 0;
9863 
9864 	job_gres_iter = list_iterator_create(job_gres_list);
9865 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
9866 		uint64_t total_gres = 0;
9867 		job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
9868 		if (job_data_ptr->cpus_per_gres)
9869 			cpus_per_gres = job_data_ptr->cpus_per_gres;
9870 		else
9871 			cpus_per_gres = job_data_ptr->def_cpus_per_gres;
9872 		if (cpus_per_gres == 0)
9873 			continue;
9874 		if (job_data_ptr->gres_per_job) {
9875 			total_gres = job_data_ptr->gres_per_job;
9876 		} else if (job_data_ptr->gres_per_node) {
9877 			total_gres = job_data_ptr->gres_per_node *
9878 				     node_count;
9879 		} else if (job_data_ptr->gres_per_socket) {
9880 			total_gres = job_data_ptr->gres_per_socket *
9881 				     node_count * sockets_per_node;
9882 		} else if (job_data_ptr->gres_per_task) {
9883 			total_gres = job_data_ptr->gres_per_task * task_count;
9884 		} else
9885 			continue;
9886 		tmp = cpus_per_gres * total_gres;
9887 		min_cpus = MAX(min_cpus, tmp);
9888 	}
9889 	list_iterator_destroy(job_gres_iter);
9890 	return min_cpus;
9891 }
9892 
9893 /*
9894  * Determine the minimum number of CPUs required to satify the job's GRES
9895  *	request on one node
9896  * sockets_per_node IN - count of sockets per node in job allocation
9897  * tasks_per_node IN - count of tasks per node in job allocation
9898  * job_gres_list IN - job GRES specification
9899  * RET count of required CPUs for the job
9900  */
gres_plugin_job_min_cpu_node(uint32_t sockets_per_node,uint32_t tasks_per_node,List job_gres_list)9901 extern int gres_plugin_job_min_cpu_node(uint32_t sockets_per_node,
9902 					uint32_t tasks_per_node,
9903 					List job_gres_list)
9904 {
9905 	ListIterator job_gres_iter;
9906 	gres_state_t *job_gres_ptr;
9907 	gres_job_state_t  *job_data_ptr;
9908 	int tmp, min_cpus = 0;
9909 	uint16_t cpus_per_gres;
9910 
9911 	if (!job_gres_list || (list_count(job_gres_list) == 0))
9912 		return 0;
9913 
9914 	job_gres_iter = list_iterator_create(job_gres_list);
9915 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
9916 		uint64_t total_gres = 0;
9917 		job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
9918 		if (job_data_ptr->cpus_per_gres)
9919 			cpus_per_gres = job_data_ptr->cpus_per_gres;
9920 		else
9921 			cpus_per_gres = job_data_ptr->def_cpus_per_gres;
9922 		if (cpus_per_gres == 0)
9923 			continue;
9924 		if (job_data_ptr->gres_per_node) {
9925 			total_gres = job_data_ptr->gres_per_node;
9926 		} else if (job_data_ptr->gres_per_socket) {
9927 			total_gres = job_data_ptr->gres_per_socket *
9928 				     sockets_per_node;
9929 		} else if (job_data_ptr->gres_per_task) {
9930 			total_gres = job_data_ptr->gres_per_task *
9931 				     tasks_per_node;
9932 		} else
9933 			total_gres = 1;
9934 		tmp = cpus_per_gres * total_gres;
9935 		min_cpus = MAX(min_cpus, tmp);
9936 	}
9937 	return min_cpus;
9938 }
9939 
9940 /*
9941  * Determine if specific GRES index on node is available to a job's allocated
9942  *	cores
9943  * IN core_bitmap - bitmap of cores allocated to the job on this node
9944  * IN/OUT alloc_core_bitmap - cores already allocated, NULL if don't care,
9945  *		updated when the function returns true
9946  * IN node_gres_ptr - GRES data for this node
9947  * IN gres_inx - index of GRES being considered for use
9948  * IN job_gres_ptr - GRES data for this job
9949  * RET true if available to those core, false otherwise
9950  */
_cores_on_gres(bitstr_t * core_bitmap,bitstr_t * alloc_core_bitmap,gres_node_state_t * node_gres_ptr,int gres_inx,gres_job_state_t * job_gres_ptr)9951 static bool _cores_on_gres(bitstr_t *core_bitmap, bitstr_t *alloc_core_bitmap,
9952 			   gres_node_state_t *node_gres_ptr, int gres_inx,
9953 			   gres_job_state_t *job_gres_ptr)
9954 {
9955 	int i, avail_cores;
9956 
9957 	if ((core_bitmap == NULL) || (node_gres_ptr->topo_cnt == 0))
9958 		return true;
9959 
9960 	for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
9961 		if (!node_gres_ptr->topo_gres_bitmap[i])
9962 			continue;
9963 		if (bit_size(node_gres_ptr->topo_gres_bitmap[i]) < gres_inx)
9964 			continue;
9965 		if (!bit_test(node_gres_ptr->topo_gres_bitmap[i], gres_inx))
9966 			continue;
9967 		if (job_gres_ptr->type_name &&
9968 		    (!node_gres_ptr->topo_type_name[i] ||
9969 		     (job_gres_ptr->type_id != node_gres_ptr->topo_type_id[i])))
9970 			continue;
9971 		if (!node_gres_ptr->topo_core_bitmap[i])
9972 			return true;
9973 		if (bit_size(node_gres_ptr->topo_core_bitmap[i]) !=
9974 		    bit_size(core_bitmap))
9975 			break;
9976 		avail_cores = bit_overlap(node_gres_ptr->topo_core_bitmap[i],
9977 					  core_bitmap);
9978 		if (avail_cores && alloc_core_bitmap) {
9979 			avail_cores -= bit_overlap(node_gres_ptr->
9980 						   topo_core_bitmap[i],
9981 						   alloc_core_bitmap);
9982 			if (avail_cores) {
9983 				bit_or(alloc_core_bitmap,
9984 				       node_gres_ptr->topo_core_bitmap[i]);
9985 			}
9986 		}
9987 		if (avail_cores)
9988 			return true;
9989 	}
9990 	return false;
9991 }
9992 
9993 /* Clear any vestigial job gres state. This may be needed on job requeue. */
gres_plugin_job_clear(List job_gres_list)9994 extern void gres_plugin_job_clear(List job_gres_list)
9995 {
9996 	int i;
9997 	ListIterator job_gres_iter;
9998 	gres_state_t *job_gres_ptr;
9999 	gres_job_state_t *job_state_ptr;
10000 
10001 	if (job_gres_list == NULL)
10002 		return;
10003 
10004 	(void) gres_plugin_init();
10005 	slurm_mutex_lock(&gres_context_lock);
10006 	job_gres_iter = list_iterator_create(job_gres_list);
10007 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
10008 		job_state_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
10009 		for (i = 0; i < job_state_ptr->node_cnt; i++) {
10010 			if (job_state_ptr->gres_bit_alloc) {
10011 				FREE_NULL_BITMAP(job_state_ptr->
10012 						 gres_bit_alloc[i]);
10013 			}
10014 			if (job_state_ptr->gres_bit_step_alloc) {
10015 				FREE_NULL_BITMAP(job_state_ptr->
10016 						 gres_bit_step_alloc[i]);
10017 			}
10018 		}
10019 		xfree(job_state_ptr->gres_bit_alloc);
10020 		xfree(job_state_ptr->gres_bit_step_alloc);
10021 		xfree(job_state_ptr->gres_cnt_step_alloc);
10022 		xfree(job_state_ptr->gres_cnt_node_alloc);
10023 		job_state_ptr->node_cnt = 0;
10024 	}
10025 	list_iterator_destroy(job_gres_iter);
10026 	slurm_mutex_unlock(&gres_context_lock);
10027 }
10028 
_job_alloc(void * job_gres_data,void * node_gres_data,int node_cnt,int node_index,int node_offset,char * gres_name,uint32_t job_id,char * node_name,bitstr_t * core_bitmap,uint32_t plugin_id,uint32_t user_id)10029 static int _job_alloc(void *job_gres_data, void *node_gres_data, int node_cnt,
10030 		      int node_index, int node_offset, char *gres_name,
10031 		      uint32_t job_id, char *node_name,
10032 		      bitstr_t *core_bitmap, uint32_t plugin_id,
10033 		      uint32_t user_id)
10034 {
10035 	int j, sz1, sz2;
10036 	int64_t gres_cnt, i;
10037 	gres_job_state_t  *job_gres_ptr  = (gres_job_state_t *)  job_gres_data;
10038 	gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data;
10039 	bool type_array_updated = false;
10040 	bitstr_t *alloc_core_bitmap = NULL;
10041 	uint64_t gres_per_bit = 1;
10042 	bool log_cnt_err = true;
10043 	char *log_type;
10044 	bool shared_gres = false, use_busy_dev = false;
10045 
10046 	/*
10047 	 * Validate data structures. Either job_gres_data->node_cnt and
10048 	 * job_gres_data->gres_bit_alloc are both set or both zero/NULL.
10049 	 */
10050 	xassert(node_cnt);
10051 	xassert(node_offset >= 0);
10052 	xassert(job_gres_ptr);
10053 	xassert(node_gres_ptr);
10054 
10055 	if (node_gres_ptr->no_consume) {
10056 		job_gres_ptr->total_gres = NO_CONSUME_VAL64;
10057 		return SLURM_SUCCESS;
10058 	}
10059 
10060 	if (_shared_gres(plugin_id)) {
10061 		shared_gres = true;
10062 		gres_per_bit = job_gres_ptr->gres_per_node;
10063 	}
10064 	if ((plugin_id == mps_plugin_id) &&
10065 	    (node_gres_ptr->gres_cnt_alloc != 0)) {
10066 		/* We must use the ONE already active GRES of this type */
10067 		use_busy_dev = true;
10068 	}
10069 
10070 	if (job_gres_ptr->type_name && !job_gres_ptr->type_name[0])
10071 		xfree(job_gres_ptr->type_name);
10072 
10073 	xfree(node_gres_ptr->gres_used);	/* Clear cache */
10074 	if (job_gres_ptr->node_cnt == 0) {
10075 		job_gres_ptr->node_cnt = node_cnt;
10076 		if (job_gres_ptr->gres_bit_alloc) {
10077 			error("gres/%s: job %u node_cnt==0 and gres_bit_alloc is set",
10078 			      gres_name, job_id);
10079 			xfree(job_gres_ptr->gres_bit_alloc);
10080 		}
10081 	}
10082 	/*
10083 	 * These next 2 checks were added long before job resizing was allowed.
10084 	 * They are not errors as we need to keep the original size around for
10085 	 * any steps that might still be out there with the larger size.  If the
10086 	 * job was sized up the gres_plugin_job_merge() function handles the
10087 	 * resize so we are set there.
10088 	 */
10089 	else if (job_gres_ptr->node_cnt < node_cnt) {
10090 		debug2("gres/%s: job %u node_cnt is now larger than it was when allocated from %u to %d",
10091 		      gres_name, job_id, job_gres_ptr->node_cnt, node_cnt);
10092 		if (node_offset >= job_gres_ptr->node_cnt)
10093 			return SLURM_ERROR;
10094 	} else if (job_gres_ptr->node_cnt > node_cnt) {
10095 		debug2("gres/%s: job %u node_cnt is now smaller than it was when allocated %u to %d",
10096 		      gres_name, job_id, job_gres_ptr->node_cnt, node_cnt);
10097 	}
10098 
10099 	if (!job_gres_ptr->gres_bit_alloc) {
10100 		job_gres_ptr->gres_bit_alloc = xcalloc(node_cnt,
10101 						       sizeof(bitstr_t *));
10102 	}
10103 	if (!job_gres_ptr->gres_cnt_node_alloc) {
10104 		job_gres_ptr->gres_cnt_node_alloc = xcalloc(node_cnt,
10105 							    sizeof(uint64_t));
10106 	}
10107 
10108 	/*
10109 	 * select/cons_tres pre-selects the resources and we just need to update
10110 	 * the data structures to reflect the selected GRES.
10111 	 */
10112 	if (job_gres_ptr->total_node_cnt) {
10113 		/* Resuming job */
10114 		if (job_gres_ptr->gres_cnt_node_alloc[node_offset]) {
10115 			gres_cnt = job_gres_ptr->
10116 				   gres_cnt_node_alloc[node_offset];
10117 		} else if (job_gres_ptr->gres_bit_alloc[node_offset]) {
10118 			gres_cnt = bit_set_count(
10119 				    job_gres_ptr->gres_bit_alloc[node_offset]);
10120 			gres_cnt *= gres_per_bit;
10121 		/* Using pre-selected GRES */
10122 		} else if (job_gres_ptr->gres_cnt_node_select &&
10123 			   job_gres_ptr->gres_cnt_node_select[node_index]) {
10124 			gres_cnt = job_gres_ptr->
10125 				   gres_cnt_node_select[node_index];
10126 		} else if (job_gres_ptr->gres_bit_select &&
10127 			   job_gres_ptr->gres_bit_select[node_index]) {
10128 			gres_cnt = bit_set_count(
10129 				    job_gres_ptr->gres_bit_select[node_index]);
10130 			gres_cnt *= gres_per_bit;
10131 		} else {
10132 			error("gres/%s: job %u node %s no resources selected",
10133 			      gres_name, job_id, node_name);
10134 			return SLURM_ERROR;
10135 		}
10136 	} else {
10137 		gres_cnt = job_gres_ptr->gres_per_node;
10138 	}
10139 
10140 	/*
10141 	 * Check that sufficient resources exist on this node
10142 	 */
10143 	job_gres_ptr->gres_cnt_node_alloc[node_offset] = gres_cnt;
10144 	i = node_gres_ptr->gres_cnt_alloc + gres_cnt;
10145 	if (i > node_gres_ptr->gres_cnt_avail) {
10146 		error("gres/%s: job %u node %s overallocated resources by %"
10147 		      PRIu64", (%"PRIu64" > %"PRIu64")",
10148 		      gres_name, job_id, node_name,
10149 		      i - node_gres_ptr->gres_cnt_avail,
10150 		      i, node_gres_ptr->gres_cnt_avail);
10151 		/* proceed with request, give job what is available */
10152 	}
10153 
10154 	if (!node_offset && job_gres_ptr->gres_cnt_step_alloc) {
10155 		uint64_t *tmp = xcalloc(job_gres_ptr->node_cnt,
10156 					sizeof(uint64_t));
10157 		memcpy(tmp, job_gres_ptr->gres_cnt_step_alloc,
10158 		       sizeof(uint64_t) * MIN(node_cnt,
10159 					      job_gres_ptr->node_cnt));
10160 		xfree(job_gres_ptr->gres_cnt_step_alloc);
10161 		job_gres_ptr->gres_cnt_step_alloc = tmp;
10162 	}
10163 	if (job_gres_ptr->gres_cnt_step_alloc == NULL) {
10164 		job_gres_ptr->gres_cnt_step_alloc =
10165 			xcalloc(job_gres_ptr->node_cnt, sizeof(uint64_t));
10166 	}
10167 
10168 	/*
10169 	 * Select and/or allocate specific resources for this job.
10170 	 */
10171 	if (job_gres_ptr->gres_bit_alloc[node_offset]) {
10172 		/*
10173 		 * Restarted slurmctld with active job or resuming a suspended
10174 		 * job. In any case, the resources already selected.
10175 		 */
10176 		if (node_gres_ptr->gres_bit_alloc == NULL) {
10177 			node_gres_ptr->gres_bit_alloc =
10178 				bit_copy(job_gres_ptr->
10179 					 gres_bit_alloc[node_offset]);
10180 			node_gres_ptr->gres_cnt_alloc +=
10181 				bit_set_count(node_gres_ptr->gres_bit_alloc);
10182 			node_gres_ptr->gres_cnt_alloc *= gres_per_bit;
10183 		} else if (node_gres_ptr->gres_bit_alloc) {
10184 			gres_cnt = (int64_t)MIN(
10185 				bit_size(node_gres_ptr->gres_bit_alloc),
10186 				bit_size(job_gres_ptr->
10187 					 gres_bit_alloc[node_offset]));
10188 			for (i = 0; i < gres_cnt; i++) {
10189 				if (bit_test(job_gres_ptr->
10190 					     gres_bit_alloc[node_offset], i) &&
10191 				    (shared_gres ||
10192 				     !bit_test(node_gres_ptr->gres_bit_alloc,
10193 					       i))) {
10194 					bit_set(node_gres_ptr->gres_bit_alloc,i);
10195 					node_gres_ptr->gres_cnt_alloc +=
10196 								gres_per_bit;
10197 				}
10198 			}
10199 		}
10200 	} else if (job_gres_ptr->total_node_cnt &&
10201 		   job_gres_ptr->gres_bit_select &&
10202 		   job_gres_ptr->gres_bit_select[node_index] &&
10203 		   job_gres_ptr->gres_cnt_node_select) {
10204 		/* Specific GRES already selected, update the node record */
10205 		bool job_mod = false;
10206 		sz1 = bit_size(job_gres_ptr->gres_bit_select[node_index]);
10207 		sz2 = bit_size(node_gres_ptr->gres_bit_alloc);
10208 		if (sz1 > sz2) {
10209 			error("gres/%s: job %u node %s gres bitmap size bad (%d > %d)",
10210 			      gres_name, job_id, node_name, sz1, sz2);
10211 			job_gres_ptr->gres_bit_select[node_index] =
10212 				bit_realloc(
10213 				job_gres_ptr->gres_bit_select[node_index], sz2);
10214 			job_mod = true;
10215 		} else if (sz1 < sz2) {
10216 			error("gres/%s: job %u node %s gres bitmap size bad (%d < %d)",
10217 			      gres_name, job_id, node_name, sz1, sz2);
10218 			job_gres_ptr->gres_bit_select[node_index] =
10219 				bit_realloc(
10220 				job_gres_ptr->gres_bit_select[node_index], sz2);
10221 		}
10222 
10223 		if (!shared_gres &&
10224 		    bit_overlap_any(job_gres_ptr->gres_bit_select[node_index],
10225 				    node_gres_ptr->gres_bit_alloc)) {
10226 			error("gres/%s: job %u node %s gres bitmap overlap",
10227 			      gres_name, job_id, node_name);
10228 			bit_and_not(job_gres_ptr->gres_bit_select[node_index],
10229 				    node_gres_ptr->gres_bit_alloc);
10230 		}
10231 		job_gres_ptr->gres_bit_alloc[node_offset] =
10232 			bit_copy(job_gres_ptr->gres_bit_select[node_index]);
10233 		job_gres_ptr->gres_cnt_node_alloc[node_offset] =
10234 			job_gres_ptr->gres_cnt_node_select[node_index];
10235 		if (!node_gres_ptr->gres_bit_alloc) {
10236 			node_gres_ptr->gres_bit_alloc =
10237 				bit_copy(job_gres_ptr->
10238 					 gres_bit_alloc[node_offset]);
10239 		} else {
10240 			bit_or(node_gres_ptr->gres_bit_alloc,
10241 			       job_gres_ptr->gres_bit_alloc[node_offset]);
10242 		}
10243 		if (job_mod) {
10244 			node_gres_ptr->gres_cnt_alloc =
10245 				bit_set_count(node_gres_ptr->gres_bit_alloc);
10246 			node_gres_ptr->gres_cnt_alloc *= gres_per_bit;
10247 		} else {
10248 			node_gres_ptr->gres_cnt_alloc += gres_cnt;
10249 		}
10250 	} else if (node_gres_ptr->gres_bit_alloc) {
10251 		int64_t gres_avail = node_gres_ptr->gres_cnt_avail;
10252 
10253 		i = bit_size(node_gres_ptr->gres_bit_alloc);
10254 		if (plugin_id == mps_plugin_id)
10255 			gres_avail = i;
10256 		else if (i < gres_avail) {
10257 			error("gres/%s: node %s gres bitmap size bad (%"PRIi64" < %"PRIi64")",
10258 			      gres_name, node_name,
10259 			      i, gres_avail);
10260 			node_gres_ptr->gres_bit_alloc =
10261 				bit_realloc(node_gres_ptr->gres_bit_alloc,
10262 					    gres_avail);
10263 		}
10264 
10265 		job_gres_ptr->gres_bit_alloc[node_offset] =
10266 			bit_alloc(gres_avail);
10267 
10268 		if (core_bitmap)
10269 			alloc_core_bitmap = bit_alloc(bit_size(core_bitmap));
10270 		/* Pass 1: Allocate GRES overlapping all allocated cores */
10271 		for (i=0; i<gres_avail && gres_cnt>0; i++) {
10272 			if (bit_test(node_gres_ptr->gres_bit_alloc, i))
10273 				continue;
10274 			if (!_cores_on_gres(core_bitmap, alloc_core_bitmap,
10275 					    node_gres_ptr, i, job_gres_ptr))
10276 				continue;
10277 			bit_set(node_gres_ptr->gres_bit_alloc, i);
10278 			bit_set(job_gres_ptr->gres_bit_alloc[node_offset], i);
10279 			node_gres_ptr->gres_cnt_alloc += gres_per_bit;
10280 			gres_cnt -= gres_per_bit;
10281 		}
10282 		FREE_NULL_BITMAP(alloc_core_bitmap);
10283 		/* Pass 2: Allocate GRES overlapping any allocated cores */
10284 		for (i=0; i<gres_avail && gres_cnt>0; i++) {
10285 			if (bit_test(node_gres_ptr->gres_bit_alloc, i))
10286 				continue;
10287 			if (!_cores_on_gres(core_bitmap, NULL, node_gres_ptr, i,
10288 					    job_gres_ptr))
10289 				continue;
10290 			bit_set(node_gres_ptr->gres_bit_alloc, i);
10291 			bit_set(job_gres_ptr->gres_bit_alloc[node_offset], i);
10292 			node_gres_ptr->gres_cnt_alloc += gres_per_bit;
10293 			gres_cnt -= gres_per_bit;
10294 		}
10295 		if (gres_cnt) {
10296 			verbose("gres/%s topology sub-optimal for job %u",
10297 				gres_name, job_id);
10298 		}
10299 		/* Pass 3: Allocate any available GRES */
10300 		for (i=0; i<gres_avail && gres_cnt>0; i++) {
10301 			if (bit_test(node_gres_ptr->gres_bit_alloc, i))
10302 				continue;
10303 			bit_set(node_gres_ptr->gres_bit_alloc, i);
10304 			bit_set(job_gres_ptr->gres_bit_alloc[node_offset], i);
10305 			node_gres_ptr->gres_cnt_alloc += gres_per_bit;
10306 			gres_cnt -= gres_per_bit;
10307 		}
10308 	} else {
10309 		node_gres_ptr->gres_cnt_alloc += gres_cnt;
10310 	}
10311 
10312 	if (job_gres_ptr->gres_bit_alloc[node_offset] &&
10313 	    node_gres_ptr->topo_gres_bitmap &&
10314 	    node_gres_ptr->topo_gres_cnt_alloc) {
10315 		for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
10316 			if (job_gres_ptr->type_name &&
10317 			    (!node_gres_ptr->topo_type_name[i] ||
10318 			     (job_gres_ptr->type_id !=
10319 			      node_gres_ptr->topo_type_id[i])))
10320 				continue;
10321 			if (use_busy_dev &&
10322 			    (node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
10323 				continue;
10324 			sz1 = bit_size(job_gres_ptr->gres_bit_alloc[node_offset]);
10325 			sz2 = bit_size(node_gres_ptr->topo_gres_bitmap[i]);
10326 
10327 			if ((sz1 != sz2) && log_cnt_err) {
10328 				if (_shared_gres(plugin_id))
10329 					log_type = "File";
10330 				else
10331 					log_type = "Count";
10332 				/* Avoid abort on bit_overlap below */
10333 				error("gres/%s %s mismatch for node %s (%d != %d)",
10334 				      gres_name, log_type, node_name, sz1, sz2);
10335 				log_cnt_err = false;
10336 			}
10337 			if (sz1 != sz2)
10338 				continue;	/* See error above */
10339 			gres_cnt = bit_overlap(job_gres_ptr->
10340 					       gres_bit_alloc[node_offset],
10341 					       node_gres_ptr->
10342 					       topo_gres_bitmap[i]);
10343 			gres_cnt *= gres_per_bit;
10344 			node_gres_ptr->topo_gres_cnt_alloc[i] += gres_cnt;
10345 			if ((node_gres_ptr->type_cnt == 0) ||
10346 			    (node_gres_ptr->topo_type_name == NULL) ||
10347 			    (node_gres_ptr->topo_type_name[i] == NULL))
10348 				continue;
10349 			for (j = 0; j < node_gres_ptr->type_cnt; j++) {
10350 				if (!node_gres_ptr->type_name[j] ||
10351 				    (node_gres_ptr->topo_type_id[i] !=
10352 				     node_gres_ptr->type_id[j]))
10353 					continue;
10354 				node_gres_ptr->type_cnt_alloc[j] += gres_cnt;
10355 				break;
10356 			}
10357 		}
10358 		type_array_updated = true;
10359 	} else if (job_gres_ptr->gres_bit_alloc[node_offset]) {
10360 		int len;	/* length of the gres bitmap on this node */
10361 		len = bit_size(job_gres_ptr->gres_bit_alloc[node_offset]);
10362 		if (!node_gres_ptr->topo_gres_cnt_alloc) {
10363 			node_gres_ptr->topo_gres_cnt_alloc =
10364 				xcalloc(len, sizeof(uint64_t));
10365 		} else {
10366 			len = MIN(len, node_gres_ptr->gres_cnt_config);
10367 		}
10368 
10369 		if ((node_gres_ptr->topo_cnt == 0) && shared_gres) {
10370 			/*
10371 			 * Need to add node topo arrays for slurmctld restart
10372 			 * and job state recovery (with GRES counts per topo)
10373 			 */
10374 			node_gres_ptr->topo_cnt =
10375 			    bit_size(job_gres_ptr->gres_bit_alloc[node_offset]);
10376 			node_gres_ptr->topo_core_bitmap =
10377 				xcalloc(node_gres_ptr->topo_cnt,
10378 					sizeof(bitstr_t *));
10379 			node_gres_ptr->topo_gres_bitmap =
10380 				xcalloc(node_gres_ptr->topo_cnt,
10381 					sizeof(bitstr_t *));
10382 			node_gres_ptr->topo_gres_cnt_alloc =
10383 				xcalloc(node_gres_ptr->topo_cnt,
10384 					sizeof(uint64_t));
10385 			node_gres_ptr->topo_gres_cnt_avail =
10386 				xcalloc(node_gres_ptr->topo_cnt,
10387 					sizeof(uint64_t));
10388 			node_gres_ptr->topo_type_id =
10389 				xcalloc(node_gres_ptr->topo_cnt,
10390 					sizeof(uint32_t));
10391 			node_gres_ptr->topo_type_name =
10392 				xcalloc(node_gres_ptr->topo_cnt,
10393 					sizeof(char *));
10394 			for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
10395 				node_gres_ptr->topo_gres_bitmap[i] =
10396 					bit_alloc(node_gres_ptr->topo_cnt);
10397 				bit_set(node_gres_ptr->topo_gres_bitmap[i], i);
10398 			}
10399 		}
10400 
10401 		for (i = 0; i < len; i++) {
10402 			gres_cnt = 0;
10403 			if (!bit_test(job_gres_ptr->
10404 				      gres_bit_alloc[node_offset], i))
10405 				continue;
10406 			/*
10407 			 * NOTE: Immediately after slurmctld restart and before
10408 			 * the node's registration, the GRES type and topology
10409 			 * information will not be available and we will be
10410 			 * unable to update topo_gres_cnt_alloc or
10411 			 * type_cnt_alloc. This results in some incorrect
10412 			 * internal bookkeeping, but does not cause failures
10413 			 * in terms of allocating GRES to jobs.
10414 			 */
10415 			for (j = 0; j < node_gres_ptr->topo_cnt; j++) {
10416 				if (use_busy_dev &&
10417 				    (node_gres_ptr->topo_gres_cnt_alloc[j] == 0))
10418 					continue;
10419 				if (node_gres_ptr->topo_gres_bitmap &&
10420 				    node_gres_ptr->topo_gres_bitmap[j] &&
10421 				    bit_test(node_gres_ptr->topo_gres_bitmap[j],
10422 					     i)) {
10423 					node_gres_ptr->topo_gres_cnt_alloc[i] +=
10424 								gres_per_bit;
10425 					gres_cnt += gres_per_bit;
10426 				}
10427 			}
10428 			if ((node_gres_ptr->type_cnt == 0) ||
10429 			    (node_gres_ptr->topo_type_name == NULL) ||
10430 			    (node_gres_ptr->topo_type_name[i] == NULL))
10431 				continue;
10432 			for (j = 0; j < node_gres_ptr->type_cnt; j++) {
10433 				if (!node_gres_ptr->type_name[j] ||
10434 				    (node_gres_ptr->topo_type_id[i] !=
10435 				     node_gres_ptr->type_id[j]))
10436 					continue;
10437 				node_gres_ptr->type_cnt_alloc[j] += gres_cnt;
10438 				break;
10439 			}
10440 		}
10441 		type_array_updated = true;
10442 		if (job_gres_ptr->type_name && job_gres_ptr->type_name[0]) {
10443 			/*
10444 			 * We may not know how many GRES of this type will be
10445 			 * available on this node, but need to track how many
10446 			 * are allocated to this job from here to avoid
10447 			 * underflows when this job is deallocated
10448 			 */
10449 			_add_gres_type(job_gres_ptr->type_name, node_gres_ptr,
10450 				       0);
10451 			for (j = 0; j < node_gres_ptr->type_cnt; j++) {
10452 				if (job_gres_ptr->type_id !=
10453 				    node_gres_ptr->type_id[j])
10454 					continue;
10455 				node_gres_ptr->type_cnt_alloc[j] +=
10456 					job_gres_ptr->gres_per_node;
10457 				break;
10458 			}
10459 		}
10460 	}
10461 
10462 	if (!type_array_updated && job_gres_ptr->type_name) {
10463 		gres_cnt = job_gres_ptr->gres_per_node;
10464 		for (j = 0; j < node_gres_ptr->type_cnt; j++) {
10465 			int64_t k;
10466 			if (job_gres_ptr->type_id !=
10467 			    node_gres_ptr->type_id[j])
10468 				continue;
10469 			k = node_gres_ptr->type_cnt_avail[j] -
10470 			    node_gres_ptr->type_cnt_alloc[j];
10471 			k = MIN(gres_cnt, k);
10472 			node_gres_ptr->type_cnt_alloc[j] += k;
10473 			gres_cnt -= k;
10474 			if (gres_cnt == 0)
10475 				break;
10476 		}
10477 	}
10478 
10479 	return SLURM_SUCCESS;
10480 }
10481 
_job_select_whole_node_internal(gres_key_t * job_search_key,gres_node_state_t * node_state_ptr,int type_inx,int context_inx,List job_gres_list)10482 static void _job_select_whole_node_internal(
10483 	gres_key_t *job_search_key, gres_node_state_t *node_state_ptr,
10484 	int type_inx, int context_inx, List job_gres_list)
10485 {
10486 	gres_state_t *job_gres_ptr;
10487 	gres_job_state_t *job_state_ptr;
10488 
10489 	if (!(job_gres_ptr = list_find_first(job_gres_list,
10490 					     _gres_find_job_by_key,
10491 					     job_search_key))) {
10492 		job_state_ptr = xmalloc(sizeof(gres_job_state_t));
10493 
10494 		job_gres_ptr = xmalloc(sizeof(gres_state_t));
10495 		job_gres_ptr->plugin_id = job_search_key->plugin_id;
10496 		job_gres_ptr->gres_data = job_state_ptr;
10497 		job_state_ptr->gres_name =
10498 			xstrdup(gres_context[context_inx].gres_name);
10499 		if (type_inx != -1)
10500 			job_state_ptr->type_name =
10501 				xstrdup(node_state_ptr->type_name[type_inx]);
10502 		job_state_ptr->type_id = job_search_key->type_id;
10503 
10504 		list_append(job_gres_list, job_gres_ptr);
10505 	} else
10506 		job_state_ptr = job_gres_ptr->gres_data;
10507 
10508 	/*
10509 	 * Add the total_gres here but no count, that will be done after
10510 	 * allocation.
10511 	 */
10512 	if (node_state_ptr->no_consume) {
10513 		job_state_ptr->total_gres = NO_CONSUME_VAL64;
10514 	} else if (type_inx != -1)
10515 		job_state_ptr->total_gres +=
10516 			node_state_ptr->type_cnt_avail[type_inx];
10517 	else
10518 		job_state_ptr->total_gres += node_state_ptr->gres_cnt_avail;
10519 }
10520 
_job_alloc_whole_node_internal(gres_key_t * job_search_key,gres_node_state_t * node_state_ptr,List job_gres_list,int node_cnt,int node_index,int node_offset,int type_index,uint32_t job_id,char * node_name,bitstr_t * core_bitmap,uint32_t user_id)10521 static int _job_alloc_whole_node_internal(
10522 	gres_key_t *job_search_key, gres_node_state_t *node_state_ptr,
10523 	List job_gres_list, int node_cnt, int node_index, int node_offset,
10524 	int type_index, uint32_t job_id, char *node_name,
10525 	bitstr_t *core_bitmap, uint32_t user_id)
10526 {
10527 	gres_state_t *job_gres_ptr;
10528 	gres_job_state_t *job_state_ptr;
10529 
10530 	if (!(job_gres_ptr = list_find_first(job_gres_list,
10531 					     _gres_find_job_by_key,
10532 					     job_search_key))) {
10533 		error("%s: This should never happen, we couldn't find the gres %u:%u",
10534 		      __func__,
10535 		      job_search_key->plugin_id,
10536 		      job_search_key->type_id);
10537 		return SLURM_ERROR;
10538 	}
10539 
10540 	job_state_ptr = (gres_job_state_t *)job_gres_ptr->gres_data;
10541 
10542 	/*
10543 	 * As the amount of gres on each node could
10544 	 * differ. We need to set the gres_per_node
10545 	 * correctly here to avoid heterogeneous node
10546 	 * issues.
10547 	 */
10548 	if (type_index != -1)
10549 		job_state_ptr->gres_per_node =
10550 			node_state_ptr->type_cnt_avail[type_index];
10551 	else
10552 		job_state_ptr->gres_per_node = node_state_ptr->gres_cnt_avail;
10553 
10554 	return _job_alloc(job_state_ptr, node_state_ptr,
10555 			  node_cnt, node_index, node_offset,
10556 			  job_state_ptr->gres_name,
10557 			  job_id, node_name, core_bitmap,
10558 			  job_gres_ptr->plugin_id,
10559 			  user_id);
10560 }
10561 
10562 /*
10563  * Select and allocate GRES to a job and update node and job GRES information
10564  * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
10565  * IN node_gres_list - node's gres_list built by
10566  *		       gres_plugin_node_config_validate()
10567  * IN node_cnt    - total number of nodes originally allocated to the job
10568  * IN node_index  - zero-origin global node index
10569  * IN node_offset - zero-origin index in job allocation to the node of interest
10570  * IN job_id      - job's ID (for logging)
10571  * IN node_name   - name of the node (for logging)
10572  * IN core_bitmap - cores allocated to this job on this node (NULL if not
10573  *                  available)
10574  * IN user_id     - job's user ID
10575  * RET SLURM_SUCCESS or error code
10576  */
gres_plugin_job_alloc(List job_gres_list,List node_gres_list,int node_cnt,int node_index,int node_offset,uint32_t job_id,char * node_name,bitstr_t * core_bitmap,uint32_t user_id)10577 extern int gres_plugin_job_alloc(List job_gres_list, List node_gres_list,
10578 				 int node_cnt, int node_index, int node_offset,
10579 				 uint32_t job_id, char *node_name,
10580 				 bitstr_t *core_bitmap, uint32_t user_id)
10581 {
10582 	int i, rc, rc2;
10583 	ListIterator job_gres_iter,  node_gres_iter;
10584 	gres_state_t *job_gres_ptr, *node_gres_ptr;
10585 
10586 	if (job_gres_list == NULL)
10587 		return SLURM_SUCCESS;
10588 	if (node_gres_list == NULL) {
10589 		error("%s: job %u has gres specification while node %s has none",
10590 		      __func__, job_id, node_name);
10591 		return SLURM_ERROR;
10592 	}
10593 
10594 	rc = gres_plugin_init();
10595 
10596 	slurm_mutex_lock(&gres_context_lock);
10597 	job_gres_iter = list_iterator_create(job_gres_list);
10598 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
10599 		for (i = 0; i < gres_context_cnt; i++) {
10600 			if (job_gres_ptr->plugin_id ==
10601 			    gres_context[i].plugin_id)
10602 				break;
10603 		}
10604 		if (i >= gres_context_cnt) {
10605 			error("%s: no plugin configured for data type %u for job %u and node %s",
10606 			      __func__, job_gres_ptr->plugin_id, job_id,
10607 			      node_name);
10608 			/* A likely sign that GresPlugins has changed */
10609 			continue;
10610 		}
10611 
10612 		node_gres_iter = list_iterator_create(node_gres_list);
10613 		while ((node_gres_ptr = (gres_state_t *)
10614 				list_next(node_gres_iter))) {
10615 			if (job_gres_ptr->plugin_id == node_gres_ptr->plugin_id)
10616 				break;
10617 		}
10618 		list_iterator_destroy(node_gres_iter);
10619 		if (node_gres_ptr == NULL) {
10620 			error("%s: job %u allocated gres/%s on node %s lacking that gres",
10621 			      __func__, job_id, gres_context[i].gres_name,
10622 			      node_name);
10623 			continue;
10624 		}
10625 
10626 		rc2 = _job_alloc(job_gres_ptr->gres_data,
10627 				 node_gres_ptr->gres_data, node_cnt, node_index,
10628 				 node_offset, gres_context[i].gres_name,
10629 				 job_id, node_name, core_bitmap,
10630 				 job_gres_ptr->plugin_id, user_id);
10631 		if (rc2 != SLURM_SUCCESS)
10632 			rc = rc2;
10633 	}
10634 	list_iterator_destroy(job_gres_iter);
10635 	slurm_mutex_unlock(&gres_context_lock);
10636 
10637 	return rc;
10638 }
10639 
10640 /*
10641  * Fill in job_gres_list with the total amount of GRES on a node.
10642  * OUT job_gres_list - This list will be destroyed and remade with all GRES on
10643  *                     node.
10644  * IN node_gres_list - node's gres_list built by
10645  *		       gres_plugin_node_config_validate()
10646  * IN job_id      - job's ID (for logging)
10647  * IN node_name   - name of the node (for logging)
10648  * RET SLURM_SUCCESS or error code
10649  */
gres_plugin_job_select_whole_node(List * job_gres_list,List node_gres_list,uint32_t job_id,char * node_name)10650 extern int gres_plugin_job_select_whole_node(
10651 	List *job_gres_list, List node_gres_list,
10652 	uint32_t job_id, char *node_name)
10653 {
10654 	int i;
10655 	ListIterator node_gres_iter;
10656 	gres_state_t *node_gres_ptr;
10657 	gres_node_state_t *node_state_ptr;
10658 
10659 	if (job_gres_list == NULL)
10660 		return SLURM_SUCCESS;
10661 	if (node_gres_list == NULL) {
10662 		error("%s: job %u has gres specification while node %s has none",
10663 		      __func__, job_id, node_name);
10664 		return SLURM_ERROR;
10665 	}
10666 
10667 	if (!*job_gres_list)
10668 		*job_gres_list = list_create(_gres_job_list_delete);
10669 
10670 	if (gres_plugin_init() != SLURM_SUCCESS)
10671 		return SLURM_ERROR;
10672 
10673 	slurm_mutex_lock(&gres_context_lock);
10674 	node_gres_iter = list_iterator_create(node_gres_list);
10675 	while ((node_gres_ptr = list_next(node_gres_iter))) {
10676 		gres_key_t job_search_key;
10677 		node_state_ptr = (gres_node_state_t *) node_gres_ptr->gres_data;
10678 
10679 		/*
10680 		 * Don't check for no_consume here, we need them added here and
10681 		 * will filter them out in gres_plugin_job_alloc_whole_node()
10682 		 */
10683 		if (!node_state_ptr->gres_cnt_config)
10684 			continue;
10685 
10686 		for (i = 0; i < gres_context_cnt; i++) {
10687 			if (node_gres_ptr->plugin_id ==
10688 			    gres_context[i].plugin_id)
10689 				break;
10690 		}
10691 		if (i >= gres_context_cnt) {
10692 			error("%s: no plugin configured for data type %u for job %u and node %s",
10693 			      __func__, node_gres_ptr->plugin_id, job_id,
10694 			      node_name);
10695 			/* A likely sign that GresPlugins has changed */
10696 			continue;
10697 		}
10698 
10699 		job_search_key.plugin_id = node_gres_ptr->plugin_id;
10700 
10701 		if (!node_state_ptr->type_cnt) {
10702 			job_search_key.type_id = 0;
10703 			_job_select_whole_node_internal(
10704 				&job_search_key, node_state_ptr,
10705 				-1, i, *job_gres_list);
10706 		} else {
10707 			for (int j = 0; j < node_state_ptr->type_cnt; j++) {
10708 				job_search_key.type_id = gres_plugin_build_id(
10709 					node_state_ptr->type_name[j]);
10710 				_job_select_whole_node_internal(
10711 					&job_search_key, node_state_ptr,
10712 					j, i, *job_gres_list);
10713 			}
10714 		}
10715 	}
10716 	list_iterator_destroy(node_gres_iter);
10717 	slurm_mutex_unlock(&gres_context_lock);
10718 
10719 	return SLURM_SUCCESS;
10720 }
10721 
10722 /*
10723  * Select and allocate all GRES on a node to a job and update node and job GRES
10724  * information
10725  * IN job_gres_list - job's gres_list built by gres_plugin_job_whole_node().
10726  * IN node_gres_list - node's gres_list built by
10727  *		       gres_plugin_node_config_validate()
10728  * IN node_cnt    - total number of nodes originally allocated to the job
10729  * IN node_index  - zero-origin global node index
10730  * IN node_offset - zero-origin index in job allocation to the node of interest
10731  * IN job_id      - job's ID (for logging)
10732  * IN node_name   - name of the node (for logging)
10733  * IN core_bitmap - cores allocated to this job on this node (NULL if not
10734  *                  available)
10735  * IN user_id     - job's user ID
10736  * RET SLURM_SUCCESS or error code
10737  */
gres_plugin_job_alloc_whole_node(List job_gres_list,List node_gres_list,int node_cnt,int node_index,int node_offset,uint32_t job_id,char * node_name,bitstr_t * core_bitmap,uint32_t user_id)10738 extern int gres_plugin_job_alloc_whole_node(
10739 	List job_gres_list, List node_gres_list,
10740 	int node_cnt, int node_index, int node_offset,
10741 	uint32_t job_id, char *node_name,
10742 	bitstr_t *core_bitmap, uint32_t user_id)
10743 {
10744 	int i, rc, rc2;
10745 	ListIterator node_gres_iter;
10746 	gres_state_t *node_gres_ptr;
10747 	gres_node_state_t *node_state_ptr;
10748 
10749 	if (job_gres_list == NULL)
10750 		return SLURM_SUCCESS;
10751 	if (node_gres_list == NULL) {
10752 		error("%s: job %u has gres specification while node %s has none",
10753 		      __func__, job_id, node_name);
10754 		return SLURM_ERROR;
10755 	}
10756 
10757 	rc = gres_plugin_init();
10758 
10759 	slurm_mutex_lock(&gres_context_lock);
10760 	node_gres_iter = list_iterator_create(node_gres_list);
10761 	while ((node_gres_ptr = list_next(node_gres_iter))) {
10762 		gres_key_t job_search_key;
10763 		node_state_ptr = (gres_node_state_t *) node_gres_ptr->gres_data;
10764 
10765 		if (node_state_ptr->no_consume ||
10766 		    !node_state_ptr->gres_cnt_config)
10767 			continue;
10768 
10769 		for (i = 0; i < gres_context_cnt; i++) {
10770 			if (node_gres_ptr->plugin_id ==
10771 			    gres_context[i].plugin_id)
10772 				break;
10773 		}
10774 		if (i >= gres_context_cnt) {
10775 			error("%s: no plugin configured for data type %u for job %u and node %s",
10776 			      __func__, node_gres_ptr->plugin_id, job_id,
10777 			      node_name);
10778 			/* A likely sign that GresPlugins has changed */
10779 			continue;
10780 		}
10781 
10782 		job_search_key.plugin_id = node_gres_ptr->plugin_id;
10783 
10784 		if (!node_state_ptr->type_cnt) {
10785 			job_search_key.type_id = 0;
10786 			rc2 = _job_alloc_whole_node_internal(
10787 				&job_search_key, node_state_ptr,
10788 				job_gres_list, node_cnt, node_index,
10789 				node_offset, -1, job_id, node_name,
10790 				core_bitmap, user_id);
10791 			if (rc2 != SLURM_SUCCESS)
10792 				rc = rc2;
10793 		} else {
10794 			for (int j = 0; j < node_state_ptr->type_cnt; j++) {
10795 				job_search_key.type_id = gres_plugin_build_id(
10796 					node_state_ptr->type_name[j]);
10797 				rc2 = _job_alloc_whole_node_internal(
10798 					&job_search_key, node_state_ptr,
10799 					job_gres_list, node_cnt, node_index,
10800 					node_offset, j, job_id, node_name,
10801 					core_bitmap, user_id);
10802 				if (rc2 != SLURM_SUCCESS)
10803 					rc = rc2;
10804 			}
10805 		}
10806 	}
10807 	list_iterator_destroy(node_gres_iter);
10808 	slurm_mutex_unlock(&gres_context_lock);
10809 
10810 	return rc;
10811 }
10812 
_job_dealloc(void * job_gres_data,void * node_gres_data,int node_offset,char * gres_name,uint32_t job_id,char * node_name,bool old_job,uint32_t plugin_id,uint32_t user_id,bool job_fini)10813 static int _job_dealloc(void *job_gres_data, void *node_gres_data,
10814 			int node_offset, char *gres_name, uint32_t job_id,
10815 			char *node_name, bool old_job, uint32_t plugin_id,
10816 			uint32_t user_id, bool job_fini)
10817 {
10818 	int i, j, len, sz1, sz2;
10819 	gres_job_state_t  *job_gres_ptr  = (gres_job_state_t *)  job_gres_data;
10820 	gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data;
10821 	bool type_array_updated = false;
10822 	uint64_t gres_cnt = 0, k;
10823 	uint64_t gres_per_bit = 1;
10824 
10825 	/*
10826 	 * Validate data structures. Either job_gres_data->node_cnt and
10827 	 * job_gres_data->gres_bit_alloc are both set or both zero/NULL.
10828 	 */
10829 	xassert(node_offset >= 0);
10830 	xassert(job_gres_ptr);
10831 	xassert(node_gres_ptr);
10832 
10833 	if (node_gres_ptr->no_consume)
10834 		return SLURM_SUCCESS;
10835 
10836 	if (job_gres_ptr->node_cnt <= node_offset) {
10837 		error("gres/%s: job %u dealloc of node %s bad node_offset %d "
10838 		      "count is %u", gres_name, job_id, node_name, node_offset,
10839 		      job_gres_ptr->node_cnt);
10840 		return SLURM_ERROR;
10841 	}
10842 
10843 	if (_shared_gres(plugin_id))
10844 		gres_per_bit = job_gres_ptr->gres_per_node;
10845 
10846 	xfree(node_gres_ptr->gres_used);	/* Clear cache */
10847 	if (node_gres_ptr->gres_bit_alloc && job_gres_ptr->gres_bit_alloc &&
10848 	    job_gres_ptr->gres_bit_alloc[node_offset]) {
10849 		len = bit_size(job_gres_ptr->gres_bit_alloc[node_offset]);
10850 		i   = bit_size(node_gres_ptr->gres_bit_alloc);
10851 		if (i != len) {
10852 			error("gres/%s: job %u and node %s bitmap sizes differ "
10853 			      "(%d != %d)", gres_name, job_id, node_name, len,
10854 			       i);
10855 			len = MIN(len, i);
10856 			/* proceed with request, make best effort */
10857 		}
10858 		for (i = 0; i < len; i++) {
10859 			if (!bit_test(job_gres_ptr->gres_bit_alloc[node_offset],
10860 				      i)) {
10861 				continue;
10862 			}
10863 			bit_clear(node_gres_ptr->gres_bit_alloc, i);
10864 
10865 			/*
10866 			 * NOTE: Do not clear bit from
10867 			 * job_gres_ptr->gres_bit_alloc[node_offset]
10868 			 * since this may only be an emulated deallocate
10869 			 */
10870 			if (node_gres_ptr->gres_cnt_alloc >= gres_per_bit) {
10871 				node_gres_ptr->gres_cnt_alloc -= gres_per_bit;
10872 			} else {
10873 				error("gres/%s: job %u dealloc node %s GRES count underflow (%"PRIu64" < %"PRIu64")",
10874 				      gres_name, job_id, node_name,
10875 				      node_gres_ptr->gres_cnt_alloc,
10876 				      gres_per_bit);
10877 				node_gres_ptr->gres_cnt_alloc = 0;
10878 			}
10879 		}
10880 	} else if (job_gres_ptr->gres_cnt_node_alloc) {
10881 		gres_cnt = job_gres_ptr->gres_cnt_node_alloc[node_offset];
10882 	} else {
10883 		gres_cnt = job_gres_ptr->gres_per_node;
10884 	}
10885 	if (gres_cnt && (node_gres_ptr->gres_cnt_alloc >= gres_cnt))
10886 		node_gres_ptr->gres_cnt_alloc -= gres_cnt;
10887 	else if (gres_cnt) {
10888 		error("gres/%s: job %u node %s GRES count underflow (%"PRIu64" < %"PRIu64")",
10889 		      gres_name, job_id, node_name,
10890 		      node_gres_ptr->gres_cnt_alloc, gres_cnt);
10891 		node_gres_ptr->gres_cnt_alloc = 0;
10892 	}
10893 
10894 	if (job_gres_ptr->gres_bit_alloc &&
10895 	    job_gres_ptr->gres_bit_alloc[node_offset] &&
10896 	    node_gres_ptr->topo_gres_bitmap &&
10897 	    node_gres_ptr->topo_gres_cnt_alloc) {
10898 		for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
10899 			sz1 = bit_size(
10900 				job_gres_ptr->gres_bit_alloc[node_offset]);
10901 			sz2 = bit_size(node_gres_ptr->topo_gres_bitmap[i]);
10902 			if (sz1 != sz2)
10903 				continue;
10904 			gres_cnt = (uint64_t)bit_overlap(
10905 				job_gres_ptr->gres_bit_alloc[node_offset],
10906 				node_gres_ptr->topo_gres_bitmap[i]);
10907 			gres_cnt *= gres_per_bit;
10908 			if (node_gres_ptr->topo_gres_cnt_alloc[i] >= gres_cnt) {
10909 				node_gres_ptr->topo_gres_cnt_alloc[i] -=
10910 					gres_cnt;
10911 			} else if (old_job) {
10912 				node_gres_ptr->topo_gres_cnt_alloc[i] = 0;
10913 			} else {
10914 				error("gres/%s: job %u dealloc node %s topo gres count underflow "
10915 				      "(%"PRIu64" %"PRIu64")",
10916 				      gres_name, job_id, node_name,
10917 				      node_gres_ptr->topo_gres_cnt_alloc[i],
10918 				      gres_cnt);
10919 				node_gres_ptr->topo_gres_cnt_alloc[i] = 0;
10920 			}
10921 			if ((node_gres_ptr->type_cnt == 0) ||
10922 			    (node_gres_ptr->topo_type_name == NULL) ||
10923 			    (node_gres_ptr->topo_type_name[i] == NULL))
10924 				continue;
10925 			for (j = 0; j < node_gres_ptr->type_cnt; j++) {
10926 				if (!node_gres_ptr->type_name[j] ||
10927 				    (node_gres_ptr->topo_type_id[i] !=
10928 				     node_gres_ptr->type_id[j]))
10929 					continue;
10930 				if (node_gres_ptr->type_cnt_alloc[j] >=
10931 				    gres_cnt) {
10932 					node_gres_ptr->type_cnt_alloc[j] -=
10933 						gres_cnt;
10934 				} else if (old_job) {
10935 					node_gres_ptr->type_cnt_alloc[j] = 0;
10936 				} else {
10937 					error("gres/%s: job %u dealloc node %s type %s gres count underflow "
10938 					      "(%"PRIu64" %"PRIu64")",
10939 					      gres_name, job_id, node_name,
10940 					      node_gres_ptr->type_name[j],
10941 					      node_gres_ptr->type_cnt_alloc[j],
10942 					      gres_cnt);
10943 					node_gres_ptr->type_cnt_alloc[j] = 0;
10944 				}
10945 			}
10946 		}
10947 		type_array_updated = true;
10948 	} else if (job_gres_ptr->gres_bit_alloc &&
10949 		   job_gres_ptr->gres_bit_alloc[node_offset] &&
10950 		   node_gres_ptr->topo_gres_cnt_alloc) {
10951 		/* Avoid crash if configuration inconsistent */
10952 		len = MIN(node_gres_ptr->gres_cnt_config,
10953 			  bit_size(job_gres_ptr->
10954 				   gres_bit_alloc[node_offset]));
10955 		for (i = 0; i < len; i++) {
10956 			if (!bit_test(job_gres_ptr->
10957 				      gres_bit_alloc[node_offset], i) ||
10958 			    !node_gres_ptr->topo_gres_cnt_alloc[i])
10959 				continue;
10960 			if (node_gres_ptr->topo_gres_cnt_alloc[i] >=
10961 			    gres_per_bit) {
10962 				node_gres_ptr->topo_gres_cnt_alloc[i] -=
10963 								gres_per_bit;
10964 			} else {
10965 				error("gres/%s: job %u dealloc node %s "
10966 				      "topo_gres_cnt_alloc[%d] count underflow "
10967 				      "(%"PRIu64" %"PRIu64")",
10968 				      gres_name, job_id, node_name, i,
10969 				      node_gres_ptr->topo_gres_cnt_alloc[i],
10970 				      gres_per_bit);
10971 				node_gres_ptr->topo_gres_cnt_alloc[i] = 0;
10972 			}
10973 			if ((node_gres_ptr->type_cnt == 0) ||
10974 			    (node_gres_ptr->topo_type_name == NULL) ||
10975 			    (node_gres_ptr->topo_type_name[i] == NULL))
10976 				continue;
10977 			for (j = 0; j < node_gres_ptr->type_cnt; j++) {
10978 				if (!node_gres_ptr->type_name[j] ||
10979 				    (node_gres_ptr->topo_type_id[i] !=
10980 				     node_gres_ptr->type_id[j]))
10981 					continue;
10982 				if (node_gres_ptr->type_cnt_alloc[j] >=
10983 				    gres_per_bit) {
10984 					node_gres_ptr->type_cnt_alloc[j] -=
10985 								gres_per_bit;
10986 				} else {
10987 					error("gres/%s: job %u dealloc node %s "
10988 					      "type %s type_cnt_alloc count underflow "
10989 					      "(%"PRIu64" %"PRIu64")",
10990 					      gres_name, job_id, node_name,
10991 					      node_gres_ptr->type_name[j],
10992 					      node_gres_ptr->type_cnt_alloc[j],
10993 					      gres_per_bit);
10994 					node_gres_ptr->type_cnt_alloc[j] = 0;
10995 				}
10996  			}
10997 		}
10998 		type_array_updated = true;
10999 	}
11000 
11001 	if (!type_array_updated && job_gres_ptr->type_name) {
11002 		gres_cnt = job_gres_ptr->gres_per_node;
11003 		for (j = 0; j < node_gres_ptr->type_cnt; j++) {
11004 			if (job_gres_ptr->type_id !=
11005 			    node_gres_ptr->type_id[j])
11006 				continue;
11007 			k = MIN(gres_cnt, node_gres_ptr->type_cnt_alloc[j]);
11008 			node_gres_ptr->type_cnt_alloc[j] -= k;
11009 			gres_cnt -= k;
11010 			if (gres_cnt == 0)
11011 				break;
11012 		}
11013  	}
11014 
11015 	return SLURM_SUCCESS;
11016 }
11017 
11018 /*
11019  * Deallocate resource from a job and update node and job gres information
11020  * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
11021  * IN node_gres_list - node's gres_list built by
11022  *		gres_plugin_node_config_validate()
11023  * IN node_offset - zero-origin index to the node of interest
11024  * IN job_id      - job's ID (for logging)
11025  * IN node_name   - name of the node (for logging)
11026  * IN old_job     - true if job started before last slurmctld reboot.
11027  *		    Immediately after slurmctld restart and before the node's
11028  *		    registration, the GRES type and topology. This results in
11029  *		    some incorrect internal bookkeeping, but does not cause
11030  *		    failures in terms of allocating GRES to jobs.
11031  * IN user_id     - job's user ID
11032  * IN: job_fini   - job fully terminating on this node (not just a test)
11033  * RET SLURM_SUCCESS or error code
11034  */
gres_plugin_job_dealloc(List job_gres_list,List node_gres_list,int node_offset,uint32_t job_id,char * node_name,bool old_job,uint32_t user_id,bool job_fini)11035 extern int gres_plugin_job_dealloc(List job_gres_list, List node_gres_list,
11036 				   int node_offset, uint32_t job_id,
11037 				   char *node_name, bool old_job,
11038 				   uint32_t user_id, bool job_fini)
11039 {
11040 	int i, rc, rc2;
11041 	ListIterator job_gres_iter;
11042 	gres_state_t *job_gres_ptr, *node_gres_ptr;
11043 	char *gres_name = NULL;
11044 
11045 	if (job_gres_list == NULL)
11046 		return SLURM_SUCCESS;
11047 	if (node_gres_list == NULL) {
11048 		error("%s: job %u has gres specification while node %s has none",
11049 		      __func__, job_id, node_name);
11050 		return SLURM_ERROR;
11051 	}
11052 
11053 	rc = gres_plugin_init();
11054 
11055 	slurm_mutex_lock(&gres_context_lock);
11056 	job_gres_iter = list_iterator_create(job_gres_list);
11057 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
11058 		for (i = 0; i < gres_context_cnt; i++) {
11059 			if (job_gres_ptr->plugin_id ==
11060 			    gres_context[i].plugin_id)
11061 				break;
11062 		}
11063 		if (i >= gres_context_cnt) {
11064 			error("%s: no plugin configured for data type %u for job %u and node %s",
11065 			      __func__, job_gres_ptr->plugin_id, job_id,
11066 			      node_name);
11067 			/* A likely sign that GresPlugins has changed */
11068 			gres_name = "UNKNOWN";
11069 		} else
11070 			gres_name = gres_context[i].gres_name;
11071 
11072 		node_gres_ptr = list_find_first(node_gres_list, _gres_find_id,
11073 						&job_gres_ptr->plugin_id);
11074 
11075 		if (node_gres_ptr == NULL) {
11076 			error("%s: node %s lacks gres/%s for job %u", __func__,
11077 			      node_name, gres_name , job_id);
11078 			continue;
11079 		}
11080 
11081 		rc2 = _job_dealloc(job_gres_ptr->gres_data,
11082 				   node_gres_ptr->gres_data, node_offset,
11083 				   gres_name, job_id, node_name, old_job,
11084 				   job_gres_ptr->plugin_id, user_id, job_fini);
11085 		if (rc2 != SLURM_SUCCESS)
11086 			rc = rc2;
11087 	}
11088 	list_iterator_destroy(job_gres_iter);
11089 	slurm_mutex_unlock(&gres_context_lock);
11090 
11091 	return rc;
11092 }
11093 
11094 /*
11095  * Merge one job's gres allocation into another job's gres allocation.
11096  * IN from_job_gres_list - List of gres records for the job being merged
11097  *			into another job
11098  * IN from_job_node_bitmap - bitmap of nodes for the job being merged into
11099  *			another job
11100  * IN/OUT to_job_gres_list - List of gres records for the job being merged
11101  *			into job
11102  * IN to_job_node_bitmap - bitmap of nodes for the job being merged into
11103  */
gres_plugin_job_merge(List from_job_gres_list,bitstr_t * from_job_node_bitmap,List to_job_gres_list,bitstr_t * to_job_node_bitmap)11104 extern void gres_plugin_job_merge(List from_job_gres_list,
11105 				  bitstr_t *from_job_node_bitmap,
11106 				  List to_job_gres_list,
11107 				  bitstr_t *to_job_node_bitmap)
11108 {
11109 	static int select_hetero = -1;
11110 	ListIterator gres_iter;
11111 	gres_state_t *gres_ptr, *gres_ptr2;
11112 	gres_job_state_t *gres_job_ptr, *gres_job_ptr2;
11113 	int new_node_cnt;
11114 	int i_first, i_last, i;
11115 	int from_inx, to_inx, new_inx;
11116 	bitstr_t **new_gres_bit_alloc, **new_gres_bit_step_alloc;
11117 	uint64_t *new_gres_cnt_step_alloc, *new_gres_cnt_node_alloc;
11118 
11119 	if (select_hetero == -1) {
11120 		/*
11121 		 * Determine if the select plugin supports heterogeneous
11122 		 * GRES allocations (count differ by node): 1=yes, 0=no
11123 		 */
11124 		char *select_type = slurm_get_select_type();
11125 		if (select_type &&
11126 		    (strstr(select_type, "cons_tres") ||
11127 		     (strstr(select_type, "cray_aries") &&
11128 		      (slurm_get_select_type_param() & CR_OTHER_CONS_TRES)))) {
11129 			select_hetero = 1;
11130 		} else
11131 			select_hetero = 0;
11132 		xfree(select_type);
11133 	}
11134 
11135 	(void) gres_plugin_init();
11136 	new_node_cnt = bit_set_count(from_job_node_bitmap) +
11137 		       bit_set_count(to_job_node_bitmap) -
11138 		       bit_overlap(from_job_node_bitmap, to_job_node_bitmap);
11139 	i_first = MIN(bit_ffs(from_job_node_bitmap),
11140 		      bit_ffs(to_job_node_bitmap));
11141 	i_first = MAX(i_first, 0);
11142 	i_last  = MAX(bit_fls(from_job_node_bitmap),
11143 		      bit_fls(to_job_node_bitmap));
11144 	if (i_last == -1) {
11145 		error("%s: node_bitmaps are empty", __func__);
11146 		return;
11147 	}
11148 
11149 	slurm_mutex_lock(&gres_context_lock);
11150 
11151 	/* Step one - Expand the gres data structures in "to" job */
11152 	if (!to_job_gres_list)
11153 		goto step2;
11154 	gres_iter = list_iterator_create(to_job_gres_list);
11155 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
11156 		gres_job_ptr = (gres_job_state_t *) gres_ptr->gres_data;
11157 		new_gres_bit_alloc = xcalloc(new_node_cnt, sizeof(bitstr_t *));
11158 		new_gres_cnt_node_alloc = xcalloc(new_node_cnt,
11159 						  sizeof(uint64_t));
11160 		new_gres_bit_step_alloc = xcalloc(new_node_cnt,
11161 						  sizeof(bitstr_t *));
11162 		new_gres_cnt_step_alloc = xcalloc(new_node_cnt,
11163 						  sizeof(uint64_t));
11164 
11165 		from_inx = to_inx = new_inx = -1;
11166 		for (i = i_first; i <= i_last; i++) {
11167 			bool from_match = false, to_match = false;
11168 			if (bit_test(to_job_node_bitmap, i)) {
11169 				to_match = true;
11170 				to_inx++;
11171 			}
11172 			if (bit_test(from_job_node_bitmap, i)) {
11173 				from_match = true;
11174 				from_inx++;
11175 			}
11176 			if (from_match || to_match)
11177 				new_inx++;
11178 			if (to_match) {
11179 				if (gres_job_ptr->gres_bit_alloc) {
11180 					new_gres_bit_alloc[new_inx] =
11181 						gres_job_ptr->
11182 						gres_bit_alloc[to_inx];
11183 				}
11184 				if (gres_job_ptr->gres_cnt_node_alloc) {
11185 					new_gres_cnt_node_alloc[new_inx] =
11186 						gres_job_ptr->
11187 						gres_cnt_node_alloc[to_inx];
11188 				}
11189 				if (gres_job_ptr->gres_bit_step_alloc) {
11190 					new_gres_bit_step_alloc[new_inx] =
11191 						gres_job_ptr->
11192 						gres_bit_step_alloc[to_inx];
11193 				}
11194 				if (gres_job_ptr->gres_cnt_step_alloc) {
11195 					new_gres_cnt_step_alloc[new_inx] =
11196 						gres_job_ptr->
11197 						gres_cnt_step_alloc[to_inx];
11198 				}
11199 			}
11200 		}
11201 		gres_job_ptr->node_cnt = new_node_cnt;
11202 		xfree(gres_job_ptr->gres_bit_alloc);
11203 		gres_job_ptr->gres_bit_alloc = new_gres_bit_alloc;
11204 		xfree(gres_job_ptr->gres_cnt_node_alloc);
11205 		gres_job_ptr->gres_cnt_node_alloc = new_gres_cnt_node_alloc;
11206 		xfree(gres_job_ptr->gres_bit_step_alloc);
11207 		gres_job_ptr->gres_bit_step_alloc = new_gres_bit_step_alloc;
11208 		xfree(gres_job_ptr->gres_cnt_step_alloc);
11209 		gres_job_ptr->gres_cnt_step_alloc = new_gres_cnt_step_alloc;
11210 	}
11211 	list_iterator_destroy(gres_iter);
11212 
11213 	/*
11214 	 * Step two - Merge the gres information from the "from" job into the
11215 	 * existing gres information for the "to" job
11216 	 */
11217 step2:	if (!from_job_gres_list)
11218 		goto step3;
11219 	if (!to_job_gres_list) {
11220 		to_job_gres_list = list_create(_gres_job_list_delete);
11221 	}
11222 	gres_iter = list_iterator_create(from_job_gres_list);
11223 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
11224 		gres_job_ptr = (gres_job_state_t *) gres_ptr->gres_data;
11225 		gres_ptr2 = list_find_first(to_job_gres_list, _gres_find_id,
11226 					    &gres_ptr->plugin_id);
11227 		if (gres_ptr2) {
11228 			gres_job_ptr2 = gres_ptr2->gres_data;
11229 		} else {
11230 			gres_ptr2 = xmalloc(sizeof(gres_state_t));
11231 			gres_job_ptr2 = xmalloc(sizeof(gres_job_state_t));
11232 			gres_ptr2->plugin_id = gres_ptr->plugin_id;
11233 			gres_ptr2->gres_data = gres_job_ptr2;
11234 			gres_job_ptr2->gres_name =
11235 					xstrdup(gres_job_ptr->gres_name);
11236 			gres_job_ptr2->cpus_per_gres =
11237 					gres_job_ptr->cpus_per_gres;
11238 			gres_job_ptr2->gres_per_job =
11239 					gres_job_ptr->gres_per_job;
11240 			gres_job_ptr2->gres_per_job =
11241 					gres_job_ptr->gres_per_job;
11242 			gres_job_ptr2->gres_per_socket =
11243 					gres_job_ptr->gres_per_socket;
11244 			gres_job_ptr2->gres_per_task =
11245 					gres_job_ptr->gres_per_task;
11246 			gres_job_ptr2->mem_per_gres =
11247 					gres_job_ptr->mem_per_gres;
11248 			gres_job_ptr2->node_cnt = new_node_cnt;
11249 			gres_job_ptr2->gres_bit_alloc =
11250 				xcalloc(new_node_cnt, sizeof(bitstr_t *));
11251 			gres_job_ptr2->gres_cnt_node_alloc =
11252 				xcalloc(new_node_cnt, sizeof(uint64_t));
11253 			gres_job_ptr2->gres_bit_step_alloc =
11254 				xcalloc(new_node_cnt, sizeof(bitstr_t *));
11255 			gres_job_ptr2->gres_cnt_step_alloc =
11256 				xcalloc(new_node_cnt, sizeof(uint64_t));
11257 			list_append(to_job_gres_list, gres_ptr2);
11258 		}
11259 		from_inx = to_inx = new_inx = -1;
11260 		for (i = i_first; i <= i_last; i++) {
11261 			bool from_match = false, to_match = false;
11262 			if (bit_test(to_job_node_bitmap, i)) {
11263 				to_match = true;
11264 				to_inx++;
11265 			}
11266 			if (bit_test(from_job_node_bitmap, i)) {
11267 				from_match = true;
11268 				from_inx++;
11269 			}
11270 			if (from_match || to_match)
11271 				new_inx++;
11272 			if (from_match) {
11273 				if (!gres_job_ptr->gres_bit_alloc) {
11274 					;
11275 				} else if (select_hetero &&
11276 					   gres_job_ptr2->
11277 					   gres_bit_alloc[new_inx] &&
11278 					   gres_job_ptr->gres_bit_alloc &&
11279 					   gres_job_ptr->
11280 					   gres_bit_alloc[new_inx]) {
11281 					/* Merge job's GRES bitmaps */
11282 					bit_or(gres_job_ptr2->
11283 					       gres_bit_alloc[new_inx],
11284 					       gres_job_ptr->
11285 					       gres_bit_alloc[from_inx]);
11286 				} else if (gres_job_ptr2->
11287 					   gres_bit_alloc[new_inx]) {
11288 					/* Keep original job's GRES bitmap */
11289 				} else {
11290 					gres_job_ptr2->gres_bit_alloc[new_inx] =
11291 						gres_job_ptr->
11292 						gres_bit_alloc[from_inx];
11293 					gres_job_ptr->
11294 						gres_bit_alloc
11295 						[from_inx] = NULL;
11296 				}
11297 				if (!gres_job_ptr->gres_bit_alloc) {
11298 					;
11299 				} else if (select_hetero &&
11300 					   gres_job_ptr2->
11301 					   gres_cnt_node_alloc[new_inx] &&
11302 					   gres_job_ptr->gres_cnt_node_alloc &&
11303 					   gres_job_ptr->
11304 					   gres_cnt_node_alloc[new_inx]) {
11305 					gres_job_ptr2->
11306 						gres_cnt_node_alloc[new_inx] +=
11307 						gres_job_ptr->
11308 						gres_cnt_node_alloc[from_inx];
11309 				} else if (gres_job_ptr2->
11310 					   gres_cnt_node_alloc[new_inx]) {
11311 					/* Keep original job's GRES bitmap */
11312 				} else {
11313 					gres_job_ptr2->
11314 						gres_cnt_node_alloc[new_inx] =
11315 						gres_job_ptr->
11316 						gres_cnt_node_alloc[from_inx];
11317 					gres_job_ptr->
11318 						gres_cnt_node_alloc[from_inx]=0;
11319 				}
11320 				if (gres_job_ptr->gres_cnt_step_alloc &&
11321 				    gres_job_ptr->
11322 				    gres_cnt_step_alloc[from_inx]) {
11323 					error("Attempt to merge gres, from "
11324 					      "job has active steps");
11325 				}
11326 			}
11327 		}
11328 	}
11329 	list_iterator_destroy(gres_iter);
11330 
11331 step3:	slurm_mutex_unlock(&gres_context_lock);
11332 	return;
11333 }
11334 
11335 /*
11336  * Set environment variables as required for a batch job
11337  * IN/OUT job_env_ptr - environment variable array
11338  * IN gres_list - generated by gres_plugin_job_alloc()
11339  * IN node_inx - zero origin node index
11340  */
gres_plugin_job_set_env(char *** job_env_ptr,List job_gres_list,int node_inx)11341 extern void gres_plugin_job_set_env(char ***job_env_ptr, List job_gres_list,
11342 				    int node_inx)
11343 {
11344 	int i;
11345 	ListIterator gres_iter;
11346 	gres_state_t *gres_ptr = NULL;
11347 	bool found;
11348 
11349 	(void) gres_plugin_init();
11350 
11351 	slurm_mutex_lock(&gres_context_lock);
11352 	for (i=0; i<gres_context_cnt; i++) {
11353 		if (gres_context[i].ops.job_set_env == NULL)
11354 			continue;	/* No plugin to call */
11355 		found = false;
11356 		if (job_gres_list) {
11357 			gres_iter = list_iterator_create(job_gres_list);
11358 			while ((gres_ptr = (gres_state_t *)
11359 				list_next(gres_iter))) {
11360 				if (gres_ptr->plugin_id !=
11361 				    gres_context[i].plugin_id)
11362 					continue;
11363 				(*(gres_context[i].ops.job_set_env))
11364 					(job_env_ptr, gres_ptr->gres_data,
11365 					 node_inx);
11366 				found = true;
11367 			}
11368 			list_iterator_destroy(gres_iter);
11369 		}
11370 		/*
11371 		 * We call the job_set_env of the gres even if this one is not
11372 		 * requested in the job. This may be convenient on certain
11373 		 * plugins, i.e. setting an env variable to say the GRES is not
11374 		 * available.
11375 		 */
11376 		if (!found) {
11377 			(*(gres_context[i].ops.job_set_env))
11378 				(job_env_ptr, NULL, node_inx);
11379 		}
11380 	}
11381 	slurm_mutex_unlock(&gres_context_lock);
11382 }
11383 
11384 /*
11385  * Set job default parameters in a given element of a list
11386  * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
11387  * IN gres_name - name of gres, apply defaults to all elements (e.g. updates to
11388  *		  gres_name="gpu" would apply to "gpu:tesla", "gpu:volta", etc.)
11389  * IN cpu_per_gpu - value to set as default
11390  * IN mem_per_gpu - value to set as default
11391  */
gres_plugin_job_set_defs(List job_gres_list,char * gres_name,uint64_t cpu_per_gpu,uint64_t mem_per_gpu)11392 extern void gres_plugin_job_set_defs(List job_gres_list, char *gres_name,
11393 				     uint64_t cpu_per_gpu,
11394 				     uint64_t mem_per_gpu)
11395 {
11396 	uint32_t plugin_id;
11397 	ListIterator gres_iter;
11398 	gres_state_t *gres_ptr = NULL;
11399 	gres_job_state_t *job_gres_data;
11400 
11401 	if (!job_gres_list)
11402 		return;
11403 
11404 	plugin_id = gres_plugin_build_id(gres_name);
11405 	gres_iter = list_iterator_create(job_gres_list);
11406 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
11407 		if (gres_ptr->plugin_id != plugin_id)
11408 			continue;
11409 		job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
11410 		if (!job_gres_data)
11411 			continue;
11412 		job_gres_data->def_cpus_per_gres = cpu_per_gpu;
11413 		job_gres_data->def_mem_per_gres = mem_per_gpu;
11414 	}
11415 	list_iterator_destroy(gres_iter);
11416 }
11417 
11418 /*
11419  * Translate GRES flag to string.
11420  * NOT reentrant
11421  */
_gres_flags_str(uint16_t flags)11422 static char *_gres_flags_str(uint16_t flags)
11423 {
11424 	if (flags & GRES_NO_CONSUME)
11425 		return "no_consume";
11426 	return "";
11427 }
11428 
_job_state_log(void * gres_data,uint32_t job_id,uint32_t plugin_id)11429 static void _job_state_log(void *gres_data, uint32_t job_id, uint32_t plugin_id)
11430 {
11431 	gres_job_state_t *gres_ptr;
11432 	char *sparse_msg = "", tmp_str[128];
11433 	int i;
11434 
11435 	xassert(gres_data);
11436 	gres_ptr = (gres_job_state_t *) gres_data;
11437 	info("gres:%s(%u) type:%s(%u) job:%u flags:%s state",
11438 	      gres_ptr->gres_name, plugin_id, gres_ptr->type_name,
11439 	      gres_ptr->type_id, job_id, _gres_flags_str(gres_ptr->flags));
11440 	if (gres_ptr->cpus_per_gres)
11441 		info("  cpus_per_gres:%u", gres_ptr->cpus_per_gres);
11442 	else if (gres_ptr->def_cpus_per_gres)
11443 		info("  def_cpus_per_gres:%u", gres_ptr->def_cpus_per_gres);
11444 	if (gres_ptr->gres_per_job)
11445 		info("  gres_per_job:%"PRIu64, gres_ptr->gres_per_job);
11446 	if (gres_ptr->gres_per_node) {
11447 		info("  gres_per_node:%"PRIu64" node_cnt:%u",
11448 		     gres_ptr->gres_per_node, gres_ptr->node_cnt);
11449 	}
11450 	if (gres_ptr->gres_per_socket)
11451 		info("  gres_per_socket:%"PRIu64, gres_ptr->gres_per_socket);
11452 	if (gres_ptr->gres_per_task)
11453 		info("  gres_per_task:%"PRIu64, gres_ptr->gres_per_task);
11454 	if (gres_ptr->mem_per_gres)
11455 		info("  mem_per_gres:%"PRIu64, gres_ptr->mem_per_gres);
11456 	else if (gres_ptr->def_mem_per_gres)
11457 		info("  def_mem_per_gres:%"PRIu64, gres_ptr->def_mem_per_gres);
11458 
11459 	if (gres_ptr->node_cnt == 0)
11460 		return;
11461 	if (gres_ptr->gres_bit_alloc == NULL)
11462 		info("  gres_bit_alloc:NULL");
11463 	if (gres_ptr->gres_cnt_node_alloc == NULL)
11464 		info("  gres_cnt_node_alloc:NULL");
11465 	if (gres_ptr->gres_bit_step_alloc == NULL)
11466 		info("  gres_bit_step_alloc:NULL");
11467 	if (gres_ptr->gres_cnt_step_alloc == NULL)
11468 		info("  gres_cnt_step_alloc:NULL");
11469 	if (gres_ptr->gres_bit_select == NULL)
11470 		info("  gres_bit_select:NULL");
11471 	if (gres_ptr->gres_cnt_node_select == NULL)
11472 		info("  gres_cnt_node_select:NULL");
11473 
11474 	for (i = 0; i < gres_ptr->node_cnt; i++) {
11475 		if (gres_ptr->gres_cnt_node_alloc &&
11476 		    gres_ptr->gres_cnt_node_alloc[i]) {
11477 			info("  gres_cnt_node_alloc[%d]:%"PRIu64,
11478 			     i, gres_ptr->gres_cnt_node_alloc[i]);
11479 		} else if (gres_ptr->gres_cnt_node_alloc)
11480 			info("  gres_cnt_node_alloc[%d]:NULL", i);
11481 
11482 		if (gres_ptr->gres_bit_alloc && gres_ptr->gres_bit_alloc[i]) {
11483 			bit_fmt(tmp_str, sizeof(tmp_str),
11484 				gres_ptr->gres_bit_alloc[i]);
11485 			info("  gres_bit_alloc[%d]:%s of %d", i, tmp_str,
11486 			     (int) bit_size(gres_ptr->gres_bit_alloc[i]));
11487 		} else if (gres_ptr->gres_bit_alloc)
11488 			info("  gres_bit_alloc[%d]:NULL", i);
11489 
11490 		if (gres_ptr->gres_bit_step_alloc &&
11491 		    gres_ptr->gres_bit_step_alloc[i]) {
11492 			bit_fmt(tmp_str, sizeof(tmp_str),
11493 				gres_ptr->gres_bit_step_alloc[i]);
11494 			info("  gres_bit_step_alloc[%d]:%s of %d", i, tmp_str,
11495 			     (int) bit_size(gres_ptr->gres_bit_step_alloc[i]));
11496 		} else if (gres_ptr->gres_bit_step_alloc)
11497 			info("  gres_bit_step_alloc[%d]:NULL", i);
11498 
11499 		if (gres_ptr->gres_cnt_step_alloc) {
11500 			info("  gres_cnt_step_alloc[%d]:%"PRIu64"", i,
11501 			     gres_ptr->gres_cnt_step_alloc[i]);
11502 		}
11503 	}
11504 
11505 	/*
11506 	 * These arrays are only used for resource selection and may include
11507 	 * data for many nodes not used in the resources eventually allocated
11508 	 * to this job.
11509 	 */
11510 	if (gres_ptr->total_node_cnt)
11511 		sparse_msg = " (sparsely populated for resource selection)";
11512 	info("  total_node_cnt:%u%s", gres_ptr->total_node_cnt, sparse_msg);
11513 	for (i = 0; i < gres_ptr->total_node_cnt; i++) {
11514 		if (gres_ptr->gres_cnt_node_select &&
11515 		    gres_ptr->gres_cnt_node_select[i]) {
11516 			info("  gres_cnt_node_select[%d]:%"PRIu64,
11517 			     i, gres_ptr->gres_cnt_node_select[i]);
11518 		}
11519 		if (gres_ptr->gres_bit_select &&
11520 		    gres_ptr->gres_bit_select[i]) {
11521 			bit_fmt(tmp_str, sizeof(tmp_str),
11522 				gres_ptr->gres_bit_select[i]);
11523 			info("  gres_bit_select[%d]:%s of %d", i, tmp_str,
11524 			     (int) bit_size(gres_ptr->gres_bit_select[i]));
11525 		}
11526 	}
11527 }
11528 
11529 /*
11530  * Extract from the job record's gres_list the count of allocated resources of
11531  * 	the named gres type.
11532  * IN job_gres_list  - job record's gres_list.
11533  * IN gres_name_type - the name of the gres type to retrieve the associated
11534  *	value from.
11535  * RET The value associated with the gres type or NO_VAL if not found.
11536  */
gres_plugin_get_job_value_by_type(List job_gres_list,char * gres_name_type)11537 extern uint64_t gres_plugin_get_job_value_by_type(List job_gres_list,
11538 						  char *gres_name_type)
11539 {
11540 	uint64_t gres_val;
11541 	uint32_t gres_name_type_id;
11542 	ListIterator  job_gres_iter;
11543 	gres_state_t *job_gres_ptr;
11544 
11545 	if (job_gres_list == NULL)
11546 		return NO_VAL64;
11547 
11548 	slurm_mutex_lock(&gres_context_lock);
11549 	gres_name_type_id = gres_plugin_build_id(gres_name_type);
11550 	gres_val = NO_VAL64;
11551 
11552 	job_gres_iter = list_iterator_create(job_gres_list);
11553 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
11554 		if (job_gres_ptr->plugin_id == gres_name_type_id) {
11555 			gres_val = ((gres_job_state_t *)
11556 				   (job_gres_ptr->gres_data))->gres_per_node;
11557 			break;
11558 		}
11559 	}
11560 	list_iterator_destroy(job_gres_iter);
11561 
11562 	slurm_mutex_unlock(&gres_context_lock);
11563 
11564 	return gres_val;
11565 }
11566 
11567 /*
11568  * Log a job's current gres state
11569  * IN gres_list - generated by gres_plugin_job_state_validate()
11570  * IN job_id - job's ID
11571  */
gres_plugin_job_state_log(List gres_list,uint32_t job_id)11572 extern void gres_plugin_job_state_log(List gres_list, uint32_t job_id)
11573 {
11574 	ListIterator gres_iter;
11575 	gres_state_t *gres_ptr;
11576 
11577 	if (!gres_debug || (gres_list == NULL))
11578 		return;
11579 
11580 	(void) gres_plugin_init();
11581 
11582 	slurm_mutex_lock(&gres_context_lock);
11583 	gres_iter = list_iterator_create(gres_list);
11584 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
11585 		_job_state_log(gres_ptr->gres_data, job_id,
11586 			       gres_ptr->plugin_id);
11587 	}
11588 	list_iterator_destroy(gres_iter);
11589 	slurm_mutex_unlock(&gres_context_lock);
11590 }
11591 
_find_device(void * x,void * key)11592 static int _find_device(void *x, void *key)
11593 {
11594 	gres_device_t *device_x = (gres_device_t *)x;
11595 	gres_device_t *device_key = (gres_device_t *)key;
11596 
11597 	if (!xstrcmp(device_x->path, device_key->path))
11598 		return 1;
11599 
11600 	return 0;
11601 }
11602 
gres_plugin_get_allocated_devices(List gres_list,bool is_job)11603 extern List gres_plugin_get_allocated_devices(List gres_list, bool is_job)
11604 {
11605 	int i, j;
11606 	ListIterator gres_itr, dev_itr;
11607 	gres_state_t *gres_ptr;
11608 	bitstr_t **local_bit_alloc = NULL;
11609 	uint32_t node_cnt;
11610 	gres_device_t *gres_device;
11611 	List gres_devices;
11612 	List device_list = NULL;
11613 
11614 	(void) gres_plugin_init();
11615 
11616 	/*
11617 	 * Create a unique device list of all possible GRES device files.
11618 	 * Initialize each device to deny.
11619 	 */
11620 	for (j = 0; j < gres_context_cnt; j++) {
11621 		if (!gres_context[j].ops.get_devices)
11622 			continue;
11623 		gres_devices = (*(gres_context[j].ops.get_devices))();
11624 		if (!gres_devices || !list_count(gres_devices))
11625 			continue;
11626 		dev_itr = list_iterator_create(gres_devices);
11627 		while ((gres_device = list_next(dev_itr))) {
11628 			if (!device_list)
11629 				device_list = list_create(NULL);
11630 			gres_device->alloc = 0;
11631 			/*
11632 			 * Keep the list unique by not adding duplicates (in the
11633 			 * case of MPS and GPU)
11634 			 */
11635 			if (!list_find_first(device_list, _find_device,
11636 					     gres_device))
11637 				list_append(device_list, gres_device);
11638 		}
11639 		list_iterator_destroy(dev_itr);
11640 	}
11641 
11642 	if (!gres_list)
11643 		return device_list;
11644 
11645 	slurm_mutex_lock(&gres_context_lock);
11646 	gres_itr = list_iterator_create(gres_list);
11647 	while ((gres_ptr = list_next(gres_itr))) {
11648 		for (j = 0; j < gres_context_cnt; j++) {
11649 			if (gres_ptr->plugin_id == gres_context[j].plugin_id)
11650 				break;
11651 		}
11652 
11653 		if (j >= gres_context_cnt) {
11654 			error("We were unable to find the gres in the context!!!  This should never happen");
11655 			continue;
11656 		}
11657 
11658 		if (!gres_ptr->gres_data)
11659 			continue;
11660 
11661 		if (is_job) {
11662 			gres_job_state_t *gres_data_ptr =
11663 				(gres_job_state_t *)gres_ptr->gres_data;
11664 			local_bit_alloc = gres_data_ptr->gres_bit_alloc;
11665 			node_cnt = gres_data_ptr->node_cnt;
11666 		} else {
11667 			gres_step_state_t *gres_data_ptr =
11668 				(gres_step_state_t *)gres_ptr->gres_data;
11669 			local_bit_alloc = gres_data_ptr->gres_bit_alloc;
11670 			node_cnt = gres_data_ptr->node_cnt;
11671 		}
11672 
11673 		if ((node_cnt != 1) ||
11674 		    !local_bit_alloc ||
11675 		    !local_bit_alloc[0] ||
11676 		    !gres_context[j].ops.get_devices)
11677 			continue;
11678 
11679 		gres_devices = (*(gres_context[j].ops.get_devices))();
11680 		if (!gres_devices) {
11681 			error("We should had got gres_devices, but for some reason none were set in the plugin.");
11682 			continue;
11683 		} else if ((int)bit_size(local_bit_alloc[0]) !=
11684 			   list_count(gres_devices)) {
11685 			error("We got %d gres devices when we were only told about %d.  This should never happen.",
11686 			      list_count(gres_devices),
11687 			      (int)bit_size(local_bit_alloc[0]));
11688 			continue;
11689 
11690 		}
11691 
11692 		dev_itr = list_iterator_create(gres_devices);
11693 		i = 0;
11694 		while ((gres_device = list_next(dev_itr))) {
11695 			if (bit_test(local_bit_alloc[0], i)) {
11696 				gres_device_t *gres_device2;
11697 				/*
11698 				 * search for the device among the unique
11699 				 * devices list (since two plugins could have
11700 				 * device records that point to the same file,
11701 				 * like with GPU and MPS)
11702 				 */
11703 				gres_device2 = list_find_first(device_list,
11704 							       _find_device,
11705 							       gres_device);
11706 				/*
11707 				 * Set both, in case they point to different
11708 				 * records
11709 				 */
11710 				gres_device->alloc = 1;
11711 				if (gres_device2)
11712 					gres_device2->alloc = 1;
11713 			}
11714 			//info("%d is %d", i, gres_device->alloc);
11715 			i++;
11716 		}
11717 		list_iterator_destroy(dev_itr);
11718 	}
11719 	list_iterator_destroy(gres_itr);
11720 	slurm_mutex_unlock(&gres_context_lock);
11721 
11722 	return device_list;
11723 }
11724 
_step_state_delete(void * gres_data)11725 static void _step_state_delete(void *gres_data)
11726 {
11727 	int i;
11728 	gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data;
11729 
11730 	if (gres_ptr == NULL)
11731 		return;
11732 
11733 	FREE_NULL_BITMAP(gres_ptr->node_in_use);
11734 	if (gres_ptr->gres_bit_alloc) {
11735 		for (i = 0; i < gres_ptr->node_cnt; i++)
11736 			FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc[i]);
11737 		xfree(gres_ptr->gres_bit_alloc);
11738 	}
11739 	xfree(gres_ptr->gres_cnt_node_alloc);
11740 	xfree(gres_ptr->type_name);
11741 	xfree(gres_ptr);
11742 }
11743 
_gres_step_list_delete(void * list_element)11744 static void _gres_step_list_delete(void *list_element)
11745 {
11746 	gres_state_t *gres_ptr = (gres_state_t *) list_element;
11747 
11748 	_step_state_delete(gres_ptr->gres_data);
11749 	xfree(gres_ptr);
11750 }
11751 
_step_test(void * step_gres_data,void * job_gres_data,int node_offset,bool first_step_node,uint16_t cpus_per_task,int max_rem_nodes,bool ignore_alloc,uint32_t job_id,uint32_t step_id,uint32_t plugin_id)11752 static uint64_t _step_test(void *step_gres_data, void *job_gres_data,
11753 			   int node_offset, bool first_step_node,
11754 			   uint16_t cpus_per_task, int max_rem_nodes,
11755 			   bool ignore_alloc,
11756 			   uint32_t job_id, uint32_t step_id,
11757 			   uint32_t plugin_id)
11758 {
11759 	gres_job_state_t  *job_gres_ptr  = (gres_job_state_t *)  job_gres_data;
11760 	gres_step_state_t *step_gres_ptr = (gres_step_state_t *) step_gres_data;
11761 	uint64_t core_cnt, gres_cnt, min_gres = 1, task_cnt;
11762 
11763 	xassert(job_gres_ptr);
11764 	xassert(step_gres_ptr);
11765 
11766 	if ((node_offset >= job_gres_ptr->node_cnt) &&
11767 	    (job_gres_ptr->node_cnt != 0)) {	/* GRES is type no_consume */
11768 		error("gres/%s: %s %u.%u node offset invalid (%d >= %u)",
11769 		      job_gres_ptr->gres_name, __func__, job_id,
11770 		      step_id, node_offset,
11771 		      job_gres_ptr->node_cnt);
11772 		return 0;
11773 	}
11774 
11775 	if (first_step_node) {
11776 		if (ignore_alloc)
11777 			step_gres_ptr->gross_gres = 0;
11778 		else
11779 			step_gres_ptr->total_gres = 0;
11780 	}
11781 	if (step_gres_ptr->gres_per_node)
11782 		min_gres = step_gres_ptr-> gres_per_node;
11783 	if (step_gres_ptr->gres_per_socket)
11784 		min_gres = MAX(min_gres, step_gres_ptr->gres_per_socket);
11785 	if (step_gres_ptr->gres_per_task)
11786 		min_gres = MAX(min_gres, step_gres_ptr->gres_per_task);
11787 	if (step_gres_ptr->gres_per_step &&
11788 	    (step_gres_ptr->gres_per_step > step_gres_ptr->total_gres) &&
11789 	    (max_rem_nodes == 1)) {
11790 		gres_cnt = step_gres_ptr->gres_per_step;
11791 		if (ignore_alloc)
11792 			   gres_cnt -= step_gres_ptr->gross_gres;
11793 		else
11794 			   gres_cnt -= step_gres_ptr->total_gres;
11795 		min_gres = MAX(min_gres, gres_cnt);
11796 	}
11797 
11798 	if (!_shared_gres(plugin_id) &&
11799 	    job_gres_ptr->gres_bit_alloc &&
11800 	    job_gres_ptr->gres_bit_alloc[node_offset]) {
11801 		gres_cnt = bit_set_count(job_gres_ptr->
11802 					 gres_bit_alloc[node_offset]);
11803 		if (!ignore_alloc &&
11804 		    job_gres_ptr->gres_bit_step_alloc &&
11805 		    job_gres_ptr->gres_bit_step_alloc[node_offset]) {
11806 			gres_cnt -= bit_set_count(job_gres_ptr->
11807 						  gres_bit_step_alloc
11808 						  [node_offset]);
11809 		}
11810 		if (min_gres > gres_cnt) {
11811 			core_cnt = 0;
11812 		} else if (step_gres_ptr->gres_per_task) {
11813 			task_cnt = (gres_cnt + step_gres_ptr->gres_per_task - 1)
11814 				   / step_gres_ptr->gres_per_task;
11815 			core_cnt = task_cnt * cpus_per_task;
11816 		} else
11817 			core_cnt = NO_VAL64;
11818 	} else if (job_gres_ptr->gres_cnt_node_alloc &&
11819 		   job_gres_ptr->gres_cnt_step_alloc) {
11820 		gres_cnt = job_gres_ptr->gres_cnt_node_alloc[node_offset];
11821 		if (!ignore_alloc) {
11822 			gres_cnt -= job_gres_ptr->
11823 				    gres_cnt_step_alloc[node_offset];
11824 		}
11825 		if (min_gres > gres_cnt) {
11826 			core_cnt = 0;
11827 		} else if (step_gres_ptr->gres_per_task) {
11828 			task_cnt = (gres_cnt + step_gres_ptr->gres_per_task - 1)
11829 				   / step_gres_ptr->gres_per_task;
11830 			core_cnt = task_cnt * cpus_per_task;
11831 		} else
11832 			core_cnt = NO_VAL64;
11833 	} else {
11834 		debug3("gres/%s: %s %u.%u gres_bit_alloc and gres_cnt_node_alloc are NULL",
11835 		       job_gres_ptr->gres_name, __func__, job_id, step_id);
11836 		gres_cnt = 0;
11837 		core_cnt = NO_VAL64;
11838 	}
11839 	if (core_cnt != 0) {
11840 		if (ignore_alloc)
11841 			step_gres_ptr->gross_gres += gres_cnt;
11842 		else
11843 			step_gres_ptr->total_gres += gres_cnt;
11844 	}
11845 
11846 	return core_cnt;
11847 }
11848 
11849 /*
11850  * TRES specification parse logic
11851  * in_val IN - initial input string
11852  * cnt OUT - count of values
11853  * gres_list IN/OUT - where to search for (or add) new step TRES record
11854  * save_ptr IN/OUT - NULL on initial call, otherwise value from previous call
11855  * rc OUT - unchanged or an error code
11856  * RET gres - step record to set value in, found or created by this function
11857  */
_get_next_step_gres(char * in_val,uint64_t * cnt,List gres_list,char ** save_ptr,int * rc)11858 static gres_step_state_t *_get_next_step_gres(char *in_val, uint64_t *cnt,
11859 					      List gres_list, char **save_ptr,
11860 					      int *rc)
11861 {
11862 	static char *prev_save_ptr = NULL;
11863 	int context_inx = NO_VAL, my_rc = SLURM_SUCCESS;
11864 	gres_step_state_t *step_gres_data = NULL;
11865 	gres_state_t *gres_ptr;
11866 	gres_key_t step_search_key;
11867 	char *type = NULL, *name = NULL;
11868 	uint16_t flags = 0;
11869 
11870 	xassert(save_ptr);
11871 	if (!in_val && (*save_ptr == NULL)) {
11872 		return NULL;
11873 	}
11874 
11875 	if (*save_ptr == NULL) {
11876 		prev_save_ptr = in_val;
11877 	} else if (*save_ptr != prev_save_ptr) {
11878 		error("%s: parsing error", __func__);
11879 		my_rc = SLURM_ERROR;
11880 		goto fini;
11881 	}
11882 
11883 	if (prev_save_ptr[0] == '\0') {	/* Empty input token */
11884 		*save_ptr = NULL;
11885 		return NULL;
11886 	}
11887 
11888 	if ((my_rc = _get_next_gres(in_val, &type, &context_inx,
11889 				    cnt, &flags, &prev_save_ptr)) ||
11890 	    (context_inx == NO_VAL)) {
11891 		prev_save_ptr = NULL;
11892 		goto fini;
11893 	}
11894 
11895 	/* Find the step GRES record */
11896 	step_search_key.plugin_id = gres_context[context_inx].plugin_id;
11897 	step_search_key.type_id = gres_plugin_build_id(type);
11898 	gres_ptr = list_find_first(gres_list, _gres_find_step_by_key,
11899 				   &step_search_key);
11900 
11901 	if (gres_ptr) {
11902 		step_gres_data = gres_ptr->gres_data;
11903 	} else {
11904 		step_gres_data = xmalloc(sizeof(gres_step_state_t));
11905 		step_gres_data->type_id = gres_plugin_build_id(type);
11906 		step_gres_data->type_name = type;
11907 		type = NULL;	/* String moved above */
11908 		gres_ptr = xmalloc(sizeof(gres_state_t));
11909 		gres_ptr->plugin_id = gres_context[context_inx].plugin_id;
11910 		gres_ptr->gres_data = step_gres_data;
11911 		list_append(gres_list, gres_ptr);
11912 	}
11913 	step_gres_data->flags = flags;
11914 
11915 fini:	xfree(name);
11916 	xfree(type);
11917 	if (my_rc != SLURM_SUCCESS) {
11918 		prev_save_ptr = NULL;
11919 		if (my_rc == ESLURM_INVALID_GRES)
11920 			info("Invalid GRES job specification %s", in_val);
11921 		*rc = my_rc;
11922 	}
11923 	*save_ptr = prev_save_ptr;
11924 	return step_gres_data;
11925 }
11926 
11927 /* Test that the step does not request more GRES than the job contains */
_validate_step_counts(List step_gres_list,List job_gres_list,int * rc)11928 static void _validate_step_counts(List step_gres_list, List job_gres_list,
11929 				  int *rc)
11930 {
11931 	ListIterator iter;
11932 	gres_state_t *job_gres_ptr, *step_gres_ptr;
11933 	gres_job_state_t *job_gres_data;
11934 	gres_step_state_t *step_gres_data;
11935 	gres_key_t job_search_key;
11936 	uint16_t cpus_per_gres;
11937 	uint64_t mem_per_gres;
11938 
11939 	if (!step_gres_list || (list_count(step_gres_list) == 0))
11940 		return;
11941 	if (!job_gres_list  || (list_count(job_gres_list)  == 0)) {
11942 		*rc = ESLURM_INVALID_GRES;
11943 		return;
11944 	}
11945 
11946 	iter = list_iterator_create(step_gres_list);
11947 	while ((step_gres_ptr = (gres_state_t *) list_next(iter))) {
11948 		step_gres_data = (gres_step_state_t *) step_gres_ptr->gres_data;
11949 		job_search_key.plugin_id = step_gres_ptr->plugin_id;
11950 		if (step_gres_data->type_id == 0)
11951 			job_search_key.type_id = NO_VAL;
11952 		else
11953 			job_search_key.type_id = step_gres_data->type_id;
11954 		job_gres_ptr = list_find_first(job_gres_list,
11955 					       _gres_find_job_by_key,
11956 					       &job_search_key);
11957 		if (!job_gres_ptr || !job_gres_ptr->gres_data) {
11958 			*rc = ESLURM_INVALID_GRES;
11959 			break;
11960 		}
11961 		job_gres_data = (gres_job_state_t *) job_gres_ptr->gres_data;
11962 		if (job_gres_data->cpus_per_gres)
11963 			cpus_per_gres = job_gres_data->cpus_per_gres;
11964 		else
11965 			cpus_per_gres = job_gres_data->def_cpus_per_gres;
11966 		if (cpus_per_gres && step_gres_data->cpus_per_gres &&
11967 		    (cpus_per_gres < step_gres_data->cpus_per_gres)) {
11968 			*rc = ESLURM_INVALID_GRES;
11969 			break;
11970 		}
11971 		if (job_gres_data->gres_per_job &&
11972 		    step_gres_data->gres_per_step &&
11973 		    (job_gres_data->gres_per_job <
11974 		     step_gres_data->gres_per_step)) {
11975 			*rc = ESLURM_INVALID_GRES;
11976 			break;
11977 		}
11978 		if (job_gres_data->gres_per_node &&
11979 		    step_gres_data->gres_per_node &&
11980 		    (job_gres_data->gres_per_node <
11981 		     step_gres_data->gres_per_node)) {
11982 			*rc = ESLURM_INVALID_GRES;
11983 			break;
11984 		}
11985 		if (job_gres_data->gres_per_socket &&
11986 		    step_gres_data->gres_per_socket &&
11987 		    (job_gres_data->gres_per_socket <
11988 		     step_gres_data->gres_per_socket)) {
11989 			*rc = ESLURM_INVALID_GRES;
11990 			break;
11991 		}
11992 		if (job_gres_data->gres_per_task &&
11993 		    step_gres_data->gres_per_task &&
11994 		    (job_gres_data->gres_per_task <
11995 		     step_gres_data->gres_per_task)) {
11996 			*rc = ESLURM_INVALID_GRES;
11997 			break;
11998 		}
11999 		if (job_gres_data->mem_per_gres)
12000 			mem_per_gres = job_gres_data->mem_per_gres;
12001 		else
12002 			mem_per_gres = job_gres_data->def_mem_per_gres;
12003 		if (mem_per_gres && step_gres_data->mem_per_gres &&
12004 		    (mem_per_gres < step_gres_data->mem_per_gres)) {
12005 			*rc = ESLURM_INVALID_GRES;
12006 			break;
12007 		}
12008 
12009 	}
12010 	list_iterator_destroy(iter);
12011 }
12012 
12013 /*
12014  * Given a step's requested gres configuration, validate it and build gres list
12015  * IN *tres* - step's requested gres input string
12016  * OUT step_gres_list - List of Gres records for this step to track usage
12017  * IN job_gres_list - List of Gres records for this job
12018  * IN job_id, step_id - ID of the step being allocated.
12019  * RET SLURM_SUCCESS or ESLURM_INVALID_GRES
12020  */
gres_plugin_step_state_validate(char * cpus_per_tres,char * tres_per_step,char * tres_per_node,char * tres_per_socket,char * tres_per_task,char * mem_per_tres,List * step_gres_list,List job_gres_list,uint32_t job_id,uint32_t step_id)12021 extern int gres_plugin_step_state_validate(char *cpus_per_tres,
12022 					   char *tres_per_step,
12023 					   char *tres_per_node,
12024 					   char *tres_per_socket,
12025 					   char *tres_per_task,
12026 					   char *mem_per_tres,
12027 					   List *step_gres_list,
12028 					   List job_gres_list, uint32_t job_id,
12029 					   uint32_t step_id)
12030 {
12031 	int rc;
12032 	gres_step_state_t *step_gres_data;
12033 	List new_step_list;
12034 	uint64_t cnt = 0;
12035 
12036 	*step_gres_list = NULL;
12037 	if ((rc = gres_plugin_init()) != SLURM_SUCCESS)
12038 		return rc;
12039 
12040 	slurm_mutex_lock(&gres_context_lock);
12041 	new_step_list = list_create(_gres_step_list_delete);
12042 	if (cpus_per_tres) {
12043 		char *in_val = cpus_per_tres, *save_ptr = NULL;
12044 		while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
12045 							    new_step_list,
12046 							    &save_ptr, &rc))) {
12047 			step_gres_data->cpus_per_gres = cnt;
12048 			in_val = NULL;
12049 		}
12050 	}
12051 	if (tres_per_step) {
12052 		char *in_val = tres_per_step, *save_ptr = NULL;
12053 		while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
12054 							    new_step_list,
12055 							    &save_ptr, &rc))) {
12056 			step_gres_data->gres_per_step = cnt;
12057 			in_val = NULL;
12058 		}
12059 	}
12060 	if (tres_per_node) {
12061 		char *in_val = tres_per_node, *save_ptr = NULL;
12062 		while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
12063 							    new_step_list,
12064 							    &save_ptr, &rc))) {
12065 			step_gres_data->gres_per_node = cnt;
12066 			in_val = NULL;
12067 		}
12068 	}
12069 	if (tres_per_socket) {
12070 		char *in_val = tres_per_socket, *save_ptr = NULL;
12071 		while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
12072 							    new_step_list,
12073 							    &save_ptr, &rc))) {
12074 			step_gres_data->gres_per_socket = cnt;
12075 			in_val = NULL;
12076 		}
12077 	}
12078 	if (tres_per_task) {
12079 		char *in_val = tres_per_task, *save_ptr = NULL;
12080 		while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
12081 							    new_step_list,
12082 							    &save_ptr, &rc))) {
12083 			step_gres_data->gres_per_task = cnt;
12084 			in_val = NULL;
12085 		}
12086 	}
12087 	if (mem_per_tres) {
12088 		char *in_val = mem_per_tres, *save_ptr = NULL;
12089 		while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
12090 							    new_step_list,
12091 							    &save_ptr, &rc))) {
12092 			step_gres_data->mem_per_gres = cnt;
12093 			in_val = NULL;
12094 		}
12095 	}
12096 	if (list_count(new_step_list) == 0) {
12097 		FREE_NULL_LIST(new_step_list);
12098 	} else {
12099 		if (rc == SLURM_SUCCESS)
12100 			_validate_step_counts(new_step_list, job_gres_list,
12101 					      &rc);
12102 		if (rc == SLURM_SUCCESS)
12103 			*step_gres_list = new_step_list;
12104 		else
12105 			FREE_NULL_LIST(new_step_list);
12106 	}
12107 	slurm_mutex_unlock(&gres_context_lock);
12108 	return rc;
12109 }
12110 
_step_state_dup(void * gres_data)12111 static void *_step_state_dup(void *gres_data)
12112 {
12113 
12114 	int i;
12115 	gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data;
12116 	gres_step_state_t *new_gres_ptr;
12117 
12118 	xassert(gres_ptr);
12119 	new_gres_ptr = xmalloc(sizeof(gres_step_state_t));
12120 	new_gres_ptr->cpus_per_gres	= gres_ptr->cpus_per_gres;
12121 	new_gres_ptr->gres_per_step	= gres_ptr->gres_per_step;
12122 	new_gres_ptr->gres_per_node	= gres_ptr->gres_per_node;
12123 	new_gres_ptr->gres_per_socket	= gres_ptr->gres_per_socket;
12124 	new_gres_ptr->gres_per_task	= gres_ptr->gres_per_task;
12125 	new_gres_ptr->mem_per_gres	= gres_ptr->mem_per_gres;
12126 	new_gres_ptr->node_cnt		= gres_ptr->node_cnt;
12127 	new_gres_ptr->total_gres	= gres_ptr->total_gres;
12128 
12129 	if (gres_ptr->node_in_use)
12130 		new_gres_ptr->node_in_use = bit_copy(gres_ptr->node_in_use);
12131 
12132 	if (gres_ptr->gres_cnt_node_alloc) {
12133 		i = sizeof(uint64_t) * gres_ptr->node_cnt;
12134 		new_gres_ptr->gres_cnt_node_alloc = xmalloc(i);
12135 		memcpy(new_gres_ptr->gres_cnt_node_alloc,
12136 		       gres_ptr->gres_cnt_node_alloc, i);
12137 	}
12138 	if (gres_ptr->gres_bit_alloc) {
12139 		new_gres_ptr->gres_bit_alloc = xcalloc(gres_ptr->node_cnt,
12140 						       sizeof(bitstr_t *));
12141 		for (i = 0; i < gres_ptr->node_cnt; i++) {
12142 			if (gres_ptr->gres_bit_alloc[i] == NULL)
12143 				continue;
12144 			new_gres_ptr->gres_bit_alloc[i] =
12145 				bit_copy(gres_ptr->gres_bit_alloc[i]);
12146 		}
12147 	}
12148 	return new_gres_ptr;
12149 }
12150 
12151 	uint64_t *gres_cnt_node_alloc;	/* Per node GRES allocated, */
12152 
_step_state_dup2(void * gres_data,int node_index)12153 static void *_step_state_dup2(void *gres_data, int node_index)
12154 {
12155 
12156 	gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data;
12157 	gres_step_state_t *new_gres_ptr;
12158 
12159 	xassert(gres_ptr);
12160 	new_gres_ptr = xmalloc(sizeof(gres_step_state_t));
12161 	new_gres_ptr->cpus_per_gres	= gres_ptr->cpus_per_gres;
12162 	new_gres_ptr->gres_per_step	= gres_ptr->gres_per_step;
12163 	new_gres_ptr->gres_per_node	= gres_ptr->gres_per_node;
12164 	new_gres_ptr->gres_per_socket	= gres_ptr->gres_per_socket;
12165 	new_gres_ptr->gres_per_task	= gres_ptr->gres_per_task;
12166 	new_gres_ptr->mem_per_gres	= gres_ptr->mem_per_gres;
12167 	new_gres_ptr->node_cnt		= 1;
12168 	new_gres_ptr->total_gres	= gres_ptr->total_gres;
12169 
12170 	if (gres_ptr->node_in_use)
12171 		new_gres_ptr->node_in_use = bit_copy(gres_ptr->node_in_use);
12172 
12173 	if (gres_ptr->gres_cnt_node_alloc) {
12174 		new_gres_ptr->gres_cnt_node_alloc = xmalloc(sizeof(uint64_t));
12175 		new_gres_ptr->gres_cnt_node_alloc[0] =
12176 		       gres_ptr->gres_cnt_node_alloc[node_index];
12177 	}
12178 
12179 	if ((node_index < gres_ptr->node_cnt) && gres_ptr->gres_bit_alloc &&
12180 	    gres_ptr->gres_bit_alloc[node_index]) {
12181 		new_gres_ptr->gres_bit_alloc = xmalloc(sizeof(bitstr_t *));
12182 		new_gres_ptr->gres_bit_alloc[0] =
12183 			bit_copy(gres_ptr->gres_bit_alloc[node_index]);
12184 	}
12185 	return new_gres_ptr;
12186 }
12187 
12188 /*
12189  * Create a copy of a step's gres state
12190  * IN gres_list - List of Gres records for this step to track usage
12191  * RET The copy or NULL on failure
12192  */
gres_plugin_step_state_dup(List gres_list)12193 List gres_plugin_step_state_dup(List gres_list)
12194 {
12195 	return gres_plugin_step_state_extract(gres_list, -1);
12196 }
12197 
12198 /*
12199  * Create a copy of a step's gres state for a particular node index
12200  * IN gres_list - List of Gres records for this step to track usage
12201  * IN node_index - zero-origin index to the node
12202  * RET The copy or NULL on failure
12203  */
gres_plugin_step_state_extract(List gres_list,int node_index)12204 List gres_plugin_step_state_extract(List gres_list, int node_index)
12205 {
12206 	ListIterator gres_iter;
12207 	gres_state_t *gres_ptr, *new_gres_state;
12208 	List new_gres_list = NULL;
12209 	void *new_gres_data;
12210 
12211 	if (gres_list == NULL)
12212 		return new_gres_list;
12213 
12214 	(void) gres_plugin_init();
12215 
12216 	slurm_mutex_lock(&gres_context_lock);
12217 	gres_iter = list_iterator_create(gres_list);
12218 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
12219 		if (node_index == -1)
12220 			new_gres_data = _step_state_dup(gres_ptr->gres_data);
12221 		else {
12222 			new_gres_data = _step_state_dup2(gres_ptr->gres_data,
12223 							 node_index);
12224 		}
12225 		if (new_gres_list == NULL) {
12226 			new_gres_list = list_create(_gres_step_list_delete);
12227 		}
12228 		new_gres_state = xmalloc(sizeof(gres_state_t));
12229 		new_gres_state->plugin_id = gres_ptr->plugin_id;
12230 		new_gres_state->gres_data = new_gres_data;
12231 		list_append(new_gres_list, new_gres_state);
12232 	}
12233 	list_iterator_destroy(gres_iter);
12234 	slurm_mutex_unlock(&gres_context_lock);
12235 
12236 	return new_gres_list;
12237 }
12238 
12239 /*
12240  * A job allocation size has changed. Update the job step gres information
12241  * bitmaps and other data structures.
12242  * IN gres_list - List of Gres records for this step to track usage
12243  * IN orig_job_node_bitmap - bitmap of nodes in the original job allocation
12244  * IN new_job_node_bitmap  - bitmap of nodes in the new job allocation
12245  */
gres_plugin_step_state_rebase(List gres_list,bitstr_t * orig_job_node_bitmap,bitstr_t * new_job_node_bitmap)12246 void gres_plugin_step_state_rebase(List gres_list,
12247 				   bitstr_t *orig_job_node_bitmap,
12248 				   bitstr_t *new_job_node_bitmap)
12249 {
12250 	ListIterator gres_iter;
12251 	gres_state_t *gres_ptr;
12252 	gres_step_state_t *gres_step_ptr;
12253 	int new_node_cnt;
12254 	int i_first, i_last, i;
12255 	int old_inx, new_inx;
12256 	bitstr_t *new_node_in_use;
12257 	bitstr_t **new_gres_bit_alloc = NULL;
12258 
12259 	if (gres_list == NULL)
12260 		return;
12261 
12262 	(void) gres_plugin_init();
12263 
12264 	slurm_mutex_lock(&gres_context_lock);
12265 	gres_iter = list_iterator_create(gres_list);
12266 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
12267 		gres_step_ptr = (gres_step_state_t *) gres_ptr->gres_data;
12268 		if (!gres_step_ptr)
12269 			continue;
12270 		if (!gres_step_ptr->node_in_use) {
12271 			error("gres_plugin_step_state_rebase: node_in_use is NULL");
12272 			continue;
12273 		}
12274 		new_node_cnt = bit_set_count(new_job_node_bitmap);
12275 		i_first = MIN(bit_ffs(orig_job_node_bitmap),
12276 			      bit_ffs(new_job_node_bitmap));
12277 		i_first = MAX(i_first, 0);
12278 		i_last  = MAX(bit_fls(orig_job_node_bitmap),
12279 			      bit_fls(new_job_node_bitmap));
12280 		if (i_last == -1) {
12281 			error("gres_plugin_step_state_rebase: node_bitmaps "
12282 			      "are empty");
12283 			continue;
12284 		}
12285 		new_node_in_use = bit_alloc(new_node_cnt);
12286 
12287 		old_inx = new_inx = -1;
12288 		for (i = i_first; i <= i_last; i++) {
12289 			bool old_match = false, new_match = false;
12290 			if (bit_test(orig_job_node_bitmap, i)) {
12291 				old_match = true;
12292 				old_inx++;
12293 			}
12294 			if (bit_test(new_job_node_bitmap, i)) {
12295 				new_match = true;
12296 				new_inx++;
12297 			}
12298 			if (old_match && new_match) {
12299 				bit_set(new_node_in_use, new_inx);
12300 				if (gres_step_ptr->gres_bit_alloc) {
12301 					if (!new_gres_bit_alloc) {
12302 						new_gres_bit_alloc =
12303 							xcalloc(new_node_cnt,
12304 								sizeof(bitstr_t *));
12305 					}
12306 					new_gres_bit_alloc[new_inx] =
12307 						gres_step_ptr->gres_bit_alloc[old_inx];
12308 				}
12309 			} else if (old_match &&
12310 				   gres_step_ptr->gres_bit_alloc &&
12311 				   gres_step_ptr->gres_bit_alloc[old_inx]) {
12312 				/* Node removed from job allocation,
12313 				 * release step's resources */
12314 				bit_free(gres_step_ptr->
12315 					 gres_bit_alloc[old_inx]);
12316 			}
12317 		}
12318 
12319 		gres_step_ptr->node_cnt = new_node_cnt;
12320 		bit_free(gres_step_ptr->node_in_use);
12321 		gres_step_ptr->node_in_use = new_node_in_use;
12322 		xfree(gres_step_ptr->gres_bit_alloc);
12323 		gres_step_ptr->gres_bit_alloc = new_gres_bit_alloc;
12324 	}
12325 	list_iterator_destroy(gres_iter);
12326 	slurm_mutex_unlock(&gres_context_lock);
12327 
12328 	return;
12329 }
12330 
12331 /*
12332  * Pack a step's current gres status, called from slurmctld for save/restore
12333  * IN gres_list - generated by gres_plugin_step_alloc()
12334  * IN/OUT buffer - location to write state to
12335  * IN job_id, step_id - job and step ID for logging
12336  */
gres_plugin_step_state_pack(List gres_list,Buf buffer,uint32_t job_id,uint32_t step_id,uint16_t protocol_version)12337 extern int gres_plugin_step_state_pack(List gres_list, Buf buffer,
12338 				       uint32_t job_id, uint32_t step_id,
12339 				       uint16_t protocol_version)
12340 {
12341 	int i, rc = SLURM_SUCCESS;
12342 	uint32_t top_offset, tail_offset, magic = GRES_MAGIC;
12343 	uint16_t rec_cnt = 0;
12344 	ListIterator gres_iter;
12345 	gres_state_t *gres_ptr;
12346 	gres_step_state_t *gres_step_ptr;
12347 
12348 	top_offset = get_buf_offset(buffer);
12349 	pack16(rec_cnt, buffer);	/* placeholder if data */
12350 
12351 	if (gres_list == NULL)
12352 		return rc;
12353 
12354 	(void) gres_plugin_init();
12355 
12356 	slurm_mutex_lock(&gres_context_lock);
12357 	gres_iter = list_iterator_create(gres_list);
12358 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
12359 		gres_step_ptr = (gres_step_state_t *) gres_ptr->gres_data;
12360 
12361 		if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
12362 			pack32(magic, buffer);
12363 			pack32(gres_ptr->plugin_id, buffer);
12364 			pack16(gres_step_ptr->cpus_per_gres, buffer);
12365 			pack16(gres_step_ptr->flags, buffer);
12366 			pack64(gres_step_ptr->gres_per_step, buffer);
12367 			pack64(gres_step_ptr->gres_per_node, buffer);
12368 			pack64(gres_step_ptr->gres_per_socket, buffer);
12369 			pack64(gres_step_ptr->gres_per_task, buffer);
12370 			pack64(gres_step_ptr->mem_per_gres, buffer);
12371 			pack64(gres_step_ptr->total_gres, buffer);
12372 			pack32(gres_step_ptr->node_cnt, buffer);
12373 			pack_bit_str_hex(gres_step_ptr->node_in_use, buffer);
12374 			if (gres_step_ptr->gres_cnt_node_alloc) {
12375 				pack8((uint8_t) 1, buffer);
12376 				pack64_array(gres_step_ptr->gres_cnt_node_alloc,
12377 					     gres_step_ptr->node_cnt, buffer);
12378 			} else {
12379 				pack8((uint8_t) 0, buffer);
12380 			}
12381 			if (gres_step_ptr->gres_bit_alloc) {
12382 				pack8((uint8_t) 1, buffer);
12383 				for (i = 0; i < gres_step_ptr->node_cnt; i++)
12384 					pack_bit_str_hex(gres_step_ptr->
12385 							 gres_bit_alloc[i],
12386 							 buffer);
12387 			} else {
12388 				pack8((uint8_t) 0, buffer);
12389 			}
12390 			rec_cnt++;
12391 		} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
12392 			pack32(magic, buffer);
12393 			pack32(gres_ptr->plugin_id, buffer);
12394 			pack16(gres_step_ptr->cpus_per_gres, buffer);
12395 			pack64(gres_step_ptr->gres_per_step, buffer);
12396 			pack64(gres_step_ptr->gres_per_node, buffer);
12397 			pack64(gres_step_ptr->gres_per_socket, buffer);
12398 			pack64(gres_step_ptr->gres_per_task, buffer);
12399 			pack64(gres_step_ptr->mem_per_gres, buffer);
12400 			pack64(gres_step_ptr->total_gres, buffer);
12401 			pack32(gres_step_ptr->node_cnt, buffer);
12402 			pack_bit_str_hex(gres_step_ptr->node_in_use, buffer);
12403 			if (gres_step_ptr->gres_cnt_node_alloc) {
12404 				pack8((uint8_t) 1, buffer);
12405 				pack64_array(gres_step_ptr->gres_cnt_node_alloc,
12406 					     gres_step_ptr->node_cnt, buffer);
12407 			} else {
12408 				pack8((uint8_t) 0, buffer);
12409 			}
12410 			if (gres_step_ptr->gres_bit_alloc) {
12411 				pack8((uint8_t) 1, buffer);
12412 				for (i = 0; i < gres_step_ptr->node_cnt; i++)
12413 					pack_bit_str_hex(gres_step_ptr->
12414 							 gres_bit_alloc[i],
12415 							 buffer);
12416 			} else {
12417 				pack8((uint8_t) 0, buffer);
12418 			}
12419 			rec_cnt++;
12420 		} else {
12421 			error("%s: protocol_version %hu not supported",
12422 			      __func__, protocol_version);
12423 			break;
12424 		}
12425 	}
12426 	list_iterator_destroy(gres_iter);
12427 	slurm_mutex_unlock(&gres_context_lock);
12428 
12429 	tail_offset = get_buf_offset(buffer);
12430 	set_buf_offset(buffer, top_offset);
12431 	pack16(rec_cnt, buffer);
12432 	set_buf_offset(buffer, tail_offset);
12433 
12434 	return rc;
12435 }
12436 
12437 /*
12438  * Unpack a step's current gres status, called from slurmctld for save/restore
12439  * OUT gres_list - restored state stored by gres_plugin_step_state_pack()
12440  * IN/OUT buffer - location to read state from
12441  * IN job_id, step_id - job and step ID for logging
12442  */
gres_plugin_step_state_unpack(List * gres_list,Buf buffer,uint32_t job_id,uint32_t step_id,uint16_t protocol_version)12443 extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer,
12444 					 uint32_t job_id, uint32_t step_id,
12445 					 uint16_t protocol_version)
12446 {
12447 	int i, rc;
12448 	uint32_t magic = 0, plugin_id = 0, uint32_tmp = 0;
12449 	uint16_t rec_cnt = 0;
12450 	uint8_t data_flag = 0;
12451 	gres_state_t *gres_ptr;
12452 	gres_step_state_t *gres_step_ptr = NULL;
12453 
12454 	safe_unpack16(&rec_cnt, buffer);
12455 	if (rec_cnt == 0)
12456 		return SLURM_SUCCESS;
12457 
12458 	rc = gres_plugin_init();
12459 
12460 	slurm_mutex_lock(&gres_context_lock);
12461 	if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
12462 		*gres_list = list_create(_gres_step_list_delete);
12463 	}
12464 
12465 	while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
12466 		if ((buffer == NULL) || (remaining_buf(buffer) == 0))
12467 			break;
12468 		rec_cnt--;
12469 		if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
12470 			safe_unpack32(&magic, buffer);
12471 			if (magic != GRES_MAGIC)
12472 				goto unpack_error;
12473 			safe_unpack32(&plugin_id, buffer);
12474 			gres_step_ptr = xmalloc(sizeof(gres_step_state_t));
12475 			safe_unpack16(&gres_step_ptr->cpus_per_gres, buffer);
12476 			safe_unpack16(&gres_step_ptr->flags, buffer);
12477 			safe_unpack64(&gres_step_ptr->gres_per_step, buffer);
12478 			safe_unpack64(&gres_step_ptr->gres_per_node, buffer);
12479 			safe_unpack64(&gres_step_ptr->gres_per_socket, buffer);
12480 			safe_unpack64(&gres_step_ptr->gres_per_task, buffer);
12481 			safe_unpack64(&gres_step_ptr->mem_per_gres, buffer);
12482 			safe_unpack64(&gres_step_ptr->total_gres, buffer);
12483 			safe_unpack32(&gres_step_ptr->node_cnt, buffer);
12484 			if (gres_step_ptr->node_cnt > NO_VAL)
12485 				goto unpack_error;
12486 			unpack_bit_str_hex(&gres_step_ptr->node_in_use, buffer);
12487 			safe_unpack8(&data_flag, buffer);
12488 			if (data_flag) {
12489 				safe_unpack64_array(
12490 					&gres_step_ptr->gres_cnt_node_alloc,
12491 					&uint32_tmp, buffer);
12492 			}
12493 			safe_unpack8(&data_flag, buffer);
12494 			if (data_flag) {
12495 				gres_step_ptr->gres_bit_alloc =
12496 					xcalloc(gres_step_ptr->node_cnt,
12497 						sizeof(bitstr_t *));
12498 				for (i = 0; i < gres_step_ptr->node_cnt; i++) {
12499 					unpack_bit_str_hex(&gres_step_ptr->
12500 							   gres_bit_alloc[i],
12501 							   buffer);
12502 				}
12503 			}
12504 		} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
12505 			safe_unpack32(&magic, buffer);
12506 			if (magic != GRES_MAGIC)
12507 				goto unpack_error;
12508 			safe_unpack32(&plugin_id, buffer);
12509 			gres_step_ptr = xmalloc(sizeof(gres_step_state_t));
12510 			safe_unpack16(&gres_step_ptr->cpus_per_gres, buffer);
12511 			safe_unpack64(&gres_step_ptr->gres_per_step, buffer);
12512 			safe_unpack64(&gres_step_ptr->gres_per_node, buffer);
12513 			safe_unpack64(&gres_step_ptr->gres_per_socket, buffer);
12514 			safe_unpack64(&gres_step_ptr->gres_per_task, buffer);
12515 			safe_unpack64(&gres_step_ptr->mem_per_gres, buffer);
12516 			safe_unpack64(&gres_step_ptr->total_gres, buffer);
12517 			safe_unpack32(&gres_step_ptr->node_cnt, buffer);
12518 			if (gres_step_ptr->node_cnt > NO_VAL)
12519 				goto unpack_error;
12520 			unpack_bit_str_hex(&gres_step_ptr->node_in_use, buffer);
12521 			safe_unpack8(&data_flag, buffer);
12522 			if (data_flag) {
12523 				safe_unpack64_array(
12524 					&gres_step_ptr->gres_cnt_node_alloc,
12525 					&uint32_tmp, buffer);
12526 			}
12527 			safe_unpack8(&data_flag, buffer);
12528 			if (data_flag) {
12529 				gres_step_ptr->gres_bit_alloc =
12530 					xcalloc(gres_step_ptr->node_cnt,
12531 						sizeof(bitstr_t *));
12532 				for (i = 0; i < gres_step_ptr->node_cnt; i++) {
12533 					unpack_bit_str_hex(&gres_step_ptr->
12534 							   gres_bit_alloc[i],
12535 							   buffer);
12536 				}
12537 			}
12538 		} else {
12539 			error("%s: protocol_version %hu not supported",
12540 			      __func__, protocol_version);
12541 			goto unpack_error;
12542 		}
12543 
12544 		for (i = 0; i < gres_context_cnt; i++) {
12545 			if (gres_context[i].plugin_id == plugin_id)
12546 				break;
12547 		}
12548 		if (i >= gres_context_cnt) {
12549 			/*
12550 			 * A likely sign that GresPlugins has changed.
12551 			 * Not a fatal error, skip over the data.
12552 			 */
12553 			info("%s: no plugin configured to unpack data type %u from step %u.%u",
12554 			      __func__, plugin_id, job_id, step_id);
12555 			_step_state_delete(gres_step_ptr);
12556 			gres_step_ptr = NULL;
12557 			continue;
12558 		}
12559 		gres_ptr = xmalloc(sizeof(gres_state_t));
12560 		gres_ptr->plugin_id = gres_context[i].plugin_id;
12561 		gres_ptr->gres_data = gres_step_ptr;
12562 		gres_step_ptr = NULL;
12563 		list_append(*gres_list, gres_ptr);
12564 	}
12565 	slurm_mutex_unlock(&gres_context_lock);
12566 	return rc;
12567 
12568 unpack_error:
12569 	error("%s: unpack error from step %u.%u", __func__, job_id, step_id);
12570 	if (gres_step_ptr)
12571 		_step_state_delete(gres_step_ptr);
12572 	slurm_mutex_unlock(&gres_context_lock);
12573 	return SLURM_ERROR;
12574 }
12575 
12576 /* Return the count of GRES of a specific name on this machine
12577  * IN step_gres_list - generated by gres_plugin_step_alloc()
12578  * IN gres_name - name of the GRES to match
12579  * RET count of GRES of this specific name available to the job or NO_VAL64
12580  */
gres_plugin_step_count(List step_gres_list,char * gres_name)12581 extern uint64_t gres_plugin_step_count(List step_gres_list, char *gres_name)
12582 {
12583 	uint64_t gres_cnt = NO_VAL64;
12584 	gres_state_t *gres_ptr = NULL;
12585 	gres_step_state_t *gres_step_ptr = NULL;
12586 	ListIterator gres_iter;
12587 	int i;
12588 
12589 	if (!step_gres_list)
12590 		return gres_cnt;
12591 
12592 	slurm_mutex_lock(&gres_context_lock);
12593 	for (i = 0; i < gres_context_cnt; i++) {
12594 		if (xstrcmp(gres_context[i].gres_name, gres_name))
12595 			continue;
12596 		gres_iter = list_iterator_create(step_gres_list);
12597 		while ((gres_ptr = (gres_state_t *)list_next(gres_iter))) {
12598 			if (gres_ptr->plugin_id != gres_context[i].plugin_id)
12599 				continue;
12600 			gres_step_ptr = (gres_step_state_t*)gres_ptr->gres_data;
12601 			if (gres_cnt == NO_VAL64)
12602 				gres_cnt = gres_step_ptr->gres_per_node;
12603 			else
12604 				gres_cnt += gres_step_ptr->gres_per_node;
12605 		}
12606 		list_iterator_destroy(gres_iter);
12607 		break;
12608 	}
12609 	slurm_mutex_unlock(&gres_context_lock);
12610 
12611 	return gres_cnt;
12612 }
12613 
12614 /*
12615  * Given a GRES context index, return a bitmap representing those GRES
12616  * which are available from the CPUs current allocated to this process.
12617  * This function only works with task/cgroup and constrained devices or
12618  * if the job step has access to the entire node's resources.
12619  */
_get_usable_gres(int context_inx)12620 static bitstr_t * _get_usable_gres(int context_inx)
12621 {
12622 #if defined(__APPLE__)
12623 	return NULL;
12624 #else
12625 #ifdef __NetBSD__
12626 	// On NetBSD, cpuset_t is an opaque data type
12627 	cpuset_t *mask = cpuset_create();
12628 #else
12629 	cpu_set_t mask;
12630 #endif
12631 	bitstr_t *usable_gres = NULL;
12632 	int i, i_last, rc;
12633 	ListIterator iter;
12634 	gres_slurmd_conf_t *gres_slurmd_conf;
12635 	int gres_inx = 0;
12636 
12637 	if (!gres_conf_list) {
12638 		error("gres_conf_list is null!");
12639 		return NULL;
12640 	}
12641 
12642 	CPU_ZERO(&mask);
12643 #ifdef __FreeBSD__
12644 	rc = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
12645 				sizeof(mask), &mask);
12646 #else
12647 	rc = sched_getaffinity(0, sizeof(mask), &mask);
12648 #endif
12649 	if (rc) {
12650 		error("sched_getaffinity error: %m");
12651 		return usable_gres;
12652 	}
12653 
12654 	usable_gres = bit_alloc(MAX_GRES_BITMAP);
12655 	iter = list_iterator_create(gres_conf_list);
12656 	while ((gres_slurmd_conf = (gres_slurmd_conf_t *) list_next(iter))) {
12657 		if (gres_slurmd_conf->plugin_id !=
12658 		    gres_context[context_inx].plugin_id)
12659 			continue;
12660 		if ((gres_inx + gres_slurmd_conf->count) >= MAX_GRES_BITMAP) {
12661 			error("GRES %s bitmap overflow ((%d + %"PRIu64") >= %d)",
12662 			      gres_slurmd_conf->name, gres_inx,
12663 			      gres_slurmd_conf->count, MAX_GRES_BITMAP);
12664 			continue;
12665 		}
12666 		if (!gres_slurmd_conf->cpus_bitmap) {
12667 			bit_nset(usable_gres, gres_inx,
12668 				 gres_inx + gres_slurmd_conf->count - 1);
12669 		} else {
12670 			i_last = bit_fls(gres_slurmd_conf->cpus_bitmap);
12671 			for (i = 0; i <= i_last; i++) {
12672 				if (!bit_test(gres_slurmd_conf->cpus_bitmap, i))
12673 					continue;
12674 				if (!CPU_ISSET(i, &mask))
12675 					continue;
12676 				bit_nset(usable_gres, gres_inx,
12677 					 gres_inx + gres_slurmd_conf->count -1);
12678 				break;
12679 			}
12680 		}
12681 		gres_inx += gres_slurmd_conf->count;
12682 	}
12683 	list_iterator_destroy(iter);
12684 
12685 #ifdef __NetBSD__
12686 	cpuset_destroy(mask);
12687 #endif
12688 
12689 	return usable_gres;
12690 #endif
12691 }
12692 
12693 /*
12694  * Configure the GRES hardware allocated to the current step while privileged
12695  *
12696  * IN step_gres_list - Step's GRES specification
12697  * IN node_id        - relative position of this node in step
12698  * IN settings       - string containing configuration settings for the hardware
12699  */
gres_plugin_step_hardware_init(List step_gres_list,uint32_t node_id,char * settings)12700 extern void gres_plugin_step_hardware_init(List step_gres_list,
12701 					   uint32_t node_id, char *settings)
12702 {
12703 	int i;
12704 	ListIterator iter;
12705 	gres_state_t *gres_ptr;
12706 	gres_step_state_t *gres_step_ptr;
12707 	bitstr_t *devices;
12708 
12709 	if (!step_gres_list)
12710 		return;
12711 
12712 	(void) gres_plugin_init();
12713 	slurm_mutex_lock(&gres_context_lock);
12714 	for (i = 0; i < gres_context_cnt; i++) {
12715 		if (gres_context[i].ops.step_hardware_init == NULL)
12716 			continue;
12717 
12718 		iter = list_iterator_create(step_gres_list);
12719 		while ((gres_ptr = list_next(iter))) {
12720 			if (gres_ptr->plugin_id == gres_context[i].plugin_id)
12721 				break;
12722 		}
12723 		list_iterator_destroy(iter);
12724 		if (!gres_ptr || !gres_ptr->gres_data)
12725 			continue;
12726 		gres_step_ptr = (gres_step_state_t *) gres_ptr->gres_data;
12727 		if ((gres_step_ptr->node_cnt != 1) ||
12728 		    !gres_step_ptr->gres_bit_alloc ||
12729 		    !gres_step_ptr->gres_bit_alloc[0])
12730 			continue;
12731 
12732 		devices = gres_step_ptr->gres_bit_alloc[0];
12733 		if (settings)
12734 			debug2("settings: %s", settings);
12735 		if (devices) {
12736 			char *dev_str = bit_fmt_full(devices);
12737 			info("devices: %s", dev_str);
12738 			xfree(dev_str);
12739 		}
12740 		(*(gres_context[i].ops.step_hardware_init))(devices, settings);
12741 	}
12742 	slurm_mutex_unlock(&gres_context_lock);
12743 }
12744 
12745 /*
12746  * Optionally undo GRES hardware configuration while privileged
12747  */
gres_plugin_step_hardware_fini(void)12748 extern void gres_plugin_step_hardware_fini(void)
12749 {
12750 	int i;
12751 	(void) gres_plugin_init();
12752 	slurm_mutex_lock(&gres_context_lock);
12753 	for (i = 0; i < gres_context_cnt; i++) {
12754 		if (gres_context[i].ops.step_hardware_fini == NULL) {
12755 			continue;
12756 		}
12757 		(*(gres_context[i].ops.step_hardware_fini)) ();
12758 	}
12759 	slurm_mutex_unlock(&gres_context_lock);
12760 }
12761 
12762 /*
12763  * Given a set GRES maps and the local process ID, return the bitmap of
12764  * GRES that should be available to this task.
12765  */
_get_gres_map(char * map_gres,int local_proc_id)12766 static bitstr_t *_get_gres_map(char *map_gres, int local_proc_id)
12767 {
12768 	bitstr_t *usable_gres = NULL;
12769 	char *tmp, *tok, *save_ptr = NULL, *mult;
12770 	int task_offset = 0, task_mult;
12771 	int map_value;
12772 
12773 	if (!map_gres || !map_gres[0])
12774 		return NULL;
12775 
12776 	while (usable_gres == NULL) {
12777 		tmp = xstrdup(map_gres);
12778 		tok = strtok_r(tmp, ",", &save_ptr);
12779 		while (tok) {
12780 			if ((mult = strchr(tok, '*'))) {
12781 				mult[0] = '\0';
12782 				task_mult = atoi(mult + 1);
12783 			} else
12784 				task_mult = 1;
12785 			if (task_mult == 0)
12786 				task_mult = 1;
12787 			if ((local_proc_id >= task_offset) &&
12788 			    (local_proc_id <= (task_offset + task_mult - 1))) {
12789 				map_value = strtol(tok, NULL, 0);
12790 				if ((map_value < 0) ||
12791 				    (map_value >= MAX_GRES_BITMAP)) {
12792 					xfree(tmp);
12793 					goto end;	/* Bad value */
12794 				}
12795 				usable_gres = bit_alloc(MAX_GRES_BITMAP);
12796 				bit_set(usable_gres, map_value);
12797 				break;	/* All done */
12798 			} else {
12799 				task_offset += task_mult;
12800 			}
12801 			tok = strtok_r(NULL, ",", &save_ptr);
12802 		}
12803 		xfree(tmp);
12804 	}
12805 end:
12806 
12807 	return usable_gres;
12808 }
12809 
12810 /*
12811  * Given a set GRES masks and the local process ID, return the bitmap of
12812  * GRES that should be available to this task.
12813  */
_get_gres_mask(char * mask_gres,int local_proc_id)12814 static bitstr_t * _get_gres_mask(char *mask_gres, int local_proc_id)
12815 {
12816 	bitstr_t *usable_gres = NULL;
12817 	char *tmp, *tok, *save_ptr = NULL, *mult;
12818 	int i, task_offset = 0, task_mult;
12819 	uint64_t mask_value;
12820 
12821 	if (!mask_gres || !mask_gres[0])
12822 		return NULL;
12823 
12824 	tmp = xstrdup(mask_gres);
12825 	tok = strtok_r(tmp, ",", &save_ptr);
12826 	while (tok) {
12827 		if ((mult = strchr(tok, '*')))
12828 			task_mult = atoi(mult + 1);
12829 		else
12830 			task_mult = 1;
12831 		if ((local_proc_id >= task_offset) &&
12832 		    (local_proc_id <= (task_offset + task_mult - 1))) {
12833 			mask_value = strtol(tok, NULL, 0);
12834 			if ((mask_value <= 0) || (mask_value >= 0xffffffff))
12835 				break;	/* Bad value */
12836 			usable_gres = bit_alloc(MAX_GRES_BITMAP);
12837 			for (i = 0; i < 64; i++) {
12838 				if ((mask_value >> i) & 0x1)
12839 					bit_set(usable_gres, i);
12840 			}
12841 			break;	/* All done */
12842 		} else {
12843 			task_offset += task_mult;
12844 		}
12845 		tok = strtok_r(NULL, ",", &save_ptr);
12846 	}
12847 	xfree(tmp);
12848 
12849 	return usable_gres;
12850 }
12851 
12852 /*
12853  * Set environment as required for all tasks of a job step
12854  * IN/OUT job_env_ptr - environment variable array
12855  * IN step_gres_list - generated by gres_plugin_step_alloc()
12856  * IN accel_bind_type - GRES binding options (old format, a bitmap)
12857  * IN tres_bind - TRES binding directives (new format, a string)
12858  * IN local_proc_id - task rank, local to this compute node only
12859  */
gres_plugin_step_set_env(char *** job_env_ptr,List step_gres_list,uint16_t accel_bind_type,char * tres_bind,int local_proc_id)12860 extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list,
12861 				     uint16_t accel_bind_type, char *tres_bind,
12862 				     int local_proc_id)
12863 {
12864 	int i;
12865 	ListIterator gres_iter;
12866 	gres_state_t *gres_ptr = NULL;
12867 	bool bind_gpu = accel_bind_type & ACCEL_BIND_CLOSEST_GPU;
12868 	bool bind_nic = accel_bind_type & ACCEL_BIND_CLOSEST_NIC;
12869 	bool bind_mic = accel_bind_type & ACCEL_BIND_CLOSEST_MIC;
12870 	char *sep, *map_gpu = NULL, *mask_gpu = NULL;
12871 	bitstr_t *usable_gres = NULL;
12872 	bool found;
12873 
12874 	if (!bind_gpu && tres_bind && (sep = strstr(tres_bind, "gpu:"))) {
12875 		sep += 4;
12876 		if (!strncasecmp(sep, "closest", 7))
12877 			bind_gpu = true;
12878 		else if (!strncasecmp(sep, "map_gpu:", 8))
12879 			map_gpu = sep + 8;
12880 		else if (!strncasecmp(sep, "mask_gpu:", 9))
12881 			mask_gpu = sep + 9;
12882 	}
12883 
12884 	(void) gres_plugin_init();
12885 	slurm_mutex_lock(&gres_context_lock);
12886 	for (i = 0; i < gres_context_cnt; i++) {
12887 		if (!gres_context[i].ops.step_set_env)
12888 			continue;	/* No plugin to call */
12889 		if (bind_gpu || bind_mic || bind_nic || map_gpu || mask_gpu) {
12890 			if (!xstrcmp(gres_context[i].gres_name, "gpu")) {
12891 				if (map_gpu) {
12892 					usable_gres = _get_gres_map(map_gpu,
12893 								local_proc_id);
12894 				} else if (mask_gpu) {
12895 					usable_gres = _get_gres_mask(mask_gpu,
12896 								local_proc_id);
12897 				} else if (bind_gpu)
12898 					usable_gres = _get_usable_gres(i);
12899 				else
12900 					continue;
12901 			} else if (!xstrcmp(gres_context[i].gres_name,
12902 					    "mic")) {
12903 				if (bind_mic)
12904 					usable_gres = _get_usable_gres(i);
12905 				else
12906 					continue;
12907 			} else if (!xstrcmp(gres_context[i].gres_name,
12908 					    "nic")) {
12909 				if (bind_nic)
12910 					usable_gres = _get_usable_gres(i);
12911 				else
12912 					continue;
12913 			} else {
12914 				continue;
12915 			}
12916 		}
12917 		found = false;
12918 		if (step_gres_list) {
12919 			gres_iter = list_iterator_create(step_gres_list);
12920 			while ((gres_ptr = (gres_state_t *)
12921 				list_next(gres_iter))) {
12922 				if (gres_ptr->plugin_id !=
12923 				    gres_context[i].plugin_id)
12924 					continue;
12925 				if (accel_bind_type || tres_bind) {
12926 					(*(gres_context[i].ops.step_reset_env))
12927 						(job_env_ptr,
12928 						 gres_ptr->gres_data,
12929 						 usable_gres);
12930 				} else {
12931 					(*(gres_context[i].ops.step_set_env))
12932 						(job_env_ptr,
12933 						 gres_ptr->gres_data);
12934 				}
12935 				found = true;
12936 			}
12937 			list_iterator_destroy(gres_iter);
12938 		}
12939 		if (!found) { /* No data fond */
12940 			if (accel_bind_type || tres_bind) {
12941 				(*(gres_context[i].ops.step_reset_env))
12942 					(job_env_ptr, NULL, NULL);
12943 			} else {
12944 				(*(gres_context[i].ops.step_set_env))
12945 					(job_env_ptr, NULL);
12946 			}
12947 		}
12948 		FREE_NULL_BITMAP(usable_gres);
12949 	}
12950 	slurm_mutex_unlock(&gres_context_lock);
12951 	FREE_NULL_BITMAP(usable_gres);
12952 }
12953 
_step_state_log(void * gres_data,uint32_t job_id,uint32_t step_id,char * gres_name)12954 static void _step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id,
12955 			    char *gres_name)
12956 {
12957 	gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data;
12958 	char tmp_str[128];
12959 	int i;
12960 
12961 	xassert(gres_ptr);
12962 	info("gres:%s type:%s(%u) step:%u.%u flags:%s state", gres_name,
12963 	     gres_ptr->type_name, gres_ptr->type_id, job_id, step_id,
12964 	     _gres_flags_str(gres_ptr->flags));
12965 	if (gres_ptr->cpus_per_gres)
12966 		info("  cpus_per_gres:%u", gres_ptr->cpus_per_gres);
12967 	if (gres_ptr->gres_per_step)
12968 		info("  gres_per_step:%"PRIu64, gres_ptr->gres_per_step);
12969 	if (gres_ptr->gres_per_node) {
12970 		info("  gres_per_node:%"PRIu64" node_cnt:%u",
12971 		     gres_ptr->gres_per_node, gres_ptr->node_cnt);
12972 	}
12973 	if (gres_ptr->gres_per_socket)
12974 		info("  gres_per_socket:%"PRIu64, gres_ptr->gres_per_socket);
12975 	if (gres_ptr->gres_per_task)
12976 		info("  gres_per_task:%"PRIu64, gres_ptr->gres_per_task);
12977 	if (gres_ptr->mem_per_gres)
12978 		info("  mem_per_gres:%"PRIu64, gres_ptr->mem_per_gres);
12979 
12980 	if (gres_ptr->node_in_use == NULL)
12981 		info("  node_in_use:NULL");
12982 	else if (gres_ptr->gres_bit_alloc == NULL)
12983 		info("  gres_bit_alloc:NULL");
12984 	else {
12985 		for (i = 0; i < gres_ptr->node_cnt; i++) {
12986 			if (!bit_test(gres_ptr->node_in_use, i))
12987 				continue;
12988 			if (gres_ptr->gres_bit_alloc[i]) {
12989 				bit_fmt(tmp_str, sizeof(tmp_str),
12990 					gres_ptr->gres_bit_alloc[i]);
12991 				info("  gres_bit_alloc[%d]:%s of %d", i,
12992 				     tmp_str,
12993 				     (int)bit_size(gres_ptr->gres_bit_alloc[i]));
12994 			} else
12995 				info("  gres_bit_alloc[%d]:NULL", i);
12996 		}
12997 	}
12998 }
12999 
13000 /*
13001  * Log a step's current gres state
13002  * IN gres_list - generated by gres_plugin_step_alloc()
13003  * IN job_id - job's ID
13004  */
gres_plugin_step_state_log(List gres_list,uint32_t job_id,uint32_t step_id)13005 extern void gres_plugin_step_state_log(List gres_list, uint32_t job_id,
13006 				       uint32_t step_id)
13007 {
13008 	int i;
13009 	ListIterator gres_iter;
13010 	gres_state_t *gres_ptr;
13011 
13012 	if (!gres_debug || (gres_list == NULL))
13013 		return;
13014 
13015 	(void) gres_plugin_init();
13016 
13017 	slurm_mutex_lock(&gres_context_lock);
13018 	gres_iter = list_iterator_create(gres_list);
13019 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
13020 		for (i = 0; i < gres_context_cnt; i++) {
13021 			if (gres_ptr->plugin_id != gres_context[i].plugin_id)
13022 				continue;
13023 			_step_state_log(gres_ptr->gres_data, job_id, step_id,
13024 					gres_context[i].gres_name);
13025 			break;
13026 		}
13027 	}
13028 	list_iterator_destroy(gres_iter);
13029 	slurm_mutex_unlock(&gres_context_lock);
13030 }
13031 
13032 /*
13033  * Determine how many cores of a job's allocation can be allocated to a step
13034  *	on a specific node
13035  * IN job_gres_list - a running job's gres info
13036  * IN/OUT step_gres_list - a pending job step's gres requirements
13037  * IN node_offset - index into the job's node allocation
13038  * IN first_step_node - true if this is node zero of the step (do initialization)
13039  * IN cpus_per_task - number of CPUs required per task
13040  * IN max_rem_nodes - maximum nodes remaining for step (including this one)
13041  * IN ignore_alloc - if set ignore resources already allocated to running steps
13042  * IN job_id, step_id - ID of the step being allocated.
13043  * RET Count of available cores on this node (sort of):
13044  *     NO_VAL64 if no limit or 0 if node is not usable
13045  */
gres_plugin_step_test(List step_gres_list,List job_gres_list,int node_offset,bool first_step_node,uint16_t cpus_per_task,int max_rem_nodes,bool ignore_alloc,uint32_t job_id,uint32_t step_id)13046 extern uint64_t gres_plugin_step_test(List step_gres_list, List job_gres_list,
13047 				      int node_offset, bool first_step_node,
13048 				      uint16_t cpus_per_task, int max_rem_nodes,
13049 				      bool ignore_alloc,
13050 				      uint32_t job_id, uint32_t step_id)
13051 {
13052 	uint64_t core_cnt, tmp_cnt;
13053 	ListIterator step_gres_iter;
13054 	gres_state_t *job_gres_ptr, *step_gres_ptr;
13055 	gres_step_state_t *step_data_ptr = NULL;
13056 
13057 	if (step_gres_list == NULL)
13058 		return NO_VAL64;
13059 	if (job_gres_list == NULL)
13060 		return 0;
13061 
13062 	if (cpus_per_task == 0)
13063 		cpus_per_task = 1;
13064 	core_cnt = NO_VAL64;
13065 	(void) gres_plugin_init();
13066 
13067 	slurm_mutex_lock(&gres_context_lock);
13068 	step_gres_iter = list_iterator_create(step_gres_list);
13069 	while ((step_gres_ptr = (gres_state_t *) list_next(step_gres_iter))) {
13070 		gres_key_t job_search_key;
13071 		step_data_ptr = (gres_step_state_t *)step_gres_ptr->gres_data;
13072 		job_search_key.plugin_id = step_gres_ptr->plugin_id;
13073 		if (step_data_ptr->type_name)
13074 			job_search_key.type_id = step_data_ptr->type_id;
13075 		else
13076 			job_search_key.type_id = NO_VAL;
13077 
13078 		job_search_key.node_offset = node_offset;
13079 		if (!(job_gres_ptr = list_find_first(
13080 			      job_gres_list,
13081 			      _gres_find_job_by_key_with_cnt,
13082 			      &job_search_key))) {
13083 			/* job lack resources required by the step */
13084 			core_cnt = 0;
13085 			break;
13086 		}
13087 
13088 		tmp_cnt = _step_test(step_data_ptr,
13089 				     job_gres_ptr->gres_data,
13090 				     node_offset, first_step_node,
13091 				     cpus_per_task, max_rem_nodes,
13092 				     ignore_alloc,
13093 				     job_id, step_id,
13094 				     step_gres_ptr->plugin_id);
13095 		if ((tmp_cnt != NO_VAL64) && (tmp_cnt < core_cnt))
13096 			core_cnt = tmp_cnt;
13097 
13098 		if (core_cnt == 0)
13099 			break;
13100 	}
13101 	list_iterator_destroy(step_gres_iter);
13102 	slurm_mutex_unlock(&gres_context_lock);
13103 
13104 	return core_cnt;
13105 }
13106 
13107 /*
13108  * Return TRUE if this plugin ID consumes GRES count > 1 for a single device
13109  * file (e.g. MPS)
13110  */
_shared_gres(uint32_t plugin_id)13111 static bool _shared_gres(uint32_t plugin_id)
13112 {
13113 	if (plugin_id == mps_plugin_id)
13114 		return true;
13115 	return false;
13116 }
13117 /*
13118  * Return TRUE if this plugin ID shares resources with another GRES that
13119  * consumes subsets of its resources (e.g. GPU)
13120  */
_sharing_gres(uint32_t plugin_id)13121 static bool _sharing_gres(uint32_t plugin_id)
13122 {
13123 	if (plugin_id == gpu_plugin_id)
13124 		return true;
13125 	return false;
13126 }
13127 
_step_alloc(void * step_gres_data,void * job_gres_data,uint32_t plugin_id,int node_offset,bool first_step_node,uint32_t job_id,uint32_t step_id,uint16_t tasks_on_node,uint32_t rem_nodes)13128 static int _step_alloc(void *step_gres_data, void *job_gres_data,
13129 		       uint32_t plugin_id, int node_offset,
13130 		       bool first_step_node,
13131 		       uint32_t job_id, uint32_t step_id,
13132 		       uint16_t tasks_on_node, uint32_t rem_nodes)
13133 {
13134 	gres_job_state_t  *job_gres_ptr  = (gres_job_state_t *)  job_gres_data;
13135 	gres_step_state_t *step_gres_ptr = (gres_step_state_t *) step_gres_data;
13136 	uint64_t gres_needed, gres_avail, max_gres = 0;
13137 	bitstr_t *gres_bit_alloc;
13138 	int i, len;
13139 
13140 	xassert(job_gres_ptr);
13141 	xassert(step_gres_ptr);
13142 
13143 	if (job_gres_ptr->node_cnt == 0)	/* no_consume */
13144 		return SLURM_SUCCESS;
13145 
13146 	if (node_offset >= job_gres_ptr->node_cnt) {
13147 		error("gres/%s: %s for %u.%u, node offset invalid (%d >= %u)",
13148 		      job_gres_ptr->gres_name, __func__, job_id,
13149 		      step_id, node_offset,
13150 		      job_gres_ptr->node_cnt);
13151 		return SLURM_ERROR;
13152 	}
13153 
13154 	if (first_step_node)
13155 		step_gres_ptr->total_gres = 0;
13156 	if (step_gres_ptr->gres_per_node) {
13157 		gres_needed = step_gres_ptr->gres_per_node;
13158 	} else if (step_gres_ptr->gres_per_task) {
13159 		gres_needed = step_gres_ptr->gres_per_task * tasks_on_node;
13160 	} else if (step_gres_ptr->gres_per_step && (rem_nodes == 1)) {
13161 		gres_needed = step_gres_ptr->gres_per_step -
13162 			      step_gres_ptr->total_gres;
13163 	} else if (step_gres_ptr->gres_per_step) {
13164 		/* Leave at least one GRES per remaining node */
13165 		max_gres = step_gres_ptr->gres_per_step -
13166 			   step_gres_ptr->total_gres - (rem_nodes - 1);
13167 		gres_needed = 1;
13168 	} else {
13169 		/*
13170 		 * No explicit step GRES specification.
13171 		 * Note that gres_per_socket is not supported for steps
13172 		 */
13173 		gres_needed = job_gres_ptr->gres_cnt_node_alloc[node_offset];
13174 	}
13175 	if (step_gres_ptr->node_cnt == 0)
13176 		step_gres_ptr->node_cnt = job_gres_ptr->node_cnt;
13177 	if (!step_gres_ptr->gres_cnt_node_alloc) {
13178 		step_gres_ptr->gres_cnt_node_alloc =
13179 			xcalloc(step_gres_ptr->node_cnt, sizeof(uint64_t));
13180 	}
13181 
13182 	if (job_gres_ptr->gres_cnt_node_alloc &&
13183 	    job_gres_ptr->gres_cnt_node_alloc[node_offset])
13184 		gres_avail = job_gres_ptr->gres_cnt_node_alloc[node_offset];
13185 	else if (job_gres_ptr->gres_bit_select &&
13186 		 job_gres_ptr->gres_bit_select[node_offset])
13187 		gres_avail = bit_set_count(
13188 				job_gres_ptr->gres_bit_select[node_offset]);
13189 	else if (job_gres_ptr->gres_cnt_node_alloc)
13190 		gres_avail = job_gres_ptr->gres_cnt_node_alloc[node_offset];
13191 	else
13192 		gres_avail = job_gres_ptr->gres_per_node;
13193 	if (gres_needed > gres_avail) {
13194 		error("gres/%s: %s for %u.%u, step's > job's "
13195 		      "for node %d (%"PRIu64" > %"PRIu64")",
13196 		      job_gres_ptr->gres_name, __func__, job_id,
13197 		      step_id, node_offset, gres_needed, gres_avail);
13198 		return SLURM_ERROR;
13199 	}
13200 
13201 	if (!job_gres_ptr->gres_cnt_step_alloc) {
13202 		job_gres_ptr->gres_cnt_step_alloc =
13203 			xcalloc(job_gres_ptr->node_cnt, sizeof(uint64_t));
13204 	}
13205 
13206 	if (gres_needed >
13207 	    (gres_avail - job_gres_ptr->gres_cnt_step_alloc[node_offset])) {
13208 		error("gres/%s: %s for %u.%u, step's > job's "
13209 		      "remaining for node %d (%"PRIu64" > "
13210 		      "(%"PRIu64" - %"PRIu64"))",
13211 		      job_gres_ptr->gres_name, __func__, job_id,
13212 		      step_id, node_offset, gres_needed, gres_avail,
13213 		      job_gres_ptr->gres_cnt_step_alloc[node_offset]);
13214 		return SLURM_ERROR;
13215 	}
13216 	gres_avail -= job_gres_ptr->gres_cnt_step_alloc[node_offset];
13217 	if (max_gres)
13218 		gres_needed = MIN(gres_avail, max_gres);
13219 
13220 	if (step_gres_ptr->gres_cnt_node_alloc &&
13221 	    (node_offset < step_gres_ptr->node_cnt))
13222 		step_gres_ptr->gres_cnt_node_alloc[node_offset] = gres_needed;
13223 	step_gres_ptr->total_gres += gres_needed;
13224 
13225 	if (step_gres_ptr->node_in_use == NULL) {
13226 		step_gres_ptr->node_in_use = bit_alloc(job_gres_ptr->node_cnt);
13227 	}
13228 	bit_set(step_gres_ptr->node_in_use, node_offset);
13229 	job_gres_ptr->gres_cnt_step_alloc[node_offset] += gres_needed;
13230 
13231 	if ((job_gres_ptr->gres_bit_alloc == NULL) ||
13232 	    (job_gres_ptr->gres_bit_alloc[node_offset] == NULL)) {
13233 		debug3("gres/%s: %s gres_bit_alloc for %u.%u is NULL",
13234 		       job_gres_ptr->gres_name, __func__, job_id, step_id);
13235 		return SLURM_SUCCESS;
13236 	}
13237 
13238 	gres_bit_alloc = bit_copy(job_gres_ptr->gres_bit_alloc[node_offset]);
13239 	len = bit_size(gres_bit_alloc);
13240 	if (_shared_gres(plugin_id)) {
13241 		for (i = 0; i < len; i++) {
13242 			if (gres_needed > 0) {
13243 				if (bit_test(gres_bit_alloc, i))
13244 					gres_needed = 0;
13245 			} else {
13246 				bit_clear(gres_bit_alloc, i);
13247 			}
13248 		}
13249 	} else {
13250 		if (job_gres_ptr->gres_bit_step_alloc &&
13251 		    job_gres_ptr->gres_bit_step_alloc[node_offset]) {
13252 			bit_and_not(gres_bit_alloc,
13253 				job_gres_ptr->gres_bit_step_alloc[node_offset]);
13254 		}
13255 		for (i = 0; i < len; i++) {
13256 			if (gres_needed > 0) {
13257 				if (bit_test(gres_bit_alloc, i))
13258 					gres_needed--;
13259 			} else {
13260 				bit_clear(gres_bit_alloc, i);
13261 			}
13262 		}
13263 	}
13264 	if (gres_needed) {
13265 		error("gres/%s: %s step %u.%u oversubscribed resources on node %d",
13266 		      job_gres_ptr->gres_name, __func__,
13267 		      job_id, step_id, node_offset);
13268 	}
13269 
13270 	if (job_gres_ptr->gres_bit_step_alloc == NULL) {
13271 		job_gres_ptr->gres_bit_step_alloc =
13272 			xcalloc(job_gres_ptr->node_cnt, sizeof(bitstr_t *));
13273 	}
13274 	if (job_gres_ptr->gres_bit_step_alloc[node_offset]) {
13275 		bit_or(job_gres_ptr->gres_bit_step_alloc[node_offset],
13276 		       gres_bit_alloc);
13277 	} else {
13278 		job_gres_ptr->gres_bit_step_alloc[node_offset] =
13279 			bit_copy(gres_bit_alloc);
13280 	}
13281 	if (step_gres_ptr->gres_bit_alloc == NULL) {
13282 		step_gres_ptr->gres_bit_alloc = xcalloc(job_gres_ptr->node_cnt,
13283 							sizeof(bitstr_t *));
13284 	}
13285 	if (step_gres_ptr->gres_bit_alloc[node_offset]) {
13286 		error("gres/%s: %s step %u.%u bit_alloc already exists",
13287 		      job_gres_ptr->gres_name, __func__, job_id, step_id);
13288 		bit_or(step_gres_ptr->gres_bit_alloc[node_offset],
13289 		       gres_bit_alloc);
13290 		FREE_NULL_BITMAP(gres_bit_alloc);
13291 	} else {
13292 		step_gres_ptr->gres_bit_alloc[node_offset] = gres_bit_alloc;
13293 	}
13294 
13295 	return SLURM_SUCCESS;
13296 }
13297 
13298 /*
13299  * Allocate resource to a step and update job and step gres information
13300  * IN step_gres_list - step's gres_list built by
13301  *		gres_plugin_step_state_validate()
13302  * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
13303  * IN node_offset - job's zero-origin index to the node of interest
13304  * IN first_step_node - true if this is the first node in the step's allocation
13305  * IN tasks_on_node - number of tasks to be launched on this node
13306  * IN rem_nodes - desired additional node count to allocate, including this node
13307  * IN job_id, step_id - ID of the step being allocated.
13308  * RET SLURM_SUCCESS or error code
13309  */
gres_plugin_step_alloc(List step_gres_list,List job_gres_list,int node_offset,bool first_step_node,uint16_t tasks_on_node,uint32_t rem_nodes,uint32_t job_id,uint32_t step_id)13310 extern int gres_plugin_step_alloc(List step_gres_list, List job_gres_list,
13311 				  int node_offset, bool first_step_node,
13312 				  uint16_t tasks_on_node, uint32_t rem_nodes,
13313 				  uint32_t job_id, uint32_t step_id)
13314 {
13315 	int rc, rc2;
13316 	ListIterator step_gres_iter;
13317 	gres_state_t *step_gres_ptr, *job_gres_ptr;
13318 
13319 	if (step_gres_list == NULL)
13320 		return SLURM_SUCCESS;
13321 	if (job_gres_list == NULL) {
13322 		error("%s: step allocates GRES, but job %u has none",
13323 		      __func__, job_id);
13324 		return SLURM_ERROR;
13325 	}
13326 
13327 	rc = gres_plugin_init();
13328 
13329 	slurm_mutex_lock(&gres_context_lock);
13330 	step_gres_iter = list_iterator_create(step_gres_list);
13331 	while ((step_gres_ptr = (gres_state_t *) list_next(step_gres_iter))) {
13332 		gres_step_state_t *step_data_ptr =
13333 			(gres_step_state_t *) step_gres_ptr->gres_data;
13334 		gres_key_t job_search_key;
13335 		step_data_ptr = (gres_step_state_t *)step_gres_ptr->gres_data;
13336 		job_search_key.plugin_id = step_gres_ptr->plugin_id;
13337 		if (step_data_ptr->type_name)
13338 			job_search_key.type_id = step_data_ptr->type_id;
13339 		else
13340 			job_search_key.type_id = NO_VAL;
13341 
13342 		job_search_key.node_offset = node_offset;
13343 		if (!(job_gres_ptr = list_find_first(
13344 			      job_gres_list,
13345 			      _gres_find_job_by_key_with_cnt,
13346 			      &job_search_key))) {
13347 			/* job lack resources required by the step */
13348 			rc = ESLURM_INVALID_GRES;
13349 			break;
13350 		}
13351 
13352 		rc2 = _step_alloc(step_data_ptr,
13353 				  job_gres_ptr->gres_data,
13354 				  step_gres_ptr->plugin_id, node_offset,
13355 				  first_step_node,
13356 				  job_id, step_id, tasks_on_node, rem_nodes);
13357 		if (rc2 != SLURM_SUCCESS)
13358 			rc = rc2;
13359 	}
13360 	list_iterator_destroy(step_gres_iter);
13361 	slurm_mutex_unlock(&gres_context_lock);
13362 
13363 	return rc;
13364 }
13365 
13366 
_step_dealloc(gres_state_t * step_gres_ptr,List job_gres_list,uint32_t job_id,uint32_t step_id)13367 static int _step_dealloc(gres_state_t *step_gres_ptr, List job_gres_list,
13368 			 uint32_t job_id, uint32_t step_id)
13369 {
13370 	gres_state_t *job_gres_ptr;
13371 	gres_step_state_t *step_data_ptr =
13372 		(gres_step_state_t *)step_gres_ptr->gres_data;
13373 	gres_job_state_t *job_data_ptr;
13374 	uint32_t i, j;
13375 	uint64_t gres_cnt;
13376 	int len_j, len_s;
13377 	gres_key_t job_search_key;
13378 
13379 	xassert(job_gres_list);
13380 	xassert(step_data_ptr);
13381 
13382 	job_search_key.plugin_id = step_gres_ptr->plugin_id;
13383 	if (step_data_ptr->type_name)
13384 		job_search_key.type_id = step_data_ptr->type_id;
13385 	else
13386 		job_search_key.type_id = NO_VAL;
13387 	for (i = 0; i < step_data_ptr->node_cnt; i++) {
13388 		job_search_key.node_offset = i;
13389 		if (!(job_gres_ptr = list_find_first(
13390 			      job_gres_list,
13391 			      _gres_find_job_by_key_with_cnt,
13392 			      &job_search_key)))
13393 			continue;
13394 
13395 		job_data_ptr = (gres_job_state_t *)job_gres_ptr->gres_data;
13396 		if (job_data_ptr->node_cnt == 0) {	/* no_consume */
13397 			xassert(!step_data_ptr->node_in_use);
13398 			xassert(!step_data_ptr->gres_bit_alloc);
13399 			return SLURM_SUCCESS;
13400 		} else if (job_data_ptr->node_cnt < i)
13401 			return SLURM_SUCCESS;
13402 
13403 		if (!step_data_ptr->node_in_use) {
13404 			error("gres/%s: %s step %u.%u dealloc, node_in_use is NULL",
13405 			      job_data_ptr->gres_name, __func__,
13406 			      job_id, step_id);
13407 			return SLURM_ERROR;
13408 		}
13409 
13410 		if (!bit_test(step_data_ptr->node_in_use, i))
13411 			continue;
13412 
13413 		if (step_data_ptr->gres_cnt_node_alloc)
13414 			gres_cnt = step_data_ptr->gres_cnt_node_alloc[i];
13415 		else
13416 			gres_cnt = step_data_ptr->gres_per_node;
13417 
13418 		if (job_data_ptr->gres_cnt_step_alloc) {
13419 			if (job_data_ptr->gres_cnt_step_alloc[i] >=
13420 			    gres_cnt) {
13421 				job_data_ptr->gres_cnt_step_alloc[i] -=
13422 					gres_cnt;
13423 			} else {
13424 				error("gres/%s: %s step %u.%u dealloc count underflow",
13425 				      job_data_ptr->gres_name, __func__,
13426 				      job_id, step_id);
13427 				job_data_ptr->gres_cnt_step_alloc[i] = 0;
13428 			}
13429 		}
13430 		if ((step_data_ptr->gres_bit_alloc == NULL) ||
13431 		    (step_data_ptr->gres_bit_alloc[i] == NULL))
13432 			continue;
13433 		if (job_data_ptr->gres_bit_alloc[i] == NULL) {
13434 			error("gres/%s: %s job %u gres_bit_alloc[%d] is NULL",
13435 			      job_data_ptr->gres_name, __func__, job_id, i);
13436 			continue;
13437 		}
13438 		len_j = bit_size(job_data_ptr->gres_bit_alloc[i]);
13439 		len_s = bit_size(step_data_ptr->gres_bit_alloc[i]);
13440 		if (len_j != len_s) {
13441 			error("gres/%s: %s step %u.%u dealloc, bit_alloc[%d] size mis-match (%d != %d)",
13442 			      job_data_ptr->gres_name, __func__,
13443 			      job_id, step_id, i, len_j, len_s);
13444 			len_j = MIN(len_j, len_s);
13445 		}
13446 		for (j = 0; j < len_j; j++) {
13447 			if (!bit_test(step_data_ptr->gres_bit_alloc[i], j))
13448 				continue;
13449 			if (job_data_ptr->gres_bit_step_alloc &&
13450 			    job_data_ptr->gres_bit_step_alloc[i]) {
13451 				bit_clear(job_data_ptr->gres_bit_step_alloc[i],
13452 					  j);
13453 			}
13454 		}
13455 		FREE_NULL_BITMAP(step_data_ptr->gres_bit_alloc[i]);
13456 	}
13457 
13458 	return SLURM_SUCCESS;
13459 }
13460 
13461 /*
13462  * Deallocate resource to a step and update job and step gres information
13463  * IN step_gres_list - step's gres_list built by
13464  *		gres_plugin_step_state_validate()
13465  * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
13466  * IN job_id, step_id - ID of the step being allocated.
13467  * RET SLURM_SUCCESS or error code
13468  */
gres_plugin_step_dealloc(List step_gres_list,List job_gres_list,uint32_t job_id,uint32_t step_id)13469 extern int gres_plugin_step_dealloc(List step_gres_list, List job_gres_list,
13470 				    uint32_t job_id, uint32_t step_id)
13471 {
13472 	int rc, rc2;
13473 	ListIterator step_gres_iter;
13474 	gres_state_t *step_gres_ptr;
13475 
13476 	if (step_gres_list == NULL)
13477 		return SLURM_SUCCESS;
13478 	if (job_gres_list == NULL) {
13479 		error("%s: step deallocates gres, but job %u has none",
13480 		      __func__, job_id);
13481 		return SLURM_ERROR;
13482 	}
13483 
13484 	rc = gres_plugin_init();
13485 
13486 	slurm_mutex_lock(&gres_context_lock);
13487 	step_gres_iter = list_iterator_create(step_gres_list);
13488 	while ((step_gres_ptr = list_next(step_gres_iter))) {
13489 		rc2 = _step_dealloc(step_gres_ptr,
13490 				    job_gres_list,
13491 				    job_id, step_id);
13492 		if (rc2 != SLURM_SUCCESS)
13493 			rc = rc2;
13494 	}
13495 	list_iterator_destroy(step_gres_iter);
13496 	slurm_mutex_unlock(&gres_context_lock);
13497 
13498 	return rc;
13499 }
13500 
13501 /*
13502  * Determine total count GRES of a given type are allocated to a job across
13503  * all nodes
13504  * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
13505  * IN gres_name - name of a GRES type
13506  * RET count of this GRES allocated to this job
13507  */
gres_get_value_by_type(List job_gres_list,char * gres_name)13508 extern uint64_t gres_get_value_by_type(List job_gres_list, char *gres_name)
13509 {
13510 	int i;
13511 	uint32_t plugin_id;
13512 	uint64_t gres_cnt = 0;
13513 	ListIterator job_gres_iter;
13514 	gres_state_t *job_gres_ptr;
13515 	gres_job_state_t *job_gres_data;
13516 
13517 	if (job_gres_list == NULL)
13518 		return NO_VAL64;
13519 
13520 	gres_cnt = NO_VAL64;
13521 	(void) gres_plugin_init();
13522 	plugin_id = gres_plugin_build_id(gres_name);
13523 
13524 	slurm_mutex_lock(&gres_context_lock);
13525 	job_gres_iter = list_iterator_create(job_gres_list);
13526 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
13527 		for (i = 0; i < gres_context_cnt; i++) {
13528 			if (job_gres_ptr->plugin_id != plugin_id)
13529 				continue;
13530 			job_gres_data = (gres_job_state_t *)
13531 					job_gres_ptr->gres_data;
13532 			gres_cnt = job_gres_data->gres_per_node;
13533 			break;
13534 		}
13535 	}
13536 	list_iterator_destroy(job_gres_iter);
13537 	slurm_mutex_unlock(&gres_context_lock);
13538 
13539 	return gres_cnt;
13540 }
13541 
13542 /*
13543  * Fill in an array of GRES type IDs contained within the given job gres_list
13544  *		and an array of corresponding counts of those GRES types.
13545  * IN gres_list - a List of GRES types allocated to a job.
13546  * IN arr_len - Length of the arrays (the number of elements in the gres_list).
13547  * IN gres_count_ids, gres_count_vals - the GRES type ID's and values found
13548  *	 	in the gres_list.
13549  * RET SLURM_SUCCESS or error code
13550  */
gres_plugin_job_count(List gres_list,int arr_len,uint32_t * gres_count_ids,uint64_t * gres_count_vals)13551 extern int gres_plugin_job_count(List gres_list, int arr_len,
13552 				 uint32_t *gres_count_ids,
13553 				 uint64_t *gres_count_vals)
13554 {
13555 	ListIterator  job_gres_iter;
13556 	gres_state_t *job_gres_ptr;
13557 	void         *job_gres_data;
13558 	int           rc, ix = 0;
13559 
13560 	rc = gres_plugin_init();
13561 	if ((rc == SLURM_SUCCESS) && (arr_len <= 0))
13562 		rc = EINVAL;
13563 	if (rc != SLURM_SUCCESS)
13564 		return rc;
13565 
13566 	slurm_mutex_lock(&gres_context_lock);
13567 
13568 	job_gres_iter = list_iterator_create(gres_list);
13569 	while ((job_gres_ptr = (gres_state_t*) list_next(job_gres_iter))) {
13570 		gres_job_state_t *job_gres_state_ptr;
13571 		job_gres_data = job_gres_ptr->gres_data;
13572 		job_gres_state_ptr = (gres_job_state_t *) job_gres_data;
13573 		xassert(job_gres_state_ptr);
13574 
13575 		gres_count_ids[ix]  = job_gres_ptr->plugin_id;
13576 		if (job_gres_state_ptr->total_gres == NO_CONSUME_VAL64)
13577 			gres_count_vals[ix] = 0;
13578 		else
13579 			gres_count_vals[ix] = job_gres_state_ptr->total_gres;
13580 		if (++ix >= arr_len)
13581 			break;
13582 	}
13583 	list_iterator_destroy(job_gres_iter);
13584 
13585 	slurm_mutex_unlock(&gres_context_lock);
13586 
13587 	return rc;
13588 }
13589 
13590 /*
13591  * Build a string identifying total GRES counts of each type
13592  * IN gres_list - a List of GRES types allocated to a job.
13593  * RET string containing comma-separated list of gres type:model:count
13594  *     must release memory using xfree()
13595  */
gres_plugin_job_alloc_count(List gres_list)13596 extern char *gres_plugin_job_alloc_count(List gres_list)
13597 {
13598 	ListIterator  job_gres_iter;
13599 	gres_state_t *job_gres_ptr;
13600 	void         *job_gres_data;
13601 	char         *gres_alloc = NULL, *gres_name, *sep = "";
13602 	int           i;
13603 
13604 	(void) gres_plugin_init();
13605 	slurm_mutex_lock(&gres_context_lock);
13606 
13607 	job_gres_iter = list_iterator_create(gres_list);
13608 	while ((job_gres_ptr = (gres_state_t*) list_next(job_gres_iter))) {
13609 		gres_job_state_t *job_gres_state_ptr;
13610 		uint64_t total_gres;
13611 
13612 		job_gres_data = job_gres_ptr->gres_data;
13613 		job_gres_state_ptr = (gres_job_state_t *) job_gres_data;
13614 		if (!job_gres_state_ptr) {
13615 			error("%s: job gres_data is NULL", __func__);
13616 			continue;
13617 		}
13618 		gres_name = "UNKNOWN";
13619 		for (i = 0; i < gres_context_cnt; i++) {
13620 			if (gres_context[i].plugin_id !=
13621 			    job_gres_ptr->plugin_id)
13622 				continue;
13623 			gres_name = gres_context[i].gres_name;
13624 		}
13625 
13626 		if (job_gres_state_ptr->total_gres == NO_CONSUME_VAL64)
13627 			total_gres = 0;
13628 		else
13629 			total_gres = job_gres_state_ptr->total_gres;
13630 
13631 		if (job_gres_state_ptr->type_name) {
13632 			xstrfmtcat(gres_alloc, "%s%s:%s:%"PRIu64, sep,
13633 				   gres_name, job_gres_state_ptr->type_name,
13634 				   total_gres);
13635 		} else {
13636 			xstrfmtcat(gres_alloc, "%s%s:%"PRIu64, sep, gres_name,
13637 				   total_gres);
13638 		}
13639 		sep = ",";
13640 	}
13641 	list_iterator_destroy(job_gres_iter);
13642 
13643 	slurm_mutex_unlock(&gres_context_lock);
13644 
13645 	return gres_alloc;
13646 }
13647 /*
13648  * Fill in an array of GRES type ids contained within the given node gres_list
13649  *		and an array of corresponding counts of those GRES types.
13650  * IN gres_list - a List of GRES types found on a node.
13651  * IN arrlen - Length of the arrays (the number of elements in the gres_list).
13652  * IN gres_count_ids, gres_count_vals - the GRES type ID's and values found
13653  *	 	in the gres_list.
13654  * IN val_type - Type of value desired, see GRES_VAL_TYPE_*
13655  * RET SLURM_SUCCESS or error code
13656  */
gres_plugin_node_count(List gres_list,int arr_len,uint32_t * gres_count_ids,uint64_t * gres_count_vals,int val_type)13657 extern int gres_plugin_node_count(List gres_list, int arr_len,
13658 				  uint32_t *gres_count_ids,
13659 				  uint64_t *gres_count_vals,
13660 				  int val_type)
13661 {
13662 	ListIterator  node_gres_iter;
13663 	gres_state_t* node_gres_ptr;
13664 	void*         node_gres_data;
13665 	uint64_t      val;
13666 	int           rc, ix = 0;
13667 
13668 	rc = gres_plugin_init();
13669 	if ((rc == SLURM_SUCCESS) && (arr_len <= 0))
13670 		rc = EINVAL;
13671 	if (rc != SLURM_SUCCESS)
13672 		return rc;
13673 
13674 	slurm_mutex_lock(&gres_context_lock);
13675 
13676 	node_gres_iter = list_iterator_create(gres_list);
13677 	while ((node_gres_ptr = (gres_state_t*) list_next(node_gres_iter))) {
13678 		gres_node_state_t *node_gres_state_ptr;
13679 		val = 0;
13680 		node_gres_data = node_gres_ptr->gres_data;
13681 		node_gres_state_ptr = (gres_node_state_t *) node_gres_data;
13682 		xassert(node_gres_state_ptr);
13683 
13684 		switch (val_type) {
13685 		case (GRES_VAL_TYPE_FOUND):
13686 			val = node_gres_state_ptr->gres_cnt_found;
13687 			break;
13688 		case (GRES_VAL_TYPE_CONFIG):
13689 			val = node_gres_state_ptr->gres_cnt_config;
13690 			break;
13691 		case (GRES_VAL_TYPE_AVAIL):
13692 			val = node_gres_state_ptr->gres_cnt_avail;
13693 			break;
13694 		case (GRES_VAL_TYPE_ALLOC):
13695 			val = node_gres_state_ptr->gres_cnt_alloc;
13696 		}
13697 
13698 		gres_count_ids[ix]  = node_gres_ptr->plugin_id;
13699 		gres_count_vals[ix] = val;
13700 		if (++ix >= arr_len)
13701 			break;
13702 	}
13703 	list_iterator_destroy(node_gres_iter);
13704 
13705 	slurm_mutex_unlock(&gres_context_lock);
13706 
13707 	return rc;
13708 }
13709 
13710 /* Send GRES information to slurmstepd on the specified file descriptor */
gres_plugin_send_stepd(int fd)13711 extern void gres_plugin_send_stepd(int fd)
13712 {
13713 	int i;
13714 
13715 	(void) gres_plugin_init();
13716 
13717 	slurm_mutex_lock(&gres_context_lock);
13718 	for (i = 0; i < gres_context_cnt; i++) {
13719 		safe_write(fd, &gres_context[i].config_flags, sizeof(uint8_t));
13720 		if (gres_context[i].ops.send_stepd == NULL)
13721 			continue;	/* No plugin to call */
13722 		(*(gres_context[i].ops.send_stepd)) (fd);
13723 	}
13724 	slurm_mutex_unlock(&gres_context_lock);
13725 
13726 	return;
13727 rwfail:
13728 	error("%s: failed", __func__);
13729 	slurm_mutex_unlock(&gres_context_lock);
13730 }
13731 
13732 /* Receive GRES information from slurmd on the specified file descriptor */
gres_plugin_recv_stepd(int fd)13733 extern void gres_plugin_recv_stepd(int fd)
13734 {
13735 	int i;
13736 
13737 	(void) gres_plugin_init();
13738 
13739 	slurm_mutex_lock(&gres_context_lock);
13740 	for (i = 0; i < gres_context_cnt; i++) {
13741 		safe_read(fd, &gres_context[i].config_flags, sizeof(uint8_t));
13742 		(void)_load_gres_plugin(&gres_context[i]);
13743 
13744 		if (gres_context[i].ops.recv_stepd == NULL)
13745 			continue;	/* No plugin to call */
13746 		(*(gres_context[i].ops.recv_stepd)) (fd);
13747 	}
13748 	slurm_mutex_unlock(&gres_context_lock);
13749 
13750 	return;
13751 rwfail:
13752 	error("%s: failed", __func__);
13753 	slurm_mutex_unlock(&gres_context_lock);
13754 }
13755 
13756 /* Get generic GRES data types here. Call the plugin for others */
_get_job_info(int gres_inx,gres_job_state_t * job_gres_data,uint32_t node_inx,enum gres_job_data_type data_type,void * data)13757 static int _get_job_info(int gres_inx, gres_job_state_t *job_gres_data,
13758 			 uint32_t node_inx, enum gres_job_data_type data_type,
13759 			 void *data)
13760 {
13761 	uint64_t *u64_data = (uint64_t *) data;
13762 	bitstr_t **bit_data = (bitstr_t **) data;
13763 	int rc = SLURM_SUCCESS;
13764 
13765 	if (!job_gres_data || !data)
13766 		return EINVAL;
13767 	if (node_inx >= job_gres_data->node_cnt)
13768 		return ESLURM_INVALID_NODE_COUNT;
13769 	if (data_type == GRES_JOB_DATA_COUNT) {
13770 		*u64_data = job_gres_data->gres_per_node;
13771 	} else if (data_type == GRES_JOB_DATA_BITMAP) {
13772 		if (job_gres_data->gres_bit_alloc)
13773 			*bit_data = job_gres_data->gres_bit_alloc[node_inx];
13774 		else
13775 			*bit_data = NULL;
13776 	} else {
13777 		/* Support here for plugin-specific data types */
13778 		rc = (*(gres_context[gres_inx].ops.job_info))
13779 			(job_gres_data, node_inx, data_type, data);
13780 	}
13781 
13782 	return rc;
13783 }
13784 
13785 /*
13786  * get data from a job's GRES data structure
13787  * IN job_gres_list  - job's GRES data structure
13788  * IN gres_name - name of a GRES type
13789  * IN node_inx - zero-origin index of the node within the job's allocation
13790  *	for which data is desired
13791  * IN data_type - type of data to get from the job's data
13792  * OUT data - pointer to the data from job's GRES data structure
13793  *            DO NOT FREE: This is a pointer into the job's data structure
13794  * RET - SLURM_SUCCESS or error code
13795  */
gres_get_job_info(List job_gres_list,char * gres_name,uint32_t node_inx,enum gres_job_data_type data_type,void * data)13796 extern int gres_get_job_info(List job_gres_list, char *gres_name,
13797 			     uint32_t node_inx,
13798 			     enum gres_job_data_type data_type, void *data)
13799 {
13800 	int i, rc = ESLURM_INVALID_GRES;
13801 	uint32_t plugin_id;
13802 	ListIterator job_gres_iter;
13803 	gres_state_t *job_gres_ptr;
13804 	gres_job_state_t *job_gres_data;
13805 
13806 	if (data == NULL)
13807 		return EINVAL;
13808 	if (job_gres_list == NULL)	/* No GRES allocated */
13809 		return ESLURM_INVALID_GRES;
13810 
13811 	(void) gres_plugin_init();
13812 	plugin_id = gres_plugin_build_id(gres_name);
13813 
13814 	slurm_mutex_lock(&gres_context_lock);
13815 	job_gres_iter = list_iterator_create(job_gres_list);
13816 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
13817 		for (i = 0; i < gres_context_cnt; i++) {
13818 			if (job_gres_ptr->plugin_id != plugin_id)
13819 				continue;
13820 			job_gres_data = (gres_job_state_t *)
13821 					job_gres_ptr->gres_data;
13822 			rc = _get_job_info(i, job_gres_data, node_inx,
13823 					   data_type, data);
13824 			break;
13825 		}
13826 	}
13827 	list_iterator_destroy(job_gres_iter);
13828 	slurm_mutex_unlock(&gres_context_lock);
13829 
13830 	return rc;
13831 }
13832 
13833 /* Given a job's GRES data structure, return the indecies for selected elements
13834  * IN job_gres_list  - job's GRES data structure
13835  * OUT gres_detail_cnt - Number of elements (nodes) in gres_detail_str
13836  * OUT gres_detail_str - Description of GRES on each node
13837  * OUT total_gres_str - String containing all gres in the job and counts.
13838  */
gres_build_job_details(List job_gres_list,uint32_t * gres_detail_cnt,char *** gres_detail_str,char ** total_gres_str)13839 extern void gres_build_job_details(List job_gres_list,
13840 				   uint32_t *gres_detail_cnt,
13841 				   char ***gres_detail_str,
13842 				   char **total_gres_str)
13843 {
13844 	int i, j;
13845 	ListIterator job_gres_iter;
13846 	gres_state_t *job_gres_ptr;
13847 	gres_job_state_t *job_gres_data;
13848 	char *sep1, *sep2, tmp_str[128], *type, **my_gres_details = NULL;
13849 	uint32_t my_gres_cnt = 0;
13850 	char *gres_name, *gres_str = NULL;
13851 	uint64_t gres_cnt;
13852 
13853 	/* Release any vestigial data (e.g. from job requeue) */
13854 	for (i = 0; i < *gres_detail_cnt; i++)
13855 		xfree(gres_detail_str[0][i]);
13856 	xfree(*gres_detail_str);
13857 	xfree(*total_gres_str);
13858 	*gres_detail_cnt = 0;
13859 
13860 	if (job_gres_list == NULL)	/* No GRES allocated */
13861 		return;
13862 
13863 	(void) gres_plugin_init();
13864 
13865 	job_gres_iter = list_iterator_create(job_gres_list);
13866 	while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
13867 		job_gres_data = (gres_job_state_t *) job_gres_ptr->gres_data;
13868 		if (job_gres_data->gres_bit_alloc == NULL)
13869 			continue;
13870 		if (my_gres_details == NULL) {
13871 			my_gres_cnt = job_gres_data->node_cnt;
13872 			my_gres_details = xcalloc(my_gres_cnt, sizeof(char *));
13873 		}
13874 
13875 		if (job_gres_data->type_name) {
13876 			sep2 = ":";
13877 			type = job_gres_data->type_name;
13878 		} else {
13879 			sep2 = "";
13880 			type = "";
13881 		}
13882 
13883 		gres_name = xstrdup_printf(
13884 			"%s%s%s",
13885 			job_gres_data->gres_name, sep2, type);
13886 		gres_cnt = 0;
13887 
13888 		for (j = 0; j < my_gres_cnt; j++) {
13889 			if (j >= job_gres_data->node_cnt)
13890 				break;	/* node count mismatch */
13891 			if (my_gres_details[j])
13892 				sep1 = ",";
13893 			else
13894 				sep1 = "";
13895 
13896 			gres_cnt += job_gres_data->gres_cnt_node_alloc[j];
13897 
13898 			if (job_gres_data->gres_bit_alloc[j]) {
13899 				bit_fmt(tmp_str, sizeof(tmp_str),
13900 					job_gres_data->gres_bit_alloc[j]);
13901 				xstrfmtcat(my_gres_details[j],
13902 					   "%s%s:%"PRIu64"(IDX:%s)",
13903 					   sep1, gres_name,
13904 					   job_gres_data->
13905 					   gres_cnt_node_alloc[j],
13906 					   tmp_str);
13907 			} else if (job_gres_data->gres_cnt_node_alloc[j]) {
13908 				xstrfmtcat(my_gres_details[j],
13909 					   "%s%s(CNT:%"PRIu64")",
13910 					   sep1, gres_name,
13911 					   job_gres_data->
13912 					   gres_cnt_node_alloc[j]);
13913 			}
13914 		}
13915 
13916 		xstrfmtcat(gres_str, "%s%s:%"PRIu64,
13917 			   gres_str ? "," : "", gres_name, gres_cnt);
13918 		xfree(gres_name);
13919 	}
13920 	list_iterator_destroy(job_gres_iter);
13921 	*gres_detail_cnt = my_gres_cnt;
13922 	*gres_detail_str = my_gres_details;
13923 	*total_gres_str = gres_str;
13924 }
13925 
13926 /* Get generic GRES data types here. Call the plugin for others */
_get_step_info(int gres_inx,gres_step_state_t * step_gres_data,uint32_t node_inx,enum gres_step_data_type data_type,void * data)13927 static int _get_step_info(int gres_inx, gres_step_state_t *step_gres_data,
13928 			  uint32_t node_inx, enum gres_step_data_type data_type,
13929 			  void *data)
13930 {
13931 	uint64_t *u64_data = (uint64_t *) data;
13932 	bitstr_t **bit_data = (bitstr_t **) data;
13933 	int rc = SLURM_SUCCESS;
13934 
13935 	if (!step_gres_data || !data)
13936 		return EINVAL;
13937 	if (node_inx >= step_gres_data->node_cnt)
13938 		return ESLURM_INVALID_NODE_COUNT;
13939 	if (data_type == GRES_STEP_DATA_COUNT) {
13940 		*u64_data = step_gres_data->gres_per_node;
13941 	} else if (data_type == GRES_STEP_DATA_BITMAP) {
13942 		if (step_gres_data->gres_bit_alloc)
13943 			*bit_data = step_gres_data->gres_bit_alloc[node_inx];
13944 		else
13945 			*bit_data = NULL;
13946 	} else {
13947 		/* Support here for plugin-specific data types */
13948 		rc = (*(gres_context[gres_inx].ops.step_info))
13949 			(step_gres_data, node_inx, data_type, data);
13950 	}
13951 
13952 	return rc;
13953 }
13954 
13955 /*
13956  * get data from a step's GRES data structure
13957  * IN step_gres_list  - step's GRES data structure
13958  * IN gres_name - name of a GRES type
13959  * IN node_inx - zero-origin index of the node within the job's allocation
13960  *	for which data is desired. Note this can differ from the step's
13961  *	node allocation index.
13962  * IN data_type - type of data to get from the step's data
13963  * OUT data - pointer to the data from step's GRES data structure
13964  *            DO NOT FREE: This is a pointer into the step's data structure
13965  * RET - SLURM_SUCCESS or error code
13966  */
gres_get_step_info(List step_gres_list,char * gres_name,uint32_t node_inx,enum gres_step_data_type data_type,void * data)13967 extern int gres_get_step_info(List step_gres_list, char *gres_name,
13968 			      uint32_t node_inx,
13969 			      enum gres_step_data_type data_type, void *data)
13970 {
13971 	int i, rc = ESLURM_INVALID_GRES;
13972 	uint32_t plugin_id;
13973 	ListIterator step_gres_iter;
13974 	gres_state_t *step_gres_ptr;
13975 	gres_step_state_t *step_gres_data;
13976 
13977 	if (data == NULL)
13978 		return EINVAL;
13979 	if (step_gres_list == NULL)	/* No GRES allocated */
13980 		return ESLURM_INVALID_GRES;
13981 
13982 	(void) gres_plugin_init();
13983 	plugin_id = gres_plugin_build_id(gres_name);
13984 
13985 	slurm_mutex_lock(&gres_context_lock);
13986 	step_gres_iter = list_iterator_create(step_gres_list);
13987 	while ((step_gres_ptr = (gres_state_t *) list_next(step_gres_iter))) {
13988 		for (i = 0; i < gres_context_cnt; i++) {
13989 			if (step_gres_ptr->plugin_id != plugin_id)
13990 				continue;
13991 			step_gres_data = (gres_step_state_t *)
13992 					 step_gres_ptr->gres_data;
13993 			rc = _get_step_info(i, step_gres_data, node_inx,
13994 					    data_type, data);
13995 			break;
13996 		}
13997 	}
13998 	list_iterator_destroy(step_gres_iter);
13999 	slurm_mutex_unlock(&gres_context_lock);
14000 
14001 	return rc;
14002 }
14003 
gres_get_step_state(List gres_list,char * name)14004 extern gres_step_state_t *gres_get_step_state(List gres_list, char *name)
14005 {
14006 	gres_state_t *gres_state_ptr;
14007 
14008 	if (!gres_list || !name || !list_count(gres_list))
14009 		return NULL;
14010 
14011 	slurm_mutex_lock(&gres_context_lock);
14012 	gres_state_ptr = list_find_first(gres_list, _gres_step_find_name, name);
14013 	slurm_mutex_unlock(&gres_context_lock);
14014 
14015 	if (!gres_state_ptr)
14016 		return NULL;
14017 
14018 	return (gres_step_state_t *)gres_state_ptr->gres_data;
14019 }
14020 
gres_get_job_state(List gres_list,char * name)14021 extern gres_job_state_t *gres_get_job_state(List gres_list, char *name)
14022 {
14023 	gres_state_t *gres_state_ptr;
14024 
14025 	if (!gres_list || !name || !list_count(gres_list))
14026 		return NULL;
14027 
14028 	slurm_mutex_lock(&gres_context_lock);
14029 	gres_state_ptr = list_find_first(gres_list, _gres_job_find_name, name);
14030 	slurm_mutex_unlock(&gres_context_lock);
14031 
14032 	if (!gres_state_ptr)
14033 		return NULL;
14034 
14035 	return (gres_job_state_t *)gres_state_ptr->gres_data;
14036 }
14037 
gres_get_autodetect_types(void)14038 extern uint32_t gres_get_autodetect_types(void)
14039 {
14040 	return autodetect_types;
14041 }
14042 
gres_2_tres_str(List gres_list,bool is_job,bool locked)14043 extern char *gres_2_tres_str(List gres_list, bool is_job, bool locked)
14044 {
14045 	ListIterator itr;
14046 	slurmdb_tres_rec_t *tres_rec;
14047 	gres_state_t *gres_state_ptr;
14048 	int i;
14049 	uint64_t count;
14050 	char *col_name = NULL;
14051 	char *tres_str = NULL;
14052 	static bool first_run = 1;
14053 	static slurmdb_tres_rec_t tres_req;
14054 	assoc_mgr_lock_t locks = { .tres = READ_LOCK };
14055 
14056 	/* we only need to init this once */
14057 	if (first_run) {
14058 		first_run = 0;
14059 		memset(&tres_req, 0, sizeof(slurmdb_tres_rec_t));
14060 		tres_req.type = "gres";
14061 	}
14062 
14063 	if (!gres_list)
14064 		return NULL;
14065 
14066 	/* must be locked first before gres_contrex_lock!!! */
14067 	if (!locked)
14068 		assoc_mgr_lock(&locks);
14069 
14070 	slurm_mutex_lock(&gres_context_lock);
14071 	itr = list_iterator_create(gres_list);
14072 	while ((gres_state_ptr = list_next(itr))) {
14073 		if (is_job) {
14074 			gres_job_state_t *gres_data_ptr = (gres_job_state_t *)
14075 				gres_state_ptr->gres_data;
14076 			col_name = gres_data_ptr->type_name;
14077 			count = gres_data_ptr->total_gres;
14078 		} else {
14079 			gres_step_state_t *gres_data_ptr = (gres_step_state_t *)
14080 				gres_state_ptr->gres_data;
14081 			col_name = gres_data_ptr->type_name;
14082 			count = gres_data_ptr->total_gres;
14083 		}
14084 
14085 		for (i = 0; i < gres_context_cnt; i++) {
14086 			if (gres_context[i].plugin_id ==
14087 			    gres_state_ptr->plugin_id) {
14088 				tres_req.name = gres_context[i].gres_name;
14089 				break;
14090 			}
14091 		}
14092 
14093 		if (!tres_req.name) {
14094 			debug("%s: couldn't find name", __func__);
14095 			continue;
14096 		}
14097 
14098 		/* If we are no_consume, print a 0 */
14099 		if (count == NO_CONSUME_VAL64)
14100 			count = 0;
14101 
14102 		tres_rec = assoc_mgr_find_tres_rec(&tres_req);
14103 
14104 		if (tres_rec &&
14105 		    slurmdb_find_tres_count_in_string(
14106 			    tres_str, tres_rec->id) == INFINITE64)
14107 			/* New gres */
14108 			xstrfmtcat(tres_str, "%s%u=%"PRIu64,
14109 				   tres_str ? "," : "",
14110 				   tres_rec->id, count);
14111 
14112 		if (i < gres_context_cnt) {
14113 			if (col_name) {
14114 				/*
14115 				 * Now let's put of the : name TRES if we are
14116 				 * tracking it as well.  This would be handy
14117 				 * for GRES like "gpu:tesla", where you might
14118 				 * want to track both as TRES.
14119 				 */
14120 				tres_req.name = xstrdup_printf(
14121 					"%s%s",
14122 					gres_context[i].gres_name_colon,
14123 					col_name);
14124 				tres_rec = assoc_mgr_find_tres_rec(&tres_req);
14125 				xfree(tres_req.name);
14126 				if (tres_rec &&
14127 				    slurmdb_find_tres_count_in_string(
14128 					    tres_str, tres_rec->id) == INFINITE64)
14129 					/* New GRES */
14130 					xstrfmtcat(tres_str, "%s%u=%"PRIu64,
14131 						   tres_str ? "," : "",
14132 						   tres_rec->id, count);
14133 			} else {
14134 				/*
14135 				 * Job allocated GRES without "type"
14136 				 * specification, but Slurm is only accounting
14137 				 * for this GRES by specific "type", so pick
14138 				 * some valid "type" to get some accounting.
14139 				 * Although the reported "type" may not be
14140 				 * accurate, it is better than nothing...
14141 				 */
14142 				tres_req.name = xstrdup_printf(
14143 					"%s", gres_context[i].gres_name);
14144 				tres_rec = assoc_mgr_find_tres_rec2(&tres_req);
14145 				xfree(tres_req.name);
14146 				if (tres_rec &&
14147 				    slurmdb_find_tres_count_in_string(
14148 					    tres_str, tres_rec->id) == INFINITE64)
14149 					/* New GRES */
14150 					xstrfmtcat(tres_str, "%s%u=%"PRIu64,
14151 						   tres_str ? "," : "",
14152 						   tres_rec->id, count);
14153 			}
14154 		}
14155 	}
14156 	list_iterator_destroy(itr);
14157 	slurm_mutex_unlock(&gres_context_lock);
14158 
14159 	if (!locked)
14160 		assoc_mgr_unlock(&locks);
14161 
14162 	return tres_str;
14163 }
14164 
14165 /* Fill in job/node TRES arrays with allocated GRES. */
_set_type_tres_cnt(gres_state_type_enum_t state_type,List gres_list,uint32_t node_cnt,uint64_t * tres_cnt,bool locked)14166 static void _set_type_tres_cnt(gres_state_type_enum_t state_type,
14167 			       List gres_list,
14168 			       uint32_t node_cnt,
14169 			       uint64_t *tres_cnt,
14170 			       bool locked)
14171 {
14172 	ListIterator itr;
14173 	gres_state_t *gres_state_ptr;
14174 	static bool first_run = 1;
14175 	static slurmdb_tres_rec_t tres_rec;
14176 	char *col_name = NULL;
14177 	uint64_t count;
14178 	int i, tres_pos;
14179 	assoc_mgr_lock_t locks = { .tres = READ_LOCK };
14180 
14181 	/* we only need to init this once */
14182 	if (first_run) {
14183 		first_run = 0;
14184 		memset(&tres_rec, 0, sizeof(slurmdb_tres_rec_t));
14185 		tres_rec.type = "gres";
14186 	}
14187 
14188 	if (!gres_list || !tres_cnt ||
14189 	    ((state_type == GRES_STATE_TYPE_JOB) &&
14190 	     (!node_cnt || (node_cnt == NO_VAL))))
14191 		return;
14192 
14193 	/* must be locked first before gres_contrex_lock!!! */
14194 	if (!locked)
14195 		assoc_mgr_lock(&locks);
14196 
14197 	slurm_mutex_lock(&gres_context_lock);
14198 	/* Initialize all GRES counters to zero. Increment them later. */
14199 	for (i = 0; i < gres_context_cnt; i++) {
14200 		tres_rec.name =	gres_context[i].gres_name;
14201 		if (tres_rec.name &&
14202 		    ((tres_pos = assoc_mgr_find_tres_pos(&tres_rec,true)) !=-1))
14203 			tres_cnt[tres_pos] = 0;
14204 	}
14205 
14206 	itr = list_iterator_create(gres_list);
14207 	while ((gres_state_ptr = list_next(itr))) {
14208 		bool set_total = false;
14209 		for (i = 0; i < gres_context_cnt; i++) {
14210 			if (gres_context[i].plugin_id ==
14211 			    gres_state_ptr->plugin_id) {
14212 				tres_rec.name =	gres_context[i].gres_name;
14213 				break;
14214 			}
14215 		}
14216 		if (!tres_rec.name) {
14217 			debug("%s: couldn't find name", __func__);
14218 			continue;
14219 		}
14220 
14221 		/* Get alloc count for main GRES. */
14222 		switch (state_type) {
14223 		case GRES_STATE_TYPE_JOB:
14224 		{
14225 			gres_job_state_t *gres_data_ptr = (gres_job_state_t *)
14226 				gres_state_ptr->gres_data;
14227 			count = gres_data_ptr->total_gres;
14228 			break;
14229 		}
14230 		case GRES_STATE_TYPE_NODE:
14231 		{
14232 			gres_node_state_t *gres_data_ptr = (gres_node_state_t *)
14233 				gres_state_ptr->gres_data;
14234 			count = gres_data_ptr->gres_cnt_alloc;
14235 			break;
14236 		}
14237 		default:
14238 			error("%s: unsupported state type %d", __func__,
14239 			      state_type);
14240 			continue;
14241 		}
14242 		/*
14243 		 * Set main TRES's count (i.e. if no GRES "type" is being
14244 		 * accounted for). We need to increment counter since the job
14245 		 * may have been allocated multiple GRES types, but Slurm is
14246 		 * only configured to track the total count. For example, a job
14247 		 * allocated 1 GPU of type "tesla" and 1 GPU of type "volta",
14248 		 * but we want to record that the job was allocated a total of
14249 		 * 2 GPUs.
14250 		 */
14251 		if ((tres_pos = assoc_mgr_find_tres_pos(&tres_rec,true)) != -1){
14252 			if (count == NO_CONSUME_VAL64)
14253 				tres_cnt[tres_pos] = NO_CONSUME_VAL64;
14254 			else
14255 				tres_cnt[tres_pos] += count;
14256 			set_total = true;
14257 		}
14258 
14259 		/*
14260 		 * Set TRES count for GRES model types. This would be handy for
14261 		 * GRES like "gpu:tesla", where you might want to track both as
14262 		 * TRES.
14263 		 */
14264 		switch (state_type) {
14265 		case GRES_STATE_TYPE_JOB:
14266 		{
14267 			gres_job_state_t *gres_data_ptr = (gres_job_state_t *)
14268 				gres_state_ptr->gres_data;
14269 
14270 			col_name = gres_data_ptr->type_name;
14271 			if (col_name) {
14272 				tres_rec.name = xstrdup_printf(
14273 					"%s%s",
14274 					gres_context[i].gres_name_colon,
14275 					col_name);
14276 				if ((tres_pos = assoc_mgr_find_tres_pos(
14277 					     &tres_rec, true)) != -1)
14278 					tres_cnt[tres_pos] = count;
14279 				xfree(tres_rec.name);
14280 			} else if (!set_total) {
14281 				/*
14282 				 * Job allocated GRES without "type"
14283 				 * specification, but Slurm is only accounting
14284 				 * for this GRES by specific "type", so pick
14285 				 * some valid "type" to get some accounting.
14286 				 * Although the reported "type" may not be
14287 				 * accurate, it is better than nothing...
14288 				 */
14289 				tres_rec.name = xstrdup_printf(
14290 					"%s", gres_context[i].gres_name);
14291 				if ((tres_pos = assoc_mgr_find_tres_pos2(
14292 					     &tres_rec, true)) != -1)
14293 					tres_cnt[tres_pos] = count;
14294 				xfree(tres_rec.name);
14295 			}
14296 			break;
14297 		}
14298 		case GRES_STATE_TYPE_NODE:
14299 		{
14300 			int type;
14301 			gres_node_state_t *gres_data_ptr = (gres_node_state_t *)
14302 				gres_state_ptr->gres_data;
14303 
14304 			for (type = 0; type < gres_data_ptr->type_cnt; type++) {
14305 				col_name = gres_data_ptr->type_name[type];
14306 				if (!col_name)
14307 					continue;
14308 
14309 				tres_rec.name = xstrdup_printf(
14310 						"%s%s",
14311 						gres_context[i].gres_name_colon,
14312 						col_name);
14313 
14314 				count = gres_data_ptr->type_cnt_alloc[type];
14315 
14316 				if ((tres_pos = assoc_mgr_find_tres_pos(
14317 							&tres_rec, true)) != -1)
14318 					tres_cnt[tres_pos] = count;
14319 				xfree(tres_rec.name);
14320 			}
14321 			break;
14322 		}
14323 		default:
14324 			error("%s: unsupported state type %d", __func__,
14325 			      state_type);
14326 			continue;
14327 		}
14328 	}
14329 	list_iterator_destroy(itr);
14330 	slurm_mutex_unlock(&gres_context_lock);
14331 
14332 	if (!locked)
14333 		assoc_mgr_unlock(&locks);
14334 
14335 	return;
14336 }
14337 
gres_set_job_tres_cnt(List gres_list,uint32_t node_cnt,uint64_t * tres_cnt,bool locked)14338 extern void gres_set_job_tres_cnt(List gres_list,
14339 				  uint32_t node_cnt,
14340 				  uint64_t *tres_cnt,
14341 				  bool locked)
14342 {
14343 	_set_type_tres_cnt(GRES_STATE_TYPE_JOB,
14344 			   gres_list, node_cnt, tres_cnt, locked);
14345 }
14346 
gres_set_node_tres_cnt(List gres_list,uint64_t * tres_cnt,bool locked)14347 extern void gres_set_node_tres_cnt(List gres_list,
14348 				   uint64_t *tres_cnt,
14349 				   bool locked)
14350 {
14351 	_set_type_tres_cnt(GRES_STATE_TYPE_NODE,
14352 			   gres_list, 0, tres_cnt, locked);
14353 }
14354 
gres_device_major(char * dev_path)14355 extern char *gres_device_major(char *dev_path)
14356 {
14357 	int loc_major, loc_minor;
14358 	char *ret_major = NULL;
14359 	struct stat fs;
14360 
14361 	if (stat(dev_path, &fs) < 0) {
14362 		error("%s: stat(%s): %m", __func__, dev_path);
14363 		return NULL;
14364 	}
14365 	loc_major = (int)major(fs.st_rdev);
14366 	loc_minor = (int)minor(fs.st_rdev);
14367 	debug3("%s : %s major %d, minor %d",
14368 	       __func__, dev_path, loc_major, loc_minor);
14369 	if (S_ISBLK(fs.st_mode)) {
14370 		xstrfmtcat(ret_major, "b %d:", loc_major);
14371 		//info("device is block ");
14372 	}
14373 	if (S_ISCHR(fs.st_mode)) {
14374 		xstrfmtcat(ret_major, "c %d:", loc_major);
14375 		//info("device is character ");
14376 	}
14377 	xstrfmtcat(ret_major, "%d rwm", loc_minor);
14378 
14379 	return ret_major;
14380 }
14381 
14382 /* Free memory for gres_device_t record */
destroy_gres_device(void * gres_device_ptr)14383 extern void destroy_gres_device(void *gres_device_ptr)
14384 {
14385 	gres_device_t *gres_device = (gres_device_t *) gres_device_ptr;
14386 
14387 	if (!gres_device)
14388 		return;
14389 	xfree(gres_device->path);
14390 	xfree(gres_device->major);
14391 	xfree(gres_device);
14392 }
14393 
14394 /* Destroy a gres_slurmd_conf_t record, free it's memory */
destroy_gres_slurmd_conf(void * x)14395 extern void destroy_gres_slurmd_conf(void *x)
14396 {
14397 	gres_slurmd_conf_t *p = (gres_slurmd_conf_t *) x;
14398 
14399 	xassert(p);
14400 	xfree(p->cpus);
14401 	FREE_NULL_BITMAP(p->cpus_bitmap);
14402 	xfree(p->file);		/* Only used by slurmd */
14403 	xfree(p->links);
14404 	xfree(p->name);
14405 	xfree(p->type_name);
14406 	xfree(p);
14407 }
14408 
14409 
14410 /*
14411  * Convert GRES config_flags to a string. The pointer returned references local
14412  * storage in this function, which is not re-entrant.
14413  */
gres_flags2str(uint8_t config_flags)14414 extern char *gres_flags2str(uint8_t config_flags)
14415 {
14416 	static char flag_str[128];
14417 	char *sep = "";
14418 
14419 	flag_str[0] = '\0';
14420 	if (config_flags & GRES_CONF_COUNT_ONLY) {
14421 		strcat(flag_str, sep);
14422 		strcat(flag_str, "CountOnly");
14423 		sep = ",";
14424 	}
14425 
14426 	if (config_flags & GRES_CONF_HAS_FILE) {
14427 		strcat(flag_str, sep);
14428 		strcat(flag_str, "HAS_FILE");
14429 		sep = ",";
14430 	}
14431 
14432 	if (config_flags & GRES_CONF_LOADED) {
14433 		strcat(flag_str, sep);
14434 		strcat(flag_str, "LOADED");
14435 		sep = ",";
14436 	}
14437 
14438 	if (config_flags & GRES_CONF_HAS_TYPE) {
14439 		strcat(flag_str, sep);
14440 		strcat(flag_str, "HAS_TYPE");
14441 		sep = ",";
14442 	}
14443 
14444 	return flag_str;
14445 }
14446 
14447 /*
14448  * Creates a gres_slurmd_conf_t record to add to a list of gres_slurmd_conf_t
14449  * records
14450  */
add_gres_to_list(List gres_list,char * name,uint64_t device_cnt,int cpu_cnt,char * cpu_aff_abs_range,bitstr_t * cpu_aff_mac_bitstr,char * device_file,char * type,char * links)14451 extern void add_gres_to_list(List gres_list, char *name, uint64_t device_cnt,
14452 			     int cpu_cnt, char *cpu_aff_abs_range,
14453 			     bitstr_t *cpu_aff_mac_bitstr, char *device_file,
14454 			     char *type, char *links)
14455 {
14456 	gres_slurmd_conf_t *gpu_record;
14457 	bool use_empty_first_record = false;
14458 	ListIterator itr = list_iterator_create(gres_list);
14459 
14460 	/*
14461 	 * If the first record already exists and has a count of 0 then
14462 	 * overwrite it.
14463 	 * This is a placeholder record created in _merge_config()
14464 	 */
14465 	gpu_record = list_next(itr);
14466 	if (gpu_record && (gpu_record->count == 0))
14467 		use_empty_first_record = true;
14468 	else
14469 		gpu_record = xmalloc(sizeof(gres_slurmd_conf_t));
14470 	gpu_record->cpu_cnt = cpu_cnt;
14471 	if (cpu_aff_mac_bitstr)
14472 		gpu_record->cpus_bitmap = bit_copy(cpu_aff_mac_bitstr);
14473 	if (device_file)
14474 		gpu_record->config_flags |= GRES_CONF_HAS_FILE;
14475 	if (type)
14476 		gpu_record->config_flags |= GRES_CONF_HAS_TYPE;
14477 	gpu_record->cpus = xstrdup(cpu_aff_abs_range);
14478 	gpu_record->type_name = xstrdup(type);
14479 	gpu_record->name = xstrdup(name);
14480 	gpu_record->file = xstrdup(device_file);
14481 	gpu_record->links = xstrdup(links);
14482 	gpu_record->count = device_cnt;
14483 	gpu_record->plugin_id = gres_plugin_build_id(name);
14484 	if (!use_empty_first_record)
14485 		list_append(gres_list, gpu_record);
14486 	list_iterator_destroy(itr);
14487 }
14488