1 /*****************************************************************************\ 2 * gres.h - driver for gres plugin 3 ***************************************************************************** 4 * Copyright (C) 2010 Lawrence Livermore National Security. 5 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 6 * Written by Morris Jette <jette1@llnl.gov> 7 * CODE-OCEC-09-009. All rights reserved. 8 * 9 * This file is part of Slurm, a resource management program. 10 * For details, see <https://slurm.schedmd.com/>. 11 * Please also read the included file: DISCLAIMER. 12 * 13 * Slurm is free software; you can redistribute it and/or modify it under 14 * the terms of the GNU General Public License as published by the Free 15 * Software Foundation; either version 2 of the License, or (at your option) 16 * any later version. 17 * 18 * In addition, as a special exception, the copyright holders give permission 19 * to link the code of portions of this program with the OpenSSL library under 20 * certain conditions as described in each individual source file, and 21 * distribute linked combinations including the two. You must obey the GNU 22 * General Public License in all respects for all of the code used other than 23 * OpenSSL. If you modify file(s) with this exception, you may extend this 24 * exception to your version of the file(s), but you are not obligated to do 25 * so. If you do not wish to do so, delete this exception statement from your 26 * version. If you delete this exception statement from all source files in 27 * the program, then also delete it here. 28 * 29 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY 30 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 31 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 32 * details. 33 * 34 * You should have received a copy of the GNU General Public License along 35 * with Slurm; if not, write to the Free Software Foundation, Inc., 36 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 37 \*****************************************************************************/ 38 39 #ifndef _GRES_H 40 #define _GRES_H 41 42 #include "slurm/slurm.h" 43 #include "slurm/slurmdb.h" 44 #include "src/common/bitstring.h" 45 #include "src/common/job_resources.h" 46 #include "src/common/node_conf.h" 47 #include "src/common/pack.h" 48 49 #define GRES_MAGIC 0x438a34d4 50 #define GRES_MAX_LINK 1023 51 52 enum { 53 GRES_VAL_TYPE_FOUND = 0, 54 GRES_VAL_TYPE_CONFIG = 1, 55 GRES_VAL_TYPE_AVAIL = 2, 56 GRES_VAL_TYPE_ALLOC = 3 57 }; 58 59 typedef struct { 60 int alloc; 61 int dev_num; 62 char *major; 63 char *path; 64 } gres_device_t; 65 66 #define GRES_CONF_HAS_FILE 0x02 /* File= is configured */ 67 #define GRES_CONF_HAS_TYPE 0x04 /* Type= is configured */ 68 #define GRES_CONF_COUNT_ONLY 0x08 /* GRES lacks plugin to load */ 69 #define GRES_CONF_LOADED 0x10 /* used to avoid loading a plugin 70 * multiple times */ 71 72 #define GRES_NO_CONSUME 0x0001 /* Requesting no consume of resources */ 73 74 /* GRES AutoDetect options */ 75 #define GRES_AUTODETECT_NONE 0x00000000 76 #define GRES_AUTODETECT_NVML 0x00000001 77 #define GRES_AUTODETECT_RSMI 0x00000002 78 79 /* Gres state information gathered by slurmd daemon */ 80 typedef struct gres_slurmd_conf { 81 uint8_t config_flags; /* See GRES_CONF_* values above */ 82 83 /* Count of gres available in this configuration record */ 84 uint64_t count; 85 86 /* Specific CPUs associated with this configuration record */ 87 uint32_t cpu_cnt; 88 char *cpus; 89 bitstr_t *cpus_bitmap; /* Using LOCAL mapping */ 90 91 /* Device file associated with this configuration record */ 92 char *file; 93 94 /* Comma-separated list of communication link IDs (numbers) */ 95 char *links; 96 97 /* Name of this gres */ 98 char *name; 99 100 /* Type of this GRES (e.g. model name) */ 101 char *type_name; 102 103 /* GRES ID number */ 104 uint32_t plugin_id; 105 } gres_slurmd_conf_t; 106 107 108 /* Extra data and functions to be passed in to the node_config_load() */ 109 typedef struct node_config_load { 110 /* How many CPUs there are configured on the node */ 111 uint32_t cpu_cnt; 112 /* A pointer to the mac_to_abs function */ 113 int (*xcpuinfo_mac_to_abs) (char *mac, char **abs); 114 } node_config_load_t; 115 116 /* Current GRES state information managed by slurmctld daemon */ 117 typedef struct gres_node_state { 118 /* Actual hardware found */ 119 uint64_t gres_cnt_found; 120 121 /* Configured resources via "Gres" parameter */ 122 uint64_t gres_cnt_config; 123 124 /* Non-consumable: Do not track resources allocated to jobs */ 125 bool no_consume; 126 127 /* True if set by node_feature plugin, ignore info from compute node */ 128 bool node_feature; 129 130 /* 131 * Total resources available for allocation to jobs. 132 * gres_cnt_found or gres_cnt_config, depending upon config_overrides 133 */ 134 uint64_t gres_cnt_avail; 135 136 /* List of GRES in current use. Set NULL if needs to be rebuilt. */ 137 char *gres_used; 138 139 /* Resources currently allocated to jobs */ 140 uint64_t gres_cnt_alloc; 141 bitstr_t *gres_bit_alloc; /* If gres.conf contains File field */ 142 143 /* 144 * Topology specific information. In the case of gres/mps, there is one 145 * topo record per file (GPU) and the size of the GRES bitmaps (i.e. 146 * gres_bit_alloc and topo_gres_bitmap[#]) is equal to the number of 147 * GPUs on the node while the count is a site-configurable value. 148 */ 149 uint16_t topo_cnt; /* Size of topo_ arrays */ 150 int link_len; /* Size of link_cnt */ 151 int **links_cnt; /* Count of links between GRES */ 152 bitstr_t **topo_core_bitmap; 153 bitstr_t **topo_gres_bitmap; 154 uint64_t *topo_gres_cnt_alloc; 155 uint64_t *topo_gres_cnt_avail; 156 uint32_t *topo_type_id; /* GRES type (e.g. model ID) */ 157 char **topo_type_name; /* GRES type (e.g. model name) */ 158 159 /* 160 * GRES type specific information (if gres.conf contains type option) 161 * 162 * NOTE: If a job requests GRES without a type specification, these 163 * type_cnt_alloc will not be incremented. Only the gres_cnt_alloc 164 * will be incremented. 165 */ 166 uint16_t type_cnt; /* Size of type_ arrays */ 167 uint64_t *type_cnt_alloc; 168 uint64_t *type_cnt_avail; 169 uint32_t *type_id; /* GRES type (e.g. model ID) */ 170 char **type_name; /* GRES type (e.g. model name) */ 171 } gres_node_state_t; 172 173 /* Gres job state as used by slurmctld daemon */ 174 typedef struct gres_job_state { 175 char *gres_name; /* GRES name (e.g. "gpu") */ 176 uint32_t type_id; /* GRES type (e.g. model ID) */ 177 char *type_name; /* GRES type (e.g. model name) */ 178 uint16_t flags; /* GRES_NO_CONSUME, etc. */ 179 180 /* Count of required GRES resources plus associated CPUs and memory */ 181 uint16_t cpus_per_gres; 182 uint64_t gres_per_job; 183 uint64_t gres_per_node; 184 uint64_t gres_per_socket; 185 uint64_t gres_per_task; 186 uint64_t mem_per_gres; 187 188 /* 189 * Default GRES configuration parameters. These values are subject to 190 * change depending upon which partition the job is currently being 191 * considered for scheduling in. 192 */ 193 uint16_t def_cpus_per_gres; 194 uint64_t def_mem_per_gres; 195 196 /* 197 * Selected resource details. One entry per node on the cluster. 198 * Used by select/cons_tres to identify which resources would be 199 * allocated on a node IF that node is included in the job allocation. 200 * Once specific nodes are selected for the job allocation, select 201 * portions of these arrays are copied to gres_bit_alloc and 202 * gres_cnt_node_alloc. The fields can then be cleared. 203 */ 204 uint32_t total_node_cnt; /* cluster total node count */ 205 bitstr_t **gres_bit_select; /* Per node GRES selected, 206 * Used with GRES files */ 207 uint64_t *gres_cnt_node_select; /* Per node GRES selected, 208 * Used without GRES files */ 209 210 /* Allocated resources details */ 211 uint64_t total_gres; /* Count of allocated GRES to job */ 212 uint32_t node_cnt; /* 0 if no_consume */ 213 bitstr_t **gres_bit_alloc; /* Per node GRES allocated, 214 * Used with GRES files */ 215 uint64_t *gres_cnt_node_alloc; /* Per node GRES allocated, 216 * Used with and without GRES files */ 217 218 /* 219 * Resources currently allocated to job steps on each node. 220 * This will be a subset of resources allocated to the job. 221 * gres_bit_step_alloc is a subset of gres_bit_alloc 222 */ 223 bitstr_t **gres_bit_step_alloc; 224 uint64_t *gres_cnt_step_alloc; 225 } gres_job_state_t; 226 227 /* Used to set Prolog and Epilog env var. Currently designed for gres/mps. */ 228 typedef struct gres_epilog_info { 229 uint32_t plugin_id; /* GRES ID number */ 230 uint32_t node_cnt; /* Count of all hosts allocated to job */ 231 char *node_list; /* List of all hosts allocated to job */ 232 bitstr_t **gres_bit_alloc; /* Per-node bitmap of allocated resources */ 233 uint64_t *gres_cnt_node_alloc; /* Per node GRES allocated, 234 * Used with and without GRES files */ 235 } gres_epilog_info_t; 236 237 /* Gres job step state as used by slurmctld daemon */ 238 typedef struct gres_step_state { 239 uint32_t type_id; /* GRES type (e.g. model ID) */ 240 char *type_name; /* GRES type (e.g. model name) */ 241 uint16_t flags; /* GRES_NO_CONSUME, etc. */ 242 243 /* Count of required GRES resources plus associated CPUs and memory */ 244 uint16_t cpus_per_gres; 245 uint64_t gres_per_step; 246 uint64_t gres_per_node; 247 uint64_t gres_per_socket; 248 uint64_t gres_per_task; 249 uint64_t mem_per_gres; 250 251 /* 252 * Allocated resources details 253 * 254 * NOTE: node_cnt and the size of node_in_use and gres_bit_alloc are 255 * identical to that of the job for simplicity. Bits in node_in_use 256 * are set for those node of the job that are used by this step and 257 * gres_bit_alloc are also set if the job's gres_bit_alloc is set 258 */ 259 uint64_t total_gres; /* allocated GRES for this step */ 260 uint64_t gross_gres; /* used during the scheduling phase, 261 * GRES that could be available for this 262 * step if no other steps active */ 263 uint64_t *gres_cnt_node_alloc; /* Per node GRES allocated, 264 * Used without GRES files */ 265 uint32_t node_cnt; 266 bitstr_t *node_in_use; 267 bitstr_t **gres_bit_alloc; /* Used with GRES files */ 268 } gres_step_state_t; 269 270 /* Per-socket GRES availability information for scheduling purposes */ 271 typedef struct sock_gres { /* GRES availability by socket */ 272 bitstr_t *bits_any_sock;/* Per-socket GRES bitmap of this name & type */ 273 bitstr_t **bits_by_sock;/* Per-socket GRES bitmap of this name & type */ 274 uint64_t cnt_any_sock; /* GRES count unconstrained by cores */ 275 uint64_t *cnt_by_sock; /* Per-socket GRES count of this name & type */ 276 char *gres_name; /* GRES name */ 277 gres_job_state_t *job_specs; /* Pointer to job info, for limits */ 278 uint64_t max_node_gres; /* Maximum GRES permitted on this node */ 279 gres_node_state_t *node_specs; /* Pointer to node info, for state */ 280 uint32_t plugin_id; /* Plugin ID (for quick search) */ 281 int sock_cnt; /* Socket count, size of bits_by_sock and 282 * cnt_by_sock arrays */ 283 uint64_t total_cnt; /* Total GRES count of this name & type */ 284 uint32_t type_id; /* GRES type (e.g. model ID) */ 285 char *type_name; /* GRES type (e.g. model name) */ 286 } sock_gres_t; 287 288 /* Similar to multi_core_data_t in slurm_protocol_defs.h */ 289 typedef struct gres_mc_data { 290 uint16_t boards_per_node; /* boards per node required by job */ 291 uint16_t sockets_per_board; /* sockets per board required by job */ 292 uint16_t sockets_per_node; /* sockets per node required by job */ 293 uint16_t cores_per_socket; /* cores per cpu required by job */ 294 uint16_t threads_per_core; /* threads per core required by job */ 295 296 uint16_t cpus_per_task; /* Count of CPUs per task */ 297 uint32_t ntasks_per_job; /* number of tasks to invoke for job or NO_VAL */ 298 uint16_t ntasks_per_node; /* number of tasks to invoke on each node */ 299 uint16_t ntasks_per_board; /* number of tasks to invoke on each board */ 300 uint16_t ntasks_per_socket; /* number of tasks to invoke on each socket */ 301 uint16_t ntasks_per_core; /* number of tasks to invoke on each core */ 302 uint8_t overcommit; /* processors being over subscribed */ 303 uint16_t plane_size; /* plane size for SLURM_DIST_PLANE */ 304 uint32_t task_dist; /* task distribution directives */ 305 uint8_t whole_node; /* allocate entire node */ 306 } gres_mc_data_t; 307 308 typedef enum { 309 GRES_STATE_TYPE_NODE = 0, 310 GRES_STATE_TYPE_JOB, 311 GRES_STATE_TYPE_STEP 312 } gres_state_type_enum_t; 313 314 /* 315 * Initialize the GRES plugins. 316 * 317 * Returns a Slurm errno. 318 */ 319 extern int gres_plugin_init(void); 320 321 /* 322 * Terminate the GRES plugins. Free memory. 323 * 324 * Returns a Slurm errno. 325 */ 326 extern int gres_plugin_fini(void); 327 328 /* 329 ************************************************************************** 330 * P L U G I N C A L L S * 331 ************************************************************************** 332 */ 333 334 /* 335 * Perform reconfig, re-read any configuration files 336 * OUT did_change - set if gres configuration changed 337 */ 338 extern int gres_plugin_reconfig(void); 339 340 /* 341 * Return a plugin-specific help message for salloc, sbatch and srun 342 * Result must be xfree()'d 343 */ 344 extern char *gres_plugin_help_msg(void); 345 346 /* 347 * Convert a GRES name or model into a number for faster comparison operations 348 * IN name - GRES name or model 349 * RET - An int representing a custom hash of the name 350 */ 351 extern uint32_t gres_plugin_build_id(char *name); 352 353 /* 354 * Takes a GRES config line (typically from slurm.conf) and remove any 355 * records for GRES which are not defined in GresTypes. 356 * RET string of valid GRES, Release memory using xfree() 357 */ 358 extern char *gres_plugin_name_filter(char *orig_gres, char *nodes); 359 360 /* 361 ************************************************************************** 362 * PLUGIN CALLS FOR SLURMD DAEMON * 363 ************************************************************************** 364 */ 365 /* 366 * Load this node's configuration (how many resources it has, topology, etc.) 367 * IN cpu_cnt - Number of CPUs configured on this node 368 * IN node_name - Name of this node 369 * IN gres_list - Node's GRES information as loaded from slurm.conf by slurmd 370 * IN xcpuinfo_abs_to_mac - Pointer to xcpuinfo_abs_to_mac() funct, if available 371 * IN xcpuinfo_mac_to_abs - Pointer to xcpuinfo_mac_to_abs() funct, if available 372 * NOTE: Called from slurmd and slurmstepd 373 */ 374 extern int gres_plugin_node_config_load(uint32_t cpu_cnt, char *node_name, 375 List gres_list, 376 void *xcpuinfo_abs_to_mac, 377 void *xcpuinfo_mac_to_abs); 378 379 /* 380 * Pack this node's gres configuration into a buffer 381 * IN/OUT buffer - message buffer to pack 382 */ 383 extern int gres_plugin_node_config_pack(Buf buffer); 384 385 /* 386 * Set GRES devices as allocated or not for a particular job 387 * IN gres_list - allocated gres devices 388 * IN is_job - if is job function expects gres_job_state_t's else 389 * gres_step_state_t's 390 * RET - List of gres_device_t containing all devices from all GRES with alloc 391 * set correctly if the device is allocated to the job/step. 392 */ 393 extern List gres_plugin_get_allocated_devices(List gres_list, bool is_job); 394 395 /* Send GRES information to slurmstepd on the specified file descriptor */ 396 extern void gres_plugin_send_stepd(int fd); 397 398 /* Receive GRES information from slurmd on the specified file descriptor */ 399 extern void gres_plugin_recv_stepd(int fd); 400 401 /* 402 ************************************************************************** 403 * PLUGIN CALLS FOR SLURMCTLD DAEMON * 404 ************************************************************************** 405 */ 406 /* 407 * Build a node's gres record based only upon the slurm.conf contents 408 * IN node_name - name of the node for which the gres information applies 409 * IN orig_config - Gres information supplied from slurm.conf 410 * IN/OUT gres_list - List of Gres records for this node to track usage 411 */ 412 extern int gres_plugin_init_node_config(char *node_name, char *orig_config, 413 List *gres_list); 414 415 /* 416 * Return how many gres Names are on the system. 417 */ 418 extern int gres_plugin_get_gres_cnt(void); 419 420 /* Add a GRES record. This is used by the node_features plugin after the 421 * slurm.conf file is read and the initial GRES records are built by 422 * gres_plugin_init(). */ 423 extern void gres_plugin_add(char *gres_name); 424 425 /* 426 * Unpack this node's configuration from a buffer (built/packed by slurmd) 427 * IN/OUT buffer - message buffer to unpack 428 * IN node_name - name of node whose data is being unpacked 429 */ 430 extern int gres_plugin_node_config_unpack(Buf buffer, char *node_name); 431 432 /* 433 * Validate a node's configuration and put a gres record onto a list 434 * Called immediately after gres_plugin_node_config_unpack(). 435 * IN node_name - name of the node for which the gres information applies 436 * IN orig_config - Gres information supplied from merged slurm.conf/gres.conf 437 * IN/OUT new_config - Updated gres info from slurm.conf 438 * IN/OUT gres_list - List of Gres records for this node to track usage 439 * IN threads_per_core - Count of CPUs (threads) per core on this node 440 * IN cores_per_sock - Count of cores per socket on this node 441 * IN sock_cnt - Count of sockets on this node 442 * IN config_overrides - true: Don't validate hardware, use slurm.conf 443 * configuration 444 * false: Validate hardware config, but use slurm.conf 445 * config 446 * OUT reason_down - set to an explanation of failure, if any, don't set if NULL 447 */ 448 extern int gres_plugin_node_config_validate(char *node_name, 449 char *orig_config, 450 char **new_config, 451 List *gres_list, 452 int threads_per_core, 453 int cores_per_sock, int sock_cnt, 454 bool config_overrides, 455 char **reason_down); 456 457 /* 458 * Add a GRES from node_feature plugin 459 * IN node_name - name of the node for which the gres information applies 460 * IN gres_name - name of the GRES being added or updated from the plugin 461 * IN gres_size - count of this GRES on this node 462 * IN/OUT new_config - Updated GRES info from slurm.conf 463 * IN/OUT gres_list - List of GRES records for this node to track usage 464 */ 465 extern void gres_plugin_node_feature(char *node_name, 466 char *gres_name, uint64_t gres_size, 467 char **new_config, List *gres_list); 468 469 /* 470 * Note that a node's configuration has been modified (e.g. "scontol update ..") 471 * IN node_name - name of the node for which the gres information applies 472 * IN new_gres - Updated GRES information supplied from slurm.conf or scontrol 473 * IN/OUT gres_str - Node's current GRES string, updated as needed 474 * IN/OUT gres_list - List of Gres records for this node to track usage 475 * IN config_overrides - true: Don't validate hardware, use slurm.conf 476 * configuration 477 * false: Validate hardware config, but use slurm.conf 478 * config 479 * IN cores_per_sock - Number of cores per socket on this node 480 * IN sock_per_node - Total count of sockets on this node (on any board) 481 */ 482 extern int gres_plugin_node_reconfig(char *node_name, 483 char *new_gres, 484 char **gres_str, 485 List *gres_list, 486 bool config_overrides, 487 int cores_per_sock, 488 int sock_per_node); 489 490 /* 491 * Pack a node's current gres status, called from slurmctld for save/restore 492 * IN gres_list - generated by gres_plugin_node_config_validate() 493 * IN/OUT buffer - location to write state to 494 * IN node_name - name of the node for which the gres information applies 495 */ 496 extern int gres_plugin_node_state_pack(List gres_list, Buf buffer, 497 char *node_name); 498 /* 499 * Unpack a node's current gres status, called from slurmctld for save/restore 500 * OUT gres_list - restored state stored by gres_plugin_node_state_pack() 501 * IN/OUT buffer - location to read state from 502 * IN node_name - name of the node for which the gres information applies 503 */ 504 extern int gres_plugin_node_state_unpack(List *gres_list, Buf buffer, 505 char *node_name, 506 uint16_t protocol_version); 507 508 /* 509 * Duplicate a node gres status (used for will-run logic) 510 * IN gres_list - node gres state information 511 * RET a copy of gres_list or NULL on failure 512 */ 513 extern List gres_plugin_node_state_dup(List gres_list); 514 515 /* 516 * Deallocate all resources on this node previous allocated to any jobs. 517 * This function isused to synchronize state after slurmctld restarts or 518 * is reconfigured. 519 * IN gres_list - node gres state information 520 */ 521 extern void gres_plugin_node_state_dealloc_all(List gres_list); 522 523 /* 524 * Log a node's current gres state 525 * IN gres_list - generated by gres_plugin_node_config_validate() 526 * IN node_name - name of the node for which the gres information applies 527 */ 528 extern void gres_plugin_node_state_log(List gres_list, char *node_name); 529 530 /* 531 * Build a string indicating a node's drained GRES 532 * IN gres_list - generated by gres_plugin_node_config_validate() 533 * RET - string, must be xfreed by caller 534 */ 535 extern char *gres_get_node_drain(List gres_list); 536 537 /* 538 * Build a string indicating a node's used GRES 539 * IN gres_list - generated by gres_plugin_node_config_validate() 540 * RET - string, must be xfreed by caller 541 */ 542 extern char *gres_get_node_used(List gres_list); 543 544 /* 545 * Give the total system count of a given GRES 546 * Returns NO_VAL64 if name not found 547 */ 548 extern uint64_t gres_get_system_cnt(char *name); 549 550 /* 551 * Get the count of a node's GRES 552 * IN gres_list - List of Gres records for this node to track usage 553 * IN name - name of gres 554 */ 555 extern uint64_t gres_plugin_node_config_cnt(List gres_list, char *name); 556 557 /* 558 * Fill in an array of GRES type ids contained within the given node gres_list 559 * and an array of corresponding counts of those GRES types. 560 * IN gres_list - a List of GRES types found on a node. 561 * IN arr_len - Length of the arrays (the number of elements in the gres_list). 562 * IN gres_count_ids, gres_count_vals - the GRES type ID's and values found 563 * in the gres_list. 564 * IN val_type - Type of value desired, see GRES_VAL_TYPE_* 565 * RET SLURM_SUCCESS or error code 566 */ 567 extern int gres_plugin_node_count(List gres_list, int arr_len, 568 uint32_t *gres_count_ids, 569 uint64_t *gres_count_vals, 570 int val_type); 571 572 /* 573 * Fill in an array of GRES type ids contained within the given job gres_list 574 * and an array of corresponding counts of those GRES types. 575 * IN gres_list - a List of GRES types allocated to a job. 576 * IN arr_len - Length of the arrays (the number of elements in the gres_list). 577 * IN gres_count_ids, gres_count_vals - the GRES type ID's and values found 578 * in the gres_list. 579 * RET SLURM_SUCCESS or error code 580 */ 581 extern int gres_plugin_job_count(List gres_list, int arr_len, 582 uint32_t *gres_count_ids, 583 uint64_t *gres_count_vals); 584 585 /* 586 * Build a string identifying total GRES counts of each type 587 * IN gres_list - a List of GRES types allocated to a job. 588 * RET string containing comma-separated list of gres type:model:count 589 * must release memory using xfree() 590 */ 591 extern char *gres_plugin_job_alloc_count(List gres_list); 592 593 /* 594 * Pack a job's allocated gres information for use by prolog/epilog 595 * IN gres_list - generated by gres_plugin_job_config_validate() 596 * IN/OUT buffer - location to write state to 597 */ 598 extern int gres_plugin_job_alloc_pack(List gres_list, Buf buffer, 599 uint16_t protocol_version); 600 601 /* 602 * Unpack a job's allocated gres information for use by prolog/epilog 603 * OUT gres_list - restored state stored by gres_plugin_job_alloc_pack() 604 * IN/OUT buffer - location to read state from 605 */ 606 extern int gres_plugin_job_alloc_unpack(List *gres_list, Buf buffer, 607 uint16_t protocol_version); 608 609 /* 610 * Build List of information needed to set job's Prolog or Epilog environment 611 * variables 612 * 613 * IN job_gres_list - job's GRES allocation info 614 * IN hostlist - list of nodes associated with the job 615 * RET information about the job's GRES allocation needed by Prolog or Epilog 616 */ 617 extern List gres_plugin_epilog_build_env(List job_gres_list, char *node_list); 618 619 /* 620 * Set environment variables as appropriate for a job's prolog or epilog based 621 * GRES allocated to the job. 622 * 623 * IN/OUT epilog_env_ptr - environment variable array 624 * IN epilog_gres_list - generated by TBD 625 * IN node_inx - zero origin node index 626 */ 627 extern void gres_plugin_epilog_set_env(char ***epilog_env_ptr, 628 List epilog_gres_list, int node_inx); 629 630 631 /* 632 * Given a job's requested GRES configuration, validate it and build a GRES list 633 * Note: This function can be used for a new request with gres_list==NULL or 634 * used to update an existing job, in which case gres_list is a copy 635 * of the job's original value (so we can clear fields as needed) 636 * IN *tres* - job requested gres input string 637 * IN/OUT num_tasks - requested task count, may be reset to provide 638 * consistent gres_per_node/task values 639 * IN/OUT min_nodes - requested minimum node count, may be reset to provide 640 * consistent gres_per_node/task values 641 * IN/OUT max_nodes - requested maximum node count, may be reset to provide 642 * consistent gres_per_node/task values 643 * IN/OUT ntasks_per_node - requested tasks_per_node count, may be reset to 644 * provide consistent gres_per_node/task values 645 * IN/OUT ntasks_per_socket - requested ntasks_per_socket count, may be reset to 646 * provide consistent gres_per_node/task values 647 * IN/OUT sockets_per_node - requested sockets_per_node count, may be reset to 648 * provide consistent gres_per_socket/node values 649 * IN/OUT cpus_per_task - requested ntasks_per_socket count, may be reset to 650 * provide consistent gres_per_task/cpus_per_gres values 651 * OUT gres_list - List of GRES records for this job to track usage 652 * RET SLURM_SUCCESS or ESLURM_INVALID_GRES 653 */ 654 extern int gres_plugin_job_state_validate(char *cpus_per_tres, 655 char *tres_freq, 656 char *tres_per_job, 657 char *tres_per_node, 658 char *tres_per_socket, 659 char *tres_per_task, 660 char *mem_per_tres, 661 uint32_t *num_tasks, 662 uint32_t *min_nodes, 663 uint32_t *max_nodes, 664 uint16_t *ntasks_per_node, 665 uint16_t *ntasks_per_socket, 666 uint16_t *sockets_per_node, 667 uint16_t *cpus_per_task, 668 List *gres_list); 669 670 /* 671 * Determine if a job's specified GRES can be supported. This is designed to 672 * prevent the running of a job using the GRES options only supported by the 673 * select/cons_tres plugin when switching (on slurmctld restart) from the 674 * cons_tres plugin to any other select plugin. 675 * 676 * IN gres_list - List of GRES records for this job to track usage 677 * RET SLURM_SUCCESS or ESLURM_INVALID_GRES 678 */ 679 extern int gres_plugin_job_revalidate(List gres_list); 680 681 /* 682 * Determine if a job's specified GRES are currently valid. This is designed to 683 * manage jobs allocated GRES which are either no longer supported or a GRES 684 * configured with the "File" option in gres.conf where the count has changed, 685 * in which case we don't know how to map the job's old GRES bitmap onto the 686 * current GRES bitmaps. 687 * 688 * IN job_id - ID of job being validated (used for logging) 689 * IN job_gres_list - List of GRES records for this job to track usage 690 * RET SLURM_SUCCESS or ESLURM_INVALID_GRES 691 */ 692 extern int gres_plugin_job_revalidate2(uint32_t job_id, List job_gres_list, 693 bitstr_t *node_bitmap); 694 695 /* 696 * Clear GRES allocation info for all job GRES at start of scheduling cycle 697 * Return TRUE if any gres_per_job constraints to satisfy 698 */ 699 extern bool gres_plugin_job_sched_init(List job_gres_list); 700 701 /* 702 * Return TRUE if all gres_per_job specifications are satisfied 703 */ 704 extern bool gres_plugin_job_sched_test(List job_gres_list, uint32_t job_id); 705 706 /* 707 * Return TRUE if all gres_per_job specifications will be satisfied with 708 * the addtitional resources provided by a single node 709 * IN job_gres_list - List of job's GRES requirements (job_gres_state_t) 710 * IN sock_gres_list - Per socket GRES availability on this node (sock_gres_t) 711 * IN job_id - The job being tested 712 */ 713 extern bool gres_plugin_job_sched_test2(List job_gres_list, List sock_gres_list, 714 uint32_t job_id); 715 716 /* 717 * Update a job's total_gres counter as we add a node to potential allocation 718 * IN job_gres_list - List of job's GRES requirements (job_gres_state_t) 719 * IN sock_gres_list - Per socket GRES availability on this node (sock_gres_t) 720 * IN avail_cpus - CPUs currently available on this node 721 */ 722 extern void gres_plugin_job_sched_add(List job_gres_list, List sock_gres_list, 723 uint16_t avail_cpus); 724 725 /* 726 * Create/update List GRES that can be made available on the specified node 727 * IN/OUT consec_gres - List of sock_gres_t that can be made available on 728 * a set of nodes 729 * IN job_gres_list - List of job's GRES requirements (gres_job_state_t) 730 * IN sock_gres_list - Per socket GRES availability on this node (sock_gres_t) 731 */ 732 extern void gres_plugin_job_sched_consec(List *consec_gres, List job_gres_list, 733 List sock_gres_list); 734 735 /* 736 * Determine if the additional sock_gres_list resources will result in 737 * satisfying the job's gres_per_job constraints 738 * IN job_gres_list - job's GRES requirements 739 * IN sock_gres_list - available GRES in a set of nodes, data structure built 740 * by gres_plugin_job_sched_consec() 741 */ 742 extern bool gres_plugin_job_sched_sufficient(List job_gres_list, 743 List sock_gres_list); 744 745 /* 746 * Given a List of sock_gres_t entries, return a string identifying the 747 * count of each GRES available on this set of nodes 748 * IN sock_gres_list - count of GRES available in this group of nodes 749 * IN job_gres_list - job GRES specification, used only to get GRES name/type 750 * RET xfree the returned string 751 */ 752 extern char *gres_plugin_job_sched_str(List sock_gres_list, List job_gres_list); 753 754 /* 755 * Create a (partial) copy of a job's gres state for job binding 756 * IN gres_list - List of Gres records for this job to track usage 757 * RET The copy or NULL on failure 758 * NOTE: Only gres_cnt_alloc, node_cnt and gres_bit_alloc are copied 759 * Job step details are NOT copied. 760 */ 761 extern List gres_plugin_job_state_dup(List gres_list); 762 763 /* 764 * Create a (partial) copy of a job's gres state for a particular node index 765 * IN gres_list - List of Gres records for this job to track usage 766 * IN node_index - zero-origin index to the node 767 * RET The copy or NULL on failure 768 */ 769 extern List gres_plugin_job_state_extract(List gres_list, int node_index); 770 771 /* 772 * Pack a job's current gres status, called from slurmctld for save/restore 773 * IN gres_list - generated by gres_plugin_job_config_validate() 774 * IN/OUT buffer - location to write state to 775 * IN job_id - job's ID 776 * IN details - if set then pack job step allocation details (only needed to 777 * save/restore job state, not needed in job credential for 778 * slurmd task binding) 779 * 780 * NOTE: A job's allocation to steps is not recorded here, but recovered with 781 * the job step state information upon slurmctld restart. 782 */ 783 extern int gres_plugin_job_state_pack(List gres_list, Buf buffer, 784 uint32_t job_id, bool details, 785 uint16_t protocol_version); 786 787 /* 788 * Unpack a job's current gres status, called from slurmctld for save/restore 789 * OUT gres_list - restored state stored by gres_plugin_job_state_pack() 790 * IN/OUT buffer - location to read state from 791 * IN job_id - job's ID 792 */ 793 extern int gres_plugin_job_state_unpack(List *gres_list, Buf buffer, 794 uint32_t job_id, 795 uint16_t protocol_version); 796 797 /* 798 * Clear the core_bitmap for cores which are not usable by this job (i.e. for 799 * cores which are already bound to other jobs or lack GRES) 800 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate() 801 * IN node_gres_list - node's gres_list built by 802 * gres_plugin_node_config_validate() 803 * IN use_total_gres - if set then consider all GRES resources as available, 804 * and none are commited to running jobs 805 * IN/OUT core_bitmap - Identification of available cores (NULL if no restriction) 806 * IN core_start_bit - index into core_bitmap for this node's first cores 807 * IN core_end_bit - index into core_bitmap for this node's last cores 808 */ 809 extern void gres_plugin_job_core_filter(List job_gres_list, List node_gres_list, 810 bool use_total_gres, 811 bitstr_t *core_bitmap, 812 int core_start_bit, int core_end_bit, 813 char *node_name); 814 815 /* 816 * Determine how many cores on the node can be used by this job 817 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate() 818 * IN node_gres_list - node's gres_list built by gres_plugin_node_config_validate() 819 * IN use_total_gres - if set then consider all gres resources as available, 820 * and none are commited to running jobs 821 * IN core_bitmap - Identification of available cores (NULL if no restriction) 822 * IN core_start_bit - index into core_bitmap for this node's first core 823 * IN core_end_bit - index into core_bitmap for this node's last core 824 * IN job_id - job's ID (for logging) 825 * IN node_name - name of the node (for logging) 826 * IN disable binding- --gres-flags=disable-binding 827 * RET: NO_VAL - All cores on node are available 828 * otherwise - Count of available cores 829 */ 830 extern uint32_t gres_plugin_job_test(List job_gres_list, List node_gres_list, 831 bool use_total_gres, bitstr_t *core_bitmap, 832 int core_start_bit, int core_end_bit, 833 uint32_t job_id, char *node_name, 834 bool disable_binding); 835 836 /* 837 * Determine how many cores on each socket of a node can be used by this job 838 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate() 839 * IN node_gres_list - node's gres_list built by gres_plugin_node_config_validate() 840 * IN use_total_gres - if set then consider all gres resources as available, 841 * and none are commited to running jobs 842 * IN/OUT core_bitmap - Identification of available cores on this node 843 * IN sockets - Count of sockets on the node 844 * IN cores_per_sock - Count of cores per socket on this node 845 * IN job_id - job's ID (for logging) 846 * IN node_name - name of the node (for logging) 847 * IN enforce_binding - if true then only use GRES with direct access to cores 848 * IN s_p_n - Expected sockets_per_node (NO_VAL if not limited) 849 * OUT req_sock_map - bitmap of specific requires sockets 850 * IN user_id - job's user ID 851 * IN node_inx - index of node to be evaluated 852 * RET: List of sock_gres_t entries identifying what resources are available on 853 * each core. Returns NULL if none available. Call FREE_NULL_LIST() to 854 * release memory. 855 */ 856 extern List gres_plugin_job_test2(List job_gres_list, List node_gres_list, 857 bool use_total_gres, bitstr_t *core_bitmap, 858 uint16_t sockets, uint16_t cores_per_sock, 859 uint32_t job_id, char *node_name, 860 bool enforce_binding, uint32_t s_p_n, 861 bitstr_t **req_sock_map, uint32_t user_id, 862 const uint32_t node_inx); 863 864 /* 865 * Determine which GRES can be used on this node given the available cores. 866 * Filter out unusable GRES. 867 * IN sock_gres_list - list of sock_gres_t entries built by gres_plugin_job_test2() 868 * IN avail_mem - memory available for the job 869 * IN max_cpus - maximum CPUs available on this node (limited by 870 * specialized cores and partition CPUs-per-node) 871 * IN enforce_binding - GRES must be co-allocated with cores 872 * IN core_bitmap - Identification of available cores on this node 873 * IN sockets - Count of sockets on the node 874 * IN cores_per_sock - Count of cores per socket on this node 875 * IN cpus_per_core - Count of CPUs per core on this node 876 * IN sock_per_node - sockets requested by job per node or NO_VAL 877 * IN task_per_node - tasks requested by job per node or NO_VAL16 878 * IN whole_node - we are requesting the whole node or not 879 * OUT avail_gpus - Count of available GPUs on this node 880 * OUT near_gpus - Count of GPUs available on sockets with available CPUs 881 * RET - 0 if job can use this node, -1 otherwise (some GRES limit prevents use) 882 */ 883 extern int gres_plugin_job_core_filter2(List sock_gres_list, uint64_t avail_mem, 884 uint16_t max_cpus, 885 bool enforce_binding, 886 bitstr_t *core_bitmap, 887 uint16_t sockets, 888 uint16_t cores_per_sock, 889 uint16_t cpus_per_core, 890 uint32_t sock_per_node, 891 uint16_t task_per_node, 892 bool whole_node, 893 uint16_t *avail_gpus, 894 uint16_t *near_gpus); 895 896 /* 897 * Determine how many tasks can be started on a given node and which 898 * sockets/cores are required 899 * IN mc_ptr - job's multi-core specs, NO_VAL and INFINITE mapped to zero 900 * IN sock_gres_list - list of sock_gres_t entries built by gres_plugin_job_test2() 901 * IN sockets - Count of sockets on the node 902 * IN cores_per_socket - Count of cores per socket on the node 903 * IN cpus_per_core - Count of CPUs per core on the node 904 * IN avail_cpus - Count of available CPUs on the node, UPDATED 905 * IN min_tasks_this_node - Minimum count of tasks that can be started on this 906 * node, UPDATED 907 * IN max_tasks_this_node - Maximum count of tasks that can be started on this 908 * node or NO_VAL, UPDATED 909 * IN rem_nodes - desired additional node count to allocate, including this node 910 * IN enforce_binding - GRES must be co-allocated with cores 911 * IN first_pass - set if first scheduling attempt for this job, use 912 * co-located GRES and cores if possible 913 * IN avail_cores - cores available on this node, UPDATED 914 */ 915 extern void gres_plugin_job_core_filter3(gres_mc_data_t *mc_ptr, 916 List sock_gres_list, 917 uint16_t sockets, 918 uint16_t cores_per_socket, 919 uint16_t cpus_per_core, 920 uint16_t *avail_cpus, 921 uint32_t *min_tasks_this_node, 922 uint32_t *max_tasks_this_node, 923 int rem_nodes, 924 bool enforce_binding, 925 bool first_pass, 926 bitstr_t *avail_core); 927 928 /* 929 * Return the maximum number of tasks that can be started on a node with 930 * sock_gres_list (per-socket GRES details for some node) 931 */ 932 extern uint32_t gres_plugin_get_task_limit(List sock_gres_list); 933 934 /* 935 * Make final GRES selection for the job 936 * sock_gres_list IN - per-socket GRES details, one record per allocated node 937 * job_id IN - job ID for logging 938 * job_res IN - job resource allocation 939 * overcommit IN - job's ability to overcommit resources 940 * tres_mc_ptr IN - job's multi-core options 941 * node_table_ptr IN - slurmctld's node records 942 * RET SLURM_SUCCESS or error code 943 */ 944 extern int gres_plugin_job_core_filter4(List *sock_gres_list, uint32_t job_id, 945 struct job_resources *job_res, 946 uint8_t overcommit, 947 gres_mc_data_t *tres_mc_ptr, 948 node_record_t *node_table_ptr); 949 950 /* 951 * Determine if job GRES specification includes a tres-per-task specification 952 * RET TRUE if any GRES requested by the job include a tres-per-task option 953 */ 954 extern bool gres_plugin_job_tres_per_task(List job_gres_list); 955 956 /* 957 * Determine if the job GRES specification includes a mem-per-tres specification 958 * RET largest mem-per-tres specification found 959 */ 960 extern uint64_t gres_plugin_job_mem_max(List job_gres_list); 961 962 /* 963 * Set per-node memory limits based upon GRES assignments 964 * RET TRUE if mem-per-tres specification used to set memory limits 965 */ 966 extern bool gres_plugin_job_mem_set(List job_gres_list, 967 job_resources_t *job_res); 968 969 /* 970 * Determine the minimum number of CPUs required to satify the job's GRES 971 * request (based upon total GRES times cpus_per_gres value) 972 * node_count IN - count of nodes in job allocation 973 * sockets_per_node IN - count of sockets per node in job allocation 974 * task_count IN - count of tasks in job allocation 975 * job_gres_list IN - job GRES specification 976 * RET count of required CPUs for the job 977 */ 978 extern int gres_plugin_job_min_cpus(uint32_t node_count, 979 uint32_t sockets_per_node, 980 uint32_t task_count, 981 List job_gres_list); 982 983 /* 984 * Determine the minimum number of CPUs required to satify the job's GRES 985 * request on one node 986 * sockets_per_node IN - count of sockets per node in job allocation 987 * tasks_per_node IN - count of tasks per node in job allocation 988 * job_gres_list IN - job GRES specification 989 * RET count of required CPUs for the job 990 */ 991 extern int gres_plugin_job_min_cpu_node(uint32_t sockets_per_node, 992 uint32_t tasks_per_node, 993 List job_gres_list); 994 995 /* 996 * Fill in job_gres_list with the total amount of GRES on a node. 997 * OUT job_gres_list - This list will be destroyed and remade with all GRES on 998 * node. 999 * IN node_gres_list - node's gres_list built by 1000 * gres_plugin_node_config_validate() 1001 * IN job_id - job's ID (for logging) 1002 * IN node_name - name of the node (for logging) 1003 * RET SLURM_SUCCESS or error code 1004 */ 1005 extern int gres_plugin_job_select_whole_node( 1006 List *job_gres_list, List node_gres_list, 1007 uint32_t job_id, char *node_name); 1008 1009 /* 1010 * Select and allocate all GRES on a node to a job and update node and job GRES 1011 * information 1012 * IN job_gres_list - job's gres_list built by gres_plugin_job_whole_node(). 1013 * IN node_gres_list - node's gres_list built by 1014 * gres_plugin_node_config_validate() 1015 * IN node_cnt - total number of nodes originally allocated to the job 1016 * IN node_index - zero-origin global node index 1017 * IN node_offset - zero-origin index in job allocation to the node of interest 1018 * IN job_id - job's ID (for logging) 1019 * IN node_name - name of the node (for logging) 1020 * IN core_bitmap - cores allocated to this job on this node (NULL if not 1021 * available) 1022 * IN user_id - job's user ID 1023 * RET SLURM_SUCCESS or error code 1024 */ 1025 extern int gres_plugin_job_alloc_whole_node( 1026 List job_gres_list, List node_gres_list, 1027 int node_cnt, int node_index, int node_offset, 1028 uint32_t job_id, char *node_name, 1029 bitstr_t *core_bitmap, uint32_t user_id); 1030 1031 /* 1032 * Select and allocate GRES to a job and update node and job GRES information 1033 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate() 1034 * IN node_gres_list - node's gres_list built by 1035 * gres_plugin_node_config_validate() 1036 * IN node_cnt - total number of nodes originally allocated to the job 1037 * IN node_index - zero-origin global node index 1038 * IN node_offset - zero-origin index in job allocation to the node of interest 1039 * IN job_id - job's ID (for logging) 1040 * IN node_name - name of the node (for logging) 1041 * IN core_bitmap - cores allocated to this job on this node (NULL if not 1042 * available) 1043 * IN user_id - job's user ID 1044 * RET SLURM_SUCCESS or error code 1045 */ 1046 extern int gres_plugin_job_alloc(List job_gres_list, List node_gres_list, 1047 int node_cnt, int node_index, int node_offset, 1048 uint32_t job_id, char *node_name, 1049 bitstr_t *core_bitmap, uint32_t user_id); 1050 1051 /* Clear any vestigial job gres state. This may be needed on job requeue. */ 1052 extern void gres_plugin_job_clear(List job_gres_list); 1053 1054 /* 1055 * Deallocate resource from a job and update node and job gres information 1056 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate() 1057 * IN node_gres_list - node's gres_list built by 1058 * gres_plugin_node_config_validate() 1059 * IN node_offset - zero-origin index to the node of interest 1060 * IN job_id - job's ID (for logging) 1061 * IN node_name - name of the node (for logging) 1062 * IN old_job - true if job started before last slurmctld reboot. 1063 * Immediately after slurmctld restart and before the node's 1064 * registration, the GRES type and topology. This results in 1065 * some incorrect internal bookkeeping, but does not cause 1066 * failures in terms of allocating GRES to jobs. 1067 * IN user_id - job's user ID 1068 * IN: job_fini - job fully terminating on this node (not just a test) 1069 * RET SLURM_SUCCESS or error code 1070 */ 1071 extern int gres_plugin_job_dealloc(List job_gres_list, List node_gres_list, 1072 int node_offset, uint32_t job_id, 1073 char *node_name, bool old_job, 1074 uint32_t user_id, bool job_fini); 1075 1076 /* 1077 * Merge one job's gres allocation into another job's gres allocation. 1078 * IN from_job_gres_list - List of gres records for the job being merged 1079 * into another job 1080 * IN from_job_node_bitmap - bitmap of nodes for the job being merged into 1081 * another job 1082 * IN/OUT to_job_gres_list - List of gres records for the job being merged 1083 * into job 1084 * IN to_job_node_bitmap - bitmap of nodes for the job being merged into 1085 */ 1086 extern void gres_plugin_job_merge(List from_job_gres_list, 1087 bitstr_t *from_job_node_bitmap, 1088 List to_job_gres_list, 1089 bitstr_t *to_job_node_bitmap); 1090 1091 /* 1092 * Set environment variables as required for a batch job 1093 * IN/OUT job_env_ptr - environment variable array 1094 * IN gres_list - generated by gres_plugin_job_alloc() 1095 * IN node_inx - zero origin node index 1096 */ 1097 extern void gres_plugin_job_set_env(char ***job_env_ptr, List job_gres_list, 1098 int node_inx); 1099 1100 /* 1101 * Set job default parameters in a given element of a list 1102 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate() 1103 * IN gres_name - name of gres, apply defaults to all elements (e.g. updates to 1104 * gres_name="gpu" would apply to "gpu:tesla", "gpu:volta", etc.) 1105 * IN cpu_per_gpu - value to set as default 1106 * IN mem_per_gpu - value to set as default 1107 */ 1108 extern void gres_plugin_job_set_defs(List job_gres_list, char *gres_name, 1109 uint64_t cpu_per_gpu, 1110 uint64_t mem_per_gpu); 1111 1112 /* 1113 * Extract from the job record's gres_list the count of allocated resources of 1114 * the named gres type. 1115 * IN job_gres_list - job record's gres_list. 1116 * IN gres_name_type - the name of the gres type to retrieve the associated 1117 * value from. 1118 * RET The value associated with the gres type or NO_VAL if not found. 1119 */ 1120 extern uint64_t gres_plugin_get_job_value_by_type(List job_gres_list, 1121 char *gres_name_type); 1122 1123 /* 1124 * Log a job's current gres state 1125 * IN gres_list - generated by gres_plugin_job_state_validate() 1126 * IN job_id - job's ID 1127 */ 1128 extern void gres_plugin_job_state_log(List gres_list, uint32_t job_id); 1129 1130 /* 1131 * Given a step's requested gres configuration, validate it and build gres list 1132 * IN *tres* - step's request's gres input string 1133 * OUT step_gres_list - List of Gres records for this step to track usage 1134 * IN job_gres_list - List of Gres records for this job 1135 * IN job_id, step_id - ID of the step being allocated. 1136 * RET SLURM_SUCCESS or ESLURM_INVALID_GRES 1137 */ 1138 extern int gres_plugin_step_state_validate(char *cpus_per_tres, 1139 char *tres_per_step, 1140 char *tres_per_node, 1141 char *tres_per_socket, 1142 char *tres_per_task, 1143 char *mem_per_tres, 1144 List *step_gres_list, 1145 List job_gres_list, uint32_t job_id, 1146 uint32_t step_id); 1147 1148 /* 1149 * Create a copy of a step's gres state 1150 * IN gres_list - List of Gres records for this step to track usage 1151 * RET The copy or NULL on failure 1152 */ 1153 List gres_plugin_step_state_dup(List gres_list); 1154 1155 /* 1156 * Create a copy of a step's gres state for a particular node index 1157 * IN gres_list - List of Gres records for this step to track usage 1158 * IN node_index - zero-origin index to the node 1159 * RET The copy or NULL on failure 1160 */ 1161 List gres_plugin_step_state_extract(List gres_list, int node_index); 1162 1163 /* 1164 * A job allocation size has changed. Update the job step gres information 1165 * bitmaps and other data structures. 1166 * IN gres_list - List of Gres records for this step to track usage 1167 * IN orig_job_node_bitmap - bitmap of nodes in the original job allocation 1168 * IN new_job_node_bitmap - bitmap of nodes in the new job allocation 1169 */ 1170 void gres_plugin_step_state_rebase(List gres_list, 1171 bitstr_t *orig_job_node_bitmap, 1172 bitstr_t *new_job_node_bitmap); 1173 1174 /* 1175 * Pack a step's current gres status, called from slurmctld for save/restore 1176 * IN gres_list - generated by gres_plugin_step_allocate() 1177 * IN/OUT buffer - location to write state to 1178 * IN job_id, step_id - job and step ID for logging 1179 */ 1180 extern int gres_plugin_step_state_pack(List gres_list, Buf buffer, 1181 uint32_t job_id, uint32_t step_id, 1182 uint16_t protocol_version); 1183 1184 /* 1185 * Unpack a step's current gres status, called from slurmctld for save/restore 1186 * OUT gres_list - restored state stored by gres_plugin_step_state_pack() 1187 * IN/OUT buffer - location to read state from 1188 * IN job_id, step_id - job and step ID for logging 1189 */ 1190 extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer, 1191 uint32_t job_id, uint32_t step_id, 1192 uint16_t protocol_version); 1193 1194 /* Return the count of GRES of a specific name on this machine 1195 * IN step_gres_list - generated by gres_plugin_step_allocate() 1196 * IN gres_name - name of the GRES to match 1197 * RET count of GRES of this specific name available to the job or NO_VAL64 1198 */ 1199 extern uint64_t gres_plugin_step_count(List step_gres_list, char *gres_name); 1200 1201 /* 1202 * Configure the GRES hardware allocated to the current step while privileged 1203 * 1204 * IN step_gres_list - Step's GRES specification 1205 * IN node_id - relative position of this node in step 1206 * IN settings - string containing configuration settings for the hardware 1207 */ 1208 extern void gres_plugin_step_hardware_init(List step_gres_list, 1209 uint32_t node_id, char *settings); 1210 1211 /* 1212 * Optionally undo GRES hardware configuration while privileged 1213 */ 1214 extern void gres_plugin_step_hardware_fini(void); 1215 1216 /* 1217 * Set environment as required for all tasks of a job step 1218 * IN/OUT job_env_ptr - environment variable array 1219 * IN step_gres_list - generated by gres_plugin_step_alloc() 1220 * IN accel_bind_type - GRES binding options (old format, a bitmap) 1221 * IN tres_bind - TRES binding directives (new format, a string) 1222 * IN local_proc_id - task rank, local to this compute node only 1223 */ 1224 extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list, 1225 uint16_t accel_bind_type, char *tres_bind, 1226 int local_proc_id); 1227 1228 /* 1229 * Log a step's current gres state 1230 * IN gres_list - generated by gres_plugin_step_allocate() 1231 * IN job_id - job's ID 1232 */ 1233 extern void gres_plugin_step_state_log(List gres_list, uint32_t job_id, 1234 uint32_t step_id); 1235 1236 /* 1237 * Determine how many cores of a job's allocation can be allocated to a step 1238 * on a specific node 1239 * IN job_gres_list - a running job's gres info 1240 * IN/OUT step_gres_list - a pending job step's gres requirements 1241 * IN node_offset - index into the job's node allocation 1242 * IN first_step_node - true if this is node zero of the step (do initialization) 1243 * IN cpus_per_task - number of CPUs required per task 1244 * IN max_rem_nodes - maximum nodes remaining for step (including this one) 1245 * IN ignore_alloc - if set ignore resources already allocated to running steps 1246 * IN job_id, step_id - ID of the step being allocated. 1247 * RET Count of available cores on this node (sort of): 1248 * NO_VAL64 if no limit or 0 if node is not usable 1249 */ 1250 extern uint64_t gres_plugin_step_test(List step_gres_list, List job_gres_list, 1251 int node_offset, bool first_step_node, 1252 uint16_t cpus_per_task, int max_rem_nodes, 1253 bool ignore_alloc, 1254 uint32_t job_id, uint32_t step_id); 1255 1256 /* 1257 * Allocate resource to a step and update job and step gres information 1258 * IN step_gres_list - step's gres_list built by 1259 * gres_plugin_step_state_validate() 1260 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate() 1261 * IN node_offset - job's zero-origin index to the node of interest 1262 * IN first_step_node - true if this is node zero of the step (do initialization) 1263 * IN tasks_on_node - number of tasks to be launched on this node 1264 * IN rem_nodes - desired additional node count to allocate, including this node 1265 * IN job_id, step_id - ID of the step being allocated. 1266 * RET SLURM_SUCCESS or error code 1267 */ 1268 extern int gres_plugin_step_alloc(List step_gres_list, List job_gres_list, 1269 int node_offset, bool first_step_node, 1270 uint16_t tasks_on_node, uint32_t rem_nodes, 1271 uint32_t job_id, uint32_t step_id); 1272 1273 /* 1274 * Deallocate resource to a step and update job and step gres information 1275 * IN step_gres_list - step's gres_list built by 1276 * gres_plugin_step_state_validate() 1277 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate() 1278 * IN job_id, step_id - ID of the step being allocated. 1279 * RET SLURM_SUCCESS or error code 1280 */ 1281 extern int gres_plugin_step_dealloc(List step_gres_list, List job_gres_list, 1282 uint32_t job_id, uint32_t step_id); 1283 1284 /* 1285 * Build a string containing the GRES details for a given node and socket 1286 * sock_gres_list IN - List of sock_gres_t entries 1287 * sock_inx IN - zero-origin socket for which information is to be returned 1288 * RET string, must call xfree() to release memory 1289 */ 1290 extern char *gres_plugin_sock_str(List sock_gres_list, int sock_inx); 1291 1292 /* 1293 * Map a given GRES type ID back to a GRES type name. 1294 * gres_id IN - GRES type ID to search for. 1295 * gres_name IN - Pre-allocated string in which to store the GRES type name. 1296 * gres_name_len - Size of gres_name in bytes 1297 * RET - error code (currently not used--always return SLURM_SUCCESS) 1298 */ 1299 extern int gres_gresid_to_gresname(uint32_t gres_id, char* gres_name, 1300 int gres_name_len); 1301 1302 /* 1303 * Determine total count GRES of a given type are allocated to a job across 1304 * all nodes 1305 * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate() 1306 * IN gres_name - name of a GRES type 1307 * RET count of this GRES allocated to this job 1308 */ 1309 extern uint64_t gres_get_value_by_type(List job_gres_list, char *gres_name); 1310 1311 enum gres_job_data_type { 1312 GRES_JOB_DATA_COUNT, /* data-> uint64_t */ 1313 GRES_JOB_DATA_BITMAP, /* data-> bitstr_t* */ 1314 }; 1315 1316 /* 1317 * get data from a job's GRES data structure 1318 * IN job_gres_list - job's GRES data structure 1319 * IN gres_name - name of a GRES type 1320 * IN node_inx - zero-origin index of the node within the job's allocation 1321 * for which data is desired 1322 * IN data_type - type of data to get from the job's data 1323 * OUT data - pointer to the data from job's GRES data structure 1324 * DO NOT FREE: This is a pointer into the job's data structure 1325 * RET - SLURM_SUCCESS or error code 1326 */ 1327 extern int gres_get_job_info(List job_gres_list, char *gres_name, 1328 uint32_t node_inx, 1329 enum gres_job_data_type data_type, void *data); 1330 1331 /* Given a job's GRES data structure, return the indecies for selected elements 1332 * IN job_gres_list - job's GRES data structure 1333 * OUT gres_detail_cnt - Number of elements (nodes) in gres_detail_str 1334 * OUT gres_detail_str - Description of GRES on each node 1335 * OUT total_gres_str - String containing all gres in the job and counts. 1336 */ 1337 extern void gres_build_job_details(List job_gres_list, 1338 uint32_t *gres_detail_cnt, 1339 char ***gres_detail_str, 1340 char **total_gres_str); 1341 1342 enum gres_step_data_type { 1343 GRES_STEP_DATA_COUNT, /* data-> uint64_t */ 1344 GRES_STEP_DATA_BITMAP, /* data-> bitstr_t* */ 1345 }; 1346 1347 /* 1348 * get data from a step's GRES data structure 1349 * IN step_gres_list - step's GRES data structure 1350 * IN gres_name - name of a GRES type 1351 * IN node_inx - zero-origin index of the node within the job's allocation 1352 * for which data is desired. Note this can differ from the step's 1353 * node allocation index. 1354 * IN data_type - type of data to get from the step's data 1355 * OUT data - pointer to the data from step's GRES data structure 1356 * DO NOT FREE: This is a pointer into the step's data structure 1357 * RET - SLURM_SUCCESS or error code 1358 */ 1359 extern int gres_get_step_info(List step_gres_list, char *gres_name, 1360 uint32_t node_inx, 1361 enum gres_step_data_type data_type, void *data); 1362 1363 extern gres_job_state_t *gres_get_job_state(List gres_list, char *name); 1364 extern gres_step_state_t *gres_get_step_state(List gres_list, char *name); 1365 1366 extern uint32_t gres_get_autodetect_types(void); 1367 1368 /* 1369 * Translate a gres_list into a tres_str 1370 * IN gres_list - filled in with gres_job_state_t or gres_step_state_t's 1371 * IN is_job - if is job function expects gres_job_state_t's else 1372 * gres_step_state_t's 1373 * IN locked - if the assoc_mgr tres read locked is locked or not 1374 * RET char * in a simple TRES format 1375 */ 1376 extern char *gres_2_tres_str(List gres_list, bool is_job, bool locked); 1377 1378 /* Fill in the job allocated tres_cnt based off the gres_list and node_cnt 1379 * IN gres_list - filled in with gres_job_state_t's 1380 * IN node_cnt - number of nodes in the job 1381 * OUT tres_cnt - gres spots filled in with total number of TRES 1382 * requested for job that are requested in gres_list 1383 * IN locked - if the assoc_mgr tres read locked is locked or not 1384 */ 1385 extern void gres_set_job_tres_cnt(List gres_list, 1386 uint32_t node_cnt, 1387 uint64_t *tres_cnt, 1388 bool locked); 1389 1390 /* Fill in the node allocated tres_cnt based off the gres_list 1391 * IN gres_list - filled in with gres_node_state_t's gres_alloc_cnt 1392 * OUT tres_cnt - gres spots filled in with total number of TRES 1393 * allocated on node 1394 * IN locked - if the assoc_mgr tres read locked is locked or not 1395 */ 1396 extern void gres_set_node_tres_cnt(List gres_list, uint64_t *tres_cnt, 1397 bool locked); 1398 1399 /* return the major info from a given path of a device */ 1400 extern char *gres_device_major(char *dev_path); 1401 1402 /* Free memory for gres_device_t record */ 1403 extern void destroy_gres_device(void *gres_device_ptr); 1404 1405 /* Destroy a gres_slurmd_conf_t record, free it's memory */ 1406 extern void destroy_gres_slurmd_conf(void *x); 1407 1408 /* 1409 * Convert GRES config_flags to a string. The pointer returned references local 1410 * storage in this function, which is not re-entrant. 1411 */ 1412 extern char *gres_flags2str(uint8_t config_flags); 1413 1414 /* 1415 * Creates a gres_slurmd_conf_t record to add to a list of gres_slurmd_conf_t 1416 * records 1417 */ 1418 extern void add_gres_to_list(List gres_list, char *name, uint64_t device_cnt, 1419 int cpu_cnt, char *cpu_aff_abs_range, 1420 bitstr_t *cpu_aff_mac_bitstr, char *device_file, 1421 char *type, char *links); 1422 1423 #endif /* !_GRES_H */ 1424