1 /*****************************************************************************\ 2 * slurmctld.h - definitions of functions and structures for slurmcltd use 3 ***************************************************************************** 4 * Copyright (C) 2002-2007 The Regents of the University of California. 5 * Copyright (C) 2008-2010 Lawrence Livermore National Security. 6 * Portions Copyright (C) 2010-2014 SchedMD <https://www.schedmd.com>. 7 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 8 * Written by Morris Jette <jette1@llnl.gov> et. al. 9 * CODE-OCEC-09-009. All rights reserved. 10 * 11 * This file is part of Slurm, a resource management program. 12 * For details, see <https://slurm.schedmd.com/>. 13 * Please also read the included file: DISCLAIMER. 14 * 15 * Slurm is free software; you can redistribute it and/or modify it under 16 * the terms of the GNU General Public License as published by the Free 17 * Software Foundation; either version 2 of the License, or (at your option) 18 * any later version. 19 * 20 * In addition, as a special exception, the copyright holders give permission 21 * to link the code of portions of this program with the OpenSSL library under 22 * certain conditions as described in each individual source file, and 23 * distribute linked combinations including the two. You must obey the GNU 24 * General Public License in all respects for all of the code used other than 25 * OpenSSL. If you modify file(s) with this exception, you may extend this 26 * exception to your version of the file(s), but you are not obligated to do 27 * so. If you do not wish to do so, delete this exception statement from your 28 * version. If you delete this exception statement from all source files in 29 * the program, then also delete it here. 30 * 31 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY 32 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 33 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 34 * details. 35 * 36 * You should have received a copy of the GNU General Public License along 37 * with Slurm; if not, write to the Free Software Foundation, Inc., 38 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 39 \*****************************************************************************/ 40 41 #ifndef _HAVE_SLURMCTLD_H 42 #define _HAVE_SLURMCTLD_H 43 44 #include "config.h" 45 46 #include <inttypes.h> 47 #include <pthread.h> 48 #include <string.h> 49 #include <sys/types.h> 50 #include <time.h> 51 #include <unistd.h> 52 53 #include "slurm/slurm.h" 54 55 #include "src/common/bitstring.h" 56 #include "src/common/list.h" 57 #include "src/common/log.h" 58 #include "src/common/macros.h" 59 #include "src/common/node_conf.h" 60 #include "src/common/pack.h" 61 #include "src/common/read_config.h" /* location of slurmctld_conf */ 62 #include "src/common/job_resources.h" 63 #include "src/common/slurm_cred.h" 64 #include "src/common/slurm_protocol_api.h" 65 #include "src/common/slurm_protocol_defs.h" 66 #include "src/common/switch.h" 67 #include "src/common/timers.h" 68 #include "src/common/xmalloc.h" 69 70 /*****************************************************************************\ 71 * GENERAL CONFIGURATION parameters and data structures 72 \*****************************************************************************/ 73 /* Maximum parallel threads to service incoming RPCs. 74 * Also maximum parallel threads to service outgoing RPCs (separate counter). 75 * Since some systems schedule pthread on a First-In-Last-Out basis, 76 * increasing this value is strongly discouraged. */ 77 #ifndef MAX_SERVER_THREADS 78 #define MAX_SERVER_THREADS 256 79 #endif 80 81 /* Maximum number of threads to service emails (see MailProg) */ 82 #ifndef MAX_MAIL_THREADS 83 #define MAX_MAIL_THREADS 64 84 #endif 85 86 /* Perform full slurmctld's state every PERIODIC_CHECKPOINT seconds */ 87 #ifndef PERIODIC_CHECKPOINT 88 #define PERIODIC_CHECKPOINT 300 89 #endif 90 91 /* Retry an incomplete RPC agent request every RPC_RETRY_INTERVAL seconds */ 92 #ifndef RPC_RETRY_INTERVAL 93 #define RPC_RETRY_INTERVAL 60 94 #endif 95 96 /* Check for jobs reaching their time limit every PERIODIC_TIMEOUT seconds */ 97 #ifndef PERIODIC_TIMEOUT 98 #define PERIODIC_TIMEOUT 30 99 #endif 100 101 /* Attempt to purge defunct job records and resend job kill requests 102 * every PURGE_JOB_INTERVAL seconds */ 103 #ifndef PURGE_JOB_INTERVAL 104 #define PURGE_JOB_INTERVAL 60 105 #endif 106 107 /* Process pending trigger events every TRIGGER_INTERVAL seconds */ 108 #ifndef TRIGGER_INTERVAL 109 #define TRIGGER_INTERVAL 15 110 #endif 111 112 /* Report current node accounting state every PERIODIC_NODE_ACCT seconds */ 113 #ifndef PERIODIC_NODE_ACCT 114 #define PERIODIC_NODE_ACCT 300 115 #endif 116 117 /* Pathname of group file record for checking update times */ 118 #ifndef GROUP_FILE 119 #define GROUP_FILE "/etc/group" 120 #endif 121 122 /* Seconds to wait for backup controller response to REQUEST_CONTROL RPC */ 123 #ifndef CONTROL_TIMEOUT 124 #define CONTROL_TIMEOUT 30 /* seconds */ 125 #endif 126 127 /* Maximum number of requeue attempts before the job is put JOB_REQUEUE_HOLD 128 * with reason JobHeldUser. 129 */ 130 #ifndef MAX_BATCH_REQUEUE 131 #define MAX_BATCH_REQUEUE 5 132 #endif 133 134 /*****************************************************************************\ 135 * General configuration parameters and data structures 136 \*****************************************************************************/ 137 138 typedef struct slurmctld_config { 139 char * auth_info; 140 pthread_cond_t backup_finish_cond; /* use thread_count_lock */ 141 time_t boot_time; 142 int daemonize; 143 char node_name_long[MAX_SLURM_NAME]; 144 char node_name_short[MAX_SLURM_NAME]; 145 bool resume_backup; 146 bool scheduling_disabled; 147 int server_thread_count; 148 time_t shutdown_time; 149 bool submissions_disabled; 150 151 slurm_cred_ctx_t cred_ctx; 152 pthread_cond_t thread_count_cond; 153 pthread_mutex_t thread_count_lock; 154 pthread_t thread_id_main; 155 pthread_t thread_id_save; 156 pthread_t thread_id_sig; 157 pthread_t thread_id_power; 158 pthread_t thread_id_purge_files; 159 pthread_t thread_id_rpc; 160 } slurmctld_config_t; 161 162 /* Job scheduling statistics */ 163 typedef struct diag_stats { 164 int proc_req_threads; 165 int proc_req_raw; 166 167 uint32_t schedule_cycle_max; 168 uint32_t schedule_cycle_last; 169 uint32_t schedule_cycle_sum; 170 uint32_t schedule_cycle_counter; 171 uint32_t schedule_cycle_depth; 172 uint32_t schedule_queue_len; 173 174 uint32_t jobs_submitted; 175 uint32_t jobs_started; 176 uint32_t jobs_completed; 177 uint32_t jobs_canceled; 178 uint32_t jobs_failed; 179 180 uint32_t job_states_ts; 181 uint32_t jobs_pending; 182 uint32_t jobs_running; 183 184 uint32_t backfilled_jobs; 185 uint32_t last_backfilled_jobs; 186 uint32_t backfilled_het_jobs; 187 uint32_t bf_active; 188 uint32_t bf_cycle_counter; 189 uint32_t bf_cycle_last; 190 uint32_t bf_cycle_max; 191 uint64_t bf_cycle_sum; 192 uint32_t bf_depth_sum; 193 uint32_t bf_depth_try_sum; 194 uint32_t bf_last_depth; 195 uint32_t bf_last_depth_try; 196 uint32_t bf_queue_len; 197 uint32_t bf_queue_len_sum; 198 uint32_t bf_table_size; 199 uint32_t bf_table_size_sum; 200 time_t bf_when_last_cycle; 201 202 uint32_t latency; 203 } diag_stats_t; 204 205 /* This is used to point out constants that exist in the 206 * curr_tres_array in tres_info_t This should be the same order as 207 * the tres_types_t enum that is defined in src/common/slurmdb_defs.h 208 */ 209 enum { 210 TRES_ARRAY_CPU = 0, 211 TRES_ARRAY_MEM, 212 TRES_ARRAY_ENERGY, 213 TRES_ARRAY_NODE, 214 TRES_ARRAY_BILLING, 215 TRES_ARRAY_FS_DISK, 216 TRES_ARRAY_VMEM, 217 TRES_ARRAY_PAGES, 218 TRES_ARRAY_TOTAL_CNT 219 }; 220 221 extern bool preempt_send_user_signal; 222 extern time_t last_proc_req_start; 223 extern diag_stats_t slurmctld_diag_stats; 224 extern slurmctld_config_t slurmctld_config; 225 extern void *acct_db_conn; 226 extern uint16_t accounting_enforce; 227 extern int association_based_accounting; 228 extern int backup_inx; /* BackupController# index */ 229 extern int batch_sched_delay; 230 extern time_t control_time; /* Time when became primary controller */ 231 extern uint32_t cluster_cpus; 232 extern bool disable_remote_singleton; 233 extern int max_depend_depth; 234 extern bool node_features_updated; 235 extern pthread_cond_t purge_thread_cond; 236 extern pthread_mutex_t purge_thread_lock; 237 extern pthread_mutex_t check_bf_running_lock; 238 extern int sched_interval; 239 extern bool slurmctld_init_db; 240 extern int slurmctld_primary; 241 extern int slurmctld_tres_cnt; 242 extern slurmdb_cluster_rec_t *response_cluster_rec; 243 extern bool test_config; 244 extern int test_config_rc; 245 246 /*****************************************************************************\ 247 * NODE parameters and data structures, mostly in src/common/node_conf.h 248 \*****************************************************************************/ 249 extern bool ping_nodes_now; /* if set, ping nodes immediately */ 250 extern bool want_nodes_reboot; /* if set, check for idle nodes */ 251 extern bool ignore_state_errors; 252 253 typedef struct node_features { 254 uint32_t magic; /* magic cookie to test data integrity */ 255 char *name; /* name of a feature */ 256 bitstr_t *node_bitmap; /* bitmap of nodes with this feature */ 257 } node_feature_t; 258 259 extern List active_feature_list;/* list of currently active node features */ 260 extern List avail_feature_list; /* list of available node features */ 261 262 /*****************************************************************************\ 263 * NODE states and bitmaps 264 * 265 * avail_node_bitmap Set if node's state is not DOWN, DRAINING/DRAINED, 266 * FAILING or NO_RESPOND (i.e. available to run a job) 267 * booting_node_bitmap Set if node in process of booting 268 * cg_node_bitmap Set if node in completing state 269 * future_node_bitmap Set if node in FUTURE state 270 * idle_node_bitmap Set if node has no jobs allocated to it 271 * power_node_bitmap Set for nodes which are powered down 272 * share_node_bitmap Set if no jobs allocated exclusive access to 273 * resources on that node (cleared if --exclusive 274 * option specified by job or Shared=NO configured for 275 * the job's partition) 276 * up_node_bitmap Set if the node's state is not DOWN 277 \*****************************************************************************/ 278 extern bitstr_t *avail_node_bitmap; /* bitmap of available nodes, 279 * state not DOWN, DRAIN or FAILING */ 280 extern bitstr_t *bf_ignore_node_bitmap; /* bitmap of nodes made available during 281 * backfill cycle */ 282 extern bitstr_t *booting_node_bitmap; /* bitmap of booting nodes */ 283 extern bitstr_t *cg_node_bitmap; /* bitmap of completing nodes */ 284 extern bitstr_t *future_node_bitmap; /* bitmap of FUTURE nodes */ 285 extern bitstr_t *idle_node_bitmap; /* bitmap of idle nodes */ 286 extern bitstr_t *power_node_bitmap; /* Powered down nodes */ 287 extern bitstr_t *share_node_bitmap; /* bitmap of sharable nodes */ 288 extern bitstr_t *up_node_bitmap; /* bitmap of up nodes, not DOWN */ 289 extern bitstr_t *rs_node_bitmap; /* next_state=resume nodes */ 290 291 /*****************************************************************************\ 292 * FRONT_END parameters and data structures 293 \*****************************************************************************/ 294 #define FRONT_END_MAGIC 0xfe9b82fe 295 296 typedef struct { 297 uint32_t magic; /* magic cookie to test data integrity */ 298 /* DO NOT ALPHABETIZE */ 299 gid_t *allow_gids; /* zero terminated list of allowed groups */ 300 char *allow_groups; /* allowed group string */ 301 uid_t *allow_uids; /* zero terminated list of allowed users */ 302 char *allow_users; /* allowed user string */ 303 time_t boot_time; /* Time of node boot, 304 * computed from up_time */ 305 char *comm_name; /* communications path name to node */ 306 gid_t *deny_gids; /* zero terminated list of denied groups */ 307 char *deny_groups; /* denied group string */ 308 uid_t *deny_uids; /* zero terminated list of denied users */ 309 char *deny_users; /* denied user string */ 310 uint32_t job_cnt_comp; /* count of completing jobs on node */ 311 uint16_t job_cnt_run; /* count of running or suspended jobs */ 312 time_t last_response; /* Time of last communication */ 313 char *name; /* frontend node name */ 314 uint32_t node_state; /* enum node_states, ORed with 315 * NODE_STATE_NO_RESPOND if not 316 * responding */ 317 bool not_responding; /* set if fails to respond, 318 * clear after logging this */ 319 slurm_addr_t slurm_addr; /* network address */ 320 uint16_t port; /* frontend specific port */ 321 uint16_t protocol_version; /* Slurm version number */ 322 char *reason; /* reason for down frontend node */ 323 time_t reason_time; /* Time stamp when reason was set, 324 * ignore if no reason is set. */ 325 uint32_t reason_uid; /* User that set the reason, ignore if 326 * no reason is set. */ 327 time_t slurmd_start_time; /* Time of slurmd startup */ 328 char *version; /* Slurm version */ 329 } front_end_record_t; 330 331 extern front_end_record_t *front_end_nodes; 332 extern uint16_t front_end_node_cnt; 333 extern time_t last_front_end_update; /* time of last front_end update */ 334 335 /*****************************************************************************\ 336 * PARTITION parameters and data structures 337 \*****************************************************************************/ 338 #define PART_MAGIC 0xaefe8495 339 340 typedef struct { 341 slurmdb_bf_usage_t *job_usage; 342 slurmdb_bf_usage_t *resv_usage; 343 xhash_t *user_usage; 344 } bf_part_data_t; 345 346 typedef struct { 347 uint32_t magic; /* magic cookie to test data integrity */ 348 /* DO NOT ALPHABETIZE */ 349 char *allow_accounts; /* comma delimited list of accounts, 350 * NULL indicates all */ 351 char **allow_account_array; /* NULL terminated list of allowed 352 * accounts */ 353 char *allow_alloc_nodes;/* comma delimited list of allowed 354 * allocating nodes 355 * NULL indicates all */ 356 char *allow_groups; /* comma delimited list of groups, 357 * NULL indicates all */ 358 uid_t *allow_uids; /* zero terminated list of allowed user IDs */ 359 char *allow_qos; /* comma delimited list of qos, 360 * NULL indicates all */ 361 bitstr_t *allow_qos_bitstr; /* (DON'T PACK) assocaited with 362 * char *allow_qos but used internally */ 363 char *alternate; /* name of alternate partition */ 364 double *billing_weights; /* array of TRES billing weights */ 365 char *billing_weights_str;/* per TRES billing weight string */ 366 uint32_t cpu_bind; /* default CPU binding type */ 367 uint64_t def_mem_per_cpu; /* default MB memory per allocated CPU */ 368 uint32_t default_time; /* minutes, NO_VAL or INFINITE */ 369 char *deny_accounts; /* comma delimited list of denied accounts */ 370 char **deny_account_array; /* NULL terminated list of denied accounts */ 371 char *deny_qos; /* comma delimited list of denied qos */ 372 bitstr_t *deny_qos_bitstr; /* (DON'T PACK) associated with 373 * char *deny_qos but used internallly */ 374 uint16_t flags; /* see PART_FLAG_* in slurm.h */ 375 uint32_t grace_time; /* default preempt grace time in seconds */ 376 List job_defaults_list; /* List of job_defaults_t elements */ 377 uint32_t max_cpus_per_node; /* maximum allocated CPUs per node */ 378 uint64_t max_mem_per_cpu; /* maximum MB memory per allocated CPU */ 379 uint32_t max_nodes; /* per job or INFINITE */ 380 uint32_t max_nodes_orig;/* unscaled value (c-nodes on BlueGene) */ 381 uint16_t max_share; /* number of jobs to gang schedule */ 382 uint32_t max_time; /* minutes or INFINITE */ 383 uint32_t min_nodes; /* per job */ 384 uint32_t min_nodes_orig;/* unscaled value (c-nodes on BlueGene) */ 385 char *name; /* name of the partition */ 386 bitstr_t *node_bitmap; /* bitmap of nodes in partition */ 387 char *nodes; /* comma delimited list names of nodes */ 388 double norm_priority; /* normalized scheduling priority for 389 * jobs (DON'T PACK) */ 390 uint16_t over_time_limit; /* job's time limit can be exceeded by this 391 * number of minutes before cancellation */ 392 uint16_t preempt_mode; /* See PREEMPT_MODE_* in slurm/slurm.h */ 393 uint16_t priority_job_factor; /* job priority weight factor */ 394 uint16_t priority_tier; /* tier for scheduling and preemption */ 395 char *qos_char; /* requested QOS from slurm.conf */ 396 slurmdb_qos_rec_t *qos_ptr; /* pointer to the quality of 397 * service record attached to this 398 * partition confirm the value before use */ 399 uint16_t state_up; /* See PARTITION_* states in slurm.h */ 400 uint32_t total_nodes; /* total number of nodes in the partition */ 401 uint32_t total_cpus; /* total number of cpus in the partition */ 402 uint32_t max_cpu_cnt; /* max # of cpus on a node in the partition */ 403 uint32_t max_core_cnt; /* max # of cores on a node in the partition */ 404 uint16_t cr_type; /* Custom CR values for partition (if supported by select plugin) */ 405 uint64_t *tres_cnt; /* array of total TRES in partition. NO_PACK */ 406 char *tres_fmt_str; /* str of configured TRES in partition */ 407 bf_part_data_t *bf_data;/* backfill data, NO PACK */ 408 } part_record_t; 409 410 extern List part_list; /* list of part_record entries */ 411 extern time_t last_part_update; /* time of last part_list update */ 412 extern part_record_t default_part; /* default configuration values */ 413 extern char *default_part_name; /* name of default partition */ 414 extern part_record_t *default_part_loc; /* default partition ptr */ 415 416 #define DEF_PART_MAX_PRIORITY 1 417 extern uint16_t part_max_priority; /* max priority_job_factor in all parts */ 418 419 /*****************************************************************************\ 420 * RESERVATION parameters and data structures 421 \*****************************************************************************/ 422 423 typedef struct slurmctld_resv { 424 uint16_t magic; /* magic cookie, RESV_MAGIC */ 425 /* DO NOT ALPHABETIZE */ 426 char *accounts; /* names of accounts permitted to use */ 427 int account_cnt; /* count of accounts permitted to use */ 428 char **account_list; /* list of accounts permitted to use */ 429 bool account_not; /* account_list users NOT permitted to use */ 430 char *assoc_list; /* list of associations */ 431 uint32_t boot_time; /* time it would take to reboot a node */ 432 char *burst_buffer; /* burst buffer resources */ 433 bitstr_t *core_bitmap; /* bitmap of reserved cores */ 434 uint32_t core_cnt; /* number of reserved cores */ 435 job_resources_t *core_resrcs; /* details of allocated cores */ 436 uint32_t duration; /* time in seconds for this 437 * reservation to last */ 438 time_t end_time; /* end time of reservation */ 439 time_t idle_start_time; /* first time when reservation had no jobs 440 * running on it */ 441 char *features; /* required node features */ 442 uint64_t flags; /* see RESERVE_FLAG_* in slurm.h */ 443 bool full_nodes; /* when reservation uses full nodes or not */ 444 uint32_t job_pend_cnt; /* number of pending jobs */ 445 uint32_t job_run_cnt; /* number of running jobs */ 446 List license_list; /* structure with license info */ 447 char *licenses; /* required system licenses */ 448 bool flags_set_node; /* flags (i.e. NODE_STATE_MAINT | 449 * NODE_STATE_RES) set for nodes */ 450 uint32_t max_start_delay;/* Maximum delay in which jobs outside of the 451 * reservation will be permitted to overlap 452 * once any jobs are queued for the 453 * reservation */ 454 char *name; /* name of reservation */ 455 bitstr_t *node_bitmap; /* bitmap of reserved nodes */ 456 uint32_t node_cnt; /* count of nodes required */ 457 char *node_list; /* list of reserved nodes or ALL */ 458 char *partition; /* name of partition to be used */ 459 part_record_t *part_ptr;/* pointer to partition used */ 460 uint32_t purge_comp_time; /* If PURGE_COMP flag is set the amount of 461 * minutes this reservation will sit idle 462 * until it is revoked. 463 */ 464 uint32_t resv_id; /* unique reservation ID, internal use */ 465 uint32_t resv_watts; /* amount of power to reserve */ 466 bool run_epilog; /* set if epilog has been executed */ 467 bool run_prolog; /* set if prolog has been executed */ 468 time_t start_time; /* start time of reservation */ 469 time_t start_time_first;/* when the reservation first started */ 470 time_t start_time_prev; /* If start time was changed this is 471 * the pervious start time. Needed 472 * for accounting */ 473 char *tres_fmt_str; /* formatted string of tres to deal with */ 474 char *tres_str; /* simple string of tres to deal with */ 475 char *users; /* names of users permitted to use */ 476 int user_cnt; /* count of users permitted to use */ 477 uid_t *user_list; /* array of users permitted to use */ 478 bool user_not; /* user_list users NOT permitted to use */ 479 } slurmctld_resv_t; 480 481 extern List resv_list; /* list of slurmctld_resv entries */ 482 extern time_t last_resv_update; /* time of last resv_list update */ 483 484 /*****************************************************************************\ 485 * JOB parameters and data structures 486 \*****************************************************************************/ 487 extern time_t last_job_update; /* time of last update to job records */ 488 489 #define DETAILS_MAGIC 0xdea84e7 490 #define JOB_MAGIC 0xf0b7392c 491 492 #define FEATURE_OP_OR 0 493 #define FEATURE_OP_AND 1 494 #define FEATURE_OP_XOR 2 495 #define FEATURE_OP_XAND 3 496 #define FEATURE_OP_END 4 /* last entry lacks separator */ 497 typedef struct job_feature { 498 char *name; /* name of feature */ 499 bool changeable; /* return value of 500 * node_features_g_changeable_feature */ 501 uint16_t count; /* count of nodes with this feature */ 502 uint8_t op_code; /* separator, see FEATURE_OP_ above */ 503 bitstr_t *node_bitmap_active; /* nodes with this feature active */ 504 bitstr_t *node_bitmap_avail; /* nodes with this feature available */ 505 uint16_t paren; /* count of enclosing parenthesis */ 506 } job_feature_t; 507 508 /* 509 * these related to the JOB_SHARED_ macros in slurm.h 510 * but with the logic for zero vs one inverted 511 */ 512 #define WHOLE_NODE_REQUIRED 0x01 513 #define WHOLE_NODE_USER 0x02 514 #define WHOLE_NODE_MCS 0x03 515 516 /* job_details - specification of a job's constraints, 517 * can be purged after initiation */ 518 struct job_details { 519 uint32_t magic; /* magic cookie for data integrity */ 520 /* DO NOT ALPHABETIZE */ 521 char *acctg_freq; /* accounting polling interval */ 522 time_t accrue_time; /* Time when we start accruing time for 523 * priority, */ 524 uint32_t argc; /* count of argv elements */ 525 char **argv; /* arguments for a batch job script */ 526 time_t begin_time; /* start at this time (srun --begin), 527 * resets to time first eligible 528 * (all dependencies satisfied) */ 529 char *cluster_features; /* required cluster_features */ 530 uint16_t contiguous; /* set if requires contiguous nodes */ 531 uint16_t core_spec; /* specialized core/thread count, 532 * threads if CORE_SPEC_THREAD flag set */ 533 char *cpu_bind; /* binding map for map/mask_cpu - This 534 * currently does not matter to the 535 * job allocation, setting this does 536 * not do anything for steps. */ 537 uint16_t cpu_bind_type; /* Default CPU bind type for steps, 538 * see cpu_bind_type_t */ 539 uint32_t cpu_freq_min; /* Minimum cpu frequency */ 540 uint32_t cpu_freq_max; /* Maximum cpu frequency */ 541 uint32_t cpu_freq_gov; /* cpu frequency governor */ 542 uint16_t cpus_per_task; /* number of processors required for 543 * each task */ 544 uint16_t orig_cpus_per_task; /* requested value of cpus_per_task */ 545 List depend_list; /* list of job_ptr:state pairs */ 546 char *dependency; /* wait for other jobs */ 547 char *orig_dependency; /* original value (for archiving) */ 548 uint16_t env_cnt; /* size of env_sup (see below) */ 549 char **env_sup; /* supplemental environment variables */ 550 bitstr_t *exc_node_bitmap; /* bitmap of excluded nodes */ 551 char *exc_nodes; /* excluded nodes */ 552 uint32_t expanding_jobid; /* ID of job to be expanded */ 553 char *extra; /* extra field, unused */ 554 List feature_list; /* required features with node counts */ 555 char *features; /* required features */ 556 uint32_t max_cpus; /* maximum number of cpus */ 557 uint32_t orig_max_cpus; /* requested value of max_cpus */ 558 uint32_t max_nodes; /* maximum number of nodes */ 559 multi_core_data_t *mc_ptr; /* multi-core specific data */ 560 char *mem_bind; /* binding map for map/mask_cpu */ 561 uint16_t mem_bind_type; /* see mem_bind_type_t */ 562 uint32_t min_cpus; /* minimum number of cpus */ 563 uint32_t orig_min_cpus; /* requested value of min_cpus */ 564 int min_gres_cpu; /* Minimum CPU count per node required 565 * to satisfy GRES requirements, 566 * not saved/restored, but rebuilt */ 567 uint32_t min_nodes; /* minimum number of nodes */ 568 uint32_t nice; /* requested priority change, 569 * NICE_OFFSET == no change */ 570 uint16_t ntasks_per_node; /* number of tasks on each node */ 571 uint32_t num_tasks; /* number of tasks to start */ 572 uint8_t open_mode; /* stdout/err append or truncate */ 573 uint8_t overcommit; /* processors being over subscribed */ 574 uint16_t plane_size; /* plane size when task_dist = 575 * SLURM_DIST_PLANE */ 576 /* job constraints: */ 577 uint32_t pn_min_cpus; /* minimum processors per node */ 578 uint32_t orig_pn_min_cpus; /* requested value of pn_min_cpus */ 579 uint64_t pn_min_memory; /* minimum memory per node (MB) OR 580 * memory per allocated 581 * CPU | MEM_PER_CPU */ 582 uint64_t orig_pn_min_memory; /* requested value of pn_min_memory */ 583 uint32_t pn_min_tmp_disk; /* minimum tempdisk per node, MB */ 584 uint8_t prolog_running; /* set while prolog_slurmctld is 585 * running */ 586 uint32_t reserved_resources; /* CPU minutes of resources reserved 587 * for this job while it was pending */ 588 bitstr_t *req_node_bitmap; /* bitmap of required nodes */ 589 time_t preempt_start_time; /* time that preeption began to start 590 * this job */ 591 char *req_nodes; /* required nodes */ 592 uint16_t requeue; /* controls ability requeue job */ 593 uint8_t share_res; /* set if job can share resources with 594 * other jobs */ 595 char *std_err; /* pathname of job's stderr file */ 596 char *std_in; /* pathname of job's stdin file */ 597 char *std_out; /* pathname of job's stdout file */ 598 time_t submit_time; /* time of submission */ 599 uint32_t task_dist; /* task layout for this job. Only 600 * useful when Consumable Resources 601 * is enabled */ 602 uint32_t usable_nodes; /* node count needed by preemption */ 603 uint8_t whole_node; /* WHOLE_NODE_REQUIRED: 1: --exclusive 604 * WHOLE_NODE_USER: 2: --exclusive=user 605 * WHOLE_NODE_MCS: 3: --exclusive=mcs */ 606 char *work_dir; /* pathname of working directory */ 607 uint16_t x11; /* --x11 flags */ 608 char *x11_magic_cookie; /* x11 magic cookie */ 609 char *x11_target; /* target host, or socket if port == 0 */ 610 uint16_t x11_target_port; /* target TCP port on alloc_node */ 611 }; 612 613 typedef struct job_array_struct { 614 uint32_t task_cnt; /* count of remaining task IDs */ 615 bitstr_t *task_id_bitmap; /* bitmap of remaining task IDs */ 616 char *task_id_str; /* string describing remaining task IDs, 617 * needs to be recalculated if NULL */ 618 uint32_t array_flags; /* Flags to control behavior (FUTURE) */ 619 uint32_t max_run_tasks; /* Maximum number of running tasks */ 620 uint32_t tot_run_tasks; /* Current running task count */ 621 uint32_t min_exit_code; /* Minimum exit code from any task */ 622 uint32_t max_exit_code; /* Maximum exit code from any task */ 623 uint32_t pend_run_tasks; /* Number of tasks ready to run due to 624 * preempting other jobs */ 625 uint32_t tot_comp_tasks; /* Completed task count */ 626 } job_array_struct_t; 627 628 #define ADMIN_SET_LIMIT 0xffff 629 630 typedef struct { 631 uint16_t qos; 632 uint16_t time; 633 uint16_t *tres; 634 } acct_policy_limit_set_t; 635 636 typedef struct { 637 uint32_t cluster_lock; /* sibling that has lock on job */ 638 char *origin_str; /* origin cluster name */ 639 uint64_t siblings_active; /* bitmap of active sibling ids. */ 640 char *siblings_active_str; /* comma separated list of actual 641 sibling names */ 642 uint64_t siblings_viable; /* bitmap of viable sibling ids. */ 643 char *siblings_viable_str; /* comma separated list of viable 644 sibling names */ 645 } job_fed_details_t; 646 647 #define HETJOB_PRIO_MIN 0x0001 /* Sort by mininum component priority[tier] */ 648 #define HETJOB_PRIO_MAX 0x0002 /* Sort by maximum component priority[tier] */ 649 #define HETJOB_PRIO_AVG 0x0004 /* Sort by average component priority[tier] */ 650 651 typedef struct { 652 bool any_resv; /* at least one component with resv */ 653 uint32_t priority_tier; /* whole hetjob calculated tier */ 654 uint32_t priority; /* whole hetjob calculated priority */ 655 } het_job_details_t; 656 657 /* 658 * NOTE: When adding fields to the job_record, or any underlying structures, 659 * be sure to sync with job_array_split. 660 */ 661 typedef struct job_record job_record_t; 662 struct job_record { 663 uint32_t magic; /* magic cookie for data integrity */ 664 /* DO NOT ALPHABETIZE */ 665 char *account; /* account number to charge */ 666 char *admin_comment; /* administrator's arbitrary comment */ 667 char *alias_list; /* node name to address aliases */ 668 char *alloc_node; /* local node making resource alloc */ 669 uint16_t alloc_resp_port; /* RESPONSE_RESOURCE_ALLOCATION port */ 670 uint32_t alloc_sid; /* local sid making resource alloc */ 671 uint32_t array_job_id; /* job_id of a job array or 0 if N/A */ 672 uint32_t array_task_id; /* task_id of a job array */ 673 job_array_struct_t *array_recs; /* job array details, 674 * only in meta-job record */ 675 uint32_t assoc_id; /* used for accounting plugins */ 676 slurmdb_assoc_rec_t *assoc_ptr; /* job's assoc record ptr confirm the 677 * value before use */ 678 char *batch_features; /* features required for batch script */ 679 uint16_t batch_flag; /* 1 or 2 if batch job (with script), 680 * 2 indicates retry mode (one retry) */ 681 char *batch_host; /* host executing batch script */ 682 double billable_tres; /* calculated billable tres for the 683 * job, as defined by the partition's 684 * billing weight. Recalculated upon job 685 * resize. Cannot be calculated until 686 * the job is alloocated resources. */ 687 uint32_t bit_flags; /* various job flags */ 688 char *burst_buffer; /* burst buffer specification */ 689 char *burst_buffer_state; /* burst buffer state */ 690 char *clusters; /* clusters job is submitted to with -M 691 option */ 692 char *comment; /* arbitrary comment */ 693 uint32_t cpu_cnt; /* current count of CPUs held 694 * by the job, decremented while job is 695 * completing */ 696 char *cpus_per_tres; /* semicolon delimited list of TRES=# values */ 697 uint16_t cr_enabled; /* specify if Consumable Resources 698 * is enabled. Needed since CR deals 699 * with a finer granularity in its 700 * node/cpu scheduling (available cpus 701 * instead of available nodes) than the 702 * linear plugin 703 * 0 if cr is NOT enabled, 704 * 1 if cr is enabled */ 705 uint32_t db_flags; /* Flags to send to the database 706 * record */ 707 uint64_t db_index; /* used only for database plugins */ 708 time_t deadline; /* deadline */ 709 uint32_t delay_boot; /* Delay boot for desired node mode */ 710 uint32_t derived_ec; /* highest exit code of all job steps */ 711 struct job_details *details; /* job details */ 712 uint16_t direct_set_prio; /* Priority set directly if 713 * set the system will not 714 * change the priority any further. */ 715 time_t end_time; /* time execution ended, actual or 716 * expected. if terminated from suspend 717 * state, this is time suspend began */ 718 time_t end_time_exp; /* when we believe the job is 719 going to end. */ 720 bool epilog_running; /* true of EpilogSlurmctld is running */ 721 uint32_t exit_code; /* exit code for job (status from 722 * wait call) */ 723 job_fed_details_t *fed_details; /* details for federated jobs. */ 724 front_end_record_t *front_end_ptr; /* Pointer to front-end node running 725 * this job */ 726 List gres_list; /* generic resource allocation detail */ 727 char *gres_alloc; /* Allocated GRES added over all nodes 728 * to be passed to slurmdbd */ 729 uint32_t gres_detail_cnt; /* Count of gres_detail_str records, 730 * one per allocated node */ 731 char **gres_detail_str; /* Details of GRES index alloc per node */ 732 char *gres_req; /* Requested GRES added over all nodes 733 * to be passed to slurmdbd */ 734 char *gres_used; /* Actual GRES use added over all nodes 735 * to be passed to slurmdbd */ 736 uint32_t group_id; /* group submitted under */ 737 het_job_details_t *het_details; /* HetJob details */ 738 uint32_t het_job_id; /* job ID of HetJob leader */ 739 char *het_job_id_set; /* job IDs for all components */ 740 uint32_t het_job_offset; /* HetJob component index */ 741 List het_job_list; /* List of job pointers to all 742 * components */ 743 uint32_t job_id; /* job ID */ 744 job_record_t *job_next; /* next entry with same hash index */ 745 job_record_t *job_array_next_j; /* job array linked list by job_id */ 746 job_record_t *job_array_next_t; /* job array linked list by task_id */ 747 job_record_t *job_preempt_comp; /* het job preempt component */ 748 job_resources_t *job_resrcs; /* details of allocated cores */ 749 uint32_t job_state; /* state of the job */ 750 uint16_t kill_on_node_fail; /* 1 if job should be killed on 751 * node failure */ 752 time_t last_sched_eval; /* last time job was evaluated for scheduling */ 753 char *licenses; /* licenses required by the job */ 754 List license_list; /* structure with license info */ 755 acct_policy_limit_set_t limit_set; /* flags if indicate an 756 * associated limit was set from 757 * a limit instead of from 758 * the request, or if the 759 * limit was set from admin */ 760 uint16_t mail_type; /* see MAIL_JOB_* in slurm.h */ 761 char *mail_user; /* user to get e-mail notification */ 762 char *mem_per_tres; /* semicolon delimited list of TRES=# values */ 763 char *mcs_label; /* mcs_label if mcs plugin in use */ 764 char *name; /* name of the job */ 765 char *network; /* network/switch requirement spec */ 766 uint32_t next_step_id; /* next step id to be used */ 767 char *nodes; /* list of nodes allocated to job */ 768 slurm_addr_t *node_addr; /* addresses of the nodes allocated to 769 * job */ 770 bitstr_t *node_bitmap; /* bitmap of nodes allocated to job */ 771 bitstr_t *node_bitmap_cg; /* bitmap of nodes completing job */ 772 uint32_t node_cnt; /* count of nodes currently 773 * allocated to job */ 774 uint32_t node_cnt_wag; /* count of nodes Slurm thinks 775 * will be allocated when the 776 * job is pending and node_cnt 777 * wasn't given by the user. 778 * This is packed in total_nodes 779 * when dumping state. When 780 * state is read in check for 781 * pending state and set this 782 * instead of total_nodes */ 783 char *nodes_completing; /* nodes still in completing state 784 * for this job, used to ensure 785 * epilog is not re-run for job */ 786 char *origin_cluster; /* cluster name that the job was 787 * submitted from */ 788 uint16_t other_port; /* port for client communications */ 789 char *partition; /* name of job partition(s) */ 790 List part_ptr_list; /* list of pointers to partition recs */ 791 bool part_nodes_missing; /* set if job's nodes removed from this 792 * partition */ 793 part_record_t *part_ptr; /* pointer to the partition record */ 794 uint8_t power_flags; /* power management flags, 795 * see SLURM_POWER_FLAGS_ */ 796 time_t pre_sus_time; /* time job ran prior to last suspend */ 797 time_t preempt_time; /* job preemption signal time */ 798 bool preempt_in_progress; /* Premption of other jobs in progress 799 * in order to start this job, 800 * (Internal use only, don't save) */ 801 uint32_t prep_epilog_cnt; /* count of epilog async tasks left */ 802 uint32_t prep_prolog_cnt; /* count of prolog async tasks left */ 803 bool prep_prolog_failed; /* any prolog_slurmctld failed */ 804 uint32_t priority; /* relative priority of the job, 805 * zero == held (don't initiate) */ 806 uint32_t *priority_array; /* partition based priority */ 807 priority_factors_object_t *prio_factors; /* cached value used 808 * by sprio command */ 809 uint32_t profile; /* Acct_gather_profile option */ 810 uint32_t qos_id; /* quality of service id */ 811 slurmdb_qos_rec_t *qos_ptr; /* pointer to the quality of 812 * service record used for 813 * this job, confirm the 814 * value before use */ 815 void *qos_blocking_ptr; /* internal use only, DON'T PACK */ 816 uint8_t reboot; /* node reboot requested before start */ 817 uint16_t restart_cnt; /* count of restarts */ 818 time_t resize_time; /* time of latest size change */ 819 uint32_t resv_id; /* reservation ID */ 820 char *resv_name; /* reservation name */ 821 struct slurmctld_resv *resv_ptr;/* reservation structure pointer */ 822 uint32_t requid; /* requester user ID */ 823 char *resp_host; /* host for srun communications */ 824 char *sched_nodes; /* list of nodes scheduled for job */ 825 dynamic_plugin_data_t *select_jobinfo;/* opaque data, BlueGene */ 826 uint32_t site_factor; /* factor to consider in priority */ 827 char **spank_job_env; /* environment variables for job prolog 828 * and epilog scripts as set by SPANK 829 * plugins */ 830 uint32_t spank_job_env_size; /* element count in spank_env */ 831 uint16_t start_protocol_ver; /* Slurm version job was 832 * started with either the 833 * creating message or the 834 * lowest slurmd in the 835 * allocation */ 836 time_t start_time; /* time execution begins, 837 * actual or expected */ 838 char *state_desc; /* optional details for state_reason */ 839 uint32_t state_reason; /* reason job still pending or failed 840 * see slurm.h:enum job_state_reason */ 841 uint32_t state_reason_prev; /* Previous state_reason, needed to 842 * return valid job information during 843 * scheduling cycle (state_reason is 844 * cleared at start of cycle) */ 845 uint32_t state_reason_prev_db; /* Previous state_reason that isn't 846 * priority or resources, only stored in 847 * the database. */ 848 List step_list; /* list of job's steps */ 849 time_t suspend_time; /* time job last suspended or resumed */ 850 char *system_comment; /* slurmctld's arbitrary comment */ 851 time_t time_last_active; /* time of last job activity */ 852 uint32_t time_limit; /* time_limit minutes or INFINITE, 853 * NO_VAL implies partition max_time */ 854 uint32_t time_min; /* minimum time_limit minutes or 855 * INFINITE, 856 * zero implies same as time_limit */ 857 time_t tot_sus_time; /* total time in suspend state */ 858 uint32_t total_cpus; /* number of allocated cpus, 859 * for accounting */ 860 uint32_t total_nodes; /* number of allocated nodes 861 * for accounting */ 862 char *tres_bind; /* Task to TRES binding directives */ 863 char *tres_freq; /* TRES frequency directives */ 864 char *tres_per_job; /* comma delimited list of TRES values */ 865 char *tres_per_node; /* comma delimited list of TRES values */ 866 char *tres_per_socket; /* comma delimited list of TRES values */ 867 char *tres_per_task; /* comma delimited list of TRES values */ 868 uint64_t *tres_req_cnt; /* array of tres counts requested 869 * based off g_tres_count in 870 * assoc_mgr */ 871 char *tres_req_str; /* string format of 872 * tres_req_cnt primarily 873 * used for state */ 874 char *tres_fmt_req_str; /* formatted req tres string for job */ 875 uint64_t *tres_alloc_cnt; /* array of tres counts allocated 876 * based off g_tres_count in 877 * assoc_mgr */ 878 char *tres_alloc_str; /* simple tres string for job */ 879 char *tres_fmt_alloc_str; /* formatted tres string for job */ 880 uint32_t user_id; /* user the job runs as */ 881 char *user_name; /* string version of user */ 882 uint16_t wait_all_nodes; /* if set, wait for all nodes to boot 883 * before starting the job */ 884 uint16_t warn_flags; /* flags for signal to send */ 885 uint16_t warn_signal; /* signal to send before end_time */ 886 uint16_t warn_time; /* when to send signal before 887 * end_time (secs) */ 888 char *wckey; /* optional wckey */ 889 890 /* Request number of switches support */ 891 uint32_t req_switch; /* Minimum number of switches */ 892 uint32_t wait4switch; /* Maximum time to wait for minimum switches */ 893 bool best_switch; /* true=min number of switches met */ 894 time_t wait4switch_start; /* Time started waiting for switch */ 895 }; 896 897 /* Job dependency specification, used in "depend_list" within job_record */ 898 typedef enum { 899 SLURM_DEPEND_AFTER = 1, /* After job begins */ 900 SLURM_DEPEND_AFTER_ANY, /* After job completes */ 901 SLURM_DEPEND_AFTER_NOT_OK, /* After job fails */ 902 SLURM_DEPEND_AFTER_OK, /* After job completes successfully */ 903 SLURM_DEPEND_SINGLETON, /* Only one job for this 904 * user/name at a time */ 905 SLURM_DEPEND_EXPAND, /* Expand running job */ 906 SLURM_DEPEND_AFTER_CORRESPOND, /* After corresponding job array 907 * elements completes */ 908 SLURM_DEPEND_BURST_BUFFER, /* After job burst buffer 909 * stage-out completes */ 910 } slurm_depend_types_t; 911 912 #define SLURM_FLAGS_OR 0x0001 /* OR job dependencies */ 913 #define SLURM_FLAGS_REMOTE 0x0002 /* Is a remote dependency */ 914 915 /* Used as values for depend_state in depend_spec_t */ 916 enum { 917 DEPEND_NOT_FULFILLED = 0, 918 DEPEND_FULFILLED, 919 DEPEND_FAILED 920 }; 921 922 typedef struct depend_spec { 923 uint32_t array_task_id; /* INFINITE for all array tasks */ 924 uint16_t depend_type; /* SLURM_DEPEND_* type */ 925 uint16_t depend_flags; /* SLURM_FLAGS_* type */ 926 uint32_t depend_state; /* Status of the dependency */ 927 uint32_t depend_time; /* time to wait (mins) */ 928 uint32_t job_id; /* Slurm job_id */ 929 job_record_t *job_ptr; /* pointer to this job */ 930 uint64_t singleton_bits; /* which clusters have satisfied the 931 singleton dependency */ 932 } depend_spec_t; 933 934 #define STEP_FLAG 0xbbbb 935 #define STEP_MAGIC 0xcafecafe 936 937 typedef struct { 938 uint32_t magic; /* magic cookie to test data integrity */ 939 /* DO NOT ALPHABETIZE */ 940 uint16_t batch_step; /* 1 if batch job step, 0 otherwise */ 941 bitstr_t *core_bitmap_job; /* bitmap of cores allocated to this 942 * step relative to job's nodes, 943 * see src/common/job_resources.h */ 944 uint32_t cpu_count; /* count of step's CPUs */ 945 uint32_t cpu_freq_min; /* Minimum cpu frequency */ 946 uint32_t cpu_freq_max; /* Maximum cpu frequency */ 947 uint32_t cpu_freq_gov; /* cpu frequency governor */ 948 uint16_t cpus_per_task; /* cpus per task initiated */ 949 char *cpus_per_tres; /* semicolon delimited list of TRES=# values */ 950 uint16_t cyclic_alloc; /* set for cyclic task allocation 951 * across nodes */ 952 uint16_t exclusive; /* dedicated resources for the step */ 953 uint32_t exit_code; /* highest exit code from any task */ 954 bitstr_t *exit_node_bitmap; /* bitmap of exited nodes */ 955 ext_sensors_data_t *ext_sensors; /* external sensors plugin data */ 956 List gres_list; /* generic resource allocation detail */ 957 char *host; /* host for srun communications */ 958 job_record_t *job_ptr; /* ptr to the job that owns the step */ 959 jobacctinfo_t *jobacct; /* keep track of process info in the 960 * step */ 961 char *mem_per_tres; /* semicolon delimited list of TRES=# values */ 962 char *name; /* name of job step */ 963 char *network; /* step's network specification */ 964 uint8_t no_kill; /* 1 if no kill on node failure */ 965 uint64_t pn_min_memory; /* minimum real memory per node OR 966 * real memory per CPU | MEM_PER_CPU, 967 * default=0 (use job limit) */ 968 uint16_t port; /* port for srun communications */ 969 time_t pre_sus_time; /* time step ran prior to last suspend */ 970 uint16_t start_protocol_ver; /* Slurm version step was 971 * started with either srun 972 * or the lowest slurmd 973 * version it is talking to */ 974 int *resv_port_array; /* reserved port indexes */ 975 uint16_t resv_port_cnt; /* count of ports reserved per node */ 976 char *resv_ports; /* ports reserved for job */ 977 uint32_t requid; /* requester user ID */ 978 time_t start_time; /* step allocation start time */ 979 uint32_t time_limit; /* step allocation time limit */ 980 dynamic_plugin_data_t *select_jobinfo;/* opaque data, BlueGene */ 981 uint32_t srun_pid; /* PID of srun (also see host/port) */ 982 uint32_t state; /* state of the step. See job_states */ 983 uint32_t step_id; /* step number */ 984 slurm_step_layout_t *step_layout;/* info about how tasks are laid out 985 * in the step */ 986 bitstr_t *step_node_bitmap; /* bitmap of nodes allocated to job 987 * step */ 988 /* time_t suspend_time; * time step last suspended or resumed 989 * implicitly the same as suspend_time 990 * in the job record */ 991 dynamic_plugin_data_t *switch_job; /* switch context, opaque */ 992 time_t time_last_active; /* time step was last found on node */ 993 time_t tot_sus_time; /* total time in suspended state */ 994 char *tres_alloc_str; /* simple TRES string for step */ 995 char *tres_bind; /* Task to TRES binding directives */ 996 char *tres_fmt_alloc_str; /* formatted tres string for step */ 997 char *tres_freq; /* TRES frequency directives */ 998 char *tres_per_step; /* semicolon delimited list of TRES=# values */ 999 char *tres_per_node; /* semicolon delimited list of TRES=# values */ 1000 char *tres_per_socket; /* semicolon delimited list of TRES=# values */ 1001 char *tres_per_task; /* semicolon delimited list of TRES=# values */ 1002 } step_record_t; 1003 1004 typedef struct { 1005 job_record_t *job_ptr; 1006 List job_queue; 1007 part_record_t *part_ptr; 1008 uint32_t prio; 1009 slurmctld_resv_t *resv_ptr; 1010 } job_queue_req_t; 1011 1012 extern List job_list; /* list of job_record entries */ 1013 extern List purge_files_list; /* list of job ids to purge files of */ 1014 1015 /*****************************************************************************\ 1016 * Consumable Resources parameters and data structures 1017 \*****************************************************************************/ 1018 1019 /* 1020 * Define the type of update and of data retrieval that can happen 1021 * from the "select/cons_res" plugin. This information needed to 1022 * support processors as consumable resources. This structure will be 1023 * useful when updating other types of consumable resources as well 1024 */ 1025 enum select_plugindata_info { 1026 SELECT_CR_PLUGIN, /* data-> uint32 See SELECT_TYPE_* below */ 1027 SELECT_BITMAP, /* Unused since version 2.0 */ 1028 SELECT_ALLOC_CPUS, /* data-> uint16 alloc cpus (CR support) */ 1029 SELECT_ALLOC_LPS, /* data-> uint32 alloc lps (CR support) */ 1030 SELECT_AVAIL_MEMORY, /* data-> uint64 avail mem (CR support) */ 1031 SELECT_STATIC_PART, /* data-> uint16, 1 if static partitioning 1032 * BlueGene support */ 1033 SELECT_CONFIG_INFO, /* data-> List get .conf info from select 1034 * plugin */ 1035 SELECT_SINGLE_JOB_TEST /* data-> uint16 1 if one select_g_job_test() 1036 * call per job, node weights in node data 1037 * structure, 0 otherwise, for cons_tres */ 1038 }; 1039 #define SELECT_TYPE_CONS_RES 1 1040 #define SELECT_TYPE_CONS_TRES 2 1041 1042 1043 /*****************************************************************************\ 1044 * Global assoc_cache variables 1045 \*****************************************************************************/ 1046 1047 /* flag to let us know if we are running on cache or from the actual 1048 * database */ 1049 extern uint16_t running_cache; 1050 /* mutex and signal to let us know if associations have been reset so we need to 1051 * redo all the pointers to the associations */ 1052 extern pthread_mutex_t assoc_cache_mutex; /* assoc cache mutex */ 1053 extern pthread_cond_t assoc_cache_cond; /* assoc cache condition */ 1054 1055 /*****************************************************************************\ 1056 * Global slurmctld functions 1057 \*****************************************************************************/ 1058 1059 /* 1060 * abort_job_on_node - Kill the specific job_id on a specific node, 1061 * the request is not processed immediately, but queued. 1062 * This is to prevent a flood of pthreads if slurmctld restarts 1063 * without saved state and slurmd daemons register with a 1064 * multitude of running jobs. Slurmctld will not recognize 1065 * these jobs and use this function to kill them - one 1066 * agent request per node as they register. 1067 * IN job_id - id of the job to be killed 1068 * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned) 1069 * IN node_name - name of the node on which the job resides 1070 */ 1071 extern void abort_job_on_node(uint32_t job_id, job_record_t *job_ptr, 1072 char *node_name); 1073 1074 /* 1075 * abort_job_on_nodes - Kill the specific job_on the specific nodes, 1076 * the request is not processed immediately, but queued. 1077 * This is to prevent a flood of pthreads if slurmctld restarts 1078 * without saved state and slurmd daemons register with a 1079 * multitude of running jobs. Slurmctld will not recognize 1080 * these jobs and use this function to kill them - one 1081 * agent request per node as they register. 1082 * IN job_ptr - pointer to terminating job 1083 * IN node_name - name of the node on which the job resides 1084 */ 1085 extern void abort_job_on_nodes(job_record_t *job_ptr, bitstr_t *node_bitmap); 1086 1087 /* 1088 * If a job has a FAIL_ACCOUNT or FAIL_QOS start_reason check and set pointers 1089 * if they are now valid. 1090 */ 1091 extern void set_job_failed_assoc_qos_ptr(job_record_t *job_ptr); 1092 1093 /* set the tres_req_str and tres_req_fmt_str for the job. assoc_mgr_locked 1094 * is set if the assoc_mgr read lock is already set. 1095 */ 1096 extern void set_job_tres_req_str(job_record_t *job_ptr, bool assoc_mgr_locked); 1097 1098 /* set the tres_alloc_str and tres_alloc_fmt_str for the job. assoc_mgr_locked 1099 * is set if the assoc_mgr read lock is already set. 1100 */ 1101 extern void set_job_tres_alloc_str(job_record_t *job_ptr, 1102 bool assoc_mgr_locked); 1103 1104 /* Note that the backup slurmctld has assumed primary control. 1105 * This function can be called multiple times. */ 1106 extern void backup_slurmctld_restart(void); 1107 1108 /* Complete a batch job requeue logic after all steps complete so that 1109 * subsequent jobs appear in a separate accounting record. */ 1110 extern void batch_requeue_fini(job_record_t *job_ptr); 1111 1112 /* Build a bitmap of nodes completing this job */ 1113 extern void build_cg_bitmap(job_record_t *job_ptr); 1114 1115 /* Build structure with job allocation details */ 1116 extern resource_allocation_response_msg_t *build_job_info_resp( 1117 job_record_t *job_ptr); 1118 1119 /* 1120 * create_part_record - create a partition record 1121 * IN name - name will be xstrdup()'d into the part_record 1122 * RET a pointer to the record or NULL if error 1123 * global: default_part - default partition parameters 1124 * part_list - global partition list 1125 * NOTE: the record's values are initialized to those of default_part 1126 * NOTE: allocates memory that should be xfreed with delete_part_record 1127 */ 1128 extern part_record_t *create_part_record(const char *name); 1129 1130 /* 1131 * build_part_bitmap - update the total_cpus, total_nodes, and node_bitmap 1132 * for the specified partition, also reset the partition pointers in 1133 * the node back to this partition. 1134 * IN part_ptr - pointer to the partition 1135 * RET 0 if no error, errno otherwise 1136 * global: node_record_table_ptr - pointer to global node table 1137 * NOTE: this does not report nodes defined in more than one partition. this 1138 * is checked only upon reading the configuration file, not on an update 1139 */ 1140 extern int build_part_bitmap(part_record_t *part_ptr); 1141 1142 /* 1143 * job_limits_check - check the limits specified for the job. 1144 * IN job_ptr - pointer to job table entry. 1145 * IN check_min_time - if true test job's minimum time limit, 1146 * otherwise test maximum time limit 1147 * RET WAIT_NO_REASON on success, fail status otherwise. 1148 */ 1149 extern int job_limits_check(job_record_t **job_pptr, bool check_min_time); 1150 1151 /* 1152 * delete_partition - delete the specified partition 1153 * IN job_specs - job specification from RPC 1154 * RET 0 on success, errno otherwise 1155 */ 1156 extern int delete_partition(delete_part_msg_t *part_desc_ptr); 1157 1158 /* 1159 * delete_step_record - delete record for job step for specified job_ptr 1160 * and step_id 1161 * IN job_ptr - pointer to job table entry to have step record removed 1162 * IN step_id - id of the desired job step 1163 * RET 0 on success, errno otherwise 1164 */ 1165 extern int delete_step_record(job_record_t *job_ptr, uint32_t step_id); 1166 1167 /* 1168 * delete_step_records - delete step record for specified job_ptr 1169 * IN job_ptr - pointer to job table entry to have step records removed 1170 */ 1171 extern void delete_step_records(job_record_t *job_ptr); 1172 1173 /* 1174 * Copy a job's dependency list 1175 * IN depend_list_src - a job's depend_lst 1176 * RET copy of depend_list_src, must bee freed by caller 1177 */ 1178 extern List depended_list_copy(List depend_list_src); 1179 1180 /* 1181 * drain_nodes - drain one or more nodes, 1182 * no-op for nodes already drained or draining 1183 * IN nodes - nodes to drain 1184 * IN reason - reason to drain the nodes 1185 * IN reason_uid - who set the reason 1186 * RET SLURM_SUCCESS or error code 1187 * global: node_record_table_ptr - pointer to global node table 1188 */ 1189 extern int drain_nodes ( char *nodes, char *reason, uint32_t reason_uid ); 1190 1191 /* dump_all_job_state - save the state of all jobs to file 1192 * RET 0 or error code */ 1193 extern int dump_all_job_state ( void ); 1194 1195 /* dump_all_node_state - save the state of all nodes to file */ 1196 extern int dump_all_node_state ( void ); 1197 1198 /* dump_all_part_state - save the state of all partitions to file */ 1199 extern int dump_all_part_state ( void ); 1200 1201 /* 1202 * dump_job_desc - dump the incoming job submit request message 1203 * IN job_specs - job specification from RPC 1204 */ 1205 extern void dump_job_desc(job_desc_msg_t * job_specs); 1206 1207 /* 1208 * dump_job_step_state - dump the state of a specific job step to a buffer, 1209 * load with load_step_state 1210 * IN step_ptr - pointer to job step for which information is to be dumped 1211 * IN/OUT buffer - location to store data, pointers automatically advanced 1212 */ 1213 extern int dump_job_step_state(void *x, void *arg); 1214 1215 /* 1216 * dump_step_desc - dump the incoming step initiate request message 1217 * IN step_spec - job step request specification from RPC 1218 */ 1219 extern void dump_step_desc(job_step_create_request_msg_t *step_spec); 1220 1221 /* Remove one node from a job's allocation */ 1222 extern void excise_node_from_job(job_record_t *job_ptr, 1223 node_record_t *node_ptr); 1224 1225 /* make_node_avail - flag specified node as available */ 1226 extern void make_node_avail(int node_inx); 1227 1228 /* 1229 * Copy a job's feature list 1230 * IN feature_list_src - a job's depend_lst 1231 * RET copy of depend_list_src, must be freed by caller 1232 */ 1233 extern List feature_list_copy(List feature_list_src); 1234 1235 /* 1236 * find_job_array_rec - return a pointer to the job record with the given 1237 * array_job_id/array_task_id 1238 * IN job_id - requested job's id 1239 * IN array_task_id - requested job's task id, 1240 * NO_VAL if none specified (i.e. not a job array) 1241 * INFINITE return any task for specified job id 1242 * RET pointer to the job's record, NULL on error 1243 */ 1244 extern job_record_t *find_job_array_rec(uint32_t array_job_id, 1245 uint32_t array_task_id); 1246 1247 /* 1248 * find_het_job_record - return a pointer to the job record with the given ID 1249 * IN job_id - requested job's ID 1250 * in het_job_id - hetjob component ID 1251 * RET pointer to the job's record, NULL on error 1252 */ 1253 extern job_record_t *find_het_job_record(uint32_t job_id, uint32_t het_job_id); 1254 1255 /* 1256 * find_job_record - return a pointer to the job record with the given job_id 1257 * IN job_id - requested job's id 1258 * RET pointer to the job's record, NULL on error 1259 */ 1260 extern job_record_t *find_job_record(uint32_t job_id); 1261 1262 /* 1263 * find_first_node_record - find a record for first node in the bitmap 1264 * IN node_bitmap 1265 */ 1266 extern node_record_t *find_first_node_record(bitstr_t *node_bitmap); 1267 1268 /* 1269 * find_part_record - find a record for partition with specified name 1270 * IN name - name of the desired partition 1271 * RET pointer to partition or NULL if not found 1272 */ 1273 extern part_record_t *find_part_record(char *name); 1274 1275 /* 1276 * find_step_record - return a pointer to the step record with the given 1277 * job_id and step_id 1278 * IN job_ptr - pointer to job table entry to have step record added 1279 * IN step_id - id of the desired job step 1280 * RET pointer to the job step's record, NULL on error 1281 */ 1282 extern step_record_t *find_step_record(job_record_t *job_ptr, uint32_t step_id); 1283 1284 /* 1285 * free_null_array_recs - free an xmalloc'd job_array_struct_t structure inside 1286 * of a job_record_t and set job_ptr->array_recs to NULL. 1287 */ 1288 extern void free_null_array_recs(job_record_t *array_recs); 1289 1290 /* 1291 * get_job_env - return the environment variables and their count for a 1292 * given job 1293 * IN job_ptr - pointer to job for which data is required 1294 * OUT env_size - number of elements to read 1295 * RET point to array of string pointers containing environment variables 1296 */ 1297 extern char **get_job_env(job_record_t *job_ptr, uint32_t *env_size); 1298 1299 /* 1300 * get_job_script - return the script for a given job 1301 * IN job_ptr - pointer to job for which data is required 1302 * RET Buf containing job script 1303 */ 1304 extern Buf get_job_script(const job_record_t *job_ptr); 1305 1306 /* 1307 * Return the next available job_id to be used. 1308 * IN test_only - if true, doesn't advance the job_id sequence, just returns 1309 * what the next job id will be. 1310 * RET a valid job_id or SLURM_ERROR if all job_ids are exhausted. 1311 */ 1312 extern uint32_t get_next_job_id(bool test_only); 1313 1314 /* 1315 * get_part_list - find record for named partition(s) 1316 * IN name - partition name(s) in a comma separated list 1317 * OUT err_part - The first invalid partition name. 1318 * RET List of pointers to the partitions or NULL if not found 1319 * NOTE: Caller must free the returned list 1320 * NOTE: Caller must free err_part 1321 */ 1322 extern List get_part_list(char *name, char **err_part); 1323 1324 /* 1325 * init_depend_policy() 1326 * Initialize variables from DependencyParameters 1327 */ 1328 extern void init_depend_policy(void); 1329 1330 /* 1331 * init_job_conf - initialize the job configuration tables and values. 1332 * this should be called after creating node information, but 1333 * before creating any job entries. 1334 * RET 0 if no error, otherwise an error code 1335 * global: last_job_update - time of last job table update 1336 * job_list - pointer to global job list 1337 */ 1338 extern int init_job_conf (void); 1339 1340 /* 1341 * init_node_conf - initialize the node configuration tables and values. 1342 * this should be called before creating any node or configuration 1343 * entries. 1344 * RET 0 if no error, otherwise an error code 1345 * global: node_record_table_ptr - pointer to global node table 1346 * default_node_record - default values for node records 1347 * default_config_record - default values for configuration records 1348 * hash_table - table of hash indexes 1349 * last_node_update - time of last node table update 1350 */ 1351 extern int init_node_conf (void); 1352 1353 /* 1354 * init_part_conf - initialize the default partition configuration values 1355 * and create a (global) partition list. 1356 * this should be called before creating any partition entries. 1357 * RET 0 if no error, otherwise an error code 1358 * global: default_part - default partition values 1359 * part_list - global partition list 1360 */ 1361 extern int init_part_conf (void); 1362 1363 /* init_requeue_policy() 1364 * Initialize the requeue exit/hold bitmaps. 1365 */ 1366 extern void init_requeue_policy(void); 1367 1368 /* 1369 * is_node_down - determine if the specified node's state is DOWN 1370 * IN name - name of the node 1371 * RET true if node exists and is down, otherwise false 1372 */ 1373 extern bool is_node_down (char *name); 1374 1375 /* 1376 * is_node_resp - determine if the specified node's state is responding 1377 * IN name - name of the node 1378 * RET true if node exists and is responding, otherwise false 1379 */ 1380 extern bool is_node_resp (char *name); 1381 1382 /* Fail a job because the qos is no longer valid */ 1383 extern int job_fail_qos(job_record_t *job_ptr, const char *func_name); 1384 1385 /* 1386 * delete_job_desc_files - remove the state files and directory 1387 * for a given job_id from SlurmStateSaveLocation 1388 */ 1389 extern void delete_job_desc_files(uint32_t job_id); 1390 1391 /* 1392 * job_alloc_info - get details about an existing job allocation 1393 * IN uid - job issuing the code 1394 * IN job_id - ID of job for which info is requested 1395 * OUT job_pptr - set to pointer to job record 1396 * NOTE: See job_alloc_info_ptr() if job pointer is known 1397 */ 1398 extern int job_alloc_info(uint32_t uid, uint32_t job_id, 1399 job_record_t **job_pptr); 1400 1401 /* 1402 * job_alloc_info_ptr - get details about an existing job allocation 1403 * IN uid - job issuing the code 1404 * IN job_ptr - pointer to job record 1405 * NOTE: See job_alloc_info() if job pointer not known 1406 */ 1407 extern int job_alloc_info_ptr(uint32_t uid, job_record_t *job_ptr); 1408 1409 /* 1410 * job_allocate - create job_records for the supplied job specification and 1411 * allocate nodes for it. 1412 * IN job_specs - job specifications 1413 * IN immediate - if set then either initiate the job immediately or fail 1414 * IN will_run - don't initiate the job if set, just test if it could run 1415 * now or later 1416 * OUT resp - will run response (includes start location, time, etc.) 1417 * IN allocate - resource allocation request only if set, batch job if zero 1418 * IN submit_uid -uid of user issuing the request 1419 * OUT job_pptr - set to pointer to job record 1420 * OUT err_msg - Custom error message to the user, caller to xfree results 1421 * IN protocol_version - version of the code the caller is using 1422 * RET 0 or an error code. If the job would only be able to execute with 1423 * some change in partition configuration then 1424 * ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned 1425 * NOTE: If allocating nodes lx[0-7] to a job and those nodes have cpu counts 1426 * of 4, 4, 4, 4, 8, 8, 4, 4 then num_cpu_groups=3, cpus_per_node={4,8,4} 1427 * and cpu_count_reps={4,2,2} 1428 * globals: job_list - pointer to global job list 1429 * list_part - global list of partition info 1430 * default_part_loc - pointer to default partition 1431 * NOTE: lock_slurmctld on entry: Read config Write job, Write node, Read part 1432 */ 1433 extern int job_allocate(job_desc_msg_t * job_specs, int immediate, 1434 int will_run, will_run_response_msg_t **resp, 1435 int allocate, uid_t submit_uid, 1436 job_record_t **job_pptr, 1437 char **err_msg, uint16_t protocol_version); 1438 1439 /* If this is a job array meta-job, prepare it for being scheduled */ 1440 extern void job_array_pre_sched(job_record_t *job_ptr); 1441 1442 /* If this is a job array meta-job, clean up after scheduling attempt */ 1443 extern job_record_t *job_array_post_sched(job_record_t *job_ptr); 1444 1445 /* Create an exact copy of an existing job record for a job array. 1446 * IN job_ptr - META job record for a job array, which is to become an 1447 * individial task of the job array. 1448 * Set the job's array_task_id to the task to be split out. 1449 * RET - The new job record, which is the new META job record. */ 1450 extern job_record_t *job_array_split(job_record_t *job_ptr); 1451 1452 /* Record the start of one job array task */ 1453 extern void job_array_start(job_record_t *job_ptr); 1454 1455 /* Return true if a job array task can be started */ 1456 extern bool job_array_start_test(job_record_t *job_ptr); 1457 1458 /* Clear job's CONFIGURING flag and advance end time as needed */ 1459 extern void job_config_fini(job_record_t *job_ptr); 1460 1461 /* Reset a job's end_time based upon it's start_time and time_limit. 1462 * NOTE: Do not reset the end_time if already being preempted */ 1463 extern void job_end_time_reset(job_record_t *job_ptr); 1464 /* 1465 * job_hold_by_assoc_id - Hold all pending jobs with a given 1466 * association ID. This happens when an association is deleted (e.g. when 1467 * a user is removed from the association database). 1468 * RET count of held jobs 1469 */ 1470 extern int job_hold_by_assoc_id(uint32_t assoc_id); 1471 1472 /* 1473 * job_hold_by_qos_id - Hold all pending jobs with a given 1474 * QOS ID. This happens when a QOS is deleted (e.g. when 1475 * a QOS is removed from the association database). 1476 * RET count of held jobs 1477 */ 1478 extern int job_hold_by_qos_id(uint32_t qos_id); 1479 1480 /* log the completion of the specified job */ 1481 extern void job_completion_logger(job_record_t *job_ptr, bool requeue); 1482 1483 /* 1484 * Return total amount of memory allocated to a job. This can be based upon 1485 * a GRES specification with various GRES/memory allocations on each node. 1486 * If current allocation information is not available, estimate memory based 1487 * upon pn_min_memory and either CPU or node count. 1488 */ 1489 extern uint64_t job_get_tres_mem(struct job_resources *job_res, 1490 uint64_t pn_min_memory, uint32_t cpu_cnt, 1491 uint32_t node_cnt); 1492 1493 /* 1494 * job_epilog_complete - Note the completion of the epilog script for a 1495 * given job 1496 * IN job_id - id of the job for which the epilog was executed 1497 * IN node_name - name of the node on which the epilog was executed 1498 * IN return_code - return code from epilog script 1499 * RET true if job is COMPLETED, otherwise false 1500 */ 1501 extern bool job_epilog_complete(uint32_t job_id, char *node_name, 1502 uint32_t return_code); 1503 1504 /* 1505 * job_end_time - Process JOB_END_TIME 1506 * IN time_req_msg - job end time request 1507 * OUT timeout_msg - job timeout response to be sent 1508 * RET SLURM_SUCCESS or an error code 1509 */ 1510 extern int job_end_time(job_alloc_info_msg_t *time_req_msg, 1511 srun_timeout_msg_t *timeout_msg); 1512 1513 /* job_fini - free all memory associated with job records */ 1514 extern void job_fini (void); 1515 1516 /* 1517 * job_fail - terminate a job due to initiation failure 1518 * IN job_id - id of the job to be killed 1519 * IN job_state - desired job state (JOB_BOOT_FAIL, JOB_NODE_FAIL, etc.) 1520 * RET 0 on success, otherwise ESlurm error code 1521 */ 1522 extern int job_fail(uint32_t job_id, uint32_t job_state); 1523 1524 1525 /* job_hold_requeue() 1526 * 1527 * Requeue the job based upon its current state. 1528 * If JOB_SPECIAL_EXIT then requeue and hold with JOB_SPECIAL_EXIT state. 1529 * If JOB_REQUEUE_HOLD then requeue and hold. 1530 * If JOB_REQUEUE then requeue and let it run again. 1531 * The requeue can happen directly from job_requeue() or from 1532 * job_epilog_complete() after the last component has finished. 1533 */ 1534 extern bool job_hold_requeue(job_record_t *job_ptr); 1535 1536 /* 1537 * determine if job is ready to execute per the node select plugin 1538 * IN job_id - job to test 1539 * OUT ready - 1 if job is ready to execute 0 otherwise 1540 * RET Slurm error code 1541 */ 1542 extern int job_node_ready(uint32_t job_id, int *ready); 1543 1544 /* Record accounting information for a job immediately before changing size */ 1545 extern void job_pre_resize_acctg(job_record_t *job_ptr); 1546 1547 /* Record accounting information for a job immediately after changing size */ 1548 extern void job_post_resize_acctg(job_record_t *job_ptr); 1549 1550 /* 1551 * job_signal - signal the specified job, access checks already done 1552 * IN job_ptr - job to be signaled 1553 * IN signal - signal to send, SIGKILL == cancel the job 1554 * IN flags - see KILL_JOB_* flags in slurm.h 1555 * IN uid - uid of requesting user 1556 * IN preempt - true if job being preempted 1557 * RET 0 on success, otherwise ESLURM error code 1558 */ 1559 extern int job_signal(job_record_t *job_ptr, uint16_t signal, 1560 uint16_t flags, uid_t uid, bool preempt); 1561 1562 /* 1563 * job_signal_id - signal the specified job 1564 * IN job_id - id of the job to be signaled 1565 * IN signal - signal to send, SIGKILL == cancel the job 1566 * IN flags - see KILL_JOB_* flags in slurm.h 1567 * IN uid - uid of requesting user 1568 * IN preempt - true if job being preempted 1569 * RET 0 on success, otherwise ESLURM error code 1570 */ 1571 extern int job_signal_id(uint32_t job_id, uint16_t signal, uint16_t flags, 1572 uid_t uid, bool preempt); 1573 /* 1574 * het_job_signal - signal all components of a hetjob 1575 * IN het_job_leader - job record of job hetjob leader 1576 * IN signal - signal to send, SIGKILL == cancel the job 1577 * IN flags - see KILL_JOB_* flags in slurm.h 1578 * IN uid - uid of requesting user 1579 * IN preempt - true if job being preempted 1580 * RET 0 on success, otherwise ESLURM error code 1581 */ 1582 extern int het_job_signal(job_record_t *het_job_leader, uint16_t signal, 1583 uint16_t flags, uid_t uid, bool preempt); 1584 1585 /* 1586 * job_str_signal - signal the specified job 1587 * IN job_id_str - id of the job to be signaled, valid formats include "#" 1588 * "#_#" and "#_[expr]" 1589 * IN signal - signal to send, SIGKILL == cancel the job 1590 * IN flags - see KILL_JOB_* flags in slurm.h 1591 * IN uid - uid of requesting user 1592 * IN preempt - true if job being preempted 1593 * RET 0 on success, otherwise ESLURM error code 1594 */ 1595 extern int job_str_signal(char *job_id_str, uint16_t signal, uint16_t flags, 1596 uid_t uid, bool preempt); 1597 1598 /* 1599 * job_suspend/job_suspend2 - perform some suspend/resume operation 1600 * NB job_suspend - Uses the job_id field and ignores job_id_str 1601 * NB job_suspend2 - Ignores the job_id field and uses job_id_str 1602 * 1603 * IN sus_ptr - suspend/resume request message 1604 * IN uid - user id of the user issuing the RPC 1605 * IN conn_fd - file descriptor on which to send reply, 1606 * -1 if none 1607 * indf_susp IN - set if job is being suspended indefinitely by user or admin 1608 * and we should clear it's priority, otherwise suspended 1609 * temporarily for gang scheduling 1610 * IN protocol_version - slurm protocol version of client 1611 * RET 0 on success, otherwise ESLURM error code 1612 */ 1613 extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, 1614 int conn_fd, bool indf_susp, 1615 uint16_t protocol_version); 1616 extern int job_suspend2(suspend_msg_t *sus_ptr, uid_t uid, 1617 int conn_fd, bool indf_susp, 1618 uint16_t protocol_version); 1619 1620 /* 1621 * job_complete - note the normal termination the specified job 1622 * IN job_id - id of the job which completed 1623 * IN uid - user id of user issuing the RPC 1624 * IN requeue - job should be run again if possible 1625 * IN node_fail - true if job terminated due to node failure 1626 * IN job_return_code - job's return code, if set then set state to JOB_FAILED 1627 * RET - 0 on success, otherwise ESLURM error code 1628 * global: job_list - pointer global job list 1629 * last_job_update - time of last job table update 1630 */ 1631 extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, 1632 bool node_fail, uint32_t job_return_code); 1633 1634 /* 1635 * job_independent - determine if this job has a dependent job pending 1636 * or if the job's scheduled begin time is in the future 1637 * IN job_ptr - pointer to job being tested 1638 * RET - true if job no longer must be defered for another job 1639 */ 1640 extern bool job_independent(job_record_t *job_ptr); 1641 1642 /* 1643 * job_req_node_filter - job reqeust node filter. 1644 * clear from a bitmap the nodes which can not be used for a job 1645 * test memory size, required features, processor count, etc. 1646 * NOTE: Does not support exclusive OR of features. 1647 * It just matches first element of XOR and ignores count. 1648 * IN job_ptr - pointer to node to be scheduled 1649 * IN/OUT bitmap - set of nodes being considered for use 1650 * RET SLURM_SUCCESS or EINVAL if can't filter (exclusive OR of features) 1651 */ 1652 extern int job_req_node_filter(job_record_t *job_ptr, bitstr_t *avail_bitmap, 1653 bool test_only); 1654 1655 /* 1656 * job_requeue - Requeue a running or pending batch job 1657 * IN uid - user id of user issuing the RPC 1658 * IN job_id - id of the job to be requeued 1659 * IN msg - slurm_msg to send response back on 1660 * IN preempt - true if job being preempted 1661 * IN flags - JobExitRequeue | Hold | JobFailed | etc. 1662 * RET 0 on success, otherwise ESLURM error code 1663 */ 1664 extern int job_requeue(uid_t uid, uint32_t job_id, slurm_msg_t *msg, 1665 bool preempt, uint32_t flags); 1666 1667 /* 1668 * job_requeue2 - Requeue a running or pending batch job 1669 * IN uid - user id of user issuing the RPC 1670 * IN req_ptr - request including ID of the job to be requeued 1671 * IN msg - slurm_msg to send response back on 1672 * IN preempt - true if job being preempted 1673 * RET 0 on success, otherwise ESLURM error code 1674 */ 1675 extern int job_requeue2(uid_t uid, requeue_msg_t *req_ptr, slurm_msg_t *msg, 1676 bool preempt); 1677 1678 /* 1679 * job_set_top - Move the specified job to the top of the queue (at least 1680 * for that user ID, partition, account, and QOS). 1681 * 1682 * IN top_ptr - user request 1683 * IN uid - user id of the user issuing the RPC 1684 * IN conn_fd - file descriptor on which to send reply, 1685 * -1 if none 1686 * IN protocol_version - slurm protocol version of client 1687 * RET 0 on success, otherwise ESLURM error code 1688 */ 1689 extern int job_set_top(top_job_msg_t *top_ptr, uid_t uid, int conn_fd, 1690 uint16_t protocol_version); 1691 1692 /* 1693 * job_step_complete - note normal completion the specified job step 1694 * IN job_id - id of the job to be completed 1695 * IN step_id - id of the job step to be completed 1696 * IN uid - user id of user issuing the RPC 1697 * IN requeue - job should be run again if possible 1698 * IN job_return_code - job's return code, if set then set state to JOB_FAILED 1699 * RET 0 on success, otherwise ESLURM error code 1700 * global: job_list - pointer global job list 1701 * last_job_update - time of last job table update 1702 */ 1703 extern int job_step_complete (uint32_t job_id, uint32_t job_step_id, 1704 uid_t uid, bool requeue, uint32_t job_return_code); 1705 1706 /* 1707 * job_step_signal - signal the specified job step 1708 * IN job_id - id of the job to be cancelled 1709 * IN step_id - id of the job step to be cancelled 1710 * IN signal - user id of user issuing the RPC 1711 * IN flags - RPC flags 1712 * IN uid - user id of user issuing the RPC 1713 * RET 0 on success, otherwise ESLURM error code 1714 * global: job_list - pointer global job list 1715 * last_job_update - time of last job table update 1716 */ 1717 int job_step_signal(uint32_t job_id, uint32_t step_id, 1718 uint16_t signal, uint16_t flags, uid_t uid); 1719 1720 /* 1721 * job_time_limit - terminate jobs which have exceeded their time limit 1722 * global: job_list - pointer global job list 1723 * last_job_update - time of last job table update 1724 */ 1725 extern void job_time_limit (void); 1726 1727 /* Builds the tres_req_cnt and tres_req_str of a job. 1728 * Only set when job is pending. 1729 * NOTE: job write lock must be locked before calling this */ 1730 extern void job_set_req_tres(job_record_t *job_ptr, bool assoc_mgr_locked); 1731 1732 /* 1733 * job_set_tres - set the tres up when allocating the job. 1734 * Only set when job is running. 1735 * NOTE: job write lock must be locked before calling this */ 1736 extern void job_set_alloc_tres(job_record_t *job_ptr, bool assoc_mgr_locked); 1737 1738 /* 1739 * job_update_tres_cnt - when job is completing remove allocated tres 1740 * from count. 1741 * IN/OUT job_ptr - job structure to be updated 1742 * IN node_inx - node bit that is finished with job. 1743 * RET SLURM_SUCCES on success SLURM_ERROR on cpu_cnt underflow 1744 */ 1745 extern int job_update_tres_cnt(job_record_t *job_ptr, int node_inx); 1746 1747 /* 1748 * Modify a job's memory limit if allocated all memory on a node and that node 1749 * reboots, possibly with a different memory size (e.g. KNL MCDRAM mode changed) 1750 */ 1751 extern void job_validate_mem(job_record_t *job_ptr); 1752 1753 /* 1754 * check_job_step_time_limit - terminate jobsteps which have exceeded 1755 * their time limit 1756 * IN job_ptr - pointer to job containing steps to check 1757 * IN now - current time to use for the limit check 1758 */ 1759 extern void check_job_step_time_limit(job_record_t *job_ptr, time_t now); 1760 1761 /* 1762 * Kill job or job step 1763 * 1764 * IN job_step_kill_msg - msg with specs on which job/step to cancel. 1765 * IN uid - uid of user requesting job/step cancel. 1766 */ 1767 extern int kill_job_step(job_step_kill_msg_t *job_step_kill_msg, uint32_t uid); 1768 1769 /* 1770 * kill_job_by_part_name - Given a partition name, deallocate resource for 1771 * its jobs and kill them 1772 * IN part_name - name of a partition 1773 * RET number of killed jobs 1774 */ 1775 extern int kill_job_by_part_name(char *part_name); 1776 1777 /* 1778 * kill_job_on_node - Kill the specific job on a specific node. 1779 * IN job_ptr - pointer to terminating job 1780 * IN node_ptr - pointer to the node on which the job resides 1781 */ 1782 extern void kill_job_on_node(job_record_t *job_ptr, node_record_t *node_ptr); 1783 1784 /* 1785 * kill_job_by_front_end_name - Given a front end node name, deallocate 1786 * resource for its jobs and kill them. 1787 * IN node_name - name of a front end node 1788 * RET number of jobs associated with this front end node 1789 */ 1790 extern int kill_job_by_front_end_name(char *node_name); 1791 1792 /* 1793 * kill_running_job_by_node_name - Given a node name, deallocate RUNNING 1794 * or COMPLETING jobs from the node or kill them 1795 * IN node_name - name of a node 1796 * RET number of killed jobs 1797 */ 1798 extern int kill_running_job_by_node_name(char *node_name); 1799 1800 /* 1801 * kill_step_on_node - determine if the specified job has any job steps 1802 * allocated to the specified node and kill them unless no_kill flag 1803 * is set on the step 1804 * IN job_ptr - pointer to an active job record 1805 * IN node_ptr - pointer to a node record 1806 * IN node_fail - true of removed node has failed 1807 * RET count of killed job steps 1808 */ 1809 extern int kill_step_on_node(job_record_t *job_ptr, node_record_t *node_ptr, 1810 bool node_fail); 1811 1812 /* list_compare_config - compare two entry from the config list based upon 1813 * weight, see common/list.h for documentation */ 1814 int list_compare_config (void *config_entry1, void *config_entry2); 1815 1816 /* 1817 * list_find_feature - find an entry in the feature list, see list.h for 1818 * documentation 1819 * IN key - is feature name or NULL for all features 1820 * RET 1 if found, 0 otherwise 1821 */ 1822 extern int list_find_feature(void *feature_entry, void *key); 1823 1824 /* 1825 * list_find_part - find an entry in the partition list, see common/list.h 1826 * for documentation 1827 * IN key - partition name or "universal_key" for all partitions 1828 * RET 1 if matches key, 0 otherwise 1829 * global- part_list - the global partition list 1830 */ 1831 extern int list_find_part (void *part_entry, void *key); 1832 1833 /* 1834 * load_all_job_state - load the job state from file, recover from last 1835 * checkpoint. Execute this after loading the configuration file data. 1836 * RET 0 or error code 1837 */ 1838 extern int load_all_job_state ( void ); 1839 1840 /* 1841 * load_all_node_state - Load the node state from file, recover on slurmctld 1842 * restart. Execute this after loading the configuration file data. 1843 * Data goes into common storage. 1844 * IN state_only - if true over-write only node state, features, gres and reason 1845 * RET 0 or error code 1846 */ 1847 extern int load_all_node_state ( bool state_only ); 1848 1849 /* 1850 * load_last_job_id - load only the last job ID from state save file. 1851 * RET 0 or error code 1852 */ 1853 extern int load_last_job_id( void ); 1854 1855 /* 1856 * load_part_uid_allow_list - reload the allow_uid list of partitions 1857 * if required (updated group file or force set) 1858 * IN force - if set then always reload the allow_uid list 1859 */ 1860 extern void load_part_uid_allow_list ( int force ); 1861 1862 /* 1863 * load_all_part_state - load the partition state from file, recover from 1864 * slurmctld restart. execute this after loading the configuration 1865 * file data. 1866 */ 1867 extern int load_all_part_state ( void ); 1868 1869 /* 1870 * Create a new job step from data in a buffer (as created by 1871 * dump_job_stepstate) 1872 * IN/OUT - job_ptr - point to a job for which the step is to be loaded. 1873 * IN/OUT buffer - location from which to get data, pointers 1874 * automatically advanced 1875 */ 1876 extern int load_step_state(job_record_t *job_ptr, Buf buffer, 1877 uint16_t protocol_version); 1878 1879 /* 1880 * Log contents of avail_feature_list and active_feature_list 1881 */ 1882 extern void log_feature_lists(void); 1883 1884 /* make_node_alloc - flag specified node as allocated to a job 1885 * IN node_ptr - pointer to node being allocated 1886 * IN job_ptr - pointer to job that is starting 1887 */ 1888 extern void make_node_alloc(node_record_t *node_ptr, job_record_t *job_ptr); 1889 1890 /* make_node_comp - flag specified node as completing a job 1891 * IN node_ptr - pointer to node marked for completion of job 1892 * IN job_ptr - pointer to job that is completing 1893 * IN suspended - true if job was previously suspended 1894 */ 1895 extern void make_node_comp(node_record_t *node_ptr, job_record_t *job_ptr, 1896 bool suspended); 1897 1898 /* 1899 * make_node_idle - flag specified node as having finished with a job 1900 * IN node_ptr - pointer to node reporting job completion 1901 * IN job_ptr - pointer to job that just completed or NULL if not applicable 1902 */ 1903 extern void make_node_idle(node_record_t *node_ptr, job_record_t *job_ptr); 1904 1905 /* 1906 * Determine of the specified job can execute right now or is currently 1907 * blocked by a partition state or limit. These job states should match the 1908 * reason values returned by job_limits_check(). 1909 */ 1910 extern bool misc_policy_job_runnable_state(job_record_t *job_ptr); 1911 1912 /* msg_to_slurmd - send given msg_type every slurmd, no args */ 1913 extern void msg_to_slurmd (slurm_msg_type_t msg_type); 1914 1915 /* request a "configless" RPC be send to all slurmd nodes */ 1916 void push_reconfig_to_slurmd(void); 1917 1918 /* node_fini - free all memory associated with node records */ 1919 extern void node_fini (void); 1920 1921 /* node_did_resp - record that the specified node is responding 1922 * IN name - name of the node */ 1923 extern void node_did_resp (char *name); 1924 1925 /* 1926 * node_not_resp - record that the specified node is not responding 1927 * IN name - name of the node 1928 * IN msg_time - time message was sent 1929 * IN resp_type - what kind of response came back from the node 1930 */ 1931 extern void node_not_resp (char *name, time_t msg_time, 1932 slurm_msg_type_t resp_type); 1933 1934 /* For every node with the "not_responding" flag set, clear the flag 1935 * and log that the node is not responding using a hostlist expression */ 1936 extern void node_no_resp_msg(void); 1937 1938 /* For a given job ID return the number of PENDING tasks which have their 1939 * own separate job_record (do not count tasks in pending META job record) */ 1940 extern int num_pending_job_array_tasks(uint32_t array_job_id); 1941 1942 /* 1943 * pack_all_jobs - dump all job information for all jobs in 1944 * machine independent form (for network transmission) 1945 * OUT buffer_ptr - the pointer is set to the allocated buffer. 1946 * OUT buffer_size - set to size of the buffer in bytes 1947 * IN show_flags - job filtering options 1948 * IN uid - uid of user making request (for partition filtering) 1949 * IN filter_uid - pack only jobs belonging to this user if not NO_VAL 1950 * IN protocol_version - slurm protocol version of client 1951 * global: job_list - global list of job records 1952 * NOTE: the buffer at *buffer_ptr must be xfreed by the caller 1953 * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c 1954 * whenever the data format changes 1955 */ 1956 extern void pack_all_jobs(char **buffer_ptr, int *buffer_size, 1957 uint16_t show_flags, uid_t uid, uint32_t filter_uid, 1958 uint16_t protocol_version); 1959 1960 /* 1961 * pack_spec_jobs - dump job information for specified jobs in 1962 * machine independent form (for network transmission) 1963 * OUT buffer_ptr - the pointer is set to the allocated buffer. 1964 * OUT buffer_size - set to size of the buffer in bytes 1965 * IN show_flags - job filtering options 1966 * IN job_ids - list of job_ids to pack 1967 * IN uid - uid of user making request (for partition filtering) 1968 * IN filter_uid - pack only jobs belonging to this user if not NO_VAL 1969 * global: job_list - global list of job records 1970 * NOTE: the buffer at *buffer_ptr must be xfreed by the caller 1971 * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c 1972 * whenever the data format changes 1973 */ 1974 extern void pack_spec_jobs(char **buffer_ptr, int *buffer_size, List job_ids, 1975 uint16_t show_flags, uid_t uid, uint32_t filter_uid, 1976 uint16_t protocol_version); 1977 1978 /* 1979 * pack_all_node - dump all configuration and node information for all nodes 1980 * in machine independent form (for network transmission) 1981 * OUT buffer_ptr - pointer to the stored data 1982 * OUT buffer_size - set to size of the buffer in bytes 1983 * IN show_flags - node filtering options 1984 * IN uid - uid of user making request (for partition filtering) 1985 * IN protocol_version - slurm protocol version of client 1986 * global: node_record_table_ptr - pointer to global node table 1987 * NOTE: the caller must xfree the buffer at *buffer_ptr 1988 * NOTE: change slurm_load_node() in api/node_info.c when data format changes 1989 * NOTE: READ lock_slurmctld config before entry 1990 */ 1991 extern void pack_all_node (char **buffer_ptr, int *buffer_size, 1992 uint16_t show_flags, uid_t uid, 1993 uint16_t protocol_version); 1994 1995 /* Pack all scheduling statistics */ 1996 extern void pack_all_stat(int resp, char **buffer_ptr, int *buffer_size, 1997 uint16_t protocol_version); 1998 1999 /* 2000 * pack_ctld_job_step_info_response_msg - packs job step info 2001 * IN job_id - specific id or NO_VAL for all 2002 * IN step_id - specific id or NO_VAL for all 2003 * IN uid - user issuing request 2004 * IN show_flags - job step filtering options 2005 * OUT buffer - location to store data, pointers automatically advanced 2006 * IN protocol_version - slurm protocol version of client 2007 * RET - 0 or error code 2008 * NOTE: MUST free_buf buffer 2009 */ 2010 extern int pack_ctld_job_step_info_response_msg( 2011 uint32_t job_id, uint32_t step_id, uid_t uid, 2012 uint16_t show_flags, Buf buffer, uint16_t protocol_version); 2013 2014 /* 2015 * pack_all_part - dump all partition information for all partitions in 2016 * machine independent form (for network transmission) 2017 * OUT buffer_ptr - the pointer is set to the allocated buffer. 2018 * OUT buffer_size - set to size of the buffer in bytes 2019 * IN show_flags - partition filtering options 2020 * IN uid - uid of user making request (for partition filtering) 2021 * IN protocol_version - slurm protocol version of client 2022 * global: part_list - global list of partition records 2023 * NOTE: the buffer at *buffer_ptr must be xfreed by the caller 2024 * NOTE: change slurm_load_part() in api/part_info.c if data format changes 2025 */ 2026 extern void pack_all_part(char **buffer_ptr, int *buffer_size, 2027 uint16_t show_flags, uid_t uid, 2028 uint16_t protocol_version); 2029 2030 /* 2031 * pack_job - dump all configuration information about a specific job in 2032 * machine independent form (for network transmission) 2033 * IN dump_job_ptr - pointer to job for which information is requested 2034 * IN show_flags - job filtering options 2035 * IN/OUT buffer - buffer in which data is placed, pointers automatically 2036 * updated 2037 * IN uid - user requesting the data 2038 * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c 2039 * whenever the data format changes 2040 */ 2041 extern void pack_job(job_record_t *dump_job_ptr, uint16_t show_flags, 2042 Buf buffer, uint16_t protocol_version, uid_t uid); 2043 2044 /* 2045 * pack_part - dump all configuration information about a specific partition 2046 * in machine independent form (for network transmission) 2047 * IN part_ptr - pointer to partition for which information is requested 2048 * IN/OUT buffer - buffer in which data is placed, pointers automatically 2049 * updated 2050 * global: default_part_loc - pointer to the default partition 2051 * NOTE: if you make any changes here be sure to make the corresponding 2052 * changes to load_part_config in api/partition_info.c 2053 */ 2054 extern void pack_part(part_record_t *part_ptr, Buf buffer, 2055 uint16_t protocol_version); 2056 2057 /* 2058 * pack_one_job - dump information for one jobs in 2059 * machine independent form (for network transmission) 2060 * OUT buffer_ptr - the pointer is set to the allocated buffer. 2061 * OUT buffer_size - set to size of the buffer in bytes 2062 * IN job_id - ID of job that we want info for 2063 * IN show_flags - job filtering options 2064 * IN uid - uid of user making request (for partition filtering) 2065 * NOTE: the buffer at *buffer_ptr must be xfreed by the caller 2066 * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c 2067 * whenever the data format changes 2068 */ 2069 extern int pack_one_job(char **buffer_ptr, int *buffer_size, 2070 uint32_t job_id, uint16_t show_flags, uid_t uid, 2071 uint16_t protocol_version); 2072 2073 /* 2074 * pack_one_node - dump all configuration and node information for one node 2075 * in machine independent form (for network transmission) 2076 * OUT buffer_ptr - pointer to the stored data 2077 * OUT buffer_size - set to size of the buffer in bytes 2078 * IN show_flags - node filtering options 2079 * IN uid - uid of user making request (for partition filtering) 2080 * IN node_name - name of node for which information is desired, 2081 * use first node if name is NULL 2082 * IN protocol_version - slurm protocol version of client 2083 * global: node_record_table_ptr - pointer to global node table 2084 * NOTE: the caller must xfree the buffer at *buffer_ptr 2085 * NOTE: change slurm_load_node() in api/node_info.c when data format changes 2086 * NOTE: READ lock_slurmctld config before entry 2087 */ 2088 extern void pack_one_node (char **buffer_ptr, int *buffer_size, 2089 uint16_t show_flags, uid_t uid, char *node_name, 2090 uint16_t protocol_version); 2091 2092 /* part_is_visible - should user be able to see this partition */ 2093 extern bool part_is_visible(part_record_t *part_ptr, uid_t uid); 2094 2095 /* part_fini - free all memory associated with partition records */ 2096 extern void part_fini (void); 2097 2098 /* 2099 * Create a copy of a job's part_list *partition list 2100 * IN part_list_src - a job's part_list 2101 * RET copy of part_list_src, must be freed by caller 2102 */ 2103 extern List part_list_copy(List part_list_src); 2104 2105 /* 2106 * Determine of the specified job can execute right now or is currently 2107 * blocked by a partition state or limit. Execute job_limits_check() to 2108 * re-validate job state. 2109 */ 2110 extern bool part_policy_job_runnable_state(job_record_t *job_ptr); 2111 2112 /* 2113 * Validate a job's account against the partition's AllowAccounts or 2114 * DenyAccounts parameters. 2115 * IN part_ptr - Partition pointer 2116 * IN acct - account name 2117 * in job_ptr - Job pointer or NULL. If set and job can not run, then set the 2118 * job's state_desc and state_reason fields 2119 * RET SLURM_SUCCESS or error code 2120 */ 2121 extern int part_policy_valid_acct(part_record_t *part_ptr, char *acct, 2122 job_record_t *job_ptr); 2123 2124 /* 2125 * Validate a job's QOS against the partition's AllowQOS or DenyQOS parameters. 2126 * IN part_ptr - Partition pointer 2127 * IN qos_ptr - QOS pointer 2128 * in job_ptr - Job pointer or NULL. If set and job can not run, then set the 2129 * job's state_desc and state_reason fields 2130 * RET SLURM_SUCCESS or error code 2131 */ 2132 extern int part_policy_valid_qos(part_record_t *part_ptr, 2133 slurmdb_qos_rec_t *qos_ptr, 2134 job_record_t *job_ptr); 2135 2136 /* 2137 * partition_in_use - determine whether a partition is in use by a RUNNING 2138 * PENDING or SUSPENDED job 2139 * IN part_name - name of a partition 2140 * RET true if the partition is in use, else false 2141 */ 2142 extern bool partition_in_use(char *part_name); 2143 2144 /* 2145 * Set "batch_host" for this job based upon it's "batch_features" and 2146 * "node_bitmap". The selection is deferred in case a node's "active_features" 2147 * is changed by a reboot. 2148 * Return SLURM_SUCCESS or error code 2149 */ 2150 extern int pick_batch_host(job_record_t *job_ptr); 2151 2152 /* 2153 * prolog_complete - note the normal termination of the prolog 2154 * IN job_id - id of the job which completed 2155 * IN prolog_return_code - prolog's return code, 2156 * if set then set job state to FAILED 2157 * RET - 0 on success, otherwise ESLURM error code 2158 * global: job_list - pointer global job list 2159 * last_job_update - time of last job table update 2160 */ 2161 extern int prolog_complete(uint32_t job_id, uint32_t prolog_return_code); 2162 2163 /* 2164 * If the job or slurm.conf requests to not kill on invalid dependency, 2165 * then set the job state reason to WAIT_DEP_INVALID. Otherwise, kill the 2166 * job. 2167 */ 2168 extern void handle_invalid_dependency(job_record_t *job_ptr); 2169 2170 /* 2171 * purge_old_job - purge old job records. 2172 * The jobs must have completed at least MIN_JOB_AGE minutes ago. 2173 * Test job dependencies, handle after_ok, after_not_ok before 2174 * purging any jobs. 2175 * NOTE: READ lock slurmctld config and WRITE lock jobs before entry 2176 */ 2177 void purge_old_job(void); 2178 2179 /* Convert a comma delimited list of QOS names into a bitmap */ 2180 extern void qos_list_build(char *qos, bitstr_t **qos_bits); 2181 2182 /* Request that the job scheduler execute soon (typically within seconds) */ 2183 extern void queue_job_scheduler(void); 2184 2185 /* 2186 * rehash_jobs - Create or rebuild the job hash table. 2187 * NOTE: run lock_slurmctld before entry: Read config, write job 2188 */ 2189 extern void rehash_jobs(void); 2190 2191 /* 2192 * Rebuild a job step's core_bitmap_job after a job has just changed size 2193 * job_ptr IN - job that was just re-sized 2194 * orig_job_node_bitmap IN - The job's original node bitmap 2195 */ 2196 extern void rebuild_step_bitmaps(job_record_t *job_ptr, 2197 bitstr_t *orig_job_node_bitmap); 2198 2199 /* 2200 * After a job has fully completed run this to release the resouces 2201 * and remove it from the system. 2202 */ 2203 extern int post_job_step(step_record_t *step_ptr); 2204 2205 /* 2206 * Create the extern step and add it to the job. 2207 */ 2208 extern step_record_t *build_extern_step(job_record_t *job_ptr); 2209 2210 /* 2211 * Create the batch step and add it to the job. 2212 */ 2213 extern step_record_t *build_batch_step(job_record_t *job_ptr_in); 2214 2215 /* update first assigned job id as needed on reconfigure */ 2216 extern void reset_first_job_id(void); 2217 2218 /* 2219 * reset_job_bitmaps - reestablish bitmaps for existing jobs. 2220 * this should be called after rebuilding node information, 2221 * but before using any job entries. 2222 * global: last_job_update - time of last job table update 2223 * job_list - pointer to global job list 2224 */ 2225 extern void reset_job_bitmaps (void); 2226 2227 /* Reset a node's CPU load value */ 2228 extern void reset_node_load(char *node_name, uint32_t cpu_load); 2229 2230 /* Reset a node's free memory value */ 2231 extern void reset_node_free_mem(char *node_name, uint64_t free_mem); 2232 2233 /* Reset all scheduling statistics 2234 * level IN - clear backfilled_jobs count if set */ 2235 extern void reset_stats(int level); 2236 2237 /* 2238 * restore_node_features - Make node and config (from slurm.conf) fields 2239 * consistent for Features, Gres and Weight 2240 * IN recover - 2241 * 0, 1 - use data from config record, built using slurm.conf 2242 * 2 = use data from node record, built from saved state 2243 */ 2244 extern void restore_node_features(int recover); 2245 2246 /* Update time stamps for job step resume */ 2247 extern void resume_job_step(job_record_t *job_ptr); 2248 2249 /* run_backup - this is the backup controller, it should run in standby 2250 * mode, assuming control when the primary controller stops responding */ 2251 extern void run_backup(slurm_trigger_callbacks_t *callbacks); 2252 2253 /* 2254 * ping_controllers - ping other controllers in HA configuration. 2255 * IN active_controller - true if active controller, false if backup 2256 */ 2257 extern int ping_controllers(bool active_controller); 2258 2259 /* Spawn health check function for every node that is not DOWN */ 2260 extern void run_health_check(void); 2261 2262 /* save_all_state - save entire slurmctld state for later recovery */ 2263 extern void save_all_state(void); 2264 2265 /* make sure the assoc_mgr lists are up and running and state is 2266 * restored */ 2267 extern void ctld_assoc_mgr_init(slurm_trigger_callbacks_t *callbacks); 2268 2269 /* send all info for the controller to accounting */ 2270 extern void send_all_to_accounting(time_t event_time, int db_rc); 2271 2272 /* A slurmctld lock needs to at least have a node read lock set before 2273 * this is called */ 2274 extern void set_cluster_tres(bool assoc_mgr_locked); 2275 2276 /* sends all jobs in eligible state to accounting. Only needed at 2277 * first registration 2278 */ 2279 extern int send_jobs_to_accounting(void); 2280 2281 /* send all nodes in a down like state to accounting. Only needed at 2282 * first registration 2283 */ 2284 extern int send_nodes_to_accounting(time_t event_time); 2285 2286 /* Decrement slurmctld thread count (as applies to thread limit) */ 2287 extern void server_thread_decr(void); 2288 2289 /* Increment slurmctld thread count (as applies to thread limit) */ 2290 extern void server_thread_incr(void); 2291 2292 /* Set a job's alias_list string */ 2293 extern void set_job_alias_list(job_record_t *job_ptr); 2294 2295 /* 2296 * set_job_prio - set a default job priority 2297 * IN job_ptr - pointer to the job_record 2298 */ 2299 extern void set_job_prio(job_record_t *job_ptr); 2300 2301 /* 2302 * set_node_down - make the specified node's state DOWN if possible 2303 * (not in a DRAIN state), kill jobs as needed 2304 * IN name - name of the node 2305 * IN reason - why the node is DOWN 2306 */ 2307 extern void set_node_down (char *name, char *reason); 2308 2309 /* 2310 * set_node_down_ptr - make the specified compute node's state DOWN and 2311 * kill jobs as needed 2312 * IN node_ptr - node_ptr to the node 2313 * IN reason - why the node is DOWN 2314 */ 2315 void set_node_down_ptr(node_record_t *node_ptr, char *reason); 2316 2317 /* 2318 * set_slurmctld_state_loc - create state directory as needed and "cd" to it 2319 */ 2320 extern void set_slurmctld_state_loc(void); 2321 2322 /* 2323 * signal_step_tasks - send specific signal to specific job step 2324 * IN step_ptr - step record pointer 2325 * IN signal - signal to send 2326 * IN msg_type - message type to send 2327 */ 2328 void signal_step_tasks(step_record_t *step_ptr, uint16_t signal, 2329 slurm_msg_type_t msg_type); 2330 2331 /* 2332 * signal_step_tasks_on_node - send specific signal to specific job step 2333 * on a specific node. 2334 * IN node_name - name of node on which to signal tasks 2335 * IN step_ptr - step record pointer 2336 * IN signal - signal to send 2337 * IN msg_type - message type to send 2338 */ 2339 void signal_step_tasks_on_node(char* node_name, step_record_t *step_ptr, 2340 uint16_t signal, slurm_msg_type_t msg_type); 2341 2342 /* 2343 * slurmctld_shutdown - wake up slurm_rpc_mgr thread via signal 2344 * RET 0 or error code 2345 */ 2346 extern int slurmctld_shutdown(void); 2347 2348 /* Update a job's record of allocated CPUs when a job step gets scheduled */ 2349 extern void step_alloc_lps(step_record_t *step_ptr); 2350 2351 /* 2352 * step_create - creates a step_record in step_specs->job_id, sets up the 2353 * according to the step_specs. 2354 * IN step_specs - job step specifications 2355 * OUT new_step_record - pointer to the new step_record (NULL on error) 2356 * IN protocol_version - slurm protocol version of client 2357 * RET - 0 or error code 2358 * NOTE: don't free the returned step_record because that is managed through 2359 * the job. 2360 */ 2361 extern int step_create(job_step_create_request_msg_t *step_specs, 2362 step_record_t **new_step_record, 2363 uint16_t protocol_version); 2364 2365 /* 2366 * step_layout_create - creates a step_layout according to the inputs. 2367 * IN step_ptr - step having tasks layed out 2368 * IN step_node_list - node list of hosts in step 2369 * IN node_count - count of nodes in step allocation 2370 * IN num_tasks - number of tasks in step 2371 * IN cpus_per_task - number of cpus per task 2372 * IN task_dist - type of task distribution 2373 * IN plane_size - size of plane (only needed for the plane distribution) 2374 * RET - NULL or slurm_step_layout_t * 2375 * NOTE: you need to free the returned step_layout usually when the 2376 * step is freed. 2377 */ 2378 extern slurm_step_layout_t *step_layout_create(step_record_t *step_ptr, 2379 char *step_node_list, 2380 uint32_t node_count, 2381 uint32_t num_tasks, 2382 uint16_t cpus_per_task, 2383 uint32_t task_dist, 2384 uint16_t plane_size); 2385 2386 /* 2387 * step_list_purge - Simple purge of a job's step list records. 2388 * IN job_ptr - pointer to job table entry to have step records removed 2389 */ 2390 extern void step_list_purge(job_record_t *job_ptr); 2391 2392 /* 2393 * step_epilog_complete - note completion of epilog on some node and 2394 * release it's switch windows if appropriate. can perform partition 2395 * switch window releases. 2396 * IN job_ptr - pointer to job which has completed epilog 2397 * IN node_name - name of node which has completed epilog 2398 */ 2399 extern int step_epilog_complete(job_record_t *job_ptr, char *node_name); 2400 2401 /* 2402 * step_partial_comp - Note the completion of a job step on at least 2403 * some of its nodes 2404 * IN req - step_completion_msg RPC from slurmstepd 2405 * IN uid - UID issuing the request 2406 * OUT rem - count of nodes for which responses are still pending 2407 * OUT max_rc - highest return code for any step thus far 2408 * RET 0 on success, otherwise ESLURM error code 2409 */ 2410 extern int step_partial_comp(step_complete_msg_t *req, uid_t uid, 2411 int *rem, uint32_t *max_rc); 2412 2413 /* 2414 * step_set_alloc_tres - set the tres up when allocating the step. 2415 * Only set when job is running. 2416 * NOTE: job write lock must be locked before calling this */ 2417 extern void step_set_alloc_tres(step_record_t *step_ptr, uint32_t node_count, 2418 bool assoc_mgr_locked, bool make_formatted); 2419 2420 /* Update time stamps for job step suspend */ 2421 extern void suspend_job_step(job_record_t *job_ptr); 2422 2423 /* For the job array data structure, build the string representation of the 2424 * bitmap. 2425 * NOTE: bit_fmt_hexmask() is far more scalable than bit_fmt(). */ 2426 extern void build_array_str(job_record_t *job_ptr); 2427 2428 /* Return true if ALL tasks of specific array job ID are complete */ 2429 extern bool test_job_array_complete(uint32_t array_job_id); 2430 2431 /* Return true if ALL tasks of specific array job ID are completed */ 2432 extern bool test_job_array_completed(uint32_t array_job_id); 2433 2434 /* Return true if ALL tasks of specific array job ID are finished */ 2435 extern bool test_job_array_finished(uint32_t array_job_id); 2436 2437 /* Return true if ANY tasks of specific array job ID are pending */ 2438 extern bool test_job_array_pending(uint32_t array_job_id); 2439 2440 /* Determine of the nodes are ready to run a job 2441 * RET true if ready */ 2442 extern bool test_job_nodes_ready(job_record_t *job_ptr); 2443 2444 /* 2445 * Synchronize the batch job in the system with their files. 2446 * All pending batch jobs must have script and environment files 2447 * No other jobs should have such files 2448 */ 2449 extern int sync_job_files(void); 2450 2451 /* After recovering job state, if using priority/basic then we increment the 2452 * priorities of all jobs to avoid decrementing the base down to zero */ 2453 extern void sync_job_priorities(void); 2454 2455 /* True if running jobs are allowed to expand, false otherwise. */ 2456 extern bool permit_job_expansion(void); 2457 2458 /* True if running jobs are allowed to shrink, false otherwise. */ 2459 extern bool permit_job_shrink(void); 2460 2461 /* 2462 * update_job - update a job's parameters per the supplied specifications 2463 * IN msg - RPC to update job, including change specification 2464 * IN uid - uid of user issuing RPC 2465 * IN send_msg - whether to send msg back or not 2466 * RET returns an error code from slurm_errno.h 2467 * global: job_list - global list of job entries 2468 * last_job_update - time of last job table update 2469 */ 2470 extern int update_job(slurm_msg_t *msg, uid_t uid, bool send_msg); 2471 2472 /* 2473 * IN msg - RPC to update job, including change specification 2474 * IN job_specs - a job's specification 2475 * IN uid - uid of user issuing RPC 2476 * RET returns an error code from slurm_errno.h 2477 * global: job_list - global list of job entries 2478 * last_job_update - time of last job table update 2479 */ 2480 extern int update_job_str(slurm_msg_t *msg, uid_t uid); 2481 2482 /* 2483 * Modify the wckey associated with a pending job 2484 * IN module - where this is called from 2485 * IN job_ptr - pointer to job which should be modified 2486 * IN new_wckey - desired wckey name 2487 * RET SLURM_SUCCESS or error code 2488 */ 2489 extern int update_job_wckey(char *module, job_record_t *job_ptr, 2490 char *new_wckey); 2491 2492 /* Reset nodes_completing field for all jobs */ 2493 extern void update_job_nodes_completing(void); 2494 2495 /* Reset slurmctld logging based upon configuration parameters 2496 * uses common slurmctld_conf data structure */ 2497 extern void update_logging(void); 2498 2499 /* 2500 * update_node - update the configuration data for one or more nodes 2501 * IN update_node_msg - update node request 2502 * RET 0 or error code 2503 * global: node_record_table_ptr - pointer to global node table 2504 */ 2505 extern int update_node ( update_node_msg_t * update_node_msg ) ; 2506 2507 /* Update nodes accounting usage data */ 2508 extern void update_nodes_acct_gather_data(void); 2509 2510 /* 2511 * update_node_record_acct_gather_data - update the energy data in the 2512 * node_record 2513 * IN msg - node energy data message 2514 * RET 0 if no error, ENOENT if no such node 2515 */ 2516 extern int update_node_record_acct_gather_data( 2517 acct_gather_node_resp_msg_t *msg); 2518 2519 /* 2520 * Process string and set partition fields to appropriate values if valid 2521 * 2522 * IN billing_weights_str - suggested billing weights 2523 * IN part_ptr - pointer to partition 2524 * IN fail - whether the inner function should fatal if the string is invalid. 2525 * RET return SLURM_ERROR on error, SLURM_SUCESS otherwise. 2526 */ 2527 extern int set_partition_billing_weights(char *billing_weights_str, 2528 part_record_t *part_ptr, bool fail); 2529 2530 /* 2531 * update_part - create or update a partition's configuration data 2532 * IN part_desc - description of partition changes 2533 * IN create_flag - create a new partition 2534 * RET 0 or an error code 2535 * global: part_list - list of partition entries 2536 * last_part_update - update time of partition records 2537 */ 2538 extern int update_part (update_part_msg_t * part_desc, bool create_flag); 2539 2540 /* Process job step update request from specified user, 2541 * RET - 0 or error code */ 2542 extern int update_step(step_update_request_msg_t *req, uid_t uid); 2543 2544 /* 2545 * validate_alloc_node - validate that the allocating node 2546 * is allowed to use this partition 2547 * IN part_ptr - pointer to a partition 2548 * IN alloc_node - allocting node of the request 2549 * RET 1 if permitted to run, 0 otherwise 2550 */ 2551 extern int validate_alloc_node(part_record_t *part_ptr, char *alloc_node); 2552 2553 /* 2554 * validate_group - validate that the submit uid is authorized to run in 2555 * this partition 2556 * IN part_ptr - pointer to a partition 2557 * IN run_uid - user to run the job as 2558 * RET 1 if permitted to run, 0 otherwise 2559 */ 2560 extern int validate_group(part_record_t *part_ptr, uid_t run_uid); 2561 2562 /* Perform some size checks on strings we store to prevent 2563 * malicious user filling slurmctld's memory 2564 * IN job_desc - user job submit request 2565 * IN submit_uid - UID making job submit request 2566 * OUT err_msg - custom error message to return 2567 * RET 0 or error code */ 2568 extern int validate_job_create_req(job_desc_msg_t * job_desc, uid_t submit_uid, 2569 char **err_msg); 2570 2571 /* 2572 * validate_jobs_on_node - validate that any jobs that should be on the node 2573 * are actually running, if not clean up the job records and/or node 2574 * records, call this function after validate_node_specs() sets the node 2575 * state properly 2576 * IN reg_msg - node registration message 2577 */ 2578 extern void validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg); 2579 2580 /* 2581 * validate_node_specs - validate the node's specifications as valid, 2582 * if not set state to down, in any case update last_response 2583 * IN reg_msg - node registration message 2584 * IN protocol_version - Version of Slurm on this node 2585 * OUT newly_up - set if node newly brought into service 2586 * RET 0 if no error, ENOENT if no such node, EINVAL if values too low 2587 * NOTE: READ lock_slurmctld config before entry 2588 */ 2589 extern int validate_node_specs(slurm_node_registration_status_msg_t *reg_msg, 2590 uint16_t protocol_version, bool *newly_up); 2591 2592 /* 2593 * validate_nodes_via_front_end - validate all nodes on a cluster as having 2594 * a valid configuration as soon as the front-end registers. Individual 2595 * nodes will not register with this configuration 2596 * IN reg_msg - node registration message 2597 * IN protocol_version - Version of Slurm on this node 2598 * OUT newly_up - set if node newly brought into service 2599 * RET 0 if no error, Slurm error code otherwise 2600 * NOTE: READ lock_slurmctld config before entry 2601 */ 2602 extern int validate_nodes_via_front_end( 2603 slurm_node_registration_status_msg_t *reg_msg, 2604 uint16_t protocol_version, bool *newly_up); 2605 2606 /* 2607 * validate_slurm_user - validate that the uid is authorized to see 2608 * privileged data (either user root or SlurmUser) 2609 * IN uid - user to validate 2610 * RET true if permitted to run, false otherwise 2611 */ 2612 extern bool validate_slurm_user(uid_t uid); 2613 2614 /* 2615 * validate_super_user - validate that the uid is authorized at the 2616 * root, SlurmUser, or SLURMDB_ADMIN_SUPER_USER level 2617 * IN uid - user to validate 2618 * RET true if permitted to run, false otherwise 2619 */ 2620 extern bool validate_super_user(uid_t uid); 2621 2622 /* 2623 * validate_operator - validate that the uid is authorized at the 2624 * root, SlurmUser, or SLURMDB_ADMIN_OPERATOR level 2625 * IN uid - user to validate 2626 * RET true if permitted to run, false otherwise 2627 */ 2628 extern bool validate_operator(uid_t uid); 2629 2630 /* cleanup_completing() 2631 * 2632 * Clean up the JOB_COMPLETING flag and eventually 2633 * requeue the job if there is a pending request 2634 * for it. This function assumes the caller has the 2635 * appropriate locks on the job_record. 2636 * This function is called when a job completes 2637 * by either when the slurmd epilog finishes or 2638 * when the slurmctld epilog finishes, whichever 2639 * comes last. 2640 */ 2641 extern void cleanup_completing(job_record_t *job_ptr); 2642 2643 /* trace_job() - print the job details if 2644 * the DEBUG_FLAG_TRACE_JOBS is set 2645 */ 2646 extern void trace_job(job_record_t *job_ptr, const char *, const char *); 2647 2648 /* 2649 * Determine if slurmctld will respond to "configless" RPCs. If so, 2650 * load the internal cached config values to avoid regenerating on each 2651 * RPC. 2652 */ 2653 extern void configless_setup(void); 2654 /* Free cached values to avoid memory leak. */ 2655 extern void configless_clear(void); 2656 2657 /* 2658 */ 2659 int 2660 waitpid_timeout(const char *, pid_t, int *, int); 2661 2662 /* 2663 * Calculate and populate the number of tres' for all partitions. 2664 */ 2665 extern void set_partition_tres(); 2666 2667 /* 2668 * Update job's federated siblings strings. 2669 * 2670 * IN job_ptr - job_ptr to update 2671 */ 2672 extern void update_job_fed_details(job_record_t *job_ptr); 2673 2674 /* 2675 * purge_job_record - purge specific job record. No testing is performed to 2676 * ensure the job records has no active references. Use only for job 2677 * records that were never fully operational (e.g. WILL_RUN test, failed 2678 * job load, failed job create, etc.). 2679 * IN job_id - job_id of job record to be purged 2680 * RET int - count of job's purged 2681 * global: job_list - global job table 2682 */ 2683 extern int purge_job_record(uint32_t job_id); 2684 2685 /* 2686 * Remove job from job hashes so that it can't be found, but leave job in 2687 * job_table so that it can be deleted by _list_delete_job(). 2688 * 2689 * IN job_ptr - job_ptr to be unlinked 2690 */ 2691 extern void unlink_job_record(job_record_t *job_ptr); 2692 2693 /* 2694 * copy_job_record_to_job_desc - construct a job_desc_msg_t for a job. 2695 * IN job_ptr - the job record 2696 * RET the job_desc_msg_t, NULL on error 2697 */ 2698 extern job_desc_msg_t *copy_job_record_to_job_desc(job_record_t *job_ptr); 2699 2700 2701 /* 2702 * Set the allocation response with the current cluster's information and the 2703 * job's allocated node's addr's if the allocation is being filled by a cluster 2704 * other than the cluster that submitted the job 2705 * 2706 * Note: make sure that the resp's working_cluster_rec is NULL'ed out before the 2707 * resp is free'd since it points to global memory. 2708 * 2709 * IN resp - allocation response being sent back to client. 2710 * IN job_ptr - allocated job 2711 * IN req_cluster - the cluster requesting the allocation info. 2712 */ 2713 extern void 2714 set_remote_working_response(resource_allocation_response_msg_t *resp, 2715 job_record_t *job_ptr, 2716 const char *req_cluster); 2717 2718 /* 2719 * Free job's fed_details ptr. 2720 */ 2721 extern void free_job_fed_details(job_fed_details_t **fed_details_pptr); 2722 2723 /* 2724 * Calculate billable TRES based on partition's defined BillingWeights. If none 2725 * is defined, return total_cpus. This is cached on job_ptr->billable_tres and 2726 * is updated if the job was resized since the last iteration. 2727 * 2728 * IN job_ptr - job to calc billable tres on 2729 * IN start_time - time the has started or been resized 2730 * IN assoc_mgr_locked - whether the tres assoc lock is set or not 2731 */ 2732 extern double calc_job_billable_tres(job_record_t *job_ptr, time_t start_time, 2733 bool assoc_mgr_locked); 2734 2735 /* 2736 * Realloc and possibly update a job_ptr->limit_set->tres array. 2737 * 2738 * If a new TRES is added the TRES positions in the array could have been moved 2739 * around. The array either needs to be grown and/or the values need to be put 2740 * in their new position. 2741 * 2742 * IN: tres_limits - job_ptr->limit_set->tres array. 2743 */ 2744 extern void update_job_limit_set_tres(uint16_t **tres_limits); 2745 2746 /* 2747 * Validate TRES specification of the form: 2748 * "name=[type:]#[,[type:]#][;name=[type:]#]" 2749 * For example: "gpu:kepler:2,craynetwork=1" 2750 */ 2751 extern bool valid_tres_cnt(char *tres); 2752 2753 /* 2754 * Validate the named TRES is valid for scheduling parameters. 2755 * This is currently a subset of all defined TRES. 2756 */ 2757 extern bool valid_tres_name(char *name); 2758 2759 /* 2760 * Check for nodes that haven't rebooted yet. 2761 * 2762 * If the node hasn't booted by ResumeTimeout, mark the node as down. 2763 */ 2764 extern void check_reboot_nodes(); 2765 2766 /* 2767 * Send warning signal to job before end time. 2768 * 2769 * IN job_ptr - job to send warn signal to. 2770 * IN ignore_time - If set, ignore the warn time and just send it. 2771 */ 2772 extern void send_job_warn_signal(job_record_t *job_ptr, bool ignore_time); 2773 2774 /* 2775 * Check if waiting for the node to still boot. 2776 * 2777 * IN node_ptr - node to check if still waiting for boot. 2778 * 2779 * RET return true if still expecting the node to boot, false otherwise. 2780 */ 2781 extern bool waiting_for_node_boot(struct node_record *node_ptr); 2782 /* 2783 * Check if any part of job_ptr is overlaping node_map. 2784 * IN node_map - bitstr of nodes set. 2785 * IN job_ptr (hetjob or not) to check. 2786 * 2787 * RET true if we overlap, false otherwise 2788 */ 2789 extern bool job_overlap_and_running(bitstr_t *node_map, job_record_t *job_ptr); 2790 2791 /* 2792 * Respond to request for backup slurmctld status 2793 */ 2794 extern void slurm_rpc_control_status(slurm_msg_t *msg, time_t control_time); 2795 2796 /* 2797 * Callbacks to let the PrEp plugins signal completion if running async. 2798 */ 2799 extern void prep_prolog_slurmctld_callback(int rc, uint32_t job_id); 2800 extern void prep_epilog_slurmctld_callback(int rc, uint32_t job_id); 2801 2802 #endif /* !_HAVE_SLURMCTLD_H */ 2803