1 /*****************************************************************************\ 2 * job_scheduler.h - data structures and function definitions for scheduling 3 * of pending jobs in priority order 4 ***************************************************************************** 5 * Copyright (C) 2002-2007 The Regents of the University of California. 6 * Copyright (C) 2008-2010 Lawrence Livermore National Security. 7 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 8 * Written by Morris Jette <jette@llnl.gov>, et. al. 9 * Derived from dsh written by Jim Garlick <garlick1@llnl.gov> 10 * CODE-OCEC-09-009. All rights reserved. 11 * 12 * This file is part of Slurm, a resource management program. 13 * For details, see <https://slurm.schedmd.com/>. 14 * Please also read the included file: DISCLAIMER. 15 * 16 * Slurm is free software; you can redistribute it and/or modify it under 17 * the terms of the GNU General Public License as published by the Free 18 * Software Foundation; either version 2 of the License, or (at your option) 19 * any later version. 20 * 21 * In addition, as a special exception, the copyright holders give permission 22 * to link the code of portions of this program with the OpenSSL library under 23 * certain conditions as described in each individual source file, and 24 * distribute linked combinations including the two. You must obey the GNU 25 * General Public License in all respects for all of the code used other than 26 * OpenSSL. If you modify file(s) with this exception, you may extend this 27 * exception to your version of the file(s), but you are not obligated to do 28 * so. If you do not wish to do so, delete this exception statement from your 29 * version. If you delete this exception statement from all source files in 30 * the program, then also delete it here. 31 * 32 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY 33 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 34 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 35 * details. 36 * 37 * You should have received a copy of the GNU General Public License along 38 * with Slurm; if not, write to the Free Software Foundation, Inc., 39 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 40 \*****************************************************************************/ 41 42 #ifndef _JOB_SCHEDULER_H 43 #define _JOB_SCHEDULER_H 44 45 #include "src/slurmctld/slurmctld.h" 46 47 typedef struct job_queue_rec { 48 uint32_t array_task_id; /* Job array, task ID */ 49 uint32_t job_id; /* Job ID */ 50 job_record_t *job_ptr; /* Pointer to job record */ 51 part_record_t *part_ptr; /* Pointer to partition record. Each 52 * job may have multiple partitions. */ 53 uint32_t priority; /* Job priority in THIS partition */ 54 slurmctld_resv_t *resv_ptr; /* If job didn't ask for a reservation, 55 * this reservation is one it can run 56 * in without requesting */ 57 } job_queue_rec_t; 58 59 /* Use as return values for test_job_dependency. */ 60 enum { 61 NO_DEPEND = 0, 62 LOCAL_DEPEND, 63 FAIL_DEPEND, 64 REMOTE_DEPEND 65 }; 66 67 /* 68 * build_feature_list - Translate a job's feature string into a feature_list 69 * IN details->features 70 * OUT details->feature_list 71 * RET error code 72 */ 73 extern int build_feature_list(job_record_t *job_ptr); 74 75 /* 76 * Set up job_queue_rec->job_ptr to use a promiscous reservation if the 77 * job_queue_rec has resv_name filled in. 78 */ 79 extern void job_queue_rec_prom_resv(job_queue_rec_t *job_queue_rec); 80 81 /* 82 * build_job_queue - build (non-priority ordered) list of pending jobs 83 * IN clear_start - if set then clear the start_time for pending jobs 84 * IN backfill - true if running backfill scheduler, enforce min time limit 85 * RET the job queue 86 * NOTE: the caller must call list_destroy() on RET value to free memory 87 */ 88 extern List build_job_queue(bool clear_start, bool backfill); 89 90 /* Given a scheduled job, return a pointer to it batch_job_launch_msg_t data */ 91 extern batch_job_launch_msg_t *build_launch_job_msg(job_record_t *job_ptr, 92 uint16_t protocol_version); 93 94 /* Determine if job's deadline specification is still valid, kill job if not 95 * job_ptr IN - Job to test 96 * func IN - function named used for logging, "sched" or "backfill" 97 * RET - true of valid, false if invalid and job cancelled 98 */ 99 extern bool deadline_ok(job_record_t *job_ptr, char *func); 100 101 /* 102 * epilog_slurmctld - execute the prolog_slurmctld for a job that has just 103 * terminated. 104 * IN job_ptr - pointer to job that has been terminated 105 */ 106 extern void epilog_slurmctld(job_record_t *job_ptr); 107 108 /* 109 * Delete a record from a job's feature_list 110 */ 111 extern void feature_list_delete(void *x); 112 113 /* 114 * Return a pointer to the dependency in job_ptr's dependency list that 115 * matches dep_ptr, or NULL if none is found. 116 * 117 * A dependency "matches" when the job_id and depend_type are the same. 118 */ 119 extern depend_spec_t *find_dependency(job_record_t *job_ptr, 120 depend_spec_t *dep_ptr); 121 122 /* 123 * Update a job's state_reason, state_desc, and dependency string based on the 124 * states of its dependencies. 125 * 126 * This is called by list_for_each() and thus has 2 void* parameters: 127 * object is a pointer to job_record_t. 128 * arg is unused. 129 */ 130 extern int handle_job_dependency_updates(void *object, void *arg); 131 132 /* 133 * job_is_completing - Determine if jobs are in the process of completing. 134 * IN/OUT eff_cg_bitmap - optional bitmap of all relevent completing nodes, 135 * relevenace determined by filtering via CompleteWait 136 * if NULL, function will terminate at first completing 137 * job 138 * RET - True of any job is in the process of completing AND 139 * CompleteWait is configured non-zero 140 * NOTE: This function can reduce resource fragmentation, which is a 141 * critical issue on Elan interconnect based systems. 142 */ 143 extern bool job_is_completing(bitstr_t *eff_cg_bitmap); 144 145 /* Determine if a pending job will run using only the specified nodes 146 * (in job_desc_msg->req_nodes), build response message and return 147 * SLURM_SUCCESS on success. Otherwise return an error code. Caller 148 * must free response message */ 149 extern int job_start_data(job_desc_msg_t *job_desc_msg, 150 will_run_response_msg_t **resp); 151 152 /* 153 * launch_job - send an RPC to a slurmd to initiate a batch job 154 * IN job_ptr - pointer to job that will be initiated 155 */ 156 extern void launch_job(job_record_t *job_ptr); 157 158 /* 159 * make_batch_job_cred - add a job credential to the batch_job_launch_msg 160 * IN/OUT launch_msg_ptr - batch_job_launch_msg in which job_id, step_id, 161 * uid and nodes have already been set 162 * IN job_ptr - pointer to job record 163 * RET 0 or error code 164 */ 165 extern int make_batch_job_cred(batch_job_launch_msg_t *launch_msg_ptr, 166 job_record_t *job_ptr, 167 uint16_t protocol_version); 168 169 /* 170 * Determine which nodes must be rebooted for a job 171 * IN job_ptr - pointer to job that will be initiated 172 * RET bitmap of nodes requiring a reboot for NodeFeaturesPlugin or NULL if none 173 */ 174 extern bitstr_t *node_features_reboot(job_record_t *job_ptr); 175 176 /* 177 * Determine if node boot required for this job 178 * IN job_ptr - pointer to job that will be initiated 179 * IN node_bitmap - nodes to be allocated 180 * RET - true if reboot required 181 */ 182 extern bool node_features_reboot_test(job_record_t *job_ptr, 183 bitstr_t *node_bitmap); 184 185 /* Print a job's dependency information based upon job_ptr->depend_list */ 186 extern void print_job_dependency(job_record_t *job_ptr, const char *func); 187 188 /* Decrement a job's prolog_running counter and launch the job if zero */ 189 extern void prolog_running_decr(job_record_t *job_ptr); 190 191 /* 192 * prolog_slurmctld - execute the prolog_slurmctld for a job that has just 193 * been allocated resources. 194 * IN job_ptr - pointer to job that will be initiated 195 */ 196 extern void prolog_slurmctld(job_record_t *job_ptr); 197 198 /* 199 * reboot_job_nodes - Reboot the compute nodes allocated to a job. 200 * IN job_ptr - pointer to job that will be initiated 201 * RET SLURM_SUCCESS(0) or error code 202 */ 203 extern int reboot_job_nodes(job_record_t *job_ptr); 204 205 /* If a job can run in multiple partitions, make sure that the one 206 * actually used is first in the string. Needed for job state save/restore */ 207 extern void rebuild_job_part_list(job_record_t *job_ptr); 208 209 /* 210 * schedule - attempt to schedule all pending jobs 211 * pending jobs for each partition will be scheduled in priority 212 * order until a request fails 213 * IN job_limit - maximum number of jobs to test now, avoid testing the full 214 * queue on every job submit (0 means to use the system default, 215 * SchedulerParameters for default_queue_depth) 216 * RET count of jobs scheduled 217 * Note: If the scheduler has executed recently, rather than executing again 218 * right away, a thread will be spawned to execute later in an effort 219 * to reduce system overhead. 220 * Note: We re-build the queue every time. Jobs can not only be added 221 * or removed from the queue, but have their priority or partition 222 * changed with the update_job RPC. In general nodes will be in priority 223 * order (by submit time), so the sorting should be pretty fast. 224 */ 225 extern int schedule(uint32_t job_limit); 226 227 /* 228 * set_job_elig_time - set the eligible time for pending jobs once their 229 * dependencies are lifted (in job->details->begin_time) 230 */ 231 extern void set_job_elig_time(void); 232 233 /* 234 * sort_job_queue - sort job_queue in decending priority order 235 * IN/OUT job_queue - sorted job queue previously made by build_job_queue() 236 */ 237 extern void sort_job_queue(List job_queue); 238 239 /* Note this differs from the ListCmpF typedef since we want jobs sorted 240 * in order of decreasing priority */ 241 extern int sort_job_queue2(void *x, void *y); 242 243 /* 244 * Determine if a job's dependencies are met 245 * Inputs: job_ptr 246 * Outputs: was_changed (optional) - 247 * If it exists, set it to true if at least 1 dependency changed 248 * state, otherwise false. 249 * RET: NO_DEPEND = no dependencies 250 * LOCAL_DEPEND = dependencies remain 251 * FAIL_DEPEND = failure (job completion code not per dependency), 252 * delete the job 253 */ 254 extern int test_job_dependency(job_record_t *job_ptr, bool *was_changed); 255 256 /* 257 * Parse a job dependency string and use it to establish a "depend_spec" 258 * list of dependencies. We accept both old format (a single job ID) and 259 * new format (e.g. "afterok:123:124,after:128"). 260 * IN job_ptr - job record to have dependency and depend_list updated 261 * IN new_depend - new dependency description 262 * RET returns an error code from slurm_errno.h 263 */ 264 extern int update_job_dependency(job_record_t *job_ptr, char *new_depend); 265 266 /* 267 * new_depend_list is a dependency list that came from a sibling cluster. It 268 * has updates to the job dependencies on that cluster. Use those changes to 269 * update the dependency list of job_ptr. 270 * Return true if a dependency was updated, false if not. 271 */ 272 extern bool update_job_dependency_list(job_record_t *job_ptr, 273 List new_depend_list); 274 275 /* 276 * When an array job is rejected for some reason, the remaining array tasks will 277 * get skipped by both the main scheduler and the backfill scheduler (it's an 278 * optimization). Hence, their reasons should match the reason of the first job. 279 * This function sets those reasons. 280 * 281 * job_ptr (IN) The current job being evaluated, after it has gone 282 * through the scheduling loop. 283 * reject_array_job (IN) A pointer to the first job (array task) in the most 284 * recently rejected array job. If job_ptr belongs to the 285 * same array as reject_array_job, then set job_ptr's 286 * reason to match reject_array_job. 287 */ 288 extern void fill_array_reasons(struct job_record *job_ptr, 289 struct job_record *reject_arr_job); 290 291 292 /* Add a job_queue_rec_t to job_queue */ 293 extern void job_queue_append_internal(job_queue_req_t *job_queue_req); 294 295 #endif /* !_JOB_SCHEDULER_H */ 296