1 /*****************************************************************************\
2  *  job_scheduler.h - data structures and function definitions for scheduling
3  *	of pending jobs in priority order
4  *****************************************************************************
5  *  Copyright (C) 2002-2007 The Regents of the University of California.
6  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
7  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
8  *  Written by Morris Jette <jette@llnl.gov>, et. al.
9  *  Derived from dsh written by Jim Garlick <garlick1@llnl.gov>
10  *  CODE-OCEC-09-009. All rights reserved.
11  *
12  *  This file is part of Slurm, a resource management program.
13  *  For details, see <https://slurm.schedmd.com/>.
14  *  Please also read the included file: DISCLAIMER.
15  *
16  *  Slurm is free software; you can redistribute it and/or modify it under
17  *  the terms of the GNU General Public License as published by the Free
18  *  Software Foundation; either version 2 of the License, or (at your option)
19  *  any later version.
20  *
21  *  In addition, as a special exception, the copyright holders give permission
22  *  to link the code of portions of this program with the OpenSSL library under
23  *  certain conditions as described in each individual source file, and
24  *  distribute linked combinations including the two. You must obey the GNU
25  *  General Public License in all respects for all of the code used other than
26  *  OpenSSL. If you modify file(s) with this exception, you may extend this
27  *  exception to your version of the file(s), but you are not obligated to do
28  *  so. If you do not wish to do so, delete this exception statement from your
29  *  version.  If you delete this exception statement from all source files in
30  *  the program, then also delete it here.
31  *
32  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
33  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
34  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
35  *  details.
36  *
37  *  You should have received a copy of the GNU General Public License along
38  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
39  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
40 \*****************************************************************************/
41 
42 #ifndef _JOB_SCHEDULER_H
43 #define _JOB_SCHEDULER_H
44 
45 #include "src/slurmctld/slurmctld.h"
46 
47 typedef struct job_queue_rec {
48 	uint32_t array_task_id;		/* Job array, task ID */
49 	uint32_t job_id;		/* Job ID */
50 	job_record_t *job_ptr;		/* Pointer to job record */
51 	part_record_t *part_ptr;	/* Pointer to partition record. Each
52 					 * job may have multiple partitions. */
53 	uint32_t priority;		/* Job priority in THIS partition */
54 	slurmctld_resv_t *resv_ptr;     /* If job didn't ask for a reservation,
55 					 * this reservation is one it can run
56 					 * in without requesting */
57 } job_queue_rec_t;
58 
59 /* Use as return values for test_job_dependency. */
60 enum {
61 	NO_DEPEND = 0,
62 	LOCAL_DEPEND,
63 	FAIL_DEPEND,
64 	REMOTE_DEPEND
65 };
66 
67 /*
68  * build_feature_list - Translate a job's feature string into a feature_list
69  * IN  details->features
70  * OUT details->feature_list
71  * RET error code
72  */
73 extern int build_feature_list(job_record_t *job_ptr);
74 
75 /*
76  * Set up job_queue_rec->job_ptr to use a promiscous reservation if the
77  * job_queue_rec has resv_name filled in.
78  */
79 extern void job_queue_rec_prom_resv(job_queue_rec_t *job_queue_rec);
80 
81 /*
82  * build_job_queue - build (non-priority ordered) list of pending jobs
83  * IN clear_start - if set then clear the start_time for pending jobs
84  * IN backfill - true if running backfill scheduler, enforce min time limit
85  * RET the job queue
86  * NOTE: the caller must call list_destroy() on RET value to free memory
87  */
88 extern List build_job_queue(bool clear_start, bool backfill);
89 
90 /* Given a scheduled job, return a pointer to it batch_job_launch_msg_t data */
91 extern batch_job_launch_msg_t *build_launch_job_msg(job_record_t *job_ptr,
92 						    uint16_t protocol_version);
93 
94 /* Determine if job's deadline specification is still valid, kill job if not
95  * job_ptr IN - Job to test
96  * func IN - function named used for logging, "sched" or "backfill"
97  * RET - true of valid, false if invalid and job cancelled
98  */
99 extern bool deadline_ok(job_record_t *job_ptr, char *func);
100 
101 /*
102  * epilog_slurmctld - execute the prolog_slurmctld for a job that has just
103  *	terminated.
104  * IN job_ptr - pointer to job that has been terminated
105  */
106 extern void epilog_slurmctld(job_record_t *job_ptr);
107 
108 /*
109  * Delete a record from a job's feature_list
110  */
111 extern void feature_list_delete(void *x);
112 
113 /*
114  * Return a pointer to the dependency in job_ptr's dependency list that
115  * matches dep_ptr, or NULL if none is found.
116  *
117  * A dependency "matches" when the job_id and depend_type are the same.
118  */
119 extern depend_spec_t *find_dependency(job_record_t *job_ptr,
120 				      depend_spec_t *dep_ptr);
121 
122 /*
123  * Update a job's state_reason, state_desc, and dependency string based on the
124  * states of its dependencies.
125  *
126  * This is called by list_for_each() and thus has 2 void* parameters:
127  * object is a pointer to job_record_t.
128  * arg is unused.
129  */
130 extern int handle_job_dependency_updates(void *object, void *arg);
131 
132 /*
133  * job_is_completing - Determine if jobs are in the process of completing.
134  * IN/OUT  eff_cg_bitmap - optional bitmap of all relevent completing nodes,
135  *                         relevenace determined by filtering via CompleteWait
136  *                         if NULL, function will terminate at first completing
137  *                         job
138  * RET - True of any job is in the process of completing AND
139  *	 CompleteWait is configured non-zero
140  * NOTE: This function can reduce resource fragmentation, which is a
141  * critical issue on Elan interconnect based systems.
142  */
143 extern bool job_is_completing(bitstr_t *eff_cg_bitmap);
144 
145 /* Determine if a pending job will run using only the specified nodes
146  * (in job_desc_msg->req_nodes), build response message and return
147  * SLURM_SUCCESS on success. Otherwise return an error code. Caller
148  * must free response message */
149 extern int job_start_data(job_desc_msg_t *job_desc_msg,
150 			  will_run_response_msg_t **resp);
151 
152 /*
153  * launch_job - send an RPC to a slurmd to initiate a batch job
154  * IN job_ptr - pointer to job that will be initiated
155  */
156 extern void launch_job(job_record_t *job_ptr);
157 
158 /*
159  * make_batch_job_cred - add a job credential to the batch_job_launch_msg
160  * IN/OUT launch_msg_ptr - batch_job_launch_msg in which job_id, step_id,
161  *                         uid and nodes have already been set
162  * IN job_ptr - pointer to job record
163  * RET 0 or error code
164  */
165 extern int make_batch_job_cred(batch_job_launch_msg_t *launch_msg_ptr,
166 			       job_record_t *job_ptr,
167 			       uint16_t protocol_version);
168 
169 /*
170  * Determine which nodes must be rebooted for a job
171  * IN job_ptr - pointer to job that will be initiated
172  * RET bitmap of nodes requiring a reboot for NodeFeaturesPlugin or NULL if none
173  */
174 extern bitstr_t *node_features_reboot(job_record_t *job_ptr);
175 
176 /*
177  * Determine if node boot required for this job
178  * IN job_ptr - pointer to job that will be initiated
179  * IN node_bitmap - nodes to be allocated
180  * RET - true if reboot required
181  */
182 extern bool node_features_reboot_test(job_record_t *job_ptr,
183 				      bitstr_t *node_bitmap);
184 
185 /* Print a job's dependency information based upon job_ptr->depend_list */
186 extern void print_job_dependency(job_record_t *job_ptr, const char *func);
187 
188 /* Decrement a job's prolog_running counter and launch the job if zero */
189 extern void prolog_running_decr(job_record_t *job_ptr);
190 
191 /*
192  * prolog_slurmctld - execute the prolog_slurmctld for a job that has just
193  *	been allocated resources.
194  * IN job_ptr - pointer to job that will be initiated
195  */
196 extern void prolog_slurmctld(job_record_t *job_ptr);
197 
198 /*
199  * reboot_job_nodes - Reboot the compute nodes allocated to a job.
200  * IN job_ptr - pointer to job that will be initiated
201  * RET SLURM_SUCCESS(0) or error code
202  */
203 extern int reboot_job_nodes(job_record_t *job_ptr);
204 
205 /* If a job can run in multiple partitions, make sure that the one
206  * actually used is first in the string. Needed for job state save/restore */
207 extern void rebuild_job_part_list(job_record_t *job_ptr);
208 
209 /*
210  * schedule - attempt to schedule all pending jobs
211  *	pending jobs for each partition will be scheduled in priority
212  *	order until a request fails
213  * IN job_limit - maximum number of jobs to test now, avoid testing the full
214  *		  queue on every job submit (0 means to use the system default,
215  *		  SchedulerParameters for default_queue_depth)
216  * RET count of jobs scheduled
217  * Note: If the scheduler has executed recently, rather than executing again
218  *	right away, a thread will be spawned to execute later in an effort
219  *	to reduce system overhead.
220  * Note: We re-build the queue every time. Jobs can not only be added
221  *	or removed from the queue, but have their priority or partition
222  *	changed with the update_job RPC. In general nodes will be in priority
223  *	order (by submit time), so the sorting should be pretty fast.
224  */
225 extern int schedule(uint32_t job_limit);
226 
227 /*
228  * set_job_elig_time - set the eligible time for pending jobs once their
229  *	dependencies are lifted (in job->details->begin_time)
230  */
231 extern void set_job_elig_time(void);
232 
233 /*
234  * sort_job_queue - sort job_queue in decending priority order
235  * IN/OUT job_queue - sorted job queue previously made by build_job_queue()
236  */
237 extern void sort_job_queue(List job_queue);
238 
239 /* Note this differs from the ListCmpF typedef since we want jobs sorted
240  *	in order of decreasing priority */
241 extern int sort_job_queue2(void *x, void *y);
242 
243 /*
244  * Determine if a job's dependencies are met
245  * Inputs: job_ptr
246  * Outputs: was_changed (optional) -
247  *          If it exists, set it to true if at least 1 dependency changed
248  *          state, otherwise false.
249  * RET: NO_DEPEND = no dependencies
250  *      LOCAL_DEPEND = dependencies remain
251  *      FAIL_DEPEND = failure (job completion code not per dependency),
252  *                    delete the job
253  */
254 extern int test_job_dependency(job_record_t *job_ptr, bool *was_changed);
255 
256 /*
257  * Parse a job dependency string and use it to establish a "depend_spec"
258  * list of dependencies. We accept both old format (a single job ID) and
259  * new format (e.g. "afterok:123:124,after:128").
260  * IN job_ptr - job record to have dependency and depend_list updated
261  * IN new_depend - new dependency description
262  * RET returns an error code from slurm_errno.h
263  */
264 extern int update_job_dependency(job_record_t *job_ptr, char *new_depend);
265 
266 /*
267  * new_depend_list is a dependency list that came from a sibling cluster. It
268  * has updates to the job dependencies on that cluster. Use those changes to
269  * update the dependency list of job_ptr.
270  * Return true if a dependency was updated, false if not.
271  */
272 extern bool update_job_dependency_list(job_record_t *job_ptr,
273 				       List new_depend_list);
274 
275 /*
276  * When an array job is rejected for some reason, the remaining array tasks will
277  * get skipped by both the main scheduler and the backfill scheduler (it's an
278  * optimization). Hence, their reasons should match the reason of the first job.
279  * This function sets those reasons.
280  *
281  * job_ptr		(IN) The current job being evaluated, after it has gone
282  * 			through the scheduling loop.
283  * reject_array_job	(IN) A pointer to the first job (array task) in the most
284  * 			recently rejected array job. If job_ptr belongs to the
285  * 			same array as reject_array_job, then set job_ptr's
286  * 			reason to match reject_array_job.
287  */
288 extern void fill_array_reasons(struct job_record *job_ptr,
289 			       struct job_record *reject_arr_job);
290 
291 
292 /* Add a job_queue_rec_t to job_queue */
293 extern void job_queue_append_internal(job_queue_req_t *job_queue_req);
294 
295 #endif /* !_JOB_SCHEDULER_H */
296