1 /*****************************************************************************\
2 * preempt.c - Job preemption plugin function setup.
3 *****************************************************************************
4 * Copyright (C) 2009-2010 Lawrence Livermore National Security.
5 * Portions Copyright (C) 2010 SchedMD <https://www.schedmd.com>.
6 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7 * Written by Morris Jette <jette1@llnl.gov>
8 * CODE-OCEC-09-009. All rights reserved.
9 *
10 * This file is part of Slurm, a resource management program.
11 * For details, see <https://slurm.schedmd.com/>.
12 * Please also read the included file: DISCLAIMER.
13 *
14 * Slurm is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option)
17 * any later version.
18 *
19 * In addition, as a special exception, the copyright holders give permission
20 * to link the code of portions of this program with the OpenSSL library under
21 * certain conditions as described in each individual source file, and
22 * distribute linked combinations including the two. You must obey the GNU
23 * General Public License in all respects for all of the code used other than
24 * OpenSSL. If you modify file(s) with this exception, you may extend this
25 * exception to your version of the file(s), but you are not obligated to do
26 * so. If you do not wish to do so, delete this exception statement from your
27 * version. If you delete this exception statement from all source files in
28 * the program, then also delete it here.
29 *
30 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
31 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
32 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
33 * details.
34 *
35 * You should have received a copy of the GNU General Public License along
36 * with Slurm; if not, write to the Free Software Foundation, Inc.,
37 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
38 \*****************************************************************************/
39
40 #include <pthread.h>
41 #include <signal.h>
42
43 #include "preempt.h"
44 #include "src/common/log.h"
45 #include "src/common/plugrack.h"
46 #include "src/common/slurm_protocol_api.h"
47 #include "src/common/xmalloc.h"
48 #include "src/common/xstring.h"
49 #include "src/slurmctld/reservation.h"
50 #include "src/slurmctld/slurmctld.h"
51 #include "src/slurmctld/job_scheduler.h"
52 #include "src/slurmctld/acct_policy.h"
53
54 static bool youngest_order = false;
55
56 typedef struct slurm_preempt_ops {
57 bool (*job_preempt_check) (job_queue_rec_t *preemptor,
58 job_queue_rec_t *preemptee);
59 bool (*preemptable) (job_record_t *preemptor,
60 job_record_t *preemptee);
61 int (*get_data) (job_record_t *job_ptr,
62 slurm_preempt_data_type_t data_type,
63 void *data);
64 } slurm_preempt_ops_t;
65
66 typedef struct {
67 job_record_t *preemptor;
68 List preemptee_job_list;
69 } preempt_candidates_t;
70
71 /*
72 * Must be synchronized with slurm_preempt_ops_t above.
73 */
74 static const char *syms[] = {
75 "preempt_p_job_preempt_check",
76 "preempt_p_preemptable",
77 "preempt_p_get_data",
78 };
79
80 static slurm_preempt_ops_t ops;
81 static plugin_context_t *g_context = NULL;
82 static pthread_mutex_t g_context_lock = PTHREAD_MUTEX_INITIALIZER;
83 static bool init_run = false;
84
_is_job_preempt_exempt_internal(void * x,void * key)85 static int _is_job_preempt_exempt_internal(void *x, void *key)
86 {
87 job_record_t *preemptee_ptr = (job_record_t *)x;
88 job_record_t *preemptor_ptr = (job_record_t *)key;
89
90 if (job_borrow_from_resv_check(preemptee_ptr, preemptor_ptr)) {
91 /*
92 * This job is on borrowed time from the reservation!
93 * Automatic preemption.
94 */
95 } else if (!(*(ops.preemptable))(preemptee_ptr, preemptor_ptr))
96 return 1;
97
98 if (preemptor_ptr->details &&
99 (preemptor_ptr->details->expanding_jobid == preemptee_ptr->job_id))
100 return 1;
101
102 if (acct_policy_is_job_preempt_exempt(preemptee_ptr))
103 return 1;
104
105 return 0;
106 }
107
_is_job_preempt_exempt(job_record_t * preemptee_ptr,job_record_t * preemptor_ptr)108 static bool _is_job_preempt_exempt(job_record_t *preemptee_ptr,
109 job_record_t *preemptor_ptr)
110 {
111 xassert(preemptee_ptr);
112 xassert(preemptor_ptr);
113
114 if (!preemptee_ptr->het_job_list)
115 return _is_job_preempt_exempt_internal(
116 preemptee_ptr, preemptor_ptr);
117 /*
118 * All components of a job must be preemptable otherwise it is
119 * preempt exempt
120 */
121 return list_find_first(preemptee_ptr->het_job_list,
122 _is_job_preempt_exempt_internal,
123 preemptor_ptr) ? true : false;
124 }
125
126 /*
127 * Return the PreemptMode which should apply to stop this job
128 */
_job_preempt_mode_internal(job_record_t * job_ptr)129 static uint16_t _job_preempt_mode_internal(job_record_t *job_ptr)
130 {
131 uint16_t data = (uint16_t)PREEMPT_MODE_OFF;
132
133 if ((slurm_preempt_init() < 0) ||
134 ((*(ops.get_data))(job_ptr, PREEMPT_DATA_MODE, &data) !=
135 SLURM_SUCCESS))
136 return data;
137
138 return data;
139 }
140
_find_job_by_preempt_mode(void * x,void * arg)141 static int _find_job_by_preempt_mode(void *x, void *arg)
142 {
143 job_record_t *job_ptr = (job_record_t *)x;
144 uint16_t preempt_mode = *(uint16_t *)arg;
145
146 if (_job_preempt_mode_internal(job_ptr) == preempt_mode)
147 return 1;
148
149 return 0;
150 }
151
_add_preemptable_job(void * x,void * arg)152 static int _add_preemptable_job(void *x, void *arg)
153 {
154 job_record_t *candidate = (job_record_t *) x;
155 preempt_candidates_t *candidates = (preempt_candidates_t *) arg;
156 job_record_t *preemptor = candidates->preemptor;
157
158 /*
159 * We only want to look at the master component of a hetjob. Since all
160 * components have to be preemptable it should be here at some point.
161 */
162 if (candidate->het_job_id && !candidate->het_job_list)
163 return 0;
164
165 /*
166 * We have to check the entire bitmap space here before we can check
167 * each part of a hetjob in _is_job_preempt_exempt()
168 */
169 if (!job_overlap_and_running(preemptor->part_ptr->node_bitmap,
170 candidate))
171 return 0;
172
173 if (_is_job_preempt_exempt(candidate, preemptor))
174 return 0;
175
176 /* This job is a preemption candidate */
177 if (!candidates->preemptee_job_list)
178 candidates->preemptee_job_list = list_create(NULL);
179
180 list_append(candidates->preemptee_job_list, candidate);
181
182 return 0;
183 }
184
_sort_by_prio(void * x,void * y)185 static int _sort_by_prio(void *x, void *y)
186 {
187 int rc;
188 uint32_t job_prio1, job_prio2;
189 job_record_t *j1 = *(job_record_t **)x;
190 job_record_t *j2 = *(job_record_t **)y;
191
192 (void)(*(ops.get_data))(j1, PREEMPT_DATA_PRIO, &job_prio1);
193 (void)(*(ops.get_data))(j2, PREEMPT_DATA_PRIO, &job_prio2);
194
195 if (job_prio1 > job_prio2)
196 rc = 1;
197 else if (job_prio1 < job_prio2)
198 rc = -1;
199 else
200 rc = 0;
201
202 return rc;
203 }
204
_sort_by_youngest(void * x,void * y)205 static int _sort_by_youngest(void *x, void *y)
206 {
207 int rc;
208 job_record_t *j1 = *(job_record_t **) x;
209 job_record_t *j2 = *(job_record_t **) y;
210
211 if (j1->start_time < j2->start_time)
212 rc = 1;
213 else if (j1->start_time > j2->start_time)
214 rc = -1;
215 else
216 rc = 0;
217
218 return rc;
219 }
220
slurm_preempt_init(void)221 extern int slurm_preempt_init(void)
222 {
223 int retval = SLURM_SUCCESS;
224 char *plugin_type = "preempt";
225 char *type = NULL;
226 char *sched_params;
227
228 /* This function is called frequently, so it should be as fast as
229 * possible. The test below will be true almost all of the time and
230 * is as fast as possible. */
231 if (init_run && g_context)
232 return retval;
233
234 slurm_mutex_lock(&g_context_lock);
235
236 if (g_context)
237 goto done;
238
239 type = slurm_get_preempt_type();
240 g_context = plugin_context_create(
241 plugin_type, type, (void **)&ops, syms, sizeof(syms));
242
243 if (!g_context) {
244 error("cannot create %s context for %s", plugin_type, type);
245 retval = SLURM_ERROR;
246 goto done;
247 }
248 init_run = true;
249
250 sched_params = slurm_get_sched_params();
251 if (xstrcasestr(sched_params, "preempt_youngest_first"))
252 youngest_order = true;
253 xfree(sched_params);
254
255 done:
256 slurm_mutex_unlock(&g_context_lock);
257 xfree(type);
258 return retval;
259 }
260
slurm_preempt_fini(void)261 extern int slurm_preempt_fini(void)
262 {
263 int rc;
264
265 if (!g_context)
266 return SLURM_SUCCESS;
267
268 init_run = false;
269 rc = plugin_context_destroy(g_context);
270 g_context = NULL;
271 return rc;
272 }
273
slurm_find_preemptable_jobs(job_record_t * job_ptr)274 extern List slurm_find_preemptable_jobs(job_record_t *job_ptr)
275 {
276 preempt_candidates_t candidates = { .preemptor = job_ptr };
277
278 /* Validate the preemptor job */
279 if (!job_ptr) {
280 error("%s: job_ptr is NULL", __func__);
281 return NULL;
282 }
283 if (!IS_JOB_PENDING(job_ptr)) {
284 error("%s: %pJ not pending", __func__, job_ptr);
285 return NULL;
286 }
287 if (!job_ptr->part_ptr) {
288 error("%s: %pJ has NULL partition ptr", __func__, job_ptr);
289 return NULL;
290 }
291 if (!job_ptr->part_ptr->node_bitmap) {
292 error("%s: partition %s node_bitmap=NULL",
293 __func__, job_ptr->part_ptr->name);
294 return NULL;
295 }
296
297 /* Build an array of pointers to preemption candidates */
298 list_for_each(job_list, _add_preemptable_job, &candidates);
299
300 if (candidates.preemptee_job_list && youngest_order)
301 list_sort(candidates.preemptee_job_list, _sort_by_youngest);
302 else if (candidates.preemptee_job_list)
303 list_sort(candidates.preemptee_job_list, _sort_by_prio);
304
305 return candidates.preemptee_job_list;
306 }
307
308 /*
309 * Return the PreemptMode which should apply to stop this job
310 */
slurm_job_preempt_mode(job_record_t * job_ptr)311 extern uint16_t slurm_job_preempt_mode(job_record_t *job_ptr)
312 {
313 uint16_t data;
314
315 if (job_ptr->het_job_list && !job_ptr->job_preempt_comp) {
316 /*
317 * Find the component job to use as the template for
318 * setting the preempt mode for all other components.
319 * The first component job found having a preempt mode
320 * in the hierarchy (ordered highest to lowest:
321 * SUSPEND->REQUEUE->CANCEL) will be used as
322 * the template.
323 *
324 * NOTE: CANCEL is not on the list below since it is handled
325 * as the default.
326 */
327 static const uint16_t preempt_modes[] = {
328 PREEMPT_MODE_SUSPEND,
329 PREEMPT_MODE_REQUEUE
330 };
331 static const int preempt_modes_cnt = sizeof(preempt_modes) /
332 sizeof(preempt_modes[0]);
333
334 for (int pm_index = 0; pm_index < preempt_modes_cnt;
335 pm_index++) {
336 data = preempt_modes[pm_index];
337 if ((job_ptr->job_preempt_comp = list_find_first(
338 job_ptr->het_job_list,
339 _find_job_by_preempt_mode,
340 &data)))
341 break;
342 }
343 /* if not found look up the mode (CANCEL expected) */
344 if (!job_ptr->job_preempt_comp)
345 data = _job_preempt_mode_internal(job_ptr);
346 } else
347 data = _job_preempt_mode_internal(job_ptr->job_preempt_comp ?
348 job_ptr->job_preempt_comp :
349 job_ptr);
350
351 return data;
352 }
353
354 /*
355 * Return true if any jobs can be preempted, otherwise false
356 */
slurm_preemption_enabled(void)357 extern bool slurm_preemption_enabled(void)
358 {
359 bool data = false;
360
361 if ((slurm_preempt_init() < 0) ||
362 ((*(ops.get_data))(NULL, PREEMPT_DATA_ENABLED, &data) !=
363 SLURM_SUCCESS))
364 return data;
365
366 return data;
367 }
368
369 /*
370 * Return the grace time for job
371 */
slurm_job_get_grace_time(job_record_t * job_ptr)372 extern uint32_t slurm_job_get_grace_time(job_record_t *job_ptr)
373 {
374 uint32_t data = 0;
375
376 if ((slurm_preempt_init() < 0) ||
377 ((*(ops.get_data))(job_ptr, PREEMPT_DATA_GRACE_TIME, &data) !=
378 SLURM_SUCCESS))
379 return data;
380
381 return data;
382 }
383
384 /*
385 * Check to see if a job is in a grace time.
386 * If no grace_time active then return 1.
387 * If grace_time is currently active then return -1.
388 */
_job_check_grace_internal(void * x,void * arg)389 static int _job_check_grace_internal(void *x, void *arg)
390 {
391 job_record_t *job_ptr = (job_record_t *)x;
392 job_record_t *preemptor_ptr = (job_record_t *)arg;
393
394 int rc = -1;
395 uint32_t grace_time = 0;
396
397 if (job_ptr->preempt_time) {
398 if (time(NULL) >= job_ptr->end_time)
399 rc = 1;
400 return rc;
401 }
402
403 xassert(preemptor_ptr);
404
405 /*
406 * If this job is running in parts of a reservation
407 */
408 if (job_borrow_from_resv_check(job_ptr, preemptor_ptr))
409 grace_time = job_ptr->warn_time;
410 else
411 grace_time = slurm_job_get_grace_time(job_ptr);
412
413 job_ptr->preempt_time = time(NULL);
414 job_ptr->end_time = MIN(job_ptr->end_time,
415 (job_ptr->preempt_time + (time_t)grace_time));
416 if (grace_time) {
417 debug("setting %u sec preemption grace time for %pJ to reclaim resources for %pJ",
418 grace_time, job_ptr, preemptor_ptr);
419 job_signal(job_ptr, SIGCONT, 0, 0, 0);
420 if (preempt_send_user_signal && job_ptr->warn_signal &&
421 !(job_ptr->warn_flags & WARN_SENT))
422 send_job_warn_signal(job_ptr, true);
423 else
424 job_signal(job_ptr, SIGTERM, 0, 0, 0);
425 } else
426 rc = 1;
427
428 return rc;
429 }
430
431 /*
432 * Check to see if a job (or hetjob) is in a grace time.
433 * If no grace_time active then return 0.
434 * If grace_time is currently active then return 1.
435 */
_job_check_grace(job_record_t * job_ptr,job_record_t * preemptor_ptr)436 static int _job_check_grace(job_record_t *job_ptr, job_record_t *preemptor_ptr)
437 {
438 if (job_ptr->het_job_list)
439 return list_for_each_nobreak(job_ptr->het_job_list,
440 _job_check_grace_internal,
441 preemptor_ptr) <= 0 ? 1 : 0;
442
443 return _job_check_grace_internal(job_ptr, preemptor_ptr) < 0 ? 1 : 0;
444 }
445
_job_warn_signal_wrapper(void * x,void * arg)446 static int _job_warn_signal_wrapper(void *x, void *arg)
447 {
448 job_record_t *job_ptr = (job_record_t *)x;
449 bool ignore_time = *(bool *)arg;
450
451 /* Ignore Time is always true */
452 send_job_warn_signal(job_ptr, ignore_time);
453
454 return 0;
455 }
456
slurm_job_preempt(job_record_t * job_ptr,job_record_t * preemptor_ptr,uint16_t mode,bool ignore_time)457 extern uint32_t slurm_job_preempt(job_record_t *job_ptr,
458 job_record_t *preemptor_ptr,
459 uint16_t mode, bool ignore_time)
460 {
461 int rc = SLURM_ERROR;
462 /* If any job is in a grace period continue */
463 if (_job_check_grace(job_ptr, preemptor_ptr))
464 return SLURM_ERROR;
465
466 if (preempt_send_user_signal) {
467 if (job_ptr->het_job_list)
468 (void)list_for_each(job_ptr->het_job_list,
469 _job_warn_signal_wrapper,
470 &ignore_time);
471 else
472 send_job_warn_signal(job_ptr, ignore_time);
473 }
474
475 if (mode == PREEMPT_MODE_CANCEL) {
476 if (job_ptr->het_job_list)
477 rc = het_job_signal(job_ptr, SIGKILL, 0, 0, true);
478 else
479 rc = job_signal(job_ptr, SIGKILL, 0, 0, true);
480 if (rc == SLURM_SUCCESS) {
481 info("preempted %pJ has been killed to reclaim resources for %pJ",
482 job_ptr, preemptor_ptr);
483 }
484 } else if (mode == PREEMPT_MODE_REQUEUE) {
485 /* job_requeue already handles het jobs */
486 rc = job_requeue(0, job_ptr->job_id,
487 NULL, true, 0);
488 if (rc == SLURM_SUCCESS) {
489 info("preempted %pJ has been requeued to reclaim resources for %pJ",
490 job_ptr, preemptor_ptr);
491 }
492 }
493
494 if (rc != SLURM_SUCCESS) {
495 if (job_ptr->het_job_list)
496 rc = het_job_signal(job_ptr, SIGKILL, 0, 0, true);
497 else
498 rc = job_signal(job_ptr, SIGKILL, 0, 0, true);
499 if (rc == SLURM_SUCCESS) {
500 info("%s: preempted %pJ had to be killed",
501 __func__, job_ptr);
502 } else {
503 info("%s: preempted %pJ kill failure %s",
504 __func__, job_ptr, slurm_strerror(rc));
505 }
506 }
507
508 if (rc == SLURM_SUCCESS)
509 job_ptr->preempt_time = time(NULL);
510
511 return rc;
512 }
513 /*
514 * Return true if the preemptor can preempt the preemptee, otherwise false
515 */
preempt_g_job_preempt_check(job_queue_rec_t * preemptor,job_queue_rec_t * preemptee)516 extern bool preempt_g_job_preempt_check(job_queue_rec_t *preemptor,
517 job_queue_rec_t *preemptee)
518 {
519 if (slurm_preempt_init() < 0)
520 return false;
521
522 return (*(ops.job_preempt_check))(preemptor, preemptee);
523 }
524
preempt_g_preemptable(job_record_t * preemptee,job_record_t * preemptor)525 extern bool preempt_g_preemptable(
526 job_record_t *preemptee, job_record_t *preemptor)
527 {
528 if (slurm_preempt_init() < 0)
529 return false;
530
531 return (*(ops.preemptable))(preemptor, preemptee);
532 }
533
preempt_g_get_data(job_record_t * job_ptr,slurm_preempt_data_type_t data_type,void * data)534 extern int preempt_g_get_data(job_record_t *job_ptr,
535 slurm_preempt_data_type_t data_type,
536 void *data)
537 {
538 if (slurm_preempt_init() < 0)
539 return SLURM_ERROR;
540
541 return (*(ops.get_data))(job_ptr, data_type, data);
542 }
543