1 /*****************************************************************************\
2  *  preempt.c - Job preemption plugin function setup.
3  *****************************************************************************
4  *  Copyright (C) 2009-2010 Lawrence Livermore National Security.
5  *  Portions Copyright (C) 2010 SchedMD <https://www.schedmd.com>.
6  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7  *  Written by Morris Jette <jette1@llnl.gov>
8  *  CODE-OCEC-09-009. All rights reserved.
9  *
10  *  This file is part of Slurm, a resource management program.
11  *  For details, see <https://slurm.schedmd.com/>.
12  *  Please also read the included file: DISCLAIMER.
13  *
14  *  Slurm is free software; you can redistribute it and/or modify it under
15  *  the terms of the GNU General Public License as published by the Free
16  *  Software Foundation; either version 2 of the License, or (at your option)
17  *  any later version.
18  *
19  *  In addition, as a special exception, the copyright holders give permission
20  *  to link the code of portions of this program with the OpenSSL library under
21  *  certain conditions as described in each individual source file, and
22  *  distribute linked combinations including the two. You must obey the GNU
23  *  General Public License in all respects for all of the code used other than
24  *  OpenSSL. If you modify file(s) with this exception, you may extend this
25  *  exception to your version of the file(s), but you are not obligated to do
26  *  so. If you do not wish to do so, delete this exception statement from your
27  *  version.  If you delete this exception statement from all source files in
28  *  the program, then also delete it here.
29  *
30  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
31  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
32  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
33  *  details.
34  *
35  *  You should have received a copy of the GNU General Public License along
36  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
37  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
38 \*****************************************************************************/
39 
40 #include <pthread.h>
41 #include <signal.h>
42 
43 #include "preempt.h"
44 #include "src/common/log.h"
45 #include "src/common/plugrack.h"
46 #include "src/common/slurm_protocol_api.h"
47 #include "src/common/xmalloc.h"
48 #include "src/common/xstring.h"
49 #include "src/slurmctld/reservation.h"
50 #include "src/slurmctld/slurmctld.h"
51 #include "src/slurmctld/job_scheduler.h"
52 #include "src/slurmctld/acct_policy.h"
53 
54 static bool youngest_order = false;
55 
56 typedef struct slurm_preempt_ops {
57 	bool		(*job_preempt_check)  (job_queue_rec_t *preemptor,
58 					       job_queue_rec_t *preemptee);
59 	bool		(*preemptable) (job_record_t *preemptor,
60 					job_record_t *preemptee);
61 	int		(*get_data) (job_record_t *job_ptr,
62 				     slurm_preempt_data_type_t data_type,
63 				     void *data);
64 } slurm_preempt_ops_t;
65 
66 typedef struct {
67 	job_record_t *preemptor;
68 	List preemptee_job_list;
69 } preempt_candidates_t;
70 
71 /*
72  * Must be synchronized with slurm_preempt_ops_t above.
73  */
74 static const char *syms[] = {
75 	"preempt_p_job_preempt_check",
76 	"preempt_p_preemptable",
77 	"preempt_p_get_data",
78 };
79 
80 static slurm_preempt_ops_t ops;
81 static plugin_context_t *g_context = NULL;
82 static pthread_mutex_t	    g_context_lock = PTHREAD_MUTEX_INITIALIZER;
83 static bool init_run = false;
84 
_is_job_preempt_exempt_internal(void * x,void * key)85 static int _is_job_preempt_exempt_internal(void *x, void *key)
86 {
87 	job_record_t *preemptee_ptr = (job_record_t *)x;
88 	job_record_t *preemptor_ptr = (job_record_t *)key;
89 
90 	if (job_borrow_from_resv_check(preemptee_ptr, preemptor_ptr)) {
91 		/*
92 		 * This job is on borrowed time from the reservation!
93 		 * Automatic preemption.
94 		 */
95 	} else if (!(*(ops.preemptable))(preemptee_ptr, preemptor_ptr))
96 		return 1;
97 
98 	if (preemptor_ptr->details &&
99 	    (preemptor_ptr->details->expanding_jobid == preemptee_ptr->job_id))
100 		return 1;
101 
102 	if (acct_policy_is_job_preempt_exempt(preemptee_ptr))
103 		return 1;
104 
105 	return 0;
106 }
107 
_is_job_preempt_exempt(job_record_t * preemptee_ptr,job_record_t * preemptor_ptr)108 static bool _is_job_preempt_exempt(job_record_t *preemptee_ptr,
109 				  job_record_t *preemptor_ptr)
110 {
111 	xassert(preemptee_ptr);
112 	xassert(preemptor_ptr);
113 
114 	if (!preemptee_ptr->het_job_list)
115 		return _is_job_preempt_exempt_internal(
116 			preemptee_ptr, preemptor_ptr);
117 	/*
118 	 * All components of a job must be preemptable otherwise it is
119 	 * preempt exempt
120 	 */
121         return list_find_first(preemptee_ptr->het_job_list,
122 			       _is_job_preempt_exempt_internal,
123 			       preemptor_ptr) ? true : false;
124 }
125 
126 /*
127  * Return the PreemptMode which should apply to stop this job
128  */
_job_preempt_mode_internal(job_record_t * job_ptr)129 static uint16_t _job_preempt_mode_internal(job_record_t *job_ptr)
130 {
131 	uint16_t data = (uint16_t)PREEMPT_MODE_OFF;
132 
133 	if ((slurm_preempt_init() < 0) ||
134 	    ((*(ops.get_data))(job_ptr, PREEMPT_DATA_MODE, &data) !=
135 	     SLURM_SUCCESS))
136 		return data;
137 
138 	return data;
139 }
140 
_find_job_by_preempt_mode(void * x,void * arg)141 static int _find_job_by_preempt_mode(void *x, void *arg)
142 {
143 	job_record_t *job_ptr = (job_record_t *)x;
144 	uint16_t preempt_mode = *(uint16_t *)arg;
145 
146 	if (_job_preempt_mode_internal(job_ptr) == preempt_mode)
147 		return 1;
148 
149 	return 0;
150 }
151 
_add_preemptable_job(void * x,void * arg)152 static int _add_preemptable_job(void *x, void *arg)
153 {
154 	job_record_t *candidate = (job_record_t *) x;
155 	preempt_candidates_t *candidates = (preempt_candidates_t *) arg;
156 	job_record_t *preemptor = candidates->preemptor;
157 
158 	/*
159 	 * We only want to look at the master component of a hetjob.  Since all
160 	 * components have to be preemptable it should be here at some point.
161 	 */
162 	if (candidate->het_job_id && !candidate->het_job_list)
163 		return 0;
164 
165 	/*
166 	 * We have to check the entire bitmap space here before we can check
167 	 * each part of a hetjob in _is_job_preempt_exempt()
168 	 */
169 	if (!job_overlap_and_running(preemptor->part_ptr->node_bitmap,
170 				     candidate))
171 		return 0;
172 
173 	if (_is_job_preempt_exempt(candidate, preemptor))
174 		return 0;
175 
176 	/* This job is a preemption candidate */
177 	if (!candidates->preemptee_job_list)
178 		candidates->preemptee_job_list = list_create(NULL);
179 
180 	list_append(candidates->preemptee_job_list, candidate);
181 
182 	return 0;
183 }
184 
_sort_by_prio(void * x,void * y)185 static int _sort_by_prio(void *x, void *y)
186 {
187 	int rc;
188 	uint32_t job_prio1, job_prio2;
189 	job_record_t *j1 = *(job_record_t **)x;
190 	job_record_t *j2 = *(job_record_t **)y;
191 
192 	(void)(*(ops.get_data))(j1, PREEMPT_DATA_PRIO, &job_prio1);
193 	(void)(*(ops.get_data))(j2, PREEMPT_DATA_PRIO, &job_prio2);
194 
195 	if (job_prio1 > job_prio2)
196 		rc = 1;
197 	else if (job_prio1 < job_prio2)
198 		rc = -1;
199 	else
200 		rc = 0;
201 
202 	return rc;
203 }
204 
_sort_by_youngest(void * x,void * y)205 static int _sort_by_youngest(void *x, void *y)
206 {
207 	int rc;
208 	job_record_t *j1 = *(job_record_t **) x;
209 	job_record_t *j2 = *(job_record_t **) y;
210 
211 	if (j1->start_time < j2->start_time)
212 		rc = 1;
213 	else if (j1->start_time > j2->start_time)
214 		rc = -1;
215 	else
216 		rc = 0;
217 
218 	return rc;
219 }
220 
slurm_preempt_init(void)221 extern int slurm_preempt_init(void)
222 {
223 	int retval = SLURM_SUCCESS;
224 	char *plugin_type = "preempt";
225 	char *type = NULL;
226 	char *sched_params;
227 
228 	/* This function is called frequently, so it should be as fast as
229 	 * possible. The test below will be true almost all of the time and
230 	 * is as fast as possible. */
231 	if (init_run && g_context)
232 		return retval;
233 
234 	slurm_mutex_lock(&g_context_lock);
235 
236 	if (g_context)
237 		goto done;
238 
239 	type = slurm_get_preempt_type();
240 	g_context = plugin_context_create(
241 		plugin_type, type, (void **)&ops, syms, sizeof(syms));
242 
243 	if (!g_context) {
244 		error("cannot create %s context for %s", plugin_type, type);
245 		retval = SLURM_ERROR;
246 		goto done;
247 	}
248 	init_run = true;
249 
250 	sched_params = slurm_get_sched_params();
251 	if (xstrcasestr(sched_params, "preempt_youngest_first"))
252 		youngest_order = true;
253 	xfree(sched_params);
254 
255 done:
256 	slurm_mutex_unlock(&g_context_lock);
257 	xfree(type);
258 	return retval;
259 }
260 
slurm_preempt_fini(void)261 extern int slurm_preempt_fini(void)
262 {
263 	int rc;
264 
265 	if (!g_context)
266 		return SLURM_SUCCESS;
267 
268 	init_run = false;
269 	rc = plugin_context_destroy(g_context);
270 	g_context = NULL;
271 	return rc;
272 }
273 
slurm_find_preemptable_jobs(job_record_t * job_ptr)274 extern List slurm_find_preemptable_jobs(job_record_t *job_ptr)
275 {
276 	preempt_candidates_t candidates	= { .preemptor = job_ptr };
277 
278 	/* Validate the preemptor job */
279 	if (!job_ptr) {
280 		error("%s: job_ptr is NULL", __func__);
281 		return NULL;
282 	}
283 	if (!IS_JOB_PENDING(job_ptr)) {
284 		error("%s: %pJ not pending", __func__, job_ptr);
285 		return NULL;
286 	}
287 	if (!job_ptr->part_ptr) {
288 		error("%s: %pJ has NULL partition ptr", __func__, job_ptr);
289 		return NULL;
290 	}
291 	if (!job_ptr->part_ptr->node_bitmap) {
292 		error("%s: partition %s node_bitmap=NULL",
293 		      __func__, job_ptr->part_ptr->name);
294 		return NULL;
295 	}
296 
297 	/* Build an array of pointers to preemption candidates */
298 	list_for_each(job_list, _add_preemptable_job, &candidates);
299 
300 	if (candidates.preemptee_job_list && youngest_order)
301 		list_sort(candidates.preemptee_job_list, _sort_by_youngest);
302 	else if (candidates.preemptee_job_list)
303 		list_sort(candidates.preemptee_job_list, _sort_by_prio);
304 
305 	return candidates.preemptee_job_list;
306 }
307 
308 /*
309  * Return the PreemptMode which should apply to stop this job
310  */
slurm_job_preempt_mode(job_record_t * job_ptr)311 extern uint16_t slurm_job_preempt_mode(job_record_t *job_ptr)
312 {
313 	uint16_t data;
314 
315 	if (job_ptr->het_job_list && !job_ptr->job_preempt_comp) {
316 		/*
317 		 * Find the component job to use as the template for
318 		 * setting the preempt mode for all other components.
319 		 * The first component job found having a preempt mode
320 		 * in the hierarchy (ordered highest to lowest:
321 		 * SUSPEND->REQUEUE->CANCEL) will be used as
322 		 * the template.
323 		 *
324 		 * NOTE: CANCEL is not on the list below since it is handled
325 		 * as the default.
326 		 */
327 		static const uint16_t preempt_modes[] = {
328 			PREEMPT_MODE_SUSPEND,
329 			PREEMPT_MODE_REQUEUE
330 		};
331 		static const int preempt_modes_cnt = sizeof(preempt_modes) /
332 			sizeof(preempt_modes[0]);
333 
334 		for (int pm_index = 0; pm_index < preempt_modes_cnt;
335 		     pm_index++) {
336 			data = preempt_modes[pm_index];
337 			if ((job_ptr->job_preempt_comp = list_find_first(
338 				     job_ptr->het_job_list,
339 				     _find_job_by_preempt_mode,
340 				     &data)))
341 				break;
342 		}
343 		/* if not found look up the mode (CANCEL expected) */
344 		if (!job_ptr->job_preempt_comp)
345 			data = _job_preempt_mode_internal(job_ptr);
346 	} else
347 		data = _job_preempt_mode_internal(job_ptr->job_preempt_comp ?
348 						  job_ptr->job_preempt_comp :
349 						  job_ptr);
350 
351 	return data;
352 }
353 
354 /*
355  * Return true if any jobs can be preempted, otherwise false
356  */
slurm_preemption_enabled(void)357 extern bool slurm_preemption_enabled(void)
358 {
359 	bool data = false;
360 
361 	if ((slurm_preempt_init() < 0) ||
362 	    ((*(ops.get_data))(NULL, PREEMPT_DATA_ENABLED, &data) !=
363 	     SLURM_SUCCESS))
364 		return data;
365 
366 	return data;
367 }
368 
369 /*
370  * Return the grace time for job
371  */
slurm_job_get_grace_time(job_record_t * job_ptr)372 extern uint32_t slurm_job_get_grace_time(job_record_t *job_ptr)
373 {
374 	uint32_t data = 0;
375 
376 	if ((slurm_preempt_init() < 0) ||
377 	    ((*(ops.get_data))(job_ptr, PREEMPT_DATA_GRACE_TIME, &data) !=
378 	     SLURM_SUCCESS))
379 		return data;
380 
381 	return data;
382 }
383 
384 /*
385  * Check to see if a job is in a grace time.
386  * If no grace_time active then return 1.
387  * If grace_time is currently active then return -1.
388  */
_job_check_grace_internal(void * x,void * arg)389 static int _job_check_grace_internal(void *x, void *arg)
390 {
391 	job_record_t *job_ptr = (job_record_t *)x;
392 	job_record_t *preemptor_ptr = (job_record_t *)arg;
393 
394 	int rc = -1;
395 	uint32_t grace_time = 0;
396 
397 	if (job_ptr->preempt_time) {
398 		if (time(NULL) >= job_ptr->end_time)
399 			rc = 1;
400 		return rc;
401 	}
402 
403 	xassert(preemptor_ptr);
404 
405 	/*
406 	 * If this job is running in parts of a reservation
407 	 */
408 	if (job_borrow_from_resv_check(job_ptr, preemptor_ptr))
409 		grace_time = job_ptr->warn_time;
410 	else
411 		grace_time = slurm_job_get_grace_time(job_ptr);
412 
413 	job_ptr->preempt_time = time(NULL);
414 	job_ptr->end_time = MIN(job_ptr->end_time,
415 				(job_ptr->preempt_time + (time_t)grace_time));
416 	if (grace_time) {
417 		debug("setting %u sec preemption grace time for %pJ to reclaim resources for %pJ",
418 		      grace_time, job_ptr, preemptor_ptr);
419 		job_signal(job_ptr, SIGCONT, 0, 0, 0);
420 		if (preempt_send_user_signal && job_ptr->warn_signal &&
421 		    !(job_ptr->warn_flags & WARN_SENT))
422 			send_job_warn_signal(job_ptr, true);
423 		else
424 			job_signal(job_ptr, SIGTERM, 0, 0, 0);
425 	} else
426 		rc = 1;
427 
428 	return rc;
429 }
430 
431 /*
432  * Check to see if a job (or hetjob) is in a grace time.
433  * If no grace_time active then return 0.
434  * If grace_time is currently active then return 1.
435  */
_job_check_grace(job_record_t * job_ptr,job_record_t * preemptor_ptr)436 static int _job_check_grace(job_record_t *job_ptr, job_record_t *preemptor_ptr)
437 {
438 	if (job_ptr->het_job_list)
439 		return list_for_each_nobreak(job_ptr->het_job_list,
440 					     _job_check_grace_internal,
441 					     preemptor_ptr) <= 0 ? 1 : 0;
442 
443 	return _job_check_grace_internal(job_ptr, preemptor_ptr) < 0 ? 1 : 0;
444 }
445 
_job_warn_signal_wrapper(void * x,void * arg)446 static int _job_warn_signal_wrapper(void *x, void *arg)
447 {
448 	job_record_t *job_ptr = (job_record_t *)x;
449 	bool ignore_time = *(bool *)arg;
450 
451 	/* Ignore Time is always true */
452 	send_job_warn_signal(job_ptr, ignore_time);
453 
454 	return 0;
455 }
456 
slurm_job_preempt(job_record_t * job_ptr,job_record_t * preemptor_ptr,uint16_t mode,bool ignore_time)457 extern uint32_t slurm_job_preempt(job_record_t *job_ptr,
458 				  job_record_t *preemptor_ptr,
459 				  uint16_t mode, bool ignore_time)
460 {
461 	int rc = SLURM_ERROR;
462 	/* If any job is in a grace period continue */
463 	if (_job_check_grace(job_ptr, preemptor_ptr))
464 		return SLURM_ERROR;
465 
466 	if (preempt_send_user_signal) {
467 		if (job_ptr->het_job_list)
468 			(void)list_for_each(job_ptr->het_job_list,
469 					    _job_warn_signal_wrapper,
470 					    &ignore_time);
471 		else
472 			send_job_warn_signal(job_ptr, ignore_time);
473 	}
474 
475 	if (mode == PREEMPT_MODE_CANCEL) {
476 		if (job_ptr->het_job_list)
477 			rc = het_job_signal(job_ptr, SIGKILL, 0, 0, true);
478 		else
479 			rc = job_signal(job_ptr, SIGKILL, 0, 0, true);
480 		if (rc == SLURM_SUCCESS) {
481 			info("preempted %pJ has been killed to reclaim resources for %pJ",
482 			     job_ptr, preemptor_ptr);
483 		}
484 	} else if (mode == PREEMPT_MODE_REQUEUE) {
485 		/* job_requeue already handles het jobs */
486 		rc = job_requeue(0, job_ptr->job_id,
487 				 NULL, true, 0);
488 		if (rc == SLURM_SUCCESS) {
489 			info("preempted %pJ has been requeued to reclaim resources for %pJ",
490 			     job_ptr, preemptor_ptr);
491 		}
492 	}
493 
494 	if (rc != SLURM_SUCCESS) {
495 		if (job_ptr->het_job_list)
496 			rc = het_job_signal(job_ptr, SIGKILL, 0, 0, true);
497 		else
498 			rc = job_signal(job_ptr, SIGKILL, 0, 0, true);
499 		if (rc == SLURM_SUCCESS) {
500 			info("%s: preempted %pJ had to be killed",
501 			     __func__, job_ptr);
502 		} else {
503 			info("%s: preempted %pJ kill failure %s",
504 			     __func__, job_ptr, slurm_strerror(rc));
505 		}
506 	}
507 
508 	if (rc == SLURM_SUCCESS)
509 		job_ptr->preempt_time = time(NULL);
510 
511 	return rc;
512 }
513 /*
514  * Return true if the preemptor can preempt the preemptee, otherwise false
515  */
preempt_g_job_preempt_check(job_queue_rec_t * preemptor,job_queue_rec_t * preemptee)516 extern bool preempt_g_job_preempt_check(job_queue_rec_t *preemptor,
517 					job_queue_rec_t *preemptee)
518 {
519 	if (slurm_preempt_init() < 0)
520 		return false;
521 
522 	return (*(ops.job_preempt_check))(preemptor, preemptee);
523 }
524 
preempt_g_preemptable(job_record_t * preemptee,job_record_t * preemptor)525 extern bool preempt_g_preemptable(
526 	job_record_t *preemptee, job_record_t *preemptor)
527 {
528 	if (slurm_preempt_init() < 0)
529 		return false;
530 
531 	return (*(ops.preemptable))(preemptor, preemptee);
532 }
533 
preempt_g_get_data(job_record_t * job_ptr,slurm_preempt_data_type_t data_type,void * data)534 extern int preempt_g_get_data(job_record_t *job_ptr,
535 			      slurm_preempt_data_type_t data_type,
536 			      void *data)
537 {
538 	if (slurm_preempt_init() < 0)
539 		return SLURM_ERROR;
540 
541 	return (*(ops.get_data))(job_ptr, data_type, data);
542 }
543