1 /*****************************************************************************\
2  *  job_mgr.c - manage the job information of slurm
3  *	Note: there is a global job list (job_list), time stamp
4  *	(last_job_update), and hash table (job_hash)
5  *****************************************************************************
6  *  Copyright (C) 2002-2007 The Regents of the University of California.
7  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
8  *  Portions Copyright (C) 2010-2017 SchedMD <https://www.schedmd.com>.
9  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
10  *  Written by Morris Jette <jette1@llnl.gov>
11  *  CODE-OCEC-09-009. All rights reserved.
12  *
13  *  This file is part of Slurm, a resource management program.
14  *  For details, see <https://slurm.schedmd.com/>.
15  *  Please also read the included file: DISCLAIMER.
16  *
17  *  Slurm is free software; you can redistribute it and/or modify it under
18  *  the terms of the GNU General Public License as published by the Free
19  *  Software Foundation; either version 2 of the License, or (at your option)
20  *  any later version.
21  *
22  *  In addition, as a special exception, the copyright holders give permission
23  *  to link the code of portions of this program with the OpenSSL library under
24  *  certain conditions as described in each individual source file, and
25  *  distribute linked combinations including the two. You must obey the GNU
26  *  General Public License in all respects for all of the code used other than
27  *  OpenSSL. If you modify file(s) with this exception, you may extend this
28  *  exception to your version of the file(s), but you are not obligated to do
29  *  so. If you do not wish to do so, delete this exception statement from your
30  *  version.  If you delete this exception statement from all source files in
31  *  the program, then also delete it here.
32  *
33  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
34  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
35  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
36  *  details.
37  *
38  *  You should have received a copy of the GNU General Public License along
39  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
40  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
41 \*****************************************************************************/
42 
43 #include "config.h"
44 #define _GNU_SOURCE
45 
46 #include <ctype.h>
47 #include <dirent.h>
48 #include <errno.h>
49 #include <fcntl.h>
50 #include <libgen.h>
51 #include <signal.h>
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include <sys/stat.h>
56 #include <sys/types.h>
57 #include <sys/param.h>
58 #include <unistd.h>
59 
60 #include "slurm/slurm_errno.h"
61 
62 #include "src/common/slurm_acct_gather.h"
63 #include "src/common/assoc_mgr.h"
64 #include "src/common/bitstring.h"
65 #include "src/common/cpu_frequency.h"
66 #include "src/common/fd.h"
67 #include "src/common/forward.h"
68 #include "src/common/gres.h"
69 #include "src/common/hostlist.h"
70 #include "src/common/node_features.h"
71 #include "src/common/node_select.h"
72 #include "src/common/parse_time.h"
73 #include "src/common/power.h"
74 #include "src/common/slurm_accounting_storage.h"
75 #include "src/common/slurm_auth.h"
76 #include "src/common/slurm_jobcomp.h"
77 #include "src/common/slurm_mcs.h"
78 #include "src/common/slurm_priority.h"
79 #include "src/common/slurm_protocol_pack.h"
80 #include "src/common/switch.h"
81 #include "src/common/timers.h"
82 #include "src/common/track_script.h"
83 #include "src/common/tres_bind.h"
84 #include "src/common/tres_frequency.h"
85 #include "src/common/uid.h"
86 #include "src/common/xassert.h"
87 #include "src/common/xstring.h"
88 
89 #include "src/slurmctld/acct_policy.h"
90 #include "src/slurmctld/agent.h"
91 #include "src/slurmctld/burst_buffer.h"
92 #include "src/slurmctld/fed_mgr.h"
93 #include "src/slurmctld/front_end.h"
94 #include "src/slurmctld/gang.h"
95 #include "src/slurmctld/job_scheduler.h"
96 #include "src/slurmctld/job_submit.h"
97 #include "src/slurmctld/licenses.h"
98 #include "src/slurmctld/locks.h"
99 #include "src/slurmctld/node_scheduler.h"
100 #include "src/slurmctld/preempt.h"
101 #include "src/slurmctld/proc_req.h"
102 #include "src/slurmctld/reservation.h"
103 #include "src/slurmctld/sched_plugin.h"
104 #include "src/slurmctld/slurmctld.h"
105 #include "src/slurmctld/slurmctld_plugstack.h"
106 #include "src/slurmctld/srun_comm.h"
107 #include "src/slurmctld/state_save.h"
108 #include "src/slurmctld/trigger_mgr.h"
109 
110 #define ARRAY_ID_BUF_SIZE 32
111 #define DETAILS_FLAG 0xdddd
112 #define MAX_EXIT_VAL 255	/* Maximum value returned by WIFEXITED() */
113 #define SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0 0
114 #define TOP_PRIORITY 0xffff0000	/* large, but leave headroom for higher */
115 #define PURGE_OLD_JOB_IN_SEC 2592000 /* 30 days in seconds */
116 
117 #define JOB_HASH_INX(_job_id)	(_job_id % hash_table_size)
118 #define JOB_ARRAY_HASH_INX(_job_id, _task_id) \
119 	((_job_id + _task_id) % hash_table_size)
120 
121 /* No need to change we always pack SLURM_PROTOCOL_VERSION */
122 #define JOB_STATE_VERSION     "PROTOCOL_VERSION"
123 #define JOB_CKPT_VERSION      "PROTOCOL_VERSION"
124 
125 typedef enum {
126 	JOB_HASH_JOB,
127 	JOB_HASH_ARRAY_JOB,
128 	JOB_HASH_ARRAY_TASK,
129 } job_hash_type_t;
130 
131 typedef struct {
132 	int resp_array_cnt;
133 	int resp_array_size;
134 	uint32_t *resp_array_rc;
135 	bitstr_t **resp_array_task_id;
136 } resp_array_struct_t;
137 
138 typedef struct {
139 	Buf       buffer;
140 	uint32_t  filter_uid;
141 	uint32_t *jobs_packed;
142 	uint16_t  protocol_version;
143 	uint16_t  show_flags;
144 	uid_t     uid;
145 } _foreach_pack_job_info_t;
146 
147 typedef struct {
148 	bitstr_t *node_map;
149 	int rc;
150 } job_overlap_args_t;
151 
152 /* Global variables */
153 List   job_list = NULL;		/* job_record list */
154 time_t last_job_update;		/* time of last update to job records */
155 
156 List purge_files_list = NULL;	/* job files to delete */
157 
158 /* Local variables */
159 static int      bf_min_age_reserve = 0;
160 static uint32_t delay_boot = 0;
161 static uint32_t highest_prio = 0;
162 static uint32_t lowest_prio  = TOP_PRIORITY;
163 static int      hash_table_size = 0;
164 static int      job_count = 0;		/* job's in the system */
165 static uint32_t job_id_sequence = 0;	/* first job_id to assign new job */
166 static struct   job_record **job_hash = NULL;
167 static struct   job_record **job_array_hash_j = NULL;
168 static struct   job_record **job_array_hash_t = NULL;
169 static bool     kill_invalid_dep;
170 static time_t   last_file_write_time = (time_t) 0;
171 static uint32_t max_array_size = NO_VAL;
172 static bitstr_t *requeue_exit = NULL;
173 static bitstr_t *requeue_exit_hold = NULL;
174 static bool     validate_cfgd_licenses = true;
175 
176 /* Local functions */
177 static void _add_job_hash(job_record_t *job_ptr);
178 static void _add_job_array_hash(job_record_t *job_ptr);
179 static void _clear_job_gres_details(job_record_t *job_ptr);
180 static int  _copy_job_desc_to_file(job_desc_msg_t * job_desc,
181 				   uint32_t job_id);
182 static int  _copy_job_desc_to_job_record(job_desc_msg_t * job_desc,
183 					 job_record_t **job_ptr,
184 					 bitstr_t ** exc_bitmap,
185 					 bitstr_t ** req_bitmap);
186 static char *_copy_nodelist_no_dup(char *node_list);
187 static job_record_t *_create_job_record(uint32_t num_jobs);
188 static void _delete_job_details(job_record_t *job_entry);
189 static slurmdb_qos_rec_t *_determine_and_validate_qos(
190 	char *resv_name, slurmdb_assoc_rec_t *assoc_ptr,
191 	bool operator, slurmdb_qos_rec_t *qos_rec, int *error_code,
192 	bool locked, log_level_t log_lvl);
193 static void _dump_job_details(struct job_details *detail_ptr, Buf buffer);
194 static void _dump_job_state(job_record_t *dump_job_ptr, Buf buffer);
195 static void _dump_job_fed_details(job_fed_details_t *fed_details_ptr,
196 				  Buf buffer);
197 static job_fed_details_t *_dup_job_fed_details(job_fed_details_t *src);
198 static void _get_batch_job_dir_ids(List batch_dirs);
199 static bool _get_whole_hetjob(void);
200 static void _job_array_comp(job_record_t *job_ptr, bool was_running,
201 			    bool requeue);
202 static int  _job_create(job_desc_msg_t * job_specs, int allocate, int will_run,
203 			job_record_t **job_rec_ptr, uid_t submit_uid,
204 			char **err_msg, uint16_t protocol_version);
205 static void _job_timed_out(job_record_t *job_ptr, bool preempted);
206 static void _kill_dependent(job_record_t *job_ptr);
207 static void _list_delete_job(void *job_entry);
208 static int  _list_find_job_old(void *job_entry, void *key);
209 static int  _load_job_details(job_record_t *job_ptr, Buf buffer,
210 			      uint16_t protocol_version);
211 static int  _load_job_fed_details(job_fed_details_t **fed_details_pptr,
212 				  Buf buffer, uint16_t protocol_version);
213 static int  _load_job_state(Buf buffer,	uint16_t protocol_version);
214 static bitstr_t *_make_requeue_array(char *conf_buf);
215 static uint32_t _max_switch_wait(uint32_t input_wait);
216 static void _notify_srun_missing_step(job_record_t *job_ptr, int node_inx,
217 				      time_t now, time_t node_boot_time);
218 static Buf  _open_job_state_file(char **state_file);
219 static time_t _get_last_job_state_write_time(void);
220 static void _pack_default_job_details(job_record_t *job_ptr, Buf buffer,
221 				      uint16_t protocol_version);
222 static void _pack_pending_job_details(struct job_details *detail_ptr,
223 				      Buf buffer,
224 				      uint16_t protocol_version);
225 static bool _parse_array_tok(char *tok, bitstr_t *array_bitmap, uint32_t max);
226 static void _purge_missing_jobs(int node_inx, time_t now);
227 static int  _read_data_array_from_file(int fd, char *file_name, char ***data,
228 				       uint32_t *size, job_record_t *job_ptr);
229 static void _remove_defunct_batch_dirs(List batch_dirs);
230 static void _remove_job_hash(job_record_t *job_ptr, job_hash_type_t type);
231 static int  _reset_detail_bitmaps(job_record_t *job_ptr);
232 static void _reset_step_bitmaps(job_record_t *job_ptr);
233 static void _resp_array_add(resp_array_struct_t **resp, job_record_t *job_ptr,
234 			    uint32_t rc);
235 static void _resp_array_add_id(resp_array_struct_t **resp, uint32_t job_id,
236 			       uint32_t task_id, uint32_t rc);
237 static void _resp_array_free(resp_array_struct_t *resp);
238 static job_array_resp_msg_t *_resp_array_xlate(resp_array_struct_t *resp,
239 					       uint32_t job_id);
240 static int  _resume_job_nodes(job_record_t *job_ptr, bool indf_susp);
241 static void _send_job_kill(job_record_t *job_ptr);
242 static int  _set_job_id(job_record_t *job_ptr);
243 static void _set_job_requeue_exit_value(job_record_t *job_ptr);
244 static void _signal_batch_job(job_record_t *job_ptr, uint16_t signal,
245 			      uint16_t flags);
246 static void _signal_job(job_record_t *job_ptr, int signal, uint16_t flags);
247 static void _suspend_job(job_record_t *job_ptr, uint16_t op, bool indf_susp);
248 static int  _suspend_job_nodes(job_record_t *job_ptr, bool indf_susp);
249 static bool _top_priority(job_record_t *job_ptr, uint32_t het_job_offset);
250 static int  _valid_job_part(job_desc_msg_t *job_desc, uid_t submit_uid,
251 			    bitstr_t *req_bitmap, part_record_t *part_ptr,
252 			    List part_ptr_list,
253 			    slurmdb_assoc_rec_t *assoc_ptr,
254 			    slurmdb_qos_rec_t *qos_ptr);
255 static int  _validate_job_desc(job_desc_msg_t *job_desc_msg, int allocate,
256 			       uid_t submit_uid, part_record_t *part_ptr,
257 			       List part_list);
258 static void _validate_job_files(List batch_dirs);
259 static bool _validate_min_mem_partition(job_desc_msg_t *job_desc_msg,
260 					part_record_t *part_ptr,
261 					List part_list);
262 static bool _valid_pn_min_mem(job_desc_msg_t * job_desc_msg,
263 			      part_record_t *part_ptr);
264 static int  _write_data_to_file(char *file_name, char *data);
265 static int  _write_data_array_to_file(char *file_name, char **data,
266 				      uint32_t size);
267 static void _xmit_new_end_time(job_record_t *job_ptr);
268 
269 
_get_mail_user(const char * user_name,uid_t user_id)270 static char *_get_mail_user(const char *user_name, uid_t user_id)
271 {
272 	char *mail_user = NULL;
273 	if (!user_name || (user_name[0] == '\0')) {
274 		mail_user = uid_to_string(user_id);
275 		/* unqualified sender, append MailDomain if set */
276 		if (slurmctld_conf.mail_domain) {
277 			xstrfmtcat(mail_user, "@%s",
278 				   slurmctld_conf.mail_domain);
279 		}
280 	} else {
281 		mail_user = xstrdup(user_name);
282 	}
283 
284 	return mail_user;
285 }
286 
_job_fail_account(job_record_t * job_ptr,const char * func_name)287 static int _job_fail_account(job_record_t *job_ptr, const char *func_name)
288 {
289 	int rc = 0; // Return number of pending jobs held
290 
291 	if (IS_JOB_PENDING(job_ptr)) {
292 		info("%s: %pJ ineligible due to invalid association",
293 		     func_name, job_ptr);
294 
295 		xfree(job_ptr->state_desc);
296 		job_ptr->state_reason = FAIL_ACCOUNT;
297 
298 		if (job_ptr->details) {
299 			/* reset the job */
300 			job_ptr->details->accrue_time = 0;
301 			job_ptr->bit_flags &= ~JOB_ACCRUE_OVER;
302 			job_ptr->details->begin_time = 0;
303 			/* Update job with new begin_time. */
304 			jobacct_storage_g_job_start(acct_db_conn, job_ptr);
305 		}
306 		rc = 1;
307 	}
308 
309 	/* This job is no longer eligible, so make it so. */
310 	if (job_ptr->assoc_ptr) {
311 		part_record_t *tmp_part = job_ptr->part_ptr;
312 		List tmp_part_list = job_ptr->part_ptr_list;
313 		slurmdb_qos_rec_t *tmp_qos = job_ptr->qos_ptr;
314 
315 		/*
316 		 * Force a start so the association doesn't get lost.  Since
317 		 * there could be some delay in the start of the job when
318 		 * running with the slurmdbd.
319 		 */
320 		if (!job_ptr->db_index)
321 			jobacct_storage_g_job_start(acct_db_conn, job_ptr);
322 
323 		/*
324 		 * Don't call acct_policy_remove_accrue_time() here, the cnt on
325 		 * parent associations will be handled correctly by the removal
326 		 * of the association.
327 		 */
328 
329 		/*
330 		 * Clear ptrs so that only association usage is removed.
331 		 * Otherwise qos and partition limits will be double accounted
332 		 * for when this job finishes. Don't do this for acrrual time,
333 		 * it has be on both because the job is ineligible and can't
334 		 * accrue time.
335 		 */
336 		job_ptr->part_ptr = NULL;
337 		job_ptr->part_ptr_list = NULL;
338 		job_ptr->qos_ptr = NULL;
339 
340 		acct_policy_remove_job_submit(job_ptr);
341 
342 		job_ptr->part_ptr = tmp_part;
343 		job_ptr->part_ptr_list = tmp_part_list;
344 		job_ptr->qos_ptr = tmp_qos;
345 
346 		job_ptr->assoc_ptr = NULL;
347 	}
348 
349 	job_ptr->assoc_id = 0;
350 
351 	return rc;
352 }
353 
job_fail_qos(job_record_t * job_ptr,const char * func_name)354 extern int job_fail_qos(job_record_t *job_ptr, const char *func_name)
355 {
356 	int rc = 0; // Return number of pending jobs held
357 
358 	if (IS_JOB_PENDING(job_ptr)) {
359 		info("%s: %pJ ineligible due to invalid qos",
360 		     func_name, job_ptr);
361 
362 		xfree(job_ptr->state_desc);
363 		job_ptr->state_reason = FAIL_QOS;
364 
365 		if (job_ptr->details) {
366 			/* reset the job */
367 			job_ptr->details->accrue_time = 0;
368 			job_ptr->bit_flags &= ~JOB_ACCRUE_OVER;
369 			job_ptr->details->begin_time = 0;
370 			/* Update job with new begin_time. */
371 			jobacct_storage_g_job_start(acct_db_conn, job_ptr);
372 		}
373 		rc = 1;
374 	}
375 
376 	/* This job is no longer eligible, so make it so. */
377 	if (job_ptr->qos_ptr) {
378 		slurmdb_assoc_rec_t *tmp_assoc = job_ptr->assoc_ptr;
379 
380 		/*
381 		 * Force a start so the qos doesn't get lost.  Since
382 		 * there could be some delay in the start of the job when
383 		 * running with the slurmdbd.
384 		 */
385 		if (!job_ptr->db_index)
386 			jobacct_storage_g_job_start(acct_db_conn, job_ptr);
387 
388 		/*
389 		 * Don't call acct_policy_remove_accrue_time() here, the cnt on
390 		 * parent associations will be handled correctly by the removal
391 		 * of the association.
392 		 */
393 
394 		/*
395 		 * Clear ptrs so that only qos usage is removed. Otherwise
396 		 * association limits will be double accounted for when this
397 		 * job finishes. Don't do this for acrrual time, it has be on
398 		 * both because the job is ineligible and can't accrue time.
399 		 */
400 		job_ptr->assoc_ptr = NULL;
401 
402 		acct_policy_remove_job_submit(job_ptr);
403 
404 		job_ptr->assoc_ptr = tmp_assoc;
405 
406 		job_ptr->qos_ptr = NULL;
407 	}
408 
409 	return rc;
410 }
411 
412 /*
413  * Functions used to manage job array responses with a separate return code
414  * possible for each task ID
415  */
416 /* Add job record to resp_array_struct_t, free with _resp_array_free() */
_resp_array_add(resp_array_struct_t ** resp,job_record_t * job_ptr,uint32_t rc)417 static void _resp_array_add(resp_array_struct_t **resp, job_record_t *job_ptr,
418 			    uint32_t rc)
419 {
420 	resp_array_struct_t *loc_resp;
421 	int array_size;
422 	int i;
423 
424 	if ((job_ptr->array_task_id == NO_VAL) &&
425 	    (job_ptr->array_recs == NULL)) {
426 		error("%s: called for non-job array %pJ",
427 		      __func__, job_ptr);
428 		return;
429 	}
430 
431 	if (max_array_size == NO_VAL) {
432 		max_array_size = slurmctld_conf.max_array_sz;
433 	}
434 
435 	xassert(resp);
436 	if (*resp == NULL) {
437 		/* Initialize the data structure */
438 		loc_resp = xmalloc(sizeof(resp_array_struct_t));
439 		loc_resp->resp_array_cnt  = 0;
440 		loc_resp->resp_array_size = 10;
441 		xrealloc(loc_resp->resp_array_rc,
442 			 (sizeof(uint32_t) * loc_resp->resp_array_size));
443 		xrealloc(loc_resp->resp_array_task_id,
444 			 (sizeof(bitstr_t *) * loc_resp->resp_array_size));
445 		*resp = loc_resp;
446 	} else {
447 		loc_resp = *resp;
448 	}
449 
450 	for (i = 0; i < loc_resp->resp_array_cnt; i++) {
451 		if (loc_resp->resp_array_rc[i] != rc)
452 			continue;
453 		/* Add to existing error code record */
454 		if (job_ptr->array_task_id != NO_VAL) {
455 			if (job_ptr->array_task_id <
456 			    bit_size(loc_resp->resp_array_task_id[i])) {
457 				bit_set(loc_resp->resp_array_task_id[i],
458 					job_ptr->array_task_id);
459 			} else {
460 				error("%s: found invalid task id %pJ",
461 				      __func__, job_ptr);
462 			}
463 		} else if (job_ptr->array_recs &&
464 			   job_ptr->array_recs->task_id_bitmap) {
465 			array_size = bit_size(job_ptr->array_recs->
466 					      task_id_bitmap);
467 			if (bit_size(loc_resp->resp_array_task_id[i]) !=
468 			    array_size) {
469 				loc_resp->resp_array_task_id[i] = bit_realloc(
470 					loc_resp->resp_array_task_id[i],
471 					array_size);
472 			}
473 			bit_or(loc_resp->resp_array_task_id[i],
474 			       job_ptr->array_recs->task_id_bitmap);
475 		} else {
476 			error("%s: found job %pJ without task ID or bitmap",
477 			      __func__, job_ptr);
478 		}
479 		return;
480 	}
481 
482 	/* Need to add a new record for this error code */
483 	if (loc_resp->resp_array_cnt >= loc_resp->resp_array_size) {
484 		/* Need to grow the table size */
485 		loc_resp->resp_array_size += 10;
486 		xrealloc(loc_resp->resp_array_rc,
487 			 (sizeof(uint32_t) * loc_resp->resp_array_size));
488 		xrealloc(loc_resp->resp_array_task_id,
489 			 (sizeof(bitstr_t *) * loc_resp->resp_array_size));
490 	}
491 
492 	loc_resp->resp_array_rc[loc_resp->resp_array_cnt] = rc;
493 	if (job_ptr->array_task_id != NO_VAL) {
494 		loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] =
495 				bit_alloc(max_array_size);
496 		if (job_ptr->array_task_id <
497 		    bit_size(loc_resp->resp_array_task_id
498 			     [loc_resp->resp_array_cnt])) {
499 			bit_set(loc_resp->resp_array_task_id
500 				[loc_resp->resp_array_cnt],
501 				job_ptr->array_task_id);
502 		}
503 	} else if (job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) {
504 		loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] =
505 			bit_copy(job_ptr->array_recs->task_id_bitmap);
506 	} else {
507 		error("%s: found %pJ without task ID or bitmap",
508 		      __func__, job_ptr);
509 		loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] =
510 				bit_alloc(max_array_size);
511 	}
512 	loc_resp->resp_array_cnt++;
513 	return;
514 }
515 /* Add record to resp_array_struct_t, free with _resp_array_free().
516  * This is a variant of _resp_array_add for the case where a job/task ID
517  * is not found, so we use a dummy job record based upon the input IDs. */
_resp_array_add_id(resp_array_struct_t ** resp,uint32_t job_id,uint32_t task_id,uint32_t rc)518 static void _resp_array_add_id(resp_array_struct_t **resp, uint32_t job_id,
519 			       uint32_t task_id, uint32_t rc)
520 {
521 	job_record_t job_ptr;
522 
523 	job_ptr.job_id = job_id;
524 	job_ptr.array_job_id = job_id;
525 	job_ptr.array_task_id = task_id;
526 	job_ptr.array_recs = NULL;
527 	_resp_array_add(resp, &job_ptr, rc);
528 }
529 
530 /* Free resp_array_struct_t built by _resp_array_add() */
_resp_array_free(resp_array_struct_t * resp)531 static void _resp_array_free(resp_array_struct_t *resp)
532 {
533 	int i;
534 
535 	if (resp) {
536 		for (i = 0; i < resp->resp_array_cnt; i++)
537 			FREE_NULL_BITMAP(resp->resp_array_task_id[i]);
538 		xfree(resp->resp_array_task_id);
539 		xfree(resp->resp_array_rc);
540 		xfree(resp);
541 	}
542 }
543 
544 /* Translate internal job array data structure into a response message */
_resp_array_xlate(resp_array_struct_t * resp,uint32_t job_id)545 static job_array_resp_msg_t *_resp_array_xlate(resp_array_struct_t *resp,
546 					       uint32_t job_id)
547 {
548 	job_array_resp_msg_t *msg;
549 	char task_str[ARRAY_ID_BUF_SIZE];
550 	int *ffs = NULL;
551 	int i, j, low;
552 
553 	ffs = xcalloc(resp->resp_array_cnt, sizeof(int));
554 	for (i = 0; i < resp->resp_array_cnt; i++) {
555 		ffs[i] = bit_ffs(resp->resp_array_task_id[i]);
556 	}
557 
558 	msg = xmalloc(sizeof(job_array_resp_msg_t));
559 	msg->job_array_count = resp->resp_array_cnt;
560 	msg->job_array_id = xcalloc(resp->resp_array_cnt, sizeof(char *));
561 	msg->error_code = xcalloc(resp->resp_array_cnt, sizeof(uint32_t));
562 	for (i = 0; i < resp->resp_array_cnt; i++) {
563 		low = -1;
564 		for (j = 0; j < resp->resp_array_cnt; j++) {
565 			if ((ffs[j] != -1) &&
566 			    ((low == -1) || (ffs[j] < ffs[low])))
567 				low = j;
568 		}
569 		if (low == -1)
570 			break;
571 		ffs[low] = -1;
572 
573 		msg->error_code[i] = resp->resp_array_rc[low];
574 		bit_fmt(task_str, ARRAY_ID_BUF_SIZE,
575 			resp->resp_array_task_id[low]);
576 		if (strlen(task_str) >= ARRAY_ID_BUF_SIZE - 2) {
577 			/* Append "..." to the buffer on overflow */
578 			task_str[ARRAY_ID_BUF_SIZE - 4] = '.';
579 			task_str[ARRAY_ID_BUF_SIZE - 3] = '.';
580 			task_str[ARRAY_ID_BUF_SIZE - 2] = '.';
581 			task_str[ARRAY_ID_BUF_SIZE - 1] = '\0';
582 		}
583 		xstrfmtcat(msg->job_array_id[i], "%u_%s", job_id, task_str);
584 	}
585 
586 	xfree(ffs);
587 	return msg;
588 }
589 
590 /*
591  * _create_job_record - create an empty job_record including job_details.
592  *	load its values with defaults (zeros, nulls, and magic cookie)
593  * IN num_jobs - number of jobs this record should represent
594  *    = 0 - split out a job array record to its own job record
595  *    = 1 - simple job OR job array with one task
596  *    > 1 - job array create with the task count as num_jobs
597  * RET pointer to the record or NULL if error
598  * NOTE: allocates memory that should be xfreed with _list_delete_job
599  */
_create_job_record(uint32_t num_jobs)600 static job_record_t *_create_job_record(uint32_t num_jobs)
601 {
602 	job_record_t *job_ptr = xmalloc(sizeof(*job_ptr));
603 	struct job_details *detail_ptr = xmalloc(sizeof(*detail_ptr));
604 
605 	if ((job_count + num_jobs) >= slurmctld_conf.max_job_cnt) {
606 		error("%s: MaxJobCount limit from slurm.conf reached (%u)",
607 		      __func__, slurmctld_conf.max_job_cnt);
608 	}
609 
610 	job_count += num_jobs;
611 	last_job_update = time(NULL);
612 
613 	job_ptr->magic = JOB_MAGIC;
614 	job_ptr->array_task_id = NO_VAL;
615 	job_ptr->details = detail_ptr;
616 	job_ptr->prio_factors = xmalloc(sizeof(priority_factors_object_t));
617 	job_ptr->site_factor = NICE_OFFSET;
618 	job_ptr->step_list = list_create(NULL);
619 
620 	xassert (detail_ptr->magic = DETAILS_MAGIC); /* set value */
621 	detail_ptr->submit_time = time(NULL);
622 	job_ptr->requid = -1; /* force to -1 for sacct to know this
623 			       * hasn't been set yet  */
624 	job_ptr->billable_tres = (double)NO_VAL;
625 	(void) list_append(job_list, job_ptr);
626 
627 	return job_ptr;
628 }
629 
630 
631 /*
632  * _delete_job_details - delete a job's detail record and clear it's pointer
633  * IN job_entry - pointer to job_record to clear the record of
634  */
_delete_job_details(job_record_t * job_entry)635 static void _delete_job_details(job_record_t *job_entry)
636 {
637 	int i;
638 
639 	if (job_entry->details == NULL)
640 		return;
641 
642 	xassert (job_entry->details->magic == DETAILS_MAGIC);
643 
644 	/*
645 	 * Queue up job to have the batch script and environment deleted.
646 	 * This is handled by a separate thread to limit the amount of
647 	 * time purge_old_job needs to spend holding locks.
648 	 */
649 	if (IS_JOB_FINISHED(job_entry)) {
650 		uint32_t *job_id = xmalloc(sizeof(uint32_t));
651 		*job_id = job_entry->job_id;
652 		list_enqueue(purge_files_list, job_id);
653 	}
654 
655 	xfree(job_entry->details->acctg_freq);
656 	for (i=0; i<job_entry->details->argc; i++)
657 		xfree(job_entry->details->argv[i]);
658 	xfree(job_entry->details->argv);
659 	xfree(job_entry->details->cpu_bind);
660 	FREE_NULL_LIST(job_entry->details->depend_list);
661 	xfree(job_entry->details->dependency);
662 	xfree(job_entry->details->orig_dependency);
663 	for (i=0; i<job_entry->details->env_cnt; i++)
664 		xfree(job_entry->details->env_sup[i]);
665 	xfree(job_entry->details->env_sup);
666 	xfree(job_entry->details->std_err);
667 	FREE_NULL_BITMAP(job_entry->details->exc_node_bitmap);
668 	xfree(job_entry->details->exc_nodes);
669 	xfree(job_entry->details->extra);
670 	FREE_NULL_LIST(job_entry->details->feature_list);
671 	xfree(job_entry->details->features);
672 	xfree(job_entry->details->cluster_features);
673 	xfree(job_entry->details->std_in);
674 	xfree(job_entry->details->mc_ptr);
675 	xfree(job_entry->details->mem_bind);
676 	xfree(job_entry->details->std_out);
677 	FREE_NULL_BITMAP(job_entry->details->req_node_bitmap);
678 	xfree(job_entry->details->req_nodes);
679 	xfree(job_entry->details->work_dir);
680 	xfree(job_entry->details->x11_magic_cookie);
681 	xfree(job_entry->details->x11_target);
682 	xfree(job_entry->details);	/* Must be last */
683 }
684 
685 /*
686  * delete_job_desc_files - delete job descriptor related files
687  *
688  * Note that this will be called on all individual job array tasks,
689  * even though (as of 17.11) individual directories are no longer created.
690  */
delete_job_desc_files(uint32_t job_id)691 extern void delete_job_desc_files(uint32_t job_id)
692 {
693 	char *dir_name = NULL, *file_name = NULL;
694 	struct stat sbuf;
695 	int hash = job_id % 10;
696 	DIR *f_dir;
697 	struct dirent *dir_ent;
698 
699 	dir_name = xstrdup_printf("%s/hash.%d/job.%u",
700 				  slurmctld_conf.state_save_location,
701 				  hash, job_id);
702 	if (stat(dir_name, &sbuf)) {
703 		xfree(dir_name);
704 		return;
705 	}
706 
707 	f_dir = opendir(dir_name);
708 	if (f_dir) {
709 		while ((dir_ent = readdir(f_dir))) {
710 			if (!xstrcmp(dir_ent->d_name, ".") ||
711 			    !xstrcmp(dir_ent->d_name, ".."))
712 				continue;
713 			xstrfmtcat(file_name, "%s/%s", dir_name,
714 				   dir_ent->d_name);
715 			(void) unlink(file_name);
716 			xfree(file_name);
717 		}
718 		closedir(f_dir);
719 	} else {
720 		error("opendir(%s): %m", dir_name);
721 	}
722 
723 	(void) rmdir(dir_name);
724 	xfree(dir_name);
725 }
726 
_max_switch_wait(uint32_t input_wait)727 static uint32_t _max_switch_wait(uint32_t input_wait)
728 {
729 	static time_t sched_update = 0;
730 	static uint32_t max_wait = 300;	/* default max_switch_wait, seconds */
731 	char *sched_params, *tmp_ptr;
732 	int i;
733 
734 	if (sched_update != slurmctld_conf.last_update) {
735 		sched_update = slurmctld_conf.last_update;
736 		sched_params = slurm_get_sched_params();
737 		if ((tmp_ptr = xstrcasestr(sched_params, "max_switch_wait="))) {
738 		/*                                        0123456789012345 */
739 			i = atoi(tmp_ptr + 16);
740 			if (i < 0) {
741 				error("ignoring SchedulerParameters: "
742 				      "max_switch_wait of %d", i);
743 			} else {
744 				max_wait = i;
745 			}
746 		}
747 		xfree(sched_params);
748 	}
749 
750 	if (max_wait > input_wait)
751 		return input_wait;
752 	return max_wait;
753 }
754 
_determine_and_validate_qos(char * resv_name,slurmdb_assoc_rec_t * assoc_ptr,bool operator,slurmdb_qos_rec_t * qos_rec,int * error_code,bool locked,log_level_t log_lvl)755 static slurmdb_qos_rec_t *_determine_and_validate_qos(
756 	char *resv_name, slurmdb_assoc_rec_t *assoc_ptr,
757 	bool operator, slurmdb_qos_rec_t *qos_rec, int *error_code,
758 	bool locked, log_level_t log_lvl)
759 {
760 	slurmdb_qos_rec_t *qos_ptr = NULL;
761 
762 	/* If enforcing associations make sure this is a valid qos
763 	   with the association.  If not just fill in the qos and
764 	   continue. */
765 
766 	xassert(qos_rec);
767 
768 	assoc_mgr_get_default_qos_info(assoc_ptr, qos_rec);
769 	if (assoc_mgr_fill_in_qos(acct_db_conn, qos_rec, accounting_enforce,
770 				  &qos_ptr, locked) != SLURM_SUCCESS) {
771 		log_var(log_lvl, "Invalid qos (%s)", qos_rec->name);
772 		*error_code = ESLURM_INVALID_QOS;
773 		return NULL;
774 	}
775 
776 	if ((accounting_enforce & ACCOUNTING_ENFORCE_QOS)
777 	    && assoc_ptr
778 	    && !operator
779 	    && (!assoc_ptr->usage->valid_qos
780 		|| !bit_test(assoc_ptr->usage->valid_qos, qos_rec->id))) {
781 		log_var(log_lvl, "This association %d(account='%s', user='%s', partition='%s') does not have access to qos %s",
782 		        assoc_ptr->id, assoc_ptr->acct, assoc_ptr->user,
783 		        assoc_ptr->partition, qos_rec->name);
784 		*error_code = ESLURM_INVALID_QOS;
785 		return NULL;
786 	}
787 
788 	if (qos_ptr && (qos_ptr->flags & QOS_FLAG_REQ_RESV)
789 	    && (!resv_name || resv_name[0] == '\0')) {
790 		log_var(log_lvl, "qos %s can only be used in a reservation",
791 		        qos_rec->name);
792 		*error_code = ESLURM_INVALID_QOS;
793 		return NULL;
794 	}
795 
796 	*error_code = SLURM_SUCCESS;
797 	return qos_ptr;
798 }
799 
800 /*
801  * dump_all_job_state - save the state of all jobs to file for checkpoint
802  *	Changes here should be reflected in load_last_job_id() and
803  *	load_all_job_state().
804  * RET 0 or error code
805  */
dump_all_job_state(void)806 int dump_all_job_state(void)
807 {
808 	/* Save high-water mark to avoid buffer growth with copies */
809 	static int high_buffer_size = (1024 * 1024);
810 	int error_code = SLURM_SUCCESS, log_fd;
811 	char *old_file, *new_file, *reg_file;
812 	struct stat stat_buf;
813 	/* Locks: Read config and job */
814 	slurmctld_lock_t job_read_lock =
815 		{ READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
816 	ListIterator job_iterator;
817 	job_record_t *job_ptr;
818 	Buf buffer = init_buf(high_buffer_size);
819 	time_t now = time(NULL);
820 	time_t last_state_file_time;
821 	DEF_TIMERS;
822 
823 	START_TIMER;
824 	/*
825 	 * Check that last state file was written at expected time.
826 	 * This is a check for two slurmctld daemons running at the same
827 	 * time in primary mode (a split-brain problem).
828 	 */
829 	last_state_file_time = _get_last_job_state_write_time();
830 	if (last_file_write_time && last_state_file_time &&
831 	    (last_file_write_time != last_state_file_time)) {
832 		error("Bad job state save file time. We wrote it at time %u, "
833 		      "but the file contains a time stamp of %u.",
834 		      (uint32_t) last_file_write_time,
835 		      (uint32_t) last_state_file_time);
836 		if (slurmctld_primary == 0) {
837 			fatal("Two slurmctld daemons are running as primary. "
838 			      "Shutting down this daemon to avoid inconsistent "
839 			      "state due to split brain.");
840 		}
841 	}
842 
843 	/* write header: version, time */
844 	packstr(JOB_STATE_VERSION, buffer);
845 	pack16(SLURM_PROTOCOL_VERSION, buffer);
846 	pack_time(now, buffer);
847 
848 	/*
849 	 * write header: job id
850 	 * This is needed so that the job id remains persistent even after
851 	 * slurmctld is restarted.
852 	 */
853 	pack32( job_id_sequence, buffer);
854 
855 	debug3("Writing job id %u to header record of job_state file",
856 	       job_id_sequence);
857 
858 	/* write individual job records */
859 	lock_slurmctld(job_read_lock);
860 	job_iterator = list_iterator_create(job_list);
861 	while ((job_ptr = list_next(job_iterator))) {
862 		_dump_job_state(job_ptr, buffer);
863 	}
864 	list_iterator_destroy(job_iterator);
865 
866 
867 	/* write the buffer to file */
868 	old_file = xstrdup(slurmctld_conf.state_save_location);
869 	xstrcat(old_file, "/job_state.old");
870 	reg_file = xstrdup(slurmctld_conf.state_save_location);
871 	xstrcat(reg_file, "/job_state");
872 	new_file = xstrdup(slurmctld_conf.state_save_location);
873 	xstrcat(new_file, "/job_state.new");
874 	unlock_slurmctld(job_read_lock);
875 
876 	if (stat(reg_file, &stat_buf) == 0) {
877 		static time_t last_mtime = (time_t) 0;
878 		int delta_t = difftime(stat_buf.st_mtime, last_mtime);
879 		if (delta_t < -10) {
880 			error("The modification time of %s moved backwards "
881 			      "by %d seconds",
882 			      reg_file, (0-delta_t));
883 			error("The clock of the file system and this computer "
884 			      "appear to not be synchronized");
885 			/* It could be safest to exit here. We likely mounted
886 			 * a different file system with the state save files */
887 		}
888 		last_mtime = time(NULL);
889 	}
890 
891 	lock_state_files();
892 	log_fd = open(new_file, O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0600);
893 	if (log_fd < 0) {
894 		error("Can't save state, create file %s error %m",
895 		      new_file);
896 		error_code = errno;
897 	} else {
898 		int pos = 0, nwrite, amount, rc;
899 		char *data;
900 
901 		nwrite = get_buf_offset(buffer);
902 		data = (char *)get_buf_data(buffer);
903 		high_buffer_size = MAX(nwrite, high_buffer_size);
904 		while (nwrite > 0) {
905 			amount = write(log_fd, &data[pos], nwrite);
906 			if ((amount < 0) && (errno != EINTR)) {
907 				error("Error writing file %s, %m", new_file);
908 				error_code = errno;
909 				break;
910 			}
911 			nwrite -= amount;
912 			pos    += amount;
913 		}
914 
915 		rc = fsync_and_close(log_fd, "job");
916 		if (rc && !error_code)
917 			error_code = rc;
918 	}
919 	if (error_code)
920 		(void) unlink(new_file);
921 	else {			/* file shuffle */
922 		(void) unlink(old_file);
923 		if (link(reg_file, old_file))
924 			debug4("unable to create link for %s -> %s: %m",
925 			       reg_file, old_file);
926 		(void) unlink(reg_file);
927 		if (link(new_file, reg_file))
928 			debug4("unable to create link for %s -> %s: %m",
929 			       new_file, reg_file);
930 		(void) unlink(new_file);
931 		last_file_write_time = now;
932 	}
933 	xfree(old_file);
934 	xfree(reg_file);
935 	xfree(new_file);
936 	unlock_state_files();
937 
938 	free_buf(buffer);
939 	END_TIMER2("dump_all_job_state");
940 	return error_code;
941 }
942 
_find_resv_part(void * x,void * key)943 static int _find_resv_part(void *x, void *key)
944 {
945 	slurmctld_resv_t *resv_ptr = (slurmctld_resv_t *) x;
946 
947 	if (resv_ptr->part_ptr != (part_record_t *) key)
948 		return 0;
949 	else
950 		return 1;	/* match */
951 }
952 
953 /* Open the job state save file, or backup if necessary.
954  * state_file IN - the name of the state save file used
955  * RET the file description to read from or error code
956  */
_open_job_state_file(char ** state_file)957 static Buf _open_job_state_file(char **state_file)
958 {
959 	Buf buf;
960 
961 	xassert(state_file);
962 	xassert(!*state_file);
963 
964 	*state_file = xstrdup_printf("%s/job_state",
965 				     slurmctld_conf.state_save_location);
966 
967 	if (!(buf = create_mmap_buf(*state_file)))
968 		error("Could not open job state file %s: %m", *state_file);
969 	else
970 		return buf;
971 
972 	error("NOTE: Trying backup state save file. Jobs may be lost!");
973 	xstrcat(*state_file, ".old");
974 	return create_mmap_buf(*state_file);
975 }
976 
set_job_failed_assoc_qos_ptr(job_record_t * job_ptr)977 extern void set_job_failed_assoc_qos_ptr(job_record_t *job_ptr)
978 {
979 	if (!job_ptr->assoc_ptr && (job_ptr->state_reason == FAIL_ACCOUNT)) {
980 		slurmdb_assoc_rec_t assoc_rec;
981 		memset(&assoc_rec, 0, sizeof(assoc_rec));
982 		/*
983 		 * For speed and accurracy we will first see if we once had an
984 		 * association record.  If not look for it by
985 		 * account,partition, user_id.
986 		 */
987 		if (job_ptr->assoc_id)
988 			assoc_rec.id = job_ptr->assoc_id;
989 		else {
990 			assoc_rec.acct      = job_ptr->account;
991 			if (job_ptr->part_ptr)
992 				assoc_rec.partition = job_ptr->part_ptr->name;
993 			assoc_rec.uid       = job_ptr->user_id;
994 		}
995 
996 		if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
997 		                            accounting_enforce,
998 		                            &job_ptr->assoc_ptr, false) ==
999 		    SLURM_SUCCESS) {
1000 			job_ptr->assoc_id = assoc_rec.id;
1001 			debug("%s: Filling in assoc for %pJ Assoc=%u",
1002 			      __func__, job_ptr, job_ptr->assoc_id);
1003 
1004 			job_ptr->state_reason = WAIT_NO_REASON;
1005 			xfree(job_ptr->state_desc);
1006 			last_job_update = time(NULL);
1007 		}
1008 	}
1009 
1010 	if (!job_ptr->qos_ptr && (job_ptr->state_reason == FAIL_QOS)) {
1011 		int qos_error = SLURM_SUCCESS;
1012 		slurmdb_qos_rec_t qos_rec;
1013 		memset(&qos_rec, 0, sizeof(qos_rec));
1014 		qos_rec.id = job_ptr->qos_id;
1015 		job_ptr->qos_ptr = _determine_and_validate_qos(
1016 			job_ptr->resv_name, job_ptr->assoc_ptr,
1017 			job_ptr->limit_set.qos, &qos_rec,
1018 			&qos_error, false, LOG_LEVEL_DEBUG2);
1019 
1020 		if ((qos_error == SLURM_SUCCESS) && job_ptr->qos_ptr) {
1021 			debug("%s: Filling in QOS for %pJ QOS=%s(%u)",
1022 			      __func__, job_ptr, qos_rec.name, job_ptr->qos_id);
1023 			job_ptr->state_reason = WAIT_NO_REASON;
1024 			xfree(job_ptr->state_desc);
1025 			last_job_update = time(NULL);
1026 		}
1027 	}
1028 }
1029 
set_job_tres_req_str(job_record_t * job_ptr,bool assoc_mgr_locked)1030 extern void set_job_tres_req_str(job_record_t *job_ptr, bool assoc_mgr_locked)
1031 {
1032 	assoc_mgr_lock_t locks = { .tres = READ_LOCK };
1033 	xassert(job_ptr);
1034 
1035 	if (!assoc_mgr_locked)
1036 		assoc_mgr_lock(&locks);
1037 
1038 	xfree(job_ptr->tres_req_str);
1039 	job_ptr->tres_req_str = assoc_mgr_make_tres_str_from_array(
1040 		job_ptr->tres_req_cnt, TRES_STR_FLAG_SIMPLE, true);
1041 
1042 	xfree(job_ptr->tres_fmt_req_str);
1043 	job_ptr->tres_fmt_req_str = assoc_mgr_make_tres_str_from_array(
1044 		job_ptr->tres_req_cnt, TRES_STR_CONVERT_UNITS, true);
1045 
1046 	if (!assoc_mgr_locked)
1047 		assoc_mgr_unlock(&locks);
1048 }
1049 
set_job_tres_alloc_str(job_record_t * job_ptr,bool assoc_mgr_locked)1050 extern void set_job_tres_alloc_str(job_record_t *job_ptr,
1051 				   bool assoc_mgr_locked)
1052 {
1053 	assoc_mgr_lock_t locks = { .tres = READ_LOCK };
1054 
1055 	xassert(job_ptr);
1056 
1057 	if (!assoc_mgr_locked)
1058 		assoc_mgr_lock(&locks);
1059 
1060 	xfree(job_ptr->tres_alloc_str);
1061 	job_ptr->tres_alloc_str = assoc_mgr_make_tres_str_from_array(
1062 		job_ptr->tres_alloc_cnt, TRES_STR_FLAG_SIMPLE, true);
1063 
1064 	xfree(job_ptr->tres_fmt_alloc_str);
1065 	job_ptr->tres_fmt_alloc_str = assoc_mgr_make_tres_str_from_array(
1066 		job_ptr->tres_alloc_cnt, TRES_STR_CONVERT_UNITS, true);
1067 
1068 	if (!assoc_mgr_locked)
1069 		assoc_mgr_unlock(&locks);
1070 }
1071 
1072 /* Note that the backup slurmctld has assumed primary control.
1073  * This function can be called multiple times. */
backup_slurmctld_restart(void)1074 extern void backup_slurmctld_restart(void)
1075 {
1076 	last_file_write_time = (time_t) 0;
1077 }
1078 
1079 /* Return the time stamp in the current job state save file, 0 is returned on
1080  * error */
_get_last_job_state_write_time(void)1081 static time_t _get_last_job_state_write_time(void)
1082 {
1083 	int error_code = SLURM_SUCCESS;
1084 	char *state_file = NULL;
1085 	Buf buffer;
1086 	time_t buf_time = (time_t) 0;
1087 	char *ver_str = NULL;
1088 	uint32_t ver_str_len;
1089 	uint16_t protocol_version = NO_VAL16;
1090 
1091 	/* read the file */
1092 	if (!(buffer = _open_job_state_file(&state_file))) {
1093 		info("No job state file (%s) found", state_file);
1094 		error_code = ENOENT;
1095 	}
1096 	xfree(state_file);
1097 	if (error_code)
1098 		return buf_time;
1099 
1100 	safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
1101 	if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION))
1102 		safe_unpack16(&protocol_version, buffer);
1103 	safe_unpack_time(&buf_time, buffer);
1104 
1105 unpack_error:
1106 	xfree(ver_str);
1107 	free_buf(buffer);
1108 	return buf_time;
1109 }
1110 
1111 /*
1112  * load_all_job_state - load the job state from file, recover from last
1113  *	checkpoint. Execute this after loading the configuration file data.
1114  *	Changes here should be reflected in load_last_job_id().
1115  * RET 0 or error code
1116  */
load_all_job_state(void)1117 extern int load_all_job_state(void)
1118 {
1119 	int error_code = SLURM_SUCCESS;
1120 	int job_cnt = 0;
1121 	char *state_file = NULL;
1122 	Buf buffer;
1123 	time_t buf_time;
1124 	uint32_t saved_job_id;
1125 	char *ver_str = NULL;
1126 	uint32_t ver_str_len;
1127 	uint16_t protocol_version = NO_VAL16;
1128 
1129 	/* read the file */
1130 	lock_state_files();
1131 	if (!(buffer = _open_job_state_file(&state_file))) {
1132 		info("No job state file (%s) to recover", state_file);
1133 		xfree(state_file);
1134 		unlock_state_files();
1135 		return ENOENT;
1136 	}
1137 	xfree(state_file);
1138 	unlock_state_files();
1139 
1140 	job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id);
1141 
1142 	safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
1143 	debug3("Version string in job_state header is %s", ver_str);
1144 	if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION))
1145 		safe_unpack16(&protocol_version, buffer);
1146 	xfree(ver_str);
1147 
1148 	if (protocol_version == NO_VAL16) {
1149 		if (!ignore_state_errors)
1150 			fatal("Can not recover job state, incompatible version, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
1151 		error("***********************************************");
1152 		error("Can not recover job state, incompatible version");
1153 		error("***********************************************");
1154 		free_buf(buffer);
1155 		return EFAULT;
1156 	}
1157 
1158 	safe_unpack_time(&buf_time, buffer);
1159 	safe_unpack32(&saved_job_id, buffer);
1160 	if (saved_job_id <= slurmctld_conf.max_job_id)
1161 		job_id_sequence = MAX(saved_job_id, job_id_sequence);
1162 	debug3("Job id in job_state header is %u", saved_job_id);
1163 
1164 	/*
1165 	 * Previously we locked the tres read lock before this loop.  It turned
1166 	 * out that created a double lock when steps were being loaded during
1167 	 * the calls to jobacctinfo_create() which also locks the read lock.
1168 	 * It ended up being much easier to move the locks for the assoc_mgr
1169 	 * into the _load_job_state function than any other option.
1170 	 */
1171 	while (remaining_buf(buffer) > 0) {
1172 		error_code = _load_job_state(buffer, protocol_version);
1173 		if (error_code != SLURM_SUCCESS)
1174 			goto unpack_error;
1175 		job_cnt++;
1176 	}
1177 	debug3("Set job_id_sequence to %u", job_id_sequence);
1178 
1179 	free_buf(buffer);
1180 	info("Recovered information about %d jobs", job_cnt);
1181 	return error_code;
1182 
1183 unpack_error:
1184 	if (!ignore_state_errors)
1185 		fatal("Incomplete job state save file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
1186 	error("Incomplete job state save file");
1187 	info("Recovered information about %d jobs", job_cnt);
1188 	free_buf(buffer);
1189 	return SLURM_ERROR;
1190 }
1191 
1192 /*
1193  * load_last_job_id - load only the last job ID from state save file.
1194  *	Changes here should be reflected in load_all_job_state().
1195  * RET 0 or error code
1196  */
load_last_job_id(void)1197 extern int load_last_job_id( void )
1198 {
1199 	char *state_file = NULL;
1200 	Buf buffer;
1201 	time_t buf_time;
1202 	char *ver_str = NULL;
1203 	uint32_t ver_str_len;
1204 	uint16_t protocol_version = NO_VAL16;
1205 
1206 	/* read the file */
1207 	lock_state_files();
1208 	if (!(buffer = _open_job_state_file(&state_file))) {
1209 		debug("No job state file (%s) to recover", state_file);
1210 		xfree(state_file);
1211 		unlock_state_files();
1212 		return ENOENT;
1213 	}
1214 	xfree(state_file);
1215 	unlock_state_files();
1216 
1217 	safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
1218 	debug3("Version string in job_state header is %s", ver_str);
1219 	if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION))
1220 		safe_unpack16(&protocol_version, buffer);
1221 	xfree(ver_str);
1222 
1223 	if (protocol_version == NO_VAL16) {
1224 		if (!ignore_state_errors)
1225 			fatal("Can not recover last job ID, incompatible version, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
1226 		debug("*************************************************");
1227 		debug("Can not recover last job ID, incompatible version");
1228 		debug("*************************************************");
1229 		free_buf(buffer);
1230 		return EFAULT;
1231 	}
1232 
1233 	safe_unpack_time(&buf_time, buffer);
1234 	safe_unpack32( &job_id_sequence, buffer);
1235 	debug3("Job ID in job_state header is %u", job_id_sequence);
1236 
1237 	/* Ignore the state for individual jobs stored here */
1238 
1239 	xfree(ver_str);
1240 	free_buf(buffer);
1241 	return SLURM_SUCCESS;
1242 
1243 unpack_error:
1244 	if (!ignore_state_errors)
1245 		fatal("Invalid job data checkpoint file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
1246 	error("Invalid job data checkpoint file");
1247 	xfree(ver_str);
1248 	free_buf(buffer);
1249 	return SLURM_ERROR;
1250 }
1251 
_pack_acct_policy_limit(acct_policy_limit_set_t * limit_set,Buf buffer,uint16_t protocol_version)1252 static void _pack_acct_policy_limit(acct_policy_limit_set_t *limit_set,
1253 				    Buf buffer, uint16_t protocol_version)
1254 {
1255 	xassert(limit_set);
1256 
1257 	pack16(limit_set->qos, buffer);
1258 	pack16(limit_set->time, buffer);
1259 	pack16_array(limit_set->tres, slurmctld_tres_cnt, buffer);
1260 }
1261 
_unpack_acct_policy_limit_members(acct_policy_limit_set_t * limit_set,Buf buffer,uint16_t protocol_version)1262 static int _unpack_acct_policy_limit_members(
1263 	acct_policy_limit_set_t *limit_set,
1264 	Buf buffer, uint16_t protocol_version)
1265 {
1266 	uint32_t tmp32;
1267 
1268 	xassert(limit_set);
1269 
1270 	safe_unpack16(&limit_set->qos, buffer);
1271 	safe_unpack16(&limit_set->time, buffer);
1272 	xfree(limit_set->tres);
1273 	safe_unpack16_array(&limit_set->tres, &tmp32, buffer);
1274 
1275 	/*
1276 	 * Because the tres array could have grown or the tres could have moved
1277 	 * positions, the array needs to be rebuilt and the old values need to
1278 	 * be copied into their new spots.
1279 	 */
1280 	if ((tmp32 < slurmctld_tres_cnt) || assoc_mgr_tres_pos_changed())
1281 		update_job_limit_set_tres(&limit_set->tres);
1282 
1283 	return SLURM_SUCCESS;
1284 
1285 unpack_error:
1286 	xfree(limit_set->tres);
1287 
1288 	return SLURM_ERROR;
1289 }
1290 
1291 /*
1292  * _dump_job_state - dump the state of a specific job, its details, and
1293  *	steps to a buffer
1294  * IN dump_job_ptr - pointer to job for which information is requested
1295  * IN/OUT buffer - location to store data, pointers automatically advanced
1296  */
_dump_job_state(job_record_t * dump_job_ptr,Buf buffer)1297 static void _dump_job_state(job_record_t *dump_job_ptr, Buf buffer)
1298 {
1299 	struct job_details *detail_ptr;
1300 	uint32_t tmp_32;
1301 
1302 	xassert(dump_job_ptr->magic == JOB_MAGIC);
1303 
1304 	/* Don't pack "unlinked" job. */
1305 	if (dump_job_ptr->job_id == NO_VAL)
1306 		return;
1307 
1308 	/* Dump basic job info */
1309 	pack32(dump_job_ptr->array_job_id, buffer);
1310 	pack32(dump_job_ptr->array_task_id, buffer);
1311 	if (dump_job_ptr->array_recs) {
1312 		build_array_str(dump_job_ptr);
1313 		if (dump_job_ptr->array_recs->task_id_bitmap) {
1314 			tmp_32 = bit_size(dump_job_ptr->array_recs->
1315 					  task_id_bitmap);
1316 		} else
1317 			tmp_32 = 0;
1318 		pack32(tmp_32, buffer);
1319 		if (tmp_32)
1320 			packstr(dump_job_ptr->array_recs->task_id_str, buffer);
1321 		pack32(dump_job_ptr->array_recs->array_flags,    buffer);
1322 		pack32(dump_job_ptr->array_recs->max_run_tasks,  buffer);
1323 		pack32(dump_job_ptr->array_recs->tot_run_tasks,  buffer);
1324 		pack32(dump_job_ptr->array_recs->min_exit_code,  buffer);
1325 		pack32(dump_job_ptr->array_recs->max_exit_code,  buffer);
1326 		pack32(dump_job_ptr->array_recs->tot_comp_tasks, buffer);
1327 	} else {
1328 		tmp_32 = NO_VAL;
1329 		pack32(tmp_32, buffer);
1330 	}
1331 
1332 	pack32(dump_job_ptr->assoc_id, buffer);
1333 	packstr(dump_job_ptr->batch_features, buffer);
1334 	pack32(dump_job_ptr->delay_boot, buffer);
1335 	pack32(dump_job_ptr->job_id, buffer);
1336 	pack32(dump_job_ptr->user_id, buffer);
1337 	pack32(dump_job_ptr->group_id, buffer);
1338 	pack32(dump_job_ptr->time_limit, buffer);
1339 	pack32(dump_job_ptr->time_min, buffer);
1340 	pack32(dump_job_ptr->priority, buffer);
1341 	pack32(dump_job_ptr->alloc_sid, buffer);
1342 	pack32(dump_job_ptr->total_cpus, buffer);
1343 	if (dump_job_ptr->total_nodes)
1344 		pack32(dump_job_ptr->total_nodes, buffer);
1345 	else
1346 		pack32(dump_job_ptr->node_cnt_wag, buffer);
1347 	pack32(dump_job_ptr->cpu_cnt, buffer);
1348 	pack32(dump_job_ptr->exit_code, buffer);
1349 	pack32(dump_job_ptr->derived_ec, buffer);
1350 	pack64(dump_job_ptr->db_index, buffer);
1351 	pack32(dump_job_ptr->resv_id, buffer);
1352 	pack32(dump_job_ptr->next_step_id, buffer);
1353 	pack32(dump_job_ptr->het_job_id, buffer);
1354 	packstr(dump_job_ptr->het_job_id_set, buffer);
1355 	pack32(dump_job_ptr->het_job_offset, buffer);
1356 	pack32(dump_job_ptr->qos_id, buffer);
1357 	pack32(dump_job_ptr->req_switch, buffer);
1358 	pack32(dump_job_ptr->wait4switch, buffer);
1359 	pack32(dump_job_ptr->profile, buffer);
1360 	pack32(dump_job_ptr->db_flags, buffer);
1361 
1362 	pack_time(dump_job_ptr->last_sched_eval, buffer);
1363 	pack_time(dump_job_ptr->preempt_time, buffer);
1364 	pack_time(dump_job_ptr->start_time, buffer);
1365 	pack_time(dump_job_ptr->end_time, buffer);
1366 	pack_time(dump_job_ptr->end_time_exp, buffer);
1367 	pack_time(dump_job_ptr->suspend_time, buffer);
1368 	pack_time(dump_job_ptr->pre_sus_time, buffer);
1369 	pack_time(dump_job_ptr->resize_time, buffer);
1370 	pack_time(dump_job_ptr->tot_sus_time, buffer);
1371 	pack_time(dump_job_ptr->deadline, buffer);
1372 
1373 	pack32(dump_job_ptr->site_factor, buffer);
1374 	pack16(dump_job_ptr->direct_set_prio, buffer);
1375 	pack32(dump_job_ptr->job_state, buffer);
1376 	pack16(dump_job_ptr->kill_on_node_fail, buffer);
1377 	pack16(dump_job_ptr->batch_flag, buffer);
1378 	pack16(dump_job_ptr->mail_type, buffer);
1379 	pack32(dump_job_ptr->state_reason, buffer);
1380 	pack32(dump_job_ptr->state_reason_prev_db, buffer);
1381 	pack8(dump_job_ptr->reboot, buffer);
1382 	pack16(dump_job_ptr->restart_cnt, buffer);
1383 	pack16(dump_job_ptr->wait_all_nodes, buffer);
1384 	pack16(dump_job_ptr->warn_flags, buffer);
1385 	pack16(dump_job_ptr->warn_signal, buffer);
1386 	pack16(dump_job_ptr->warn_time, buffer);
1387 
1388 	_pack_acct_policy_limit(&dump_job_ptr->limit_set, buffer,
1389 				SLURM_PROTOCOL_VERSION);
1390 
1391 	packstr(dump_job_ptr->state_desc, buffer);
1392 	packstr(dump_job_ptr->resp_host, buffer);
1393 
1394 	pack16(dump_job_ptr->alloc_resp_port, buffer);
1395 	pack16(dump_job_ptr->other_port, buffer);
1396 	pack8(dump_job_ptr->power_flags, buffer);
1397 	pack16(dump_job_ptr->start_protocol_ver, buffer);
1398 	packdouble(dump_job_ptr->billable_tres, buffer);
1399 
1400 	if (IS_JOB_COMPLETING(dump_job_ptr)) {
1401 		if (dump_job_ptr->nodes_completing == NULL) {
1402 			dump_job_ptr->nodes_completing =
1403 				bitmap2node_name(dump_job_ptr->node_bitmap);
1404 		}
1405 		packstr(dump_job_ptr->nodes_completing, buffer);
1406 	}
1407 	packstr(dump_job_ptr->nodes, buffer);
1408 	packstr(dump_job_ptr->partition, buffer);
1409 	packstr(dump_job_ptr->name, buffer);
1410 	packstr(dump_job_ptr->user_name, buffer);
1411 	packstr(dump_job_ptr->wckey, buffer);
1412 	packstr(dump_job_ptr->alloc_node, buffer);
1413 	packstr(dump_job_ptr->account, buffer);
1414 	packstr(dump_job_ptr->admin_comment, buffer);
1415 	packstr(dump_job_ptr->comment, buffer);
1416 	packstr(dump_job_ptr->gres_alloc, buffer);
1417 	packstr(dump_job_ptr->gres_req, buffer);
1418 	packstr(dump_job_ptr->gres_used, buffer);
1419 	packstr(dump_job_ptr->network, buffer);
1420 	packstr(dump_job_ptr->licenses, buffer);
1421 	packstr(dump_job_ptr->mail_user, buffer);
1422 	packstr(dump_job_ptr->mcs_label, buffer);
1423 	packstr(dump_job_ptr->resv_name, buffer);
1424 	packstr(dump_job_ptr->batch_host, buffer);
1425 	packstr(dump_job_ptr->burst_buffer, buffer);
1426 	packstr(dump_job_ptr->burst_buffer_state, buffer);
1427 	packstr(dump_job_ptr->system_comment, buffer);
1428 
1429 	select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo,
1430 				     buffer, SLURM_PROTOCOL_VERSION);
1431 	pack_job_resources(dump_job_ptr->job_resrcs, buffer,
1432 			   SLURM_PROTOCOL_VERSION);
1433 
1434 	packstr_array(dump_job_ptr->spank_job_env,
1435 		      dump_job_ptr->spank_job_env_size, buffer);
1436 
1437 	(void) gres_plugin_job_state_pack(dump_job_ptr->gres_list, buffer,
1438 					  dump_job_ptr->job_id, true,
1439 					  SLURM_PROTOCOL_VERSION);
1440 
1441 	/* Dump job details, if available */
1442 	detail_ptr = dump_job_ptr->details;
1443 	if (detail_ptr) {
1444 		xassert (detail_ptr->magic == DETAILS_MAGIC);
1445 		pack16((uint16_t) DETAILS_FLAG, buffer);
1446 		_dump_job_details(detail_ptr, buffer);
1447 	} else
1448 		pack16((uint16_t) 0, buffer);	/* no details flag */
1449 
1450 	/* Dump job steps */
1451 	list_for_each(dump_job_ptr->step_list, dump_job_step_state, buffer);
1452 
1453 	pack16((uint16_t) 0, buffer);	/* no step flag */
1454 	pack32(dump_job_ptr->bit_flags, buffer);
1455 	packstr(dump_job_ptr->tres_alloc_str, buffer);
1456 	packstr(dump_job_ptr->tres_fmt_alloc_str, buffer);
1457 	packstr(dump_job_ptr->tres_req_str, buffer);
1458 	packstr(dump_job_ptr->tres_fmt_req_str, buffer);
1459 
1460 	packstr(dump_job_ptr->clusters, buffer);
1461 	_dump_job_fed_details(dump_job_ptr->fed_details, buffer);
1462 
1463 	packstr(dump_job_ptr->origin_cluster, buffer);
1464 
1465 	packstr(dump_job_ptr->cpus_per_tres, buffer);
1466 	packstr(dump_job_ptr->mem_per_tres, buffer);
1467 	packstr(dump_job_ptr->tres_bind, buffer);
1468 	packstr(dump_job_ptr->tres_freq, buffer);
1469 	packstr(dump_job_ptr->tres_per_job, buffer);
1470 	packstr(dump_job_ptr->tres_per_node, buffer);
1471 	packstr(dump_job_ptr->tres_per_socket, buffer);
1472 	packstr(dump_job_ptr->tres_per_task, buffer);
1473 }
1474 
1475 /* Unpack a job's state information from a buffer */
1476 /* NOTE: assoc_mgr qos, tres and assoc read lock must be unlocked before
1477  * calling */
_load_job_state(Buf buffer,uint16_t protocol_version)1478 static int _load_job_state(Buf buffer, uint16_t protocol_version)
1479 {
1480 	uint64_t db_index;
1481 	uint32_t job_id, user_id, group_id, time_limit, priority, alloc_sid;
1482 	uint32_t exit_code, assoc_id, name_len, time_min;
1483 	uint32_t next_step_id, total_cpus, total_nodes = 0, cpu_cnt;
1484 	uint32_t resv_id, spank_job_env_size = 0, qos_id, derived_ec = 0;
1485 	uint32_t array_job_id = 0, req_switch = 0, wait4switch = 0;
1486 	uint32_t profile = ACCT_GATHER_PROFILE_NOT_SET, db_flags = 0;
1487 	uint32_t job_state, delay_boot = 0, site_factor = NICE_OFFSET;
1488 	time_t start_time, end_time, end_time_exp, suspend_time,
1489 		pre_sus_time, tot_sus_time;
1490 	time_t preempt_time = 0, deadline = 0;
1491 	time_t last_sched_eval = 0;
1492 	time_t resize_time = 0, now = time(NULL);
1493 	uint8_t reboot = 0, power_flags = 0;
1494 	uint32_t array_task_id = NO_VAL, state_reason_prev_db = 0;
1495 	uint32_t array_flags = 0, max_run_tasks = 0, tot_run_tasks = 0;
1496 	uint32_t min_exit_code = 0, max_exit_code = 0, tot_comp_tasks = 0;
1497 	uint32_t het_job_id = 0, het_job_offset = 0, state_reason;
1498 	uint16_t details, batch_flag, step_flag;
1499 	uint16_t kill_on_node_fail, direct_set_prio;
1500 	uint16_t alloc_resp_port, other_port, mail_type, tmp16;
1501 	uint16_t restart_cnt;
1502 	uint16_t wait_all_nodes, warn_flags = 0, warn_signal, warn_time;
1503 	acct_policy_limit_set_t limit_set;
1504 	uint16_t start_protocol_ver = SLURM_MIN_PROTOCOL_VERSION;
1505 	char *nodes = NULL, *partition = NULL, *name = NULL, *resp_host = NULL;
1506 	char *account = NULL, *network = NULL, *mail_user = NULL;
1507 	char *comment = NULL, *nodes_completing = NULL, *alloc_node = NULL;
1508 	char *licenses = NULL, *state_desc = NULL, *wckey = NULL;
1509 	char *resv_name = NULL, *batch_host = NULL;
1510 	char *gres_alloc = NULL, *gres_req = NULL, *gres_used = NULL;
1511 	char *burst_buffer = NULL, *burst_buffer_state = NULL;
1512 	char *admin_comment = NULL, *task_id_str = NULL, *mcs_label = NULL;
1513 	char *clusters = NULL, *het_job_id_set = NULL, *user_name = NULL;
1514 	char *batch_features = NULL, *system_comment = NULL;
1515 	uint32_t task_id_size = NO_VAL;
1516 	char **spank_job_env = (char **) NULL;
1517 	List gres_list = NULL, part_ptr_list = NULL;
1518 	job_record_t *job_ptr = NULL;
1519 	part_record_t *part_ptr;
1520 	int error_code, i, qos_error, rc;
1521 	dynamic_plugin_data_t *select_jobinfo = NULL;
1522 	job_resources_t *job_resources = NULL;
1523 	slurmdb_assoc_rec_t assoc_rec;
1524 	slurmdb_qos_rec_t qos_rec;
1525 	bool job_finished = false;
1526 	double billable_tres = (double)NO_VAL;
1527 	char *tres_alloc_str = NULL, *tres_fmt_alloc_str = NULL,
1528 		*tres_req_str = NULL, *tres_fmt_req_str = NULL;
1529 	uint32_t pelog_env_size = 0;
1530 	char **pelog_env = (char **) NULL;
1531 	job_fed_details_t *job_fed_details = NULL;
1532 	assoc_mgr_lock_t locks = { .assoc = READ_LOCK,
1533 				   .qos = READ_LOCK,
1534 				   .tres = READ_LOCK,
1535 				   .user = READ_LOCK };
1536 
1537 	memset(&limit_set, 0, sizeof(limit_set));
1538 	limit_set.tres = xcalloc(slurmctld_tres_cnt, sizeof(uint16_t));
1539 
1540 	if (protocol_version >= SLURM_20_02_PROTOCOL_VERSION) {
1541 		safe_unpack32(&array_job_id, buffer);
1542 		safe_unpack32(&array_task_id, buffer);
1543 
1544 		/* Job Array record */
1545 		safe_unpack32(&task_id_size, buffer);
1546 		if (task_id_size != NO_VAL) {
1547 			if (task_id_size) {
1548 				safe_unpackstr_xmalloc(&task_id_str, &name_len,
1549 						       buffer);
1550 			}
1551 			safe_unpack32(&array_flags,    buffer);
1552 			safe_unpack32(&max_run_tasks,  buffer);
1553 			safe_unpack32(&tot_run_tasks,  buffer);
1554 			safe_unpack32(&min_exit_code,  buffer);
1555 			safe_unpack32(&max_exit_code,  buffer);
1556 			safe_unpack32(&tot_comp_tasks, buffer);
1557 		}
1558 
1559 		safe_unpack32(&assoc_id, buffer);
1560 		safe_unpackstr_xmalloc(&batch_features, &name_len, buffer);
1561 		safe_unpack32(&delay_boot, buffer);
1562 		safe_unpack32(&job_id, buffer);
1563 
1564 		/* validity test as possible */
1565 		if (job_id == 0) {
1566 			verbose("Invalid job_id %u", job_id);
1567 			goto unpack_error;
1568 		}
1569 
1570 		job_ptr = find_job_record(job_id);
1571 		if (job_ptr == NULL) {
1572 			job_ptr = _create_job_record(1);
1573 			if (!job_ptr) {
1574 				error("Create job entry failed for JobId=%u",
1575 				      job_id);
1576 				goto unpack_error;
1577 			}
1578 			job_ptr->job_id = job_id;
1579 			job_ptr->array_job_id = array_job_id;
1580 			job_ptr->array_task_id = array_task_id;
1581 		}
1582 
1583 		safe_unpack32(&user_id, buffer);
1584 		safe_unpack32(&group_id, buffer);
1585 		safe_unpack32(&time_limit, buffer);
1586 		safe_unpack32(&time_min, buffer);
1587 		safe_unpack32(&priority, buffer);
1588 		safe_unpack32(&alloc_sid, buffer);
1589 		safe_unpack32(&total_cpus, buffer);
1590 		safe_unpack32(&total_nodes, buffer);
1591 		safe_unpack32(&cpu_cnt, buffer);
1592 		safe_unpack32(&exit_code, buffer);
1593 		safe_unpack32(&derived_ec, buffer);
1594 		safe_unpack64(&db_index, buffer);
1595 		safe_unpack32(&resv_id, buffer);
1596 		safe_unpack32(&next_step_id, buffer);
1597 		safe_unpack32(&het_job_id, buffer);
1598 		safe_unpackstr_xmalloc(&het_job_id_set, &name_len, buffer);
1599 		safe_unpack32(&het_job_offset, buffer);
1600 		safe_unpack32(&qos_id, buffer);
1601 		safe_unpack32(&req_switch, buffer);
1602 		safe_unpack32(&wait4switch, buffer);
1603 		safe_unpack32(&profile, buffer);
1604 		safe_unpack32(&db_flags, buffer);
1605 
1606 		safe_unpack_time(&last_sched_eval, buffer);
1607 		safe_unpack_time(&preempt_time, buffer);
1608 		safe_unpack_time(&start_time, buffer);
1609 		safe_unpack_time(&end_time, buffer);
1610 		safe_unpack_time(&end_time_exp, buffer);
1611 		safe_unpack_time(&suspend_time, buffer);
1612 		safe_unpack_time(&pre_sus_time, buffer);
1613 		safe_unpack_time(&resize_time, buffer);
1614 		safe_unpack_time(&tot_sus_time, buffer);
1615 		safe_unpack_time(&deadline, buffer);
1616 
1617 		safe_unpack32(&site_factor, buffer);
1618 		safe_unpack16(&direct_set_prio, buffer);
1619 		safe_unpack32(&job_state, buffer);
1620 		safe_unpack16(&kill_on_node_fail, buffer);
1621 		safe_unpack16(&batch_flag, buffer);
1622 		safe_unpack16(&mail_type, buffer);
1623 		safe_unpack32(&state_reason, buffer);
1624 		safe_unpack32(&state_reason_prev_db, buffer);
1625 		safe_unpack8 (&reboot, buffer);
1626 		safe_unpack16(&restart_cnt, buffer);
1627 		safe_unpack16(&wait_all_nodes, buffer);
1628 		safe_unpack16(&warn_flags, buffer);
1629 		safe_unpack16(&warn_signal, buffer);
1630 		safe_unpack16(&warn_time, buffer);
1631 
1632 		_unpack_acct_policy_limit_members(&limit_set, buffer,
1633 						  protocol_version);
1634 
1635 		safe_unpackstr_xmalloc(&state_desc, &name_len, buffer);
1636 		safe_unpackstr_xmalloc(&resp_host, &name_len, buffer);
1637 
1638 		safe_unpack16(&alloc_resp_port, buffer);
1639 		safe_unpack16(&other_port, buffer);
1640 		safe_unpack8(&power_flags, buffer);
1641 		safe_unpack16(&start_protocol_ver, buffer);
1642 		safe_unpackdouble(&billable_tres, buffer);
1643 
1644 		if (job_state & JOB_COMPLETING) {
1645 			safe_unpackstr_xmalloc(&nodes_completing,
1646 					       &name_len, buffer);
1647 		}
1648 		safe_unpackstr_xmalloc(&nodes, &name_len, buffer);
1649 		safe_unpackstr_xmalloc(&partition, &name_len, buffer);
1650 		if (partition == NULL) {
1651 			error("No partition for JobId=%u", job_id);
1652 			goto unpack_error;
1653 		}
1654 		part_ptr = find_part_record (partition);
1655 		if (part_ptr == NULL) {
1656 			char *err_part = NULL;
1657 			part_ptr_list = get_part_list(partition, &err_part);
1658 			if (part_ptr_list) {
1659 				part_ptr = list_peek(part_ptr_list);
1660 				if (list_count(part_ptr_list) == 1)
1661 					FREE_NULL_LIST(part_ptr_list);
1662 			} else {
1663 				verbose("Invalid partition (%s) for JobId=%u",
1664 					err_part, job_id);
1665 				xfree(err_part);
1666 				/* not fatal error, partition could have been
1667 				 * removed, reset_job_bitmaps() will clean-up
1668 				 * this job */
1669 			}
1670 		}
1671 
1672 		safe_unpackstr_xmalloc(&name, &name_len, buffer);
1673 		safe_unpackstr_xmalloc(&user_name, &name_len, buffer);
1674 		safe_unpackstr_xmalloc(&wckey, &name_len, buffer);
1675 		safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer);
1676 		safe_unpackstr_xmalloc(&account, &name_len, buffer);
1677 		safe_unpackstr_xmalloc(&admin_comment, &name_len, buffer);
1678 		safe_unpackstr_xmalloc(&comment, &name_len, buffer);
1679 		safe_unpackstr_xmalloc(&gres_alloc, &name_len, buffer);
1680 		safe_unpackstr_xmalloc(&gres_req, &name_len, buffer);
1681 		safe_unpackstr_xmalloc(&gres_used, &name_len, buffer);
1682 		safe_unpackstr_xmalloc(&network, &name_len, buffer);
1683 		safe_unpackstr_xmalloc(&licenses, &name_len, buffer);
1684 		safe_unpackstr_xmalloc(&mail_user, &name_len, buffer);
1685 		safe_unpackstr_xmalloc(&mcs_label, &name_len, buffer);
1686 		safe_unpackstr_xmalloc(&resv_name, &name_len, buffer);
1687 		safe_unpackstr_xmalloc(&batch_host, &name_len, buffer);
1688 		safe_unpackstr_xmalloc(&burst_buffer, &name_len, buffer);
1689 		safe_unpackstr_xmalloc(&burst_buffer_state, &name_len, buffer);
1690 		safe_unpackstr_xmalloc(&system_comment, &name_len, buffer);
1691 
1692 		if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer,
1693 						   protocol_version))
1694 			goto unpack_error;
1695 		if (unpack_job_resources(&job_resources, buffer,
1696 					 protocol_version))
1697 			goto unpack_error;
1698 
1699 		safe_unpackstr_array(&spank_job_env, &spank_job_env_size,
1700 				     buffer);
1701 
1702 		if (gres_plugin_job_state_unpack(&gres_list, buffer, job_id,
1703 						 protocol_version) !=
1704 		    SLURM_SUCCESS)
1705 			goto unpack_error;
1706 		gres_plugin_job_state_log(gres_list, job_id);
1707 
1708 		safe_unpack16(&details, buffer);
1709 		if ((details == DETAILS_FLAG) &&
1710 		    (_load_job_details(job_ptr, buffer, protocol_version))) {
1711 			job_ptr->job_state = JOB_FAILED;
1712 			job_ptr->exit_code = 1;
1713 			job_ptr->state_reason = FAIL_SYSTEM;
1714 			xfree(job_ptr->state_desc);
1715 			job_ptr->end_time = now;
1716 			goto unpack_error;
1717 		}
1718 		safe_unpack16(&step_flag, buffer);
1719 
1720 		while (step_flag == STEP_FLAG) {
1721 			/*
1722 			 * No need to put these into accounting if they
1723 			 * haven't been since all information will be
1724 			 * put in when the job is finished.
1725 			 */
1726 			if ((error_code = load_step_state(job_ptr, buffer,
1727 							  protocol_version)))
1728 				goto unpack_error;
1729 			safe_unpack16(&step_flag, buffer);
1730 		}
1731 		safe_unpack32(&job_ptr->bit_flags, buffer);
1732 		job_ptr->bit_flags &= ~BACKFILL_TEST;
1733 		job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST;
1734 		safe_unpackstr_xmalloc(&tres_alloc_str,
1735 				       &name_len, buffer);
1736 		safe_unpackstr_xmalloc(&tres_fmt_alloc_str,
1737 				       &name_len, buffer);
1738 		safe_unpackstr_xmalloc(&tres_req_str, &name_len, buffer);
1739 		safe_unpackstr_xmalloc(&tres_fmt_req_str, &name_len, buffer);
1740 		safe_unpackstr_xmalloc(&clusters, &name_len, buffer);
1741 		if ((error_code = _load_job_fed_details(&job_fed_details,
1742 							buffer,
1743 							protocol_version)))
1744 			goto unpack_error;
1745 
1746 		safe_unpackstr_xmalloc(&job_ptr->origin_cluster, &name_len,
1747 				       buffer);
1748 
1749 		safe_unpackstr_xmalloc(&job_ptr->cpus_per_tres, &name_len,
1750 				       buffer);
1751 		safe_unpackstr_xmalloc(&job_ptr->mem_per_tres, &name_len,
1752 				       buffer);
1753 		safe_unpackstr_xmalloc(&job_ptr->tres_bind, &name_len,
1754 				       buffer);
1755 		safe_unpackstr_xmalloc(&job_ptr->tres_freq, &name_len,
1756 				       buffer);
1757 		safe_unpackstr_xmalloc(&job_ptr->tres_per_job, &name_len,
1758 				       buffer);
1759 		safe_unpackstr_xmalloc(&job_ptr->tres_per_node, &name_len,
1760 				       buffer);
1761 		safe_unpackstr_xmalloc(&job_ptr->tres_per_socket, &name_len,
1762 				       buffer);
1763 		safe_unpackstr_xmalloc(&job_ptr->tres_per_task, &name_len,
1764 				       buffer);
1765 	} else if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
1766 		uint16_t uint16_tmp;
1767 		safe_unpack32(&array_job_id, buffer);
1768 		safe_unpack32(&array_task_id, buffer);
1769 
1770 		/* Job Array record */
1771 		safe_unpack32(&task_id_size, buffer);
1772 		if (task_id_size != NO_VAL) {
1773 			if (task_id_size) {
1774 				safe_unpackstr_xmalloc(&task_id_str, &name_len,
1775 						       buffer);
1776 			}
1777 			safe_unpack32(&array_flags,    buffer);
1778 			safe_unpack32(&max_run_tasks,  buffer);
1779 			safe_unpack32(&tot_run_tasks,  buffer);
1780 			safe_unpack32(&min_exit_code,  buffer);
1781 			safe_unpack32(&max_exit_code,  buffer);
1782 			safe_unpack32(&tot_comp_tasks, buffer);
1783 		}
1784 
1785 		safe_unpack32(&assoc_id, buffer);
1786 		safe_unpackstr_xmalloc(&batch_features, &name_len, buffer);
1787 		safe_unpack32(&delay_boot, buffer);
1788 		safe_unpack32(&job_id, buffer);
1789 
1790 		/* validity test as possible */
1791 		if (job_id == 0) {
1792 			verbose("Invalid job_id %u", job_id);
1793 			goto unpack_error;
1794 		}
1795 
1796 		job_ptr = find_job_record(job_id);
1797 		if (job_ptr == NULL) {
1798 			job_ptr = _create_job_record(1);
1799 			if (!job_ptr) {
1800 				error("Create job entry failed for JobId=%u",
1801 				      job_id);
1802 				goto unpack_error;
1803 			}
1804 			job_ptr->job_id = job_id;
1805 			job_ptr->array_job_id = array_job_id;
1806 			job_ptr->array_task_id = array_task_id;
1807 		}
1808 
1809 		safe_unpack32(&user_id, buffer);
1810 		safe_unpack32(&group_id, buffer);
1811 		safe_unpack32(&time_limit, buffer);
1812 		safe_unpack32(&time_min, buffer);
1813 		safe_unpack32(&priority, buffer);
1814 		safe_unpack32(&alloc_sid, buffer);
1815 		safe_unpack32(&total_cpus, buffer);
1816 		safe_unpack32(&total_nodes, buffer);
1817 		safe_unpack32(&cpu_cnt, buffer);
1818 		safe_unpack32(&exit_code, buffer);
1819 		safe_unpack32(&derived_ec, buffer);
1820 		safe_unpack64(&db_index, buffer);
1821 		safe_unpack32(&resv_id, buffer);
1822 		safe_unpack32(&next_step_id, buffer);
1823 		safe_unpack32(&het_job_id, buffer);
1824 		safe_unpackstr_xmalloc(&het_job_id_set, &name_len, buffer);
1825 		safe_unpack32(&het_job_offset, buffer);
1826 		safe_unpack32(&qos_id, buffer);
1827 		safe_unpack32(&req_switch, buffer);
1828 		safe_unpack32(&wait4switch, buffer);
1829 		safe_unpack32(&profile, buffer);
1830 		safe_unpack32(&db_flags, buffer);
1831 
1832 		safe_unpack_time(&last_sched_eval, buffer);
1833 		safe_unpack_time(&preempt_time, buffer);
1834 		safe_unpack_time(&start_time, buffer);
1835 		safe_unpack_time(&end_time, buffer);
1836 		safe_unpack_time(&end_time_exp, buffer);
1837 		safe_unpack_time(&suspend_time, buffer);
1838 		safe_unpack_time(&pre_sus_time, buffer);
1839 		safe_unpack_time(&resize_time, buffer);
1840 		safe_unpack_time(&tot_sus_time, buffer);
1841 		safe_unpack_time(&deadline, buffer);
1842 
1843 		safe_unpack32(&site_factor, buffer);
1844 		safe_unpack16(&direct_set_prio, buffer);
1845 		safe_unpack32(&job_state, buffer);
1846 		safe_unpack16(&kill_on_node_fail, buffer);
1847 		safe_unpack16(&batch_flag, buffer);
1848 		safe_unpack16(&mail_type, buffer);
1849 		safe_unpack32(&state_reason, buffer);
1850 		safe_unpack32(&state_reason_prev_db, buffer);
1851 		safe_unpack8 (&reboot, buffer);
1852 		safe_unpack16(&restart_cnt, buffer);
1853 		safe_unpack16(&wait_all_nodes, buffer);
1854 		safe_unpack16(&warn_flags, buffer);
1855 		safe_unpack16(&warn_signal, buffer);
1856 		safe_unpack16(&warn_time, buffer);
1857 
1858 		_unpack_acct_policy_limit_members(&limit_set, buffer,
1859 						  protocol_version);
1860 
1861 		safe_unpackstr_xmalloc(&state_desc, &name_len, buffer);
1862 		safe_unpackstr_xmalloc(&resp_host, &name_len, buffer);
1863 
1864 		safe_unpack16(&alloc_resp_port, buffer);
1865 		safe_unpack16(&other_port, buffer);
1866 		safe_unpack8(&power_flags, buffer);
1867 		safe_unpack16(&start_protocol_ver, buffer);
1868 		safe_unpackdouble(&billable_tres, buffer);
1869 
1870 		if (job_state & JOB_COMPLETING) {
1871 			safe_unpackstr_xmalloc(&nodes_completing,
1872 					       &name_len, buffer);
1873 		}
1874 		safe_unpackstr_xmalloc(&nodes, &name_len, buffer);
1875 		safe_unpackstr_xmalloc(&partition, &name_len, buffer);
1876 		if (partition == NULL) {
1877 			error("No partition for JobId=%u", job_id);
1878 			goto unpack_error;
1879 		}
1880 		part_ptr = find_part_record (partition);
1881 		if (part_ptr == NULL) {
1882 			char *err_part = NULL;
1883 			part_ptr_list = get_part_list(partition, &err_part);
1884 			if (part_ptr_list) {
1885 				part_ptr = list_peek(part_ptr_list);
1886 				if (list_count(part_ptr_list) == 1)
1887 					FREE_NULL_LIST(part_ptr_list);
1888 			} else {
1889 				verbose("Invalid partition (%s) for JobId=%u",
1890 					err_part, job_id);
1891 				xfree(err_part);
1892 				/* not fatal error, partition could have been
1893 				 * removed, reset_job_bitmaps() will clean-up
1894 				 * this job */
1895 			}
1896 		}
1897 
1898 		safe_unpackstr_xmalloc(&name, &name_len, buffer);
1899 		safe_unpackstr_xmalloc(&user_name, &name_len, buffer);
1900 		safe_unpackstr_xmalloc(&wckey, &name_len, buffer);
1901 		safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer);
1902 		safe_unpackstr_xmalloc(&account, &name_len, buffer);
1903 		safe_unpackstr_xmalloc(&admin_comment, &name_len, buffer);
1904 		safe_unpackstr_xmalloc(&comment, &name_len, buffer);
1905 		safe_unpackstr_xmalloc(&gres_alloc, &name_len, buffer);
1906 		safe_unpackstr_xmalloc(&gres_req, &name_len, buffer);
1907 		safe_unpackstr_xmalloc(&gres_used, &name_len, buffer);
1908 		safe_unpackstr_xmalloc(&network, &name_len, buffer);
1909 		safe_unpackstr_xmalloc(&licenses, &name_len, buffer);
1910 		safe_unpackstr_xmalloc(&mail_user, &name_len, buffer);
1911 		safe_unpackstr_xmalloc(&mcs_label, &name_len, buffer);
1912 		safe_unpackstr_xmalloc(&resv_name, &name_len, buffer);
1913 		safe_unpackstr_xmalloc(&batch_host, &name_len, buffer);
1914 		safe_unpackstr_xmalloc(&burst_buffer, &name_len, buffer);
1915 		safe_unpackstr_xmalloc(&burst_buffer_state, &name_len, buffer);
1916 		safe_unpackstr_xmalloc(&system_comment, &name_len, buffer);
1917 
1918 		if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer,
1919 						   protocol_version))
1920 			goto unpack_error;
1921 		if (unpack_job_resources(&job_resources, buffer,
1922 					 protocol_version))
1923 			goto unpack_error;
1924 
1925 		safe_unpack16(&uint16_tmp, buffer); /* was ckpt_interval */
1926 		/* fake out the former checkpoint plugin */
1927 		{
1928 			uint16_t id;
1929 			uint32_t size;
1930 			safe_unpack16(&id, buffer);
1931 			safe_unpack32(&size, buffer);
1932 			/* skip past any checkpoint plugin info */
1933 			size += get_buf_offset(buffer);
1934 			set_buf_offset(buffer, size);
1935 		}
1936 
1937 		safe_unpackstr_array(&spank_job_env, &spank_job_env_size,
1938 				     buffer);
1939 
1940 		if (gres_plugin_job_state_unpack(&gres_list, buffer, job_id,
1941 						 protocol_version) !=
1942 		    SLURM_SUCCESS)
1943 			goto unpack_error;
1944 		gres_plugin_job_state_log(gres_list, job_id);
1945 
1946 		safe_unpack16(&details, buffer);
1947 		if ((details == DETAILS_FLAG) &&
1948 		    (_load_job_details(job_ptr, buffer, protocol_version))) {
1949 			job_ptr->job_state = JOB_FAILED;
1950 			job_ptr->exit_code = 1;
1951 			job_ptr->state_reason = FAIL_SYSTEM;
1952 			xfree(job_ptr->state_desc);
1953 			job_ptr->end_time = now;
1954 			goto unpack_error;
1955 		}
1956 		safe_unpack16(&step_flag, buffer);
1957 		/*
1958 		 * The batch_host is needed to create a step_layout for the
1959 		 * batch step since that wasn't packed until 20.02.
1960 		 */
1961 		job_ptr->batch_host = batch_host;
1962 		while (step_flag == STEP_FLAG) {
1963 			/*
1964 			 * No need to put these into accounting if they
1965 			 * haven't been since all information will be
1966 			 * put in when the job is finished.
1967 			 */
1968 			if ((error_code = load_step_state(job_ptr, buffer,
1969 							  protocol_version)))
1970 				goto unpack_error;
1971 			safe_unpack16(&step_flag, buffer);
1972 		}
1973 		job_ptr->batch_host = NULL;
1974 		safe_unpack32(&job_ptr->bit_flags, buffer);
1975 		job_ptr->bit_flags &= ~BACKFILL_TEST;
1976 		job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST;
1977 		safe_unpackstr_xmalloc(&tres_alloc_str,
1978 				       &name_len, buffer);
1979 		safe_unpackstr_xmalloc(&tres_fmt_alloc_str,
1980 				       &name_len, buffer);
1981 		safe_unpackstr_xmalloc(&tres_req_str, &name_len, buffer);
1982 		safe_unpackstr_xmalloc(&tres_fmt_req_str, &name_len, buffer);
1983 		safe_unpackstr_xmalloc(&clusters, &name_len, buffer);
1984 		if ((error_code = _load_job_fed_details(&job_fed_details,
1985 							buffer,
1986 							protocol_version)))
1987 			goto unpack_error;
1988 
1989 		safe_unpackstr_xmalloc(&job_ptr->origin_cluster, &name_len,
1990 				       buffer);
1991 
1992 		safe_unpackstr_xmalloc(&job_ptr->cpus_per_tres, &name_len,
1993 				       buffer);
1994 		safe_unpackstr_xmalloc(&job_ptr->mem_per_tres, &name_len,
1995 				       buffer);
1996 		safe_unpackstr_xmalloc(&job_ptr->tres_bind, &name_len,
1997 				       buffer);
1998 		safe_unpackstr_xmalloc(&job_ptr->tres_freq, &name_len,
1999 				       buffer);
2000 		safe_unpackstr_xmalloc(&job_ptr->tres_per_job, &name_len,
2001 				       buffer);
2002 		safe_unpackstr_xmalloc(&job_ptr->tres_per_node, &name_len,
2003 				       buffer);
2004 		safe_unpackstr_xmalloc(&job_ptr->tres_per_socket, &name_len,
2005 				       buffer);
2006 		safe_unpackstr_xmalloc(&job_ptr->tres_per_task, &name_len,
2007 				       buffer);
2008 	} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
2009 		uint16_t uint16_tmp;
2010 		safe_unpack32(&array_job_id, buffer);
2011 		safe_unpack32(&array_task_id, buffer);
2012 
2013 		/* Job Array record */
2014 		safe_unpack32(&task_id_size, buffer);
2015 		if (task_id_size != NO_VAL) {
2016 			if (task_id_size) {
2017 				safe_unpackstr_xmalloc(&task_id_str, &name_len,
2018 						       buffer);
2019 			}
2020 			safe_unpack32(&array_flags,    buffer);
2021 			safe_unpack32(&max_run_tasks,  buffer);
2022 			safe_unpack32(&tot_run_tasks,  buffer);
2023 			safe_unpack32(&min_exit_code,  buffer);
2024 			safe_unpack32(&max_exit_code,  buffer);
2025 			safe_unpack32(&tot_comp_tasks, buffer);
2026 		}
2027 
2028 		safe_unpack32(&assoc_id, buffer);
2029 		safe_unpackstr_xmalloc(&batch_features, &name_len, buffer);
2030 		safe_unpack32(&delay_boot, buffer);
2031 		safe_unpack32(&job_id, buffer);
2032 
2033 		/* validity test as possible */
2034 		if (job_id == 0) {
2035 			verbose("Invalid job_id %u", job_id);
2036 			goto unpack_error;
2037 		}
2038 
2039 		job_ptr = find_job_record(job_id);
2040 		if (job_ptr == NULL) {
2041 			job_ptr = _create_job_record(1);
2042 			if (!job_ptr) {
2043 				error("Create job entry failed for JobId=%u",
2044 				      job_id);
2045 				goto unpack_error;
2046 			}
2047 			job_ptr->job_id = job_id;
2048 			job_ptr->array_job_id = array_job_id;
2049 			job_ptr->array_task_id = array_task_id;
2050 		}
2051 
2052 		safe_unpack32(&user_id, buffer);
2053 		safe_unpack32(&group_id, buffer);
2054 		safe_unpack32(&time_limit, buffer);
2055 		safe_unpack32(&time_min, buffer);
2056 		safe_unpack32(&priority, buffer);
2057 		safe_unpack32(&alloc_sid, buffer);
2058 		safe_unpack32(&total_cpus, buffer);
2059 		safe_unpack32(&total_nodes, buffer);
2060 		safe_unpack32(&cpu_cnt, buffer);
2061 		safe_unpack32(&exit_code, buffer);
2062 		safe_unpack32(&derived_ec, buffer);
2063 		safe_unpack64(&db_index, buffer);
2064 		safe_unpack32(&resv_id, buffer);
2065 		safe_unpack32(&next_step_id, buffer);
2066 		safe_unpack32(&het_job_id, buffer);
2067 		safe_unpackstr_xmalloc(&het_job_id_set, &name_len, buffer);
2068 		safe_unpack32(&het_job_offset, buffer);
2069 		safe_unpack32(&qos_id, buffer);
2070 		safe_unpack32(&req_switch, buffer);
2071 		safe_unpack32(&wait4switch, buffer);
2072 		safe_unpack32(&profile, buffer);
2073 
2074 		safe_unpack_time(&last_sched_eval, buffer);
2075 		safe_unpack_time(&preempt_time, buffer);
2076 		safe_unpack_time(&start_time, buffer);
2077 		safe_unpack_time(&end_time, buffer);
2078 		safe_unpack_time(&end_time_exp, buffer);
2079 		safe_unpack_time(&suspend_time, buffer);
2080 		safe_unpack_time(&pre_sus_time, buffer);
2081 		safe_unpack_time(&resize_time, buffer);
2082 		safe_unpack_time(&tot_sus_time, buffer);
2083 		safe_unpack_time(&deadline, buffer);
2084 
2085 		safe_unpack16(&direct_set_prio, buffer);
2086 		safe_unpack32(&job_state, buffer);
2087 		safe_unpack16(&kill_on_node_fail, buffer);
2088 		safe_unpack16(&batch_flag, buffer);
2089 		safe_unpack16(&mail_type, buffer);
2090 		safe_unpack16(&tmp16, buffer);
2091 		state_reason = tmp16;
2092 		safe_unpack8 (&reboot, buffer);
2093 		safe_unpack16(&restart_cnt, buffer);
2094 		safe_unpack16(&wait_all_nodes, buffer);
2095 		safe_unpack16(&warn_flags, buffer);
2096 		safe_unpack16(&warn_signal, buffer);
2097 		safe_unpack16(&warn_time, buffer);
2098 
2099 		_unpack_acct_policy_limit_members(&limit_set, buffer,
2100 						  protocol_version);
2101 
2102 		safe_unpackstr_xmalloc(&state_desc, &name_len, buffer);
2103 		safe_unpackstr_xmalloc(&resp_host, &name_len, buffer);
2104 
2105 		safe_unpack16(&alloc_resp_port, buffer);
2106 		safe_unpack16(&other_port, buffer);
2107 		safe_unpack8(&power_flags, buffer);
2108 		safe_unpack16(&start_protocol_ver, buffer);
2109 		safe_unpackdouble(&billable_tres, buffer);
2110 
2111 		if (job_state & JOB_COMPLETING) {
2112 			safe_unpackstr_xmalloc(&nodes_completing,
2113 					       &name_len, buffer);
2114 		}
2115 		safe_unpackstr_xmalloc(&nodes, &name_len, buffer);
2116 		safe_unpackstr_xmalloc(&partition, &name_len, buffer);
2117 		if (partition == NULL) {
2118 			error("No partition for JobId=%u", job_id);
2119 			goto unpack_error;
2120 		}
2121 		part_ptr = find_part_record (partition);
2122 		if (part_ptr == NULL) {
2123 			char *err_part = NULL;
2124 			part_ptr_list = get_part_list(partition, &err_part);
2125 			if (part_ptr_list) {
2126 				part_ptr = list_peek(part_ptr_list);
2127 				if (list_count(part_ptr_list) == 1)
2128 					FREE_NULL_LIST(part_ptr_list);
2129 			} else {
2130 				verbose("Invalid partition (%s) for JobId=%u",
2131 					err_part, job_id);
2132 				xfree(err_part);
2133 				/* not fatal error, partition could have been
2134 				 * removed, reset_job_bitmaps() will clean-up
2135 				 * this job */
2136 			}
2137 		}
2138 
2139 		safe_unpackstr_xmalloc(&name, &name_len, buffer);
2140 		safe_unpackstr_xmalloc(&user_name, &name_len, buffer);
2141 		safe_unpackstr_xmalloc(&wckey, &name_len, buffer);
2142 		safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer);
2143 		safe_unpackstr_xmalloc(&account, &name_len, buffer);
2144 		safe_unpackstr_xmalloc(&admin_comment, &name_len, buffer);
2145 		safe_unpackstr_xmalloc(&comment, &name_len, buffer);
2146 		safe_unpackstr_xmalloc(&gres_alloc, &name_len, buffer);
2147 		safe_unpackstr_xmalloc(&gres_req, &name_len, buffer);
2148 		safe_unpackstr_xmalloc(&gres_used, &name_len, buffer);
2149 		safe_unpackstr_xmalloc(&network, &name_len, buffer);
2150 		safe_unpackstr_xmalloc(&licenses, &name_len, buffer);
2151 		safe_unpackstr_xmalloc(&mail_user, &name_len, buffer);
2152 		safe_unpackstr_xmalloc(&mcs_label, &name_len, buffer);
2153 		safe_unpackstr_xmalloc(&resv_name, &name_len, buffer);
2154 		safe_unpackstr_xmalloc(&batch_host, &name_len, buffer);
2155 		safe_unpackstr_xmalloc(&burst_buffer, &name_len, buffer);
2156 		safe_unpackstr_xmalloc(&burst_buffer_state, &name_len, buffer);
2157 		safe_unpackstr_xmalloc(&system_comment, &name_len, buffer);
2158 
2159 		if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer,
2160 						   protocol_version))
2161 			goto unpack_error;
2162 		if (unpack_job_resources(&job_resources, buffer,
2163 					 protocol_version))
2164 			goto unpack_error;
2165 
2166 		safe_unpack16(&uint16_tmp, buffer); /* was ckpt_interval */
2167 		/* fake out the former checkpoint plugin */
2168 		{
2169 			uint16_t id;
2170 			uint32_t size;
2171 			safe_unpack16(&id, buffer);
2172 			safe_unpack32(&size, buffer);
2173 			/* skip past any checkpoint plugin info */
2174 			size += get_buf_offset(buffer);
2175 			set_buf_offset(buffer, size);
2176 		}
2177 
2178 		safe_unpackstr_array(&spank_job_env, &spank_job_env_size,
2179 				     buffer);
2180 
2181 		if (gres_plugin_job_state_unpack(&gres_list, buffer, job_id,
2182 						 protocol_version) !=
2183 		    SLURM_SUCCESS)
2184 			goto unpack_error;
2185 		gres_plugin_job_state_log(gres_list, job_id);
2186 
2187 		safe_unpack16(&details, buffer);
2188 		if ((details == DETAILS_FLAG) &&
2189 		    (_load_job_details(job_ptr, buffer, protocol_version))) {
2190 			job_ptr->job_state = JOB_FAILED;
2191 			job_ptr->exit_code = 1;
2192 			job_ptr->state_reason = FAIL_SYSTEM;
2193 			xfree(job_ptr->state_desc);
2194 			job_ptr->end_time = now;
2195 			goto unpack_error;
2196 		}
2197 		safe_unpack16(&step_flag, buffer);
2198 
2199 		while (step_flag == STEP_FLAG) {
2200 			/*
2201 			 * No need to put these into accounting if they
2202 			 * haven't been since all information will be
2203 			 * put in when the job is finished.
2204 			 */
2205 			if ((error_code = load_step_state(job_ptr, buffer,
2206 							  protocol_version)))
2207 				goto unpack_error;
2208 			safe_unpack16(&step_flag, buffer);
2209 		}
2210 		safe_unpack32(&job_ptr->bit_flags, buffer);
2211 		job_ptr->bit_flags &= ~BACKFILL_TEST;
2212 		job_ptr->bit_flags |= JOB_MEM_SET;
2213 		safe_unpackstr_xmalloc(&tres_alloc_str,
2214 				       &name_len, buffer);
2215 		safe_unpackstr_xmalloc(&tres_fmt_alloc_str,
2216 				       &name_len, buffer);
2217 		safe_unpackstr_xmalloc(&tres_req_str, &name_len, buffer);
2218 		safe_unpackstr_xmalloc(&tres_fmt_req_str, &name_len, buffer);
2219 		safe_unpackstr_xmalloc(&clusters, &name_len, buffer);
2220 		if ((error_code = _load_job_fed_details(&job_fed_details,
2221 							buffer,
2222 							protocol_version)))
2223 			goto unpack_error;
2224 
2225 		safe_unpackstr_xmalloc(&job_ptr->origin_cluster, &name_len,
2226 				       buffer);
2227 
2228 		safe_unpackstr_xmalloc(&job_ptr->cpus_per_tres, &name_len,
2229 				       buffer);
2230 		safe_unpackstr_xmalloc(&job_ptr->mem_per_tres, &name_len,
2231 				       buffer);
2232 		safe_unpackstr_xmalloc(&job_ptr->tres_bind, &name_len,
2233 				       buffer);
2234 		safe_unpackstr_xmalloc(&job_ptr->tres_freq, &name_len,
2235 				       buffer);
2236 		safe_unpackstr_xmalloc(&job_ptr->tres_per_job, &name_len,
2237 				       buffer);
2238 		safe_unpackstr_xmalloc(&job_ptr->tres_per_node, &name_len,
2239 				       buffer);
2240 		safe_unpackstr_xmalloc(&job_ptr->tres_per_socket, &name_len,
2241 				       buffer);
2242 		safe_unpackstr_xmalloc(&job_ptr->tres_per_task, &name_len,
2243 				       buffer);
2244 	} else {
2245 		error("%s: protocol_version %hu not supported",
2246 		      __func__, protocol_version);
2247 		goto unpack_error;
2248 	}
2249 
2250 	/* "Don't load "unlinked" job. */
2251 	if (job_ptr->job_id == NO_VAL) {
2252 		debug("skipping unlinked job");
2253 		rc = SLURM_SUCCESS;
2254 		goto free_it;
2255 	}
2256 
2257 	if (((job_state & JOB_STATE_BASE) >= JOB_END) ||
2258 	    (batch_flag > MAX_BATCH_REQUEUE)) {
2259 		error("Invalid data for JobId=%u: job_state=%u batch_flag=%u",
2260 		      job_id, job_state, batch_flag);
2261 		goto unpack_error;
2262 	}
2263 	if (kill_on_node_fail > 1) {
2264 		error("Invalid data for JobId=%u: kill_on_node_fail=%u",
2265 		      job_id, kill_on_node_fail);
2266 		goto unpack_error;
2267 	}
2268 
2269 	if ((priority > 1) && (direct_set_prio == 0)) {
2270 		highest_prio = MAX(highest_prio, priority);
2271 		lowest_prio  = MIN(lowest_prio,  priority);
2272 	}
2273 
2274 #if 0
2275 	/*
2276 	 * This is not necessary since the job_id_sequence is checkpointed and
2277 	 * the jobid will be checked if it's in use in get_next_job_id().
2278 	 */
2279 
2280 	/* Base job_id_sequence off of local job id but only if the job
2281 	 * originated from this cluster -- so that the local job id of a
2282 	 * different cluster isn't restored here. */
2283 	if (!job_fed_details ||
2284 	    !xstrcmp(job_fed_details->origin_str, slurmctld_conf.cluster_name))
2285 		local_job_id = fed_mgr_get_local_id(job_id);
2286 	if (job_id_sequence <= local_job_id)
2287 		job_id_sequence = local_job_id + 1;
2288 #endif
2289 
2290 	xfree(job_ptr->tres_alloc_str);
2291 	job_ptr->tres_alloc_str = tres_alloc_str;
2292 	tres_alloc_str = NULL;
2293 
2294 	xfree(job_ptr->tres_req_str);
2295 	job_ptr->tres_req_str = tres_req_str;
2296 	tres_req_str = NULL;
2297 
2298 	xfree(job_ptr->tres_fmt_alloc_str);
2299 	job_ptr->tres_fmt_alloc_str = tres_fmt_alloc_str;
2300 	tres_fmt_alloc_str = NULL;
2301 
2302 	xfree(job_ptr->tres_fmt_req_str);
2303 	job_ptr->tres_fmt_req_str = tres_fmt_req_str;
2304 	tres_fmt_req_str = NULL;
2305 
2306 	xfree(job_ptr->account);
2307 	job_ptr->account = account;
2308 	xstrtolower(job_ptr->account);
2309 	account          = NULL;  /* reused, nothing left to free */
2310 	xfree(job_ptr->alloc_node);
2311 	job_ptr->alloc_node   = alloc_node;
2312 	alloc_node             = NULL;	/* reused, nothing left to free */
2313 	job_ptr->alloc_resp_port = alloc_resp_port;
2314 	job_ptr->alloc_sid    = alloc_sid;
2315 	job_ptr->assoc_id     = assoc_id;
2316 	job_ptr->delay_boot   = delay_boot;
2317 	xfree(job_ptr->admin_comment);
2318 	job_ptr->admin_comment = admin_comment;
2319 	admin_comment          = NULL;  /* reused, nothing left to free */
2320 	xfree(job_ptr->system_comment);
2321 	job_ptr->system_comment = system_comment;
2322 	system_comment          = NULL;  /* reused, nothing left to free */
2323 	xfree(job_ptr->batch_features);
2324 	job_ptr->batch_features = batch_features;
2325 	batch_features          = NULL;  /* reused, nothing left to free */
2326 	job_ptr->batch_flag   = batch_flag;
2327 	xfree(job_ptr->batch_host);
2328 	job_ptr->batch_host   = batch_host;
2329 	batch_host            = NULL;  /* reused, nothing left to free */
2330 	xfree(job_ptr->burst_buffer);
2331 	job_ptr->burst_buffer = burst_buffer;
2332 	burst_buffer          = NULL;  /* reused, nothing left to free */
2333 	xfree(job_ptr->burst_buffer_state);
2334 	job_ptr->burst_buffer_state = burst_buffer_state;
2335 	burst_buffer_state    = NULL;  /* reused, nothing left to free */
2336 	xfree(job_ptr->comment);
2337 	job_ptr->comment      = comment;
2338 	comment               = NULL;  /* reused, nothing left to free */
2339 	job_ptr->billable_tres = billable_tres;
2340 	xfree(job_ptr->gres_alloc);
2341 	job_ptr->gres_alloc   = gres_alloc;
2342 	gres_alloc            = NULL;  /* reused, nothing left to free */
2343 	xfree(job_ptr->gres_req);
2344 	job_ptr->gres_req    = gres_req;
2345 	gres_req              = NULL;  /* reused, nothing left to free */
2346 	xfree(job_ptr->gres_used);
2347 	job_ptr->gres_used    = gres_used;
2348 	gres_used             = NULL;  /* reused, nothing left to free */
2349 	job_ptr->gres_list    = gres_list;
2350 	job_ptr->site_factor = site_factor;
2351 	job_ptr->direct_set_prio = direct_set_prio;
2352 	job_ptr->db_index     = db_index;
2353 	job_ptr->derived_ec   = derived_ec;
2354 	job_ptr->end_time_exp = end_time_exp;
2355 	job_ptr->end_time     = end_time;
2356 	job_ptr->exit_code    = exit_code;
2357 	job_ptr->group_id     = group_id;
2358 	job_ptr->job_state    = job_state;
2359 	job_ptr->kill_on_node_fail = kill_on_node_fail;
2360 	xfree(job_ptr->licenses);
2361 	job_ptr->licenses     = licenses;
2362 	licenses              = NULL;	/* reused, nothing left to free */
2363 	job_ptr->mail_type    = mail_type;
2364 	xfree(job_ptr->mail_user);
2365 	if (mail_user)
2366 		job_ptr->mail_user    = mail_user;
2367 	else
2368 		job_ptr->mail_user = _get_mail_user(NULL, user_id);
2369 	mail_user             = NULL;	/* reused, nothing left to free */
2370 	xfree(job_ptr->mcs_label);
2371 	job_ptr->mcs_label    = mcs_label;
2372 	mcs_label	      = NULL;   /* reused, nothing left to free */
2373 	xfree(job_ptr->name);		/* in case duplicate record */
2374 	job_ptr->name         = name;
2375 	name                  = NULL;	/* reused, nothing left to free */
2376 	xfree(job_ptr->user_name);
2377 	job_ptr->user_name    = user_name;
2378 	user_name             = NULL;   /* reused, nothing left to free */
2379 	xfree(job_ptr->wckey);		/* in case duplicate record */
2380 	job_ptr->wckey        = wckey;
2381 	xstrtolower(job_ptr->wckey);
2382 	wckey                 = NULL;	/* reused, nothing left to free */
2383 	xfree(job_ptr->network);
2384 	job_ptr->network      = network;
2385 	network               = NULL;  /* reused, nothing left to free */
2386 	job_ptr->next_step_id = next_step_id;
2387 	xfree(job_ptr->nodes);		/* in case duplicate record */
2388 	job_ptr->nodes        = nodes;
2389 	nodes                 = NULL;	/* reused, nothing left to free */
2390 	if (nodes_completing) {
2391 		xfree(job_ptr->nodes_completing);
2392 		job_ptr->nodes_completing = nodes_completing;
2393 		nodes_completing = NULL;  /* reused, nothing left to free */
2394 	}
2395 	job_ptr->other_port   = other_port;
2396 	job_ptr->power_flags  = power_flags;
2397 	job_ptr->het_job_id     = het_job_id;
2398 	xfree(job_ptr->het_job_id_set);
2399 	job_ptr->het_job_id_set = het_job_id_set;
2400 	het_job_id_set       = NULL;	/* reused, nothing left to free */
2401 	job_ptr->het_job_offset = het_job_offset;
2402 	xfree(job_ptr->partition);
2403 	job_ptr->partition    = partition;
2404 	partition             = NULL;	/* reused, nothing left to free */
2405 	job_ptr->part_ptr = part_ptr;
2406 	job_ptr->part_ptr_list = part_ptr_list;
2407 	job_ptr->pre_sus_time = pre_sus_time;
2408 	job_ptr->priority     = priority;
2409 	job_ptr->qos_id       = qos_id;
2410 	job_ptr->reboot       = reboot;
2411 	xfree(job_ptr->resp_host);
2412 	job_ptr->resp_host    = resp_host;
2413 	resp_host             = NULL;	/* reused, nothing left to free */
2414 	job_ptr->resize_time  = resize_time;
2415 	job_ptr->restart_cnt  = restart_cnt;
2416 	job_ptr->resv_id      = resv_id;
2417 	job_ptr->resv_name    = resv_name;
2418 	resv_name             = NULL;	/* reused, nothing left to free */
2419 	job_ptr->select_jobinfo = select_jobinfo;
2420 	job_ptr->job_resrcs   = job_resources;
2421 	job_ptr->spank_job_env = spank_job_env;
2422 	job_ptr->spank_job_env_size = spank_job_env_size;
2423 	job_ptr->start_time   = start_time;
2424 	job_ptr->state_reason = state_reason;
2425 	job_ptr->state_reason_prev_db = state_reason_prev_db;
2426 	job_ptr->state_desc   = state_desc;
2427 	state_desc            = NULL;	/* reused, nothing left to free */
2428 	job_ptr->suspend_time = suspend_time;
2429 	job_ptr->deadline     = deadline;
2430 	if (task_id_size != NO_VAL) {
2431 		if (!job_ptr->array_recs)
2432 			job_ptr->array_recs=xmalloc(sizeof(job_array_struct_t));
2433 		FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap);
2434 		xfree(job_ptr->array_recs->task_id_str);
2435 		if (task_id_size) {
2436 			job_ptr->array_recs->task_id_bitmap =
2437 				bit_alloc(task_id_size);
2438 			if (task_id_str) {
2439 				bit_unfmt_hexmask(
2440 					job_ptr->array_recs->task_id_bitmap,
2441 					task_id_str);
2442 				job_ptr->array_recs->task_id_str = task_id_str;
2443 				task_id_str = NULL;
2444 			}
2445 			job_ptr->array_recs->task_cnt =
2446 				bit_set_count(job_ptr->array_recs->
2447 					      task_id_bitmap);
2448 
2449 			if (job_ptr->array_recs->task_cnt > 1)
2450 				job_count += (job_ptr->array_recs->task_cnt-1);
2451 		} else
2452 			xfree(task_id_str);
2453 		job_ptr->array_recs->array_flags    = array_flags;
2454 		job_ptr->array_recs->max_run_tasks  = max_run_tasks;
2455 		job_ptr->array_recs->tot_run_tasks  = tot_run_tasks;
2456 		job_ptr->array_recs->min_exit_code  = min_exit_code;
2457 		job_ptr->array_recs->max_exit_code  = max_exit_code;
2458 		job_ptr->array_recs->tot_comp_tasks = tot_comp_tasks;
2459 	}
2460 	job_ptr->time_last_active = now;
2461 	job_ptr->time_limit   = time_limit;
2462 	job_ptr->time_min     = time_min;
2463 	job_ptr->total_cpus   = total_cpus;
2464 
2465 	if (IS_JOB_PENDING(job_ptr))
2466 		job_ptr->node_cnt_wag = total_nodes;
2467 	else
2468 		job_ptr->total_nodes  = total_nodes;
2469 
2470 	job_ptr->cpu_cnt      = cpu_cnt;
2471 	job_ptr->tot_sus_time = tot_sus_time;
2472 	job_ptr->last_sched_eval = last_sched_eval;
2473 	job_ptr->preempt_time = preempt_time;
2474 	job_ptr->user_id      = user_id;
2475 	job_ptr->wait_all_nodes = wait_all_nodes;
2476 	job_ptr->warn_flags   = warn_flags;
2477 	job_ptr->warn_signal  = warn_signal;
2478 	job_ptr->warn_time    = warn_time;
2479 
2480 	memcpy(&job_ptr->limit_set, &limit_set,
2481 	       sizeof(acct_policy_limit_set_t));
2482 	limit_set.tres = NULL;
2483 
2484 	job_ptr->req_switch      = req_switch;
2485 	job_ptr->wait4switch     = wait4switch;
2486 	job_ptr->profile         = profile;
2487 	job_ptr->db_flags        = db_flags;
2488 	/*
2489 	 * This needs to always to initialized to "true".  The select
2490 	 * plugin will deal with it every time it goes through the
2491 	 * logic if req_switch or wait4switch are set.
2492 	 */
2493 	job_ptr->best_switch     = true;
2494 	job_ptr->start_protocol_ver = start_protocol_ver;
2495 
2496 	_add_job_hash(job_ptr);
2497 	_add_job_array_hash(job_ptr);
2498 
2499 	memset(&assoc_rec, 0, sizeof(assoc_rec));
2500 
2501 	/*
2502 	 * For speed and accurracy we will first see if we once had an
2503 	 * association record.  If not look for it by
2504 	 * account,partition, user_id.
2505 	 */
2506 	if (job_ptr->assoc_id)
2507 		assoc_rec.id = job_ptr->assoc_id;
2508 	else {
2509 		assoc_rec.acct      = job_ptr->account;
2510 		if (job_ptr->part_ptr)
2511 			assoc_rec.partition = job_ptr->part_ptr->name;
2512 		assoc_rec.uid       = job_ptr->user_id;
2513 	}
2514 
2515 	assoc_mgr_lock(&locks);
2516 	if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
2517 				    accounting_enforce,
2518 				    &job_ptr->assoc_ptr, true) &&
2519 	    (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)
2520 	    && (!IS_JOB_FINISHED(job_ptr))) {
2521 		_job_fail_account(job_ptr, __func__);
2522 	} else {
2523 		job_ptr->assoc_id = assoc_rec.id;
2524 		info("Recovered %pJ Assoc=%u", job_ptr, job_ptr->assoc_id);
2525 
2526 		if (job_ptr->state_reason == FAIL_ACCOUNT) {
2527 			job_ptr->state_reason = WAIT_NO_REASON;
2528 			xfree(job_ptr->state_desc);
2529 		}
2530 
2531 		/* make sure we have started this job in accounting */
2532 		if (!job_ptr->db_index) {
2533 			debug("starting %pJ in accounting", job_ptr);
2534 			if (!with_slurmdbd)
2535 				jobacct_storage_g_job_start(
2536 					acct_db_conn, job_ptr);
2537 			if (slurmctld_init_db
2538 			    && IS_JOB_SUSPENDED(job_ptr)) {
2539 				jobacct_storage_g_job_suspend(acct_db_conn,
2540 							      job_ptr);
2541 			}
2542 		}
2543 		/* make sure we have this job completed in the database */
2544 		if (IS_JOB_FINISHED(job_ptr)) {
2545 			if (slurmctld_init_db &&
2546 			    !(job_ptr->bit_flags & TRES_STR_CALC) &&
2547 			    job_ptr->tres_alloc_cnt &&
2548 			    (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64))
2549 				set_job_tres_alloc_str(job_ptr, false);
2550 			jobacct_storage_g_job_complete(
2551 				acct_db_conn, job_ptr);
2552 			job_finished = 1;
2553 		}
2554 	}
2555 
2556 	if (!job_finished && job_ptr->qos_id &&
2557 	    (job_ptr->state_reason != FAIL_ACCOUNT)) {
2558 		memset(&qos_rec, 0, sizeof(qos_rec));
2559 		qos_rec.id = job_ptr->qos_id;
2560 		job_ptr->qos_ptr = _determine_and_validate_qos(
2561 			job_ptr->resv_name, job_ptr->assoc_ptr,
2562 			job_ptr->limit_set.qos, &qos_rec,
2563 			&qos_error, true, LOG_LEVEL_ERROR);
2564 		if ((qos_error != SLURM_SUCCESS) && !job_ptr->limit_set.qos) {
2565 			job_fail_qos(job_ptr, __func__);
2566 		} else {
2567 			job_ptr->qos_id = qos_rec.id;
2568 			if (job_ptr->state_reason == FAIL_QOS) {
2569 				job_ptr->state_reason = WAIT_NO_REASON;
2570 				xfree(job_ptr->state_desc);
2571 			}
2572 		}
2573 	}
2574 
2575 	/*
2576 	 * do this after the format string just in case for some
2577 	 * reason the tres_alloc_str is NULL but not the fmt_str
2578 	 */
2579 	if (job_ptr->tres_alloc_str)
2580 		assoc_mgr_set_tres_cnt_array(
2581 			&job_ptr->tres_alloc_cnt, job_ptr->tres_alloc_str,
2582 			0, true);
2583 	else
2584 		job_set_alloc_tres(job_ptr, true);
2585 
2586 	if (job_ptr->tres_req_str)
2587 		assoc_mgr_set_tres_cnt_array(
2588 			&job_ptr->tres_req_cnt, job_ptr->tres_req_str, 0, true);
2589 	else
2590 		job_set_req_tres(job_ptr, true);
2591 	assoc_mgr_unlock(&locks);
2592 
2593 	build_node_details(job_ptr, false);	/* set node_addr */
2594 	gres_build_job_details(job_ptr->gres_list,
2595 			       &job_ptr->gres_detail_cnt,
2596 			       &job_ptr->gres_detail_str,
2597 			       &job_ptr->gres_used);
2598 	job_ptr->clusters     = clusters;
2599 	job_ptr->fed_details  = job_fed_details;
2600 	return SLURM_SUCCESS;
2601 
2602 unpack_error:
2603 	error("Incomplete job record");
2604 	rc = SLURM_ERROR;
2605 
2606 free_it:
2607 	xfree(alloc_node);
2608 	xfree(account);
2609 	xfree(admin_comment);
2610 	xfree(batch_features);
2611 	xfree(batch_host);
2612 	xfree(burst_buffer);
2613 	xfree(clusters);
2614 	xfree(comment);
2615 	xfree(gres_alloc);
2616 	xfree(gres_req);
2617 	xfree(gres_used);
2618 	xfree(het_job_id_set);
2619 	free_job_fed_details(&job_fed_details);
2620 	free_job_resources(&job_resources);
2621 	xfree(resp_host);
2622 	xfree(licenses);
2623 	xfree(limit_set.tres);
2624 	xfree(mail_user);
2625 	xfree(mcs_label);
2626 	xfree(name);
2627 	xfree(nodes);
2628 	xfree(nodes_completing);
2629 	xfree(partition);
2630 	FREE_NULL_LIST(part_ptr_list);
2631 	xfree(resv_name);
2632 	for (i = 0; i < spank_job_env_size; i++)
2633 		xfree(spank_job_env[i]);
2634 	xfree(spank_job_env);
2635 	xfree(state_desc);
2636 	xfree(system_comment);
2637 	xfree(task_id_str);
2638 	xfree(tres_alloc_str);
2639 	xfree(tres_fmt_alloc_str);
2640 	xfree(tres_fmt_req_str);
2641 	xfree(tres_req_str);
2642 	xfree(user_name);
2643 	xfree(wckey);
2644 	select_g_select_jobinfo_free(select_jobinfo);
2645 	if (job_ptr) {
2646 		if (job_ptr->job_id == 0)
2647 			job_ptr->job_id = NO_VAL;
2648 		purge_job_record(job_ptr->job_id);
2649 	}
2650 	for (i = 0; i < pelog_env_size; i++)
2651 		xfree(pelog_env[i]);
2652 	xfree(pelog_env);
2653 
2654 	return rc;
2655 }
2656 
2657 /*
2658  * _dump_job_details - dump the state of a specific job details to
2659  *	a buffer
2660  * IN detail_ptr - pointer to job details for which information is requested
2661  * IN/OUT buffer - location to store data, pointers automatically advanced
2662  */
_dump_job_details(struct job_details * detail_ptr,Buf buffer)2663 void _dump_job_details(struct job_details *detail_ptr, Buf buffer)
2664 {
2665 	/*
2666 	 * Some job fields can change in the course of scheduling, so we
2667 	 * report the original values supplied by the user rather than
2668 	 * an intermediate value that might be set by our scheduling
2669 	 * logic (e.g. to enforce a partition, association or QOS limit).
2670 	 *
2671 	 * Fields subject to change and their original values are as follows:
2672 	 * min_cpus		orig_min_cpus
2673 	 * max_cpus		orig_max_cpus
2674 	 * cpus_per_task 	orig_cpus_per_task
2675 	 * pn_min_cpus		orig_pn_min_cpus
2676 	 * pn_min_memory	orig_pn_min_memory
2677 	 * dependency		orig_dependency
2678 	 */
2679 	pack32(detail_ptr->orig_min_cpus, buffer);	/* subject to change */
2680 	pack32(detail_ptr->orig_max_cpus, buffer);	/* subject to change */
2681 	pack32(detail_ptr->min_nodes, buffer);
2682 	pack32(detail_ptr->max_nodes, buffer);
2683 	pack32(detail_ptr->num_tasks, buffer);
2684 
2685 	packstr(detail_ptr->acctg_freq, buffer);
2686 	pack16(detail_ptr->contiguous, buffer);
2687 	pack16(detail_ptr->core_spec, buffer);
2688 	pack16(detail_ptr->orig_cpus_per_task, buffer);	/* subject to change */
2689 	pack32(detail_ptr->nice, buffer);
2690 	pack16(detail_ptr->ntasks_per_node, buffer);
2691 	pack16(detail_ptr->requeue, buffer);
2692 	pack32(detail_ptr->task_dist, buffer);
2693 
2694 	pack8(detail_ptr->share_res, buffer);
2695 	pack8(detail_ptr->whole_node, buffer);
2696 
2697 	packstr(detail_ptr->cpu_bind,     buffer);
2698 	pack16(detail_ptr->cpu_bind_type, buffer);
2699 	packstr(detail_ptr->mem_bind,     buffer);
2700 	pack16(detail_ptr->mem_bind_type, buffer);
2701 	pack16(detail_ptr->plane_size, buffer);
2702 
2703 	pack8(detail_ptr->open_mode, buffer);
2704 	pack8(detail_ptr->overcommit, buffer);
2705 	pack8(detail_ptr->prolog_running, buffer);
2706 
2707 	pack32(detail_ptr->orig_pn_min_cpus, buffer);	/* subject to change */
2708 	pack64(detail_ptr->orig_pn_min_memory, buffer);	/* subject to change */
2709 	pack32(detail_ptr->pn_min_tmp_disk, buffer);
2710 	pack32(detail_ptr->cpu_freq_min, buffer);
2711 	pack32(detail_ptr->cpu_freq_max, buffer);
2712 	pack32(detail_ptr->cpu_freq_gov, buffer);
2713 	pack_time(detail_ptr->begin_time, buffer);
2714 	pack_time(detail_ptr->accrue_time, buffer);
2715 	pack_time(detail_ptr->submit_time, buffer);
2716 
2717 	packstr(detail_ptr->req_nodes,  buffer);
2718 	packstr(detail_ptr->exc_nodes,  buffer);
2719 	packstr(detail_ptr->features,   buffer);
2720 	packstr(detail_ptr->cluster_features, buffer);
2721 	pack_dep_list(detail_ptr->depend_list, buffer, SLURM_PROTOCOL_VERSION);
2722 	packstr(detail_ptr->dependency, buffer);
2723 	packstr(detail_ptr->orig_dependency, buffer);	/* subject to change */
2724 
2725 	packstr(detail_ptr->std_err,       buffer);
2726 	packstr(detail_ptr->std_in,        buffer);
2727 	packstr(detail_ptr->std_out,       buffer);
2728 	packstr(detail_ptr->work_dir,  buffer);
2729 
2730 	pack_multi_core_data(detail_ptr->mc_ptr, buffer,
2731 			     SLURM_PROTOCOL_VERSION);
2732 	packstr_array(detail_ptr->argv, detail_ptr->argc, buffer);
2733 	packstr_array(detail_ptr->env_sup, detail_ptr->env_cnt, buffer);
2734 }
2735 
2736 /* _load_job_details - Unpack a job details information from buffer */
_load_job_details(job_record_t * job_ptr,Buf buffer,uint16_t protocol_version)2737 static int _load_job_details(job_record_t *job_ptr, Buf buffer,
2738 			     uint16_t protocol_version)
2739 {
2740 	char *acctg_freq = NULL, *req_nodes = NULL, *exc_nodes = NULL;
2741 	char *features = NULL, *cpu_bind = NULL, *dependency = NULL;
2742 	char *orig_dependency = NULL, *mem_bind, *cluster_features = NULL;
2743 	char *err = NULL, *in = NULL, *out = NULL, *work_dir = NULL;
2744 	char **argv = (char **) NULL, **env_sup = (char **) NULL;
2745 	uint32_t min_nodes, max_nodes;
2746 	uint32_t min_cpus = 1, max_cpus = NO_VAL;
2747 	uint32_t pn_min_cpus, pn_min_tmp_disk;
2748 	uint64_t pn_min_memory;
2749 	uint32_t cpu_freq_min = NO_VAL;
2750 	uint32_t cpu_freq_max = NO_VAL;
2751 	uint32_t cpu_freq_gov = NO_VAL, nice = 0;
2752 	uint32_t num_tasks, name_len, argc = 0, env_cnt = 0, task_dist;
2753 	uint16_t contiguous, core_spec = NO_VAL16;
2754 	uint16_t ntasks_per_node, cpus_per_task, requeue;
2755 	uint16_t cpu_bind_type, mem_bind_type, plane_size;
2756 	uint8_t open_mode, overcommit, prolog_running;
2757 	uint8_t share_res, whole_node;
2758 	time_t begin_time, accrue_time = 0, submit_time;
2759 	int i;
2760 	List depend_list = NULL;
2761 	multi_core_data_t *mc_ptr;
2762 
2763 	/* unpack the job's details from the buffer */
2764 	if (protocol_version >= SLURM_20_02_PROTOCOL_VERSION) {
2765 		safe_unpack32(&min_cpus, buffer);
2766 		safe_unpack32(&max_cpus, buffer);
2767 		safe_unpack32(&min_nodes, buffer);
2768 		safe_unpack32(&max_nodes, buffer);
2769 		safe_unpack32(&num_tasks, buffer);
2770 
2771 		safe_unpackstr_xmalloc(&acctg_freq, &name_len, buffer);
2772 		safe_unpack16(&contiguous, buffer);
2773 		safe_unpack16(&core_spec, buffer);
2774 		safe_unpack16(&cpus_per_task, buffer);
2775 		safe_unpack32(&nice, buffer);
2776 		safe_unpack16(&ntasks_per_node, buffer);
2777 		safe_unpack16(&requeue, buffer);
2778 		safe_unpack32(&task_dist, buffer);
2779 
2780 		safe_unpack8(&share_res, buffer);
2781 		safe_unpack8(&whole_node, buffer);
2782 
2783 		safe_unpackstr_xmalloc(&cpu_bind, &name_len, buffer);
2784 		safe_unpack16(&cpu_bind_type, buffer);
2785 		safe_unpackstr_xmalloc(&mem_bind, &name_len, buffer);
2786 		safe_unpack16(&mem_bind_type, buffer);
2787 		safe_unpack16(&plane_size, buffer);
2788 
2789 		safe_unpack8(&open_mode, buffer);
2790 		safe_unpack8(&overcommit, buffer);
2791 		safe_unpack8(&prolog_running, buffer);
2792 
2793 		safe_unpack32(&pn_min_cpus, buffer);
2794 		safe_unpack64(&pn_min_memory, buffer);
2795 		safe_unpack32(&pn_min_tmp_disk, buffer);
2796 		safe_unpack32(&cpu_freq_min, buffer);
2797 		safe_unpack32(&cpu_freq_max, buffer);
2798 		safe_unpack32(&cpu_freq_gov, buffer);
2799 		safe_unpack_time(&begin_time, buffer);
2800 		safe_unpack_time(&accrue_time, buffer);
2801 		safe_unpack_time(&submit_time, buffer);
2802 
2803 		safe_unpackstr_xmalloc(&req_nodes,  &name_len, buffer);
2804 		safe_unpackstr_xmalloc(&exc_nodes,  &name_len, buffer);
2805 		safe_unpackstr_xmalloc(&features,   &name_len, buffer);
2806 		safe_unpackstr_xmalloc(&cluster_features, &name_len, buffer);
2807 		unpack_dep_list(&depend_list, buffer, protocol_version);
2808 		safe_unpackstr_xmalloc(&dependency, &name_len, buffer);
2809 		safe_unpackstr_xmalloc(&orig_dependency, &name_len, buffer);
2810 
2811 		safe_unpackstr_xmalloc(&err, &name_len, buffer);
2812 		safe_unpackstr_xmalloc(&in,  &name_len, buffer);
2813 		safe_unpackstr_xmalloc(&out, &name_len, buffer);
2814 		safe_unpackstr_xmalloc(&work_dir, &name_len, buffer);
2815 
2816 		if (unpack_multi_core_data(&mc_ptr, buffer, protocol_version))
2817 			goto unpack_error;
2818 		safe_unpackstr_array(&argv, &argc, buffer);
2819 		safe_unpackstr_array(&env_sup, &env_cnt, buffer);
2820 	} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
2821 		char *temp_str;
2822 
2823 		safe_unpack32(&min_cpus, buffer);
2824 		safe_unpack32(&max_cpus, buffer);
2825 		safe_unpack32(&min_nodes, buffer);
2826 		safe_unpack32(&max_nodes, buffer);
2827 		safe_unpack32(&num_tasks, buffer);
2828 
2829 		safe_unpackstr_xmalloc(&acctg_freq, &name_len, buffer);
2830 		safe_unpack16(&contiguous, buffer);
2831 		safe_unpack16(&core_spec, buffer);
2832 		safe_unpack16(&cpus_per_task, buffer);
2833 		safe_unpack32(&nice, buffer);
2834 		safe_unpack16(&ntasks_per_node, buffer);
2835 		safe_unpack16(&requeue, buffer);
2836 		safe_unpack32(&task_dist, buffer);
2837 
2838 		safe_unpack8(&share_res, buffer);
2839 		safe_unpack8(&whole_node, buffer);
2840 
2841 		safe_unpackstr_xmalloc(&cpu_bind, &name_len, buffer);
2842 		safe_unpack16(&cpu_bind_type, buffer);
2843 		safe_unpackstr_xmalloc(&mem_bind, &name_len, buffer);
2844 		safe_unpack16(&mem_bind_type, buffer);
2845 		safe_unpack16(&plane_size, buffer);
2846 
2847 		safe_unpack8(&open_mode, buffer);
2848 		safe_unpack8(&overcommit, buffer);
2849 		safe_unpack8(&prolog_running, buffer);
2850 
2851 		safe_unpack32(&pn_min_cpus, buffer);
2852 		safe_unpack64(&pn_min_memory, buffer);
2853 		safe_unpack32(&pn_min_tmp_disk, buffer);
2854 		safe_unpack32(&cpu_freq_min, buffer);
2855 		safe_unpack32(&cpu_freq_max, buffer);
2856 		safe_unpack32(&cpu_freq_gov, buffer);
2857 		safe_unpack_time(&begin_time, buffer);
2858 		safe_unpack_time(&accrue_time, buffer);
2859 		safe_unpack_time(&submit_time, buffer);
2860 
2861 		safe_unpackstr_xmalloc(&req_nodes,  &name_len, buffer);
2862 		safe_unpackstr_xmalloc(&exc_nodes,  &name_len, buffer);
2863 		safe_unpackstr_xmalloc(&features,   &name_len, buffer);
2864 		safe_unpackstr_xmalloc(&cluster_features, &name_len, buffer);
2865 		safe_unpackstr_xmalloc(&dependency, &name_len, buffer);
2866 		safe_unpackstr_xmalloc(&orig_dependency, &name_len, buffer);
2867 
2868 		safe_unpackstr_xmalloc(&err, &name_len, buffer);
2869 		safe_unpackstr_xmalloc(&in,  &name_len, buffer);
2870 		safe_unpackstr_xmalloc(&out, &name_len, buffer);
2871 		safe_unpackstr_xmalloc(&work_dir, &name_len, buffer);
2872 		safe_unpackstr_xmalloc(&temp_str, &name_len, buffer);
2873 		xfree(temp_str); /* was ckpt_dir */
2874 		safe_unpackstr_xmalloc(&temp_str, &name_len, buffer);
2875 		xfree(temp_str); /* was restart_dir */
2876 
2877 		if (unpack_multi_core_data(&mc_ptr, buffer, protocol_version))
2878 			goto unpack_error;
2879 		safe_unpackstr_array(&argv, &argc, buffer);
2880 		safe_unpackstr_array(&env_sup, &env_cnt, buffer);
2881 	} else {
2882 		error("_load_job_details: protocol_version "
2883 		      "%hu not supported", protocol_version);
2884 		goto unpack_error;
2885 	}
2886 
2887 	/* validity test as possible */
2888 	if (contiguous > 1) {
2889 		error("Invalid data for %pJ: contiguous=%u",
2890 		      job_ptr, contiguous);
2891 		goto unpack_error;
2892 	}
2893 	if ((requeue > 1) || (overcommit > 1)) {
2894 		error("Invalid data for %pJ: requeue=%u overcommit=%u",
2895 		      job_ptr, requeue, overcommit);
2896 		goto unpack_error;
2897 	}
2898 	if (prolog_running > 4) {
2899 		error("Invalid data for %pJ: prolog_running=%u",
2900 		      job_ptr, prolog_running);
2901 		goto unpack_error;
2902 	}
2903 
2904 	/* free any left-over detail data */
2905 	xfree(job_ptr->details->acctg_freq);
2906 	for (i=0; i<job_ptr->details->argc; i++)
2907 		xfree(job_ptr->details->argv[i]);
2908 	xfree(job_ptr->details->argv);
2909 	xfree(job_ptr->details->cpu_bind);
2910 	FREE_NULL_LIST(job_ptr->details->depend_list);
2911 	xfree(job_ptr->details->dependency);
2912 	xfree(job_ptr->details->orig_dependency);
2913 	xfree(job_ptr->details->std_err);
2914 	for (i=0; i<job_ptr->details->env_cnt; i++)
2915 		xfree(job_ptr->details->env_sup[i]);
2916 	xfree(job_ptr->details->env_sup);
2917 	xfree(job_ptr->details->exc_nodes);
2918 	xfree(job_ptr->details->features);
2919 	xfree(job_ptr->details->cluster_features);
2920 	xfree(job_ptr->details->std_in);
2921 	xfree(job_ptr->details->mem_bind);
2922 	xfree(job_ptr->details->std_out);
2923 	xfree(job_ptr->details->req_nodes);
2924 	xfree(job_ptr->details->work_dir);
2925 
2926 	/* now put the details into the job record */
2927 	job_ptr->details->acctg_freq = acctg_freq;
2928 	job_ptr->details->argc = argc;
2929 	job_ptr->details->argv = argv;
2930 	job_ptr->details->accrue_time = accrue_time;
2931 	job_ptr->details->begin_time = begin_time;
2932 	job_ptr->details->contiguous = contiguous;
2933 	job_ptr->details->core_spec = core_spec;
2934 	job_ptr->details->cpu_bind = cpu_bind;
2935 	job_ptr->details->cpu_bind_type = cpu_bind_type;
2936 	job_ptr->details->cpu_freq_min = cpu_freq_min;
2937 	job_ptr->details->cpu_freq_max = cpu_freq_max;
2938 	job_ptr->details->cpu_freq_gov = cpu_freq_gov;
2939 	if (cpus_per_task != NO_VAL16)
2940 		job_ptr->details->cpus_per_task = cpus_per_task;
2941 	else
2942 		job_ptr->details->cpus_per_task = 1;
2943 	job_ptr->details->orig_cpus_per_task = cpus_per_task;
2944 	job_ptr->details->depend_list = depend_list;
2945 	job_ptr->details->dependency = dependency;
2946 	job_ptr->details->orig_dependency = orig_dependency;
2947 	job_ptr->details->env_cnt = env_cnt;
2948 	job_ptr->details->env_sup = env_sup;
2949 	job_ptr->details->std_err = err;
2950 	job_ptr->details->exc_nodes = exc_nodes;
2951 	job_ptr->details->features = features;
2952 	job_ptr->details->cluster_features = cluster_features;
2953 	job_ptr->details->std_in = in;
2954 	job_ptr->details->pn_min_cpus = pn_min_cpus;
2955 	job_ptr->details->orig_pn_min_cpus = pn_min_cpus;
2956 	job_ptr->details->pn_min_memory = pn_min_memory;
2957 	job_ptr->details->orig_pn_min_memory = pn_min_memory;
2958 	job_ptr->details->pn_min_tmp_disk = pn_min_tmp_disk;
2959 	job_ptr->details->max_cpus = max_cpus;
2960 	job_ptr->details->orig_max_cpus = max_cpus;
2961 	job_ptr->details->max_nodes = max_nodes;
2962 	job_ptr->details->mc_ptr = mc_ptr;
2963 	job_ptr->details->mem_bind = mem_bind;
2964 	job_ptr->details->mem_bind_type = mem_bind_type;
2965 	job_ptr->details->min_cpus = min_cpus;
2966 	job_ptr->details->orig_min_cpus = min_cpus;
2967 	job_ptr->details->min_nodes = min_nodes;
2968 	job_ptr->details->nice = nice;
2969 	job_ptr->details->ntasks_per_node = ntasks_per_node;
2970 	job_ptr->details->num_tasks = num_tasks;
2971 	job_ptr->details->open_mode = open_mode;
2972 	job_ptr->details->std_out = out;
2973 	job_ptr->details->overcommit = overcommit;
2974 	job_ptr->details->plane_size = plane_size;
2975 	job_ptr->details->prolog_running = prolog_running;
2976 	job_ptr->details->req_nodes = req_nodes;
2977 	job_ptr->details->requeue = requeue;
2978 	job_ptr->details->share_res = share_res;
2979 	job_ptr->details->submit_time = submit_time;
2980 	job_ptr->details->task_dist = task_dist;
2981 	job_ptr->details->whole_node = whole_node;
2982 	job_ptr->details->work_dir = work_dir;
2983 
2984 	return SLURM_SUCCESS;
2985 
2986 unpack_error:
2987 
2988 /*	for (i=0; i<argc; i++)
2989 	xfree(argv[i]);  Don't trust this on unpack error */
2990 	xfree(acctg_freq);
2991 	xfree(argv);
2992 	xfree(cpu_bind);
2993 	xfree(dependency);
2994 	xfree(orig_dependency);
2995 /*	for (i=0; i<env_cnt; i++)
2996 	xfree(env_sup[i]);  Don't trust this on unpack error */
2997 	xfree(env_sup);
2998 	xfree(err);
2999 	xfree(exc_nodes);
3000 	xfree(features);
3001 	xfree(cluster_features);
3002 	xfree(in);
3003 	xfree(mem_bind);
3004 	xfree(out);
3005 	xfree(req_nodes);
3006 	xfree(work_dir);
3007 	return SLURM_ERROR;
3008 }
3009 
3010 /* _add_job_hash - add a job hash entry for given job record, job_id must
3011  *	already be set
3012  * IN job_ptr - pointer to job record
3013  * Globals: hash table updated
3014  */
_add_job_hash(job_record_t * job_ptr)3015 static void _add_job_hash(job_record_t *job_ptr)
3016 {
3017 	int inx;
3018 
3019 	inx = JOB_HASH_INX(job_ptr->job_id);
3020 	job_ptr->job_next = job_hash[inx];
3021 	job_hash[inx] = job_ptr;
3022 }
3023 
3024 /* _remove_job_hash - remove a job hash entry for given job record, job_id must
3025  *	already be set
3026  * IN job_ptr - pointer to job record
3027  * IN type - which hash to work with
3028  * Globals: hash table updated
3029  */
_remove_job_hash(job_record_t * job_entry,job_hash_type_t type)3030 static void _remove_job_hash(job_record_t *job_entry, job_hash_type_t type)
3031 {
3032 	job_record_t *job_ptr, **job_pptr;
3033 
3034 	xassert(job_entry);
3035 
3036 	switch (type) {
3037 	case JOB_HASH_JOB:
3038 		job_pptr = &job_hash[JOB_HASH_INX(job_entry->job_id)];
3039 		break;
3040 	case JOB_HASH_ARRAY_JOB:
3041 		job_pptr = &job_array_hash_j[
3042 			JOB_HASH_INX(job_entry->array_job_id)];
3043 		break;
3044 	case JOB_HASH_ARRAY_TASK:
3045 		job_pptr = &job_array_hash_t[
3046 			JOB_ARRAY_HASH_INX(job_entry->array_job_id,
3047 					   job_entry->array_task_id)];
3048 		break;
3049 	default:
3050 		fatal("%s: unknown job_hash_type_t %d", __func__, type);
3051 		return;
3052 	}
3053 
3054 	while ((job_pptr != NULL) && (*job_pptr != NULL) &&
3055 	       ((job_ptr = *job_pptr) != job_entry)) {
3056 		xassert(job_ptr->magic == JOB_MAGIC);
3057 		switch (type) {
3058 		case JOB_HASH_JOB:
3059 			job_pptr = &job_ptr->job_next;
3060 			break;
3061 		case JOB_HASH_ARRAY_JOB:
3062 			job_pptr = &job_ptr->job_array_next_j;
3063 			break;
3064 		case JOB_HASH_ARRAY_TASK:
3065 			job_pptr = &job_ptr->job_array_next_t;
3066 			break;
3067 		}
3068 	}
3069 
3070 	if (job_pptr == NULL || *job_pptr == NULL) {
3071 		if (job_entry->job_id == NO_VAL)
3072 			return;
3073 
3074 		switch (type) {
3075 		case JOB_HASH_JOB:
3076 			error("%s: Could not find hash entry for JobId=%u",
3077 			      __func__, job_entry->job_id);
3078 			break;
3079 		case JOB_HASH_ARRAY_JOB:
3080 			error("%s: job array hash error %u", __func__,
3081 			      job_entry->array_job_id);
3082 			break;
3083 		case JOB_HASH_ARRAY_TASK:
3084 			error("%s: job array, task ID hash error %u_%u",
3085 			      __func__,
3086 			      job_entry->array_job_id,
3087 			      job_entry->array_task_id);
3088 			break;
3089 		}
3090 		return;
3091 	}
3092 
3093 	switch (type) {
3094 	case JOB_HASH_JOB:
3095 		*job_pptr = job_entry->job_next;
3096 		job_entry->job_next = NULL;
3097 		break;
3098 	case JOB_HASH_ARRAY_JOB:
3099 		*job_pptr = job_entry->job_array_next_j;
3100 		job_entry->job_array_next_j = NULL;
3101 		break;
3102 	case JOB_HASH_ARRAY_TASK:
3103 		*job_pptr = job_entry->job_array_next_t;
3104 		job_entry->job_array_next_t = NULL;
3105 		break;
3106 	}
3107 }
3108 
3109 /* _add_job_array_hash - add a job hash entry for given job record,
3110  *	array_job_id and array_task_id must already be set
3111  * IN job_ptr - pointer to job record
3112  * Globals: hash table updated
3113  */
_add_job_array_hash(job_record_t * job_ptr)3114 void _add_job_array_hash(job_record_t *job_ptr)
3115 {
3116 	int inx;
3117 
3118 	if (job_ptr->array_task_id == NO_VAL)
3119 		return;	/* Not a job array */
3120 
3121 	inx = JOB_HASH_INX(job_ptr->array_job_id);
3122 	job_ptr->job_array_next_j = job_array_hash_j[inx];
3123 	job_array_hash_j[inx] = job_ptr;
3124 
3125 	inx = JOB_ARRAY_HASH_INX(job_ptr->array_job_id,job_ptr->array_task_id);
3126 	job_ptr->job_array_next_t = job_array_hash_t[inx];
3127 	job_array_hash_t[inx] = job_ptr;
3128 }
3129 
3130 /* For the job array data structure, build the string representation of the
3131  * bitmap.
3132  * NOTE: bit_fmt_hexmask() is far more scalable than bit_fmt(). */
build_array_str(job_record_t * job_ptr)3133 extern void build_array_str(job_record_t *job_ptr)
3134 {
3135 	job_array_struct_t *array_recs = job_ptr->array_recs;
3136 
3137 	if (!array_recs || array_recs->task_id_str ||
3138 	    !array_recs->task_id_bitmap ||
3139 	    (job_ptr->array_task_id != NO_VAL) ||
3140 	    (bit_ffs(job_ptr->array_recs->task_id_bitmap) == -1))
3141 		return;
3142 
3143 
3144 	array_recs->task_id_str = bit_fmt_hexmask(array_recs->task_id_bitmap);
3145 
3146 	/* While it is efficient to set the db_index to 0 here
3147 	 * to get the database to update the record for
3148 	 * pending tasks it also creates a window in which if
3149 	 * the association id is changed (different account or
3150 	 * partition) instead of returning the previous
3151 	 * db_index (expected) it would create a new one
3152 	 * leaving the other orphaned.  Setting the job_state
3153 	 * sets things up so the db_index isn't lost but the
3154 	 * start message is still sent to get the desired behavior. */
3155 
3156 	/* Here we set the JOB_UPDATE_DB flag so we resend the start of the
3157 	 * job updating the array task string and count of pending
3158 	 * jobs.  This is faster than sending the start again since
3159 	 * this could happen many times (like lots of array elements
3160 	 * starting at once) instead of just ever so often.
3161 	 */
3162 
3163 	if (job_ptr->db_index)
3164 		job_ptr->job_state |= JOB_UPDATE_DB;
3165 }
3166 
3167 /* Return true if ALL tasks of specific array job ID are complete */
test_job_array_complete(uint32_t array_job_id)3168 extern bool test_job_array_complete(uint32_t array_job_id)
3169 {
3170 	job_record_t *job_ptr;
3171 	int inx;
3172 
3173 	job_ptr = find_job_record(array_job_id);
3174 	if (job_ptr) {
3175 		if (!IS_JOB_COMPLETE(job_ptr))
3176 			return false;
3177 		if (job_ptr->array_recs && job_ptr->array_recs->max_exit_code)
3178 			return false;
3179 	}
3180 
3181 	/* Need to test individual job array records */
3182 	inx = JOB_HASH_INX(array_job_id);
3183 	job_ptr = job_array_hash_j[inx];
3184 	while (job_ptr) {
3185 		if (job_ptr->array_job_id == array_job_id) {
3186 			if (!IS_JOB_COMPLETE(job_ptr))
3187 				return false;
3188 		}
3189 		job_ptr = job_ptr->job_array_next_j;
3190 	}
3191 	return true;
3192 }
3193 
3194 /* Return true if ALL tasks of specific array job ID are completed */
test_job_array_completed(uint32_t array_job_id)3195 extern bool test_job_array_completed(uint32_t array_job_id)
3196 {
3197 	job_record_t *job_ptr;
3198 	int inx;
3199 
3200 	job_ptr = find_job_record(array_job_id);
3201 	if (job_ptr) {
3202 		if (!IS_JOB_COMPLETED(job_ptr))
3203 			return false;
3204 	}
3205 
3206 	/* Need to test individual job array records */
3207 	inx = JOB_HASH_INX(array_job_id);
3208 	job_ptr = job_array_hash_j[inx];
3209 	while (job_ptr) {
3210 		if (job_ptr->array_job_id == array_job_id) {
3211 			if (!IS_JOB_COMPLETED(job_ptr))
3212 				return false;
3213 		}
3214 		job_ptr = job_ptr->job_array_next_j;
3215 	}
3216 	return true;
3217 }
3218 
3219 /*
3220  * Return true if ALL tasks of specific array job ID are completed AND
3221  * all except for the head job have been purged.
3222  */
_test_job_array_purged(uint32_t array_job_id)3223 extern bool _test_job_array_purged(uint32_t array_job_id)
3224 {
3225 	job_record_t *job_ptr, *head_job_ptr;
3226 	int inx;
3227 
3228 	head_job_ptr = find_job_record(array_job_id);
3229 	if (head_job_ptr) {
3230 		if (!IS_JOB_COMPLETED(head_job_ptr))
3231 			return false;
3232 	}
3233 
3234 	/* Need to test individual job array records */
3235 	inx = JOB_HASH_INX(array_job_id);
3236 	job_ptr = job_array_hash_j[inx];
3237 	while (job_ptr) {
3238 		if ((job_ptr->array_job_id == array_job_id) &&
3239 		    (job_ptr != head_job_ptr)) {
3240 			return false;
3241 		}
3242 		job_ptr = job_ptr->job_array_next_j;
3243 	}
3244 	return true;
3245 }
3246 
3247 /* Return true if ALL tasks of specific array job ID are finished */
test_job_array_finished(uint32_t array_job_id)3248 extern bool test_job_array_finished(uint32_t array_job_id)
3249 {
3250 	job_record_t *job_ptr;
3251 	int inx;
3252 
3253 	job_ptr = find_job_record(array_job_id);
3254 	if (job_ptr) {
3255 		if (!IS_JOB_FINISHED(job_ptr))
3256 			return false;
3257 	}
3258 
3259 	/* Need to test individual job array records */
3260 	inx = JOB_HASH_INX(array_job_id);
3261 	job_ptr = job_array_hash_j[inx];
3262 	while (job_ptr) {
3263 		if (job_ptr->array_job_id == array_job_id) {
3264 			if (!IS_JOB_FINISHED(job_ptr))
3265 				return false;
3266 		}
3267 		job_ptr = job_ptr->job_array_next_j;
3268 	}
3269 
3270 	return true;
3271 }
3272 
3273 /* Return true if ANY tasks of specific array job ID are pending */
test_job_array_pending(uint32_t array_job_id)3274 extern bool test_job_array_pending(uint32_t array_job_id)
3275 {
3276 	job_record_t *job_ptr;
3277 	int inx;
3278 
3279 	job_ptr = find_job_record(array_job_id);
3280 	if (job_ptr) {
3281 		if (IS_JOB_PENDING(job_ptr))
3282 			return true;
3283 		if (job_ptr->array_recs && job_ptr->array_recs->task_cnt)
3284 			return true;
3285 	}
3286 
3287 	/* Need to test individual job array records */
3288 	inx = JOB_HASH_INX(array_job_id);
3289 	job_ptr = job_array_hash_j[inx];
3290 	while (job_ptr) {
3291 		if (job_ptr->array_job_id == array_job_id) {
3292 			if (IS_JOB_PENDING(job_ptr))
3293 				return true;
3294 		}
3295 		job_ptr = job_ptr->job_array_next_j;
3296 	}
3297 	return false;
3298 }
3299 
3300 /* For a given job ID return the number of PENDING tasks which have their
3301  * own separate job_record (do not count tasks in pending META job record) */
num_pending_job_array_tasks(uint32_t array_job_id)3302 extern int num_pending_job_array_tasks(uint32_t array_job_id)
3303 {
3304 	job_record_t *job_ptr;
3305 	int count = 0, inx;
3306 
3307 	inx = JOB_HASH_INX(array_job_id);
3308 	job_ptr = job_array_hash_j[inx];
3309 	while (job_ptr) {
3310 		if ((job_ptr->array_job_id == array_job_id) &&
3311 		    IS_JOB_PENDING(job_ptr))
3312 			count++;
3313 		job_ptr = job_ptr->job_array_next_j;
3314 	}
3315 
3316 	return count;
3317 }
3318 
3319 /*
3320  * find_job_array_rec - return a pointer to the job record with the given
3321  *	array_job_id/array_task_id
3322  * IN job_id - requested job's id
3323  * IN array_task_id - requested job's task id,
3324  *		      NO_VAL if none specified (i.e. not a job array)
3325  *		      INFINITE return any task for specified job id
3326  * RET pointer to the job's record, NULL on error
3327  */
find_job_array_rec(uint32_t array_job_id,uint32_t array_task_id)3328 extern job_record_t *find_job_array_rec(uint32_t array_job_id,
3329 					uint32_t array_task_id)
3330 {
3331 	job_record_t *job_ptr, *match_job_ptr = NULL;
3332 	int inx;
3333 
3334 	if (array_task_id == NO_VAL)
3335 		return find_job_record(array_job_id);
3336 
3337 	if (array_task_id == INFINITE) {	/* find by job ID */
3338 		/* Look for job record with all of the pending tasks */
3339 		job_ptr = find_job_record(array_job_id);
3340 		if (job_ptr && job_ptr->array_recs &&
3341 		    (job_ptr->array_job_id == array_job_id))
3342 			return job_ptr;
3343 
3344 		inx = JOB_HASH_INX(array_job_id);
3345 		job_ptr = job_array_hash_j[inx];
3346 		while (job_ptr) {
3347 			if (job_ptr->array_job_id == array_job_id) {
3348 				match_job_ptr = job_ptr;
3349 				if (!IS_JOB_FINISHED(job_ptr)) {
3350 					return job_ptr;
3351 				}
3352 			}
3353 			job_ptr = job_ptr->job_array_next_j;
3354 		}
3355 		return match_job_ptr;
3356 	} else {		/* Find specific task ID */
3357 		inx = JOB_ARRAY_HASH_INX(array_job_id, array_task_id);
3358 		job_ptr = job_array_hash_t[inx];
3359 		while (job_ptr) {
3360 			if ((job_ptr->array_job_id == array_job_id) &&
3361 			    (job_ptr->array_task_id == array_task_id)) {
3362 				return job_ptr;
3363 			}
3364 			job_ptr = job_ptr->job_array_next_t;
3365 		}
3366 		/* Look for job record with all of the pending tasks */
3367 		job_ptr = find_job_record(array_job_id);
3368 		if (job_ptr && job_ptr->array_recs &&
3369 		    job_ptr->array_recs->task_id_bitmap) {
3370 			inx = bit_size(job_ptr->array_recs->task_id_bitmap);
3371 			if ((array_task_id < inx) &&
3372 			    bit_test(job_ptr->array_recs->task_id_bitmap,
3373 				     array_task_id)) {
3374 				return job_ptr;
3375 			}
3376 		}
3377 		return NULL;	/* None found */
3378 	}
3379 }
3380 
3381 /*
3382  * find_het_job_record - return a pointer to the job record with the given ID
3383  * IN job_id - requested job's ID
3384  * in het_job_id - hetjob component ID
3385  * RET pointer to the job's record, NULL on error
3386  */
find_het_job_record(uint32_t job_id,uint32_t het_job_id)3387 extern job_record_t *find_het_job_record(uint32_t job_id, uint32_t het_job_id)
3388 {
3389 	job_record_t *het_job_leader, *het_job;
3390 	ListIterator iter;
3391 
3392 	het_job_leader = job_hash[JOB_HASH_INX(job_id)];
3393 	while (het_job_leader) {
3394 		if (het_job_leader->job_id == job_id)
3395 			break;
3396 		het_job_leader = het_job_leader->job_next;
3397 	}
3398 	if (!het_job_leader)
3399 		return NULL;
3400 	if (het_job_leader->het_job_offset == het_job_id)
3401 		return het_job_leader;
3402 
3403 	if (!het_job_leader->het_job_list)
3404 		return NULL;
3405 	iter = list_iterator_create(het_job_leader->het_job_list);
3406 	while ((het_job = list_next(iter))) {
3407 		if (het_job_leader->het_job_id != het_job->het_job_id) {
3408 			error("%s: Bad het_job_list for %pJ",
3409 			      __func__, het_job_leader);
3410 			continue;
3411 		}
3412 		if (het_job->het_job_offset == het_job_id)
3413 			break;
3414 	}
3415 	list_iterator_destroy(iter);
3416 
3417 	return het_job;
3418 }
3419 
3420 /*
3421  * find_job_record - return a pointer to the job record with the given job_id
3422  * IN job_id - requested job's id
3423  * RET pointer to the job's record, NULL on error
3424  */
find_job_record(uint32_t job_id)3425 extern job_record_t *find_job_record(uint32_t job_id)
3426 {
3427 	job_record_t *job_ptr;
3428 
3429 	job_ptr = job_hash[JOB_HASH_INX(job_id)];
3430 	while (job_ptr) {
3431 		if (job_ptr->job_id == job_id)
3432 			return job_ptr;
3433 		job_ptr = job_ptr->job_next;
3434 	}
3435 
3436 	return NULL;
3437 }
3438 
3439 /* rebuild a job's partition name list based upon the contents of its
3440  *	part_ptr_list */
_rebuild_part_name_list(job_record_t * job_ptr)3441 static void _rebuild_part_name_list(job_record_t *job_ptr)
3442 {
3443 	bool job_active = false, job_pending = false;
3444 	part_record_t *part_ptr;
3445 	ListIterator part_iterator;
3446 
3447 	xfree(job_ptr->partition);
3448 
3449 	if (!job_ptr->part_ptr_list) {
3450 		job_ptr->partition = xstrdup(job_ptr->part_ptr->name);
3451 		last_job_update = time(NULL);
3452 		return;
3453 	}
3454 
3455 	if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) {
3456 		job_active = true;
3457 		job_ptr->partition = xstrdup(job_ptr->part_ptr->name);
3458 	} else if (IS_JOB_PENDING(job_ptr))
3459 		job_pending = true;
3460 
3461 	part_iterator = list_iterator_create(job_ptr->part_ptr_list);
3462 	while ((part_ptr = list_next(part_iterator))) {
3463 		if (job_pending) {
3464 			/* Reset job's one partition to a valid one */
3465 			job_ptr->part_ptr = part_ptr;
3466 			job_pending = false;
3467 		}
3468 		if (job_active && (part_ptr == job_ptr->part_ptr))
3469 			continue;	/* already added */
3470 		if (job_ptr->partition)
3471 			xstrcat(job_ptr->partition, ",");
3472 		xstrcat(job_ptr->partition, part_ptr->name);
3473 	}
3474 	list_iterator_destroy(part_iterator);
3475 	last_job_update = time(NULL);
3476 }
3477 
3478 /*
3479  * Kill job or job step
3480  *
3481  * IN job_step_kill_msg - msg with specs on which job/step to cancel.
3482  * IN uid               - uid of user requesting job/step cancel.
3483  */
_kill_job_step(job_step_kill_msg_t * job_step_kill_msg,uint32_t uid)3484 static int _kill_job_step(job_step_kill_msg_t *job_step_kill_msg, uint32_t uid)
3485 {
3486 	DEF_TIMERS;
3487 	/* Locks: Read config, write job, write node, read fed */
3488 	slurmctld_lock_t job_write_lock = {
3489 		READ_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, READ_LOCK };
3490 	job_record_t *job_ptr;
3491 	int error_code = SLURM_SUCCESS;
3492 
3493 	START_TIMER;
3494 	lock_slurmctld(job_write_lock);
3495 	job_ptr = find_job_record(job_step_kill_msg->job_id);
3496 	trace_job(job_ptr, __func__, "enter");
3497 
3498 	/* do RPC call */
3499 	if (job_step_kill_msg->job_step_id == SLURM_BATCH_SCRIPT) {
3500 		/* NOTE: SLURM_BATCH_SCRIPT == NO_VAL */
3501 		error_code = job_signal_id(job_step_kill_msg->job_id,
3502 					   job_step_kill_msg->signal,
3503 					   job_step_kill_msg->flags, uid,
3504 					   false);
3505 		unlock_slurmctld(job_write_lock);
3506 		END_TIMER2(__func__);
3507 
3508 		/* return result */
3509 		if (error_code) {
3510 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_STEPS)
3511 				info("Signal %u %pJ by UID=%u: %s",
3512 				     job_step_kill_msg->signal, job_ptr, uid,
3513 				     slurm_strerror(error_code));
3514 		} else {
3515 			if (job_step_kill_msg->signal == SIGKILL) {
3516 				if (slurmctld_conf.debug_flags &
3517 						DEBUG_FLAG_STEPS)
3518 					info("%s: Cancel of %pJ by UID=%u, %s",
3519 					     __func__, job_ptr, uid, TIME_STR);
3520 				slurmctld_diag_stats.jobs_canceled++;
3521 			} else {
3522 				if (slurmctld_conf.debug_flags &
3523 						DEBUG_FLAG_STEPS)
3524 					info("%s: Signal %u of %pJ by UID=%u, %s",
3525 					     __func__,
3526 					     job_step_kill_msg->signal,
3527 					     job_ptr, uid, TIME_STR);
3528 			}
3529 
3530 			/* Below function provides its own locking */
3531 			schedule_job_save();
3532 		}
3533 	} else {
3534 		error_code = job_step_signal(job_step_kill_msg->job_id,
3535 					     job_step_kill_msg->job_step_id,
3536 					     job_step_kill_msg->signal,
3537 					     job_step_kill_msg->flags,
3538 					     uid);
3539 		unlock_slurmctld(job_write_lock);
3540 		END_TIMER2(__func__);
3541 
3542 		/* return result */
3543 		if (error_code) {
3544 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_STEPS)
3545 				info("Signal %u of JobId=%u StepId=%u by UID=%u: %s",
3546 				     job_step_kill_msg->signal,
3547 				     job_step_kill_msg->job_id,
3548 				     job_step_kill_msg->job_step_id, uid,
3549 				     slurm_strerror(error_code));
3550 		} else {
3551 			if (job_step_kill_msg->signal == SIGKILL) {
3552 				if (slurmctld_conf.debug_flags &
3553 						DEBUG_FLAG_STEPS)
3554 					info("%s: Cancel of JobId=%u StepId=%u by UID=%u %s",
3555 					     __func__,
3556 					     job_step_kill_msg->job_id,
3557 					     job_step_kill_msg->job_step_id,
3558 					     uid, TIME_STR);
3559 			} else {
3560 				if (slurmctld_conf.debug_flags &
3561 						DEBUG_FLAG_STEPS)
3562 					info("%s: Signal %u of JobId=%u StepId=%u by UID=%u %s",
3563 					     __func__,
3564 					     job_step_kill_msg->signal,
3565 					     job_step_kill_msg->job_id,
3566 					     job_step_kill_msg->job_step_id,
3567 					     uid, TIME_STR);
3568 			}
3569 
3570 			/* Below function provides its own locking */
3571 			schedule_job_save();
3572 		}
3573 	}
3574 
3575 	trace_job(job_ptr, __func__, "return");
3576 	return error_code;
3577 }
3578 
3579 /*
3580  * Kill job or job step
3581  *
3582  * IN job_step_kill_msg - msg with specs on which job/step to cancel.
3583  * IN uid               - uid of user requesting job/step cancel.
3584  */
kill_job_step(job_step_kill_msg_t * job_step_kill_msg,uint32_t uid)3585 extern int kill_job_step(job_step_kill_msg_t *job_step_kill_msg, uint32_t uid)
3586 {
3587 	/* Locks: Read job */
3588 	slurmctld_lock_t job_read_lock = {
3589 		NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
3590 	job_record_t *job_ptr, *het_job_ptr;
3591 	uint32_t *het_job_ids = NULL;
3592 	int cnt = 0, i, rc;
3593 	int error_code = SLURM_SUCCESS;
3594 	ListIterator iter;
3595 
3596 	lock_slurmctld(job_read_lock);
3597 	job_ptr = find_job_record(job_step_kill_msg->job_id);
3598 	if (job_ptr && job_ptr->het_job_list &&
3599 	    (job_step_kill_msg->signal == SIGKILL) &&
3600 	    (job_step_kill_msg->job_step_id != SLURM_BATCH_SCRIPT)) {
3601 		cnt = list_count(job_ptr->het_job_list);
3602 		het_job_ids = xcalloc(cnt, sizeof(uint32_t));
3603 		i = 0;
3604 		iter = list_iterator_create(job_ptr->het_job_list);
3605 		while ((het_job_ptr = list_next(iter))) {
3606 			het_job_ids[i++] = het_job_ptr->job_id;
3607 		}
3608 		list_iterator_destroy(iter);
3609 	}
3610 	unlock_slurmctld(job_read_lock);
3611 
3612 	if (!job_ptr) {
3613 		info("%s: invalid JobId=%u",
3614 		      __func__, job_step_kill_msg->job_id);
3615 		error_code = ESLURM_INVALID_JOB_ID;
3616 	} else if (het_job_ids) {
3617 		for (i = 0; i < cnt; i++) {
3618 			job_step_kill_msg->job_id = het_job_ids[i];
3619 			rc = _kill_job_step(job_step_kill_msg, uid);
3620 			if (rc != SLURM_SUCCESS)
3621 				error_code = rc;
3622 		}
3623 		xfree(het_job_ids);
3624 	} else {
3625 		error_code = _kill_job_step(job_step_kill_msg, uid);
3626 	}
3627 
3628 	return error_code;
3629 }
3630 
3631 /*
3632  * kill_job_by_part_name - Given a partition name, deallocate resource for
3633  *	its jobs and kill them. All jobs associated with this partition
3634  *	will have their partition pointer cleared.
3635  * IN part_name - name of a partition
3636  * RET number of jobs associated with this partition
3637  */
kill_job_by_part_name(char * part_name)3638 extern int kill_job_by_part_name(char *part_name)
3639 {
3640 	ListIterator job_iterator, part_iterator;
3641 	job_record_t *job_ptr;
3642 	part_record_t *part_ptr, *part2_ptr;
3643 	int kill_job_cnt = 0;
3644 	time_t now = time(NULL);
3645 
3646 	part_ptr = find_part_record (part_name);
3647 	if (part_ptr == NULL)	/* No such partition */
3648 		return 0;
3649 
3650 	job_iterator = list_iterator_create(job_list);
3651 	while ((job_ptr = list_next(job_iterator))) {
3652 		bool pending = false, suspended = false;
3653 
3654 		pending = IS_JOB_PENDING(job_ptr);
3655 		if (job_ptr->part_ptr_list) {
3656 			/* Remove partition if candidate for a job */
3657 			bool rebuild_name_list = false;
3658 			part_iterator = list_iterator_create(job_ptr->
3659 							     part_ptr_list);
3660 			while ((part2_ptr = list_next(part_iterator))) {
3661 				if (part2_ptr != part_ptr)
3662 					continue;
3663 				list_remove(part_iterator);
3664 				rebuild_name_list = true;
3665 			}
3666 			list_iterator_destroy(part_iterator);
3667 			if (rebuild_name_list) {
3668 				if (list_count(job_ptr->part_ptr_list) > 0) {
3669 					_rebuild_part_name_list(job_ptr);
3670 					job_ptr->part_ptr =
3671 						list_peek(job_ptr->
3672 							  part_ptr_list);
3673 				} else {
3674 					FREE_NULL_LIST(job_ptr->part_ptr_list);
3675 				}
3676 			}
3677 		}
3678 
3679 		if (job_ptr->part_ptr != part_ptr)
3680 			continue;
3681 
3682 		if (IS_JOB_SUSPENDED(job_ptr)) {
3683 			uint32_t suspend_job_state = job_ptr->job_state;
3684 			/* we can't have it as suspended when we call the
3685 			 * accounting stuff.
3686 			 */
3687 			job_ptr->job_state = JOB_CANCELLED;
3688 			jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
3689 			job_ptr->job_state = suspend_job_state;
3690 			suspended = true;
3691 		}
3692 		if (IS_JOB_RUNNING(job_ptr) || suspended) {
3693 			kill_job_cnt++;
3694 			info("Killing %pJ on defunct partition %s",
3695 			     job_ptr, part_name);
3696 			job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING;
3697 			build_cg_bitmap(job_ptr);
3698 			job_ptr->state_reason = FAIL_DOWN_PARTITION;
3699 			xfree(job_ptr->state_desc);
3700 			if (suspended) {
3701 				job_ptr->end_time = job_ptr->suspend_time;
3702 				job_ptr->tot_sus_time +=
3703 					difftime(now, job_ptr->suspend_time);
3704 			} else
3705 				job_ptr->end_time = now;
3706 			job_completion_logger(job_ptr, false);
3707 			if (!pending)
3708 				deallocate_nodes(job_ptr, false, suspended,
3709 						 false);
3710 		} else if (pending) {
3711 			kill_job_cnt++;
3712 			info("Killing %pJ on defunct partition %s",
3713 			     job_ptr, part_name);
3714 			job_ptr->job_state	= JOB_CANCELLED;
3715 			job_ptr->start_time	= now;
3716 			job_ptr->end_time	= now;
3717 			job_ptr->exit_code	= 1;
3718 			job_completion_logger(job_ptr, false);
3719 			fed_mgr_job_complete(job_ptr, 0, now);
3720 		}
3721 		job_ptr->part_ptr = NULL;
3722 		FREE_NULL_LIST(job_ptr->part_ptr_list);
3723 	}
3724 	list_iterator_destroy(job_iterator);
3725 
3726 	if (kill_job_cnt)
3727 		last_job_update = now;
3728 	return kill_job_cnt;
3729 }
3730 
3731 /*
3732  * kill_job_by_front_end_name - Given a front end node name, deallocate
3733  *	resource for its jobs and kill them.
3734  * IN node_name - name of a front end node
3735  * RET number of jobs associated with this front end node
3736  * NOTE: Patterned after kill_running_job_by_node_name()
3737  */
kill_job_by_front_end_name(char * node_name)3738 extern int kill_job_by_front_end_name(char *node_name)
3739 {
3740 #ifdef HAVE_FRONT_END
3741 	ListIterator job_iterator;
3742 	job_record_t *job_ptr, *het_job_leader;
3743 	node_record_t *node_ptr;
3744 	time_t now = time(NULL);
3745 	int i, kill_job_cnt = 0;
3746 
3747 	if (node_name == NULL)
3748 		fatal("kill_job_by_front_end_name: node_name is NULL");
3749 
3750 	job_iterator = list_iterator_create(job_list);
3751 	while ((job_ptr = list_next(job_iterator))) {
3752 		bool suspended = false;
3753 
3754 		if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr) &&
3755 		    !IS_JOB_COMPLETING(job_ptr))
3756 			continue;
3757 		het_job_leader = NULL;
3758 		if (job_ptr->het_job_id)
3759 			het_job_leader = find_job_record(job_ptr->het_job_id);
3760 		if (!het_job_leader)
3761 			het_job_leader = job_ptr;
3762 		if ((het_job_leader->batch_host == NULL) ||
3763 		    xstrcmp(het_job_leader->batch_host, node_name))
3764 			continue;	/* no match on node name */
3765 
3766 		if (IS_JOB_SUSPENDED(job_ptr)) {
3767 			uint32_t suspend_job_state = job_ptr->job_state;
3768 			/*
3769 			 * we can't have it as suspended when we call the
3770 			 * accounting stuff.
3771 			 */
3772 			job_ptr->job_state = JOB_CANCELLED;
3773 			jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
3774 			job_ptr->job_state = suspend_job_state;
3775 			suspended = true;
3776 		}
3777 		if (IS_JOB_COMPLETING(job_ptr)) {
3778 			kill_job_cnt++;
3779 			while ((i = bit_ffs(job_ptr->node_bitmap_cg)) >= 0) {
3780 				bit_clear(job_ptr->node_bitmap_cg, i);
3781 				if (job_ptr->node_cnt)
3782 					(job_ptr->node_cnt)--;
3783 				else {
3784 					error("node_cnt underflow on %pJ",
3785 					      job_ptr);
3786 				}
3787 				job_update_tres_cnt(job_ptr, i);
3788 				if (job_ptr->node_cnt == 0) {
3789 					cleanup_completing(job_ptr);
3790 				}
3791 				node_ptr = &node_record_table_ptr[i];
3792 				if (node_ptr->comp_job_cnt)
3793 					(node_ptr->comp_job_cnt)--;
3794 				else {
3795 					error("Node %s comp_job_cnt underflow, %pJ",
3796 					      node_ptr->name, job_ptr);
3797 				}
3798 			}
3799 		} else if (IS_JOB_RUNNING(job_ptr) || suspended) {
3800 			kill_job_cnt++;
3801 			if (job_ptr->batch_flag && job_ptr->details &&
3802 			    slurmctld_conf.job_requeue &&
3803 			    (job_ptr->details->requeue > 0)) {
3804 				char requeue_msg[128];
3805 
3806 				srun_node_fail(job_ptr, node_name);
3807 				info("requeue %pJ due to failure of node %s",
3808 				     job_ptr, node_name);
3809 				set_job_prio(job_ptr);
3810 				snprintf(requeue_msg, sizeof(requeue_msg),
3811 					 "Job requeued due to failure "
3812 					 "of node %s",
3813 					 node_name);
3814 				job_ptr->time_last_active  = now;
3815 				if (suspended) {
3816 					job_ptr->end_time =
3817 						job_ptr->suspend_time;
3818 					job_ptr->tot_sus_time +=
3819 						difftime(now,
3820 							 job_ptr->
3821 							 suspend_time);
3822 				} else
3823 					job_ptr->end_time = now;
3824 
3825 				/*
3826 				 * We want this job to look like it
3827 				 * was terminated in the accounting logs.
3828 				 * Set a new submit time so the restarted
3829 				 * job looks like a new job.
3830 				 */
3831 				job_ptr->job_state  = JOB_NODE_FAIL;
3832 				build_cg_bitmap(job_ptr);
3833 				job_completion_logger(job_ptr, true);
3834 				deallocate_nodes(job_ptr, false, suspended,
3835 						 false);
3836 
3837 				/* do this after the epilog complete,
3838 				 * setting it here is too early */
3839 				//job_ptr->db_index = 0;
3840 				//job_ptr->details->submit_time = now;
3841 
3842 				job_ptr->job_state = JOB_PENDING;
3843 				if (job_ptr->node_cnt)
3844 					job_ptr->job_state |= JOB_COMPLETING;
3845 
3846 				job_ptr->restart_cnt++;
3847 
3848 				/* clear signal sent flag on requeue */
3849 				job_ptr->warn_flags &= ~WARN_SENT;
3850 
3851 				/* Since the job completion logger
3852 				 * removes the submit we need to add it
3853 				 * again. */
3854 				acct_policy_add_job_submit(job_ptr);
3855 
3856 				if (!job_ptr->node_bitmap_cg ||
3857 				    bit_set_count(job_ptr->node_bitmap_cg) == 0)
3858 					batch_requeue_fini(job_ptr);
3859 			} else {
3860 				info("Killing %pJ on failed node %s",
3861 				     job_ptr, node_name);
3862 				srun_node_fail(job_ptr, node_name);
3863 				job_ptr->job_state = JOB_NODE_FAIL |
3864 						     JOB_COMPLETING;
3865 				build_cg_bitmap(job_ptr);
3866 				job_ptr->state_reason = FAIL_DOWN_NODE;
3867 				xfree(job_ptr->state_desc);
3868 				if (suspended) {
3869 					job_ptr->end_time =
3870 						job_ptr->suspend_time;
3871 					job_ptr->tot_sus_time +=
3872 						difftime(now,
3873 							 job_ptr->suspend_time);
3874 				} else
3875 					job_ptr->end_time = now;
3876 				job_completion_logger(job_ptr, false);
3877 				deallocate_nodes(job_ptr, false, suspended,
3878 						 false);
3879 			}
3880 		}
3881 	}
3882 	list_iterator_destroy(job_iterator);
3883 
3884 	if (kill_job_cnt)
3885 		last_job_update = now;
3886 	return kill_job_cnt;
3887 #else
3888 	return 0;
3889 #endif
3890 }
3891 
3892 /*
3893  * partition_in_use - determine whether a partition is in use by a RUNNING
3894  *	PENDING or SUSPENDED job or reservations
3895  * IN part_name - name of a partition
3896  * RET true if the partition is in use, else false
3897  */
partition_in_use(char * part_name)3898 extern bool partition_in_use(char *part_name)
3899 {
3900 	ListIterator job_iterator;
3901 	job_record_t *job_ptr;
3902 	part_record_t *part_ptr;
3903 
3904 	part_ptr = find_part_record (part_name);
3905 	if (part_ptr == NULL)	/* No such partition */
3906 		return false;
3907 
3908 	/* check jobs */
3909 	job_iterator = list_iterator_create(job_list);
3910 	while ((job_ptr = list_next(job_iterator))) {
3911 		if (job_ptr->part_ptr == part_ptr) {
3912 			if (!IS_JOB_FINISHED(job_ptr)) {
3913 				list_iterator_destroy(job_iterator);
3914 				return true;
3915 			}
3916 		}
3917 	}
3918 	list_iterator_destroy(job_iterator);
3919 
3920 	/* check reservations */
3921 	if (list_find_first(resv_list, _find_resv_part, part_ptr))
3922 		return true;
3923 
3924 	return false;
3925 }
3926 
3927 /* Clear a job's GRES details per node strings, rebuilt later on demand */
_clear_job_gres_details(job_record_t * job_ptr)3928 static void _clear_job_gres_details(job_record_t *job_ptr)
3929 {
3930 	int i;
3931 
3932 	xfree(job_ptr->gres_used);
3933 	for (i = 0; i < job_ptr->gres_detail_cnt; i++)
3934 		xfree(job_ptr->gres_detail_str[i]);
3935 	xfree(job_ptr->gres_detail_str);
3936 	job_ptr->gres_detail_cnt = 0;
3937 }
3938 
3939 
_job_node_test(job_record_t * job_ptr,int node_inx)3940 static bool _job_node_test(job_record_t *job_ptr, int node_inx)
3941 {
3942 	if (job_ptr->node_bitmap &&
3943 	    bit_test(job_ptr->node_bitmap, node_inx))
3944 		return true;
3945 	return false;
3946 }
3947 
_het_job_on_node(job_record_t * job_ptr,int node_inx)3948 static bool _het_job_on_node(job_record_t *job_ptr, int node_inx)
3949 {
3950 	job_record_t *het_job_leader, *het_job;
3951 	ListIterator iter;
3952 	static bool result = false;
3953 
3954 	if (!job_ptr->het_job_id)
3955 		return _job_node_test(job_ptr, node_inx);
3956 
3957 	het_job_leader = find_job_record(job_ptr->het_job_id);
3958 	if (!het_job_leader) {
3959 		error("%s: Hetjob leader %pJ not found",
3960 		      __func__, job_ptr);
3961 		return _job_node_test(job_ptr, node_inx);
3962 	}
3963 	if (!het_job_leader->het_job_list) {
3964 		error("%s: Hetjob leader %pJ job list is NULL",
3965 		      __func__, job_ptr);
3966 		return _job_node_test(job_ptr, node_inx);
3967 	}
3968 
3969 	iter = list_iterator_create(het_job_leader->het_job_list);
3970 	while ((het_job = list_next(iter))) {
3971 		if ((result = _job_node_test(het_job, node_inx)))
3972 			break;
3973 		/*
3974 		 * After a DOWN node is removed from another job component,
3975 		 * we have no way to identify other hetjob components with
3976 		 * the same node, so assume if one component is in NODE_FAILED
3977 		 * state, they all should be.
3978 		 */
3979 		if (IS_JOB_NODE_FAILED(het_job)) {
3980 			result = true;
3981 			break;
3982 		}
3983 	}
3984 	list_iterator_destroy(iter);
3985 
3986 	return result;
3987 }
3988 
3989 /*
3990  * kill_running_job_by_node_name - Given a node name, deallocate RUNNING
3991  *	or COMPLETING jobs from the node or kill them
3992  * IN node_name - name of a node
3993  * RET number of killed jobs
3994  */
kill_running_job_by_node_name(char * node_name)3995 extern int kill_running_job_by_node_name(char *node_name)
3996 {
3997 	ListIterator job_iterator;
3998 	job_record_t *job_ptr;
3999 	node_record_t *node_ptr;
4000 	int node_inx;
4001 	int kill_job_cnt = 0;
4002 	time_t now = time(NULL);
4003 
4004 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
4005 	xassert(verify_lock(NODE_LOCK, WRITE_LOCK));
4006 
4007 	node_ptr = find_node_record(node_name);
4008 	if (node_ptr == NULL)	/* No such node */
4009 		return 0;
4010 	node_inx = node_ptr - node_record_table_ptr;
4011 
4012 	job_iterator = list_iterator_create(job_list);
4013 	while ((job_ptr = list_next(job_iterator))) {
4014 		bool suspended = false;
4015 		if (!_het_job_on_node(job_ptr, node_inx))
4016 			continue;	/* job not on this node */
4017 		if (nonstop_ops.node_fail)
4018 			(nonstop_ops.node_fail)(job_ptr, node_ptr);
4019 		if (IS_JOB_SUSPENDED(job_ptr)) {
4020 			uint32_t suspend_job_state = job_ptr->job_state;
4021 			/*
4022 			 * we can't have it as suspended when we call the
4023 			 * accounting stuff.
4024 			 */
4025 			job_ptr->job_state = JOB_CANCELLED;
4026 			jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
4027 			job_ptr->job_state = suspend_job_state;
4028 			suspended = true;
4029 		}
4030 
4031 		if (IS_JOB_COMPLETING(job_ptr)) {
4032 			if (!bit_test(job_ptr->node_bitmap_cg, node_inx))
4033 				continue;
4034 			kill_job_cnt++;
4035 			bit_clear(job_ptr->node_bitmap_cg, node_inx);
4036 			job_update_tres_cnt(job_ptr, node_inx);
4037 			if (job_ptr->node_cnt)
4038 				(job_ptr->node_cnt)--;
4039 			else {
4040 				error("node_cnt underflow on %pJ", job_ptr);
4041 			}
4042 			if (job_ptr->node_cnt == 0)
4043 				cleanup_completing(job_ptr);
4044 
4045 			if (node_ptr->comp_job_cnt)
4046 				(node_ptr->comp_job_cnt)--;
4047 			else {
4048 				error("Node %s comp_job_cnt underflow, %pJ",
4049 				      node_ptr->name, job_ptr);
4050 			}
4051 		} else if (IS_JOB_RUNNING(job_ptr) || suspended) {
4052 			kill_job_cnt++;
4053 			if ((job_ptr->details) &&
4054 			    (job_ptr->kill_on_node_fail == 0) &&
4055 			    (job_ptr->node_cnt > 1) &&
4056 			    !IS_JOB_CONFIGURING(job_ptr)) {
4057 				/* keep job running on remaining nodes */
4058 				srun_node_fail(job_ptr, node_name);
4059 				error("Removing failed node %s from %pJ",
4060 				      node_name, job_ptr);
4061 				job_pre_resize_acctg(job_ptr);
4062 				kill_step_on_node(job_ptr, node_ptr, true);
4063 				excise_node_from_job(job_ptr, node_ptr);
4064 				(void) gs_job_start(job_ptr);
4065 				gres_build_job_details(job_ptr->gres_list,
4066 						       &job_ptr->gres_detail_cnt,
4067 						       &job_ptr->gres_detail_str,
4068 						       &job_ptr->gres_used);
4069 				job_post_resize_acctg(job_ptr);
4070 			} else if (job_ptr->batch_flag && job_ptr->details &&
4071 				   job_ptr->details->requeue) {
4072 				char requeue_msg[128];
4073 
4074 				srun_node_fail(job_ptr, node_name);
4075 				info("requeue job %pJ due to failure of node %s",
4076 				     job_ptr, node_name);
4077 				snprintf(requeue_msg, sizeof(requeue_msg),
4078 					 "Job requeued due to failure "
4079 					 "of node %s",
4080 					 node_name);
4081 				job_ptr->time_last_active  = now;
4082 				if (suspended) {
4083 					job_ptr->end_time =
4084 						job_ptr->suspend_time;
4085 					job_ptr->tot_sus_time +=
4086 						difftime(now,
4087 							 job_ptr->
4088 							 suspend_time);
4089 				} else
4090 					job_ptr->end_time = now;
4091 
4092 				/*
4093 				 * We want this job to look like it
4094 				 * was terminated in the accounting logs.
4095 				 * Set a new submit time so the restarted
4096 				 * job looks like a new job.
4097 				 */
4098 				job_ptr->job_state = JOB_NODE_FAIL;
4099 				build_cg_bitmap(job_ptr);
4100 				job_completion_logger(job_ptr, true);
4101 				deallocate_nodes(job_ptr, false, suspended,
4102 						 false);
4103 
4104 				/* do this after the epilog complete,
4105 				 * setting it here is too early */
4106 				//job_ptr->db_index = 0;
4107 				//job_ptr->details->submit_time = now;
4108 
4109 				job_ptr->job_state = JOB_PENDING;
4110 				if (job_ptr->node_cnt)
4111 					job_ptr->job_state |= JOB_COMPLETING;
4112 
4113 				job_ptr->restart_cnt++;
4114 
4115 				/* clear signal sent flag on requeue */
4116 				job_ptr->warn_flags &= ~WARN_SENT;
4117 
4118 				/*
4119 				 * Since the job completion logger
4120 				 * removes the submit we need to add it
4121 				 * again.
4122 				 */
4123 				acct_policy_add_job_submit(job_ptr);
4124 
4125 				if (!job_ptr->node_bitmap_cg ||
4126 				    bit_set_count(job_ptr->node_bitmap_cg) == 0)
4127 					batch_requeue_fini(job_ptr);
4128 			} else {
4129 				info("Killing %pJ on failed node %s",
4130 				     job_ptr, node_name);
4131 				srun_node_fail(job_ptr, node_name);
4132 				job_ptr->job_state = JOB_NODE_FAIL |
4133 						     JOB_COMPLETING;
4134 				build_cg_bitmap(job_ptr);
4135 				job_ptr->state_reason = FAIL_DOWN_NODE;
4136 				xfree(job_ptr->state_desc);
4137 				if (suspended) {
4138 					job_ptr->end_time =
4139 						job_ptr->suspend_time;
4140 					job_ptr->tot_sus_time +=
4141 						difftime(now,
4142 							 job_ptr->suspend_time);
4143 				} else
4144 					job_ptr->end_time = now;
4145 				job_completion_logger(job_ptr, false);
4146 				deallocate_nodes(job_ptr, false, suspended,
4147 						 false);
4148 			}
4149 		}
4150 
4151 	}
4152 	list_iterator_destroy(job_iterator);
4153 	if (kill_job_cnt)
4154 		last_job_update = now;
4155 
4156 	return kill_job_cnt;
4157 }
4158 
4159 /* Remove one node from a job's allocation */
excise_node_from_job(job_record_t * job_ptr,node_record_t * node_ptr)4160 extern void excise_node_from_job(job_record_t *job_ptr,
4161 				 node_record_t *node_ptr)
4162 {
4163 	int i, i_first, i_last, orig_pos = -1, new_pos = -1;
4164 	bitstr_t *orig_bitmap;
4165 
4166 	orig_bitmap = bit_copy(job_ptr->node_bitmap);
4167 	make_node_idle(node_ptr, job_ptr); /* updates bitmap */
4168 	xfree(job_ptr->nodes);
4169 	job_ptr->nodes = bitmap2node_name(job_ptr->node_bitmap);
4170 	i_first = bit_ffs(orig_bitmap);
4171 	if (i_first >= 0)
4172 		i_last = bit_fls(orig_bitmap);
4173 	else
4174 		i_last = -2;
4175 	for (i = i_first; i <= i_last; i++) {
4176 		if (!bit_test(orig_bitmap,i))
4177 			continue;
4178 		orig_pos++;
4179 		if (!bit_test(job_ptr->node_bitmap, i))
4180 			continue;
4181 		new_pos++;
4182 		if (orig_pos == new_pos)
4183 			continue;
4184 		memcpy(&job_ptr->node_addr[new_pos],
4185 		       &job_ptr->node_addr[orig_pos], sizeof(slurm_addr_t));
4186 		/*
4187 		 * NOTE: The job's allocation in the job_ptr->job_resrcs
4188 		 * data structure is unchanged  even after a node allocated
4189 		 * to the job goes DOWN.
4190 		 */
4191 	}
4192 
4193 	job_ptr->total_nodes = job_ptr->node_cnt = new_pos + 1;
4194 
4195 	FREE_NULL_BITMAP(orig_bitmap);
4196 	(void) select_g_job_resized(job_ptr, node_ptr);
4197 }
4198 
4199 /*
4200  * dump_job_desc - dump the incoming job submit request message
4201  * IN job_specs - job specification from RPC
4202  */
dump_job_desc(job_desc_msg_t * job_specs)4203 void dump_job_desc(job_desc_msg_t * job_specs)
4204 {
4205 	long pn_min_cpus, pn_min_tmp_disk, min_cpus;
4206 	uint64_t pn_min_memory;
4207 	long time_limit, priority, contiguous, nice, time_min;
4208 	long kill_on_node_fail, shared, immediate, wait_all_nodes;
4209 	long cpus_per_task, requeue, num_tasks, overcommit;
4210 	long ntasks_per_node, ntasks_per_socket, ntasks_per_core;
4211 	int spec_count;
4212 	char *mem_type, buf[100], *signal_flags, *spec_type, *job_id;
4213 
4214 	if (get_log_level() < LOG_LEVEL_DEBUG3)
4215 		return;
4216 
4217 	if (job_specs == NULL)
4218 		return;
4219 
4220 	if (job_specs->job_id_str)
4221 		job_id = job_specs->job_id_str;
4222 	else if (job_specs->job_id == NO_VAL)
4223 		job_id = "N/A";
4224 	else {
4225 		snprintf(buf, sizeof(buf), "%u", job_specs->job_id);
4226 		job_id = buf;
4227 	}
4228 	debug3("JobDesc: user_id=%u JobId=%s partition=%s name=%s",
4229 	       job_specs->user_id, job_id,
4230 	       job_specs->partition, job_specs->name);
4231 
4232 	min_cpus = (job_specs->min_cpus != NO_VAL) ?
4233 		(long) job_specs->min_cpus : -1L;
4234 	pn_min_cpus    = (job_specs->pn_min_cpus != NO_VAL16) ?
4235 		(long) job_specs->pn_min_cpus : -1L;
4236 	if (job_specs->core_spec == NO_VAL16) {
4237 		spec_type  = "core";
4238 		spec_count = -1;
4239 	} else if (job_specs->core_spec & CORE_SPEC_THREAD) {
4240 		spec_type  = "thread";
4241 		spec_count = job_specs->core_spec & (~CORE_SPEC_THREAD);
4242 	} else {
4243 		spec_type  = "core";
4244 		spec_count = job_specs->core_spec;
4245 	}
4246 	debug3("   cpus=%ld-%u pn_min_cpus=%ld %s_spec=%d",
4247 	       min_cpus, job_specs->max_cpus, pn_min_cpus,
4248 	       spec_type, spec_count);
4249 
4250 	debug3("   Nodes=%u-[%u] Sock/Node=%u Core/Sock=%u Thread/Core=%u",
4251 	       job_specs->min_nodes, job_specs->max_nodes,
4252 	       job_specs->sockets_per_node, job_specs->cores_per_socket,
4253 	       job_specs->threads_per_core);
4254 
4255 	if (job_specs->pn_min_memory == NO_VAL64) {
4256 		pn_min_memory = -1L;
4257 		mem_type = "job";
4258 	} else if (job_specs->pn_min_memory & MEM_PER_CPU) {
4259 		pn_min_memory = job_specs->pn_min_memory & (~MEM_PER_CPU);
4260 		mem_type = "cpu";
4261 	} else {
4262 		pn_min_memory = job_specs->pn_min_memory;
4263 		mem_type = "job";
4264 	}
4265 	pn_min_tmp_disk = (job_specs->pn_min_tmp_disk != NO_VAL) ?
4266 		(long) job_specs->pn_min_tmp_disk : -1L;
4267 	debug3("   pn_min_memory_%s=%"PRIu64" pn_min_tmp_disk=%ld",
4268 	       mem_type, pn_min_memory, pn_min_tmp_disk);
4269 	immediate = (job_specs->immediate == 0) ? 0L : 1L;
4270 	debug3("   immediate=%ld reservation=%s",
4271 	       immediate, job_specs->reservation);
4272 	debug3("   features=%s batch_features=%s cluster_features=%s",
4273 	       job_specs->features, job_specs->batch_features,
4274 	       job_specs->cluster_features);
4275 
4276 	debug3("   req_nodes=%s exc_nodes=%s",
4277 	       job_specs->req_nodes, job_specs->exc_nodes);
4278 
4279 	time_limit = (job_specs->time_limit != NO_VAL) ?
4280 		(long) job_specs->time_limit : -1L;
4281 	time_min = (job_specs->time_min != NO_VAL) ?
4282 		(long) job_specs->time_min : time_limit;
4283 	priority   = (job_specs->priority != NO_VAL) ?
4284 		(long) job_specs->priority : -1L;
4285 	contiguous = (job_specs->contiguous != NO_VAL16) ?
4286 		(long) job_specs->contiguous : -1L;
4287 	shared = (job_specs->shared != NO_VAL16) ?
4288 		(long) job_specs->shared : -1L;
4289 	debug3("   time_limit=%ld-%ld priority=%ld contiguous=%ld shared=%ld",
4290 	       time_min, time_limit, priority, contiguous, shared);
4291 
4292 	kill_on_node_fail = (job_specs->kill_on_node_fail !=
4293 			     NO_VAL16) ?
4294 		(long) job_specs->kill_on_node_fail : -1L;
4295 	if (job_specs->script)	/* log has problem with string len & null */
4296 		debug3("   kill_on_node_fail=%ld script=%.40s...",
4297 		       kill_on_node_fail, job_specs->script);
4298 	else
4299 		debug3("   kill_on_node_fail=%ld script=(null)",
4300 		       kill_on_node_fail);
4301 
4302 	if (job_specs->argc == 1)
4303 		debug3("   argv=\"%s\"",
4304 		       job_specs->argv[0]);
4305 	else if (job_specs->argc == 2)
4306 		debug3("   argv=%s,%s",
4307 		       job_specs->argv[0],
4308 		       job_specs->argv[1]);
4309 	else if (job_specs->argc > 2)
4310 		debug3("   argv=%s,%s,%s,...",
4311 		       job_specs->argv[0],
4312 		       job_specs->argv[1],
4313 		       job_specs->argv[2]);
4314 
4315 	if (job_specs->env_size == 1)
4316 		debug3("   environment=\"%s\"",
4317 		       job_specs->environment[0]);
4318 	else if (job_specs->env_size == 2)
4319 		debug3("   environment=%s,%s",
4320 		       job_specs->environment[0],
4321 		       job_specs->environment[1]);
4322 	else if (job_specs->env_size > 2)
4323 		debug3("   environment=%s,%s,%s,...",
4324 		       job_specs->environment[0],
4325 		       job_specs->environment[1],
4326 		       job_specs->environment[2]);
4327 
4328 	if (job_specs->spank_job_env_size == 1)
4329 		debug3("   spank_job_env=\"%s\"",
4330 		       job_specs->spank_job_env[0]);
4331 	else if (job_specs->spank_job_env_size == 2)
4332 		debug3("   spank_job_env=%s,%s",
4333 		       job_specs->spank_job_env[0],
4334 		       job_specs->spank_job_env[1]);
4335 	else if (job_specs->spank_job_env_size > 2)
4336 		debug3("   spank_job_env=%s,%s,%s,...",
4337 		       job_specs->spank_job_env[0],
4338 		       job_specs->spank_job_env[1],
4339 		       job_specs->spank_job_env[2]);
4340 
4341 	debug3("   stdin=%s stdout=%s stderr=%s",
4342 	       job_specs->std_in, job_specs->std_out, job_specs->std_err);
4343 
4344 	debug3("   work_dir=%s alloc_node:sid=%s:%u",
4345 	       job_specs->work_dir,
4346 	       job_specs->alloc_node, job_specs->alloc_sid);
4347 
4348 	debug3("   power_flags=%s",
4349 	       power_flags_str(job_specs->power_flags));
4350 
4351 	debug3("   resp_host=%s alloc_resp_port=%u other_port=%u",
4352 	       job_specs->resp_host,
4353 	       job_specs->alloc_resp_port, job_specs->other_port);
4354 	debug3("   dependency=%s account=%s qos=%s comment=%s",
4355 	       job_specs->dependency, job_specs->account,
4356 	       job_specs->qos, job_specs->comment);
4357 
4358 	num_tasks = (job_specs->num_tasks != NO_VAL) ?
4359 		(long) job_specs->num_tasks : -1L;
4360 	overcommit = (job_specs->overcommit != NO_VAL8) ?
4361 		(long) job_specs->overcommit : -1L;
4362 	nice = (job_specs->nice != NO_VAL) ?
4363 		((int64_t)job_specs->nice - NICE_OFFSET) : 0;
4364 	debug3("   mail_type=%u mail_user=%s nice=%ld num_tasks=%ld "
4365 	       "open_mode=%u overcommit=%ld acctg_freq=%s",
4366 	       job_specs->mail_type, job_specs->mail_user, nice, num_tasks,
4367 	       job_specs->open_mode, overcommit, job_specs->acctg_freq);
4368 
4369 	slurm_make_time_str(&job_specs->begin_time, buf, sizeof(buf));
4370 	cpus_per_task = (job_specs->cpus_per_task != NO_VAL16) ?
4371 		(long) job_specs->cpus_per_task : -1L;
4372 	requeue = (job_specs->requeue != NO_VAL16) ?
4373 		(long) job_specs->requeue : -1L;
4374 	debug3("   network=%s begin=%s cpus_per_task=%ld requeue=%ld "
4375 	       "licenses=%s",
4376 	       job_specs->network, buf, cpus_per_task, requeue,
4377 	       job_specs->licenses);
4378 
4379 	slurm_make_time_str(&job_specs->end_time, buf, sizeof(buf));
4380 	wait_all_nodes = (job_specs->wait_all_nodes != NO_VAL16) ?
4381 			 (long) job_specs->wait_all_nodes : -1L;
4382 	if (job_specs->warn_flags & KILL_JOB_BATCH)
4383 		signal_flags = "B:";
4384 	else
4385 		signal_flags = "";
4386 	cpu_freq_debug(NULL, NULL, buf, sizeof(buf), job_specs->cpu_freq_gov,
4387 		       job_specs->cpu_freq_min, job_specs->cpu_freq_max,
4388 		       NO_VAL);
4389 	debug3("   end_time=%s signal=%s%u@%u wait_all_nodes=%ld cpu_freq=%s",
4390 	       buf, signal_flags, job_specs->warn_signal, job_specs->warn_time,
4391 	       wait_all_nodes, buf);
4392 
4393 	ntasks_per_node = (job_specs->ntasks_per_node != NO_VAL16) ?
4394 		(long) job_specs->ntasks_per_node : -1L;
4395 	ntasks_per_socket = (job_specs->ntasks_per_socket !=
4396 			     NO_VAL16) ?
4397 		(long) job_specs->ntasks_per_socket : -1L;
4398 	ntasks_per_core = (job_specs->ntasks_per_core != NO_VAL16) ?
4399 		(long) job_specs->ntasks_per_core : -1L;
4400 	debug3("   ntasks_per_node=%ld ntasks_per_socket=%ld "
4401 	       "ntasks_per_core=%ld",
4402 	       ntasks_per_node, ntasks_per_socket, ntasks_per_core);
4403 
4404 	debug3("   mem_bind=%u:%s plane_size:%u",
4405 	       job_specs->mem_bind_type, job_specs->mem_bind,
4406 	       job_specs->plane_size);
4407 	debug3("   array_inx=%s", job_specs->array_inx);
4408 	debug3("   burst_buffer=%s", job_specs->burst_buffer);
4409 	debug3("   mcs_label=%s", job_specs->mcs_label);
4410 	slurm_make_time_str(&job_specs->deadline, buf, sizeof(buf));
4411 	debug3("   deadline=%s", buf);
4412 	debug3("   bitflags=%u delay_boot=%u", job_specs->bitflags,
4413 	       job_specs->delay_boot);
4414 
4415 	if (job_specs->cpus_per_tres)
4416 		debug3("   CPUs_per_TRES=%s", job_specs->cpus_per_tres);
4417 	if (job_specs->mem_per_tres)
4418 		debug3("   Mem_per_TRES=%s", job_specs->mem_per_tres);
4419 	if (job_specs->tres_bind)
4420 		debug3("   TRES_bind=%s", job_specs->tres_bind);
4421 	if (job_specs->tres_freq)
4422 		debug3("   TRES_freq=%s", job_specs->tres_freq);
4423 	if (job_specs->tres_per_job)
4424 		debug3("   TRES_per_job=%s", job_specs->tres_per_job);
4425 	if (job_specs->tres_per_node)
4426 		debug3("   TRES_per_node=%s", job_specs->tres_per_node);
4427 	if (job_specs->tres_per_socket)
4428 		debug3("   TRES_per_socket=%s", job_specs->tres_per_socket);
4429 	if (job_specs->tres_per_task)
4430 		debug3("   TRES_per_task=%s", job_specs->tres_per_task);
4431 
4432 	select_g_select_jobinfo_sprint(job_specs->select_jobinfo,
4433 				       buf, sizeof(buf), SELECT_PRINT_MIXED);
4434 	if (buf[0] != '\0')
4435 		debug3("   %s", buf);
4436 }
4437 
4438 /*
4439  * init_job_conf - initialize the job configuration tables and values.
4440  *	this should be called after creating node information, but
4441  *	before creating any job entries. Pre-existing job entries are
4442  *	left unchanged.
4443  *	NOTE: The job hash table size does not change after initial creation.
4444  * RET 0 if no error, otherwise an error code
4445  * global: last_job_update - time of last job table update
4446  *	job_list - pointer to global job list
4447  */
init_job_conf(void)4448 int init_job_conf(void)
4449 {
4450 	if (job_list == NULL) {
4451 		job_count = 0;
4452 		job_list = list_create(_list_delete_job);
4453 	}
4454 
4455 	last_job_update = time(NULL);
4456 
4457 	if (!purge_files_list) {
4458 		purge_files_list = list_create(xfree_ptr);
4459 	}
4460 
4461 	return SLURM_SUCCESS;
4462 }
4463 
4464 /*
4465  * rehash_jobs - Create or rebuild the job hash table.
4466  */
rehash_jobs(void)4467 extern void rehash_jobs(void)
4468 {
4469 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
4470 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
4471 
4472 	if (job_hash == NULL) {
4473 		hash_table_size = slurmctld_conf.max_job_cnt;
4474 		job_hash = xcalloc(hash_table_size, sizeof(job_record_t *));
4475 		job_array_hash_j = xcalloc(hash_table_size,
4476 					   sizeof(job_record_t *));
4477 		job_array_hash_t = xcalloc(hash_table_size,
4478 					   sizeof(job_record_t *));
4479 	} else if (hash_table_size < (slurmctld_conf.max_job_cnt / 2)) {
4480 		/* If the MaxJobCount grows by too much, the hash table will
4481 		 * be ineffective without rebuilding. We don't presently bother
4482 		 * to rebuild the hash table, but cut MaxJobCount back as
4483 		 * needed. */
4484 		error ("MaxJobCount reset too high, restart slurmctld");
4485 		slurmctld_conf.max_job_cnt = hash_table_size;
4486 	}
4487 }
4488 
4489 /* Create an exact copy of an existing job record for a job array.
4490  * IN job_ptr - META job record for a job array, which is to become an
4491  *		individial task of the job array.
4492  *		Set the job's array_task_id to the task to be split out.
4493  * RET - The new job record, which is the new META job record. */
job_array_split(job_record_t * job_ptr)4494 extern job_record_t *job_array_split(job_record_t *job_ptr)
4495 {
4496 	job_record_t *job_ptr_pend = NULL, *save_job_next;
4497 	struct job_details *job_details, *details_new, *save_details;
4498 	uint32_t save_job_id;
4499 	uint64_t save_db_index = job_ptr->db_index;
4500 	priority_factors_object_t *save_prio_factors;
4501 	List save_step_list;
4502 	int i;
4503 
4504 	job_ptr_pend = _create_job_record(0);
4505 	if (!job_ptr_pend)
4506 		return NULL;
4507 
4508 	_remove_job_hash(job_ptr, JOB_HASH_JOB);
4509 	job_ptr_pend->job_id = job_ptr->job_id;
4510 	if (_set_job_id(job_ptr) != SLURM_SUCCESS)
4511 		fatal("%s: _set_job_id error", __func__);
4512 	if (!job_ptr->array_recs) {
4513 		fatal_abort("%s: %pJ record lacks array structure",
4514 			    __func__, job_ptr);
4515 	}
4516 
4517 	/*
4518 	 * Copy most of original job data.
4519 	 * This could be done in parallel, but performance was worse.
4520 	 */
4521 	save_job_id   = job_ptr_pend->job_id;
4522 	save_job_next = job_ptr_pend->job_next;
4523 	save_details  = job_ptr_pend->details;
4524 	save_prio_factors = job_ptr_pend->prio_factors;
4525 	save_step_list = job_ptr_pend->step_list;
4526 	memcpy(job_ptr_pend, job_ptr, sizeof(job_record_t));
4527 
4528 	job_ptr_pend->job_id   = save_job_id;
4529 	job_ptr_pend->job_next = save_job_next;
4530 	job_ptr_pend->details  = save_details;
4531 	job_ptr_pend->db_flags = 0;
4532 	job_ptr_pend->step_list = save_step_list;
4533 	job_ptr_pend->db_index = save_db_index;
4534 
4535 	job_ptr_pend->prio_factors = save_prio_factors;
4536 	slurm_copy_priority_factors_object(job_ptr_pend->prio_factors,
4537 					   job_ptr->prio_factors);
4538 
4539 	job_ptr_pend->account = xstrdup(job_ptr->account);
4540 	job_ptr_pend->admin_comment = xstrdup(job_ptr->admin_comment);
4541 	job_ptr_pend->alias_list = xstrdup(job_ptr->alias_list);
4542 	job_ptr_pend->alloc_node = xstrdup(job_ptr->alloc_node);
4543 
4544 	job_ptr_pend->array_recs = job_ptr->array_recs;
4545 	job_ptr->array_recs = NULL;
4546 
4547 	if (job_ptr_pend->array_recs &&
4548 	    job_ptr_pend->array_recs->task_id_bitmap) {
4549 		bit_clear(job_ptr_pend->array_recs->task_id_bitmap,
4550 			  job_ptr_pend->array_task_id);
4551 	}
4552 	xfree(job_ptr_pend->array_recs->task_id_str);
4553 	if (job_ptr_pend->array_recs->task_cnt) {
4554 		job_ptr_pend->array_recs->task_cnt--;
4555 	} else {
4556 		error("%pJ array_recs->task_cnt underflow",
4557 		      job_ptr);
4558 	}
4559 	job_ptr_pend->array_task_id = NO_VAL;
4560 
4561 	job_ptr_pend->batch_host = NULL;
4562 	job_ptr_pend->burst_buffer = xstrdup(job_ptr->burst_buffer);
4563 	job_ptr_pend->burst_buffer_state = xstrdup(job_ptr->burst_buffer_state);
4564 	job_ptr_pend->clusters = xstrdup(job_ptr->clusters);
4565 	job_ptr_pend->comment = xstrdup(job_ptr->comment);
4566 
4567 	job_ptr_pend->fed_details = _dup_job_fed_details(job_ptr->fed_details);
4568 
4569 	job_ptr_pend->front_end_ptr = NULL;
4570 	/* struct job_details *details;		*** NOTE: Copied below */
4571 	if (job_ptr->gres_list) {
4572 		job_ptr_pend->gres_list =
4573 			gres_plugin_job_state_dup(job_ptr->gres_list);
4574 	}
4575 	job_ptr_pend->gres_detail_cnt = 0;
4576 	job_ptr_pend->gres_detail_str = NULL;
4577 	job_ptr_pend->gres_alloc = NULL;
4578 	job_ptr_pend->gres_req = NULL;
4579 	job_ptr_pend->gres_used = NULL;
4580 
4581 	job_ptr_pend->limit_set.tres = xcalloc(slurmctld_tres_cnt,
4582 					       sizeof(uint16_t));
4583 	memcpy(job_ptr_pend->limit_set.tres, job_ptr->limit_set.tres,
4584 	       sizeof(uint16_t) * slurmctld_tres_cnt);
4585 
4586 	_add_job_hash(job_ptr);		/* Sets job_next */
4587 	_add_job_hash(job_ptr_pend);	/* Sets job_next */
4588 	_add_job_array_hash(job_ptr);
4589 	job_ptr_pend->job_resrcs = NULL;
4590 
4591 	job_ptr_pend->licenses = xstrdup(job_ptr->licenses);
4592 	job_ptr_pend->license_list = license_job_copy(job_ptr->license_list);
4593 	job_ptr_pend->mail_user = xstrdup(job_ptr->mail_user);
4594 	job_ptr_pend->mcs_label = xstrdup(job_ptr->mcs_label);
4595 	job_ptr_pend->name = xstrdup(job_ptr->name);
4596 	job_ptr_pend->network = xstrdup(job_ptr->network);
4597 	job_ptr_pend->node_addr = NULL;
4598 	job_ptr_pend->node_bitmap = NULL;
4599 	job_ptr_pend->node_bitmap_cg = NULL;
4600 	job_ptr_pend->nodes = NULL;
4601 	job_ptr_pend->nodes_completing = NULL;
4602 	job_ptr_pend->origin_cluster = xstrdup(job_ptr->origin_cluster);
4603 	job_ptr_pend->partition = xstrdup(job_ptr->partition);
4604 	job_ptr_pend->part_ptr_list = part_list_copy(job_ptr->part_ptr_list);
4605 	/* On jobs that are held the priority_array isn't set up yet,
4606 	 * so check to see if it exists before copying. */
4607 	if (job_ptr->part_ptr_list && job_ptr->priority_array) {
4608 		i = list_count(job_ptr->part_ptr_list) * sizeof(uint32_t);
4609 		job_ptr_pend->priority_array = xmalloc(i);
4610 		memcpy(job_ptr_pend->priority_array,
4611 		       job_ptr->priority_array, i);
4612 	}
4613 	job_ptr_pend->resv_name = xstrdup(job_ptr->resv_name);
4614 	job_ptr_pend->resp_host = xstrdup(job_ptr->resp_host);
4615 	if (job_ptr->select_jobinfo) {
4616 		job_ptr_pend->select_jobinfo =
4617 			select_g_select_jobinfo_copy(job_ptr->select_jobinfo);
4618 	}
4619 	job_ptr_pend->sched_nodes = NULL;
4620 	if (job_ptr->spank_job_env_size) {
4621 		job_ptr_pend->spank_job_env =
4622 			xcalloc((job_ptr->spank_job_env_size + 1),
4623 				sizeof(char *));
4624 		for (i = 0; i < job_ptr->spank_job_env_size; i++) {
4625 			job_ptr_pend->spank_job_env[i] =
4626 				xstrdup(job_ptr->spank_job_env[i]);
4627 		}
4628 	}
4629 	job_ptr_pend->state_desc = xstrdup(job_ptr->state_desc);
4630 
4631 	job_ptr_pend->system_comment = xstrdup(job_ptr->system_comment);
4632 
4633 	i = sizeof(uint64_t) * slurmctld_tres_cnt;
4634 	job_ptr_pend->tres_req_cnt = xmalloc(i);
4635 	memcpy(job_ptr_pend->tres_req_cnt, job_ptr->tres_req_cnt, i);
4636 	job_ptr_pend->tres_req_str = xstrdup(job_ptr->tres_req_str);
4637 	job_ptr_pend->tres_fmt_req_str = xstrdup(job_ptr->tres_fmt_req_str);
4638 	job_ptr_pend->tres_alloc_str = NULL;
4639 	job_ptr_pend->tres_fmt_alloc_str = NULL;
4640 	job_ptr_pend->tres_alloc_cnt = NULL;
4641 
4642 	job_ptr_pend->cpus_per_tres = xstrdup(job_ptr->cpus_per_tres);
4643 	job_ptr_pend->mem_per_tres = xstrdup(job_ptr->mem_per_tres);
4644 	job_ptr_pend->tres_bind = xstrdup(job_ptr->tres_bind);
4645 	job_ptr_pend->tres_freq = xstrdup(job_ptr->tres_freq);
4646 	job_ptr_pend->tres_per_job = xstrdup(job_ptr->tres_per_job);
4647 	job_ptr_pend->tres_per_node = xstrdup(job_ptr->tres_per_node);
4648 	job_ptr_pend->tres_per_socket = xstrdup(job_ptr->tres_per_socket);
4649 	job_ptr_pend->tres_per_task = xstrdup(job_ptr->tres_per_task);
4650 
4651 	job_ptr_pend->user_name = xstrdup(job_ptr->user_name);
4652 	job_ptr_pend->wckey = xstrdup(job_ptr->wckey);
4653 	job_ptr_pend->deadline = job_ptr->deadline;
4654 
4655 	job_details = job_ptr->details;
4656 	details_new = job_ptr_pend->details;
4657 	memcpy(details_new, job_details, sizeof(struct job_details));
4658 
4659 	/*
4660 	 * Reset the preempt_start_time or high priority array jobs will hang
4661 	 * for a period before preempting more jobs.
4662 	 */
4663 	details_new->preempt_start_time = 0;
4664 
4665 	details_new->acctg_freq = xstrdup(job_details->acctg_freq);
4666 	if (job_details->argc) {
4667 		details_new->argv =
4668 			xcalloc((job_details->argc + 1), sizeof(char *));
4669 		for (i = 0; i < job_details->argc; i++) {
4670 			details_new->argv[i] = xstrdup(job_details->argv[i]);
4671 		}
4672 	}
4673 	details_new->cpu_bind = xstrdup(job_details->cpu_bind);
4674 	details_new->cpu_bind_type = job_details->cpu_bind_type;
4675 	details_new->cpu_freq_min = job_details->cpu_freq_min;
4676 	details_new->cpu_freq_max = job_details->cpu_freq_max;
4677 	details_new->cpu_freq_gov = job_details->cpu_freq_gov;
4678 	details_new->depend_list = depended_list_copy(job_details->depend_list);
4679 	details_new->dependency = xstrdup(job_details->dependency);
4680 	details_new->orig_dependency = xstrdup(job_details->orig_dependency);
4681 	if (job_details->env_cnt) {
4682 		details_new->env_sup =
4683 			xcalloc((job_details->env_cnt + 1), sizeof(char *));
4684 		for (i = 0; i < job_details->env_cnt; i++) {
4685 			details_new->env_sup[i] =
4686 				xstrdup(job_details->env_sup[i]);
4687 		}
4688 	}
4689 	if (job_details->exc_node_bitmap) {
4690 		details_new->exc_node_bitmap =
4691 			bit_copy(job_details->exc_node_bitmap);
4692 	}
4693 	details_new->exc_nodes = xstrdup(job_details->exc_nodes);
4694 	details_new->feature_list =
4695 		feature_list_copy(job_details->feature_list);
4696 	details_new->features = xstrdup(job_details->features);
4697 	details_new->cluster_features = xstrdup(job_details->cluster_features);
4698 	if (job_details->mc_ptr) {
4699 		i = sizeof(multi_core_data_t);
4700 		details_new->mc_ptr = xmalloc(i);
4701 		memcpy(details_new->mc_ptr, job_details->mc_ptr, i);
4702 	}
4703 	details_new->mem_bind = xstrdup(job_details->mem_bind);
4704 	details_new->mem_bind_type = job_details->mem_bind_type;
4705 	if (job_details->req_node_bitmap) {
4706 		details_new->req_node_bitmap =
4707 			bit_copy(job_details->req_node_bitmap);
4708 	}
4709 	details_new->req_nodes = xstrdup(job_details->req_nodes);
4710 	details_new->std_err = xstrdup(job_details->std_err);
4711 	details_new->std_in = xstrdup(job_details->std_in);
4712 	details_new->std_out = xstrdup(job_details->std_out);
4713 	details_new->work_dir = xstrdup(job_details->work_dir);
4714 	details_new->x11_magic_cookie = xstrdup(job_details->x11_magic_cookie);
4715 
4716 	if (job_ptr->fed_details) {
4717 		add_fed_job_info(job_ptr);
4718 		/*
4719 		 * The new (split) job needs its remote dependencies tested
4720 		 * separately from just the meta job, so send remote
4721 		 * dependencies to siblings if needed.
4722 		 */
4723 		if (job_ptr->details->dependency &&
4724 		    job_ptr->details->depend_list)
4725 			fed_mgr_submit_remote_dependencies(job_ptr, false,
4726 							   false);
4727 	}
4728 
4729 	return job_ptr_pend;
4730 }
4731 
4732 /* Add job array data stucture to the job record */
_create_job_array(job_record_t * job_ptr,job_desc_msg_t * job_specs)4733 static void _create_job_array(job_record_t *job_ptr, job_desc_msg_t *job_specs)
4734 {
4735 	struct job_details *details;
4736 	char *sep = NULL;
4737 	int max_run_tasks, min_task_id, max_task_id, step_task_id = 1, task_cnt;
4738 	uint32_t i_cnt;
4739 
4740 	if (!job_specs->array_bitmap)
4741 		return;
4742 
4743 	i_cnt = bit_set_count(job_specs->array_bitmap);
4744 	if (i_cnt == 0) {
4745 		info("%s: %pJ array_bitmap is empty", __func__, job_ptr);
4746 		return;
4747 	}
4748 
4749 	job_ptr->array_job_id = job_ptr->job_id;
4750 	job_ptr->array_recs = xmalloc(sizeof(job_array_struct_t));
4751 	min_task_id = bit_ffs(job_specs->array_bitmap);
4752 	max_task_id = bit_fls(job_specs->array_bitmap);
4753 	task_cnt = bit_set_count(job_specs->array_bitmap);
4754 	i_cnt = max_task_id + 1;
4755 	job_specs->array_bitmap = bit_realloc(job_specs->array_bitmap, i_cnt);
4756 	job_ptr->array_recs->task_id_bitmap = job_specs->array_bitmap;
4757 	job_specs->array_bitmap = NULL;
4758 	job_ptr->array_recs->task_cnt =
4759 		bit_set_count(job_ptr->array_recs->task_id_bitmap);
4760 	if (job_ptr->array_recs->task_cnt > 1)
4761 		job_count += (job_ptr->array_recs->task_cnt - 1);
4762 
4763 	if (job_specs->array_inx)
4764 		sep = strchr(job_specs->array_inx, '%');
4765 	if (sep) {
4766 		max_run_tasks = atoi(sep + 1);
4767 		if (max_run_tasks > 0)
4768 			job_ptr->array_recs->max_run_tasks = max_run_tasks;
4769 	}
4770 
4771 	details = job_ptr->details;
4772 	if (details) {
4773 		if (job_specs->array_inx) {
4774 			sep = strchr(job_specs->array_inx, ':');
4775 			if (sep)
4776 				step_task_id = atoi(sep + 1);
4777 		}
4778 		details->env_sup = xrealloc(details->env_sup,
4779 					    (sizeof(char *) *
4780 					    (details->env_cnt + 4)));
4781 		xstrfmtcat(details->env_sup[details->env_cnt++],
4782 			   "SLURM_ARRAY_TASK_COUNT=%d", task_cnt);
4783 		xstrfmtcat(details->env_sup[details->env_cnt++],
4784 			   "SLURM_ARRAY_TASK_MIN=%d", min_task_id);
4785 		xstrfmtcat(details->env_sup[details->env_cnt++],
4786 			   "SLURM_ARRAY_TASK_MAX=%d", max_task_id);
4787 		xstrfmtcat(details->env_sup[details->env_cnt++],
4788 			   "SLURM_ARRAY_TASK_STEP=%d", step_task_id);
4789 	}
4790 }
4791 
4792 /*
4793  * Wrapper for select_nodes() function that will test all valid partitions
4794  * for a new job
4795  * IN job_ptr - pointer to the job record
4796  * IN test_only - if set do not allocate nodes, just confirm they
4797  *	could be allocated now
4798  * IN select_node_bitmap - bitmap of nodes to be used for the
4799  *	job's resource allocation (not returned if NULL), caller
4800  *	must free
4801  * OUT err_msg - error message for job, caller must xfree
4802  */
_select_nodes_parts(job_record_t * job_ptr,bool test_only,bitstr_t ** select_node_bitmap,char ** err_msg)4803 static int _select_nodes_parts(job_record_t *job_ptr, bool test_only,
4804 			       bitstr_t **select_node_bitmap, char **err_msg)
4805 {
4806 	part_record_t *part_ptr;
4807 	ListIterator iter;
4808 	int rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
4809 	int best_rc = -1, part_limits_rc = WAIT_NO_REASON;
4810 	bitstr_t *save_avail_node_bitmap = NULL;
4811 
4812 	save_avail_node_bitmap = bit_copy(avail_node_bitmap);
4813 	bit_or(avail_node_bitmap, rs_node_bitmap);
4814 
4815 	if (job_ptr->part_ptr_list) {
4816 		list_sort(job_ptr->part_ptr_list, priority_sort_part_tier);
4817 		iter = list_iterator_create(job_ptr->part_ptr_list);
4818 		while ((part_ptr = list_next(iter))) {
4819 			job_ptr->part_ptr = part_ptr;
4820 			debug2("Try %pJ on next partition %s",
4821 			       job_ptr, part_ptr->name);
4822 
4823 			part_limits_rc = job_limits_check(&job_ptr, false);
4824 
4825 			if ((part_limits_rc != WAIT_NO_REASON) &&
4826 			    (slurmctld_conf.enforce_part_limits ==
4827 			     PARTITION_ENFORCE_ANY))
4828 				continue;
4829 			if ((part_limits_rc != WAIT_NO_REASON) &&
4830 			    (slurmctld_conf.enforce_part_limits ==
4831 			     PARTITION_ENFORCE_ALL)) {
4832 				if (part_limits_rc != WAIT_PART_DOWN) {
4833 					best_rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
4834 					break;
4835 				} else {
4836 					best_rc = ESLURM_PARTITION_DOWN;
4837 				}
4838 			}
4839 
4840 			if (part_limits_rc == WAIT_NO_REASON) {
4841 				rc = select_nodes(job_ptr, test_only,
4842 						  select_node_bitmap, err_msg,
4843 						  true,
4844 						  SLURMDB_JOB_FLAG_SUBMIT);
4845 			} else {
4846 				rc = select_nodes(job_ptr, true,
4847 						  select_node_bitmap, err_msg,
4848 						  true,
4849 						  SLURMDB_JOB_FLAG_SUBMIT);
4850 				if ((rc == SLURM_SUCCESS) &&
4851 				    (part_limits_rc == WAIT_PART_DOWN))
4852 					rc = ESLURM_PARTITION_DOWN;
4853 			}
4854 			if ((rc == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) &&
4855 			    (slurmctld_conf.enforce_part_limits ==
4856 			     PARTITION_ENFORCE_ALL)) {
4857 				best_rc = rc;	/* Job can not run */
4858 				break;
4859 			}
4860 			if ((rc != ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) &&
4861 			    (rc != ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) &&
4862 			    (rc != ESLURM_RESERVATION_BUSY) &&
4863 			    (rc != ESLURM_NODES_BUSY)) {
4864 				best_rc = rc;	/* Job can run now */
4865 				if ((slurmctld_conf.enforce_part_limits ==
4866 				     PARTITION_ENFORCE_ANY) ||
4867 				    (slurmctld_conf.enforce_part_limits ==
4868 				     PARTITION_ENFORCE_NONE) ||
4869 				    (!test_only &&
4870 				     (part_limits_rc == WAIT_NO_REASON))) {
4871 					break;
4872 				}
4873 			}
4874 			if (((rc == ESLURM_NODES_BUSY) ||
4875 			     (rc == ESLURM_RESERVATION_BUSY)) &&
4876 			    (best_rc == -1) &&
4877 			    ((slurmctld_conf.enforce_part_limits ==
4878 			      PARTITION_ENFORCE_ANY) ||
4879 			     (slurmctld_conf.enforce_part_limits ==
4880 			      PARTITION_ENFORCE_NONE))) {
4881 				if (test_only)
4882 					break;
4883 				best_rc = rc;	/* Keep looking for partition
4884 						 * where job can start now */
4885 			}
4886 			if ((job_ptr->preempt_in_progress) &&
4887 			    (rc != ESLURM_NODES_BUSY)) {
4888 				/* Already started preempting jobs, don't
4889 				 * consider starting this job in another
4890 				 * partition as we iterator over others. */
4891 				test_only = true;
4892 			}
4893 		}
4894 		list_iterator_destroy(iter);
4895 		if (best_rc != -1)
4896 			rc = best_rc;
4897 		else if (part_limits_rc == WAIT_PART_DOWN)
4898 			rc = ESLURM_PARTITION_DOWN;
4899 	} else {
4900 		part_limits_rc = job_limits_check(&job_ptr, false);
4901 		if (part_limits_rc == WAIT_NO_REASON) {
4902 			rc = select_nodes(job_ptr, test_only,
4903 					  select_node_bitmap, err_msg, true,
4904 					  SLURMDB_JOB_FLAG_SUBMIT);
4905 		} else if (part_limits_rc == WAIT_PART_DOWN) {
4906 			rc = select_nodes(job_ptr, true,
4907 					  select_node_bitmap, err_msg, true,
4908 					  SLURMDB_JOB_FLAG_SUBMIT);
4909 			if (rc == SLURM_SUCCESS)
4910 				rc = ESLURM_PARTITION_DOWN;
4911 		}
4912 	}
4913 
4914 	if (rc == ESLURM_NODES_BUSY)
4915 		job_ptr->state_reason = WAIT_RESOURCES;
4916 	else if ((rc == ESLURM_RESERVATION_BUSY) ||
4917 		 (rc == ESLURM_RESERVATION_NOT_USABLE))
4918 		job_ptr->state_reason = WAIT_RESERVATION;
4919 	else if (rc == ESLURM_JOB_HELD)
4920 		/* Do not reset the state_reason field here. select_nodes()
4921 		 * already set the state_reason field, and this error code
4922 		 * does not distinguish between user and admin holds. */
4923 		;
4924 	else if (rc == ESLURM_NODE_NOT_AVAIL)
4925 		job_ptr->state_reason = WAIT_NODE_NOT_AVAIL;
4926 	else if (rc == ESLURM_QOS_THRES)
4927 		job_ptr->state_reason = WAIT_QOS_THRES;
4928 	else if (rc == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)
4929 		job_ptr->state_reason = WAIT_PART_CONFIG;
4930 	else if (rc == ESLURM_POWER_NOT_AVAIL)
4931 		job_ptr->state_reason = WAIT_POWER_NOT_AVAIL;
4932 	else if (rc == ESLURM_BURST_BUFFER_WAIT)
4933 		job_ptr->state_reason = WAIT_BURST_BUFFER_RESOURCE;
4934 	else if (rc == ESLURM_POWER_RESERVED)
4935 		job_ptr->state_reason = WAIT_POWER_RESERVED;
4936 	else if (rc == ESLURM_PARTITION_DOWN)
4937 		job_ptr->state_reason = WAIT_PART_DOWN;
4938 	else if (rc == ESLURM_INVALID_QOS)
4939 		job_ptr->state_reason = FAIL_QOS;
4940 	else if (rc == ESLURM_INVALID_ACCOUNT)
4941 		job_ptr->state_reason = FAIL_ACCOUNT;
4942 
4943 	FREE_NULL_BITMAP(avail_node_bitmap);
4944 	avail_node_bitmap = save_avail_node_bitmap;
4945 
4946 	return rc;
4947 }
4948 
_has_deadline(job_record_t * job_ptr)4949 static inline bool _has_deadline(job_record_t *job_ptr)
4950 {
4951 	if ((job_ptr->deadline) && (job_ptr->deadline != NO_VAL)) {
4952 		queue_job_scheduler();
4953 		return true;
4954 	}
4955 	return false;
4956 }
4957 
4958 /*
4959  * job_allocate - create job_records for the supplied job specification and
4960  *	allocate nodes for it.
4961  * IN job_specs - job specifications
4962  * IN immediate - if set then either initiate the job immediately or fail
4963  * IN will_run - don't initiate the job if set, just test if it could run
4964  *	now or later
4965  * OUT resp - will run response (includes start location, time, etc.)
4966  * IN allocate - resource allocation request only if set, batch job if zero
4967  * IN submit_uid -uid of user issuing the request
4968  * OUT job_pptr - set to pointer to job record
4969  * OUT err_msg - Custom error message to the user, caller to xfree results
4970  * IN protocol_version - version of the code the caller is using
4971  * RET 0 or an error code. If the job would only be able to execute with
4972  *	some change in partition configuration then
4973  *	ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned
4974  * globals: job_list - pointer to global job list
4975  *	list_part - global list of partition info
4976  *	default_part_loc - pointer to default partition
4977  */
job_allocate(job_desc_msg_t * job_specs,int immediate,int will_run,will_run_response_msg_t ** resp,int allocate,uid_t submit_uid,job_record_t ** job_pptr,char ** err_msg,uint16_t protocol_version)4978 extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
4979 			int will_run, will_run_response_msg_t **resp,
4980 			int allocate, uid_t submit_uid,
4981 			job_record_t **job_pptr, char **err_msg,
4982 			uint16_t protocol_version)
4983 {
4984 	static time_t sched_update = 0;
4985 	static int defer_sched = 0;
4986 	char *sched_params, *tmp_ptr;
4987 	int error_code, i;
4988 	bool no_alloc, top_prio, test_only, too_fragmented, independent;
4989 	job_record_t *job_ptr;
4990 	time_t now = time(NULL);
4991 	bool held_user = false;
4992 
4993 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
4994 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
4995 	xassert(verify_lock(NODE_LOCK, WRITE_LOCK));
4996 	xassert(verify_lock(PART_LOCK, READ_LOCK));
4997 
4998 	if (sched_update != slurmctld_conf.last_update) {
4999 		sched_update = slurmctld_conf.last_update;
5000 		sched_params = slurm_get_sched_params();
5001 		if (xstrcasestr(sched_params, "defer"))
5002 			defer_sched = 1;
5003 		else
5004 			defer_sched = 0;
5005 		if ((tmp_ptr = xstrcasestr(sched_params, "delay_boot="))) {
5006 			char *tmp_comma;
5007 			if ((tmp_comma = xstrstr(tmp_ptr, ",")))
5008 				*tmp_comma = '\0';
5009 			i = time_str2secs(tmp_ptr + 11);
5010 			if (i != NO_VAL)
5011 				delay_boot = i;
5012 			if (tmp_comma)
5013 				*tmp_comma = ',';
5014 		}
5015 		bf_min_age_reserve = 0;
5016 		if ((tmp_ptr = xstrcasestr(sched_params,
5017 					   "bf_min_age_reserve="))) {
5018 			int min_age = atoi(tmp_ptr + 19);
5019 			if (min_age > 0)
5020 				bf_min_age_reserve = min_age;
5021 		}
5022 
5023 		if (xstrcasestr(sched_params, "allow_zero_lic"))
5024 			validate_cfgd_licenses = false;
5025 
5026 		xfree(sched_params);
5027 	}
5028 
5029 	if (job_specs->array_bitmap)
5030 		i = bit_set_count(job_specs->array_bitmap);
5031 	else
5032 		i = 1;
5033 
5034 	if ((job_count + i) >= slurmctld_conf.max_job_cnt) {
5035 		error("%s: MaxJobCount limit from slurm.conf reached (%u)",
5036 		      __func__, slurmctld_conf.max_job_cnt);
5037 		return EAGAIN;
5038 	}
5039 
5040 	error_code = _job_create(job_specs, allocate, will_run,
5041 				 &job_ptr, submit_uid, err_msg,
5042 				 protocol_version);
5043 	*job_pptr = job_ptr;
5044 	if (error_code) {
5045 		if (job_ptr && (immediate || will_run)) {
5046 			/* this should never really happen here */
5047 			job_ptr->job_state = JOB_FAILED;
5048 			job_ptr->exit_code = 1;
5049 			job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
5050 			xfree(job_ptr->state_desc);
5051 			job_ptr->start_time = job_ptr->end_time = now;
5052 			job_completion_logger(job_ptr, false);
5053 			error("%s: setting %pJ to \"%s\"",
5054 			      __func__, job_ptr,
5055 			      job_reason_string(job_ptr->state_reason));
5056 		}
5057 		return error_code;
5058 	}
5059 	xassert(job_ptr);
5060 	if (job_specs->array_bitmap)
5061 		independent = false;
5062 	else
5063 		independent = job_independent(job_ptr);
5064 	/*
5065 	 * priority needs to be calculated after this since we set a
5066 	 * begin time in job_independent and that lets us know if the
5067 	 * job is eligible.
5068 	 */
5069 	if (job_ptr->priority == NO_VAL)
5070 		set_job_prio(job_ptr);
5071 
5072 	if (job_ptr->state_reason == WAIT_HELD_USER)
5073 		held_user = true;
5074 
5075 	if (independent &&
5076 	    (license_job_test(job_ptr, time(NULL), true) != SLURM_SUCCESS))
5077 		independent = false;
5078 
5079 	/* Avoid resource fragmentation if important */
5080 	if ((submit_uid || (job_specs->req_nodes == NULL)) &&
5081 	    independent && job_is_completing(NULL))
5082 		too_fragmented = true;	/* Don't pick nodes for job now */
5083 	/*
5084 	 * FIXME: Ideally we only want to refuse the request if the
5085 	 * required node list is insufficient to satisfy the job's
5086 	 * processor or node count requirements, but the overhead is
5087 	 * rather high to do that right here. We let requests from
5088 	 * user root proceed if a node list is specified, for
5089 	 * meta-schedulers (e.g. Maui, Moab, etc.).
5090 	 */
5091 	else
5092 		too_fragmented = false;
5093 
5094 	if (independent && (!too_fragmented) && !defer_sched)
5095 		top_prio = _top_priority(job_ptr, job_specs->het_job_offset);
5096 	else
5097 		top_prio = true;	/* don't bother testing,
5098 					 * it is not runable anyway */
5099 
5100 	if (immediate &&
5101 	    (too_fragmented || (!top_prio) || (!independent) || defer_sched)) {
5102 		job_ptr->job_state  = JOB_FAILED;
5103 		job_ptr->exit_code  = 1;
5104 		job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
5105 		xfree(job_ptr->state_desc);
5106 		job_ptr->start_time = job_ptr->end_time = now;
5107 		job_completion_logger(job_ptr, false);
5108 		if (!independent) {
5109 			debug2("%s: setting %pJ to \"%s\" due to dependency (%s)",
5110 			       __func__, job_ptr,
5111 			       job_reason_string(job_ptr->state_reason),
5112 			       slurm_strerror(ESLURM_DEPENDENCY));
5113 			return ESLURM_DEPENDENCY;
5114 		}
5115 		else if (too_fragmented) {
5116 			debug2("%s: setting %pJ to \"%s\" due to fragmentation (%s)",
5117 			       __func__, job_ptr,
5118 			       job_reason_string(job_ptr->state_reason),
5119 			       slurm_strerror(ESLURM_FRAGMENTATION));
5120 			return ESLURM_FRAGMENTATION;
5121 		}
5122 		else if (!top_prio) {
5123 			debug2("%s: setting %pJ to \"%s\" because it's not top priority (%s)",
5124 			       __func__, job_ptr,
5125 			       job_reason_string(job_ptr->state_reason),
5126 			       slurm_strerror(ESLURM_NOT_TOP_PRIORITY));
5127 			return ESLURM_NOT_TOP_PRIORITY;
5128 		} else {
5129 			job_ptr->state_reason = FAIL_DEFER;
5130 			debug2("%s: setting %pJ to \"%s\" due to SchedulerParameters=defer (%s)",
5131 			       __func__, job_ptr,
5132 			       job_reason_string(job_ptr->state_reason),
5133 			       slurm_strerror(ESLURM_DEFER));
5134 			return ESLURM_DEFER;
5135 		}
5136 	}
5137 
5138 	if (will_run && resp) {
5139 		job_desc_msg_t job_desc_msg;
5140 		int rc;
5141 		slurm_init_job_desc_msg(&job_desc_msg);
5142 		job_desc_msg.job_id = job_ptr->job_id;
5143 		rc = job_start_data(&job_desc_msg, resp);
5144 		job_ptr->job_state  = JOB_FAILED;
5145 		job_ptr->exit_code  = 1;
5146 		job_ptr->start_time = job_ptr->end_time = now;
5147 		purge_job_record(job_ptr->job_id);
5148 		return rc;
5149 	}
5150 
5151 	/*
5152 	 * fed jobs need to go to the siblings first so don't attempt to
5153 	 * schedule the job now.
5154 	 */
5155 	test_only = will_run || job_ptr->deadline || (allocate == 0) ||
5156 		    job_ptr->fed_details;
5157 
5158 	no_alloc = test_only || too_fragmented || _has_deadline(job_ptr) ||
5159 		   (!top_prio) || (!independent) || !avail_front_end(job_ptr) ||
5160 		   (job_specs->het_job_offset != NO_VAL) || defer_sched;
5161 
5162 	no_alloc = no_alloc || (bb_g_job_test_stage_in(job_ptr, no_alloc) != 1);
5163 
5164 	error_code = _select_nodes_parts(job_ptr, no_alloc, NULL, err_msg);
5165 	if (!test_only) {
5166 		last_job_update = now;
5167 	}
5168 
5169 	if (held_user)
5170 		job_ptr->state_reason = WAIT_HELD_USER;
5171        /*
5172 	* Moved this (_create_job_array) here to handle when a job
5173 	* array is submitted since we
5174 	* want to know the array task count when we check the job against
5175 	* QOS/Assoc limits
5176 	*/
5177 	_create_job_array(job_ptr, job_specs);
5178 
5179 	slurmctld_diag_stats.jobs_submitted +=
5180 		(job_ptr->array_recs && job_ptr->array_recs->task_cnt) ?
5181 		job_ptr->array_recs->task_cnt : 1;
5182 
5183 	acct_policy_add_job_submit(job_ptr);
5184 
5185 	if ((error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) &&
5186 	    (slurmctld_conf.enforce_part_limits != PARTITION_ENFORCE_NONE))
5187 		;	/* Reject job submission */
5188 	else if ((error_code == ESLURM_NODES_BUSY) ||
5189 		 (error_code == ESLURM_RESERVATION_BUSY) ||
5190 		 (error_code == ESLURM_JOB_HELD) ||
5191 		 (error_code == ESLURM_NODE_NOT_AVAIL) ||
5192 		 (error_code == ESLURM_QOS_THRES) ||
5193 		 (error_code == ESLURM_ACCOUNTING_POLICY) ||
5194 		 (error_code == ESLURM_RESERVATION_NOT_USABLE) ||
5195 		 (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) ||
5196 		 (error_code == ESLURM_POWER_NOT_AVAIL) ||
5197 		 (error_code == ESLURM_BURST_BUFFER_WAIT) ||
5198 		 (error_code == ESLURM_POWER_RESERVED) ||
5199 		 (error_code == ESLURM_PARTITION_DOWN)) {
5200 		/* Not fatal error, but job can't be scheduled right now */
5201 		if (immediate) {
5202 			job_ptr->job_state  = JOB_FAILED;
5203 			job_ptr->exit_code  = 1;
5204 			job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
5205 			xfree(job_ptr->state_desc);
5206 			job_ptr->start_time = job_ptr->end_time = now;
5207 			job_completion_logger(job_ptr, false);
5208 			debug2("%s: setting %pJ to \"%s\" because it cannot be immediately allocated (%s)",
5209 			       __func__, job_ptr,
5210 			       job_reason_string(job_ptr->state_reason),
5211 			       slurm_strerror(error_code));
5212 		} else {	/* job remains queued */
5213 			if ((error_code == ESLURM_NODES_BUSY) ||
5214 			    (error_code == ESLURM_BURST_BUFFER_WAIT) ||
5215 			    (error_code == ESLURM_RESERVATION_BUSY) ||
5216 			    (error_code == ESLURM_ACCOUNTING_POLICY) ||
5217 			    ((error_code == ESLURM_PARTITION_DOWN) &&
5218 			    (job_ptr->batch_flag))) {
5219 				error_code = SLURM_SUCCESS;
5220 			}
5221 		}
5222 		return error_code;
5223 	}
5224 
5225 	if (error_code) {	/* fundamental flaw in job request */
5226 		job_ptr->job_state  = JOB_FAILED;
5227 		job_ptr->exit_code  = 1;
5228 		job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
5229 		xfree(job_ptr->state_desc);
5230 		job_ptr->start_time = job_ptr->end_time = now;
5231 		job_completion_logger(job_ptr, false);
5232 		debug2("%s: setting %pJ to \"%s\" due to a flaw in the job request (%s)",
5233 		       __func__, job_ptr,
5234 		       job_reason_string(job_ptr->state_reason),
5235 		       slurm_strerror(error_code));
5236 		return error_code;
5237 	}
5238 
5239 	if (will_run) {		/* job would run, flag job destruction */
5240 		job_ptr->job_state  = JOB_FAILED;
5241 		job_ptr->exit_code  = 1;
5242 		job_ptr->start_time = job_ptr->end_time = now;
5243 		purge_job_record(job_ptr->job_id);
5244 	} else if (!with_slurmdbd)
5245 		jobacct_storage_g_job_start(acct_db_conn, job_ptr);
5246 
5247 	if (!will_run) {
5248 		sched_debug2("%pJ allocated resources: NodeList=%s",
5249 			     job_ptr, job_ptr->nodes);
5250 		rebuild_job_part_list(job_ptr);
5251 	}
5252 
5253 	return SLURM_SUCCESS;
5254 }
5255 
5256 /*
5257  * job_fail - terminate a job due to initiation failure
5258  * IN job_ptr - Pointer to job to be killed
5259  * IN job_state - desired job state (JOB_BOOT_FAIL, JOB_NODE_FAIL, etc.)
5260  * RET 0 on success, otherwise ESLURM error code
5261  */
_job_fail(job_record_t * job_ptr,uint32_t job_state)5262 static int _job_fail(job_record_t *job_ptr, uint32_t job_state)
5263 {
5264 	time_t now = time(NULL);
5265 	bool suspended = false;
5266 
5267 	if (IS_JOB_FINISHED(job_ptr))
5268 		return ESLURM_ALREADY_DONE;
5269 	if (IS_JOB_SUSPENDED(job_ptr)) {
5270 		uint32_t suspend_job_state = job_ptr->job_state;
5271 		/*
5272 		 * we can't have it as suspended when we call the
5273 		 * accounting stuff.
5274 		 */
5275 		job_ptr->job_state = JOB_CANCELLED;
5276 		jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
5277 		job_ptr->job_state = suspend_job_state;
5278 		suspended = true;
5279 	}
5280 
5281 	if (IS_JOB_CONFIGURING(job_ptr) || IS_JOB_RUNNING(job_ptr) ||
5282 	    suspended) {
5283 		/* No need to signal steps, deallocate kills them */
5284 		job_ptr->time_last_active       = now;
5285 		if (suspended) {
5286 			job_ptr->end_time       = job_ptr->suspend_time;
5287 			job_ptr->tot_sus_time  +=
5288 				difftime(now, job_ptr->suspend_time);
5289 		} else
5290 			job_ptr->end_time       = now;
5291 		last_job_update                 = now;
5292 		job_ptr->job_state = job_state | JOB_COMPLETING;
5293 		job_ptr->exit_code = 1;
5294 		job_ptr->state_reason = FAIL_LAUNCH;
5295 		xfree(job_ptr->state_desc);
5296 		job_completion_logger(job_ptr, false);
5297 		if (job_ptr->node_bitmap) {
5298 			build_cg_bitmap(job_ptr);
5299 			deallocate_nodes(job_ptr, false, suspended, false);
5300 		}
5301 		return SLURM_SUCCESS;
5302 	}
5303 	/* All other states */
5304 	verbose("job_fail: %pJ can't be killed from state=%s",
5305 		job_ptr, job_state_string(job_ptr->job_state));
5306 
5307 	return ESLURM_TRANSITION_STATE_NO_UPDATE;
5308 
5309 }
5310 
5311 /*
5312  * job_fail - terminate a job due to initiation failure
5313  * IN job_id - ID of the job to be killed
5314  * IN job_state - desired job state (JOB_BOOT_FAIL, JOB_NODE_FAIL, etc.)
5315  * RET 0 on success, otherwise ESLURM error code
5316  */
job_fail(uint32_t job_id,uint32_t job_state)5317 extern int job_fail(uint32_t job_id, uint32_t job_state)
5318 {
5319 	job_record_t *job_ptr, *het_job, *het_job_leader;
5320 	ListIterator iter;
5321 	int rc = SLURM_SUCCESS, rc1;
5322 
5323 	job_ptr = find_job_record(job_id);
5324 	if (job_ptr == NULL) {
5325 		error("job_fail: invalid JobId=%u", job_id);
5326 		return ESLURM_INVALID_JOB_ID;
5327 	}
5328 
5329 	if (job_ptr->het_job_id == 0)
5330 		return _job_fail(job_ptr, job_state);
5331 
5332 	het_job_leader = find_job_record(job_ptr->het_job_id);
5333 	if (!het_job_leader) {
5334 		error("%s: Hetjob leader %pJ not found",
5335 		      __func__, job_ptr);
5336 		return _job_fail(job_ptr, job_state);
5337 	}
5338 	if (!het_job_leader->het_job_list) {
5339 		error("%s: Hetjob leader %pJ job list is NULL",
5340 		      __func__, job_ptr);
5341 		return _job_fail(job_ptr, job_state);
5342 	}
5343 
5344 	iter = list_iterator_create(het_job_leader->het_job_list);
5345 	while ((het_job = list_next(iter))) {
5346 		if (het_job_leader->het_job_id != het_job->het_job_id) {
5347 			error("%s: Bad het_job_list for %pJ",
5348 			      __func__, het_job_leader);
5349 			continue;
5350 		}
5351 		rc1 = _job_fail(het_job, job_state);
5352 		if (rc1 != SLURM_SUCCESS)
5353 			rc = rc1;
5354 	}
5355 	list_iterator_destroy(iter);
5356 
5357 	return rc;
5358 }
5359 
5360 /*
5361  * Signal a job based upon job pointer.
5362  * Authentication and authorization checks must be performed before calling.
5363  */
job_signal(job_record_t * job_ptr,uint16_t signal,uint16_t flags,uid_t uid,bool preempt)5364 extern int job_signal(job_record_t *job_ptr, uint16_t signal,
5365 		      uint16_t flags, uid_t uid, bool preempt)
5366 {
5367 	uint16_t job_term_state;
5368 	time_t now = time(NULL);
5369 
5370 	trace_job(job_ptr, __func__, "enter");
5371 
5372 	if (IS_JOB_STAGE_OUT(job_ptr) && (flags & KILL_HURRY)) {
5373 		job_ptr->bit_flags |= JOB_KILL_HURRY;
5374 		return bb_g_job_cancel(job_ptr);
5375 	}
5376 
5377 	if (IS_JOB_FINISHED(job_ptr))
5378 		return ESLURM_ALREADY_DONE;
5379 
5380 	/*
5381 	 * If is origin job then cancel siblings -- if they exist.
5382 	 * origin job = because it knows where the siblings are
5383 	 * If the job is running locally then just do the normal signaling
5384 	 */
5385 	if (!(flags & KILL_NO_SIBS) && !IS_JOB_RUNNING(job_ptr) &&
5386 	    job_ptr->fed_details && fed_mgr_fed_rec) {
5387 		uint32_t origin_id = fed_mgr_get_cluster_id(job_ptr->job_id);
5388 		slurmdb_cluster_rec_t *origin =
5389 			fed_mgr_get_cluster_by_id(origin_id);
5390 
5391 		if (origin && (origin == fed_mgr_cluster_rec) &&
5392 		    fed_mgr_job_started_on_sib(job_ptr)) {
5393 			/*
5394 			 * If the job is running on a remote cluster then wait
5395 			 * for the job to report back that it's completed,
5396 			 * otherwise just signal the pending siblings and itself
5397 			 * (by not returning).
5398 			 */
5399 			return fed_mgr_job_cancel(job_ptr, signal, flags, uid,
5400 						  false);
5401 		} else if (origin && (origin == fed_mgr_cluster_rec)) {
5402 			/* cancel origin job and revoke sibling jobs */
5403 			fed_mgr_job_revoke_sibs(job_ptr);
5404 			fed_mgr_remove_remote_dependencies(job_ptr);
5405 		} else if (!origin ||
5406 			   !origin->fed.send ||
5407 			   (((slurm_persist_conn_t *)origin->fed.send)->fd
5408 			    == -1)) {
5409 			/*
5410 			 * The origin is down just signal all of the viable
5411 			 * sibling jobs
5412 			 */
5413 			fed_mgr_job_cancel(job_ptr, signal, flags, uid, true);
5414 		}
5415 	}
5416 
5417 	/* let node select plugin do any state-dependent signaling actions */
5418 	select_g_job_signal(job_ptr, signal);
5419 	last_job_update = now;
5420 
5421 	/* save user ID of the one who requested the job be cancelled */
5422 	if (signal == SIGKILL)
5423 		job_ptr->requid = uid;
5424 	if (IS_JOB_PENDING(job_ptr) && IS_JOB_COMPLETING(job_ptr) &&
5425 	    (signal == SIGKILL)) {
5426 		/* Prevent job requeue, otherwise preserve state */
5427 		job_ptr->job_state = JOB_CANCELLED | JOB_COMPLETING;
5428 
5429 		/* build_cg_bitmap() not needed, job already completing */
5430 		verbose("%s: %u of requeuing %pJ successful",
5431 			__func__, signal, job_ptr);
5432 		return SLURM_SUCCESS;
5433 	}
5434 
5435 	if (flags & KILL_HURRY)
5436 		job_ptr->bit_flags |= JOB_KILL_HURRY;
5437 
5438 	if (IS_JOB_CONFIGURING(job_ptr) && (signal == SIGKILL)) {
5439 		last_job_update         = now;
5440 		job_ptr->end_time       = now;
5441 		job_ptr->job_state      = JOB_CANCELLED | JOB_COMPLETING;
5442 		if (flags & KILL_FED_REQUEUE)
5443 			job_ptr->job_state |= JOB_REQUEUE;
5444 		build_cg_bitmap(job_ptr);
5445 		job_completion_logger(job_ptr, false);
5446 		deallocate_nodes(job_ptr, false, false, false);
5447 		if (flags & KILL_FED_REQUEUE) {
5448 			job_ptr->job_state &= (~JOB_REQUEUE);
5449 		}
5450 		verbose("%s: %u of configuring %pJ successful",
5451 			__func__, signal, job_ptr);
5452 		return SLURM_SUCCESS;
5453 	}
5454 
5455 	if (IS_JOB_PENDING(job_ptr) && (signal == SIGKILL)) {
5456 		job_ptr->job_state	= JOB_CANCELLED;
5457 		if (flags & KILL_FED_REQUEUE)
5458 			job_ptr->job_state |= JOB_REQUEUE;
5459 		job_ptr->start_time	= now;
5460 		job_ptr->end_time	= now;
5461 		srun_allocate_abort(job_ptr);
5462 		job_completion_logger(job_ptr, false);
5463 		if (flags & KILL_FED_REQUEUE) {
5464 			job_ptr->job_state &= (~JOB_REQUEUE);
5465 		}
5466 		/*
5467 		 * Send back a response to the origin cluster, in other cases
5468 		 * where the job is running the job will send back a response
5469 		 * after the job is is completed. This can happen when the
5470 		 * pending origin job is put into a hold state and the siblings
5471 		 * are removed or when the job is canceled from the origin.
5472 		 */
5473 		fed_mgr_job_complete(job_ptr, 0, now);
5474 		verbose("%s: %u of pending %pJ successful",
5475 			__func__, signal, job_ptr);
5476 		return SLURM_SUCCESS;
5477 	}
5478 
5479 	if (preempt)
5480 		job_term_state = JOB_PREEMPTED;
5481 	else
5482 		job_term_state = JOB_CANCELLED;
5483 	if (IS_JOB_SUSPENDED(job_ptr) && (signal == SIGKILL)) {
5484 		last_job_update         = now;
5485 		job_ptr->end_time       = job_ptr->suspend_time;
5486 		job_ptr->tot_sus_time  += difftime(now, job_ptr->suspend_time);
5487 		job_ptr->job_state      = job_term_state | JOB_COMPLETING;
5488 		if (flags & KILL_FED_REQUEUE)
5489 			job_ptr->job_state |= JOB_REQUEUE;
5490 		build_cg_bitmap(job_ptr);
5491 		jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
5492 		job_completion_logger(job_ptr, false);
5493 		if (flags & KILL_FED_REQUEUE)
5494 			job_ptr->job_state &= (~JOB_REQUEUE);
5495 		deallocate_nodes(job_ptr, false, true, preempt);
5496 		verbose("%s: %u of suspended %pJ successful",
5497 			__func__, signal, job_ptr);
5498 		return SLURM_SUCCESS;
5499 	}
5500 
5501 	if (IS_JOB_RUNNING(job_ptr)) {
5502 
5503 		if ((signal == SIGSTOP) || (signal == SIGCONT)) {
5504 			if (IS_JOB_SIGNALING(job_ptr)) {
5505 				verbose("%s: %u not send to %pJ 0x%x",
5506 					__func__, signal, job_ptr,
5507 					job_ptr->job_state);
5508 				return ESLURM_TRANSITION_STATE_NO_UPDATE;
5509 			}
5510 			job_ptr->job_state |= JOB_SIGNALING;
5511 		}
5512 
5513 		if ((signal == SIGKILL)
5514 		    && !(flags & KILL_STEPS_ONLY)
5515 		    && !(flags & KILL_JOB_BATCH)) {
5516 			/* No need to signal steps, deallocate kills them
5517 			 */
5518 			job_ptr->time_last_active	= now;
5519 			job_ptr->end_time		= now;
5520 			last_job_update			= now;
5521 			job_ptr->job_state = job_term_state | JOB_COMPLETING;
5522 			if (flags & KILL_FED_REQUEUE)
5523 				job_ptr->job_state |= JOB_REQUEUE;
5524 			build_cg_bitmap(job_ptr);
5525 			job_completion_logger(job_ptr, false);
5526 			deallocate_nodes(job_ptr, false, false, preempt);
5527 			if (flags & KILL_FED_REQUEUE)
5528 				job_ptr->job_state &= (~JOB_REQUEUE);
5529 		} else if (job_ptr->batch_flag && (flags & KILL_JOB_BATCH)) {
5530 			_signal_batch_job(job_ptr, signal, flags);
5531 		} else if ((flags & KILL_JOB_BATCH) && !job_ptr->batch_flag) {
5532 			if ((signal == SIGSTOP) || (signal == SIGCONT))
5533 				job_ptr->job_state &= ~JOB_SIGNALING;
5534 			return ESLURM_JOB_SCRIPT_MISSING;
5535 		} else {
5536 			_signal_job(job_ptr, signal, flags);
5537 		}
5538 		verbose("%s: %u of running %pJ successful 0x%x",
5539 			__func__, signal, job_ptr, job_ptr->job_state);
5540 		return SLURM_SUCCESS;
5541 	}
5542 
5543 	verbose("%s: %pJ can't be sent signal %u from state=%s",
5544 		__func__, job_ptr, signal,
5545 		job_state_string(job_ptr->job_state));
5546 
5547 	trace_job(job_ptr, __func__, "return");
5548 
5549 	return ESLURM_TRANSITION_STATE_NO_UPDATE;
5550 }
5551 
5552 /*
5553  * job_signal_id - signal the specified job
5554  * IN job_id - id of the job to be signaled
5555  * IN signal - signal to send, SIGKILL == cancel the job
5556  * IN flags  - see KILL_JOB_* flags in slurm.h
5557  * IN uid - uid of requesting user
5558  * IN preempt - true if job being preempted
5559  * RET 0 on success, otherwise ESLURM error code
5560  */
job_signal_id(uint32_t job_id,uint16_t signal,uint16_t flags,uid_t uid,bool preempt)5561 extern int job_signal_id(uint32_t job_id, uint16_t signal, uint16_t flags,
5562 			 uid_t uid, bool preempt)
5563 {
5564 	job_record_t *job_ptr;
5565 
5566 	job_ptr = find_job_record(job_id);
5567 	if (job_ptr == NULL) {
5568 		info("%s: invalid JobId=%u", __func__, job_id);
5569 		return ESLURM_INVALID_JOB_ID;
5570 	}
5571 
5572 	if ((job_ptr->user_id != uid) && !validate_operator(uid) &&
5573 	    !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
5574 					  job_ptr->account)) {
5575 		error("Security violation, JOB_CANCEL RPC for %pJ from uid %u",
5576 		      job_ptr, uid);
5577 		return ESLURM_ACCESS_DENIED;
5578 	}
5579 
5580 	return job_signal(job_ptr, signal, flags, uid, preempt);
5581 }
5582 
5583 /* Signal all components of a hetjob */
het_job_signal(job_record_t * het_job_leader,uint16_t signal,uint16_t flags,uid_t uid,bool preempt)5584 extern int het_job_signal(job_record_t *het_job_leader, uint16_t signal,
5585 			  uint16_t flags, uid_t uid, bool preempt)
5586 {
5587 	ListIterator iter;
5588 	int rc = SLURM_SUCCESS, rc1;
5589 	job_record_t *het_job;
5590 
5591 	iter = list_iterator_create(het_job_leader->het_job_list);
5592 	while ((het_job = list_next(iter))) {
5593 		if (het_job_leader->het_job_id != het_job->het_job_id) {
5594 			error("%s: Bad het_job_list for %pJ",
5595 			      __func__, het_job_leader);
5596 			continue;
5597 		}
5598 		rc1 = job_signal(het_job, signal, flags, uid, preempt);
5599 		if (rc1 != SLURM_SUCCESS)
5600 			rc = rc1;
5601 	}
5602 	list_iterator_destroy(iter);
5603 
5604 	return rc;
5605 }
5606 
_get_whole_hetjob(void)5607 static bool _get_whole_hetjob(void)
5608 {
5609 	static time_t sched_update = 0;
5610 	static bool whole_hetjob = false;
5611 	char *sched_params = NULL;
5612 
5613 	if (sched_update != slurmctld_conf.last_update) {
5614 		sched_update = slurmctld_conf.last_update;
5615 		sched_params = slurm_get_sched_params();
5616 		if (xstrcasestr(sched_params, "whole_hetjob") ||
5617 		    xstrcasestr(sched_params, "whole_pack"))
5618 			whole_hetjob = true;
5619 		else
5620 			whole_hetjob = false;
5621 		xfree(sched_params);
5622 	}
5623 
5624 	return whole_hetjob;
5625 }
5626 
5627 /*
5628  * job_str_signal - signal the specified job
5629  * IN job_id_str - id of the job to be signaled, valid formats include "#"
5630  *	"#_#" and "#_[expr]"
5631  * IN signal - signal to send, SIGKILL == cancel the job
5632  * IN flags  - see KILL_JOB_* flags in slurm.h
5633  * IN uid - uid of requesting user
5634  * IN preempt - true if job being preempted
5635  * RET 0 on success, otherwise ESLURM error code
5636  */
job_str_signal(char * job_id_str,uint16_t signal,uint16_t flags,uid_t uid,bool preempt)5637 extern int job_str_signal(char *job_id_str, uint16_t signal, uint16_t flags,
5638 			  uid_t uid, bool preempt)
5639 {
5640 	job_record_t *job_ptr;
5641 	uint32_t job_id;
5642 	time_t now = time(NULL);
5643 	char *end_ptr = NULL, *tok, *tmp;
5644 	long int long_id;
5645 	bitstr_t *array_bitmap = NULL;
5646 	bool valid = true;
5647 	int32_t i, i_first, i_last;
5648 	int rc = SLURM_SUCCESS, rc2, len;
5649 
5650 	if (max_array_size == NO_VAL) {
5651 		max_array_size = slurmctld_conf.max_array_sz;
5652 	}
5653 
5654 	long_id = strtol(job_id_str, &end_ptr, 10);
5655 	if ((long_id <= 0) || (long_id == LONG_MAX) ||
5656 	    ((end_ptr[0] != '\0') && (end_ptr[0] != '_') &&
5657 	     (end_ptr[0] != '+'))) {
5658 		info("%s(1): invalid JobId=%s", __func__, job_id_str);
5659 		return ESLURM_INVALID_JOB_ID;
5660 	}
5661 	if ((end_ptr[0] == '_') && (end_ptr[1] == '*'))
5662 		end_ptr += 2;	/* Defaults to full job array */
5663 
5664 	if (end_ptr[0] == '+') {	/* Signal hetjob element */
5665 		job_id = (uint32_t) long_id;
5666 		long_id = strtol(end_ptr + 1, &end_ptr, 10);
5667 		if ((long_id < 0) || (long_id == LONG_MAX) ||
5668 		    (end_ptr[0] != '\0')) {
5669 			info("%s(2): invalid JobId=%s", __func__, job_id_str);
5670 			return ESLURM_INVALID_JOB_ID;
5671 		}
5672 		job_ptr = find_het_job_record(job_id, (uint32_t) long_id);
5673 		if (!job_ptr)
5674 			return ESLURM_ALREADY_DONE;
5675 		if ((job_ptr->user_id != uid) && !validate_operator(uid) &&
5676 		    !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
5677 						  job_ptr->account)) {
5678 			error("Security violation, REQUEST_KILL_JOB RPC for %pJ from uid %u",
5679 			      job_ptr, uid);
5680 			return ESLURM_ACCESS_DENIED;
5681 		}
5682 		if (IS_JOB_PENDING(job_ptr))
5683 			return ESLURM_NOT_WHOLE_HET_JOB;
5684 		return job_signal(job_ptr, signal, flags, uid,preempt);
5685 	}
5686 
5687 	last_job_update = now;
5688 	job_id = (uint32_t) long_id;
5689 	if (end_ptr[0] == '\0') {	/* Single job (or full job array) */
5690 		int jobs_done = 0, jobs_signaled = 0;
5691 		job_record_t *job_ptr_done = NULL;
5692 		job_ptr = find_job_record(job_id);
5693 		if (job_ptr && (job_ptr->user_id != uid) &&
5694 		    !validate_operator(uid) &&
5695 		    !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
5696 						  job_ptr->account)) {
5697 			error("Security violation, REQUEST_KILL_JOB RPC for %pJ from uid %u",
5698 			      job_ptr, uid);
5699 			return ESLURM_ACCESS_DENIED;
5700 		}
5701 		if (job_ptr && job_ptr->het_job_list) {   /* Hetjob leader */
5702 			return het_job_signal(job_ptr, signal, flags, uid,
5703 					      preempt);
5704 		}
5705 		if (job_ptr && job_ptr->het_job_id && _get_whole_hetjob()) {
5706 			job_record_t *het_job_leader;
5707 			het_job_leader = find_job_record(job_ptr->het_job_id);
5708 			if (het_job_leader && het_job_leader->het_job_list) {
5709 				return het_job_signal(het_job_leader, signal,
5710 						      flags, uid, preempt);
5711 			}
5712 			error("%s: Hetjob leader %pJ not found",
5713 			      __func__, job_ptr);
5714 		}
5715 		if (job_ptr && job_ptr->het_job_id && IS_JOB_PENDING(job_ptr))
5716 			return ESLURM_NOT_WHOLE_HET_JOB;/* Hetjob child */
5717 		if (job_ptr && (job_ptr->array_task_id == NO_VAL) &&
5718 		    (job_ptr->array_recs == NULL)) {
5719 			/* This is a regular job, not a job array */
5720 			return job_signal_id(job_id, signal, flags, uid, preempt);
5721 		}
5722 
5723 		/*
5724 		 * This will kill the meta record that holds all
5725 		 * pending jobs.  We want to kill this first so we
5726 		 * don't start jobs just to kill them as we are
5727 		 * killing other elements of the array.
5728 		 */
5729 		if (job_ptr && job_ptr->array_recs) {
5730 			/* This is a job array */
5731 			job_ptr_done = job_ptr;
5732 			rc = job_signal(job_ptr, signal, flags, uid, preempt);
5733 			if (rc == ESLURM_ACCESS_DENIED)
5734 				return rc;
5735 			jobs_signaled++;
5736 			if (rc == ESLURM_ALREADY_DONE) {
5737 				jobs_done++;
5738 				rc = SLURM_SUCCESS;
5739 			}
5740 		}
5741 
5742 		/* Signal all tasks of this job array */
5743 		job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
5744 		if (!job_ptr && !job_ptr_done) {
5745 			info("%s(3): invalid JobId=%u", __func__, job_id);
5746 			return ESLURM_INVALID_JOB_ID;
5747 		}
5748 		while (job_ptr) {
5749 			if (job_ptr->array_job_id == job_id)
5750 				break;
5751 			job_ptr = job_ptr->job_array_next_j;
5752 		}
5753 		while (job_ptr) {
5754 			if ((job_ptr->array_job_id == job_id) &&
5755 			    (job_ptr != job_ptr_done)) {
5756 				rc2 = job_signal(job_ptr, signal, flags, uid,
5757 						 preempt);
5758 				jobs_signaled++;
5759 				if (rc2 == ESLURM_ALREADY_DONE) {
5760 					jobs_done++;
5761 				} else {
5762 					rc = MAX(rc, rc2);
5763 				}
5764 			}
5765 			job_ptr = job_ptr->job_array_next_j;
5766 		}
5767 		if ((rc == SLURM_SUCCESS) && (jobs_done == jobs_signaled))
5768 			return ESLURM_ALREADY_DONE;
5769 		return rc;
5770 
5771 	}
5772 
5773 	array_bitmap = bit_alloc(max_array_size);
5774 	tmp = xstrdup(end_ptr + 1);
5775 	tok = strtok_r(tmp, ",", &end_ptr);
5776 	while (tok && valid) {
5777 		valid = _parse_array_tok(tok, array_bitmap,
5778 					 max_array_size);
5779 		tok = strtok_r(NULL, ",", &end_ptr);
5780 	}
5781 	xfree(tmp);
5782 	if (valid) {
5783 		i_last = bit_fls(array_bitmap);
5784 		if (i_last < 0)
5785 			valid = false;
5786 	}
5787 	if (!valid) {
5788 		info("%s(4): invalid JobId=%s", __func__, job_id_str);
5789 		rc = ESLURM_INVALID_JOB_ID;
5790 		goto endit;
5791 	}
5792 
5793 	/* Find some job record and validate the user signaling the job */
5794 	job_ptr = find_job_record(job_id);
5795 	if (job_ptr == NULL) {
5796 		job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
5797 		while (job_ptr) {
5798 			if (job_ptr->array_job_id == job_id)
5799 				break;
5800 			job_ptr = job_ptr->job_array_next_j;
5801 		}
5802 	}
5803 	if ((job_ptr == NULL) ||
5804 	    ((job_ptr->array_task_id == NO_VAL) &&
5805 	     (job_ptr->array_recs == NULL))) {
5806 		info("%s(5): invalid JobId=%s", __func__, job_id_str);
5807 		rc = ESLURM_INVALID_JOB_ID;
5808 		goto endit;
5809 	}
5810 
5811 	if ((job_ptr->user_id != uid) && !validate_operator(uid) &&
5812 	    !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
5813 					  job_ptr->account)) {
5814 		error("%s: Security violation JOB_CANCEL RPC for %pJ from uid %u",
5815 		      __func__, job_ptr, uid);
5816 		rc = ESLURM_ACCESS_DENIED;
5817 		goto endit;
5818 	}
5819 
5820 	if (IS_JOB_PENDING(job_ptr) &&
5821 	    job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) {
5822 		/* Ensure bitmap sizes match for AND operations */
5823 		len = bit_size(job_ptr->array_recs->task_id_bitmap);
5824 		i_last++;
5825 		if (i_last < len) {
5826 			array_bitmap = bit_realloc(array_bitmap, len);
5827 		} else {
5828 			array_bitmap = bit_realloc(array_bitmap, i_last);
5829 			job_ptr->array_recs->task_id_bitmap = bit_realloc(
5830 				job_ptr->array_recs->task_id_bitmap, i_last);
5831 		}
5832 		if (signal == SIGKILL) {
5833 			uint32_t orig_task_cnt, new_task_count;
5834 			/* task_id_bitmap changes, so we need a copy of it */
5835 			bitstr_t *task_id_bitmap_orig =
5836 				bit_copy(job_ptr->array_recs->task_id_bitmap);
5837 
5838 			bit_and_not(job_ptr->array_recs->task_id_bitmap,
5839 				array_bitmap);
5840 			xfree(job_ptr->array_recs->task_id_str);
5841 			orig_task_cnt = job_ptr->array_recs->task_cnt;
5842 			new_task_count = bit_set_count(job_ptr->array_recs->
5843 						       task_id_bitmap);
5844 			if (!new_task_count) {
5845 				last_job_update		= now;
5846 				job_ptr->job_state	= JOB_CANCELLED;
5847 				job_ptr->start_time	= now;
5848 				job_ptr->end_time	= now;
5849 				job_ptr->requid		= uid;
5850 				srun_allocate_abort(job_ptr);
5851 				job_completion_logger(job_ptr, false);
5852 				/*
5853 				 * Master job record, even wihtout tasks,
5854 				 * counts as one job record
5855 				 */
5856 				job_count -= (orig_task_cnt - 1);
5857 			} else {
5858 				_job_array_comp(job_ptr, false, false);
5859 				job_count -= (orig_task_cnt - new_task_count);
5860 				/*
5861 				 * Since we are altering the job array's
5862 				 * task_cnt we must go alter this count in the
5863 				 * acct_policy code as if they are finishing
5864 				 * (accrue_cnt/job_submit etc...).
5865 				 */
5866 				if (job_ptr->array_recs->task_cnt >
5867 				    new_task_count) {
5868 					uint32_t tmp_state = job_ptr->job_state;
5869 					job_ptr->job_state = JOB_CANCELLED;
5870 
5871 					job_ptr->array_recs->task_cnt -=
5872 						new_task_count;
5873 					acct_policy_remove_job_submit(job_ptr);
5874 					job_ptr->bit_flags &= ~JOB_ACCRUE_OVER;
5875 					job_ptr->job_state = tmp_state;
5876 				}
5877 			}
5878 
5879 			/*
5880 			 * Set the task_cnt here since
5881 			 * job_completion_logger needs the total
5882 			 * pending count to handle the acct_policy
5883 			 * limit for submitted jobs correctly.
5884 			 */
5885 			job_ptr->array_recs->task_cnt = new_task_count;
5886 			bit_and_not(array_bitmap, task_id_bitmap_orig);
5887 			FREE_NULL_BITMAP(task_id_bitmap_orig);
5888 		} else {
5889 			bit_and_not(array_bitmap,
5890 				    job_ptr->array_recs->task_id_bitmap);
5891 			rc = ESLURM_TRANSITION_STATE_NO_UPDATE;
5892 		}
5893 	}
5894 
5895 	i_first = bit_ffs(array_bitmap);
5896 	if (i_first >= 0)
5897 		i_last = bit_fls(array_bitmap);
5898 	else
5899 		i_last = -2;
5900 	for (i = i_first; i <= i_last; i++) {
5901 		if (!bit_test(array_bitmap, i))
5902 			continue;
5903 		job_ptr = find_job_array_rec(job_id, i);
5904 		if (job_ptr == NULL) {
5905 			info("%s(6): invalid JobId=%u_%d",
5906 			      __func__, job_id, i);
5907 			rc = ESLURM_INVALID_JOB_ID;
5908 			continue;
5909 		}
5910 
5911 		rc2 = job_signal(job_ptr, signal, flags, uid, preempt);
5912 		rc = MAX(rc, rc2);
5913 	}
5914 endit:
5915 	FREE_NULL_BITMAP(array_bitmap);
5916 
5917 	return rc;
5918 }
5919 
_signal_batch_job(job_record_t * job_ptr,uint16_t signal,uint16_t flags)5920 static void _signal_batch_job(job_record_t *job_ptr, uint16_t signal,
5921 			      uint16_t flags)
5922 {
5923 	bitoff_t i;
5924 	signal_tasks_msg_t *signal_tasks_msg = NULL;
5925 	agent_arg_t *agent_args = NULL;
5926 
5927 	xassert(job_ptr);
5928 	xassert(job_ptr->batch_host);
5929 	i = bit_ffs(job_ptr->node_bitmap);
5930 	if (i < 0) {
5931 		error("%s: %pJ lacks assigned nodes", __func__, job_ptr);
5932 		return;
5933 	}
5934 
5935 	agent_args = xmalloc(sizeof(agent_arg_t));
5936 	agent_args->msg_type	= REQUEST_SIGNAL_TASKS;
5937 	agent_args->retry	= 1;
5938 	agent_args->node_count  = 1;
5939 #ifdef HAVE_FRONT_END
5940 	if (job_ptr->front_end_ptr)
5941 		agent_args->protocol_version =
5942 			job_ptr->front_end_ptr->protocol_version;
5943 #else
5944 	node_record_t *node_ptr;
5945 	if ((node_ptr = find_node_record(job_ptr->batch_host)))
5946 		agent_args->protocol_version = node_ptr->protocol_version;
5947 #endif
5948 	agent_args->hostlist	= hostlist_create(job_ptr->batch_host);
5949 	signal_tasks_msg = xmalloc(sizeof(signal_tasks_msg_t));
5950 	signal_tasks_msg->job_id      = job_ptr->job_id;
5951 	signal_tasks_msg->job_step_id = NO_VAL;
5952 
5953 	signal_tasks_msg->flags = flags;
5954 	signal_tasks_msg->signal = signal;
5955 
5956 	agent_args->msg_args = signal_tasks_msg;
5957 	agent_queue_request(agent_args);
5958 	return;
5959 }
5960 
5961 /*
5962  * prolog_complete - note the normal termination of the prolog
5963  * IN job_id - id of the job which completed
5964  * IN prolog_return_code - prolog's return code,
5965  *    if set then set job state to FAILED
5966  * RET - 0 on success, otherwise ESLURM error code
5967  * global: job_list - pointer global job list
5968  *	last_job_update - time of last job table update
5969  */
prolog_complete(uint32_t job_id,uint32_t prolog_return_code)5970 extern int prolog_complete(uint32_t job_id,
5971 			   uint32_t prolog_return_code)
5972 {
5973 	job_record_t *job_ptr;
5974 
5975 	job_ptr = find_job_record(job_id);
5976 	if (job_ptr == NULL) {
5977 		info("prolog_complete: invalid JobId=%u", job_id);
5978 		return ESLURM_INVALID_JOB_ID;
5979 	}
5980 
5981 	if (IS_JOB_COMPLETING(job_ptr))
5982 		return SLURM_SUCCESS;
5983 
5984 	if (prolog_return_code)
5985 		error("Prolog launch failure, %pJ", job_ptr);
5986 
5987 	job_ptr->state_reason = WAIT_NO_REASON;
5988 
5989 	return SLURM_SUCCESS;
5990 }
5991 
_job_complete(job_record_t * job_ptr,uid_t uid,bool requeue,bool node_fail,uint32_t job_return_code)5992 static int _job_complete(job_record_t *job_ptr, uid_t uid, bool requeue,
5993 			 bool node_fail, uint32_t job_return_code)
5994 {
5995 	node_record_t *node_ptr;
5996 	time_t now = time(NULL);
5997 	uint32_t job_comp_flag = 0;
5998 	bool suspended = false;
5999 	int i;
6000 	int use_cloud = false;
6001 	uint16_t over_time_limit;
6002 
6003 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
6004 	xassert(verify_lock(FED_LOCK, READ_LOCK));
6005 
6006 	if (IS_JOB_FINISHED(job_ptr)) {
6007 		if (job_ptr->exit_code == 0)
6008 			job_ptr->exit_code = job_return_code;
6009 		return ESLURM_ALREADY_DONE;
6010 	}
6011 
6012 	if (IS_JOB_COMPLETING(job_ptr))
6013 		return SLURM_SUCCESS;	/* avoid replay */
6014 
6015 	if ((job_return_code & 0xff) == SIG_OOM) {
6016 		info("%s: %pJ OOM failure",  __func__, job_ptr);
6017 	} else if (WIFSIGNALED(job_return_code)) {
6018 		info("%s: %pJ WTERMSIG %d",
6019 		     __func__, job_ptr, WTERMSIG(job_return_code));
6020 	} else if (WIFEXITED(job_return_code)) {
6021 		info("%s: %pJ WEXITSTATUS %d",
6022 		     __func__, job_ptr, WEXITSTATUS(job_return_code));
6023 	}
6024 
6025 	if (IS_JOB_RUNNING(job_ptr))
6026 		job_comp_flag = JOB_COMPLETING;
6027 	else if (IS_JOB_PENDING(job_ptr)) {
6028 		job_return_code = NO_VAL;
6029 		job_ptr->start_time = now;
6030 		fed_mgr_job_revoke_sibs(job_ptr);
6031 	}
6032 
6033 	if ((job_return_code == NO_VAL) &&
6034 	    (IS_JOB_RUNNING(job_ptr) || IS_JOB_PENDING(job_ptr))) {
6035 		if (node_fail) {
6036 			info("%s: %pJ cancelled by node failure",
6037 			     __func__, job_ptr);
6038 		} else {
6039 			info("%s: %pJ cancelled by interactive user",
6040 			     __func__, job_ptr);
6041 		}
6042 	}
6043 
6044 	if (IS_JOB_SUSPENDED(job_ptr)) {
6045 		uint32_t suspend_job_state = job_ptr->job_state;
6046 		/*
6047 		 * we can't have it as suspended when we call the
6048 		 * accounting stuff.
6049 		 */
6050 		job_ptr->job_state = JOB_CANCELLED;
6051 		jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
6052 		job_ptr->job_state = suspend_job_state;
6053 		job_comp_flag = JOB_COMPLETING;
6054 		suspended = true;
6055 	}
6056 
6057 	if (job_comp_flag && (job_ptr->node_cnt == 0)) {
6058 		/*
6059 		 * Job has no resources left (used to expand another job).
6060 		 * Avoid duplicate run of epilog and underflow in CPU count.
6061 		 */
6062 		job_comp_flag = 0;
6063 	}
6064 
6065 	if (requeue && job_ptr->details && job_ptr->batch_flag) {
6066 		/*
6067 		 * We want this job to look like it was terminated in the
6068 		 * accounting logs. Set a new submit time so the restarted
6069 		 * job looks like a new job.
6070 		 */
6071 		job_ptr->end_time = now;
6072 		job_ptr->job_state  = JOB_NODE_FAIL;
6073 		job_completion_logger(job_ptr, true);
6074 		/*
6075 		 * Do this after the epilog complete.
6076 		 * Setting it here is too early.
6077 		 */
6078 		//job_ptr->db_index = 0;
6079 		//job_ptr->details->submit_time = now + 1;
6080 		if (job_ptr->node_bitmap) {
6081 			i = bit_ffs(job_ptr->node_bitmap);
6082 			if (i >= 0) {
6083 				node_ptr = node_record_table_ptr + i;
6084 				if (IS_NODE_CLOUD(node_ptr))
6085 					use_cloud = true;
6086 			}
6087 		}
6088 		if (!use_cloud)
6089 			job_ptr->batch_flag++;	/* only one retry */
6090 		job_ptr->restart_cnt++;
6091 
6092 		/* clear signal sent flag on requeue */
6093 		job_ptr->warn_flags &= ~WARN_SENT;
6094 
6095 		job_ptr->job_state = JOB_PENDING | job_comp_flag;
6096 		/*
6097 		 * Since the job completion logger removes the job submit
6098 		 * information, we need to add it again.
6099 		 */
6100 		acct_policy_add_job_submit(job_ptr);
6101 		if (node_fail) {
6102 			info("%s: requeue %pJ due to node failure",
6103 			     __func__, job_ptr);
6104 		} else {
6105 			info("%s: requeue %pJ per user/system request",
6106 			     __func__, job_ptr);
6107 		}
6108 		/*
6109 		 * We have reached the maximum number of requeue
6110 		 * attempts hold the job with HoldMaxRequeue reason.
6111 		 */
6112 		if (job_ptr->batch_flag > MAX_BATCH_REQUEUE) {
6113 			job_ptr->job_state |= JOB_REQUEUE_HOLD;
6114 			job_ptr->state_reason = WAIT_MAX_REQUEUE;
6115 			job_ptr->batch_flag = 1;
6116 			debug("%s: Holding %pJ, repeated requeue failures",
6117 			      __func__, job_ptr);
6118 			job_ptr->priority = 0;
6119 		}
6120 	} else if (IS_JOB_PENDING(job_ptr) && job_ptr->details &&
6121 		   job_ptr->batch_flag) {
6122 		/*
6123 		 * Possible failure mode with DOWN node and job requeue.
6124 		 * The DOWN node might actually respond to the cancel and
6125 		 * take us here.  Don't run job_completion_logger here since
6126 		 * this is here to catch duplicate cancels from slowly
6127 		 * responding slurmds
6128 		 */
6129 		return SLURM_SUCCESS;
6130 	} else {
6131 		if (job_ptr->part_ptr &&
6132 		    (job_ptr->part_ptr->over_time_limit != NO_VAL16)) {
6133 			over_time_limit = job_ptr->part_ptr->over_time_limit;
6134 		} else {
6135 			over_time_limit = slurmctld_conf.over_time_limit;
6136 		}
6137 
6138 		if (node_fail) {
6139 			job_ptr->job_state = JOB_NODE_FAIL | job_comp_flag;
6140 			job_ptr->requid = uid;
6141 		} else if (job_return_code == NO_VAL) {
6142 			job_ptr->job_state = JOB_CANCELLED | job_comp_flag;
6143 			job_ptr->requid = uid;
6144 		} else if ((job_return_code & 0xff) == SIG_OOM) {
6145 			job_ptr->job_state = JOB_OOM | job_comp_flag;
6146 			job_ptr->exit_code = job_return_code;
6147 			job_ptr->state_reason = FAIL_OOM;
6148 			xfree(job_ptr->state_desc);
6149 		} else if (WIFEXITED(job_return_code) &&
6150 			   WEXITSTATUS(job_return_code)) {
6151 			job_ptr->job_state = JOB_FAILED   | job_comp_flag;
6152 			job_ptr->exit_code = job_return_code;
6153 			job_ptr->state_reason = FAIL_EXIT_CODE;
6154 			xfree(job_ptr->state_desc);
6155 		} else if (WIFSIGNALED(job_return_code)) {
6156 			job_ptr->job_state = JOB_FAILED | job_comp_flag;
6157 			job_ptr->exit_code = job_return_code;
6158 			job_ptr->state_reason = FAIL_LAUNCH;
6159 		} else if (job_comp_flag
6160 			   && ((job_ptr->end_time
6161 				+ over_time_limit * 60) < now)) {
6162 			/*
6163 			 * Test if the job has finished before its allowed
6164 			 * over time has expired.
6165 			 */
6166 			job_ptr->job_state = JOB_TIMEOUT  | job_comp_flag;
6167 			job_ptr->state_reason = FAIL_TIMEOUT;
6168 			xfree(job_ptr->state_desc);
6169 		} else {
6170 			job_ptr->job_state = JOB_COMPLETE | job_comp_flag;
6171 			job_ptr->exit_code = job_return_code;
6172 			if (nonstop_ops.job_fini)
6173 				(nonstop_ops.job_fini)(job_ptr);
6174 		}
6175 
6176 		if (suspended) {
6177 			job_ptr->end_time = job_ptr->suspend_time;
6178 			job_ptr->tot_sus_time +=
6179 				difftime(now, job_ptr->suspend_time);
6180 		} else
6181 			job_ptr->end_time = now;
6182 		job_completion_logger(job_ptr, false);
6183 	}
6184 
6185 	last_job_update = now;
6186 	job_ptr->time_last_active = now;   /* Timer for resending kill RPC */
6187 	if (job_comp_flag) {	/* job was running */
6188 		build_cg_bitmap(job_ptr);
6189 		deallocate_nodes(job_ptr, false, suspended, false);
6190 	}
6191 
6192 	/* Check for and cleanup stuck scripts */
6193 	if (job_ptr->details && job_ptr->details->prolog_running)
6194 		track_script_flush_job(job_ptr->job_id);
6195 
6196 	info("%s: %pJ done", __func__, job_ptr);
6197 	return SLURM_SUCCESS;
6198 }
6199 
6200 
6201 /*
6202  * job_complete - note the normal termination the specified job
6203  * IN job_id - id of the job which completed
6204  * IN uid - user id of user issuing the RPC
6205  * IN requeue - job should be run again if possible
6206  * IN node_fail - true if job terminated due to node failure
6207  * IN job_return_code - job's return code, if set then set state to FAILED
6208  * RET - 0 on success, otherwise ESLURM error code
6209  * global: job_list - pointer global job list
6210  *	last_job_update - time of last job table update
6211  */
job_complete(uint32_t job_id,uid_t uid,bool requeue,bool node_fail,uint32_t job_return_code)6212 extern int job_complete(uint32_t job_id, uid_t uid, bool requeue,
6213 			bool node_fail, uint32_t job_return_code)
6214 {
6215 	job_record_t *job_ptr, *het_job_ptr;
6216 	ListIterator iter;
6217 	int rc, rc1;
6218 
6219 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
6220 	xassert(verify_lock(FED_LOCK, READ_LOCK));
6221 
6222 	job_ptr = find_job_record(job_id);
6223 	if (job_ptr == NULL) {
6224 		info("%s: invalid JobId=%u", __func__, job_id);
6225 		return ESLURM_INVALID_JOB_ID;
6226 	}
6227 
6228 	if ((job_ptr->user_id != uid) && !validate_slurm_user(uid)) {
6229 		error("%s: Security violation, JOB_COMPLETE RPC for %pJ from uid %u",
6230 		      __func__, job_ptr, uid);
6231 		return ESLURM_USER_ID_MISSING;
6232 	}
6233 
6234 	if (job_ptr->het_job_list) {
6235 		rc = SLURM_SUCCESS;
6236 		iter = list_iterator_create(job_ptr->het_job_list);
6237 		while ((het_job_ptr = list_next(iter))) {
6238 			if (job_ptr->het_job_id != het_job_ptr->het_job_id) {
6239 				error("%s: Bad het_job_list for %pJ",
6240 				      __func__, job_ptr);
6241 				continue;
6242 			}
6243 			rc1 = _job_complete(het_job_ptr, uid, requeue,
6244 					    node_fail, job_return_code);
6245 			if (rc1 != SLURM_SUCCESS)
6246 				rc = rc1;
6247 		}
6248 		list_iterator_destroy(iter);
6249 	} else {
6250 		rc = _job_complete(job_ptr, uid, requeue, node_fail,
6251 				   job_return_code);
6252 	}
6253 
6254 	return rc;
6255 }
6256 
_alt_part_test(part_record_t * part_ptr,part_record_t ** part_ptr_new)6257 static int _alt_part_test(part_record_t *part_ptr, part_record_t **part_ptr_new)
6258 {
6259 	part_record_t *alt_part_ptr = NULL;
6260 	char *alt_name;
6261 
6262 	*part_ptr_new = NULL;
6263 	if ((part_ptr->state_up & PARTITION_SUBMIT) == 0) {
6264 		info("_alt_part_test: original partition is not available "
6265 		     "(drain or inactive): %s", part_ptr->name);
6266 		alt_name = part_ptr->alternate;
6267 		while (alt_name) {
6268 			alt_part_ptr = find_part_record(alt_name);
6269 			if (alt_part_ptr == NULL) {
6270 				info("_alt_part_test: invalid alternate "
6271 				     "partition name specified: %s", alt_name);
6272 				return ESLURM_INVALID_PARTITION_NAME;
6273 			}
6274 			if (alt_part_ptr == part_ptr) {
6275 				info("_alt_part_test: no valid alternate "
6276 				     "partition is available");
6277 				return ESLURM_PARTITION_NOT_AVAIL;
6278 			}
6279 			if (alt_part_ptr->state_up & PARTITION_SUBMIT)
6280 				break;
6281 			/* Try next alternate in the sequence */
6282 			alt_name = alt_part_ptr->alternate;
6283 		}
6284 		if (alt_name == NULL) {
6285 			info("_alt_part_test: no valid alternate partition is "
6286 			     "available");
6287 			return ESLURM_PARTITION_NOT_AVAIL;
6288 		}
6289 		*part_ptr_new = alt_part_ptr;
6290 	}
6291 	return SLURM_SUCCESS;
6292 }
6293 
6294 /*
6295  * Test if this job can use this partition
6296  *
6297  * NOTE: This function is also called with a dummy job_desc_msg_t from
6298  * job_limits_check() if there is any new check added here you may also have to
6299  * add that parameter to the job_desc_msg_t in that function.
6300  */
_part_access_check(part_record_t * part_ptr,job_desc_msg_t * job_desc,bitstr_t * req_bitmap,uid_t submit_uid,slurmdb_qos_rec_t * qos_ptr,char * acct)6301 static int _part_access_check(part_record_t *part_ptr, job_desc_msg_t *job_desc,
6302 			      bitstr_t *req_bitmap, uid_t submit_uid,
6303 			      slurmdb_qos_rec_t *qos_ptr, char *acct)
6304 {
6305 	uint32_t total_nodes, min_nodes_tmp, max_nodes_tmp;
6306 	uint32_t job_min_nodes, job_max_nodes;
6307 	int rc = SLURM_SUCCESS;
6308 
6309 	if ((part_ptr->flags & PART_FLAG_REQ_RESV) &&
6310 	    (!job_desc->reservation || job_desc->reservation[0] == '\0')) {
6311 		debug2("%s: uid %u access to partition %s "
6312 		     "denied, requires reservation", __func__,
6313 		     (unsigned int) submit_uid, part_ptr->name);
6314 		return ESLURM_ACCESS_DENIED;
6315 	}
6316 
6317 	if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && (submit_uid != 0) &&
6318 	    (submit_uid != slurmctld_conf.slurm_user_id)) {
6319 		debug2("%s: uid %u access to partition %s "
6320 		     "denied, not root", __func__,
6321 		     (unsigned int) submit_uid, part_ptr->name);
6322 		return ESLURM_ACCESS_DENIED;
6323 	}
6324 
6325 	if ((job_desc->user_id == 0) && (part_ptr->flags & PART_FLAG_NO_ROOT)) {
6326 		error("%s: Security violation, SUBMIT_JOB for "
6327 		      "user root disabled", __func__);
6328 		return ESLURM_USER_ID_MISSING;
6329 	}
6330 
6331 	if (validate_group(part_ptr, job_desc->user_id) == 0) {
6332 		debug2("%s: uid %u access to partition %s "
6333 		     "denied, bad group", __func__,
6334 		     (unsigned int) job_desc->user_id, part_ptr->name);
6335 		return ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP;
6336 	}
6337 
6338 	if (validate_alloc_node(part_ptr, job_desc->alloc_node) == 0) {
6339 		debug2("%s: uid %u access to partition %s "
6340 		     "denied, bad allocating node: %s", __func__,
6341 		     (unsigned int) job_desc->user_id, part_ptr->name,
6342 		     job_desc->alloc_node);
6343 		return ESLURM_ACCESS_DENIED;
6344 	}
6345 
6346 	if ((part_ptr->state_up & PARTITION_SCHED) &&
6347 	    (job_desc->min_cpus != NO_VAL)) {
6348 		if (job_desc->min_cpus > part_ptr->total_cpus) {
6349 			debug2("%s: Job requested too many CPUs (%u) of partition %s(%u)",
6350 			     __func__, job_desc->min_cpus, part_ptr->name,
6351 			     part_ptr->total_cpus);
6352 			return ESLURM_TOO_MANY_REQUESTED_CPUS;
6353 		} else if (job_desc->min_cpus >
6354 			   (part_ptr->max_cpus_per_node *
6355 			    part_ptr->total_nodes)) {
6356 			debug2("%s: Job requested too many CPUs (%u) of partition %s(%u)",
6357 			     __func__, job_desc->min_cpus, part_ptr->name,
6358 			     (part_ptr->max_cpus_per_node *
6359 			     part_ptr->total_nodes));
6360 			return ESLURM_TOO_MANY_REQUESTED_CPUS;
6361 		}
6362 	}
6363 
6364 	/* Check against total nodes on the partition */
6365 	total_nodes = part_ptr->total_nodes;
6366 	if ((part_ptr->state_up & PARTITION_SCHED) &&
6367 	    (job_desc->min_nodes != NO_VAL) &&
6368 	    (job_desc->min_nodes > total_nodes)) {
6369 		debug2("%s: Job requested too many nodes (%u) "
6370 		     "of partition %s(%u)", __func__,
6371 		     job_desc->min_nodes, part_ptr->name, total_nodes);
6372 		return ESLURM_INVALID_NODE_COUNT;
6373 	}
6374 
6375 	if (req_bitmap && !bit_super_set(req_bitmap, part_ptr->node_bitmap)) {
6376 		debug2("%s: requested nodes %s not in partition %s", __func__,
6377 		     job_desc->req_nodes, part_ptr->name);
6378 		return ESLURM_REQUESTED_NODES_NOT_IN_PARTITION;
6379 	}
6380 
6381 	/* The node counts have not been altered yet, so do not figure them out
6382 	 * by using the cpu counts.  The partitions have already been altered
6383 	 * so we have to use the original values.
6384 	 */
6385 	job_min_nodes = job_desc->min_nodes;
6386 	job_max_nodes = job_desc->max_nodes;
6387 	min_nodes_tmp = part_ptr->min_nodes;
6388 	max_nodes_tmp = part_ptr->max_nodes;
6389 
6390 	/* Check against min/max node limits in the partition */
6391 
6392 	if ((part_ptr->state_up & PARTITION_SCHED) &&
6393 	    (job_min_nodes != NO_VAL) &&
6394 	    (job_min_nodes < min_nodes_tmp) &&
6395 	    (!qos_ptr || (qos_ptr && !(qos_ptr->flags
6396 				       & QOS_FLAG_PART_MIN_NODE)))) {
6397 		debug2("%s: Job requested for nodes (%u) "
6398 		     "smaller than partition %s(%u) min nodes", __func__,
6399 		     job_min_nodes, part_ptr->name, min_nodes_tmp);
6400 		return  ESLURM_INVALID_NODE_COUNT;
6401 	}
6402 
6403 	if ((part_ptr->state_up & PARTITION_SCHED) &&
6404 	    (job_max_nodes != NO_VAL) &&
6405 	    (job_max_nodes > max_nodes_tmp) &&
6406 	    (!qos_ptr || (qos_ptr && !(qos_ptr->flags
6407 				       & QOS_FLAG_PART_MAX_NODE)))) {
6408 		debug2("%s: Job requested for nodes (%u) greater than partition"
6409 		     " %s(%u) max nodes", __func__, job_max_nodes,
6410 		     part_ptr->name, max_nodes_tmp);
6411 		return ESLURM_INVALID_NODE_COUNT;
6412 	}
6413 
6414 	if ((part_ptr->state_up & PARTITION_SCHED) &&
6415 	    (job_desc->time_limit != NO_VAL) &&
6416 	    (job_desc->time_limit > part_ptr->max_time) &&
6417 	    (!qos_ptr || !(qos_ptr->flags & QOS_FLAG_PART_TIME_LIMIT))) {
6418 		debug2("%s: Job time limit (%u) exceeds limit of partition "
6419 		     "%s(%u)", __func__, job_desc->time_limit, part_ptr->name,
6420 		     part_ptr->max_time);
6421 		return ESLURM_INVALID_TIME_LIMIT;
6422 	}
6423 
6424 	if (slurmctld_conf.enforce_part_limits) {
6425 		if ((rc = part_policy_valid_acct(part_ptr, acct, NULL))
6426 		    != SLURM_SUCCESS)
6427 			goto fini;
6428 
6429 		if ((rc = part_policy_valid_qos(part_ptr, qos_ptr, NULL))
6430 		    != SLURM_SUCCESS)
6431 			goto fini;
6432 	}
6433 
6434 fini:
6435 	return rc;
6436 }
6437 
_get_job_parts(job_desc_msg_t * job_desc,part_record_t ** part_pptr,List * part_pptr_list,char ** err_msg)6438 static int _get_job_parts(job_desc_msg_t *job_desc, part_record_t **part_pptr,
6439 			  List *part_pptr_list, char **err_msg)
6440 {
6441 	part_record_t *part_ptr = NULL, *part_ptr_new = NULL;
6442 	List part_ptr_list = NULL;
6443 	int rc = SLURM_SUCCESS;
6444 
6445 	/* Identify partition(s) and set pointer(s) to their struct */
6446 	if (job_desc->partition) {
6447 		char *err_part = NULL;
6448 		part_ptr = find_part_record(job_desc->partition);
6449 		if (part_ptr == NULL) {
6450 			part_ptr_list = get_part_list(job_desc->partition,
6451 						      &err_part);
6452 			if (part_ptr_list) {
6453 				part_ptr = list_peek(part_ptr_list);
6454 				if (list_count(part_ptr_list) == 1)
6455 					FREE_NULL_LIST(part_ptr_list);
6456 			}
6457 		}
6458 		if (part_ptr == NULL) {
6459 			info("%s: invalid partition specified: %s",
6460 			     __func__, job_desc->partition);
6461 			if (err_msg) {
6462 				xfree(*err_msg);
6463 				xstrfmtcat(*err_msg,
6464 					"invalid partition specified: %s",
6465 					err_part);
6466 				xfree(err_part);
6467 			}
6468 			return ESLURM_INVALID_PARTITION_NAME;
6469 		}
6470 	} else if (job_desc->reservation && job_desc->reservation[0] != '\0' ) {
6471 		slurmctld_resv_t *resv_ptr = NULL;
6472 		resv_ptr = find_resv_name(job_desc->reservation);
6473 		if (resv_ptr)
6474 			part_ptr = resv_ptr->part_ptr;
6475 		if (part_ptr)
6476 			job_desc->partition = xstrdup(part_ptr->name);
6477 	}
6478 
6479 	if (!part_ptr) {
6480 		if (default_part_loc == NULL) {
6481 			error("%s: default partition not set", __func__);
6482 			return ESLURM_DEFAULT_PARTITION_NOT_SET;
6483 		}
6484 		part_ptr = default_part_loc;
6485 		job_desc->partition = xstrdup(part_ptr->name);
6486 	}
6487 
6488 	/* Change partition pointer(s) to alternates as needed */
6489 	if (part_ptr_list) {
6490 		int fail_rc = SLURM_SUCCESS;
6491 		part_record_t *part_ptr_tmp;
6492 		bool rebuild_name_list = false;
6493 		ListIterator iter = list_iterator_create(part_ptr_list);
6494 
6495 		while ((part_ptr_tmp = list_next(iter))) {
6496 			rc = _alt_part_test(part_ptr_tmp, &part_ptr_new);
6497 			if (rc != SLURM_SUCCESS) {
6498 				fail_rc = rc;
6499 				list_remove(iter);
6500 				rebuild_name_list = true;
6501 				continue;
6502 			}
6503 			if (part_ptr_new) {
6504 				list_insert(iter, part_ptr_new);
6505 				list_remove(iter);
6506 				rebuild_name_list = true;
6507 			}
6508 		}
6509 		list_iterator_destroy(iter);
6510 		if (list_is_empty(part_ptr_list)) {
6511 			if (fail_rc != SLURM_SUCCESS)
6512 				rc = fail_rc;
6513 			else
6514 				rc = ESLURM_PARTITION_NOT_AVAIL;
6515 			goto fini;
6516 		}
6517 		rc = SLURM_SUCCESS;	/* At least some partition usable */
6518 		if (rebuild_name_list) {
6519 			part_ptr = NULL;
6520 			xfree(job_desc->partition);
6521 			iter = list_iterator_create(part_ptr_list);
6522 			while ((part_ptr_tmp = list_next(iter))) {
6523 				if (job_desc->partition)
6524 					xstrcat(job_desc->partition, ",");
6525 				else
6526 					part_ptr = part_ptr_tmp;
6527 				xstrcat(job_desc->partition,
6528 					part_ptr_tmp->name);
6529 			}
6530 			list_iterator_destroy(iter);
6531 			if (!part_ptr) {
6532 				rc = ESLURM_PARTITION_NOT_AVAIL;
6533 				goto fini;
6534 			}
6535 		}
6536 	} else {
6537 		rc = _alt_part_test(part_ptr, &part_ptr_new);
6538 		if (rc != SLURM_SUCCESS)
6539 			goto fini;
6540 		if (part_ptr_new) {
6541 			part_ptr = part_ptr_new;
6542 			xfree(job_desc->partition);
6543 			job_desc->partition = xstrdup(part_ptr->name);
6544 		}
6545 	}
6546 
6547 	*part_pptr = part_ptr;
6548 	if (part_pptr_list) {
6549 		*part_pptr_list = part_ptr_list;
6550 		part_ptr_list = NULL;
6551 	} else
6552 		FREE_NULL_LIST(part_ptr_list);
6553 
6554 fini:
6555 	return rc;
6556 }
6557 
_valid_job_part(job_desc_msg_t * job_desc,uid_t submit_uid,bitstr_t * req_bitmap,part_record_t * part_ptr,List part_ptr_list,slurmdb_assoc_rec_t * assoc_ptr,slurmdb_qos_rec_t * qos_ptr)6558 static int _valid_job_part(job_desc_msg_t *job_desc, uid_t submit_uid,
6559 			   bitstr_t *req_bitmap, part_record_t *part_ptr,
6560 			   List part_ptr_list,
6561 			   slurmdb_assoc_rec_t *assoc_ptr,
6562 			   slurmdb_qos_rec_t *qos_ptr)
6563 {
6564 	int rc = SLURM_SUCCESS;
6565 	part_record_t *part_ptr_tmp;
6566 	slurmdb_assoc_rec_t assoc_rec;
6567 	uint32_t min_nodes_orig = INFINITE, max_nodes_orig = 1;
6568 	uint32_t max_time = 0;
6569 	bool any_check = false;
6570 
6571 	/* Change partition pointer(s) to alternates as needed */
6572 	if (part_ptr_list) {
6573 		int fail_rc = SLURM_SUCCESS;
6574 		ListIterator iter = list_iterator_create(part_ptr_list);
6575 
6576 		while ((part_ptr_tmp = list_next(iter))) {
6577 			/*
6578 			 * FIXME: When dealing with multiple partitions we
6579 			 * currently can't deal with partition based
6580 			 * associations.
6581 			 */
6582 			memset(&assoc_rec, 0, sizeof(assoc_rec));
6583 			if (assoc_ptr) {
6584 				assoc_rec.acct      = assoc_ptr->acct;
6585 				assoc_rec.partition = part_ptr_tmp->name;
6586 				assoc_rec.uid       = job_desc->user_id;
6587 				(void) assoc_mgr_fill_in_assoc(
6588 					acct_db_conn, &assoc_rec,
6589 					accounting_enforce, NULL, false);
6590 			}
6591 
6592 			if (assoc_ptr && assoc_rec.id != assoc_ptr->id) {
6593 				info("%s: can't check multiple "
6594 				     "partitions with partition based "
6595 				     "associations", __func__);
6596 				rc = SLURM_ERROR;
6597 			} else {
6598 				rc = _part_access_check(part_ptr_tmp, job_desc,
6599 							req_bitmap, submit_uid,
6600 							qos_ptr, assoc_ptr ?
6601 							assoc_ptr->acct : NULL);
6602 			}
6603 			if ((rc != SLURM_SUCCESS) &&
6604 			    ((rc == ESLURM_ACCESS_DENIED) ||
6605 			     (rc == ESLURM_USER_ID_MISSING) ||
6606 			     (rc == ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP) ||
6607 			     (slurmctld_conf.enforce_part_limits ==
6608 			      PARTITION_ENFORCE_ALL))) {
6609 				fail_rc = rc;
6610 				break;
6611 			} else if (rc != SLURM_SUCCESS) {
6612 				fail_rc = rc;
6613 			} else {
6614 				any_check = true;
6615 			}
6616 
6617 			/* Set to success since we found a usable partition */
6618 			if (any_check && slurmctld_conf.enforce_part_limits ==
6619 			    PARTITION_ENFORCE_ANY)
6620 				fail_rc = SLURM_SUCCESS;
6621 
6622 			min_nodes_orig = MIN(min_nodes_orig,
6623 					     part_ptr_tmp->min_nodes_orig);
6624 			max_nodes_orig = MAX(max_nodes_orig,
6625 					     part_ptr_tmp->max_nodes_orig);
6626 			max_time = MAX(max_time, part_ptr_tmp->max_time);
6627 		}
6628 		list_iterator_destroy(iter);
6629 
6630 		if (list_is_empty(part_ptr_list) ||
6631 		    (slurmctld_conf.enforce_part_limits &&
6632 		     (fail_rc != SLURM_SUCCESS))) {
6633 			if (slurmctld_conf.enforce_part_limits ==
6634 			    PARTITION_ENFORCE_ALL)
6635 				rc = fail_rc;
6636 			else if (slurmctld_conf.enforce_part_limits ==
6637 				 PARTITION_ENFORCE_ANY && !any_check)
6638 				rc = fail_rc;
6639 			else {
6640 				rc = ESLURM_PARTITION_NOT_AVAIL;
6641 			}
6642 			goto fini;
6643 		}
6644 		rc = SLURM_SUCCESS;	/* At least some partition usable */
6645 	} else {
6646 		min_nodes_orig = part_ptr->min_nodes_orig;
6647 		max_nodes_orig = part_ptr->max_nodes_orig;
6648 		max_time = part_ptr->max_time;
6649 		rc = _part_access_check(part_ptr, job_desc, req_bitmap,
6650 					submit_uid, qos_ptr,
6651 					assoc_ptr ? assoc_ptr->acct : NULL);
6652 		if ((rc != SLURM_SUCCESS) &&
6653 		    ((rc == ESLURM_ACCESS_DENIED) ||
6654 		     (rc == ESLURM_USER_ID_MISSING) ||
6655 		     (rc == ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP) ||
6656 		     slurmctld_conf.enforce_part_limits))
6657 			goto fini;
6658 		/* Enforce Part Limit = no */
6659 		rc = SLURM_SUCCESS;
6660 	}
6661 
6662 	/* Validate job limits against partition limits */
6663 
6664 	/* Check Partition with the highest limits when there are muliple */
6665 	if (job_desc->min_nodes == NO_VAL) {
6666 		/* Avoid setting the job request to 0 nodes unless requested */
6667 		if (!min_nodes_orig)
6668 			job_desc->min_nodes = 1;
6669 		else
6670 			job_desc->min_nodes = min_nodes_orig;
6671 	} else if ((job_desc->min_nodes > max_nodes_orig) &&
6672 		   slurmctld_conf.enforce_part_limits &&
6673 		   (!qos_ptr || (qos_ptr && !(qos_ptr->flags &
6674 					      QOS_FLAG_PART_MAX_NODE)))) {
6675 		info("%s: job's min nodes greater than "
6676 		     "partition's max nodes (%u > %u)",
6677 		     __func__, job_desc->min_nodes, max_nodes_orig);
6678 		rc = ESLURM_INVALID_NODE_COUNT;
6679 		goto fini;
6680 	} else if ((job_desc->min_nodes < min_nodes_orig) &&
6681 		   ((job_desc->max_nodes == NO_VAL) ||
6682 		    (job_desc->max_nodes >= min_nodes_orig))) {
6683 		job_desc->min_nodes = min_nodes_orig;
6684 	}
6685 
6686 	if ((job_desc->max_nodes != NO_VAL) &&
6687 	    slurmctld_conf.enforce_part_limits &&
6688 	    (job_desc->max_nodes < min_nodes_orig) &&
6689 	    (!qos_ptr || (qos_ptr && !(qos_ptr->flags
6690 				       & QOS_FLAG_PART_MIN_NODE)))) {
6691 		info("%s: job's max nodes less than partition's "
6692 		     "min nodes (%u < %u)",
6693 		     __func__, job_desc->max_nodes, min_nodes_orig);
6694 		rc = ESLURM_INVALID_NODE_COUNT;
6695 		goto fini;
6696 	}
6697 #ifndef HAVE_FRONT_END
6698 	/* Zero node count OK for persistent burst buffer create or destroy */
6699 	if ((job_desc->min_nodes == 0) &&
6700 	    (job_desc->array_inx || (job_desc->het_job_offset != NO_VAL) ||
6701 	     (!job_desc->burst_buffer && !job_desc->script))) {
6702 		info("%s: min_nodes is zero", __func__);
6703 		rc = ESLURM_INVALID_NODE_COUNT;
6704 		goto fini;
6705 	}
6706 #endif
6707 
6708 	if ((job_desc->time_limit   == NO_VAL) &&
6709 	    (part_ptr->default_time == 0)) {
6710 		info("%s: job's default time is 0", __func__);
6711 		rc = ESLURM_INVALID_TIME_LIMIT;
6712 		goto fini;
6713 	}
6714 
6715 	if ((job_desc->time_limit   == NO_VAL) &&
6716 	    (part_ptr->default_time != NO_VAL))
6717 		job_desc->time_limit = part_ptr->default_time;
6718 
6719 	if ((job_desc->time_min != NO_VAL) &&
6720 	    (job_desc->time_min >  max_time) &&
6721 	    (!qos_ptr || (qos_ptr && !(qos_ptr->flags &
6722 				       QOS_FLAG_PART_TIME_LIMIT)))) {
6723 		info("%s: job's min time greater than "
6724 		     "partition's (%u > %u)",
6725 		     __func__, job_desc->time_min, max_time);
6726 		rc = ESLURM_INVALID_TIME_MIN_LIMIT;
6727 		goto fini;
6728 	}
6729 	if ((job_desc->time_limit != NO_VAL) &&
6730 	    (job_desc->time_limit >  max_time) &&
6731 	    (job_desc->time_min   == NO_VAL) &&
6732 	    slurmctld_conf.enforce_part_limits &&
6733 	    (!qos_ptr || (qos_ptr && !(qos_ptr->flags &
6734 				       QOS_FLAG_PART_TIME_LIMIT)))) {
6735 		info("%s: job's time limit greater than "
6736 		     "partition's (%u > %u)",
6737 		     __func__, job_desc->time_limit, max_time);
6738 		rc = ESLURM_INVALID_TIME_LIMIT;
6739 		goto fini;
6740 	}
6741 	if ((job_desc->time_min != NO_VAL) &&
6742 	    (job_desc->time_min >  job_desc->time_limit) &&
6743 	    (!qos_ptr || (qos_ptr && !(qos_ptr->flags &
6744 				       QOS_FLAG_PART_TIME_LIMIT)))) {
6745 		info("%s: job's min_time greater time limit "
6746 		     "(%u > %u)",
6747 		     __func__, job_desc->time_min, job_desc->time_limit);
6748 		rc = ESLURM_INVALID_TIME_MIN_LIMIT;
6749 		goto fini;
6750 	}
6751 	if ((job_desc->deadline) && (job_desc->deadline != NO_VAL)) {
6752 		char time_str_now[32];
6753 		char time_str_deadline[32];
6754 		time_t now = time(NULL);
6755 		slurm_make_time_str(&job_desc->deadline, time_str_deadline,
6756 				    sizeof(time_str_deadline));
6757 		slurm_make_time_str(&now, time_str_now, sizeof(time_str_now));
6758 		if (job_desc->deadline < now) {
6759 			info("%s: job's deadline smaller than now (%s < %s)",
6760 			     __func__, time_str_deadline, time_str_now);
6761 			rc = ESLURM_INVALID_TIME_LIMIT;
6762 			goto fini;
6763 		}
6764 		if ((job_desc->time_min) && (job_desc->time_min != NO_VAL) &&
6765 		    (job_desc->deadline < (now + job_desc->time_min * 60))) {
6766 			info("%s: job's min_time greater than deadline (%u > %s)",
6767 			     __func__, job_desc->time_min, time_str_deadline);
6768 			rc = ESLURM_INVALID_TIME_MIN_LIMIT;
6769 			goto fini;
6770 		}
6771 		if ((job_desc->time_min == 0) && (job_desc->time_limit) &&
6772 		    (job_desc->time_limit != NO_VAL) &&
6773 		    (job_desc->deadline < (now + job_desc->time_limit * 60))) {
6774 			info("%s: job's time_limit greater than deadline (%u > %s)",
6775 			     __func__, job_desc->time_limit, time_str_deadline);
6776 			rc = ESLURM_INVALID_TIME_LIMIT;
6777 			goto fini;
6778 		}
6779 	}
6780 
6781 fini:
6782 	return rc;
6783 }
6784 
6785 /*
6786  * job_limits_check - check the limits specified for the job.
6787  * IN job_ptr - pointer to job table entry.
6788  * IN check_min_time - if true test job's minimum time limit,
6789  *		otherwise test maximum time limit
6790  * RET WAIT_NO_REASON on success, fail status otherwise.
6791  */
job_limits_check(job_record_t ** job_pptr,bool check_min_time)6792 extern int job_limits_check(job_record_t **job_pptr, bool check_min_time)
6793 {
6794 	struct job_details *detail_ptr;
6795 	enum job_state_reason fail_reason;
6796 	part_record_t *part_ptr = NULL;
6797 	job_record_t *job_ptr = NULL;
6798 	slurmdb_qos_rec_t  *qos_ptr;
6799 	slurmdb_assoc_rec_t *assoc_ptr;
6800 	job_desc_msg_t job_desc;
6801 	int rc;
6802 
6803 	job_ptr = *job_pptr;
6804 	detail_ptr = job_ptr->details;
6805 	part_ptr = job_ptr->part_ptr;
6806 	qos_ptr = job_ptr->qos_ptr;
6807 	assoc_ptr = job_ptr->assoc_ptr;
6808 	if (!detail_ptr || !part_ptr) {
6809 		fatal_abort("%pJ has NULL details_ptr and/or part_ptr",
6810 			    job_ptr);
6811 		return WAIT_NO_REASON;	/* To prevent CLANG error */
6812 	}
6813 
6814 	fail_reason = WAIT_NO_REASON;
6815 
6816 	/*
6817 	 * Here we need to pretend we are just submitting the job so we can
6818 	 * utilize the already existing function _part_access_check. If any
6819 	 * additional fields in that function are ever checked, the fields set
6820 	 * below will need to be modified.
6821 	 */
6822 	slurm_init_job_desc_msg(&job_desc);
6823 	job_desc.reservation = job_ptr->resv_name;
6824 	job_desc.user_id = job_ptr->user_id;
6825 	job_desc.alloc_node = job_ptr->alloc_node;
6826 	job_desc.min_cpus = detail_ptr->orig_min_cpus;
6827 	job_desc.min_nodes = detail_ptr->min_nodes;
6828 	/* _part_access_check looks for NO_VAL instead of 0 */
6829 	job_desc.max_nodes = detail_ptr->max_nodes ?
6830 		detail_ptr->max_nodes : NO_VAL;;
6831 	if (check_min_time && job_ptr->time_min)
6832 		job_desc.time_limit = job_ptr->time_min;
6833 	else
6834 		job_desc.time_limit = job_ptr->time_limit;
6835 
6836 	if ((rc = _part_access_check(part_ptr, &job_desc, NULL,
6837 				     job_ptr->user_id, qos_ptr,
6838 				     job_ptr->account))) {
6839 		debug2("%pJ can't run in partition %s: %s",
6840 		       job_ptr, part_ptr->name, slurm_strerror(rc));
6841 		switch (rc) {
6842 		case ESLURM_INVALID_TIME_LIMIT:
6843 		case ESLURM_INVALID_TIME_MIN_LIMIT:
6844 			if (job_ptr->limit_set.time != ADMIN_SET_LIMIT)
6845 				fail_reason = WAIT_PART_TIME_LIMIT;
6846 			break;
6847 		case ESLURM_INVALID_NODE_COUNT:
6848 			fail_reason = WAIT_PART_NODE_LIMIT;
6849 			break;
6850 		/* FIXME */
6851 		/* case ESLURM_TOO_MANY_REQUESTED_CPUS: */
6852 		/* 	failt_reason = NON_EXISTANT_WAIT_PART_CPU_LIMIT; */
6853 		/* 	break; */
6854 		default:
6855 			fail_reason = WAIT_PART_CONFIG;
6856 			break;
6857 		}
6858 	} else if (part_ptr->state_up == PARTITION_DOWN) {
6859 		debug2("%pJ requested down partition %s",
6860 		       job_ptr, part_ptr->name);
6861 		fail_reason = WAIT_PART_DOWN;
6862 	} else if (part_ptr->state_up == PARTITION_INACTIVE) {
6863 		debug2("%pJ requested inactive partition %s",
6864 		       job_ptr, part_ptr->name);
6865 		fail_reason = WAIT_PART_INACTIVE;
6866 	} else if (qos_ptr && assoc_ptr &&
6867 		   (qos_ptr->flags & QOS_FLAG_ENFORCE_USAGE_THRES) &&
6868 		   (!fuzzy_equal(qos_ptr->usage_thres, NO_VAL))) {
6869 		if (!job_ptr->prio_factors) {
6870 			job_ptr->prio_factors =
6871 				xmalloc(sizeof(priority_factors_object_t));
6872 		}
6873 		if (!job_ptr->prio_factors->priority_fs) {
6874 			if (fuzzy_equal(assoc_ptr->usage->usage_efctv, NO_VAL))
6875 				priority_g_set_assoc_usage(assoc_ptr);
6876 			job_ptr->prio_factors->priority_fs =
6877 				priority_g_calc_fs_factor(
6878 					assoc_ptr->usage->usage_efctv,
6879 					(long double)assoc_ptr->usage->
6880 					shares_norm);
6881 		}
6882 		if (job_ptr->prio_factors->priority_fs < qos_ptr->usage_thres){
6883 			debug2("%pJ exceeds usage threshold", job_ptr);
6884 			fail_reason = WAIT_QOS_THRES;
6885 		}
6886 	} else if (fail_reason == WAIT_NO_REASON) {
6887 		/*
6888 		 * Here we need to pretend we are just submitting the job so we
6889 		 * can utilize the already existing function _valid_pn_min_mem.
6890 		 * If anything else is ever checked in that function this will
6891 		 * most likely have to be updated. Some of the needed members
6892 		 * were already initialized above to call _part_access_check, as
6893 		 * well as the memset for job_desc.
6894 		 */
6895 		if (job_ptr->bit_flags & JOB_MEM_SET)
6896 			job_desc.pn_min_memory = detail_ptr->orig_pn_min_memory;
6897 		else if (part_ptr->def_mem_per_cpu)
6898 			job_desc.pn_min_memory = part_ptr->def_mem_per_cpu;
6899 		else
6900 			job_desc.pn_min_memory = slurmctld_conf.def_mem_per_cpu;
6901 		if (detail_ptr->orig_cpus_per_task == NO_VAL16)
6902 			job_desc.cpus_per_task = 1;
6903 		else
6904 			job_desc.cpus_per_task = detail_ptr->orig_cpus_per_task;
6905 		if (detail_ptr->num_tasks)
6906 			job_desc.num_tasks = detail_ptr->num_tasks;
6907 		else {
6908 			job_desc.num_tasks = job_desc.min_nodes;
6909 			if (detail_ptr->ntasks_per_node != NO_VAL16)
6910 				job_desc.num_tasks *=
6911 					detail_ptr->ntasks_per_node;
6912 		}
6913 		//job_desc.min_cpus = detail_ptr->min_cpus; /* init'ed above */
6914 		job_desc.max_cpus = detail_ptr->orig_max_cpus;
6915 		job_desc.shared = (uint16_t)detail_ptr->share_res;
6916 		job_desc.ntasks_per_node = detail_ptr->ntasks_per_node;
6917 		job_desc.pn_min_cpus = detail_ptr->orig_pn_min_cpus;
6918 		job_desc.job_id = job_ptr->job_id;
6919 		if (!_valid_pn_min_mem(&job_desc, part_ptr)) {
6920 			/* debug2 message already logged inside the function. */
6921 			fail_reason = WAIT_PN_MEM_LIMIT;
6922 		} else {
6923 			/* Copy back to job_record adjusted members */
6924 			detail_ptr->pn_min_memory = job_desc.pn_min_memory;
6925 			detail_ptr->cpus_per_task = job_desc.cpus_per_task;
6926 			detail_ptr->min_cpus = job_desc.min_cpus;
6927 			detail_ptr->max_cpus = job_desc.max_cpus;
6928 			detail_ptr->pn_min_cpus = job_desc.pn_min_cpus;
6929 		}
6930 	}
6931 
6932 	return (fail_reason);
6933 }
6934 
6935 /*
6936  * _job_create - create a job table record for the supplied specifications.
6937  *	This performs only basic tests for request validity (access to
6938  *	partition, nodes count in partition, and sufficient processors in
6939  *	partition).
6940  * IN job_specs - job specifications
6941  * IN allocate - resource allocation request if set rather than job submit
6942  * IN will_run - job is not to be created, test of validity only
6943  * OUT job_pptr - pointer to the job (NULL on error)
6944  * OUT err_msg - Error message for user
6945  * RET 0 on success, otherwise ESLURM error code. If the job would only be
6946  *	able to execute with some change in partition configuration then
6947  *	ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned
6948  */
6949 
_job_create(job_desc_msg_t * job_desc,int allocate,int will_run,job_record_t ** job_pptr,uid_t submit_uid,char ** err_msg,uint16_t protocol_version)6950 static int _job_create(job_desc_msg_t *job_desc, int allocate, int will_run,
6951 		       job_record_t **job_pptr, uid_t submit_uid,
6952 		       char **err_msg, uint16_t protocol_version)
6953 {
6954 	int error_code = SLURM_SUCCESS, i, qos_error;
6955 	part_record_t *part_ptr = NULL;
6956 	List part_ptr_list = NULL;
6957 	bitstr_t *req_bitmap = NULL, *exc_bitmap = NULL;
6958 	job_record_t *job_ptr = NULL;
6959 	slurmdb_assoc_rec_t assoc_rec, *assoc_ptr = NULL;
6960 	List license_list = NULL, gres_list = NULL;
6961 	bool valid;
6962 	slurmdb_qos_rec_t qos_rec, *qos_ptr;
6963 	uint32_t user_submit_priority, acct_reason = 0;
6964 	acct_policy_limit_set_t acct_policy_limit_set;
6965 
6966 	memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set));
6967 	acct_policy_limit_set.tres = xcalloc(slurmctld_tres_cnt,
6968 					     sizeof(uint16_t));
6969 
6970 	*job_pptr = NULL;
6971 
6972 	user_submit_priority = job_desc->priority;
6973 
6974 	/*
6975 	 * Reject X11 forwarding requests from 18.08 clients since the
6976 	 * implementation has changed, and support for setting up tunnels in
6977 	 * the older style was removed with no backwards compatibility.
6978 	 * Remove this two versions after 19.05 is released.
6979 	 */
6980 	if (job_desc->x11 && (protocol_version < SLURM_19_05_PROTOCOL_VERSION)) {
6981 		info("%s: cannot support X11 tunnelling from older salloc/srun",
6982 		     __func__);
6983 		error_code = ESLURM_X11_NOT_AVAIL;
6984 		goto cleanup_fail;
6985 	}
6986 
6987 	/* ensure that selected nodes are in this partition */
6988 	if (job_desc->req_nodes) {
6989 		error_code = node_name2bitmap(job_desc->req_nodes, false,
6990 					      &req_bitmap);
6991 		if (error_code) {
6992 			error_code = ESLURM_INVALID_NODE_NAME;
6993 			goto cleanup_fail;
6994 		}
6995 		if ((job_desc->contiguous != NO_VAL16) &&
6996 		    (job_desc->contiguous))
6997 			bit_fill_gaps(req_bitmap);
6998 		i = bit_set_count(req_bitmap);
6999 		if (i > job_desc->min_nodes)
7000 			job_desc->min_nodes = i;
7001 		if (i > job_desc->min_cpus)
7002 			job_desc->min_cpus = i;
7003 		if (job_desc->max_nodes &&
7004 		    (job_desc->min_nodes > job_desc->max_nodes)) {
7005 #if 0
7006 			info("%s: max node count less than required hostlist "
7007 			     "size for user %u", __func__, job_desc->user_id);
7008 			job_desc->max_nodes = job_desc->min_nodes;
7009 #else
7010 			error_code = ESLURM_INVALID_NODE_COUNT;
7011 			goto cleanup_fail;
7012 #endif
7013 		}
7014 	}
7015 
7016 	/* Zero node count OK for persistent burst buffer create or destroy */
7017 	if ((job_desc->max_nodes == 0) &&
7018 	    (job_desc->array_inx || (job_desc->het_job_offset != NO_VAL) ||
7019 	     (!job_desc->burst_buffer && !job_desc->script))) {
7020 		info("%s: max_nodes is zero", __func__);
7021 		error_code = ESLURM_INVALID_NODE_COUNT;
7022 		goto cleanup_fail;
7023 	}
7024 
7025 	error_code = _get_job_parts(job_desc, &part_ptr, &part_ptr_list,
7026 				    err_msg);
7027 	if (error_code != SLURM_SUCCESS)
7028 		goto cleanup_fail;
7029 
7030 	memset(&assoc_rec, 0, sizeof(assoc_rec));
7031 	assoc_rec.acct      = job_desc->account;
7032 	assoc_rec.partition = part_ptr->name;
7033 	assoc_rec.uid       = job_desc->user_id;
7034 	/*
7035 	 * Checks are done later to validate assoc_ptr, so we don't
7036 	 * need to lock outside of fill_in_assoc.
7037 	 */
7038 	if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
7039 				    accounting_enforce, &assoc_ptr, false)) {
7040 		info("%s: invalid account or partition for user %u, "
7041 		     "account '%s', and partition '%s'", __func__,
7042 		     job_desc->user_id, assoc_rec.acct, assoc_rec.partition);
7043 		error_code = ESLURM_INVALID_ACCOUNT;
7044 		goto cleanup_fail;
7045 	} else if (association_based_accounting &&
7046 		   !assoc_ptr &&
7047 		   !(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)) {
7048 		/*
7049 		 * If not enforcing associations we want to look for the
7050 		 * default account and use it to avoid getting trash in the
7051 		 * accounting records.
7052 		 */
7053 		assoc_rec.acct = NULL;
7054 		(void) assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
7055 					       accounting_enforce, &assoc_ptr,
7056 					       false);
7057 		if (assoc_ptr) {
7058 			info("%s: account '%s' has no association for user %u "
7059 			     "using default account '%s'",
7060 			     __func__, job_desc->account, job_desc->user_id,
7061 			     assoc_rec.acct);
7062 			xfree(job_desc->account);
7063 		}
7064 	}
7065 
7066 	if (job_desc->account == NULL)
7067 		job_desc->account = xstrdup(assoc_rec.acct);
7068 
7069 	/* This must be done after we have the assoc_ptr set */
7070 	memset(&qos_rec, 0, sizeof(qos_rec));
7071 	qos_rec.name = job_desc->qos;
7072 
7073 	qos_ptr = _determine_and_validate_qos(
7074 		job_desc->reservation, assoc_ptr, false, &qos_rec, &qos_error,
7075 		false, LOG_LEVEL_ERROR);
7076 
7077 	if (qos_error != SLURM_SUCCESS) {
7078 		error_code = qos_error;
7079 		goto cleanup_fail;
7080 	}
7081 
7082 	error_code = _valid_job_part(job_desc, submit_uid, req_bitmap,
7083 				     part_ptr, part_ptr_list,
7084 				     assoc_ptr, qos_ptr);
7085 	if (error_code != SLURM_SUCCESS)
7086 		goto cleanup_fail;
7087 
7088 	if ((error_code = _validate_job_desc(job_desc, allocate, submit_uid,
7089 					     part_ptr, part_ptr_list))) {
7090 		goto cleanup_fail;
7091 	}
7092 
7093 	job_desc->tres_req_cnt = xcalloc(slurmctld_tres_cnt, sizeof(uint64_t));
7094 	job_desc->tres_req_cnt[TRES_ARRAY_NODE] = job_desc->min_nodes;
7095 	job_desc->tres_req_cnt[TRES_ARRAY_CPU]  = job_desc->min_cpus;
7096 	job_desc->tres_req_cnt[TRES_ARRAY_MEM]  = job_get_tres_mem(NULL,
7097 					job_desc->pn_min_memory,
7098 					job_desc->tres_req_cnt[TRES_ARRAY_CPU],
7099 					job_desc->min_nodes);
7100 
7101 	license_list = license_validate(job_desc->licenses,
7102 					validate_cfgd_licenses, true,
7103 					job_desc->tres_req_cnt, &valid);
7104 	if (!valid) {
7105 		info("Job's requested licenses are invalid: %s",
7106 		     job_desc->licenses);
7107 		error_code = ESLURM_INVALID_LICENSES;
7108 		goto cleanup_fail;
7109 	}
7110 
7111 	if ((error_code = gres_plugin_job_state_validate(
7112 						job_desc->cpus_per_tres,
7113 						job_desc->tres_freq,
7114 						job_desc->tres_per_job,
7115 						job_desc->tres_per_node,
7116 						job_desc->tres_per_socket,
7117 						job_desc->tres_per_task,
7118 						job_desc->mem_per_tres,
7119 						&job_desc->num_tasks,
7120 						&job_desc->min_nodes,
7121 						&job_desc->max_nodes,
7122 						&job_desc->ntasks_per_node,
7123 						&job_desc->ntasks_per_socket,
7124 						&job_desc->sockets_per_node,
7125 						&job_desc->cpus_per_task,
7126 						&gres_list)))
7127 		goto cleanup_fail;
7128 
7129 	if (!valid_tres_cnt(job_desc->cpus_per_tres)	||
7130 	    !valid_tres_cnt(job_desc->mem_per_tres)	||
7131 	    tres_bind_verify_cmdline(job_desc->tres_bind) ||
7132 	    tres_freq_verify_cmdline(job_desc->tres_freq) ||
7133 	    !valid_tres_cnt(job_desc->mem_per_tres)	||
7134 	    !valid_tres_cnt(job_desc->tres_per_job)	||
7135 	    !valid_tres_cnt(job_desc->tres_per_node)	||
7136 	    !valid_tres_cnt(job_desc->tres_per_socket)	||
7137 	    !valid_tres_cnt(job_desc->tres_per_task)) {
7138 		error_code = ESLURM_INVALID_TRES;
7139 		goto cleanup_fail;
7140 	}
7141 
7142 	gres_set_job_tres_cnt(gres_list,
7143 			      job_desc->min_nodes,
7144 			      job_desc->tres_req_cnt,
7145 			      false);
7146 
7147 	/*
7148 	 * Do this last,after other TRES' have been set as it uses the other
7149 	 * values to calculate the billing value.
7150 	 */
7151 	job_desc->tres_req_cnt[TRES_ARRAY_BILLING] =
7152 		assoc_mgr_tres_weighted(job_desc->tres_req_cnt,
7153 					part_ptr->billing_weights,
7154 					slurmctld_conf.priority_flags, false);
7155 
7156 	if ((error_code = bb_g_job_validate(job_desc, submit_uid))
7157 	    != SLURM_SUCCESS)
7158 		goto cleanup_fail;
7159 
7160 	if (job_desc->deadline && (job_desc->time_limit == NO_VAL) &&
7161 	    (job_desc->time_min == NO_VAL))
7162 		job_desc->time_min = 1;
7163 	if ((accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) &&
7164 	    (!acct_policy_validate(job_desc, part_ptr,
7165 				   assoc_ptr, qos_ptr, &acct_reason,
7166 				   &acct_policy_limit_set, 0))) {
7167 		if (err_msg) {
7168 			xfree(*err_msg);
7169 			*err_msg = xstrdup(job_reason_string(acct_reason));
7170 		}
7171 		info("%s: exceeded association/QOS limit for user %u: %s",
7172 		     __func__, job_desc->user_id,
7173 		     err_msg ? *err_msg : job_reason_string(acct_reason));
7174 		error_code = ESLURM_ACCOUNTING_POLICY;
7175 		goto cleanup_fail;
7176 	}
7177 
7178 	if (job_desc->exc_nodes) {
7179 		error_code = node_name2bitmap(job_desc->exc_nodes, false,
7180 					      &exc_bitmap);
7181 		if (error_code) {
7182 			error_code = ESLURM_INVALID_NODE_NAME;
7183 			goto cleanup_fail;
7184 		}
7185 	}
7186 	if (exc_bitmap && req_bitmap) {
7187 		bitstr_t *tmp_bitmap = NULL;
7188 		bitoff_t first_set;
7189 		tmp_bitmap = bit_copy(exc_bitmap);
7190 		bit_and(tmp_bitmap, req_bitmap);
7191 		first_set = bit_ffs(tmp_bitmap);
7192 		FREE_NULL_BITMAP(tmp_bitmap);
7193 		if (first_set != -1) {
7194 			info("Job's required and excluded node lists overlap");
7195 			error_code = ESLURM_INVALID_NODE_NAME;
7196 			goto cleanup_fail;
7197 		}
7198 	}
7199 
7200 	if (job_desc->min_nodes == NO_VAL)
7201 		job_desc->min_nodes = 1;
7202 
7203 	if (job_desc->max_nodes == NO_VAL)
7204 		job_desc->max_nodes = 0;
7205 
7206 	if (job_desc->max_nodes &&
7207 	    (job_desc->max_nodes < job_desc->min_nodes)) {
7208 		info("%s: Job's max_nodes(%u) < min_nodes(%u)",
7209 		     __func__, job_desc->max_nodes, job_desc->min_nodes);
7210 		error_code = ESLURM_INVALID_NODE_COUNT;
7211 		goto cleanup_fail;
7212 	}
7213 
7214 	if ((error_code = _copy_job_desc_to_job_record(job_desc,
7215 						       job_pptr,
7216 						       &req_bitmap,
7217 						       &exc_bitmap))) {
7218 		if (error_code == SLURM_ERROR)
7219 			error_code = ESLURM_ERROR_ON_DESC_TO_RECORD_COPY;
7220 		job_ptr = *job_pptr;
7221 		goto cleanup_fail;
7222 	}
7223 
7224 	job_ptr = *job_pptr;
7225 	job_ptr->start_protocol_ver = protocol_version;
7226 	job_ptr->part_ptr = part_ptr;
7227 	job_ptr->part_ptr_list = part_ptr_list;
7228 	job_ptr->bit_flags |= JOB_DEPENDENT;
7229 	job_ptr->last_sched_eval = time(NULL);
7230 
7231 	part_ptr_list = NULL;
7232 
7233 	memcpy(&job_ptr->limit_set, &acct_policy_limit_set,
7234 	       sizeof(acct_policy_limit_set_t));
7235 	acct_policy_limit_set.tres = NULL;
7236 
7237 	job_ptr->assoc_id = assoc_rec.id;
7238 	job_ptr->assoc_ptr = (void *) assoc_ptr;
7239 	job_ptr->qos_ptr = (void *) qos_ptr;
7240 	job_ptr->qos_id = qos_rec.id;
7241 
7242 	if (mcs_g_set_mcs_label(job_ptr, job_desc->mcs_label) != 0 ) {
7243 		if (job_desc->mcs_label == NULL) {
7244 			error("Failed to create job: No valid mcs_label found");
7245 		} else {
7246 			error("Failed to create job: Invalid mcs-label: %s",
7247 			      job_desc->mcs_label);
7248 		}
7249 		error_code = ESLURM_INVALID_MCS_LABEL;
7250 		goto cleanup_fail;
7251 	}
7252 
7253 	/*
7254 	 * Permission for altering priority was confirmed above. The job_submit
7255 	 * plugin may have set the priority directly or put the job on hold. If
7256 	 * the priority is not given, we will figure it out later after we see
7257 	 * if the job is eligible or not. So we want NO_VAL if not set.
7258 	 */
7259 	job_ptr->priority = job_desc->priority;
7260 	if (job_ptr->priority == 0) {
7261 		if (user_submit_priority == 0)
7262 			job_ptr->state_reason = WAIT_HELD_USER;
7263 		else
7264 			job_ptr->state_reason = WAIT_HELD;
7265 	} else if (job_ptr->priority != NO_VAL) {
7266 		job_ptr->direct_set_prio = 1;
7267 	}
7268 
7269 	/*
7270 	 * The job submit plugin sets site_factor to NO_VAL so that it can
7271 	 * only be set the by the job submit plugin at submission.
7272 	 */
7273 	if (job_desc->site_factor != NO_VAL)
7274 		job_ptr->site_factor = job_desc->site_factor;
7275 
7276 	error_code = update_job_dependency(job_ptr, job_desc->dependency);
7277 	if (error_code != SLURM_SUCCESS)
7278 		goto cleanup_fail;
7279 	job_ptr->details->orig_dependency = xstrdup(job_ptr->details->
7280 						    dependency);
7281 
7282 	if ((error_code = build_feature_list(job_ptr)))
7283 		goto cleanup_fail;
7284 
7285 	/*
7286 	 * NOTE: If this job is being used to expand another job, this job's
7287 	 * gres_list has already been filled in with a copy of gres_list job
7288 	 * to be expanded by update_job_dependency()
7289 	 */
7290 	if (!job_ptr->details->expanding_jobid) {
7291 		job_ptr->gres_list = gres_list;
7292 		gres_list = NULL;
7293 	}
7294 
7295 	job_ptr->gres_detail_cnt = 0;
7296 	job_ptr->gres_detail_str = NULL;
7297 	gres_plugin_job_state_log(job_ptr->gres_list, job_ptr->job_id);
7298 
7299 	if ((error_code = validate_job_resv(job_ptr)))
7300 		goto cleanup_fail;
7301 
7302 	if (job_desc->script
7303 	    &&  (!will_run)) {	/* don't bother with copy if just a test */
7304 		if ((error_code = _copy_job_desc_to_file(job_desc,
7305 							 job_ptr->job_id))) {
7306 			error_code = ESLURM_WRITING_TO_FILE;
7307 			goto cleanup_fail;
7308 		}
7309 		job_ptr->batch_flag = 1;
7310 	} else
7311 		job_ptr->batch_flag = 0;
7312 	if (!will_run &&
7313 	    (error_code = bb_g_job_validate2(job_ptr, err_msg)))
7314 		goto cleanup_fail;
7315 
7316 	job_ptr->license_list = license_list;
7317 	license_list = NULL;
7318 
7319 	if (job_desc->req_switch != NO_VAL) {	/* Max # of switches */
7320 		job_ptr->req_switch = job_desc->req_switch;
7321 		if (job_desc->wait4switch != NO_VAL) {
7322 			job_ptr->wait4switch =
7323 				_max_switch_wait(job_desc->wait4switch);
7324 		} else
7325 			job_ptr->wait4switch = _max_switch_wait(INFINITE);
7326 	}
7327 	job_ptr->best_switch = true;
7328 
7329 	FREE_NULL_LIST(license_list);
7330 	FREE_NULL_LIST(gres_list);
7331 	FREE_NULL_BITMAP(req_bitmap);
7332 	FREE_NULL_BITMAP(exc_bitmap);
7333 	return error_code;
7334 
7335 cleanup_fail:
7336 	if (job_ptr) {
7337 		job_ptr->job_state = JOB_FAILED;
7338 		job_ptr->exit_code = 1;
7339 		job_ptr->state_reason = FAIL_SYSTEM;
7340 		xfree(job_ptr->state_desc);
7341 		job_ptr->start_time = job_ptr->end_time = time(NULL);
7342 		purge_job_record(job_ptr->job_id);
7343 		*job_pptr = NULL;
7344 	}
7345 	FREE_NULL_LIST(license_list);
7346 	xfree(acct_policy_limit_set.tres);
7347 	FREE_NULL_LIST(gres_list);
7348 	FREE_NULL_LIST(part_ptr_list);
7349 	FREE_NULL_BITMAP(req_bitmap);
7350 	FREE_NULL_BITMAP(exc_bitmap);
7351 	return error_code;
7352 }
7353 
_test_strlen(char * test_str,char * str_name,int max_str_len)7354 static int _test_strlen(char *test_str, char *str_name, int max_str_len)
7355 {
7356 	int i = 0;
7357 
7358 	if (test_str)
7359 		i = strlen(test_str);
7360 	if (i > max_str_len) {
7361 		info("job_create_request: strlen(%s) too big (%d > %d)",
7362 		     str_name, i, max_str_len);
7363 		return ESLURM_PATHNAME_TOO_LONG;
7364 	}
7365 	return SLURM_SUCCESS;
7366 }
7367 
7368 /* For each token in a comma delimited job array expression set the matching
7369  * bitmap entry */
_parse_array_tok(char * tok,bitstr_t * array_bitmap,uint32_t max)7370 static bool _parse_array_tok(char *tok, bitstr_t *array_bitmap, uint32_t max)
7371 {
7372 	char *end_ptr = NULL;
7373 	int i, first, last, step = 1;
7374 
7375 	if (tok[0] == '[')	/* Strip leading "[" */
7376 		tok++;
7377 	first = strtol(tok, &end_ptr, 10);
7378 	if (end_ptr[0] == ']')	/* Strip trailing "]" */
7379 		end_ptr++;
7380 	if (first < 0)
7381 		return false;
7382 	if (end_ptr[0] == '-') {
7383 		last = strtol(end_ptr + 1, &end_ptr, 10);
7384 		if (end_ptr[0] == ']')	/* Strip trailing "]" */
7385 			end_ptr++;
7386 		if (end_ptr[0] == ':') {
7387 			step = strtol(end_ptr + 1, &end_ptr, 10);
7388 			if (end_ptr[0] == ']')	/* Strip trailing "]" */
7389 				end_ptr++;
7390 			if ((end_ptr[0] != '\0') && (end_ptr[0] != '%'))
7391 				return false;
7392 			if (step <= 0)
7393 				return false;
7394 		} else if ((end_ptr[0] != '\0') && (end_ptr[0] != '%')) {
7395 			return false;
7396 		}
7397 		if (last < first)
7398 			return false;
7399 	} else if ((end_ptr[0] != '\0') && (end_ptr[0] != '%')) {
7400 		return false;
7401 	} else {
7402 		last = first;
7403 	}
7404 
7405 	if (last >= max)
7406 		return false;
7407 
7408 	for (i = first; i <= last; i += step) {
7409 		bit_set(array_bitmap, i);
7410 	}
7411 
7412 	return true;
7413 }
7414 
7415 /* Translate a job array expression into the equivalent bitmap */
_valid_array_inx(job_desc_msg_t * job_desc)7416 static bool _valid_array_inx(job_desc_msg_t *job_desc)
7417 {
7418 	static time_t sched_update = 0;
7419 	static uint32_t max_task_cnt = NO_VAL;
7420 	uint32_t task_cnt;
7421 	bool valid = true;
7422 	char *tmp, *tok, *last = NULL;
7423 
7424 	FREE_NULL_BITMAP(job_desc->array_bitmap);
7425 	if (!job_desc->array_inx || !job_desc->array_inx[0])
7426 		return true;
7427 	if (!job_desc->script || !job_desc->script[0])
7428 		return false;
7429 
7430 	if (max_array_size == NO_VAL) {
7431 		max_array_size = slurmctld_conf.max_array_sz;
7432 	}
7433 	if (max_array_size == 0) {
7434 		verbose("Job arrays disabled, MaxArraySize=0");
7435 		return false;
7436 	}
7437 
7438 	if (sched_update != slurmctld_conf.last_update) {
7439 		char *sched_params = slurm_get_sched_params();
7440 		char *key;
7441 		max_task_cnt = max_array_size;
7442 		sched_update = slurmctld_conf.last_update;
7443 		if ((key = xstrcasestr(sched_params, "max_array_tasks="))) {
7444 			key += 16;
7445 			max_task_cnt = atoi(key);
7446 		}
7447 		xfree(sched_params);
7448 	}
7449 
7450 	/* We have a job array request */
7451 	job_desc->immediate = 0;	/* Disable immediate option */
7452 	job_desc->array_bitmap = bit_alloc(max_array_size);
7453 
7454 	tmp = xstrdup(job_desc->array_inx);
7455 	tok = strtok_r(tmp, ",", &last);
7456 	while (tok && valid) {
7457 		valid = _parse_array_tok(tok, job_desc->array_bitmap,
7458 					 max_array_size);
7459 		tok = strtok_r(NULL, ",", &last);
7460 	}
7461 	xfree(tmp);
7462 
7463 	if (valid && (max_task_cnt < max_array_size)) {
7464 		task_cnt = bit_set_count(job_desc->array_bitmap);
7465 		if (task_cnt > max_task_cnt) {
7466 			debug("max_array_tasks exceeded (%u > %u)",
7467 			      task_cnt, max_task_cnt);
7468 			valid = false;
7469 		}
7470 	}
7471 
7472 	return valid;
7473 }
7474 
7475 /* Make sure a job descriptor's strings are not huge, which could result in
7476  * a denial of service attack due to memory demands by the slurmctld */
_test_job_desc_fields(job_desc_msg_t * job_desc)7477 static int _test_job_desc_fields(job_desc_msg_t * job_desc)
7478 {
7479 	static int max_script = -1;
7480 
7481 	if (max_script == -1) {
7482 		char *sched_params = slurm_get_sched_params();
7483 		char *tmp_ptr;
7484 		max_script = 4 * 1024 * 1024;
7485 		if ((tmp_ptr = xstrcasestr(sched_params, "max_script_size="))) {
7486 			max_script = atoi(tmp_ptr + 16);
7487 		}
7488 		xfree(sched_params);
7489 	}
7490 
7491 	if (_test_strlen(job_desc->account, "account", 1024)		||
7492 	    _test_strlen(job_desc->alloc_node, "alloc_node", 1024)	||
7493 	    _test_strlen(job_desc->array_inx, "array_inx", 1024 * 4)	||
7494 	    _test_strlen(job_desc->burst_buffer, "burst_buffer",1024*8) ||
7495 	    _test_strlen(job_desc->comment, "comment", 1024)		||
7496 	    _test_strlen(job_desc->cpu_bind, "cpu-bind", 1024 * 128)	||
7497 	    _test_strlen(job_desc->cpus_per_tres, "cpus_per_tres", 1024)||
7498 	    _test_strlen(job_desc->dependency, "dependency", 1024*128)	||
7499 	    _test_strlen(job_desc->features, "features", 1024)		||
7500 	    _test_strlen(
7501 		job_desc->cluster_features, "cluster_features", 1024)   ||
7502 	    _test_strlen(job_desc->licenses, "licenses", 1024)		||
7503 	    _test_strlen(job_desc->mail_user, "mail_user", 1024)	||
7504 	    _test_strlen(job_desc->mcs_label, "mcs_label", 1024)	||
7505 	    _test_strlen(job_desc->mem_bind, "mem-bind", 1024 * 128)	||
7506 	    _test_strlen(job_desc->mem_per_tres, "mem_per_tres", 1024)	||
7507 	    _test_strlen(job_desc->name, "name", 1024)			||
7508 	    _test_strlen(job_desc->network, "network", 1024)		||
7509 	    _test_strlen(job_desc->partition, "partition", 1024)	||
7510 	    _test_strlen(job_desc->qos, "qos", 1024)			||
7511 	    _test_strlen(job_desc->reservation, "reservation", 1024)	||
7512 	    _test_strlen(job_desc->script, "script", max_script)	||
7513 	    _test_strlen(job_desc->std_err, "std_err", MAXPATHLEN)	||
7514 	    _test_strlen(job_desc->std_in, "std_in", MAXPATHLEN)	||
7515 	    _test_strlen(job_desc->std_out, "std_out", MAXPATHLEN)	||
7516 	    _test_strlen(job_desc->tres_bind, "tres_bind", 1024)	||
7517 	    _test_strlen(job_desc->tres_freq, "tres_freq", 1024)	||
7518 	    _test_strlen(job_desc->tres_per_job, "tres_per_job", 1024)	||
7519 	    _test_strlen(job_desc->tres_per_node, "tres_per_node", 1024)||
7520 	    _test_strlen(job_desc->tres_per_socket, "tres_per_socket", 1024) ||
7521 	    _test_strlen(job_desc->tres_per_task, "tres_per_task", 1024)||
7522 	    _test_strlen(job_desc->wckey, "wckey", 1024)		||
7523 	    _test_strlen(job_desc->work_dir, "work_dir", MAXPATHLEN))
7524 		return ESLURM_PATHNAME_TOO_LONG;
7525 
7526 	return SLURM_SUCCESS;
7527 }
7528 
7529 /* Perform some size checks on strings we store to prevent
7530  * malicious user filling slurmctld's memory
7531  * IN job_desc   - user job submit request
7532  * IN submit_uid - UID making job submit request
7533  * OUT err_msg   - custom error message to return
7534  * RET 0 or error code */
validate_job_create_req(job_desc_msg_t * job_desc,uid_t submit_uid,char ** err_msg)7535 extern int validate_job_create_req(job_desc_msg_t * job_desc, uid_t submit_uid,
7536 				   char **err_msg)
7537 {
7538 	int rc;
7539 
7540 	/*
7541 	 * Check user permission for negative 'nice' and non-0 priority values
7542 	 * (restricted to root, SlurmUser, or SLURMDB_ADMIN_OPERATOR) _before_
7543 	 * running the job_submit plugin.
7544 	 */
7545 	if (!validate_operator(submit_uid)) {
7546 		if (job_desc->priority != 0)
7547 			job_desc->priority = NO_VAL;
7548 		if (job_desc->nice < NICE_OFFSET)
7549 			return ESLURM_INVALID_NICE;
7550 	}
7551 
7552 	if (!validate_super_user(submit_uid)) {
7553 		/* AdminComment can only be set by an Admin. */
7554 		if (job_desc->admin_comment)
7555 			return ESLURM_ACCESS_DENIED;
7556 
7557 		if (job_desc->reboot && (job_desc->reboot != NO_VAL16)) {
7558 			*err_msg = xstrdup("rebooting of nodes is only allowed for admins");
7559 			return ESLURM_ACCESS_DENIED;
7560 		}
7561 	}
7562 
7563 	rc = job_submit_plugin_submit(job_desc, (uint32_t) submit_uid, err_msg);
7564 	if (rc != SLURM_SUCCESS)
7565 		return rc;
7566 	rc = node_features_g_job_valid(job_desc->features);
7567 	if (rc != SLURM_SUCCESS)
7568 		return rc;
7569 
7570 	rc = _test_job_desc_fields(job_desc);
7571 	if (rc != SLURM_SUCCESS)
7572 		return rc;
7573 
7574 	if (!_valid_array_inx(job_desc))
7575 		return ESLURM_INVALID_ARRAY;
7576 
7577 	if (job_desc->x11 && !(slurmctld_conf.prolog_flags & PROLOG_FLAG_X11))
7578 		return ESLURM_X11_NOT_AVAIL;
7579 
7580 	/* Make sure anything that may be put in the database will be
7581 	 * lower case */
7582 	xstrtolower(job_desc->account);
7583 	xstrtolower(job_desc->wckey);
7584 
7585 	/* Basic validation of some parameters */
7586 	if (job_desc->req_nodes) {
7587 		hostlist_t hl;
7588 		uint32_t host_cnt;
7589 		hl = hostlist_create(job_desc->req_nodes);
7590 		if (hl == NULL) {
7591 			/* likely a badly formatted hostlist */
7592 			error("validate_job_create_req: bad hostlist");
7593 			return ESLURM_INVALID_NODE_NAME;
7594 		}
7595 		host_cnt = hostlist_count(hl);
7596 		hostlist_destroy(hl);
7597 		if ((job_desc->min_nodes == NO_VAL) ||
7598 		    (job_desc->min_nodes <  host_cnt))
7599 			job_desc->min_nodes = host_cnt;
7600 	}
7601 
7602 	/* If max nodes is different than min nodes don't set tasks or
7603 	 * it will hard code the range.
7604 	 */
7605 	if ((job_desc->ntasks_per_node != NO_VAL16) &&
7606 	    (job_desc->min_nodes       != NO_VAL) &&
7607 	    (job_desc->num_tasks       == NO_VAL)) {
7608 		job_desc->num_tasks =
7609 			job_desc->ntasks_per_node * job_desc->min_nodes;
7610 	}
7611 
7612 	/* Only set min and max cpus if overcommit isn't set */
7613 	if ((job_desc->overcommit == NO_VAL8) &&
7614 	    (job_desc->min_cpus   != NO_VAL)  &&
7615 	    (job_desc->num_tasks  != NO_VAL)  &&
7616 	    (job_desc->num_tasks > job_desc->min_cpus)) {
7617 		if (job_desc->num_tasks != NO_VAL)
7618 			job_desc->min_cpus = job_desc->num_tasks;
7619 		else if (job_desc->min_nodes != NO_VAL)
7620 			job_desc->min_cpus = job_desc->min_nodes;
7621 		else
7622 			job_desc->min_cpus = 1;
7623 
7624 		if (job_desc->cpus_per_task != NO_VAL16)
7625 			job_desc->min_cpus *= job_desc->cpus_per_task;
7626 		/* This is just a sanity check as we wouldn't ever have a
7627 		 * max_cpus if we didn't have a min_cpus.
7628 		 */
7629 		if ((job_desc->max_cpus != NO_VAL) &&
7630 		    (job_desc->max_cpus < job_desc->min_cpus))
7631 			job_desc->max_cpus = job_desc->min_cpus;
7632 	}
7633 
7634 	if (job_desc->reboot && (job_desc->reboot != NO_VAL16))
7635 		job_desc->shared = 0;
7636 
7637 	return SLURM_SUCCESS;
7638 }
7639 
7640 /* _copy_job_desc_to_file - copy the job script and environment from the RPC
7641  *	structure into a file */
7642 static int
_copy_job_desc_to_file(job_desc_msg_t * job_desc,uint32_t job_id)7643 _copy_job_desc_to_file(job_desc_msg_t * job_desc, uint32_t job_id)
7644 {
7645 	int error_code = 0, hash;
7646 	char *dir_name, *file_name;
7647 	DEF_TIMERS;
7648 
7649 	START_TIMER;
7650 
7651 	if (!job_desc->environment || job_desc->env_size == 0) {
7652 		error("%s: batch job cannot run without an environment",
7653 		      __func__);
7654 		return ESLURM_ENVIRONMENT_MISSING;
7655 	}
7656 
7657 	/* Create directory based upon job ID due to limitations on the number
7658 	 * of files possible in a directory on some file system types (e.g.
7659 	 * up to 64k files on a FAT32 file system). */
7660 	hash = job_id % 10;
7661 	dir_name = xstrdup_printf("%s/hash.%d",
7662 				  slurmctld_conf.state_save_location, hash);
7663 	(void) mkdir(dir_name, 0700);
7664 
7665 	/* Create job_id specific directory */
7666 	xstrfmtcat(dir_name, "/job.%u", job_id);
7667 	if (mkdir(dir_name, 0700)) {
7668 		if (!slurmctld_primary && (errno == EEXIST)) {
7669 			error("Apparent duplicate JobId=%u. Two primary slurmctld daemons might currently be active",
7670 			      job_id);
7671 		}
7672 		error("mkdir(%s) error %m", dir_name);
7673 		xfree(dir_name);
7674 		return ESLURM_WRITING_TO_FILE;
7675 	}
7676 
7677 	/* Create environment file, and write data to it */
7678 	file_name = xstrdup_printf("%s/environment", dir_name);
7679 	error_code = _write_data_array_to_file(file_name,
7680 					       job_desc->environment,
7681 					       job_desc->env_size);
7682 	xfree(file_name);
7683 
7684 	if (error_code == 0) {
7685 		/* Create script file */
7686 		file_name = xstrdup_printf("%s/script", dir_name);
7687 		error_code = _write_data_to_file(file_name, job_desc->script);
7688 		xfree(file_name);
7689 	}
7690 
7691 	xfree(dir_name);
7692 	END_TIMER2("_copy_job_desc_to_file");
7693 	return error_code;
7694 }
7695 
7696 /* Return true of the specified job ID already has a batch directory so
7697  * that a different job ID can be created. This is to help limit damage from
7698  * split-brain, where two slurmctld daemons are running as primary. */
_dup_job_file_test(uint32_t job_id)7699 static bool _dup_job_file_test(uint32_t job_id)
7700 {
7701 	char *dir_name_src;
7702 	struct stat buf;
7703 	int rc, hash = job_id % 10;
7704 
7705 	dir_name_src = xstrdup_printf("%s/hash.%d/job.%u",
7706 				      slurmctld_conf.state_save_location,
7707 				      hash, job_id);
7708 	rc = stat(dir_name_src, &buf);
7709 	xfree(dir_name_src);
7710 	if (rc == 0) {
7711 		error("Vestigial state files for JobId=%u, but no job record. This may be the result of two slurmctld running in primary mode",
7712 		      job_id);
7713 		return true;
7714 	}
7715 	return false;
7716 }
7717 
7718 /*
7719  * Create file with specified name and write the supplied data array to it
7720  * IN file_name - file to create and write to
7721  * IN data - array of pointers to strings (e.g. env)
7722  * IN size - number of elements in data
7723  */
7724 static int
_write_data_array_to_file(char * file_name,char ** data,uint32_t size)7725 _write_data_array_to_file(char *file_name, char **data, uint32_t size)
7726 {
7727 	int fd, i, pos, nwrite, amount;
7728 
7729 	fd = creat(file_name, 0600);
7730 	if (fd < 0) {
7731 		error("Error creating file %s, %m", file_name);
7732 		return ESLURM_WRITING_TO_FILE;
7733 	}
7734 
7735 	amount = write(fd, &size, sizeof(uint32_t));
7736 	if (amount < sizeof(uint32_t)) {
7737 		error("Error writing file %s, %m", file_name);
7738 		close(fd);
7739 		return ESLURM_WRITING_TO_FILE;
7740 	}
7741 
7742 	if (data == NULL) {
7743 		close(fd);
7744 		return SLURM_SUCCESS;
7745 	}
7746 
7747 	for (i = 0; i < size; i++) {
7748 		nwrite = strlen(data[i]) + 1;
7749 		pos = 0;
7750 		while (nwrite > 0) {
7751 			amount = write(fd, &data[i][pos], nwrite);
7752 			if ((amount < 0) && (errno != EINTR)) {
7753 				error("Error writing file %s, %m",
7754 				      file_name);
7755 				close(fd);
7756 				return ESLURM_WRITING_TO_FILE;
7757 			}
7758 			nwrite -= amount;
7759 			pos    += amount;
7760 		}
7761 	}
7762 
7763 	close(fd);
7764 	return SLURM_SUCCESS;
7765 }
7766 
7767 /*
7768  * Create file with specified name and write the supplied data array to it
7769  * IN file_name - file to create and write to
7770  * IN data - pointer to string
7771  */
_write_data_to_file(char * file_name,char * data)7772 static int _write_data_to_file(char *file_name, char *data)
7773 {
7774 	int fd, pos, nwrite, amount;
7775 
7776 	if (data == NULL) {
7777 		(void) unlink(file_name);
7778 		return SLURM_SUCCESS;
7779 	}
7780 
7781 	fd = creat(file_name, 0700);
7782 	if (fd < 0) {
7783 		error("Error creating file %s, %m", file_name);
7784 		return ESLURM_WRITING_TO_FILE;
7785 	}
7786 
7787 	nwrite = strlen(data) + 1;
7788 	pos = 0;
7789 	while (nwrite > 0) {
7790 		amount = write(fd, &data[pos], nwrite);
7791 		if ((amount < 0) && (errno != EINTR)) {
7792 			error("Error writing file %s, %m", file_name);
7793 			close(fd);
7794 			return ESLURM_WRITING_TO_FILE;
7795 		}
7796 		nwrite -= amount;
7797 		pos    += amount;
7798 	}
7799 	close(fd);
7800 	return SLURM_SUCCESS;
7801 }
7802 
7803 /*
7804  * get_job_env - return the environment variables and their count for a
7805  *	given job
7806  * IN job_ptr - pointer to job for which data is required
7807  * OUT env_size - number of elements to read
7808  * RET point to array of string pointers containing environment variables
7809  */
get_job_env(job_record_t * job_ptr,uint32_t * env_size)7810 char **get_job_env(job_record_t *job_ptr, uint32_t *env_size)
7811 {
7812 	char *file_name = NULL, **environment = NULL;
7813 	int cc, fd = -1, hash;
7814 	uint32_t use_id;
7815 
7816 	use_id = (job_ptr->array_task_id != NO_VAL) ?
7817 		job_ptr->array_job_id : job_ptr->job_id;
7818 	hash = use_id % 10;
7819 	file_name = xstrdup_printf("%s/hash.%d/job.%u/environment",
7820 				   slurmctld_conf.state_save_location,
7821 				   hash, use_id);
7822 	fd = open(file_name, 0);
7823 
7824 	if (fd >= 0) {
7825 		cc = _read_data_array_from_file(fd, file_name, &environment,
7826 						env_size, job_ptr);
7827 		if (cc < 0)
7828 			environment = NULL;
7829 		close(fd);
7830 	} else {
7831 		error("Could not open environment file for %pJ", job_ptr);
7832 	}
7833 
7834 	xfree(file_name);
7835 	return environment;
7836 }
7837 
7838 /*
7839  * get_job_script - return the script for a given job
7840  * IN job_ptr - pointer to job for which data is required
7841  * RET Buf containing job script
7842  */
get_job_script(const job_record_t * job_ptr)7843 Buf get_job_script(const job_record_t *job_ptr)
7844 {
7845 	char *file_name = NULL;
7846 	int hash;
7847 	uint32_t use_id;
7848 	Buf buf;
7849 
7850 	if (!job_ptr->batch_flag)
7851 		return NULL;
7852 
7853 	use_id = (job_ptr->array_task_id != NO_VAL) ?
7854 		job_ptr->array_job_id : job_ptr->job_id;
7855 	hash = use_id % 10;
7856 	file_name = xstrdup_printf("%s/hash.%d/job.%u/script",
7857 				   slurmctld_conf.state_save_location,
7858 				   hash, use_id);
7859 
7860 	if (!(buf = create_mmap_buf(file_name)))
7861 		error("Could not open script file for %pJ", job_ptr);
7862 	xfree(file_name);
7863 
7864 	return buf;
7865 }
7866 
7867 /*
7868  * Read a collection of strings from a file
7869  * IN fd - file descriptor
7870  * IN file_name - file to read from
7871  * OUT data - pointer to array of pointers to strings (e.g. env),
7872  *	must be xfreed when no longer needed
7873  * OUT size - number of elements in data
7874  * IN job_ptr - job
7875  * RET 0 on success, -1 on error
7876  * NOTE: The output format of this must be identical with _xduparray2()
7877  */
_read_data_array_from_file(int fd,char * file_name,char *** data,uint32_t * size,job_record_t * job_ptr)7878 static int _read_data_array_from_file(int fd, char *file_name, char ***data,
7879 				      uint32_t *size, job_record_t *job_ptr)
7880 {
7881 	int pos, buf_size, amount, i, j;
7882 	char *buffer, **array_ptr;
7883 	uint32_t rec_cnt;
7884 
7885 	xassert(file_name);
7886 	xassert(data);
7887 	xassert(size);
7888 	*data = NULL;
7889 	*size = 0;
7890 
7891 	amount = read(fd, &rec_cnt, sizeof(uint32_t));
7892 	if (amount < sizeof(uint32_t)) {
7893 		if (amount != 0)	/* incomplete write */
7894 			error("Error reading file %s, %m", file_name);
7895 		else
7896 			verbose("File %s has zero size", file_name);
7897 		return -1;
7898 	}
7899 
7900 	if (rec_cnt >= INT_MAX) {
7901 		error("%s: unreasonable record counter %d in file %s",
7902 		      __func__, rec_cnt, file_name);
7903 		return -1;
7904 	}
7905 
7906 	if (rec_cnt == 0) {
7907 		*data = NULL;
7908 		*size = 0;
7909 		return 0;
7910 	}
7911 
7912 	pos = 0;
7913 	buf_size = BUF_SIZE;
7914 	buffer = xmalloc(buf_size + 1);
7915 	while (1) {
7916 		amount = read(fd, &buffer[pos], BUF_SIZE);
7917 		if (amount < 0) {
7918 			error("Error reading file %s, %m", file_name);
7919 			xfree(buffer);
7920 			return -1;
7921 		}
7922 		buffer[pos + amount] = '\0';
7923 		pos += amount;
7924 		if (amount < BUF_SIZE)	/* end of file */
7925 			break;
7926 		buf_size += amount;
7927 		xrealloc(buffer, buf_size + 1);
7928 	}
7929 
7930 	/* Allocate extra space for supplemental environment variables */
7931 	if (job_ptr->details->env_cnt) {
7932 		for (j = 0; j < job_ptr->details->env_cnt; j++)
7933 			pos += (strlen(job_ptr->details->env_sup[j]) + 1);
7934 		xrealloc(buffer, pos);
7935 	}
7936 
7937 	/* We have all the data, now let's compute the pointers */
7938 	array_ptr = xcalloc((rec_cnt + job_ptr->details->env_cnt),
7939 			    sizeof(char *));
7940 	for (i = 0, pos = 0; i < rec_cnt; i++) {
7941 		array_ptr[i] = &buffer[pos];
7942 		pos += strlen(&buffer[pos]) + 1;
7943 		if ((pos > buf_size) && ((i + 1) < rec_cnt)) {
7944 			error("Bad environment file %s", file_name);
7945 			rec_cnt = i;
7946 			break;
7947 		}
7948 	}
7949 
7950 	/* Add supplemental environment variables */
7951 	if (job_ptr->details->env_cnt) {
7952 		char *tmp_chr;
7953 		int env_len, name_len;
7954 		for (j = 0; j < job_ptr->details->env_cnt; j++) {
7955 			tmp_chr = strchr(job_ptr->details->env_sup[j], '=');
7956 			if (tmp_chr == NULL) {
7957 				error("Invalid supplemental environment "
7958 				      "variable: %s",
7959 				      job_ptr->details->env_sup[j]);
7960 				continue;
7961 			}
7962 			env_len  = strlen(job_ptr->details->env_sup[j]) + 1;
7963 			name_len = tmp_chr - job_ptr->details->env_sup[j] + 1;
7964 			/* search for duplicate */
7965 			for (i = 0; i < rec_cnt; i++) {
7966 				if (xstrncmp(array_ptr[i],
7967 					     job_ptr->details->env_sup[j],
7968 					     name_len)) {
7969 					continue;
7970 				}
7971 
7972 				/*
7973 				 * If we are are the front we can not overwrite
7974 				 * that spot, we can clear it an then add to the
7975 				 * end of the array.
7976 				 */
7977 				if (i == 0) {
7978 					array_ptr[0][0] = '\0';
7979 					i = rec_cnt;
7980 					break;
7981 				}
7982 				/* over-write duplicate */
7983 				memcpy(&buffer[pos],
7984 				       job_ptr->details->env_sup[j], env_len);
7985 				array_ptr[i] = &buffer[pos];
7986 				pos += env_len;
7987 				break;
7988 			}
7989 			if (i >= rec_cnt) {	/* add env to array end */
7990 				memcpy(&buffer[pos],
7991 				       job_ptr->details->env_sup[j], env_len);
7992 				array_ptr[rec_cnt++] = &buffer[pos];
7993 				pos += env_len;
7994 			}
7995 		}
7996 	}
7997 
7998 	*size = rec_cnt;
7999 	*data = array_ptr;
8000 	return 0;
8001 }
8002 
8003 /* Given a job request, return a multi_core_data struct.
8004  * Returns NULL if no values set in the job/step request */
8005 static multi_core_data_t *
_set_multi_core_data(job_desc_msg_t * job_desc)8006 _set_multi_core_data(job_desc_msg_t * job_desc)
8007 {
8008 	multi_core_data_t * mc_ptr;
8009 
8010 	if ((job_desc->sockets_per_node  == NO_VAL16)	&&
8011 	    (job_desc->cores_per_socket  == NO_VAL16)	&&
8012 	    (job_desc->threads_per_core  == NO_VAL16)	&&
8013 	    (job_desc->ntasks_per_socket == NO_VAL16)	&&
8014 	    (job_desc->ntasks_per_core   == NO_VAL16)	&&
8015 	    (job_desc->plane_size        == NO_VAL16))
8016 		return NULL;
8017 
8018 	mc_ptr = xmalloc(sizeof(multi_core_data_t));
8019 	mc_ptr->sockets_per_node = job_desc->sockets_per_node;
8020 	mc_ptr->cores_per_socket = job_desc->cores_per_socket;
8021 	mc_ptr->threads_per_core = job_desc->threads_per_core;
8022 	if (job_desc->ntasks_per_socket != NO_VAL16)
8023 		mc_ptr->ntasks_per_socket  = job_desc->ntasks_per_socket;
8024 	else
8025 		mc_ptr->ntasks_per_socket  = INFINITE16;
8026 	if (job_desc->ntasks_per_core != NO_VAL16)
8027 		mc_ptr->ntasks_per_core    = job_desc->ntasks_per_core;
8028 	else if (slurmctld_conf.select_type_param & CR_ONE_TASK_PER_CORE)
8029 		mc_ptr->ntasks_per_core    = 1;
8030 	else
8031 		mc_ptr->ntasks_per_core    = INFINITE16;
8032 	if (job_desc->plane_size != NO_VAL16)
8033 		mc_ptr->plane_size         = job_desc->plane_size;
8034 	else
8035 		mc_ptr->plane_size         = 0;
8036 
8037 	return mc_ptr;
8038 }
8039 
8040 /* Return default "wait_all_nodes" option for a new job */
_default_wait_all_nodes(job_desc_msg_t * job_desc)8041 static uint16_t _default_wait_all_nodes(job_desc_msg_t *job_desc)
8042 {
8043 	static uint16_t default_batch_wait = NO_VAL16;
8044 	static time_t sched_update = 0;
8045 	char *sched_params;
8046 
8047 	if (!job_desc->script)
8048 		return 0;
8049 
8050 	if ((default_batch_wait != NO_VAL16) &&
8051 	    (sched_update == slurmctld_conf.last_update))
8052 		return default_batch_wait;
8053 
8054 	sched_params = slurm_get_sched_params();
8055 	if (xstrcasestr(sched_params, "sbatch_wait_nodes"))
8056 		default_batch_wait = 1;
8057 	else
8058 		default_batch_wait = 0;
8059 	xfree(sched_params);
8060 	sched_update = slurmctld_conf.last_update;
8061 
8062 	return default_batch_wait;
8063 }
8064 
8065 /* _copy_job_desc_to_job_record - copy the job descriptor from the RPC
8066  *	structure into the actual slurmctld job record */
_copy_job_desc_to_job_record(job_desc_msg_t * job_desc,job_record_t ** job_rec_ptr,bitstr_t ** req_bitmap,bitstr_t ** exc_bitmap)8067 static int _copy_job_desc_to_job_record(job_desc_msg_t *job_desc,
8068 					job_record_t **job_rec_ptr,
8069 					bitstr_t **req_bitmap,
8070 					bitstr_t **exc_bitmap)
8071 {
8072 	int error_code;
8073 	struct job_details *detail_ptr;
8074 	job_record_t *job_ptr;
8075 
8076 	if (slurm_get_track_wckey()) {
8077 		if (!job_desc->wckey) {
8078 			/* get the default wckey for this user since none was
8079 			 * given */
8080 			slurmdb_user_rec_t user_rec;
8081 			memset(&user_rec, 0, sizeof(user_rec));
8082 			user_rec.uid = job_desc->user_id;
8083 			assoc_mgr_fill_in_user(acct_db_conn, &user_rec,
8084 					       accounting_enforce, NULL, false);
8085 			if (user_rec.default_wckey)
8086 				job_desc->wckey = xstrdup_printf(
8087 					"*%s", user_rec.default_wckey);
8088 			else if (!(accounting_enforce &
8089 				   ACCOUNTING_ENFORCE_WCKEYS))
8090 				job_desc->wckey = xstrdup("*");
8091 			else {
8092 				error("Job didn't specify wckey and user "
8093 				      "%d has no default.", job_desc->user_id);
8094 				return ESLURM_INVALID_WCKEY;
8095 			}
8096 		} else if (job_desc->wckey) {
8097 			slurmdb_wckey_rec_t wckey_rec, *wckey_ptr = NULL;
8098 
8099 			memset(&wckey_rec, 0, sizeof(wckey_rec));
8100 			wckey_rec.uid       = job_desc->user_id;
8101 			wckey_rec.name      = job_desc->wckey;
8102 
8103 			if (assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec,
8104 						    accounting_enforce,
8105 						    &wckey_ptr, false)) {
8106 				if (accounting_enforce &
8107 				    ACCOUNTING_ENFORCE_WCKEYS) {
8108 					error("%s: invalid wckey '%s' for "
8109 					      "user %u.",
8110 					      __func__, wckey_rec.name,
8111 					      job_desc->user_id);
8112 					return ESLURM_INVALID_WCKEY;
8113 				}
8114 			}
8115 		} else if (accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS) {
8116 			/* This should never happen */
8117 			info("%s: no wckey was given for job submit", __func__);
8118 			return ESLURM_INVALID_WCKEY;
8119 		}
8120 	}
8121 
8122 	job_ptr = _create_job_record(1);
8123 	if (!job_ptr)
8124 		return SLURM_ERROR;
8125 
8126 	*job_rec_ptr = job_ptr;
8127 	job_ptr->partition = xstrdup(job_desc->partition);
8128 	if (job_desc->profile != ACCT_GATHER_PROFILE_NOT_SET)
8129 		job_ptr->profile = job_desc->profile;
8130 
8131 	if (job_desc->job_id != NO_VAL) {	/* already confirmed unique */
8132 		job_ptr->job_id = job_desc->job_id;
8133 	} else {
8134 		error_code = _set_job_id(job_ptr);
8135 		if (error_code)
8136 			return error_code;
8137 	}
8138 
8139 	job_ptr->name = xstrdup(job_desc->name);
8140 	job_ptr->wckey = xstrdup(job_desc->wckey);
8141 
8142 	/* Since this is only used in the slurmctld, copy it now. */
8143 	job_ptr->tres_req_cnt = job_desc->tres_req_cnt;
8144 	job_desc->tres_req_cnt = NULL;
8145 	set_job_tres_req_str(job_ptr, false);
8146 	_add_job_hash(job_ptr);
8147 
8148 	job_ptr->user_id    = (uid_t) job_desc->user_id;
8149 	job_ptr->group_id   = (gid_t) job_desc->group_id;
8150 	job_ptr->job_state  = JOB_PENDING;
8151 	job_ptr->time_limit = job_desc->time_limit;
8152 	job_ptr->deadline   = job_desc->deadline;
8153 	if (job_desc->delay_boot == NO_VAL)
8154 		job_ptr->delay_boot   = delay_boot;
8155 	else
8156 		job_ptr->delay_boot   = job_desc->delay_boot;
8157 	if (job_desc->time_min != NO_VAL)
8158 		job_ptr->time_min = job_desc->time_min;
8159 	job_ptr->alloc_sid  = job_desc->alloc_sid;
8160 	job_ptr->alloc_node = xstrdup(job_desc->alloc_node);
8161 	job_ptr->account    = xstrdup(job_desc->account);
8162 	job_ptr->batch_features = xstrdup(job_desc->batch_features);
8163 	job_ptr->burst_buffer = xstrdup(job_desc->burst_buffer);
8164 	job_ptr->network    = xstrdup(job_desc->network);
8165 	job_ptr->resv_name  = xstrdup(job_desc->reservation);
8166 	job_ptr->restart_cnt = job_desc->restart_cnt;
8167 	job_ptr->comment    = xstrdup(job_desc->comment);
8168 	job_ptr->admin_comment = xstrdup(job_desc->admin_comment);
8169 
8170 	if (job_desc->kill_on_node_fail != NO_VAL16)
8171 		job_ptr->kill_on_node_fail = job_desc->kill_on_node_fail;
8172 
8173 	job_ptr->resp_host = xstrdup(job_desc->resp_host);
8174 	job_ptr->alloc_resp_port = job_desc->alloc_resp_port;
8175 	job_ptr->other_port = job_desc->other_port;
8176 	job_ptr->power_flags = job_desc->power_flags;
8177 	job_ptr->time_last_active = time(NULL);
8178 	job_ptr->cr_enabled = 0;
8179 	job_ptr->derived_ec = 0;
8180 
8181 	job_ptr->licenses  = xstrdup(job_desc->licenses);
8182 	job_ptr->mail_user = _get_mail_user(job_desc->mail_user,
8183 					    job_ptr->user_id);
8184 	if (job_desc->mail_type &&
8185 	    (job_desc->mail_type != NO_VAL16)) {
8186 		job_ptr->mail_type = job_desc->mail_type;
8187 	}
8188 
8189 	job_ptr->bit_flags = job_desc->bitflags;
8190 	job_ptr->bit_flags &= ~BACKFILL_TEST;
8191 	job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST;
8192 	job_ptr->spank_job_env = job_desc->spank_job_env;
8193 	job_ptr->spank_job_env_size = job_desc->spank_job_env_size;
8194 	job_desc->spank_job_env = (char **) NULL; /* nothing left to free */
8195 	job_desc->spank_job_env_size = 0;         /* nothing left to free */
8196 	job_ptr->mcs_label = xstrdup(job_desc->mcs_label);
8197 	job_ptr->origin_cluster = xstrdup(job_desc->origin_cluster);
8198 
8199 	job_ptr->cpus_per_tres = xstrdup(job_desc->cpus_per_tres);
8200 	job_ptr->mem_per_tres = xstrdup(job_desc->mem_per_tres);
8201 	job_ptr->tres_bind = xstrdup(job_desc->tres_bind);
8202 	job_ptr->tres_freq = xstrdup(job_desc->tres_freq);
8203 	job_ptr->tres_per_job = xstrdup(job_desc->tres_per_job);
8204 	job_ptr->tres_per_node = xstrdup(job_desc->tres_per_node);
8205 	job_ptr->tres_per_socket = xstrdup(job_desc->tres_per_socket);
8206 	job_ptr->tres_per_task = xstrdup(job_desc->tres_per_task);
8207 
8208 	if (job_desc->wait_all_nodes == NO_VAL16)
8209 		job_ptr->wait_all_nodes = _default_wait_all_nodes(job_desc);
8210 	else
8211 		job_ptr->wait_all_nodes = job_desc->wait_all_nodes;
8212 	job_ptr->warn_flags  = job_desc->warn_flags;
8213 	job_ptr->warn_signal = job_desc->warn_signal;
8214 	job_ptr->warn_time   = job_desc->warn_time;
8215 
8216 	detail_ptr = job_ptr->details;
8217 	detail_ptr->argc = job_desc->argc;
8218 	detail_ptr->argv = job_desc->argv;
8219 	job_desc->argv   = (char **) NULL; /* nothing left to free */
8220 	job_desc->argc   = 0;		   /* nothing left to free */
8221 	detail_ptr->acctg_freq = xstrdup(job_desc->acctg_freq);
8222 	detail_ptr->cpu_bind_type = job_desc->cpu_bind_type;
8223 	detail_ptr->cpu_bind   = xstrdup(job_desc->cpu_bind);
8224 	detail_ptr->cpu_freq_gov = job_desc->cpu_freq_gov;
8225 	detail_ptr->cpu_freq_max = job_desc->cpu_freq_max;
8226 	detail_ptr->cpu_freq_min = job_desc->cpu_freq_min;
8227 	detail_ptr->extra      = job_desc->extra;
8228 	detail_ptr->nice       = job_desc->nice;
8229 	detail_ptr->open_mode  = job_desc->open_mode;
8230 	detail_ptr->min_cpus   = job_desc->min_cpus;
8231 	detail_ptr->orig_min_cpus   = job_desc->min_cpus;
8232 	detail_ptr->max_cpus   = job_desc->max_cpus;
8233 	detail_ptr->orig_max_cpus   = job_desc->max_cpus;
8234 	detail_ptr->min_nodes  = job_desc->min_nodes;
8235 	detail_ptr->max_nodes  = job_desc->max_nodes;
8236 	detail_ptr->x11        = job_desc->x11;
8237 	detail_ptr->x11_magic_cookie = xstrdup(job_desc->x11_magic_cookie);
8238 	detail_ptr->x11_target = xstrdup(job_desc->x11_target);
8239 	detail_ptr->x11_target_port = job_desc->x11_target_port;
8240 	if (job_desc->req_nodes) {
8241 		detail_ptr->req_nodes =
8242 			_copy_nodelist_no_dup(job_desc->req_nodes);
8243 		detail_ptr->req_node_bitmap = *req_bitmap;
8244 		*req_bitmap = NULL;	/* Reused nothing left to free */
8245 	}
8246 	if (job_desc->exc_nodes) {
8247 		detail_ptr->exc_nodes =
8248 			_copy_nodelist_no_dup(job_desc->exc_nodes);
8249 		detail_ptr->exc_node_bitmap = *exc_bitmap;
8250 		*exc_bitmap = NULL;	/* Reused nothing left to free */
8251 	}
8252 	detail_ptr->features = xstrdup(job_desc->features);
8253 	detail_ptr->cluster_features = xstrdup(job_desc->cluster_features);
8254 	if (job_desc->fed_siblings_viable) {
8255 		job_ptr->fed_details = xmalloc(sizeof(job_fed_details_t));
8256 		job_ptr->fed_details->siblings_viable =
8257 			job_desc->fed_siblings_viable;
8258 		update_job_fed_details(job_ptr);
8259 	}
8260 	if (job_desc->shared == JOB_SHARED_NONE) {
8261 		detail_ptr->share_res  = 0;
8262 		detail_ptr->whole_node = WHOLE_NODE_REQUIRED;
8263 	} else if (job_desc->shared == JOB_SHARED_OK) {
8264 		detail_ptr->share_res  = 1;
8265 		detail_ptr->whole_node = 0;
8266 	} else if (job_desc->shared == JOB_SHARED_USER) {
8267 		detail_ptr->share_res  = NO_VAL8;
8268 		detail_ptr->whole_node = WHOLE_NODE_USER;
8269 	} else if (job_desc->shared == JOB_SHARED_MCS) {
8270 		detail_ptr->share_res  = NO_VAL8;
8271 		detail_ptr->whole_node = WHOLE_NODE_MCS;
8272 	} else {
8273 		detail_ptr->share_res  = NO_VAL8;
8274 		detail_ptr->whole_node = 0;
8275 	}
8276 	if (job_desc->contiguous != NO_VAL16)
8277 		detail_ptr->contiguous = job_desc->contiguous;
8278 	if (slurmctld_conf.conf_flags & CTL_CONF_ASRU)
8279 		detail_ptr->core_spec = job_desc->core_spec;
8280 	else
8281 		detail_ptr->core_spec = NO_VAL16;
8282 	if (detail_ptr->core_spec != NO_VAL16)
8283 		detail_ptr->whole_node = 1;
8284 	if (job_desc->task_dist != NO_VAL)
8285 		detail_ptr->task_dist = job_desc->task_dist;
8286 	if (job_desc->cpus_per_task == NO_VAL16) {
8287 		detail_ptr->cpus_per_task = 1;
8288 		detail_ptr->orig_cpus_per_task = NO_VAL16;
8289 	} else {
8290 		detail_ptr->cpus_per_task = MAX(job_desc->cpus_per_task, 1);
8291 		detail_ptr->orig_cpus_per_task = detail_ptr->cpus_per_task;
8292 	}
8293 	if (job_desc->pn_min_cpus != NO_VAL16)
8294 		detail_ptr->pn_min_cpus = job_desc->pn_min_cpus;
8295 	if (job_desc->overcommit != NO_VAL8)
8296 		detail_ptr->overcommit = job_desc->overcommit;
8297 	if (job_desc->num_tasks != NO_VAL)
8298 		detail_ptr->num_tasks = job_desc->num_tasks;
8299 	if (job_desc->ntasks_per_node != NO_VAL16) {
8300 		detail_ptr->ntasks_per_node = job_desc->ntasks_per_node;
8301 		if ((detail_ptr->overcommit == 0) &&
8302 		    (detail_ptr->num_tasks > 1)) {
8303 			detail_ptr->pn_min_cpus =
8304 				MAX(detail_ptr->pn_min_cpus,
8305 				    (detail_ptr->cpus_per_task *
8306 				     detail_ptr->ntasks_per_node));
8307 		}
8308 	}
8309 	detail_ptr->pn_min_cpus = MAX(detail_ptr->pn_min_cpus,
8310 				      detail_ptr->cpus_per_task);
8311 	detail_ptr->orig_pn_min_cpus = detail_ptr->pn_min_cpus;
8312 	if (job_desc->reboot != NO_VAL16)
8313 		job_ptr->reboot = MIN(job_desc->reboot, 1);
8314 	else
8315 		job_ptr->reboot = 0;
8316 	if (job_desc->requeue != NO_VAL16)
8317 		detail_ptr->requeue = MIN(job_desc->requeue, 1);
8318 	else
8319 		detail_ptr->requeue = slurmctld_conf.job_requeue;
8320 	if (job_desc->pn_min_memory != NO_VAL64)
8321 		detail_ptr->pn_min_memory = job_desc->pn_min_memory;
8322 	detail_ptr->orig_pn_min_memory = detail_ptr->pn_min_memory;
8323 	if (job_desc->pn_min_tmp_disk != NO_VAL)
8324 		detail_ptr->pn_min_tmp_disk = job_desc->pn_min_tmp_disk;
8325 	detail_ptr->std_err = xstrdup(job_desc->std_err);
8326 	detail_ptr->std_in = xstrdup(job_desc->std_in);
8327 	detail_ptr->std_out = xstrdup(job_desc->std_out);
8328 	detail_ptr->work_dir = xstrdup(job_desc->work_dir);
8329 	if (job_desc->begin_time > time(NULL))
8330 		detail_ptr->begin_time = job_desc->begin_time;
8331 	job_ptr->select_jobinfo =
8332 		select_g_select_jobinfo_copy(job_desc->select_jobinfo);
8333 
8334 	select_g_select_jobinfo_set(job_ptr->select_jobinfo,
8335 				    SELECT_JOBDATA_NETWORK,
8336 				    job_ptr->network);
8337 
8338 	job_ptr->clusters = xstrdup(job_desc->clusters);
8339 
8340 	/*
8341 	 * The priority needs to be set after this since we don't have
8342 	 * an association rec yet
8343 	 */
8344 	detail_ptr->mc_ptr = _set_multi_core_data(job_desc);
8345 
8346 	if ((job_ptr->bit_flags & SPREAD_JOB) && (detail_ptr->max_nodes == 0) &&
8347 	    (detail_ptr->num_tasks != 0)) {
8348 		if (detail_ptr->min_nodes == 0)
8349 			detail_ptr->min_nodes = 1;
8350 		detail_ptr->max_nodes =
8351 			MIN(node_record_count, detail_ptr->num_tasks);
8352 	}
8353 
8354 	return SLURM_SUCCESS;
8355 }
8356 
8357 /*
8358  * _copy_nodelist_no_dup - Take a node_list string and convert it to an
8359  *	expression without duplicate names. For example, we want to convert
8360  *	a users request for nodes "lx1,lx2,lx1,lx3" to "lx[1-3]"
8361  * node_list IN - string describing a list of nodes
8362  * RET a compact node expression, must be xfreed by the user
8363  */
_copy_nodelist_no_dup(char * node_list)8364 static char *_copy_nodelist_no_dup(char *node_list)
8365 {
8366 	char *buf;
8367 
8368 	hostlist_t hl = hostlist_create(node_list);
8369 	if (hl == NULL)
8370 		return NULL;
8371 	hostlist_uniq(hl);
8372 	buf = hostlist_ranged_string_xmalloc(hl);
8373 	hostlist_destroy(hl);
8374 
8375 	return buf;
8376 }
8377 
8378 /* Return the number of CPUs on the first node in the identified partition */
_cpus_per_node_part(part_record_t * part_ptr)8379 static uint16_t _cpus_per_node_part(part_record_t *part_ptr)
8380 {
8381 	int node_inx = -1;
8382 	node_record_t *node_ptr;
8383 
8384 	if (part_ptr->node_bitmap)
8385 		node_inx = bit_ffs(part_ptr->node_bitmap);
8386 	if (node_inx >= 0) {
8387 		node_ptr = node_record_table_ptr + node_inx;
8388 		return node_ptr->config_ptr->cpus;
8389 	}
8390 	return 0;
8391 }
8392 
8393 /*
8394  * Test if this job exceeds any of MaxMemPer[CPU|Node] limits and potentially
8395  * adjust mem / cpu ratios.
8396  *
8397  * NOTE: This function is also called with a dummy job_desc_msg_t from
8398  * job_limits_check(), if there is any new check added here you may also have to
8399  * add that parameter to the job_desc_msg_t in that function.
8400  */
_valid_pn_min_mem(job_desc_msg_t * job_desc_msg,part_record_t * part_ptr)8401 static bool _valid_pn_min_mem(job_desc_msg_t *job_desc_msg,
8402 			      part_record_t *part_ptr)
8403 {
8404 	uint64_t job_mem_limit = job_desc_msg->pn_min_memory;
8405 	uint64_t sys_mem_limit;
8406 	uint16_t cpus_per_node;
8407 
8408 	if (part_ptr && part_ptr->max_mem_per_cpu)
8409 		sys_mem_limit = part_ptr->max_mem_per_cpu;
8410 	else
8411 		sys_mem_limit = slurmctld_conf.max_mem_per_cpu;
8412 
8413 	if ((sys_mem_limit == 0) || (sys_mem_limit == MEM_PER_CPU))
8414 		return true;
8415 
8416 	if ((job_mem_limit & MEM_PER_CPU) && (sys_mem_limit & MEM_PER_CPU)) {
8417 		uint64_t mem_ratio;
8418 		job_mem_limit &= (~MEM_PER_CPU);
8419 		sys_mem_limit &= (~MEM_PER_CPU);
8420 		if (job_mem_limit <= sys_mem_limit)
8421 			return true;
8422 		mem_ratio = (job_mem_limit + sys_mem_limit - 1);
8423 		mem_ratio /= sys_mem_limit;
8424 		debug("increasing cpus_per_task and decreasing mem_per_cpu by "
8425 		      "factor of %"PRIu64" based upon mem_per_cpu limits",
8426 		      mem_ratio);
8427 		if (job_desc_msg->cpus_per_task == NO_VAL16)
8428 			job_desc_msg->cpus_per_task = mem_ratio;
8429 		else
8430 			job_desc_msg->cpus_per_task *= mem_ratio;
8431 		job_desc_msg->pn_min_memory = ((job_mem_limit + mem_ratio - 1) /
8432 					       mem_ratio) | MEM_PER_CPU;
8433 		if ((job_desc_msg->num_tasks != NO_VAL) &&
8434 		    (job_desc_msg->num_tasks != 0) &&
8435 		    (job_desc_msg->min_cpus  != NO_VAL)) {
8436 			job_desc_msg->min_cpus =
8437 				job_desc_msg->num_tasks *
8438 				job_desc_msg->cpus_per_task;
8439 
8440 			if ((job_desc_msg->max_cpus != NO_VAL) &&
8441 			    (job_desc_msg->max_cpus < job_desc_msg->min_cpus)) {
8442 				job_desc_msg->max_cpus = job_desc_msg->min_cpus;
8443 			}
8444 		}
8445 		return true;
8446 	}
8447 
8448 	if (((job_mem_limit & MEM_PER_CPU) == 0) &&
8449 	    ((sys_mem_limit & MEM_PER_CPU) == 0)) {
8450 		if (job_mem_limit <= sys_mem_limit)
8451 			return true;
8452 		debug2("JobId=%u mem=%"PRIu64"M > MaxMemPerNode=%"PRIu64"M in partition %s",
8453 		       job_desc_msg->job_id, job_mem_limit, sys_mem_limit,
8454 		       (part_ptr && part_ptr->name) ? part_ptr->name : "N/A");
8455 		return false;
8456 	}
8457 
8458 	/* Job and system have different memory limit forms (i.e. one is a
8459 	 * per-job and the other is per-node). Covert them both to per-node
8460 	 * values for comparison. */
8461 	if (part_ptr && (!part_ptr->max_share || !job_desc_msg->shared)) {
8462 		/* Whole node allocation */
8463 		cpus_per_node = _cpus_per_node_part(part_ptr);
8464 	} else {
8465 		if ((job_desc_msg->ntasks_per_node != NO_VAL16) &&
8466 		    (job_desc_msg->ntasks_per_node != 0))
8467 			cpus_per_node = job_desc_msg->ntasks_per_node;
8468 		else
8469 			cpus_per_node = 1;
8470 
8471 		if ((job_desc_msg->num_tasks != NO_VAL) &&
8472 		    (job_desc_msg->num_tasks != 0)     &&
8473 		    (job_desc_msg->max_nodes != NO_VAL) &&
8474 		    (job_desc_msg->max_nodes != 0)) {
8475 			cpus_per_node = MAX(cpus_per_node,
8476 				((job_desc_msg->num_tasks +
8477 				  job_desc_msg->max_nodes - 1) /
8478 				 job_desc_msg->max_nodes));
8479 		}
8480 
8481 		if ((job_desc_msg->cpus_per_task != NO_VAL16) &&
8482 		    (job_desc_msg->cpus_per_task != 0))
8483 			cpus_per_node *= job_desc_msg->cpus_per_task;
8484 
8485 		if ((job_desc_msg->pn_min_cpus != NO_VAL16) &&
8486 		    (job_desc_msg->pn_min_cpus > cpus_per_node))
8487 			cpus_per_node = job_desc_msg->pn_min_cpus;
8488 	}
8489 
8490 	if (job_mem_limit & MEM_PER_CPU) {
8491 		/* Job has per-CPU memory limit, system has per-node limit */
8492 		job_mem_limit &= (~MEM_PER_CPU);
8493 		job_mem_limit *= cpus_per_node;
8494 	} else {
8495 		/* Job has per-node memory limit, system has per-CPU limit */
8496 		uint32_t min_cpus;
8497 		sys_mem_limit &= (~MEM_PER_CPU);
8498 		min_cpus = (job_mem_limit + sys_mem_limit - 1) / sys_mem_limit;
8499 
8500 		if ((job_desc_msg->pn_min_cpus == NO_VAL16) ||
8501 		    (job_desc_msg->pn_min_cpus < min_cpus)) {
8502 			debug("Setting job's pn_min_cpus to %u due to memory "
8503 			      "limit", min_cpus);
8504 			job_desc_msg->pn_min_cpus = min_cpus;
8505 			cpus_per_node = MAX(cpus_per_node, min_cpus);
8506 		}
8507 		sys_mem_limit *= cpus_per_node;
8508 	}
8509 
8510 	if (job_mem_limit <= sys_mem_limit)
8511 		return true;
8512 
8513 	debug2("JobId=%u mem=%"PRIu64"M > MaxMemPer%s=%"PRIu64"M in partition:%s",
8514 	       job_desc_msg->job_id, job_mem_limit,
8515 	       (job_mem_limit & MEM_PER_CPU) ? "CPU" : "Node", sys_mem_limit,
8516 	       (part_ptr && part_ptr->name) ? part_ptr->name : "N/A");
8517 
8518 	return false;
8519 }
8520 
8521 /*
8522  * Validate TRES specification of the form:
8523  * "name=[type:]#[,[type:]#][;name=[type:]#]"
8524  * For example: "gpu:kepler:2,craynetwork=1"
8525  */
valid_tres_cnt(char * tres)8526 extern bool valid_tres_cnt(char *tres)
8527 {
8528 	char *end_ptr = NULL, *colon, *save_ptr = NULL, *sep, *tok, *tmp;
8529 	bool rc = true;
8530 	long long int val;
8531 
8532 	if (!tres || (tres[0] == '\0'))
8533 		return true;
8534 
8535 	tmp = xstrdup(tres);
8536 	tok = strtok_r(tmp, ",", &save_ptr);
8537 	while (tok) {
8538 		bool valid_name = false;
8539 		sep = strchr(tok, ':');
8540 		if (sep) {
8541 			sep[0] = '\0';
8542 			sep++;
8543 		}
8544 		if (valid_tres_name(tok))
8545 			valid_name = true;
8546 		if (!sep) {	/* No model or count. Implicit count of 1 */
8547 			if (!valid_name) {
8548 				rc = false;
8549 				break;
8550 			}
8551 		} else if ((colon = strchr(sep, ':'))) {
8552 			/* Includes explicit "name:type:count" */
8553 			sep = colon + 1;	/* Points to count */
8554 			val = strtoll(sep, &end_ptr, 10);
8555 			/* First only check numeric component for validity */
8556 			if (((val < 0) ||
8557 			    (val == LLONG_MAX)) ||
8558 			    (!valid_name && (val != 0))) {
8559 				rc = false;
8560 				break;
8561 			}
8562 
8563 			/*
8564 			 * Now check that any count suffic is valid.
8565 			 */
8566 			if (suffix_mult(end_ptr) == NO_VAL64) {
8567 				rc = false;
8568 				break;
8569 			}
8570 		} else {
8571 			/*
8572 			 * Includes "name:type" or "name:count"
8573 			 * Since we don't know if there is a count,
8574 			 * we can not do more now.
8575 			 */
8576 		}
8577 		tok = strtok_r(NULL, ",", &save_ptr);
8578 	}
8579 	xfree(tmp);
8580 
8581 	return rc;
8582 }
8583 
8584 /*
8585  * Validate the named TRES is valid for scheduling parameters.
8586  * Returns FALSE if the name is invalid or the GRES count is zero.
8587  */
valid_tres_name(char * name)8588 extern bool valid_tres_name(char *name)
8589 {
8590 	if (!name || (name[0] == '\0'))
8591 		return false;
8592 	if (gres_get_system_cnt(name) != NO_VAL64)
8593 		return true;
8594 
8595 	return false;
8596 }
8597 
8598 /*
8599  * Increment time limit of one job record for node configuraiton.
8600  */
_job_time_limit_incr(job_record_t * job_ptr,uint32_t boot_job_id)8601 static void _job_time_limit_incr(job_record_t *job_ptr, uint32_t boot_job_id)
8602 {
8603 	time_t delta_t, now = time(NULL);
8604 
8605 	delta_t = difftime(now, job_ptr->start_time);
8606 	if ((job_ptr->job_id != boot_job_id) && !IS_JOB_CONFIGURING(job_ptr))
8607 		job_ptr->tot_sus_time = delta_t;
8608 
8609 	if ((job_ptr->time_limit != INFINITE) &&
8610 	    ((job_ptr->job_id == boot_job_id) || (delta_t != 0))) {
8611 		if (delta_t && !IS_JOB_CONFIGURING(job_ptr)) {
8612 			verbose("Extending %pJ time limit by %u secs for configuration",
8613 				job_ptr, (uint32_t) delta_t);
8614 		}
8615 		job_ptr->end_time = now + (job_ptr->time_limit * 60);
8616 		job_ptr->end_time_exp = job_ptr->end_time;
8617 	}
8618 }
8619 
8620 /*
8621  * Increment time limit for all components of a hetjob for node configuraiton.
8622  * job_ptr IN - pointer to job record for which configuration is complete
8623  * boot_job_id - job ID of record with newly powered up node or 0
8624  */
_het_job_time_limit_incr(job_record_t * job_ptr,uint32_t boot_job_id)8625 static void _het_job_time_limit_incr(job_record_t *job_ptr,
8626 				     uint32_t boot_job_id)
8627 {
8628 	job_record_t *het_job_leader, *het_job;
8629 	ListIterator iter;
8630 
8631 	if (!job_ptr->het_job_id) {
8632 		_job_time_limit_incr(job_ptr, boot_job_id);
8633 		return;
8634 	}
8635 
8636 	het_job_leader = find_job_record(job_ptr->het_job_id);
8637 	if (!het_job_leader) {
8638 		error("%s: Hetjob leader %pJ not found",
8639 		      __func__, job_ptr);
8640 		_job_time_limit_incr(job_ptr, boot_job_id);
8641 		return;
8642 	}
8643 	if (!het_job_leader->het_job_list) {
8644 		error("%s: Hetjob leader %pJ job list is NULL",
8645 		      __func__, job_ptr);
8646 		_job_time_limit_incr(job_ptr, boot_job_id);
8647 		return;
8648 	}
8649 
8650 	iter = list_iterator_create(het_job_leader->het_job_list);
8651 	while ((het_job = list_next(iter))) {
8652 		_job_time_limit_incr(het_job, boot_job_id);
8653 	}
8654 	list_iterator_destroy(iter);
8655 }
8656 
8657 /* Clear job's CONFIGURING flag and advance end time as needed */
job_config_fini(job_record_t * job_ptr)8658 extern void job_config_fini(job_record_t *job_ptr)
8659 {
8660 	time_t now = time(NULL);
8661 
8662 	last_job_update = now;
8663 	job_ptr->job_state &= ~JOB_CONFIGURING;
8664 	if (IS_JOB_POWER_UP_NODE(job_ptr)) {
8665 		info("Resetting %pJ start time for node power up", job_ptr);
8666 		job_ptr->job_state &= ~JOB_POWER_UP_NODE;
8667 		job_ptr->start_time = now;
8668 		_het_job_time_limit_incr(job_ptr, job_ptr->job_id);
8669 		jobacct_storage_g_job_start(acct_db_conn, job_ptr);
8670 	} else {
8671 		_het_job_time_limit_incr(job_ptr, 0);
8672 	}
8673 
8674 	/*
8675 	 * Request asynchronous launch of a prolog for a non-batch job.
8676 	 * PROLOG_FLAG_CONTAIN also turns on PROLOG_FLAG_ALLOC.
8677 	 */
8678 	if (slurmctld_conf.prolog_flags & PROLOG_FLAG_ALLOC)
8679 		launch_prolog(job_ptr);
8680 }
8681 
8682 /*
8683  * Determine of the nodes are ready to run a job
8684  * RET true if ready
8685  */
test_job_nodes_ready(job_record_t * job_ptr)8686 extern bool test_job_nodes_ready(job_record_t *job_ptr)
8687 {
8688 	if (IS_JOB_PENDING(job_ptr))
8689 		return false;
8690 	if (!job_ptr->node_bitmap)	/* Revoked allocation */
8691 		return true;
8692 	if (bit_overlap_any(job_ptr->node_bitmap, power_node_bitmap))
8693 		return false;
8694 
8695 	if (!job_ptr->batch_flag ||
8696 	    job_ptr->batch_features ||
8697 	    job_ptr->wait_all_nodes || job_ptr->burst_buffer) {
8698 		/* Make sure all nodes ready to start job */
8699 		if ((select_g_job_ready(job_ptr) & READY_NODE_STATE) == 0)
8700 			return false;
8701 	} else if (job_ptr->batch_flag) {
8702 
8703 #ifdef HAVE_FRONT_END
8704 		/* Make sure frontend node is ready to start batch job */
8705 		front_end_record_t *front_end_ptr =
8706 			find_front_end_record(job_ptr->batch_host);
8707 		if (!front_end_ptr ||
8708 		    IS_NODE_POWER_SAVE(front_end_ptr) ||
8709 		    IS_NODE_POWER_UP(front_end_ptr)) {
8710 			return false;
8711 		}
8712 #else
8713 		/* Make sure first node is ready to start batch job */
8714 		node_record_t *node_ptr =
8715 			find_node_record(job_ptr->batch_host);
8716 		if (!node_ptr ||
8717 		    IS_NODE_POWER_SAVE(node_ptr) ||
8718 		    IS_NODE_POWER_UP(node_ptr)) {
8719 			return false;
8720 		}
8721 #endif
8722 	}
8723 
8724 	return true;
8725 }
8726 
8727 /*
8728  * For non-hetjob, return true if this job is configuring.
8729  * For hetjob, return true if any component of the job is configuring.
8730  */
_het_job_configuring_test(job_record_t * job_ptr)8731 static bool _het_job_configuring_test(job_record_t *job_ptr)
8732 {
8733 	job_record_t *het_job_leader, *het_job;
8734 	ListIterator iter;
8735 	bool result = false;
8736 
8737 	if (IS_JOB_CONFIGURING(job_ptr))
8738 		return true;
8739 	if (!job_ptr->het_job_id)
8740 		return false;
8741 
8742 	het_job_leader = find_job_record(job_ptr->het_job_id);
8743 	if (!het_job_leader) {
8744 		error("%s: Hetjob leader %pJ not found", __func__, job_ptr);
8745 		return false;
8746 	}
8747 	if (!het_job_leader->het_job_list) {
8748 		error("%s: Hetjob leader %pJ job list is NULL",
8749 		      __func__, job_ptr);
8750 		return false;
8751 	}
8752 
8753 	iter = list_iterator_create(het_job_leader->het_job_list);
8754 	while ((het_job = list_next(iter))) {
8755 		if (IS_JOB_CONFIGURING(het_job)) {
8756 			result = true;
8757 			break;
8758 		}
8759 	}
8760 	list_iterator_destroy(iter);
8761 
8762 	return result;
8763 }
8764 
8765 /*
8766  * job_time_limit - terminate jobs which have exceeded their time limit
8767  * global: job_list - pointer global job list
8768  *	last_job_update - time of last job table update
8769  */
job_time_limit(void)8770 void job_time_limit(void)
8771 {
8772 	ListIterator job_iterator;
8773 	job_record_t *job_ptr;
8774 	time_t now = time(NULL);
8775 	time_t old = now - ((slurmctld_conf.inactive_limit * 4 / 3) +
8776 			    slurmctld_conf.msg_timeout + 1);
8777 	time_t over_run;
8778 	uint16_t over_time_limit;
8779 	uint8_t prolog;
8780 	int job_test_count = 0;
8781 	uint32_t resv_over_run = slurmctld_conf.resv_over_run;
8782 
8783 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
8784 
8785 	if (resv_over_run == INFINITE16)
8786 		resv_over_run = YEAR_SECONDS;
8787 	else
8788 		resv_over_run *= 60;
8789 
8790 	/*
8791 	 * locks same as in _slurmctld_background() (The only current place this
8792 	 * is called).
8793 	 */
8794 	slurmctld_lock_t job_write_lock = {
8795 		READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
8796 	DEF_TIMERS;
8797 
8798 	job_iterator = list_iterator_create(job_list);
8799 	START_TIMER;
8800 	while ((job_ptr = list_next(job_iterator))) {
8801 		xassert (job_ptr->magic == JOB_MAGIC);
8802 		job_test_count++;
8803 
8804 		if (job_ptr->details)
8805 			prolog = job_ptr->details->prolog_running;
8806 		else
8807 			prolog = 0;
8808 		if ((prolog == 0) && IS_JOB_CONFIGURING(job_ptr) &&
8809 		    test_job_nodes_ready(job_ptr)) {
8810 			info("%s: Configuration for %pJ complete",
8811 			     __func__, job_ptr);
8812 			job_config_fini(job_ptr);
8813 			if (job_ptr->bit_flags & NODE_REBOOT) {
8814 				job_ptr->bit_flags &= (~NODE_REBOOT);
8815 				if (job_ptr->batch_flag)
8816 					launch_job(job_ptr);
8817 			}
8818 		}
8819 
8820 		/*
8821 		 * Features have been changed on some node, make job eligiable
8822 		 * to run and test to see if it can run now
8823 		 */
8824 		if (node_features_updated &&
8825 		    (job_ptr->state_reason == FAIL_BAD_CONSTRAINTS) &&
8826 		    IS_JOB_PENDING(job_ptr) && (job_ptr->priority == 0)) {
8827 			job_ptr->state_reason = WAIT_NO_REASON;
8828 			set_job_prio(job_ptr);
8829 			last_job_update = now;
8830 		}
8831 
8832 		/* Don't enforce time limits for configuring hetjobs */
8833 		if (_het_job_configuring_test(job_ptr))
8834 			continue;
8835 
8836 		/*
8837 		 * Only running jobs can be killed due to timeout. Do not kill
8838 		 * suspended jobs due to timeout.
8839 		 */
8840 		if (!IS_JOB_RUNNING(job_ptr))
8841 			continue;
8842 
8843 		/*
8844 		 * everything above here is considered "quick", and skips the
8845 		 * timeout at the bottom of the loop by using a continue.
8846 		 * everything below is considered "slow", and needs to jump to
8847 		 * time_check before the next job is tested
8848 		 */
8849 		if (job_ptr->preempt_time) {
8850 			(void)slurm_job_preempt(job_ptr, NULL,
8851 						slurm_job_preempt_mode(job_ptr),
8852 						false);
8853 			goto time_check;
8854 		}
8855 
8856 		if (slurmctld_conf.inactive_limit &&
8857 		    (job_ptr->batch_flag == 0)    &&
8858 		    (job_ptr->time_last_active <= old) &&
8859 		    (job_ptr->other_port) &&
8860 		    (job_ptr->part_ptr) &&
8861 		    (!(job_ptr->part_ptr->flags & PART_FLAG_ROOT_ONLY))) {
8862 			/* job inactive, kill it */
8863 			info("%s: inactivity time limit reached for %pJ",
8864 			     __func__, job_ptr);
8865 			_job_timed_out(job_ptr, false);
8866 			job_ptr->state_reason = FAIL_INACTIVE_LIMIT;
8867 			xfree(job_ptr->state_desc);
8868 			goto time_check;
8869 		}
8870 		if (job_ptr->time_limit != INFINITE) {
8871 			send_job_warn_signal(job_ptr, false);
8872 			if ((job_ptr->mail_type & MAIL_JOB_TIME100) &&
8873 			    (now >= job_ptr->end_time)) {
8874 				job_ptr->mail_type &= (~MAIL_JOB_TIME100);
8875 				mail_job_info(job_ptr, MAIL_JOB_TIME100);
8876 			}
8877 			if ((job_ptr->mail_type & MAIL_JOB_TIME90) &&
8878 			    (now + (job_ptr->time_limit * 60 * 0.1) >=
8879 			     job_ptr->end_time)) {
8880 				job_ptr->mail_type &= (~MAIL_JOB_TIME90);
8881 				mail_job_info(job_ptr, MAIL_JOB_TIME90);
8882 			}
8883 			if ((job_ptr->mail_type & MAIL_JOB_TIME80) &&
8884 			    (now + (job_ptr->time_limit * 60 * 0.2) >=
8885 			     job_ptr->end_time)) {
8886 				job_ptr->mail_type &= (~MAIL_JOB_TIME80);
8887 				mail_job_info(job_ptr, MAIL_JOB_TIME80);
8888 			}
8889 			if ((job_ptr->mail_type & MAIL_JOB_TIME50) &&
8890 			    (now + (job_ptr->time_limit * 60 * 0.5) >=
8891 			     job_ptr->end_time)) {
8892 				job_ptr->mail_type &= (~MAIL_JOB_TIME50);
8893 				mail_job_info(job_ptr, MAIL_JOB_TIME50);
8894 			}
8895 
8896 			if (job_ptr->part_ptr &&
8897 			    (job_ptr->part_ptr->over_time_limit != NO_VAL16)) {
8898 				over_time_limit =
8899 					job_ptr->part_ptr->over_time_limit;
8900 			} else {
8901 				over_time_limit =
8902 					slurmctld_conf.over_time_limit;
8903 			}
8904 			if (over_time_limit == INFINITE16)
8905 				over_run = now - YEAR_SECONDS;
8906 			else
8907 				over_run = now - (over_time_limit  * 60);
8908 			if (job_ptr->end_time <= over_run) {
8909 				last_job_update = now;
8910 				info("Time limit exhausted for %pJ", job_ptr);
8911 				_job_timed_out(job_ptr, false);
8912 				job_ptr->state_reason = FAIL_TIMEOUT;
8913 				xfree(job_ptr->state_desc);
8914 				goto time_check;
8915 			}
8916 		}
8917 
8918 		if (job_ptr->resv_ptr &&
8919 		    !(job_ptr->resv_ptr->flags & RESERVE_FLAG_FLEX) &&
8920 		    (job_ptr->resv_ptr->end_time + resv_over_run) < time(NULL)){
8921 			last_job_update = now;
8922 			info("Reservation ended for %pJ", job_ptr);
8923 			_job_timed_out(job_ptr, false);
8924 			job_ptr->state_reason = FAIL_TIMEOUT;
8925 			xfree(job_ptr->state_desc);
8926 			goto time_check;
8927 		}
8928 
8929 		/*
8930 		 * check if any individual job steps have exceeded
8931 		 * their time limit
8932 		 */
8933 		if (job_ptr->step_list &&
8934 		    (list_count(job_ptr->step_list) > 0))
8935 			check_job_step_time_limit(job_ptr, now);
8936 
8937 		acct_policy_job_time_out(job_ptr);
8938 
8939 		if (job_ptr->state_reason == FAIL_TIMEOUT) {
8940 			last_job_update = now;
8941 			_job_timed_out(job_ptr, false);
8942 			xfree(job_ptr->state_desc);
8943 			goto time_check;
8944 		}
8945 
8946 		/* Give srun command warning message about pending timeout */
8947 		if (job_ptr->end_time <= (now + PERIODIC_TIMEOUT * 2))
8948 			srun_timeout (job_ptr);
8949 
8950 		/*
8951 		 * _job_timed_out() and other calls can take a long time on
8952 		 * some platforms. This loop is holding the job_write lock;
8953 		 * if a lot of jobs need to be timed out within the same cycle
8954 		 * this stalls other threads from running and causes
8955 		 * communication issues within the cluster.
8956 		 *
8957 		 * This test happens last, as job_ptr may be pointing to a job
8958 		 * that would be deleted by a separate thread when the job_write
8959 		 * lock is released. However, list_next itself is thread safe,
8960 		 * and can be used again once the locks are reacquired.
8961 		 * list_peek_next is used in the unlikely event the timer has
8962 		 * expired just as the end of the job_list is reached.
8963 		 */
8964 time_check:
8965 		/* Use a hard-coded 3 second timeout, with a 1 second sleep. */
8966 		if (slurm_delta_tv(&tv1) >= 3000000 && list_peek_next(job_iterator) ) {
8967 			END_TIMER;
8968 			debug("%s: yielding locks after testing"
8969 			      " %d jobs, %s",
8970 			      __func__, job_test_count, TIME_STR);
8971 			unlock_slurmctld(job_write_lock);
8972 			usleep(1000000);
8973 			lock_slurmctld(job_write_lock);
8974 			START_TIMER;
8975 			job_test_count = 0;
8976 		}
8977 	}
8978 	list_iterator_destroy(job_iterator);
8979 	node_features_updated = false;
8980 }
8981 
job_set_req_tres(job_record_t * job_ptr,bool assoc_mgr_locked)8982 extern void job_set_req_tres(job_record_t *job_ptr, bool assoc_mgr_locked)
8983 {
8984 	uint32_t cpu_cnt = 0, node_cnt = 0;
8985 	uint64_t mem_cnt = 0;
8986 	assoc_mgr_lock_t locks = { .tres = READ_LOCK };
8987 
8988 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
8989 
8990 	xfree(job_ptr->tres_req_str);
8991 	xfree(job_ptr->tres_fmt_req_str);
8992 	xfree(job_ptr->tres_req_cnt);
8993 
8994 	if (!assoc_mgr_locked)
8995 		assoc_mgr_lock(&locks);
8996 
8997 	job_ptr->tres_req_cnt = xcalloc(g_tres_count, sizeof(uint64_t));
8998 
8999 	if (job_ptr->details) {
9000 		node_cnt = job_ptr->details->min_nodes;
9001 		cpu_cnt = job_ptr->details->min_cpus;
9002 		if (job_ptr->details->pn_min_memory)
9003 			mem_cnt = job_ptr->details->pn_min_memory;
9004 	}
9005 
9006 	/* if this is set just override */
9007 	if (job_ptr->total_cpus)
9008 		cpu_cnt = job_ptr->total_cpus;
9009 
9010 	if (job_ptr->node_cnt)
9011 		node_cnt = job_ptr->node_cnt;
9012 
9013 	job_ptr->tres_req_cnt[TRES_ARRAY_NODE] = (uint64_t)node_cnt;
9014 	job_ptr->tres_req_cnt[TRES_ARRAY_CPU] = (uint64_t)cpu_cnt;
9015 	job_ptr->tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem(
9016 							job_ptr->job_resrcs,
9017 							mem_cnt, cpu_cnt,
9018 							node_cnt);
9019 
9020 	license_set_job_tres_cnt(job_ptr->license_list,
9021 				 job_ptr->tres_req_cnt,
9022 				 true);
9023 
9024 	/* FIXME: this assumes that all nodes have equal TRES */
9025 	gres_set_job_tres_cnt(job_ptr->gres_list,
9026 			      node_cnt,
9027 			      job_ptr->tres_req_cnt,
9028 			      true);
9029 
9030 	bb_g_job_set_tres_cnt(job_ptr,
9031 			      job_ptr->tres_req_cnt,
9032 			      true);
9033 
9034 	/*
9035 	 * Do this last as it calculates off of everything else.
9036 	 * Don't use calc_job_billable_tres() as it relies on allocated tres
9037 	 * If the partition was destroyed the part_ptr will be NULL.  As this
9038 	 * could be run on already finished jobs running in the assoc mgr
9039 	 * cache.
9040 	 */
9041 	if (job_ptr->part_ptr)
9042 		job_ptr->tres_req_cnt[TRES_ARRAY_BILLING] =
9043 			assoc_mgr_tres_weighted(
9044 				job_ptr->tres_req_cnt,
9045 				job_ptr->part_ptr->billing_weights,
9046 				slurmctld_conf.priority_flags, true);
9047 
9048 	/* now that the array is filled lets make the string from it */
9049 	set_job_tres_req_str(job_ptr, true);
9050 
9051 	if (!assoc_mgr_locked)
9052 		assoc_mgr_unlock(&locks);
9053 }
9054 
job_set_alloc_tres(job_record_t * job_ptr,bool assoc_mgr_locked)9055 extern void job_set_alloc_tres(job_record_t *job_ptr, bool assoc_mgr_locked)
9056 {
9057 	uint32_t alloc_nodes = 0;
9058 	assoc_mgr_lock_t locks = { .tres = READ_LOCK };
9059 
9060 	xfree(job_ptr->tres_alloc_str);
9061 	xfree(job_ptr->tres_alloc_cnt);
9062 	xfree(job_ptr->tres_fmt_alloc_str);
9063 
9064 	/*
9065 	 * We only need to do this on non-pending jobs.
9066 	 * Requeued jobs are marked as PENDING|COMPLETING until the epilog is
9067 	 * finished so we still need the alloc tres until then.
9068 	 */
9069 	if (IS_JOB_PENDING(job_ptr) && !IS_JOB_COMPLETING(job_ptr))
9070 		return;
9071 
9072 	if (!assoc_mgr_locked)
9073 		assoc_mgr_lock(&locks);
9074 
9075 	job_ptr->tres_alloc_cnt = xcalloc(slurmctld_tres_cnt, sizeof(uint64_t));
9076 
9077 	job_ptr->tres_alloc_cnt[TRES_ARRAY_CPU] = (uint64_t)job_ptr->total_cpus;
9078 
9079 	alloc_nodes = job_ptr->node_cnt;
9080 	job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE] = (uint64_t)alloc_nodes;
9081 	job_ptr->tres_alloc_cnt[TRES_ARRAY_MEM] =
9082 		job_get_tres_mem(
9083 			job_ptr->job_resrcs,
9084 			job_ptr->details->pn_min_memory,
9085 			job_ptr->tres_alloc_cnt[TRES_ARRAY_CPU],
9086 			job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE]);
9087 
9088 	job_ptr->tres_alloc_cnt[TRES_ARRAY_ENERGY] = NO_VAL64;
9089 
9090 	license_set_job_tres_cnt(job_ptr->license_list,
9091 				 job_ptr->tres_alloc_cnt,
9092 				 true);
9093 
9094 	gres_set_job_tres_cnt(job_ptr->gres_list,
9095 			      alloc_nodes,
9096 			      job_ptr->tres_alloc_cnt,
9097 			      true);
9098 
9099 	bb_g_job_set_tres_cnt(job_ptr,
9100 			      job_ptr->tres_alloc_cnt,
9101 			      true);
9102 
9103 	/* Do this last as it calculates off of everything else. */
9104 	job_ptr->tres_alloc_cnt[TRES_ARRAY_BILLING] =
9105 		calc_job_billable_tres(job_ptr, job_ptr->start_time, true);
9106 
9107 	/* now that the array is filled lets make the string from it */
9108 	set_job_tres_alloc_str(job_ptr, true);
9109 
9110 	if (!assoc_mgr_locked)
9111 		assoc_mgr_unlock(&locks);
9112 
9113 	return;
9114 }
9115 
9116 /*
9117  * job_update_tres_cnt - when job is completing remove allocated tres
9118  *                      from count.
9119  * IN/OUT job_ptr - job structure to be updated
9120  * IN node_inx    - node bit that is finished with job.
9121  * RET SLURM_SUCCES on success SLURM_ERROR on cpu_cnt underflow
9122  */
job_update_tres_cnt(job_record_t * job_ptr,int node_inx)9123 extern int job_update_tres_cnt(job_record_t *job_ptr, int node_inx)
9124 {
9125 	int cpu_cnt, offset = -1, rc = SLURM_SUCCESS;
9126 
9127 	xassert(job_ptr);
9128 
9129 	if (job_ptr->details->whole_node == 1) {
9130 		/*
9131 		 * Since we are allocating whole nodes don't rely on
9132 		 * the job_resrcs since it could be less because the
9133 		 * node could of only used 1 thread per core.
9134 		 */
9135 		node_record_t *node_ptr =
9136 			node_record_table_ptr + node_inx;
9137 		cpu_cnt = node_ptr->config_ptr->cpus;
9138 	} else {
9139 		if ((offset = job_resources_node_inx_to_cpu_inx(
9140 				job_ptr->job_resrcs, node_inx)) < 0) {
9141 			error("%s: problem getting offset of %pJ",
9142 			      __func__, job_ptr);
9143 			job_ptr->cpu_cnt = 0;
9144 			return SLURM_ERROR;
9145 		}
9146 
9147 		cpu_cnt = job_ptr->job_resrcs->cpus[offset];
9148 	}
9149 	if (cpu_cnt > job_ptr->cpu_cnt) {
9150 		error("%s: cpu_cnt underflow (%d > %u) on %pJ", __func__,
9151 		      cpu_cnt, job_ptr->cpu_cnt, job_ptr);
9152 		job_ptr->cpu_cnt = 0;
9153 		rc = SLURM_ERROR;
9154 	} else
9155 		job_ptr->cpu_cnt -= cpu_cnt;
9156 
9157 	if (IS_JOB_RESIZING(job_ptr)) {
9158 		if (cpu_cnt > job_ptr->total_cpus) {
9159 			error("%s: total_cpus underflow on %pJ",
9160 			       __func__, job_ptr);
9161 			job_ptr->total_cpus = 0;
9162 			rc = SLURM_ERROR;
9163 		} else
9164 			job_ptr->total_cpus -= cpu_cnt;
9165 
9166 		job_set_alloc_tres(job_ptr, false);
9167 	}
9168 	return rc;
9169 }
9170 
9171 /* Terminate a job that has exhausted its time limit */
_job_timed_out(job_record_t * job_ptr,bool preempted)9172 static void _job_timed_out(job_record_t *job_ptr, bool preempted)
9173 {
9174 	xassert(job_ptr);
9175 
9176 	srun_timeout(job_ptr);
9177 	if (job_ptr->details) {
9178 		time_t now      = time(NULL);
9179 		job_ptr->end_time           = now;
9180 		job_ptr->time_last_active   = now;
9181 		if (!job_ptr->preempt_time)
9182 			job_ptr->job_state = JOB_TIMEOUT | JOB_COMPLETING;
9183 		build_cg_bitmap(job_ptr);
9184 		job_completion_logger(job_ptr, false);
9185 		deallocate_nodes(job_ptr, !preempted, false, preempted);
9186 	} else
9187 		job_signal(job_ptr, SIGKILL, 0, 0, false);
9188 	return;
9189 }
9190 
9191 /* _validate_job_desc - validate that a job descriptor for job submit or
9192  *	allocate has valid data, set values to defaults as required
9193  * IN/OUT job_desc_msg - pointer to job descriptor, modified as needed
9194  * IN allocate - if clear job to be queued, if set allocate for user now
9195  * IN submit_uid - who request originated
9196  */
_validate_job_desc(job_desc_msg_t * job_desc_msg,int allocate,uid_t submit_uid,part_record_t * part_ptr,List part_list)9197 static int _validate_job_desc(job_desc_msg_t *job_desc_msg, int allocate,
9198 			      uid_t submit_uid, part_record_t *part_ptr,
9199 			      List part_list)
9200 {
9201 	if ((job_desc_msg->min_cpus  == NO_VAL) &&
9202 	    (job_desc_msg->min_nodes == NO_VAL) &&
9203 	    (job_desc_msg->req_nodes == NULL)) {
9204 		info("%s: job specified no min_cpus, min_nodes or req_nodes",
9205 		     __func__);
9206 		return ESLURM_JOB_MISSING_SIZE_SPECIFICATION;
9207 	}
9208 	if ((allocate == SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0) &&
9209 	    (job_desc_msg->script == NULL)) {
9210 		info("%s: job failed to specify Script", __func__);
9211 		return ESLURM_JOB_SCRIPT_MISSING;
9212 	}
9213 	if (job_desc_msg->script && job_desc_msg->x11) {
9214 		info("%s: batch job cannot use X11 forwarding", __func__);
9215 		return ESLURM_X11_NOT_AVAIL;
9216 	}
9217 	if (job_desc_msg->user_id == NO_VAL) {
9218 		info("%s: job failed to specify User", __func__);
9219 		return ESLURM_USER_ID_MISSING;
9220 	}
9221 	if ( job_desc_msg->group_id == NO_VAL ) {
9222 		debug("%s: job failed to specify group", __func__);
9223 		return ESLURM_GROUP_ID_MISSING;
9224 	}
9225 	if (job_desc_msg->contiguous == NO_VAL16)
9226 		job_desc_msg->contiguous = 0;
9227 
9228 	if (job_desc_msg->task_dist == NO_VAL) {
9229 		/* not typically set by salloc or sbatch */
9230 		job_desc_msg->task_dist = SLURM_DIST_CYCLIC;
9231 	}
9232 	if (job_desc_msg->plane_size == NO_VAL16)
9233 		job_desc_msg->plane_size = 0;
9234 
9235 	if (job_desc_msg->kill_on_node_fail == NO_VAL16)
9236 		job_desc_msg->kill_on_node_fail = 1;
9237 
9238 	if (job_desc_msg->job_id != NO_VAL) {
9239 		job_record_t *dup_job_ptr;
9240 		if (!fed_mgr_fed_rec &&
9241 		    (submit_uid != 0) &&
9242 		    (submit_uid != slurmctld_conf.slurm_user_id)) {
9243 			info("attempt by uid %u to set JobId=%u",
9244 			     submit_uid, job_desc_msg->job_id);
9245 			return ESLURM_INVALID_JOB_ID;
9246 		}
9247 		if (job_desc_msg->job_id == 0) {
9248 			info("attempt by uid %u to set JobId=0",
9249 			     submit_uid);
9250 			return ESLURM_INVALID_JOB_ID;
9251 		}
9252 		dup_job_ptr = find_job_record(job_desc_msg->job_id);
9253 		if (dup_job_ptr) {
9254 			info("attempt to re-use active %pJ", dup_job_ptr);
9255 			return ESLURM_DUPLICATE_JOB_ID;
9256 		}
9257 	}
9258 
9259 	if (job_desc_msg->nice == NO_VAL)
9260 		job_desc_msg->nice = NICE_OFFSET;
9261 
9262 	if (job_desc_msg->pn_min_memory == NO_VAL64) {
9263 		/* Default memory limit is DefMemPerCPU (if set) or no limit */
9264 		if (part_ptr && part_ptr->def_mem_per_cpu) {
9265 			job_desc_msg->pn_min_memory =
9266 					part_ptr->def_mem_per_cpu;
9267 		} else {
9268 			job_desc_msg->pn_min_memory =
9269 					slurmctld_conf.def_mem_per_cpu;
9270 		}
9271 	} else if (!_validate_min_mem_partition(job_desc_msg, part_ptr,
9272 						part_list)) {
9273 		return ESLURM_INVALID_TASK_MEMORY;
9274 	} else {
9275 		/* Memory limit explicity set by user */
9276 		job_desc_msg->bitflags |= JOB_MEM_SET;
9277 	}
9278 
9279 	if (job_desc_msg->pn_min_memory == MEM_PER_CPU) {
9280 		/* Map --mem-per-cpu=0 to --mem=0 for simpler logic */
9281 		job_desc_msg->pn_min_memory = 0;
9282 	}
9283 
9284 	/* Validate a job's accounting frequency, if specified */
9285 	if (acct_gather_check_acct_freq_task(
9286 		    job_desc_msg->pn_min_memory, job_desc_msg->acctg_freq))
9287 		return ESLURMD_INVALID_ACCT_FREQ;
9288 
9289 	if (job_desc_msg->min_nodes == NO_VAL)
9290 		job_desc_msg->min_nodes = 1;	/* default node count of 1 */
9291 	if (job_desc_msg->min_cpus == NO_VAL)
9292 		job_desc_msg->min_cpus = job_desc_msg->min_nodes;
9293 
9294 	if ((job_desc_msg->pn_min_cpus == NO_VAL16) ||
9295 	    (job_desc_msg->pn_min_cpus == 0))
9296 		job_desc_msg->pn_min_cpus = 1;   /* default 1 cpu per node */
9297 	if (job_desc_msg->pn_min_tmp_disk == NO_VAL)
9298 		job_desc_msg->pn_min_tmp_disk = 0;/* default 0MB disk per node */
9299 
9300 	return SLURM_SUCCESS;
9301 }
9302 
9303 /*
9304  * Traverse the list of partitions and invoke the
9305  * function validating the job memory specification.
9306  */
_validate_min_mem_partition(job_desc_msg_t * job_desc_msg,part_record_t * part_ptr,List part_list)9307 static bool _validate_min_mem_partition(job_desc_msg_t *job_desc_msg,
9308 					part_record_t *part_ptr, List part_list)
9309 {
9310 	ListIterator iter;
9311 	part_record_t *part;
9312 	uint64_t tmp_pn_min_memory;
9313 	uint16_t tmp_cpus_per_task;
9314 	uint32_t tmp_min_cpus;
9315 	uint32_t tmp_max_cpus;
9316 	uint32_t tmp_pn_min_cpus;
9317 	bool cc = false;
9318 
9319 	/* no reason to check them here as we aren't enforcing them */
9320 	if (!slurmctld_conf.enforce_part_limits)
9321 		return true;
9322 
9323 	tmp_pn_min_memory = job_desc_msg->pn_min_memory;
9324 	tmp_cpus_per_task = job_desc_msg->cpus_per_task;
9325 	tmp_min_cpus = job_desc_msg->min_cpus;
9326 	tmp_max_cpus = job_desc_msg->max_cpus;
9327 	tmp_pn_min_cpus = job_desc_msg->pn_min_cpus;
9328 
9329 	if (part_list == NULL) {
9330 		cc = _valid_pn_min_mem(job_desc_msg, part_ptr);
9331 	} else {
9332 		iter = list_iterator_create(part_list);
9333 		while ((part = list_next(iter))) {
9334 			cc = _valid_pn_min_mem(job_desc_msg, part);
9335 
9336 			/* for ALL we have to test them all */
9337 			if (slurmctld_conf.enforce_part_limits ==
9338 			    PARTITION_ENFORCE_ALL) {
9339 				if (!cc)
9340 					break;
9341 			} else if (cc) /* break, we found one! */
9342 				break;
9343 			else if (slurmctld_conf.enforce_part_limits ==
9344 				 PARTITION_ENFORCE_ANY) {
9345 				debug("%s: Job requested for (%"PRIu64")MB is invalid"
9346 				      " for partition %s",
9347 				      __func__, job_desc_msg->pn_min_memory,
9348 				      part->name);
9349 			}
9350 
9351 			job_desc_msg->pn_min_memory = tmp_pn_min_memory;
9352 			job_desc_msg->cpus_per_task = tmp_cpus_per_task;
9353 			job_desc_msg->min_cpus = tmp_min_cpus;
9354 			job_desc_msg->max_cpus = tmp_max_cpus;
9355 			job_desc_msg->pn_min_cpus = tmp_pn_min_cpus;
9356 		}
9357 		list_iterator_destroy(iter);
9358 	}
9359 
9360 	/*
9361 	 * Restoring original values, if it is necessary,
9362 	 * these will be modified in job_limits_check()
9363 	 */
9364 	job_desc_msg->pn_min_memory = tmp_pn_min_memory;
9365 	job_desc_msg->cpus_per_task = tmp_cpus_per_task;
9366 	job_desc_msg->min_cpus = tmp_min_cpus;
9367 	job_desc_msg->max_cpus = tmp_max_cpus;
9368 	job_desc_msg->pn_min_cpus = tmp_pn_min_cpus;
9369 
9370 	return cc;
9371 }
9372 
free_null_array_recs(job_record_t * job_ptr)9373 extern void free_null_array_recs(job_record_t *job_ptr)
9374 {
9375 	if (!job_ptr || !job_ptr->array_recs)
9376 		return;
9377 
9378 	FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap);
9379 	xfree(job_ptr->array_recs->task_id_str);
9380 	xfree(job_ptr->array_recs);
9381 }
9382 
_delete_job_common(job_record_t * job_ptr)9383 static void _delete_job_common(job_record_t *job_ptr)
9384 {
9385 	/* Remove record from fed_job_list */
9386 	fed_mgr_remove_fed_job_info(job_ptr->job_id);
9387 
9388 	/* Remove the record from job hash table */
9389 	_remove_job_hash(job_ptr, JOB_HASH_JOB);
9390 
9391 	/* Remove the record from job array hash tables, if applicable */
9392 	if (job_ptr->array_task_id != NO_VAL) {
9393 		_remove_job_hash(job_ptr, JOB_HASH_ARRAY_JOB);
9394 		_remove_job_hash(job_ptr, JOB_HASH_ARRAY_TASK);
9395 	}
9396 }
9397 
9398 /*
9399  * _list_delete_job - delete a job record and its corresponding job_details,
9400  *	see common/list.h for documentation
9401  * IN job_entry - pointer to job_record to delete
9402  */
_list_delete_job(void * job_entry)9403 static void _list_delete_job(void *job_entry)
9404 {
9405 	job_record_t *job_ptr = (job_record_t *) job_entry;
9406 	int job_array_size, i;
9407 
9408 	xassert(job_entry);
9409 	xassert (job_ptr->magic == JOB_MAGIC);
9410 	job_ptr->magic = 0;	/* make sure we don't delete record twice */
9411 
9412 	_delete_job_common(job_ptr);
9413 
9414 	if (job_ptr->array_recs) {
9415 		job_array_size = MAX(1, job_ptr->array_recs->task_cnt);
9416 	} else {
9417 		job_array_size = 1;
9418 	}
9419 
9420 	_delete_job_details(job_ptr);
9421 	xfree(job_ptr->account);
9422 	xfree(job_ptr->admin_comment);
9423 	xfree(job_ptr->alias_list);
9424 	xfree(job_ptr->alloc_node);
9425 	free_null_array_recs(job_ptr);
9426 	if (job_ptr->array_recs) {
9427 		FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap);
9428 		xfree(job_ptr->array_recs->task_id_str);
9429 		xfree(job_ptr->array_recs);
9430 	}
9431 	xfree(job_ptr->batch_features);
9432 	xfree(job_ptr->batch_host);
9433 	xfree(job_ptr->burst_buffer);
9434 	xfree(job_ptr->comment);
9435 	xfree(job_ptr->clusters);
9436 	xfree(job_ptr->cpus_per_tres);
9437 	free_job_fed_details(&job_ptr->fed_details);
9438 	free_job_resources(&job_ptr->job_resrcs);
9439 	xfree(job_ptr->gres_alloc);
9440 	_clear_job_gres_details(job_ptr);
9441 	xfree(job_ptr->gres_req);
9442 	xfree(job_ptr->gres_used);
9443 	FREE_NULL_LIST(job_ptr->gres_list);
9444 	xfree(job_ptr->licenses);
9445 	FREE_NULL_LIST(job_ptr->license_list);
9446 	xfree(job_ptr->limit_set.tres);
9447 	xfree(job_ptr->mail_user);
9448 	xfree(job_ptr->mcs_label);
9449 	xfree(job_ptr->mem_per_tres);
9450 	xfree(job_ptr->name);
9451 	xfree(job_ptr->network);
9452 	xfree(job_ptr->node_addr);
9453 	FREE_NULL_BITMAP(job_ptr->node_bitmap);
9454 	FREE_NULL_BITMAP(job_ptr->node_bitmap_cg);
9455 	xfree(job_ptr->nodes);
9456 	xfree(job_ptr->nodes_completing);
9457 	xfree(job_ptr->origin_cluster);
9458 	if (job_ptr->het_details && job_ptr->het_job_id) {
9459 		/* xfree struct if hetjob leader and NULL ptr otherwise. */
9460 		if (job_ptr->het_job_offset == 0)
9461 			xfree(job_ptr->het_details);
9462 		else
9463 			job_ptr->het_details = NULL;
9464 	}
9465 	xfree(job_ptr->het_job_id_set);
9466 	FREE_NULL_LIST(job_ptr->het_job_list);
9467 	xfree(job_ptr->partition);
9468 	FREE_NULL_LIST(job_ptr->part_ptr_list);
9469 	xfree(job_ptr->priority_array);
9470 	slurm_destroy_priority_factors_object(job_ptr->prio_factors);
9471 	xfree(job_ptr->resp_host);
9472 	xfree(job_ptr->resv_name);
9473 	xfree(job_ptr->sched_nodes);
9474 	for (i = 0; i < job_ptr->spank_job_env_size; i++)
9475 		xfree(job_ptr->spank_job_env[i]);
9476 	xfree(job_ptr->spank_job_env);
9477 	xfree(job_ptr->state_desc);
9478 	xfree(job_ptr->system_comment);
9479 	xfree(job_ptr->tres_alloc_cnt);
9480 	xfree(job_ptr->tres_alloc_str);
9481 	xfree(job_ptr->tres_bind);
9482 	xfree(job_ptr->tres_freq);
9483 	xfree(job_ptr->tres_fmt_alloc_str);
9484 	xfree(job_ptr->tres_per_job);
9485 	xfree(job_ptr->tres_per_node);
9486 	xfree(job_ptr->tres_per_socket);
9487 	xfree(job_ptr->tres_per_task);
9488 	xfree(job_ptr->tres_req_cnt);
9489 	xfree(job_ptr->tres_req_str);
9490 	xfree(job_ptr->tres_fmt_req_str);
9491 	step_list_purge(job_ptr);
9492 	select_g_select_jobinfo_free(job_ptr->select_jobinfo);
9493 	xfree(job_ptr->user_name);
9494 	xfree(job_ptr->wckey);
9495 	if (job_array_size > job_count) {
9496 		error("job_count underflow");
9497 		job_count = 0;
9498 	} else {
9499 		job_count -= job_array_size;
9500 	}
9501 	job_ptr->job_id = 0;
9502 	xfree(job_ptr);
9503 }
9504 
9505 
9506 /*
9507  * find specific job_id entry in the job list, key is job_id_ptr
9508  */
_list_find_job_id(void * job_entry,void * key)9509 static int _list_find_job_id(void *job_entry, void *key)
9510 {
9511 	job_record_t *job_ptr = (job_record_t *) job_entry;
9512 	uint32_t *job_id_ptr = (uint32_t *) key;
9513 
9514 	if (job_ptr->job_id == *job_id_ptr)
9515 		return 1;
9516 
9517 	return 0;
9518 }
9519 
9520 /*
9521  * _list_find_job_old - find old entries in the job list,
9522  *	see common/list.h for documentation, key is ignored
9523  * job_entry IN - job pointer
9524  * key IN - if not NULL, then skip hetjobs
9525  */
_list_find_job_old(void * job_entry,void * key)9526 static int _list_find_job_old(void *job_entry, void *key)
9527 {
9528 	time_t kill_age, min_age, now = time(NULL);
9529 	job_record_t *job_ptr = (job_record_t *) job_entry;
9530 	uint16_t cleaning = 0;
9531 
9532 	if ((job_ptr->job_id == NO_VAL) && IS_JOB_REVOKED(job_ptr))
9533 		return 1;
9534 
9535 	if (key && job_ptr->het_job_id)
9536 		return 0;
9537 
9538 	if (IS_JOB_COMPLETING(job_ptr) && !LOTS_OF_AGENTS) {
9539 		kill_age = now - (slurmctld_conf.kill_wait +
9540 				  2 * slurm_get_msg_timeout());
9541 		if (job_ptr->time_last_active < kill_age) {
9542 			job_ptr->time_last_active = now;
9543 			re_kill_job(job_ptr);
9544 		}
9545 		return 0;       /* Job still completing */
9546 	}
9547 
9548 	if (job_ptr->epilog_running)
9549 		return 0;       /* EpilogSlurmctld still running */
9550 
9551 	if (slurmctld_conf.min_job_age == 0)
9552 		return 0;	/* No job record purging */
9553 
9554 	if (fed_mgr_fed_rec && job_ptr->fed_details &&
9555 	    !fed_mgr_is_origin_job(job_ptr)) {
9556 		uint32_t origin_id = fed_mgr_get_cluster_id(job_ptr->job_id);
9557 		slurmdb_cluster_rec_t *origin =
9558 			fed_mgr_get_cluster_by_id(origin_id);
9559 
9560 		/* keep job around until origin comes back and is synced */
9561 		if (origin &&
9562 		    (!origin->fed.send ||
9563 		     (((slurm_persist_conn_t *)origin->fed.send)->fd == -1) ||
9564 		     !origin->fed.sync_sent))
9565 		    return 0;
9566 	}
9567 
9568 	min_age  = now - slurmctld_conf.min_job_age;
9569 	if (job_ptr->end_time > min_age)
9570 		return 0;	/* Too new to purge */
9571 
9572 	if (!(IS_JOB_COMPLETED(job_ptr)))
9573 		return 0;	/* Job still active */
9574 
9575 	if (job_ptr->step_list && list_count(job_ptr->step_list)) {
9576 		debug("%pJ still has %d active steps",
9577 		      job_ptr, list_count(job_ptr->step_list));
9578 		/*
9579 		 * If the job has been around more than 30 days the steps are
9580 		 * bogus.  Blow the job away.  This was witnessed <= 16.05 but
9581 		 * hasn't be seen since.  This is here just to clear them out if
9582 		 * this ever shows up again.
9583 		 */
9584 		min_age = now - PURGE_OLD_JOB_IN_SEC;
9585 		if (job_ptr->end_time <= min_age) {
9586 			info("Force purge of %pJ. It ended over 30 days ago, the slurmctld thinks there are still steps running but they are most likely bogus. In any case you might want to check nodes %s to make sure nothing remains of the job.",
9587 			     job_ptr, job_ptr->nodes);
9588 			goto end_it;
9589 		} else
9590 			return 0;	/* steps are still active */
9591 	}
9592 
9593 	if (job_ptr->array_recs) {
9594 		if (job_ptr->array_recs->tot_run_tasks ||
9595 		    !_test_job_array_purged(job_ptr->array_job_id)) {
9596 			/* Some tasks from this job array still active */
9597 			return 0;
9598 		}
9599 	}
9600 
9601 	select_g_select_jobinfo_get(job_ptr->select_jobinfo,
9602 				    SELECT_JOBDATA_CLEANING,
9603 				    &cleaning);
9604 	if (cleaning)
9605 		return 0;      /* Job hasn't finished yet */
9606 
9607 	if (bb_g_job_test_stage_out(job_ptr) != 1)
9608 		return 0;      /* Stage out in progress */
9609 
9610 	/* If we don't have a db_index by now and we are running with
9611 	 * the slurmdbd, lets put it on the list to be handled later
9612 	 * when slurmdbd comes back up since we won't get another chance.
9613 	 * job_start won't pend for job_db_inx when the job is finished.
9614 	 */
9615 end_it:
9616 	if (with_slurmdbd && !job_ptr->db_index)
9617 		jobacct_storage_g_job_start(acct_db_conn, job_ptr);
9618 
9619 	return 1;		/* Purge the job */
9620 }
9621 
9622 /* Determine if ALL partitions associated with a job are hidden */
_all_parts_hidden(job_record_t * job_ptr,uid_t uid)9623 static bool _all_parts_hidden(job_record_t *job_ptr, uid_t uid)
9624 {
9625 	bool rc;
9626 	ListIterator part_iterator;
9627 	part_record_t *part_ptr;
9628 
9629 	if (job_ptr->part_ptr_list) {
9630 		rc = true;
9631 		part_iterator = list_iterator_create(job_ptr->part_ptr_list);
9632 		while ((part_ptr = list_next(part_iterator))) {
9633 			if (part_is_visible(part_ptr, uid)) {
9634 				rc = false;
9635 				break;
9636 			}
9637 		}
9638 		list_iterator_destroy(part_iterator);
9639 		return rc;
9640 	}
9641 
9642 	if (job_ptr->part_ptr && part_is_visible(job_ptr->part_ptr, uid))
9643 		return false;
9644 	return true;
9645 }
9646 
9647 /* Determine if a given job should be seen by a specific user */
_hide_job(job_record_t * job_ptr,uid_t uid,uint16_t show_flags)9648 static bool _hide_job(job_record_t *job_ptr, uid_t uid, uint16_t show_flags)
9649 {
9650 	if (!(show_flags & SHOW_ALL) && IS_JOB_REVOKED(job_ptr))
9651 		return true;
9652 
9653 	if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) &&
9654 	    (job_ptr->user_id != uid) && !validate_operator(uid) &&
9655 	    (((slurm_mcs_get_privatedata() == 0) &&
9656 	      !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
9657 					    job_ptr->account)) ||
9658 	     ((slurm_mcs_get_privatedata() == 1) &&
9659 	      (mcs_g_check_mcs_label(uid, job_ptr->mcs_label) != 0))))
9660 		return true;
9661 	return false;
9662 }
9663 
_pack_job(job_record_t * job_ptr,_foreach_pack_job_info_t * pack_info)9664 static void _pack_job(job_record_t *job_ptr,
9665 		      _foreach_pack_job_info_t *pack_info)
9666 {
9667 	xassert (job_ptr->magic == JOB_MAGIC);
9668 
9669 	if ((pack_info->filter_uid != NO_VAL) &&
9670 	    (pack_info->filter_uid != job_ptr->user_id))
9671 		return;
9672 
9673 	if (((pack_info->show_flags & SHOW_ALL) == 0) &&
9674 	    (pack_info->uid != 0) &&
9675 	    _all_parts_hidden(job_ptr, pack_info->uid))
9676 		return;
9677 
9678 	if (_hide_job(job_ptr, pack_info->uid, pack_info->show_flags))
9679 		return;
9680 
9681 	pack_job(job_ptr, pack_info->show_flags, pack_info->buffer,
9682 		 pack_info->protocol_version, pack_info->uid);
9683 
9684 	(*pack_info->jobs_packed)++;
9685 }
9686 
_foreach_pack_jobid(void * object,void * arg)9687 static int _foreach_pack_jobid(void *object, void *arg)
9688 {
9689 	job_record_t *job_ptr;
9690 	uint32_t job_id = *(uint32_t *)object;
9691 	_foreach_pack_job_info_t *info = (_foreach_pack_job_info_t *)arg;
9692 
9693 	if (!(job_ptr = find_job_record(job_id)))
9694 		return SLURM_SUCCESS;
9695 
9696 	_pack_job(job_ptr, info);
9697 
9698 	return SLURM_SUCCESS;
9699 }
9700 
9701 /*
9702  * pack_all_jobs - dump all job information for all jobs in
9703  *	machine independent form (for network transmission)
9704  * OUT buffer_ptr - the pointer is set to the allocated buffer.
9705  * OUT buffer_size - set to size of the buffer in bytes
9706  * IN show_flags - job filtering options
9707  * IN uid - uid of user making request (for partition filtering)
9708  * IN filter_uid - pack only jobs belonging to this user if not NO_VAL
9709  * global: job_list - global list of job records
9710  * NOTE: the buffer at *buffer_ptr must be xfreed by the caller
9711  * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
9712  *	whenever the data format changes
9713  */
pack_all_jobs(char ** buffer_ptr,int * buffer_size,uint16_t show_flags,uid_t uid,uint32_t filter_uid,uint16_t protocol_version)9714 extern void pack_all_jobs(char **buffer_ptr, int *buffer_size,
9715 			  uint16_t show_flags, uid_t uid, uint32_t filter_uid,
9716 			  uint16_t protocol_version)
9717 {
9718 	uint32_t jobs_packed = 0, tmp_offset;
9719 	_foreach_pack_job_info_t pack_info = {0};
9720 	Buf buffer;
9721 	ListIterator itr;
9722 	job_record_t *job_ptr = NULL;
9723 
9724 	buffer_ptr[0] = NULL;
9725 	*buffer_size = 0;
9726 
9727 	buffer = init_buf(BUF_SIZE);
9728 
9729 	/* write message body header : size and time */
9730 	/* put in a place holder job record count of 0 for now */
9731 	pack32(jobs_packed, buffer);
9732 	pack_time(time(NULL), buffer);
9733 
9734 	/* write individual job records */
9735 	pack_info.buffer           = buffer;
9736 	pack_info.filter_uid       = filter_uid;
9737 	pack_info.jobs_packed      = &jobs_packed;
9738 	pack_info.protocol_version = protocol_version;
9739 	pack_info.show_flags       = show_flags;
9740 	pack_info.uid              = uid;
9741 
9742 	itr = list_iterator_create(job_list);
9743 	while ((job_ptr = list_next(itr))) {
9744 		_pack_job(job_ptr, &pack_info);
9745 	}
9746 	list_iterator_destroy(itr);
9747 
9748 	/* put the real record count in the message body header */
9749 	tmp_offset = get_buf_offset(buffer);
9750 	set_buf_offset(buffer, 0);
9751 	pack32(jobs_packed, buffer);
9752 	set_buf_offset(buffer, tmp_offset);
9753 
9754 	*buffer_size = get_buf_offset(buffer);
9755 	buffer_ptr[0] = xfer_buf_data(buffer);
9756 }
9757 
9758 /*
9759  * pack_spec_jobs - dump job information for specified jobs in
9760  *	machine independent form (for network transmission)
9761  * OUT buffer_ptr - the pointer is set to the allocated buffer.
9762  * OUT buffer_size - set to size of the buffer in bytes
9763  * IN show_flags - job filtering options
9764  * IN job_ids - list of job_ids to pack
9765  * IN uid - uid of user making request (for partition filtering)
9766  * IN filter_uid - pack only jobs belonging to this user if not NO_VAL
9767  * global: job_list - global list of job records
9768  * NOTE: the buffer at *buffer_ptr must be xfreed by the caller
9769  * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
9770  *	whenever the data format changes
9771  */
pack_spec_jobs(char ** buffer_ptr,int * buffer_size,List job_ids,uint16_t show_flags,uid_t uid,uint32_t filter_uid,uint16_t protocol_version)9772 extern void pack_spec_jobs(char **buffer_ptr, int *buffer_size, List job_ids,
9773 			   uint16_t show_flags, uid_t uid, uint32_t filter_uid,
9774 			   uint16_t protocol_version)
9775 {
9776 	uint32_t jobs_packed = 0, tmp_offset;
9777 	_foreach_pack_job_info_t pack_info = {0};
9778 	Buf buffer;
9779 
9780 	xassert(job_ids);
9781 
9782 	buffer_ptr[0] = NULL;
9783 	*buffer_size = 0;
9784 
9785 	buffer = init_buf(BUF_SIZE);
9786 
9787 	/* write message body header : size and time */
9788 	/* put in a place holder job record count of 0 for now */
9789 	pack32(jobs_packed, buffer);
9790 	pack_time(time(NULL), buffer);
9791 
9792 	/* write individual job records */
9793 	pack_info.buffer           = buffer;
9794 	pack_info.filter_uid       = filter_uid;
9795 	pack_info.jobs_packed      = &jobs_packed;
9796 	pack_info.protocol_version = protocol_version;
9797 	pack_info.show_flags       = show_flags;
9798 	pack_info.uid              = uid;
9799 
9800 	list_for_each(job_ids, _foreach_pack_jobid, &pack_info);
9801 
9802 	/* put the real record count in the message body header */
9803 	tmp_offset = get_buf_offset(buffer);
9804 	set_buf_offset(buffer, 0);
9805 	pack32(jobs_packed, buffer);
9806 	set_buf_offset(buffer, tmp_offset);
9807 
9808 	*buffer_size = get_buf_offset(buffer);
9809 	buffer_ptr[0] = xfer_buf_data(buffer);
9810 }
9811 
_pack_het_job(job_record_t * job_ptr,uint16_t show_flags,Buf buffer,uint16_t protocol_version,uid_t uid)9812 static int _pack_het_job(job_record_t *job_ptr, uint16_t show_flags,
9813 			    Buf buffer, uint16_t protocol_version, uid_t uid)
9814 {
9815 	job_record_t *het_job_ptr;
9816 	int job_cnt = 0;
9817 	ListIterator iter;
9818 
9819 	iter = list_iterator_create(job_ptr->het_job_list);
9820 	while ((het_job_ptr = list_next(iter))) {
9821 		if (het_job_ptr->het_job_id == job_ptr->het_job_id) {
9822 			pack_job(het_job_ptr, show_flags, buffer,
9823 				 protocol_version, uid);
9824 			job_cnt++;
9825 		} else {
9826 			error("%s: Bad het_job_list for %pJ",
9827 			      __func__, job_ptr);
9828 		}
9829 	}
9830 	list_iterator_destroy(iter);
9831 
9832 	return job_cnt;
9833 }
9834 
9835 /*
9836  * pack_one_job - dump information for one jobs in
9837  *	machine independent form (for network transmission)
9838  * OUT buffer_ptr - the pointer is set to the allocated buffer.
9839  * OUT buffer_size - set to size of the buffer in bytes
9840  * IN job_id - ID of job that we want info for
9841  * IN show_flags - job filtering options
9842  * IN uid - uid of user making request (for partition filtering)
9843  * NOTE: the buffer at *buffer_ptr must be xfreed by the caller
9844  * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
9845  *	whenever the data format changes
9846  */
pack_one_job(char ** buffer_ptr,int * buffer_size,uint32_t job_id,uint16_t show_flags,uid_t uid,uint16_t protocol_version)9847 extern int pack_one_job(char **buffer_ptr, int *buffer_size,
9848 			uint32_t job_id, uint16_t show_flags, uid_t uid,
9849 			uint16_t protocol_version)
9850 {
9851 	job_record_t *job_ptr;
9852 	uint32_t jobs_packed = 0, tmp_offset;
9853 	Buf buffer;
9854 
9855 	buffer_ptr[0] = NULL;
9856 	*buffer_size = 0;
9857 
9858 	buffer = init_buf(BUF_SIZE);
9859 
9860 	/* write message body header : size and time */
9861 	/* put in a place holder job record count of 0 for now */
9862 	pack32(jobs_packed, buffer);
9863 	pack_time(time(NULL), buffer);
9864 
9865 	job_ptr = find_job_record(job_id);
9866 	if (job_ptr && job_ptr->het_job_list) {
9867 		/* Pack heterogeneous job components */
9868 		if (!_hide_job(job_ptr, uid, show_flags)) {
9869 			jobs_packed = _pack_het_job(job_ptr, show_flags,
9870 						       buffer, protocol_version,
9871 						       uid);
9872 		}
9873 	} else if (job_ptr && (job_ptr->array_task_id == NO_VAL) &&
9874 		   !job_ptr->array_recs) {
9875 		/* Pack regular (not array) job */
9876 		if (!_hide_job(job_ptr, uid, show_flags)) {
9877 			pack_job(job_ptr, show_flags, buffer, protocol_version,
9878 				 uid);
9879 			jobs_packed++;
9880 		}
9881 	} else {
9882 		bool packed_head = false;
9883 
9884 		/* Either the job is not found or it is a job array */
9885 		if (job_ptr) {
9886 			packed_head = true;
9887 			if (!_hide_job(job_ptr, uid, show_flags)) {
9888 				pack_job(job_ptr, show_flags, buffer,
9889 					 protocol_version, uid);
9890 				jobs_packed++;
9891 			}
9892 		}
9893 
9894 		job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
9895 		while (job_ptr) {
9896 			if ((job_ptr->job_id == job_id) && packed_head) {
9897 				;	/* Already packed */
9898 			} else if (job_ptr->array_job_id == job_id) {
9899 				if (_hide_job(job_ptr, uid, show_flags))
9900 					break;
9901 				pack_job(job_ptr, show_flags, buffer,
9902 					 protocol_version, uid);
9903 				jobs_packed++;
9904 			}
9905 			job_ptr = job_ptr->job_array_next_j;
9906 		}
9907 	}
9908 
9909 	if (jobs_packed == 0) {
9910 		free_buf(buffer);
9911 		return ESLURM_INVALID_JOB_ID;
9912 	}
9913 
9914 	/* put the real record count in the message body header */
9915 	tmp_offset = get_buf_offset(buffer);
9916 	set_buf_offset(buffer, 0);
9917 	pack32(jobs_packed, buffer);
9918 	set_buf_offset(buffer, tmp_offset);
9919 
9920 	*buffer_size = get_buf_offset(buffer);
9921 	buffer_ptr[0] = xfer_buf_data(buffer);
9922 
9923 	return SLURM_SUCCESS;
9924 }
9925 
_pack_job_gres(job_record_t * dump_job_ptr,Buf buffer,uint16_t protocol_version)9926 static void _pack_job_gres(job_record_t *dump_job_ptr, Buf buffer,
9927 			   uint16_t protocol_version)
9928 {
9929 	if (!IS_JOB_STARTED(dump_job_ptr) || IS_JOB_FINISHED(dump_job_ptr) ||
9930 	    (dump_job_ptr->gres_list == NULL)) {
9931 		packstr_array(NULL, 0, buffer);
9932 		return;
9933 	}
9934 
9935 	packstr_array(dump_job_ptr->gres_detail_str,
9936 		      dump_job_ptr->gres_detail_cnt, buffer);
9937 }
9938 
9939 /*
9940  * pack_job - dump all configuration information about a specific job in
9941  *	machine independent form (for network transmission)
9942  * IN dump_job_ptr - pointer to job for which information is requested
9943  * IN show_flags - job filtering options
9944  * IN/OUT buffer - buffer in which data is placed, pointers automatically
9945  *	updated
9946  * IN uid - user requesting the data
9947  * NOTE: change _unpack_job_info_members() in common/slurm_protocol_pack.c
9948  *	  whenever the data format changes
9949  */
pack_job(job_record_t * dump_job_ptr,uint16_t show_flags,Buf buffer,uint16_t protocol_version,uid_t uid)9950 void pack_job(job_record_t *dump_job_ptr, uint16_t show_flags, Buf buffer,
9951 	      uint16_t protocol_version, uid_t uid)
9952 {
9953 	struct job_details *detail_ptr;
9954 	time_t accrue_time = 0, begin_time = 0, start_time = 0, end_time = 0;
9955 	uint32_t time_limit;
9956 	char *nodelist = NULL;
9957 	assoc_mgr_lock_t locks = { .qos = READ_LOCK };
9958 
9959 	if (protocol_version >= SLURM_20_02_PROTOCOL_VERSION) {
9960 		detail_ptr = dump_job_ptr->details;
9961 		pack32(dump_job_ptr->array_job_id, buffer);
9962 		pack32(dump_job_ptr->array_task_id, buffer);
9963 		if (dump_job_ptr->array_recs) {
9964 			build_array_str(dump_job_ptr);
9965 			packstr(dump_job_ptr->array_recs->task_id_str, buffer);
9966 			pack32(dump_job_ptr->array_recs->max_run_tasks, buffer);
9967 		} else {
9968 			job_record_t *array_head = NULL;
9969 			packnull(buffer);
9970 			if (dump_job_ptr->array_job_id) {
9971 				array_head = find_job_record(
9972 						dump_job_ptr->array_job_id);
9973 			}
9974 			if (array_head && array_head->array_recs) {
9975 				pack32(array_head->array_recs->max_run_tasks,
9976 				       buffer);
9977 			} else {
9978 				pack32((uint32_t) 0, buffer);
9979 			}
9980 		}
9981 
9982 		pack32(dump_job_ptr->assoc_id, buffer);
9983 		pack32(dump_job_ptr->delay_boot, buffer);
9984 		pack32(dump_job_ptr->job_id,   buffer);
9985 		pack32(dump_job_ptr->user_id,  buffer);
9986 		pack32(dump_job_ptr->group_id, buffer);
9987 		pack32(dump_job_ptr->het_job_id, buffer);
9988 		packstr(dump_job_ptr->het_job_id_set, buffer);
9989 		pack32(dump_job_ptr->het_job_offset, buffer);
9990 		pack32(dump_job_ptr->profile,  buffer);
9991 
9992 		pack32(dump_job_ptr->job_state,    buffer);
9993 		pack16(dump_job_ptr->batch_flag,   buffer);
9994 		pack16(dump_job_ptr->state_reason, buffer);
9995 		pack8(dump_job_ptr->power_flags,   buffer);
9996 		pack8(dump_job_ptr->reboot,        buffer);
9997 		pack16(dump_job_ptr->restart_cnt,  buffer);
9998 		pack16(show_flags,  buffer);
9999 		pack_time(dump_job_ptr->deadline, buffer);
10000 
10001 		pack32(dump_job_ptr->alloc_sid, buffer);
10002 		if ((dump_job_ptr->time_limit == NO_VAL)
10003 		    && dump_job_ptr->part_ptr)
10004 			time_limit = dump_job_ptr->part_ptr->max_time;
10005 		else
10006 			time_limit = dump_job_ptr->time_limit;
10007 
10008 		pack32(time_limit, buffer);
10009 		pack32(dump_job_ptr->time_min, buffer);
10010 
10011 		if (dump_job_ptr->details) {
10012 			pack32(dump_job_ptr->details->nice,  buffer);
10013 			pack_time(dump_job_ptr->details->submit_time, buffer);
10014 			/* Earliest possible begin time */
10015 			begin_time = dump_job_ptr->details->begin_time;
10016 			/* When we started accruing time for priority */
10017 			accrue_time = dump_job_ptr->details->accrue_time;
10018 		} else {   /* Some job details may be purged after completion */
10019 			pack32(NICE_OFFSET, buffer);	/* Best guess */
10020 			pack_time((time_t) 0, buffer);
10021 		}
10022 
10023 		pack_time(begin_time, buffer);
10024 		pack_time(accrue_time, buffer);
10025 
10026 		if (IS_JOB_STARTED(dump_job_ptr)) {
10027 			/* Report actual start time, in past */
10028 			start_time = dump_job_ptr->start_time;
10029 			end_time = dump_job_ptr->end_time;
10030 		} else if (dump_job_ptr->start_time != 0) {
10031 			/* Report expected start time,
10032 			 * making sure that time is not in the past */
10033 			start_time = MAX(dump_job_ptr->start_time, time(NULL));
10034 			if (time_limit != NO_VAL) {
10035 				end_time = MAX(dump_job_ptr->end_time,
10036 					       (start_time + time_limit * 60));
10037 			}
10038 		} else	if (begin_time > time(NULL)) {
10039 			/* earliest start time in the future */
10040 			start_time = begin_time;
10041 			if (time_limit != NO_VAL) {
10042 				end_time = MAX(dump_job_ptr->end_time,
10043 					       (start_time + time_limit * 60));
10044 			}
10045 		}
10046 		pack_time(start_time, buffer);
10047 		pack_time(end_time, buffer);
10048 
10049 		pack_time(dump_job_ptr->suspend_time, buffer);
10050 		pack_time(dump_job_ptr->pre_sus_time, buffer);
10051 		pack_time(dump_job_ptr->resize_time, buffer);
10052 		pack_time(dump_job_ptr->last_sched_eval, buffer);
10053 		pack_time(dump_job_ptr->preempt_time, buffer);
10054 		pack32(dump_job_ptr->priority, buffer);
10055 		packdouble(dump_job_ptr->billable_tres, buffer);
10056 
10057 		packstr(slurmctld_conf.cluster_name, buffer);
10058 		/* Only send the allocated nodelist since we are only sending
10059 		 * the number of cpus and nodes that are currently allocated. */
10060 		if (!IS_JOB_COMPLETING(dump_job_ptr))
10061 			packstr(dump_job_ptr->nodes, buffer);
10062 		else {
10063 			nodelist =
10064 				bitmap2node_name(dump_job_ptr->node_bitmap_cg);
10065 			packstr(nodelist, buffer);
10066 			xfree(nodelist);
10067 		}
10068 
10069 		packstr(dump_job_ptr->sched_nodes, buffer);
10070 
10071 		if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr)
10072 			packstr(dump_job_ptr->part_ptr->name, buffer);
10073 		else
10074 			packstr(dump_job_ptr->partition, buffer);
10075 		packstr(dump_job_ptr->account, buffer);
10076 		packstr(dump_job_ptr->admin_comment, buffer);
10077 		pack32(dump_job_ptr->site_factor, buffer);
10078 		packstr(dump_job_ptr->network, buffer);
10079 		packstr(dump_job_ptr->comment, buffer);
10080 		packstr(dump_job_ptr->batch_features, buffer);
10081 		packstr(dump_job_ptr->batch_host, buffer);
10082 		packstr(dump_job_ptr->burst_buffer, buffer);
10083 		packstr(dump_job_ptr->burst_buffer_state, buffer);
10084 		packstr(dump_job_ptr->system_comment, buffer);
10085 
10086 		assoc_mgr_lock(&locks);
10087 		if (dump_job_ptr->qos_ptr)
10088 			packstr(dump_job_ptr->qos_ptr->name, buffer);
10089 		else {
10090 			if (assoc_mgr_qos_list) {
10091 				packstr(slurmdb_qos_str(assoc_mgr_qos_list,
10092 							dump_job_ptr->qos_id),
10093 					buffer);
10094 			} else
10095 				packnull(buffer);
10096 		}
10097 
10098 		if (IS_JOB_STARTED(dump_job_ptr) &&
10099 		    (slurmctld_conf.preempt_mode != PREEMPT_MODE_OFF) &&
10100 		    (slurm_job_preempt_mode(dump_job_ptr) != PREEMPT_MODE_OFF)) {
10101 			time_t preemptable = acct_policy_get_preemptable_time(
10102 						dump_job_ptr);
10103 			pack_time(preemptable, buffer);
10104 		} else {
10105 			pack_time(0, buffer);
10106 		}
10107 		assoc_mgr_unlock(&locks);
10108 
10109 		packstr(dump_job_ptr->licenses, buffer);
10110 		packstr(dump_job_ptr->state_desc, buffer);
10111 		packstr(dump_job_ptr->resv_name, buffer);
10112 		packstr(dump_job_ptr->mcs_label, buffer);
10113 
10114 		pack32(dump_job_ptr->exit_code, buffer);
10115 		pack32(dump_job_ptr->derived_ec, buffer);
10116 
10117 		packstr(dump_job_ptr->gres_used, buffer);
10118 		if (show_flags & SHOW_DETAIL) {
10119 			pack_job_resources(dump_job_ptr->job_resrcs, buffer,
10120 					   protocol_version);
10121 			_pack_job_gres(dump_job_ptr, buffer, protocol_version);
10122 		} else {
10123 			pack32(NO_VAL, buffer);
10124 			pack32((uint32_t) 0, buffer);
10125 		}
10126 
10127 		packstr(dump_job_ptr->name, buffer);
10128 		packstr(dump_job_ptr->user_name, buffer);
10129 		packstr(dump_job_ptr->wckey, buffer);
10130 		pack32(dump_job_ptr->req_switch, buffer);
10131 		pack32(dump_job_ptr->wait4switch, buffer);
10132 
10133 		packstr(dump_job_ptr->alloc_node, buffer);
10134 		if (!IS_JOB_COMPLETING(dump_job_ptr))
10135 			pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer);
10136 		else
10137 			pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer);
10138 
10139 		select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo,
10140 					     buffer, protocol_version);
10141 
10142 		/* A few details are always dumped here */
10143 		_pack_default_job_details(dump_job_ptr, buffer,
10144 					  protocol_version);
10145 
10146 		/* other job details are only dumped until the job starts
10147 		 * running (at which time they become meaningless) */
10148 		if (detail_ptr)
10149 			_pack_pending_job_details(detail_ptr, buffer,
10150 						  protocol_version);
10151 		else
10152 			_pack_pending_job_details(NULL, buffer,
10153 						  protocol_version);
10154 		pack32(dump_job_ptr->bit_flags, buffer);
10155 		packstr(dump_job_ptr->tres_fmt_alloc_str, buffer);
10156 		packstr(dump_job_ptr->tres_fmt_req_str, buffer);
10157 		pack16(dump_job_ptr->start_protocol_ver, buffer);
10158 
10159 		if (dump_job_ptr->fed_details) {
10160 			packstr(dump_job_ptr->fed_details->origin_str, buffer);
10161 			pack64(dump_job_ptr->fed_details->siblings_active,
10162 			       buffer);
10163 			packstr(dump_job_ptr->fed_details->siblings_active_str,
10164 				buffer);
10165 			pack64(dump_job_ptr->fed_details->siblings_viable,
10166 			       buffer);
10167 			packstr(dump_job_ptr->fed_details->siblings_viable_str,
10168 				buffer);
10169 		} else {
10170 			packnull(buffer);
10171 			pack64((uint64_t)0, buffer);
10172 			packnull(buffer);
10173 			pack64((uint64_t)0, buffer);
10174 			packnull(buffer);
10175 		}
10176 
10177 		packstr(dump_job_ptr->cpus_per_tres, buffer);
10178 		packstr(dump_job_ptr->mem_per_tres, buffer);
10179 		packstr(dump_job_ptr->tres_bind, buffer);
10180 		packstr(dump_job_ptr->tres_freq, buffer);
10181 		packstr(dump_job_ptr->tres_per_job, buffer);
10182 		packstr(dump_job_ptr->tres_per_node, buffer);
10183 		packstr(dump_job_ptr->tres_per_socket, buffer);
10184 		packstr(dump_job_ptr->tres_per_task, buffer);
10185 
10186 		pack16(dump_job_ptr->mail_type, buffer);
10187 		packstr(dump_job_ptr->mail_user, buffer);
10188 	} else if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
10189 		detail_ptr = dump_job_ptr->details;
10190 		pack32(dump_job_ptr->array_job_id, buffer);
10191 		pack32(dump_job_ptr->array_task_id, buffer);
10192 		if (dump_job_ptr->array_recs) {
10193 			build_array_str(dump_job_ptr);
10194 			packstr(dump_job_ptr->array_recs->task_id_str, buffer);
10195 			pack32(dump_job_ptr->array_recs->max_run_tasks, buffer);
10196 		} else {
10197 			job_record_t *array_head = NULL;
10198 			packnull(buffer);
10199 			if (dump_job_ptr->array_job_id) {
10200 				array_head = find_job_record(
10201 						dump_job_ptr->array_job_id);
10202 			}
10203 			if (array_head && array_head->array_recs) {
10204 				pack32(array_head->array_recs->max_run_tasks,
10205 				       buffer);
10206 			} else {
10207 				pack32((uint32_t) 0, buffer);
10208 			}
10209 		}
10210 
10211 		pack32(dump_job_ptr->assoc_id, buffer);
10212 		pack32(dump_job_ptr->delay_boot, buffer);
10213 		pack32(dump_job_ptr->job_id,   buffer);
10214 		pack32(dump_job_ptr->user_id,  buffer);
10215 		pack32(dump_job_ptr->group_id, buffer);
10216 		pack32(dump_job_ptr->het_job_id, buffer);
10217 		packstr(dump_job_ptr->het_job_id_set, buffer);
10218 		pack32(dump_job_ptr->het_job_offset, buffer);
10219 		pack32(dump_job_ptr->profile,  buffer);
10220 
10221 		pack32(dump_job_ptr->job_state,    buffer);
10222 		pack16(dump_job_ptr->batch_flag,   buffer);
10223 		pack16(dump_job_ptr->state_reason, buffer);
10224 		pack8(dump_job_ptr->power_flags,   buffer);
10225 		pack8(dump_job_ptr->reboot,        buffer);
10226 		pack16(dump_job_ptr->restart_cnt,  buffer);
10227 		pack16(show_flags,  buffer);
10228 		pack_time(dump_job_ptr->deadline, buffer);
10229 
10230 		pack32(dump_job_ptr->alloc_sid, buffer);
10231 		if ((dump_job_ptr->time_limit == NO_VAL)
10232 		    && dump_job_ptr->part_ptr)
10233 			time_limit = dump_job_ptr->part_ptr->max_time;
10234 		else
10235 			time_limit = dump_job_ptr->time_limit;
10236 
10237 		pack32(time_limit, buffer);
10238 		pack32(dump_job_ptr->time_min, buffer);
10239 
10240 		if (dump_job_ptr->details) {
10241 			pack32(dump_job_ptr->details->nice,  buffer);
10242 			pack_time(dump_job_ptr->details->submit_time, buffer);
10243 			/* Earliest possible begin time */
10244 			begin_time = dump_job_ptr->details->begin_time;
10245 			/* When we started accruing time for priority */
10246 			accrue_time = dump_job_ptr->details->accrue_time;
10247 		} else {   /* Some job details may be purged after completion */
10248 			pack32(NICE_OFFSET, buffer);	/* Best guess */
10249 			pack_time((time_t) 0, buffer);
10250 		}
10251 
10252 		pack_time(begin_time, buffer);
10253 		pack_time(accrue_time, buffer);
10254 
10255 		if (IS_JOB_STARTED(dump_job_ptr)) {
10256 			/* Report actual start time, in past */
10257 			start_time = dump_job_ptr->start_time;
10258 			end_time = dump_job_ptr->end_time;
10259 		} else if (dump_job_ptr->start_time != 0) {
10260 			/* Report expected start time,
10261 			 * making sure that time is not in the past */
10262 			start_time = MAX(dump_job_ptr->start_time, time(NULL));
10263 			if (time_limit != NO_VAL) {
10264 				end_time = MAX(dump_job_ptr->end_time,
10265 					       (start_time + time_limit * 60));
10266 			}
10267 		} else	if (begin_time > time(NULL)) {
10268 			/* earliest start time in the future */
10269 			start_time = begin_time;
10270 			if (time_limit != NO_VAL) {
10271 				end_time = MAX(dump_job_ptr->end_time,
10272 					       (start_time + time_limit * 60));
10273 			}
10274 		}
10275 		pack_time(start_time, buffer);
10276 		pack_time(end_time, buffer);
10277 
10278 		pack_time(dump_job_ptr->suspend_time, buffer);
10279 		pack_time(dump_job_ptr->pre_sus_time, buffer);
10280 		pack_time(dump_job_ptr->resize_time, buffer);
10281 		pack_time(dump_job_ptr->last_sched_eval, buffer);
10282 		pack_time(dump_job_ptr->preempt_time, buffer);
10283 		pack32(dump_job_ptr->priority, buffer);
10284 		packdouble(dump_job_ptr->billable_tres, buffer);
10285 
10286 		packstr(slurmctld_conf.cluster_name, buffer);
10287 		/* Only send the allocated nodelist since we are only sending
10288 		 * the number of cpus and nodes that are currently allocated. */
10289 		if (!IS_JOB_COMPLETING(dump_job_ptr))
10290 			packstr(dump_job_ptr->nodes, buffer);
10291 		else {
10292 			nodelist =
10293 				bitmap2node_name(dump_job_ptr->node_bitmap_cg);
10294 			packstr(nodelist, buffer);
10295 			xfree(nodelist);
10296 		}
10297 
10298 		packstr(dump_job_ptr->sched_nodes, buffer);
10299 
10300 		if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr)
10301 			packstr(dump_job_ptr->part_ptr->name, buffer);
10302 		else
10303 			packstr(dump_job_ptr->partition, buffer);
10304 		packstr(dump_job_ptr->account, buffer);
10305 		packstr(dump_job_ptr->admin_comment, buffer);
10306 		pack32(dump_job_ptr->site_factor, buffer);
10307 		packstr(dump_job_ptr->network, buffer);
10308 		packstr(dump_job_ptr->comment, buffer);
10309 		packstr(dump_job_ptr->batch_features, buffer);
10310 		packstr(dump_job_ptr->batch_host, buffer);
10311 		packstr(dump_job_ptr->burst_buffer, buffer);
10312 		packstr(dump_job_ptr->burst_buffer_state, buffer);
10313 		packstr(dump_job_ptr->system_comment, buffer);
10314 
10315 		assoc_mgr_lock(&locks);
10316 		if (dump_job_ptr->qos_ptr)
10317 			packstr(dump_job_ptr->qos_ptr->name, buffer);
10318 		else {
10319 			if (assoc_mgr_qos_list) {
10320 				packstr(slurmdb_qos_str(assoc_mgr_qos_list,
10321 							dump_job_ptr->qos_id),
10322 					buffer);
10323 			} else
10324 				packnull(buffer);
10325 		}
10326 
10327 		if (IS_JOB_STARTED(dump_job_ptr) &&
10328 		    (slurmctld_conf.preempt_mode != PREEMPT_MODE_OFF) &&
10329 		    (slurm_job_preempt_mode(dump_job_ptr) != PREEMPT_MODE_OFF)) {
10330 			time_t preemptable = acct_policy_get_preemptable_time(
10331 						dump_job_ptr);
10332 			pack_time(preemptable, buffer);
10333 		} else {
10334 			pack_time(0, buffer);
10335 		}
10336 		assoc_mgr_unlock(&locks);
10337 
10338 		packstr(dump_job_ptr->licenses, buffer);
10339 		packstr(dump_job_ptr->state_desc, buffer);
10340 		packstr(dump_job_ptr->resv_name, buffer);
10341 		packstr(dump_job_ptr->mcs_label, buffer);
10342 
10343 		pack32(dump_job_ptr->exit_code, buffer);
10344 		pack32(dump_job_ptr->derived_ec, buffer);
10345 
10346 		if (show_flags & SHOW_DETAIL) {
10347 			pack_job_resources(dump_job_ptr->job_resrcs, buffer,
10348 					   protocol_version);
10349 			_pack_job_gres(dump_job_ptr, buffer, protocol_version);
10350 		} else {
10351 			pack32(NO_VAL, buffer);
10352 			pack32((uint32_t) 0, buffer);
10353 		}
10354 
10355 		packstr(dump_job_ptr->name, buffer);
10356 		packstr(dump_job_ptr->user_name, buffer);
10357 		packstr(dump_job_ptr->wckey, buffer);
10358 		pack32(dump_job_ptr->req_switch, buffer);
10359 		pack32(dump_job_ptr->wait4switch, buffer);
10360 
10361 		packstr(dump_job_ptr->alloc_node, buffer);
10362 		if (!IS_JOB_COMPLETING(dump_job_ptr))
10363 			pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer);
10364 		else
10365 			pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer);
10366 
10367 		select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo,
10368 					     buffer, protocol_version);
10369 
10370 		/* A few details are always dumped here */
10371 		_pack_default_job_details(dump_job_ptr, buffer,
10372 					  protocol_version);
10373 
10374 		/* other job details are only dumped until the job starts
10375 		 * running (at which time they become meaningless) */
10376 		if (detail_ptr)
10377 			_pack_pending_job_details(detail_ptr, buffer,
10378 						  protocol_version);
10379 		else
10380 			_pack_pending_job_details(NULL, buffer,
10381 						  protocol_version);
10382 		pack32(dump_job_ptr->bit_flags, buffer);
10383 		packstr(dump_job_ptr->tres_fmt_alloc_str, buffer);
10384 		packstr(dump_job_ptr->tres_fmt_req_str, buffer);
10385 		pack16(dump_job_ptr->start_protocol_ver, buffer);
10386 
10387 		if (dump_job_ptr->fed_details) {
10388 			packstr(dump_job_ptr->fed_details->origin_str, buffer);
10389 			pack64(dump_job_ptr->fed_details->siblings_active,
10390 			       buffer);
10391 			packstr(dump_job_ptr->fed_details->siblings_active_str,
10392 				buffer);
10393 			pack64(dump_job_ptr->fed_details->siblings_viable,
10394 			       buffer);
10395 			packstr(dump_job_ptr->fed_details->siblings_viable_str,
10396 				buffer);
10397 		} else {
10398 			packnull(buffer);
10399 			pack64((uint64_t)0, buffer);
10400 			packnull(buffer);
10401 			pack64((uint64_t)0, buffer);
10402 			packnull(buffer);
10403 		}
10404 
10405 		packstr(dump_job_ptr->cpus_per_tres, buffer);
10406 		packstr(dump_job_ptr->mem_per_tres, buffer);
10407 		packstr(dump_job_ptr->tres_bind, buffer);
10408 		packstr(dump_job_ptr->tres_freq, buffer);
10409 		packstr(dump_job_ptr->tres_per_job, buffer);
10410 		packstr(dump_job_ptr->tres_per_node, buffer);
10411 		packstr(dump_job_ptr->tres_per_socket, buffer);
10412 		packstr(dump_job_ptr->tres_per_task, buffer);
10413 	} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
10414 		detail_ptr = dump_job_ptr->details;
10415 		pack32(dump_job_ptr->array_job_id, buffer);
10416 		pack32(dump_job_ptr->array_task_id, buffer);
10417 		if (dump_job_ptr->array_recs) {
10418 			build_array_str(dump_job_ptr);
10419 			packstr(dump_job_ptr->array_recs->task_id_str, buffer);
10420 			pack32(dump_job_ptr->array_recs->max_run_tasks, buffer);
10421 		} else {
10422 			packnull(buffer);
10423 			pack32((uint32_t) 0, buffer);
10424 		}
10425 
10426 		pack32(dump_job_ptr->assoc_id, buffer);
10427 		pack32(dump_job_ptr->delay_boot, buffer);
10428 		pack32(dump_job_ptr->job_id,   buffer);
10429 		pack32(dump_job_ptr->user_id,  buffer);
10430 		pack32(dump_job_ptr->group_id, buffer);
10431 		pack32(dump_job_ptr->het_job_id, buffer);
10432 		packstr(dump_job_ptr->het_job_id_set, buffer);
10433 		pack32(dump_job_ptr->het_job_offset, buffer);
10434 		pack32(dump_job_ptr->profile,  buffer);
10435 
10436 		pack32(dump_job_ptr->job_state,    buffer);
10437 		pack16(dump_job_ptr->batch_flag,   buffer);
10438 		pack16(dump_job_ptr->state_reason, buffer);
10439 		pack8(dump_job_ptr->power_flags,   buffer);
10440 		pack8(dump_job_ptr->reboot,        buffer);
10441 		pack16(dump_job_ptr->restart_cnt,  buffer);
10442 		pack16(show_flags,  buffer);
10443 		pack_time(dump_job_ptr->deadline, buffer);
10444 
10445 		pack32(dump_job_ptr->alloc_sid, buffer);
10446 		if ((dump_job_ptr->time_limit == NO_VAL)
10447 		    && dump_job_ptr->part_ptr)
10448 			time_limit = dump_job_ptr->part_ptr->max_time;
10449 		else
10450 			time_limit = dump_job_ptr->time_limit;
10451 
10452 		pack32(time_limit, buffer);
10453 		pack32(dump_job_ptr->time_min, buffer);
10454 
10455 		if (dump_job_ptr->details) {
10456 			pack32(dump_job_ptr->details->nice,  buffer);
10457 			pack_time(dump_job_ptr->details->submit_time, buffer);
10458 			/* Earliest possible begin time */
10459 			begin_time = dump_job_ptr->details->begin_time;
10460 			/* When we started accruing time for priority */
10461 			accrue_time = dump_job_ptr->details->accrue_time;
10462 		} else {   /* Some job details may be purged after completion */
10463 			pack32(NICE_OFFSET, buffer);	/* Best guess */
10464 			pack_time((time_t) 0, buffer);
10465 		}
10466 
10467 		pack_time(begin_time, buffer);
10468 		pack_time(accrue_time, buffer);
10469 
10470 		if (IS_JOB_STARTED(dump_job_ptr)) {
10471 			/* Report actual start time, in past */
10472 			start_time = dump_job_ptr->start_time;
10473 			end_time = dump_job_ptr->end_time;
10474 		} else if (dump_job_ptr->start_time != 0) {
10475 			/* Report expected start time,
10476 			 * making sure that time is not in the past */
10477 			start_time = MAX(dump_job_ptr->start_time, time(NULL));
10478 			if (time_limit != NO_VAL) {
10479 				end_time = MAX(dump_job_ptr->end_time,
10480 					       (start_time + time_limit * 60));
10481 			}
10482 		} else	if (begin_time > time(NULL)) {
10483 			/* earliest start time in the future */
10484 			start_time = begin_time;
10485 			if (time_limit != NO_VAL) {
10486 				end_time = MAX(dump_job_ptr->end_time,
10487 					       (start_time + time_limit * 60));
10488 			}
10489 		}
10490 		pack_time(start_time, buffer);
10491 		pack_time(end_time, buffer);
10492 
10493 		pack_time(dump_job_ptr->suspend_time, buffer);
10494 		pack_time(dump_job_ptr->pre_sus_time, buffer);
10495 		pack_time(dump_job_ptr->resize_time, buffer);
10496 		pack_time(dump_job_ptr->last_sched_eval, buffer);
10497 		pack_time(dump_job_ptr->preempt_time, buffer);
10498 		pack32(dump_job_ptr->priority, buffer);
10499 		packdouble(dump_job_ptr->billable_tres, buffer);
10500 
10501 		packstr(slurmctld_conf.cluster_name, buffer);
10502 		/* Only send the allocated nodelist since we are only sending
10503 		 * the number of cpus and nodes that are currently allocated. */
10504 		if (!IS_JOB_COMPLETING(dump_job_ptr))
10505 			packstr(dump_job_ptr->nodes, buffer);
10506 		else {
10507 			nodelist =
10508 				bitmap2node_name(dump_job_ptr->node_bitmap_cg);
10509 			packstr(nodelist, buffer);
10510 			xfree(nodelist);
10511 		}
10512 
10513 		packstr(dump_job_ptr->sched_nodes, buffer);
10514 
10515 		if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr)
10516 			packstr(dump_job_ptr->part_ptr->name, buffer);
10517 		else
10518 			packstr(dump_job_ptr->partition, buffer);
10519 		packstr(dump_job_ptr->account, buffer);
10520 		packstr(dump_job_ptr->admin_comment, buffer);
10521 		packstr(dump_job_ptr->network, buffer);
10522 		packstr(dump_job_ptr->comment, buffer);
10523 		packstr(dump_job_ptr->batch_features, buffer);
10524 		packstr(dump_job_ptr->batch_host, buffer);
10525 		packstr(dump_job_ptr->burst_buffer, buffer);
10526 		packstr(dump_job_ptr->burst_buffer_state, buffer);
10527 		packstr(dump_job_ptr->system_comment, buffer);
10528 
10529 		assoc_mgr_lock(&locks);
10530 		if (dump_job_ptr->qos_ptr)
10531 			packstr(dump_job_ptr->qos_ptr->name, buffer);
10532 		else {
10533 			if (assoc_mgr_qos_list) {
10534 				packstr(slurmdb_qos_str(assoc_mgr_qos_list,
10535 							dump_job_ptr->qos_id),
10536 					buffer);
10537 			} else
10538 				packnull(buffer);
10539 		}
10540 		assoc_mgr_unlock(&locks);
10541 
10542 		packstr(dump_job_ptr->licenses, buffer);
10543 		packstr(dump_job_ptr->state_desc, buffer);
10544 		packstr(dump_job_ptr->resv_name, buffer);
10545 		packstr(dump_job_ptr->mcs_label, buffer);
10546 
10547 		pack32(dump_job_ptr->exit_code, buffer);
10548 		pack32(dump_job_ptr->derived_ec, buffer);
10549 
10550 		if (show_flags & SHOW_DETAIL) {
10551 			pack_job_resources(dump_job_ptr->job_resrcs, buffer,
10552 					   protocol_version);
10553 			_pack_job_gres(dump_job_ptr, buffer, protocol_version);
10554 		} else {
10555 			pack32(NO_VAL, buffer);
10556 			pack32((uint32_t) 0, buffer);
10557 		}
10558 
10559 		packstr(dump_job_ptr->name, buffer);
10560 		packstr(dump_job_ptr->user_name, buffer);
10561 		packstr(dump_job_ptr->wckey, buffer);
10562 		pack32(dump_job_ptr->req_switch, buffer);
10563 		pack32(dump_job_ptr->wait4switch, buffer);
10564 
10565 		packstr(dump_job_ptr->alloc_node, buffer);
10566 		if (!IS_JOB_COMPLETING(dump_job_ptr))
10567 			pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer);
10568 		else
10569 			pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer);
10570 
10571 		select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo,
10572 					     buffer, protocol_version);
10573 
10574 		/* A few details are always dumped here */
10575 		_pack_default_job_details(dump_job_ptr, buffer,
10576 					  protocol_version);
10577 
10578 		/* other job details are only dumped until the job starts
10579 		 * running (at which time they become meaningless) */
10580 		if (detail_ptr)
10581 			_pack_pending_job_details(detail_ptr, buffer,
10582 						  protocol_version);
10583 		else
10584 			_pack_pending_job_details(NULL, buffer,
10585 						  protocol_version);
10586 		pack32(dump_job_ptr->bit_flags, buffer);
10587 		packstr(dump_job_ptr->tres_fmt_alloc_str, buffer);
10588 		packstr(dump_job_ptr->tres_fmt_req_str, buffer);
10589 		pack16(dump_job_ptr->start_protocol_ver, buffer);
10590 
10591 		if (dump_job_ptr->fed_details) {
10592 			packstr(dump_job_ptr->fed_details->origin_str, buffer);
10593 			pack64(dump_job_ptr->fed_details->siblings_active,
10594 			       buffer);
10595 			packstr(dump_job_ptr->fed_details->siblings_active_str,
10596 				buffer);
10597 			pack64(dump_job_ptr->fed_details->siblings_viable,
10598 			       buffer);
10599 			packstr(dump_job_ptr->fed_details->siblings_viable_str,
10600 				buffer);
10601 		} else {
10602 			packnull(buffer);
10603 			pack64((uint64_t)0, buffer);
10604 			packnull(buffer);
10605 			pack64((uint64_t)0, buffer);
10606 			packnull(buffer);
10607 		}
10608 
10609 		packstr(dump_job_ptr->cpus_per_tres, buffer);
10610 		packstr(dump_job_ptr->mem_per_tres, buffer);
10611 		packstr(dump_job_ptr->tres_bind, buffer);
10612 		packstr(dump_job_ptr->tres_freq, buffer);
10613 		packstr(dump_job_ptr->tres_per_job, buffer);
10614 		packstr(dump_job_ptr->tres_per_node, buffer);
10615 		packstr(dump_job_ptr->tres_per_socket, buffer);
10616 		packstr(dump_job_ptr->tres_per_task, buffer);
10617 	} else {
10618 		error("pack_job: protocol_version "
10619 		      "%hu not supported", protocol_version);
10620 	}
10621 }
10622 
_find_node_config(int * cpu_cnt_ptr,int * core_cnt_ptr)10623 static void _find_node_config(int *cpu_cnt_ptr, int *core_cnt_ptr)
10624 {
10625 	static int max_cpu_cnt = -1, max_core_cnt = -1;
10626 	static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
10627 	int i;
10628 	node_record_t *node_ptr = node_record_table_ptr;
10629 
10630 	slurm_mutex_lock(&lock);
10631 	if (max_cpu_cnt == -1) {
10632 		for (i = 0; i < node_record_count; i++, node_ptr++) {
10633 			/* Only data from config_record used for scheduling */
10634 			max_cpu_cnt = MAX(max_cpu_cnt,
10635 					  node_ptr->config_ptr->cpus);
10636 			max_core_cnt = MAX(max_core_cnt,
10637 					   node_ptr->config_ptr->cores);
10638 		}
10639 	}
10640 	slurm_mutex_unlock(&lock);
10641 
10642 	*cpu_cnt_ptr  = max_cpu_cnt;
10643 	*core_cnt_ptr = max_core_cnt;
10644 
10645 	return;
10646 
10647 }
10648 
10649 /* pack default job details for "get_job_info" RPC */
_pack_default_job_details(job_record_t * job_ptr,Buf buffer,uint16_t protocol_version)10650 static void _pack_default_job_details(job_record_t *job_ptr, Buf buffer,
10651 				      uint16_t protocol_version)
10652 {
10653 	int max_cpu_cnt = -1, max_core_cnt = -1;
10654 	int i;
10655 	struct job_details *detail_ptr = job_ptr->details;
10656 	uint16_t shared = 0;
10657 
10658 	if (!detail_ptr)
10659 		shared = NO_VAL16;
10660 	else if (detail_ptr->share_res == 1)	/* User --share */
10661 		shared = 1;
10662 	else if ((detail_ptr->share_res == 0) ||
10663 		 (detail_ptr->whole_node == 1))
10664 		shared = 0;			/* User --exclusive */
10665 	else if (detail_ptr->whole_node == WHOLE_NODE_USER)
10666 		shared = JOB_SHARED_USER;	/* User --exclusive=user */
10667 	else if (detail_ptr->whole_node == WHOLE_NODE_MCS)
10668 		shared = JOB_SHARED_MCS;	/* User --exclusive=mcs */
10669 	else if (job_ptr->part_ptr) {
10670 		/* Report shared status based upon latest partition info */
10671 		if (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)
10672 			shared = JOB_SHARED_USER;
10673 		else if ((job_ptr->part_ptr->max_share & SHARED_FORCE) &&
10674 			 ((job_ptr->part_ptr->max_share & (~SHARED_FORCE)) > 1))
10675 			shared = 1;		/* Partition Shared=force */
10676 		else if (job_ptr->part_ptr->max_share == 0)
10677 			shared = 0;		/* Partition Shared=exclusive */
10678 		else
10679 			shared = NO_VAL16;  /* Part Shared=yes or no */
10680 	} else
10681 		shared = NO_VAL16;	/* No user or partition info */
10682 
10683 	if (job_ptr->part_ptr && job_ptr->part_ptr->max_cpu_cnt) {
10684 		max_cpu_cnt  = job_ptr->part_ptr->max_cpu_cnt;
10685 		max_core_cnt = job_ptr->part_ptr->max_core_cnt;
10686 	} else
10687 		_find_node_config(&max_cpu_cnt, &max_core_cnt);
10688 
10689 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
10690 		if (detail_ptr) {
10691 			packstr(detail_ptr->features,   buffer);
10692 			packstr(detail_ptr->cluster_features, buffer);
10693 			packstr(detail_ptr->work_dir,   buffer);
10694 			packstr(detail_ptr->dependency, buffer);
10695 
10696 			if (detail_ptr->argv) {
10697 				char *cmd_line = NULL, *pos = NULL;
10698 				for (i = 0; detail_ptr->argv[i]; i++) {
10699 					xstrfmtcatat(cmd_line, &pos, "%s%s",
10700 					             (i ? " " : ""),
10701 						     detail_ptr->argv[i]);
10702 				}
10703 				packstr(cmd_line, buffer);
10704 				xfree(cmd_line);
10705 			} else
10706 				packnull(buffer);
10707 
10708 			if (IS_JOB_COMPLETING(job_ptr) && job_ptr->cpu_cnt) {
10709 				pack32(job_ptr->cpu_cnt, buffer);
10710 				pack32((uint32_t) 0, buffer);
10711 			} else if (job_ptr->total_cpus &&
10712 				   !IS_JOB_PENDING(job_ptr)) {
10713 				/* If job is PENDING ignore total_cpus,
10714 				 * which may have been set by previous run
10715 				 * followed by job requeue. */
10716 				pack32(job_ptr->total_cpus, buffer);
10717 				pack32((uint32_t) 0, buffer);
10718 			} else {
10719 				pack32(detail_ptr->min_cpus, buffer);
10720 				if (detail_ptr->max_cpus != NO_VAL)
10721 					pack32(detail_ptr->max_cpus, buffer);
10722 				else
10723 					pack32((uint32_t) 0, buffer);
10724 			}
10725 
10726 			if (IS_JOB_COMPLETING(job_ptr) && job_ptr->node_cnt) {
10727 				pack32(job_ptr->node_cnt, buffer);
10728 				pack32((uint32_t) 0, buffer);
10729 			} else if (job_ptr->total_nodes) {
10730 				pack32(job_ptr->total_nodes, buffer);
10731 				pack32((uint32_t) 0, buffer);
10732 			} else if (job_ptr->node_cnt_wag) {
10733 				/* This should catch everything else, but
10734 				 * just in case this is 0 (startup or
10735 				 * whatever) we will keep the rest of
10736 				 * this if statement around.
10737 				 */
10738 				pack32(job_ptr->node_cnt_wag, buffer);
10739 				pack32((uint32_t) detail_ptr->max_nodes,
10740 				       buffer);
10741 			} else if (detail_ptr->ntasks_per_node) {
10742 				/* min_nodes based upon task count and ntasks
10743 				 * per node */
10744 				uint32_t min_nodes;
10745 				min_nodes = detail_ptr->num_tasks /
10746 					    detail_ptr->ntasks_per_node;
10747 				min_nodes = MAX(min_nodes,
10748 						detail_ptr->min_nodes);
10749 				pack32(min_nodes, buffer);
10750 				pack32(detail_ptr->max_nodes, buffer);
10751 			} else if (detail_ptr->cpus_per_task > 1) {
10752 				/* min_nodes based upon task count and cpus
10753 				 * per task */
10754 				uint32_t ntasks_per_node, min_nodes;
10755 				ntasks_per_node = max_cpu_cnt /
10756 						  detail_ptr->cpus_per_task;
10757 				ntasks_per_node = MAX(ntasks_per_node, 1);
10758 				min_nodes = detail_ptr->num_tasks /
10759 					    ntasks_per_node;
10760 				min_nodes = MAX(min_nodes,
10761 						detail_ptr->min_nodes);
10762 				if (detail_ptr->num_tasks % ntasks_per_node)
10763 					min_nodes++;
10764 				pack32(min_nodes, buffer);
10765 				pack32(detail_ptr->max_nodes, buffer);
10766 			} else if (detail_ptr->mc_ptr &&
10767 				   detail_ptr->mc_ptr->ntasks_per_core &&
10768 				   (detail_ptr->mc_ptr->ntasks_per_core
10769 				    != INFINITE16)) {
10770 				/* min_nodes based upon task count and ntasks
10771 				 * per core */
10772 				uint32_t min_cores, min_nodes;
10773 				min_cores = detail_ptr->num_tasks +
10774 					    detail_ptr->mc_ptr->ntasks_per_core
10775 					    - 1;
10776 				min_cores /= detail_ptr->mc_ptr->ntasks_per_core;
10777 
10778 				min_nodes = min_cores + max_core_cnt - 1;
10779 				min_nodes /= max_core_cnt;
10780 				min_nodes = MAX(min_nodes,
10781 						detail_ptr->min_nodes);
10782 				pack32(min_nodes, buffer);
10783 				pack32(detail_ptr->max_nodes, buffer);
10784 			} else {
10785 				/* min_nodes based upon task count only */
10786 				uint32_t min_nodes;
10787 				min_nodes = detail_ptr->num_tasks +
10788 					    max_cpu_cnt - 1;
10789 				min_nodes /= max_cpu_cnt;
10790 				min_nodes = MAX(min_nodes,
10791 						detail_ptr->min_nodes);
10792 				pack32(min_nodes, buffer);
10793 				pack32(detail_ptr->max_nodes, buffer);
10794 			}
10795 
10796 			pack16(detail_ptr->requeue,   buffer);
10797 			pack16(detail_ptr->ntasks_per_node, buffer);
10798 			if (detail_ptr->num_tasks)
10799 				pack32(detail_ptr->num_tasks, buffer);
10800 			else if (IS_JOB_PENDING(job_ptr))
10801 				pack32(detail_ptr->min_nodes, buffer);
10802 			else
10803 				pack32(job_ptr->node_cnt, buffer);
10804 			pack16(shared, buffer);
10805 			pack32(detail_ptr->cpu_freq_min, buffer);
10806 			pack32(detail_ptr->cpu_freq_max, buffer);
10807 			pack32(detail_ptr->cpu_freq_gov, buffer);
10808 		} else {
10809 			packnull(buffer);
10810 			packnull(buffer);
10811 			packnull(buffer);
10812 			packnull(buffer);
10813 
10814 			if (job_ptr->total_cpus)
10815 				pack32(job_ptr->total_cpus, buffer);
10816 			else
10817 				pack32(job_ptr->cpu_cnt, buffer);
10818 			pack32((uint32_t) 0, buffer);
10819 
10820 			pack32(job_ptr->node_cnt, buffer);
10821 			pack32((uint32_t) 0, buffer);
10822 			pack16((uint16_t) 0, buffer);
10823 			pack16((uint16_t) 0, buffer);
10824 			pack16((uint16_t) 0, buffer);
10825 			pack32((uint32_t) 0, buffer);
10826 			pack32((uint32_t) 0, buffer);
10827 			pack32((uint32_t) 0, buffer);
10828 		}
10829 	} else {
10830 		error("_pack_default_job_details: protocol_version "
10831 		      "%hu not supported", protocol_version);
10832 	}
10833 }
10834 
10835 /* pack pending job details for "get_job_info" RPC */
_pack_pending_job_details(struct job_details * detail_ptr,Buf buffer,uint16_t protocol_version)10836 static void _pack_pending_job_details(struct job_details *detail_ptr,
10837 				      Buf buffer, uint16_t protocol_version)
10838 {
10839 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
10840 		if (detail_ptr) {
10841 			pack16(detail_ptr->contiguous, buffer);
10842 			pack16(detail_ptr->core_spec, buffer);
10843 			pack16(detail_ptr->cpus_per_task, buffer);
10844 			pack16(detail_ptr->pn_min_cpus, buffer);
10845 
10846 			pack64(detail_ptr->pn_min_memory, buffer);
10847 			pack32(detail_ptr->pn_min_tmp_disk, buffer);
10848 
10849 			packstr(detail_ptr->req_nodes, buffer);
10850 			pack_bit_str_hex(detail_ptr->req_node_bitmap, buffer);
10851 			packstr(detail_ptr->exc_nodes, buffer);
10852 			pack_bit_str_hex(detail_ptr->exc_node_bitmap, buffer);
10853 
10854 			packstr(detail_ptr->std_err, buffer);
10855 			packstr(detail_ptr->std_in, buffer);
10856 			packstr(detail_ptr->std_out, buffer);
10857 
10858 			pack_multi_core_data(detail_ptr->mc_ptr, buffer,
10859 					     protocol_version);
10860 		} else {
10861 			pack16((uint16_t) 0, buffer);
10862 			pack16((uint16_t) 0, buffer);
10863 			pack16((uint16_t) 0, buffer);
10864 			pack16((uint16_t) 0, buffer);
10865 
10866 			pack64((uint64_t) 0, buffer);
10867 			pack32((uint32_t) 0, buffer);
10868 
10869 			packnull(buffer);
10870 			packnull(buffer);
10871 			packnull(buffer);
10872 			packnull(buffer);
10873 
10874 			packnull(buffer);
10875 			packnull(buffer);
10876 			packnull(buffer);
10877 
10878 			pack_multi_core_data(NULL, buffer, protocol_version);
10879 		}
10880 	} else {
10881 		error("%s: protocol_version %hu not supported", __func__,
10882 		      protocol_version);
10883 	}
10884 }
10885 
_purge_het_job_filter(void * x,void * key)10886 static int _purge_het_job_filter(void *x, void *key)
10887 {
10888 	job_record_t *job_ptr = (job_record_t *) x;
10889 	job_record_t *job_filter = (job_record_t *) key;
10890 	if (job_ptr->het_job_id == job_filter->het_job_id)
10891 		return 1;
10892 	return 0;
10893 }
10894 
10895 /* If this is a hetjob leader and all components are complete,
10896  * then purge all job of its hetjob records
10897  * RET true if this record purged */
_purge_complete_het_job(job_record_t * het_job_leader)10898 static inline bool _purge_complete_het_job(job_record_t *het_job_leader)
10899 {
10900 	job_record_t purge_job_rec;
10901 	job_record_t *het_job;
10902 	ListIterator iter;
10903 	bool incomplete_job = false;
10904 	int i;
10905 
10906 	if (!het_job_leader->het_job_list)
10907 		return false;		/* Not hetjob leader */
10908 	if (!IS_JOB_FINISHED(het_job_leader))
10909 		return false;		/* Hetjob leader incomplete */
10910 
10911 	iter = list_iterator_create(het_job_leader->het_job_list);
10912 	while ((het_job = list_next(iter))) {
10913 		if (het_job_leader->het_job_id != het_job->het_job_id) {
10914 			error("%s: Bad het_job_list for %pJ",
10915 			      __func__, het_job_leader);
10916 			continue;
10917 		}
10918 		if (!_list_find_job_old(het_job, NULL)) {
10919 			incomplete_job = true;
10920 			break;
10921 		}
10922 	}
10923 	list_iterator_destroy(iter);
10924 
10925 	if (incomplete_job)
10926 		return false;
10927 
10928 	purge_job_rec.het_job_id = het_job_leader->het_job_id;
10929 	i = list_delete_all(job_list, &_purge_het_job_filter, &purge_job_rec);
10930 	if (i) {
10931 		debug2("%s: purged %d old job records", __func__, i);
10932 		last_job_update = time(NULL);
10933 		slurm_mutex_lock(&purge_thread_lock);
10934 		slurm_cond_signal(&purge_thread_cond);
10935 		slurm_mutex_unlock(&purge_thread_lock);
10936 	}
10937 	return true;
10938 }
10939 
10940 /*
10941  * If the job or slurm.conf requests to not kill on invalid dependency,
10942  * then set the job state reason to WAIT_DEP_INVALID. Otherwise, kill the
10943  * job.
10944  */
handle_invalid_dependency(job_record_t * job_ptr)10945 void handle_invalid_dependency(job_record_t *job_ptr)
10946 {
10947 	job_ptr->state_reason = WAIT_DEP_INVALID;
10948 	xfree(job_ptr->state_desc);
10949 	if (job_ptr->bit_flags & KILL_INV_DEP) {
10950 		_kill_dependent(job_ptr);
10951 	} else if (job_ptr->bit_flags & NO_KILL_INV_DEP) {
10952 		debug("%s: %pJ job dependency never satisfied",
10953 		      __func__, job_ptr);
10954 	} else if (kill_invalid_dep) {
10955 		_kill_dependent(job_ptr);
10956 	} else {
10957 		debug("%s: %pJ job dependency never satisfied",
10958 		      __func__, job_ptr);
10959 		job_ptr->state_reason = WAIT_DEP_INVALID;
10960 	}
10961 	fed_mgr_remove_remote_dependencies(job_ptr);
10962 }
10963 
10964 /*
10965  * purge_old_job - purge old job records.
10966  *	The jobs must have completed at least MIN_JOB_AGE minutes ago.
10967  *	Test job dependencies, handle after_ok, after_not_ok before
10968  *	purging any jobs.
10969  */
purge_old_job(void)10970 void purge_old_job(void)
10971 {
10972 	ListIterator job_iterator;
10973 	job_record_t *job_ptr;
10974 	int i, purge_job_count;
10975 
10976 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
10977 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
10978 	xassert(verify_lock(NODE_LOCK, WRITE_LOCK));
10979 	xassert(verify_lock(FED_LOCK, READ_LOCK));
10980 
10981 	if ((purge_job_count = list_count(purge_files_list)))
10982 		debug("%s: job file deletion is falling behind, "
10983 		      "%d left to remove", __func__, purge_job_count);
10984 
10985 	job_iterator = list_iterator_create(job_list);
10986 	while ((job_ptr = list_next(job_iterator))) {
10987 		if (_purge_complete_het_job(job_ptr))
10988 			continue;
10989 		if (!IS_JOB_PENDING(job_ptr))
10990 			continue;
10991 		/*
10992 		 * If the dependency is already invalid there's no reason to
10993 		 * keep checking it.
10994 		 */
10995 		if (job_ptr->state_reason == WAIT_DEP_INVALID)
10996 			continue;
10997 		if (test_job_dependency(job_ptr, NULL) == FAIL_DEPEND) {
10998 			/* Check what are the job disposition
10999 			 * to deal with invalid dependecies
11000 			 */
11001 			handle_invalid_dependency(job_ptr);
11002 		}
11003 	}
11004 	list_iterator_destroy(job_iterator);
11005 	fed_mgr_test_remote_dependencies();
11006 
11007 	i = list_delete_all(job_list, &_list_find_job_old, "");
11008 	if (i) {
11009 		debug2("purge_old_job: purged %d old job records", i);
11010 		last_job_update = time(NULL);
11011 		slurm_mutex_lock(&purge_thread_lock);
11012 		slurm_cond_signal(&purge_thread_cond);
11013 		slurm_mutex_unlock(&purge_thread_lock);
11014 	}
11015 }
11016 
11017 
11018 /*
11019  * purge_job_record - purge specific job record. No testing is performed to
11020  *	ensure the job records has no active references. Use only for job
11021  *	records that were never fully operational (e.g. WILL_RUN test, failed
11022  *	job load, failed job create, etc.).
11023  * IN job_id - job_id of job record to be purged
11024  * RET int - count of job's purged
11025  * global: job_list - global job table
11026  */
purge_job_record(uint32_t job_id)11027 extern int purge_job_record(uint32_t job_id)
11028 {
11029 	int count = 0;
11030 	count = list_delete_all(job_list, _list_find_job_id, (void *)&job_id);
11031 	if (count) {
11032 		last_job_update = time(NULL);
11033 		slurm_mutex_lock(&purge_thread_lock);
11034 		slurm_cond_signal(&purge_thread_cond);
11035 		slurm_mutex_unlock(&purge_thread_lock);
11036 	}
11037 
11038 	return count;
11039 }
11040 
unlink_job_record(job_record_t * job_ptr)11041 extern void unlink_job_record(job_record_t *job_ptr)
11042 {
11043 	uint32_t *job_id;
11044 
11045 	xassert(job_ptr->magic == JOB_MAGIC);
11046 
11047 	_delete_job_common(job_ptr);
11048 
11049 	job_id = xmalloc(sizeof(uint32_t));
11050 	*job_id = job_ptr->job_id;
11051 	list_enqueue(purge_files_list, job_id);
11052 
11053 	job_ptr->job_id = NO_VAL;
11054 
11055 	last_job_update = time(NULL);
11056 	slurm_mutex_lock(&purge_thread_lock);
11057 	slurm_cond_signal(&purge_thread_cond);
11058 	slurm_mutex_unlock(&purge_thread_lock);
11059 }
11060 
11061 /*
11062  * reset_job_bitmaps - reestablish bitmaps for existing jobs.
11063  *	this should be called after rebuilding node information,
11064  *	but before using any job entries.
11065  * global: last_job_update - time of last job table update
11066  *	job_list - pointer to global job list
11067  */
reset_job_bitmaps(void)11068 void reset_job_bitmaps(void)
11069 {
11070 	ListIterator job_iterator;
11071 	job_record_t *job_ptr;
11072 	part_record_t *part_ptr;
11073 	List part_ptr_list = NULL;
11074 	bool job_fail = false;
11075 	time_t now = time(NULL);
11076 	bool gang_flag = false;
11077 	static uint32_t cr_flag = NO_VAL;
11078 
11079 	xassert(job_list);
11080 
11081 	if (cr_flag == NO_VAL) {
11082 		cr_flag = 0;  /* call is no-op for select/linear and others */
11083 		if (select_g_get_info_from_plugin(SELECT_CR_PLUGIN,
11084 						  NULL, &cr_flag)) {
11085 			cr_flag = NO_VAL;	/* error */
11086 		}
11087 
11088 	}
11089 	if (slurmctld_conf.preempt_mode & PREEMPT_MODE_GANG)
11090 		gang_flag = true;
11091 
11092 	job_iterator = list_iterator_create(job_list);
11093 	while ((job_ptr = list_next(job_iterator))) {
11094 		xassert (job_ptr->magic == JOB_MAGIC);
11095 		job_fail = false;
11096 
11097 		if (job_ptr->partition == NULL) {
11098 			error("No partition for %pJ", job_ptr);
11099 			part_ptr = NULL;
11100 			job_fail = true;
11101 		} else {
11102 			char *err_part = NULL;
11103 			part_ptr = find_part_record(job_ptr->partition);
11104 			if (part_ptr == NULL) {
11105 				part_ptr_list = get_part_list(
11106 						job_ptr->partition,
11107 						&err_part);
11108 				if (part_ptr_list) {
11109 					part_ptr = list_peek(part_ptr_list);
11110 					if (list_count(part_ptr_list) == 1)
11111 						FREE_NULL_LIST(part_ptr_list);
11112 				}
11113 			}
11114 			if (part_ptr == NULL) {
11115 				error("Invalid partition (%s) for %pJ",
11116 				      err_part, job_ptr);
11117 				xfree(err_part);
11118 				job_fail = true;
11119 			}
11120 		}
11121 		job_ptr->part_ptr = part_ptr;
11122 		FREE_NULL_LIST(job_ptr->part_ptr_list);
11123 		if (part_ptr_list) {
11124 			job_ptr->part_ptr_list = part_ptr_list;
11125 			part_ptr_list = NULL;	/* clear for next job */
11126 		}
11127 
11128 		FREE_NULL_BITMAP(job_ptr->node_bitmap_cg);
11129 		if (job_ptr->nodes_completing &&
11130 		    node_name2bitmap(job_ptr->nodes_completing,
11131 				     false,  &job_ptr->node_bitmap_cg)) {
11132 			error("Invalid nodes (%s) for %pJ",
11133 			      job_ptr->nodes_completing, job_ptr);
11134 			job_fail = true;
11135 		}
11136 		FREE_NULL_BITMAP(job_ptr->node_bitmap);
11137 		if (job_ptr->nodes &&
11138 		    node_name2bitmap(job_ptr->nodes, false,
11139 				     &job_ptr->node_bitmap) && !job_fail) {
11140 			error("Invalid nodes (%s) for %pJ",
11141 			      job_ptr->nodes, job_ptr);
11142 			job_fail = true;
11143 		}
11144 		if (reset_node_bitmap(job_ptr))
11145 			job_fail = true;
11146 		if (!job_fail && !IS_JOB_FINISHED(job_ptr) &&
11147 		    job_ptr->job_resrcs && (cr_flag || gang_flag) &&
11148 		    valid_job_resources(job_ptr->job_resrcs,
11149 					node_record_table_ptr)) {
11150 			error("Aborting %pJ due to change in socket/core configuration of allocated nodes",
11151 			      job_ptr);
11152 			job_fail = true;
11153 		}
11154 		if (!job_fail && !IS_JOB_FINISHED(job_ptr) &&
11155 		    gres_plugin_job_revalidate(job_ptr->gres_list)) {
11156 			error("Aborting %pJ due to use of unsupported GRES options",
11157 			      job_ptr);
11158 			job_fail = true;
11159 		}
11160 
11161 		if (!job_fail && job_ptr->job_resrcs &&
11162 		    (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) &&
11163 		    gres_plugin_job_revalidate2(job_ptr->job_id,
11164 					job_ptr->gres_list,
11165 					job_ptr->job_resrcs->node_bitmap)) {
11166 			/*
11167 			 * This can be due to the job being allocated GRES
11168 			 * which no longer exist (i.e. the GRES count on some
11169 			 * allocated node changed since when the job started).
11170 			 */
11171 			error("Aborting %pJ due to use of invalid GRES configuration",
11172 			      job_ptr);
11173 			job_fail = true;
11174 		}
11175 
11176 		_reset_step_bitmaps(job_ptr);
11177 
11178 		/* Do not increase the job->node_cnt for completed jobs */
11179 		if (! IS_JOB_COMPLETED(job_ptr))
11180 			build_node_details(job_ptr, false); /* set node_addr */
11181 
11182 		if (_reset_detail_bitmaps(job_ptr))
11183 			job_fail = true;
11184 
11185 		if (job_fail) {
11186 			if (IS_JOB_PENDING(job_ptr)) {
11187 				job_ptr->start_time =
11188 					job_ptr->end_time = time(NULL);
11189 				job_ptr->job_state = JOB_NODE_FAIL;
11190 			} else if (IS_JOB_RUNNING(job_ptr)) {
11191 				job_ptr->end_time = time(NULL);
11192 				job_ptr->job_state = JOB_NODE_FAIL |
11193 						     JOB_COMPLETING;
11194 				build_cg_bitmap(job_ptr);
11195 			} else if (IS_JOB_SUSPENDED(job_ptr)) {
11196 				job_ptr->end_time = job_ptr->suspend_time;
11197 				job_ptr->job_state = JOB_NODE_FAIL |
11198 						     JOB_COMPLETING;
11199 				build_cg_bitmap(job_ptr);
11200 				job_ptr->tot_sus_time +=
11201 					difftime(now, job_ptr->suspend_time);
11202 				jobacct_storage_g_job_suspend(acct_db_conn,
11203 							      job_ptr);
11204 			}
11205 			job_ptr->state_reason = FAIL_DOWN_NODE;
11206 			xfree(job_ptr->state_desc);
11207 			job_completion_logger(job_ptr, false);
11208 			if (job_ptr->job_state == JOB_NODE_FAIL) {
11209 				/* build_cg_bitmap() may clear JOB_COMPLETING */
11210 				epilog_slurmctld(job_ptr);
11211 			}
11212 		}
11213 	}
11214 
11215 	list_iterator_reset(job_iterator);
11216 	/* This will reinitialize the select plugin database, which
11217 	 * we can only do after ALL job's states and bitmaps are set
11218 	 * (i.e. it needs to be in this second loop) */
11219 	while ((job_ptr = list_next(job_iterator))) {
11220 		if (select_g_select_nodeinfo_set(job_ptr) != SLURM_SUCCESS) {
11221 			error("select_g_select_nodeinfo_set(%pJ): %m",
11222 			      job_ptr);
11223 		}
11224 	}
11225 	list_iterator_destroy(job_iterator);
11226 
11227 	last_job_update = now;
11228 }
11229 
_reset_detail_bitmaps(job_record_t * job_ptr)11230 static int _reset_detail_bitmaps(job_record_t *job_ptr)
11231 {
11232 	if (job_ptr->details == NULL)
11233 		return SLURM_SUCCESS;
11234 
11235 	FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
11236 
11237 	if ((job_ptr->details->req_nodes) &&
11238 	    (node_name2bitmap(job_ptr->details->req_nodes, false,
11239 			      &job_ptr->details->req_node_bitmap))) {
11240 		error("Invalid req_nodes (%s) for %pJ",
11241 		      job_ptr->details->req_nodes, job_ptr);
11242 		return SLURM_ERROR;
11243 	}
11244 
11245 	FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
11246 	if ((job_ptr->details->exc_nodes) &&
11247 	    (node_name2bitmap(job_ptr->details->exc_nodes, true,
11248 			      &job_ptr->details->exc_node_bitmap))) {
11249 		error("Invalid exc_nodes (%s) for %pJ",
11250 		      job_ptr->details->exc_nodes, job_ptr);
11251 		return SLURM_ERROR;
11252 	}
11253 
11254 	return SLURM_SUCCESS;
11255 }
11256 
_reset_step_bitmaps(job_record_t * job_ptr)11257 static void _reset_step_bitmaps(job_record_t *job_ptr)
11258 {
11259 	ListIterator step_iterator;
11260 	step_record_t *step_ptr;
11261 
11262 	step_iterator = list_iterator_create (job_ptr->step_list);
11263 	while ((step_ptr = list_next(step_iterator))) {
11264 		if (step_ptr->state < JOB_RUNNING)
11265 			continue;
11266 		FREE_NULL_BITMAP(step_ptr->step_node_bitmap);
11267 		if (step_ptr->step_layout &&
11268 		    step_ptr->step_layout->node_list &&
11269 		    (node_name2bitmap(step_ptr->step_layout->node_list, false,
11270 				      &step_ptr->step_node_bitmap))) {
11271 			error("Invalid step_node_list (%s) for %pS",
11272 			      step_ptr->step_layout->node_list, step_ptr);
11273 			delete_step_record (job_ptr, step_ptr->step_id);
11274 		} else if (step_ptr->step_node_bitmap == NULL) {
11275 			error("Missing node_list for %pS", step_ptr);
11276 			delete_step_record (job_ptr, step_ptr->step_id);
11277 		}
11278 	}
11279 
11280 	list_iterator_destroy (step_iterator);
11281 	return;
11282 }
11283 
11284 /* update first assigned job id as needed on reconfigure */
reset_first_job_id(void)11285 void reset_first_job_id(void)
11286 {
11287 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
11288 	job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id);
11289 }
11290 
11291 /*
11292  * Return the next available job_id to be used.
11293  *
11294  * IN test_only - if true, doesn't advance the job_id sequence, just returns
11295  * 	what the next job id will be.
11296  * RET a valid job_id or SLURM_ERROR if all job_ids are exhausted.
11297  */
get_next_job_id(bool test_only)11298 extern uint32_t get_next_job_id(bool test_only)
11299 {
11300 	int i;
11301 	uint32_t new_id, max_jobs, tmp_id_sequence;
11302 
11303 	xassert(verify_lock(JOB_LOCK, READ_LOCK));
11304 	xassert(test_only || verify_lock(JOB_LOCK, WRITE_LOCK));
11305 	xassert(verify_lock(FED_LOCK, READ_LOCK));
11306 
11307 	max_jobs = slurmctld_conf.max_job_id - slurmctld_conf.first_job_id;
11308 	tmp_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id);
11309 
11310 	/* Ensure no conflict in job id if we roll over 32 bits */
11311 	for (i = 0; i < max_jobs; i++) {
11312 		if (++tmp_id_sequence >= slurmctld_conf.max_job_id)
11313 			tmp_id_sequence = slurmctld_conf.first_job_id;
11314 
11315 		new_id = fed_mgr_get_job_id(tmp_id_sequence);
11316 
11317 		if (find_job_record(new_id))
11318 			continue;
11319 		if (_dup_job_file_test(new_id))
11320 			continue;
11321 
11322 		if (!test_only)
11323 			job_id_sequence = tmp_id_sequence;
11324 
11325 		return new_id;
11326 	}
11327 
11328 	error("We have exhausted our supply of valid job id values. "
11329 	      "FirstJobId=%u MaxJobId=%u", slurmctld_conf.first_job_id,
11330 	      slurmctld_conf.max_job_id);
11331 	return SLURM_ERROR;
11332 }
11333 
11334 /*
11335  * _set_job_id - set a default job_id, ensure that it is unique
11336  * IN job_ptr - pointer to the job_record
11337  */
_set_job_id(job_record_t * job_ptr)11338 static int _set_job_id(job_record_t *job_ptr)
11339 {
11340 	uint32_t new_id;
11341 
11342 	xassert(job_ptr);
11343 	xassert (job_ptr->magic == JOB_MAGIC);
11344 
11345 	if ((new_id = get_next_job_id(false)) != SLURM_ERROR) {
11346 		job_ptr->job_id = new_id;
11347 		/* When we get a new job id might as well make sure
11348 		 * the db_index is 0 since there is no way it will be
11349 		 * correct otherwise :). */
11350 		job_ptr->db_index = 0;
11351 		return SLURM_SUCCESS;
11352 	}
11353 
11354 	job_ptr->job_id = NO_VAL;
11355 	return EAGAIN;
11356 }
11357 
11358 
11359 /*
11360  * set_job_prio - set a default job priority
11361  * IN job_ptr - pointer to the job_record
11362  */
set_job_prio(job_record_t * job_ptr)11363 extern void set_job_prio(job_record_t *job_ptr)
11364 {
11365 	uint32_t relative_prio;
11366 
11367 	xassert(job_ptr);
11368 	xassert (job_ptr->magic == JOB_MAGIC);
11369 
11370 	if (IS_JOB_FINISHED(job_ptr))
11371 		return;
11372 	job_ptr->priority = slurm_sched_g_initial_priority(lowest_prio,
11373 							   job_ptr);
11374 	if ((job_ptr->priority == 0) || (job_ptr->direct_set_prio))
11375 		return;
11376 
11377 	relative_prio = job_ptr->priority;
11378 	if (job_ptr->details && (job_ptr->details->nice != NICE_OFFSET)) {
11379 		int64_t offset = job_ptr->details->nice;
11380 		offset -= NICE_OFFSET;
11381 		relative_prio += offset;
11382 	}
11383 	lowest_prio = MIN(relative_prio, lowest_prio);
11384 }
11385 
11386 /* After recovering job state, if using priority/basic then we increment the
11387  * priorities of all jobs to avoid decrementing the base down to zero */
sync_job_priorities(void)11388 extern void sync_job_priorities(void)
11389 {
11390 	ListIterator job_iterator;
11391 	job_record_t *job_ptr;
11392 	uint32_t prio_boost = 0;
11393 
11394 	if ((highest_prio != 0) && (highest_prio < TOP_PRIORITY))
11395 		prio_boost = TOP_PRIORITY - highest_prio;
11396 	if (xstrcmp(slurmctld_conf.priority_type, "priority/basic") ||
11397 	    (prio_boost < 1000000))
11398 		return;
11399 
11400 	job_iterator = list_iterator_create(job_list);
11401 	while ((job_ptr = list_next(job_iterator))) {
11402 		if ((job_ptr->priority) && (job_ptr->direct_set_prio == 0))
11403 			job_ptr->priority += prio_boost;
11404 	}
11405 	list_iterator_destroy(job_iterator);
11406 	lowest_prio += prio_boost;
11407 }
11408 
11409 /*
11410  * _top_priority - determine if any other job has a higher priority than the
11411  *	specified job
11412  * IN job_ptr - pointer to selected job
11413  * RET true if selected job has highest priority
11414  */
_top_priority(job_record_t * job_ptr,uint32_t het_job_offset)11415 static bool _top_priority(job_record_t *job_ptr, uint32_t het_job_offset)
11416 {
11417 	struct job_details *detail_ptr = job_ptr->details;
11418 	time_t now = time(NULL);
11419 	int pend_time;
11420 	bool top;
11421 
11422 	if (job_ptr->priority == 0)	/* user held */
11423 		top = false;
11424 	else {
11425 		ListIterator job_iterator;
11426 		job_record_t *job_ptr2;
11427 
11428 		top = true;	/* assume top priority until found otherwise */
11429 		job_iterator = list_iterator_create(job_list);
11430 		while ((job_ptr2 = list_next(job_iterator))) {
11431 			if (job_ptr2 == job_ptr)
11432 				continue;
11433 			if ((het_job_offset != NO_VAL) && (job_ptr->job_id ==
11434 			    (job_ptr2->job_id + het_job_offset)))
11435 				continue;
11436 			if (!IS_JOB_PENDING(job_ptr2))
11437 				continue;
11438 			if (IS_JOB_COMPLETING(job_ptr2)) {
11439 				/* Job is hung in pending & completing state,
11440 				 * indicative of job requeue */
11441 				continue;
11442 			}
11443 
11444 			if (bf_min_age_reserve) {
11445 				if (job_ptr2->details->begin_time == 0)
11446 					continue;
11447 				pend_time = difftime(now, job_ptr2->
11448 						     details->begin_time);
11449 				if (pend_time < bf_min_age_reserve)
11450 					continue;
11451 			}
11452 			if (!acct_policy_job_runnable_state(job_ptr2) ||
11453 			    !misc_policy_job_runnable_state(job_ptr2) ||
11454 			    !part_policy_job_runnable_state(job_ptr2) ||
11455 			    !job_independent(job_ptr2))
11456 				continue;
11457 
11458 			if (!xstrcmp(job_ptr2->resv_name, job_ptr->resv_name) ||
11459 			    (job_ptr2->resv_ptr &&
11460 			     (job_ptr->warn_time <=
11461 			      job_ptr2->resv_ptr->max_start_delay) &&
11462 			     (job_ptr->warn_flags & KILL_JOB_RESV))) {
11463 				/* same reservation */
11464 				if (job_ptr2->priority <= job_ptr->priority)
11465 					continue;
11466 				top = false;
11467 				break;
11468 			} else if ((job_ptr2->resv_name &&
11469 				    (!job_ptr->resv_name)) ||
11470 				   ((!job_ptr2->resv_name) &&
11471 				    job_ptr->resv_name))
11472 				continue;	/* different reservation */
11473 
11474 
11475 			if (bb_g_job_test_stage_in(job_ptr2, true) != 1)
11476 				continue;	/* Waiting for buffer */
11477 
11478 			if (job_ptr2->part_ptr == job_ptr->part_ptr) {
11479 				/* same partition */
11480 				if (job_ptr2->priority <= job_ptr->priority)
11481 					continue;
11482 				top = false;
11483 				break;
11484 			}
11485 			if (bit_overlap_any(job_ptr->part_ptr->node_bitmap,
11486 					    job_ptr2->part_ptr->node_bitmap) == 0)
11487 				continue;   /* no node overlap in partitions */
11488 			if ((job_ptr2->part_ptr->priority_tier >
11489 			     job_ptr ->part_ptr->priority_tier) ||
11490 			    ((job_ptr2->part_ptr->priority_tier ==
11491 			      job_ptr ->part_ptr->priority_tier) &&
11492 			     (job_ptr2->priority >  job_ptr->priority))) {
11493 				top = false;
11494 				break;
11495 			}
11496 		}
11497 		list_iterator_destroy(job_iterator);
11498 	}
11499 
11500 	if ((!top) && detail_ptr) {	/* not top prio */
11501 		if (job_ptr->priority == 0) {		/* user/admin hold */
11502 			if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS
11503 			    && (job_ptr->state_reason != WAIT_RESV_DELETED)
11504 			    && (job_ptr->state_reason != FAIL_BURST_BUFFER_OP)
11505 			    && (job_ptr->state_reason != FAIL_ACCOUNT)
11506 			    && (job_ptr->state_reason != FAIL_QOS)
11507 			    && (job_ptr->state_reason != WAIT_HELD)
11508 			    && (job_ptr->state_reason != WAIT_HELD_USER)
11509 			    && job_ptr->state_reason != WAIT_MAX_REQUEUE) {
11510 				job_ptr->state_reason = WAIT_HELD;
11511 				xfree(job_ptr->state_desc);
11512 			}
11513 		} else if (job_ptr->state_reason == WAIT_NO_REASON &&
11514 			   het_job_offset == NO_VAL) {
11515 			job_ptr->state_reason = WAIT_PRIORITY;
11516 			xfree(job_ptr->state_desc);
11517 		}
11518 	}
11519 	return top;
11520 }
11521 
_merge_job_licenses(job_record_t * shrink_job_ptr,job_record_t * expand_job_ptr)11522 static void _merge_job_licenses(job_record_t *shrink_job_ptr,
11523 				job_record_t *expand_job_ptr)
11524 {
11525 	xassert(shrink_job_ptr);
11526 	xassert(expand_job_ptr);
11527 
11528 	/* FIXME: do we really need to update accounting here?  It
11529 	 * might already happen */
11530 
11531 	if (!shrink_job_ptr->licenses)		/* No licenses to add */
11532 		return;
11533 
11534 	if (!expand_job_ptr->licenses) {	/* Just transfer licenses */
11535 		expand_job_ptr->licenses = shrink_job_ptr->licenses;
11536 		shrink_job_ptr->licenses = NULL;
11537 		FREE_NULL_LIST(expand_job_ptr->license_list);
11538 		expand_job_ptr->license_list = shrink_job_ptr->license_list;
11539 		shrink_job_ptr->license_list = NULL;
11540 		return;
11541 	}
11542 
11543 	/* Merge the license information into expanding job */
11544 	xstrcat(expand_job_ptr->licenses, ",");
11545 	xstrcat(expand_job_ptr->licenses, shrink_job_ptr->licenses);
11546 	xfree(shrink_job_ptr->licenses);
11547 	FREE_NULL_LIST(expand_job_ptr->license_list);
11548 	FREE_NULL_LIST(shrink_job_ptr->license_list);
11549 	license_job_merge(expand_job_ptr);
11550 	return;
11551 }
11552 
_hold_job_rec(job_record_t * job_ptr,uid_t uid)11553 static void _hold_job_rec(job_record_t *job_ptr, uid_t uid)
11554 {
11555 	int i, j;
11556 
11557 	job_ptr->direct_set_prio = 1;
11558 	job_ptr->priority = 0;
11559 
11560 	if (IS_JOB_PENDING(job_ptr))
11561 		acct_policy_remove_accrue_time(job_ptr, false);
11562 
11563 	if (job_ptr->part_ptr_list && job_ptr->priority_array) {
11564 		j = list_count(job_ptr->part_ptr_list);
11565 		for (i = 0; i < j; i++) {
11566 			job_ptr->priority_array[i] = 0;
11567 		}
11568 	}
11569 	sched_info("%s: hold on %pJ by uid %u", __func__, job_ptr, uid);
11570 }
11571 
_hold_job(job_record_t * job_ptr,uid_t uid)11572 static void _hold_job(job_record_t *job_ptr, uid_t uid)
11573 {
11574 	job_record_t *het_job_leader = NULL, *het_job;
11575 	ListIterator iter;
11576 
11577 	if (job_ptr->het_job_id && _get_whole_hetjob())
11578 		het_job_leader = find_job_record(job_ptr->het_job_id);
11579 	if (het_job_leader && het_job_leader->het_job_list) {
11580 		iter = list_iterator_create(het_job_leader->het_job_list);
11581 		while ((het_job = list_next(iter)))
11582 			_hold_job_rec(het_job, uid);
11583 		list_iterator_destroy(iter);
11584 		return;
11585 	}
11586 	_hold_job_rec(job_ptr, uid);
11587 }
11588 
_release_job_rec(job_record_t * job_ptr,uid_t uid)11589 static void _release_job_rec(job_record_t *job_ptr, uid_t uid)
11590 {
11591 	time_t now = time(NULL);
11592 	if (job_ptr->details && (job_ptr->details->begin_time < now))
11593 		job_ptr->details->begin_time = 0;
11594 	job_ptr->direct_set_prio = 0;
11595 	set_job_prio(job_ptr);
11596 	job_ptr->state_reason = WAIT_NO_REASON;
11597 	job_ptr->state_reason_prev = WAIT_NO_REASON;
11598 	job_ptr->job_state &= ~JOB_SPECIAL_EXIT;
11599 	xfree(job_ptr->state_desc);
11600 	job_ptr->exit_code = 0;
11601 	fed_mgr_job_requeue(job_ptr); /* submit sibling jobs */
11602 	sched_info("%s: release hold on %pJ by uid %u",
11603 		   __func__, job_ptr, uid);
11604 }
11605 
_release_job(job_record_t * job_ptr,uid_t uid)11606 static void _release_job(job_record_t *job_ptr, uid_t uid)
11607 {
11608 	job_record_t *het_job_leader = NULL, *het_job;
11609 	ListIterator iter;
11610 
11611 	if (job_ptr->het_job_id && _get_whole_hetjob())
11612 		het_job_leader = find_job_record(job_ptr->het_job_id);
11613 	if (het_job_leader && het_job_leader->het_job_list) {
11614 		iter = list_iterator_create(het_job_leader->het_job_list);
11615 		while ((het_job = list_next(iter)))
11616 			_release_job_rec(het_job, uid);
11617 		list_iterator_destroy(iter);
11618 		return;
11619 	}
11620 	_release_job_rec(job_ptr, uid);
11621 }
11622 
11623 /*
11624  * Gets a new association giving priority to the given parameters in job_desc,
11625  * and if not possible using the job_ptr ones.
11626  * IN job_desc: The new job description to use for getting the assoc_ptr.
11627  * IN job_ptr: The original job_ptr to use when parameters are not in job_desc.
11628  * RET assoc_rec, the new association combining the most updated information
11629  * from job_desc.
11630  */
_retrieve_new_assoc(job_desc_msg_t * job_desc,job_record_t * job_ptr)11631 static slurmdb_assoc_rec_t *_retrieve_new_assoc(job_desc_msg_t *job_desc,
11632 						job_record_t *job_ptr)
11633 {
11634 	slurmdb_assoc_rec_t assoc_rec, *assoc_ptr = NULL;
11635 
11636 	memset(&assoc_rec, 0, sizeof(assoc_rec));
11637 
11638 	if (job_desc->partition) {
11639 		part_record_t *part_ptr = NULL;
11640 		int error_code =
11641 			_get_job_parts(job_desc, &part_ptr, NULL, NULL);
11642 		/* We don't need this we only care about part_ptr */
11643 		if (error_code != SLURM_SUCCESS) {
11644 			errno = error_code;
11645 			return NULL;
11646 		} else if (!(part_ptr->state_up & PARTITION_SUBMIT)) {
11647 			errno = ESLURM_PARTITION_NOT_AVAIL;
11648 			return NULL;
11649 		}
11650 
11651 		assoc_rec.partition = part_ptr->name;
11652 	} else if (job_ptr->part_ptr)
11653 		assoc_rec.partition = job_ptr->part_ptr->name;
11654 
11655 	if (job_desc->account)
11656 		assoc_rec.acct = job_desc->account;
11657 	else
11658 		assoc_rec.acct = job_ptr->account;
11659 
11660 	assoc_rec.uid = job_ptr->user_id;
11661 
11662 	if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
11663 				    accounting_enforce,
11664 				    &assoc_ptr, false)) {
11665 		info("%s: invalid account %s for %pJ",
11666 		     __func__, assoc_rec.acct, job_ptr);
11667 		errno = ESLURM_INVALID_ACCOUNT;
11668 		return NULL;
11669 	} else if (association_based_accounting &&
11670 		   !assoc_ptr &&
11671 		   !(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) &&
11672 		   assoc_rec.acct) {
11673 		/* if not enforcing associations we want to look for
11674 		 * the default account and use it to avoid getting
11675 		 * trash in the accounting records.
11676 		 */
11677 		assoc_rec.acct = NULL;
11678 		(void) assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
11679 					       accounting_enforce,
11680 					       &assoc_ptr, false);
11681 	}
11682 
11683 	return assoc_ptr;
11684 }
11685 
11686 /* Allocate nodes to new job. Old job info will be cleared at epilog complete */
_realloc_nodes(job_record_t * job_ptr,bitstr_t * orig_node_bitmap)11687 static void _realloc_nodes(job_record_t *job_ptr, bitstr_t *orig_node_bitmap)
11688 {
11689 	int i, i_first, i_last;
11690 	node_record_t *node_ptr;
11691 
11692 	xassert(job_ptr);
11693 	xassert(orig_node_bitmap);
11694 	if (!job_ptr->job_resrcs || !job_ptr->job_resrcs->node_bitmap)
11695 		return;
11696 	i_first = bit_ffs(job_ptr->job_resrcs->node_bitmap);
11697 	if (i_first >= 0)
11698 		i_last = bit_fls(job_ptr->job_resrcs->node_bitmap);
11699 	else
11700 		i_last = -1;
11701 	for (i = i_first; i <= i_last; i++) {
11702 		if (!bit_test(job_ptr->job_resrcs->node_bitmap, i) ||
11703 		    bit_test(orig_node_bitmap, i))
11704 			continue;
11705 		node_ptr = node_record_table_ptr + i;
11706 		make_node_alloc(node_ptr, job_ptr);
11707 	}
11708 }
11709 
permit_job_expansion(void)11710 extern bool permit_job_expansion(void)
11711 {
11712 	static time_t sched_update = 0;
11713 	static bool permit_job_expansion = false;
11714 
11715 	if (sched_update != slurmctld_conf.last_update) {
11716 		char *sched_params = slurm_get_sched_params();
11717 		sched_update = slurmctld_conf.last_update;
11718 		if (xstrcasestr(sched_params, "permit_job_expansion"))
11719 			permit_job_expansion = true;
11720 		else
11721 			permit_job_expansion = false;
11722 		xfree(sched_params);
11723 	}
11724 
11725 	return permit_job_expansion;
11726 }
11727 
permit_job_shrink(void)11728 extern bool permit_job_shrink(void)
11729 {
11730 	static time_t sched_update = 0;
11731 	static bool permit_job_shrink = false;
11732 
11733 	if (sched_update != slurmctld_conf.last_update) {
11734 		char *sched_params = slurm_get_sched_params();
11735 		sched_update = slurmctld_conf.last_update;
11736 		if (xstrcasestr(sched_params, "disable_job_shrink"))
11737 			permit_job_shrink = false;
11738 		else
11739 			permit_job_shrink = true;
11740 		xfree(sched_params);
11741 	}
11742 
11743 	return permit_job_shrink;
11744 }
11745 
_update_job(job_record_t * job_ptr,job_desc_msg_t * job_specs,uid_t uid)11746 static int _update_job(job_record_t *job_ptr, job_desc_msg_t *job_specs,
11747 		       uid_t uid)
11748 {
11749 	int error_code = SLURM_SUCCESS;
11750 	enum job_state_reason fail_reason;
11751 	bool operator = false;
11752 	bool is_coord_oldacc = false, is_coord_newacc = false;
11753 	uint32_t save_min_nodes = 0, save_max_nodes = 0;
11754 	uint32_t save_min_cpus = 0, save_max_cpus = 0;
11755 	struct job_details *detail_ptr;
11756 	part_record_t *new_part_ptr = NULL, *use_part_ptr = NULL;
11757 	bitstr_t *exc_bitmap = NULL, *new_req_bitmap = NULL;
11758 	time_t now = time(NULL);
11759 	multi_core_data_t *mc_ptr = NULL;
11760 	bool update_accounting = false, new_req_bitmap_given = false;
11761 	acct_policy_limit_set_t acct_policy_limit_set;
11762 	uint16_t tres[slurmctld_tres_cnt];
11763 	bool acct_limit_already_exceeded;
11764 	bool tres_changed = false;
11765 	int tres_pos;
11766 	uint64_t tres_req_cnt[slurmctld_tres_cnt];
11767 	bool tres_req_cnt_set = false, valid_licenses = false;
11768 	List gres_list = NULL;
11769 	List license_list = NULL;
11770 	List part_ptr_list = NULL;
11771 	uint32_t orig_time_limit;
11772 	bool gres_update = false;
11773 	slurmdb_assoc_rec_t *new_assoc_ptr = NULL, *use_assoc_ptr = NULL;
11774 	slurmdb_qos_rec_t *new_qos_ptr = NULL, *use_qos_ptr = NULL;
11775 	slurmctld_resv_t *new_resv_ptr = NULL;
11776 	uint32_t user_site_factor;
11777 
11778 	assoc_mgr_lock_t locks = { .tres = READ_LOCK };
11779 
11780 	/*
11781 	 * This means we are in the middle of requesting the db_inx from the
11782 	 * database. So we can't update right now.  You should try again outside
11783 	 * the job_write lock in a second or so.
11784 	 */
11785 	if (job_ptr->db_index == NO_VAL64)
11786 		return ESLURM_JOB_SETTING_DB_INX;
11787 
11788 	operator = validate_operator(uid);
11789 	if (job_specs->burst_buffer) {
11790 		/*
11791 		 * burst_buffer contents are validated at job submit time and
11792 		 * data is possibly being staged at later times. It can not
11793 		 * be changed except to clear the value on a completed job and
11794 		 * purge the record in order to recover from a failure mode
11795 		 */
11796 		if (IS_JOB_COMPLETED(job_ptr) && operator &&
11797 		    (job_specs->burst_buffer[0] == '\0')) {
11798 			xfree(job_ptr->burst_buffer);
11799 			last_job_update = now;
11800 		} else {
11801 			error_code = ESLURM_NOT_SUPPORTED;
11802 		}
11803 	}
11804 	if (error_code != SLURM_SUCCESS)
11805 		goto fini;
11806 
11807 	if (job_specs->array_inx && job_ptr->array_recs) {
11808 		int throttle;
11809 		throttle = strtoll(job_specs->array_inx, (char **) NULL, 10);
11810 		if (throttle >= 0) {
11811 			info("%s: set max_run_tasks to %d for job array %pJ",
11812 			     __func__, throttle, job_ptr);
11813 			job_ptr->array_recs->max_run_tasks = throttle;
11814 		} else {
11815 			info("%s: invalid max_run_tasks of %d for job array %pJ, ignored",
11816 			     __func__, throttle, job_ptr);
11817 			error_code = ESLURM_BAD_TASK_COUNT;
11818 		}
11819 		/*
11820 		 * Even if the job is complete, permit changing
11821 		 * ArrayTaskThrottle for other elements of the task array
11822 		 */
11823 		if (IS_JOB_FINISHED(job_ptr))
11824 			goto fini;
11825 	}
11826 
11827 	if (IS_JOB_FINISHED(job_ptr)) {
11828 		error_code = ESLURM_JOB_FINISHED;
11829 		goto fini;
11830 	}
11831 
11832 	/*
11833 	 * Validate before job_submit_plugin_modify() so that the job_submit
11834 	 * plugin can make changes to the field without triggering an auth
11835 	 * issue.
11836 	 */
11837 	if (job_specs->admin_comment && !validate_super_user(uid)) {
11838 		error("Attempt to change admin_comment for %pJ", job_ptr);
11839 		error_code = ESLURM_ACCESS_DENIED;
11840 		goto fini;
11841 	}
11842 
11843 	/* Save before submit plugin potentially modifies it. */
11844 	user_site_factor = job_specs->site_factor;
11845 
11846 	if (job_specs->user_id == NO_VAL) {
11847 		/*
11848 		 * Used by job_submit/lua to find default partition and
11849 		 * access control logic below to validate partition change
11850 		 */
11851 		job_specs->user_id = job_ptr->user_id;
11852 	}
11853 	error_code = job_submit_plugin_modify(job_specs, job_ptr,
11854 					      (uint32_t) uid);
11855 	if (error_code != SLURM_SUCCESS)
11856 		return error_code;
11857 	error_code = node_features_g_job_valid(job_specs->features);
11858 	if (error_code != SLURM_SUCCESS)
11859 		return error_code;
11860 
11861 	error_code = _test_job_desc_fields(job_specs);
11862 	if (error_code != SLURM_SUCCESS)
11863 		return error_code;
11864 
11865 	memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set));
11866 	acct_policy_limit_set.tres = tres;
11867 
11868 	if (operator) {
11869 		/* set up the acct_policy if we are at least an operator */
11870 		for (tres_pos = 0; tres_pos < slurmctld_tres_cnt; tres_pos++)
11871 			acct_policy_limit_set.tres[tres_pos] = ADMIN_SET_LIMIT;
11872 		acct_policy_limit_set.time = ADMIN_SET_LIMIT;
11873 		acct_policy_limit_set.qos = ADMIN_SET_LIMIT;
11874 	} else
11875 		memset(tres, 0, sizeof(tres));
11876 
11877 	/* Check authorization for modifying this job */
11878 	is_coord_oldacc = assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
11879 							     job_ptr->account);
11880 	is_coord_newacc = assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
11881 							   job_specs->account);
11882 	if ((job_ptr->user_id != uid) && !operator) {
11883 		/*
11884 		 * Fail if we are not coordinators of the current account or
11885 		 * if we are changing an account and  we are not coordinators
11886 		 * of both src and dest accounts.
11887 		 */
11888 		if (!is_coord_oldacc ||
11889 		    (!is_coord_newacc && job_specs->account)) {
11890 			error("Security violation, JOB_UPDATE RPC from uid %d",
11891 			      uid);
11892 			return ESLURM_USER_ID_MISSING;
11893 		}
11894 	}
11895 
11896 	detail_ptr = job_ptr->details;
11897 	if (detail_ptr)
11898 		mc_ptr = detail_ptr->mc_ptr;
11899 	last_job_update = now;
11900 
11901 	/*
11902 	 * Check to see if the new requested job_specs exceeds any
11903 	 * existing limit. If it passes, cool, we will check the new
11904 	 * association/qos/part later in the code and fail if it is wrong.
11905 	 *
11906 	 * If it doesn't pass this mean some limit was exceededed before the
11907 	 * update request so let's keep the user continue screwing up herself
11908 	 * with the limit if it is what she wants. We do this by not exiting
11909 	 * on the later call to acct_policy_validate() if it fails.
11910 	 *
11911 	 * We will also prevent the update to return an error code that is
11912 	 * confusing since many things could successfully update and we are now
11913 	 * just already violating a limit. The job won't be allowed to run,
11914 	 * but it will allow the update to happen which is most likely what
11915 	 * was desired.
11916 	 *
11917 	 * Changes in between this check and the next acct_policy_validate()
11918 	 * will not be constrained to accounting enforce limits.
11919 	 */
11920 	orig_time_limit = job_specs->time_limit;
11921 
11922 	memcpy(tres_req_cnt, job_ptr->tres_req_cnt, sizeof(tres_req_cnt));
11923 	job_specs->tres_req_cnt = tres_req_cnt;
11924 	tres_req_cnt_set = true;
11925 
11926 	acct_limit_already_exceeded = false;
11927 
11928 	if (!operator && (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) {
11929 		if (!acct_policy_validate(job_specs, job_ptr->part_ptr,
11930 					  job_ptr->assoc_ptr, job_ptr->qos_ptr,
11931 					  NULL, &acct_policy_limit_set,
11932 					  true)) {
11933 			debug("%s: already exceeded association's cpu, node, "
11934 			      "memory or time limit for user %u",
11935 			      __func__, job_specs->user_id);
11936 			acct_limit_already_exceeded = true;
11937 		}
11938 		job_specs->time_limit = orig_time_limit;
11939 	}
11940 
11941 	/*
11942 	 * The partition, assoc, qos, reservation, and req_node_bitmap all have
11943 	 * to be set before checking later.  So here we set them into temporary
11944 	 * variables set in the job way later.
11945 	 */
11946 	if (job_specs->partition &&
11947 	    !xstrcmp(job_specs->partition, job_ptr->partition)) {
11948 		sched_debug("%s: new partition identical to old partition %pJ",
11949 			    __func__, job_ptr);
11950 	} else if (job_specs->partition) {
11951 		if (!IS_JOB_PENDING(job_ptr)) {
11952 			error_code = ESLURM_JOB_NOT_PENDING;
11953 			goto fini;
11954 		}
11955 
11956 		error_code = _get_job_parts(job_specs,
11957 					    &new_part_ptr,
11958 					    &part_ptr_list, NULL);
11959 
11960 		if (error_code != SLURM_SUCCESS)
11961 			;
11962 		else if ((new_part_ptr->state_up & PARTITION_SUBMIT) == 0)
11963 			error_code = ESLURM_PARTITION_NOT_AVAIL;
11964 		else if (!part_ptr_list &&
11965 			 !xstrcmp(new_part_ptr->name, job_ptr->partition)) {
11966 			sched_debug("%s: 2 new partition identical to old partition %pJ",
11967 				    __func__, job_ptr);
11968 			new_part_ptr = NULL;
11969 		}
11970 		if (error_code != SLURM_SUCCESS)
11971 			goto fini;
11972 	}
11973 
11974 	use_part_ptr = new_part_ptr ? new_part_ptr : job_ptr->part_ptr;
11975 
11976 	/* Check the account and the partition as both affect the association */
11977 	if (job_specs->account || new_part_ptr) {
11978 		if (!IS_JOB_PENDING(job_ptr))
11979 			error_code = ESLURM_JOB_NOT_PENDING;
11980 		else {
11981 			new_assoc_ptr = _retrieve_new_assoc(job_specs, job_ptr);
11982 
11983 			if (!new_assoc_ptr)
11984 				error_code = errno;
11985 			else if (new_assoc_ptr == job_ptr->assoc_ptr) {
11986 				new_assoc_ptr = NULL;
11987 				sched_debug("%s: new association identical to old association %u",
11988 					    __func__, job_ptr->job_id);
11989 			}
11990 
11991 			/*
11992 			 * Clear errno that may have been set by
11993 			 * _retrieve_new_assoc.
11994 			 */
11995 			errno = 0;
11996 		}
11997 
11998 		if (error_code != SLURM_SUCCESS)
11999 			goto fini;
12000 	}
12001 
12002 	use_assoc_ptr = new_assoc_ptr ?	new_assoc_ptr : job_ptr->assoc_ptr;
12003 
12004 	if (job_specs->qos) {
12005 		slurmdb_qos_rec_t qos_rec;
12006 		char *resv_name;
12007 
12008 		if (job_specs->reservation
12009 		    && job_specs->reservation[0] != '\0')
12010 			resv_name = job_specs->reservation;
12011 		else
12012 			resv_name = job_ptr->resv_name;
12013 
12014 		memset(&qos_rec, 0, sizeof(qos_rec));
12015 
12016 		/* If the qos is blank that means we want the default */
12017 		if (job_specs->qos[0])
12018 			qos_rec.name = job_specs->qos;
12019 
12020 		new_qos_ptr = _determine_and_validate_qos(
12021 			resv_name, use_assoc_ptr,
12022 			operator, &qos_rec, &error_code, false,
12023 			LOG_LEVEL_ERROR);
12024 		if ((error_code == SLURM_SUCCESS) && new_qos_ptr) {
12025 			if (job_ptr->qos_ptr == new_qos_ptr) {
12026 				sched_debug("%s: new QOS identical to old QOS %pJ",
12027 					    __func__, job_ptr);
12028 				new_qos_ptr = NULL;
12029 			} else if (!IS_JOB_PENDING(job_ptr)) {
12030 				error_code = ESLURM_JOB_NOT_PENDING;
12031 				new_qos_ptr = NULL;
12032 			}
12033 		}
12034 
12035 		if (error_code != SLURM_SUCCESS)
12036 			goto fini;
12037 	}
12038 
12039 	use_qos_ptr = new_qos_ptr ? new_qos_ptr : job_ptr->qos_ptr;
12040 
12041 	if (job_specs->bitflags & RESET_ACCRUE_TIME) {
12042 		if (!IS_JOB_PENDING(job_ptr) || !detail_ptr) {
12043 			error_code = ESLURM_JOB_NOT_PENDING;
12044 			goto fini;
12045 		} else
12046 			acct_policy_remove_accrue_time(job_ptr, false);
12047 	}
12048 
12049 	/*
12050 	 * Must check req_nodes to set the job_ptr->details->req_node_bitmap
12051 	 * before we validate it later.
12052 	 */
12053 	if (job_specs->req_nodes &&
12054 	    (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
12055 		/*
12056 		 * Use req_nodes to change the nodes associated with a running
12057 		 * for lack of other field in the job request to use
12058 		 */
12059 		if (!permit_job_shrink()) {
12060 			error("%s: request to shrink %pJ denied by configuration",
12061 			      __func__, job_ptr);
12062 			error_code = ESLURM_NOT_SUPPORTED;
12063 			goto fini;
12064 		} else if ((job_specs->req_nodes[0] == '\0') ||
12065 		    node_name2bitmap(job_specs->req_nodes,
12066 				     false, &new_req_bitmap) ||
12067 		    !bit_super_set(new_req_bitmap, job_ptr->node_bitmap) ||
12068 		    (job_ptr->details && job_ptr->details->expanding_jobid)) {
12069 			sched_info("%s: Invalid node list (%s) for %pJ update",
12070 				   __func__, job_specs->req_nodes, job_ptr);
12071 			error_code = ESLURM_INVALID_NODE_NAME;
12072 			goto fini;
12073 		} else if (new_req_bitmap) {
12074 			int i, i_first, i_last;
12075 			node_record_t *node_ptr;
12076 			bitstr_t *rem_nodes;
12077 
12078 			/*
12079 			 * They requested a new list of nodes for the job. If
12080 			 * the batch host isn't in this list, then deny this
12081 			 * request.
12082 			 */
12083 			if (job_ptr->batch_flag) {
12084 				bitstr_t *batch_host_bitmap;
12085 				if (node_name2bitmap(job_ptr->batch_host, false,
12086 						     &batch_host_bitmap))
12087 					error("%s: Invalid batch host %s for %pJ; this should never happen",
12088 					      __func__, job_ptr->batch_host,
12089 					      job_ptr);
12090 				else if (!bit_overlap_any(batch_host_bitmap,
12091 							  new_req_bitmap)) {
12092 					error("%s: Batch host %s for %pJ is not in the requested node list %s. You cannot remove the batch host from a job when resizing.",
12093 					      __func__, job_ptr->batch_host,
12094 					      job_ptr, job_specs->req_nodes);
12095 					error_code = ESLURM_INVALID_NODE_NAME;
12096 					bit_free(batch_host_bitmap);
12097 					goto fini;
12098 				} else
12099 					bit_free(batch_host_bitmap);
12100 			}
12101 
12102 			sched_info("%s: setting nodes to %s for %pJ",
12103 				   __func__, job_specs->req_nodes, job_ptr);
12104 			job_pre_resize_acctg(job_ptr);
12105 			i_first = bit_ffs(job_ptr->node_bitmap);
12106 			if (i_first >= 0)
12107 				i_last  = bit_fls(job_ptr->node_bitmap);
12108 			else
12109 				i_last = -2;
12110 			rem_nodes = bit_alloc(bit_size(job_ptr->node_bitmap));
12111 			for (i = i_first; i <= i_last; i++) {
12112 				if (bit_test(new_req_bitmap, i) ||
12113 				    !bit_test(job_ptr->node_bitmap, i))
12114 					continue;
12115 				bit_set(rem_nodes, i);
12116 			}
12117 #ifndef HAVE_FRONT_END
12118 			abort_job_on_nodes(job_ptr, rem_nodes);
12119 #endif
12120 			for (i = i_first; i <= i_last; i++) {
12121 				if (!bit_test(rem_nodes, i))
12122 					continue;
12123 				node_ptr = node_record_table_ptr + i;
12124 				kill_step_on_node(job_ptr, node_ptr, false);
12125 				excise_node_from_job(job_ptr, node_ptr);
12126 			}
12127 			bit_free(rem_nodes);
12128 			(void) gs_job_start(job_ptr);
12129 			gres_build_job_details(job_ptr->gres_list,
12130 					       &job_ptr->gres_detail_cnt,
12131 					       &job_ptr->gres_detail_str,
12132 					       &job_ptr->gres_used);
12133 			job_post_resize_acctg(job_ptr);
12134 			/*
12135 			 * Since job_post_resize_acctg will restart
12136 			 * things, don't do it again.
12137 			 */
12138 			update_accounting = false;
12139 		} else {
12140 			update_accounting = true;
12141 		}
12142 		FREE_NULL_BITMAP(new_req_bitmap);
12143 	} else if (job_specs->req_nodes) {
12144 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
12145 			error_code = ESLURM_JOB_NOT_PENDING;
12146 		else if (job_specs->req_nodes[0] == '\0')
12147 			new_req_bitmap_given = true;
12148 		else {
12149 			if (node_name2bitmap(job_specs->req_nodes, false,
12150 					     &new_req_bitmap)) {
12151 				sched_info("%s: Invalid node list for job_update: %s",
12152 					   __func__, job_specs->req_nodes);
12153 				FREE_NULL_BITMAP(new_req_bitmap);
12154 				error_code = ESLURM_INVALID_NODE_NAME;
12155 			} else
12156 				new_req_bitmap_given = true;
12157 		}
12158 	}
12159 
12160 	if (error_code != SLURM_SUCCESS)
12161 		goto fini;
12162 
12163 	/* this needs to be after partition and QOS checks */
12164 	if (job_specs->reservation
12165 	    && (!xstrcmp(job_specs->reservation, job_ptr->resv_name) ||
12166 		(!job_ptr->resv_name && job_specs->reservation[0] == '\0'))) {
12167 		sched_debug("%s: new reservation identical to old reservation %pJ",
12168 			    __func__, job_ptr);
12169 	} else if (job_specs->reservation) {
12170 		if (!IS_JOB_PENDING(job_ptr) && !IS_JOB_RUNNING(job_ptr)) {
12171 			error_code = ESLURM_JOB_NOT_PENDING_NOR_RUNNING;
12172 		} else {
12173 			job_record_t tmp_job_rec;
12174 
12175 			memcpy(&tmp_job_rec, job_ptr, sizeof(job_record_t));
12176 			tmp_job_rec.resv_name = xstrdup(job_specs->reservation);
12177 			tmp_job_rec.resv_ptr = NULL;
12178 			tmp_job_rec.part_ptr = use_part_ptr;
12179 			tmp_job_rec.qos_ptr = use_qos_ptr;
12180 			tmp_job_rec.assoc_ptr = use_assoc_ptr;
12181 
12182 			error_code = validate_job_resv(&tmp_job_rec);
12183 
12184 			/*
12185 			 * It doesn't matter what this is, just set it as
12186 			 * failure will be NULL.
12187 			 */
12188 			new_resv_ptr = tmp_job_rec.resv_ptr;
12189 
12190 			/*
12191 			 * Make sure this job isn't using a partition or QOS
12192 			 * that requires it to be in a reservation.
12193 			 */
12194 			if ((error_code == SLURM_SUCCESS) && !new_resv_ptr) {
12195 				if (use_part_ptr
12196 				    && use_part_ptr->flags & PART_FLAG_REQ_RESV)
12197 					error_code = ESLURM_ACCESS_DENIED;
12198 
12199 				if (use_qos_ptr
12200 				    && use_qos_ptr->flags & QOS_FLAG_REQ_RESV)
12201 					error_code = ESLURM_INVALID_QOS;
12202 			}
12203 
12204 			xfree(tmp_job_rec.resv_name);
12205 		}
12206 		if (error_code != SLURM_SUCCESS)
12207 			goto fini;
12208 	}
12209 
12210 	if (job_specs->cpus_per_tres   || job_specs->tres_per_job    ||
12211 	    job_specs->tres_per_node   || job_specs->tres_per_socket ||
12212 	    job_specs->tres_per_task   || job_specs->mem_per_tres)
12213 		gres_update = true;
12214 	if (gres_update) {
12215 		uint16_t orig_ntasks_per_socket = NO_VAL16;
12216 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL) ||
12217 		    (detail_ptr->expanding_jobid != 0)) {
12218 			error_code = ESLURM_JOB_NOT_PENDING;
12219 			goto fini;
12220 		}
12221 
12222 		if (job_specs->num_tasks == NO_VAL)
12223 			job_specs->num_tasks = detail_ptr->num_tasks;
12224 		if (job_specs->min_nodes == NO_VAL)
12225 			job_specs->min_nodes = detail_ptr->min_nodes;
12226 		if (job_specs->max_nodes == NO_VAL)
12227 			job_specs->max_nodes = detail_ptr->max_nodes;
12228 		if (job_specs->ntasks_per_node == NO_VAL16)
12229 			job_specs->ntasks_per_node = detail_ptr->ntasks_per_node;
12230 		if ((job_specs->ntasks_per_socket == NO_VAL16) &&
12231 		    (detail_ptr->mc_ptr) &&
12232 		    (detail_ptr->mc_ptr->ntasks_per_socket != INFINITE16)) {
12233 			job_specs->ntasks_per_socket =
12234 				mc_ptr->ntasks_per_socket;
12235 			orig_ntasks_per_socket = job_specs->ntasks_per_socket;
12236 		}
12237 		if (job_specs->cpus_per_task == NO_VAL16)
12238 			job_specs->cpus_per_task = detail_ptr->cpus_per_task;
12239 		gres_list = gres_plugin_job_state_dup(job_ptr->gres_list);
12240 		if ((error_code = gres_plugin_job_state_validate(
12241 						job_specs->cpus_per_tres,
12242 						job_specs->tres_freq,
12243 						job_specs->tres_per_job,
12244 						job_specs->tres_per_node,
12245 						job_specs->tres_per_socket,
12246 						job_specs->tres_per_task,
12247 						job_specs->mem_per_tres,
12248 						&job_specs->num_tasks,
12249 						&job_specs->min_nodes,
12250 						&job_specs->max_nodes,
12251 						&job_specs->ntasks_per_node,
12252 						&job_specs->ntasks_per_socket,
12253 						&job_specs->sockets_per_node,
12254 						&job_specs->cpus_per_task,
12255 						&gres_list))) {
12256 			sched_info("%s: invalid GRES for %pJ",
12257 				   __func__, job_ptr);
12258 			goto fini;
12259 		}
12260 		if (job_specs->num_tasks == detail_ptr->num_tasks)
12261 			job_specs->num_tasks = NO_VAL;	/* Unchanged */
12262 		if (job_specs->min_nodes == detail_ptr->min_nodes)
12263 			job_specs->min_nodes = NO_VAL;	/* Unchanged */
12264 		if (job_specs->max_nodes == detail_ptr->max_nodes)
12265 			job_specs->max_nodes = NO_VAL;	/* Unchanged */
12266 		if (job_specs->ntasks_per_node == detail_ptr->ntasks_per_node)
12267 			job_specs->ntasks_per_node = NO_VAL16;	/* Unchanged */
12268 		if (job_specs->ntasks_per_socket == orig_ntasks_per_socket)
12269 			job_specs->ntasks_per_socket = NO_VAL16; /* Unchanged */
12270 		if (job_specs->cpus_per_task == detail_ptr->cpus_per_task)
12271 			job_specs->cpus_per_task = NO_VAL16;	/* Unchanged */
12272 	}
12273 	if (gres_update) {
12274 		gres_set_job_tres_cnt(gres_list, detail_ptr->min_nodes,
12275 				      job_specs->tres_req_cnt, false);
12276 	}
12277 
12278 	if ((job_specs->min_nodes != NO_VAL) &&
12279 	    (job_specs->min_nodes != INFINITE)) {
12280 		uint32_t min_cpus = (job_specs->pn_min_cpus != NO_VAL16 ?
12281 			job_specs->pn_min_cpus : detail_ptr->pn_min_cpus) *
12282 			job_specs->min_nodes;
12283 		uint32_t num_cpus = job_specs->min_cpus != NO_VAL ?
12284 			job_specs->min_cpus :
12285 			job_ptr->tres_req_cnt[TRES_ARRAY_CPU];
12286 		uint32_t num_tasks = job_specs->num_tasks != NO_VAL ?
12287 			job_specs->num_tasks : detail_ptr->num_tasks;
12288 
12289 		if (!num_tasks) {
12290 			num_tasks = detail_ptr->min_nodes;
12291 
12292 		} else if (num_tasks < job_specs->min_nodes) {
12293 			info("%s: adjusting num_tasks (prev: %u) to be at least min_nodes: %u",
12294 			     __func__, num_tasks, job_specs->min_nodes);
12295 			num_tasks = job_specs->min_nodes;
12296 			if (IS_JOB_PENDING(job_ptr))
12297 				job_specs->num_tasks = num_tasks;
12298 		}
12299 
12300 		num_tasks *= job_specs->cpus_per_task != NO_VAL16 ?
12301 			job_specs->cpus_per_task : detail_ptr->cpus_per_task;
12302 		num_tasks = MAX(num_tasks, min_cpus);
12303 		if (num_tasks > num_cpus) {
12304 			info("%s: adjusting min_cpus (prev: %u) to be at least : %u",
12305 			     __func__, num_cpus, num_tasks);
12306 			job_specs->min_cpus = num_tasks;
12307 
12308 			job_specs->pn_min_memory =
12309 				job_specs->pn_min_memory != NO_VAL64 ?
12310 				job_specs->pn_min_memory :
12311 				detail_ptr->pn_min_memory;
12312 		}
12313 
12314 		assoc_mgr_lock(&locks);
12315 
12316 		if (!job_specs->licenses) {
12317 			license_set_job_tres_cnt(job_ptr->license_list,
12318 						 job_specs->tres_req_cnt,
12319 						 true);
12320 		}
12321 		assoc_mgr_unlock(&locks);
12322 
12323 
12324 		job_specs->tres_req_cnt[TRES_ARRAY_NODE] = job_specs->min_nodes;
12325 	}
12326 
12327 	if (job_specs->min_cpus != NO_VAL)
12328 		job_specs->tres_req_cnt[TRES_ARRAY_CPU] = job_specs->min_cpus;
12329 	else if ((job_specs->pn_min_cpus != NO_VAL16) &&
12330 		 (job_specs->pn_min_cpus != 0)) {
12331 		job_specs->tres_req_cnt[TRES_ARRAY_CPU] =
12332 			job_specs->pn_min_cpus *
12333 			(job_specs->min_nodes != NO_VAL ?
12334 			 job_specs->min_nodes :
12335 			 detail_ptr ? detail_ptr->min_nodes : 1);
12336 		job_specs->min_cpus = job_specs->tres_req_cnt[TRES_ARRAY_CPU];
12337 	}
12338 
12339 	job_specs->tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem(NULL,
12340 		job_specs->pn_min_memory,
12341 		job_specs->tres_req_cnt[TRES_ARRAY_CPU] ?
12342 		job_specs->tres_req_cnt[TRES_ARRAY_CPU] :
12343 		job_ptr->tres_req_cnt[TRES_ARRAY_CPU],
12344 		job_specs->min_nodes != NO_VAL ?
12345 		job_specs->min_nodes :
12346 		detail_ptr ? detail_ptr->min_nodes : 1);
12347 
12348 	if (job_specs->licenses && !xstrcmp(job_specs->licenses,
12349 					    job_ptr->licenses)) {
12350 		sched_debug("%s: new licenses identical to old licenses \"%s\"",
12351 			    __func__, job_ptr->licenses);
12352 	} else if (job_specs->licenses) {
12353 		bool pending = IS_JOB_PENDING(job_ptr);
12354 		license_list = license_validate(job_specs->licenses, true, true,
12355 						pending ?
12356 						job_specs->tres_req_cnt : NULL,
12357 						&valid_licenses);
12358 
12359 		if (!valid_licenses) {
12360 			sched_info("%s: invalid licenses: %s",
12361 				   __func__, job_specs->licenses);
12362 			error_code = ESLURM_INVALID_LICENSES;
12363 		}
12364 	}
12365 
12366 	if (error_code != SLURM_SUCCESS)
12367 		goto fini;
12368 
12369 	if (job_specs->exc_nodes && detail_ptr &&
12370 	    !xstrcmp(job_specs->exc_nodes, detail_ptr->exc_nodes)) {
12371 		sched_debug("%s: new exc_nodes identical to old exc_nodes %s",
12372 			    __func__, job_specs->exc_nodes);
12373 	} else if (job_specs->exc_nodes) {
12374 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
12375 			error_code = ESLURM_JOB_NOT_PENDING;
12376 		else if (job_specs->exc_nodes[0] == '\0') {
12377 			xfree(detail_ptr->exc_nodes);
12378 			FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap);
12379 		} else {
12380 			if (node_name2bitmap(job_specs->exc_nodes, false,
12381 					     &exc_bitmap)) {
12382 				sched_error("%s: Invalid node list for update of %pJ: %s",
12383 					    __func__, job_ptr,
12384 					    job_specs->exc_nodes);
12385 				FREE_NULL_BITMAP(exc_bitmap);
12386 				error_code = ESLURM_INVALID_NODE_NAME;
12387 			}
12388 			if (exc_bitmap) {
12389 				xfree(detail_ptr->exc_nodes);
12390 				detail_ptr->exc_nodes =
12391 					xstrdup(job_specs->exc_nodes);
12392 				FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap);
12393 				detail_ptr->exc_node_bitmap = exc_bitmap;
12394 				sched_info("%s: setting exc_nodes to %s for %pJ",
12395 					   __func__, job_specs->exc_nodes, job_ptr);
12396 			}
12397 		}
12398 	}
12399 	if (error_code != SLURM_SUCCESS)
12400 		goto fini;
12401 
12402 	if (job_specs->min_nodes == INFINITE) {
12403 		/* Used by scontrol just to get current configuration info */
12404 		job_specs->min_nodes = NO_VAL;
12405 	}
12406 	if ((job_specs->min_nodes != NO_VAL) &&
12407 	    (job_specs->min_nodes > job_ptr->node_cnt) &&
12408 	    !permit_job_expansion() &&
12409 	    (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
12410 		info("%s: Change of size for %pJ not supported",  __func__,
12411 		     job_ptr);
12412 		error_code = ESLURM_NOT_SUPPORTED;
12413 		goto fini;
12414 	}
12415 
12416 	if (job_specs->req_switch != NO_VAL) {
12417 		job_ptr->req_switch = job_specs->req_switch;
12418 		info("%s: Change of switches to %u %pJ",
12419 		     __func__, job_specs->req_switch, job_ptr);
12420 	}
12421 	if (job_specs->wait4switch != NO_VAL) {
12422 		job_ptr->wait4switch = _max_switch_wait(job_specs->wait4switch);
12423 		info("%s: Change of switch wait to %u secs %pJ",
12424 		     __func__, job_ptr->wait4switch, job_ptr);
12425 	}
12426 
12427 	if (job_specs->admin_comment) {
12428 		if (!validate_super_user(uid)) {
12429 			error("%s: Attempt to change admin_comment for %pJ",
12430 			      __func__, job_ptr);
12431 			error_code = ESLURM_ACCESS_DENIED;
12432 		} else {
12433 			xfree(job_ptr->admin_comment);
12434 			job_ptr->admin_comment =
12435 				xstrdup(job_specs->admin_comment);
12436 			info("%s: setting admin_comment to %s for %pJ",
12437 			     __func__, job_ptr->admin_comment, job_ptr);
12438 		}
12439 	}
12440 
12441 	if (job_specs->comment) {
12442 		xfree(job_ptr->comment);
12443 		job_ptr->comment = xstrdup(job_specs->comment);
12444 		info("%s: setting comment to %s for %pJ",
12445 		     __func__, job_ptr->comment, job_ptr);
12446 	}
12447 
12448 	if (error_code != SLURM_SUCCESS)
12449 		goto fini;
12450 
12451         /*
12452 	 * Now that we know what the new part, qos, and association are going
12453 	 * to be lets check the limits.
12454 	 * If a limit was already exceeded before this update
12455 	 * request, let's assume it is expected and allow the change to happen.
12456 	 */
12457 	if (new_qos_ptr || new_assoc_ptr || new_part_ptr) {
12458 		if (!operator &&
12459 		    (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) {
12460 			uint32_t acct_reason = 0;
12461 			char *resv_orig = NULL;
12462 			bool resv_reset = false, min_reset = false,
12463 				max_reset = false,
12464 				time_min_reset = false;
12465 			if (!acct_policy_validate(job_specs, use_part_ptr,
12466 						  use_assoc_ptr, use_qos_ptr,
12467 						  &acct_reason,
12468 						  &acct_policy_limit_set,
12469 						  true)
12470 			    && !acct_limit_already_exceeded) {
12471 				info("%s: exceeded association/QOS limit for user %u: %s",
12472 				     __func__, job_specs->user_id,
12473 				     job_reason_string(acct_reason));
12474 				error_code = ESLURM_ACCOUNTING_POLICY;
12475 				goto fini;
12476 			}
12477 			/*
12478 			 * We need to set the various parts of job_specs below
12479 			 * to something since _valid_job_part() will validate
12480 			 * them.  Note the reservation part is validated in the
12481 			 * sub call to _part_access_check().
12482 			 */
12483 			if (job_specs->min_nodes == NO_VAL) {
12484 				job_specs->min_nodes = detail_ptr->min_nodes;
12485 				min_reset = true;
12486 			}
12487 			if ((job_specs->max_nodes == NO_VAL) &&
12488 			    (detail_ptr->max_nodes != 0)) {
12489 				job_specs->max_nodes = detail_ptr->max_nodes;
12490 				max_reset = true;
12491 			}
12492 
12493 			if ((job_specs->time_min == NO_VAL) &&
12494 			    (job_ptr->time_min != 0)) {
12495 				job_specs->time_min = job_ptr->time_min;
12496 				time_min_reset = true;
12497 			}
12498 
12499 			/*
12500 			 * This always gets reset, so don't worry about tracking
12501 			 * it.
12502 			 */
12503 			if (job_specs->time_limit == NO_VAL)
12504 				job_specs->time_limit = job_ptr->time_limit;
12505 
12506 			if (!job_specs->reservation
12507 			    || job_specs->reservation[0] == '\0') {
12508 				resv_reset = true;
12509 				resv_orig = job_specs->reservation;
12510 				job_specs->reservation = job_ptr->resv_name;
12511 			}
12512 
12513 			if ((error_code = _valid_job_part(
12514 				     job_specs, uid,
12515 				     new_req_bitmap_given ?
12516 				     new_req_bitmap :
12517 				     job_ptr->details->req_node_bitmap,
12518 				     use_part_ptr,
12519 				     new_part_ptr ?
12520 				     part_ptr_list : job_ptr->part_ptr_list,
12521 				     use_assoc_ptr, use_qos_ptr)))
12522 				goto fini;
12523 
12524 			if (min_reset)
12525 				job_specs->min_nodes = NO_VAL;
12526 			if (max_reset)
12527 				job_specs->max_nodes = NO_VAL;
12528 			if (time_min_reset)
12529 				job_specs->time_min = NO_VAL;
12530 			if (resv_reset)
12531 				job_specs->reservation = resv_orig;
12532 
12533 			job_specs->time_limit = orig_time_limit;
12534 		}
12535 
12536 		/*
12537 		 * Since we are successful to this point remove the job from the
12538 		 * old qos/assoc's
12539 		 */
12540 		acct_policy_remove_job_submit(job_ptr);
12541 		acct_policy_remove_accrue_time(job_ptr, false);
12542 	}
12543 
12544 	if (new_qos_ptr) {
12545 		/* Change QOS */
12546 		job_ptr->qos_id = new_qos_ptr->id;
12547 		job_ptr->qos_ptr = new_qos_ptr;
12548 		job_ptr->limit_set.qos = acct_policy_limit_set.qos;
12549 
12550 		if (job_ptr->state_reason == FAIL_QOS) {
12551 			job_ptr->state_reason = WAIT_NO_REASON;
12552 			xfree(job_ptr->state_desc);
12553 		}
12554 
12555 		info("%s: setting QOS to %s for %pJ",
12556 		     __func__, new_qos_ptr->name, job_ptr);
12557 	}
12558 
12559 	if (new_assoc_ptr) {
12560 		/* Change account/association */
12561 		xfree(job_ptr->account);
12562 		job_ptr->account = xstrdup(new_assoc_ptr->acct);
12563 		job_ptr->assoc_id = new_assoc_ptr->id;
12564 		job_ptr->assoc_ptr = new_assoc_ptr;
12565 
12566 		if (job_ptr->state_reason == FAIL_ACCOUNT) {
12567 			job_ptr->state_reason = WAIT_NO_REASON;
12568 			xfree(job_ptr->state_desc);
12569 		}
12570 
12571 		info("%s: setting account to %s for %pJ",
12572 		     __func__, job_ptr->account, job_ptr);
12573 	}
12574 
12575 	if (new_part_ptr) {
12576 		/* Change partition */
12577 		job_ptr->part_ptr = new_part_ptr;
12578 		FREE_NULL_LIST(job_ptr->part_ptr_list);
12579 		job_ptr->part_ptr_list = part_ptr_list;
12580 		part_ptr_list = NULL;	/* nothing to free */
12581 
12582 		_rebuild_part_name_list(job_ptr);
12583 
12584 		/* Rebuilt in priority/multifactor plugin */
12585 		xfree(job_ptr->priority_array);
12586 
12587 		info("%s: setting partition to %s for %pJ",
12588 		     __func__, job_specs->partition, job_ptr);
12589 	}
12590 
12591 	/* Now add the job to the new qos/assoc's */
12592 	if (new_qos_ptr || new_assoc_ptr || new_part_ptr) {
12593 		update_accounting = true;
12594 		acct_policy_add_job_submit(job_ptr);
12595 	}
12596 
12597 	if (new_req_bitmap_given) {
12598 		xfree(detail_ptr->req_nodes);
12599 		if (job_specs->req_nodes[0] != '\0')
12600 			detail_ptr->req_nodes =	xstrdup(job_specs->req_nodes);
12601 		FREE_NULL_BITMAP(detail_ptr->req_node_bitmap);
12602 		detail_ptr->req_node_bitmap = new_req_bitmap;
12603 		new_req_bitmap = NULL;
12604 		sched_info("%s: setting req_nodes to %s for %pJ",
12605 			   __func__, job_specs->req_nodes, job_ptr);
12606 	}
12607 
12608 	if (new_resv_ptr) {
12609 		job_ptr->resv_name = xstrdup(new_resv_ptr->name);
12610 		job_ptr->resv_ptr = new_resv_ptr;
12611 		sched_info("%s: setting reservation to %s for %pJ", __func__,
12612 			   job_ptr->resv_name, job_ptr);
12613 		update_accounting = true;
12614 	} else if (job_specs->reservation &&
12615 		   job_specs->reservation[0] == '\0' &&
12616 		   job_ptr->resv_name) {
12617 		xfree(job_ptr->resv_name);
12618 		job_ptr->resv_id    = 0;
12619 		job_ptr->resv_ptr   = NULL;
12620 		sched_info("%s: setting reservation to '' for %pJ",
12621 			   __func__, job_ptr);
12622 		update_accounting = true;
12623 	}
12624 
12625 	/* Reset min and max cpu counts as needed, ensure consistency */
12626 	if (job_specs->min_cpus != NO_VAL) {
12627 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
12628 			error_code = ESLURM_JOB_NOT_PENDING;
12629 		else if (job_specs->min_cpus < 1)
12630 			error_code = ESLURM_INVALID_CPU_COUNT;
12631 		else {
12632 			save_min_cpus = detail_ptr->min_cpus;
12633 			detail_ptr->min_cpus = job_specs->min_cpus;
12634 		}
12635 	}
12636 	if (job_specs->max_cpus != NO_VAL) {
12637 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
12638 			error_code = ESLURM_JOB_NOT_PENDING;
12639 		else {
12640 			save_max_cpus = detail_ptr->max_cpus;
12641 			detail_ptr->max_cpus = job_specs->max_cpus;
12642 		}
12643 	}
12644 	if ((save_min_cpus || save_max_cpus) && detail_ptr->max_cpus &&
12645 	    (detail_ptr->max_cpus < detail_ptr->min_cpus)) {
12646 		error_code = ESLURM_INVALID_CPU_COUNT;
12647 		if (save_min_cpus) {
12648 			detail_ptr->min_cpus = save_min_cpus;
12649 			save_min_cpus = 0;
12650 		}
12651 		if (save_max_cpus) {
12652 			detail_ptr->max_cpus = save_max_cpus;
12653 			save_max_cpus = 0;
12654 		}
12655 	}
12656 
12657 	if (error_code != SLURM_SUCCESS)
12658 		goto fini;
12659 
12660 	if (save_min_cpus && (detail_ptr->min_cpus != save_min_cpus)) {
12661 		info("%s: setting min_cpus from %u to %u for %pJ",
12662 		     __func__, save_min_cpus, detail_ptr->min_cpus, job_ptr);
12663 		job_ptr->limit_set.tres[TRES_ARRAY_CPU] =
12664 			acct_policy_limit_set.tres[TRES_ARRAY_CPU];
12665 		detail_ptr->orig_min_cpus = job_specs->min_cpus;
12666 		update_accounting = true;
12667 	}
12668 	if (save_max_cpus && (detail_ptr->max_cpus != save_max_cpus)) {
12669 		info("%s: setting max_cpus from %u to %u for %pJ",
12670 		     __func__, save_max_cpus, detail_ptr->max_cpus, job_ptr);
12671 		/*
12672 		 * Always use the acct_policy_limit_set.* since if set by a
12673 		 * super user it be set correctly
12674 		 */
12675 		job_ptr->limit_set.tres[TRES_ARRAY_CPU] =
12676 			acct_policy_limit_set.tres[TRES_ARRAY_CPU];
12677 		detail_ptr->orig_max_cpus = job_specs->max_cpus;
12678 		update_accounting = true;
12679 	}
12680 
12681 	if ((job_specs->pn_min_cpus != NO_VAL16) &&
12682 	    (job_specs->pn_min_cpus != 0)) {
12683 
12684 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
12685 			error_code = ESLURM_JOB_NOT_PENDING;
12686 		} else {
12687 			detail_ptr->pn_min_cpus = job_specs->pn_min_cpus;
12688 			detail_ptr->orig_pn_min_cpus = job_specs->pn_min_cpus;
12689 			info("%s: setting pn_min_cpus to %u for %pJ",
12690 			     __func__, job_specs->pn_min_cpus, job_ptr);
12691 		}
12692 	}
12693 
12694 	if (error_code != SLURM_SUCCESS)
12695 		goto fini;
12696 
12697 	if (job_specs->cpus_per_task != NO_VAL16) {
12698 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
12699 			error_code = ESLURM_JOB_NOT_PENDING;
12700 		} else if (detail_ptr->cpus_per_task !=
12701 			   job_specs->cpus_per_task) {
12702 			info("%s: setting cpus_per_task from %u to %u for %pJ",
12703 			     __func__, detail_ptr->cpus_per_task,
12704 			     job_specs->cpus_per_task, job_ptr);
12705 			detail_ptr->cpus_per_task = job_specs->cpus_per_task;
12706 			detail_ptr->orig_cpus_per_task =
12707 					job_specs->cpus_per_task;
12708 		}
12709 	}
12710 
12711 	if (error_code != SLURM_SUCCESS)
12712 		goto fini;
12713 
12714 	/* Reset min and max node counts as needed, ensure consistency */
12715 	if (job_specs->min_nodes != NO_VAL) {
12716 		if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))
12717 			;	/* shrink running job, processed later */
12718 		else if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
12719 			error_code = ESLURM_JOB_NOT_PENDING;
12720 		else if (job_specs->min_nodes < 1) {
12721 			info("%s: min_nodes < 1 for %pJ", __func__, job_ptr);
12722 			error_code = ESLURM_INVALID_NODE_COUNT;
12723 		} else {
12724 			/* Resize of pending job */
12725 			save_min_nodes = detail_ptr->min_nodes;
12726 			detail_ptr->min_nodes = job_specs->min_nodes;
12727 		}
12728 	}
12729 	if (job_specs->max_nodes != NO_VAL) {
12730 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
12731 			error_code = ESLURM_JOB_NOT_PENDING;
12732 		else {
12733 			save_max_nodes = detail_ptr->max_nodes;
12734 			detail_ptr->max_nodes = job_specs->max_nodes;
12735 		}
12736 	}
12737 	if ((save_min_nodes || save_max_nodes) && detail_ptr->max_nodes &&
12738 	    (detail_ptr->max_nodes < detail_ptr->min_nodes)) {
12739 		info("%s: max_nodes < min_nodes (%u < %u) for %pJ", __func__,
12740 		     detail_ptr->max_nodes, detail_ptr->min_nodes,
12741 		     job_ptr);
12742 		error_code = ESLURM_INVALID_NODE_COUNT;
12743 		if (save_min_nodes) {
12744 			detail_ptr->min_nodes = save_min_nodes;
12745 			save_min_nodes = 0;
12746 		}
12747 		if (save_max_nodes) {
12748 			detail_ptr->max_nodes = save_max_nodes;
12749 			save_max_nodes = 0;
12750 		}
12751 	}
12752 	if (error_code != SLURM_SUCCESS)
12753 		goto fini;
12754 
12755 	if (save_min_nodes && (save_min_nodes!= detail_ptr->min_nodes)) {
12756 		info("%s: setting min_nodes from %u to %u for %pJ", __func__,
12757 		     save_min_nodes, detail_ptr->min_nodes, job_ptr);
12758 		job_ptr->limit_set.tres[TRES_ARRAY_NODE] =
12759 			acct_policy_limit_set.tres[TRES_ARRAY_NODE];
12760 		update_accounting = true;
12761 	}
12762 	if (save_max_nodes && (save_max_nodes != detail_ptr->max_nodes)) {
12763 		info("%s: setting max_nodes from %u to %u for %pJ", __func__,
12764 		     save_max_nodes, detail_ptr->max_nodes, job_ptr);
12765 		/*
12766 		 * Always use the acct_policy_limit_set.* since if set by a
12767 		 * super user it be set correctly
12768 		 */
12769 		job_ptr->limit_set.tres[TRES_ARRAY_NODE] =
12770 			acct_policy_limit_set.tres[TRES_ARRAY_NODE];
12771 		update_accounting = true;
12772 	}
12773 
12774 	if (job_specs->num_tasks != NO_VAL) {
12775 		if (!IS_JOB_PENDING(job_ptr))
12776 			error_code = ESLURM_JOB_NOT_PENDING;
12777 		else if (job_specs->num_tasks < 1)
12778 			error_code = ESLURM_BAD_TASK_COUNT;
12779 		else {
12780 			detail_ptr->num_tasks = job_specs->num_tasks;
12781 			info("%s: setting num_tasks to %u for %pJ",
12782 			     __func__, job_specs->num_tasks, job_ptr);
12783 		}
12784 	}
12785 	if (error_code != SLURM_SUCCESS)
12786 		goto fini;
12787 
12788 	if (job_specs->time_limit != NO_VAL) {
12789 		if (IS_JOB_FINISHED(job_ptr) || job_ptr->preempt_time)
12790 			error_code = ESLURM_JOB_FINISHED;
12791 		else if (job_ptr->time_limit == job_specs->time_limit) {
12792 			sched_debug("%s: new time limit identical to old time limit %pJ",
12793 				    __func__, job_ptr);
12794 		} else if (operator ||
12795 			   (job_ptr->time_limit > job_specs->time_limit)) {
12796 			time_t old_time =  job_ptr->time_limit;
12797 			uint32_t use_time_min = job_specs->time_min != NO_VAL ?
12798 				job_specs->time_min : job_ptr->time_min;
12799 			if (old_time == INFINITE)	/* one year in mins */
12800 				old_time = (365 * 24 * 60);
12801 			if (job_specs->time_limit < use_time_min) {
12802 				sched_info("%s: attempt to set time_limit < time_min (%u < %u)",
12803 					   __func__,
12804 					   job_specs->time_limit,
12805 					   use_time_min);
12806 				error_code = ESLURM_INVALID_TIME_MIN_LIMIT;
12807 				goto fini;
12808 			}
12809 			acct_policy_alter_job(job_ptr, job_specs->time_limit);
12810 			job_ptr->time_limit = job_specs->time_limit;
12811 			if (IS_JOB_RUNNING(job_ptr) ||
12812 			    IS_JOB_SUSPENDED(job_ptr)) {
12813 				if (job_ptr->preempt_time) {
12814 					;	/* Preemption in progress */
12815 				} else if (job_ptr->time_limit == INFINITE) {
12816 					/* Set end time in one year */
12817 					job_ptr->end_time = now +
12818 						(365 * 24 * 60 * 60);
12819 				} else {
12820 					/*
12821 					 * Update end_time based upon change
12822 					 * to preserve suspend time info
12823 					 */
12824 					job_ptr->end_time = job_ptr->end_time +
12825 						((job_ptr->time_limit -
12826 						  old_time) * 60);
12827 				}
12828 				if (job_ptr->end_time < now)
12829 					job_ptr->end_time = now;
12830 				if (IS_JOB_RUNNING(job_ptr) &&
12831 				    (list_is_empty(job_ptr->step_list) == 0)) {
12832 					_xmit_new_end_time(job_ptr);
12833 				}
12834 				job_ptr->end_time_exp = job_ptr->end_time;
12835 			}
12836 			sched_info("%s: setting time_limit to %u for %pJ",
12837 				   __func__, job_specs->time_limit, job_ptr);
12838 			/*
12839 			 * Always use the acct_policy_limit_set.*
12840 			 * since if set by a super user it be set correctly
12841 			 */
12842 			job_ptr->limit_set.time = acct_policy_limit_set.time;
12843 			update_accounting = true;
12844 		} else if (IS_JOB_PENDING(job_ptr) && job_ptr->part_ptr &&
12845 			   (job_ptr->part_ptr->max_time >=
12846 			    job_specs->time_limit)) {
12847 			job_ptr->time_limit = job_specs->time_limit;
12848 			sched_info("%s: setting time_limit to %u for %pJ",
12849 				   __func__, job_specs->time_limit, job_ptr);
12850 			/*
12851 			 * Always use the acct_policy_limit_set.*
12852 			 * since if set by a super user it be set correctly
12853 			 */
12854 			job_ptr->limit_set.time = acct_policy_limit_set.time;
12855 			update_accounting = true;
12856 		} else {
12857 			sched_info("%s: Attempt to increase time limit for %pJ",
12858 				   __func__, job_ptr);
12859 			error_code = ESLURM_ACCESS_DENIED;
12860 		}
12861 	}
12862 	if (error_code != SLURM_SUCCESS)
12863 		goto fini;
12864 
12865 	if ((job_specs->time_min != NO_VAL) && IS_JOB_PENDING(job_ptr)) {
12866 		if (job_specs->time_min > job_ptr->time_limit) {
12867 			info("%s: attempt to set TimeMin > TimeLimit (%u > %u)",
12868 			     __func__, job_specs->time_min, job_ptr->time_limit);
12869 			error_code = ESLURM_INVALID_TIME_MIN_LIMIT;
12870 		} else if (job_ptr->time_min != job_specs->time_min) {
12871 			job_ptr->time_min = job_specs->time_min;
12872 			info("%s: setting TimeMin to %u for %pJ",
12873 			     __func__, job_specs->time_min, job_ptr);
12874 		}
12875 	}
12876 	if (error_code != SLURM_SUCCESS)
12877 		goto fini;
12878 
12879 	if (job_specs->end_time) {
12880 		if (!IS_JOB_RUNNING(job_ptr) || job_ptr->preempt_time) {
12881 			/*
12882 			 * We may want to use this for deadline scheduling
12883 			 * at some point in the future. For now only reset
12884 			 * the time limit of running jobs.
12885 			 */
12886 			error_code = ESLURM_JOB_NOT_RUNNING;
12887 		} else if (job_specs->end_time < now) {
12888 			error_code = ESLURM_INVALID_TIME_VALUE;
12889 		} else if (operator ||
12890 			   (job_ptr->end_time > job_specs->end_time)) {
12891 			int delta_t  = job_specs->end_time - job_ptr->end_time;
12892 			job_ptr->end_time = job_specs->end_time;
12893 			job_ptr->time_limit += (delta_t+30)/60; /* Sec->min */
12894 			sched_info("%s: setting time_limit to %u for %pJ",
12895 				   __func__, job_ptr->time_limit, job_ptr);
12896 			/* Always use the acct_policy_limit_set.*
12897 			 * since if set by a super user it be set correctly */
12898 			job_ptr->limit_set.time = acct_policy_limit_set.time;
12899 			update_accounting = true;
12900 		} else {
12901 			sched_info("%s: Attempt to extend end time for %pJ",
12902 				   __func__, job_ptr);
12903 			error_code = ESLURM_ACCESS_DENIED;
12904 		}
12905 	}
12906 
12907 	if ((job_specs->deadline) && (!IS_JOB_RUNNING(job_ptr))) {
12908 		char time_str[32];
12909 		slurm_make_time_str(&job_ptr->deadline, time_str,
12910 				    sizeof(time_str));
12911 		if (job_specs->deadline < now) {
12912 			error_code = ESLURM_INVALID_TIME_VALUE;
12913 		} else if (operator) {
12914 			/* update deadline */
12915 			job_ptr->deadline = job_specs->deadline;
12916 			sched_info("%s: setting deadline to %s for %pJ",
12917 				   __func__, time_str, job_ptr);
12918 			/*
12919 			 * Always use the acct_policy_limit_set.*
12920 			 * since if set by a super user it be set correctly
12921 			 */
12922 			job_ptr->limit_set.time = acct_policy_limit_set.time;
12923 			update_accounting = true;
12924 		} else {
12925 			sched_info("%s: Attempt to extend end time for %pJ",
12926 				   __func__, job_ptr);
12927 			error_code = ESLURM_ACCESS_DENIED;
12928 		}
12929 	}
12930 	if (error_code != SLURM_SUCCESS)
12931 		goto fini;
12932 
12933 	if (job_specs->delay_boot != NO_VAL) {
12934 		job_ptr->delay_boot = job_specs->delay_boot;
12935 		sched_info("%s: setting delay_boot to %u for %pJ",
12936 			   __func__, job_specs->delay_boot, job_ptr);
12937 	}
12938 
12939 	if ((job_specs->requeue != NO_VAL16) && detail_ptr) {
12940 		detail_ptr->requeue = MIN(job_specs->requeue, 1);
12941 		sched_info("%s: setting requeue to %u for %pJ",
12942 			   __func__, job_specs->requeue, job_ptr);
12943 	}
12944 
12945 	if (job_specs->priority != NO_VAL) {
12946 		/*
12947 		 * If we are doing time slicing we could update the
12948 		 * priority of the job while running to give better
12949 		 * position (larger time slices) than competing jobs
12950 		 */
12951 		if (IS_JOB_FINISHED(job_ptr) || (detail_ptr == NULL))
12952 			error_code = ESLURM_JOB_FINISHED;
12953 		else if (job_ptr->priority == job_specs->priority) {
12954 			debug("%s: setting priority to current value",__func__);
12955 			if ((job_ptr->priority == 0) && operator) {
12956 				/*
12957 				 * Authorized user can change from user hold
12958 				 * to admin hold or admin hold to user hold
12959 				 */
12960 				if (job_specs->alloc_sid == ALLOC_SID_USER_HOLD)
12961 					job_ptr->state_reason = WAIT_HELD_USER;
12962 				else
12963 					job_ptr->state_reason = WAIT_HELD;
12964 			}
12965 		} else if ((job_ptr->priority == 0) &&
12966 			   (job_specs->priority == INFINITE) &&
12967 			   (operator ||
12968 			    (job_ptr->state_reason == WAIT_RESV_DELETED) ||
12969 			    (job_ptr->state_reason == WAIT_HELD_USER))) {
12970 			_release_job(job_ptr, uid);
12971 		} else if ((job_ptr->priority == 0) &&
12972 			   (job_specs->priority != INFINITE)) {
12973 			info("%s: ignore priority reset request on held %pJ",
12974 			     __func__, job_ptr);
12975 			error_code = ESLURM_JOB_HELD;
12976 		} else if (operator ||
12977 			 (job_ptr->priority > job_specs->priority)) {
12978 			if (job_specs->priority != 0)
12979 				job_ptr->details->nice = NICE_OFFSET;
12980 			if (job_specs->priority == INFINITE) {
12981 				job_ptr->direct_set_prio = 0;
12982 				set_job_prio(job_ptr);
12983 			} else if (job_specs->priority == 0) {
12984 				_hold_job(job_ptr, uid);
12985 			} else {
12986 				if (operator) {
12987 					/*
12988 					 * Only administrator can make
12989 					 * persistent change to a job's
12990 					 * priority, except holding a job
12991 					 */
12992 					job_ptr->direct_set_prio = 1;
12993 				} else
12994 					error_code = ESLURM_PRIO_RESET_FAIL;
12995 				job_ptr->priority = job_specs->priority;
12996 				if (job_ptr->part_ptr_list &&
12997 				    job_ptr->priority_array) {
12998 					int i, j = list_count(
12999 						job_ptr->part_ptr_list);
13000 					for (i = 0; i < j; i++) {
13001 						job_ptr->priority_array[i] =
13002 						job_specs->priority;
13003 					}
13004 				}
13005 			}
13006 			sched_info("%s: set priority to %u for %pJ",
13007 				   __func__, job_ptr->priority, job_ptr);
13008 			update_accounting = true;
13009 			if (job_ptr->priority == 0) {
13010 				if (!operator ||
13011 				    (job_specs->alloc_sid ==
13012 				     ALLOC_SID_USER_HOLD)) {
13013 					job_ptr->state_reason = WAIT_HELD_USER;
13014 				} else
13015 					job_ptr->state_reason = WAIT_HELD;
13016 				xfree(job_ptr->state_desc);
13017 
13018 				/* remove pending remote sibling jobs */
13019 				if (IS_JOB_PENDING(job_ptr) &&
13020 				    !IS_JOB_REVOKED(job_ptr)) {
13021 					fed_mgr_job_revoke_sibs(job_ptr);
13022 				}
13023 			}
13024 		} else if ((job_ptr->priority != 0) &&
13025 			   (job_specs->priority == INFINITE)) {
13026 			/*
13027 			 * If the job was already released, ignore another
13028 			 * release request.
13029 			 */
13030 			debug("%s: %pJ already released, ignoring request",
13031 			      __func__, job_ptr);
13032 		} else {
13033 			sched_error("Attempt to modify priority for %pJ",
13034 				    job_ptr);
13035 			error_code = ESLURM_ACCESS_DENIED;
13036 		}
13037 	} else if (job_ptr->state_reason == FAIL_BAD_CONSTRAINTS) {
13038 		/*
13039 		 * We need to check if the state is BadConstraints here since we
13040 		 * are altering the job the bad constraint might have gone
13041 		 * away.  If it did the priority (0) wouldn't get reset so the
13042 		 * job would just go into JobAdminHeld otherwise.
13043 		 */
13044 		job_ptr->direct_set_prio = 0;
13045 		set_job_prio(job_ptr);
13046 		sched_debug("%s: job request changed somehow, removing the bad constraints to reevaluate %pJ uid %u",
13047 			    __func__, job_ptr, uid);
13048 		job_ptr->state_reason = WAIT_NO_REASON;
13049 	}
13050 
13051 	if (error_code != SLURM_SUCCESS)
13052 		goto fini;
13053 
13054 	if (job_specs->nice != NO_VAL) {
13055 		if (IS_JOB_FINISHED(job_ptr) || (job_ptr->details == NULL))
13056 			error_code = ESLURM_JOB_FINISHED;
13057 		else if (job_ptr->details &&
13058 			 (job_ptr->details->nice == job_specs->nice))
13059 			sched_debug("%s: new nice identical to old nice %pJ",
13060 				    __func__, job_ptr);
13061 		else if (job_ptr->direct_set_prio && job_ptr->priority != 0)
13062 			info("%s: ignore nice set request on %pJ",
13063 			     __func__, job_ptr);
13064 		else if (operator || (job_specs->nice >= NICE_OFFSET)) {
13065 			if (!xstrcmp(slurmctld_conf.priority_type,
13066 			             "priority/basic")) {
13067 				int64_t new_prio = job_ptr->priority;
13068 				new_prio += job_ptr->details->nice;
13069 				new_prio -= job_specs->nice;
13070 				job_ptr->priority = MAX(new_prio, 2);
13071 				sched_info("%s: nice changed from %u to %u, setting priority to %u for %pJ",
13072 					   __func__, job_ptr->details->nice,
13073 					   job_specs->nice,
13074 					   job_ptr->priority, job_ptr);
13075 			}
13076 			job_ptr->details->nice = job_specs->nice;
13077 			update_accounting = true;
13078 		} else {
13079 			sched_error("%s: Attempt to modify nice for %pJ",
13080 				    __func__, job_ptr);
13081 			error_code = ESLURM_ACCESS_DENIED;
13082 		}
13083 	}
13084 	if (error_code != SLURM_SUCCESS)
13085 		goto fini;
13086 
13087 	if (job_specs->pn_min_memory != NO_VAL64) {
13088 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
13089 			error_code = ESLURM_JOB_NOT_PENDING;
13090 		} else if (job_specs->pn_min_memory
13091 			   == detail_ptr->pn_min_memory) {
13092 			sched_debug("%s: new memory limit identical to old limit for %pJ",
13093 				    __func__, job_ptr);
13094 		} else {
13095 			char *entity;
13096 			if (job_specs->pn_min_memory == MEM_PER_CPU) {
13097 				/* Map --mem-per-cpu=0 to --mem=0 */
13098 				job_specs->pn_min_memory = 0;
13099 			}
13100 			if (job_specs->pn_min_memory & MEM_PER_CPU)
13101 				entity = "cpu";
13102 			else
13103 				entity = "job";
13104 
13105 			detail_ptr->pn_min_memory = job_specs->pn_min_memory;
13106 			detail_ptr->orig_pn_min_memory =
13107 					job_specs->pn_min_memory;
13108 			job_ptr->bit_flags |= JOB_MEM_SET;
13109 			sched_info("%s: setting min_memory_%s to %"PRIu64" for %pJ",
13110 				   __func__, entity,
13111 				   (job_specs->pn_min_memory & (~MEM_PER_CPU)),
13112 				   job_ptr);
13113 			/*
13114 			 * Always use the acct_policy_limit_set.*
13115 			 * since if set by a super user it be set correctly
13116 			 */
13117 			job_ptr->limit_set.tres[TRES_ARRAY_MEM] =
13118 				acct_policy_limit_set.tres[TRES_ARRAY_MEM];
13119 		}
13120 	}
13121 	if (error_code != SLURM_SUCCESS)
13122 		goto fini;
13123 
13124 	if (job_specs->pn_min_tmp_disk != NO_VAL) {
13125 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
13126 			error_code = ESLURM_JOB_NOT_PENDING;
13127 		} else {
13128 			detail_ptr->pn_min_tmp_disk =
13129 				job_specs->pn_min_tmp_disk;
13130 
13131 			sched_info("%s: setting job_min_tmp_disk to %u for %pJ",
13132 				   __func__, job_specs->pn_min_tmp_disk,
13133 				   job_ptr);
13134 		}
13135 	}
13136 	if (error_code != SLURM_SUCCESS)
13137 		goto fini;
13138 
13139 	if (job_specs->sockets_per_node != NO_VAL16) {
13140 		if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) {
13141 			error_code = ESLURM_JOB_NOT_PENDING;
13142 			goto fini;
13143 		} else {
13144 			mc_ptr->sockets_per_node = job_specs->sockets_per_node;
13145 			sched_info("%s: setting sockets_per_node to %u for %pJ",
13146 				   __func__, job_specs->sockets_per_node,
13147 				   job_ptr);
13148 		}
13149 	}
13150 
13151 	if (job_specs->cores_per_socket != NO_VAL16) {
13152 		if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) {
13153 			error_code = ESLURM_JOB_NOT_PENDING;
13154 			goto fini;
13155 		} else {
13156 			mc_ptr->cores_per_socket = job_specs->cores_per_socket;
13157 			sched_info("%s: setting cores_per_socket to %u for %pJ",
13158 				   __func__, job_specs->cores_per_socket,
13159 				   job_ptr);
13160 		}
13161 	}
13162 
13163 	if ((job_specs->threads_per_core != NO_VAL16)) {
13164 		if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) {
13165 			error_code = ESLURM_JOB_NOT_PENDING;
13166 			goto fini;
13167 		} else {
13168 			mc_ptr->threads_per_core = job_specs->threads_per_core;
13169 			sched_info("%s: setting threads_per_core to %u for %pJ",
13170 				   __func__, job_specs->threads_per_core,
13171 				   job_ptr);
13172 		}
13173 	}
13174 
13175 	if (job_specs->shared != NO_VAL16) {
13176 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
13177 			error_code = ESLURM_JOB_NOT_PENDING;
13178 		} else if (!operator) {
13179 			sched_error("%s: Attempt to change sharing for %pJ",
13180 				    __func__, job_ptr);
13181 			error_code = ESLURM_ACCESS_DENIED;
13182 		} else {
13183 			if (job_specs->shared) {
13184 				detail_ptr->share_res = 1;
13185 				detail_ptr->whole_node = 0;
13186 			} else {
13187 				detail_ptr->share_res = 0;
13188 			}
13189 			sched_info("%s: setting shared to %u for %pJ",
13190 				   __func__, job_specs->shared, job_ptr);
13191 		}
13192 	}
13193 	if (error_code != SLURM_SUCCESS)
13194 		goto fini;
13195 
13196 	if (job_specs->contiguous != NO_VAL16) {
13197 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
13198 			error_code = ESLURM_JOB_NOT_PENDING;
13199 		else if (operator
13200 			 || (detail_ptr->contiguous > job_specs->contiguous)) {
13201 			detail_ptr->contiguous = job_specs->contiguous;
13202 			sched_info("%s: setting contiguous to %u for %pJ",
13203 				   __func__, job_specs->contiguous, job_ptr);
13204 		} else {
13205 			sched_error("%s: Attempt to add contiguous for %pJ",
13206 				    __func__, job_ptr);
13207 			error_code = ESLURM_ACCESS_DENIED;
13208 		}
13209 	}
13210 	if (error_code != SLURM_SUCCESS)
13211 		goto fini;
13212 
13213 	if (job_specs->core_spec != NO_VAL16) {
13214 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
13215 			error_code = ESLURM_JOB_NOT_PENDING;
13216 		else if (operator &&
13217 			 (slurmctld_conf.conf_flags & CTL_CONF_ASRU)) {
13218 			if (job_specs->core_spec == INFINITE16)
13219 				detail_ptr->core_spec = NO_VAL16;
13220 			else
13221 				detail_ptr->core_spec = job_specs->core_spec;
13222 			sched_info("%s: setting core_spec to %u for %pJ",
13223 				   __func__, detail_ptr->core_spec, job_ptr);
13224 			if (detail_ptr->core_spec != NO_VAL16)
13225 				detail_ptr->whole_node = 1;
13226 		} else {
13227 			sched_error("%s Attempt to modify core_spec for %pJ",
13228 				    __func__, job_ptr);
13229 			error_code = ESLURM_ACCESS_DENIED;
13230 		}
13231 	}
13232 	if (error_code != SLURM_SUCCESS)
13233 		goto fini;
13234 
13235 	if (job_specs->features && detail_ptr &&
13236 	    !xstrcmp(job_specs->features, detail_ptr->features)) {
13237 		sched_debug("%s: new features identical to old features %s",
13238 			    __func__, job_specs->features);
13239 	} else if (job_specs->features) {
13240 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
13241 			error_code = ESLURM_JOB_NOT_PENDING;
13242 		else if (job_specs->features[0] != '\0') {
13243 			char *old_features = detail_ptr->features;
13244 			List old_list = detail_ptr->feature_list;
13245 			detail_ptr->features = xstrdup(job_specs->features);
13246 			detail_ptr->feature_list = NULL;
13247 			if (build_feature_list(job_ptr)) {
13248 				sched_info("%s: invalid features(%s) for %pJ",
13249 					   __func__, job_specs->features,
13250 					   job_ptr);
13251 				FREE_NULL_LIST(detail_ptr->feature_list);
13252 				detail_ptr->features = old_features;
13253 				detail_ptr->feature_list = old_list;
13254 				error_code = ESLURM_INVALID_FEATURE;
13255 			} else {
13256 				sched_info("%s: setting features to %s for %pJ",
13257 					   __func__, job_specs->features,
13258 					   job_ptr);
13259 				xfree(old_features);
13260 				FREE_NULL_LIST(old_list);
13261 			}
13262 		} else {
13263 			sched_info("%s: cleared features for %pJ", __func__,
13264 				   job_ptr);
13265 			xfree(detail_ptr->features);
13266 			FREE_NULL_LIST(detail_ptr->feature_list);
13267 		}
13268 	}
13269 	if (error_code != SLURM_SUCCESS)
13270 		goto fini;
13271 
13272 	if (job_specs->cluster_features &&
13273 	    (error_code = fed_mgr_update_job_cluster_features(
13274 					job_ptr, job_specs->cluster_features)))
13275 		goto fini;
13276 
13277 	if (job_specs->clusters &&
13278 	    (error_code = fed_mgr_update_job_clusters(job_ptr,
13279 						     job_specs->clusters)))
13280 		goto fini;
13281 
13282 	if (gres_list) {
13283 		char *tmp = NULL;
13284 		if (job_specs->cpus_per_tres) {
13285 			xstrfmtcat(tmp, "cpus_per_tres:%s ",
13286 				   job_specs->cpus_per_tres);
13287 			xfree(job_ptr->cpus_per_tres);
13288 			job_ptr->cpus_per_tres = job_specs->cpus_per_tres;
13289 			job_specs->cpus_per_tres = NULL;
13290 		}
13291 		if (job_specs->tres_per_job) {
13292 			xstrfmtcat(tmp, "tres_per_job:%s ",
13293 				   job_specs->tres_per_job);
13294 			xfree(job_ptr->tres_per_job);
13295 			job_ptr->tres_per_job = job_specs->tres_per_job;
13296 			job_specs->tres_per_job = NULL;
13297 		}
13298 		if (job_specs->tres_per_node) {
13299 			xstrfmtcat(tmp, "tres_per_node:%s ",
13300 				   job_specs->tres_per_node);
13301 			xfree(job_ptr->tres_per_node);
13302 			job_ptr->tres_per_node = job_specs->tres_per_node;
13303 			job_specs->tres_per_node = NULL;
13304 		}
13305 		if (job_specs->tres_per_socket) {
13306 			xstrfmtcat(tmp, "tres_per_socket:%s ",
13307 				   job_specs->tres_per_socket);
13308 			xfree(job_ptr->tres_per_socket);
13309 			job_ptr->tres_per_socket = job_specs->tres_per_socket;
13310 			job_specs->tres_per_socket = NULL;
13311 		}
13312 		if (job_specs->tres_per_task) {
13313 			xstrfmtcat(tmp, "tres_per_task:%s ",
13314 				   job_specs->tres_per_task);
13315 			xfree(job_ptr->tres_per_task);
13316 			job_ptr->tres_per_task = job_specs->tres_per_task;
13317 			job_specs->tres_per_task = NULL;
13318 		}
13319 		if (job_specs->mem_per_tres) {
13320 			xstrfmtcat(tmp, "mem_per_tres:%s ",
13321 				   job_specs->mem_per_tres);
13322 			xfree(job_ptr->mem_per_tres);
13323 			job_ptr->mem_per_tres = job_specs->mem_per_tres;
13324 			job_specs->mem_per_tres = NULL;
13325 		}
13326 		sched_info("%s: setting %sfor %pJ", __func__, tmp, job_ptr);
13327 		xfree(tmp);
13328 		FREE_NULL_LIST(job_ptr->gres_list);
13329 		job_ptr->gres_list = gres_list;
13330 		gres_build_job_details(job_ptr->gres_list,
13331 				       &job_ptr->gres_detail_cnt,
13332 				       &job_ptr->gres_detail_str,
13333 				       &job_ptr->gres_used);
13334 		gres_list = NULL;
13335 	}
13336 
13337 	if (job_specs->name) {
13338 		if (IS_JOB_FINISHED(job_ptr)) {
13339 			error_code = ESLURM_JOB_FINISHED;
13340 			goto fini;
13341 		} else if (!xstrcmp(job_specs->name, job_ptr->name)) {
13342 			sched_debug("%s: new name identical to old name %pJ",
13343 				    __func__, job_ptr);
13344 		} else {
13345 			xfree(job_ptr->name);
13346 			job_ptr->name = xstrdup(job_specs->name);
13347 
13348 			sched_info("%s: setting name to %s for %pJ",
13349 				   __func__, job_ptr->name, job_ptr);
13350 			update_accounting = true;
13351 		}
13352 	}
13353 
13354 	if (job_specs->work_dir && detail_ptr &&
13355 	    !xstrcmp(job_specs->work_dir, detail_ptr->work_dir)) {
13356 		sched_debug("%s: new work_dir identical to old work_dir %s",
13357 			    __func__, job_specs->work_dir);
13358 	} else if (job_specs->work_dir) {
13359 		if (!IS_JOB_PENDING(job_ptr)) {
13360 			error_code = ESLURM_JOB_NOT_PENDING;
13361 			goto fini;
13362 		} else if (detail_ptr) {
13363 			xfree(detail_ptr->work_dir);
13364 			detail_ptr->work_dir = xstrdup(job_specs->work_dir);
13365 			sched_info("%s: setting work_dir to %s for %pJ",
13366 				   __func__, detail_ptr->work_dir, job_ptr);
13367 			update_accounting = true;
13368 		}
13369 	}
13370 
13371 	if (job_specs->std_out && detail_ptr &&
13372 	    !xstrcmp(job_specs->std_out, detail_ptr->std_out)) {
13373 		sched_debug("%s: new std_out identical to old std_out %s",
13374 			    __func__, job_specs->std_out);
13375 	} else if (job_specs->std_out) {
13376 		if (!IS_JOB_PENDING(job_ptr))
13377 			error_code = ESLURM_JOB_NOT_PENDING;
13378 		else if (detail_ptr) {
13379 			xfree(detail_ptr->std_out);
13380 			detail_ptr->std_out = xstrdup(job_specs->std_out);
13381 		}
13382 	}
13383 	if (error_code != SLURM_SUCCESS)
13384 		goto fini;
13385 
13386 	if (job_specs->wckey
13387 	    && !xstrcmp(job_specs->wckey, job_ptr->wckey)) {
13388 		sched_debug("%s: new wckey identical to old wckey %pJ",
13389 			    __func__, job_ptr);
13390 	} else if (job_specs->wckey) {
13391 		if (!IS_JOB_PENDING(job_ptr))
13392 			error_code = ESLURM_JOB_NOT_PENDING;
13393 		else {
13394 			int rc = update_job_wckey((char *) __func__,
13395 						  job_ptr, job_specs->wckey);
13396 			if (rc != SLURM_SUCCESS)
13397 				error_code = rc;
13398 			else
13399 				update_accounting = true;
13400 		}
13401 	}
13402 	if (error_code != SLURM_SUCCESS)
13403 		goto fini;
13404 
13405 	if ((job_specs->min_nodes != NO_VAL) &&
13406 	    (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
13407 		/*
13408 		 * Use req_nodes to change the nodes associated with a running
13409 		 * for lack of other field in the job request to use
13410 		 */
13411 		if ((job_specs->min_nodes == 0) && (job_ptr->node_cnt > 0) &&
13412 		    job_ptr->details && job_ptr->details->expanding_jobid) {
13413 			job_record_t *expand_job_ptr;
13414 			bitstr_t *orig_job_node_bitmap, *orig_jobx_node_bitmap;
13415 
13416 			expand_job_ptr = find_job_record(job_ptr->details->
13417 							 expanding_jobid);
13418 			if (expand_job_ptr == NULL) {
13419 				info("%s: Invalid node count (%u) for %pJ update, JobId=%u to expand not found",
13420 				     __func__, job_specs->min_nodes, job_ptr,
13421 				     job_ptr->details->expanding_jobid);
13422 				error_code = ESLURM_INVALID_JOB_ID;
13423 				goto fini;
13424 			}
13425 			if (IS_JOB_SUSPENDED(job_ptr) ||
13426 			    IS_JOB_SUSPENDED(expand_job_ptr)) {
13427 				info("%s: Can not expand %pJ from %pJ, job is suspended",
13428 				     __func__, expand_job_ptr, job_ptr);
13429 				error_code = ESLURM_JOB_SUSPENDED;
13430 				goto fini;
13431 			}
13432 			if ((job_ptr->step_list != NULL) &&
13433 			    (list_count(job_ptr->step_list) != 0)) {
13434 				info("%s: Attempt to merge %pJ with active steps into %pJ",
13435 				     __func__, job_ptr, expand_job_ptr);
13436 				error_code = ESLURMD_STEP_EXISTS;
13437 				goto fini;
13438 			}
13439 			sched_info("%s: killing %pJ and moving all resources to %pJ",
13440 				   __func__, job_ptr, expand_job_ptr);
13441 			job_pre_resize_acctg(job_ptr);
13442 			job_pre_resize_acctg(expand_job_ptr);
13443 			_send_job_kill(job_ptr);
13444 
13445 			xassert(job_ptr->job_resrcs);
13446 			xassert(job_ptr->job_resrcs->node_bitmap);
13447 			xassert(expand_job_ptr->job_resrcs->node_bitmap);
13448 			orig_job_node_bitmap = bit_copy(job_ptr->node_bitmap);
13449 			orig_jobx_node_bitmap = bit_copy(expand_job_ptr->
13450 							 job_resrcs->
13451 							 node_bitmap);
13452 			error_code = select_g_job_expand(job_ptr,
13453 							 expand_job_ptr);
13454 			if (error_code == SLURM_SUCCESS) {
13455 				_merge_job_licenses(job_ptr, expand_job_ptr);
13456 				FREE_NULL_BITMAP(job_ptr->node_bitmap);
13457 				job_ptr->node_bitmap = orig_job_node_bitmap;
13458 				orig_job_node_bitmap = NULL;
13459 				deallocate_nodes(job_ptr, false, false, false);
13460 				bit_clear_all(job_ptr->node_bitmap);
13461 				job_ptr->job_state &= JOB_STATE_FLAGS;
13462 				job_ptr->job_state |= JOB_COMPLETE;
13463 				_realloc_nodes(expand_job_ptr,
13464 					       orig_jobx_node_bitmap);
13465 				rebuild_step_bitmaps(expand_job_ptr,
13466 						     orig_jobx_node_bitmap);
13467 				(void) gs_job_fini(job_ptr);
13468 				(void) gs_job_start(expand_job_ptr);
13469 			}
13470 			FREE_NULL_BITMAP(orig_job_node_bitmap);
13471 			FREE_NULL_BITMAP(orig_jobx_node_bitmap);
13472 			job_post_resize_acctg(job_ptr);
13473 			job_post_resize_acctg(expand_job_ptr);
13474 			/*
13475 			 * Since job_post_resize_acctg will restart things,
13476 			 * don't do it again.
13477 			 */
13478 			update_accounting = false;
13479 			if (error_code)
13480 				goto fini;
13481 		} else if ((job_specs->min_nodes == 0) ||
13482 			   (job_specs->min_nodes > job_ptr->node_cnt) ||
13483 			   job_ptr->details->expanding_jobid) {
13484 			sched_info("%s: Invalid node count (%u) for %pJ update",
13485 				   __func__, job_specs->min_nodes, job_ptr);
13486 			error_code = ESLURM_INVALID_NODE_COUNT;
13487 			goto fini;
13488 		} else if (job_specs->min_nodes == job_ptr->node_cnt) {
13489 			debug2("%s: No change in node count update for %pJ",
13490 			       __func__, job_ptr);
13491 		} else if (!permit_job_shrink()) {
13492 			error("%s: request to shrink %pJ denied by configuration",
13493 			      __func__, job_ptr);
13494 			error_code = ESLURM_NOT_SUPPORTED;
13495 			goto fini;
13496 		} else {
13497 			int i, i_first, i_last, total = 0;
13498 			node_record_t *node_ptr;
13499 			bitstr_t *rem_nodes, *tmp_nodes;
13500 			sched_info("%s: set node count to %u for %pJ", __func__,
13501 				   job_specs->min_nodes, job_ptr);
13502 			job_pre_resize_acctg(job_ptr);
13503 
13504 			/*
13505 			 * Don't remove the batch host from the job. The batch
13506 			 * host isn't guaranteed to be the first bit set in
13507 			 * job_ptr->node_bitmap because the batch host can be
13508 			 * selected with the --batch and --constraint sbatch
13509 			 * flags.
13510 			 */
13511 			tmp_nodes = bit_copy(job_ptr->node_bitmap);
13512 			if (job_ptr->batch_host) {
13513 				bitstr_t *batch_host_bitmap;
13514 				if (node_name2bitmap(job_ptr->batch_host, false,
13515 						     &batch_host_bitmap))
13516 					error("%s: Invalid batch host %s for %pJ; this should never happen",
13517 					      __func__, job_ptr->batch_host,
13518 					      job_ptr);
13519 				else {
13520 					bit_and_not(tmp_nodes,
13521 						    batch_host_bitmap);
13522 					bit_free(batch_host_bitmap);
13523 					/*
13524 					 * Set total to 1 since we're
13525 					 * guaranteeing that we won't remove the
13526 					 * batch host.
13527 					 */
13528 					total = 1;
13529 				}
13530 			}
13531 
13532 			i_first = bit_ffs(tmp_nodes);
13533 			if (i_first >= 0)
13534 				i_last  = bit_fls(tmp_nodes);
13535 			else
13536 				i_last = -2;
13537 			rem_nodes = bit_alloc(bit_size(tmp_nodes));
13538 			for (i = i_first; i <= i_last; i++) {
13539 				if (!bit_test(tmp_nodes, i))
13540 					continue;
13541 				if (++total <= job_specs->min_nodes)
13542 					continue;
13543 				bit_set(rem_nodes, i);
13544 			}
13545 #ifndef HAVE_FRONT_END
13546 			abort_job_on_nodes(job_ptr, rem_nodes);
13547 #endif
13548 			for (i = i_first, total = 0; i <= i_last; i++) {
13549 				if (!bit_test(rem_nodes, i))
13550 					continue;
13551 				node_ptr = node_record_table_ptr + i;
13552 				kill_step_on_node(job_ptr, node_ptr, false);
13553 				excise_node_from_job(job_ptr, node_ptr);
13554 			}
13555 			bit_free(rem_nodes);
13556 			bit_free(tmp_nodes);
13557 			(void) gs_job_start(job_ptr);
13558 			job_post_resize_acctg(job_ptr);
13559 			sched_info("%s: set nodes to %s for %pJ",
13560 				   __func__, job_ptr->nodes, job_ptr);
13561 			/*
13562 			 * Since job_post_resize_acctg() will restart
13563 			 * things don't do it again.
13564 			 */
13565 			update_accounting = false;
13566 		}
13567 		gres_build_job_details(job_ptr->gres_list,
13568 				       &job_ptr->gres_detail_cnt,
13569 				       &job_ptr->gres_detail_str,
13570 				       &job_ptr->gres_used);
13571 	}
13572 
13573 	if (job_specs->ntasks_per_node != NO_VAL16) {
13574 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
13575 			error_code = ESLURM_JOB_NOT_PENDING;
13576 		else if (operator) {
13577 			detail_ptr->ntasks_per_node =
13578 				job_specs->ntasks_per_node;
13579 			sched_info("%s: setting ntasks_per_node to %u for %pJ",
13580 				   __func__, job_specs->ntasks_per_node, job_ptr);
13581 		} else {
13582 			sched_error("%s: Not super user: ignore ntasks_per_node change for job %pJ",
13583 				    __func__, job_ptr);
13584 			error_code = ESLURM_ACCESS_DENIED;
13585 		}
13586 	}
13587 	if (error_code != SLURM_SUCCESS)
13588 		goto fini;
13589 
13590 	if (job_specs->ntasks_per_socket != NO_VAL16) {
13591 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL) ||
13592 		    (detail_ptr->mc_ptr == NULL)) {
13593 			error_code = ESLURM_JOB_NOT_PENDING;
13594 		} else if (operator) {
13595 			detail_ptr->mc_ptr->ntasks_per_socket =
13596 				job_specs->ntasks_per_socket;
13597 			sched_info("%s: setting ntasks_per_socket to %u for %pJ",
13598 				   __func__, job_specs->ntasks_per_socket,
13599 				   job_ptr);
13600 		} else {
13601 			sched_error("%s: Not super user: ignore ntasks_per_socket change for %pJ",
13602 				    __func__, job_ptr);
13603 			error_code = ESLURM_ACCESS_DENIED;
13604 		}
13605 	}
13606 	if (error_code != SLURM_SUCCESS)
13607 		goto fini;
13608 
13609 	if (job_specs->dependency) {
13610 		/* Can't update dependency of revoked job */
13611 		if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL) ||
13612 		    IS_JOB_REVOKED(job_ptr))
13613 			error_code = ESLURM_JOB_NOT_PENDING;
13614 		else if (!fed_mgr_is_origin_job(job_ptr)) {
13615 			/*
13616 			 * If the job became independent because of a dependency
13617 			 * update, that job gets requeued on siblings and then
13618 			 * the dependency update gets sent to siblings. So we
13619 			 * silently ignore this update on the sibling.
13620 			 */
13621 		} else {
13622 			int rc;
13623 			rc = update_job_dependency(job_ptr,
13624 						   job_specs->dependency);
13625 			if (rc != SLURM_SUCCESS)
13626 				error_code = rc;
13627 			/*
13628 			 * Because dependencies updated and we don't know where
13629 			 * they used to be, send dependencies to all siblings
13630 			 * so the siblings can update their dependency list.
13631 			 */
13632 			else {
13633 				rc = fed_mgr_submit_remote_dependencies(job_ptr,
13634 									true,
13635 									false);
13636 				if (rc) {
13637 					error("%s: %pJ Failed to send remote dependencies to some or all siblings.",
13638 					      __func__, job_ptr);
13639 					error_code = rc;
13640 				}
13641 				/*
13642 				 * Even if we fail to send remote dependencies,
13643 				 * we already succeeded in updating the job's
13644 				 * dependency locally, so we still need to
13645 				 * do these things.
13646 				 */
13647 				job_ptr->details->orig_dependency =
13648 					xstrdup(job_ptr->details->dependency);
13649 				sched_info("%s: setting dependency to %s for %pJ",
13650 					   __func__,
13651 					   job_ptr->details->dependency,
13652 					   job_ptr);
13653 				/*
13654 				 * If the job isn't independent, remove pending
13655 				 * remote sibling jobs
13656 				 */
13657 				if (!job_independent(job_ptr))
13658 					fed_mgr_job_revoke_sibs(job_ptr);
13659 			}
13660 		}
13661 	}
13662 	if (error_code != SLURM_SUCCESS)
13663 		goto fini;
13664 
13665 	if (job_specs->begin_time) {
13666 		if (IS_JOB_PENDING(job_ptr) && detail_ptr) {
13667 			char time_str[32];
13668 			/*
13669 			 * Make sure this time is current, it does no good for
13670 			 * accounting to say this job could have started before
13671 			 * now
13672 			 */
13673 			if (job_specs->begin_time < now)
13674 				job_specs->begin_time = now;
13675 
13676 			if (detail_ptr->begin_time != job_specs->begin_time) {
13677 				detail_ptr->begin_time = job_specs->begin_time;
13678 				update_accounting = true;
13679 				slurm_make_time_str(&detail_ptr->begin_time,
13680 						    time_str, sizeof(time_str));
13681 				sched_info("%s: setting begin to %s for %pJ",
13682 					   __func__, time_str, job_ptr);
13683 				acct_policy_remove_accrue_time(job_ptr, false);
13684 			} else
13685 				sched_debug("%s: new begin time identical to old begin time %pJ",
13686 					    __func__, job_ptr);
13687 		} else {
13688 			error_code = ESLURM_JOB_NOT_PENDING;
13689 			goto fini;
13690 		}
13691 	}
13692 
13693 	if (valid_licenses) {
13694 		if (IS_JOB_PENDING(job_ptr)) {
13695 			FREE_NULL_LIST(job_ptr->license_list);
13696 			job_ptr->license_list = license_list;
13697 			license_list = NULL;
13698 			sched_info("%s: changing licenses from '%s' to '%s' for pending %pJ",
13699 				   __func__, job_ptr->licenses,
13700 				   job_specs->licenses, job_ptr);
13701 			xfree(job_ptr->licenses);
13702 			job_ptr->licenses = xstrdup(job_specs->licenses);
13703 		} else if (IS_JOB_RUNNING(job_ptr)) {
13704 			/*
13705 			 * Operators can modify license counts on running jobs,
13706 			 * regular users can only completely remove license
13707 			 * counts on running jobs.
13708 			 */
13709 			if (!operator && license_list) {
13710 				sched_error("%s: Not operator user: ignore licenses change for %pJ",
13711 					    __func__, job_ptr);
13712 				error_code = ESLURM_ACCESS_DENIED;
13713 				goto fini;
13714 			}
13715 
13716 			/*
13717 			 * NOTE: This can result in oversubscription of
13718 			 * licenses
13719 			 */
13720 			license_job_return(job_ptr);
13721 			FREE_NULL_LIST(job_ptr->license_list);
13722 			job_ptr->license_list = license_list;
13723 			license_list = NULL;
13724 			sched_info("%s: changing licenses from '%s' to '%s' for running %pJ",
13725 				   __func__, job_ptr->licenses,
13726 				   job_specs->licenses, job_ptr);
13727 			xfree(job_ptr->licenses);
13728 			job_ptr->licenses = xstrdup(job_specs->licenses);
13729 			license_job_get(job_ptr);
13730 		} else {
13731 			/*
13732 			 * licenses are valid, but job state or user not
13733 			 * allowed to make changes
13734 			 */
13735 			sched_info("%s: could not change licenses for %pJ",
13736 				   __func__, job_ptr);
13737 			error_code = ESLURM_JOB_NOT_PENDING_NOR_RUNNING;
13738 			FREE_NULL_LIST(license_list);
13739 		}
13740 
13741 		update_accounting = true;
13742 	}
13743 	if (error_code != SLURM_SUCCESS)
13744 		goto fini;
13745 
13746 	fail_reason = job_limits_check(&job_ptr, false);
13747 	if (fail_reason != WAIT_NO_REASON) {
13748 		if (fail_reason == WAIT_QOS_THRES)
13749 			error_code = ESLURM_QOS_THRES;
13750 		else if ((fail_reason == WAIT_PART_TIME_LIMIT) ||
13751 			 (fail_reason == WAIT_PART_NODE_LIMIT) ||
13752 			 (fail_reason == WAIT_PART_DOWN) ||
13753 			 (fail_reason == WAIT_HELD))
13754 			error_code = SLURM_SUCCESS;
13755 		else
13756 			error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
13757 
13758 		if (error_code != SLURM_SUCCESS) {
13759 			if ((job_ptr->state_reason != WAIT_HELD) &&
13760 			    (job_ptr->state_reason != WAIT_HELD_USER) &&
13761 			    (job_ptr->state_reason != WAIT_RESV_DELETED)) {
13762 				job_ptr->state_reason = fail_reason;
13763 				xfree(job_ptr->state_desc);
13764 			}
13765 			goto fini;
13766 		}
13767 	} else if ((job_ptr->state_reason != WAIT_HELD)
13768 		   && (job_ptr->state_reason != WAIT_HELD_USER)
13769 		   && (job_ptr->state_reason != WAIT_RESV_DELETED)
13770 		   /*
13771 		    * A job update can come while the prolog is running.
13772 		    * Don't change state_reason if the prolog is running.
13773 		    * _is_prolog_finished() relies on state_reason==WAIT_PROLOG
13774 		    * to know if the prolog is running. If we change it here,
13775 		    * then slurmctld will think that the prolog isn't running
13776 		    * anymore and _slurm_rpc_job_ready will tell srun that the
13777 		    * prolog is done even if it isn't. Then srun can launch a
13778 		    * job step before the prolog is done, which breaks the
13779 		    * behavior of PrologFlags=alloc and means that the job step
13780 		    * could launch before the extern step sets up x11.
13781 		    */
13782 		   && (job_ptr->state_reason != WAIT_PROLOG)
13783 		   && (job_ptr->state_reason != WAIT_MAX_REQUEUE)) {
13784 		job_ptr->state_reason = WAIT_NO_REASON;
13785 		xfree(job_ptr->state_desc);
13786 	}
13787 
13788 	if (job_specs->reboot != NO_VAL16) {
13789 		if (!validate_super_user(uid)) {
13790 			error("%s: Attempt to change reboot for %pJ",
13791 			      __func__, job_ptr);
13792 			error_code = ESLURM_ACCESS_DENIED;
13793 		} else if (!IS_JOB_PENDING(job_ptr)) {
13794 			error_code = ESLURM_JOB_NOT_PENDING;
13795 			goto fini;
13796 		} else {
13797 			sched_info("%s: setting reboot to %u for %pJ",
13798 				   __func__, job_specs->reboot, job_ptr);
13799 			if (job_specs->reboot == 0)
13800 				job_ptr->reboot = 0;
13801 			else
13802 				job_ptr->reboot = MAX(1, job_specs->reboot);
13803 		}
13804 	}
13805 
13806 	if (job_specs->network && !xstrcmp(job_specs->network,
13807 					   job_ptr->network)) {
13808 		sched_debug("%s: new network identical to old network %s",
13809 			    __func__, job_ptr->network);
13810 	} else if (job_specs->network) {
13811 		xfree(job_ptr->network);
13812 		if (!strlen(job_specs->network)
13813 		    || !xstrcmp(job_specs->network, "none")) {
13814 			sched_info("%s: clearing Network option for %pJ",
13815 				   __func__, job_ptr);
13816 		} else {
13817 			job_ptr->network = xstrdup(job_specs->network);
13818 			sched_info("%s: setting Network to %s for %pJ",
13819 				   __func__, job_ptr->network, job_ptr);
13820 			select_g_select_jobinfo_set(
13821 				job_ptr->select_jobinfo,
13822 				SELECT_JOBDATA_NETWORK,
13823 				job_ptr->network);
13824 		}
13825 	}
13826 
13827 	if (job_specs->fed_siblings_viable) {
13828 		if (!job_ptr->fed_details) {
13829 			error_code = ESLURM_JOB_NOT_FEDERATED;
13830 			goto fini;
13831 		}
13832 
13833 		info("%s: setting fed_siblings from %"PRIu64" to %"PRIu64" for %pJ",
13834 		     __func__, job_ptr->fed_details->siblings_viable,
13835 		     job_specs->fed_siblings_viable, job_ptr);
13836 
13837 		job_ptr->fed_details->siblings_viable =
13838 			job_specs->fed_siblings_viable;
13839 		update_job_fed_details(job_ptr);
13840 	}
13841 
13842 	if (job_specs->cpus_per_tres) {
13843 		if (!valid_tres_cnt(job_specs->cpus_per_tres)) {
13844 			error_code = ESLURM_INVALID_TRES;
13845 			goto fini;
13846 		}
13847 		xfree(job_ptr->cpus_per_tres);
13848 		if (!strlen(job_specs->cpus_per_tres)) {
13849 			sched_info("%s: clearing CpusPerTres option for %pJ",
13850 				   __func__, job_ptr);
13851 		} else {
13852 			job_ptr->cpus_per_tres =
13853 				xstrdup(job_specs->cpus_per_tres);
13854 			sched_info("%s: setting CpusPerTres to %s for %pJ",
13855 				   __func__, job_ptr->cpus_per_tres, job_ptr);
13856 		}
13857 	}
13858 
13859 	if (job_specs->mem_per_tres) {
13860 		if (!valid_tres_cnt(job_specs->mem_per_tres)) {
13861 			error_code = ESLURM_INVALID_TRES;
13862 			goto fini;
13863 		}
13864 		xfree(job_ptr->mem_per_tres);
13865 		if (!strlen(job_specs->mem_per_tres)) {
13866 			sched_info("%s: clearing MemPerTres option for %pJ",
13867 				   __func__, job_ptr);
13868 		} else {
13869 			job_ptr->mem_per_tres =
13870 				xstrdup(job_specs->mem_per_tres);
13871 			sched_info("%s: setting MemPerTres to %s for %pJ",
13872 				   __func__, job_ptr->mem_per_tres, job_ptr);
13873 		}
13874 	}
13875 
13876 	if (job_specs->tres_bind) {
13877 		if (tres_bind_verify_cmdline(job_specs->tres_bind)) {
13878 			error_code = ESLURM_INVALID_TRES;
13879 			goto fini;
13880 		}
13881 		xfree(job_ptr->tres_bind);
13882 		if (!strlen(job_specs->tres_bind)) {
13883 			sched_info("%s: clearing TresBind option for %pJ",
13884 				   __func__, job_ptr);
13885 		} else {
13886 			job_ptr->tres_bind = xstrdup(job_specs->tres_bind);
13887 			sched_info("%s: setting TresBind to %s for %pJ",
13888 				   __func__, job_ptr->tres_bind, job_ptr);
13889 		}
13890 	}
13891 
13892 	if (job_specs->tres_freq) {
13893 		if (tres_freq_verify_cmdline(job_specs->tres_freq)) {
13894 			error_code = ESLURM_INVALID_TRES;
13895 			goto fini;
13896 		}
13897 		xfree(job_ptr->tres_freq);
13898 		if (!strlen(job_specs->tres_freq)) {
13899 			sched_info("%s: clearing TresFreq option for %pJ",
13900 				   __func__, job_ptr);
13901 		} else {
13902 			job_ptr->tres_freq = xstrdup(job_specs->tres_freq);
13903 			sched_info("%s: setting TresFreq to %s for %pJ",
13904 				   __func__, job_ptr->tres_freq, job_ptr);
13905 		}
13906 	}
13907 
13908 	if (job_specs->tres_per_job) {
13909 		if (!valid_tres_cnt(job_specs->tres_per_job)) {
13910 			error_code = ESLURM_INVALID_TRES;
13911 			goto fini;
13912 		}
13913 		xfree(job_ptr->tres_per_job);
13914 		if (!strlen(job_specs->tres_per_job)) {
13915 			sched_info("%s: clearing TresPerJob option for %pJ",
13916 				   __func__, job_ptr);
13917 		} else {
13918 			job_ptr->tres_per_job =
13919 					xstrdup(job_specs->tres_per_job);
13920 			sched_info("%s: setting TresPerJob to %s for %pJ",
13921 				   __func__, job_ptr->tres_per_job, job_ptr);
13922 		}
13923 	}
13924 	if (job_specs->tres_per_node) {
13925 		if (!valid_tres_cnt(job_specs->tres_per_node)) {
13926 			error_code = ESLURM_INVALID_TRES;
13927 			goto fini;
13928 		}
13929 		xfree(job_ptr->tres_per_node);
13930 		if (!strlen(job_specs->tres_per_node)) {
13931 			sched_info("%s: clearing TresPerNode option for %pJ",
13932 				   __func__, job_ptr);
13933 		} else {
13934 			job_ptr->tres_per_node =
13935 					xstrdup(job_specs->tres_per_node);
13936 			sched_info("%s: setting TresPerNode to %s for %pJ",
13937 				   __func__, job_ptr->tres_per_node, job_ptr);
13938 		}
13939 	}
13940 
13941 	if (job_specs->tres_per_socket) {
13942 		if (!valid_tres_cnt(job_specs->tres_per_socket)) {
13943 			error_code = ESLURM_INVALID_TRES;
13944 			goto fini;
13945 		}
13946 		xfree(job_ptr->tres_per_socket);
13947 		if (!strlen(job_specs->tres_per_socket)) {
13948 			sched_info("%s: clearing TresPerSocket option for %pJ",
13949 				   __func__, job_ptr);
13950 		} else {
13951 			job_ptr->tres_per_socket =
13952 				xstrdup(job_specs->tres_per_socket);
13953 			sched_info("%s: setting TresPerSocket to %s for %pJ",
13954 				   __func__, job_ptr->tres_per_socket, job_ptr);
13955 		}
13956 	}
13957 
13958 	if (job_specs->tres_per_task) {
13959 		if (!valid_tres_cnt(job_specs->tres_per_task)) {
13960 			error_code = ESLURM_INVALID_TRES;
13961 			goto fini;
13962 		}
13963 		xfree(job_ptr->tres_per_task);
13964 		if (!strlen(job_specs->tres_per_task)) {
13965 			sched_info("%s: clearing TresPerTask option for %pJ",
13966 				   __func__, job_ptr);
13967 		} else {
13968 			job_ptr->tres_per_task =
13969 				xstrdup(job_specs->tres_per_task);
13970 			sched_info("%s: setting TresPerTask to %s for %pJ",
13971 				   __func__, job_ptr->tres_per_task, job_ptr);
13972 		}
13973 	}
13974 
13975 	if (job_specs->mail_type != NO_VAL16) {
13976 		job_ptr->mail_type = job_specs->mail_type;
13977 		sched_info("%s: setting mail_type to %u for %pJ",
13978 			   __func__, job_ptr->mail_type, job_ptr);
13979 	}
13980 
13981 	if (job_specs->mail_user) {
13982 		xfree(job_ptr->mail_user);
13983 		job_ptr->mail_user = _get_mail_user(job_specs->mail_user,
13984 						    job_ptr->user_id);
13985 		sched_info("%s: setting mail_user to %s for %pJ",
13986 			   __func__, job_ptr->mail_user, job_ptr);
13987 	}
13988 
13989 	/*
13990 	 * The job submit plugin sets site_factor to NO_VAL before calling
13991 	 * the plugin to prevent the user from specifying it.
13992 	 */
13993 	if (user_site_factor != NO_VAL) {
13994 		if (!operator) {
13995 			error("%s: Attempt to change SiteFactor for %pJ",
13996 			      __func__, job_ptr);
13997 			error_code = ESLURM_ACCESS_DENIED;
13998 			job_specs->site_factor = NO_VAL;
13999 		} else
14000 			job_specs->site_factor = user_site_factor;
14001 	}
14002 	if (job_specs->site_factor != NO_VAL) {
14003 		sched_info("%s: setting AdinPrioFactor to %u for %pJ",
14004 			   __func__, job_specs->site_factor, job_ptr);
14005 		job_ptr->site_factor = job_specs->site_factor;
14006 	}
14007 
14008 fini:
14009 	FREE_NULL_BITMAP(new_req_bitmap);
14010 	FREE_NULL_LIST(part_ptr_list);
14011 
14012 	if ((error_code == SLURM_SUCCESS) && tres_req_cnt_set) {
14013 		for (tres_pos = 0; tres_pos < slurmctld_tres_cnt; tres_pos++) {
14014 			if (!tres_req_cnt[tres_pos] ||
14015 			    (tres_req_cnt[tres_pos] ==
14016 			     job_ptr->tres_req_cnt[tres_pos]))
14017 				continue;
14018 
14019 			job_ptr->tres_req_cnt[tres_pos] =
14020 				tres_req_cnt[tres_pos];
14021 			tres_changed = true;
14022 		}
14023 		if (tres_changed) {
14024 			job_ptr->tres_req_cnt[TRES_ARRAY_BILLING] =
14025 				assoc_mgr_tres_weighted(
14026 					job_ptr->tres_req_cnt,
14027 					job_ptr->part_ptr->billing_weights,
14028 					slurmctld_conf.priority_flags,
14029 					false);
14030 			set_job_tres_req_str(job_ptr, false);
14031 			update_accounting = true;
14032 			job_ptr->node_cnt_wag = 0;
14033 		}
14034 	}
14035 
14036 	/* This was a local variable, so set it back to NULL */
14037 	job_specs->tres_req_cnt = NULL;
14038 
14039 	FREE_NULL_LIST(gres_list);
14040 	FREE_NULL_LIST(license_list);
14041 	if (update_accounting) {
14042 		info("%s: updating accounting",  __func__);
14043 		/* Update job record in accounting to reflect changes */
14044 		jobacct_storage_job_start_direct(acct_db_conn, job_ptr);
14045 	}
14046 
14047 	/*
14048 	 * If job isn't held recalculate the priority when not using
14049 	 * priority/basic. Since many factors of an update may affect priority
14050 	 * considerations. Do this whether or not the update was successful or
14051 	 * not.
14052 	 */
14053 	if ((job_ptr->priority != 0) &&
14054 	    xstrcmp(slurmctld_conf.priority_type, "priority/basic"))
14055 		set_job_prio(job_ptr);
14056 
14057 	if ((error_code == SLURM_SUCCESS) &&
14058 	    fed_mgr_fed_rec &&
14059 	    job_ptr->fed_details && fed_mgr_is_origin_job(job_ptr)) {
14060 		/* Send updates to sibling jobs */
14061 		/* Add the siblings_active to be updated. They could have been
14062 		 * updated if the job's ClusterFeatures were updated. */
14063 		job_specs->fed_siblings_viable =
14064 			job_ptr->fed_details->siblings_viable;
14065 		fed_mgr_update_job(job_ptr->job_id, job_specs,
14066 				   job_ptr->fed_details->siblings_active, uid);
14067 	}
14068 
14069 	return error_code;
14070 }
14071 
14072 /*
14073  * update_job - update a job's parameters per the supplied specifications
14074  * IN msg - RPC to update job, including change specification
14075  * IN uid - uid of user issuing RPC
14076  * IN send_msg - whether to send msg back or not
14077  * RET returns an error code from slurm_errno.h
14078  * global: job_list - global list of job entries
14079  *	last_job_update - time of last job table update
14080  */
update_job(slurm_msg_t * msg,uid_t uid,bool send_msg)14081 extern int update_job(slurm_msg_t *msg, uid_t uid, bool send_msg)
14082 {
14083 	job_desc_msg_t *job_specs = (job_desc_msg_t *) msg->data;
14084 	job_record_t *job_ptr;
14085 	char *hostname = g_slurm_auth_get_host(msg->auth_cred);
14086 	int rc;
14087 
14088 	xfree(job_specs->job_id_str);
14089 	xstrfmtcat(job_specs->job_id_str, "%u", job_specs->job_id);
14090 
14091 	if (hostname) {
14092 		xfree(job_specs->alloc_node);
14093 		job_specs->alloc_node = hostname;
14094 	}
14095 
14096 	job_ptr = find_job_record(job_specs->job_id);
14097 	if (job_ptr == NULL) {
14098 		info("%s: JobId=%u does not exist",
14099 		     __func__, job_specs->job_id);
14100 		rc = ESLURM_INVALID_JOB_ID;
14101 	} else {
14102 		if (job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap)
14103 			job_specs->array_bitmap =
14104 				bit_copy(job_ptr->array_recs->task_id_bitmap);
14105 
14106 		rc = _update_job(job_ptr, job_specs, uid);
14107 	}
14108 	if (send_msg && rc != ESLURM_JOB_SETTING_DB_INX)
14109 		slurm_send_rc_msg(msg, rc);
14110 	xfree(job_specs->job_id_str);
14111 
14112 	return rc;
14113 }
14114 
14115 /*
14116  * IN msg - RPC to update job, including change specification
14117  * IN job_specs - a job's specification
14118  * IN uid - uid of user issuing RPC
14119  * RET returns an error code from slurm_errno.h
14120  * global: job_list - global list of job entries
14121  *	last_job_update - time of last job table update
14122  */
update_job_str(slurm_msg_t * msg,uid_t uid)14123 extern int update_job_str(slurm_msg_t *msg, uid_t uid)
14124 {
14125 
14126 	slurm_msg_t resp_msg;
14127 	job_desc_msg_t *job_specs = (job_desc_msg_t *) msg->data;
14128 	job_record_t *job_ptr, *new_job_ptr, *het_job;
14129 	char *hostname = g_slurm_auth_get_host(msg->auth_cred);
14130 	ListIterator iter;
14131 	long int long_id;
14132 	uint32_t job_id = 0, het_job_offset;
14133 	bitstr_t *array_bitmap = NULL, *tmp_bitmap;
14134 	bool valid = true;
14135 	int32_t i, i_first, i_last;
14136 	int len, rc = SLURM_SUCCESS, rc2;
14137 	char *end_ptr, *tok, *tmp = NULL;
14138 	char *job_id_str;
14139 	resp_array_struct_t *resp_array = NULL;
14140 	job_array_resp_msg_t *resp_array_msg = NULL;
14141 	return_code_msg_t rc_msg;
14142 
14143 	job_id_str = job_specs->job_id_str;
14144 
14145 	if (hostname) {
14146 		xfree(job_specs->alloc_node);
14147 		job_specs->alloc_node = hostname;
14148 
14149 	}
14150 
14151 	if (max_array_size == NO_VAL)
14152 		max_array_size = slurmctld_conf.max_array_sz;
14153 
14154 	long_id = strtol(job_id_str, &end_ptr, 10);
14155 	if ((long_id <= 0) || (long_id == LONG_MAX) ||
14156 	    ((end_ptr[0] != '\0') && (end_ptr[0] != '_') &&
14157 	     (end_ptr[0] != '+'))) {
14158 		info("%s: invalid JobId=%s", __func__, job_id_str);
14159 		rc = ESLURM_INVALID_JOB_ID;
14160 		goto reply;
14161 	}
14162 	job_id = (uint32_t) long_id;
14163 	if (end_ptr[0] == '\0') {	/* Single job (or full job array) */
14164 		job_record_t *job_ptr_done = NULL;
14165 		job_ptr = find_job_record(job_id);
14166 		if (job_ptr && job_ptr->het_job_list) {
14167 			iter = list_iterator_create(job_ptr->het_job_list);
14168 			while ((het_job = list_next(iter))) {
14169 				if (job_ptr->het_job_id !=
14170 				    het_job->het_job_id) {
14171 					error("%s: Bad het_job_list for %pJ",
14172 					      __func__, job_ptr);
14173 					continue;
14174 				}
14175 				rc = _update_job(het_job, job_specs, uid);
14176 			}
14177 			list_iterator_destroy(iter);
14178 			goto reply;
14179 		}
14180 		if (job_ptr &&
14181 		    (((job_ptr->array_task_id == NO_VAL) &&
14182 		      (job_ptr->array_recs == NULL)) ||
14183 		     ((job_ptr->array_task_id != NO_VAL) &&
14184 		      (job_ptr->array_job_id  != job_id)))) {
14185 			/* This is a regular job or single task of job array */
14186 			rc = _update_job(job_ptr, job_specs, uid);
14187 			goto reply;
14188 		}
14189 
14190 		if (job_ptr && job_ptr->array_recs) {
14191 			/* This is a job array */
14192 			job_ptr_done = job_ptr;
14193 			if (job_ptr->array_recs->task_id_bitmap)
14194 				job_specs->array_bitmap = bit_copy(
14195 					job_ptr->array_recs->task_id_bitmap);
14196 			rc2 = _update_job(job_ptr, job_specs, uid);
14197 			if (rc2 == ESLURM_JOB_SETTING_DB_INX) {
14198 				rc = rc2;
14199 				goto reply;
14200 			}
14201 			_resp_array_add(&resp_array, job_ptr, rc2);
14202 		}
14203 
14204 		/* Update all tasks of this job array */
14205 		job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
14206 		if (!job_ptr && !job_ptr_done) {
14207 			info("%s: invalid JobId=%u", __func__, job_id);
14208 			rc = ESLURM_INVALID_JOB_ID;
14209 			goto reply;
14210 		}
14211 		while (job_ptr) {
14212 			if ((job_ptr->array_job_id == job_id) &&
14213 			    (job_ptr != job_ptr_done)) {
14214 				rc2 = _update_job(job_ptr, job_specs, uid);
14215 				if (rc2 == ESLURM_JOB_SETTING_DB_INX) {
14216 					rc = rc2;
14217 					goto reply;
14218 				}
14219 				_resp_array_add(&resp_array, job_ptr, rc2);
14220 			}
14221 			job_ptr = job_ptr->job_array_next_j;
14222 		}
14223 		goto reply;
14224 	} else if (end_ptr[0] == '+') {	/* Hetjob element */
14225 		long_id = strtol(end_ptr+1, &tmp, 10);
14226 		if ((long_id < 0) || (long_id == LONG_MAX) ||
14227 		    (tmp[0] != '\0')) {
14228 			info("%s: invalid JobId=%s", __func__, job_id_str);
14229 			rc = ESLURM_INVALID_JOB_ID;
14230 			goto reply;
14231 		}
14232 		het_job_offset = (uint32_t) long_id;
14233 		job_ptr = find_het_job_record(job_id, het_job_offset);
14234 		if (!job_ptr) {
14235 			info("%s: invalid JobId=%u", __func__, job_id);
14236 			rc = ESLURM_INVALID_JOB_ID;
14237 			goto reply;
14238 		}
14239 		rc = _update_job(job_ptr, job_specs, uid);
14240 		goto reply;
14241 	}
14242 
14243 	array_bitmap = bit_alloc(max_array_size);
14244 	tmp = xstrdup(end_ptr + 1);
14245 	tok = strtok_r(tmp, ",", &end_ptr);
14246 	while (tok && valid) {
14247 		valid = _parse_array_tok(tok, array_bitmap,
14248 					 max_array_size);
14249 		tok = strtok_r(NULL, ",", &end_ptr);
14250 	}
14251 	xfree(tmp);
14252 	if (valid) {
14253 		i_last = bit_fls(array_bitmap);
14254 		if (i_last < 0)
14255 			valid = false;
14256 	}
14257 	if (!valid) {
14258 		info("%s: invalid JobId=%s", __func__, job_id_str);
14259 		rc = ESLURM_INVALID_JOB_ID;
14260 		goto reply;
14261 	}
14262 
14263 	job_ptr = find_job_record(job_id);
14264 	if (job_ptr && IS_JOB_PENDING(job_ptr) &&
14265 	    job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) {
14266 		/* Ensure bitmap sizes match for AND operations */
14267 		len = bit_size(job_ptr->array_recs->task_id_bitmap);
14268 		i_last++;
14269 		if (i_last < len) {
14270 			array_bitmap = bit_realloc(array_bitmap, len);
14271 		} else {
14272 			array_bitmap = bit_realloc(array_bitmap, i_last);
14273 			job_ptr->array_recs->task_id_bitmap = bit_realloc(
14274 				job_ptr->array_recs->task_id_bitmap, i_last);
14275 		}
14276 		if (!bit_overlap_any(job_ptr->array_recs->task_id_bitmap,
14277 				     array_bitmap)) {
14278 			/* Nothing to do with this job record */
14279 		} else if (bit_super_set(job_ptr->array_recs->task_id_bitmap,
14280 					 array_bitmap)) {
14281 			/* Update the record with all pending tasks */
14282 			job_specs->array_bitmap =
14283 				bit_copy(job_ptr->array_recs->task_id_bitmap);
14284 			rc2 = _update_job(job_ptr, job_specs, uid);
14285 			if (rc2 == ESLURM_JOB_SETTING_DB_INX) {
14286 				rc = rc2;
14287 				goto reply;
14288 			}
14289 			_resp_array_add(&resp_array, job_ptr, rc2);
14290 			bit_and_not(array_bitmap, job_specs->array_bitmap);
14291 		} else {
14292 			/* Need to split out tasks to separate job records */
14293 			tmp_bitmap = bit_copy(job_ptr->array_recs->
14294 					      task_id_bitmap);
14295 			bit_and(tmp_bitmap, array_bitmap);
14296 			i_first = bit_ffs(tmp_bitmap);
14297 			if (i_first >= 0)
14298 				i_last = bit_fls(tmp_bitmap);
14299 			else
14300 				i_last = -2;
14301 			for (i = i_first; i <= i_last; i++) {
14302 				if (!bit_test(tmp_bitmap, i))
14303 					continue;
14304 				job_ptr->array_task_id = i;
14305 				new_job_ptr = job_array_split(job_ptr);
14306 				if (!new_job_ptr) {
14307 					error("%s: Unable to copy record for %pJ",
14308 					      __func__, job_ptr);
14309 				} else {
14310 					/* The array_recs structure is moved
14311 					 * to the new job record copy */
14312 					bb_g_job_validate2(job_ptr, NULL);
14313 					job_ptr = new_job_ptr;
14314 				}
14315 			}
14316 			FREE_NULL_BITMAP(tmp_bitmap);
14317 		}
14318 	}
14319 
14320 	i_first = bit_ffs(array_bitmap);
14321 	if (i_first >= 0)
14322 		i_last = bit_fls(array_bitmap);
14323 	else
14324 		i_last = -2;
14325 	for (i = i_first; i <= i_last; i++) {
14326 		if (!bit_test(array_bitmap, i))
14327 			continue;
14328 		job_ptr = find_job_array_rec(job_id, i);
14329 		if (job_ptr == NULL) {
14330 			info("%s: invalid JobId=%u_%d", __func__, job_id, i);
14331 			_resp_array_add_id(&resp_array, job_id, i,
14332 					   ESLURM_INVALID_JOB_ID);
14333 			continue;
14334 		}
14335 
14336 		rc2 = _update_job(job_ptr, job_specs, uid);
14337 		if (rc2 == ESLURM_JOB_SETTING_DB_INX) {
14338 			rc = rc2;
14339 			goto reply;
14340 		}
14341 		_resp_array_add(&resp_array, job_ptr, rc2);
14342 	}
14343 
14344 reply:
14345 	if ((rc != ESLURM_JOB_SETTING_DB_INX) && (msg->conn_fd >= 0)) {
14346 		slurm_msg_t_init(&resp_msg);
14347 		resp_msg.protocol_version = msg->protocol_version;
14348 		if (resp_array) {
14349 			resp_array_msg = _resp_array_xlate(resp_array, job_id);
14350 			resp_msg.msg_type  = RESPONSE_JOB_ARRAY_ERRORS;
14351 			resp_msg.data      = resp_array_msg;
14352 		} else {
14353 			resp_msg.msg_type  = RESPONSE_SLURM_RC;
14354 			rc_msg.return_code = rc;
14355 			resp_msg.data      = &rc_msg;
14356 		}
14357 		resp_msg.conn = msg->conn;
14358 		slurm_send_node_msg(msg->conn_fd, &resp_msg);
14359 
14360 		if (resp_array_msg) {
14361 			slurm_free_job_array_resp(resp_array_msg);
14362 			resp_msg.data = NULL;
14363 		}
14364 	}
14365 	_resp_array_free(resp_array);
14366 
14367 	FREE_NULL_BITMAP(array_bitmap);
14368 
14369 	return rc;
14370 }
14371 
_send_job_kill(job_record_t * job_ptr)14372 static void _send_job_kill(job_record_t *job_ptr)
14373 {
14374 	kill_job_msg_t *kill_job = NULL;
14375 	agent_arg_t *agent_args = NULL;
14376 #ifdef HAVE_FRONT_END
14377 	front_end_record_t *front_end_ptr;
14378 #else
14379 	int i;
14380 	node_record_t *node_ptr;
14381 #endif
14382 
14383 	xassert(job_ptr);
14384 	xassert(job_ptr->details);
14385 
14386 	agent_args = xmalloc(sizeof(agent_arg_t));
14387 	agent_args->msg_type = REQUEST_TERMINATE_JOB;
14388 	agent_args->retry = 0;	/* re_kill_job() resends as needed */
14389 	agent_args->hostlist = hostlist_create(NULL);
14390 	kill_job = xmalloc(sizeof(kill_job_msg_t));
14391 	last_node_update    = time(NULL);
14392 	kill_job->job_gres_info	=
14393 		gres_plugin_epilog_build_env(job_ptr->gres_list,job_ptr->nodes);
14394 	kill_job->job_id    = job_ptr->job_id;
14395 	kill_job->het_job_id = job_ptr->het_job_id;
14396 	kill_job->step_id   = NO_VAL;
14397 	kill_job->job_state = job_ptr->job_state;
14398 	kill_job->job_uid   = job_ptr->user_id;
14399 	kill_job->job_gid   = job_ptr->group_id;
14400 	kill_job->nodes     = xstrdup(job_ptr->nodes);
14401 	kill_job->time      = time(NULL);
14402 	kill_job->start_time = job_ptr->start_time;
14403 	kill_job->select_jobinfo = select_g_select_jobinfo_copy(
14404 			job_ptr->select_jobinfo);
14405 	kill_job->spank_job_env = xduparray(job_ptr->spank_job_env_size,
14406 					    job_ptr->spank_job_env);
14407 	kill_job->spank_job_env_size = job_ptr->spank_job_env_size;
14408 
14409 #ifdef HAVE_FRONT_END
14410 	if (job_ptr->batch_host &&
14411 	    (front_end_ptr = job_ptr->front_end_ptr)) {
14412 		agent_args->protocol_version = front_end_ptr->protocol_version;
14413 		hostlist_push_host(agent_args->hostlist, job_ptr->batch_host);
14414 		agent_args->node_count++;
14415 	}
14416 #else
14417 	if (!job_ptr->node_bitmap_cg)
14418 		build_cg_bitmap(job_ptr);
14419 	agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
14420 	for (i = 0, node_ptr = node_record_table_ptr;
14421 	     i < node_record_count; i++, node_ptr++) {
14422 		if (!bit_test(job_ptr->node_bitmap_cg, i))
14423 			continue;
14424 		if (agent_args->protocol_version > node_ptr->protocol_version)
14425 			agent_args->protocol_version =
14426 				node_ptr->protocol_version;
14427 		hostlist_push_host(agent_args->hostlist, node_ptr->name);
14428 		agent_args->node_count++;
14429 	}
14430 #endif
14431 	if (agent_args->node_count == 0) {
14432 		if (job_ptr->details->expanding_jobid == 0) {
14433 			error("%s: %pJ allocated no nodes to be killed on",
14434 			      __func__, job_ptr);
14435 		}
14436 		xfree(kill_job->nodes);
14437 		xfree(kill_job);
14438 		hostlist_destroy(agent_args->hostlist);
14439 		xfree(agent_args);
14440 		return;
14441 	}
14442 
14443 	agent_args->msg_args = kill_job;
14444 	agent_queue_request(agent_args);
14445 	return;
14446 }
14447 
14448 /* Record accounting information for a job immediately before changing size */
job_pre_resize_acctg(job_record_t * job_ptr)14449 extern void job_pre_resize_acctg(job_record_t *job_ptr)
14450 {
14451 	/* if we don't have a db_index go a start this one up since if
14452 	   running with the slurmDBD the job may not have started yet.
14453 	*/
14454 
14455 	if ((!job_ptr->db_index || job_ptr->db_index == NO_VAL64)
14456 	    && !job_ptr->resize_time)
14457 		jobacct_storage_g_job_start(acct_db_conn, job_ptr);
14458 
14459 	job_ptr->job_state |= JOB_RESIZING;
14460 	/* NOTE: job_completion_logger() calls
14461 	 *	 acct_policy_remove_job_submit() */
14462 	job_completion_logger(job_ptr, false);
14463 
14464 	/* This doesn't happen in job_completion_logger, but gets
14465 	 * added back in with job_post_resize_acctg so remove it here. */
14466 	acct_policy_job_fini(job_ptr);
14467 
14468 	/* NOTE: The RESIZING FLAG needed to be cleared with
14469 	   job_post_resize_acctg */
14470 }
14471 
14472 /* Record accounting information for a job immediately after changing size */
job_post_resize_acctg(job_record_t * job_ptr)14473 extern void job_post_resize_acctg(job_record_t *job_ptr)
14474 {
14475 	time_t org_submit = job_ptr->details->submit_time;
14476 
14477 	/*
14478 	 * NOTE: The RESIZING FLAG needed to be set with job_pre_resize_acctg()
14479 	 * the assert is here to make sure we code it that way.
14480 	 */
14481 	xassert(IS_JOB_RESIZING(job_ptr));
14482 	acct_policy_add_job_submit(job_ptr);
14483 	/* job_set_alloc_tres() must be called before acct_policy_job_begin() */
14484 	job_set_alloc_tres(job_ptr, false);
14485 	acct_policy_job_begin(job_ptr);
14486 	job_claim_resv(job_ptr);
14487 
14488 	if (job_ptr->resize_time)
14489 		job_ptr->details->submit_time = job_ptr->resize_time;
14490 
14491 	job_ptr->resize_time = time(NULL);
14492 
14493 	/* FIXME: see if this can be changed to job_start_direct() */
14494 	jobacct_storage_g_job_start(acct_db_conn, job_ptr);
14495 
14496 	job_ptr->details->submit_time = org_submit;
14497 	job_ptr->job_state &= (~JOB_RESIZING);
14498 
14499 	/*
14500 	 * Reset the end_time_exp that was probably set to NO_VAL when
14501 	 * ending the job on the resize.  If using the
14502 	 * priority/multifactor plugin if the end_time_exp is NO_VAL
14503 	 * it will not run again for the job.
14504 	 */
14505 	job_ptr->end_time_exp = job_ptr->end_time;
14506 
14507 	/*
14508 	 * If a job is resized, the core bitmap will differ in the step.
14509 	 * See rebuild_step_bitmaps(). The problem will go away when we have
14510 	 * per-node core bitmaps. For now just set a flag that the job was
14511 	 * resized while there were active job steps.
14512 	 */
14513 	if (job_ptr->step_list && (list_count(job_ptr->step_list) > 0))
14514 		job_ptr->bit_flags |= JOB_RESIZED;
14515 }
14516 
_build_step_id(char * buf,int buf_len,uint32_t step_id)14517 static char *_build_step_id(char *buf, int buf_len, uint32_t step_id)
14518 {
14519 	if (step_id == SLURM_BATCH_SCRIPT)
14520 		snprintf(buf, buf_len, "StepId=Batch");
14521 	else
14522 		snprintf(buf, buf_len, "StepId=%u", step_id);
14523 	return buf;
14524 }
14525 
14526 /*
14527  * validate_jobs_on_node - validate that any jobs that should be on the node
14528  *	are actually running, if not clean up the job records and/or node
14529  *	records.
14530  * IN reg_msg - node registration message
14531  */
14532 extern void
validate_jobs_on_node(slurm_node_registration_status_msg_t * reg_msg)14533 validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg)
14534 {
14535 	int i, node_inx, jobs_on_node;
14536 	node_record_t *node_ptr;
14537 	job_record_t *job_ptr;
14538 	step_record_t *step_ptr;
14539 	char step_str[64];
14540 	time_t now = time(NULL);
14541 
14542 	node_ptr = find_node_record(reg_msg->node_name);
14543 	if (node_ptr == NULL) {
14544 		error("slurmd registered on unknown node %s",
14545 			reg_msg->node_name);
14546 		return;
14547 	}
14548 
14549 	if (reg_msg->energy)
14550 		memcpy(node_ptr->energy, reg_msg->energy,
14551 		       sizeof(acct_gather_energy_t));
14552 
14553 	if (node_ptr->up_time > reg_msg->up_time) {
14554 		verbose("Node %s rebooted %u secs ago",
14555 			reg_msg->node_name, reg_msg->up_time);
14556 	}
14557 
14558 	if (reg_msg->up_time <= now) {
14559 		node_ptr->up_time = reg_msg->up_time;
14560 		node_ptr->boot_time = now - reg_msg->up_time;
14561 		node_ptr->slurmd_start_time = reg_msg->slurmd_start_time;
14562 	} else {
14563 		error("Node up_time is invalid: %u>%u", reg_msg->up_time,
14564 		      (uint32_t) now);
14565 	}
14566 
14567 	if (waiting_for_node_boot(node_ptr))
14568 		return;
14569 
14570 	node_inx = node_ptr - node_record_table_ptr;
14571 
14572 	/* Check that jobs running are really supposed to be there */
14573 	for (i = 0; i < reg_msg->job_count; i++) {
14574 		if ( (reg_msg->job_id[i] >= MIN_NOALLOC_JOBID) &&
14575 		     (reg_msg->job_id[i] <= MAX_NOALLOC_JOBID) ) {
14576 			info("NoAllocate JobId=%u %s reported on node %s",
14577 			     reg_msg->job_id[i],
14578 			     _build_step_id(step_str, sizeof(step_str),
14579 					    reg_msg->step_id[i]),
14580 			     reg_msg->node_name);
14581 			continue;
14582 		}
14583 
14584 		job_ptr = find_job_record(reg_msg->job_id[i]);
14585 		if (job_ptr == NULL) {
14586 			error("Orphan JobId=%u %s reported on node %s",
14587 			      reg_msg->job_id[i],
14588 			      _build_step_id(step_str, sizeof(step_str),
14589 					     reg_msg->step_id[i]),
14590 			      reg_msg->node_name);
14591 			abort_job_on_node(reg_msg->job_id[i],
14592 					  job_ptr, node_ptr->name);
14593 		}
14594 
14595 		else if (IS_JOB_RUNNING(job_ptr) ||
14596 			 IS_JOB_SUSPENDED(job_ptr)) {
14597 			if (bit_test(job_ptr->node_bitmap, node_inx)) {
14598 				if ((job_ptr->batch_flag) &&
14599 				    (node_inx == bit_ffs(
14600 						job_ptr->node_bitmap))) {
14601 					/* NOTE: Used for purging defunct
14602 					 * batch jobs */
14603 					job_ptr->time_last_active = now;
14604 				}
14605 				step_ptr = find_step_record(job_ptr,
14606 							    reg_msg->
14607 							    step_id[i]);
14608 				if (step_ptr)
14609 					step_ptr->time_last_active = now;
14610 				debug3("Registered %pS on node %s",
14611 				       step_ptr, reg_msg->node_name);
14612 			} else {
14613 				/* Typically indicates a job requeue and
14614 				 * restart on another nodes. A node from the
14615 				 * original allocation just responded here. */
14616 				error("Registered %pJ %s on wrong node %s",
14617 				      job_ptr,
14618 				       _build_step_id(step_str,
14619 						      sizeof(step_str),
14620 						      reg_msg->step_id[i]),
14621 				      reg_msg->node_name);
14622 				info("%s: job nodes %s count %d inx %d",
14623 				     __func__, job_ptr->nodes,
14624 				     job_ptr->node_cnt, node_inx);
14625 				abort_job_on_node(reg_msg->job_id[i], job_ptr,
14626 						  node_ptr->name);
14627 			}
14628 		}
14629 
14630 		else if (IS_JOB_COMPLETING(job_ptr)) {
14631 			/*
14632 			 * Re-send kill request as needed,
14633 			 * not necessarily an error
14634 			 */
14635 			kill_job_on_node(job_ptr, node_ptr);
14636 		}
14637 
14638 
14639 		else if (IS_JOB_PENDING(job_ptr)) {
14640 			/* Typically indicates a job requeue and the hung
14641 			 * slurmd that went DOWN is now responding */
14642 			error("Registered PENDING %pJ %s on node %s",
14643 			      job_ptr,
14644 			      _build_step_id(step_str, sizeof(step_str),
14645 					     reg_msg->step_id[i]),
14646 			      reg_msg->node_name);
14647 			abort_job_on_node(reg_msg->job_id[i],
14648 					  job_ptr, node_ptr->name);
14649 		}
14650 
14651 		else if (difftime(now, job_ptr->end_time) <
14652 			 slurm_get_msg_timeout()) {	/* Race condition */
14653 			debug("Registered newly completed %pJ %s on %s",
14654 			      job_ptr,
14655 			      _build_step_id(step_str, sizeof(step_str),
14656 					     reg_msg->step_id[i]),
14657 			      node_ptr->name);
14658 		}
14659 
14660 		else {		/* else job is supposed to be done */
14661 			error("Registered %pJ %s in state %s on node %s",
14662 			      job_ptr,
14663 			      _build_step_id(step_str, sizeof(step_str),
14664 					     reg_msg->step_id[i]),
14665 			      job_state_string(job_ptr->job_state),
14666 			      reg_msg->node_name);
14667 			kill_job_on_node(job_ptr, node_ptr);
14668 		}
14669 	}
14670 
14671 	jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt;
14672 	if (jobs_on_node)
14673 		_purge_missing_jobs(node_inx, now);
14674 
14675 	if (jobs_on_node != reg_msg->job_count) {
14676 		/* slurmd will not know of a job unless the job has
14677 		 * steps active at registration time, so this is not
14678 		 * an error condition, slurmd is also reporting steps
14679 		 * rather than jobs */
14680 		debug3("resetting job_count on node %s from %u to %d",
14681 			reg_msg->node_name, reg_msg->job_count, jobs_on_node);
14682 		reg_msg->job_count = jobs_on_node;
14683 	}
14684 
14685 	return;
14686 }
14687 
14688 /* Purge any batch job that should have its script running on node
14689  * node_inx, but is not. Allow BatchStartTimeout + ResumeTimeout seconds
14690  * for startup.
14691  *
14692  * Purge all job steps that were started before the node was last booted.
14693  *
14694  * Also notify srun if any job steps should be active on this node
14695  * but are not found. */
_purge_missing_jobs(int node_inx,time_t now)14696 static void _purge_missing_jobs(int node_inx, time_t now)
14697 {
14698 	ListIterator job_iterator;
14699 	job_record_t *job_ptr;
14700 	node_record_t *node_ptr = node_record_table_ptr + node_inx;
14701 	uint16_t batch_start_timeout	= slurm_get_batch_start_timeout();
14702 	uint16_t msg_timeout		= slurm_get_msg_timeout();
14703 	uint16_t resume_timeout		= slurm_get_resume_timeout();
14704 	uint32_t suspend_time		= slurm_get_suspend_time();
14705 	time_t batch_startup_time, node_boot_time = (time_t) 0, startup_time;
14706 
14707 	if (node_ptr->boot_time > (msg_timeout + 5)) {
14708 		/* allow for message timeout and other delays */
14709 		node_boot_time = node_ptr->boot_time - (msg_timeout + 5);
14710 	}
14711 	batch_startup_time  = now - batch_start_timeout;
14712 	batch_startup_time -= MIN(DEFAULT_MSG_TIMEOUT, msg_timeout);
14713 
14714 	job_iterator = list_iterator_create(job_list);
14715 	while ((job_ptr = list_next(job_iterator))) {
14716 		if ((IS_JOB_CONFIGURING(job_ptr) ||
14717 		    (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) ||
14718 		    (!bit_test(job_ptr->node_bitmap, node_inx)))
14719 			continue;
14720 		if ((job_ptr->batch_flag != 0)			&&
14721 		    (suspend_time != 0) /* power mgmt on */	&&
14722 		    (job_ptr->start_time < node_boot_time)) {
14723 			startup_time = batch_startup_time - resume_timeout;
14724 		} else
14725 			startup_time = batch_startup_time;
14726 
14727 		if ((job_ptr->batch_flag != 0)			&&
14728 		    (job_ptr->het_job_offset == 0)		&&
14729 		    (job_ptr->time_last_active < startup_time)	&&
14730 		    (job_ptr->start_time       < startup_time)	&&
14731 		    (node_ptr == find_node_record(job_ptr->batch_host))) {
14732 			bool requeue = false;
14733 			char *requeue_msg = "";
14734 			if (job_ptr->details && job_ptr->details->requeue) {
14735 				requeue = true;
14736 				requeue_msg = ", Requeuing job";
14737 			}
14738 			info("Batch %pJ missing from batch node %s (not found BatchStartTime after startup)%s",
14739 			     job_ptr, job_ptr->batch_host, requeue_msg);
14740 			job_ptr->exit_code = 1;
14741 			job_complete(job_ptr->job_id,
14742 				     slurmctld_conf.slurm_user_id,
14743 				     requeue, true, NO_VAL);
14744 		} else {
14745 			_notify_srun_missing_step(job_ptr, node_inx,
14746 						  now, node_boot_time);
14747 		}
14748 	}
14749 	list_iterator_destroy(job_iterator);
14750 }
14751 
_notify_srun_missing_step(job_record_t * job_ptr,int node_inx,time_t now,time_t node_boot_time)14752 static void _notify_srun_missing_step(job_record_t *job_ptr, int node_inx,
14753 				      time_t now, time_t node_boot_time)
14754 {
14755 	ListIterator step_iterator;
14756 	step_record_t *step_ptr;
14757 	char *node_name = node_record_table_ptr[node_inx].name;
14758 
14759 	xassert(job_ptr);
14760 	step_iterator = list_iterator_create (job_ptr->step_list);
14761 	while ((step_ptr = list_next(step_iterator))) {
14762 		if ((step_ptr->step_id == SLURM_EXTERN_CONT) ||
14763 		    (step_ptr->step_id == SLURM_BATCH_SCRIPT) ||
14764 		    (step_ptr->state != JOB_RUNNING))
14765 			continue;
14766 		if (!bit_test(step_ptr->step_node_bitmap, node_inx))
14767 			continue;
14768 		if (step_ptr->time_last_active >= now) {
14769 			/* Back up timer in case more than one node
14770 			 * registration happens at this same time.
14771 			 * We don't want this node's registration
14772 			 * to count toward a different node's
14773 			 * registration message. */
14774 			step_ptr->time_last_active = now - 1;
14775 		} else if (step_ptr->host && step_ptr->port) {
14776 			/* srun may be able to verify step exists on
14777 			 * this node using I/O sockets and kill the
14778 			 * job as needed */
14779 			srun_step_missing(step_ptr, node_name);
14780 		} else if ((step_ptr->start_time < node_boot_time) &&
14781 			   (step_ptr->no_kill == 0)) {
14782 			/* There is a risk that the job step's tasks completed
14783 			 * on this node before its reboot, but that should be
14784 			 * very rare and there is no srun to work with (POE) */
14785 			info("Node %s rebooted, killing missing step %u.%u",
14786 			     node_name, job_ptr->job_id, step_ptr->step_id);
14787 			signal_step_tasks_on_node(node_name, step_ptr, SIGKILL,
14788 						  REQUEST_TERMINATE_TASKS);
14789 		}
14790 	}
14791 	list_iterator_destroy (step_iterator);
14792 }
14793 
14794 /*
14795  * abort_job_on_node - Kill the specific job_id on a specific node,
14796  *	the request is not processed immediately, but queued.
14797  *	This is to prevent a flood of pthreads if slurmctld restarts
14798  *	without saved state and slurmd daemons register with a
14799  *	multitude of running jobs. Slurmctld will not recognize
14800  *	these jobs and use this function to kill them - one
14801  *	agent request per node as they register.
14802  * IN job_id - id of the job to be killed
14803  * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. job reported
14804  *		by slurmd on some node, but job records already purged from
14805  *		slurmctld)
14806  * IN node_name - name of the node on which the job resides
14807  */
abort_job_on_node(uint32_t job_id,job_record_t * job_ptr,char * node_name)14808 extern void abort_job_on_node(uint32_t job_id, job_record_t *job_ptr,
14809 			      char *node_name)
14810 {
14811 	agent_arg_t *agent_info;
14812 	kill_job_msg_t *kill_req;
14813 
14814 	kill_req = xmalloc(sizeof(kill_job_msg_t));
14815 	kill_req->job_id	= job_id;
14816 	kill_req->step_id	= NO_VAL;
14817 	kill_req->time          = time(NULL);
14818 	kill_req->nodes		= xstrdup(node_name);
14819 	if (job_ptr) {  /* NULL if unknown */
14820 		kill_req->job_gres_info	=
14821 			gres_plugin_epilog_build_env(job_ptr->gres_list,
14822 						     job_ptr->nodes);
14823 		kill_req->het_job_id	= job_ptr->het_job_id;
14824 		kill_req->start_time = job_ptr->start_time;
14825 		kill_req->select_jobinfo =
14826 			select_g_select_jobinfo_copy(job_ptr->select_jobinfo);
14827 		kill_req->spank_job_env = xduparray(job_ptr->spank_job_env_size,
14828 						    job_ptr->spank_job_env);
14829 		kill_req->spank_job_env_size = job_ptr->spank_job_env_size;
14830 	} else {
14831 		/* kill_req->start_time = 0;  Default value */
14832 	}
14833 
14834 	agent_info = xmalloc(sizeof(agent_arg_t));
14835 	agent_info->node_count	= 1;
14836 	agent_info->retry	= 0;
14837 	agent_info->hostlist	= hostlist_create(node_name);
14838 #ifdef HAVE_FRONT_END
14839 	if (job_ptr && job_ptr->front_end_ptr)
14840 		agent_info->protocol_version =
14841 			job_ptr->front_end_ptr->protocol_version;
14842 	if (job_ptr) {
14843 		debug("Aborting %pJ on front end node %s", job_ptr, node_name);
14844 	} else {
14845 		debug("Aborting JobId=%u on front end node %s", job_id,
14846 		      node_name);
14847 	}
14848 #else
14849 	node_record_t *node_ptr;
14850 	if ((node_ptr = find_node_record(node_name)))
14851 		agent_info->protocol_version = node_ptr->protocol_version;
14852 	if (job_ptr)
14853 		debug("Aborting %pJ on node %s", job_ptr, node_name);
14854 	else
14855 		debug("Aborting JobId=%u on node %s", job_id, node_name);
14856 #endif
14857 	agent_info->msg_type	= REQUEST_ABORT_JOB;
14858 	agent_info->msg_args	= kill_req;
14859 
14860 	agent_queue_request(agent_info);
14861 }
14862 
14863 /*
14864  * abort_job_on_nodes - Kill the specific job_on the specific nodes,
14865  *	the request is not processed immediately, but queued.
14866  *	This is to prevent a flood of pthreads if slurmctld restarts
14867  *	without saved state and slurmd daemons register with a
14868  *	multitude of running jobs. Slurmctld will not recognize
14869  *	these jobs and use this function to kill them - one
14870  *	agent request per node as they register.
14871  * IN job_ptr - pointer to terminating job
14872  * IN node_name - name of the node on which the job resides
14873  */
abort_job_on_nodes(job_record_t * job_ptr,bitstr_t * node_bitmap)14874 extern void abort_job_on_nodes(job_record_t *job_ptr,
14875 			       bitstr_t *node_bitmap)
14876 {
14877 	bitstr_t *full_node_bitmap, *tmp_node_bitmap;
14878 	node_record_t *node_ptr;
14879 	int i, i_first, i_last;
14880 	agent_arg_t *agent_info;
14881 	kill_job_msg_t *kill_req;
14882 	uint16_t protocol_version;
14883 
14884 #ifdef HAVE_FRONT_END
14885 	fatal("%s: front-end mode not supported", __func__);
14886 #endif
14887 	xassert(node_bitmap);
14888 	/* Send a separate message for nodes at different protocol_versions */
14889 	full_node_bitmap = bit_copy(node_bitmap);
14890 	while ((i_first = bit_ffs(full_node_bitmap)) >= 0) {
14891 		i_last = bit_fls(full_node_bitmap);
14892 		node_ptr = node_record_table_ptr + i_first;
14893 		protocol_version = node_ptr->protocol_version;
14894 		tmp_node_bitmap = bit_alloc(bit_size(node_bitmap));
14895 		for (i = i_first; i <= i_last; i++) {
14896 			if (!bit_test(full_node_bitmap, i))
14897 				continue;
14898 			node_ptr = node_record_table_ptr + i;
14899 			if (node_ptr->protocol_version != protocol_version)
14900 				continue;
14901 			bit_clear(full_node_bitmap, i);
14902 			bit_set(tmp_node_bitmap, i);
14903 		}
14904 		kill_req = xmalloc(sizeof(kill_job_msg_t));
14905 		kill_req->job_gres_info	=
14906 			gres_plugin_epilog_build_env(job_ptr->gres_list,
14907 						     job_ptr->nodes);
14908 		kill_req->job_id	= job_ptr->job_id;
14909 		kill_req->step_id	= NO_VAL;
14910 		kill_req->time          = time(NULL);
14911 		kill_req->nodes		= bitmap2node_name(tmp_node_bitmap);
14912 		kill_req->het_job_id	= job_ptr->het_job_id;
14913 		kill_req->start_time	= job_ptr->start_time;
14914 		kill_req->select_jobinfo =
14915 			select_g_select_jobinfo_copy(job_ptr->select_jobinfo);
14916 		kill_req->spank_job_env = xduparray(job_ptr->spank_job_env_size,
14917 						    job_ptr->spank_job_env);
14918 		kill_req->spank_job_env_size = job_ptr->spank_job_env_size;
14919 		agent_info = xmalloc(sizeof(agent_arg_t));
14920 		agent_info->node_count	= bit_set_count(tmp_node_bitmap);
14921 		agent_info->retry	= 1;
14922 		agent_info->hostlist	= hostlist_create(kill_req->nodes);
14923 		debug("Aborting %pJ on nodes %s", job_ptr, kill_req->nodes);
14924 		agent_info->msg_type	= REQUEST_ABORT_JOB;
14925 		agent_info->msg_args	= kill_req;
14926 		agent_info->protocol_version = protocol_version;
14927 		agent_queue_request(agent_info);
14928 		bit_free(tmp_node_bitmap);
14929 	}
14930 	bit_free(full_node_bitmap);
14931 }
14932 
14933 /*
14934  * kill_job_on_node - Kill the specific job on a specific node.
14935  * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned)
14936  * IN node_ptr - pointer to the node on which the job resides
14937  */
kill_job_on_node(job_record_t * job_ptr,node_record_t * node_ptr)14938 extern void kill_job_on_node(job_record_t *job_ptr,
14939 			     node_record_t *node_ptr)
14940 {
14941 	agent_arg_t *agent_info;
14942 	kill_job_msg_t *kill_req;
14943 
14944 	kill_req = xmalloc(sizeof(kill_job_msg_t));
14945 	kill_req->job_gres_info	=
14946 		gres_plugin_epilog_build_env(job_ptr->gres_list,job_ptr->nodes);
14947 	kill_req->het_job_id	= job_ptr->het_job_id;
14948 	kill_req->job_id	= job_ptr->job_id;
14949 	kill_req->step_id	= NO_VAL;
14950 	kill_req->time          = time(NULL);
14951 	kill_req->start_time	= job_ptr->start_time;
14952 	kill_req->nodes		= xstrdup(node_ptr->name);
14953 	kill_req->select_jobinfo =
14954 			select_g_select_jobinfo_copy(job_ptr->select_jobinfo);
14955 	kill_req->job_state	= job_ptr->job_state;
14956 	kill_req->spank_job_env = xduparray(job_ptr->spank_job_env_size,
14957 					    job_ptr->spank_job_env);
14958 	kill_req->spank_job_env_size = job_ptr->spank_job_env_size;
14959 
14960 	agent_info = xmalloc(sizeof(agent_arg_t));
14961 	agent_info->node_count	= 1;
14962 	agent_info->retry	= 0;
14963 #ifdef HAVE_FRONT_END
14964 	xassert(job_ptr->batch_host);
14965 	if (job_ptr->front_end_ptr)
14966 		agent_info->protocol_version =
14967 			job_ptr->front_end_ptr->protocol_version;
14968 	agent_info->hostlist	= hostlist_create(job_ptr->batch_host);
14969 	debug("Killing %pJ on front end node %s",
14970 	      job_ptr, job_ptr->batch_host);
14971 #else
14972 	agent_info->protocol_version = node_ptr->protocol_version;
14973 	agent_info->hostlist	= hostlist_create(node_ptr->name);
14974 	debug("Killing %pJ on node %s", job_ptr, node_ptr->name);
14975 #endif
14976 	agent_info->msg_type	= REQUEST_TERMINATE_JOB;
14977 	agent_info->msg_args	= kill_req;
14978 
14979 	agent_queue_request(agent_info);
14980 }
14981 
14982 /*
14983  * Return true if this job is complete (including all elements of a hetjob)
14984  */
_job_all_finished(job_record_t * job_ptr)14985 static bool _job_all_finished(job_record_t *job_ptr)
14986 {
14987 	job_record_t *het_job;
14988 	ListIterator iter;
14989 	bool finished = true;
14990 
14991 	if (!IS_JOB_FINISHED(job_ptr))
14992 		return false;
14993 
14994 	if (!job_ptr->het_job_list)
14995 		return true;
14996 
14997 	iter = list_iterator_create(job_ptr->het_job_list);
14998 	while ((het_job = list_next(iter))) {
14999 		if (!IS_JOB_FINISHED(het_job)) {
15000 			finished = false;
15001 			break;
15002 		}
15003 	}
15004 	list_iterator_destroy(iter);
15005 
15006 	return finished;
15007 }
15008 
15009 /*
15010  * job_alloc_info_ptr - get details about an existing job allocation
15011  * IN uid - job issuing the code
15012  * IN job_ptr - pointer to job record
15013  * NOTE: See job_alloc_info() if job pointer not known
15014  */
job_alloc_info_ptr(uint32_t uid,job_record_t * job_ptr)15015 extern int job_alloc_info_ptr(uint32_t uid, job_record_t *job_ptr)
15016 {
15017 	uint8_t prolog = 0;
15018 
15019 	if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) &&
15020 	    (job_ptr->user_id != uid) && !validate_operator(uid) &&
15021 	    (((slurm_mcs_get_privatedata() == 0) &&
15022 	      !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
15023 					    job_ptr->account)) ||
15024 	     ((slurm_mcs_get_privatedata() == 1) &&
15025 	      (mcs_g_check_mcs_label(uid, job_ptr->mcs_label) != 0))))
15026 		return ESLURM_ACCESS_DENIED;
15027 	if (IS_JOB_PENDING(job_ptr))
15028 		return ESLURM_JOB_PENDING;
15029 	if (_job_all_finished(job_ptr))
15030 		return ESLURM_ALREADY_DONE;
15031 	if (job_ptr->details)
15032 		prolog = job_ptr->details->prolog_running;
15033 
15034 	if (job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD") &&
15035 	    (prolog == 0) && job_ptr->node_bitmap &&
15036 	    (bit_overlap_any(power_node_bitmap, job_ptr->node_bitmap) == 0)) {
15037 		last_job_update = time(NULL);
15038 		set_job_alias_list(job_ptr);
15039 	}
15040 
15041 	return SLURM_SUCCESS;
15042 }
15043 
15044 /*
15045  * job_alloc_info - get details about an existing job allocation
15046  * IN uid - job issuing the code
15047  * IN job_id - ID of job for which info is requested
15048  * OUT job_pptr - set to pointer to job record
15049  * NOTE: See job_alloc_info_ptr() if job pointer is known
15050  */
job_alloc_info(uint32_t uid,uint32_t job_id,job_record_t ** job_pptr)15051 extern int job_alloc_info(uint32_t uid, uint32_t job_id,
15052 			  job_record_t **job_pptr)
15053 {
15054 	job_record_t *job_ptr;
15055 
15056 	job_ptr = find_job_record(job_id);
15057 	if (job_ptr == NULL)
15058 		return ESLURM_INVALID_JOB_ID;
15059 	if (job_pptr)
15060 		*job_pptr = job_ptr;
15061 	return job_alloc_info_ptr(uid, job_ptr);
15062 }
15063 
15064 /*
15065  * Synchronize the batch job in the system with their files.
15066  * All pending batch jobs must have script and environment files
15067  * No other jobs should have such files
15068  */
sync_job_files(void)15069 int sync_job_files(void)
15070 {
15071 	List batch_dirs;
15072 
15073 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
15074 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
15075 
15076 	if (!slurmctld_primary)	/* Don't purge files from backup slurmctld */
15077 		return SLURM_SUCCESS;
15078 
15079 	batch_dirs = list_create(xfree_ptr);
15080 	_get_batch_job_dir_ids(batch_dirs);
15081 	_validate_job_files(batch_dirs);
15082 	_remove_defunct_batch_dirs(batch_dirs);
15083 	FREE_NULL_LIST(batch_dirs);
15084 	return SLURM_SUCCESS;
15085 }
15086 
15087 /* Append to the batch_dirs list the job_id's associated with
15088  *	every batch job directory in existence
15089  */
_get_batch_job_dir_ids(List batch_dirs)15090 static void _get_batch_job_dir_ids(List batch_dirs)
15091 {
15092 	DIR *f_dir, *h_dir;
15093 	struct dirent *dir_ent, *hash_ent;
15094 	long long_job_id;
15095 	uint32_t *job_id_ptr;
15096 	char *endptr;
15097 
15098 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
15099 
15100 	xassert(slurmctld_conf.state_save_location);
15101 	f_dir = opendir(slurmctld_conf.state_save_location);
15102 	if (!f_dir) {
15103 		error("opendir(%s): %m",
15104 		      slurmctld_conf.state_save_location);
15105 		return;
15106 	}
15107 
15108 	while ((dir_ent = readdir(f_dir))) {
15109 		if (!xstrncmp("hash.#", dir_ent->d_name, 5)) {
15110 			char *h_path = NULL;
15111 			xstrfmtcat(h_path, "%s/%s",
15112 				   slurmctld_conf.state_save_location,
15113 				   dir_ent->d_name);
15114 			h_dir = opendir(h_path);
15115 			xfree(h_path);
15116 			if (!h_dir)
15117 				continue;
15118 			while ((hash_ent = readdir(h_dir))) {
15119 				if (xstrncmp("job.#", hash_ent->d_name, 4))
15120 					continue;
15121 				long_job_id = strtol(&hash_ent->d_name[4],
15122 						     &endptr, 10);
15123 				if ((long_job_id == 0) || (endptr[0] != '\0'))
15124 					continue;
15125 				debug3("Found batch directory for JobId=%ld",
15126 				      long_job_id);
15127 				job_id_ptr = xmalloc(sizeof(uint32_t));
15128 				*job_id_ptr = long_job_id;
15129 				list_append(batch_dirs, job_id_ptr);
15130 			}
15131 			closedir(h_dir);
15132 		}
15133 	}
15134 
15135 	closedir(f_dir);
15136 }
15137 
_clear_state_dir_flag(void * x,void * arg)15138 static int _clear_state_dir_flag(void *x, void *arg)
15139 {
15140 	job_record_t *job_ptr = (job_record_t *) x;
15141 	job_ptr->bit_flags &= ~HAS_STATE_DIR;
15142 	return 0;
15143 }
15144 
_test_state_dir_flag(void * x,void * arg)15145 static int _test_state_dir_flag(void *x, void *arg)
15146 {
15147 	job_record_t *job_ptr = (job_record_t *) x;
15148 
15149 	if (job_ptr->bit_flags & HAS_STATE_DIR) {
15150 		job_ptr->bit_flags &= ~HAS_STATE_DIR;
15151 		return 0;
15152 	}
15153 
15154 	if (!job_ptr->batch_flag || !IS_JOB_PENDING(job_ptr) ||
15155 	    (job_ptr->het_job_offset > 0))
15156 		return 0;	/* No files expected */
15157 
15158 	error("Script for %pJ lost, state set to FAILED", job_ptr);
15159 	job_ptr->job_state = JOB_FAILED;
15160 	job_ptr->exit_code = 1;
15161 	job_ptr->state_reason = FAIL_SYSTEM;
15162 	xfree(job_ptr->state_desc);
15163 	job_ptr->start_time = job_ptr->end_time = time(NULL);
15164 	job_completion_logger(job_ptr, false);
15165 	return 0;
15166 }
15167 
15168 /* All pending batch jobs must have a batch_dir entry,
15169  *	otherwise we flag it as FAILED and don't schedule
15170  * If the batch_dir entry exists for a PENDING or RUNNING batch job,
15171  *	remove it the list (of directories to be deleted) */
_validate_job_files(List batch_dirs)15172 static void _validate_job_files(List batch_dirs)
15173 {
15174 	job_record_t *job_ptr;
15175 	ListIterator batch_dir_iter;
15176 	uint32_t *job_id_ptr, array_job_id;
15177 
15178 	list_for_each(job_list, _clear_state_dir_flag, NULL);
15179 
15180 	batch_dir_iter = list_iterator_create(batch_dirs);
15181 	while ((job_id_ptr = list_next(batch_dir_iter))) {
15182 		job_ptr = find_job_record(*job_id_ptr);
15183 		if (job_ptr) {
15184 			job_ptr->bit_flags |= HAS_STATE_DIR;
15185 			list_delete_item(batch_dir_iter);
15186 		}
15187 		if (job_ptr && job_ptr->array_recs) { /* Update all tasks */
15188 			array_job_id = job_ptr->array_job_id;
15189 			job_ptr = job_array_hash_j[JOB_HASH_INX(array_job_id)];
15190 			while (job_ptr) {
15191 				if (job_ptr->array_job_id == array_job_id)
15192 					job_ptr->bit_flags |= HAS_STATE_DIR;
15193 				job_ptr = job_ptr->job_array_next_j;
15194 			}
15195 		}
15196 	}
15197 	list_iterator_destroy(batch_dir_iter);
15198 
15199 	list_for_each(job_list, _test_state_dir_flag, NULL);
15200 }
15201 
15202 /* Remove all batch_dir entries in the list */
_remove_defunct_batch_dirs(List batch_dirs)15203 static void _remove_defunct_batch_dirs(List batch_dirs)
15204 {
15205 	ListIterator batch_dir_inx;
15206 	uint32_t *job_id_ptr;
15207 
15208 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
15209 
15210 	batch_dir_inx = list_iterator_create(batch_dirs);
15211 	while ((job_id_ptr = list_next(batch_dir_inx))) {
15212 		info("Purged files for defunct batch JobId=%u",
15213 		     *job_id_ptr);
15214 		delete_job_desc_files(*job_id_ptr);
15215 	}
15216 	list_iterator_destroy(batch_dir_inx);
15217 }
15218 
15219 /*
15220  *  _xmit_new_end_time
15221  *	Tell all slurmd's associated with a job of its new end time
15222  * IN job_ptr - pointer to terminating job
15223  * globals: node_record_count - number of nodes in the system
15224  *	node_record_table_ptr - pointer to global node table
15225  */
_xmit_new_end_time(job_record_t * job_ptr)15226 static void _xmit_new_end_time(job_record_t *job_ptr)
15227 {
15228 #ifndef HAVE_FRONT_END
15229 	int i;
15230 #endif
15231 	job_time_msg_t *job_time_msg_ptr;
15232 	agent_arg_t *agent_args;
15233 
15234 	agent_args = xmalloc(sizeof(agent_arg_t));
15235 	agent_args->msg_type = REQUEST_UPDATE_JOB_TIME;
15236 	agent_args->retry = 1;
15237 	agent_args->hostlist = hostlist_create(NULL);
15238 	job_time_msg_ptr = xmalloc(sizeof(job_time_msg_t));
15239 	job_time_msg_ptr->job_id          = job_ptr->job_id;
15240 	job_time_msg_ptr->expiration_time = job_ptr->end_time;
15241 
15242 #ifdef HAVE_FRONT_END
15243 	xassert(job_ptr->batch_host);
15244 	if (job_ptr->front_end_ptr)
15245 		agent_args->protocol_version =
15246 			job_ptr->front_end_ptr->protocol_version;
15247 	hostlist_push_host(agent_args->hostlist, job_ptr->batch_host);
15248 	agent_args->node_count  = 1;
15249 #else
15250 	agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
15251 	for (i = 0; i < node_record_count; i++) {
15252 		if (bit_test(job_ptr->node_bitmap, i) == 0)
15253 			continue;
15254 		if (agent_args->protocol_version >
15255 		    node_record_table_ptr[i].protocol_version)
15256 			agent_args->protocol_version =
15257 				node_record_table_ptr[i].protocol_version;
15258 		hostlist_push_host(agent_args->hostlist,
15259 			      node_record_table_ptr[i].name);
15260 		agent_args->node_count++;
15261 	}
15262 #endif
15263 
15264 	agent_args->msg_args = job_time_msg_ptr;
15265 	agent_queue_request(agent_args);
15266 	return;
15267 }
15268 
15269 /*
15270  * Return total amount of memory allocated to a job. This can be based upon
15271  * a GRES specification with various GRES/memory allocations on each node.
15272  * If current allocation information is not available, estimate memory based
15273  * upon pn_min_memory and either CPU or node count.
15274  */
job_get_tres_mem(struct job_resources * job_res,uint64_t pn_min_memory,uint32_t cpu_cnt,uint32_t node_cnt)15275 extern uint64_t job_get_tres_mem(struct job_resources *job_res,
15276 				 uint64_t pn_min_memory, uint32_t cpu_cnt,
15277 				 uint32_t node_cnt)
15278 {
15279 	uint64_t mem_total = 0;
15280 	int i;
15281 
15282 	if (job_res) {
15283 		for (i = 0; i < job_res->nhosts; i++) {
15284 			mem_total += job_res->memory_allocated[i];
15285 		}
15286 		return mem_total;
15287 	}
15288 
15289 	if (pn_min_memory == NO_VAL64)
15290 		return mem_total;
15291 
15292 	if (pn_min_memory & MEM_PER_CPU) {
15293 		if (cpu_cnt != NO_VAL) {
15294 			mem_total = pn_min_memory & (~MEM_PER_CPU);
15295 			mem_total *= cpu_cnt;
15296 		}
15297 	} else if (node_cnt != NO_VAL)
15298 		mem_total = pn_min_memory * node_cnt;
15299 
15300 	return mem_total;
15301 }
15302 
15303 /*
15304  * job_epilog_complete - Note the completion of the epilog script for a
15305  *	given job
15306  * IN job_id      - id of the job for which the epilog was executed
15307  * IN node_name   - name of the node on which the epilog was executed
15308  * IN return_code - return code from epilog script
15309  * RET true if job is COMPLETED, otherwise false
15310  */
job_epilog_complete(uint32_t job_id,char * node_name,uint32_t return_code)15311 extern bool job_epilog_complete(uint32_t job_id, char *node_name,
15312 				uint32_t return_code)
15313 {
15314 #ifdef HAVE_FRONT_END
15315 	int i;
15316 #endif
15317 	job_record_t *job_ptr = find_job_record(job_id);
15318 	node_record_t *node_ptr;
15319 
15320 	if (job_ptr == NULL) {
15321 		debug("%s: unable to find JobId=%u for node=%s with return_code=%u.",
15322 		      __func__, job_id, node_name, return_code);
15323 		return true;
15324 	}
15325 
15326 	trace_job(job_ptr, __func__, "enter");
15327 
15328 	/*
15329 	 * There is a potential race condition this handles.
15330 	 * If slurmctld cold-starts while slurmd keeps running, slurmd could
15331 	 * notify slurmctld of a job epilog completion before getting synced
15332 	 * up with slurmctld state. If a new job arrives and the job_id is
15333 	 * reused, we could try to note the termination of a job that hasn't
15334 	 * really started. Very rare obviously.
15335 	 */
15336 	if ((IS_JOB_PENDING(job_ptr) && (!IS_JOB_COMPLETING(job_ptr))) ||
15337 	    (job_ptr->node_bitmap == NULL)) {
15338 #ifndef HAVE_FRONT_END
15339 		uint32_t base_state = NODE_STATE_UNKNOWN;
15340 		node_ptr = find_node_record(node_name);
15341 		if (node_ptr)
15342 			base_state = node_ptr->node_state & NODE_STATE_BASE;
15343 		if (base_state == NODE_STATE_DOWN) {
15344 			debug("%s: %pJ complete response from DOWN node %s",
15345 			      __func__, job_ptr, node_name);
15346 		} else if (job_ptr->restart_cnt) {
15347 			/*
15348 			 * Duplicate epilog complete can be due to race
15349 			 */
15350 			debug("%s: %pJ duplicate epilog complete response",
15351 			      __func__, job_ptr);
15352 		} else {
15353 			error("%s: %pJ is non-running slurmctld and slurmd out of sync",
15354 			      __func__, job_ptr);
15355 		}
15356 #endif
15357 		return false;
15358 	}
15359 
15360 #ifdef HAVE_FRONT_END
15361 	xassert(job_ptr->batch_host);
15362 	/*
15363 	 * If there is a bad epilog error don't down the frontend node.
15364 	 * If needed the nodes in use by the job will be downed below.
15365 	 */
15366 	if (return_code)
15367 		error("%s: %pJ epilog error on %s",
15368 		      __func__, job_ptr, job_ptr->batch_host);
15369 
15370 	if (job_ptr->front_end_ptr && IS_JOB_COMPLETING(job_ptr)) {
15371 		front_end_record_t *front_end_ptr = job_ptr->front_end_ptr;
15372 		if (front_end_ptr->job_cnt_comp)
15373 			front_end_ptr->job_cnt_comp--;
15374 		else {
15375 			error("%s: %pJ job_cnt_comp underflow on front end %s",
15376 			      __func__, job_ptr, front_end_ptr->name);
15377 		}
15378 		if (front_end_ptr->job_cnt_comp == 0)
15379 			front_end_ptr->node_state &= (~NODE_STATE_COMPLETING);
15380 	}
15381 
15382 	if ((job_ptr->total_nodes == 0) && IS_JOB_COMPLETING(job_ptr)) {
15383 		/*
15384 		 * Job resources moved into another job and
15385 		 * tasks already killed
15386 		 */
15387 		front_end_record_t *front_end_ptr = job_ptr->front_end_ptr;
15388 		if (front_end_ptr)
15389 			front_end_ptr->node_state &= (~NODE_STATE_COMPLETING);
15390 	} else {
15391 		for (i = 0; i < node_record_count; i++) {
15392 			if (!bit_test(job_ptr->node_bitmap, i))
15393 				continue;
15394 			node_ptr = &node_record_table_ptr[i];
15395 			if (return_code) {
15396 				drain_nodes(node_ptr->name, "Epilog error",
15397 					    slurmctld_conf.slurm_user_id);
15398 			}
15399 			/* Change job from completing to completed */
15400 			make_node_idle(node_ptr, job_ptr);
15401 		}
15402 	}
15403 #else
15404 	if (return_code) {
15405 		error("%s: %pJ epilog error on %s, draining the node",
15406 		      __func__, job_ptr, node_name);
15407 		drain_nodes(node_name, "Epilog error",
15408 			    slurmctld_conf.slurm_user_id);
15409 	}
15410 	/* Change job from completing to completed */
15411 	node_ptr = find_node_record(node_name);
15412 	if (node_ptr)
15413 		make_node_idle(node_ptr, job_ptr);
15414 #endif
15415 
15416 	step_epilog_complete(job_ptr, node_name);
15417 	/* nodes_completing is out of date, rebuild when next saved */
15418 	xfree(job_ptr->nodes_completing);
15419 	if (!IS_JOB_COMPLETING(job_ptr)) {	/* COMPLETED */
15420 		batch_requeue_fini(job_ptr);
15421 		return true;
15422 	} else
15423 		return false;
15424 }
15425 
15426 /* Complete a batch job requeue logic after all steps complete so that
15427  * subsequent jobs appear in a separate accounting record. */
batch_requeue_fini(job_record_t * job_ptr)15428 void batch_requeue_fini(job_record_t *job_ptr)
15429 {
15430 	if (IS_JOB_COMPLETING(job_ptr) ||
15431 	    !IS_JOB_PENDING(job_ptr) || !job_ptr->batch_flag)
15432 		return;
15433 
15434 	info("Requeuing %pJ", job_ptr);
15435 
15436 	/* Clear everything so this appears to be a new job and then restart
15437 	 * it in accounting. */
15438 	job_ptr->start_time = 0;
15439 	job_ptr->end_time_exp = job_ptr->end_time = 0;
15440 	job_ptr->total_cpus = 0;
15441 	job_ptr->pre_sus_time = 0;
15442 	job_ptr->preempt_time = 0;
15443 	job_ptr->suspend_time = 0;
15444 	job_ptr->tot_sus_time = 0;
15445 	/* Current code (<= 2.1) has it so we start the new job with the next
15446 	 * step id.  This could be used when restarting to figure out which
15447 	 * step the previous run of this job stopped on. */
15448 	//job_ptr->next_step_id = 0;
15449 
15450 	job_ptr->node_cnt = 0;
15451 	xfree(job_ptr->nodes);
15452 	xfree(job_ptr->nodes_completing);
15453 	FREE_NULL_BITMAP(job_ptr->node_bitmap);
15454 	FREE_NULL_BITMAP(job_ptr->node_bitmap_cg);
15455 
15456 	job_resv_clear_promiscous_flag(job_ptr);
15457 
15458 	if (job_ptr->details) {
15459 		time_t now = time(NULL);
15460 		/* The time stamp on the new batch launch credential must be
15461 		 * larger than the time stamp on the revoke request. Also the
15462 		 * I/O must be all cleared out, the named socket purged and
15463 		 * the job credential purged by slurmd. */
15464 		if (job_ptr->details->begin_time <= now) {
15465 			/* See src/common/slurm_cred.c
15466 			 * #define DEFAULT_EXPIRATION_WINDOW 1200 */
15467 			int cred_lifetime = 1200;
15468 			(void) slurm_cred_ctx_get(slurmctld_config.cred_ctx,
15469 						  SLURM_CRED_OPT_EXPIRY_WINDOW,
15470 						  &cred_lifetime);
15471 			job_ptr->details->begin_time = now + cred_lifetime + 1;
15472 		}
15473 
15474 		/* Since this could happen on a launch we need to make sure the
15475 		 * submit isn't the same as the last submit so put now + 1 so
15476 		 * we get different records in the database */
15477 		if (now == job_ptr->details->submit_time)
15478 			now++;
15479 		job_ptr->details->submit_time = now;
15480 
15481 		/* clear the accrue flag */
15482 		job_ptr->bit_flags &= ~JOB_ACCRUE_OVER;
15483 		job_ptr->details->accrue_time = 0;
15484 
15485 		if ((job_ptr->details->whole_node == 1) && job_ptr->gres_list) {
15486 			/*
15487 			 * We need to reset the gres_list to what was requested
15488 			 * instead of what was given exclusively.
15489 			 */
15490 			FREE_NULL_LIST(job_ptr->gres_list);
15491 			(void)gres_plugin_job_state_validate(
15492 				job_ptr->cpus_per_tres,
15493 				job_ptr->tres_freq,
15494 				job_ptr->tres_per_job,
15495 				job_ptr->tres_per_node,
15496 				job_ptr->tres_per_socket,
15497 				job_ptr->tres_per_task,
15498 				job_ptr->mem_per_tres,
15499 				&job_ptr->details->num_tasks,
15500 				&job_ptr->details->min_nodes,
15501 				&job_ptr->details->max_nodes,
15502 				&job_ptr->details->ntasks_per_node,
15503 				&job_ptr->details->mc_ptr->ntasks_per_socket,
15504 				&job_ptr->details->mc_ptr->sockets_per_node,
15505 				&job_ptr->details->cpus_per_task,
15506 				&job_ptr->gres_list);
15507 		}
15508 	}
15509 
15510 	/*
15511 	 * If a reservation ended and was a repeated (e.g., daily, weekly)
15512 	 * reservation, its ID will be different; make sure
15513 	 * job->resv_id matches the reservation id.
15514 	 */
15515 	if (job_ptr->resv_ptr)
15516 		job_ptr->resv_id = job_ptr->resv_ptr->resv_id;
15517 
15518 	/* Reset this after the batch step has finished or the batch step
15519 	 * information will be attributed to the next run of the job. */
15520 	job_ptr->db_index = 0;
15521 	if (!with_slurmdbd)
15522 		jobacct_storage_g_job_start(acct_db_conn, job_ptr);
15523 
15524 	/* Submit new sibling jobs for fed jobs */
15525 	if (fed_mgr_is_origin_job(job_ptr)) {
15526 		if (fed_mgr_job_requeue(job_ptr)) {
15527 			error("failed to submit requeued sibling jobs for fed %pJ",
15528 			      job_ptr);
15529 		}
15530 	}
15531 }
15532 
15533 
15534 /* job_fini - free all memory associated with job records */
job_fini(void)15535 void job_fini (void)
15536 {
15537 	FREE_NULL_LIST(job_list);
15538 	xfree(job_hash);
15539 	xfree(job_array_hash_j);
15540 	xfree(job_array_hash_t);
15541 	FREE_NULL_LIST(purge_files_list);
15542 	FREE_NULL_BITMAP(requeue_exit);
15543 	FREE_NULL_BITMAP(requeue_exit_hold);
15544 }
15545 
15546 /* Record the start of one job array task */
job_array_start(job_record_t * job_ptr)15547 extern void job_array_start(job_record_t *job_ptr)
15548 {
15549 	job_record_t *base_job_ptr;
15550 
15551 	if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) {
15552 		base_job_ptr = find_job_record(job_ptr->array_job_id);
15553 		if (base_job_ptr && base_job_ptr->array_recs) {
15554 			base_job_ptr->array_recs->tot_run_tasks++;
15555 		}
15556 	}
15557 }
15558 
15559 /* Return true if a job array task can be started */
job_array_start_test(job_record_t * job_ptr)15560 extern bool job_array_start_test(job_record_t *job_ptr)
15561 {
15562 	job_record_t *base_job_ptr;
15563 	time_t now = time(NULL);
15564 
15565 	if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) {
15566 		base_job_ptr = find_job_record(job_ptr->array_job_id);
15567 		if (base_job_ptr && base_job_ptr->array_recs &&
15568 		    (base_job_ptr->array_recs->max_run_tasks != 0) &&
15569 		    (base_job_ptr->array_recs->tot_run_tasks >=
15570 		     base_job_ptr->array_recs->max_run_tasks)) {
15571 			if (job_ptr->details &&
15572 			    (job_ptr->details->begin_time <= now))
15573 				job_ptr->details->begin_time = (time_t) 0;
15574 			xfree(job_ptr->state_desc);
15575 			job_ptr->state_reason = WAIT_ARRAY_TASK_LIMIT;
15576 			return false;
15577 		}
15578 	}
15579 
15580 	return true;
15581 }
15582 
_job_array_comp(job_record_t * job_ptr,bool was_running,bool requeue)15583 static void _job_array_comp(job_record_t *job_ptr, bool was_running,
15584 			    bool requeue)
15585 {
15586 	job_record_t *base_job_ptr;
15587 	uint32_t status;
15588 
15589 	if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) {
15590 		status = job_ptr->exit_code;
15591 		if ((status == 0) && !IS_JOB_COMPLETE(job_ptr)) {
15592 			/* Avoid max_exit_code == 0 if task did not run to
15593 			 * successful completion (e.g. Cancelled, NodeFail) */
15594 			status = 9;
15595 		}
15596 		base_job_ptr = find_job_record(job_ptr->array_job_id);
15597 		if (base_job_ptr && base_job_ptr->array_recs) {
15598 			if (requeue) {
15599 				base_job_ptr->array_recs->array_flags |=
15600 					ARRAY_TASK_REQUEUED;
15601 			} else if (!base_job_ptr->array_recs->tot_comp_tasks) {
15602 				base_job_ptr->array_recs->min_exit_code =
15603 					status;
15604 				base_job_ptr->array_recs->max_exit_code =
15605 					status;
15606 			} else {
15607 				base_job_ptr->array_recs->min_exit_code =
15608 					MIN(status, base_job_ptr->
15609 					    array_recs->min_exit_code);
15610 				base_job_ptr->array_recs->max_exit_code =
15611 					MAX(status, base_job_ptr->
15612 					    array_recs->max_exit_code);
15613 			}
15614 			if (was_running &&
15615 			    base_job_ptr->array_recs->tot_run_tasks)
15616 				base_job_ptr->array_recs->tot_run_tasks--;
15617 			base_job_ptr->array_recs->tot_comp_tasks++;
15618 		}
15619 	}
15620 }
15621 
15622 /* log the completion of the specified job */
job_completion_logger(job_record_t * job_ptr,bool requeue)15623 extern void job_completion_logger(job_record_t *job_ptr, bool requeue)
15624 {
15625 	int base_state;
15626 	bool arr_finished = false, task_failed = false, task_requeued = false;
15627 	bool was_running = false;
15628 	job_record_t *master_job = NULL;
15629 	uint32_t max_exit_code = 0;
15630 
15631 	xassert(job_ptr);
15632 
15633 	acct_policy_remove_job_submit(job_ptr);
15634 	if (job_ptr->nodes && ((job_ptr->bit_flags & JOB_KILL_HURRY) == 0)
15635 	    && !IS_JOB_RESIZING(job_ptr)) {
15636 		(void) bb_g_job_start_stage_out(job_ptr);
15637 	} else if (job_ptr->nodes && IS_JOB_RESIZING(job_ptr)){
15638 		debug("%s: %pJ resizing, skipping bb stage_out",
15639 		      __func__, job_ptr);
15640 	} else {
15641 		/*
15642 		 * Never allocated compute nodes.
15643 		 * Unless job ran, there is no data to stage-out
15644 		 */
15645 		(void) bb_g_job_cancel(job_ptr);
15646 	}
15647 	if (job_ptr->bit_flags & JOB_WAS_RUNNING) {
15648 		job_ptr->bit_flags &= ~JOB_WAS_RUNNING;
15649 		was_running = true;
15650 	}
15651 
15652 	_job_array_comp(job_ptr, was_running, requeue);
15653 
15654 	if (!IS_JOB_RESIZING(job_ptr) &&
15655 	    !IS_JOB_PENDING(job_ptr)  &&
15656 	    !IS_JOB_REVOKED(job_ptr)  &&
15657 	    ((job_ptr->array_task_id == NO_VAL) ||
15658 	     (job_ptr->mail_type & MAIL_ARRAY_TASKS) ||
15659 	     (arr_finished = test_job_array_finished(job_ptr->array_job_id)))) {
15660 		/* Remove configuring state just to make sure it isn't there
15661 		 * since it will throw off displays of the job. */
15662 		job_ptr->job_state &= ~JOB_CONFIGURING;
15663 
15664 		/* make sure all parts of the job are notified
15665 		 * Fed Jobs: only signal the srun from where the job is running
15666 		 * or from the origin if the job wasn't running. */
15667 		if (!job_ptr->fed_details ||
15668 		    fed_mgr_job_is_self_owned(job_ptr) ||
15669 		    (fed_mgr_is_origin_job(job_ptr) &&
15670 		     !fed_mgr_job_is_locked(job_ptr)))
15671 			srun_job_complete(job_ptr);
15672 
15673 		/* mail out notifications of completion */
15674 		if (arr_finished) {
15675 			/* We need to summarize different tasks states. */
15676 			master_job = find_job_record(job_ptr->array_job_id);
15677 			if (master_job && master_job->array_recs) {
15678 				task_requeued =
15679 					(master_job->array_recs->array_flags &
15680 					 ARRAY_TASK_REQUEUED);
15681 				if (task_requeued &&
15682 				    (job_ptr->mail_type & MAIL_JOB_REQUEUE)) {
15683 					/*
15684 					 * At least 1 task requeued and job
15685 					 * req. to be notified on requeues.
15686 					 */
15687 					mail_job_info(master_job,
15688 						      MAIL_JOB_REQUEUE);
15689 				}
15690 
15691 				max_exit_code =
15692 					master_job->array_recs->max_exit_code;
15693 				task_failed = (WIFEXITED(max_exit_code) &&
15694 					       WEXITSTATUS(max_exit_code));
15695 				if (task_failed &&
15696 				    (job_ptr->mail_type & MAIL_JOB_FAIL)) {
15697 					/*
15698 					 * At least 1 task failed and job
15699 					 * req. to be notified on failures.
15700 					 */
15701 					mail_job_info(master_job,
15702 						      MAIL_JOB_FAIL);
15703 				} else if (job_ptr->mail_type & MAIL_JOB_END) {
15704 					/*
15705 					 * Job req. to be notified on END.
15706 					 */
15707 					mail_job_info(job_ptr, MAIL_JOB_END);
15708 				}
15709 			}
15710 		} else {
15711 			base_state = job_ptr->job_state & JOB_STATE_BASE;
15712 			if ((base_state == JOB_COMPLETE) ||
15713 			    (base_state == JOB_CANCELLED)) {
15714 				if (requeue &&
15715 				    (job_ptr->mail_type & MAIL_JOB_REQUEUE)) {
15716 					mail_job_info(job_ptr,
15717 						      MAIL_JOB_REQUEUE);
15718 				} else if (job_ptr->mail_type & MAIL_JOB_END) {
15719 					mail_job_info(job_ptr, MAIL_JOB_END);
15720 				}
15721 			} else {	/* JOB_FAILED, JOB_TIMEOUT, etc. */
15722 				if (job_ptr->mail_type & MAIL_JOB_FAIL)
15723 					mail_job_info(job_ptr, MAIL_JOB_FAIL);
15724 				else if (job_ptr->mail_type & MAIL_JOB_END)
15725 					mail_job_info(job_ptr, MAIL_JOB_END);
15726 			}
15727 		}
15728 	}
15729 
15730 	g_slurm_jobcomp_write(job_ptr);
15731 
15732 	/* When starting the resized job everything is taken care of
15733 	 * elsewhere, so don't call it here. */
15734 	if (IS_JOB_RESIZING(job_ptr))
15735 		return;
15736 
15737 	if (!with_slurmdbd && !job_ptr->db_index)
15738 		jobacct_storage_g_job_start(acct_db_conn, job_ptr);
15739 
15740 	if (!(job_ptr->bit_flags & TRES_STR_CALC) &&
15741 	    job_ptr->tres_alloc_cnt &&
15742 	    (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64))
15743 		set_job_tres_alloc_str(job_ptr, false);
15744 
15745 	jobacct_storage_g_job_complete(acct_db_conn, job_ptr);
15746 }
15747 
15748 /*
15749  * job_independent - determine if this job has a dependent job pending
15750  *	or if the job's scheduled begin time is in the future
15751  * IN job_ptr - pointer to job being tested
15752  * RET - true if job no longer must be deferred for another job
15753  */
job_independent(job_record_t * job_ptr)15754 extern bool job_independent(job_record_t *job_ptr)
15755 {
15756 	struct job_details *detail_ptr = job_ptr->details;
15757 	time_t now = time(NULL);
15758 	int depend_rc;
15759 
15760 	if ((job_ptr->state_reason == FAIL_BURST_BUFFER_OP) ||
15761 	    (job_ptr->state_reason == FAIL_ACCOUNT) ||
15762 	    (job_ptr->state_reason == FAIL_QOS) ||
15763 	    (job_ptr->state_reason == WAIT_HELD) ||
15764 	    (job_ptr->state_reason == WAIT_HELD_USER) ||
15765 	    (job_ptr->state_reason == WAIT_MAX_REQUEUE) ||
15766 	    (job_ptr->state_reason == WAIT_RESV_DELETED) ||
15767 	    (job_ptr->state_reason == WAIT_DEP_INVALID))
15768 		return false;
15769 
15770 	/* Test dependencies first so we can cancel jobs before dependent
15771 	 * job records get purged (e.g. afterok, afternotok) */
15772 	depend_rc = test_job_dependency(job_ptr, NULL);
15773 	if ((depend_rc == LOCAL_DEPEND) || (depend_rc == REMOTE_DEPEND)) {
15774 		/* start_time has passed but still has dependency which
15775 		 * makes it ineligible */
15776 		if (detail_ptr->begin_time < now)
15777 			detail_ptr->begin_time = 0;
15778 		job_ptr->state_reason = WAIT_DEPENDENCY;
15779 		xfree(job_ptr->state_desc);
15780 		return false;
15781 	} else if (depend_rc == FAIL_DEPEND) {
15782 		handle_invalid_dependency(job_ptr);
15783 		return false;
15784 	}
15785 	/* Job is eligible to start now */
15786 	if (job_ptr->state_reason == WAIT_DEPENDENCY) {
15787 		job_ptr->state_reason = WAIT_NO_REASON;
15788 		xfree(job_ptr->state_desc);
15789 		/* Submit the job to its siblings. */
15790 		if (job_ptr->details) {
15791 			fed_mgr_job_requeue(job_ptr);
15792 		}
15793 	}
15794 
15795 	/* Check for maximum number of running tasks in a job array */
15796 	if (!job_array_start_test(job_ptr))
15797 		return false;
15798 
15799 	if (detail_ptr && (detail_ptr->begin_time > now)) {
15800 		job_ptr->state_reason = WAIT_TIME;
15801 		xfree(job_ptr->state_desc);
15802 		return false;	/* not yet time */
15803 	}
15804 
15805 	if (job_test_resv_now(job_ptr) != SLURM_SUCCESS) {
15806 		job_ptr->state_reason = WAIT_RESERVATION;
15807 		xfree(job_ptr->state_desc);
15808 		return false;	/* not yet time */
15809 	}
15810 
15811 	if ((detail_ptr && (detail_ptr->begin_time == 0) &&
15812 	    (job_ptr->priority != 0))) {
15813 		detail_ptr->begin_time = now;
15814 		/*
15815 		 * Send begin time to the database if it is already there, or it
15816 		 * won't get there until the job starts.
15817 		 */
15818 		jobacct_storage_job_start_direct(acct_db_conn, job_ptr);
15819 	} else if (job_ptr->state_reason == WAIT_TIME) {
15820 		job_ptr->state_reason = WAIT_NO_REASON;
15821 		xfree(job_ptr->state_desc);
15822 	}
15823 	return true;
15824 }
15825 
15826 /*
15827  * determine if job is ready to execute per the node select plugin
15828  * IN job_id - job to test
15829  * OUT ready - 1 if job is ready to execute 0 otherwise
15830  * RET Slurm error code
15831  */
job_node_ready(uint32_t job_id,int * ready)15832 extern int job_node_ready(uint32_t job_id, int *ready)
15833 {
15834 	int rc;
15835 	job_record_t *job_ptr;
15836 	xassert(ready);
15837 
15838 	*ready = 0;
15839 	job_ptr = find_job_record(job_id);
15840 	if (job_ptr == NULL)
15841 		return ESLURM_INVALID_JOB_ID;
15842 
15843 	/* Always call select_g_job_ready() so that select/bluegene can
15844 	 * test and update block state information. */
15845 	rc = select_g_job_ready(job_ptr);
15846 	if (rc == READY_JOB_FATAL)
15847 		return ESLURM_INVALID_PARTITION_NAME;
15848 	if (rc == READY_JOB_ERROR)
15849 		return EAGAIN;
15850 	if (rc)
15851 		rc = READY_NODE_STATE;
15852 
15853 	if (job_ptr->details && job_ptr->details->prolog_running)
15854 		rc &= (~READY_NODE_STATE);
15855 
15856 	if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))
15857 		rc |= READY_JOB_STATE;
15858 	if ((rc == (READY_NODE_STATE | READY_JOB_STATE)) &&
15859 	    job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD") &&
15860 	    job_ptr->node_bitmap &&
15861 	    (bit_overlap_any(power_node_bitmap, job_ptr->node_bitmap) == 0)) {
15862 		last_job_update = time(NULL);
15863 		set_job_alias_list(job_ptr);
15864 	}
15865 
15866 	*ready = rc;
15867 	return SLURM_SUCCESS;
15868 }
15869 
15870 /* Send specified signal to all steps associated with a job */
_signal_job(job_record_t * job_ptr,int signal,uint16_t flags)15871 static void _signal_job(job_record_t *job_ptr, int signal, uint16_t flags)
15872 {
15873 #ifndef HAVE_FRONT_END
15874 	int i;
15875 #endif
15876 	agent_arg_t *agent_args = NULL;
15877 	signal_tasks_msg_t *signal_job_msg = NULL;
15878 	static int notify_srun_static = -1;
15879 	int notify_srun = 0;
15880 
15881 	if (notify_srun_static == -1) {
15882 		/* do this for all but slurm (poe, aprun, etc...) */
15883 		if (xstrcmp(slurmctld_conf.launch_type, "launch/slurm"))
15884 			notify_srun_static = 1;
15885 		else
15886 			notify_srun_static = 0;
15887 	}
15888 
15889 #ifdef HAVE_FRONT_END
15890 	/* On a front end system always notify_srun instead of slurmd */
15891 	if (notify_srun_static)
15892 		notify_srun = 1;
15893 #else
15894 	/* For launch/poe all signals are forwarded by srun to poe to tasks
15895 	 * except SIGSTOP/SIGCONT, which are used for job preemption. In that
15896 	 * case the slurmd must directly suspend tasks and switch resources. */
15897 	if (notify_srun_static && (signal != SIGSTOP) && (signal != SIGCONT))
15898 		notify_srun = 1;
15899 #endif
15900 
15901 	if (notify_srun) {
15902 		ListIterator step_iterator;
15903 		step_record_t *step_ptr;
15904 		step_iterator = list_iterator_create(job_ptr->step_list);
15905 		while ((step_ptr = list_next(step_iterator))) {
15906 			/* Since we have already checked the uid,
15907 			 * we can send this signal as uid 0. */
15908 			job_step_signal(job_ptr->job_id, step_ptr->step_id,
15909 					signal, 0, 0);
15910 		}
15911 		list_iterator_destroy (step_iterator);
15912 
15913 		return;
15914 	}
15915 
15916 	agent_args = xmalloc(sizeof(agent_arg_t));
15917 	agent_args->msg_type = REQUEST_SIGNAL_TASKS;
15918 	agent_args->retry = 1;
15919 	agent_args->hostlist = hostlist_create(NULL);
15920 	signal_job_msg = xmalloc(sizeof(signal_tasks_msg_t));
15921 	signal_job_msg->job_id = job_ptr->job_id;
15922 
15923 	/*
15924 	 * We don't ever want to kill a step with this message.  The flags below
15925 	 * will make sure that does happen.  Just in case though, set the
15926 	 * step_id to an impossible number.
15927 	 */
15928 	signal_job_msg->job_step_id = slurmctld_conf.max_step_cnt + 1;
15929 
15930 	/*
15931 	 * Encode the flags for slurm stepd to know what steps get signaled
15932 	 * Here if we aren't signaling the full job we always only want to
15933 	 * signal all other steps.
15934 	 */
15935 	if ((flags & KILL_FULL_JOB) ||
15936 	    (flags & KILL_JOB_BATCH) ||
15937 	    (flags & KILL_STEPS_ONLY))
15938 		signal_job_msg->flags = flags;
15939 	else
15940 		signal_job_msg->flags = KILL_STEPS_ONLY;
15941 
15942 	signal_job_msg->signal = signal;
15943 
15944 #ifdef HAVE_FRONT_END
15945 	xassert(job_ptr->batch_host);
15946 	if (job_ptr->front_end_ptr)
15947 		agent_args->protocol_version =
15948 			job_ptr->front_end_ptr->protocol_version;
15949 	hostlist_push_host(agent_args->hostlist, job_ptr->batch_host);
15950 	agent_args->node_count = 1;
15951 #else
15952 	agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
15953 	for (i = 0; i < node_record_count; i++) {
15954 		if (bit_test(job_ptr->node_bitmap, i) == 0)
15955 			continue;
15956 		if (agent_args->protocol_version >
15957 		    node_record_table_ptr[i].protocol_version)
15958 			agent_args->protocol_version =
15959 				node_record_table_ptr[i].protocol_version;
15960 		hostlist_push_host(agent_args->hostlist,
15961 			      node_record_table_ptr[i].name);
15962 		agent_args->node_count++;
15963 	}
15964 #endif
15965 
15966 	if (agent_args->node_count == 0) {
15967 		xfree(signal_job_msg);
15968 		xfree(agent_args);
15969 		return;
15970 	}
15971 
15972 	agent_args->msg_args = signal_job_msg;
15973 	agent_queue_request(agent_args);
15974 	return;
15975 }
15976 
_switch_suspend_info(job_record_t * job_ptr)15977 static void *_switch_suspend_info(job_record_t *job_ptr)
15978 {
15979 	ListIterator step_iterator;
15980 	step_record_t *step_ptr;
15981 	void *switch_suspend_info = NULL;
15982 
15983 	step_iterator = list_iterator_create (job_ptr->step_list);
15984 	while ((step_ptr = list_next(step_iterator))) {
15985 		if (step_ptr->state != JOB_RUNNING)
15986 			continue;
15987 		switch_g_job_suspend_info_get(step_ptr->switch_job,
15988 					      &switch_suspend_info);
15989 	}
15990 	list_iterator_destroy (step_iterator);
15991 
15992 	return switch_suspend_info;
15993 }
15994 
15995 /* Send suspend request to slumrd of all nodes associated with a job
15996  * job_ptr IN - job to be suspended or resumed
15997  * op IN - SUSPEND_JOB or RESUME_JOB
15998  * indf_susp IN - set if job is being suspended indefinitely by user
15999  *                or admin, otherwise suspended for gang scheduling
16000  */
_suspend_job(job_record_t * job_ptr,uint16_t op,bool indf_susp)16001 static void _suspend_job(job_record_t *job_ptr, uint16_t op, bool indf_susp)
16002 {
16003 #ifndef HAVE_FRONT_END
16004 	int i;
16005 #endif
16006 	agent_arg_t *agent_args;
16007 	suspend_int_msg_t *sus_ptr;
16008 
16009 	agent_args = xmalloc(sizeof(agent_arg_t));
16010 	agent_args->msg_type = REQUEST_SUSPEND_INT;
16011 	agent_args->retry = 0;	/* don't resend, gang scheduler can
16012 				 * quickly induce huge backlog
16013 				 * of agent.c RPCs */
16014 	agent_args->hostlist = hostlist_create(NULL);
16015 	sus_ptr = xmalloc(sizeof(suspend_int_msg_t));
16016 	sus_ptr->job_core_spec = job_ptr->details->core_spec;
16017 	sus_ptr->job_id = job_ptr->job_id;
16018 	sus_ptr->op = op;
16019 	sus_ptr->indf_susp = indf_susp;
16020 	sus_ptr->switch_info = _switch_suspend_info(job_ptr);
16021 
16022 #ifdef HAVE_FRONT_END
16023 	xassert(job_ptr->batch_host);
16024 	if (job_ptr->front_end_ptr) {
16025 		agent_args->protocol_version =
16026 			job_ptr->front_end_ptr->protocol_version;
16027 	}
16028 	hostlist_push_host(agent_args->hostlist, job_ptr->batch_host);
16029 	agent_args->node_count = 1;
16030 #else
16031 	agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
16032 	for (i = 0; i < node_record_count; i++) {
16033 		if (bit_test(job_ptr->node_bitmap, i) == 0)
16034 			continue;
16035 		if (agent_args->protocol_version >
16036 		    node_record_table_ptr[i].protocol_version)
16037 			agent_args->protocol_version =
16038 				node_record_table_ptr[i].protocol_version;
16039 		hostlist_push_host(agent_args->hostlist,
16040 				   node_record_table_ptr[i].name);
16041 		agent_args->node_count++;
16042 	}
16043 #endif
16044 
16045 	if (agent_args->node_count == 0) {
16046 		slurm_free_suspend_int_msg(sus_ptr);
16047 		xfree(agent_args);
16048 		return;
16049 	}
16050 
16051 	agent_args->msg_args = sus_ptr;
16052 	agent_queue_request(agent_args);
16053 	return;
16054 }
16055 
16056 /*
16057  * Specified job is being suspended, release allocated nodes
16058  * job_ptr IN - job to be suspended
16059  * indf_susp IN - set if job is being suspended indefinitely by user
16060  *                or admin, otherwise suspended for gang scheduling
16061  */
_suspend_job_nodes(job_record_t * job_ptr,bool indf_susp)16062 static int _suspend_job_nodes(job_record_t *job_ptr, bool indf_susp)
16063 {
16064 	int i, i_first, i_last, rc = SLURM_SUCCESS;
16065 	node_record_t *node_ptr;
16066 	uint32_t node_flags;
16067 	time_t now = time(NULL);
16068 
16069 	if ((rc = select_g_job_suspend(job_ptr, indf_susp)) != SLURM_SUCCESS)
16070 		return rc;
16071 
16072 	i_first = bit_ffs(job_ptr->node_bitmap);
16073 	if (i_first >= 0)
16074 		i_last = bit_fls(job_ptr->node_bitmap);
16075 	else
16076 		i_last = -2;
16077 	node_ptr = node_record_table_ptr + i_first;
16078 	for (i = i_first; i <= i_last; i++, node_ptr++) {
16079 		if (!bit_test(job_ptr->node_bitmap, i))
16080 			continue;
16081 		node_ptr->sus_job_cnt++;
16082 		if (node_ptr->run_job_cnt)
16083 			(node_ptr->run_job_cnt)--;
16084 		else {
16085 			error("%s: %pJ node %s run_job_cnt underflow",
16086 			      __func__, job_ptr, node_ptr->name);
16087 		}
16088 		if (job_ptr->details && (job_ptr->details->share_res == 0)) {
16089 			if (node_ptr->no_share_job_cnt)
16090 				(node_ptr->no_share_job_cnt)--;
16091 			else {
16092 				error("%s: %pJ node %s no_share_job_cnt underflow",
16093 				      __func__, job_ptr, node_ptr->name);
16094 			}
16095 			if (node_ptr->no_share_job_cnt == 0)
16096 				bit_set(share_node_bitmap, i);
16097 		}
16098 		node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
16099 		if ((node_ptr->run_job_cnt  == 0) &&
16100 		    (node_ptr->comp_job_cnt == 0)) {
16101 			bit_set(idle_node_bitmap, i);
16102 		}
16103 		if (IS_NODE_DOWN(node_ptr)) {
16104 			debug3("%s: %pJ node %s left DOWN",
16105 			       __func__, job_ptr, node_ptr->name);
16106 		} else if (node_ptr->run_job_cnt) {
16107 			node_ptr->node_state = NODE_STATE_ALLOCATED |
16108 					       node_flags;
16109 		} else {
16110 			node_ptr->node_state = NODE_STATE_IDLE | node_flags;
16111 			node_ptr->last_idle  = now;
16112 		}
16113 	}
16114 	last_job_update = last_node_update = now;
16115 	return rc;
16116 }
16117 
16118 /*
16119  * Specified job is being resumed, re-allocate the nodes
16120  * job_ptr IN - job to be resumed
16121  * indf_susp IN - set i f job is being resumed from indefinite suspend by user
16122  *                or admin, otherwise resume from gang scheduling
16123  */
_resume_job_nodes(job_record_t * job_ptr,bool indf_susp)16124 static int _resume_job_nodes(job_record_t *job_ptr, bool indf_susp)
16125 {
16126 	int i, i_first, i_last, rc = SLURM_SUCCESS;
16127 	node_record_t *node_ptr;
16128 	uint32_t node_flags;
16129 
16130 	if ((rc = select_g_job_resume(job_ptr, indf_susp)) != SLURM_SUCCESS)
16131 		return rc;
16132 
16133 	i_first = bit_ffs(job_ptr->node_bitmap);
16134 	if (i_first >= 0)
16135 		i_last = bit_fls(job_ptr->node_bitmap);
16136 	else
16137 		i_last = -2;
16138 	node_ptr = node_record_table_ptr + i_first;
16139 	for (i = i_first; i <= i_last; i++, node_ptr++) {
16140 		if (!bit_test(job_ptr->node_bitmap, i))
16141 			continue;
16142 		if (IS_NODE_DOWN(node_ptr))
16143 			return SLURM_ERROR;
16144 	}
16145 
16146 	node_ptr = node_record_table_ptr + i_first;
16147 	for (i = i_first; i <= i_last; i++, node_ptr++) {
16148 		if (!bit_test(job_ptr->node_bitmap, i))
16149 			continue;
16150 
16151 		if (node_ptr->sus_job_cnt)
16152 			(node_ptr->sus_job_cnt)--;
16153 		else {
16154 			error("Node %s sus_job_cnt underflow",
16155 			      node_ptr->name);
16156 		}
16157 		node_ptr->run_job_cnt++;
16158 		if (job_ptr->details &&
16159 		    (job_ptr->details->share_res == 0)) {
16160 			node_ptr->no_share_job_cnt++;
16161 			if (node_ptr->no_share_job_cnt)
16162 				bit_clear(share_node_bitmap, i);
16163 		}
16164 
16165 		if (slurm_mcs_get_select(job_ptr) == 1) {
16166 			xfree(node_ptr->mcs_label);
16167 			node_ptr->mcs_label = xstrdup(job_ptr->mcs_label);
16168 		}
16169 
16170 		bit_clear(idle_node_bitmap, i);
16171 		node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
16172 		node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags;
16173 	}
16174 	last_job_update = last_node_update = time(NULL);
16175 	return rc;
16176 }
16177 
_job_suspend_switch_test(job_record_t * job_ptr)16178 static int _job_suspend_switch_test(job_record_t *job_ptr)
16179 {
16180 	int rc = SLURM_SUCCESS;
16181 	ListIterator step_iterator;
16182 	step_record_t *step_ptr;
16183 
16184 	step_iterator = list_iterator_create(job_ptr->step_list);
16185 	while ((step_ptr = list_next(step_iterator))) {
16186 		if (step_ptr->state != JOB_RUNNING)
16187 			continue;
16188 		rc = switch_g_job_suspend_test(step_ptr->switch_job);
16189 		if (rc != SLURM_SUCCESS)
16190 			break;
16191 	}
16192 	list_iterator_destroy (step_iterator);
16193 
16194 	return rc;
16195 }
16196 
16197 /*
16198  * Determine if a job can be resumed.
16199  * Check for multiple jobs on the same nodes with core specialization.
16200  * RET 0 on success, otherwise ESLURM error code
16201  */
_job_resume_test(job_record_t * job_ptr)16202 static int _job_resume_test(job_record_t *job_ptr)
16203 {
16204 	int rc = SLURM_SUCCESS;
16205 	ListIterator job_iterator;
16206 	job_record_t *test_job_ptr;
16207 
16208 	if ((job_ptr->details == NULL) ||
16209 	    (job_ptr->details->core_spec == NO_VAL16) ||
16210 	    (job_ptr->node_bitmap == NULL))
16211 		return rc;
16212 
16213 	job_iterator = list_iterator_create(job_list);
16214 	while ((test_job_ptr = list_next(job_iterator))) {
16215 		if (test_job_ptr->details &&
16216 		    (test_job_ptr->details->core_spec != NO_VAL16) &&
16217 		    IS_JOB_RUNNING(test_job_ptr) &&
16218 		    test_job_ptr->node_bitmap &&
16219 		    bit_overlap_any(test_job_ptr->node_bitmap,
16220 				    job_ptr->node_bitmap)) {
16221 			rc = ESLURM_NODES_BUSY;
16222 			break;
16223 		}
16224 /* FIXME: Also test for ESLURM_INTERCONNECT_BUSY */
16225 	}
16226 	list_iterator_destroy(job_iterator);
16227 
16228 	return rc;
16229 }
16230 
16231 /*
16232  * _job_suspend_op - perform some suspend/resume operation on a job
16233  * op IN - operation: suspend/resume
16234  * indf_susp IN - set if job is being suspended indefinitely by user or admin
16235  *                and we should clear it's priority, otherwise suspended
16236  *		  temporarily for gang scheduling
16237  * RET 0 on success, otherwise ESLURM error code
16238  */
_job_suspend_op(job_record_t * job_ptr,uint16_t op,bool indf_susp)16239 static int _job_suspend_op(job_record_t *job_ptr, uint16_t op, bool indf_susp)
16240 {
16241 	int rc = SLURM_SUCCESS;
16242 	time_t now = time(NULL);
16243 
16244 	if (IS_JOB_PENDING(job_ptr))
16245 		return ESLURM_JOB_PENDING;
16246 	if (IS_JOB_FINISHED(job_ptr))
16247 		return ESLURM_ALREADY_DONE;
16248 	if ((op == SUSPEND_JOB) &&
16249 	    (_job_suspend_switch_test(job_ptr) != SLURM_SUCCESS))
16250 		return ESLURM_NOT_SUPPORTED;
16251 	if ((op == RESUME_JOB) && (rc = _job_resume_test(job_ptr)))
16252 		return rc;
16253 
16254 	/* perform the operation */
16255 	if (op == SUSPEND_JOB) {
16256 		if (IS_JOB_SUSPENDED(job_ptr) && indf_susp) {
16257 			debug("%s: Holding %pJ, re-suspend operation",
16258 			      __func__, job_ptr);
16259 			job_ptr->priority = 0;	/* Prevent gang sched resume */
16260 			return SLURM_SUCCESS;
16261 		}
16262 		if (!IS_JOB_RUNNING(job_ptr))
16263 			return ESLURM_JOB_NOT_RUNNING;
16264 		rc = _suspend_job_nodes(job_ptr, indf_susp);
16265 		if (rc != SLURM_SUCCESS)
16266 			return rc;
16267 		_suspend_job(job_ptr, op, indf_susp);
16268 		job_ptr->job_state = JOB_SUSPENDED;
16269 		if (indf_susp) {    /* Job being manually suspended, not gang */
16270 			debug("%s: Holding %pJ, suspend operation",
16271 			      __func__, job_ptr);
16272 			job_ptr->priority = 0;
16273 			(void) gs_job_fini(job_ptr);
16274 		}
16275 		if (job_ptr->suspend_time) {
16276 			job_ptr->pre_sus_time +=
16277 				difftime(now, job_ptr->suspend_time);
16278 		} else {
16279 			job_ptr->pre_sus_time +=
16280 				difftime(now, job_ptr->start_time);
16281 		}
16282 		suspend_job_step(job_ptr);
16283 	} else if (op == RESUME_JOB) {
16284 		if (!IS_JOB_SUSPENDED(job_ptr))
16285 			return ESLURM_JOB_NOT_SUSPENDED;
16286 		rc = _resume_job_nodes(job_ptr, indf_susp);
16287 		power_g_job_resume(job_ptr);
16288 		if (rc != SLURM_SUCCESS)
16289 			return rc;
16290 		_suspend_job(job_ptr, op, indf_susp);
16291 		if (job_ptr->priority == 0) {
16292 			/* Job was manually suspended, not gang */
16293 			set_job_prio(job_ptr);
16294 			(void) gs_job_start(job_ptr);
16295 		}
16296 		job_ptr->job_state = JOB_RUNNING;
16297 		job_ptr->tot_sus_time +=
16298 			difftime(now, job_ptr->suspend_time);
16299 
16300 		if ((job_ptr->time_limit != INFINITE) &&
16301 		    (!job_ptr->preempt_time)) {
16302 			debug3("%pJ resumed, updating end_time", job_ptr);
16303 			job_ptr->end_time_exp = job_ptr->end_time =
16304 				now + (job_ptr->time_limit * 60)
16305 				- job_ptr->pre_sus_time;
16306 		}
16307 		resume_job_step(job_ptr);
16308 	}
16309 
16310 	job_ptr->time_last_active = now;
16311 	job_ptr->suspend_time = now;
16312 	jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
16313 
16314 	return rc;
16315 }
16316 
16317 
16318 /*
16319  * _job_suspend - perform some suspend/resume operation, if the specified
16320  *                job records is a hetjob leader, perform the operation on all
16321  *                components of the hetjob
16322  * job_ptr - job to operate upon
16323  * op IN - operation: suspend/resume
16324  * indf_susp IN - set if job is being suspended indefinitely by user or admin
16325  *                and we should clear it's priority, otherwise suspended
16326  *		  temporarily for gang scheduling
16327  * RET 0 on success, otherwise ESLURM error code
16328  */
_job_suspend(job_record_t * job_ptr,uint16_t op,bool indf_susp)16329 static int _job_suspend(job_record_t *job_ptr, uint16_t op, bool indf_susp)
16330 {
16331 	job_record_t *het_job;
16332 	int rc = SLURM_SUCCESS, rc1;
16333 	ListIterator iter;
16334 
16335 	if (job_ptr->het_job_id && !job_ptr->het_job_list)
16336 		return ESLURM_NOT_WHOLE_HET_JOB;
16337 
16338 	/* Notify salloc/srun of suspend/resume */
16339 	srun_job_suspend(job_ptr, op);
16340 
16341 	if (job_ptr->het_job_list) {
16342 		iter = list_iterator_create(job_ptr->het_job_list);
16343 		while ((het_job = list_next(iter))) {
16344 			if (job_ptr->het_job_id != het_job->het_job_id) {
16345 				error("%s: Bad het_job_list for %pJ",
16346 				      __func__, job_ptr);
16347 				continue;
16348 			}
16349 			rc1 = _job_suspend_op(het_job, op, indf_susp);
16350 			if (rc1 != SLURM_SUCCESS)
16351 				rc = rc1;
16352 		}
16353 		list_iterator_destroy(iter);
16354 	} else {
16355 		rc = _job_suspend_op(job_ptr, op, indf_susp);
16356 	}
16357 
16358 	return rc;
16359 }
16360 
16361 /*
16362  * job_suspend - perform some suspend/resume operation
16363  * NOTE: job_suspend  - Uses the job_id field and ignores job_id_str
16364  *
16365  * IN sus_ptr - suspend/resume request message
16366  * IN uid - user id of the user issuing the RPC
16367  * IN conn_fd - file descriptor on which to send reply,
16368  *              -1 if none
16369  * indf_susp IN - set if job is being suspended indefinitely by user or admin
16370  *                and we should clear it's priority, otherwise suspended
16371  *		  temporarily for gang scheduling
16372  * IN protocol_version - slurm protocol version of client
16373  * RET 0 on success, otherwise ESLURM error code
16374  */
job_suspend(suspend_msg_t * sus_ptr,uid_t uid,int conn_fd,bool indf_susp,uint16_t protocol_version)16375 extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid,
16376 		       int conn_fd, bool indf_susp,
16377 		       uint16_t protocol_version)
16378 {
16379 	int rc = SLURM_SUCCESS;
16380 	job_record_t *job_ptr = NULL;
16381 	slurm_msg_t resp_msg;
16382 	return_code_msg_t rc_msg;
16383 
16384 	xfree(sus_ptr->job_id_str);
16385 	xstrfmtcat(sus_ptr->job_id_str, "%u", sus_ptr->job_id);
16386 
16387 	/* validate the request */
16388 	if (!validate_operator(uid)) {
16389 		error("SECURITY VIOLATION: Attempt to suspend job from user %u",
16390 		      (int) uid);
16391 		rc = ESLURM_ACCESS_DENIED;
16392 		goto reply;
16393 	}
16394 
16395 	/* find the job */
16396 	job_ptr = find_job_record (sus_ptr->job_id);
16397 	if (job_ptr == NULL) {
16398 		rc = ESLURM_INVALID_JOB_ID;
16399 		goto reply;
16400 	}
16401 
16402 	rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp);
16403 
16404     reply:
16405 
16406 	/* Since we have already used it lets make sure we don't leak
16407 	   memory */
16408 	xfree(sus_ptr->job_id_str);
16409 
16410 	if (conn_fd >= 0) {
16411 		slurm_msg_t_init(&resp_msg);
16412 		resp_msg.protocol_version = protocol_version;
16413 		resp_msg.msg_type  = RESPONSE_SLURM_RC;
16414 		memset(&rc_msg, 0, sizeof(rc_msg));
16415 		rc_msg.return_code = rc;
16416 		resp_msg.data      = &rc_msg;
16417 		slurm_send_node_msg(conn_fd, &resp_msg);
16418 	}
16419 	return rc;
16420 }
16421 
16422 /*
16423  * job_suspend2 - perform some suspend/resume operation
16424  * NB job_suspend2 - Ignores the job_id field and uses job_id_str
16425  *
16426  * IN sus_ptr - suspend/resume request message
16427  * IN uid - user id of the user issuing the RPC
16428  * IN conn_fd - file descriptor on which to send reply,
16429  *              -1 if none
16430  * indf_susp IN - set if job is being suspended indefinitely by user or admin
16431  *                and we should clear it's priority, otherwise suspended
16432  *		  temporarily for gang scheduling
16433  * IN protocol_version - slurm protocol version of client
16434  * RET 0 on success, otherwise ESLURM error code
16435  */
job_suspend2(suspend_msg_t * sus_ptr,uid_t uid,int conn_fd,bool indf_susp,uint16_t protocol_version)16436 extern int job_suspend2(suspend_msg_t *sus_ptr, uid_t uid,
16437 			int conn_fd, bool indf_susp,
16438 			uint16_t protocol_version)
16439 {
16440 	int rc = SLURM_SUCCESS, rc2;
16441 	job_record_t *job_ptr = NULL;
16442 	long int long_id;
16443 	uint32_t job_id = 0;
16444 	char *end_ptr = NULL, *tok, *tmp;
16445 	bitstr_t *array_bitmap = NULL;
16446 	bool valid = true;
16447 	int32_t i, i_first, i_last;
16448 	slurm_msg_t resp_msg;
16449 	return_code_msg_t rc_msg;
16450 	resp_array_struct_t *resp_array = NULL;
16451 	job_array_resp_msg_t *resp_array_msg = NULL;
16452 
16453 	if (max_array_size == NO_VAL) {
16454 		max_array_size = slurmctld_conf.max_array_sz;
16455 	}
16456 
16457 	/* validate the request */
16458 	if (!validate_operator(uid)) {
16459 		error("SECURITY VIOLATION: Attempt to suspend job from user %u",
16460 		      (int) uid);
16461 		rc = ESLURM_ACCESS_DENIED;
16462 		goto reply;
16463 	}
16464 
16465 	long_id = strtol(sus_ptr->job_id_str, &end_ptr, 10);
16466 	if (end_ptr[0] == '+')
16467 		rc = ESLURM_NOT_WHOLE_HET_JOB;
16468 	else if ((long_id <= 0) || (long_id == LONG_MAX) ||
16469 		 ((end_ptr[0] != '\0') && (end_ptr[0] != '_')))
16470 		rc = ESLURM_INVALID_JOB_ID;
16471 	if (rc != SLURM_SUCCESS) {
16472 		info("%s: invalid JobId=%s", __func__, sus_ptr->job_id_str);
16473 		goto reply;
16474 	}
16475 
16476 	job_id = (uint32_t) long_id;
16477 	if (end_ptr[0] == '\0') {	/* Single job (or full job array) */
16478 		job_record_t *job_ptr_done = NULL;
16479 		job_ptr = find_job_record(job_id);
16480 		if (job_ptr &&
16481 		    (((job_ptr->array_task_id == NO_VAL) &&
16482 		      (job_ptr->array_recs == NULL)) ||
16483 		     ((job_ptr->array_task_id != NO_VAL) &&
16484 		      (job_ptr->array_job_id  != job_id)))) {
16485 			/* This is a regular job or single task of job array */
16486 			rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp);
16487 			goto reply;
16488 		}
16489 
16490 		if (job_ptr && job_ptr->array_recs) {
16491 			/* This is a job array */
16492 			job_ptr_done = job_ptr;
16493 			rc2 = _job_suspend(job_ptr, sus_ptr->op, indf_susp);
16494 			_resp_array_add(&resp_array, job_ptr, rc2);
16495 		}
16496 
16497 		/* Suspend all tasks of this job array */
16498 		job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
16499 		if (!job_ptr && !job_ptr_done) {
16500 			rc = ESLURM_INVALID_JOB_ID;
16501 			goto reply;
16502 		}
16503 		while (job_ptr) {
16504 			if ((job_ptr->array_job_id == job_id) &&
16505 			    (job_ptr != job_ptr_done)) {
16506 				rc2 = _job_suspend(job_ptr, sus_ptr->op,
16507 						   indf_susp);
16508 				_resp_array_add(&resp_array, job_ptr, rc2);
16509 			}
16510 			job_ptr = job_ptr->job_array_next_j;
16511 		}
16512 		goto reply;
16513 	}
16514 
16515 	array_bitmap = bit_alloc(max_array_size);
16516 	tmp = xstrdup(end_ptr + 1);
16517 	tok = strtok_r(tmp, ",", &end_ptr);
16518 	while (tok && valid) {
16519 		valid = _parse_array_tok(tok, array_bitmap,
16520 					 max_array_size);
16521 		tok = strtok_r(NULL, ",", &end_ptr);
16522 	}
16523 	xfree(tmp);
16524 	if (valid) {
16525 		i_last = bit_fls(array_bitmap);
16526 		if (i_last < 0)
16527 			valid = false;
16528 	}
16529 	if (!valid) {
16530 		info("%s: invalid JobId=%s", __func__, sus_ptr->job_id_str);
16531 		rc = ESLURM_INVALID_JOB_ID;
16532 		goto reply;
16533 	}
16534 
16535 	i_first = bit_ffs(array_bitmap);
16536 	if (i_first >= 0)
16537 		i_last = bit_fls(array_bitmap);
16538 	else
16539 		i_last = -2;
16540 	for (i = i_first; i <= i_last; i++) {
16541 		if (!bit_test(array_bitmap, i))
16542 			continue;
16543 		job_ptr = find_job_array_rec(job_id, i);
16544 		if (job_ptr == NULL) {
16545 			info("%s: invalid JobId=%u_%d", __func__, job_id, i);
16546 			_resp_array_add_id(&resp_array, job_id, i,
16547 					   ESLURM_INVALID_JOB_ID);
16548 			continue;
16549 		}
16550 		rc2 = _job_suspend(job_ptr, sus_ptr->op, indf_susp);
16551 		_resp_array_add(&resp_array, job_ptr, rc2);
16552 	}
16553 
16554     reply:
16555 	if (conn_fd >= 0) {
16556 		slurm_msg_t_init(&resp_msg);
16557 		resp_msg.protocol_version = protocol_version;
16558 		if (resp_array) {
16559 			resp_array_msg = _resp_array_xlate(resp_array, job_id);
16560 			resp_msg.msg_type  = RESPONSE_JOB_ARRAY_ERRORS;
16561 			resp_msg.data      = resp_array_msg;
16562 		} else {
16563 			resp_msg.msg_type  = RESPONSE_SLURM_RC;
16564 			rc_msg.return_code = rc;
16565 			resp_msg.data      = &rc_msg;
16566 		}
16567 		slurm_send_node_msg(conn_fd, &resp_msg);
16568 
16569 		if (resp_array_msg) {
16570 			slurm_free_job_array_resp(resp_array_msg);
16571 			resp_msg.data = NULL;
16572 		}
16573 	}
16574 	_resp_array_free(resp_array);
16575 
16576 	FREE_NULL_BITMAP(array_bitmap);
16577 
16578 	return rc;
16579 }
16580 
16581 /*
16582  * _job_requeue_op - Requeue a running or pending batch job
16583  * IN uid - user id of user issuing the RPC
16584  * IN job_ptr - job to be requeued
16585  * IN preempt - true if job being preempted
16586  * RET 0 on success, otherwise ESLURM error code
16587  */
_job_requeue_op(uid_t uid,job_record_t * job_ptr,bool preempt,uint32_t flags)16588 static int _job_requeue_op(uid_t uid, job_record_t *job_ptr, bool preempt,
16589 			   uint32_t flags)
16590 {
16591 	bool is_running = false, is_suspended = false, is_completed = false;
16592 	bool is_completing = false;
16593 	time_t now = time(NULL);
16594 	uint32_t completing_flags = 0;
16595 
16596 	/* validate the request */
16597 	if ((uid != job_ptr->user_id) && !validate_operator(uid) &&
16598 	    !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
16599 					  job_ptr->account)) {
16600 		return ESLURM_ACCESS_DENIED;
16601 	}
16602 
16603 	if (((flags & JOB_STATE_BASE) == JOB_RUNNING) &&
16604 	    !IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr)) {
16605 		return SLURM_SUCCESS;
16606 	}
16607 
16608 	if (flags & JOB_RECONFIG_FAIL)
16609 		node_features_g_get_node(job_ptr->nodes);
16610 
16611 	/*
16612 	 * If the partition was removed don't allow the job to be
16613 	 * requeued.  If it doesn't have details then something is very
16614 	 * wrong and if the job doesn't want to be requeued don't.
16615 	 */
16616 	if (!job_ptr->part_ptr || !job_ptr->details
16617 	    || !job_ptr->details->requeue) {
16618 		if (flags & JOB_RECONFIG_FAIL)
16619 			(void) _job_fail(job_ptr, JOB_BOOT_FAIL);
16620 		return ESLURM_DISABLED;
16621 	}
16622 
16623 	if (job_ptr->batch_flag == 0) {
16624 		debug("Job-requeue can only be done for batch jobs");
16625 		if (flags & JOB_RECONFIG_FAIL)
16626 			(void) _job_fail(job_ptr, JOB_BOOT_FAIL);
16627 		return ESLURM_BATCH_ONLY;
16628 	}
16629 
16630 	/*
16631 	 * If the job is already pending, just return an error.
16632 	 * A federated origin job can be pending and revoked with a sibling job
16633 	 * on another cluster.
16634 	 */
16635 	if (IS_JOB_PENDING(job_ptr) &&
16636 	    (!job_ptr->fed_details || !job_ptr->fed_details->cluster_lock))
16637 		return ESLURM_JOB_PENDING;
16638 
16639 	if ((flags & JOB_RECONFIG_FAIL) && IS_JOB_CANCELLED(job_ptr)) {
16640 		/*
16641 		 * Job was cancelled (likely be the user) while node
16642 		 * reconfiguration was in progress, so don't requeue it
16643 		 * if the node reconfiguration failed.
16644 		 */
16645 		return ESLURM_DISABLED;
16646 	}
16647 
16648 	if (job_ptr->fed_details) {
16649 		int rc;
16650 		if ((rc = fed_mgr_job_requeue_test(job_ptr, flags)))
16651 			return rc;
16652 
16653 		/* Sent requeue request to origin cluster */
16654 		if (job_ptr->job_state & JOB_REQUEUE_FED)
16655 			return SLURM_SUCCESS;
16656 	}
16657 
16658 	last_job_update = now;
16659 
16660 	/*
16661 	 * In the job is in the process of completing
16662 	 * return SLURM_SUCCESS and set the status
16663 	 * to JOB_PENDING since we support requeue
16664 	 * of done/exit/exiting jobs.
16665 	 */
16666 	if (IS_JOB_COMPLETING(job_ptr)) {
16667 		completing_flags = job_ptr->job_state & JOB_STATE_FLAGS;
16668 		is_completing = true;
16669 	}
16670 
16671 	if (IS_JOB_SUSPENDED(job_ptr)) {
16672 		uint32_t suspend_job_state = job_ptr->job_state;
16673 		/*
16674 		 * we can't have it as suspended when we call the
16675 		 * accounting stuff.
16676 		 */
16677 		job_ptr->job_state = JOB_REQUEUE;
16678 		jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
16679 		job_ptr->job_state = suspend_job_state;
16680 		is_suspended = true;
16681 	}
16682 
16683 	job_ptr->time_last_active  = now;
16684 	if (is_suspended)
16685 		job_ptr->end_time = job_ptr->suspend_time;
16686 	else if (!is_completing)
16687 		job_ptr->end_time = now;
16688 
16689 	/*
16690 	 * Save the state of the job so that
16691 	 * we deallocate the nodes if is in
16692 	 * running state.
16693 	 */
16694 	if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr))
16695 		is_running = true;
16696 	else if (IS_JOB_COMPLETED(job_ptr))
16697 		is_completed = true;
16698 
16699 	/* Only change state to requeue for local jobs */
16700 	if (fed_mgr_is_origin_job(job_ptr) &&
16701 	    !fed_mgr_is_tracker_only_job(job_ptr)) {
16702 		/*
16703 		 * We want this job to have the requeued/preempted state in the
16704 		 * accounting logs. Set a new submit time so the restarted
16705 		 * job looks like a new job.
16706 		 */
16707 		if (preempt) {
16708 			job_ptr->job_state = JOB_PREEMPTED;
16709 			build_cg_bitmap(job_ptr);
16710 			job_completion_logger(job_ptr, false);
16711 			job_ptr->job_state = JOB_REQUEUE;
16712 		} else {
16713 			job_ptr->job_state = JOB_REQUEUE;
16714 			build_cg_bitmap(job_ptr);
16715 			job_completion_logger(job_ptr, true);
16716 		}
16717 	}
16718 
16719 	/*
16720 	 * Increment restart counter before completing reply so that completing
16721 	 * jobs get counted and so that fed jobs get counted before submitting
16722 	 * new siblings in batch_requeue_fini()
16723 	 */
16724 	job_ptr->restart_cnt++;
16725 
16726 	if (is_completing) {
16727 		job_ptr->job_state = JOB_PENDING | completing_flags;
16728 		goto reply;
16729 	}
16730 
16731 	/*
16732 	 * Deallocate resources only if the job has some.
16733 	 * JOB_COMPLETING is needed to properly clean up steps.
16734 	 */
16735 	if (is_running) {
16736 		job_ptr->job_state |= JOB_COMPLETING;
16737 		deallocate_nodes(job_ptr, false, is_suspended, preempt);
16738 		job_ptr->job_state &= (~JOB_COMPLETING);
16739 	}
16740 
16741 	/* do this after the epilog complete, setting it here is too early */
16742 	//job_ptr->db_index = 0;
16743 	//job_ptr->details->submit_time = now;
16744 
16745 	job_ptr->job_state = JOB_PENDING;
16746 	if (job_ptr->node_cnt)
16747 		job_ptr->job_state |= JOB_COMPLETING;
16748 
16749 	/*
16750 	 * Mark the origin job as requeueing. Will finish requeueing fed job
16751 	 * after job has completed.
16752 	 * If it's completed, batch_requeue_fini is called below and will call
16753 	 * fed_mgr_job_requeue() to submit new siblings.
16754 	 * If it's not completed, batch_requeue_fini will either be called when
16755 	 * the running origin job finishes or the running remote sibling job
16756 	 * reports that the job is finished.
16757 	 */
16758 	if (job_ptr->fed_details && !is_completed) {
16759 		job_ptr->job_state |= JOB_COMPLETING;
16760 		job_ptr->job_state |= JOB_REQUEUE_FED;
16761 	}
16762 
16763 	/*
16764 	 * If we set the time limit it means the user didn't so reset
16765 	 * it here or we could bust some limit when we try again
16766 	 */
16767 	if (job_ptr->limit_set.time == 1) {
16768 		job_ptr->time_limit = NO_VAL;
16769 		job_ptr->limit_set.time = 0;
16770 	}
16771 
16772 reply:
16773 	job_ptr->pre_sus_time = (time_t) 0;
16774 	job_ptr->suspend_time = (time_t) 0;
16775 	job_ptr->tot_sus_time = (time_t) 0;
16776 
16777 	job_ptr->db_flags = 0;
16778 
16779 	/* clear signal sent flag on requeue */
16780 	job_ptr->warn_flags &= ~WARN_SENT;
16781 
16782 	/*
16783 	 * Since the job completion logger removes the submit we need
16784 	 * to add it again.
16785 	 */
16786 	acct_policy_add_job_submit(job_ptr);
16787 
16788 	acct_policy_update_pending_job(job_ptr);
16789 
16790 	if (flags & JOB_SPECIAL_EXIT) {
16791 		job_ptr->job_state |= JOB_SPECIAL_EXIT;
16792 		job_ptr->state_reason = WAIT_HELD_USER;
16793 		xfree(job_ptr->state_desc);
16794 		job_ptr->state_desc =
16795 			xstrdup("job requeued in special exit state");
16796 		debug("%s: Holding %pJ, special exit", __func__, job_ptr);
16797 		job_ptr->priority = 0;
16798 	}
16799 	if (flags & JOB_REQUEUE_HOLD) {
16800 		job_ptr->state_reason = WAIT_HELD_USER;
16801 		xfree(job_ptr->state_desc);
16802 		if (flags & JOB_LAUNCH_FAILED) {
16803 			job_ptr->state_desc
16804 				= xstrdup("launch failed requeued held");
16805 		} else {
16806 			job_ptr->state_desc
16807 				= xstrdup("job requeued in held state");
16808 		}
16809 		debug("%s: Holding %pJ, requeue-hold exit", __func__, job_ptr);
16810 		job_ptr->priority = 0;
16811 	}
16812 
16813 	/*
16814 	 * When jobs are requeued while running/completing batch_requeue_fini is
16815 	 * called after the job is completely finished.  If the job is already
16816 	 * finished it needs to be called to clear out states (especially the
16817 	 * db_index or we will just write over the last job in the database).
16818 	 * Call batch_requeue_fini after setting priority to 0 for requeue_hold
16819 	 * and special_exit so federation doesn't submit siblings for held job.
16820 	 */
16821 	if (is_completed)
16822 		batch_requeue_fini(job_ptr);
16823 
16824 	debug("%s: %pJ state 0x%x reason %u priority %d",
16825 	      __func__, job_ptr, job_ptr->job_state,
16826 	      job_ptr->state_reason, job_ptr->priority);
16827 
16828 	return SLURM_SUCCESS;
16829 }
16830 
16831 /*
16832  * _job_requeue - Requeue a running or pending batch job, if the specified
16833  *		  job records is a hetjob leader, perform the operation on all
16834  *		  components of the hetjob
16835  * IN uid - user id of user issuing the RPC
16836  * IN job_ptr - job to be requeued
16837  * IN preempt - true if job being preempted
16838  * RET 0 on success, otherwise ESLURM error code
16839  */
_job_requeue(uid_t uid,job_record_t * job_ptr,bool preempt,uint32_t flags)16840 static int _job_requeue(uid_t uid, job_record_t *job_ptr, bool preempt,
16841 			uint32_t flags)
16842 {
16843 	job_record_t *het_job;
16844 	int rc = SLURM_SUCCESS, rc1;
16845 	ListIterator iter;
16846 
16847 	if (job_ptr->het_job_id && !job_ptr->het_job_list)
16848 		return ESLURM_NOT_HET_JOB_LEADER;
16849 
16850 	if (job_ptr->het_job_list) {
16851 		iter = list_iterator_create(job_ptr->het_job_list);
16852 		while ((het_job = list_next(iter))) {
16853 			if (job_ptr->het_job_id != het_job->het_job_id) {
16854 				error("%s: Bad het_job_list for %pJ",
16855 				      __func__, job_ptr);
16856 				continue;
16857 			}
16858 			rc1 = _job_requeue_op(uid, het_job, preempt, flags);
16859 			if (rc1 != SLURM_SUCCESS)
16860 				rc = rc1;
16861 		}
16862 		list_iterator_destroy(iter);
16863 	} else {
16864 		rc = _job_requeue_op(uid, job_ptr, preempt, flags);
16865 	}
16866 
16867 	return rc;
16868 }
16869 
16870 /*
16871  * job_requeue - Requeue a running or pending batch job
16872  * IN uid - user id of user issuing the RPC
16873  * IN job_id - id of the job to be requeued
16874  * IN msg - slurm_msg to send response back on
16875  * IN preempt - true if job being preempted
16876  * IN flags - JobExitRequeue | Hold | JobFailed | etc.
16877  * RET 0 on success, otherwise ESLURM error code
16878  */
job_requeue(uid_t uid,uint32_t job_id,slurm_msg_t * msg,bool preempt,uint32_t flags)16879 extern int job_requeue(uid_t uid, uint32_t job_id, slurm_msg_t *msg,
16880 		       bool preempt, uint32_t flags)
16881 {
16882 	int rc = SLURM_SUCCESS;
16883 	job_record_t *job_ptr = NULL;
16884 
16885 	/* find the job */
16886 	job_ptr = find_job_record(job_id);
16887 	if (job_ptr == NULL) {
16888 		rc = ESLURM_INVALID_JOB_ID;
16889 	} else {
16890 		/* _job_requeue already handles het jobs */
16891 		rc = _job_requeue(uid, job_ptr, preempt, flags);
16892 	}
16893 
16894 	if (msg) {
16895 		slurm_send_rc_msg(msg, rc);
16896 	}
16897 
16898 	return rc;
16899 }
16900 
16901 /*
16902  * job_requeue2 - Requeue a running or pending batch job
16903  * IN uid - user id of user issuing the RPC
16904  * IN req_ptr - request including ID of the job to be requeued
16905  * IN msg - slurm_msg to send response back on
16906  * IN preempt - true if job being preempted
16907  * RET 0 on success, otherwise ESLURM error code
16908  */
job_requeue2(uid_t uid,requeue_msg_t * req_ptr,slurm_msg_t * msg,bool preempt)16909 extern int job_requeue2(uid_t uid, requeue_msg_t *req_ptr, slurm_msg_t *msg,
16910 			bool preempt)
16911 {
16912 	int rc = SLURM_SUCCESS, rc2;
16913 	job_record_t *job_ptr = NULL;
16914 	long int long_id;
16915 	uint32_t job_id = 0;
16916 	char *end_ptr = NULL, *tok, *tmp;
16917 	bitstr_t *array_bitmap = NULL;
16918 	bool valid = true;
16919 	int32_t i, i_first, i_last;
16920 	slurm_msg_t resp_msg;
16921 	return_code_msg_t rc_msg;
16922 	uint32_t flags = req_ptr->flags;
16923 	char *job_id_str = req_ptr->job_id_str;
16924 	resp_array_struct_t *resp_array = NULL;
16925 	job_array_resp_msg_t *resp_array_msg = NULL;
16926 
16927 	if (max_array_size == NO_VAL) {
16928 		max_array_size = slurmctld_conf.max_array_sz;
16929 	}
16930 
16931 	long_id = strtol(job_id_str, &end_ptr, 10);
16932 	if ((long_id <= 0) || (long_id == LONG_MAX) ||
16933 	    ((end_ptr[0] != '\0') && (end_ptr[0] != '_'))) {
16934 		info("%s: invalid JobId=%s", __func__, job_id_str);
16935 		rc = ESLURM_INVALID_JOB_ID;
16936 		goto reply;
16937 	}
16938 	if ((end_ptr[0] == '_') && (end_ptr[1] == '*'))
16939 		end_ptr += 2;	/* Defaults to full job array */
16940 
16941 	job_id = (uint32_t) long_id;
16942 	if (end_ptr[0] == '\0') {	/* Single job (or full job array) */
16943 		job_record_t *job_ptr_done = NULL;
16944 		job_ptr = find_job_record(job_id);
16945 		if (job_ptr &&
16946 		    (((job_ptr->array_task_id == NO_VAL) &&
16947 		      (job_ptr->array_recs == NULL)) ||
16948 		     ((job_ptr->array_task_id != NO_VAL) &&
16949 		      (job_ptr->array_job_id  != job_id)))) {
16950 			/* This is a regular job or single task of job array */
16951 			rc = _job_requeue(uid, job_ptr, preempt, flags);
16952 			goto reply;
16953 		}
16954 
16955 		if (job_ptr && job_ptr->array_recs) {
16956 			/* This is a job array */
16957 			job_ptr_done = job_ptr;
16958 			rc2 = _job_requeue(uid, job_ptr, preempt, flags);
16959 			_resp_array_add(&resp_array, job_ptr, rc2);
16960 		}
16961 
16962 		/* Requeue all tasks of this job array */
16963 		job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
16964 		if (!job_ptr && !job_ptr_done) {
16965 			rc = ESLURM_INVALID_JOB_ID;
16966 			goto reply;
16967 		}
16968 		while (job_ptr) {
16969 			if ((job_ptr->array_job_id == job_id) &&
16970 			    (job_ptr != job_ptr_done)) {
16971 				rc2 = _job_requeue(uid, job_ptr, preempt,flags);
16972 				_resp_array_add(&resp_array, job_ptr, rc2);
16973 			}
16974 			job_ptr = job_ptr->job_array_next_j;
16975 		}
16976 		goto reply;
16977 	}
16978 
16979 	array_bitmap = bit_alloc(max_array_size);
16980 	tmp = xstrdup(end_ptr + 1);
16981 	tok = strtok_r(tmp, ",", &end_ptr);
16982 	while (tok && valid) {
16983 		valid = _parse_array_tok(tok, array_bitmap,
16984 					 max_array_size);
16985 		tok = strtok_r(NULL, ",", &end_ptr);
16986 	}
16987 	xfree(tmp);
16988 	if (valid) {
16989 		i_last = bit_fls(array_bitmap);
16990 		if (i_last < 0)
16991 			valid = false;
16992 	}
16993 	if (!valid) {
16994 		info("%s: invalid JobId=%s", __func__, job_id_str);
16995 		rc = ESLURM_INVALID_JOB_ID;
16996 		goto reply;
16997 	}
16998 
16999 	i_first = bit_ffs(array_bitmap);
17000 	if (i_first >= 0)
17001 		i_last = bit_fls(array_bitmap);
17002 	else
17003 		i_last = -2;
17004 	for (i = i_first; i <= i_last; i++) {
17005 		if (!bit_test(array_bitmap, i))
17006 			continue;
17007 		job_ptr = find_job_array_rec(job_id, i);
17008 		if (job_ptr == NULL) {
17009 			info("%s: invalid JobId=%u_%d", __func__, job_id, i);
17010 			_resp_array_add_id(&resp_array, job_id, i,
17011 					   ESLURM_INVALID_JOB_ID);
17012 			continue;
17013 		}
17014 
17015 		rc2 = _job_requeue(uid, job_ptr, preempt, flags);
17016 		_resp_array_add(&resp_array, job_ptr, rc2);
17017 	}
17018 
17019     reply:
17020 	if (msg) {
17021 		response_init(&resp_msg, msg);
17022 		if (resp_array) {
17023 			resp_array_msg = _resp_array_xlate(resp_array, job_id);
17024 			resp_msg.msg_type  = RESPONSE_JOB_ARRAY_ERRORS;
17025 			resp_msg.data      = resp_array_msg;
17026 		} else {
17027 			resp_msg.msg_type  = RESPONSE_SLURM_RC;
17028 			rc_msg.return_code = rc;
17029 			resp_msg.data      = &rc_msg;
17030 		}
17031 		slurm_send_node_msg(msg->conn_fd, &resp_msg);
17032 
17033 		if (resp_array_msg) {
17034 			slurm_free_job_array_resp(resp_array_msg);
17035 			resp_msg.data = NULL;
17036 		}
17037 	}
17038 	_resp_array_free(resp_array);
17039 
17040 	FREE_NULL_BITMAP(array_bitmap);
17041 
17042 	return rc;
17043 }
17044 
_top_job_flag_clear(void * x,void * arg)17045 static int _top_job_flag_clear(void *x, void *arg)
17046 {
17047 	job_record_t *job_ptr = (job_record_t *) x;
17048 	job_ptr->bit_flags &= (~TOP_PRIO_TMP);
17049 	return 0;
17050 }
17051 
17052 /* This sorts so the highest priorities come off the list first */
_top_job_prio_sort(void * x,void * y)17053 static int _top_job_prio_sort(void *x, void *y)
17054 {
17055 	uint32_t *prio1, *prio2;
17056 	prio1 = *(uint32_t **) x;
17057 	prio2 = *(uint32_t **) y;
17058 	if (*prio1 < *prio2)
17059 		return 1;
17060 	if (*prio1 > *prio2)
17061 		return -1;
17062 	return 0;
17063 }
17064 
_set_top(List top_job_list,uid_t uid)17065 static int _set_top(List top_job_list, uid_t uid)
17066 {
17067 	List prio_list, other_job_list;
17068 	ListIterator iter;
17069 	job_record_t *job_ptr, *first_job_ptr = NULL;
17070 	int rc = SLURM_SUCCESS, rc2 = SLURM_SUCCESS;
17071 	uint32_t last_prio = NO_VAL, next_prio;
17072 	int64_t delta_prio, delta_nice, total_delta = 0;
17073 	int other_job_cnt = 0;
17074 	uint32_t *prio_elem;
17075 
17076 	xassert(job_list);
17077 	xassert(top_job_list);
17078 	prio_list = list_create(xfree_ptr);
17079 	(void) list_for_each(job_list, _top_job_flag_clear, NULL);
17080 
17081 	/* Validate the jobs in our "top" list */
17082 	iter = list_iterator_create(top_job_list);
17083 	while ((job_ptr = list_next(iter))) {
17084 		if ((job_ptr->user_id != uid) && (uid != 0)) {
17085 			error("Security violation: REQUEST_TOP_JOB for %pJ from uid=%u",
17086 			      job_ptr, uid);
17087 			rc = ESLURM_ACCESS_DENIED;
17088 			break;
17089 		}
17090 		if (!IS_JOB_PENDING(job_ptr) || (job_ptr->details == NULL)) {
17091 			debug("%s: %pJ not pending",  __func__, job_ptr);
17092 			list_remove(iter);
17093 			rc2 = ESLURM_JOB_NOT_PENDING;
17094 			continue;
17095 		}
17096 		if (job_ptr->part_ptr_list) {
17097 			debug("%s: %pJ in partition list", __func__, job_ptr);
17098 			list_remove(iter);
17099 			rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
17100 			break;
17101 		}
17102 		if (job_ptr->priority == 0) {
17103 			debug("%s: %pJ is held", __func__, job_ptr);
17104 			list_remove(iter);
17105 			rc2 = ESLURM_JOB_HELD;
17106 			continue;
17107 		}
17108 		if (job_ptr->bit_flags & TOP_PRIO_TMP) {
17109 			/* Duplicate job ID */
17110 			list_remove(iter);
17111 			continue;
17112 		}
17113 		if (!first_job_ptr)
17114 			first_job_ptr = job_ptr;
17115 		job_ptr->bit_flags |= TOP_PRIO_TMP;
17116 		prio_elem = xmalloc(sizeof(uint32_t));
17117 		*prio_elem = job_ptr->priority;
17118 		list_append(prio_list, prio_elem);
17119 	}
17120 	list_iterator_destroy(iter);
17121 	if (rc != SLURM_SUCCESS) {
17122 		FREE_NULL_LIST(prio_list);
17123 		return rc;
17124 	}
17125 	if (!first_job_ptr) {
17126 		FREE_NULL_LIST(prio_list);
17127 		return rc2;
17128 	}
17129 
17130 	/* Identify other jobs which we can adjust the nice value of */
17131 	other_job_list = list_create(NULL);
17132 	iter = list_iterator_create(job_list);
17133 	while ((job_ptr = list_next(iter))) {
17134 		/*
17135 		 * Do not select jobs with priority 0 (held), or
17136 		 * priority 1 (would be held if we lowered the priority).
17137 		 */
17138 		if ((job_ptr->bit_flags & TOP_PRIO_TMP) ||
17139 		    (job_ptr->details == NULL) ||
17140 		    (job_ptr->part_ptr_list)   ||
17141 		    (job_ptr->priority <= 1)   ||
17142 		    (job_ptr->assoc_ptr != first_job_ptr->assoc_ptr) ||
17143 		    (job_ptr->part_ptr  != first_job_ptr->part_ptr)  ||
17144 		    (job_ptr->qos_ptr   != first_job_ptr->qos_ptr)   ||
17145 		    (job_ptr->user_id   != first_job_ptr->user_id)   ||
17146 		    (!IS_JOB_PENDING(job_ptr)))
17147 			continue;
17148 		other_job_cnt++;
17149 		job_ptr->bit_flags |= TOP_PRIO_TMP;
17150 		prio_elem = xmalloc(sizeof(uint32_t));
17151 		*prio_elem = job_ptr->priority;
17152 		list_append(prio_list, prio_elem);
17153 		list_append(other_job_list, job_ptr);
17154 	}
17155 	list_iterator_destroy(iter);
17156 
17157 	/* Now adjust nice values and priorities of the listed "top" jobs */
17158 	list_sort(prio_list, _top_job_prio_sort);
17159 	iter = list_iterator_create(top_job_list);
17160 	while ((job_ptr = list_next(iter))) {
17161 		prio_elem = list_pop(prio_list);
17162 		next_prio = *prio_elem;
17163 		xfree(prio_elem);
17164 		if ((last_prio != NO_VAL) && (next_prio == last_prio) &&
17165 		    (last_prio > 2))
17166 			/*
17167 			 * We don't want to set job priority lower than 1, so
17168 			 * last_prio cannot be smaller than 2, since we will
17169 			 * later use last_prio - 1 for the new job priority.
17170 			 */
17171 			next_prio = last_prio - 1;
17172 		last_prio = next_prio;
17173 		delta_prio = (int64_t) next_prio - job_ptr->priority;
17174 		delta_nice = MIN(job_ptr->details->nice, delta_prio);
17175 		total_delta += delta_nice;
17176 		job_ptr->priority = next_prio;
17177 		job_ptr->details->nice -= delta_nice;
17178 		job_ptr->bit_flags &= (~TOP_PRIO_TMP);
17179 	}
17180 	list_iterator_destroy(iter);
17181 	FREE_NULL_LIST(prio_list);
17182 
17183 	/* Now adjust nice values and priorities of remaining effected jobs */
17184 	if (other_job_cnt) {
17185 		iter = list_iterator_create(other_job_list);
17186 		while ((job_ptr = list_next(iter))) {
17187 			delta_prio = total_delta / other_job_cnt;
17188 			next_prio = job_ptr->priority - delta_prio;
17189 			if (next_prio >= last_prio) {
17190 				next_prio = last_prio - 1;
17191 				delta_prio = job_ptr->priority - next_prio;
17192 			}
17193 			delta_nice = delta_prio;
17194 			job_ptr->priority = next_prio;
17195 			job_ptr->details->nice += delta_nice;
17196 			job_ptr->bit_flags &= (~TOP_PRIO_TMP);
17197 			total_delta -= delta_nice;
17198 			if (--other_job_cnt == 0)
17199 				break;	/* Count will match list size anyway */
17200 		}
17201 		list_iterator_destroy(iter);
17202 	}
17203 	FREE_NULL_LIST(other_job_list);
17204 
17205 	last_job_update = time(NULL);
17206 
17207 	return rc;
17208 }
17209 
17210 /*
17211  * job_set_top - Move the specified jobs to the top of the queue (at least
17212  *	for that user ID, partition, account, and QOS).
17213  *
17214  * IN top_ptr - user request
17215  * IN uid - user id of the user issuing the RPC
17216  * IN conn_fd - file descriptor on which to send reply,
17217  *              -1 if none
17218  * IN protocol_version - slurm protocol version of client
17219  * RET 0 on success, otherwise ESLURM error code
17220  */
job_set_top(top_job_msg_t * top_ptr,uid_t uid,int conn_fd,uint16_t protocol_version)17221 extern int job_set_top(top_job_msg_t *top_ptr, uid_t uid, int conn_fd,
17222 		       uint16_t protocol_version)
17223 {
17224 	int rc = SLURM_SUCCESS;
17225 	List top_job_list = NULL;
17226 	char *job_str_tmp = NULL, *tok, *save_ptr = NULL, *end_ptr = NULL;
17227 	job_record_t *job_ptr = NULL;
17228 	long int long_id;
17229 	uint32_t job_id = 0, task_id = 0;
17230 	slurm_msg_t resp_msg;
17231 	return_code_msg_t rc_msg;
17232 
17233 	if (validate_operator(uid)) {
17234 		uid = 0;
17235 	} else {
17236 		bool disable_user_top = true;
17237 		char *sched_params = slurm_get_sched_params();
17238 		if (xstrcasestr(sched_params, "enable_user_top"))
17239 			disable_user_top = false;
17240 		xfree(sched_params);
17241 		if (disable_user_top) {
17242 			rc = ESLURM_ACCESS_DENIED;
17243 			goto reply;
17244 		}
17245 	}
17246 
17247 	top_job_list = list_create(NULL);
17248 	job_str_tmp = xstrdup(top_ptr->job_id_str);
17249 	tok = strtok_r(job_str_tmp, ",", &save_ptr);
17250 	while (tok) {
17251 		long_id = strtol(tok, &end_ptr, 10);
17252 		if ((long_id <= 0) || (long_id == LONG_MAX) ||
17253 		    ((end_ptr[0] != '\0') && (end_ptr[0] != '_'))) {
17254 			info("%s: invalid job id %s", __func__, tok);
17255 			rc = ESLURM_INVALID_JOB_ID;
17256 			goto reply;
17257 		}
17258 		job_id = (uint32_t) long_id;
17259 		if ((end_ptr[0] == '\0') || /* Single job (or full job array) */
17260 		    ((end_ptr[0] == '_') && (end_ptr[1] == '*') &&
17261 		     (end_ptr[2] == '\0'))) {
17262 			job_ptr = find_job_record(job_id);
17263 			if (!job_ptr) {
17264 				rc = ESLURM_INVALID_JOB_ID;
17265 				goto reply;
17266 			}
17267 			list_append(top_job_list, job_ptr);
17268 		} else if (end_ptr[0] != '_') {        /* Invalid job ID spec */
17269 			rc = ESLURM_INVALID_JOB_ID;
17270 			goto reply;
17271 		} else {		/* Single task of a job array */
17272 			task_id = strtol(end_ptr + 1, &end_ptr, 10);
17273 			if (end_ptr[0] != '\0') {      /* Invalid job ID spec */
17274 				rc = ESLURM_INVALID_JOB_ID;
17275 				goto reply;
17276 			}
17277 			job_ptr = find_job_array_rec(job_id, task_id);
17278 			if (!job_ptr) {
17279 				rc = ESLURM_INVALID_JOB_ID;
17280 				goto reply;
17281 			}
17282 			list_append(top_job_list, job_ptr);
17283 		}
17284 		tok = strtok_r(NULL, ",", &save_ptr);
17285 	}
17286 
17287 	if (list_count(top_job_list) == 0) {
17288 		rc = ESLURM_INVALID_JOB_ID;
17289 		goto reply;
17290 	}
17291 	rc = _set_top(top_job_list, uid);
17292 
17293 reply:	FREE_NULL_LIST(top_job_list);
17294 	xfree(job_str_tmp);
17295 	if (conn_fd >= 0) {
17296 		slurm_msg_t_init(&resp_msg);
17297 		resp_msg.protocol_version = protocol_version;
17298 		resp_msg.msg_type  = RESPONSE_SLURM_RC;
17299 		memset(&rc_msg, 0, sizeof(rc_msg));
17300 		rc_msg.return_code = rc;
17301 		resp_msg.data      = &rc_msg;
17302 		slurm_send_node_msg(conn_fd, &resp_msg);
17303 	}
17304 
17305 	return rc;
17306 }
17307 
17308 /*
17309  * job_end_time - Process JOB_END_TIME
17310  * IN time_req_msg - job end time request
17311  * OUT timeout_msg - job timeout response to be sent
17312  * RET SLURM_SUCCESS or an error code
17313  */
job_end_time(job_alloc_info_msg_t * time_req_msg,srun_timeout_msg_t * timeout_msg)17314 extern int job_end_time(job_alloc_info_msg_t *time_req_msg,
17315 			srun_timeout_msg_t *timeout_msg)
17316 {
17317 	job_record_t *job_ptr;
17318 	xassert(timeout_msg);
17319 
17320 	job_ptr = find_job_record(time_req_msg->job_id);
17321 	if (!job_ptr)
17322 		return ESLURM_INVALID_JOB_ID;
17323 
17324 	memset(timeout_msg, 0, sizeof(srun_timeout_msg_t));
17325 	timeout_msg->job_id  = time_req_msg->job_id;
17326 	timeout_msg->step_id = NO_VAL;
17327 	timeout_msg->timeout = job_ptr->end_time;
17328 	return SLURM_SUCCESS;
17329 }
17330 
17331 /* Reset nodes_completing field for all jobs. */
update_job_nodes_completing(void)17332 extern void update_job_nodes_completing(void)
17333 {
17334 	ListIterator job_iterator;
17335 	job_record_t *job_ptr;
17336 
17337 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
17338 
17339 	if (!job_list)
17340 		return;
17341 
17342 	job_iterator = list_iterator_create(job_list);
17343 	while ((job_ptr = list_next(job_iterator))) {
17344 		if ((!IS_JOB_COMPLETING(job_ptr)) ||
17345 		    (job_ptr->node_bitmap == NULL))
17346 			continue;
17347 		xfree(job_ptr->nodes_completing);
17348 		if (job_ptr->node_bitmap_cg) {
17349 			job_ptr->nodes_completing =
17350 				bitmap2node_name(job_ptr->node_bitmap_cg);
17351 		} else {
17352 			job_ptr->nodes_completing =
17353 				bitmap2node_name(job_ptr->node_bitmap);
17354 		}
17355 	}
17356 	list_iterator_destroy(job_iterator);
17357 }
17358 
17359 /*
17360  * job_hold_by_assoc_id - Hold all pending jobs with a given
17361  *	association ID. This happens when an association is deleted (e.g. when
17362  *	a user is removed from the association database).
17363  * RET count of held jobs
17364  */
job_hold_by_assoc_id(uint32_t assoc_id)17365 extern int job_hold_by_assoc_id(uint32_t assoc_id)
17366 {
17367 	int cnt = 0;
17368 	ListIterator job_iterator;
17369 	job_record_t *job_ptr;
17370 	/* Write lock on jobs */
17371 	slurmctld_lock_t job_write_lock =
17372 		{ NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
17373 
17374 	if (!job_list)
17375 		return cnt;
17376 
17377 	lock_slurmctld(job_write_lock);
17378 	job_iterator = list_iterator_create(job_list);
17379 	while ((job_ptr = list_next(job_iterator))) {
17380 		if (job_ptr->assoc_id != assoc_id)
17381 			continue;
17382 
17383 		cnt += _job_fail_account(job_ptr, __func__);
17384 	}
17385 	list_iterator_destroy(job_iterator);
17386 	unlock_slurmctld(job_write_lock);
17387 	return cnt;
17388 }
17389 
17390 /*
17391  * job_hold_by_qos_id - Hold all pending jobs with a given
17392  *	QOS ID. This happens when a QOS is deleted (e.g. when
17393  *	a QOS is removed from the association database).
17394  * RET count of held jobs
17395  */
job_hold_by_qos_id(uint32_t qos_id)17396 extern int job_hold_by_qos_id(uint32_t qos_id)
17397 {
17398 	int cnt = 0;
17399 	ListIterator job_iterator;
17400 	job_record_t *job_ptr;
17401 	/* Write lock on jobs */
17402 	slurmctld_lock_t job_write_lock =
17403 		{ NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
17404 
17405 	if (!job_list)
17406 		return cnt;
17407 
17408 	lock_slurmctld(job_write_lock);
17409 	job_iterator = list_iterator_create(job_list);
17410 	while ((job_ptr = list_next(job_iterator))) {
17411 		if (job_ptr->qos_blocking_ptr &&
17412 		    ((slurmdb_qos_rec_t *)job_ptr->qos_blocking_ptr)->id
17413 		    == qos_id)
17414 			job_ptr->qos_blocking_ptr = NULL;
17415 		if (job_ptr->qos_id != qos_id)
17416 			continue;
17417 
17418 		cnt += job_fail_qos(job_ptr, __func__);
17419 	}
17420 	list_iterator_destroy(job_iterator);
17421 	unlock_slurmctld(job_write_lock);
17422 	return cnt;
17423 }
17424 
17425 /*
17426  * Modify the account associated with a pending job
17427  * IN module - where this is called from
17428  * IN job_ptr - pointer to job which should be modified
17429  * IN new_wckey - desired wckey name
17430  * RET SLURM_SUCCESS or error code
17431  */
update_job_wckey(char * module,job_record_t * job_ptr,char * new_wckey)17432 extern int update_job_wckey(char *module, job_record_t *job_ptr,
17433 			    char *new_wckey)
17434 {
17435 	slurmdb_wckey_rec_t wckey_rec, *wckey_ptr;
17436 
17437 	if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL)) {
17438 		info("%s: attempt to modify account for non-pending %pJ",
17439 		     module, job_ptr);
17440 		return ESLURM_JOB_NOT_PENDING;
17441 	}
17442 
17443 	memset(&wckey_rec, 0, sizeof(wckey_rec));
17444 	wckey_rec.uid       = job_ptr->user_id;
17445 	wckey_rec.name      = new_wckey;
17446 	if (assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec,
17447 				    accounting_enforce, &wckey_ptr, false)) {
17448 		info("%s: invalid wckey %s for %pJ",
17449 		     module, new_wckey, job_ptr);
17450 		return ESLURM_INVALID_WCKEY;
17451 	} else if (association_based_accounting
17452 		  && !wckey_ptr
17453 		  && !(accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS)) {
17454 		/* if not enforcing associations we want to look for
17455 		   the default account and use it to avoid getting
17456 		   trash in the accounting records.
17457 		*/
17458 		wckey_rec.name = NULL;
17459 		assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec,
17460 					accounting_enforce, &wckey_ptr, false);
17461 		if (!wckey_ptr) {
17462 			debug("%s: we didn't have a wckey record for wckey "
17463 			      "'%s' and user '%u', and we can't seem to find "
17464 			      "a default one either.  Setting it anyway. "
17465 			      "This will produce trash in accounting.  "
17466 			      "If this is not what you desire please put "
17467 			      "AccountStorageEnforce=wckeys in your slurm.conf "
17468 			      "file.", module, new_wckey,
17469 			      job_ptr->user_id);
17470 			wckey_rec.name = new_wckey;
17471 		}
17472 	}
17473 
17474 	xfree(job_ptr->wckey);
17475 	if (wckey_rec.name && wckey_rec.name[0] != '\0') {
17476 		job_ptr->wckey = xstrdup(wckey_rec.name);
17477 		info("%s: setting wckey to %s for %pJ",
17478 		     module, wckey_rec.name, job_ptr);
17479 	} else {
17480 		info("%s: cleared wckey for %pJ", module, job_ptr);
17481 	}
17482 
17483 	last_job_update = time(NULL);
17484 
17485 	return SLURM_SUCCESS;
17486 }
17487 
send_jobs_to_accounting(void)17488 extern int send_jobs_to_accounting(void)
17489 {
17490 	ListIterator itr = NULL;
17491 	job_record_t *job_ptr;
17492 	slurmctld_lock_t job_write_lock = {
17493 		NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK, NO_LOCK };
17494 
17495 	/* send jobs in pending or running state */
17496 	lock_slurmctld(job_write_lock);
17497 	itr = list_iterator_create(job_list);
17498 	while ((job_ptr = list_next(itr))) {
17499 		if (!job_ptr->assoc_id) {
17500 			slurmdb_assoc_rec_t assoc_rec;
17501 			memset(&assoc_rec, 0,
17502 			       sizeof(assoc_rec));
17503 			assoc_rec.acct      = job_ptr->account;
17504 			if (job_ptr->part_ptr)
17505 				assoc_rec.partition = job_ptr->part_ptr->name;
17506 			assoc_rec.uid       = job_ptr->user_id;
17507 
17508 			if (assoc_mgr_fill_in_assoc(
17509 				   acct_db_conn, &assoc_rec,
17510 				   accounting_enforce,
17511 				   &job_ptr->assoc_ptr, false) &&
17512 			    (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)
17513 			    && (!IS_JOB_FINISHED(job_ptr))) {
17514 				_job_fail_account(job_ptr, __func__);
17515 				continue;
17516 			} else
17517 				job_ptr->assoc_id = assoc_rec.id;
17518 		}
17519 
17520 		/* we only want active, un accounted for jobs */
17521 		if (job_ptr->db_index || IS_JOB_FINISHED(job_ptr))
17522 			continue;
17523 
17524 		debug("first reg: starting %pJ in accounting", job_ptr);
17525 		jobacct_storage_g_job_start(acct_db_conn, job_ptr);
17526 
17527 		if (IS_JOB_SUSPENDED(job_ptr))
17528 			jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
17529 	}
17530 	list_iterator_destroy(itr);
17531 	unlock_slurmctld(job_write_lock);
17532 
17533 	return SLURM_SUCCESS;
17534 }
17535 
17536 /*
17537  * copy_job_record_to_job_desc - construct a job_desc_msg_t for a job.
17538  * IN job_ptr - the job record
17539  * RET the job_desc_msg_t, NULL on error
17540  */
copy_job_record_to_job_desc(job_record_t * job_ptr)17541 extern job_desc_msg_t *copy_job_record_to_job_desc(job_record_t *job_ptr)
17542 {
17543 	job_desc_msg_t *job_desc;
17544 	struct job_details *details = job_ptr->details;
17545 	multi_core_data_t *mc_ptr = details->mc_ptr;
17546 	int i;
17547 
17548 	/* construct a job_desc_msg_t from job */
17549 	job_desc = xmalloc(sizeof(job_desc_msg_t));
17550 
17551 	job_desc->account           = xstrdup(job_ptr->account);
17552 	job_desc->acctg_freq        = xstrdup(details->acctg_freq);
17553 	job_desc->alloc_node        = xstrdup(job_ptr->alloc_node);
17554 	/* Since the allocating salloc or srun is not expected to exist
17555 	 * when this checkpointed job is restarted, do not save these:
17556 	 *
17557 	 * job_desc->alloc_resp_port   = job_ptr->alloc_resp_port;
17558 	 * job_desc->alloc_sid         = job_ptr->alloc_sid;
17559 	 */
17560 	job_desc->argc              = details->argc;
17561 	job_desc->argv              = xcalloc(job_desc->argc, sizeof(char *));
17562 	for (i = 0; i < job_desc->argc; i ++)
17563 		job_desc->argv[i]   = xstrdup(details->argv[i]);
17564 	job_desc->begin_time        = details->begin_time;
17565 	job_desc->bitflags 	    = job_ptr->bit_flags;
17566 	job_desc->clusters          = xstrdup(job_ptr->clusters);
17567 	job_desc->comment           = xstrdup(job_ptr->comment);
17568 	job_desc->contiguous        = details->contiguous;
17569 	job_desc->core_spec         = details->core_spec;
17570 	job_desc->cpu_bind          = xstrdup(details->cpu_bind);
17571 	job_desc->cpu_bind_type     = details->cpu_bind_type;
17572 	job_desc->cpu_freq_min      = details->cpu_freq_min;
17573 	job_desc->cpu_freq_max      = details->cpu_freq_max;
17574 	job_desc->cpu_freq_gov      = details->cpu_freq_gov;
17575 	job_desc->deadline          = job_ptr->deadline;
17576 	job_desc->dependency        = xstrdup(details->dependency);
17577 	job_desc->end_time          = 0; /* Unused today */
17578 	job_desc->environment       = get_job_env(job_ptr,
17579 						  &job_desc->env_size);
17580 	job_desc->exc_nodes         = xstrdup(details->exc_nodes);
17581 	job_desc->features          = xstrdup(details->features);
17582 	job_desc->cluster_features  = xstrdup(details->cluster_features);
17583 	job_desc->group_id          = job_ptr->group_id;
17584 	job_desc->immediate         = 0; /* nowhere to get this value */
17585 	job_desc->job_id            = job_ptr->job_id;
17586 	job_desc->kill_on_node_fail = job_ptr->kill_on_node_fail;
17587 	job_desc->licenses          = xstrdup(job_ptr->licenses);
17588 	job_desc->mail_type         = job_ptr->mail_type;
17589 	job_desc->mail_user         = xstrdup(job_ptr->mail_user);
17590 	job_desc->mcs_label	    = xstrdup(job_ptr->mcs_label);
17591 	job_desc->mem_bind          = xstrdup(details->mem_bind);
17592 	job_desc->mem_bind_type     = details->mem_bind_type;
17593 	job_desc->name              = xstrdup(job_ptr->name);
17594 	job_desc->network           = xstrdup(job_ptr->network);
17595 	job_desc->nice              = details->nice;
17596 	job_desc->num_tasks         = details->num_tasks;
17597 	job_desc->open_mode         = details->open_mode;
17598 	job_desc->origin_cluster    = xstrdup(job_ptr->origin_cluster);
17599 	job_desc->other_port        = job_ptr->other_port;
17600 	job_desc->power_flags       = job_ptr->power_flags;
17601 	job_desc->overcommit        = details->overcommit;
17602 	job_desc->partition         = xstrdup(job_ptr->partition);
17603 	job_desc->plane_size        = details->plane_size;
17604 	job_desc->priority          = job_ptr->priority;
17605 	if (job_ptr->qos_ptr)
17606 		job_desc->qos       = xstrdup(job_ptr->qos_ptr->name);
17607 	job_desc->resp_host         = xstrdup(job_ptr->resp_host);
17608 	job_desc->req_nodes         = xstrdup(details->req_nodes);
17609 	job_desc->requeue           = details->requeue;
17610 	job_desc->reservation       = xstrdup(job_ptr->resv_name);
17611 	job_desc->restart_cnt       = job_ptr->restart_cnt;
17612 	job_desc->script_buf        = get_job_script(job_ptr);
17613 	if (details->share_res == 1)
17614 		job_desc->shared     = JOB_SHARED_OK;
17615 	else if (details->whole_node == WHOLE_NODE_REQUIRED)
17616 		job_desc->shared     =  JOB_SHARED_NONE;
17617 	else if (details->whole_node == WHOLE_NODE_USER)
17618 		job_desc->shared     =  JOB_SHARED_USER;
17619 	else if (details->whole_node == WHOLE_NODE_MCS)
17620 		job_desc->shared     =  JOB_SHARED_MCS;
17621 	else
17622 		job_desc->shared     = NO_VAL16;
17623 	job_desc->spank_job_env_size = job_ptr->spank_job_env_size;
17624 	job_desc->spank_job_env      = xcalloc(job_desc->spank_job_env_size,
17625 					       sizeof(char *));
17626 	for (i = 0; i < job_desc->spank_job_env_size; i ++)
17627 		job_desc->spank_job_env[i]= xstrdup(job_ptr->spank_job_env[i]);
17628 	job_desc->std_err           = xstrdup(details->std_err);
17629 	job_desc->std_in            = xstrdup(details->std_in);
17630 	job_desc->std_out           = xstrdup(details->std_out);
17631 	job_desc->task_dist         = details->task_dist;
17632 	job_desc->time_limit        = job_ptr->time_limit;
17633 	job_desc->time_min          = job_ptr->time_min;
17634 	job_desc->user_id           = job_ptr->user_id;
17635 	job_desc->wait_all_nodes    = job_ptr->wait_all_nodes;
17636 	job_desc->warn_flags        = job_ptr->warn_flags;
17637 	job_desc->warn_signal       = job_ptr->warn_signal;
17638 	job_desc->warn_time         = job_ptr->warn_time;
17639 	job_desc->wckey             = xstrdup(job_ptr->wckey);
17640 	job_desc->work_dir          = xstrdup(details->work_dir);
17641 	job_desc->pn_min_cpus       = details->pn_min_cpus;
17642 	job_desc->pn_min_memory     = details->pn_min_memory;
17643 	job_desc->pn_min_tmp_disk   = details->pn_min_tmp_disk;
17644 	job_desc->min_cpus          = details->min_cpus;
17645 	job_desc->max_cpus          = details->max_cpus;
17646 	job_desc->min_nodes         = details->min_nodes;
17647 	job_desc->max_nodes         = details->max_nodes;
17648 	if (job_desc->max_nodes == 0) /* set 0 in _job_create() */
17649 		job_desc->max_nodes = NO_VAL;
17650 	job_desc->sockets_per_node  = mc_ptr->sockets_per_node;
17651 	job_desc->cores_per_socket  = mc_ptr->cores_per_socket;
17652 	job_desc->threads_per_core  = mc_ptr->threads_per_core;
17653 	job_desc->cpus_per_task     = details->cpus_per_task;
17654 	job_desc->ntasks_per_node   = details->ntasks_per_node;
17655 	job_desc->ntasks_per_socket = mc_ptr->ntasks_per_socket;
17656 	job_desc->ntasks_per_core   = mc_ptr->ntasks_per_core;
17657 
17658 	job_desc->cpus_per_tres     = xstrdup(job_ptr->cpus_per_tres);
17659 	job_desc->mem_per_tres      = xstrdup(job_ptr->mem_per_tres);
17660 	job_desc->tres_bind         = xstrdup(job_ptr->tres_bind);
17661 	job_desc->tres_freq         = xstrdup(job_ptr->tres_freq);
17662 	job_desc->tres_per_job      = xstrdup(job_ptr->tres_per_job);
17663 	job_desc->tres_per_node     = xstrdup(job_ptr->tres_per_node);
17664 	job_desc->tres_per_socket   = xstrdup(job_ptr->tres_per_socket);
17665 	job_desc->tres_per_task     = xstrdup(job_ptr->tres_per_task);
17666 
17667 	if (job_ptr->fed_details) {
17668 		job_desc->fed_siblings_active =
17669 			job_ptr->fed_details->siblings_active;
17670 		job_desc->fed_siblings_viable =
17671 			job_ptr->fed_details->siblings_viable;
17672 	}
17673 
17674 	return job_desc;
17675 }
17676 
17677 /* Build a bitmap of nodes completing this job */
build_cg_bitmap(job_record_t * job_ptr)17678 extern void build_cg_bitmap(job_record_t *job_ptr)
17679 {
17680 	FREE_NULL_BITMAP(job_ptr->node_bitmap_cg);
17681 	if (job_ptr->node_bitmap) {
17682 		job_ptr->node_bitmap_cg = bit_copy(job_ptr->node_bitmap);
17683 		if (bit_set_count(job_ptr->node_bitmap_cg) == 0)
17684 			job_ptr->job_state &= (~JOB_COMPLETING);
17685 	} else {
17686 		error("build_cg_bitmap: node_bitmap is NULL");
17687 		job_ptr->node_bitmap_cg = bit_alloc(node_record_count);
17688 		job_ptr->job_state &= (~JOB_COMPLETING);
17689 	}
17690 }
17691 
17692 /* job_hold_requeue()
17693  *
17694  * Requeue the job based upon its current state.
17695  * If JOB_SPECIAL_EXIT then requeue and hold with JOB_SPECIAL_EXIT state.
17696  * If JOB_REQUEUE_HOLD then requeue and hold.
17697  * If JOB_REQUEUE then requeue and let it run again.
17698  * The requeue can happen directly from job_requeue() or from
17699  * job_epilog_complete() after the last component has finished.
17700  *
17701  * RET returns true if the job was requeued
17702  */
job_hold_requeue(job_record_t * job_ptr)17703 extern bool job_hold_requeue(job_record_t *job_ptr)
17704 {
17705 	uint32_t state;
17706 	uint32_t flags;
17707 	job_record_t *base_job_ptr = NULL;
17708 
17709 	xassert(job_ptr);
17710 
17711 	/* If the job is already pending it was
17712 	 * eventually requeued somewhere else.
17713 	 */
17714 	if (IS_JOB_PENDING(job_ptr) && !IS_JOB_REVOKED(job_ptr))
17715 		return false;
17716 
17717 	/* If the job is not on the origin cluster, then don't worry about
17718 	 * requeueing the job here. The exit code will be sent the origin
17719 	 * cluster and the origin cluster will decide if the job should be
17720 	 * requeued or not. */
17721 	if (!fed_mgr_is_origin_job(job_ptr))
17722 		return false;
17723 
17724 	/*
17725 	 * A job may be canceled during its epilog in which case we need to
17726 	 * check that the job (or base job in the case of an array) was not
17727 	 * canceled before attemping to requeue.
17728 	 */
17729 	if (IS_JOB_CANCELLED(job_ptr) ||
17730 	    (((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) &&
17731 	     (base_job_ptr = find_job_record(job_ptr->array_job_id)) &&
17732 	     base_job_ptr->array_recs && IS_JOB_CANCELLED(base_job_ptr)))
17733 		return false;
17734 
17735 	/* Check if the job exit with one of the
17736 	 * configured requeue values. */
17737 	_set_job_requeue_exit_value(job_ptr);
17738 
17739 	state = job_ptr->job_state;
17740 
17741 	if (! (state & JOB_REQUEUE))
17742 		return false;
17743 
17744 	/* Sent event requeue to the database.  */
17745 	if (!(job_ptr->bit_flags & TRES_STR_CALC) &&
17746 	    job_ptr->tres_alloc_cnt &&
17747 	    (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64))
17748 		set_job_tres_alloc_str(job_ptr, false);
17749 	jobacct_storage_g_job_complete(acct_db_conn, job_ptr);
17750 
17751 	debug("%s: %pJ state 0x%x", __func__, job_ptr, state);
17752 
17753 	/* Set the job pending */
17754 	flags = job_ptr->job_state & JOB_STATE_FLAGS;
17755 	job_ptr->job_state = JOB_PENDING | flags;
17756 
17757 	job_ptr->restart_cnt++;
17758 
17759 	/* clear signal sent flag on requeue */
17760 	job_ptr->warn_flags &= ~WARN_SENT;
17761 
17762 	/*
17763 	 * Test if user wants to requeue the job
17764 	 * in hold or with a special exit value.
17765 	 */
17766 	if (state & JOB_SPECIAL_EXIT) {
17767 		/*
17768 		 * JOB_SPECIAL_EXIT means requeue the job,
17769 		 * put it on hold and display state as JOB_SPECIAL_EXIT.
17770 		 */
17771 		job_ptr->job_state |= JOB_SPECIAL_EXIT;
17772 		job_ptr->state_reason = WAIT_HELD_USER;
17773 		debug("%s: Holding %pJ, special exit", __func__, job_ptr);
17774 		job_ptr->priority = 0;
17775 	}
17776 
17777 	job_ptr->job_state &= ~JOB_REQUEUE;
17778 
17779 	/*
17780 	 * Mark array as requeued. Exit codes have already been handled in
17781 	 * _job_array_comp()
17782 	 */
17783 	if (((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) &&
17784 	    (base_job_ptr = find_job_record(job_ptr->array_job_id)) &&
17785 	    base_job_ptr->array_recs) {
17786 		base_job_ptr->array_recs->array_flags |= ARRAY_TASK_REQUEUED;
17787 	}
17788 
17789 	debug("%s: %pJ state 0x%x reason %u priority %d",
17790 	      __func__, job_ptr, job_ptr->job_state,
17791 	      job_ptr->state_reason, job_ptr->priority);
17792 
17793 	return true;
17794 }
17795 
_parse_max_depend_depth(char * str)17796 static void _parse_max_depend_depth(char *str)
17797 {
17798 	int i = atoi(str);
17799 	if (i < 0)
17800 		error("ignoring max_depend_depth value of %d", i);
17801 	else
17802 		max_depend_depth = i;
17803 }
17804 
init_depend_policy(void)17805 extern void init_depend_policy(void)
17806 {
17807 	char *depend_params = slurm_get_dependency_params();
17808 	char *sched_params = slurm_get_sched_params();
17809 	char *tmp_ptr;
17810 
17811 	disable_remote_singleton =
17812 		(xstrcasestr(depend_params, "disable_remote_singleton")) ?
17813 		true : false;
17814 
17815 	/*
17816 	 * kill_invalid_depend and max_depend_depth are moving from
17817 	 * SchedulerParameters to DependencyParameters. Support both for 20.02,
17818 	 * then remove them from SchedulerParameters in a future release.
17819 	 */
17820 	if (xstrcasestr(sched_params, "kill_invalid_depend")) {
17821 		info("kill_invalid_depend is deprecated in SchedulerParameters and moved to DependencyParameters");
17822 		kill_invalid_dep = true;
17823 	} else
17824 		kill_invalid_dep =
17825 			(xstrcasestr(depend_params, "kill_invalid_depend")) ?
17826 			true : false;
17827 
17828 	/* 					   01234567890123456 */
17829 	if ((tmp_ptr = xstrcasestr(depend_params, "max_depend_depth=")))
17830 		_parse_max_depend_depth(tmp_ptr + 17);
17831 	else if ((tmp_ptr = xstrcasestr(sched_params, "max_depend_depth="))) {
17832 		info("max_depend_depth is deprecated in SchedulerParameters and moved to DependencyParameters");
17833 		_parse_max_depend_depth(tmp_ptr + 17);
17834 	} else
17835 		max_depend_depth = 10;
17836 
17837 	xfree(depend_params);
17838 	xfree(sched_params);
17839 
17840 	if (slurmctld_conf.debug_flags & DEBUG_FLAG_DEPENDENCY)
17841 		info("%s: kill_invalid_depend is set to %d; disable_remote_singleton is set to %d; max_depend_depth is set to %d",
17842 		     __func__, kill_invalid_dep, disable_remote_singleton,
17843 		     max_depend_depth);
17844 	else
17845 		debug2("%s: kill_invalid_depend is set to %d; disable_remote_singleton is set to %d; max_depend_depth is set to %d",
17846 		       __func__, kill_invalid_dep, disable_remote_singleton,
17847 		       max_depend_depth);
17848 }
17849 
17850 /* init_requeue_policy()
17851  * Initialize the requeue exit/hold bitmaps.
17852  */
init_requeue_policy(void)17853 extern void init_requeue_policy(void)
17854 {
17855 	/* clean first as we can be reconfiguring */
17856 	FREE_NULL_BITMAP(requeue_exit);
17857 	FREE_NULL_BITMAP(requeue_exit_hold);
17858 
17859 	requeue_exit = _make_requeue_array(slurmctld_conf.requeue_exit);
17860 	requeue_exit_hold = _make_requeue_array(
17861 		slurmctld_conf.requeue_exit_hold);
17862 }
17863 
17864 /* _make_requeue_array()
17865  *
17866  * Process the RequeueExit|RequeueExitHold configuration
17867  * parameters creating two bitmaps holding the exit values
17868  * of jobs for which they have to be requeued.
17869  */
_make_requeue_array(char * conf_buf)17870 static bitstr_t *_make_requeue_array(char *conf_buf)
17871 {
17872 	hostset_t hs;
17873 	bitstr_t *bs = NULL;
17874 	char *tok = NULL, *end_ptr = NULL;
17875 	long val;
17876 
17877 	if (conf_buf == NULL)
17878 		return bs;
17879 
17880 	xstrfmtcat(tok, "[%s]", conf_buf);
17881 	hs = hostset_create(tok);
17882 	xfree(tok);
17883 	if (!hs) {
17884 		error("%s: exit values: %s", __func__, conf_buf);
17885 		return bs;
17886 	}
17887 
17888 	debug("%s: exit values: %s", __func__, conf_buf);
17889 
17890 	bs = bit_alloc(MAX_EXIT_VAL + 1);
17891 	while ((tok = hostset_shift(hs))) {
17892 		val = strtol(tok, &end_ptr, 10);
17893 		if ((end_ptr[0] == '\0') &&
17894 		    (val >= 0) && (val <= MAX_EXIT_VAL)) {
17895 			bit_set(bs, val);
17896 		} else {
17897 			error("%s: exit values: %s (%s)",
17898 			      __func__, conf_buf, tok);
17899 		}
17900 		free(tok);
17901 	}
17902 	hostset_destroy(hs);
17903 
17904 	return bs;
17905 }
17906 
17907 /* _set_job_requeue_exit_value()
17908  *
17909  * Compared the job exit values with the configured
17910  * RequeueExit and RequeueHoldExit and a match is
17911  * found, set the appropriate state for job_hold_requeue()
17912  */
_set_job_requeue_exit_value(job_record_t * job_ptr)17913 static void _set_job_requeue_exit_value(job_record_t *job_ptr)
17914 {
17915 	int exit_code;
17916 
17917 	exit_code = WEXITSTATUS(job_ptr->exit_code);
17918 	if ((exit_code < 0) || (exit_code > MAX_EXIT_VAL))
17919 		return;
17920 
17921 	if (requeue_exit && bit_test(requeue_exit, exit_code)) {
17922 		debug2("%s: %pJ exit code %d state JOB_REQUEUE",
17923 		       __func__, job_ptr, exit_code);
17924 		job_ptr->job_state |= JOB_REQUEUE;
17925 		return;
17926 	}
17927 
17928 	if (requeue_exit_hold && bit_test(requeue_exit_hold, exit_code)) {
17929 		/* Not sure if want to set special exit state in this case */
17930 		debug2("%s: %pJ exit code %d state JOB_SPECIAL_EXIT",
17931 		       __func__, job_ptr, exit_code);
17932 		job_ptr->job_state |= JOB_REQUEUE;
17933 		job_ptr->job_state |= JOB_SPECIAL_EXIT;
17934 		return;
17935 	}
17936 }
17937 
17938 /*
17939  * Reset a job's end_time based upon it's start_time and time_limit.
17940  * NOTE: Do not reset the end_time if already being preempted
17941  */
job_end_time_reset(job_record_t * job_ptr)17942 extern void job_end_time_reset(job_record_t *job_ptr)
17943 {
17944 	if (job_ptr->preempt_time)
17945 		return; /* Preemption in progress */
17946 	if (job_ptr->time_limit == INFINITE) {
17947 		job_ptr->end_time = job_ptr->start_time +
17948 				    (365 * 24 * 60 * 60); /* secs in year */
17949 	} else {
17950 		job_ptr->end_time = job_ptr->start_time +
17951 				    (job_ptr->time_limit * 60);	/* secs */
17952 	}
17953 	job_ptr->end_time_exp = job_ptr->end_time;
17954 }
17955 
17956 /* trace_job() - print the job details if
17957  *               the DEBUG_FLAG_TRACE_JOBS is set
17958  */
trace_job(job_record_t * job_ptr,const char * func,const char * extra)17959 extern void trace_job(job_record_t *job_ptr, const char *func,
17960 		      const char *extra)
17961 {
17962 	if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRACE_JOBS) {
17963 		info("%s: %s %pJ", func, extra, job_ptr);
17964 	}
17965 }
17966 
17967 /* If this is a job array meta-job, prepare it for being scheduled */
job_array_pre_sched(job_record_t * job_ptr)17968 extern void job_array_pre_sched(job_record_t *job_ptr)
17969 {
17970 	int32_t i;
17971 
17972 	if (!job_ptr->array_recs || !job_ptr->array_recs->task_id_bitmap)
17973 		return;
17974 
17975 	i = bit_ffs(job_ptr->array_recs->task_id_bitmap);
17976 	if (i < 0) {
17977 		/* This happens if the final task in a meta-job is requeued */
17978 		if (job_ptr->restart_cnt == 0) {
17979 			error("%pJ has empty task_id_bitmap", job_ptr);
17980 		}
17981 		FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap);
17982 		return;
17983 	}
17984 
17985 	job_ptr->array_job_id  = job_ptr->job_id;
17986 	job_ptr->array_task_id = i;
17987 }
17988 
17989 /* If this is a job array meta-job, clean up after scheduling attempt */
job_array_post_sched(job_record_t * job_ptr)17990 extern job_record_t *job_array_post_sched(job_record_t *job_ptr)
17991 {
17992 	job_record_t *new_job_ptr = NULL;
17993 
17994 	if (!job_ptr->array_recs || !job_ptr->array_recs->task_id_bitmap)
17995 		return job_ptr;
17996 
17997 	if (job_ptr->array_recs->task_cnt <= 1) {
17998 		/* Preserve array_recs for min/max exit codes for job array */
17999 		if (job_ptr->array_recs->task_cnt) {
18000 			job_ptr->array_recs->task_cnt--;
18001 		} else if (job_ptr->restart_cnt) {
18002 			/* Last task of a job array has been requeued */
18003 		} else {
18004 			error("job %pJ array_recs task count underflow",
18005 			      job_ptr);
18006 		}
18007 		xfree(job_ptr->array_recs->task_id_str);
18008 		if (job_ptr->array_recs->task_cnt == 0)
18009 			FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap);
18010 
18011 		/* While it is efficient to set the db_index to 0 here
18012 		 * to get the database to update the record for
18013 		 * pending tasks it also creates a window in which if
18014 		 * the association id is changed (different account or
18015 		 * partition) instead of returning the previous
18016 		 * db_index (expected) it would create a new one
18017 		 * leaving the other orphaned.  Setting the job_state
18018 		 * sets things up so the db_index isn't lost but the
18019 		 * start message is still sent to get the desired behavior. */
18020 		if (job_ptr->db_index)
18021 			job_ptr->job_state |= JOB_UPDATE_DB;
18022 
18023 		/* If job is requeued, it will already be in the hash table */
18024 		if (!find_job_array_rec(job_ptr->array_job_id,
18025 					job_ptr->array_task_id)) {
18026 			_add_job_array_hash(job_ptr);
18027 		}
18028 		new_job_ptr = job_ptr;
18029 	} else {
18030 		new_job_ptr = job_array_split(job_ptr);
18031 		if (new_job_ptr) {
18032 			new_job_ptr->job_state = JOB_PENDING;
18033 			new_job_ptr->start_time = (time_t) 0;
18034 			/* Do NOT set the JOB_UPDATE_DB flag here, it
18035 			 * is handled when task_id_str is created elsewhere */
18036 		} else {
18037 			error("%s: Unable to copy record for %pJ",
18038 			      __func__, job_ptr);
18039 		}
18040 	}
18041 
18042 	return new_job_ptr;
18043 }
18044 
18045 /* _kill_dependent()
18046  *
18047  * Exterminate the job that has invalid dependency
18048  * condition.
18049  */
_kill_dependent(job_record_t * job_ptr)18050 static void _kill_dependent(job_record_t *job_ptr)
18051 {
18052 	time_t now = time(NULL);
18053 
18054 	info("%s: Job dependency can't be satisfied, cancelling %pJ",
18055 	     __func__, job_ptr);
18056 	job_ptr->job_state = JOB_CANCELLED;
18057 	job_ptr->start_time = now;
18058 	job_ptr->end_time = now;
18059 	job_completion_logger(job_ptr, false);
18060 	last_job_update = now;
18061 	srun_allocate_abort(job_ptr);
18062 }
18063 
_dup_job_fed_details(job_fed_details_t * src)18064 static job_fed_details_t *_dup_job_fed_details(job_fed_details_t *src)
18065 {
18066 	job_fed_details_t *dst = NULL;
18067 
18068 	if (!src)
18069 		return NULL;
18070 
18071 	dst = xmalloc(sizeof(job_fed_details_t));
18072 	memcpy(dst, src, sizeof(job_fed_details_t));
18073 	dst->origin_str          = xstrdup(src->origin_str);
18074 	dst->siblings_active_str = xstrdup(src->siblings_active_str);
18075 	dst->siblings_viable_str = xstrdup(src->siblings_viable_str);
18076 
18077 	return dst;
18078 }
18079 
free_job_fed_details(job_fed_details_t ** fed_details_pptr)18080 extern void free_job_fed_details(job_fed_details_t **fed_details_pptr)
18081 {
18082 	job_fed_details_t *fed_details_ptr = *fed_details_pptr;
18083 
18084 	if (fed_details_ptr) {
18085 		xfree(fed_details_ptr->origin_str);
18086 		xfree(fed_details_ptr->siblings_active_str);
18087 		xfree(fed_details_ptr->siblings_viable_str);
18088 		xfree(fed_details_ptr);
18089 		*fed_details_pptr = NULL;
18090 	}
18091 }
18092 
_dump_job_fed_details(job_fed_details_t * fed_details_ptr,Buf buffer)18093 static void _dump_job_fed_details(job_fed_details_t *fed_details_ptr,
18094 				  Buf buffer)
18095 {
18096 	if (fed_details_ptr) {
18097 		pack16(1, buffer);
18098 		pack32(fed_details_ptr->cluster_lock, buffer);
18099 		packstr(fed_details_ptr->origin_str, buffer);
18100 		pack64(fed_details_ptr->siblings_active, buffer);
18101 		packstr(fed_details_ptr->siblings_active_str, buffer);
18102 		pack64(fed_details_ptr->siblings_viable, buffer);
18103 		packstr(fed_details_ptr->siblings_viable_str, buffer);
18104 	} else {
18105 		pack16(0, buffer);
18106 	}
18107 }
18108 
_load_job_fed_details(job_fed_details_t ** fed_details_pptr,Buf buffer,uint16_t protocol_version)18109 static int _load_job_fed_details(job_fed_details_t **fed_details_pptr,
18110 				 Buf buffer,
18111 				 uint16_t protocol_version)
18112 {
18113 	uint16_t tmp_uint16;
18114 	uint32_t tmp_uint32;
18115 	job_fed_details_t *fed_details_ptr = NULL;
18116 
18117 	xassert(fed_details_pptr);
18118 
18119 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
18120 		safe_unpack16(&tmp_uint16, buffer);
18121 		if (tmp_uint16) {
18122 			*fed_details_pptr = xmalloc(sizeof(job_fed_details_t));
18123 			fed_details_ptr = *fed_details_pptr;
18124 			safe_unpack32(&fed_details_ptr->cluster_lock, buffer);
18125 			safe_unpackstr_xmalloc(&fed_details_ptr->origin_str,
18126 					       &tmp_uint32, buffer);
18127 			safe_unpack64(&fed_details_ptr->siblings_active,
18128 				      buffer);
18129 			safe_unpackstr_xmalloc(
18130 					&fed_details_ptr->siblings_active_str,
18131 					&tmp_uint32, buffer);
18132 			safe_unpack64(&fed_details_ptr->siblings_viable,
18133 				      buffer);
18134 			safe_unpackstr_xmalloc(
18135 					&fed_details_ptr->siblings_viable_str,
18136 					&tmp_uint32, buffer);
18137 		}
18138 	} else
18139 		goto unpack_error;
18140 
18141 	return SLURM_SUCCESS;
18142 
18143 unpack_error:
18144 	free_job_fed_details(fed_details_pptr);
18145 	*fed_details_pptr = NULL;
18146 
18147 	return SLURM_ERROR;
18148 }
18149 
18150 /* Set federated job's sibling strings. */
update_job_fed_details(job_record_t * job_ptr)18151 extern void update_job_fed_details(job_record_t *job_ptr)
18152 {
18153 	xassert(job_ptr);
18154 	xassert(job_ptr->fed_details);
18155 
18156 	xfree(job_ptr->fed_details->siblings_active_str);
18157 	xfree(job_ptr->fed_details->siblings_viable_str);
18158 
18159 	job_ptr->fed_details->siblings_active_str =
18160 		fed_mgr_cluster_ids_to_names(
18161 					job_ptr->fed_details->siblings_active);
18162 	job_ptr->fed_details->siblings_viable_str =
18163 		fed_mgr_cluster_ids_to_names(
18164 					job_ptr->fed_details->siblings_viable);
18165 
18166 	/* only set once */
18167 	if (!job_ptr->fed_details->origin_str)
18168 		job_ptr->fed_details->origin_str =
18169 			fed_mgr_get_cluster_name(
18170 				fed_mgr_get_cluster_id(job_ptr->job_id));
18171 }
18172 
18173 /*
18174  * Set the allocation response with the current cluster's information and the
18175  * job's allocated node's addr's if the allocation is being filled by a cluster
18176  * other than the cluster that submitted the job
18177  *
18178  * Note: make sure that the resp's working_cluster_rec is NULL'ed out before the
18179  * resp is free'd since it points to global memory.
18180  *
18181  * IN resp - allocation response being sent back to client.
18182  * IN job_ptr - allocated job
18183  * IN req_cluster - the cluster requsting the allocation info.
18184  */
set_remote_working_response(resource_allocation_response_msg_t * resp,job_record_t * job_ptr,const char * req_cluster)18185 extern void set_remote_working_response(
18186 	resource_allocation_response_msg_t *resp,
18187 	job_record_t *job_ptr, const char *req_cluster)
18188 {
18189 	xassert(resp);
18190 	xassert(job_ptr);
18191 
18192 	if (job_ptr->node_cnt &&
18193 	    req_cluster && slurmctld_conf.cluster_name &&
18194 	    xstrcmp(slurmctld_conf.cluster_name, req_cluster)) {
18195 		if (job_ptr->fed_details &&
18196 		    fed_mgr_cluster_rec) {
18197 			resp->working_cluster_rec = fed_mgr_cluster_rec;
18198 		} else {
18199 			resp->working_cluster_rec = response_cluster_rec;
18200 		}
18201 
18202 		resp->node_addr = xcalloc(job_ptr->node_cnt,
18203 					  sizeof(slurm_addr_t));
18204 		memcpy(resp->node_addr, job_ptr->node_addr,
18205 		       (sizeof(slurm_addr_t) * job_ptr->node_cnt));
18206 	}
18207 }
18208 
18209 /* Build structure with job allocation details */
build_job_info_resp(job_record_t * job_ptr)18210 extern resource_allocation_response_msg_t *build_job_info_resp(
18211 	job_record_t *job_ptr)
18212 {
18213 	resource_allocation_response_msg_t *job_info_resp_msg;
18214 	int i, j;
18215 
18216 	job_info_resp_msg = xmalloc(sizeof(resource_allocation_response_msg_t));
18217 
18218 
18219 	if (!job_ptr->job_resrcs) {
18220 		;
18221 	} else if (bit_equal(job_ptr->node_bitmap,
18222 			     job_ptr->job_resrcs->node_bitmap)) {
18223 		job_info_resp_msg->num_cpu_groups =
18224 			job_ptr->job_resrcs->cpu_array_cnt;
18225 		job_info_resp_msg->cpu_count_reps =
18226 			xcalloc(job_ptr->job_resrcs->cpu_array_cnt,
18227 				sizeof(uint32_t));
18228 		memcpy(job_info_resp_msg->cpu_count_reps,
18229 		       job_ptr->job_resrcs->cpu_array_reps,
18230 		       (sizeof(uint32_t) * job_ptr->job_resrcs->cpu_array_cnt));
18231 		job_info_resp_msg->cpus_per_node  =
18232 			xcalloc(job_ptr->job_resrcs->cpu_array_cnt,
18233 				sizeof(uint16_t));
18234 		memcpy(job_info_resp_msg->cpus_per_node,
18235 		       job_ptr->job_resrcs->cpu_array_value,
18236 		       (sizeof(uint16_t) * job_ptr->job_resrcs->cpu_array_cnt));
18237 	} else {
18238 		/* Job has changed size, rebuild CPU count info */
18239 		job_info_resp_msg->num_cpu_groups = job_ptr->node_cnt;
18240 		job_info_resp_msg->cpu_count_reps = xcalloc(job_ptr->node_cnt,
18241 							    sizeof(uint32_t));
18242 		job_info_resp_msg->cpus_per_node = xcalloc(job_ptr->node_cnt,
18243 							   sizeof(uint32_t));
18244 		for (i = 0, j = -1; i < job_ptr->job_resrcs->nhosts; i++) {
18245 			if (job_ptr->job_resrcs->cpus[i] == 0)
18246 				continue;
18247 			if ((j == -1) ||
18248 			    (job_info_resp_msg->cpus_per_node[j] !=
18249 			     job_ptr->job_resrcs->cpus[i])) {
18250 				j++;
18251 				job_info_resp_msg->cpus_per_node[j] =
18252 					job_ptr->job_resrcs->cpus[i];
18253 				job_info_resp_msg->cpu_count_reps[j] = 1;
18254 			} else {
18255 				job_info_resp_msg->cpu_count_reps[j]++;
18256 			}
18257 		}
18258 		job_info_resp_msg->num_cpu_groups = j + 1;
18259 	}
18260 	job_info_resp_msg->account        = xstrdup(job_ptr->account);
18261 	job_info_resp_msg->alias_list     = xstrdup(job_ptr->alias_list);
18262 	job_info_resp_msg->job_id         = job_ptr->job_id;
18263 	job_info_resp_msg->node_cnt       = job_ptr->node_cnt;
18264 	job_info_resp_msg->node_list      = xstrdup(job_ptr->nodes);
18265 	job_info_resp_msg->partition      = xstrdup(job_ptr->partition);
18266 	if (job_ptr->qos_ptr) {
18267 		slurmdb_qos_rec_t *qos;
18268 		qos = (slurmdb_qos_rec_t *)job_ptr->qos_ptr;
18269 		job_info_resp_msg->qos = xstrdup(qos->name);
18270 	}
18271 	job_info_resp_msg->resv_name      = xstrdup(job_ptr->resv_name);
18272 	job_info_resp_msg->select_jobinfo =
18273 		select_g_select_jobinfo_copy(job_ptr->select_jobinfo);
18274 	if (job_ptr->details) {
18275 		if (job_ptr->bit_flags & JOB_MEM_SET) {
18276 			job_info_resp_msg->pn_min_memory =
18277 				job_ptr->details->pn_min_memory;
18278 		}
18279 		if (job_ptr->details->mc_ptr) {
18280 			job_info_resp_msg->ntasks_per_board =
18281 				job_ptr->details->mc_ptr->ntasks_per_board;
18282 			job_info_resp_msg->ntasks_per_core =
18283 				job_ptr->details->mc_ptr->ntasks_per_core;
18284 			job_info_resp_msg->ntasks_per_socket =
18285 				job_ptr->details->mc_ptr->ntasks_per_socket;
18286 		}
18287 	} else {
18288 		/* job_info_resp_msg->pn_min_memory     = 0; */
18289 		job_info_resp_msg->ntasks_per_board  = NO_VAL16;
18290 		job_info_resp_msg->ntasks_per_core   = NO_VAL16;
18291 		job_info_resp_msg->ntasks_per_socket = NO_VAL16;
18292 	}
18293 
18294 	if (job_ptr->details && job_ptr->details->env_cnt) {
18295 		job_info_resp_msg->env_size = job_ptr->details->env_cnt;
18296 		job_info_resp_msg->environment =
18297 			xcalloc(job_info_resp_msg->env_size + 1,
18298 				sizeof(char *));
18299 		for (i = 0; i < job_info_resp_msg->env_size; i++) {
18300 			job_info_resp_msg->environment[i] =
18301 				xstrdup(job_ptr->details->env_sup[i]);
18302 		}
18303 		job_info_resp_msg->environment[i] = NULL;
18304 	}
18305 
18306 	return job_info_resp_msg;
18307 }
18308 
18309 /*
18310  * Calculate billable TRES based on partition's defined BillingWeights. If none
18311  * is defined, return total_cpus. This is cached on job_ptr->billable_tres and
18312  * is updated if the job was resized since the last iteration.
18313  *
18314  * IN job_ptr          - job to calc billable tres on
18315  * IN start_time       - time the has started or been resized
18316  * IN assoc_mgr_locked - whether the tres assoc lock is set or not
18317  */
calc_job_billable_tres(job_record_t * job_ptr,time_t start_time,bool assoc_mgr_locked)18318 extern double calc_job_billable_tres(job_record_t *job_ptr, time_t start_time,
18319 				     bool assoc_mgr_locked)
18320 {
18321 	xassert(job_ptr);
18322 
18323 	part_record_t *part_ptr = job_ptr->part_ptr;
18324 
18325 	/* We don't have any resources allocated, just return 0. */
18326 	if (!job_ptr->tres_alloc_cnt)
18327 		return 0;
18328 
18329 	/* Don't recalculate unless the job is new or resized */
18330 	if ((!fuzzy_equal(job_ptr->billable_tres, NO_VAL)) &&
18331 	    difftime(job_ptr->resize_time, start_time) < 0.0)
18332 		return job_ptr->billable_tres;
18333 
18334 	if (slurmctld_conf.debug_flags & DEBUG_FLAG_PRIO)
18335 		info("BillingWeight: %pJ is either new or it was resized",
18336 		     job_ptr);
18337 
18338 	/* No billing weights defined. Return CPU count */
18339 	if (!part_ptr || !part_ptr->billing_weights) {
18340 		job_ptr->billable_tres = job_ptr->total_cpus;
18341 		return job_ptr->billable_tres;
18342 	}
18343 
18344 	if (slurmctld_conf.debug_flags & DEBUG_FLAG_PRIO)
18345 		info("BillingWeight: %pJ using \"%s\" from partition %s",
18346 		     job_ptr, part_ptr->billing_weights_str,
18347 		     job_ptr->part_ptr->name);
18348 
18349 	job_ptr->billable_tres =
18350 		assoc_mgr_tres_weighted(job_ptr->tres_alloc_cnt,
18351 					part_ptr->billing_weights,
18352 					slurmctld_conf.priority_flags,
18353 					assoc_mgr_locked);
18354 
18355 	if (slurmctld_conf.debug_flags & DEBUG_FLAG_PRIO)
18356 		info("BillingWeight: %pJ %s = %f",
18357 		     job_ptr,
18358 		     (slurmctld_conf.priority_flags & PRIORITY_FLAGS_MAX_TRES) ?
18359 		     "MAX(node TRES) + SUM(Global TRES)" : "SUM(TRES)",
18360 		     job_ptr->billable_tres);
18361 
18362 	return job_ptr->billable_tres;
18363 }
18364 
update_job_limit_set_tres(uint16_t ** limits_pptr)18365 extern void update_job_limit_set_tres(uint16_t **limits_pptr)
18366 {
18367 	int i, old_pos;
18368 	int new_size = sizeof(uint16_t) * slurmctld_tres_cnt;
18369 
18370 	xassert(limits_pptr);
18371 
18372 	*limits_pptr = xrealloc(*limits_pptr, new_size);
18373 
18374 	if (assoc_mgr_tres_pos_changed()) {
18375 		uint16_t *limits_ptr, tmp_tres[slurmctld_tres_cnt];
18376 		limits_ptr = *limits_pptr;
18377 
18378 		for (i = 0; i < slurmctld_tres_cnt; i++) {
18379 			if ((old_pos = assoc_mgr_get_old_tres_pos(i)) == -1)
18380 				tmp_tres[i] = 0;
18381 			else
18382 				tmp_tres[i] = limits_ptr[old_pos];
18383 		}
18384 		memcpy(limits_ptr, tmp_tres, new_size);
18385 	}
18386 }
18387 
18388 
18389 /*
18390  * Send warning signal to job before end time.
18391  *
18392  * IN job_ptr - job to send warn signal to.
18393  * IN ignore_time - If set, ignore the warn time and just send it.
18394  */
send_job_warn_signal(job_record_t * job_ptr,bool ignore_time)18395 extern void send_job_warn_signal(job_record_t *job_ptr, bool ignore_time)
18396 {
18397 	if (job_ptr->warn_signal &&
18398 	    !(job_ptr->warn_flags & WARN_SENT) &&
18399 	    (ignore_time ||
18400 	     (job_ptr->warn_time &&
18401 	      ((job_ptr->warn_time + PERIODIC_TIMEOUT + time(NULL)) >=
18402 	        job_ptr->end_time)))) {
18403 		/*
18404 		 * If --signal B option was not specified,
18405 		 * signal only the steps but not the batch step.
18406 		 */
18407 		if (!(job_ptr->warn_flags & KILL_JOB_BATCH))
18408 			job_ptr->warn_flags |= KILL_STEPS_ONLY;
18409 
18410 		debug("%s: warning signal %u to %pJ",
18411 		      __func__, job_ptr->warn_signal, job_ptr);
18412 
18413 		job_signal(job_ptr, job_ptr->warn_signal,
18414 			   job_ptr->warn_flags, 0, false);
18415 
18416 		/* mark job as signaled */
18417 		job_ptr->warn_flags |= WARN_SENT;
18418 	}
18419 }
18420 
_overlap_and_running_internal(void * x,void * arg)18421 static int _overlap_and_running_internal(void *x, void *arg)
18422 {
18423 	job_record_t *job_ptr = (job_record_t *)x;
18424 	job_overlap_args_t *overlap_args = (job_overlap_args_t *)arg;
18425 
18426 	/* We always break if we find something not running */
18427 	if ((!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) {
18428 		overlap_args->rc = 0;
18429 		return 1;
18430 	}
18431 
18432 	/*
18433 	 * We are just looking for something overlapping.  On a hetjob we need
18434 	 * to check everything.
18435 	 */
18436 	if (job_ptr->node_bitmap &&
18437 	    bit_overlap_any(overlap_args->node_map, job_ptr->node_bitmap))
18438 		overlap_args->rc = 1;
18439 
18440 	return 0;
18441 }
18442 
job_overlap_and_running(bitstr_t * node_map,job_record_t * job_ptr)18443 extern bool job_overlap_and_running(bitstr_t *node_map, job_record_t *job_ptr)
18444 {
18445 	job_overlap_args_t overlap_args = {
18446 		.node_map = node_map
18447 	};
18448 
18449 	if (!job_ptr->het_job_list)
18450 		(void)_overlap_and_running_internal(job_ptr, &overlap_args);
18451 	else
18452 		(void)list_for_each(job_ptr->het_job_list,
18453 				    _overlap_and_running_internal,
18454 				    &overlap_args);
18455 
18456 	return overlap_args.rc;
18457 }
18458