1 /*****************************************************************************\
2 * job_mgr.c - manage the job information of slurm
3 * Note: there is a global job list (job_list), time stamp
4 * (last_job_update), and hash table (job_hash)
5 *****************************************************************************
6 * Copyright (C) 2002-2007 The Regents of the University of California.
7 * Copyright (C) 2008-2010 Lawrence Livermore National Security.
8 * Portions Copyright (C) 2010-2017 SchedMD <https://www.schedmd.com>.
9 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
10 * Written by Morris Jette <jette1@llnl.gov>
11 * CODE-OCEC-09-009. All rights reserved.
12 *
13 * This file is part of Slurm, a resource management program.
14 * For details, see <https://slurm.schedmd.com/>.
15 * Please also read the included file: DISCLAIMER.
16 *
17 * Slurm is free software; you can redistribute it and/or modify it under
18 * the terms of the GNU General Public License as published by the Free
19 * Software Foundation; either version 2 of the License, or (at your option)
20 * any later version.
21 *
22 * In addition, as a special exception, the copyright holders give permission
23 * to link the code of portions of this program with the OpenSSL library under
24 * certain conditions as described in each individual source file, and
25 * distribute linked combinations including the two. You must obey the GNU
26 * General Public License in all respects for all of the code used other than
27 * OpenSSL. If you modify file(s) with this exception, you may extend this
28 * exception to your version of the file(s), but you are not obligated to do
29 * so. If you do not wish to do so, delete this exception statement from your
30 * version. If you delete this exception statement from all source files in
31 * the program, then also delete it here.
32 *
33 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
34 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
35 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
36 * details.
37 *
38 * You should have received a copy of the GNU General Public License along
39 * with Slurm; if not, write to the Free Software Foundation, Inc.,
40 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
41 \*****************************************************************************/
42
43 #include "config.h"
44 #define _GNU_SOURCE
45
46 #include <ctype.h>
47 #include <dirent.h>
48 #include <errno.h>
49 #include <fcntl.h>
50 #include <libgen.h>
51 #include <signal.h>
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include <sys/stat.h>
56 #include <sys/types.h>
57 #include <sys/param.h>
58 #include <unistd.h>
59
60 #include "slurm/slurm_errno.h"
61
62 #include "src/common/slurm_acct_gather.h"
63 #include "src/common/assoc_mgr.h"
64 #include "src/common/bitstring.h"
65 #include "src/common/cpu_frequency.h"
66 #include "src/common/fd.h"
67 #include "src/common/forward.h"
68 #include "src/common/gres.h"
69 #include "src/common/hostlist.h"
70 #include "src/common/node_features.h"
71 #include "src/common/node_select.h"
72 #include "src/common/parse_time.h"
73 #include "src/common/power.h"
74 #include "src/common/slurm_accounting_storage.h"
75 #include "src/common/slurm_auth.h"
76 #include "src/common/slurm_jobcomp.h"
77 #include "src/common/slurm_mcs.h"
78 #include "src/common/slurm_priority.h"
79 #include "src/common/slurm_protocol_pack.h"
80 #include "src/common/switch.h"
81 #include "src/common/timers.h"
82 #include "src/common/track_script.h"
83 #include "src/common/tres_bind.h"
84 #include "src/common/tres_frequency.h"
85 #include "src/common/uid.h"
86 #include "src/common/xassert.h"
87 #include "src/common/xstring.h"
88
89 #include "src/slurmctld/acct_policy.h"
90 #include "src/slurmctld/agent.h"
91 #include "src/slurmctld/burst_buffer.h"
92 #include "src/slurmctld/fed_mgr.h"
93 #include "src/slurmctld/front_end.h"
94 #include "src/slurmctld/gang.h"
95 #include "src/slurmctld/job_scheduler.h"
96 #include "src/slurmctld/job_submit.h"
97 #include "src/slurmctld/licenses.h"
98 #include "src/slurmctld/locks.h"
99 #include "src/slurmctld/node_scheduler.h"
100 #include "src/slurmctld/preempt.h"
101 #include "src/slurmctld/proc_req.h"
102 #include "src/slurmctld/reservation.h"
103 #include "src/slurmctld/sched_plugin.h"
104 #include "src/slurmctld/slurmctld.h"
105 #include "src/slurmctld/slurmctld_plugstack.h"
106 #include "src/slurmctld/srun_comm.h"
107 #include "src/slurmctld/state_save.h"
108 #include "src/slurmctld/trigger_mgr.h"
109
110 #define ARRAY_ID_BUF_SIZE 32
111 #define DETAILS_FLAG 0xdddd
112 #define MAX_EXIT_VAL 255 /* Maximum value returned by WIFEXITED() */
113 #define SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0 0
114 #define TOP_PRIORITY 0xffff0000 /* large, but leave headroom for higher */
115 #define PURGE_OLD_JOB_IN_SEC 2592000 /* 30 days in seconds */
116
117 #define JOB_HASH_INX(_job_id) (_job_id % hash_table_size)
118 #define JOB_ARRAY_HASH_INX(_job_id, _task_id) \
119 ((_job_id + _task_id) % hash_table_size)
120
121 /* No need to change we always pack SLURM_PROTOCOL_VERSION */
122 #define JOB_STATE_VERSION "PROTOCOL_VERSION"
123 #define JOB_CKPT_VERSION "PROTOCOL_VERSION"
124
125 typedef enum {
126 JOB_HASH_JOB,
127 JOB_HASH_ARRAY_JOB,
128 JOB_HASH_ARRAY_TASK,
129 } job_hash_type_t;
130
131 typedef struct {
132 int resp_array_cnt;
133 int resp_array_size;
134 uint32_t *resp_array_rc;
135 bitstr_t **resp_array_task_id;
136 } resp_array_struct_t;
137
138 typedef struct {
139 Buf buffer;
140 uint32_t filter_uid;
141 uint32_t *jobs_packed;
142 uint16_t protocol_version;
143 uint16_t show_flags;
144 uid_t uid;
145 } _foreach_pack_job_info_t;
146
147 typedef struct {
148 bitstr_t *node_map;
149 int rc;
150 } job_overlap_args_t;
151
152 /* Global variables */
153 List job_list = NULL; /* job_record list */
154 time_t last_job_update; /* time of last update to job records */
155
156 List purge_files_list = NULL; /* job files to delete */
157
158 /* Local variables */
159 static int bf_min_age_reserve = 0;
160 static uint32_t delay_boot = 0;
161 static uint32_t highest_prio = 0;
162 static uint32_t lowest_prio = TOP_PRIORITY;
163 static int hash_table_size = 0;
164 static int job_count = 0; /* job's in the system */
165 static uint32_t job_id_sequence = 0; /* first job_id to assign new job */
166 static struct job_record **job_hash = NULL;
167 static struct job_record **job_array_hash_j = NULL;
168 static struct job_record **job_array_hash_t = NULL;
169 static bool kill_invalid_dep;
170 static time_t last_file_write_time = (time_t) 0;
171 static uint32_t max_array_size = NO_VAL;
172 static bitstr_t *requeue_exit = NULL;
173 static bitstr_t *requeue_exit_hold = NULL;
174 static bool validate_cfgd_licenses = true;
175
176 /* Local functions */
177 static void _add_job_hash(job_record_t *job_ptr);
178 static void _add_job_array_hash(job_record_t *job_ptr);
179 static void _clear_job_gres_details(job_record_t *job_ptr);
180 static int _copy_job_desc_to_file(job_desc_msg_t * job_desc,
181 uint32_t job_id);
182 static int _copy_job_desc_to_job_record(job_desc_msg_t * job_desc,
183 job_record_t **job_ptr,
184 bitstr_t ** exc_bitmap,
185 bitstr_t ** req_bitmap);
186 static char *_copy_nodelist_no_dup(char *node_list);
187 static job_record_t *_create_job_record(uint32_t num_jobs);
188 static void _delete_job_details(job_record_t *job_entry);
189 static slurmdb_qos_rec_t *_determine_and_validate_qos(
190 char *resv_name, slurmdb_assoc_rec_t *assoc_ptr,
191 bool operator, slurmdb_qos_rec_t *qos_rec, int *error_code,
192 bool locked, log_level_t log_lvl);
193 static void _dump_job_details(struct job_details *detail_ptr, Buf buffer);
194 static void _dump_job_state(job_record_t *dump_job_ptr, Buf buffer);
195 static void _dump_job_fed_details(job_fed_details_t *fed_details_ptr,
196 Buf buffer);
197 static job_fed_details_t *_dup_job_fed_details(job_fed_details_t *src);
198 static void _get_batch_job_dir_ids(List batch_dirs);
199 static bool _get_whole_hetjob(void);
200 static void _job_array_comp(job_record_t *job_ptr, bool was_running,
201 bool requeue);
202 static int _job_create(job_desc_msg_t * job_specs, int allocate, int will_run,
203 job_record_t **job_rec_ptr, uid_t submit_uid,
204 char **err_msg, uint16_t protocol_version);
205 static void _job_timed_out(job_record_t *job_ptr, bool preempted);
206 static void _kill_dependent(job_record_t *job_ptr);
207 static void _list_delete_job(void *job_entry);
208 static int _list_find_job_old(void *job_entry, void *key);
209 static int _load_job_details(job_record_t *job_ptr, Buf buffer,
210 uint16_t protocol_version);
211 static int _load_job_fed_details(job_fed_details_t **fed_details_pptr,
212 Buf buffer, uint16_t protocol_version);
213 static int _load_job_state(Buf buffer, uint16_t protocol_version);
214 static bitstr_t *_make_requeue_array(char *conf_buf);
215 static uint32_t _max_switch_wait(uint32_t input_wait);
216 static void _notify_srun_missing_step(job_record_t *job_ptr, int node_inx,
217 time_t now, time_t node_boot_time);
218 static Buf _open_job_state_file(char **state_file);
219 static time_t _get_last_job_state_write_time(void);
220 static void _pack_default_job_details(job_record_t *job_ptr, Buf buffer,
221 uint16_t protocol_version);
222 static void _pack_pending_job_details(struct job_details *detail_ptr,
223 Buf buffer,
224 uint16_t protocol_version);
225 static bool _parse_array_tok(char *tok, bitstr_t *array_bitmap, uint32_t max);
226 static void _purge_missing_jobs(int node_inx, time_t now);
227 static int _read_data_array_from_file(int fd, char *file_name, char ***data,
228 uint32_t *size, job_record_t *job_ptr);
229 static void _remove_defunct_batch_dirs(List batch_dirs);
230 static void _remove_job_hash(job_record_t *job_ptr, job_hash_type_t type);
231 static int _reset_detail_bitmaps(job_record_t *job_ptr);
232 static void _reset_step_bitmaps(job_record_t *job_ptr);
233 static void _resp_array_add(resp_array_struct_t **resp, job_record_t *job_ptr,
234 uint32_t rc);
235 static void _resp_array_add_id(resp_array_struct_t **resp, uint32_t job_id,
236 uint32_t task_id, uint32_t rc);
237 static void _resp_array_free(resp_array_struct_t *resp);
238 static job_array_resp_msg_t *_resp_array_xlate(resp_array_struct_t *resp,
239 uint32_t job_id);
240 static int _resume_job_nodes(job_record_t *job_ptr, bool indf_susp);
241 static void _send_job_kill(job_record_t *job_ptr);
242 static int _set_job_id(job_record_t *job_ptr);
243 static void _set_job_requeue_exit_value(job_record_t *job_ptr);
244 static void _signal_batch_job(job_record_t *job_ptr, uint16_t signal,
245 uint16_t flags);
246 static void _signal_job(job_record_t *job_ptr, int signal, uint16_t flags);
247 static void _suspend_job(job_record_t *job_ptr, uint16_t op, bool indf_susp);
248 static int _suspend_job_nodes(job_record_t *job_ptr, bool indf_susp);
249 static bool _top_priority(job_record_t *job_ptr, uint32_t het_job_offset);
250 static int _valid_job_part(job_desc_msg_t *job_desc, uid_t submit_uid,
251 bitstr_t *req_bitmap, part_record_t *part_ptr,
252 List part_ptr_list,
253 slurmdb_assoc_rec_t *assoc_ptr,
254 slurmdb_qos_rec_t *qos_ptr);
255 static int _validate_job_desc(job_desc_msg_t *job_desc_msg, int allocate,
256 uid_t submit_uid, part_record_t *part_ptr,
257 List part_list);
258 static void _validate_job_files(List batch_dirs);
259 static bool _validate_min_mem_partition(job_desc_msg_t *job_desc_msg,
260 part_record_t *part_ptr,
261 List part_list);
262 static bool _valid_pn_min_mem(job_desc_msg_t * job_desc_msg,
263 part_record_t *part_ptr);
264 static int _write_data_to_file(char *file_name, char *data);
265 static int _write_data_array_to_file(char *file_name, char **data,
266 uint32_t size);
267 static void _xmit_new_end_time(job_record_t *job_ptr);
268
269
_get_mail_user(const char * user_name,uid_t user_id)270 static char *_get_mail_user(const char *user_name, uid_t user_id)
271 {
272 char *mail_user = NULL;
273 if (!user_name || (user_name[0] == '\0')) {
274 mail_user = uid_to_string(user_id);
275 /* unqualified sender, append MailDomain if set */
276 if (slurmctld_conf.mail_domain) {
277 xstrfmtcat(mail_user, "@%s",
278 slurmctld_conf.mail_domain);
279 }
280 } else {
281 mail_user = xstrdup(user_name);
282 }
283
284 return mail_user;
285 }
286
_job_fail_account(job_record_t * job_ptr,const char * func_name)287 static int _job_fail_account(job_record_t *job_ptr, const char *func_name)
288 {
289 int rc = 0; // Return number of pending jobs held
290
291 if (IS_JOB_PENDING(job_ptr)) {
292 info("%s: %pJ ineligible due to invalid association",
293 func_name, job_ptr);
294
295 xfree(job_ptr->state_desc);
296 job_ptr->state_reason = FAIL_ACCOUNT;
297
298 if (job_ptr->details) {
299 /* reset the job */
300 job_ptr->details->accrue_time = 0;
301 job_ptr->bit_flags &= ~JOB_ACCRUE_OVER;
302 job_ptr->details->begin_time = 0;
303 /* Update job with new begin_time. */
304 jobacct_storage_g_job_start(acct_db_conn, job_ptr);
305 }
306 rc = 1;
307 }
308
309 /* This job is no longer eligible, so make it so. */
310 if (job_ptr->assoc_ptr) {
311 part_record_t *tmp_part = job_ptr->part_ptr;
312 List tmp_part_list = job_ptr->part_ptr_list;
313 slurmdb_qos_rec_t *tmp_qos = job_ptr->qos_ptr;
314
315 /*
316 * Force a start so the association doesn't get lost. Since
317 * there could be some delay in the start of the job when
318 * running with the slurmdbd.
319 */
320 if (!job_ptr->db_index)
321 jobacct_storage_g_job_start(acct_db_conn, job_ptr);
322
323 /*
324 * Don't call acct_policy_remove_accrue_time() here, the cnt on
325 * parent associations will be handled correctly by the removal
326 * of the association.
327 */
328
329 /*
330 * Clear ptrs so that only association usage is removed.
331 * Otherwise qos and partition limits will be double accounted
332 * for when this job finishes. Don't do this for acrrual time,
333 * it has be on both because the job is ineligible and can't
334 * accrue time.
335 */
336 job_ptr->part_ptr = NULL;
337 job_ptr->part_ptr_list = NULL;
338 job_ptr->qos_ptr = NULL;
339
340 acct_policy_remove_job_submit(job_ptr);
341
342 job_ptr->part_ptr = tmp_part;
343 job_ptr->part_ptr_list = tmp_part_list;
344 job_ptr->qos_ptr = tmp_qos;
345
346 job_ptr->assoc_ptr = NULL;
347 }
348
349 job_ptr->assoc_id = 0;
350
351 return rc;
352 }
353
job_fail_qos(job_record_t * job_ptr,const char * func_name)354 extern int job_fail_qos(job_record_t *job_ptr, const char *func_name)
355 {
356 int rc = 0; // Return number of pending jobs held
357
358 if (IS_JOB_PENDING(job_ptr)) {
359 info("%s: %pJ ineligible due to invalid qos",
360 func_name, job_ptr);
361
362 xfree(job_ptr->state_desc);
363 job_ptr->state_reason = FAIL_QOS;
364
365 if (job_ptr->details) {
366 /* reset the job */
367 job_ptr->details->accrue_time = 0;
368 job_ptr->bit_flags &= ~JOB_ACCRUE_OVER;
369 job_ptr->details->begin_time = 0;
370 /* Update job with new begin_time. */
371 jobacct_storage_g_job_start(acct_db_conn, job_ptr);
372 }
373 rc = 1;
374 }
375
376 /* This job is no longer eligible, so make it so. */
377 if (job_ptr->qos_ptr) {
378 slurmdb_assoc_rec_t *tmp_assoc = job_ptr->assoc_ptr;
379
380 /*
381 * Force a start so the qos doesn't get lost. Since
382 * there could be some delay in the start of the job when
383 * running with the slurmdbd.
384 */
385 if (!job_ptr->db_index)
386 jobacct_storage_g_job_start(acct_db_conn, job_ptr);
387
388 /*
389 * Don't call acct_policy_remove_accrue_time() here, the cnt on
390 * parent associations will be handled correctly by the removal
391 * of the association.
392 */
393
394 /*
395 * Clear ptrs so that only qos usage is removed. Otherwise
396 * association limits will be double accounted for when this
397 * job finishes. Don't do this for acrrual time, it has be on
398 * both because the job is ineligible and can't accrue time.
399 */
400 job_ptr->assoc_ptr = NULL;
401
402 acct_policy_remove_job_submit(job_ptr);
403
404 job_ptr->assoc_ptr = tmp_assoc;
405
406 job_ptr->qos_ptr = NULL;
407 }
408
409 return rc;
410 }
411
412 /*
413 * Functions used to manage job array responses with a separate return code
414 * possible for each task ID
415 */
416 /* Add job record to resp_array_struct_t, free with _resp_array_free() */
_resp_array_add(resp_array_struct_t ** resp,job_record_t * job_ptr,uint32_t rc)417 static void _resp_array_add(resp_array_struct_t **resp, job_record_t *job_ptr,
418 uint32_t rc)
419 {
420 resp_array_struct_t *loc_resp;
421 int array_size;
422 int i;
423
424 if ((job_ptr->array_task_id == NO_VAL) &&
425 (job_ptr->array_recs == NULL)) {
426 error("%s: called for non-job array %pJ",
427 __func__, job_ptr);
428 return;
429 }
430
431 if (max_array_size == NO_VAL) {
432 max_array_size = slurmctld_conf.max_array_sz;
433 }
434
435 xassert(resp);
436 if (*resp == NULL) {
437 /* Initialize the data structure */
438 loc_resp = xmalloc(sizeof(resp_array_struct_t));
439 loc_resp->resp_array_cnt = 0;
440 loc_resp->resp_array_size = 10;
441 xrealloc(loc_resp->resp_array_rc,
442 (sizeof(uint32_t) * loc_resp->resp_array_size));
443 xrealloc(loc_resp->resp_array_task_id,
444 (sizeof(bitstr_t *) * loc_resp->resp_array_size));
445 *resp = loc_resp;
446 } else {
447 loc_resp = *resp;
448 }
449
450 for (i = 0; i < loc_resp->resp_array_cnt; i++) {
451 if (loc_resp->resp_array_rc[i] != rc)
452 continue;
453 /* Add to existing error code record */
454 if (job_ptr->array_task_id != NO_VAL) {
455 if (job_ptr->array_task_id <
456 bit_size(loc_resp->resp_array_task_id[i])) {
457 bit_set(loc_resp->resp_array_task_id[i],
458 job_ptr->array_task_id);
459 } else {
460 error("%s: found invalid task id %pJ",
461 __func__, job_ptr);
462 }
463 } else if (job_ptr->array_recs &&
464 job_ptr->array_recs->task_id_bitmap) {
465 array_size = bit_size(job_ptr->array_recs->
466 task_id_bitmap);
467 if (bit_size(loc_resp->resp_array_task_id[i]) !=
468 array_size) {
469 loc_resp->resp_array_task_id[i] = bit_realloc(
470 loc_resp->resp_array_task_id[i],
471 array_size);
472 }
473 bit_or(loc_resp->resp_array_task_id[i],
474 job_ptr->array_recs->task_id_bitmap);
475 } else {
476 error("%s: found job %pJ without task ID or bitmap",
477 __func__, job_ptr);
478 }
479 return;
480 }
481
482 /* Need to add a new record for this error code */
483 if (loc_resp->resp_array_cnt >= loc_resp->resp_array_size) {
484 /* Need to grow the table size */
485 loc_resp->resp_array_size += 10;
486 xrealloc(loc_resp->resp_array_rc,
487 (sizeof(uint32_t) * loc_resp->resp_array_size));
488 xrealloc(loc_resp->resp_array_task_id,
489 (sizeof(bitstr_t *) * loc_resp->resp_array_size));
490 }
491
492 loc_resp->resp_array_rc[loc_resp->resp_array_cnt] = rc;
493 if (job_ptr->array_task_id != NO_VAL) {
494 loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] =
495 bit_alloc(max_array_size);
496 if (job_ptr->array_task_id <
497 bit_size(loc_resp->resp_array_task_id
498 [loc_resp->resp_array_cnt])) {
499 bit_set(loc_resp->resp_array_task_id
500 [loc_resp->resp_array_cnt],
501 job_ptr->array_task_id);
502 }
503 } else if (job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) {
504 loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] =
505 bit_copy(job_ptr->array_recs->task_id_bitmap);
506 } else {
507 error("%s: found %pJ without task ID or bitmap",
508 __func__, job_ptr);
509 loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] =
510 bit_alloc(max_array_size);
511 }
512 loc_resp->resp_array_cnt++;
513 return;
514 }
515 /* Add record to resp_array_struct_t, free with _resp_array_free().
516 * This is a variant of _resp_array_add for the case where a job/task ID
517 * is not found, so we use a dummy job record based upon the input IDs. */
_resp_array_add_id(resp_array_struct_t ** resp,uint32_t job_id,uint32_t task_id,uint32_t rc)518 static void _resp_array_add_id(resp_array_struct_t **resp, uint32_t job_id,
519 uint32_t task_id, uint32_t rc)
520 {
521 job_record_t job_ptr;
522
523 job_ptr.job_id = job_id;
524 job_ptr.array_job_id = job_id;
525 job_ptr.array_task_id = task_id;
526 job_ptr.array_recs = NULL;
527 _resp_array_add(resp, &job_ptr, rc);
528 }
529
530 /* Free resp_array_struct_t built by _resp_array_add() */
_resp_array_free(resp_array_struct_t * resp)531 static void _resp_array_free(resp_array_struct_t *resp)
532 {
533 int i;
534
535 if (resp) {
536 for (i = 0; i < resp->resp_array_cnt; i++)
537 FREE_NULL_BITMAP(resp->resp_array_task_id[i]);
538 xfree(resp->resp_array_task_id);
539 xfree(resp->resp_array_rc);
540 xfree(resp);
541 }
542 }
543
544 /* Translate internal job array data structure into a response message */
_resp_array_xlate(resp_array_struct_t * resp,uint32_t job_id)545 static job_array_resp_msg_t *_resp_array_xlate(resp_array_struct_t *resp,
546 uint32_t job_id)
547 {
548 job_array_resp_msg_t *msg;
549 char task_str[ARRAY_ID_BUF_SIZE];
550 int *ffs = NULL;
551 int i, j, low;
552
553 ffs = xcalloc(resp->resp_array_cnt, sizeof(int));
554 for (i = 0; i < resp->resp_array_cnt; i++) {
555 ffs[i] = bit_ffs(resp->resp_array_task_id[i]);
556 }
557
558 msg = xmalloc(sizeof(job_array_resp_msg_t));
559 msg->job_array_count = resp->resp_array_cnt;
560 msg->job_array_id = xcalloc(resp->resp_array_cnt, sizeof(char *));
561 msg->error_code = xcalloc(resp->resp_array_cnt, sizeof(uint32_t));
562 for (i = 0; i < resp->resp_array_cnt; i++) {
563 low = -1;
564 for (j = 0; j < resp->resp_array_cnt; j++) {
565 if ((ffs[j] != -1) &&
566 ((low == -1) || (ffs[j] < ffs[low])))
567 low = j;
568 }
569 if (low == -1)
570 break;
571 ffs[low] = -1;
572
573 msg->error_code[i] = resp->resp_array_rc[low];
574 bit_fmt(task_str, ARRAY_ID_BUF_SIZE,
575 resp->resp_array_task_id[low]);
576 if (strlen(task_str) >= ARRAY_ID_BUF_SIZE - 2) {
577 /* Append "..." to the buffer on overflow */
578 task_str[ARRAY_ID_BUF_SIZE - 4] = '.';
579 task_str[ARRAY_ID_BUF_SIZE - 3] = '.';
580 task_str[ARRAY_ID_BUF_SIZE - 2] = '.';
581 task_str[ARRAY_ID_BUF_SIZE - 1] = '\0';
582 }
583 xstrfmtcat(msg->job_array_id[i], "%u_%s", job_id, task_str);
584 }
585
586 xfree(ffs);
587 return msg;
588 }
589
590 /*
591 * _create_job_record - create an empty job_record including job_details.
592 * load its values with defaults (zeros, nulls, and magic cookie)
593 * IN num_jobs - number of jobs this record should represent
594 * = 0 - split out a job array record to its own job record
595 * = 1 - simple job OR job array with one task
596 * > 1 - job array create with the task count as num_jobs
597 * RET pointer to the record or NULL if error
598 * NOTE: allocates memory that should be xfreed with _list_delete_job
599 */
_create_job_record(uint32_t num_jobs)600 static job_record_t *_create_job_record(uint32_t num_jobs)
601 {
602 job_record_t *job_ptr = xmalloc(sizeof(*job_ptr));
603 struct job_details *detail_ptr = xmalloc(sizeof(*detail_ptr));
604
605 if ((job_count + num_jobs) >= slurmctld_conf.max_job_cnt) {
606 error("%s: MaxJobCount limit from slurm.conf reached (%u)",
607 __func__, slurmctld_conf.max_job_cnt);
608 }
609
610 job_count += num_jobs;
611 last_job_update = time(NULL);
612
613 job_ptr->magic = JOB_MAGIC;
614 job_ptr->array_task_id = NO_VAL;
615 job_ptr->details = detail_ptr;
616 job_ptr->prio_factors = xmalloc(sizeof(priority_factors_object_t));
617 job_ptr->site_factor = NICE_OFFSET;
618 job_ptr->step_list = list_create(NULL);
619
620 xassert (detail_ptr->magic = DETAILS_MAGIC); /* set value */
621 detail_ptr->submit_time = time(NULL);
622 job_ptr->requid = -1; /* force to -1 for sacct to know this
623 * hasn't been set yet */
624 job_ptr->billable_tres = (double)NO_VAL;
625 (void) list_append(job_list, job_ptr);
626
627 return job_ptr;
628 }
629
630
631 /*
632 * _delete_job_details - delete a job's detail record and clear it's pointer
633 * IN job_entry - pointer to job_record to clear the record of
634 */
_delete_job_details(job_record_t * job_entry)635 static void _delete_job_details(job_record_t *job_entry)
636 {
637 int i;
638
639 if (job_entry->details == NULL)
640 return;
641
642 xassert (job_entry->details->magic == DETAILS_MAGIC);
643
644 /*
645 * Queue up job to have the batch script and environment deleted.
646 * This is handled by a separate thread to limit the amount of
647 * time purge_old_job needs to spend holding locks.
648 */
649 if (IS_JOB_FINISHED(job_entry)) {
650 uint32_t *job_id = xmalloc(sizeof(uint32_t));
651 *job_id = job_entry->job_id;
652 list_enqueue(purge_files_list, job_id);
653 }
654
655 xfree(job_entry->details->acctg_freq);
656 for (i=0; i<job_entry->details->argc; i++)
657 xfree(job_entry->details->argv[i]);
658 xfree(job_entry->details->argv);
659 xfree(job_entry->details->cpu_bind);
660 FREE_NULL_LIST(job_entry->details->depend_list);
661 xfree(job_entry->details->dependency);
662 xfree(job_entry->details->orig_dependency);
663 for (i=0; i<job_entry->details->env_cnt; i++)
664 xfree(job_entry->details->env_sup[i]);
665 xfree(job_entry->details->env_sup);
666 xfree(job_entry->details->std_err);
667 FREE_NULL_BITMAP(job_entry->details->exc_node_bitmap);
668 xfree(job_entry->details->exc_nodes);
669 xfree(job_entry->details->extra);
670 FREE_NULL_LIST(job_entry->details->feature_list);
671 xfree(job_entry->details->features);
672 xfree(job_entry->details->cluster_features);
673 xfree(job_entry->details->std_in);
674 xfree(job_entry->details->mc_ptr);
675 xfree(job_entry->details->mem_bind);
676 xfree(job_entry->details->std_out);
677 FREE_NULL_BITMAP(job_entry->details->req_node_bitmap);
678 xfree(job_entry->details->req_nodes);
679 xfree(job_entry->details->work_dir);
680 xfree(job_entry->details->x11_magic_cookie);
681 xfree(job_entry->details->x11_target);
682 xfree(job_entry->details); /* Must be last */
683 }
684
685 /*
686 * delete_job_desc_files - delete job descriptor related files
687 *
688 * Note that this will be called on all individual job array tasks,
689 * even though (as of 17.11) individual directories are no longer created.
690 */
delete_job_desc_files(uint32_t job_id)691 extern void delete_job_desc_files(uint32_t job_id)
692 {
693 char *dir_name = NULL, *file_name = NULL;
694 struct stat sbuf;
695 int hash = job_id % 10;
696 DIR *f_dir;
697 struct dirent *dir_ent;
698
699 dir_name = xstrdup_printf("%s/hash.%d/job.%u",
700 slurmctld_conf.state_save_location,
701 hash, job_id);
702 if (stat(dir_name, &sbuf)) {
703 xfree(dir_name);
704 return;
705 }
706
707 f_dir = opendir(dir_name);
708 if (f_dir) {
709 while ((dir_ent = readdir(f_dir))) {
710 if (!xstrcmp(dir_ent->d_name, ".") ||
711 !xstrcmp(dir_ent->d_name, ".."))
712 continue;
713 xstrfmtcat(file_name, "%s/%s", dir_name,
714 dir_ent->d_name);
715 (void) unlink(file_name);
716 xfree(file_name);
717 }
718 closedir(f_dir);
719 } else {
720 error("opendir(%s): %m", dir_name);
721 }
722
723 (void) rmdir(dir_name);
724 xfree(dir_name);
725 }
726
_max_switch_wait(uint32_t input_wait)727 static uint32_t _max_switch_wait(uint32_t input_wait)
728 {
729 static time_t sched_update = 0;
730 static uint32_t max_wait = 300; /* default max_switch_wait, seconds */
731 char *sched_params, *tmp_ptr;
732 int i;
733
734 if (sched_update != slurmctld_conf.last_update) {
735 sched_update = slurmctld_conf.last_update;
736 sched_params = slurm_get_sched_params();
737 if ((tmp_ptr = xstrcasestr(sched_params, "max_switch_wait="))) {
738 /* 0123456789012345 */
739 i = atoi(tmp_ptr + 16);
740 if (i < 0) {
741 error("ignoring SchedulerParameters: "
742 "max_switch_wait of %d", i);
743 } else {
744 max_wait = i;
745 }
746 }
747 xfree(sched_params);
748 }
749
750 if (max_wait > input_wait)
751 return input_wait;
752 return max_wait;
753 }
754
_determine_and_validate_qos(char * resv_name,slurmdb_assoc_rec_t * assoc_ptr,bool operator,slurmdb_qos_rec_t * qos_rec,int * error_code,bool locked,log_level_t log_lvl)755 static slurmdb_qos_rec_t *_determine_and_validate_qos(
756 char *resv_name, slurmdb_assoc_rec_t *assoc_ptr,
757 bool operator, slurmdb_qos_rec_t *qos_rec, int *error_code,
758 bool locked, log_level_t log_lvl)
759 {
760 slurmdb_qos_rec_t *qos_ptr = NULL;
761
762 /* If enforcing associations make sure this is a valid qos
763 with the association. If not just fill in the qos and
764 continue. */
765
766 xassert(qos_rec);
767
768 assoc_mgr_get_default_qos_info(assoc_ptr, qos_rec);
769 if (assoc_mgr_fill_in_qos(acct_db_conn, qos_rec, accounting_enforce,
770 &qos_ptr, locked) != SLURM_SUCCESS) {
771 log_var(log_lvl, "Invalid qos (%s)", qos_rec->name);
772 *error_code = ESLURM_INVALID_QOS;
773 return NULL;
774 }
775
776 if ((accounting_enforce & ACCOUNTING_ENFORCE_QOS)
777 && assoc_ptr
778 && !operator
779 && (!assoc_ptr->usage->valid_qos
780 || !bit_test(assoc_ptr->usage->valid_qos, qos_rec->id))) {
781 log_var(log_lvl, "This association %d(account='%s', user='%s', partition='%s') does not have access to qos %s",
782 assoc_ptr->id, assoc_ptr->acct, assoc_ptr->user,
783 assoc_ptr->partition, qos_rec->name);
784 *error_code = ESLURM_INVALID_QOS;
785 return NULL;
786 }
787
788 if (qos_ptr && (qos_ptr->flags & QOS_FLAG_REQ_RESV)
789 && (!resv_name || resv_name[0] == '\0')) {
790 log_var(log_lvl, "qos %s can only be used in a reservation",
791 qos_rec->name);
792 *error_code = ESLURM_INVALID_QOS;
793 return NULL;
794 }
795
796 *error_code = SLURM_SUCCESS;
797 return qos_ptr;
798 }
799
800 /*
801 * dump_all_job_state - save the state of all jobs to file for checkpoint
802 * Changes here should be reflected in load_last_job_id() and
803 * load_all_job_state().
804 * RET 0 or error code
805 */
dump_all_job_state(void)806 int dump_all_job_state(void)
807 {
808 /* Save high-water mark to avoid buffer growth with copies */
809 static int high_buffer_size = (1024 * 1024);
810 int error_code = SLURM_SUCCESS, log_fd;
811 char *old_file, *new_file, *reg_file;
812 struct stat stat_buf;
813 /* Locks: Read config and job */
814 slurmctld_lock_t job_read_lock =
815 { READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
816 ListIterator job_iterator;
817 job_record_t *job_ptr;
818 Buf buffer = init_buf(high_buffer_size);
819 time_t now = time(NULL);
820 time_t last_state_file_time;
821 DEF_TIMERS;
822
823 START_TIMER;
824 /*
825 * Check that last state file was written at expected time.
826 * This is a check for two slurmctld daemons running at the same
827 * time in primary mode (a split-brain problem).
828 */
829 last_state_file_time = _get_last_job_state_write_time();
830 if (last_file_write_time && last_state_file_time &&
831 (last_file_write_time != last_state_file_time)) {
832 error("Bad job state save file time. We wrote it at time %u, "
833 "but the file contains a time stamp of %u.",
834 (uint32_t) last_file_write_time,
835 (uint32_t) last_state_file_time);
836 if (slurmctld_primary == 0) {
837 fatal("Two slurmctld daemons are running as primary. "
838 "Shutting down this daemon to avoid inconsistent "
839 "state due to split brain.");
840 }
841 }
842
843 /* write header: version, time */
844 packstr(JOB_STATE_VERSION, buffer);
845 pack16(SLURM_PROTOCOL_VERSION, buffer);
846 pack_time(now, buffer);
847
848 /*
849 * write header: job id
850 * This is needed so that the job id remains persistent even after
851 * slurmctld is restarted.
852 */
853 pack32( job_id_sequence, buffer);
854
855 debug3("Writing job id %u to header record of job_state file",
856 job_id_sequence);
857
858 /* write individual job records */
859 lock_slurmctld(job_read_lock);
860 job_iterator = list_iterator_create(job_list);
861 while ((job_ptr = list_next(job_iterator))) {
862 _dump_job_state(job_ptr, buffer);
863 }
864 list_iterator_destroy(job_iterator);
865
866
867 /* write the buffer to file */
868 old_file = xstrdup(slurmctld_conf.state_save_location);
869 xstrcat(old_file, "/job_state.old");
870 reg_file = xstrdup(slurmctld_conf.state_save_location);
871 xstrcat(reg_file, "/job_state");
872 new_file = xstrdup(slurmctld_conf.state_save_location);
873 xstrcat(new_file, "/job_state.new");
874 unlock_slurmctld(job_read_lock);
875
876 if (stat(reg_file, &stat_buf) == 0) {
877 static time_t last_mtime = (time_t) 0;
878 int delta_t = difftime(stat_buf.st_mtime, last_mtime);
879 if (delta_t < -10) {
880 error("The modification time of %s moved backwards "
881 "by %d seconds",
882 reg_file, (0-delta_t));
883 error("The clock of the file system and this computer "
884 "appear to not be synchronized");
885 /* It could be safest to exit here. We likely mounted
886 * a different file system with the state save files */
887 }
888 last_mtime = time(NULL);
889 }
890
891 lock_state_files();
892 log_fd = open(new_file, O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0600);
893 if (log_fd < 0) {
894 error("Can't save state, create file %s error %m",
895 new_file);
896 error_code = errno;
897 } else {
898 int pos = 0, nwrite, amount, rc;
899 char *data;
900
901 nwrite = get_buf_offset(buffer);
902 data = (char *)get_buf_data(buffer);
903 high_buffer_size = MAX(nwrite, high_buffer_size);
904 while (nwrite > 0) {
905 amount = write(log_fd, &data[pos], nwrite);
906 if ((amount < 0) && (errno != EINTR)) {
907 error("Error writing file %s, %m", new_file);
908 error_code = errno;
909 break;
910 }
911 nwrite -= amount;
912 pos += amount;
913 }
914
915 rc = fsync_and_close(log_fd, "job");
916 if (rc && !error_code)
917 error_code = rc;
918 }
919 if (error_code)
920 (void) unlink(new_file);
921 else { /* file shuffle */
922 (void) unlink(old_file);
923 if (link(reg_file, old_file))
924 debug4("unable to create link for %s -> %s: %m",
925 reg_file, old_file);
926 (void) unlink(reg_file);
927 if (link(new_file, reg_file))
928 debug4("unable to create link for %s -> %s: %m",
929 new_file, reg_file);
930 (void) unlink(new_file);
931 last_file_write_time = now;
932 }
933 xfree(old_file);
934 xfree(reg_file);
935 xfree(new_file);
936 unlock_state_files();
937
938 free_buf(buffer);
939 END_TIMER2("dump_all_job_state");
940 return error_code;
941 }
942
_find_resv_part(void * x,void * key)943 static int _find_resv_part(void *x, void *key)
944 {
945 slurmctld_resv_t *resv_ptr = (slurmctld_resv_t *) x;
946
947 if (resv_ptr->part_ptr != (part_record_t *) key)
948 return 0;
949 else
950 return 1; /* match */
951 }
952
953 /* Open the job state save file, or backup if necessary.
954 * state_file IN - the name of the state save file used
955 * RET the file description to read from or error code
956 */
_open_job_state_file(char ** state_file)957 static Buf _open_job_state_file(char **state_file)
958 {
959 Buf buf;
960
961 xassert(state_file);
962 xassert(!*state_file);
963
964 *state_file = xstrdup_printf("%s/job_state",
965 slurmctld_conf.state_save_location);
966
967 if (!(buf = create_mmap_buf(*state_file)))
968 error("Could not open job state file %s: %m", *state_file);
969 else
970 return buf;
971
972 error("NOTE: Trying backup state save file. Jobs may be lost!");
973 xstrcat(*state_file, ".old");
974 return create_mmap_buf(*state_file);
975 }
976
set_job_failed_assoc_qos_ptr(job_record_t * job_ptr)977 extern void set_job_failed_assoc_qos_ptr(job_record_t *job_ptr)
978 {
979 if (!job_ptr->assoc_ptr && (job_ptr->state_reason == FAIL_ACCOUNT)) {
980 slurmdb_assoc_rec_t assoc_rec;
981 memset(&assoc_rec, 0, sizeof(assoc_rec));
982 /*
983 * For speed and accurracy we will first see if we once had an
984 * association record. If not look for it by
985 * account,partition, user_id.
986 */
987 if (job_ptr->assoc_id)
988 assoc_rec.id = job_ptr->assoc_id;
989 else {
990 assoc_rec.acct = job_ptr->account;
991 if (job_ptr->part_ptr)
992 assoc_rec.partition = job_ptr->part_ptr->name;
993 assoc_rec.uid = job_ptr->user_id;
994 }
995
996 if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
997 accounting_enforce,
998 &job_ptr->assoc_ptr, false) ==
999 SLURM_SUCCESS) {
1000 job_ptr->assoc_id = assoc_rec.id;
1001 debug("%s: Filling in assoc for %pJ Assoc=%u",
1002 __func__, job_ptr, job_ptr->assoc_id);
1003
1004 job_ptr->state_reason = WAIT_NO_REASON;
1005 xfree(job_ptr->state_desc);
1006 last_job_update = time(NULL);
1007 }
1008 }
1009
1010 if (!job_ptr->qos_ptr && (job_ptr->state_reason == FAIL_QOS)) {
1011 int qos_error = SLURM_SUCCESS;
1012 slurmdb_qos_rec_t qos_rec;
1013 memset(&qos_rec, 0, sizeof(qos_rec));
1014 qos_rec.id = job_ptr->qos_id;
1015 job_ptr->qos_ptr = _determine_and_validate_qos(
1016 job_ptr->resv_name, job_ptr->assoc_ptr,
1017 job_ptr->limit_set.qos, &qos_rec,
1018 &qos_error, false, LOG_LEVEL_DEBUG2);
1019
1020 if ((qos_error == SLURM_SUCCESS) && job_ptr->qos_ptr) {
1021 debug("%s: Filling in QOS for %pJ QOS=%s(%u)",
1022 __func__, job_ptr, qos_rec.name, job_ptr->qos_id);
1023 job_ptr->state_reason = WAIT_NO_REASON;
1024 xfree(job_ptr->state_desc);
1025 last_job_update = time(NULL);
1026 }
1027 }
1028 }
1029
set_job_tres_req_str(job_record_t * job_ptr,bool assoc_mgr_locked)1030 extern void set_job_tres_req_str(job_record_t *job_ptr, bool assoc_mgr_locked)
1031 {
1032 assoc_mgr_lock_t locks = { .tres = READ_LOCK };
1033 xassert(job_ptr);
1034
1035 if (!assoc_mgr_locked)
1036 assoc_mgr_lock(&locks);
1037
1038 xfree(job_ptr->tres_req_str);
1039 job_ptr->tres_req_str = assoc_mgr_make_tres_str_from_array(
1040 job_ptr->tres_req_cnt, TRES_STR_FLAG_SIMPLE, true);
1041
1042 xfree(job_ptr->tres_fmt_req_str);
1043 job_ptr->tres_fmt_req_str = assoc_mgr_make_tres_str_from_array(
1044 job_ptr->tres_req_cnt, TRES_STR_CONVERT_UNITS, true);
1045
1046 if (!assoc_mgr_locked)
1047 assoc_mgr_unlock(&locks);
1048 }
1049
set_job_tres_alloc_str(job_record_t * job_ptr,bool assoc_mgr_locked)1050 extern void set_job_tres_alloc_str(job_record_t *job_ptr,
1051 bool assoc_mgr_locked)
1052 {
1053 assoc_mgr_lock_t locks = { .tres = READ_LOCK };
1054
1055 xassert(job_ptr);
1056
1057 if (!assoc_mgr_locked)
1058 assoc_mgr_lock(&locks);
1059
1060 xfree(job_ptr->tres_alloc_str);
1061 job_ptr->tres_alloc_str = assoc_mgr_make_tres_str_from_array(
1062 job_ptr->tres_alloc_cnt, TRES_STR_FLAG_SIMPLE, true);
1063
1064 xfree(job_ptr->tres_fmt_alloc_str);
1065 job_ptr->tres_fmt_alloc_str = assoc_mgr_make_tres_str_from_array(
1066 job_ptr->tres_alloc_cnt, TRES_STR_CONVERT_UNITS, true);
1067
1068 if (!assoc_mgr_locked)
1069 assoc_mgr_unlock(&locks);
1070 }
1071
1072 /* Note that the backup slurmctld has assumed primary control.
1073 * This function can be called multiple times. */
backup_slurmctld_restart(void)1074 extern void backup_slurmctld_restart(void)
1075 {
1076 last_file_write_time = (time_t) 0;
1077 }
1078
1079 /* Return the time stamp in the current job state save file, 0 is returned on
1080 * error */
_get_last_job_state_write_time(void)1081 static time_t _get_last_job_state_write_time(void)
1082 {
1083 int error_code = SLURM_SUCCESS;
1084 char *state_file = NULL;
1085 Buf buffer;
1086 time_t buf_time = (time_t) 0;
1087 char *ver_str = NULL;
1088 uint32_t ver_str_len;
1089 uint16_t protocol_version = NO_VAL16;
1090
1091 /* read the file */
1092 if (!(buffer = _open_job_state_file(&state_file))) {
1093 info("No job state file (%s) found", state_file);
1094 error_code = ENOENT;
1095 }
1096 xfree(state_file);
1097 if (error_code)
1098 return buf_time;
1099
1100 safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
1101 if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION))
1102 safe_unpack16(&protocol_version, buffer);
1103 safe_unpack_time(&buf_time, buffer);
1104
1105 unpack_error:
1106 xfree(ver_str);
1107 free_buf(buffer);
1108 return buf_time;
1109 }
1110
1111 /*
1112 * load_all_job_state - load the job state from file, recover from last
1113 * checkpoint. Execute this after loading the configuration file data.
1114 * Changes here should be reflected in load_last_job_id().
1115 * RET 0 or error code
1116 */
load_all_job_state(void)1117 extern int load_all_job_state(void)
1118 {
1119 int error_code = SLURM_SUCCESS;
1120 int job_cnt = 0;
1121 char *state_file = NULL;
1122 Buf buffer;
1123 time_t buf_time;
1124 uint32_t saved_job_id;
1125 char *ver_str = NULL;
1126 uint32_t ver_str_len;
1127 uint16_t protocol_version = NO_VAL16;
1128
1129 /* read the file */
1130 lock_state_files();
1131 if (!(buffer = _open_job_state_file(&state_file))) {
1132 info("No job state file (%s) to recover", state_file);
1133 xfree(state_file);
1134 unlock_state_files();
1135 return ENOENT;
1136 }
1137 xfree(state_file);
1138 unlock_state_files();
1139
1140 job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id);
1141
1142 safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
1143 debug3("Version string in job_state header is %s", ver_str);
1144 if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION))
1145 safe_unpack16(&protocol_version, buffer);
1146 xfree(ver_str);
1147
1148 if (protocol_version == NO_VAL16) {
1149 if (!ignore_state_errors)
1150 fatal("Can not recover job state, incompatible version, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
1151 error("***********************************************");
1152 error("Can not recover job state, incompatible version");
1153 error("***********************************************");
1154 free_buf(buffer);
1155 return EFAULT;
1156 }
1157
1158 safe_unpack_time(&buf_time, buffer);
1159 safe_unpack32(&saved_job_id, buffer);
1160 if (saved_job_id <= slurmctld_conf.max_job_id)
1161 job_id_sequence = MAX(saved_job_id, job_id_sequence);
1162 debug3("Job id in job_state header is %u", saved_job_id);
1163
1164 /*
1165 * Previously we locked the tres read lock before this loop. It turned
1166 * out that created a double lock when steps were being loaded during
1167 * the calls to jobacctinfo_create() which also locks the read lock.
1168 * It ended up being much easier to move the locks for the assoc_mgr
1169 * into the _load_job_state function than any other option.
1170 */
1171 while (remaining_buf(buffer) > 0) {
1172 error_code = _load_job_state(buffer, protocol_version);
1173 if (error_code != SLURM_SUCCESS)
1174 goto unpack_error;
1175 job_cnt++;
1176 }
1177 debug3("Set job_id_sequence to %u", job_id_sequence);
1178
1179 free_buf(buffer);
1180 info("Recovered information about %d jobs", job_cnt);
1181 return error_code;
1182
1183 unpack_error:
1184 if (!ignore_state_errors)
1185 fatal("Incomplete job state save file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
1186 error("Incomplete job state save file");
1187 info("Recovered information about %d jobs", job_cnt);
1188 free_buf(buffer);
1189 return SLURM_ERROR;
1190 }
1191
1192 /*
1193 * load_last_job_id - load only the last job ID from state save file.
1194 * Changes here should be reflected in load_all_job_state().
1195 * RET 0 or error code
1196 */
load_last_job_id(void)1197 extern int load_last_job_id( void )
1198 {
1199 char *state_file = NULL;
1200 Buf buffer;
1201 time_t buf_time;
1202 char *ver_str = NULL;
1203 uint32_t ver_str_len;
1204 uint16_t protocol_version = NO_VAL16;
1205
1206 /* read the file */
1207 lock_state_files();
1208 if (!(buffer = _open_job_state_file(&state_file))) {
1209 debug("No job state file (%s) to recover", state_file);
1210 xfree(state_file);
1211 unlock_state_files();
1212 return ENOENT;
1213 }
1214 xfree(state_file);
1215 unlock_state_files();
1216
1217 safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
1218 debug3("Version string in job_state header is %s", ver_str);
1219 if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION))
1220 safe_unpack16(&protocol_version, buffer);
1221 xfree(ver_str);
1222
1223 if (protocol_version == NO_VAL16) {
1224 if (!ignore_state_errors)
1225 fatal("Can not recover last job ID, incompatible version, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
1226 debug("*************************************************");
1227 debug("Can not recover last job ID, incompatible version");
1228 debug("*************************************************");
1229 free_buf(buffer);
1230 return EFAULT;
1231 }
1232
1233 safe_unpack_time(&buf_time, buffer);
1234 safe_unpack32( &job_id_sequence, buffer);
1235 debug3("Job ID in job_state header is %u", job_id_sequence);
1236
1237 /* Ignore the state for individual jobs stored here */
1238
1239 xfree(ver_str);
1240 free_buf(buffer);
1241 return SLURM_SUCCESS;
1242
1243 unpack_error:
1244 if (!ignore_state_errors)
1245 fatal("Invalid job data checkpoint file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
1246 error("Invalid job data checkpoint file");
1247 xfree(ver_str);
1248 free_buf(buffer);
1249 return SLURM_ERROR;
1250 }
1251
_pack_acct_policy_limit(acct_policy_limit_set_t * limit_set,Buf buffer,uint16_t protocol_version)1252 static void _pack_acct_policy_limit(acct_policy_limit_set_t *limit_set,
1253 Buf buffer, uint16_t protocol_version)
1254 {
1255 xassert(limit_set);
1256
1257 pack16(limit_set->qos, buffer);
1258 pack16(limit_set->time, buffer);
1259 pack16_array(limit_set->tres, slurmctld_tres_cnt, buffer);
1260 }
1261
_unpack_acct_policy_limit_members(acct_policy_limit_set_t * limit_set,Buf buffer,uint16_t protocol_version)1262 static int _unpack_acct_policy_limit_members(
1263 acct_policy_limit_set_t *limit_set,
1264 Buf buffer, uint16_t protocol_version)
1265 {
1266 uint32_t tmp32;
1267
1268 xassert(limit_set);
1269
1270 safe_unpack16(&limit_set->qos, buffer);
1271 safe_unpack16(&limit_set->time, buffer);
1272 xfree(limit_set->tres);
1273 safe_unpack16_array(&limit_set->tres, &tmp32, buffer);
1274
1275 /*
1276 * Because the tres array could have grown or the tres could have moved
1277 * positions, the array needs to be rebuilt and the old values need to
1278 * be copied into their new spots.
1279 */
1280 if ((tmp32 < slurmctld_tres_cnt) || assoc_mgr_tres_pos_changed())
1281 update_job_limit_set_tres(&limit_set->tres);
1282
1283 return SLURM_SUCCESS;
1284
1285 unpack_error:
1286 xfree(limit_set->tres);
1287
1288 return SLURM_ERROR;
1289 }
1290
1291 /*
1292 * _dump_job_state - dump the state of a specific job, its details, and
1293 * steps to a buffer
1294 * IN dump_job_ptr - pointer to job for which information is requested
1295 * IN/OUT buffer - location to store data, pointers automatically advanced
1296 */
_dump_job_state(job_record_t * dump_job_ptr,Buf buffer)1297 static void _dump_job_state(job_record_t *dump_job_ptr, Buf buffer)
1298 {
1299 struct job_details *detail_ptr;
1300 uint32_t tmp_32;
1301
1302 xassert(dump_job_ptr->magic == JOB_MAGIC);
1303
1304 /* Don't pack "unlinked" job. */
1305 if (dump_job_ptr->job_id == NO_VAL)
1306 return;
1307
1308 /* Dump basic job info */
1309 pack32(dump_job_ptr->array_job_id, buffer);
1310 pack32(dump_job_ptr->array_task_id, buffer);
1311 if (dump_job_ptr->array_recs) {
1312 build_array_str(dump_job_ptr);
1313 if (dump_job_ptr->array_recs->task_id_bitmap) {
1314 tmp_32 = bit_size(dump_job_ptr->array_recs->
1315 task_id_bitmap);
1316 } else
1317 tmp_32 = 0;
1318 pack32(tmp_32, buffer);
1319 if (tmp_32)
1320 packstr(dump_job_ptr->array_recs->task_id_str, buffer);
1321 pack32(dump_job_ptr->array_recs->array_flags, buffer);
1322 pack32(dump_job_ptr->array_recs->max_run_tasks, buffer);
1323 pack32(dump_job_ptr->array_recs->tot_run_tasks, buffer);
1324 pack32(dump_job_ptr->array_recs->min_exit_code, buffer);
1325 pack32(dump_job_ptr->array_recs->max_exit_code, buffer);
1326 pack32(dump_job_ptr->array_recs->tot_comp_tasks, buffer);
1327 } else {
1328 tmp_32 = NO_VAL;
1329 pack32(tmp_32, buffer);
1330 }
1331
1332 pack32(dump_job_ptr->assoc_id, buffer);
1333 packstr(dump_job_ptr->batch_features, buffer);
1334 pack32(dump_job_ptr->delay_boot, buffer);
1335 pack32(dump_job_ptr->job_id, buffer);
1336 pack32(dump_job_ptr->user_id, buffer);
1337 pack32(dump_job_ptr->group_id, buffer);
1338 pack32(dump_job_ptr->time_limit, buffer);
1339 pack32(dump_job_ptr->time_min, buffer);
1340 pack32(dump_job_ptr->priority, buffer);
1341 pack32(dump_job_ptr->alloc_sid, buffer);
1342 pack32(dump_job_ptr->total_cpus, buffer);
1343 if (dump_job_ptr->total_nodes)
1344 pack32(dump_job_ptr->total_nodes, buffer);
1345 else
1346 pack32(dump_job_ptr->node_cnt_wag, buffer);
1347 pack32(dump_job_ptr->cpu_cnt, buffer);
1348 pack32(dump_job_ptr->exit_code, buffer);
1349 pack32(dump_job_ptr->derived_ec, buffer);
1350 pack64(dump_job_ptr->db_index, buffer);
1351 pack32(dump_job_ptr->resv_id, buffer);
1352 pack32(dump_job_ptr->next_step_id, buffer);
1353 pack32(dump_job_ptr->het_job_id, buffer);
1354 packstr(dump_job_ptr->het_job_id_set, buffer);
1355 pack32(dump_job_ptr->het_job_offset, buffer);
1356 pack32(dump_job_ptr->qos_id, buffer);
1357 pack32(dump_job_ptr->req_switch, buffer);
1358 pack32(dump_job_ptr->wait4switch, buffer);
1359 pack32(dump_job_ptr->profile, buffer);
1360 pack32(dump_job_ptr->db_flags, buffer);
1361
1362 pack_time(dump_job_ptr->last_sched_eval, buffer);
1363 pack_time(dump_job_ptr->preempt_time, buffer);
1364 pack_time(dump_job_ptr->start_time, buffer);
1365 pack_time(dump_job_ptr->end_time, buffer);
1366 pack_time(dump_job_ptr->end_time_exp, buffer);
1367 pack_time(dump_job_ptr->suspend_time, buffer);
1368 pack_time(dump_job_ptr->pre_sus_time, buffer);
1369 pack_time(dump_job_ptr->resize_time, buffer);
1370 pack_time(dump_job_ptr->tot_sus_time, buffer);
1371 pack_time(dump_job_ptr->deadline, buffer);
1372
1373 pack32(dump_job_ptr->site_factor, buffer);
1374 pack16(dump_job_ptr->direct_set_prio, buffer);
1375 pack32(dump_job_ptr->job_state, buffer);
1376 pack16(dump_job_ptr->kill_on_node_fail, buffer);
1377 pack16(dump_job_ptr->batch_flag, buffer);
1378 pack16(dump_job_ptr->mail_type, buffer);
1379 pack32(dump_job_ptr->state_reason, buffer);
1380 pack32(dump_job_ptr->state_reason_prev_db, buffer);
1381 pack8(dump_job_ptr->reboot, buffer);
1382 pack16(dump_job_ptr->restart_cnt, buffer);
1383 pack16(dump_job_ptr->wait_all_nodes, buffer);
1384 pack16(dump_job_ptr->warn_flags, buffer);
1385 pack16(dump_job_ptr->warn_signal, buffer);
1386 pack16(dump_job_ptr->warn_time, buffer);
1387
1388 _pack_acct_policy_limit(&dump_job_ptr->limit_set, buffer,
1389 SLURM_PROTOCOL_VERSION);
1390
1391 packstr(dump_job_ptr->state_desc, buffer);
1392 packstr(dump_job_ptr->resp_host, buffer);
1393
1394 pack16(dump_job_ptr->alloc_resp_port, buffer);
1395 pack16(dump_job_ptr->other_port, buffer);
1396 pack8(dump_job_ptr->power_flags, buffer);
1397 pack16(dump_job_ptr->start_protocol_ver, buffer);
1398 packdouble(dump_job_ptr->billable_tres, buffer);
1399
1400 if (IS_JOB_COMPLETING(dump_job_ptr)) {
1401 if (dump_job_ptr->nodes_completing == NULL) {
1402 dump_job_ptr->nodes_completing =
1403 bitmap2node_name(dump_job_ptr->node_bitmap);
1404 }
1405 packstr(dump_job_ptr->nodes_completing, buffer);
1406 }
1407 packstr(dump_job_ptr->nodes, buffer);
1408 packstr(dump_job_ptr->partition, buffer);
1409 packstr(dump_job_ptr->name, buffer);
1410 packstr(dump_job_ptr->user_name, buffer);
1411 packstr(dump_job_ptr->wckey, buffer);
1412 packstr(dump_job_ptr->alloc_node, buffer);
1413 packstr(dump_job_ptr->account, buffer);
1414 packstr(dump_job_ptr->admin_comment, buffer);
1415 packstr(dump_job_ptr->comment, buffer);
1416 packstr(dump_job_ptr->gres_alloc, buffer);
1417 packstr(dump_job_ptr->gres_req, buffer);
1418 packstr(dump_job_ptr->gres_used, buffer);
1419 packstr(dump_job_ptr->network, buffer);
1420 packstr(dump_job_ptr->licenses, buffer);
1421 packstr(dump_job_ptr->mail_user, buffer);
1422 packstr(dump_job_ptr->mcs_label, buffer);
1423 packstr(dump_job_ptr->resv_name, buffer);
1424 packstr(dump_job_ptr->batch_host, buffer);
1425 packstr(dump_job_ptr->burst_buffer, buffer);
1426 packstr(dump_job_ptr->burst_buffer_state, buffer);
1427 packstr(dump_job_ptr->system_comment, buffer);
1428
1429 select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo,
1430 buffer, SLURM_PROTOCOL_VERSION);
1431 pack_job_resources(dump_job_ptr->job_resrcs, buffer,
1432 SLURM_PROTOCOL_VERSION);
1433
1434 packstr_array(dump_job_ptr->spank_job_env,
1435 dump_job_ptr->spank_job_env_size, buffer);
1436
1437 (void) gres_plugin_job_state_pack(dump_job_ptr->gres_list, buffer,
1438 dump_job_ptr->job_id, true,
1439 SLURM_PROTOCOL_VERSION);
1440
1441 /* Dump job details, if available */
1442 detail_ptr = dump_job_ptr->details;
1443 if (detail_ptr) {
1444 xassert (detail_ptr->magic == DETAILS_MAGIC);
1445 pack16((uint16_t) DETAILS_FLAG, buffer);
1446 _dump_job_details(detail_ptr, buffer);
1447 } else
1448 pack16((uint16_t) 0, buffer); /* no details flag */
1449
1450 /* Dump job steps */
1451 list_for_each(dump_job_ptr->step_list, dump_job_step_state, buffer);
1452
1453 pack16((uint16_t) 0, buffer); /* no step flag */
1454 pack32(dump_job_ptr->bit_flags, buffer);
1455 packstr(dump_job_ptr->tres_alloc_str, buffer);
1456 packstr(dump_job_ptr->tres_fmt_alloc_str, buffer);
1457 packstr(dump_job_ptr->tres_req_str, buffer);
1458 packstr(dump_job_ptr->tres_fmt_req_str, buffer);
1459
1460 packstr(dump_job_ptr->clusters, buffer);
1461 _dump_job_fed_details(dump_job_ptr->fed_details, buffer);
1462
1463 packstr(dump_job_ptr->origin_cluster, buffer);
1464
1465 packstr(dump_job_ptr->cpus_per_tres, buffer);
1466 packstr(dump_job_ptr->mem_per_tres, buffer);
1467 packstr(dump_job_ptr->tres_bind, buffer);
1468 packstr(dump_job_ptr->tres_freq, buffer);
1469 packstr(dump_job_ptr->tres_per_job, buffer);
1470 packstr(dump_job_ptr->tres_per_node, buffer);
1471 packstr(dump_job_ptr->tres_per_socket, buffer);
1472 packstr(dump_job_ptr->tres_per_task, buffer);
1473 }
1474
1475 /* Unpack a job's state information from a buffer */
1476 /* NOTE: assoc_mgr qos, tres and assoc read lock must be unlocked before
1477 * calling */
_load_job_state(Buf buffer,uint16_t protocol_version)1478 static int _load_job_state(Buf buffer, uint16_t protocol_version)
1479 {
1480 uint64_t db_index;
1481 uint32_t job_id, user_id, group_id, time_limit, priority, alloc_sid;
1482 uint32_t exit_code, assoc_id, name_len, time_min;
1483 uint32_t next_step_id, total_cpus, total_nodes = 0, cpu_cnt;
1484 uint32_t resv_id, spank_job_env_size = 0, qos_id, derived_ec = 0;
1485 uint32_t array_job_id = 0, req_switch = 0, wait4switch = 0;
1486 uint32_t profile = ACCT_GATHER_PROFILE_NOT_SET, db_flags = 0;
1487 uint32_t job_state, delay_boot = 0, site_factor = NICE_OFFSET;
1488 time_t start_time, end_time, end_time_exp, suspend_time,
1489 pre_sus_time, tot_sus_time;
1490 time_t preempt_time = 0, deadline = 0;
1491 time_t last_sched_eval = 0;
1492 time_t resize_time = 0, now = time(NULL);
1493 uint8_t reboot = 0, power_flags = 0;
1494 uint32_t array_task_id = NO_VAL, state_reason_prev_db = 0;
1495 uint32_t array_flags = 0, max_run_tasks = 0, tot_run_tasks = 0;
1496 uint32_t min_exit_code = 0, max_exit_code = 0, tot_comp_tasks = 0;
1497 uint32_t het_job_id = 0, het_job_offset = 0, state_reason;
1498 uint16_t details, batch_flag, step_flag;
1499 uint16_t kill_on_node_fail, direct_set_prio;
1500 uint16_t alloc_resp_port, other_port, mail_type, tmp16;
1501 uint16_t restart_cnt;
1502 uint16_t wait_all_nodes, warn_flags = 0, warn_signal, warn_time;
1503 acct_policy_limit_set_t limit_set;
1504 uint16_t start_protocol_ver = SLURM_MIN_PROTOCOL_VERSION;
1505 char *nodes = NULL, *partition = NULL, *name = NULL, *resp_host = NULL;
1506 char *account = NULL, *network = NULL, *mail_user = NULL;
1507 char *comment = NULL, *nodes_completing = NULL, *alloc_node = NULL;
1508 char *licenses = NULL, *state_desc = NULL, *wckey = NULL;
1509 char *resv_name = NULL, *batch_host = NULL;
1510 char *gres_alloc = NULL, *gres_req = NULL, *gres_used = NULL;
1511 char *burst_buffer = NULL, *burst_buffer_state = NULL;
1512 char *admin_comment = NULL, *task_id_str = NULL, *mcs_label = NULL;
1513 char *clusters = NULL, *het_job_id_set = NULL, *user_name = NULL;
1514 char *batch_features = NULL, *system_comment = NULL;
1515 uint32_t task_id_size = NO_VAL;
1516 char **spank_job_env = (char **) NULL;
1517 List gres_list = NULL, part_ptr_list = NULL;
1518 job_record_t *job_ptr = NULL;
1519 part_record_t *part_ptr;
1520 int error_code, i, qos_error, rc;
1521 dynamic_plugin_data_t *select_jobinfo = NULL;
1522 job_resources_t *job_resources = NULL;
1523 slurmdb_assoc_rec_t assoc_rec;
1524 slurmdb_qos_rec_t qos_rec;
1525 bool job_finished = false;
1526 double billable_tres = (double)NO_VAL;
1527 char *tres_alloc_str = NULL, *tres_fmt_alloc_str = NULL,
1528 *tres_req_str = NULL, *tres_fmt_req_str = NULL;
1529 uint32_t pelog_env_size = 0;
1530 char **pelog_env = (char **) NULL;
1531 job_fed_details_t *job_fed_details = NULL;
1532 assoc_mgr_lock_t locks = { .assoc = READ_LOCK,
1533 .qos = READ_LOCK,
1534 .tres = READ_LOCK,
1535 .user = READ_LOCK };
1536
1537 memset(&limit_set, 0, sizeof(limit_set));
1538 limit_set.tres = xcalloc(slurmctld_tres_cnt, sizeof(uint16_t));
1539
1540 if (protocol_version >= SLURM_20_02_PROTOCOL_VERSION) {
1541 safe_unpack32(&array_job_id, buffer);
1542 safe_unpack32(&array_task_id, buffer);
1543
1544 /* Job Array record */
1545 safe_unpack32(&task_id_size, buffer);
1546 if (task_id_size != NO_VAL) {
1547 if (task_id_size) {
1548 safe_unpackstr_xmalloc(&task_id_str, &name_len,
1549 buffer);
1550 }
1551 safe_unpack32(&array_flags, buffer);
1552 safe_unpack32(&max_run_tasks, buffer);
1553 safe_unpack32(&tot_run_tasks, buffer);
1554 safe_unpack32(&min_exit_code, buffer);
1555 safe_unpack32(&max_exit_code, buffer);
1556 safe_unpack32(&tot_comp_tasks, buffer);
1557 }
1558
1559 safe_unpack32(&assoc_id, buffer);
1560 safe_unpackstr_xmalloc(&batch_features, &name_len, buffer);
1561 safe_unpack32(&delay_boot, buffer);
1562 safe_unpack32(&job_id, buffer);
1563
1564 /* validity test as possible */
1565 if (job_id == 0) {
1566 verbose("Invalid job_id %u", job_id);
1567 goto unpack_error;
1568 }
1569
1570 job_ptr = find_job_record(job_id);
1571 if (job_ptr == NULL) {
1572 job_ptr = _create_job_record(1);
1573 if (!job_ptr) {
1574 error("Create job entry failed for JobId=%u",
1575 job_id);
1576 goto unpack_error;
1577 }
1578 job_ptr->job_id = job_id;
1579 job_ptr->array_job_id = array_job_id;
1580 job_ptr->array_task_id = array_task_id;
1581 }
1582
1583 safe_unpack32(&user_id, buffer);
1584 safe_unpack32(&group_id, buffer);
1585 safe_unpack32(&time_limit, buffer);
1586 safe_unpack32(&time_min, buffer);
1587 safe_unpack32(&priority, buffer);
1588 safe_unpack32(&alloc_sid, buffer);
1589 safe_unpack32(&total_cpus, buffer);
1590 safe_unpack32(&total_nodes, buffer);
1591 safe_unpack32(&cpu_cnt, buffer);
1592 safe_unpack32(&exit_code, buffer);
1593 safe_unpack32(&derived_ec, buffer);
1594 safe_unpack64(&db_index, buffer);
1595 safe_unpack32(&resv_id, buffer);
1596 safe_unpack32(&next_step_id, buffer);
1597 safe_unpack32(&het_job_id, buffer);
1598 safe_unpackstr_xmalloc(&het_job_id_set, &name_len, buffer);
1599 safe_unpack32(&het_job_offset, buffer);
1600 safe_unpack32(&qos_id, buffer);
1601 safe_unpack32(&req_switch, buffer);
1602 safe_unpack32(&wait4switch, buffer);
1603 safe_unpack32(&profile, buffer);
1604 safe_unpack32(&db_flags, buffer);
1605
1606 safe_unpack_time(&last_sched_eval, buffer);
1607 safe_unpack_time(&preempt_time, buffer);
1608 safe_unpack_time(&start_time, buffer);
1609 safe_unpack_time(&end_time, buffer);
1610 safe_unpack_time(&end_time_exp, buffer);
1611 safe_unpack_time(&suspend_time, buffer);
1612 safe_unpack_time(&pre_sus_time, buffer);
1613 safe_unpack_time(&resize_time, buffer);
1614 safe_unpack_time(&tot_sus_time, buffer);
1615 safe_unpack_time(&deadline, buffer);
1616
1617 safe_unpack32(&site_factor, buffer);
1618 safe_unpack16(&direct_set_prio, buffer);
1619 safe_unpack32(&job_state, buffer);
1620 safe_unpack16(&kill_on_node_fail, buffer);
1621 safe_unpack16(&batch_flag, buffer);
1622 safe_unpack16(&mail_type, buffer);
1623 safe_unpack32(&state_reason, buffer);
1624 safe_unpack32(&state_reason_prev_db, buffer);
1625 safe_unpack8 (&reboot, buffer);
1626 safe_unpack16(&restart_cnt, buffer);
1627 safe_unpack16(&wait_all_nodes, buffer);
1628 safe_unpack16(&warn_flags, buffer);
1629 safe_unpack16(&warn_signal, buffer);
1630 safe_unpack16(&warn_time, buffer);
1631
1632 _unpack_acct_policy_limit_members(&limit_set, buffer,
1633 protocol_version);
1634
1635 safe_unpackstr_xmalloc(&state_desc, &name_len, buffer);
1636 safe_unpackstr_xmalloc(&resp_host, &name_len, buffer);
1637
1638 safe_unpack16(&alloc_resp_port, buffer);
1639 safe_unpack16(&other_port, buffer);
1640 safe_unpack8(&power_flags, buffer);
1641 safe_unpack16(&start_protocol_ver, buffer);
1642 safe_unpackdouble(&billable_tres, buffer);
1643
1644 if (job_state & JOB_COMPLETING) {
1645 safe_unpackstr_xmalloc(&nodes_completing,
1646 &name_len, buffer);
1647 }
1648 safe_unpackstr_xmalloc(&nodes, &name_len, buffer);
1649 safe_unpackstr_xmalloc(&partition, &name_len, buffer);
1650 if (partition == NULL) {
1651 error("No partition for JobId=%u", job_id);
1652 goto unpack_error;
1653 }
1654 part_ptr = find_part_record (partition);
1655 if (part_ptr == NULL) {
1656 char *err_part = NULL;
1657 part_ptr_list = get_part_list(partition, &err_part);
1658 if (part_ptr_list) {
1659 part_ptr = list_peek(part_ptr_list);
1660 if (list_count(part_ptr_list) == 1)
1661 FREE_NULL_LIST(part_ptr_list);
1662 } else {
1663 verbose("Invalid partition (%s) for JobId=%u",
1664 err_part, job_id);
1665 xfree(err_part);
1666 /* not fatal error, partition could have been
1667 * removed, reset_job_bitmaps() will clean-up
1668 * this job */
1669 }
1670 }
1671
1672 safe_unpackstr_xmalloc(&name, &name_len, buffer);
1673 safe_unpackstr_xmalloc(&user_name, &name_len, buffer);
1674 safe_unpackstr_xmalloc(&wckey, &name_len, buffer);
1675 safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer);
1676 safe_unpackstr_xmalloc(&account, &name_len, buffer);
1677 safe_unpackstr_xmalloc(&admin_comment, &name_len, buffer);
1678 safe_unpackstr_xmalloc(&comment, &name_len, buffer);
1679 safe_unpackstr_xmalloc(&gres_alloc, &name_len, buffer);
1680 safe_unpackstr_xmalloc(&gres_req, &name_len, buffer);
1681 safe_unpackstr_xmalloc(&gres_used, &name_len, buffer);
1682 safe_unpackstr_xmalloc(&network, &name_len, buffer);
1683 safe_unpackstr_xmalloc(&licenses, &name_len, buffer);
1684 safe_unpackstr_xmalloc(&mail_user, &name_len, buffer);
1685 safe_unpackstr_xmalloc(&mcs_label, &name_len, buffer);
1686 safe_unpackstr_xmalloc(&resv_name, &name_len, buffer);
1687 safe_unpackstr_xmalloc(&batch_host, &name_len, buffer);
1688 safe_unpackstr_xmalloc(&burst_buffer, &name_len, buffer);
1689 safe_unpackstr_xmalloc(&burst_buffer_state, &name_len, buffer);
1690 safe_unpackstr_xmalloc(&system_comment, &name_len, buffer);
1691
1692 if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer,
1693 protocol_version))
1694 goto unpack_error;
1695 if (unpack_job_resources(&job_resources, buffer,
1696 protocol_version))
1697 goto unpack_error;
1698
1699 safe_unpackstr_array(&spank_job_env, &spank_job_env_size,
1700 buffer);
1701
1702 if (gres_plugin_job_state_unpack(&gres_list, buffer, job_id,
1703 protocol_version) !=
1704 SLURM_SUCCESS)
1705 goto unpack_error;
1706 gres_plugin_job_state_log(gres_list, job_id);
1707
1708 safe_unpack16(&details, buffer);
1709 if ((details == DETAILS_FLAG) &&
1710 (_load_job_details(job_ptr, buffer, protocol_version))) {
1711 job_ptr->job_state = JOB_FAILED;
1712 job_ptr->exit_code = 1;
1713 job_ptr->state_reason = FAIL_SYSTEM;
1714 xfree(job_ptr->state_desc);
1715 job_ptr->end_time = now;
1716 goto unpack_error;
1717 }
1718 safe_unpack16(&step_flag, buffer);
1719
1720 while (step_flag == STEP_FLAG) {
1721 /*
1722 * No need to put these into accounting if they
1723 * haven't been since all information will be
1724 * put in when the job is finished.
1725 */
1726 if ((error_code = load_step_state(job_ptr, buffer,
1727 protocol_version)))
1728 goto unpack_error;
1729 safe_unpack16(&step_flag, buffer);
1730 }
1731 safe_unpack32(&job_ptr->bit_flags, buffer);
1732 job_ptr->bit_flags &= ~BACKFILL_TEST;
1733 job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST;
1734 safe_unpackstr_xmalloc(&tres_alloc_str,
1735 &name_len, buffer);
1736 safe_unpackstr_xmalloc(&tres_fmt_alloc_str,
1737 &name_len, buffer);
1738 safe_unpackstr_xmalloc(&tres_req_str, &name_len, buffer);
1739 safe_unpackstr_xmalloc(&tres_fmt_req_str, &name_len, buffer);
1740 safe_unpackstr_xmalloc(&clusters, &name_len, buffer);
1741 if ((error_code = _load_job_fed_details(&job_fed_details,
1742 buffer,
1743 protocol_version)))
1744 goto unpack_error;
1745
1746 safe_unpackstr_xmalloc(&job_ptr->origin_cluster, &name_len,
1747 buffer);
1748
1749 safe_unpackstr_xmalloc(&job_ptr->cpus_per_tres, &name_len,
1750 buffer);
1751 safe_unpackstr_xmalloc(&job_ptr->mem_per_tres, &name_len,
1752 buffer);
1753 safe_unpackstr_xmalloc(&job_ptr->tres_bind, &name_len,
1754 buffer);
1755 safe_unpackstr_xmalloc(&job_ptr->tres_freq, &name_len,
1756 buffer);
1757 safe_unpackstr_xmalloc(&job_ptr->tres_per_job, &name_len,
1758 buffer);
1759 safe_unpackstr_xmalloc(&job_ptr->tres_per_node, &name_len,
1760 buffer);
1761 safe_unpackstr_xmalloc(&job_ptr->tres_per_socket, &name_len,
1762 buffer);
1763 safe_unpackstr_xmalloc(&job_ptr->tres_per_task, &name_len,
1764 buffer);
1765 } else if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
1766 uint16_t uint16_tmp;
1767 safe_unpack32(&array_job_id, buffer);
1768 safe_unpack32(&array_task_id, buffer);
1769
1770 /* Job Array record */
1771 safe_unpack32(&task_id_size, buffer);
1772 if (task_id_size != NO_VAL) {
1773 if (task_id_size) {
1774 safe_unpackstr_xmalloc(&task_id_str, &name_len,
1775 buffer);
1776 }
1777 safe_unpack32(&array_flags, buffer);
1778 safe_unpack32(&max_run_tasks, buffer);
1779 safe_unpack32(&tot_run_tasks, buffer);
1780 safe_unpack32(&min_exit_code, buffer);
1781 safe_unpack32(&max_exit_code, buffer);
1782 safe_unpack32(&tot_comp_tasks, buffer);
1783 }
1784
1785 safe_unpack32(&assoc_id, buffer);
1786 safe_unpackstr_xmalloc(&batch_features, &name_len, buffer);
1787 safe_unpack32(&delay_boot, buffer);
1788 safe_unpack32(&job_id, buffer);
1789
1790 /* validity test as possible */
1791 if (job_id == 0) {
1792 verbose("Invalid job_id %u", job_id);
1793 goto unpack_error;
1794 }
1795
1796 job_ptr = find_job_record(job_id);
1797 if (job_ptr == NULL) {
1798 job_ptr = _create_job_record(1);
1799 if (!job_ptr) {
1800 error("Create job entry failed for JobId=%u",
1801 job_id);
1802 goto unpack_error;
1803 }
1804 job_ptr->job_id = job_id;
1805 job_ptr->array_job_id = array_job_id;
1806 job_ptr->array_task_id = array_task_id;
1807 }
1808
1809 safe_unpack32(&user_id, buffer);
1810 safe_unpack32(&group_id, buffer);
1811 safe_unpack32(&time_limit, buffer);
1812 safe_unpack32(&time_min, buffer);
1813 safe_unpack32(&priority, buffer);
1814 safe_unpack32(&alloc_sid, buffer);
1815 safe_unpack32(&total_cpus, buffer);
1816 safe_unpack32(&total_nodes, buffer);
1817 safe_unpack32(&cpu_cnt, buffer);
1818 safe_unpack32(&exit_code, buffer);
1819 safe_unpack32(&derived_ec, buffer);
1820 safe_unpack64(&db_index, buffer);
1821 safe_unpack32(&resv_id, buffer);
1822 safe_unpack32(&next_step_id, buffer);
1823 safe_unpack32(&het_job_id, buffer);
1824 safe_unpackstr_xmalloc(&het_job_id_set, &name_len, buffer);
1825 safe_unpack32(&het_job_offset, buffer);
1826 safe_unpack32(&qos_id, buffer);
1827 safe_unpack32(&req_switch, buffer);
1828 safe_unpack32(&wait4switch, buffer);
1829 safe_unpack32(&profile, buffer);
1830 safe_unpack32(&db_flags, buffer);
1831
1832 safe_unpack_time(&last_sched_eval, buffer);
1833 safe_unpack_time(&preempt_time, buffer);
1834 safe_unpack_time(&start_time, buffer);
1835 safe_unpack_time(&end_time, buffer);
1836 safe_unpack_time(&end_time_exp, buffer);
1837 safe_unpack_time(&suspend_time, buffer);
1838 safe_unpack_time(&pre_sus_time, buffer);
1839 safe_unpack_time(&resize_time, buffer);
1840 safe_unpack_time(&tot_sus_time, buffer);
1841 safe_unpack_time(&deadline, buffer);
1842
1843 safe_unpack32(&site_factor, buffer);
1844 safe_unpack16(&direct_set_prio, buffer);
1845 safe_unpack32(&job_state, buffer);
1846 safe_unpack16(&kill_on_node_fail, buffer);
1847 safe_unpack16(&batch_flag, buffer);
1848 safe_unpack16(&mail_type, buffer);
1849 safe_unpack32(&state_reason, buffer);
1850 safe_unpack32(&state_reason_prev_db, buffer);
1851 safe_unpack8 (&reboot, buffer);
1852 safe_unpack16(&restart_cnt, buffer);
1853 safe_unpack16(&wait_all_nodes, buffer);
1854 safe_unpack16(&warn_flags, buffer);
1855 safe_unpack16(&warn_signal, buffer);
1856 safe_unpack16(&warn_time, buffer);
1857
1858 _unpack_acct_policy_limit_members(&limit_set, buffer,
1859 protocol_version);
1860
1861 safe_unpackstr_xmalloc(&state_desc, &name_len, buffer);
1862 safe_unpackstr_xmalloc(&resp_host, &name_len, buffer);
1863
1864 safe_unpack16(&alloc_resp_port, buffer);
1865 safe_unpack16(&other_port, buffer);
1866 safe_unpack8(&power_flags, buffer);
1867 safe_unpack16(&start_protocol_ver, buffer);
1868 safe_unpackdouble(&billable_tres, buffer);
1869
1870 if (job_state & JOB_COMPLETING) {
1871 safe_unpackstr_xmalloc(&nodes_completing,
1872 &name_len, buffer);
1873 }
1874 safe_unpackstr_xmalloc(&nodes, &name_len, buffer);
1875 safe_unpackstr_xmalloc(&partition, &name_len, buffer);
1876 if (partition == NULL) {
1877 error("No partition for JobId=%u", job_id);
1878 goto unpack_error;
1879 }
1880 part_ptr = find_part_record (partition);
1881 if (part_ptr == NULL) {
1882 char *err_part = NULL;
1883 part_ptr_list = get_part_list(partition, &err_part);
1884 if (part_ptr_list) {
1885 part_ptr = list_peek(part_ptr_list);
1886 if (list_count(part_ptr_list) == 1)
1887 FREE_NULL_LIST(part_ptr_list);
1888 } else {
1889 verbose("Invalid partition (%s) for JobId=%u",
1890 err_part, job_id);
1891 xfree(err_part);
1892 /* not fatal error, partition could have been
1893 * removed, reset_job_bitmaps() will clean-up
1894 * this job */
1895 }
1896 }
1897
1898 safe_unpackstr_xmalloc(&name, &name_len, buffer);
1899 safe_unpackstr_xmalloc(&user_name, &name_len, buffer);
1900 safe_unpackstr_xmalloc(&wckey, &name_len, buffer);
1901 safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer);
1902 safe_unpackstr_xmalloc(&account, &name_len, buffer);
1903 safe_unpackstr_xmalloc(&admin_comment, &name_len, buffer);
1904 safe_unpackstr_xmalloc(&comment, &name_len, buffer);
1905 safe_unpackstr_xmalloc(&gres_alloc, &name_len, buffer);
1906 safe_unpackstr_xmalloc(&gres_req, &name_len, buffer);
1907 safe_unpackstr_xmalloc(&gres_used, &name_len, buffer);
1908 safe_unpackstr_xmalloc(&network, &name_len, buffer);
1909 safe_unpackstr_xmalloc(&licenses, &name_len, buffer);
1910 safe_unpackstr_xmalloc(&mail_user, &name_len, buffer);
1911 safe_unpackstr_xmalloc(&mcs_label, &name_len, buffer);
1912 safe_unpackstr_xmalloc(&resv_name, &name_len, buffer);
1913 safe_unpackstr_xmalloc(&batch_host, &name_len, buffer);
1914 safe_unpackstr_xmalloc(&burst_buffer, &name_len, buffer);
1915 safe_unpackstr_xmalloc(&burst_buffer_state, &name_len, buffer);
1916 safe_unpackstr_xmalloc(&system_comment, &name_len, buffer);
1917
1918 if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer,
1919 protocol_version))
1920 goto unpack_error;
1921 if (unpack_job_resources(&job_resources, buffer,
1922 protocol_version))
1923 goto unpack_error;
1924
1925 safe_unpack16(&uint16_tmp, buffer); /* was ckpt_interval */
1926 /* fake out the former checkpoint plugin */
1927 {
1928 uint16_t id;
1929 uint32_t size;
1930 safe_unpack16(&id, buffer);
1931 safe_unpack32(&size, buffer);
1932 /* skip past any checkpoint plugin info */
1933 size += get_buf_offset(buffer);
1934 set_buf_offset(buffer, size);
1935 }
1936
1937 safe_unpackstr_array(&spank_job_env, &spank_job_env_size,
1938 buffer);
1939
1940 if (gres_plugin_job_state_unpack(&gres_list, buffer, job_id,
1941 protocol_version) !=
1942 SLURM_SUCCESS)
1943 goto unpack_error;
1944 gres_plugin_job_state_log(gres_list, job_id);
1945
1946 safe_unpack16(&details, buffer);
1947 if ((details == DETAILS_FLAG) &&
1948 (_load_job_details(job_ptr, buffer, protocol_version))) {
1949 job_ptr->job_state = JOB_FAILED;
1950 job_ptr->exit_code = 1;
1951 job_ptr->state_reason = FAIL_SYSTEM;
1952 xfree(job_ptr->state_desc);
1953 job_ptr->end_time = now;
1954 goto unpack_error;
1955 }
1956 safe_unpack16(&step_flag, buffer);
1957 /*
1958 * The batch_host is needed to create a step_layout for the
1959 * batch step since that wasn't packed until 20.02.
1960 */
1961 job_ptr->batch_host = batch_host;
1962 while (step_flag == STEP_FLAG) {
1963 /*
1964 * No need to put these into accounting if they
1965 * haven't been since all information will be
1966 * put in when the job is finished.
1967 */
1968 if ((error_code = load_step_state(job_ptr, buffer,
1969 protocol_version)))
1970 goto unpack_error;
1971 safe_unpack16(&step_flag, buffer);
1972 }
1973 job_ptr->batch_host = NULL;
1974 safe_unpack32(&job_ptr->bit_flags, buffer);
1975 job_ptr->bit_flags &= ~BACKFILL_TEST;
1976 job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST;
1977 safe_unpackstr_xmalloc(&tres_alloc_str,
1978 &name_len, buffer);
1979 safe_unpackstr_xmalloc(&tres_fmt_alloc_str,
1980 &name_len, buffer);
1981 safe_unpackstr_xmalloc(&tres_req_str, &name_len, buffer);
1982 safe_unpackstr_xmalloc(&tres_fmt_req_str, &name_len, buffer);
1983 safe_unpackstr_xmalloc(&clusters, &name_len, buffer);
1984 if ((error_code = _load_job_fed_details(&job_fed_details,
1985 buffer,
1986 protocol_version)))
1987 goto unpack_error;
1988
1989 safe_unpackstr_xmalloc(&job_ptr->origin_cluster, &name_len,
1990 buffer);
1991
1992 safe_unpackstr_xmalloc(&job_ptr->cpus_per_tres, &name_len,
1993 buffer);
1994 safe_unpackstr_xmalloc(&job_ptr->mem_per_tres, &name_len,
1995 buffer);
1996 safe_unpackstr_xmalloc(&job_ptr->tres_bind, &name_len,
1997 buffer);
1998 safe_unpackstr_xmalloc(&job_ptr->tres_freq, &name_len,
1999 buffer);
2000 safe_unpackstr_xmalloc(&job_ptr->tres_per_job, &name_len,
2001 buffer);
2002 safe_unpackstr_xmalloc(&job_ptr->tres_per_node, &name_len,
2003 buffer);
2004 safe_unpackstr_xmalloc(&job_ptr->tres_per_socket, &name_len,
2005 buffer);
2006 safe_unpackstr_xmalloc(&job_ptr->tres_per_task, &name_len,
2007 buffer);
2008 } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
2009 uint16_t uint16_tmp;
2010 safe_unpack32(&array_job_id, buffer);
2011 safe_unpack32(&array_task_id, buffer);
2012
2013 /* Job Array record */
2014 safe_unpack32(&task_id_size, buffer);
2015 if (task_id_size != NO_VAL) {
2016 if (task_id_size) {
2017 safe_unpackstr_xmalloc(&task_id_str, &name_len,
2018 buffer);
2019 }
2020 safe_unpack32(&array_flags, buffer);
2021 safe_unpack32(&max_run_tasks, buffer);
2022 safe_unpack32(&tot_run_tasks, buffer);
2023 safe_unpack32(&min_exit_code, buffer);
2024 safe_unpack32(&max_exit_code, buffer);
2025 safe_unpack32(&tot_comp_tasks, buffer);
2026 }
2027
2028 safe_unpack32(&assoc_id, buffer);
2029 safe_unpackstr_xmalloc(&batch_features, &name_len, buffer);
2030 safe_unpack32(&delay_boot, buffer);
2031 safe_unpack32(&job_id, buffer);
2032
2033 /* validity test as possible */
2034 if (job_id == 0) {
2035 verbose("Invalid job_id %u", job_id);
2036 goto unpack_error;
2037 }
2038
2039 job_ptr = find_job_record(job_id);
2040 if (job_ptr == NULL) {
2041 job_ptr = _create_job_record(1);
2042 if (!job_ptr) {
2043 error("Create job entry failed for JobId=%u",
2044 job_id);
2045 goto unpack_error;
2046 }
2047 job_ptr->job_id = job_id;
2048 job_ptr->array_job_id = array_job_id;
2049 job_ptr->array_task_id = array_task_id;
2050 }
2051
2052 safe_unpack32(&user_id, buffer);
2053 safe_unpack32(&group_id, buffer);
2054 safe_unpack32(&time_limit, buffer);
2055 safe_unpack32(&time_min, buffer);
2056 safe_unpack32(&priority, buffer);
2057 safe_unpack32(&alloc_sid, buffer);
2058 safe_unpack32(&total_cpus, buffer);
2059 safe_unpack32(&total_nodes, buffer);
2060 safe_unpack32(&cpu_cnt, buffer);
2061 safe_unpack32(&exit_code, buffer);
2062 safe_unpack32(&derived_ec, buffer);
2063 safe_unpack64(&db_index, buffer);
2064 safe_unpack32(&resv_id, buffer);
2065 safe_unpack32(&next_step_id, buffer);
2066 safe_unpack32(&het_job_id, buffer);
2067 safe_unpackstr_xmalloc(&het_job_id_set, &name_len, buffer);
2068 safe_unpack32(&het_job_offset, buffer);
2069 safe_unpack32(&qos_id, buffer);
2070 safe_unpack32(&req_switch, buffer);
2071 safe_unpack32(&wait4switch, buffer);
2072 safe_unpack32(&profile, buffer);
2073
2074 safe_unpack_time(&last_sched_eval, buffer);
2075 safe_unpack_time(&preempt_time, buffer);
2076 safe_unpack_time(&start_time, buffer);
2077 safe_unpack_time(&end_time, buffer);
2078 safe_unpack_time(&end_time_exp, buffer);
2079 safe_unpack_time(&suspend_time, buffer);
2080 safe_unpack_time(&pre_sus_time, buffer);
2081 safe_unpack_time(&resize_time, buffer);
2082 safe_unpack_time(&tot_sus_time, buffer);
2083 safe_unpack_time(&deadline, buffer);
2084
2085 safe_unpack16(&direct_set_prio, buffer);
2086 safe_unpack32(&job_state, buffer);
2087 safe_unpack16(&kill_on_node_fail, buffer);
2088 safe_unpack16(&batch_flag, buffer);
2089 safe_unpack16(&mail_type, buffer);
2090 safe_unpack16(&tmp16, buffer);
2091 state_reason = tmp16;
2092 safe_unpack8 (&reboot, buffer);
2093 safe_unpack16(&restart_cnt, buffer);
2094 safe_unpack16(&wait_all_nodes, buffer);
2095 safe_unpack16(&warn_flags, buffer);
2096 safe_unpack16(&warn_signal, buffer);
2097 safe_unpack16(&warn_time, buffer);
2098
2099 _unpack_acct_policy_limit_members(&limit_set, buffer,
2100 protocol_version);
2101
2102 safe_unpackstr_xmalloc(&state_desc, &name_len, buffer);
2103 safe_unpackstr_xmalloc(&resp_host, &name_len, buffer);
2104
2105 safe_unpack16(&alloc_resp_port, buffer);
2106 safe_unpack16(&other_port, buffer);
2107 safe_unpack8(&power_flags, buffer);
2108 safe_unpack16(&start_protocol_ver, buffer);
2109 safe_unpackdouble(&billable_tres, buffer);
2110
2111 if (job_state & JOB_COMPLETING) {
2112 safe_unpackstr_xmalloc(&nodes_completing,
2113 &name_len, buffer);
2114 }
2115 safe_unpackstr_xmalloc(&nodes, &name_len, buffer);
2116 safe_unpackstr_xmalloc(&partition, &name_len, buffer);
2117 if (partition == NULL) {
2118 error("No partition for JobId=%u", job_id);
2119 goto unpack_error;
2120 }
2121 part_ptr = find_part_record (partition);
2122 if (part_ptr == NULL) {
2123 char *err_part = NULL;
2124 part_ptr_list = get_part_list(partition, &err_part);
2125 if (part_ptr_list) {
2126 part_ptr = list_peek(part_ptr_list);
2127 if (list_count(part_ptr_list) == 1)
2128 FREE_NULL_LIST(part_ptr_list);
2129 } else {
2130 verbose("Invalid partition (%s) for JobId=%u",
2131 err_part, job_id);
2132 xfree(err_part);
2133 /* not fatal error, partition could have been
2134 * removed, reset_job_bitmaps() will clean-up
2135 * this job */
2136 }
2137 }
2138
2139 safe_unpackstr_xmalloc(&name, &name_len, buffer);
2140 safe_unpackstr_xmalloc(&user_name, &name_len, buffer);
2141 safe_unpackstr_xmalloc(&wckey, &name_len, buffer);
2142 safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer);
2143 safe_unpackstr_xmalloc(&account, &name_len, buffer);
2144 safe_unpackstr_xmalloc(&admin_comment, &name_len, buffer);
2145 safe_unpackstr_xmalloc(&comment, &name_len, buffer);
2146 safe_unpackstr_xmalloc(&gres_alloc, &name_len, buffer);
2147 safe_unpackstr_xmalloc(&gres_req, &name_len, buffer);
2148 safe_unpackstr_xmalloc(&gres_used, &name_len, buffer);
2149 safe_unpackstr_xmalloc(&network, &name_len, buffer);
2150 safe_unpackstr_xmalloc(&licenses, &name_len, buffer);
2151 safe_unpackstr_xmalloc(&mail_user, &name_len, buffer);
2152 safe_unpackstr_xmalloc(&mcs_label, &name_len, buffer);
2153 safe_unpackstr_xmalloc(&resv_name, &name_len, buffer);
2154 safe_unpackstr_xmalloc(&batch_host, &name_len, buffer);
2155 safe_unpackstr_xmalloc(&burst_buffer, &name_len, buffer);
2156 safe_unpackstr_xmalloc(&burst_buffer_state, &name_len, buffer);
2157 safe_unpackstr_xmalloc(&system_comment, &name_len, buffer);
2158
2159 if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer,
2160 protocol_version))
2161 goto unpack_error;
2162 if (unpack_job_resources(&job_resources, buffer,
2163 protocol_version))
2164 goto unpack_error;
2165
2166 safe_unpack16(&uint16_tmp, buffer); /* was ckpt_interval */
2167 /* fake out the former checkpoint plugin */
2168 {
2169 uint16_t id;
2170 uint32_t size;
2171 safe_unpack16(&id, buffer);
2172 safe_unpack32(&size, buffer);
2173 /* skip past any checkpoint plugin info */
2174 size += get_buf_offset(buffer);
2175 set_buf_offset(buffer, size);
2176 }
2177
2178 safe_unpackstr_array(&spank_job_env, &spank_job_env_size,
2179 buffer);
2180
2181 if (gres_plugin_job_state_unpack(&gres_list, buffer, job_id,
2182 protocol_version) !=
2183 SLURM_SUCCESS)
2184 goto unpack_error;
2185 gres_plugin_job_state_log(gres_list, job_id);
2186
2187 safe_unpack16(&details, buffer);
2188 if ((details == DETAILS_FLAG) &&
2189 (_load_job_details(job_ptr, buffer, protocol_version))) {
2190 job_ptr->job_state = JOB_FAILED;
2191 job_ptr->exit_code = 1;
2192 job_ptr->state_reason = FAIL_SYSTEM;
2193 xfree(job_ptr->state_desc);
2194 job_ptr->end_time = now;
2195 goto unpack_error;
2196 }
2197 safe_unpack16(&step_flag, buffer);
2198
2199 while (step_flag == STEP_FLAG) {
2200 /*
2201 * No need to put these into accounting if they
2202 * haven't been since all information will be
2203 * put in when the job is finished.
2204 */
2205 if ((error_code = load_step_state(job_ptr, buffer,
2206 protocol_version)))
2207 goto unpack_error;
2208 safe_unpack16(&step_flag, buffer);
2209 }
2210 safe_unpack32(&job_ptr->bit_flags, buffer);
2211 job_ptr->bit_flags &= ~BACKFILL_TEST;
2212 job_ptr->bit_flags |= JOB_MEM_SET;
2213 safe_unpackstr_xmalloc(&tres_alloc_str,
2214 &name_len, buffer);
2215 safe_unpackstr_xmalloc(&tres_fmt_alloc_str,
2216 &name_len, buffer);
2217 safe_unpackstr_xmalloc(&tres_req_str, &name_len, buffer);
2218 safe_unpackstr_xmalloc(&tres_fmt_req_str, &name_len, buffer);
2219 safe_unpackstr_xmalloc(&clusters, &name_len, buffer);
2220 if ((error_code = _load_job_fed_details(&job_fed_details,
2221 buffer,
2222 protocol_version)))
2223 goto unpack_error;
2224
2225 safe_unpackstr_xmalloc(&job_ptr->origin_cluster, &name_len,
2226 buffer);
2227
2228 safe_unpackstr_xmalloc(&job_ptr->cpus_per_tres, &name_len,
2229 buffer);
2230 safe_unpackstr_xmalloc(&job_ptr->mem_per_tres, &name_len,
2231 buffer);
2232 safe_unpackstr_xmalloc(&job_ptr->tres_bind, &name_len,
2233 buffer);
2234 safe_unpackstr_xmalloc(&job_ptr->tres_freq, &name_len,
2235 buffer);
2236 safe_unpackstr_xmalloc(&job_ptr->tres_per_job, &name_len,
2237 buffer);
2238 safe_unpackstr_xmalloc(&job_ptr->tres_per_node, &name_len,
2239 buffer);
2240 safe_unpackstr_xmalloc(&job_ptr->tres_per_socket, &name_len,
2241 buffer);
2242 safe_unpackstr_xmalloc(&job_ptr->tres_per_task, &name_len,
2243 buffer);
2244 } else {
2245 error("%s: protocol_version %hu not supported",
2246 __func__, protocol_version);
2247 goto unpack_error;
2248 }
2249
2250 /* "Don't load "unlinked" job. */
2251 if (job_ptr->job_id == NO_VAL) {
2252 debug("skipping unlinked job");
2253 rc = SLURM_SUCCESS;
2254 goto free_it;
2255 }
2256
2257 if (((job_state & JOB_STATE_BASE) >= JOB_END) ||
2258 (batch_flag > MAX_BATCH_REQUEUE)) {
2259 error("Invalid data for JobId=%u: job_state=%u batch_flag=%u",
2260 job_id, job_state, batch_flag);
2261 goto unpack_error;
2262 }
2263 if (kill_on_node_fail > 1) {
2264 error("Invalid data for JobId=%u: kill_on_node_fail=%u",
2265 job_id, kill_on_node_fail);
2266 goto unpack_error;
2267 }
2268
2269 if ((priority > 1) && (direct_set_prio == 0)) {
2270 highest_prio = MAX(highest_prio, priority);
2271 lowest_prio = MIN(lowest_prio, priority);
2272 }
2273
2274 #if 0
2275 /*
2276 * This is not necessary since the job_id_sequence is checkpointed and
2277 * the jobid will be checked if it's in use in get_next_job_id().
2278 */
2279
2280 /* Base job_id_sequence off of local job id but only if the job
2281 * originated from this cluster -- so that the local job id of a
2282 * different cluster isn't restored here. */
2283 if (!job_fed_details ||
2284 !xstrcmp(job_fed_details->origin_str, slurmctld_conf.cluster_name))
2285 local_job_id = fed_mgr_get_local_id(job_id);
2286 if (job_id_sequence <= local_job_id)
2287 job_id_sequence = local_job_id + 1;
2288 #endif
2289
2290 xfree(job_ptr->tres_alloc_str);
2291 job_ptr->tres_alloc_str = tres_alloc_str;
2292 tres_alloc_str = NULL;
2293
2294 xfree(job_ptr->tres_req_str);
2295 job_ptr->tres_req_str = tres_req_str;
2296 tres_req_str = NULL;
2297
2298 xfree(job_ptr->tres_fmt_alloc_str);
2299 job_ptr->tres_fmt_alloc_str = tres_fmt_alloc_str;
2300 tres_fmt_alloc_str = NULL;
2301
2302 xfree(job_ptr->tres_fmt_req_str);
2303 job_ptr->tres_fmt_req_str = tres_fmt_req_str;
2304 tres_fmt_req_str = NULL;
2305
2306 xfree(job_ptr->account);
2307 job_ptr->account = account;
2308 xstrtolower(job_ptr->account);
2309 account = NULL; /* reused, nothing left to free */
2310 xfree(job_ptr->alloc_node);
2311 job_ptr->alloc_node = alloc_node;
2312 alloc_node = NULL; /* reused, nothing left to free */
2313 job_ptr->alloc_resp_port = alloc_resp_port;
2314 job_ptr->alloc_sid = alloc_sid;
2315 job_ptr->assoc_id = assoc_id;
2316 job_ptr->delay_boot = delay_boot;
2317 xfree(job_ptr->admin_comment);
2318 job_ptr->admin_comment = admin_comment;
2319 admin_comment = NULL; /* reused, nothing left to free */
2320 xfree(job_ptr->system_comment);
2321 job_ptr->system_comment = system_comment;
2322 system_comment = NULL; /* reused, nothing left to free */
2323 xfree(job_ptr->batch_features);
2324 job_ptr->batch_features = batch_features;
2325 batch_features = NULL; /* reused, nothing left to free */
2326 job_ptr->batch_flag = batch_flag;
2327 xfree(job_ptr->batch_host);
2328 job_ptr->batch_host = batch_host;
2329 batch_host = NULL; /* reused, nothing left to free */
2330 xfree(job_ptr->burst_buffer);
2331 job_ptr->burst_buffer = burst_buffer;
2332 burst_buffer = NULL; /* reused, nothing left to free */
2333 xfree(job_ptr->burst_buffer_state);
2334 job_ptr->burst_buffer_state = burst_buffer_state;
2335 burst_buffer_state = NULL; /* reused, nothing left to free */
2336 xfree(job_ptr->comment);
2337 job_ptr->comment = comment;
2338 comment = NULL; /* reused, nothing left to free */
2339 job_ptr->billable_tres = billable_tres;
2340 xfree(job_ptr->gres_alloc);
2341 job_ptr->gres_alloc = gres_alloc;
2342 gres_alloc = NULL; /* reused, nothing left to free */
2343 xfree(job_ptr->gres_req);
2344 job_ptr->gres_req = gres_req;
2345 gres_req = NULL; /* reused, nothing left to free */
2346 xfree(job_ptr->gres_used);
2347 job_ptr->gres_used = gres_used;
2348 gres_used = NULL; /* reused, nothing left to free */
2349 job_ptr->gres_list = gres_list;
2350 job_ptr->site_factor = site_factor;
2351 job_ptr->direct_set_prio = direct_set_prio;
2352 job_ptr->db_index = db_index;
2353 job_ptr->derived_ec = derived_ec;
2354 job_ptr->end_time_exp = end_time_exp;
2355 job_ptr->end_time = end_time;
2356 job_ptr->exit_code = exit_code;
2357 job_ptr->group_id = group_id;
2358 job_ptr->job_state = job_state;
2359 job_ptr->kill_on_node_fail = kill_on_node_fail;
2360 xfree(job_ptr->licenses);
2361 job_ptr->licenses = licenses;
2362 licenses = NULL; /* reused, nothing left to free */
2363 job_ptr->mail_type = mail_type;
2364 xfree(job_ptr->mail_user);
2365 if (mail_user)
2366 job_ptr->mail_user = mail_user;
2367 else
2368 job_ptr->mail_user = _get_mail_user(NULL, user_id);
2369 mail_user = NULL; /* reused, nothing left to free */
2370 xfree(job_ptr->mcs_label);
2371 job_ptr->mcs_label = mcs_label;
2372 mcs_label = NULL; /* reused, nothing left to free */
2373 xfree(job_ptr->name); /* in case duplicate record */
2374 job_ptr->name = name;
2375 name = NULL; /* reused, nothing left to free */
2376 xfree(job_ptr->user_name);
2377 job_ptr->user_name = user_name;
2378 user_name = NULL; /* reused, nothing left to free */
2379 xfree(job_ptr->wckey); /* in case duplicate record */
2380 job_ptr->wckey = wckey;
2381 xstrtolower(job_ptr->wckey);
2382 wckey = NULL; /* reused, nothing left to free */
2383 xfree(job_ptr->network);
2384 job_ptr->network = network;
2385 network = NULL; /* reused, nothing left to free */
2386 job_ptr->next_step_id = next_step_id;
2387 xfree(job_ptr->nodes); /* in case duplicate record */
2388 job_ptr->nodes = nodes;
2389 nodes = NULL; /* reused, nothing left to free */
2390 if (nodes_completing) {
2391 xfree(job_ptr->nodes_completing);
2392 job_ptr->nodes_completing = nodes_completing;
2393 nodes_completing = NULL; /* reused, nothing left to free */
2394 }
2395 job_ptr->other_port = other_port;
2396 job_ptr->power_flags = power_flags;
2397 job_ptr->het_job_id = het_job_id;
2398 xfree(job_ptr->het_job_id_set);
2399 job_ptr->het_job_id_set = het_job_id_set;
2400 het_job_id_set = NULL; /* reused, nothing left to free */
2401 job_ptr->het_job_offset = het_job_offset;
2402 xfree(job_ptr->partition);
2403 job_ptr->partition = partition;
2404 partition = NULL; /* reused, nothing left to free */
2405 job_ptr->part_ptr = part_ptr;
2406 job_ptr->part_ptr_list = part_ptr_list;
2407 job_ptr->pre_sus_time = pre_sus_time;
2408 job_ptr->priority = priority;
2409 job_ptr->qos_id = qos_id;
2410 job_ptr->reboot = reboot;
2411 xfree(job_ptr->resp_host);
2412 job_ptr->resp_host = resp_host;
2413 resp_host = NULL; /* reused, nothing left to free */
2414 job_ptr->resize_time = resize_time;
2415 job_ptr->restart_cnt = restart_cnt;
2416 job_ptr->resv_id = resv_id;
2417 job_ptr->resv_name = resv_name;
2418 resv_name = NULL; /* reused, nothing left to free */
2419 job_ptr->select_jobinfo = select_jobinfo;
2420 job_ptr->job_resrcs = job_resources;
2421 job_ptr->spank_job_env = spank_job_env;
2422 job_ptr->spank_job_env_size = spank_job_env_size;
2423 job_ptr->start_time = start_time;
2424 job_ptr->state_reason = state_reason;
2425 job_ptr->state_reason_prev_db = state_reason_prev_db;
2426 job_ptr->state_desc = state_desc;
2427 state_desc = NULL; /* reused, nothing left to free */
2428 job_ptr->suspend_time = suspend_time;
2429 job_ptr->deadline = deadline;
2430 if (task_id_size != NO_VAL) {
2431 if (!job_ptr->array_recs)
2432 job_ptr->array_recs=xmalloc(sizeof(job_array_struct_t));
2433 FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap);
2434 xfree(job_ptr->array_recs->task_id_str);
2435 if (task_id_size) {
2436 job_ptr->array_recs->task_id_bitmap =
2437 bit_alloc(task_id_size);
2438 if (task_id_str) {
2439 bit_unfmt_hexmask(
2440 job_ptr->array_recs->task_id_bitmap,
2441 task_id_str);
2442 job_ptr->array_recs->task_id_str = task_id_str;
2443 task_id_str = NULL;
2444 }
2445 job_ptr->array_recs->task_cnt =
2446 bit_set_count(job_ptr->array_recs->
2447 task_id_bitmap);
2448
2449 if (job_ptr->array_recs->task_cnt > 1)
2450 job_count += (job_ptr->array_recs->task_cnt-1);
2451 } else
2452 xfree(task_id_str);
2453 job_ptr->array_recs->array_flags = array_flags;
2454 job_ptr->array_recs->max_run_tasks = max_run_tasks;
2455 job_ptr->array_recs->tot_run_tasks = tot_run_tasks;
2456 job_ptr->array_recs->min_exit_code = min_exit_code;
2457 job_ptr->array_recs->max_exit_code = max_exit_code;
2458 job_ptr->array_recs->tot_comp_tasks = tot_comp_tasks;
2459 }
2460 job_ptr->time_last_active = now;
2461 job_ptr->time_limit = time_limit;
2462 job_ptr->time_min = time_min;
2463 job_ptr->total_cpus = total_cpus;
2464
2465 if (IS_JOB_PENDING(job_ptr))
2466 job_ptr->node_cnt_wag = total_nodes;
2467 else
2468 job_ptr->total_nodes = total_nodes;
2469
2470 job_ptr->cpu_cnt = cpu_cnt;
2471 job_ptr->tot_sus_time = tot_sus_time;
2472 job_ptr->last_sched_eval = last_sched_eval;
2473 job_ptr->preempt_time = preempt_time;
2474 job_ptr->user_id = user_id;
2475 job_ptr->wait_all_nodes = wait_all_nodes;
2476 job_ptr->warn_flags = warn_flags;
2477 job_ptr->warn_signal = warn_signal;
2478 job_ptr->warn_time = warn_time;
2479
2480 memcpy(&job_ptr->limit_set, &limit_set,
2481 sizeof(acct_policy_limit_set_t));
2482 limit_set.tres = NULL;
2483
2484 job_ptr->req_switch = req_switch;
2485 job_ptr->wait4switch = wait4switch;
2486 job_ptr->profile = profile;
2487 job_ptr->db_flags = db_flags;
2488 /*
2489 * This needs to always to initialized to "true". The select
2490 * plugin will deal with it every time it goes through the
2491 * logic if req_switch or wait4switch are set.
2492 */
2493 job_ptr->best_switch = true;
2494 job_ptr->start_protocol_ver = start_protocol_ver;
2495
2496 _add_job_hash(job_ptr);
2497 _add_job_array_hash(job_ptr);
2498
2499 memset(&assoc_rec, 0, sizeof(assoc_rec));
2500
2501 /*
2502 * For speed and accurracy we will first see if we once had an
2503 * association record. If not look for it by
2504 * account,partition, user_id.
2505 */
2506 if (job_ptr->assoc_id)
2507 assoc_rec.id = job_ptr->assoc_id;
2508 else {
2509 assoc_rec.acct = job_ptr->account;
2510 if (job_ptr->part_ptr)
2511 assoc_rec.partition = job_ptr->part_ptr->name;
2512 assoc_rec.uid = job_ptr->user_id;
2513 }
2514
2515 assoc_mgr_lock(&locks);
2516 if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
2517 accounting_enforce,
2518 &job_ptr->assoc_ptr, true) &&
2519 (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)
2520 && (!IS_JOB_FINISHED(job_ptr))) {
2521 _job_fail_account(job_ptr, __func__);
2522 } else {
2523 job_ptr->assoc_id = assoc_rec.id;
2524 info("Recovered %pJ Assoc=%u", job_ptr, job_ptr->assoc_id);
2525
2526 if (job_ptr->state_reason == FAIL_ACCOUNT) {
2527 job_ptr->state_reason = WAIT_NO_REASON;
2528 xfree(job_ptr->state_desc);
2529 }
2530
2531 /* make sure we have started this job in accounting */
2532 if (!job_ptr->db_index) {
2533 debug("starting %pJ in accounting", job_ptr);
2534 if (!with_slurmdbd)
2535 jobacct_storage_g_job_start(
2536 acct_db_conn, job_ptr);
2537 if (slurmctld_init_db
2538 && IS_JOB_SUSPENDED(job_ptr)) {
2539 jobacct_storage_g_job_suspend(acct_db_conn,
2540 job_ptr);
2541 }
2542 }
2543 /* make sure we have this job completed in the database */
2544 if (IS_JOB_FINISHED(job_ptr)) {
2545 if (slurmctld_init_db &&
2546 !(job_ptr->bit_flags & TRES_STR_CALC) &&
2547 job_ptr->tres_alloc_cnt &&
2548 (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64))
2549 set_job_tres_alloc_str(job_ptr, false);
2550 jobacct_storage_g_job_complete(
2551 acct_db_conn, job_ptr);
2552 job_finished = 1;
2553 }
2554 }
2555
2556 if (!job_finished && job_ptr->qos_id &&
2557 (job_ptr->state_reason != FAIL_ACCOUNT)) {
2558 memset(&qos_rec, 0, sizeof(qos_rec));
2559 qos_rec.id = job_ptr->qos_id;
2560 job_ptr->qos_ptr = _determine_and_validate_qos(
2561 job_ptr->resv_name, job_ptr->assoc_ptr,
2562 job_ptr->limit_set.qos, &qos_rec,
2563 &qos_error, true, LOG_LEVEL_ERROR);
2564 if ((qos_error != SLURM_SUCCESS) && !job_ptr->limit_set.qos) {
2565 job_fail_qos(job_ptr, __func__);
2566 } else {
2567 job_ptr->qos_id = qos_rec.id;
2568 if (job_ptr->state_reason == FAIL_QOS) {
2569 job_ptr->state_reason = WAIT_NO_REASON;
2570 xfree(job_ptr->state_desc);
2571 }
2572 }
2573 }
2574
2575 /*
2576 * do this after the format string just in case for some
2577 * reason the tres_alloc_str is NULL but not the fmt_str
2578 */
2579 if (job_ptr->tres_alloc_str)
2580 assoc_mgr_set_tres_cnt_array(
2581 &job_ptr->tres_alloc_cnt, job_ptr->tres_alloc_str,
2582 0, true);
2583 else
2584 job_set_alloc_tres(job_ptr, true);
2585
2586 if (job_ptr->tres_req_str)
2587 assoc_mgr_set_tres_cnt_array(
2588 &job_ptr->tres_req_cnt, job_ptr->tres_req_str, 0, true);
2589 else
2590 job_set_req_tres(job_ptr, true);
2591 assoc_mgr_unlock(&locks);
2592
2593 build_node_details(job_ptr, false); /* set node_addr */
2594 gres_build_job_details(job_ptr->gres_list,
2595 &job_ptr->gres_detail_cnt,
2596 &job_ptr->gres_detail_str,
2597 &job_ptr->gres_used);
2598 job_ptr->clusters = clusters;
2599 job_ptr->fed_details = job_fed_details;
2600 return SLURM_SUCCESS;
2601
2602 unpack_error:
2603 error("Incomplete job record");
2604 rc = SLURM_ERROR;
2605
2606 free_it:
2607 xfree(alloc_node);
2608 xfree(account);
2609 xfree(admin_comment);
2610 xfree(batch_features);
2611 xfree(batch_host);
2612 xfree(burst_buffer);
2613 xfree(clusters);
2614 xfree(comment);
2615 xfree(gres_alloc);
2616 xfree(gres_req);
2617 xfree(gres_used);
2618 xfree(het_job_id_set);
2619 free_job_fed_details(&job_fed_details);
2620 free_job_resources(&job_resources);
2621 xfree(resp_host);
2622 xfree(licenses);
2623 xfree(limit_set.tres);
2624 xfree(mail_user);
2625 xfree(mcs_label);
2626 xfree(name);
2627 xfree(nodes);
2628 xfree(nodes_completing);
2629 xfree(partition);
2630 FREE_NULL_LIST(part_ptr_list);
2631 xfree(resv_name);
2632 for (i = 0; i < spank_job_env_size; i++)
2633 xfree(spank_job_env[i]);
2634 xfree(spank_job_env);
2635 xfree(state_desc);
2636 xfree(system_comment);
2637 xfree(task_id_str);
2638 xfree(tres_alloc_str);
2639 xfree(tres_fmt_alloc_str);
2640 xfree(tres_fmt_req_str);
2641 xfree(tres_req_str);
2642 xfree(user_name);
2643 xfree(wckey);
2644 select_g_select_jobinfo_free(select_jobinfo);
2645 if (job_ptr) {
2646 if (job_ptr->job_id == 0)
2647 job_ptr->job_id = NO_VAL;
2648 purge_job_record(job_ptr->job_id);
2649 }
2650 for (i = 0; i < pelog_env_size; i++)
2651 xfree(pelog_env[i]);
2652 xfree(pelog_env);
2653
2654 return rc;
2655 }
2656
2657 /*
2658 * _dump_job_details - dump the state of a specific job details to
2659 * a buffer
2660 * IN detail_ptr - pointer to job details for which information is requested
2661 * IN/OUT buffer - location to store data, pointers automatically advanced
2662 */
_dump_job_details(struct job_details * detail_ptr,Buf buffer)2663 void _dump_job_details(struct job_details *detail_ptr, Buf buffer)
2664 {
2665 /*
2666 * Some job fields can change in the course of scheduling, so we
2667 * report the original values supplied by the user rather than
2668 * an intermediate value that might be set by our scheduling
2669 * logic (e.g. to enforce a partition, association or QOS limit).
2670 *
2671 * Fields subject to change and their original values are as follows:
2672 * min_cpus orig_min_cpus
2673 * max_cpus orig_max_cpus
2674 * cpus_per_task orig_cpus_per_task
2675 * pn_min_cpus orig_pn_min_cpus
2676 * pn_min_memory orig_pn_min_memory
2677 * dependency orig_dependency
2678 */
2679 pack32(detail_ptr->orig_min_cpus, buffer); /* subject to change */
2680 pack32(detail_ptr->orig_max_cpus, buffer); /* subject to change */
2681 pack32(detail_ptr->min_nodes, buffer);
2682 pack32(detail_ptr->max_nodes, buffer);
2683 pack32(detail_ptr->num_tasks, buffer);
2684
2685 packstr(detail_ptr->acctg_freq, buffer);
2686 pack16(detail_ptr->contiguous, buffer);
2687 pack16(detail_ptr->core_spec, buffer);
2688 pack16(detail_ptr->orig_cpus_per_task, buffer); /* subject to change */
2689 pack32(detail_ptr->nice, buffer);
2690 pack16(detail_ptr->ntasks_per_node, buffer);
2691 pack16(detail_ptr->requeue, buffer);
2692 pack32(detail_ptr->task_dist, buffer);
2693
2694 pack8(detail_ptr->share_res, buffer);
2695 pack8(detail_ptr->whole_node, buffer);
2696
2697 packstr(detail_ptr->cpu_bind, buffer);
2698 pack16(detail_ptr->cpu_bind_type, buffer);
2699 packstr(detail_ptr->mem_bind, buffer);
2700 pack16(detail_ptr->mem_bind_type, buffer);
2701 pack16(detail_ptr->plane_size, buffer);
2702
2703 pack8(detail_ptr->open_mode, buffer);
2704 pack8(detail_ptr->overcommit, buffer);
2705 pack8(detail_ptr->prolog_running, buffer);
2706
2707 pack32(detail_ptr->orig_pn_min_cpus, buffer); /* subject to change */
2708 pack64(detail_ptr->orig_pn_min_memory, buffer); /* subject to change */
2709 pack32(detail_ptr->pn_min_tmp_disk, buffer);
2710 pack32(detail_ptr->cpu_freq_min, buffer);
2711 pack32(detail_ptr->cpu_freq_max, buffer);
2712 pack32(detail_ptr->cpu_freq_gov, buffer);
2713 pack_time(detail_ptr->begin_time, buffer);
2714 pack_time(detail_ptr->accrue_time, buffer);
2715 pack_time(detail_ptr->submit_time, buffer);
2716
2717 packstr(detail_ptr->req_nodes, buffer);
2718 packstr(detail_ptr->exc_nodes, buffer);
2719 packstr(detail_ptr->features, buffer);
2720 packstr(detail_ptr->cluster_features, buffer);
2721 pack_dep_list(detail_ptr->depend_list, buffer, SLURM_PROTOCOL_VERSION);
2722 packstr(detail_ptr->dependency, buffer);
2723 packstr(detail_ptr->orig_dependency, buffer); /* subject to change */
2724
2725 packstr(detail_ptr->std_err, buffer);
2726 packstr(detail_ptr->std_in, buffer);
2727 packstr(detail_ptr->std_out, buffer);
2728 packstr(detail_ptr->work_dir, buffer);
2729
2730 pack_multi_core_data(detail_ptr->mc_ptr, buffer,
2731 SLURM_PROTOCOL_VERSION);
2732 packstr_array(detail_ptr->argv, detail_ptr->argc, buffer);
2733 packstr_array(detail_ptr->env_sup, detail_ptr->env_cnt, buffer);
2734 }
2735
2736 /* _load_job_details - Unpack a job details information from buffer */
_load_job_details(job_record_t * job_ptr,Buf buffer,uint16_t protocol_version)2737 static int _load_job_details(job_record_t *job_ptr, Buf buffer,
2738 uint16_t protocol_version)
2739 {
2740 char *acctg_freq = NULL, *req_nodes = NULL, *exc_nodes = NULL;
2741 char *features = NULL, *cpu_bind = NULL, *dependency = NULL;
2742 char *orig_dependency = NULL, *mem_bind, *cluster_features = NULL;
2743 char *err = NULL, *in = NULL, *out = NULL, *work_dir = NULL;
2744 char **argv = (char **) NULL, **env_sup = (char **) NULL;
2745 uint32_t min_nodes, max_nodes;
2746 uint32_t min_cpus = 1, max_cpus = NO_VAL;
2747 uint32_t pn_min_cpus, pn_min_tmp_disk;
2748 uint64_t pn_min_memory;
2749 uint32_t cpu_freq_min = NO_VAL;
2750 uint32_t cpu_freq_max = NO_VAL;
2751 uint32_t cpu_freq_gov = NO_VAL, nice = 0;
2752 uint32_t num_tasks, name_len, argc = 0, env_cnt = 0, task_dist;
2753 uint16_t contiguous, core_spec = NO_VAL16;
2754 uint16_t ntasks_per_node, cpus_per_task, requeue;
2755 uint16_t cpu_bind_type, mem_bind_type, plane_size;
2756 uint8_t open_mode, overcommit, prolog_running;
2757 uint8_t share_res, whole_node;
2758 time_t begin_time, accrue_time = 0, submit_time;
2759 int i;
2760 List depend_list = NULL;
2761 multi_core_data_t *mc_ptr;
2762
2763 /* unpack the job's details from the buffer */
2764 if (protocol_version >= SLURM_20_02_PROTOCOL_VERSION) {
2765 safe_unpack32(&min_cpus, buffer);
2766 safe_unpack32(&max_cpus, buffer);
2767 safe_unpack32(&min_nodes, buffer);
2768 safe_unpack32(&max_nodes, buffer);
2769 safe_unpack32(&num_tasks, buffer);
2770
2771 safe_unpackstr_xmalloc(&acctg_freq, &name_len, buffer);
2772 safe_unpack16(&contiguous, buffer);
2773 safe_unpack16(&core_spec, buffer);
2774 safe_unpack16(&cpus_per_task, buffer);
2775 safe_unpack32(&nice, buffer);
2776 safe_unpack16(&ntasks_per_node, buffer);
2777 safe_unpack16(&requeue, buffer);
2778 safe_unpack32(&task_dist, buffer);
2779
2780 safe_unpack8(&share_res, buffer);
2781 safe_unpack8(&whole_node, buffer);
2782
2783 safe_unpackstr_xmalloc(&cpu_bind, &name_len, buffer);
2784 safe_unpack16(&cpu_bind_type, buffer);
2785 safe_unpackstr_xmalloc(&mem_bind, &name_len, buffer);
2786 safe_unpack16(&mem_bind_type, buffer);
2787 safe_unpack16(&plane_size, buffer);
2788
2789 safe_unpack8(&open_mode, buffer);
2790 safe_unpack8(&overcommit, buffer);
2791 safe_unpack8(&prolog_running, buffer);
2792
2793 safe_unpack32(&pn_min_cpus, buffer);
2794 safe_unpack64(&pn_min_memory, buffer);
2795 safe_unpack32(&pn_min_tmp_disk, buffer);
2796 safe_unpack32(&cpu_freq_min, buffer);
2797 safe_unpack32(&cpu_freq_max, buffer);
2798 safe_unpack32(&cpu_freq_gov, buffer);
2799 safe_unpack_time(&begin_time, buffer);
2800 safe_unpack_time(&accrue_time, buffer);
2801 safe_unpack_time(&submit_time, buffer);
2802
2803 safe_unpackstr_xmalloc(&req_nodes, &name_len, buffer);
2804 safe_unpackstr_xmalloc(&exc_nodes, &name_len, buffer);
2805 safe_unpackstr_xmalloc(&features, &name_len, buffer);
2806 safe_unpackstr_xmalloc(&cluster_features, &name_len, buffer);
2807 unpack_dep_list(&depend_list, buffer, protocol_version);
2808 safe_unpackstr_xmalloc(&dependency, &name_len, buffer);
2809 safe_unpackstr_xmalloc(&orig_dependency, &name_len, buffer);
2810
2811 safe_unpackstr_xmalloc(&err, &name_len, buffer);
2812 safe_unpackstr_xmalloc(&in, &name_len, buffer);
2813 safe_unpackstr_xmalloc(&out, &name_len, buffer);
2814 safe_unpackstr_xmalloc(&work_dir, &name_len, buffer);
2815
2816 if (unpack_multi_core_data(&mc_ptr, buffer, protocol_version))
2817 goto unpack_error;
2818 safe_unpackstr_array(&argv, &argc, buffer);
2819 safe_unpackstr_array(&env_sup, &env_cnt, buffer);
2820 } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
2821 char *temp_str;
2822
2823 safe_unpack32(&min_cpus, buffer);
2824 safe_unpack32(&max_cpus, buffer);
2825 safe_unpack32(&min_nodes, buffer);
2826 safe_unpack32(&max_nodes, buffer);
2827 safe_unpack32(&num_tasks, buffer);
2828
2829 safe_unpackstr_xmalloc(&acctg_freq, &name_len, buffer);
2830 safe_unpack16(&contiguous, buffer);
2831 safe_unpack16(&core_spec, buffer);
2832 safe_unpack16(&cpus_per_task, buffer);
2833 safe_unpack32(&nice, buffer);
2834 safe_unpack16(&ntasks_per_node, buffer);
2835 safe_unpack16(&requeue, buffer);
2836 safe_unpack32(&task_dist, buffer);
2837
2838 safe_unpack8(&share_res, buffer);
2839 safe_unpack8(&whole_node, buffer);
2840
2841 safe_unpackstr_xmalloc(&cpu_bind, &name_len, buffer);
2842 safe_unpack16(&cpu_bind_type, buffer);
2843 safe_unpackstr_xmalloc(&mem_bind, &name_len, buffer);
2844 safe_unpack16(&mem_bind_type, buffer);
2845 safe_unpack16(&plane_size, buffer);
2846
2847 safe_unpack8(&open_mode, buffer);
2848 safe_unpack8(&overcommit, buffer);
2849 safe_unpack8(&prolog_running, buffer);
2850
2851 safe_unpack32(&pn_min_cpus, buffer);
2852 safe_unpack64(&pn_min_memory, buffer);
2853 safe_unpack32(&pn_min_tmp_disk, buffer);
2854 safe_unpack32(&cpu_freq_min, buffer);
2855 safe_unpack32(&cpu_freq_max, buffer);
2856 safe_unpack32(&cpu_freq_gov, buffer);
2857 safe_unpack_time(&begin_time, buffer);
2858 safe_unpack_time(&accrue_time, buffer);
2859 safe_unpack_time(&submit_time, buffer);
2860
2861 safe_unpackstr_xmalloc(&req_nodes, &name_len, buffer);
2862 safe_unpackstr_xmalloc(&exc_nodes, &name_len, buffer);
2863 safe_unpackstr_xmalloc(&features, &name_len, buffer);
2864 safe_unpackstr_xmalloc(&cluster_features, &name_len, buffer);
2865 safe_unpackstr_xmalloc(&dependency, &name_len, buffer);
2866 safe_unpackstr_xmalloc(&orig_dependency, &name_len, buffer);
2867
2868 safe_unpackstr_xmalloc(&err, &name_len, buffer);
2869 safe_unpackstr_xmalloc(&in, &name_len, buffer);
2870 safe_unpackstr_xmalloc(&out, &name_len, buffer);
2871 safe_unpackstr_xmalloc(&work_dir, &name_len, buffer);
2872 safe_unpackstr_xmalloc(&temp_str, &name_len, buffer);
2873 xfree(temp_str); /* was ckpt_dir */
2874 safe_unpackstr_xmalloc(&temp_str, &name_len, buffer);
2875 xfree(temp_str); /* was restart_dir */
2876
2877 if (unpack_multi_core_data(&mc_ptr, buffer, protocol_version))
2878 goto unpack_error;
2879 safe_unpackstr_array(&argv, &argc, buffer);
2880 safe_unpackstr_array(&env_sup, &env_cnt, buffer);
2881 } else {
2882 error("_load_job_details: protocol_version "
2883 "%hu not supported", protocol_version);
2884 goto unpack_error;
2885 }
2886
2887 /* validity test as possible */
2888 if (contiguous > 1) {
2889 error("Invalid data for %pJ: contiguous=%u",
2890 job_ptr, contiguous);
2891 goto unpack_error;
2892 }
2893 if ((requeue > 1) || (overcommit > 1)) {
2894 error("Invalid data for %pJ: requeue=%u overcommit=%u",
2895 job_ptr, requeue, overcommit);
2896 goto unpack_error;
2897 }
2898 if (prolog_running > 4) {
2899 error("Invalid data for %pJ: prolog_running=%u",
2900 job_ptr, prolog_running);
2901 goto unpack_error;
2902 }
2903
2904 /* free any left-over detail data */
2905 xfree(job_ptr->details->acctg_freq);
2906 for (i=0; i<job_ptr->details->argc; i++)
2907 xfree(job_ptr->details->argv[i]);
2908 xfree(job_ptr->details->argv);
2909 xfree(job_ptr->details->cpu_bind);
2910 FREE_NULL_LIST(job_ptr->details->depend_list);
2911 xfree(job_ptr->details->dependency);
2912 xfree(job_ptr->details->orig_dependency);
2913 xfree(job_ptr->details->std_err);
2914 for (i=0; i<job_ptr->details->env_cnt; i++)
2915 xfree(job_ptr->details->env_sup[i]);
2916 xfree(job_ptr->details->env_sup);
2917 xfree(job_ptr->details->exc_nodes);
2918 xfree(job_ptr->details->features);
2919 xfree(job_ptr->details->cluster_features);
2920 xfree(job_ptr->details->std_in);
2921 xfree(job_ptr->details->mem_bind);
2922 xfree(job_ptr->details->std_out);
2923 xfree(job_ptr->details->req_nodes);
2924 xfree(job_ptr->details->work_dir);
2925
2926 /* now put the details into the job record */
2927 job_ptr->details->acctg_freq = acctg_freq;
2928 job_ptr->details->argc = argc;
2929 job_ptr->details->argv = argv;
2930 job_ptr->details->accrue_time = accrue_time;
2931 job_ptr->details->begin_time = begin_time;
2932 job_ptr->details->contiguous = contiguous;
2933 job_ptr->details->core_spec = core_spec;
2934 job_ptr->details->cpu_bind = cpu_bind;
2935 job_ptr->details->cpu_bind_type = cpu_bind_type;
2936 job_ptr->details->cpu_freq_min = cpu_freq_min;
2937 job_ptr->details->cpu_freq_max = cpu_freq_max;
2938 job_ptr->details->cpu_freq_gov = cpu_freq_gov;
2939 if (cpus_per_task != NO_VAL16)
2940 job_ptr->details->cpus_per_task = cpus_per_task;
2941 else
2942 job_ptr->details->cpus_per_task = 1;
2943 job_ptr->details->orig_cpus_per_task = cpus_per_task;
2944 job_ptr->details->depend_list = depend_list;
2945 job_ptr->details->dependency = dependency;
2946 job_ptr->details->orig_dependency = orig_dependency;
2947 job_ptr->details->env_cnt = env_cnt;
2948 job_ptr->details->env_sup = env_sup;
2949 job_ptr->details->std_err = err;
2950 job_ptr->details->exc_nodes = exc_nodes;
2951 job_ptr->details->features = features;
2952 job_ptr->details->cluster_features = cluster_features;
2953 job_ptr->details->std_in = in;
2954 job_ptr->details->pn_min_cpus = pn_min_cpus;
2955 job_ptr->details->orig_pn_min_cpus = pn_min_cpus;
2956 job_ptr->details->pn_min_memory = pn_min_memory;
2957 job_ptr->details->orig_pn_min_memory = pn_min_memory;
2958 job_ptr->details->pn_min_tmp_disk = pn_min_tmp_disk;
2959 job_ptr->details->max_cpus = max_cpus;
2960 job_ptr->details->orig_max_cpus = max_cpus;
2961 job_ptr->details->max_nodes = max_nodes;
2962 job_ptr->details->mc_ptr = mc_ptr;
2963 job_ptr->details->mem_bind = mem_bind;
2964 job_ptr->details->mem_bind_type = mem_bind_type;
2965 job_ptr->details->min_cpus = min_cpus;
2966 job_ptr->details->orig_min_cpus = min_cpus;
2967 job_ptr->details->min_nodes = min_nodes;
2968 job_ptr->details->nice = nice;
2969 job_ptr->details->ntasks_per_node = ntasks_per_node;
2970 job_ptr->details->num_tasks = num_tasks;
2971 job_ptr->details->open_mode = open_mode;
2972 job_ptr->details->std_out = out;
2973 job_ptr->details->overcommit = overcommit;
2974 job_ptr->details->plane_size = plane_size;
2975 job_ptr->details->prolog_running = prolog_running;
2976 job_ptr->details->req_nodes = req_nodes;
2977 job_ptr->details->requeue = requeue;
2978 job_ptr->details->share_res = share_res;
2979 job_ptr->details->submit_time = submit_time;
2980 job_ptr->details->task_dist = task_dist;
2981 job_ptr->details->whole_node = whole_node;
2982 job_ptr->details->work_dir = work_dir;
2983
2984 return SLURM_SUCCESS;
2985
2986 unpack_error:
2987
2988 /* for (i=0; i<argc; i++)
2989 xfree(argv[i]); Don't trust this on unpack error */
2990 xfree(acctg_freq);
2991 xfree(argv);
2992 xfree(cpu_bind);
2993 xfree(dependency);
2994 xfree(orig_dependency);
2995 /* for (i=0; i<env_cnt; i++)
2996 xfree(env_sup[i]); Don't trust this on unpack error */
2997 xfree(env_sup);
2998 xfree(err);
2999 xfree(exc_nodes);
3000 xfree(features);
3001 xfree(cluster_features);
3002 xfree(in);
3003 xfree(mem_bind);
3004 xfree(out);
3005 xfree(req_nodes);
3006 xfree(work_dir);
3007 return SLURM_ERROR;
3008 }
3009
3010 /* _add_job_hash - add a job hash entry for given job record, job_id must
3011 * already be set
3012 * IN job_ptr - pointer to job record
3013 * Globals: hash table updated
3014 */
_add_job_hash(job_record_t * job_ptr)3015 static void _add_job_hash(job_record_t *job_ptr)
3016 {
3017 int inx;
3018
3019 inx = JOB_HASH_INX(job_ptr->job_id);
3020 job_ptr->job_next = job_hash[inx];
3021 job_hash[inx] = job_ptr;
3022 }
3023
3024 /* _remove_job_hash - remove a job hash entry for given job record, job_id must
3025 * already be set
3026 * IN job_ptr - pointer to job record
3027 * IN type - which hash to work with
3028 * Globals: hash table updated
3029 */
_remove_job_hash(job_record_t * job_entry,job_hash_type_t type)3030 static void _remove_job_hash(job_record_t *job_entry, job_hash_type_t type)
3031 {
3032 job_record_t *job_ptr, **job_pptr;
3033
3034 xassert(job_entry);
3035
3036 switch (type) {
3037 case JOB_HASH_JOB:
3038 job_pptr = &job_hash[JOB_HASH_INX(job_entry->job_id)];
3039 break;
3040 case JOB_HASH_ARRAY_JOB:
3041 job_pptr = &job_array_hash_j[
3042 JOB_HASH_INX(job_entry->array_job_id)];
3043 break;
3044 case JOB_HASH_ARRAY_TASK:
3045 job_pptr = &job_array_hash_t[
3046 JOB_ARRAY_HASH_INX(job_entry->array_job_id,
3047 job_entry->array_task_id)];
3048 break;
3049 default:
3050 fatal("%s: unknown job_hash_type_t %d", __func__, type);
3051 return;
3052 }
3053
3054 while ((job_pptr != NULL) && (*job_pptr != NULL) &&
3055 ((job_ptr = *job_pptr) != job_entry)) {
3056 xassert(job_ptr->magic == JOB_MAGIC);
3057 switch (type) {
3058 case JOB_HASH_JOB:
3059 job_pptr = &job_ptr->job_next;
3060 break;
3061 case JOB_HASH_ARRAY_JOB:
3062 job_pptr = &job_ptr->job_array_next_j;
3063 break;
3064 case JOB_HASH_ARRAY_TASK:
3065 job_pptr = &job_ptr->job_array_next_t;
3066 break;
3067 }
3068 }
3069
3070 if (job_pptr == NULL || *job_pptr == NULL) {
3071 if (job_entry->job_id == NO_VAL)
3072 return;
3073
3074 switch (type) {
3075 case JOB_HASH_JOB:
3076 error("%s: Could not find hash entry for JobId=%u",
3077 __func__, job_entry->job_id);
3078 break;
3079 case JOB_HASH_ARRAY_JOB:
3080 error("%s: job array hash error %u", __func__,
3081 job_entry->array_job_id);
3082 break;
3083 case JOB_HASH_ARRAY_TASK:
3084 error("%s: job array, task ID hash error %u_%u",
3085 __func__,
3086 job_entry->array_job_id,
3087 job_entry->array_task_id);
3088 break;
3089 }
3090 return;
3091 }
3092
3093 switch (type) {
3094 case JOB_HASH_JOB:
3095 *job_pptr = job_entry->job_next;
3096 job_entry->job_next = NULL;
3097 break;
3098 case JOB_HASH_ARRAY_JOB:
3099 *job_pptr = job_entry->job_array_next_j;
3100 job_entry->job_array_next_j = NULL;
3101 break;
3102 case JOB_HASH_ARRAY_TASK:
3103 *job_pptr = job_entry->job_array_next_t;
3104 job_entry->job_array_next_t = NULL;
3105 break;
3106 }
3107 }
3108
3109 /* _add_job_array_hash - add a job hash entry for given job record,
3110 * array_job_id and array_task_id must already be set
3111 * IN job_ptr - pointer to job record
3112 * Globals: hash table updated
3113 */
_add_job_array_hash(job_record_t * job_ptr)3114 void _add_job_array_hash(job_record_t *job_ptr)
3115 {
3116 int inx;
3117
3118 if (job_ptr->array_task_id == NO_VAL)
3119 return; /* Not a job array */
3120
3121 inx = JOB_HASH_INX(job_ptr->array_job_id);
3122 job_ptr->job_array_next_j = job_array_hash_j[inx];
3123 job_array_hash_j[inx] = job_ptr;
3124
3125 inx = JOB_ARRAY_HASH_INX(job_ptr->array_job_id,job_ptr->array_task_id);
3126 job_ptr->job_array_next_t = job_array_hash_t[inx];
3127 job_array_hash_t[inx] = job_ptr;
3128 }
3129
3130 /* For the job array data structure, build the string representation of the
3131 * bitmap.
3132 * NOTE: bit_fmt_hexmask() is far more scalable than bit_fmt(). */
build_array_str(job_record_t * job_ptr)3133 extern void build_array_str(job_record_t *job_ptr)
3134 {
3135 job_array_struct_t *array_recs = job_ptr->array_recs;
3136
3137 if (!array_recs || array_recs->task_id_str ||
3138 !array_recs->task_id_bitmap ||
3139 (job_ptr->array_task_id != NO_VAL) ||
3140 (bit_ffs(job_ptr->array_recs->task_id_bitmap) == -1))
3141 return;
3142
3143
3144 array_recs->task_id_str = bit_fmt_hexmask(array_recs->task_id_bitmap);
3145
3146 /* While it is efficient to set the db_index to 0 here
3147 * to get the database to update the record for
3148 * pending tasks it also creates a window in which if
3149 * the association id is changed (different account or
3150 * partition) instead of returning the previous
3151 * db_index (expected) it would create a new one
3152 * leaving the other orphaned. Setting the job_state
3153 * sets things up so the db_index isn't lost but the
3154 * start message is still sent to get the desired behavior. */
3155
3156 /* Here we set the JOB_UPDATE_DB flag so we resend the start of the
3157 * job updating the array task string and count of pending
3158 * jobs. This is faster than sending the start again since
3159 * this could happen many times (like lots of array elements
3160 * starting at once) instead of just ever so often.
3161 */
3162
3163 if (job_ptr->db_index)
3164 job_ptr->job_state |= JOB_UPDATE_DB;
3165 }
3166
3167 /* Return true if ALL tasks of specific array job ID are complete */
test_job_array_complete(uint32_t array_job_id)3168 extern bool test_job_array_complete(uint32_t array_job_id)
3169 {
3170 job_record_t *job_ptr;
3171 int inx;
3172
3173 job_ptr = find_job_record(array_job_id);
3174 if (job_ptr) {
3175 if (!IS_JOB_COMPLETE(job_ptr))
3176 return false;
3177 if (job_ptr->array_recs && job_ptr->array_recs->max_exit_code)
3178 return false;
3179 }
3180
3181 /* Need to test individual job array records */
3182 inx = JOB_HASH_INX(array_job_id);
3183 job_ptr = job_array_hash_j[inx];
3184 while (job_ptr) {
3185 if (job_ptr->array_job_id == array_job_id) {
3186 if (!IS_JOB_COMPLETE(job_ptr))
3187 return false;
3188 }
3189 job_ptr = job_ptr->job_array_next_j;
3190 }
3191 return true;
3192 }
3193
3194 /* Return true if ALL tasks of specific array job ID are completed */
test_job_array_completed(uint32_t array_job_id)3195 extern bool test_job_array_completed(uint32_t array_job_id)
3196 {
3197 job_record_t *job_ptr;
3198 int inx;
3199
3200 job_ptr = find_job_record(array_job_id);
3201 if (job_ptr) {
3202 if (!IS_JOB_COMPLETED(job_ptr))
3203 return false;
3204 }
3205
3206 /* Need to test individual job array records */
3207 inx = JOB_HASH_INX(array_job_id);
3208 job_ptr = job_array_hash_j[inx];
3209 while (job_ptr) {
3210 if (job_ptr->array_job_id == array_job_id) {
3211 if (!IS_JOB_COMPLETED(job_ptr))
3212 return false;
3213 }
3214 job_ptr = job_ptr->job_array_next_j;
3215 }
3216 return true;
3217 }
3218
3219 /*
3220 * Return true if ALL tasks of specific array job ID are completed AND
3221 * all except for the head job have been purged.
3222 */
_test_job_array_purged(uint32_t array_job_id)3223 extern bool _test_job_array_purged(uint32_t array_job_id)
3224 {
3225 job_record_t *job_ptr, *head_job_ptr;
3226 int inx;
3227
3228 head_job_ptr = find_job_record(array_job_id);
3229 if (head_job_ptr) {
3230 if (!IS_JOB_COMPLETED(head_job_ptr))
3231 return false;
3232 }
3233
3234 /* Need to test individual job array records */
3235 inx = JOB_HASH_INX(array_job_id);
3236 job_ptr = job_array_hash_j[inx];
3237 while (job_ptr) {
3238 if ((job_ptr->array_job_id == array_job_id) &&
3239 (job_ptr != head_job_ptr)) {
3240 return false;
3241 }
3242 job_ptr = job_ptr->job_array_next_j;
3243 }
3244 return true;
3245 }
3246
3247 /* Return true if ALL tasks of specific array job ID are finished */
test_job_array_finished(uint32_t array_job_id)3248 extern bool test_job_array_finished(uint32_t array_job_id)
3249 {
3250 job_record_t *job_ptr;
3251 int inx;
3252
3253 job_ptr = find_job_record(array_job_id);
3254 if (job_ptr) {
3255 if (!IS_JOB_FINISHED(job_ptr))
3256 return false;
3257 }
3258
3259 /* Need to test individual job array records */
3260 inx = JOB_HASH_INX(array_job_id);
3261 job_ptr = job_array_hash_j[inx];
3262 while (job_ptr) {
3263 if (job_ptr->array_job_id == array_job_id) {
3264 if (!IS_JOB_FINISHED(job_ptr))
3265 return false;
3266 }
3267 job_ptr = job_ptr->job_array_next_j;
3268 }
3269
3270 return true;
3271 }
3272
3273 /* Return true if ANY tasks of specific array job ID are pending */
test_job_array_pending(uint32_t array_job_id)3274 extern bool test_job_array_pending(uint32_t array_job_id)
3275 {
3276 job_record_t *job_ptr;
3277 int inx;
3278
3279 job_ptr = find_job_record(array_job_id);
3280 if (job_ptr) {
3281 if (IS_JOB_PENDING(job_ptr))
3282 return true;
3283 if (job_ptr->array_recs && job_ptr->array_recs->task_cnt)
3284 return true;
3285 }
3286
3287 /* Need to test individual job array records */
3288 inx = JOB_HASH_INX(array_job_id);
3289 job_ptr = job_array_hash_j[inx];
3290 while (job_ptr) {
3291 if (job_ptr->array_job_id == array_job_id) {
3292 if (IS_JOB_PENDING(job_ptr))
3293 return true;
3294 }
3295 job_ptr = job_ptr->job_array_next_j;
3296 }
3297 return false;
3298 }
3299
3300 /* For a given job ID return the number of PENDING tasks which have their
3301 * own separate job_record (do not count tasks in pending META job record) */
num_pending_job_array_tasks(uint32_t array_job_id)3302 extern int num_pending_job_array_tasks(uint32_t array_job_id)
3303 {
3304 job_record_t *job_ptr;
3305 int count = 0, inx;
3306
3307 inx = JOB_HASH_INX(array_job_id);
3308 job_ptr = job_array_hash_j[inx];
3309 while (job_ptr) {
3310 if ((job_ptr->array_job_id == array_job_id) &&
3311 IS_JOB_PENDING(job_ptr))
3312 count++;
3313 job_ptr = job_ptr->job_array_next_j;
3314 }
3315
3316 return count;
3317 }
3318
3319 /*
3320 * find_job_array_rec - return a pointer to the job record with the given
3321 * array_job_id/array_task_id
3322 * IN job_id - requested job's id
3323 * IN array_task_id - requested job's task id,
3324 * NO_VAL if none specified (i.e. not a job array)
3325 * INFINITE return any task for specified job id
3326 * RET pointer to the job's record, NULL on error
3327 */
find_job_array_rec(uint32_t array_job_id,uint32_t array_task_id)3328 extern job_record_t *find_job_array_rec(uint32_t array_job_id,
3329 uint32_t array_task_id)
3330 {
3331 job_record_t *job_ptr, *match_job_ptr = NULL;
3332 int inx;
3333
3334 if (array_task_id == NO_VAL)
3335 return find_job_record(array_job_id);
3336
3337 if (array_task_id == INFINITE) { /* find by job ID */
3338 /* Look for job record with all of the pending tasks */
3339 job_ptr = find_job_record(array_job_id);
3340 if (job_ptr && job_ptr->array_recs &&
3341 (job_ptr->array_job_id == array_job_id))
3342 return job_ptr;
3343
3344 inx = JOB_HASH_INX(array_job_id);
3345 job_ptr = job_array_hash_j[inx];
3346 while (job_ptr) {
3347 if (job_ptr->array_job_id == array_job_id) {
3348 match_job_ptr = job_ptr;
3349 if (!IS_JOB_FINISHED(job_ptr)) {
3350 return job_ptr;
3351 }
3352 }
3353 job_ptr = job_ptr->job_array_next_j;
3354 }
3355 return match_job_ptr;
3356 } else { /* Find specific task ID */
3357 inx = JOB_ARRAY_HASH_INX(array_job_id, array_task_id);
3358 job_ptr = job_array_hash_t[inx];
3359 while (job_ptr) {
3360 if ((job_ptr->array_job_id == array_job_id) &&
3361 (job_ptr->array_task_id == array_task_id)) {
3362 return job_ptr;
3363 }
3364 job_ptr = job_ptr->job_array_next_t;
3365 }
3366 /* Look for job record with all of the pending tasks */
3367 job_ptr = find_job_record(array_job_id);
3368 if (job_ptr && job_ptr->array_recs &&
3369 job_ptr->array_recs->task_id_bitmap) {
3370 inx = bit_size(job_ptr->array_recs->task_id_bitmap);
3371 if ((array_task_id < inx) &&
3372 bit_test(job_ptr->array_recs->task_id_bitmap,
3373 array_task_id)) {
3374 return job_ptr;
3375 }
3376 }
3377 return NULL; /* None found */
3378 }
3379 }
3380
3381 /*
3382 * find_het_job_record - return a pointer to the job record with the given ID
3383 * IN job_id - requested job's ID
3384 * in het_job_id - hetjob component ID
3385 * RET pointer to the job's record, NULL on error
3386 */
find_het_job_record(uint32_t job_id,uint32_t het_job_id)3387 extern job_record_t *find_het_job_record(uint32_t job_id, uint32_t het_job_id)
3388 {
3389 job_record_t *het_job_leader, *het_job;
3390 ListIterator iter;
3391
3392 het_job_leader = job_hash[JOB_HASH_INX(job_id)];
3393 while (het_job_leader) {
3394 if (het_job_leader->job_id == job_id)
3395 break;
3396 het_job_leader = het_job_leader->job_next;
3397 }
3398 if (!het_job_leader)
3399 return NULL;
3400 if (het_job_leader->het_job_offset == het_job_id)
3401 return het_job_leader;
3402
3403 if (!het_job_leader->het_job_list)
3404 return NULL;
3405 iter = list_iterator_create(het_job_leader->het_job_list);
3406 while ((het_job = list_next(iter))) {
3407 if (het_job_leader->het_job_id != het_job->het_job_id) {
3408 error("%s: Bad het_job_list for %pJ",
3409 __func__, het_job_leader);
3410 continue;
3411 }
3412 if (het_job->het_job_offset == het_job_id)
3413 break;
3414 }
3415 list_iterator_destroy(iter);
3416
3417 return het_job;
3418 }
3419
3420 /*
3421 * find_job_record - return a pointer to the job record with the given job_id
3422 * IN job_id - requested job's id
3423 * RET pointer to the job's record, NULL on error
3424 */
find_job_record(uint32_t job_id)3425 extern job_record_t *find_job_record(uint32_t job_id)
3426 {
3427 job_record_t *job_ptr;
3428
3429 job_ptr = job_hash[JOB_HASH_INX(job_id)];
3430 while (job_ptr) {
3431 if (job_ptr->job_id == job_id)
3432 return job_ptr;
3433 job_ptr = job_ptr->job_next;
3434 }
3435
3436 return NULL;
3437 }
3438
3439 /* rebuild a job's partition name list based upon the contents of its
3440 * part_ptr_list */
_rebuild_part_name_list(job_record_t * job_ptr)3441 static void _rebuild_part_name_list(job_record_t *job_ptr)
3442 {
3443 bool job_active = false, job_pending = false;
3444 part_record_t *part_ptr;
3445 ListIterator part_iterator;
3446
3447 xfree(job_ptr->partition);
3448
3449 if (!job_ptr->part_ptr_list) {
3450 job_ptr->partition = xstrdup(job_ptr->part_ptr->name);
3451 last_job_update = time(NULL);
3452 return;
3453 }
3454
3455 if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) {
3456 job_active = true;
3457 job_ptr->partition = xstrdup(job_ptr->part_ptr->name);
3458 } else if (IS_JOB_PENDING(job_ptr))
3459 job_pending = true;
3460
3461 part_iterator = list_iterator_create(job_ptr->part_ptr_list);
3462 while ((part_ptr = list_next(part_iterator))) {
3463 if (job_pending) {
3464 /* Reset job's one partition to a valid one */
3465 job_ptr->part_ptr = part_ptr;
3466 job_pending = false;
3467 }
3468 if (job_active && (part_ptr == job_ptr->part_ptr))
3469 continue; /* already added */
3470 if (job_ptr->partition)
3471 xstrcat(job_ptr->partition, ",");
3472 xstrcat(job_ptr->partition, part_ptr->name);
3473 }
3474 list_iterator_destroy(part_iterator);
3475 last_job_update = time(NULL);
3476 }
3477
3478 /*
3479 * Kill job or job step
3480 *
3481 * IN job_step_kill_msg - msg with specs on which job/step to cancel.
3482 * IN uid - uid of user requesting job/step cancel.
3483 */
_kill_job_step(job_step_kill_msg_t * job_step_kill_msg,uint32_t uid)3484 static int _kill_job_step(job_step_kill_msg_t *job_step_kill_msg, uint32_t uid)
3485 {
3486 DEF_TIMERS;
3487 /* Locks: Read config, write job, write node, read fed */
3488 slurmctld_lock_t job_write_lock = {
3489 READ_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, READ_LOCK };
3490 job_record_t *job_ptr;
3491 int error_code = SLURM_SUCCESS;
3492
3493 START_TIMER;
3494 lock_slurmctld(job_write_lock);
3495 job_ptr = find_job_record(job_step_kill_msg->job_id);
3496 trace_job(job_ptr, __func__, "enter");
3497
3498 /* do RPC call */
3499 if (job_step_kill_msg->job_step_id == SLURM_BATCH_SCRIPT) {
3500 /* NOTE: SLURM_BATCH_SCRIPT == NO_VAL */
3501 error_code = job_signal_id(job_step_kill_msg->job_id,
3502 job_step_kill_msg->signal,
3503 job_step_kill_msg->flags, uid,
3504 false);
3505 unlock_slurmctld(job_write_lock);
3506 END_TIMER2(__func__);
3507
3508 /* return result */
3509 if (error_code) {
3510 if (slurmctld_conf.debug_flags & DEBUG_FLAG_STEPS)
3511 info("Signal %u %pJ by UID=%u: %s",
3512 job_step_kill_msg->signal, job_ptr, uid,
3513 slurm_strerror(error_code));
3514 } else {
3515 if (job_step_kill_msg->signal == SIGKILL) {
3516 if (slurmctld_conf.debug_flags &
3517 DEBUG_FLAG_STEPS)
3518 info("%s: Cancel of %pJ by UID=%u, %s",
3519 __func__, job_ptr, uid, TIME_STR);
3520 slurmctld_diag_stats.jobs_canceled++;
3521 } else {
3522 if (slurmctld_conf.debug_flags &
3523 DEBUG_FLAG_STEPS)
3524 info("%s: Signal %u of %pJ by UID=%u, %s",
3525 __func__,
3526 job_step_kill_msg->signal,
3527 job_ptr, uid, TIME_STR);
3528 }
3529
3530 /* Below function provides its own locking */
3531 schedule_job_save();
3532 }
3533 } else {
3534 error_code = job_step_signal(job_step_kill_msg->job_id,
3535 job_step_kill_msg->job_step_id,
3536 job_step_kill_msg->signal,
3537 job_step_kill_msg->flags,
3538 uid);
3539 unlock_slurmctld(job_write_lock);
3540 END_TIMER2(__func__);
3541
3542 /* return result */
3543 if (error_code) {
3544 if (slurmctld_conf.debug_flags & DEBUG_FLAG_STEPS)
3545 info("Signal %u of JobId=%u StepId=%u by UID=%u: %s",
3546 job_step_kill_msg->signal,
3547 job_step_kill_msg->job_id,
3548 job_step_kill_msg->job_step_id, uid,
3549 slurm_strerror(error_code));
3550 } else {
3551 if (job_step_kill_msg->signal == SIGKILL) {
3552 if (slurmctld_conf.debug_flags &
3553 DEBUG_FLAG_STEPS)
3554 info("%s: Cancel of JobId=%u StepId=%u by UID=%u %s",
3555 __func__,
3556 job_step_kill_msg->job_id,
3557 job_step_kill_msg->job_step_id,
3558 uid, TIME_STR);
3559 } else {
3560 if (slurmctld_conf.debug_flags &
3561 DEBUG_FLAG_STEPS)
3562 info("%s: Signal %u of JobId=%u StepId=%u by UID=%u %s",
3563 __func__,
3564 job_step_kill_msg->signal,
3565 job_step_kill_msg->job_id,
3566 job_step_kill_msg->job_step_id,
3567 uid, TIME_STR);
3568 }
3569
3570 /* Below function provides its own locking */
3571 schedule_job_save();
3572 }
3573 }
3574
3575 trace_job(job_ptr, __func__, "return");
3576 return error_code;
3577 }
3578
3579 /*
3580 * Kill job or job step
3581 *
3582 * IN job_step_kill_msg - msg with specs on which job/step to cancel.
3583 * IN uid - uid of user requesting job/step cancel.
3584 */
kill_job_step(job_step_kill_msg_t * job_step_kill_msg,uint32_t uid)3585 extern int kill_job_step(job_step_kill_msg_t *job_step_kill_msg, uint32_t uid)
3586 {
3587 /* Locks: Read job */
3588 slurmctld_lock_t job_read_lock = {
3589 NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
3590 job_record_t *job_ptr, *het_job_ptr;
3591 uint32_t *het_job_ids = NULL;
3592 int cnt = 0, i, rc;
3593 int error_code = SLURM_SUCCESS;
3594 ListIterator iter;
3595
3596 lock_slurmctld(job_read_lock);
3597 job_ptr = find_job_record(job_step_kill_msg->job_id);
3598 if (job_ptr && job_ptr->het_job_list &&
3599 (job_step_kill_msg->signal == SIGKILL) &&
3600 (job_step_kill_msg->job_step_id != SLURM_BATCH_SCRIPT)) {
3601 cnt = list_count(job_ptr->het_job_list);
3602 het_job_ids = xcalloc(cnt, sizeof(uint32_t));
3603 i = 0;
3604 iter = list_iterator_create(job_ptr->het_job_list);
3605 while ((het_job_ptr = list_next(iter))) {
3606 het_job_ids[i++] = het_job_ptr->job_id;
3607 }
3608 list_iterator_destroy(iter);
3609 }
3610 unlock_slurmctld(job_read_lock);
3611
3612 if (!job_ptr) {
3613 info("%s: invalid JobId=%u",
3614 __func__, job_step_kill_msg->job_id);
3615 error_code = ESLURM_INVALID_JOB_ID;
3616 } else if (het_job_ids) {
3617 for (i = 0; i < cnt; i++) {
3618 job_step_kill_msg->job_id = het_job_ids[i];
3619 rc = _kill_job_step(job_step_kill_msg, uid);
3620 if (rc != SLURM_SUCCESS)
3621 error_code = rc;
3622 }
3623 xfree(het_job_ids);
3624 } else {
3625 error_code = _kill_job_step(job_step_kill_msg, uid);
3626 }
3627
3628 return error_code;
3629 }
3630
3631 /*
3632 * kill_job_by_part_name - Given a partition name, deallocate resource for
3633 * its jobs and kill them. All jobs associated with this partition
3634 * will have their partition pointer cleared.
3635 * IN part_name - name of a partition
3636 * RET number of jobs associated with this partition
3637 */
kill_job_by_part_name(char * part_name)3638 extern int kill_job_by_part_name(char *part_name)
3639 {
3640 ListIterator job_iterator, part_iterator;
3641 job_record_t *job_ptr;
3642 part_record_t *part_ptr, *part2_ptr;
3643 int kill_job_cnt = 0;
3644 time_t now = time(NULL);
3645
3646 part_ptr = find_part_record (part_name);
3647 if (part_ptr == NULL) /* No such partition */
3648 return 0;
3649
3650 job_iterator = list_iterator_create(job_list);
3651 while ((job_ptr = list_next(job_iterator))) {
3652 bool pending = false, suspended = false;
3653
3654 pending = IS_JOB_PENDING(job_ptr);
3655 if (job_ptr->part_ptr_list) {
3656 /* Remove partition if candidate for a job */
3657 bool rebuild_name_list = false;
3658 part_iterator = list_iterator_create(job_ptr->
3659 part_ptr_list);
3660 while ((part2_ptr = list_next(part_iterator))) {
3661 if (part2_ptr != part_ptr)
3662 continue;
3663 list_remove(part_iterator);
3664 rebuild_name_list = true;
3665 }
3666 list_iterator_destroy(part_iterator);
3667 if (rebuild_name_list) {
3668 if (list_count(job_ptr->part_ptr_list) > 0) {
3669 _rebuild_part_name_list(job_ptr);
3670 job_ptr->part_ptr =
3671 list_peek(job_ptr->
3672 part_ptr_list);
3673 } else {
3674 FREE_NULL_LIST(job_ptr->part_ptr_list);
3675 }
3676 }
3677 }
3678
3679 if (job_ptr->part_ptr != part_ptr)
3680 continue;
3681
3682 if (IS_JOB_SUSPENDED(job_ptr)) {
3683 uint32_t suspend_job_state = job_ptr->job_state;
3684 /* we can't have it as suspended when we call the
3685 * accounting stuff.
3686 */
3687 job_ptr->job_state = JOB_CANCELLED;
3688 jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
3689 job_ptr->job_state = suspend_job_state;
3690 suspended = true;
3691 }
3692 if (IS_JOB_RUNNING(job_ptr) || suspended) {
3693 kill_job_cnt++;
3694 info("Killing %pJ on defunct partition %s",
3695 job_ptr, part_name);
3696 job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING;
3697 build_cg_bitmap(job_ptr);
3698 job_ptr->state_reason = FAIL_DOWN_PARTITION;
3699 xfree(job_ptr->state_desc);
3700 if (suspended) {
3701 job_ptr->end_time = job_ptr->suspend_time;
3702 job_ptr->tot_sus_time +=
3703 difftime(now, job_ptr->suspend_time);
3704 } else
3705 job_ptr->end_time = now;
3706 job_completion_logger(job_ptr, false);
3707 if (!pending)
3708 deallocate_nodes(job_ptr, false, suspended,
3709 false);
3710 } else if (pending) {
3711 kill_job_cnt++;
3712 info("Killing %pJ on defunct partition %s",
3713 job_ptr, part_name);
3714 job_ptr->job_state = JOB_CANCELLED;
3715 job_ptr->start_time = now;
3716 job_ptr->end_time = now;
3717 job_ptr->exit_code = 1;
3718 job_completion_logger(job_ptr, false);
3719 fed_mgr_job_complete(job_ptr, 0, now);
3720 }
3721 job_ptr->part_ptr = NULL;
3722 FREE_NULL_LIST(job_ptr->part_ptr_list);
3723 }
3724 list_iterator_destroy(job_iterator);
3725
3726 if (kill_job_cnt)
3727 last_job_update = now;
3728 return kill_job_cnt;
3729 }
3730
3731 /*
3732 * kill_job_by_front_end_name - Given a front end node name, deallocate
3733 * resource for its jobs and kill them.
3734 * IN node_name - name of a front end node
3735 * RET number of jobs associated with this front end node
3736 * NOTE: Patterned after kill_running_job_by_node_name()
3737 */
kill_job_by_front_end_name(char * node_name)3738 extern int kill_job_by_front_end_name(char *node_name)
3739 {
3740 #ifdef HAVE_FRONT_END
3741 ListIterator job_iterator;
3742 job_record_t *job_ptr, *het_job_leader;
3743 node_record_t *node_ptr;
3744 time_t now = time(NULL);
3745 int i, kill_job_cnt = 0;
3746
3747 if (node_name == NULL)
3748 fatal("kill_job_by_front_end_name: node_name is NULL");
3749
3750 job_iterator = list_iterator_create(job_list);
3751 while ((job_ptr = list_next(job_iterator))) {
3752 bool suspended = false;
3753
3754 if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr) &&
3755 !IS_JOB_COMPLETING(job_ptr))
3756 continue;
3757 het_job_leader = NULL;
3758 if (job_ptr->het_job_id)
3759 het_job_leader = find_job_record(job_ptr->het_job_id);
3760 if (!het_job_leader)
3761 het_job_leader = job_ptr;
3762 if ((het_job_leader->batch_host == NULL) ||
3763 xstrcmp(het_job_leader->batch_host, node_name))
3764 continue; /* no match on node name */
3765
3766 if (IS_JOB_SUSPENDED(job_ptr)) {
3767 uint32_t suspend_job_state = job_ptr->job_state;
3768 /*
3769 * we can't have it as suspended when we call the
3770 * accounting stuff.
3771 */
3772 job_ptr->job_state = JOB_CANCELLED;
3773 jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
3774 job_ptr->job_state = suspend_job_state;
3775 suspended = true;
3776 }
3777 if (IS_JOB_COMPLETING(job_ptr)) {
3778 kill_job_cnt++;
3779 while ((i = bit_ffs(job_ptr->node_bitmap_cg)) >= 0) {
3780 bit_clear(job_ptr->node_bitmap_cg, i);
3781 if (job_ptr->node_cnt)
3782 (job_ptr->node_cnt)--;
3783 else {
3784 error("node_cnt underflow on %pJ",
3785 job_ptr);
3786 }
3787 job_update_tres_cnt(job_ptr, i);
3788 if (job_ptr->node_cnt == 0) {
3789 cleanup_completing(job_ptr);
3790 }
3791 node_ptr = &node_record_table_ptr[i];
3792 if (node_ptr->comp_job_cnt)
3793 (node_ptr->comp_job_cnt)--;
3794 else {
3795 error("Node %s comp_job_cnt underflow, %pJ",
3796 node_ptr->name, job_ptr);
3797 }
3798 }
3799 } else if (IS_JOB_RUNNING(job_ptr) || suspended) {
3800 kill_job_cnt++;
3801 if (job_ptr->batch_flag && job_ptr->details &&
3802 slurmctld_conf.job_requeue &&
3803 (job_ptr->details->requeue > 0)) {
3804 char requeue_msg[128];
3805
3806 srun_node_fail(job_ptr, node_name);
3807 info("requeue %pJ due to failure of node %s",
3808 job_ptr, node_name);
3809 set_job_prio(job_ptr);
3810 snprintf(requeue_msg, sizeof(requeue_msg),
3811 "Job requeued due to failure "
3812 "of node %s",
3813 node_name);
3814 job_ptr->time_last_active = now;
3815 if (suspended) {
3816 job_ptr->end_time =
3817 job_ptr->suspend_time;
3818 job_ptr->tot_sus_time +=
3819 difftime(now,
3820 job_ptr->
3821 suspend_time);
3822 } else
3823 job_ptr->end_time = now;
3824
3825 /*
3826 * We want this job to look like it
3827 * was terminated in the accounting logs.
3828 * Set a new submit time so the restarted
3829 * job looks like a new job.
3830 */
3831 job_ptr->job_state = JOB_NODE_FAIL;
3832 build_cg_bitmap(job_ptr);
3833 job_completion_logger(job_ptr, true);
3834 deallocate_nodes(job_ptr, false, suspended,
3835 false);
3836
3837 /* do this after the epilog complete,
3838 * setting it here is too early */
3839 //job_ptr->db_index = 0;
3840 //job_ptr->details->submit_time = now;
3841
3842 job_ptr->job_state = JOB_PENDING;
3843 if (job_ptr->node_cnt)
3844 job_ptr->job_state |= JOB_COMPLETING;
3845
3846 job_ptr->restart_cnt++;
3847
3848 /* clear signal sent flag on requeue */
3849 job_ptr->warn_flags &= ~WARN_SENT;
3850
3851 /* Since the job completion logger
3852 * removes the submit we need to add it
3853 * again. */
3854 acct_policy_add_job_submit(job_ptr);
3855
3856 if (!job_ptr->node_bitmap_cg ||
3857 bit_set_count(job_ptr->node_bitmap_cg) == 0)
3858 batch_requeue_fini(job_ptr);
3859 } else {
3860 info("Killing %pJ on failed node %s",
3861 job_ptr, node_name);
3862 srun_node_fail(job_ptr, node_name);
3863 job_ptr->job_state = JOB_NODE_FAIL |
3864 JOB_COMPLETING;
3865 build_cg_bitmap(job_ptr);
3866 job_ptr->state_reason = FAIL_DOWN_NODE;
3867 xfree(job_ptr->state_desc);
3868 if (suspended) {
3869 job_ptr->end_time =
3870 job_ptr->suspend_time;
3871 job_ptr->tot_sus_time +=
3872 difftime(now,
3873 job_ptr->suspend_time);
3874 } else
3875 job_ptr->end_time = now;
3876 job_completion_logger(job_ptr, false);
3877 deallocate_nodes(job_ptr, false, suspended,
3878 false);
3879 }
3880 }
3881 }
3882 list_iterator_destroy(job_iterator);
3883
3884 if (kill_job_cnt)
3885 last_job_update = now;
3886 return kill_job_cnt;
3887 #else
3888 return 0;
3889 #endif
3890 }
3891
3892 /*
3893 * partition_in_use - determine whether a partition is in use by a RUNNING
3894 * PENDING or SUSPENDED job or reservations
3895 * IN part_name - name of a partition
3896 * RET true if the partition is in use, else false
3897 */
partition_in_use(char * part_name)3898 extern bool partition_in_use(char *part_name)
3899 {
3900 ListIterator job_iterator;
3901 job_record_t *job_ptr;
3902 part_record_t *part_ptr;
3903
3904 part_ptr = find_part_record (part_name);
3905 if (part_ptr == NULL) /* No such partition */
3906 return false;
3907
3908 /* check jobs */
3909 job_iterator = list_iterator_create(job_list);
3910 while ((job_ptr = list_next(job_iterator))) {
3911 if (job_ptr->part_ptr == part_ptr) {
3912 if (!IS_JOB_FINISHED(job_ptr)) {
3913 list_iterator_destroy(job_iterator);
3914 return true;
3915 }
3916 }
3917 }
3918 list_iterator_destroy(job_iterator);
3919
3920 /* check reservations */
3921 if (list_find_first(resv_list, _find_resv_part, part_ptr))
3922 return true;
3923
3924 return false;
3925 }
3926
3927 /* Clear a job's GRES details per node strings, rebuilt later on demand */
_clear_job_gres_details(job_record_t * job_ptr)3928 static void _clear_job_gres_details(job_record_t *job_ptr)
3929 {
3930 int i;
3931
3932 xfree(job_ptr->gres_used);
3933 for (i = 0; i < job_ptr->gres_detail_cnt; i++)
3934 xfree(job_ptr->gres_detail_str[i]);
3935 xfree(job_ptr->gres_detail_str);
3936 job_ptr->gres_detail_cnt = 0;
3937 }
3938
3939
_job_node_test(job_record_t * job_ptr,int node_inx)3940 static bool _job_node_test(job_record_t *job_ptr, int node_inx)
3941 {
3942 if (job_ptr->node_bitmap &&
3943 bit_test(job_ptr->node_bitmap, node_inx))
3944 return true;
3945 return false;
3946 }
3947
_het_job_on_node(job_record_t * job_ptr,int node_inx)3948 static bool _het_job_on_node(job_record_t *job_ptr, int node_inx)
3949 {
3950 job_record_t *het_job_leader, *het_job;
3951 ListIterator iter;
3952 static bool result = false;
3953
3954 if (!job_ptr->het_job_id)
3955 return _job_node_test(job_ptr, node_inx);
3956
3957 het_job_leader = find_job_record(job_ptr->het_job_id);
3958 if (!het_job_leader) {
3959 error("%s: Hetjob leader %pJ not found",
3960 __func__, job_ptr);
3961 return _job_node_test(job_ptr, node_inx);
3962 }
3963 if (!het_job_leader->het_job_list) {
3964 error("%s: Hetjob leader %pJ job list is NULL",
3965 __func__, job_ptr);
3966 return _job_node_test(job_ptr, node_inx);
3967 }
3968
3969 iter = list_iterator_create(het_job_leader->het_job_list);
3970 while ((het_job = list_next(iter))) {
3971 if ((result = _job_node_test(het_job, node_inx)))
3972 break;
3973 /*
3974 * After a DOWN node is removed from another job component,
3975 * we have no way to identify other hetjob components with
3976 * the same node, so assume if one component is in NODE_FAILED
3977 * state, they all should be.
3978 */
3979 if (IS_JOB_NODE_FAILED(het_job)) {
3980 result = true;
3981 break;
3982 }
3983 }
3984 list_iterator_destroy(iter);
3985
3986 return result;
3987 }
3988
3989 /*
3990 * kill_running_job_by_node_name - Given a node name, deallocate RUNNING
3991 * or COMPLETING jobs from the node or kill them
3992 * IN node_name - name of a node
3993 * RET number of killed jobs
3994 */
kill_running_job_by_node_name(char * node_name)3995 extern int kill_running_job_by_node_name(char *node_name)
3996 {
3997 ListIterator job_iterator;
3998 job_record_t *job_ptr;
3999 node_record_t *node_ptr;
4000 int node_inx;
4001 int kill_job_cnt = 0;
4002 time_t now = time(NULL);
4003
4004 xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
4005 xassert(verify_lock(NODE_LOCK, WRITE_LOCK));
4006
4007 node_ptr = find_node_record(node_name);
4008 if (node_ptr == NULL) /* No such node */
4009 return 0;
4010 node_inx = node_ptr - node_record_table_ptr;
4011
4012 job_iterator = list_iterator_create(job_list);
4013 while ((job_ptr = list_next(job_iterator))) {
4014 bool suspended = false;
4015 if (!_het_job_on_node(job_ptr, node_inx))
4016 continue; /* job not on this node */
4017 if (nonstop_ops.node_fail)
4018 (nonstop_ops.node_fail)(job_ptr, node_ptr);
4019 if (IS_JOB_SUSPENDED(job_ptr)) {
4020 uint32_t suspend_job_state = job_ptr->job_state;
4021 /*
4022 * we can't have it as suspended when we call the
4023 * accounting stuff.
4024 */
4025 job_ptr->job_state = JOB_CANCELLED;
4026 jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
4027 job_ptr->job_state = suspend_job_state;
4028 suspended = true;
4029 }
4030
4031 if (IS_JOB_COMPLETING(job_ptr)) {
4032 if (!bit_test(job_ptr->node_bitmap_cg, node_inx))
4033 continue;
4034 kill_job_cnt++;
4035 bit_clear(job_ptr->node_bitmap_cg, node_inx);
4036 job_update_tres_cnt(job_ptr, node_inx);
4037 if (job_ptr->node_cnt)
4038 (job_ptr->node_cnt)--;
4039 else {
4040 error("node_cnt underflow on %pJ", job_ptr);
4041 }
4042 if (job_ptr->node_cnt == 0)
4043 cleanup_completing(job_ptr);
4044
4045 if (node_ptr->comp_job_cnt)
4046 (node_ptr->comp_job_cnt)--;
4047 else {
4048 error("Node %s comp_job_cnt underflow, %pJ",
4049 node_ptr->name, job_ptr);
4050 }
4051 } else if (IS_JOB_RUNNING(job_ptr) || suspended) {
4052 kill_job_cnt++;
4053 if ((job_ptr->details) &&
4054 (job_ptr->kill_on_node_fail == 0) &&
4055 (job_ptr->node_cnt > 1) &&
4056 !IS_JOB_CONFIGURING(job_ptr)) {
4057 /* keep job running on remaining nodes */
4058 srun_node_fail(job_ptr, node_name);
4059 error("Removing failed node %s from %pJ",
4060 node_name, job_ptr);
4061 job_pre_resize_acctg(job_ptr);
4062 kill_step_on_node(job_ptr, node_ptr, true);
4063 excise_node_from_job(job_ptr, node_ptr);
4064 (void) gs_job_start(job_ptr);
4065 gres_build_job_details(job_ptr->gres_list,
4066 &job_ptr->gres_detail_cnt,
4067 &job_ptr->gres_detail_str,
4068 &job_ptr->gres_used);
4069 job_post_resize_acctg(job_ptr);
4070 } else if (job_ptr->batch_flag && job_ptr->details &&
4071 job_ptr->details->requeue) {
4072 char requeue_msg[128];
4073
4074 srun_node_fail(job_ptr, node_name);
4075 info("requeue job %pJ due to failure of node %s",
4076 job_ptr, node_name);
4077 snprintf(requeue_msg, sizeof(requeue_msg),
4078 "Job requeued due to failure "
4079 "of node %s",
4080 node_name);
4081 job_ptr->time_last_active = now;
4082 if (suspended) {
4083 job_ptr->end_time =
4084 job_ptr->suspend_time;
4085 job_ptr->tot_sus_time +=
4086 difftime(now,
4087 job_ptr->
4088 suspend_time);
4089 } else
4090 job_ptr->end_time = now;
4091
4092 /*
4093 * We want this job to look like it
4094 * was terminated in the accounting logs.
4095 * Set a new submit time so the restarted
4096 * job looks like a new job.
4097 */
4098 job_ptr->job_state = JOB_NODE_FAIL;
4099 build_cg_bitmap(job_ptr);
4100 job_completion_logger(job_ptr, true);
4101 deallocate_nodes(job_ptr, false, suspended,
4102 false);
4103
4104 /* do this after the epilog complete,
4105 * setting it here is too early */
4106 //job_ptr->db_index = 0;
4107 //job_ptr->details->submit_time = now;
4108
4109 job_ptr->job_state = JOB_PENDING;
4110 if (job_ptr->node_cnt)
4111 job_ptr->job_state |= JOB_COMPLETING;
4112
4113 job_ptr->restart_cnt++;
4114
4115 /* clear signal sent flag on requeue */
4116 job_ptr->warn_flags &= ~WARN_SENT;
4117
4118 /*
4119 * Since the job completion logger
4120 * removes the submit we need to add it
4121 * again.
4122 */
4123 acct_policy_add_job_submit(job_ptr);
4124
4125 if (!job_ptr->node_bitmap_cg ||
4126 bit_set_count(job_ptr->node_bitmap_cg) == 0)
4127 batch_requeue_fini(job_ptr);
4128 } else {
4129 info("Killing %pJ on failed node %s",
4130 job_ptr, node_name);
4131 srun_node_fail(job_ptr, node_name);
4132 job_ptr->job_state = JOB_NODE_FAIL |
4133 JOB_COMPLETING;
4134 build_cg_bitmap(job_ptr);
4135 job_ptr->state_reason = FAIL_DOWN_NODE;
4136 xfree(job_ptr->state_desc);
4137 if (suspended) {
4138 job_ptr->end_time =
4139 job_ptr->suspend_time;
4140 job_ptr->tot_sus_time +=
4141 difftime(now,
4142 job_ptr->suspend_time);
4143 } else
4144 job_ptr->end_time = now;
4145 job_completion_logger(job_ptr, false);
4146 deallocate_nodes(job_ptr, false, suspended,
4147 false);
4148 }
4149 }
4150
4151 }
4152 list_iterator_destroy(job_iterator);
4153 if (kill_job_cnt)
4154 last_job_update = now;
4155
4156 return kill_job_cnt;
4157 }
4158
4159 /* Remove one node from a job's allocation */
excise_node_from_job(job_record_t * job_ptr,node_record_t * node_ptr)4160 extern void excise_node_from_job(job_record_t *job_ptr,
4161 node_record_t *node_ptr)
4162 {
4163 int i, i_first, i_last, orig_pos = -1, new_pos = -1;
4164 bitstr_t *orig_bitmap;
4165
4166 orig_bitmap = bit_copy(job_ptr->node_bitmap);
4167 make_node_idle(node_ptr, job_ptr); /* updates bitmap */
4168 xfree(job_ptr->nodes);
4169 job_ptr->nodes = bitmap2node_name(job_ptr->node_bitmap);
4170 i_first = bit_ffs(orig_bitmap);
4171 if (i_first >= 0)
4172 i_last = bit_fls(orig_bitmap);
4173 else
4174 i_last = -2;
4175 for (i = i_first; i <= i_last; i++) {
4176 if (!bit_test(orig_bitmap,i))
4177 continue;
4178 orig_pos++;
4179 if (!bit_test(job_ptr->node_bitmap, i))
4180 continue;
4181 new_pos++;
4182 if (orig_pos == new_pos)
4183 continue;
4184 memcpy(&job_ptr->node_addr[new_pos],
4185 &job_ptr->node_addr[orig_pos], sizeof(slurm_addr_t));
4186 /*
4187 * NOTE: The job's allocation in the job_ptr->job_resrcs
4188 * data structure is unchanged even after a node allocated
4189 * to the job goes DOWN.
4190 */
4191 }
4192
4193 job_ptr->total_nodes = job_ptr->node_cnt = new_pos + 1;
4194
4195 FREE_NULL_BITMAP(orig_bitmap);
4196 (void) select_g_job_resized(job_ptr, node_ptr);
4197 }
4198
4199 /*
4200 * dump_job_desc - dump the incoming job submit request message
4201 * IN job_specs - job specification from RPC
4202 */
dump_job_desc(job_desc_msg_t * job_specs)4203 void dump_job_desc(job_desc_msg_t * job_specs)
4204 {
4205 long pn_min_cpus, pn_min_tmp_disk, min_cpus;
4206 uint64_t pn_min_memory;
4207 long time_limit, priority, contiguous, nice, time_min;
4208 long kill_on_node_fail, shared, immediate, wait_all_nodes;
4209 long cpus_per_task, requeue, num_tasks, overcommit;
4210 long ntasks_per_node, ntasks_per_socket, ntasks_per_core;
4211 int spec_count;
4212 char *mem_type, buf[100], *signal_flags, *spec_type, *job_id;
4213
4214 if (get_log_level() < LOG_LEVEL_DEBUG3)
4215 return;
4216
4217 if (job_specs == NULL)
4218 return;
4219
4220 if (job_specs->job_id_str)
4221 job_id = job_specs->job_id_str;
4222 else if (job_specs->job_id == NO_VAL)
4223 job_id = "N/A";
4224 else {
4225 snprintf(buf, sizeof(buf), "%u", job_specs->job_id);
4226 job_id = buf;
4227 }
4228 debug3("JobDesc: user_id=%u JobId=%s partition=%s name=%s",
4229 job_specs->user_id, job_id,
4230 job_specs->partition, job_specs->name);
4231
4232 min_cpus = (job_specs->min_cpus != NO_VAL) ?
4233 (long) job_specs->min_cpus : -1L;
4234 pn_min_cpus = (job_specs->pn_min_cpus != NO_VAL16) ?
4235 (long) job_specs->pn_min_cpus : -1L;
4236 if (job_specs->core_spec == NO_VAL16) {
4237 spec_type = "core";
4238 spec_count = -1;
4239 } else if (job_specs->core_spec & CORE_SPEC_THREAD) {
4240 spec_type = "thread";
4241 spec_count = job_specs->core_spec & (~CORE_SPEC_THREAD);
4242 } else {
4243 spec_type = "core";
4244 spec_count = job_specs->core_spec;
4245 }
4246 debug3(" cpus=%ld-%u pn_min_cpus=%ld %s_spec=%d",
4247 min_cpus, job_specs->max_cpus, pn_min_cpus,
4248 spec_type, spec_count);
4249
4250 debug3(" Nodes=%u-[%u] Sock/Node=%u Core/Sock=%u Thread/Core=%u",
4251 job_specs->min_nodes, job_specs->max_nodes,
4252 job_specs->sockets_per_node, job_specs->cores_per_socket,
4253 job_specs->threads_per_core);
4254
4255 if (job_specs->pn_min_memory == NO_VAL64) {
4256 pn_min_memory = -1L;
4257 mem_type = "job";
4258 } else if (job_specs->pn_min_memory & MEM_PER_CPU) {
4259 pn_min_memory = job_specs->pn_min_memory & (~MEM_PER_CPU);
4260 mem_type = "cpu";
4261 } else {
4262 pn_min_memory = job_specs->pn_min_memory;
4263 mem_type = "job";
4264 }
4265 pn_min_tmp_disk = (job_specs->pn_min_tmp_disk != NO_VAL) ?
4266 (long) job_specs->pn_min_tmp_disk : -1L;
4267 debug3(" pn_min_memory_%s=%"PRIu64" pn_min_tmp_disk=%ld",
4268 mem_type, pn_min_memory, pn_min_tmp_disk);
4269 immediate = (job_specs->immediate == 0) ? 0L : 1L;
4270 debug3(" immediate=%ld reservation=%s",
4271 immediate, job_specs->reservation);
4272 debug3(" features=%s batch_features=%s cluster_features=%s",
4273 job_specs->features, job_specs->batch_features,
4274 job_specs->cluster_features);
4275
4276 debug3(" req_nodes=%s exc_nodes=%s",
4277 job_specs->req_nodes, job_specs->exc_nodes);
4278
4279 time_limit = (job_specs->time_limit != NO_VAL) ?
4280 (long) job_specs->time_limit : -1L;
4281 time_min = (job_specs->time_min != NO_VAL) ?
4282 (long) job_specs->time_min : time_limit;
4283 priority = (job_specs->priority != NO_VAL) ?
4284 (long) job_specs->priority : -1L;
4285 contiguous = (job_specs->contiguous != NO_VAL16) ?
4286 (long) job_specs->contiguous : -1L;
4287 shared = (job_specs->shared != NO_VAL16) ?
4288 (long) job_specs->shared : -1L;
4289 debug3(" time_limit=%ld-%ld priority=%ld contiguous=%ld shared=%ld",
4290 time_min, time_limit, priority, contiguous, shared);
4291
4292 kill_on_node_fail = (job_specs->kill_on_node_fail !=
4293 NO_VAL16) ?
4294 (long) job_specs->kill_on_node_fail : -1L;
4295 if (job_specs->script) /* log has problem with string len & null */
4296 debug3(" kill_on_node_fail=%ld script=%.40s...",
4297 kill_on_node_fail, job_specs->script);
4298 else
4299 debug3(" kill_on_node_fail=%ld script=(null)",
4300 kill_on_node_fail);
4301
4302 if (job_specs->argc == 1)
4303 debug3(" argv=\"%s\"",
4304 job_specs->argv[0]);
4305 else if (job_specs->argc == 2)
4306 debug3(" argv=%s,%s",
4307 job_specs->argv[0],
4308 job_specs->argv[1]);
4309 else if (job_specs->argc > 2)
4310 debug3(" argv=%s,%s,%s,...",
4311 job_specs->argv[0],
4312 job_specs->argv[1],
4313 job_specs->argv[2]);
4314
4315 if (job_specs->env_size == 1)
4316 debug3(" environment=\"%s\"",
4317 job_specs->environment[0]);
4318 else if (job_specs->env_size == 2)
4319 debug3(" environment=%s,%s",
4320 job_specs->environment[0],
4321 job_specs->environment[1]);
4322 else if (job_specs->env_size > 2)
4323 debug3(" environment=%s,%s,%s,...",
4324 job_specs->environment[0],
4325 job_specs->environment[1],
4326 job_specs->environment[2]);
4327
4328 if (job_specs->spank_job_env_size == 1)
4329 debug3(" spank_job_env=\"%s\"",
4330 job_specs->spank_job_env[0]);
4331 else if (job_specs->spank_job_env_size == 2)
4332 debug3(" spank_job_env=%s,%s",
4333 job_specs->spank_job_env[0],
4334 job_specs->spank_job_env[1]);
4335 else if (job_specs->spank_job_env_size > 2)
4336 debug3(" spank_job_env=%s,%s,%s,...",
4337 job_specs->spank_job_env[0],
4338 job_specs->spank_job_env[1],
4339 job_specs->spank_job_env[2]);
4340
4341 debug3(" stdin=%s stdout=%s stderr=%s",
4342 job_specs->std_in, job_specs->std_out, job_specs->std_err);
4343
4344 debug3(" work_dir=%s alloc_node:sid=%s:%u",
4345 job_specs->work_dir,
4346 job_specs->alloc_node, job_specs->alloc_sid);
4347
4348 debug3(" power_flags=%s",
4349 power_flags_str(job_specs->power_flags));
4350
4351 debug3(" resp_host=%s alloc_resp_port=%u other_port=%u",
4352 job_specs->resp_host,
4353 job_specs->alloc_resp_port, job_specs->other_port);
4354 debug3(" dependency=%s account=%s qos=%s comment=%s",
4355 job_specs->dependency, job_specs->account,
4356 job_specs->qos, job_specs->comment);
4357
4358 num_tasks = (job_specs->num_tasks != NO_VAL) ?
4359 (long) job_specs->num_tasks : -1L;
4360 overcommit = (job_specs->overcommit != NO_VAL8) ?
4361 (long) job_specs->overcommit : -1L;
4362 nice = (job_specs->nice != NO_VAL) ?
4363 ((int64_t)job_specs->nice - NICE_OFFSET) : 0;
4364 debug3(" mail_type=%u mail_user=%s nice=%ld num_tasks=%ld "
4365 "open_mode=%u overcommit=%ld acctg_freq=%s",
4366 job_specs->mail_type, job_specs->mail_user, nice, num_tasks,
4367 job_specs->open_mode, overcommit, job_specs->acctg_freq);
4368
4369 slurm_make_time_str(&job_specs->begin_time, buf, sizeof(buf));
4370 cpus_per_task = (job_specs->cpus_per_task != NO_VAL16) ?
4371 (long) job_specs->cpus_per_task : -1L;
4372 requeue = (job_specs->requeue != NO_VAL16) ?
4373 (long) job_specs->requeue : -1L;
4374 debug3(" network=%s begin=%s cpus_per_task=%ld requeue=%ld "
4375 "licenses=%s",
4376 job_specs->network, buf, cpus_per_task, requeue,
4377 job_specs->licenses);
4378
4379 slurm_make_time_str(&job_specs->end_time, buf, sizeof(buf));
4380 wait_all_nodes = (job_specs->wait_all_nodes != NO_VAL16) ?
4381 (long) job_specs->wait_all_nodes : -1L;
4382 if (job_specs->warn_flags & KILL_JOB_BATCH)
4383 signal_flags = "B:";
4384 else
4385 signal_flags = "";
4386 cpu_freq_debug(NULL, NULL, buf, sizeof(buf), job_specs->cpu_freq_gov,
4387 job_specs->cpu_freq_min, job_specs->cpu_freq_max,
4388 NO_VAL);
4389 debug3(" end_time=%s signal=%s%u@%u wait_all_nodes=%ld cpu_freq=%s",
4390 buf, signal_flags, job_specs->warn_signal, job_specs->warn_time,
4391 wait_all_nodes, buf);
4392
4393 ntasks_per_node = (job_specs->ntasks_per_node != NO_VAL16) ?
4394 (long) job_specs->ntasks_per_node : -1L;
4395 ntasks_per_socket = (job_specs->ntasks_per_socket !=
4396 NO_VAL16) ?
4397 (long) job_specs->ntasks_per_socket : -1L;
4398 ntasks_per_core = (job_specs->ntasks_per_core != NO_VAL16) ?
4399 (long) job_specs->ntasks_per_core : -1L;
4400 debug3(" ntasks_per_node=%ld ntasks_per_socket=%ld "
4401 "ntasks_per_core=%ld",
4402 ntasks_per_node, ntasks_per_socket, ntasks_per_core);
4403
4404 debug3(" mem_bind=%u:%s plane_size:%u",
4405 job_specs->mem_bind_type, job_specs->mem_bind,
4406 job_specs->plane_size);
4407 debug3(" array_inx=%s", job_specs->array_inx);
4408 debug3(" burst_buffer=%s", job_specs->burst_buffer);
4409 debug3(" mcs_label=%s", job_specs->mcs_label);
4410 slurm_make_time_str(&job_specs->deadline, buf, sizeof(buf));
4411 debug3(" deadline=%s", buf);
4412 debug3(" bitflags=%u delay_boot=%u", job_specs->bitflags,
4413 job_specs->delay_boot);
4414
4415 if (job_specs->cpus_per_tres)
4416 debug3(" CPUs_per_TRES=%s", job_specs->cpus_per_tres);
4417 if (job_specs->mem_per_tres)
4418 debug3(" Mem_per_TRES=%s", job_specs->mem_per_tres);
4419 if (job_specs->tres_bind)
4420 debug3(" TRES_bind=%s", job_specs->tres_bind);
4421 if (job_specs->tres_freq)
4422 debug3(" TRES_freq=%s", job_specs->tres_freq);
4423 if (job_specs->tres_per_job)
4424 debug3(" TRES_per_job=%s", job_specs->tres_per_job);
4425 if (job_specs->tres_per_node)
4426 debug3(" TRES_per_node=%s", job_specs->tres_per_node);
4427 if (job_specs->tres_per_socket)
4428 debug3(" TRES_per_socket=%s", job_specs->tres_per_socket);
4429 if (job_specs->tres_per_task)
4430 debug3(" TRES_per_task=%s", job_specs->tres_per_task);
4431
4432 select_g_select_jobinfo_sprint(job_specs->select_jobinfo,
4433 buf, sizeof(buf), SELECT_PRINT_MIXED);
4434 if (buf[0] != '\0')
4435 debug3(" %s", buf);
4436 }
4437
4438 /*
4439 * init_job_conf - initialize the job configuration tables and values.
4440 * this should be called after creating node information, but
4441 * before creating any job entries. Pre-existing job entries are
4442 * left unchanged.
4443 * NOTE: The job hash table size does not change after initial creation.
4444 * RET 0 if no error, otherwise an error code
4445 * global: last_job_update - time of last job table update
4446 * job_list - pointer to global job list
4447 */
init_job_conf(void)4448 int init_job_conf(void)
4449 {
4450 if (job_list == NULL) {
4451 job_count = 0;
4452 job_list = list_create(_list_delete_job);
4453 }
4454
4455 last_job_update = time(NULL);
4456
4457 if (!purge_files_list) {
4458 purge_files_list = list_create(xfree_ptr);
4459 }
4460
4461 return SLURM_SUCCESS;
4462 }
4463
4464 /*
4465 * rehash_jobs - Create or rebuild the job hash table.
4466 */
rehash_jobs(void)4467 extern void rehash_jobs(void)
4468 {
4469 xassert(verify_lock(CONF_LOCK, READ_LOCK));
4470 xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
4471
4472 if (job_hash == NULL) {
4473 hash_table_size = slurmctld_conf.max_job_cnt;
4474 job_hash = xcalloc(hash_table_size, sizeof(job_record_t *));
4475 job_array_hash_j = xcalloc(hash_table_size,
4476 sizeof(job_record_t *));
4477 job_array_hash_t = xcalloc(hash_table_size,
4478 sizeof(job_record_t *));
4479 } else if (hash_table_size < (slurmctld_conf.max_job_cnt / 2)) {
4480 /* If the MaxJobCount grows by too much, the hash table will
4481 * be ineffective without rebuilding. We don't presently bother
4482 * to rebuild the hash table, but cut MaxJobCount back as
4483 * needed. */
4484 error ("MaxJobCount reset too high, restart slurmctld");
4485 slurmctld_conf.max_job_cnt = hash_table_size;
4486 }
4487 }
4488
4489 /* Create an exact copy of an existing job record for a job array.
4490 * IN job_ptr - META job record for a job array, which is to become an
4491 * individial task of the job array.
4492 * Set the job's array_task_id to the task to be split out.
4493 * RET - The new job record, which is the new META job record. */
job_array_split(job_record_t * job_ptr)4494 extern job_record_t *job_array_split(job_record_t *job_ptr)
4495 {
4496 job_record_t *job_ptr_pend = NULL, *save_job_next;
4497 struct job_details *job_details, *details_new, *save_details;
4498 uint32_t save_job_id;
4499 uint64_t save_db_index = job_ptr->db_index;
4500 priority_factors_object_t *save_prio_factors;
4501 List save_step_list;
4502 int i;
4503
4504 job_ptr_pend = _create_job_record(0);
4505 if (!job_ptr_pend)
4506 return NULL;
4507
4508 _remove_job_hash(job_ptr, JOB_HASH_JOB);
4509 job_ptr_pend->job_id = job_ptr->job_id;
4510 if (_set_job_id(job_ptr) != SLURM_SUCCESS)
4511 fatal("%s: _set_job_id error", __func__);
4512 if (!job_ptr->array_recs) {
4513 fatal_abort("%s: %pJ record lacks array structure",
4514 __func__, job_ptr);
4515 }
4516
4517 /*
4518 * Copy most of original job data.
4519 * This could be done in parallel, but performance was worse.
4520 */
4521 save_job_id = job_ptr_pend->job_id;
4522 save_job_next = job_ptr_pend->job_next;
4523 save_details = job_ptr_pend->details;
4524 save_prio_factors = job_ptr_pend->prio_factors;
4525 save_step_list = job_ptr_pend->step_list;
4526 memcpy(job_ptr_pend, job_ptr, sizeof(job_record_t));
4527
4528 job_ptr_pend->job_id = save_job_id;
4529 job_ptr_pend->job_next = save_job_next;
4530 job_ptr_pend->details = save_details;
4531 job_ptr_pend->db_flags = 0;
4532 job_ptr_pend->step_list = save_step_list;
4533 job_ptr_pend->db_index = save_db_index;
4534
4535 job_ptr_pend->prio_factors = save_prio_factors;
4536 slurm_copy_priority_factors_object(job_ptr_pend->prio_factors,
4537 job_ptr->prio_factors);
4538
4539 job_ptr_pend->account = xstrdup(job_ptr->account);
4540 job_ptr_pend->admin_comment = xstrdup(job_ptr->admin_comment);
4541 job_ptr_pend->alias_list = xstrdup(job_ptr->alias_list);
4542 job_ptr_pend->alloc_node = xstrdup(job_ptr->alloc_node);
4543
4544 job_ptr_pend->array_recs = job_ptr->array_recs;
4545 job_ptr->array_recs = NULL;
4546
4547 if (job_ptr_pend->array_recs &&
4548 job_ptr_pend->array_recs->task_id_bitmap) {
4549 bit_clear(job_ptr_pend->array_recs->task_id_bitmap,
4550 job_ptr_pend->array_task_id);
4551 }
4552 xfree(job_ptr_pend->array_recs->task_id_str);
4553 if (job_ptr_pend->array_recs->task_cnt) {
4554 job_ptr_pend->array_recs->task_cnt--;
4555 } else {
4556 error("%pJ array_recs->task_cnt underflow",
4557 job_ptr);
4558 }
4559 job_ptr_pend->array_task_id = NO_VAL;
4560
4561 job_ptr_pend->batch_host = NULL;
4562 job_ptr_pend->burst_buffer = xstrdup(job_ptr->burst_buffer);
4563 job_ptr_pend->burst_buffer_state = xstrdup(job_ptr->burst_buffer_state);
4564 job_ptr_pend->clusters = xstrdup(job_ptr->clusters);
4565 job_ptr_pend->comment = xstrdup(job_ptr->comment);
4566
4567 job_ptr_pend->fed_details = _dup_job_fed_details(job_ptr->fed_details);
4568
4569 job_ptr_pend->front_end_ptr = NULL;
4570 /* struct job_details *details; *** NOTE: Copied below */
4571 if (job_ptr->gres_list) {
4572 job_ptr_pend->gres_list =
4573 gres_plugin_job_state_dup(job_ptr->gres_list);
4574 }
4575 job_ptr_pend->gres_detail_cnt = 0;
4576 job_ptr_pend->gres_detail_str = NULL;
4577 job_ptr_pend->gres_alloc = NULL;
4578 job_ptr_pend->gres_req = NULL;
4579 job_ptr_pend->gres_used = NULL;
4580
4581 job_ptr_pend->limit_set.tres = xcalloc(slurmctld_tres_cnt,
4582 sizeof(uint16_t));
4583 memcpy(job_ptr_pend->limit_set.tres, job_ptr->limit_set.tres,
4584 sizeof(uint16_t) * slurmctld_tres_cnt);
4585
4586 _add_job_hash(job_ptr); /* Sets job_next */
4587 _add_job_hash(job_ptr_pend); /* Sets job_next */
4588 _add_job_array_hash(job_ptr);
4589 job_ptr_pend->job_resrcs = NULL;
4590
4591 job_ptr_pend->licenses = xstrdup(job_ptr->licenses);
4592 job_ptr_pend->license_list = license_job_copy(job_ptr->license_list);
4593 job_ptr_pend->mail_user = xstrdup(job_ptr->mail_user);
4594 job_ptr_pend->mcs_label = xstrdup(job_ptr->mcs_label);
4595 job_ptr_pend->name = xstrdup(job_ptr->name);
4596 job_ptr_pend->network = xstrdup(job_ptr->network);
4597 job_ptr_pend->node_addr = NULL;
4598 job_ptr_pend->node_bitmap = NULL;
4599 job_ptr_pend->node_bitmap_cg = NULL;
4600 job_ptr_pend->nodes = NULL;
4601 job_ptr_pend->nodes_completing = NULL;
4602 job_ptr_pend->origin_cluster = xstrdup(job_ptr->origin_cluster);
4603 job_ptr_pend->partition = xstrdup(job_ptr->partition);
4604 job_ptr_pend->part_ptr_list = part_list_copy(job_ptr->part_ptr_list);
4605 /* On jobs that are held the priority_array isn't set up yet,
4606 * so check to see if it exists before copying. */
4607 if (job_ptr->part_ptr_list && job_ptr->priority_array) {
4608 i = list_count(job_ptr->part_ptr_list) * sizeof(uint32_t);
4609 job_ptr_pend->priority_array = xmalloc(i);
4610 memcpy(job_ptr_pend->priority_array,
4611 job_ptr->priority_array, i);
4612 }
4613 job_ptr_pend->resv_name = xstrdup(job_ptr->resv_name);
4614 job_ptr_pend->resp_host = xstrdup(job_ptr->resp_host);
4615 if (job_ptr->select_jobinfo) {
4616 job_ptr_pend->select_jobinfo =
4617 select_g_select_jobinfo_copy(job_ptr->select_jobinfo);
4618 }
4619 job_ptr_pend->sched_nodes = NULL;
4620 if (job_ptr->spank_job_env_size) {
4621 job_ptr_pend->spank_job_env =
4622 xcalloc((job_ptr->spank_job_env_size + 1),
4623 sizeof(char *));
4624 for (i = 0; i < job_ptr->spank_job_env_size; i++) {
4625 job_ptr_pend->spank_job_env[i] =
4626 xstrdup(job_ptr->spank_job_env[i]);
4627 }
4628 }
4629 job_ptr_pend->state_desc = xstrdup(job_ptr->state_desc);
4630
4631 job_ptr_pend->system_comment = xstrdup(job_ptr->system_comment);
4632
4633 i = sizeof(uint64_t) * slurmctld_tres_cnt;
4634 job_ptr_pend->tres_req_cnt = xmalloc(i);
4635 memcpy(job_ptr_pend->tres_req_cnt, job_ptr->tres_req_cnt, i);
4636 job_ptr_pend->tres_req_str = xstrdup(job_ptr->tres_req_str);
4637 job_ptr_pend->tres_fmt_req_str = xstrdup(job_ptr->tres_fmt_req_str);
4638 job_ptr_pend->tres_alloc_str = NULL;
4639 job_ptr_pend->tres_fmt_alloc_str = NULL;
4640 job_ptr_pend->tres_alloc_cnt = NULL;
4641
4642 job_ptr_pend->cpus_per_tres = xstrdup(job_ptr->cpus_per_tres);
4643 job_ptr_pend->mem_per_tres = xstrdup(job_ptr->mem_per_tres);
4644 job_ptr_pend->tres_bind = xstrdup(job_ptr->tres_bind);
4645 job_ptr_pend->tres_freq = xstrdup(job_ptr->tres_freq);
4646 job_ptr_pend->tres_per_job = xstrdup(job_ptr->tres_per_job);
4647 job_ptr_pend->tres_per_node = xstrdup(job_ptr->tres_per_node);
4648 job_ptr_pend->tres_per_socket = xstrdup(job_ptr->tres_per_socket);
4649 job_ptr_pend->tres_per_task = xstrdup(job_ptr->tres_per_task);
4650
4651 job_ptr_pend->user_name = xstrdup(job_ptr->user_name);
4652 job_ptr_pend->wckey = xstrdup(job_ptr->wckey);
4653 job_ptr_pend->deadline = job_ptr->deadline;
4654
4655 job_details = job_ptr->details;
4656 details_new = job_ptr_pend->details;
4657 memcpy(details_new, job_details, sizeof(struct job_details));
4658
4659 /*
4660 * Reset the preempt_start_time or high priority array jobs will hang
4661 * for a period before preempting more jobs.
4662 */
4663 details_new->preempt_start_time = 0;
4664
4665 details_new->acctg_freq = xstrdup(job_details->acctg_freq);
4666 if (job_details->argc) {
4667 details_new->argv =
4668 xcalloc((job_details->argc + 1), sizeof(char *));
4669 for (i = 0; i < job_details->argc; i++) {
4670 details_new->argv[i] = xstrdup(job_details->argv[i]);
4671 }
4672 }
4673 details_new->cpu_bind = xstrdup(job_details->cpu_bind);
4674 details_new->cpu_bind_type = job_details->cpu_bind_type;
4675 details_new->cpu_freq_min = job_details->cpu_freq_min;
4676 details_new->cpu_freq_max = job_details->cpu_freq_max;
4677 details_new->cpu_freq_gov = job_details->cpu_freq_gov;
4678 details_new->depend_list = depended_list_copy(job_details->depend_list);
4679 details_new->dependency = xstrdup(job_details->dependency);
4680 details_new->orig_dependency = xstrdup(job_details->orig_dependency);
4681 if (job_details->env_cnt) {
4682 details_new->env_sup =
4683 xcalloc((job_details->env_cnt + 1), sizeof(char *));
4684 for (i = 0; i < job_details->env_cnt; i++) {
4685 details_new->env_sup[i] =
4686 xstrdup(job_details->env_sup[i]);
4687 }
4688 }
4689 if (job_details->exc_node_bitmap) {
4690 details_new->exc_node_bitmap =
4691 bit_copy(job_details->exc_node_bitmap);
4692 }
4693 details_new->exc_nodes = xstrdup(job_details->exc_nodes);
4694 details_new->feature_list =
4695 feature_list_copy(job_details->feature_list);
4696 details_new->features = xstrdup(job_details->features);
4697 details_new->cluster_features = xstrdup(job_details->cluster_features);
4698 if (job_details->mc_ptr) {
4699 i = sizeof(multi_core_data_t);
4700 details_new->mc_ptr = xmalloc(i);
4701 memcpy(details_new->mc_ptr, job_details->mc_ptr, i);
4702 }
4703 details_new->mem_bind = xstrdup(job_details->mem_bind);
4704 details_new->mem_bind_type = job_details->mem_bind_type;
4705 if (job_details->req_node_bitmap) {
4706 details_new->req_node_bitmap =
4707 bit_copy(job_details->req_node_bitmap);
4708 }
4709 details_new->req_nodes = xstrdup(job_details->req_nodes);
4710 details_new->std_err = xstrdup(job_details->std_err);
4711 details_new->std_in = xstrdup(job_details->std_in);
4712 details_new->std_out = xstrdup(job_details->std_out);
4713 details_new->work_dir = xstrdup(job_details->work_dir);
4714 details_new->x11_magic_cookie = xstrdup(job_details->x11_magic_cookie);
4715
4716 if (job_ptr->fed_details) {
4717 add_fed_job_info(job_ptr);
4718 /*
4719 * The new (split) job needs its remote dependencies tested
4720 * separately from just the meta job, so send remote
4721 * dependencies to siblings if needed.
4722 */
4723 if (job_ptr->details->dependency &&
4724 job_ptr->details->depend_list)
4725 fed_mgr_submit_remote_dependencies(job_ptr, false,
4726 false);
4727 }
4728
4729 return job_ptr_pend;
4730 }
4731
4732 /* Add job array data stucture to the job record */
_create_job_array(job_record_t * job_ptr,job_desc_msg_t * job_specs)4733 static void _create_job_array(job_record_t *job_ptr, job_desc_msg_t *job_specs)
4734 {
4735 struct job_details *details;
4736 char *sep = NULL;
4737 int max_run_tasks, min_task_id, max_task_id, step_task_id = 1, task_cnt;
4738 uint32_t i_cnt;
4739
4740 if (!job_specs->array_bitmap)
4741 return;
4742
4743 i_cnt = bit_set_count(job_specs->array_bitmap);
4744 if (i_cnt == 0) {
4745 info("%s: %pJ array_bitmap is empty", __func__, job_ptr);
4746 return;
4747 }
4748
4749 job_ptr->array_job_id = job_ptr->job_id;
4750 job_ptr->array_recs = xmalloc(sizeof(job_array_struct_t));
4751 min_task_id = bit_ffs(job_specs->array_bitmap);
4752 max_task_id = bit_fls(job_specs->array_bitmap);
4753 task_cnt = bit_set_count(job_specs->array_bitmap);
4754 i_cnt = max_task_id + 1;
4755 job_specs->array_bitmap = bit_realloc(job_specs->array_bitmap, i_cnt);
4756 job_ptr->array_recs->task_id_bitmap = job_specs->array_bitmap;
4757 job_specs->array_bitmap = NULL;
4758 job_ptr->array_recs->task_cnt =
4759 bit_set_count(job_ptr->array_recs->task_id_bitmap);
4760 if (job_ptr->array_recs->task_cnt > 1)
4761 job_count += (job_ptr->array_recs->task_cnt - 1);
4762
4763 if (job_specs->array_inx)
4764 sep = strchr(job_specs->array_inx, '%');
4765 if (sep) {
4766 max_run_tasks = atoi(sep + 1);
4767 if (max_run_tasks > 0)
4768 job_ptr->array_recs->max_run_tasks = max_run_tasks;
4769 }
4770
4771 details = job_ptr->details;
4772 if (details) {
4773 if (job_specs->array_inx) {
4774 sep = strchr(job_specs->array_inx, ':');
4775 if (sep)
4776 step_task_id = atoi(sep + 1);
4777 }
4778 details->env_sup = xrealloc(details->env_sup,
4779 (sizeof(char *) *
4780 (details->env_cnt + 4)));
4781 xstrfmtcat(details->env_sup[details->env_cnt++],
4782 "SLURM_ARRAY_TASK_COUNT=%d", task_cnt);
4783 xstrfmtcat(details->env_sup[details->env_cnt++],
4784 "SLURM_ARRAY_TASK_MIN=%d", min_task_id);
4785 xstrfmtcat(details->env_sup[details->env_cnt++],
4786 "SLURM_ARRAY_TASK_MAX=%d", max_task_id);
4787 xstrfmtcat(details->env_sup[details->env_cnt++],
4788 "SLURM_ARRAY_TASK_STEP=%d", step_task_id);
4789 }
4790 }
4791
4792 /*
4793 * Wrapper for select_nodes() function that will test all valid partitions
4794 * for a new job
4795 * IN job_ptr - pointer to the job record
4796 * IN test_only - if set do not allocate nodes, just confirm they
4797 * could be allocated now
4798 * IN select_node_bitmap - bitmap of nodes to be used for the
4799 * job's resource allocation (not returned if NULL), caller
4800 * must free
4801 * OUT err_msg - error message for job, caller must xfree
4802 */
_select_nodes_parts(job_record_t * job_ptr,bool test_only,bitstr_t ** select_node_bitmap,char ** err_msg)4803 static int _select_nodes_parts(job_record_t *job_ptr, bool test_only,
4804 bitstr_t **select_node_bitmap, char **err_msg)
4805 {
4806 part_record_t *part_ptr;
4807 ListIterator iter;
4808 int rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
4809 int best_rc = -1, part_limits_rc = WAIT_NO_REASON;
4810 bitstr_t *save_avail_node_bitmap = NULL;
4811
4812 save_avail_node_bitmap = bit_copy(avail_node_bitmap);
4813 bit_or(avail_node_bitmap, rs_node_bitmap);
4814
4815 if (job_ptr->part_ptr_list) {
4816 list_sort(job_ptr->part_ptr_list, priority_sort_part_tier);
4817 iter = list_iterator_create(job_ptr->part_ptr_list);
4818 while ((part_ptr = list_next(iter))) {
4819 job_ptr->part_ptr = part_ptr;
4820 debug2("Try %pJ on next partition %s",
4821 job_ptr, part_ptr->name);
4822
4823 part_limits_rc = job_limits_check(&job_ptr, false);
4824
4825 if ((part_limits_rc != WAIT_NO_REASON) &&
4826 (slurmctld_conf.enforce_part_limits ==
4827 PARTITION_ENFORCE_ANY))
4828 continue;
4829 if ((part_limits_rc != WAIT_NO_REASON) &&
4830 (slurmctld_conf.enforce_part_limits ==
4831 PARTITION_ENFORCE_ALL)) {
4832 if (part_limits_rc != WAIT_PART_DOWN) {
4833 best_rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
4834 break;
4835 } else {
4836 best_rc = ESLURM_PARTITION_DOWN;
4837 }
4838 }
4839
4840 if (part_limits_rc == WAIT_NO_REASON) {
4841 rc = select_nodes(job_ptr, test_only,
4842 select_node_bitmap, err_msg,
4843 true,
4844 SLURMDB_JOB_FLAG_SUBMIT);
4845 } else {
4846 rc = select_nodes(job_ptr, true,
4847 select_node_bitmap, err_msg,
4848 true,
4849 SLURMDB_JOB_FLAG_SUBMIT);
4850 if ((rc == SLURM_SUCCESS) &&
4851 (part_limits_rc == WAIT_PART_DOWN))
4852 rc = ESLURM_PARTITION_DOWN;
4853 }
4854 if ((rc == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) &&
4855 (slurmctld_conf.enforce_part_limits ==
4856 PARTITION_ENFORCE_ALL)) {
4857 best_rc = rc; /* Job can not run */
4858 break;
4859 }
4860 if ((rc != ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) &&
4861 (rc != ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) &&
4862 (rc != ESLURM_RESERVATION_BUSY) &&
4863 (rc != ESLURM_NODES_BUSY)) {
4864 best_rc = rc; /* Job can run now */
4865 if ((slurmctld_conf.enforce_part_limits ==
4866 PARTITION_ENFORCE_ANY) ||
4867 (slurmctld_conf.enforce_part_limits ==
4868 PARTITION_ENFORCE_NONE) ||
4869 (!test_only &&
4870 (part_limits_rc == WAIT_NO_REASON))) {
4871 break;
4872 }
4873 }
4874 if (((rc == ESLURM_NODES_BUSY) ||
4875 (rc == ESLURM_RESERVATION_BUSY)) &&
4876 (best_rc == -1) &&
4877 ((slurmctld_conf.enforce_part_limits ==
4878 PARTITION_ENFORCE_ANY) ||
4879 (slurmctld_conf.enforce_part_limits ==
4880 PARTITION_ENFORCE_NONE))) {
4881 if (test_only)
4882 break;
4883 best_rc = rc; /* Keep looking for partition
4884 * where job can start now */
4885 }
4886 if ((job_ptr->preempt_in_progress) &&
4887 (rc != ESLURM_NODES_BUSY)) {
4888 /* Already started preempting jobs, don't
4889 * consider starting this job in another
4890 * partition as we iterator over others. */
4891 test_only = true;
4892 }
4893 }
4894 list_iterator_destroy(iter);
4895 if (best_rc != -1)
4896 rc = best_rc;
4897 else if (part_limits_rc == WAIT_PART_DOWN)
4898 rc = ESLURM_PARTITION_DOWN;
4899 } else {
4900 part_limits_rc = job_limits_check(&job_ptr, false);
4901 if (part_limits_rc == WAIT_NO_REASON) {
4902 rc = select_nodes(job_ptr, test_only,
4903 select_node_bitmap, err_msg, true,
4904 SLURMDB_JOB_FLAG_SUBMIT);
4905 } else if (part_limits_rc == WAIT_PART_DOWN) {
4906 rc = select_nodes(job_ptr, true,
4907 select_node_bitmap, err_msg, true,
4908 SLURMDB_JOB_FLAG_SUBMIT);
4909 if (rc == SLURM_SUCCESS)
4910 rc = ESLURM_PARTITION_DOWN;
4911 }
4912 }
4913
4914 if (rc == ESLURM_NODES_BUSY)
4915 job_ptr->state_reason = WAIT_RESOURCES;
4916 else if ((rc == ESLURM_RESERVATION_BUSY) ||
4917 (rc == ESLURM_RESERVATION_NOT_USABLE))
4918 job_ptr->state_reason = WAIT_RESERVATION;
4919 else if (rc == ESLURM_JOB_HELD)
4920 /* Do not reset the state_reason field here. select_nodes()
4921 * already set the state_reason field, and this error code
4922 * does not distinguish between user and admin holds. */
4923 ;
4924 else if (rc == ESLURM_NODE_NOT_AVAIL)
4925 job_ptr->state_reason = WAIT_NODE_NOT_AVAIL;
4926 else if (rc == ESLURM_QOS_THRES)
4927 job_ptr->state_reason = WAIT_QOS_THRES;
4928 else if (rc == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)
4929 job_ptr->state_reason = WAIT_PART_CONFIG;
4930 else if (rc == ESLURM_POWER_NOT_AVAIL)
4931 job_ptr->state_reason = WAIT_POWER_NOT_AVAIL;
4932 else if (rc == ESLURM_BURST_BUFFER_WAIT)
4933 job_ptr->state_reason = WAIT_BURST_BUFFER_RESOURCE;
4934 else if (rc == ESLURM_POWER_RESERVED)
4935 job_ptr->state_reason = WAIT_POWER_RESERVED;
4936 else if (rc == ESLURM_PARTITION_DOWN)
4937 job_ptr->state_reason = WAIT_PART_DOWN;
4938 else if (rc == ESLURM_INVALID_QOS)
4939 job_ptr->state_reason = FAIL_QOS;
4940 else if (rc == ESLURM_INVALID_ACCOUNT)
4941 job_ptr->state_reason = FAIL_ACCOUNT;
4942
4943 FREE_NULL_BITMAP(avail_node_bitmap);
4944 avail_node_bitmap = save_avail_node_bitmap;
4945
4946 return rc;
4947 }
4948
_has_deadline(job_record_t * job_ptr)4949 static inline bool _has_deadline(job_record_t *job_ptr)
4950 {
4951 if ((job_ptr->deadline) && (job_ptr->deadline != NO_VAL)) {
4952 queue_job_scheduler();
4953 return true;
4954 }
4955 return false;
4956 }
4957
4958 /*
4959 * job_allocate - create job_records for the supplied job specification and
4960 * allocate nodes for it.
4961 * IN job_specs - job specifications
4962 * IN immediate - if set then either initiate the job immediately or fail
4963 * IN will_run - don't initiate the job if set, just test if it could run
4964 * now or later
4965 * OUT resp - will run response (includes start location, time, etc.)
4966 * IN allocate - resource allocation request only if set, batch job if zero
4967 * IN submit_uid -uid of user issuing the request
4968 * OUT job_pptr - set to pointer to job record
4969 * OUT err_msg - Custom error message to the user, caller to xfree results
4970 * IN protocol_version - version of the code the caller is using
4971 * RET 0 or an error code. If the job would only be able to execute with
4972 * some change in partition configuration then
4973 * ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned
4974 * globals: job_list - pointer to global job list
4975 * list_part - global list of partition info
4976 * default_part_loc - pointer to default partition
4977 */
job_allocate(job_desc_msg_t * job_specs,int immediate,int will_run,will_run_response_msg_t ** resp,int allocate,uid_t submit_uid,job_record_t ** job_pptr,char ** err_msg,uint16_t protocol_version)4978 extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
4979 int will_run, will_run_response_msg_t **resp,
4980 int allocate, uid_t submit_uid,
4981 job_record_t **job_pptr, char **err_msg,
4982 uint16_t protocol_version)
4983 {
4984 static time_t sched_update = 0;
4985 static int defer_sched = 0;
4986 char *sched_params, *tmp_ptr;
4987 int error_code, i;
4988 bool no_alloc, top_prio, test_only, too_fragmented, independent;
4989 job_record_t *job_ptr;
4990 time_t now = time(NULL);
4991 bool held_user = false;
4992
4993 xassert(verify_lock(CONF_LOCK, READ_LOCK));
4994 xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
4995 xassert(verify_lock(NODE_LOCK, WRITE_LOCK));
4996 xassert(verify_lock(PART_LOCK, READ_LOCK));
4997
4998 if (sched_update != slurmctld_conf.last_update) {
4999 sched_update = slurmctld_conf.last_update;
5000 sched_params = slurm_get_sched_params();
5001 if (xstrcasestr(sched_params, "defer"))
5002 defer_sched = 1;
5003 else
5004 defer_sched = 0;
5005 if ((tmp_ptr = xstrcasestr(sched_params, "delay_boot="))) {
5006 char *tmp_comma;
5007 if ((tmp_comma = xstrstr(tmp_ptr, ",")))
5008 *tmp_comma = '\0';
5009 i = time_str2secs(tmp_ptr + 11);
5010 if (i != NO_VAL)
5011 delay_boot = i;
5012 if (tmp_comma)
5013 *tmp_comma = ',';
5014 }
5015 bf_min_age_reserve = 0;
5016 if ((tmp_ptr = xstrcasestr(sched_params,
5017 "bf_min_age_reserve="))) {
5018 int min_age = atoi(tmp_ptr + 19);
5019 if (min_age > 0)
5020 bf_min_age_reserve = min_age;
5021 }
5022
5023 if (xstrcasestr(sched_params, "allow_zero_lic"))
5024 validate_cfgd_licenses = false;
5025
5026 xfree(sched_params);
5027 }
5028
5029 if (job_specs->array_bitmap)
5030 i = bit_set_count(job_specs->array_bitmap);
5031 else
5032 i = 1;
5033
5034 if ((job_count + i) >= slurmctld_conf.max_job_cnt) {
5035 error("%s: MaxJobCount limit from slurm.conf reached (%u)",
5036 __func__, slurmctld_conf.max_job_cnt);
5037 return EAGAIN;
5038 }
5039
5040 error_code = _job_create(job_specs, allocate, will_run,
5041 &job_ptr, submit_uid, err_msg,
5042 protocol_version);
5043 *job_pptr = job_ptr;
5044 if (error_code) {
5045 if (job_ptr && (immediate || will_run)) {
5046 /* this should never really happen here */
5047 job_ptr->job_state = JOB_FAILED;
5048 job_ptr->exit_code = 1;
5049 job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
5050 xfree(job_ptr->state_desc);
5051 job_ptr->start_time = job_ptr->end_time = now;
5052 job_completion_logger(job_ptr, false);
5053 error("%s: setting %pJ to \"%s\"",
5054 __func__, job_ptr,
5055 job_reason_string(job_ptr->state_reason));
5056 }
5057 return error_code;
5058 }
5059 xassert(job_ptr);
5060 if (job_specs->array_bitmap)
5061 independent = false;
5062 else
5063 independent = job_independent(job_ptr);
5064 /*
5065 * priority needs to be calculated after this since we set a
5066 * begin time in job_independent and that lets us know if the
5067 * job is eligible.
5068 */
5069 if (job_ptr->priority == NO_VAL)
5070 set_job_prio(job_ptr);
5071
5072 if (job_ptr->state_reason == WAIT_HELD_USER)
5073 held_user = true;
5074
5075 if (independent &&
5076 (license_job_test(job_ptr, time(NULL), true) != SLURM_SUCCESS))
5077 independent = false;
5078
5079 /* Avoid resource fragmentation if important */
5080 if ((submit_uid || (job_specs->req_nodes == NULL)) &&
5081 independent && job_is_completing(NULL))
5082 too_fragmented = true; /* Don't pick nodes for job now */
5083 /*
5084 * FIXME: Ideally we only want to refuse the request if the
5085 * required node list is insufficient to satisfy the job's
5086 * processor or node count requirements, but the overhead is
5087 * rather high to do that right here. We let requests from
5088 * user root proceed if a node list is specified, for
5089 * meta-schedulers (e.g. Maui, Moab, etc.).
5090 */
5091 else
5092 too_fragmented = false;
5093
5094 if (independent && (!too_fragmented) && !defer_sched)
5095 top_prio = _top_priority(job_ptr, job_specs->het_job_offset);
5096 else
5097 top_prio = true; /* don't bother testing,
5098 * it is not runable anyway */
5099
5100 if (immediate &&
5101 (too_fragmented || (!top_prio) || (!independent) || defer_sched)) {
5102 job_ptr->job_state = JOB_FAILED;
5103 job_ptr->exit_code = 1;
5104 job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
5105 xfree(job_ptr->state_desc);
5106 job_ptr->start_time = job_ptr->end_time = now;
5107 job_completion_logger(job_ptr, false);
5108 if (!independent) {
5109 debug2("%s: setting %pJ to \"%s\" due to dependency (%s)",
5110 __func__, job_ptr,
5111 job_reason_string(job_ptr->state_reason),
5112 slurm_strerror(ESLURM_DEPENDENCY));
5113 return ESLURM_DEPENDENCY;
5114 }
5115 else if (too_fragmented) {
5116 debug2("%s: setting %pJ to \"%s\" due to fragmentation (%s)",
5117 __func__, job_ptr,
5118 job_reason_string(job_ptr->state_reason),
5119 slurm_strerror(ESLURM_FRAGMENTATION));
5120 return ESLURM_FRAGMENTATION;
5121 }
5122 else if (!top_prio) {
5123 debug2("%s: setting %pJ to \"%s\" because it's not top priority (%s)",
5124 __func__, job_ptr,
5125 job_reason_string(job_ptr->state_reason),
5126 slurm_strerror(ESLURM_NOT_TOP_PRIORITY));
5127 return ESLURM_NOT_TOP_PRIORITY;
5128 } else {
5129 job_ptr->state_reason = FAIL_DEFER;
5130 debug2("%s: setting %pJ to \"%s\" due to SchedulerParameters=defer (%s)",
5131 __func__, job_ptr,
5132 job_reason_string(job_ptr->state_reason),
5133 slurm_strerror(ESLURM_DEFER));
5134 return ESLURM_DEFER;
5135 }
5136 }
5137
5138 if (will_run && resp) {
5139 job_desc_msg_t job_desc_msg;
5140 int rc;
5141 slurm_init_job_desc_msg(&job_desc_msg);
5142 job_desc_msg.job_id = job_ptr->job_id;
5143 rc = job_start_data(&job_desc_msg, resp);
5144 job_ptr->job_state = JOB_FAILED;
5145 job_ptr->exit_code = 1;
5146 job_ptr->start_time = job_ptr->end_time = now;
5147 purge_job_record(job_ptr->job_id);
5148 return rc;
5149 }
5150
5151 /*
5152 * fed jobs need to go to the siblings first so don't attempt to
5153 * schedule the job now.
5154 */
5155 test_only = will_run || job_ptr->deadline || (allocate == 0) ||
5156 job_ptr->fed_details;
5157
5158 no_alloc = test_only || too_fragmented || _has_deadline(job_ptr) ||
5159 (!top_prio) || (!independent) || !avail_front_end(job_ptr) ||
5160 (job_specs->het_job_offset != NO_VAL) || defer_sched;
5161
5162 no_alloc = no_alloc || (bb_g_job_test_stage_in(job_ptr, no_alloc) != 1);
5163
5164 error_code = _select_nodes_parts(job_ptr, no_alloc, NULL, err_msg);
5165 if (!test_only) {
5166 last_job_update = now;
5167 }
5168
5169 if (held_user)
5170 job_ptr->state_reason = WAIT_HELD_USER;
5171 /*
5172 * Moved this (_create_job_array) here to handle when a job
5173 * array is submitted since we
5174 * want to know the array task count when we check the job against
5175 * QOS/Assoc limits
5176 */
5177 _create_job_array(job_ptr, job_specs);
5178
5179 slurmctld_diag_stats.jobs_submitted +=
5180 (job_ptr->array_recs && job_ptr->array_recs->task_cnt) ?
5181 job_ptr->array_recs->task_cnt : 1;
5182
5183 acct_policy_add_job_submit(job_ptr);
5184
5185 if ((error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) &&
5186 (slurmctld_conf.enforce_part_limits != PARTITION_ENFORCE_NONE))
5187 ; /* Reject job submission */
5188 else if ((error_code == ESLURM_NODES_BUSY) ||
5189 (error_code == ESLURM_RESERVATION_BUSY) ||
5190 (error_code == ESLURM_JOB_HELD) ||
5191 (error_code == ESLURM_NODE_NOT_AVAIL) ||
5192 (error_code == ESLURM_QOS_THRES) ||
5193 (error_code == ESLURM_ACCOUNTING_POLICY) ||
5194 (error_code == ESLURM_RESERVATION_NOT_USABLE) ||
5195 (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) ||
5196 (error_code == ESLURM_POWER_NOT_AVAIL) ||
5197 (error_code == ESLURM_BURST_BUFFER_WAIT) ||
5198 (error_code == ESLURM_POWER_RESERVED) ||
5199 (error_code == ESLURM_PARTITION_DOWN)) {
5200 /* Not fatal error, but job can't be scheduled right now */
5201 if (immediate) {
5202 job_ptr->job_state = JOB_FAILED;
5203 job_ptr->exit_code = 1;
5204 job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
5205 xfree(job_ptr->state_desc);
5206 job_ptr->start_time = job_ptr->end_time = now;
5207 job_completion_logger(job_ptr, false);
5208 debug2("%s: setting %pJ to \"%s\" because it cannot be immediately allocated (%s)",
5209 __func__, job_ptr,
5210 job_reason_string(job_ptr->state_reason),
5211 slurm_strerror(error_code));
5212 } else { /* job remains queued */
5213 if ((error_code == ESLURM_NODES_BUSY) ||
5214 (error_code == ESLURM_BURST_BUFFER_WAIT) ||
5215 (error_code == ESLURM_RESERVATION_BUSY) ||
5216 (error_code == ESLURM_ACCOUNTING_POLICY) ||
5217 ((error_code == ESLURM_PARTITION_DOWN) &&
5218 (job_ptr->batch_flag))) {
5219 error_code = SLURM_SUCCESS;
5220 }
5221 }
5222 return error_code;
5223 }
5224
5225 if (error_code) { /* fundamental flaw in job request */
5226 job_ptr->job_state = JOB_FAILED;
5227 job_ptr->exit_code = 1;
5228 job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
5229 xfree(job_ptr->state_desc);
5230 job_ptr->start_time = job_ptr->end_time = now;
5231 job_completion_logger(job_ptr, false);
5232 debug2("%s: setting %pJ to \"%s\" due to a flaw in the job request (%s)",
5233 __func__, job_ptr,
5234 job_reason_string(job_ptr->state_reason),
5235 slurm_strerror(error_code));
5236 return error_code;
5237 }
5238
5239 if (will_run) { /* job would run, flag job destruction */
5240 job_ptr->job_state = JOB_FAILED;
5241 job_ptr->exit_code = 1;
5242 job_ptr->start_time = job_ptr->end_time = now;
5243 purge_job_record(job_ptr->job_id);
5244 } else if (!with_slurmdbd)
5245 jobacct_storage_g_job_start(acct_db_conn, job_ptr);
5246
5247 if (!will_run) {
5248 sched_debug2("%pJ allocated resources: NodeList=%s",
5249 job_ptr, job_ptr->nodes);
5250 rebuild_job_part_list(job_ptr);
5251 }
5252
5253 return SLURM_SUCCESS;
5254 }
5255
5256 /*
5257 * job_fail - terminate a job due to initiation failure
5258 * IN job_ptr - Pointer to job to be killed
5259 * IN job_state - desired job state (JOB_BOOT_FAIL, JOB_NODE_FAIL, etc.)
5260 * RET 0 on success, otherwise ESLURM error code
5261 */
_job_fail(job_record_t * job_ptr,uint32_t job_state)5262 static int _job_fail(job_record_t *job_ptr, uint32_t job_state)
5263 {
5264 time_t now = time(NULL);
5265 bool suspended = false;
5266
5267 if (IS_JOB_FINISHED(job_ptr))
5268 return ESLURM_ALREADY_DONE;
5269 if (IS_JOB_SUSPENDED(job_ptr)) {
5270 uint32_t suspend_job_state = job_ptr->job_state;
5271 /*
5272 * we can't have it as suspended when we call the
5273 * accounting stuff.
5274 */
5275 job_ptr->job_state = JOB_CANCELLED;
5276 jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
5277 job_ptr->job_state = suspend_job_state;
5278 suspended = true;
5279 }
5280
5281 if (IS_JOB_CONFIGURING(job_ptr) || IS_JOB_RUNNING(job_ptr) ||
5282 suspended) {
5283 /* No need to signal steps, deallocate kills them */
5284 job_ptr->time_last_active = now;
5285 if (suspended) {
5286 job_ptr->end_time = job_ptr->suspend_time;
5287 job_ptr->tot_sus_time +=
5288 difftime(now, job_ptr->suspend_time);
5289 } else
5290 job_ptr->end_time = now;
5291 last_job_update = now;
5292 job_ptr->job_state = job_state | JOB_COMPLETING;
5293 job_ptr->exit_code = 1;
5294 job_ptr->state_reason = FAIL_LAUNCH;
5295 xfree(job_ptr->state_desc);
5296 job_completion_logger(job_ptr, false);
5297 if (job_ptr->node_bitmap) {
5298 build_cg_bitmap(job_ptr);
5299 deallocate_nodes(job_ptr, false, suspended, false);
5300 }
5301 return SLURM_SUCCESS;
5302 }
5303 /* All other states */
5304 verbose("job_fail: %pJ can't be killed from state=%s",
5305 job_ptr, job_state_string(job_ptr->job_state));
5306
5307 return ESLURM_TRANSITION_STATE_NO_UPDATE;
5308
5309 }
5310
5311 /*
5312 * job_fail - terminate a job due to initiation failure
5313 * IN job_id - ID of the job to be killed
5314 * IN job_state - desired job state (JOB_BOOT_FAIL, JOB_NODE_FAIL, etc.)
5315 * RET 0 on success, otherwise ESLURM error code
5316 */
job_fail(uint32_t job_id,uint32_t job_state)5317 extern int job_fail(uint32_t job_id, uint32_t job_state)
5318 {
5319 job_record_t *job_ptr, *het_job, *het_job_leader;
5320 ListIterator iter;
5321 int rc = SLURM_SUCCESS, rc1;
5322
5323 job_ptr = find_job_record(job_id);
5324 if (job_ptr == NULL) {
5325 error("job_fail: invalid JobId=%u", job_id);
5326 return ESLURM_INVALID_JOB_ID;
5327 }
5328
5329 if (job_ptr->het_job_id == 0)
5330 return _job_fail(job_ptr, job_state);
5331
5332 het_job_leader = find_job_record(job_ptr->het_job_id);
5333 if (!het_job_leader) {
5334 error("%s: Hetjob leader %pJ not found",
5335 __func__, job_ptr);
5336 return _job_fail(job_ptr, job_state);
5337 }
5338 if (!het_job_leader->het_job_list) {
5339 error("%s: Hetjob leader %pJ job list is NULL",
5340 __func__, job_ptr);
5341 return _job_fail(job_ptr, job_state);
5342 }
5343
5344 iter = list_iterator_create(het_job_leader->het_job_list);
5345 while ((het_job = list_next(iter))) {
5346 if (het_job_leader->het_job_id != het_job->het_job_id) {
5347 error("%s: Bad het_job_list for %pJ",
5348 __func__, het_job_leader);
5349 continue;
5350 }
5351 rc1 = _job_fail(het_job, job_state);
5352 if (rc1 != SLURM_SUCCESS)
5353 rc = rc1;
5354 }
5355 list_iterator_destroy(iter);
5356
5357 return rc;
5358 }
5359
5360 /*
5361 * Signal a job based upon job pointer.
5362 * Authentication and authorization checks must be performed before calling.
5363 */
job_signal(job_record_t * job_ptr,uint16_t signal,uint16_t flags,uid_t uid,bool preempt)5364 extern int job_signal(job_record_t *job_ptr, uint16_t signal,
5365 uint16_t flags, uid_t uid, bool preempt)
5366 {
5367 uint16_t job_term_state;
5368 time_t now = time(NULL);
5369
5370 trace_job(job_ptr, __func__, "enter");
5371
5372 if (IS_JOB_STAGE_OUT(job_ptr) && (flags & KILL_HURRY)) {
5373 job_ptr->bit_flags |= JOB_KILL_HURRY;
5374 return bb_g_job_cancel(job_ptr);
5375 }
5376
5377 if (IS_JOB_FINISHED(job_ptr))
5378 return ESLURM_ALREADY_DONE;
5379
5380 /*
5381 * If is origin job then cancel siblings -- if they exist.
5382 * origin job = because it knows where the siblings are
5383 * If the job is running locally then just do the normal signaling
5384 */
5385 if (!(flags & KILL_NO_SIBS) && !IS_JOB_RUNNING(job_ptr) &&
5386 job_ptr->fed_details && fed_mgr_fed_rec) {
5387 uint32_t origin_id = fed_mgr_get_cluster_id(job_ptr->job_id);
5388 slurmdb_cluster_rec_t *origin =
5389 fed_mgr_get_cluster_by_id(origin_id);
5390
5391 if (origin && (origin == fed_mgr_cluster_rec) &&
5392 fed_mgr_job_started_on_sib(job_ptr)) {
5393 /*
5394 * If the job is running on a remote cluster then wait
5395 * for the job to report back that it's completed,
5396 * otherwise just signal the pending siblings and itself
5397 * (by not returning).
5398 */
5399 return fed_mgr_job_cancel(job_ptr, signal, flags, uid,
5400 false);
5401 } else if (origin && (origin == fed_mgr_cluster_rec)) {
5402 /* cancel origin job and revoke sibling jobs */
5403 fed_mgr_job_revoke_sibs(job_ptr);
5404 fed_mgr_remove_remote_dependencies(job_ptr);
5405 } else if (!origin ||
5406 !origin->fed.send ||
5407 (((slurm_persist_conn_t *)origin->fed.send)->fd
5408 == -1)) {
5409 /*
5410 * The origin is down just signal all of the viable
5411 * sibling jobs
5412 */
5413 fed_mgr_job_cancel(job_ptr, signal, flags, uid, true);
5414 }
5415 }
5416
5417 /* let node select plugin do any state-dependent signaling actions */
5418 select_g_job_signal(job_ptr, signal);
5419 last_job_update = now;
5420
5421 /* save user ID of the one who requested the job be cancelled */
5422 if (signal == SIGKILL)
5423 job_ptr->requid = uid;
5424 if (IS_JOB_PENDING(job_ptr) && IS_JOB_COMPLETING(job_ptr) &&
5425 (signal == SIGKILL)) {
5426 /* Prevent job requeue, otherwise preserve state */
5427 job_ptr->job_state = JOB_CANCELLED | JOB_COMPLETING;
5428
5429 /* build_cg_bitmap() not needed, job already completing */
5430 verbose("%s: %u of requeuing %pJ successful",
5431 __func__, signal, job_ptr);
5432 return SLURM_SUCCESS;
5433 }
5434
5435 if (flags & KILL_HURRY)
5436 job_ptr->bit_flags |= JOB_KILL_HURRY;
5437
5438 if (IS_JOB_CONFIGURING(job_ptr) && (signal == SIGKILL)) {
5439 last_job_update = now;
5440 job_ptr->end_time = now;
5441 job_ptr->job_state = JOB_CANCELLED | JOB_COMPLETING;
5442 if (flags & KILL_FED_REQUEUE)
5443 job_ptr->job_state |= JOB_REQUEUE;
5444 build_cg_bitmap(job_ptr);
5445 job_completion_logger(job_ptr, false);
5446 deallocate_nodes(job_ptr, false, false, false);
5447 if (flags & KILL_FED_REQUEUE) {
5448 job_ptr->job_state &= (~JOB_REQUEUE);
5449 }
5450 verbose("%s: %u of configuring %pJ successful",
5451 __func__, signal, job_ptr);
5452 return SLURM_SUCCESS;
5453 }
5454
5455 if (IS_JOB_PENDING(job_ptr) && (signal == SIGKILL)) {
5456 job_ptr->job_state = JOB_CANCELLED;
5457 if (flags & KILL_FED_REQUEUE)
5458 job_ptr->job_state |= JOB_REQUEUE;
5459 job_ptr->start_time = now;
5460 job_ptr->end_time = now;
5461 srun_allocate_abort(job_ptr);
5462 job_completion_logger(job_ptr, false);
5463 if (flags & KILL_FED_REQUEUE) {
5464 job_ptr->job_state &= (~JOB_REQUEUE);
5465 }
5466 /*
5467 * Send back a response to the origin cluster, in other cases
5468 * where the job is running the job will send back a response
5469 * after the job is is completed. This can happen when the
5470 * pending origin job is put into a hold state and the siblings
5471 * are removed or when the job is canceled from the origin.
5472 */
5473 fed_mgr_job_complete(job_ptr, 0, now);
5474 verbose("%s: %u of pending %pJ successful",
5475 __func__, signal, job_ptr);
5476 return SLURM_SUCCESS;
5477 }
5478
5479 if (preempt)
5480 job_term_state = JOB_PREEMPTED;
5481 else
5482 job_term_state = JOB_CANCELLED;
5483 if (IS_JOB_SUSPENDED(job_ptr) && (signal == SIGKILL)) {
5484 last_job_update = now;
5485 job_ptr->end_time = job_ptr->suspend_time;
5486 job_ptr->tot_sus_time += difftime(now, job_ptr->suspend_time);
5487 job_ptr->job_state = job_term_state | JOB_COMPLETING;
5488 if (flags & KILL_FED_REQUEUE)
5489 job_ptr->job_state |= JOB_REQUEUE;
5490 build_cg_bitmap(job_ptr);
5491 jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
5492 job_completion_logger(job_ptr, false);
5493 if (flags & KILL_FED_REQUEUE)
5494 job_ptr->job_state &= (~JOB_REQUEUE);
5495 deallocate_nodes(job_ptr, false, true, preempt);
5496 verbose("%s: %u of suspended %pJ successful",
5497 __func__, signal, job_ptr);
5498 return SLURM_SUCCESS;
5499 }
5500
5501 if (IS_JOB_RUNNING(job_ptr)) {
5502
5503 if ((signal == SIGSTOP) || (signal == SIGCONT)) {
5504 if (IS_JOB_SIGNALING(job_ptr)) {
5505 verbose("%s: %u not send to %pJ 0x%x",
5506 __func__, signal, job_ptr,
5507 job_ptr->job_state);
5508 return ESLURM_TRANSITION_STATE_NO_UPDATE;
5509 }
5510 job_ptr->job_state |= JOB_SIGNALING;
5511 }
5512
5513 if ((signal == SIGKILL)
5514 && !(flags & KILL_STEPS_ONLY)
5515 && !(flags & KILL_JOB_BATCH)) {
5516 /* No need to signal steps, deallocate kills them
5517 */
5518 job_ptr->time_last_active = now;
5519 job_ptr->end_time = now;
5520 last_job_update = now;
5521 job_ptr->job_state = job_term_state | JOB_COMPLETING;
5522 if (flags & KILL_FED_REQUEUE)
5523 job_ptr->job_state |= JOB_REQUEUE;
5524 build_cg_bitmap(job_ptr);
5525 job_completion_logger(job_ptr, false);
5526 deallocate_nodes(job_ptr, false, false, preempt);
5527 if (flags & KILL_FED_REQUEUE)
5528 job_ptr->job_state &= (~JOB_REQUEUE);
5529 } else if (job_ptr->batch_flag && (flags & KILL_JOB_BATCH)) {
5530 _signal_batch_job(job_ptr, signal, flags);
5531 } else if ((flags & KILL_JOB_BATCH) && !job_ptr->batch_flag) {
5532 if ((signal == SIGSTOP) || (signal == SIGCONT))
5533 job_ptr->job_state &= ~JOB_SIGNALING;
5534 return ESLURM_JOB_SCRIPT_MISSING;
5535 } else {
5536 _signal_job(job_ptr, signal, flags);
5537 }
5538 verbose("%s: %u of running %pJ successful 0x%x",
5539 __func__, signal, job_ptr, job_ptr->job_state);
5540 return SLURM_SUCCESS;
5541 }
5542
5543 verbose("%s: %pJ can't be sent signal %u from state=%s",
5544 __func__, job_ptr, signal,
5545 job_state_string(job_ptr->job_state));
5546
5547 trace_job(job_ptr, __func__, "return");
5548
5549 return ESLURM_TRANSITION_STATE_NO_UPDATE;
5550 }
5551
5552 /*
5553 * job_signal_id - signal the specified job
5554 * IN job_id - id of the job to be signaled
5555 * IN signal - signal to send, SIGKILL == cancel the job
5556 * IN flags - see KILL_JOB_* flags in slurm.h
5557 * IN uid - uid of requesting user
5558 * IN preempt - true if job being preempted
5559 * RET 0 on success, otherwise ESLURM error code
5560 */
job_signal_id(uint32_t job_id,uint16_t signal,uint16_t flags,uid_t uid,bool preempt)5561 extern int job_signal_id(uint32_t job_id, uint16_t signal, uint16_t flags,
5562 uid_t uid, bool preempt)
5563 {
5564 job_record_t *job_ptr;
5565
5566 job_ptr = find_job_record(job_id);
5567 if (job_ptr == NULL) {
5568 info("%s: invalid JobId=%u", __func__, job_id);
5569 return ESLURM_INVALID_JOB_ID;
5570 }
5571
5572 if ((job_ptr->user_id != uid) && !validate_operator(uid) &&
5573 !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
5574 job_ptr->account)) {
5575 error("Security violation, JOB_CANCEL RPC for %pJ from uid %u",
5576 job_ptr, uid);
5577 return ESLURM_ACCESS_DENIED;
5578 }
5579
5580 return job_signal(job_ptr, signal, flags, uid, preempt);
5581 }
5582
5583 /* Signal all components of a hetjob */
het_job_signal(job_record_t * het_job_leader,uint16_t signal,uint16_t flags,uid_t uid,bool preempt)5584 extern int het_job_signal(job_record_t *het_job_leader, uint16_t signal,
5585 uint16_t flags, uid_t uid, bool preempt)
5586 {
5587 ListIterator iter;
5588 int rc = SLURM_SUCCESS, rc1;
5589 job_record_t *het_job;
5590
5591 iter = list_iterator_create(het_job_leader->het_job_list);
5592 while ((het_job = list_next(iter))) {
5593 if (het_job_leader->het_job_id != het_job->het_job_id) {
5594 error("%s: Bad het_job_list for %pJ",
5595 __func__, het_job_leader);
5596 continue;
5597 }
5598 rc1 = job_signal(het_job, signal, flags, uid, preempt);
5599 if (rc1 != SLURM_SUCCESS)
5600 rc = rc1;
5601 }
5602 list_iterator_destroy(iter);
5603
5604 return rc;
5605 }
5606
_get_whole_hetjob(void)5607 static bool _get_whole_hetjob(void)
5608 {
5609 static time_t sched_update = 0;
5610 static bool whole_hetjob = false;
5611 char *sched_params = NULL;
5612
5613 if (sched_update != slurmctld_conf.last_update) {
5614 sched_update = slurmctld_conf.last_update;
5615 sched_params = slurm_get_sched_params();
5616 if (xstrcasestr(sched_params, "whole_hetjob") ||
5617 xstrcasestr(sched_params, "whole_pack"))
5618 whole_hetjob = true;
5619 else
5620 whole_hetjob = false;
5621 xfree(sched_params);
5622 }
5623
5624 return whole_hetjob;
5625 }
5626
5627 /*
5628 * job_str_signal - signal the specified job
5629 * IN job_id_str - id of the job to be signaled, valid formats include "#"
5630 * "#_#" and "#_[expr]"
5631 * IN signal - signal to send, SIGKILL == cancel the job
5632 * IN flags - see KILL_JOB_* flags in slurm.h
5633 * IN uid - uid of requesting user
5634 * IN preempt - true if job being preempted
5635 * RET 0 on success, otherwise ESLURM error code
5636 */
job_str_signal(char * job_id_str,uint16_t signal,uint16_t flags,uid_t uid,bool preempt)5637 extern int job_str_signal(char *job_id_str, uint16_t signal, uint16_t flags,
5638 uid_t uid, bool preempt)
5639 {
5640 job_record_t *job_ptr;
5641 uint32_t job_id;
5642 time_t now = time(NULL);
5643 char *end_ptr = NULL, *tok, *tmp;
5644 long int long_id;
5645 bitstr_t *array_bitmap = NULL;
5646 bool valid = true;
5647 int32_t i, i_first, i_last;
5648 int rc = SLURM_SUCCESS, rc2, len;
5649
5650 if (max_array_size == NO_VAL) {
5651 max_array_size = slurmctld_conf.max_array_sz;
5652 }
5653
5654 long_id = strtol(job_id_str, &end_ptr, 10);
5655 if ((long_id <= 0) || (long_id == LONG_MAX) ||
5656 ((end_ptr[0] != '\0') && (end_ptr[0] != '_') &&
5657 (end_ptr[0] != '+'))) {
5658 info("%s(1): invalid JobId=%s", __func__, job_id_str);
5659 return ESLURM_INVALID_JOB_ID;
5660 }
5661 if ((end_ptr[0] == '_') && (end_ptr[1] == '*'))
5662 end_ptr += 2; /* Defaults to full job array */
5663
5664 if (end_ptr[0] == '+') { /* Signal hetjob element */
5665 job_id = (uint32_t) long_id;
5666 long_id = strtol(end_ptr + 1, &end_ptr, 10);
5667 if ((long_id < 0) || (long_id == LONG_MAX) ||
5668 (end_ptr[0] != '\0')) {
5669 info("%s(2): invalid JobId=%s", __func__, job_id_str);
5670 return ESLURM_INVALID_JOB_ID;
5671 }
5672 job_ptr = find_het_job_record(job_id, (uint32_t) long_id);
5673 if (!job_ptr)
5674 return ESLURM_ALREADY_DONE;
5675 if ((job_ptr->user_id != uid) && !validate_operator(uid) &&
5676 !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
5677 job_ptr->account)) {
5678 error("Security violation, REQUEST_KILL_JOB RPC for %pJ from uid %u",
5679 job_ptr, uid);
5680 return ESLURM_ACCESS_DENIED;
5681 }
5682 if (IS_JOB_PENDING(job_ptr))
5683 return ESLURM_NOT_WHOLE_HET_JOB;
5684 return job_signal(job_ptr, signal, flags, uid,preempt);
5685 }
5686
5687 last_job_update = now;
5688 job_id = (uint32_t) long_id;
5689 if (end_ptr[0] == '\0') { /* Single job (or full job array) */
5690 int jobs_done = 0, jobs_signaled = 0;
5691 job_record_t *job_ptr_done = NULL;
5692 job_ptr = find_job_record(job_id);
5693 if (job_ptr && (job_ptr->user_id != uid) &&
5694 !validate_operator(uid) &&
5695 !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
5696 job_ptr->account)) {
5697 error("Security violation, REQUEST_KILL_JOB RPC for %pJ from uid %u",
5698 job_ptr, uid);
5699 return ESLURM_ACCESS_DENIED;
5700 }
5701 if (job_ptr && job_ptr->het_job_list) { /* Hetjob leader */
5702 return het_job_signal(job_ptr, signal, flags, uid,
5703 preempt);
5704 }
5705 if (job_ptr && job_ptr->het_job_id && _get_whole_hetjob()) {
5706 job_record_t *het_job_leader;
5707 het_job_leader = find_job_record(job_ptr->het_job_id);
5708 if (het_job_leader && het_job_leader->het_job_list) {
5709 return het_job_signal(het_job_leader, signal,
5710 flags, uid, preempt);
5711 }
5712 error("%s: Hetjob leader %pJ not found",
5713 __func__, job_ptr);
5714 }
5715 if (job_ptr && job_ptr->het_job_id && IS_JOB_PENDING(job_ptr))
5716 return ESLURM_NOT_WHOLE_HET_JOB;/* Hetjob child */
5717 if (job_ptr && (job_ptr->array_task_id == NO_VAL) &&
5718 (job_ptr->array_recs == NULL)) {
5719 /* This is a regular job, not a job array */
5720 return job_signal_id(job_id, signal, flags, uid, preempt);
5721 }
5722
5723 /*
5724 * This will kill the meta record that holds all
5725 * pending jobs. We want to kill this first so we
5726 * don't start jobs just to kill them as we are
5727 * killing other elements of the array.
5728 */
5729 if (job_ptr && job_ptr->array_recs) {
5730 /* This is a job array */
5731 job_ptr_done = job_ptr;
5732 rc = job_signal(job_ptr, signal, flags, uid, preempt);
5733 if (rc == ESLURM_ACCESS_DENIED)
5734 return rc;
5735 jobs_signaled++;
5736 if (rc == ESLURM_ALREADY_DONE) {
5737 jobs_done++;
5738 rc = SLURM_SUCCESS;
5739 }
5740 }
5741
5742 /* Signal all tasks of this job array */
5743 job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
5744 if (!job_ptr && !job_ptr_done) {
5745 info("%s(3): invalid JobId=%u", __func__, job_id);
5746 return ESLURM_INVALID_JOB_ID;
5747 }
5748 while (job_ptr) {
5749 if (job_ptr->array_job_id == job_id)
5750 break;
5751 job_ptr = job_ptr->job_array_next_j;
5752 }
5753 while (job_ptr) {
5754 if ((job_ptr->array_job_id == job_id) &&
5755 (job_ptr != job_ptr_done)) {
5756 rc2 = job_signal(job_ptr, signal, flags, uid,
5757 preempt);
5758 jobs_signaled++;
5759 if (rc2 == ESLURM_ALREADY_DONE) {
5760 jobs_done++;
5761 } else {
5762 rc = MAX(rc, rc2);
5763 }
5764 }
5765 job_ptr = job_ptr->job_array_next_j;
5766 }
5767 if ((rc == SLURM_SUCCESS) && (jobs_done == jobs_signaled))
5768 return ESLURM_ALREADY_DONE;
5769 return rc;
5770
5771 }
5772
5773 array_bitmap = bit_alloc(max_array_size);
5774 tmp = xstrdup(end_ptr + 1);
5775 tok = strtok_r(tmp, ",", &end_ptr);
5776 while (tok && valid) {
5777 valid = _parse_array_tok(tok, array_bitmap,
5778 max_array_size);
5779 tok = strtok_r(NULL, ",", &end_ptr);
5780 }
5781 xfree(tmp);
5782 if (valid) {
5783 i_last = bit_fls(array_bitmap);
5784 if (i_last < 0)
5785 valid = false;
5786 }
5787 if (!valid) {
5788 info("%s(4): invalid JobId=%s", __func__, job_id_str);
5789 rc = ESLURM_INVALID_JOB_ID;
5790 goto endit;
5791 }
5792
5793 /* Find some job record and validate the user signaling the job */
5794 job_ptr = find_job_record(job_id);
5795 if (job_ptr == NULL) {
5796 job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
5797 while (job_ptr) {
5798 if (job_ptr->array_job_id == job_id)
5799 break;
5800 job_ptr = job_ptr->job_array_next_j;
5801 }
5802 }
5803 if ((job_ptr == NULL) ||
5804 ((job_ptr->array_task_id == NO_VAL) &&
5805 (job_ptr->array_recs == NULL))) {
5806 info("%s(5): invalid JobId=%s", __func__, job_id_str);
5807 rc = ESLURM_INVALID_JOB_ID;
5808 goto endit;
5809 }
5810
5811 if ((job_ptr->user_id != uid) && !validate_operator(uid) &&
5812 !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
5813 job_ptr->account)) {
5814 error("%s: Security violation JOB_CANCEL RPC for %pJ from uid %u",
5815 __func__, job_ptr, uid);
5816 rc = ESLURM_ACCESS_DENIED;
5817 goto endit;
5818 }
5819
5820 if (IS_JOB_PENDING(job_ptr) &&
5821 job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) {
5822 /* Ensure bitmap sizes match for AND operations */
5823 len = bit_size(job_ptr->array_recs->task_id_bitmap);
5824 i_last++;
5825 if (i_last < len) {
5826 array_bitmap = bit_realloc(array_bitmap, len);
5827 } else {
5828 array_bitmap = bit_realloc(array_bitmap, i_last);
5829 job_ptr->array_recs->task_id_bitmap = bit_realloc(
5830 job_ptr->array_recs->task_id_bitmap, i_last);
5831 }
5832 if (signal == SIGKILL) {
5833 uint32_t orig_task_cnt, new_task_count;
5834 /* task_id_bitmap changes, so we need a copy of it */
5835 bitstr_t *task_id_bitmap_orig =
5836 bit_copy(job_ptr->array_recs->task_id_bitmap);
5837
5838 bit_and_not(job_ptr->array_recs->task_id_bitmap,
5839 array_bitmap);
5840 xfree(job_ptr->array_recs->task_id_str);
5841 orig_task_cnt = job_ptr->array_recs->task_cnt;
5842 new_task_count = bit_set_count(job_ptr->array_recs->
5843 task_id_bitmap);
5844 if (!new_task_count) {
5845 last_job_update = now;
5846 job_ptr->job_state = JOB_CANCELLED;
5847 job_ptr->start_time = now;
5848 job_ptr->end_time = now;
5849 job_ptr->requid = uid;
5850 srun_allocate_abort(job_ptr);
5851 job_completion_logger(job_ptr, false);
5852 /*
5853 * Master job record, even wihtout tasks,
5854 * counts as one job record
5855 */
5856 job_count -= (orig_task_cnt - 1);
5857 } else {
5858 _job_array_comp(job_ptr, false, false);
5859 job_count -= (orig_task_cnt - new_task_count);
5860 /*
5861 * Since we are altering the job array's
5862 * task_cnt we must go alter this count in the
5863 * acct_policy code as if they are finishing
5864 * (accrue_cnt/job_submit etc...).
5865 */
5866 if (job_ptr->array_recs->task_cnt >
5867 new_task_count) {
5868 uint32_t tmp_state = job_ptr->job_state;
5869 job_ptr->job_state = JOB_CANCELLED;
5870
5871 job_ptr->array_recs->task_cnt -=
5872 new_task_count;
5873 acct_policy_remove_job_submit(job_ptr);
5874 job_ptr->bit_flags &= ~JOB_ACCRUE_OVER;
5875 job_ptr->job_state = tmp_state;
5876 }
5877 }
5878
5879 /*
5880 * Set the task_cnt here since
5881 * job_completion_logger needs the total
5882 * pending count to handle the acct_policy
5883 * limit for submitted jobs correctly.
5884 */
5885 job_ptr->array_recs->task_cnt = new_task_count;
5886 bit_and_not(array_bitmap, task_id_bitmap_orig);
5887 FREE_NULL_BITMAP(task_id_bitmap_orig);
5888 } else {
5889 bit_and_not(array_bitmap,
5890 job_ptr->array_recs->task_id_bitmap);
5891 rc = ESLURM_TRANSITION_STATE_NO_UPDATE;
5892 }
5893 }
5894
5895 i_first = bit_ffs(array_bitmap);
5896 if (i_first >= 0)
5897 i_last = bit_fls(array_bitmap);
5898 else
5899 i_last = -2;
5900 for (i = i_first; i <= i_last; i++) {
5901 if (!bit_test(array_bitmap, i))
5902 continue;
5903 job_ptr = find_job_array_rec(job_id, i);
5904 if (job_ptr == NULL) {
5905 info("%s(6): invalid JobId=%u_%d",
5906 __func__, job_id, i);
5907 rc = ESLURM_INVALID_JOB_ID;
5908 continue;
5909 }
5910
5911 rc2 = job_signal(job_ptr, signal, flags, uid, preempt);
5912 rc = MAX(rc, rc2);
5913 }
5914 endit:
5915 FREE_NULL_BITMAP(array_bitmap);
5916
5917 return rc;
5918 }
5919
_signal_batch_job(job_record_t * job_ptr,uint16_t signal,uint16_t flags)5920 static void _signal_batch_job(job_record_t *job_ptr, uint16_t signal,
5921 uint16_t flags)
5922 {
5923 bitoff_t i;
5924 signal_tasks_msg_t *signal_tasks_msg = NULL;
5925 agent_arg_t *agent_args = NULL;
5926
5927 xassert(job_ptr);
5928 xassert(job_ptr->batch_host);
5929 i = bit_ffs(job_ptr->node_bitmap);
5930 if (i < 0) {
5931 error("%s: %pJ lacks assigned nodes", __func__, job_ptr);
5932 return;
5933 }
5934
5935 agent_args = xmalloc(sizeof(agent_arg_t));
5936 agent_args->msg_type = REQUEST_SIGNAL_TASKS;
5937 agent_args->retry = 1;
5938 agent_args->node_count = 1;
5939 #ifdef HAVE_FRONT_END
5940 if (job_ptr->front_end_ptr)
5941 agent_args->protocol_version =
5942 job_ptr->front_end_ptr->protocol_version;
5943 #else
5944 node_record_t *node_ptr;
5945 if ((node_ptr = find_node_record(job_ptr->batch_host)))
5946 agent_args->protocol_version = node_ptr->protocol_version;
5947 #endif
5948 agent_args->hostlist = hostlist_create(job_ptr->batch_host);
5949 signal_tasks_msg = xmalloc(sizeof(signal_tasks_msg_t));
5950 signal_tasks_msg->job_id = job_ptr->job_id;
5951 signal_tasks_msg->job_step_id = NO_VAL;
5952
5953 signal_tasks_msg->flags = flags;
5954 signal_tasks_msg->signal = signal;
5955
5956 agent_args->msg_args = signal_tasks_msg;
5957 agent_queue_request(agent_args);
5958 return;
5959 }
5960
5961 /*
5962 * prolog_complete - note the normal termination of the prolog
5963 * IN job_id - id of the job which completed
5964 * IN prolog_return_code - prolog's return code,
5965 * if set then set job state to FAILED
5966 * RET - 0 on success, otherwise ESLURM error code
5967 * global: job_list - pointer global job list
5968 * last_job_update - time of last job table update
5969 */
prolog_complete(uint32_t job_id,uint32_t prolog_return_code)5970 extern int prolog_complete(uint32_t job_id,
5971 uint32_t prolog_return_code)
5972 {
5973 job_record_t *job_ptr;
5974
5975 job_ptr = find_job_record(job_id);
5976 if (job_ptr == NULL) {
5977 info("prolog_complete: invalid JobId=%u", job_id);
5978 return ESLURM_INVALID_JOB_ID;
5979 }
5980
5981 if (IS_JOB_COMPLETING(job_ptr))
5982 return SLURM_SUCCESS;
5983
5984 if (prolog_return_code)
5985 error("Prolog launch failure, %pJ", job_ptr);
5986
5987 job_ptr->state_reason = WAIT_NO_REASON;
5988
5989 return SLURM_SUCCESS;
5990 }
5991
_job_complete(job_record_t * job_ptr,uid_t uid,bool requeue,bool node_fail,uint32_t job_return_code)5992 static int _job_complete(job_record_t *job_ptr, uid_t uid, bool requeue,
5993 bool node_fail, uint32_t job_return_code)
5994 {
5995 node_record_t *node_ptr;
5996 time_t now = time(NULL);
5997 uint32_t job_comp_flag = 0;
5998 bool suspended = false;
5999 int i;
6000 int use_cloud = false;
6001 uint16_t over_time_limit;
6002
6003 xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
6004 xassert(verify_lock(FED_LOCK, READ_LOCK));
6005
6006 if (IS_JOB_FINISHED(job_ptr)) {
6007 if (job_ptr->exit_code == 0)
6008 job_ptr->exit_code = job_return_code;
6009 return ESLURM_ALREADY_DONE;
6010 }
6011
6012 if (IS_JOB_COMPLETING(job_ptr))
6013 return SLURM_SUCCESS; /* avoid replay */
6014
6015 if ((job_return_code & 0xff) == SIG_OOM) {
6016 info("%s: %pJ OOM failure", __func__, job_ptr);
6017 } else if (WIFSIGNALED(job_return_code)) {
6018 info("%s: %pJ WTERMSIG %d",
6019 __func__, job_ptr, WTERMSIG(job_return_code));
6020 } else if (WIFEXITED(job_return_code)) {
6021 info("%s: %pJ WEXITSTATUS %d",
6022 __func__, job_ptr, WEXITSTATUS(job_return_code));
6023 }
6024
6025 if (IS_JOB_RUNNING(job_ptr))
6026 job_comp_flag = JOB_COMPLETING;
6027 else if (IS_JOB_PENDING(job_ptr)) {
6028 job_return_code = NO_VAL;
6029 job_ptr->start_time = now;
6030 fed_mgr_job_revoke_sibs(job_ptr);
6031 }
6032
6033 if ((job_return_code == NO_VAL) &&
6034 (IS_JOB_RUNNING(job_ptr) || IS_JOB_PENDING(job_ptr))) {
6035 if (node_fail) {
6036 info("%s: %pJ cancelled by node failure",
6037 __func__, job_ptr);
6038 } else {
6039 info("%s: %pJ cancelled by interactive user",
6040 __func__, job_ptr);
6041 }
6042 }
6043
6044 if (IS_JOB_SUSPENDED(job_ptr)) {
6045 uint32_t suspend_job_state = job_ptr->job_state;
6046 /*
6047 * we can't have it as suspended when we call the
6048 * accounting stuff.
6049 */
6050 job_ptr->job_state = JOB_CANCELLED;
6051 jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
6052 job_ptr->job_state = suspend_job_state;
6053 job_comp_flag = JOB_COMPLETING;
6054 suspended = true;
6055 }
6056
6057 if (job_comp_flag && (job_ptr->node_cnt == 0)) {
6058 /*
6059 * Job has no resources left (used to expand another job).
6060 * Avoid duplicate run of epilog and underflow in CPU count.
6061 */
6062 job_comp_flag = 0;
6063 }
6064
6065 if (requeue && job_ptr->details && job_ptr->batch_flag) {
6066 /*
6067 * We want this job to look like it was terminated in the
6068 * accounting logs. Set a new submit time so the restarted
6069 * job looks like a new job.
6070 */
6071 job_ptr->end_time = now;
6072 job_ptr->job_state = JOB_NODE_FAIL;
6073 job_completion_logger(job_ptr, true);
6074 /*
6075 * Do this after the epilog complete.
6076 * Setting it here is too early.
6077 */
6078 //job_ptr->db_index = 0;
6079 //job_ptr->details->submit_time = now + 1;
6080 if (job_ptr->node_bitmap) {
6081 i = bit_ffs(job_ptr->node_bitmap);
6082 if (i >= 0) {
6083 node_ptr = node_record_table_ptr + i;
6084 if (IS_NODE_CLOUD(node_ptr))
6085 use_cloud = true;
6086 }
6087 }
6088 if (!use_cloud)
6089 job_ptr->batch_flag++; /* only one retry */
6090 job_ptr->restart_cnt++;
6091
6092 /* clear signal sent flag on requeue */
6093 job_ptr->warn_flags &= ~WARN_SENT;
6094
6095 job_ptr->job_state = JOB_PENDING | job_comp_flag;
6096 /*
6097 * Since the job completion logger removes the job submit
6098 * information, we need to add it again.
6099 */
6100 acct_policy_add_job_submit(job_ptr);
6101 if (node_fail) {
6102 info("%s: requeue %pJ due to node failure",
6103 __func__, job_ptr);
6104 } else {
6105 info("%s: requeue %pJ per user/system request",
6106 __func__, job_ptr);
6107 }
6108 /*
6109 * We have reached the maximum number of requeue
6110 * attempts hold the job with HoldMaxRequeue reason.
6111 */
6112 if (job_ptr->batch_flag > MAX_BATCH_REQUEUE) {
6113 job_ptr->job_state |= JOB_REQUEUE_HOLD;
6114 job_ptr->state_reason = WAIT_MAX_REQUEUE;
6115 job_ptr->batch_flag = 1;
6116 debug("%s: Holding %pJ, repeated requeue failures",
6117 __func__, job_ptr);
6118 job_ptr->priority = 0;
6119 }
6120 } else if (IS_JOB_PENDING(job_ptr) && job_ptr->details &&
6121 job_ptr->batch_flag) {
6122 /*
6123 * Possible failure mode with DOWN node and job requeue.
6124 * The DOWN node might actually respond to the cancel and
6125 * take us here. Don't run job_completion_logger here since
6126 * this is here to catch duplicate cancels from slowly
6127 * responding slurmds
6128 */
6129 return SLURM_SUCCESS;
6130 } else {
6131 if (job_ptr->part_ptr &&
6132 (job_ptr->part_ptr->over_time_limit != NO_VAL16)) {
6133 over_time_limit = job_ptr->part_ptr->over_time_limit;
6134 } else {
6135 over_time_limit = slurmctld_conf.over_time_limit;
6136 }
6137
6138 if (node_fail) {
6139 job_ptr->job_state = JOB_NODE_FAIL | job_comp_flag;
6140 job_ptr->requid = uid;
6141 } else if (job_return_code == NO_VAL) {
6142 job_ptr->job_state = JOB_CANCELLED | job_comp_flag;
6143 job_ptr->requid = uid;
6144 } else if ((job_return_code & 0xff) == SIG_OOM) {
6145 job_ptr->job_state = JOB_OOM | job_comp_flag;
6146 job_ptr->exit_code = job_return_code;
6147 job_ptr->state_reason = FAIL_OOM;
6148 xfree(job_ptr->state_desc);
6149 } else if (WIFEXITED(job_return_code) &&
6150 WEXITSTATUS(job_return_code)) {
6151 job_ptr->job_state = JOB_FAILED | job_comp_flag;
6152 job_ptr->exit_code = job_return_code;
6153 job_ptr->state_reason = FAIL_EXIT_CODE;
6154 xfree(job_ptr->state_desc);
6155 } else if (WIFSIGNALED(job_return_code)) {
6156 job_ptr->job_state = JOB_FAILED | job_comp_flag;
6157 job_ptr->exit_code = job_return_code;
6158 job_ptr->state_reason = FAIL_LAUNCH;
6159 } else if (job_comp_flag
6160 && ((job_ptr->end_time
6161 + over_time_limit * 60) < now)) {
6162 /*
6163 * Test if the job has finished before its allowed
6164 * over time has expired.
6165 */
6166 job_ptr->job_state = JOB_TIMEOUT | job_comp_flag;
6167 job_ptr->state_reason = FAIL_TIMEOUT;
6168 xfree(job_ptr->state_desc);
6169 } else {
6170 job_ptr->job_state = JOB_COMPLETE | job_comp_flag;
6171 job_ptr->exit_code = job_return_code;
6172 if (nonstop_ops.job_fini)
6173 (nonstop_ops.job_fini)(job_ptr);
6174 }
6175
6176 if (suspended) {
6177 job_ptr->end_time = job_ptr->suspend_time;
6178 job_ptr->tot_sus_time +=
6179 difftime(now, job_ptr->suspend_time);
6180 } else
6181 job_ptr->end_time = now;
6182 job_completion_logger(job_ptr, false);
6183 }
6184
6185 last_job_update = now;
6186 job_ptr->time_last_active = now; /* Timer for resending kill RPC */
6187 if (job_comp_flag) { /* job was running */
6188 build_cg_bitmap(job_ptr);
6189 deallocate_nodes(job_ptr, false, suspended, false);
6190 }
6191
6192 /* Check for and cleanup stuck scripts */
6193 if (job_ptr->details && job_ptr->details->prolog_running)
6194 track_script_flush_job(job_ptr->job_id);
6195
6196 info("%s: %pJ done", __func__, job_ptr);
6197 return SLURM_SUCCESS;
6198 }
6199
6200
6201 /*
6202 * job_complete - note the normal termination the specified job
6203 * IN job_id - id of the job which completed
6204 * IN uid - user id of user issuing the RPC
6205 * IN requeue - job should be run again if possible
6206 * IN node_fail - true if job terminated due to node failure
6207 * IN job_return_code - job's return code, if set then set state to FAILED
6208 * RET - 0 on success, otherwise ESLURM error code
6209 * global: job_list - pointer global job list
6210 * last_job_update - time of last job table update
6211 */
job_complete(uint32_t job_id,uid_t uid,bool requeue,bool node_fail,uint32_t job_return_code)6212 extern int job_complete(uint32_t job_id, uid_t uid, bool requeue,
6213 bool node_fail, uint32_t job_return_code)
6214 {
6215 job_record_t *job_ptr, *het_job_ptr;
6216 ListIterator iter;
6217 int rc, rc1;
6218
6219 xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
6220 xassert(verify_lock(FED_LOCK, READ_LOCK));
6221
6222 job_ptr = find_job_record(job_id);
6223 if (job_ptr == NULL) {
6224 info("%s: invalid JobId=%u", __func__, job_id);
6225 return ESLURM_INVALID_JOB_ID;
6226 }
6227
6228 if ((job_ptr->user_id != uid) && !validate_slurm_user(uid)) {
6229 error("%s: Security violation, JOB_COMPLETE RPC for %pJ from uid %u",
6230 __func__, job_ptr, uid);
6231 return ESLURM_USER_ID_MISSING;
6232 }
6233
6234 if (job_ptr->het_job_list) {
6235 rc = SLURM_SUCCESS;
6236 iter = list_iterator_create(job_ptr->het_job_list);
6237 while ((het_job_ptr = list_next(iter))) {
6238 if (job_ptr->het_job_id != het_job_ptr->het_job_id) {
6239 error("%s: Bad het_job_list for %pJ",
6240 __func__, job_ptr);
6241 continue;
6242 }
6243 rc1 = _job_complete(het_job_ptr, uid, requeue,
6244 node_fail, job_return_code);
6245 if (rc1 != SLURM_SUCCESS)
6246 rc = rc1;
6247 }
6248 list_iterator_destroy(iter);
6249 } else {
6250 rc = _job_complete(job_ptr, uid, requeue, node_fail,
6251 job_return_code);
6252 }
6253
6254 return rc;
6255 }
6256
_alt_part_test(part_record_t * part_ptr,part_record_t ** part_ptr_new)6257 static int _alt_part_test(part_record_t *part_ptr, part_record_t **part_ptr_new)
6258 {
6259 part_record_t *alt_part_ptr = NULL;
6260 char *alt_name;
6261
6262 *part_ptr_new = NULL;
6263 if ((part_ptr->state_up & PARTITION_SUBMIT) == 0) {
6264 info("_alt_part_test: original partition is not available "
6265 "(drain or inactive): %s", part_ptr->name);
6266 alt_name = part_ptr->alternate;
6267 while (alt_name) {
6268 alt_part_ptr = find_part_record(alt_name);
6269 if (alt_part_ptr == NULL) {
6270 info("_alt_part_test: invalid alternate "
6271 "partition name specified: %s", alt_name);
6272 return ESLURM_INVALID_PARTITION_NAME;
6273 }
6274 if (alt_part_ptr == part_ptr) {
6275 info("_alt_part_test: no valid alternate "
6276 "partition is available");
6277 return ESLURM_PARTITION_NOT_AVAIL;
6278 }
6279 if (alt_part_ptr->state_up & PARTITION_SUBMIT)
6280 break;
6281 /* Try next alternate in the sequence */
6282 alt_name = alt_part_ptr->alternate;
6283 }
6284 if (alt_name == NULL) {
6285 info("_alt_part_test: no valid alternate partition is "
6286 "available");
6287 return ESLURM_PARTITION_NOT_AVAIL;
6288 }
6289 *part_ptr_new = alt_part_ptr;
6290 }
6291 return SLURM_SUCCESS;
6292 }
6293
6294 /*
6295 * Test if this job can use this partition
6296 *
6297 * NOTE: This function is also called with a dummy job_desc_msg_t from
6298 * job_limits_check() if there is any new check added here you may also have to
6299 * add that parameter to the job_desc_msg_t in that function.
6300 */
_part_access_check(part_record_t * part_ptr,job_desc_msg_t * job_desc,bitstr_t * req_bitmap,uid_t submit_uid,slurmdb_qos_rec_t * qos_ptr,char * acct)6301 static int _part_access_check(part_record_t *part_ptr, job_desc_msg_t *job_desc,
6302 bitstr_t *req_bitmap, uid_t submit_uid,
6303 slurmdb_qos_rec_t *qos_ptr, char *acct)
6304 {
6305 uint32_t total_nodes, min_nodes_tmp, max_nodes_tmp;
6306 uint32_t job_min_nodes, job_max_nodes;
6307 int rc = SLURM_SUCCESS;
6308
6309 if ((part_ptr->flags & PART_FLAG_REQ_RESV) &&
6310 (!job_desc->reservation || job_desc->reservation[0] == '\0')) {
6311 debug2("%s: uid %u access to partition %s "
6312 "denied, requires reservation", __func__,
6313 (unsigned int) submit_uid, part_ptr->name);
6314 return ESLURM_ACCESS_DENIED;
6315 }
6316
6317 if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && (submit_uid != 0) &&
6318 (submit_uid != slurmctld_conf.slurm_user_id)) {
6319 debug2("%s: uid %u access to partition %s "
6320 "denied, not root", __func__,
6321 (unsigned int) submit_uid, part_ptr->name);
6322 return ESLURM_ACCESS_DENIED;
6323 }
6324
6325 if ((job_desc->user_id == 0) && (part_ptr->flags & PART_FLAG_NO_ROOT)) {
6326 error("%s: Security violation, SUBMIT_JOB for "
6327 "user root disabled", __func__);
6328 return ESLURM_USER_ID_MISSING;
6329 }
6330
6331 if (validate_group(part_ptr, job_desc->user_id) == 0) {
6332 debug2("%s: uid %u access to partition %s "
6333 "denied, bad group", __func__,
6334 (unsigned int) job_desc->user_id, part_ptr->name);
6335 return ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP;
6336 }
6337
6338 if (validate_alloc_node(part_ptr, job_desc->alloc_node) == 0) {
6339 debug2("%s: uid %u access to partition %s "
6340 "denied, bad allocating node: %s", __func__,
6341 (unsigned int) job_desc->user_id, part_ptr->name,
6342 job_desc->alloc_node);
6343 return ESLURM_ACCESS_DENIED;
6344 }
6345
6346 if ((part_ptr->state_up & PARTITION_SCHED) &&
6347 (job_desc->min_cpus != NO_VAL)) {
6348 if (job_desc->min_cpus > part_ptr->total_cpus) {
6349 debug2("%s: Job requested too many CPUs (%u) of partition %s(%u)",
6350 __func__, job_desc->min_cpus, part_ptr->name,
6351 part_ptr->total_cpus);
6352 return ESLURM_TOO_MANY_REQUESTED_CPUS;
6353 } else if (job_desc->min_cpus >
6354 (part_ptr->max_cpus_per_node *
6355 part_ptr->total_nodes)) {
6356 debug2("%s: Job requested too many CPUs (%u) of partition %s(%u)",
6357 __func__, job_desc->min_cpus, part_ptr->name,
6358 (part_ptr->max_cpus_per_node *
6359 part_ptr->total_nodes));
6360 return ESLURM_TOO_MANY_REQUESTED_CPUS;
6361 }
6362 }
6363
6364 /* Check against total nodes on the partition */
6365 total_nodes = part_ptr->total_nodes;
6366 if ((part_ptr->state_up & PARTITION_SCHED) &&
6367 (job_desc->min_nodes != NO_VAL) &&
6368 (job_desc->min_nodes > total_nodes)) {
6369 debug2("%s: Job requested too many nodes (%u) "
6370 "of partition %s(%u)", __func__,
6371 job_desc->min_nodes, part_ptr->name, total_nodes);
6372 return ESLURM_INVALID_NODE_COUNT;
6373 }
6374
6375 if (req_bitmap && !bit_super_set(req_bitmap, part_ptr->node_bitmap)) {
6376 debug2("%s: requested nodes %s not in partition %s", __func__,
6377 job_desc->req_nodes, part_ptr->name);
6378 return ESLURM_REQUESTED_NODES_NOT_IN_PARTITION;
6379 }
6380
6381 /* The node counts have not been altered yet, so do not figure them out
6382 * by using the cpu counts. The partitions have already been altered
6383 * so we have to use the original values.
6384 */
6385 job_min_nodes = job_desc->min_nodes;
6386 job_max_nodes = job_desc->max_nodes;
6387 min_nodes_tmp = part_ptr->min_nodes;
6388 max_nodes_tmp = part_ptr->max_nodes;
6389
6390 /* Check against min/max node limits in the partition */
6391
6392 if ((part_ptr->state_up & PARTITION_SCHED) &&
6393 (job_min_nodes != NO_VAL) &&
6394 (job_min_nodes < min_nodes_tmp) &&
6395 (!qos_ptr || (qos_ptr && !(qos_ptr->flags
6396 & QOS_FLAG_PART_MIN_NODE)))) {
6397 debug2("%s: Job requested for nodes (%u) "
6398 "smaller than partition %s(%u) min nodes", __func__,
6399 job_min_nodes, part_ptr->name, min_nodes_tmp);
6400 return ESLURM_INVALID_NODE_COUNT;
6401 }
6402
6403 if ((part_ptr->state_up & PARTITION_SCHED) &&
6404 (job_max_nodes != NO_VAL) &&
6405 (job_max_nodes > max_nodes_tmp) &&
6406 (!qos_ptr || (qos_ptr && !(qos_ptr->flags
6407 & QOS_FLAG_PART_MAX_NODE)))) {
6408 debug2("%s: Job requested for nodes (%u) greater than partition"
6409 " %s(%u) max nodes", __func__, job_max_nodes,
6410 part_ptr->name, max_nodes_tmp);
6411 return ESLURM_INVALID_NODE_COUNT;
6412 }
6413
6414 if ((part_ptr->state_up & PARTITION_SCHED) &&
6415 (job_desc->time_limit != NO_VAL) &&
6416 (job_desc->time_limit > part_ptr->max_time) &&
6417 (!qos_ptr || !(qos_ptr->flags & QOS_FLAG_PART_TIME_LIMIT))) {
6418 debug2("%s: Job time limit (%u) exceeds limit of partition "
6419 "%s(%u)", __func__, job_desc->time_limit, part_ptr->name,
6420 part_ptr->max_time);
6421 return ESLURM_INVALID_TIME_LIMIT;
6422 }
6423
6424 if (slurmctld_conf.enforce_part_limits) {
6425 if ((rc = part_policy_valid_acct(part_ptr, acct, NULL))
6426 != SLURM_SUCCESS)
6427 goto fini;
6428
6429 if ((rc = part_policy_valid_qos(part_ptr, qos_ptr, NULL))
6430 != SLURM_SUCCESS)
6431 goto fini;
6432 }
6433
6434 fini:
6435 return rc;
6436 }
6437
_get_job_parts(job_desc_msg_t * job_desc,part_record_t ** part_pptr,List * part_pptr_list,char ** err_msg)6438 static int _get_job_parts(job_desc_msg_t *job_desc, part_record_t **part_pptr,
6439 List *part_pptr_list, char **err_msg)
6440 {
6441 part_record_t *part_ptr = NULL, *part_ptr_new = NULL;
6442 List part_ptr_list = NULL;
6443 int rc = SLURM_SUCCESS;
6444
6445 /* Identify partition(s) and set pointer(s) to their struct */
6446 if (job_desc->partition) {
6447 char *err_part = NULL;
6448 part_ptr = find_part_record(job_desc->partition);
6449 if (part_ptr == NULL) {
6450 part_ptr_list = get_part_list(job_desc->partition,
6451 &err_part);
6452 if (part_ptr_list) {
6453 part_ptr = list_peek(part_ptr_list);
6454 if (list_count(part_ptr_list) == 1)
6455 FREE_NULL_LIST(part_ptr_list);
6456 }
6457 }
6458 if (part_ptr == NULL) {
6459 info("%s: invalid partition specified: %s",
6460 __func__, job_desc->partition);
6461 if (err_msg) {
6462 xfree(*err_msg);
6463 xstrfmtcat(*err_msg,
6464 "invalid partition specified: %s",
6465 err_part);
6466 xfree(err_part);
6467 }
6468 return ESLURM_INVALID_PARTITION_NAME;
6469 }
6470 } else if (job_desc->reservation && job_desc->reservation[0] != '\0' ) {
6471 slurmctld_resv_t *resv_ptr = NULL;
6472 resv_ptr = find_resv_name(job_desc->reservation);
6473 if (resv_ptr)
6474 part_ptr = resv_ptr->part_ptr;
6475 if (part_ptr)
6476 job_desc->partition = xstrdup(part_ptr->name);
6477 }
6478
6479 if (!part_ptr) {
6480 if (default_part_loc == NULL) {
6481 error("%s: default partition not set", __func__);
6482 return ESLURM_DEFAULT_PARTITION_NOT_SET;
6483 }
6484 part_ptr = default_part_loc;
6485 job_desc->partition = xstrdup(part_ptr->name);
6486 }
6487
6488 /* Change partition pointer(s) to alternates as needed */
6489 if (part_ptr_list) {
6490 int fail_rc = SLURM_SUCCESS;
6491 part_record_t *part_ptr_tmp;
6492 bool rebuild_name_list = false;
6493 ListIterator iter = list_iterator_create(part_ptr_list);
6494
6495 while ((part_ptr_tmp = list_next(iter))) {
6496 rc = _alt_part_test(part_ptr_tmp, &part_ptr_new);
6497 if (rc != SLURM_SUCCESS) {
6498 fail_rc = rc;
6499 list_remove(iter);
6500 rebuild_name_list = true;
6501 continue;
6502 }
6503 if (part_ptr_new) {
6504 list_insert(iter, part_ptr_new);
6505 list_remove(iter);
6506 rebuild_name_list = true;
6507 }
6508 }
6509 list_iterator_destroy(iter);
6510 if (list_is_empty(part_ptr_list)) {
6511 if (fail_rc != SLURM_SUCCESS)
6512 rc = fail_rc;
6513 else
6514 rc = ESLURM_PARTITION_NOT_AVAIL;
6515 goto fini;
6516 }
6517 rc = SLURM_SUCCESS; /* At least some partition usable */
6518 if (rebuild_name_list) {
6519 part_ptr = NULL;
6520 xfree(job_desc->partition);
6521 iter = list_iterator_create(part_ptr_list);
6522 while ((part_ptr_tmp = list_next(iter))) {
6523 if (job_desc->partition)
6524 xstrcat(job_desc->partition, ",");
6525 else
6526 part_ptr = part_ptr_tmp;
6527 xstrcat(job_desc->partition,
6528 part_ptr_tmp->name);
6529 }
6530 list_iterator_destroy(iter);
6531 if (!part_ptr) {
6532 rc = ESLURM_PARTITION_NOT_AVAIL;
6533 goto fini;
6534 }
6535 }
6536 } else {
6537 rc = _alt_part_test(part_ptr, &part_ptr_new);
6538 if (rc != SLURM_SUCCESS)
6539 goto fini;
6540 if (part_ptr_new) {
6541 part_ptr = part_ptr_new;
6542 xfree(job_desc->partition);
6543 job_desc->partition = xstrdup(part_ptr->name);
6544 }
6545 }
6546
6547 *part_pptr = part_ptr;
6548 if (part_pptr_list) {
6549 *part_pptr_list = part_ptr_list;
6550 part_ptr_list = NULL;
6551 } else
6552 FREE_NULL_LIST(part_ptr_list);
6553
6554 fini:
6555 return rc;
6556 }
6557
_valid_job_part(job_desc_msg_t * job_desc,uid_t submit_uid,bitstr_t * req_bitmap,part_record_t * part_ptr,List part_ptr_list,slurmdb_assoc_rec_t * assoc_ptr,slurmdb_qos_rec_t * qos_ptr)6558 static int _valid_job_part(job_desc_msg_t *job_desc, uid_t submit_uid,
6559 bitstr_t *req_bitmap, part_record_t *part_ptr,
6560 List part_ptr_list,
6561 slurmdb_assoc_rec_t *assoc_ptr,
6562 slurmdb_qos_rec_t *qos_ptr)
6563 {
6564 int rc = SLURM_SUCCESS;
6565 part_record_t *part_ptr_tmp;
6566 slurmdb_assoc_rec_t assoc_rec;
6567 uint32_t min_nodes_orig = INFINITE, max_nodes_orig = 1;
6568 uint32_t max_time = 0;
6569 bool any_check = false;
6570
6571 /* Change partition pointer(s) to alternates as needed */
6572 if (part_ptr_list) {
6573 int fail_rc = SLURM_SUCCESS;
6574 ListIterator iter = list_iterator_create(part_ptr_list);
6575
6576 while ((part_ptr_tmp = list_next(iter))) {
6577 /*
6578 * FIXME: When dealing with multiple partitions we
6579 * currently can't deal with partition based
6580 * associations.
6581 */
6582 memset(&assoc_rec, 0, sizeof(assoc_rec));
6583 if (assoc_ptr) {
6584 assoc_rec.acct = assoc_ptr->acct;
6585 assoc_rec.partition = part_ptr_tmp->name;
6586 assoc_rec.uid = job_desc->user_id;
6587 (void) assoc_mgr_fill_in_assoc(
6588 acct_db_conn, &assoc_rec,
6589 accounting_enforce, NULL, false);
6590 }
6591
6592 if (assoc_ptr && assoc_rec.id != assoc_ptr->id) {
6593 info("%s: can't check multiple "
6594 "partitions with partition based "
6595 "associations", __func__);
6596 rc = SLURM_ERROR;
6597 } else {
6598 rc = _part_access_check(part_ptr_tmp, job_desc,
6599 req_bitmap, submit_uid,
6600 qos_ptr, assoc_ptr ?
6601 assoc_ptr->acct : NULL);
6602 }
6603 if ((rc != SLURM_SUCCESS) &&
6604 ((rc == ESLURM_ACCESS_DENIED) ||
6605 (rc == ESLURM_USER_ID_MISSING) ||
6606 (rc == ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP) ||
6607 (slurmctld_conf.enforce_part_limits ==
6608 PARTITION_ENFORCE_ALL))) {
6609 fail_rc = rc;
6610 break;
6611 } else if (rc != SLURM_SUCCESS) {
6612 fail_rc = rc;
6613 } else {
6614 any_check = true;
6615 }
6616
6617 /* Set to success since we found a usable partition */
6618 if (any_check && slurmctld_conf.enforce_part_limits ==
6619 PARTITION_ENFORCE_ANY)
6620 fail_rc = SLURM_SUCCESS;
6621
6622 min_nodes_orig = MIN(min_nodes_orig,
6623 part_ptr_tmp->min_nodes_orig);
6624 max_nodes_orig = MAX(max_nodes_orig,
6625 part_ptr_tmp->max_nodes_orig);
6626 max_time = MAX(max_time, part_ptr_tmp->max_time);
6627 }
6628 list_iterator_destroy(iter);
6629
6630 if (list_is_empty(part_ptr_list) ||
6631 (slurmctld_conf.enforce_part_limits &&
6632 (fail_rc != SLURM_SUCCESS))) {
6633 if (slurmctld_conf.enforce_part_limits ==
6634 PARTITION_ENFORCE_ALL)
6635 rc = fail_rc;
6636 else if (slurmctld_conf.enforce_part_limits ==
6637 PARTITION_ENFORCE_ANY && !any_check)
6638 rc = fail_rc;
6639 else {
6640 rc = ESLURM_PARTITION_NOT_AVAIL;
6641 }
6642 goto fini;
6643 }
6644 rc = SLURM_SUCCESS; /* At least some partition usable */
6645 } else {
6646 min_nodes_orig = part_ptr->min_nodes_orig;
6647 max_nodes_orig = part_ptr->max_nodes_orig;
6648 max_time = part_ptr->max_time;
6649 rc = _part_access_check(part_ptr, job_desc, req_bitmap,
6650 submit_uid, qos_ptr,
6651 assoc_ptr ? assoc_ptr->acct : NULL);
6652 if ((rc != SLURM_SUCCESS) &&
6653 ((rc == ESLURM_ACCESS_DENIED) ||
6654 (rc == ESLURM_USER_ID_MISSING) ||
6655 (rc == ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP) ||
6656 slurmctld_conf.enforce_part_limits))
6657 goto fini;
6658 /* Enforce Part Limit = no */
6659 rc = SLURM_SUCCESS;
6660 }
6661
6662 /* Validate job limits against partition limits */
6663
6664 /* Check Partition with the highest limits when there are muliple */
6665 if (job_desc->min_nodes == NO_VAL) {
6666 /* Avoid setting the job request to 0 nodes unless requested */
6667 if (!min_nodes_orig)
6668 job_desc->min_nodes = 1;
6669 else
6670 job_desc->min_nodes = min_nodes_orig;
6671 } else if ((job_desc->min_nodes > max_nodes_orig) &&
6672 slurmctld_conf.enforce_part_limits &&
6673 (!qos_ptr || (qos_ptr && !(qos_ptr->flags &
6674 QOS_FLAG_PART_MAX_NODE)))) {
6675 info("%s: job's min nodes greater than "
6676 "partition's max nodes (%u > %u)",
6677 __func__, job_desc->min_nodes, max_nodes_orig);
6678 rc = ESLURM_INVALID_NODE_COUNT;
6679 goto fini;
6680 } else if ((job_desc->min_nodes < min_nodes_orig) &&
6681 ((job_desc->max_nodes == NO_VAL) ||
6682 (job_desc->max_nodes >= min_nodes_orig))) {
6683 job_desc->min_nodes = min_nodes_orig;
6684 }
6685
6686 if ((job_desc->max_nodes != NO_VAL) &&
6687 slurmctld_conf.enforce_part_limits &&
6688 (job_desc->max_nodes < min_nodes_orig) &&
6689 (!qos_ptr || (qos_ptr && !(qos_ptr->flags
6690 & QOS_FLAG_PART_MIN_NODE)))) {
6691 info("%s: job's max nodes less than partition's "
6692 "min nodes (%u < %u)",
6693 __func__, job_desc->max_nodes, min_nodes_orig);
6694 rc = ESLURM_INVALID_NODE_COUNT;
6695 goto fini;
6696 }
6697 #ifndef HAVE_FRONT_END
6698 /* Zero node count OK for persistent burst buffer create or destroy */
6699 if ((job_desc->min_nodes == 0) &&
6700 (job_desc->array_inx || (job_desc->het_job_offset != NO_VAL) ||
6701 (!job_desc->burst_buffer && !job_desc->script))) {
6702 info("%s: min_nodes is zero", __func__);
6703 rc = ESLURM_INVALID_NODE_COUNT;
6704 goto fini;
6705 }
6706 #endif
6707
6708 if ((job_desc->time_limit == NO_VAL) &&
6709 (part_ptr->default_time == 0)) {
6710 info("%s: job's default time is 0", __func__);
6711 rc = ESLURM_INVALID_TIME_LIMIT;
6712 goto fini;
6713 }
6714
6715 if ((job_desc->time_limit == NO_VAL) &&
6716 (part_ptr->default_time != NO_VAL))
6717 job_desc->time_limit = part_ptr->default_time;
6718
6719 if ((job_desc->time_min != NO_VAL) &&
6720 (job_desc->time_min > max_time) &&
6721 (!qos_ptr || (qos_ptr && !(qos_ptr->flags &
6722 QOS_FLAG_PART_TIME_LIMIT)))) {
6723 info("%s: job's min time greater than "
6724 "partition's (%u > %u)",
6725 __func__, job_desc->time_min, max_time);
6726 rc = ESLURM_INVALID_TIME_MIN_LIMIT;
6727 goto fini;
6728 }
6729 if ((job_desc->time_limit != NO_VAL) &&
6730 (job_desc->time_limit > max_time) &&
6731 (job_desc->time_min == NO_VAL) &&
6732 slurmctld_conf.enforce_part_limits &&
6733 (!qos_ptr || (qos_ptr && !(qos_ptr->flags &
6734 QOS_FLAG_PART_TIME_LIMIT)))) {
6735 info("%s: job's time limit greater than "
6736 "partition's (%u > %u)",
6737 __func__, job_desc->time_limit, max_time);
6738 rc = ESLURM_INVALID_TIME_LIMIT;
6739 goto fini;
6740 }
6741 if ((job_desc->time_min != NO_VAL) &&
6742 (job_desc->time_min > job_desc->time_limit) &&
6743 (!qos_ptr || (qos_ptr && !(qos_ptr->flags &
6744 QOS_FLAG_PART_TIME_LIMIT)))) {
6745 info("%s: job's min_time greater time limit "
6746 "(%u > %u)",
6747 __func__, job_desc->time_min, job_desc->time_limit);
6748 rc = ESLURM_INVALID_TIME_MIN_LIMIT;
6749 goto fini;
6750 }
6751 if ((job_desc->deadline) && (job_desc->deadline != NO_VAL)) {
6752 char time_str_now[32];
6753 char time_str_deadline[32];
6754 time_t now = time(NULL);
6755 slurm_make_time_str(&job_desc->deadline, time_str_deadline,
6756 sizeof(time_str_deadline));
6757 slurm_make_time_str(&now, time_str_now, sizeof(time_str_now));
6758 if (job_desc->deadline < now) {
6759 info("%s: job's deadline smaller than now (%s < %s)",
6760 __func__, time_str_deadline, time_str_now);
6761 rc = ESLURM_INVALID_TIME_LIMIT;
6762 goto fini;
6763 }
6764 if ((job_desc->time_min) && (job_desc->time_min != NO_VAL) &&
6765 (job_desc->deadline < (now + job_desc->time_min * 60))) {
6766 info("%s: job's min_time greater than deadline (%u > %s)",
6767 __func__, job_desc->time_min, time_str_deadline);
6768 rc = ESLURM_INVALID_TIME_MIN_LIMIT;
6769 goto fini;
6770 }
6771 if ((job_desc->time_min == 0) && (job_desc->time_limit) &&
6772 (job_desc->time_limit != NO_VAL) &&
6773 (job_desc->deadline < (now + job_desc->time_limit * 60))) {
6774 info("%s: job's time_limit greater than deadline (%u > %s)",
6775 __func__, job_desc->time_limit, time_str_deadline);
6776 rc = ESLURM_INVALID_TIME_LIMIT;
6777 goto fini;
6778 }
6779 }
6780
6781 fini:
6782 return rc;
6783 }
6784
6785 /*
6786 * job_limits_check - check the limits specified for the job.
6787 * IN job_ptr - pointer to job table entry.
6788 * IN check_min_time - if true test job's minimum time limit,
6789 * otherwise test maximum time limit
6790 * RET WAIT_NO_REASON on success, fail status otherwise.
6791 */
job_limits_check(job_record_t ** job_pptr,bool check_min_time)6792 extern int job_limits_check(job_record_t **job_pptr, bool check_min_time)
6793 {
6794 struct job_details *detail_ptr;
6795 enum job_state_reason fail_reason;
6796 part_record_t *part_ptr = NULL;
6797 job_record_t *job_ptr = NULL;
6798 slurmdb_qos_rec_t *qos_ptr;
6799 slurmdb_assoc_rec_t *assoc_ptr;
6800 job_desc_msg_t job_desc;
6801 int rc;
6802
6803 job_ptr = *job_pptr;
6804 detail_ptr = job_ptr->details;
6805 part_ptr = job_ptr->part_ptr;
6806 qos_ptr = job_ptr->qos_ptr;
6807 assoc_ptr = job_ptr->assoc_ptr;
6808 if (!detail_ptr || !part_ptr) {
6809 fatal_abort("%pJ has NULL details_ptr and/or part_ptr",
6810 job_ptr);
6811 return WAIT_NO_REASON; /* To prevent CLANG error */
6812 }
6813
6814 fail_reason = WAIT_NO_REASON;
6815
6816 /*
6817 * Here we need to pretend we are just submitting the job so we can
6818 * utilize the already existing function _part_access_check. If any
6819 * additional fields in that function are ever checked, the fields set
6820 * below will need to be modified.
6821 */
6822 slurm_init_job_desc_msg(&job_desc);
6823 job_desc.reservation = job_ptr->resv_name;
6824 job_desc.user_id = job_ptr->user_id;
6825 job_desc.alloc_node = job_ptr->alloc_node;
6826 job_desc.min_cpus = detail_ptr->orig_min_cpus;
6827 job_desc.min_nodes = detail_ptr->min_nodes;
6828 /* _part_access_check looks for NO_VAL instead of 0 */
6829 job_desc.max_nodes = detail_ptr->max_nodes ?
6830 detail_ptr->max_nodes : NO_VAL;;
6831 if (check_min_time && job_ptr->time_min)
6832 job_desc.time_limit = job_ptr->time_min;
6833 else
6834 job_desc.time_limit = job_ptr->time_limit;
6835
6836 if ((rc = _part_access_check(part_ptr, &job_desc, NULL,
6837 job_ptr->user_id, qos_ptr,
6838 job_ptr->account))) {
6839 debug2("%pJ can't run in partition %s: %s",
6840 job_ptr, part_ptr->name, slurm_strerror(rc));
6841 switch (rc) {
6842 case ESLURM_INVALID_TIME_LIMIT:
6843 case ESLURM_INVALID_TIME_MIN_LIMIT:
6844 if (job_ptr->limit_set.time != ADMIN_SET_LIMIT)
6845 fail_reason = WAIT_PART_TIME_LIMIT;
6846 break;
6847 case ESLURM_INVALID_NODE_COUNT:
6848 fail_reason = WAIT_PART_NODE_LIMIT;
6849 break;
6850 /* FIXME */
6851 /* case ESLURM_TOO_MANY_REQUESTED_CPUS: */
6852 /* failt_reason = NON_EXISTANT_WAIT_PART_CPU_LIMIT; */
6853 /* break; */
6854 default:
6855 fail_reason = WAIT_PART_CONFIG;
6856 break;
6857 }
6858 } else if (part_ptr->state_up == PARTITION_DOWN) {
6859 debug2("%pJ requested down partition %s",
6860 job_ptr, part_ptr->name);
6861 fail_reason = WAIT_PART_DOWN;
6862 } else if (part_ptr->state_up == PARTITION_INACTIVE) {
6863 debug2("%pJ requested inactive partition %s",
6864 job_ptr, part_ptr->name);
6865 fail_reason = WAIT_PART_INACTIVE;
6866 } else if (qos_ptr && assoc_ptr &&
6867 (qos_ptr->flags & QOS_FLAG_ENFORCE_USAGE_THRES) &&
6868 (!fuzzy_equal(qos_ptr->usage_thres, NO_VAL))) {
6869 if (!job_ptr->prio_factors) {
6870 job_ptr->prio_factors =
6871 xmalloc(sizeof(priority_factors_object_t));
6872 }
6873 if (!job_ptr->prio_factors->priority_fs) {
6874 if (fuzzy_equal(assoc_ptr->usage->usage_efctv, NO_VAL))
6875 priority_g_set_assoc_usage(assoc_ptr);
6876 job_ptr->prio_factors->priority_fs =
6877 priority_g_calc_fs_factor(
6878 assoc_ptr->usage->usage_efctv,
6879 (long double)assoc_ptr->usage->
6880 shares_norm);
6881 }
6882 if (job_ptr->prio_factors->priority_fs < qos_ptr->usage_thres){
6883 debug2("%pJ exceeds usage threshold", job_ptr);
6884 fail_reason = WAIT_QOS_THRES;
6885 }
6886 } else if (fail_reason == WAIT_NO_REASON) {
6887 /*
6888 * Here we need to pretend we are just submitting the job so we
6889 * can utilize the already existing function _valid_pn_min_mem.
6890 * If anything else is ever checked in that function this will
6891 * most likely have to be updated. Some of the needed members
6892 * were already initialized above to call _part_access_check, as
6893 * well as the memset for job_desc.
6894 */
6895 if (job_ptr->bit_flags & JOB_MEM_SET)
6896 job_desc.pn_min_memory = detail_ptr->orig_pn_min_memory;
6897 else if (part_ptr->def_mem_per_cpu)
6898 job_desc.pn_min_memory = part_ptr->def_mem_per_cpu;
6899 else
6900 job_desc.pn_min_memory = slurmctld_conf.def_mem_per_cpu;
6901 if (detail_ptr->orig_cpus_per_task == NO_VAL16)
6902 job_desc.cpus_per_task = 1;
6903 else
6904 job_desc.cpus_per_task = detail_ptr->orig_cpus_per_task;
6905 if (detail_ptr->num_tasks)
6906 job_desc.num_tasks = detail_ptr->num_tasks;
6907 else {
6908 job_desc.num_tasks = job_desc.min_nodes;
6909 if (detail_ptr->ntasks_per_node != NO_VAL16)
6910 job_desc.num_tasks *=
6911 detail_ptr->ntasks_per_node;
6912 }
6913 //job_desc.min_cpus = detail_ptr->min_cpus; /* init'ed above */
6914 job_desc.max_cpus = detail_ptr->orig_max_cpus;
6915 job_desc.shared = (uint16_t)detail_ptr->share_res;
6916 job_desc.ntasks_per_node = detail_ptr->ntasks_per_node;
6917 job_desc.pn_min_cpus = detail_ptr->orig_pn_min_cpus;
6918 job_desc.job_id = job_ptr->job_id;
6919 if (!_valid_pn_min_mem(&job_desc, part_ptr)) {
6920 /* debug2 message already logged inside the function. */
6921 fail_reason = WAIT_PN_MEM_LIMIT;
6922 } else {
6923 /* Copy back to job_record adjusted members */
6924 detail_ptr->pn_min_memory = job_desc.pn_min_memory;
6925 detail_ptr->cpus_per_task = job_desc.cpus_per_task;
6926 detail_ptr->min_cpus = job_desc.min_cpus;
6927 detail_ptr->max_cpus = job_desc.max_cpus;
6928 detail_ptr->pn_min_cpus = job_desc.pn_min_cpus;
6929 }
6930 }
6931
6932 return (fail_reason);
6933 }
6934
6935 /*
6936 * _job_create - create a job table record for the supplied specifications.
6937 * This performs only basic tests for request validity (access to
6938 * partition, nodes count in partition, and sufficient processors in
6939 * partition).
6940 * IN job_specs - job specifications
6941 * IN allocate - resource allocation request if set rather than job submit
6942 * IN will_run - job is not to be created, test of validity only
6943 * OUT job_pptr - pointer to the job (NULL on error)
6944 * OUT err_msg - Error message for user
6945 * RET 0 on success, otherwise ESLURM error code. If the job would only be
6946 * able to execute with some change in partition configuration then
6947 * ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned
6948 */
6949
_job_create(job_desc_msg_t * job_desc,int allocate,int will_run,job_record_t ** job_pptr,uid_t submit_uid,char ** err_msg,uint16_t protocol_version)6950 static int _job_create(job_desc_msg_t *job_desc, int allocate, int will_run,
6951 job_record_t **job_pptr, uid_t submit_uid,
6952 char **err_msg, uint16_t protocol_version)
6953 {
6954 int error_code = SLURM_SUCCESS, i, qos_error;
6955 part_record_t *part_ptr = NULL;
6956 List part_ptr_list = NULL;
6957 bitstr_t *req_bitmap = NULL, *exc_bitmap = NULL;
6958 job_record_t *job_ptr = NULL;
6959 slurmdb_assoc_rec_t assoc_rec, *assoc_ptr = NULL;
6960 List license_list = NULL, gres_list = NULL;
6961 bool valid;
6962 slurmdb_qos_rec_t qos_rec, *qos_ptr;
6963 uint32_t user_submit_priority, acct_reason = 0;
6964 acct_policy_limit_set_t acct_policy_limit_set;
6965
6966 memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set));
6967 acct_policy_limit_set.tres = xcalloc(slurmctld_tres_cnt,
6968 sizeof(uint16_t));
6969
6970 *job_pptr = NULL;
6971
6972 user_submit_priority = job_desc->priority;
6973
6974 /*
6975 * Reject X11 forwarding requests from 18.08 clients since the
6976 * implementation has changed, and support for setting up tunnels in
6977 * the older style was removed with no backwards compatibility.
6978 * Remove this two versions after 19.05 is released.
6979 */
6980 if (job_desc->x11 && (protocol_version < SLURM_19_05_PROTOCOL_VERSION)) {
6981 info("%s: cannot support X11 tunnelling from older salloc/srun",
6982 __func__);
6983 error_code = ESLURM_X11_NOT_AVAIL;
6984 goto cleanup_fail;
6985 }
6986
6987 /* ensure that selected nodes are in this partition */
6988 if (job_desc->req_nodes) {
6989 error_code = node_name2bitmap(job_desc->req_nodes, false,
6990 &req_bitmap);
6991 if (error_code) {
6992 error_code = ESLURM_INVALID_NODE_NAME;
6993 goto cleanup_fail;
6994 }
6995 if ((job_desc->contiguous != NO_VAL16) &&
6996 (job_desc->contiguous))
6997 bit_fill_gaps(req_bitmap);
6998 i = bit_set_count(req_bitmap);
6999 if (i > job_desc->min_nodes)
7000 job_desc->min_nodes = i;
7001 if (i > job_desc->min_cpus)
7002 job_desc->min_cpus = i;
7003 if (job_desc->max_nodes &&
7004 (job_desc->min_nodes > job_desc->max_nodes)) {
7005 #if 0
7006 info("%s: max node count less than required hostlist "
7007 "size for user %u", __func__, job_desc->user_id);
7008 job_desc->max_nodes = job_desc->min_nodes;
7009 #else
7010 error_code = ESLURM_INVALID_NODE_COUNT;
7011 goto cleanup_fail;
7012 #endif
7013 }
7014 }
7015
7016 /* Zero node count OK for persistent burst buffer create or destroy */
7017 if ((job_desc->max_nodes == 0) &&
7018 (job_desc->array_inx || (job_desc->het_job_offset != NO_VAL) ||
7019 (!job_desc->burst_buffer && !job_desc->script))) {
7020 info("%s: max_nodes is zero", __func__);
7021 error_code = ESLURM_INVALID_NODE_COUNT;
7022 goto cleanup_fail;
7023 }
7024
7025 error_code = _get_job_parts(job_desc, &part_ptr, &part_ptr_list,
7026 err_msg);
7027 if (error_code != SLURM_SUCCESS)
7028 goto cleanup_fail;
7029
7030 memset(&assoc_rec, 0, sizeof(assoc_rec));
7031 assoc_rec.acct = job_desc->account;
7032 assoc_rec.partition = part_ptr->name;
7033 assoc_rec.uid = job_desc->user_id;
7034 /*
7035 * Checks are done later to validate assoc_ptr, so we don't
7036 * need to lock outside of fill_in_assoc.
7037 */
7038 if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
7039 accounting_enforce, &assoc_ptr, false)) {
7040 info("%s: invalid account or partition for user %u, "
7041 "account '%s', and partition '%s'", __func__,
7042 job_desc->user_id, assoc_rec.acct, assoc_rec.partition);
7043 error_code = ESLURM_INVALID_ACCOUNT;
7044 goto cleanup_fail;
7045 } else if (association_based_accounting &&
7046 !assoc_ptr &&
7047 !(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)) {
7048 /*
7049 * If not enforcing associations we want to look for the
7050 * default account and use it to avoid getting trash in the
7051 * accounting records.
7052 */
7053 assoc_rec.acct = NULL;
7054 (void) assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
7055 accounting_enforce, &assoc_ptr,
7056 false);
7057 if (assoc_ptr) {
7058 info("%s: account '%s' has no association for user %u "
7059 "using default account '%s'",
7060 __func__, job_desc->account, job_desc->user_id,
7061 assoc_rec.acct);
7062 xfree(job_desc->account);
7063 }
7064 }
7065
7066 if (job_desc->account == NULL)
7067 job_desc->account = xstrdup(assoc_rec.acct);
7068
7069 /* This must be done after we have the assoc_ptr set */
7070 memset(&qos_rec, 0, sizeof(qos_rec));
7071 qos_rec.name = job_desc->qos;
7072
7073 qos_ptr = _determine_and_validate_qos(
7074 job_desc->reservation, assoc_ptr, false, &qos_rec, &qos_error,
7075 false, LOG_LEVEL_ERROR);
7076
7077 if (qos_error != SLURM_SUCCESS) {
7078 error_code = qos_error;
7079 goto cleanup_fail;
7080 }
7081
7082 error_code = _valid_job_part(job_desc, submit_uid, req_bitmap,
7083 part_ptr, part_ptr_list,
7084 assoc_ptr, qos_ptr);
7085 if (error_code != SLURM_SUCCESS)
7086 goto cleanup_fail;
7087
7088 if ((error_code = _validate_job_desc(job_desc, allocate, submit_uid,
7089 part_ptr, part_ptr_list))) {
7090 goto cleanup_fail;
7091 }
7092
7093 job_desc->tres_req_cnt = xcalloc(slurmctld_tres_cnt, sizeof(uint64_t));
7094 job_desc->tres_req_cnt[TRES_ARRAY_NODE] = job_desc->min_nodes;
7095 job_desc->tres_req_cnt[TRES_ARRAY_CPU] = job_desc->min_cpus;
7096 job_desc->tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem(NULL,
7097 job_desc->pn_min_memory,
7098 job_desc->tres_req_cnt[TRES_ARRAY_CPU],
7099 job_desc->min_nodes);
7100
7101 license_list = license_validate(job_desc->licenses,
7102 validate_cfgd_licenses, true,
7103 job_desc->tres_req_cnt, &valid);
7104 if (!valid) {
7105 info("Job's requested licenses are invalid: %s",
7106 job_desc->licenses);
7107 error_code = ESLURM_INVALID_LICENSES;
7108 goto cleanup_fail;
7109 }
7110
7111 if ((error_code = gres_plugin_job_state_validate(
7112 job_desc->cpus_per_tres,
7113 job_desc->tres_freq,
7114 job_desc->tres_per_job,
7115 job_desc->tres_per_node,
7116 job_desc->tres_per_socket,
7117 job_desc->tres_per_task,
7118 job_desc->mem_per_tres,
7119 &job_desc->num_tasks,
7120 &job_desc->min_nodes,
7121 &job_desc->max_nodes,
7122 &job_desc->ntasks_per_node,
7123 &job_desc->ntasks_per_socket,
7124 &job_desc->sockets_per_node,
7125 &job_desc->cpus_per_task,
7126 &gres_list)))
7127 goto cleanup_fail;
7128
7129 if (!valid_tres_cnt(job_desc->cpus_per_tres) ||
7130 !valid_tres_cnt(job_desc->mem_per_tres) ||
7131 tres_bind_verify_cmdline(job_desc->tres_bind) ||
7132 tres_freq_verify_cmdline(job_desc->tres_freq) ||
7133 !valid_tres_cnt(job_desc->mem_per_tres) ||
7134 !valid_tres_cnt(job_desc->tres_per_job) ||
7135 !valid_tres_cnt(job_desc->tres_per_node) ||
7136 !valid_tres_cnt(job_desc->tres_per_socket) ||
7137 !valid_tres_cnt(job_desc->tres_per_task)) {
7138 error_code = ESLURM_INVALID_TRES;
7139 goto cleanup_fail;
7140 }
7141
7142 gres_set_job_tres_cnt(gres_list,
7143 job_desc->min_nodes,
7144 job_desc->tres_req_cnt,
7145 false);
7146
7147 /*
7148 * Do this last,after other TRES' have been set as it uses the other
7149 * values to calculate the billing value.
7150 */
7151 job_desc->tres_req_cnt[TRES_ARRAY_BILLING] =
7152 assoc_mgr_tres_weighted(job_desc->tres_req_cnt,
7153 part_ptr->billing_weights,
7154 slurmctld_conf.priority_flags, false);
7155
7156 if ((error_code = bb_g_job_validate(job_desc, submit_uid))
7157 != SLURM_SUCCESS)
7158 goto cleanup_fail;
7159
7160 if (job_desc->deadline && (job_desc->time_limit == NO_VAL) &&
7161 (job_desc->time_min == NO_VAL))
7162 job_desc->time_min = 1;
7163 if ((accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) &&
7164 (!acct_policy_validate(job_desc, part_ptr,
7165 assoc_ptr, qos_ptr, &acct_reason,
7166 &acct_policy_limit_set, 0))) {
7167 if (err_msg) {
7168 xfree(*err_msg);
7169 *err_msg = xstrdup(job_reason_string(acct_reason));
7170 }
7171 info("%s: exceeded association/QOS limit for user %u: %s",
7172 __func__, job_desc->user_id,
7173 err_msg ? *err_msg : job_reason_string(acct_reason));
7174 error_code = ESLURM_ACCOUNTING_POLICY;
7175 goto cleanup_fail;
7176 }
7177
7178 if (job_desc->exc_nodes) {
7179 error_code = node_name2bitmap(job_desc->exc_nodes, false,
7180 &exc_bitmap);
7181 if (error_code) {
7182 error_code = ESLURM_INVALID_NODE_NAME;
7183 goto cleanup_fail;
7184 }
7185 }
7186 if (exc_bitmap && req_bitmap) {
7187 bitstr_t *tmp_bitmap = NULL;
7188 bitoff_t first_set;
7189 tmp_bitmap = bit_copy(exc_bitmap);
7190 bit_and(tmp_bitmap, req_bitmap);
7191 first_set = bit_ffs(tmp_bitmap);
7192 FREE_NULL_BITMAP(tmp_bitmap);
7193 if (first_set != -1) {
7194 info("Job's required and excluded node lists overlap");
7195 error_code = ESLURM_INVALID_NODE_NAME;
7196 goto cleanup_fail;
7197 }
7198 }
7199
7200 if (job_desc->min_nodes == NO_VAL)
7201 job_desc->min_nodes = 1;
7202
7203 if (job_desc->max_nodes == NO_VAL)
7204 job_desc->max_nodes = 0;
7205
7206 if (job_desc->max_nodes &&
7207 (job_desc->max_nodes < job_desc->min_nodes)) {
7208 info("%s: Job's max_nodes(%u) < min_nodes(%u)",
7209 __func__, job_desc->max_nodes, job_desc->min_nodes);
7210 error_code = ESLURM_INVALID_NODE_COUNT;
7211 goto cleanup_fail;
7212 }
7213
7214 if ((error_code = _copy_job_desc_to_job_record(job_desc,
7215 job_pptr,
7216 &req_bitmap,
7217 &exc_bitmap))) {
7218 if (error_code == SLURM_ERROR)
7219 error_code = ESLURM_ERROR_ON_DESC_TO_RECORD_COPY;
7220 job_ptr = *job_pptr;
7221 goto cleanup_fail;
7222 }
7223
7224 job_ptr = *job_pptr;
7225 job_ptr->start_protocol_ver = protocol_version;
7226 job_ptr->part_ptr = part_ptr;
7227 job_ptr->part_ptr_list = part_ptr_list;
7228 job_ptr->bit_flags |= JOB_DEPENDENT;
7229 job_ptr->last_sched_eval = time(NULL);
7230
7231 part_ptr_list = NULL;
7232
7233 memcpy(&job_ptr->limit_set, &acct_policy_limit_set,
7234 sizeof(acct_policy_limit_set_t));
7235 acct_policy_limit_set.tres = NULL;
7236
7237 job_ptr->assoc_id = assoc_rec.id;
7238 job_ptr->assoc_ptr = (void *) assoc_ptr;
7239 job_ptr->qos_ptr = (void *) qos_ptr;
7240 job_ptr->qos_id = qos_rec.id;
7241
7242 if (mcs_g_set_mcs_label(job_ptr, job_desc->mcs_label) != 0 ) {
7243 if (job_desc->mcs_label == NULL) {
7244 error("Failed to create job: No valid mcs_label found");
7245 } else {
7246 error("Failed to create job: Invalid mcs-label: %s",
7247 job_desc->mcs_label);
7248 }
7249 error_code = ESLURM_INVALID_MCS_LABEL;
7250 goto cleanup_fail;
7251 }
7252
7253 /*
7254 * Permission for altering priority was confirmed above. The job_submit
7255 * plugin may have set the priority directly or put the job on hold. If
7256 * the priority is not given, we will figure it out later after we see
7257 * if the job is eligible or not. So we want NO_VAL if not set.
7258 */
7259 job_ptr->priority = job_desc->priority;
7260 if (job_ptr->priority == 0) {
7261 if (user_submit_priority == 0)
7262 job_ptr->state_reason = WAIT_HELD_USER;
7263 else
7264 job_ptr->state_reason = WAIT_HELD;
7265 } else if (job_ptr->priority != NO_VAL) {
7266 job_ptr->direct_set_prio = 1;
7267 }
7268
7269 /*
7270 * The job submit plugin sets site_factor to NO_VAL so that it can
7271 * only be set the by the job submit plugin at submission.
7272 */
7273 if (job_desc->site_factor != NO_VAL)
7274 job_ptr->site_factor = job_desc->site_factor;
7275
7276 error_code = update_job_dependency(job_ptr, job_desc->dependency);
7277 if (error_code != SLURM_SUCCESS)
7278 goto cleanup_fail;
7279 job_ptr->details->orig_dependency = xstrdup(job_ptr->details->
7280 dependency);
7281
7282 if ((error_code = build_feature_list(job_ptr)))
7283 goto cleanup_fail;
7284
7285 /*
7286 * NOTE: If this job is being used to expand another job, this job's
7287 * gres_list has already been filled in with a copy of gres_list job
7288 * to be expanded by update_job_dependency()
7289 */
7290 if (!job_ptr->details->expanding_jobid) {
7291 job_ptr->gres_list = gres_list;
7292 gres_list = NULL;
7293 }
7294
7295 job_ptr->gres_detail_cnt = 0;
7296 job_ptr->gres_detail_str = NULL;
7297 gres_plugin_job_state_log(job_ptr->gres_list, job_ptr->job_id);
7298
7299 if ((error_code = validate_job_resv(job_ptr)))
7300 goto cleanup_fail;
7301
7302 if (job_desc->script
7303 && (!will_run)) { /* don't bother with copy if just a test */
7304 if ((error_code = _copy_job_desc_to_file(job_desc,
7305 job_ptr->job_id))) {
7306 error_code = ESLURM_WRITING_TO_FILE;
7307 goto cleanup_fail;
7308 }
7309 job_ptr->batch_flag = 1;
7310 } else
7311 job_ptr->batch_flag = 0;
7312 if (!will_run &&
7313 (error_code = bb_g_job_validate2(job_ptr, err_msg)))
7314 goto cleanup_fail;
7315
7316 job_ptr->license_list = license_list;
7317 license_list = NULL;
7318
7319 if (job_desc->req_switch != NO_VAL) { /* Max # of switches */
7320 job_ptr->req_switch = job_desc->req_switch;
7321 if (job_desc->wait4switch != NO_VAL) {
7322 job_ptr->wait4switch =
7323 _max_switch_wait(job_desc->wait4switch);
7324 } else
7325 job_ptr->wait4switch = _max_switch_wait(INFINITE);
7326 }
7327 job_ptr->best_switch = true;
7328
7329 FREE_NULL_LIST(license_list);
7330 FREE_NULL_LIST(gres_list);
7331 FREE_NULL_BITMAP(req_bitmap);
7332 FREE_NULL_BITMAP(exc_bitmap);
7333 return error_code;
7334
7335 cleanup_fail:
7336 if (job_ptr) {
7337 job_ptr->job_state = JOB_FAILED;
7338 job_ptr->exit_code = 1;
7339 job_ptr->state_reason = FAIL_SYSTEM;
7340 xfree(job_ptr->state_desc);
7341 job_ptr->start_time = job_ptr->end_time = time(NULL);
7342 purge_job_record(job_ptr->job_id);
7343 *job_pptr = NULL;
7344 }
7345 FREE_NULL_LIST(license_list);
7346 xfree(acct_policy_limit_set.tres);
7347 FREE_NULL_LIST(gres_list);
7348 FREE_NULL_LIST(part_ptr_list);
7349 FREE_NULL_BITMAP(req_bitmap);
7350 FREE_NULL_BITMAP(exc_bitmap);
7351 return error_code;
7352 }
7353
_test_strlen(char * test_str,char * str_name,int max_str_len)7354 static int _test_strlen(char *test_str, char *str_name, int max_str_len)
7355 {
7356 int i = 0;
7357
7358 if (test_str)
7359 i = strlen(test_str);
7360 if (i > max_str_len) {
7361 info("job_create_request: strlen(%s) too big (%d > %d)",
7362 str_name, i, max_str_len);
7363 return ESLURM_PATHNAME_TOO_LONG;
7364 }
7365 return SLURM_SUCCESS;
7366 }
7367
7368 /* For each token in a comma delimited job array expression set the matching
7369 * bitmap entry */
_parse_array_tok(char * tok,bitstr_t * array_bitmap,uint32_t max)7370 static bool _parse_array_tok(char *tok, bitstr_t *array_bitmap, uint32_t max)
7371 {
7372 char *end_ptr = NULL;
7373 int i, first, last, step = 1;
7374
7375 if (tok[0] == '[') /* Strip leading "[" */
7376 tok++;
7377 first = strtol(tok, &end_ptr, 10);
7378 if (end_ptr[0] == ']') /* Strip trailing "]" */
7379 end_ptr++;
7380 if (first < 0)
7381 return false;
7382 if (end_ptr[0] == '-') {
7383 last = strtol(end_ptr + 1, &end_ptr, 10);
7384 if (end_ptr[0] == ']') /* Strip trailing "]" */
7385 end_ptr++;
7386 if (end_ptr[0] == ':') {
7387 step = strtol(end_ptr + 1, &end_ptr, 10);
7388 if (end_ptr[0] == ']') /* Strip trailing "]" */
7389 end_ptr++;
7390 if ((end_ptr[0] != '\0') && (end_ptr[0] != '%'))
7391 return false;
7392 if (step <= 0)
7393 return false;
7394 } else if ((end_ptr[0] != '\0') && (end_ptr[0] != '%')) {
7395 return false;
7396 }
7397 if (last < first)
7398 return false;
7399 } else if ((end_ptr[0] != '\0') && (end_ptr[0] != '%')) {
7400 return false;
7401 } else {
7402 last = first;
7403 }
7404
7405 if (last >= max)
7406 return false;
7407
7408 for (i = first; i <= last; i += step) {
7409 bit_set(array_bitmap, i);
7410 }
7411
7412 return true;
7413 }
7414
7415 /* Translate a job array expression into the equivalent bitmap */
_valid_array_inx(job_desc_msg_t * job_desc)7416 static bool _valid_array_inx(job_desc_msg_t *job_desc)
7417 {
7418 static time_t sched_update = 0;
7419 static uint32_t max_task_cnt = NO_VAL;
7420 uint32_t task_cnt;
7421 bool valid = true;
7422 char *tmp, *tok, *last = NULL;
7423
7424 FREE_NULL_BITMAP(job_desc->array_bitmap);
7425 if (!job_desc->array_inx || !job_desc->array_inx[0])
7426 return true;
7427 if (!job_desc->script || !job_desc->script[0])
7428 return false;
7429
7430 if (max_array_size == NO_VAL) {
7431 max_array_size = slurmctld_conf.max_array_sz;
7432 }
7433 if (max_array_size == 0) {
7434 verbose("Job arrays disabled, MaxArraySize=0");
7435 return false;
7436 }
7437
7438 if (sched_update != slurmctld_conf.last_update) {
7439 char *sched_params = slurm_get_sched_params();
7440 char *key;
7441 max_task_cnt = max_array_size;
7442 sched_update = slurmctld_conf.last_update;
7443 if ((key = xstrcasestr(sched_params, "max_array_tasks="))) {
7444 key += 16;
7445 max_task_cnt = atoi(key);
7446 }
7447 xfree(sched_params);
7448 }
7449
7450 /* We have a job array request */
7451 job_desc->immediate = 0; /* Disable immediate option */
7452 job_desc->array_bitmap = bit_alloc(max_array_size);
7453
7454 tmp = xstrdup(job_desc->array_inx);
7455 tok = strtok_r(tmp, ",", &last);
7456 while (tok && valid) {
7457 valid = _parse_array_tok(tok, job_desc->array_bitmap,
7458 max_array_size);
7459 tok = strtok_r(NULL, ",", &last);
7460 }
7461 xfree(tmp);
7462
7463 if (valid && (max_task_cnt < max_array_size)) {
7464 task_cnt = bit_set_count(job_desc->array_bitmap);
7465 if (task_cnt > max_task_cnt) {
7466 debug("max_array_tasks exceeded (%u > %u)",
7467 task_cnt, max_task_cnt);
7468 valid = false;
7469 }
7470 }
7471
7472 return valid;
7473 }
7474
7475 /* Make sure a job descriptor's strings are not huge, which could result in
7476 * a denial of service attack due to memory demands by the slurmctld */
_test_job_desc_fields(job_desc_msg_t * job_desc)7477 static int _test_job_desc_fields(job_desc_msg_t * job_desc)
7478 {
7479 static int max_script = -1;
7480
7481 if (max_script == -1) {
7482 char *sched_params = slurm_get_sched_params();
7483 char *tmp_ptr;
7484 max_script = 4 * 1024 * 1024;
7485 if ((tmp_ptr = xstrcasestr(sched_params, "max_script_size="))) {
7486 max_script = atoi(tmp_ptr + 16);
7487 }
7488 xfree(sched_params);
7489 }
7490
7491 if (_test_strlen(job_desc->account, "account", 1024) ||
7492 _test_strlen(job_desc->alloc_node, "alloc_node", 1024) ||
7493 _test_strlen(job_desc->array_inx, "array_inx", 1024 * 4) ||
7494 _test_strlen(job_desc->burst_buffer, "burst_buffer",1024*8) ||
7495 _test_strlen(job_desc->comment, "comment", 1024) ||
7496 _test_strlen(job_desc->cpu_bind, "cpu-bind", 1024 * 128) ||
7497 _test_strlen(job_desc->cpus_per_tres, "cpus_per_tres", 1024)||
7498 _test_strlen(job_desc->dependency, "dependency", 1024*128) ||
7499 _test_strlen(job_desc->features, "features", 1024) ||
7500 _test_strlen(
7501 job_desc->cluster_features, "cluster_features", 1024) ||
7502 _test_strlen(job_desc->licenses, "licenses", 1024) ||
7503 _test_strlen(job_desc->mail_user, "mail_user", 1024) ||
7504 _test_strlen(job_desc->mcs_label, "mcs_label", 1024) ||
7505 _test_strlen(job_desc->mem_bind, "mem-bind", 1024 * 128) ||
7506 _test_strlen(job_desc->mem_per_tres, "mem_per_tres", 1024) ||
7507 _test_strlen(job_desc->name, "name", 1024) ||
7508 _test_strlen(job_desc->network, "network", 1024) ||
7509 _test_strlen(job_desc->partition, "partition", 1024) ||
7510 _test_strlen(job_desc->qos, "qos", 1024) ||
7511 _test_strlen(job_desc->reservation, "reservation", 1024) ||
7512 _test_strlen(job_desc->script, "script", max_script) ||
7513 _test_strlen(job_desc->std_err, "std_err", MAXPATHLEN) ||
7514 _test_strlen(job_desc->std_in, "std_in", MAXPATHLEN) ||
7515 _test_strlen(job_desc->std_out, "std_out", MAXPATHLEN) ||
7516 _test_strlen(job_desc->tres_bind, "tres_bind", 1024) ||
7517 _test_strlen(job_desc->tres_freq, "tres_freq", 1024) ||
7518 _test_strlen(job_desc->tres_per_job, "tres_per_job", 1024) ||
7519 _test_strlen(job_desc->tres_per_node, "tres_per_node", 1024)||
7520 _test_strlen(job_desc->tres_per_socket, "tres_per_socket", 1024) ||
7521 _test_strlen(job_desc->tres_per_task, "tres_per_task", 1024)||
7522 _test_strlen(job_desc->wckey, "wckey", 1024) ||
7523 _test_strlen(job_desc->work_dir, "work_dir", MAXPATHLEN))
7524 return ESLURM_PATHNAME_TOO_LONG;
7525
7526 return SLURM_SUCCESS;
7527 }
7528
7529 /* Perform some size checks on strings we store to prevent
7530 * malicious user filling slurmctld's memory
7531 * IN job_desc - user job submit request
7532 * IN submit_uid - UID making job submit request
7533 * OUT err_msg - custom error message to return
7534 * RET 0 or error code */
validate_job_create_req(job_desc_msg_t * job_desc,uid_t submit_uid,char ** err_msg)7535 extern int validate_job_create_req(job_desc_msg_t * job_desc, uid_t submit_uid,
7536 char **err_msg)
7537 {
7538 int rc;
7539
7540 /*
7541 * Check user permission for negative 'nice' and non-0 priority values
7542 * (restricted to root, SlurmUser, or SLURMDB_ADMIN_OPERATOR) _before_
7543 * running the job_submit plugin.
7544 */
7545 if (!validate_operator(submit_uid)) {
7546 if (job_desc->priority != 0)
7547 job_desc->priority = NO_VAL;
7548 if (job_desc->nice < NICE_OFFSET)
7549 return ESLURM_INVALID_NICE;
7550 }
7551
7552 if (!validate_super_user(submit_uid)) {
7553 /* AdminComment can only be set by an Admin. */
7554 if (job_desc->admin_comment)
7555 return ESLURM_ACCESS_DENIED;
7556
7557 if (job_desc->reboot && (job_desc->reboot != NO_VAL16)) {
7558 *err_msg = xstrdup("rebooting of nodes is only allowed for admins");
7559 return ESLURM_ACCESS_DENIED;
7560 }
7561 }
7562
7563 rc = job_submit_plugin_submit(job_desc, (uint32_t) submit_uid, err_msg);
7564 if (rc != SLURM_SUCCESS)
7565 return rc;
7566 rc = node_features_g_job_valid(job_desc->features);
7567 if (rc != SLURM_SUCCESS)
7568 return rc;
7569
7570 rc = _test_job_desc_fields(job_desc);
7571 if (rc != SLURM_SUCCESS)
7572 return rc;
7573
7574 if (!_valid_array_inx(job_desc))
7575 return ESLURM_INVALID_ARRAY;
7576
7577 if (job_desc->x11 && !(slurmctld_conf.prolog_flags & PROLOG_FLAG_X11))
7578 return ESLURM_X11_NOT_AVAIL;
7579
7580 /* Make sure anything that may be put in the database will be
7581 * lower case */
7582 xstrtolower(job_desc->account);
7583 xstrtolower(job_desc->wckey);
7584
7585 /* Basic validation of some parameters */
7586 if (job_desc->req_nodes) {
7587 hostlist_t hl;
7588 uint32_t host_cnt;
7589 hl = hostlist_create(job_desc->req_nodes);
7590 if (hl == NULL) {
7591 /* likely a badly formatted hostlist */
7592 error("validate_job_create_req: bad hostlist");
7593 return ESLURM_INVALID_NODE_NAME;
7594 }
7595 host_cnt = hostlist_count(hl);
7596 hostlist_destroy(hl);
7597 if ((job_desc->min_nodes == NO_VAL) ||
7598 (job_desc->min_nodes < host_cnt))
7599 job_desc->min_nodes = host_cnt;
7600 }
7601
7602 /* If max nodes is different than min nodes don't set tasks or
7603 * it will hard code the range.
7604 */
7605 if ((job_desc->ntasks_per_node != NO_VAL16) &&
7606 (job_desc->min_nodes != NO_VAL) &&
7607 (job_desc->num_tasks == NO_VAL)) {
7608 job_desc->num_tasks =
7609 job_desc->ntasks_per_node * job_desc->min_nodes;
7610 }
7611
7612 /* Only set min and max cpus if overcommit isn't set */
7613 if ((job_desc->overcommit == NO_VAL8) &&
7614 (job_desc->min_cpus != NO_VAL) &&
7615 (job_desc->num_tasks != NO_VAL) &&
7616 (job_desc->num_tasks > job_desc->min_cpus)) {
7617 if (job_desc->num_tasks != NO_VAL)
7618 job_desc->min_cpus = job_desc->num_tasks;
7619 else if (job_desc->min_nodes != NO_VAL)
7620 job_desc->min_cpus = job_desc->min_nodes;
7621 else
7622 job_desc->min_cpus = 1;
7623
7624 if (job_desc->cpus_per_task != NO_VAL16)
7625 job_desc->min_cpus *= job_desc->cpus_per_task;
7626 /* This is just a sanity check as we wouldn't ever have a
7627 * max_cpus if we didn't have a min_cpus.
7628 */
7629 if ((job_desc->max_cpus != NO_VAL) &&
7630 (job_desc->max_cpus < job_desc->min_cpus))
7631 job_desc->max_cpus = job_desc->min_cpus;
7632 }
7633
7634 if (job_desc->reboot && (job_desc->reboot != NO_VAL16))
7635 job_desc->shared = 0;
7636
7637 return SLURM_SUCCESS;
7638 }
7639
7640 /* _copy_job_desc_to_file - copy the job script and environment from the RPC
7641 * structure into a file */
7642 static int
_copy_job_desc_to_file(job_desc_msg_t * job_desc,uint32_t job_id)7643 _copy_job_desc_to_file(job_desc_msg_t * job_desc, uint32_t job_id)
7644 {
7645 int error_code = 0, hash;
7646 char *dir_name, *file_name;
7647 DEF_TIMERS;
7648
7649 START_TIMER;
7650
7651 if (!job_desc->environment || job_desc->env_size == 0) {
7652 error("%s: batch job cannot run without an environment",
7653 __func__);
7654 return ESLURM_ENVIRONMENT_MISSING;
7655 }
7656
7657 /* Create directory based upon job ID due to limitations on the number
7658 * of files possible in a directory on some file system types (e.g.
7659 * up to 64k files on a FAT32 file system). */
7660 hash = job_id % 10;
7661 dir_name = xstrdup_printf("%s/hash.%d",
7662 slurmctld_conf.state_save_location, hash);
7663 (void) mkdir(dir_name, 0700);
7664
7665 /* Create job_id specific directory */
7666 xstrfmtcat(dir_name, "/job.%u", job_id);
7667 if (mkdir(dir_name, 0700)) {
7668 if (!slurmctld_primary && (errno == EEXIST)) {
7669 error("Apparent duplicate JobId=%u. Two primary slurmctld daemons might currently be active",
7670 job_id);
7671 }
7672 error("mkdir(%s) error %m", dir_name);
7673 xfree(dir_name);
7674 return ESLURM_WRITING_TO_FILE;
7675 }
7676
7677 /* Create environment file, and write data to it */
7678 file_name = xstrdup_printf("%s/environment", dir_name);
7679 error_code = _write_data_array_to_file(file_name,
7680 job_desc->environment,
7681 job_desc->env_size);
7682 xfree(file_name);
7683
7684 if (error_code == 0) {
7685 /* Create script file */
7686 file_name = xstrdup_printf("%s/script", dir_name);
7687 error_code = _write_data_to_file(file_name, job_desc->script);
7688 xfree(file_name);
7689 }
7690
7691 xfree(dir_name);
7692 END_TIMER2("_copy_job_desc_to_file");
7693 return error_code;
7694 }
7695
7696 /* Return true of the specified job ID already has a batch directory so
7697 * that a different job ID can be created. This is to help limit damage from
7698 * split-brain, where two slurmctld daemons are running as primary. */
_dup_job_file_test(uint32_t job_id)7699 static bool _dup_job_file_test(uint32_t job_id)
7700 {
7701 char *dir_name_src;
7702 struct stat buf;
7703 int rc, hash = job_id % 10;
7704
7705 dir_name_src = xstrdup_printf("%s/hash.%d/job.%u",
7706 slurmctld_conf.state_save_location,
7707 hash, job_id);
7708 rc = stat(dir_name_src, &buf);
7709 xfree(dir_name_src);
7710 if (rc == 0) {
7711 error("Vestigial state files for JobId=%u, but no job record. This may be the result of two slurmctld running in primary mode",
7712 job_id);
7713 return true;
7714 }
7715 return false;
7716 }
7717
7718 /*
7719 * Create file with specified name and write the supplied data array to it
7720 * IN file_name - file to create and write to
7721 * IN data - array of pointers to strings (e.g. env)
7722 * IN size - number of elements in data
7723 */
7724 static int
_write_data_array_to_file(char * file_name,char ** data,uint32_t size)7725 _write_data_array_to_file(char *file_name, char **data, uint32_t size)
7726 {
7727 int fd, i, pos, nwrite, amount;
7728
7729 fd = creat(file_name, 0600);
7730 if (fd < 0) {
7731 error("Error creating file %s, %m", file_name);
7732 return ESLURM_WRITING_TO_FILE;
7733 }
7734
7735 amount = write(fd, &size, sizeof(uint32_t));
7736 if (amount < sizeof(uint32_t)) {
7737 error("Error writing file %s, %m", file_name);
7738 close(fd);
7739 return ESLURM_WRITING_TO_FILE;
7740 }
7741
7742 if (data == NULL) {
7743 close(fd);
7744 return SLURM_SUCCESS;
7745 }
7746
7747 for (i = 0; i < size; i++) {
7748 nwrite = strlen(data[i]) + 1;
7749 pos = 0;
7750 while (nwrite > 0) {
7751 amount = write(fd, &data[i][pos], nwrite);
7752 if ((amount < 0) && (errno != EINTR)) {
7753 error("Error writing file %s, %m",
7754 file_name);
7755 close(fd);
7756 return ESLURM_WRITING_TO_FILE;
7757 }
7758 nwrite -= amount;
7759 pos += amount;
7760 }
7761 }
7762
7763 close(fd);
7764 return SLURM_SUCCESS;
7765 }
7766
7767 /*
7768 * Create file with specified name and write the supplied data array to it
7769 * IN file_name - file to create and write to
7770 * IN data - pointer to string
7771 */
_write_data_to_file(char * file_name,char * data)7772 static int _write_data_to_file(char *file_name, char *data)
7773 {
7774 int fd, pos, nwrite, amount;
7775
7776 if (data == NULL) {
7777 (void) unlink(file_name);
7778 return SLURM_SUCCESS;
7779 }
7780
7781 fd = creat(file_name, 0700);
7782 if (fd < 0) {
7783 error("Error creating file %s, %m", file_name);
7784 return ESLURM_WRITING_TO_FILE;
7785 }
7786
7787 nwrite = strlen(data) + 1;
7788 pos = 0;
7789 while (nwrite > 0) {
7790 amount = write(fd, &data[pos], nwrite);
7791 if ((amount < 0) && (errno != EINTR)) {
7792 error("Error writing file %s, %m", file_name);
7793 close(fd);
7794 return ESLURM_WRITING_TO_FILE;
7795 }
7796 nwrite -= amount;
7797 pos += amount;
7798 }
7799 close(fd);
7800 return SLURM_SUCCESS;
7801 }
7802
7803 /*
7804 * get_job_env - return the environment variables and their count for a
7805 * given job
7806 * IN job_ptr - pointer to job for which data is required
7807 * OUT env_size - number of elements to read
7808 * RET point to array of string pointers containing environment variables
7809 */
get_job_env(job_record_t * job_ptr,uint32_t * env_size)7810 char **get_job_env(job_record_t *job_ptr, uint32_t *env_size)
7811 {
7812 char *file_name = NULL, **environment = NULL;
7813 int cc, fd = -1, hash;
7814 uint32_t use_id;
7815
7816 use_id = (job_ptr->array_task_id != NO_VAL) ?
7817 job_ptr->array_job_id : job_ptr->job_id;
7818 hash = use_id % 10;
7819 file_name = xstrdup_printf("%s/hash.%d/job.%u/environment",
7820 slurmctld_conf.state_save_location,
7821 hash, use_id);
7822 fd = open(file_name, 0);
7823
7824 if (fd >= 0) {
7825 cc = _read_data_array_from_file(fd, file_name, &environment,
7826 env_size, job_ptr);
7827 if (cc < 0)
7828 environment = NULL;
7829 close(fd);
7830 } else {
7831 error("Could not open environment file for %pJ", job_ptr);
7832 }
7833
7834 xfree(file_name);
7835 return environment;
7836 }
7837
7838 /*
7839 * get_job_script - return the script for a given job
7840 * IN job_ptr - pointer to job for which data is required
7841 * RET Buf containing job script
7842 */
get_job_script(const job_record_t * job_ptr)7843 Buf get_job_script(const job_record_t *job_ptr)
7844 {
7845 char *file_name = NULL;
7846 int hash;
7847 uint32_t use_id;
7848 Buf buf;
7849
7850 if (!job_ptr->batch_flag)
7851 return NULL;
7852
7853 use_id = (job_ptr->array_task_id != NO_VAL) ?
7854 job_ptr->array_job_id : job_ptr->job_id;
7855 hash = use_id % 10;
7856 file_name = xstrdup_printf("%s/hash.%d/job.%u/script",
7857 slurmctld_conf.state_save_location,
7858 hash, use_id);
7859
7860 if (!(buf = create_mmap_buf(file_name)))
7861 error("Could not open script file for %pJ", job_ptr);
7862 xfree(file_name);
7863
7864 return buf;
7865 }
7866
7867 /*
7868 * Read a collection of strings from a file
7869 * IN fd - file descriptor
7870 * IN file_name - file to read from
7871 * OUT data - pointer to array of pointers to strings (e.g. env),
7872 * must be xfreed when no longer needed
7873 * OUT size - number of elements in data
7874 * IN job_ptr - job
7875 * RET 0 on success, -1 on error
7876 * NOTE: The output format of this must be identical with _xduparray2()
7877 */
_read_data_array_from_file(int fd,char * file_name,char *** data,uint32_t * size,job_record_t * job_ptr)7878 static int _read_data_array_from_file(int fd, char *file_name, char ***data,
7879 uint32_t *size, job_record_t *job_ptr)
7880 {
7881 int pos, buf_size, amount, i, j;
7882 char *buffer, **array_ptr;
7883 uint32_t rec_cnt;
7884
7885 xassert(file_name);
7886 xassert(data);
7887 xassert(size);
7888 *data = NULL;
7889 *size = 0;
7890
7891 amount = read(fd, &rec_cnt, sizeof(uint32_t));
7892 if (amount < sizeof(uint32_t)) {
7893 if (amount != 0) /* incomplete write */
7894 error("Error reading file %s, %m", file_name);
7895 else
7896 verbose("File %s has zero size", file_name);
7897 return -1;
7898 }
7899
7900 if (rec_cnt >= INT_MAX) {
7901 error("%s: unreasonable record counter %d in file %s",
7902 __func__, rec_cnt, file_name);
7903 return -1;
7904 }
7905
7906 if (rec_cnt == 0) {
7907 *data = NULL;
7908 *size = 0;
7909 return 0;
7910 }
7911
7912 pos = 0;
7913 buf_size = BUF_SIZE;
7914 buffer = xmalloc(buf_size + 1);
7915 while (1) {
7916 amount = read(fd, &buffer[pos], BUF_SIZE);
7917 if (amount < 0) {
7918 error("Error reading file %s, %m", file_name);
7919 xfree(buffer);
7920 return -1;
7921 }
7922 buffer[pos + amount] = '\0';
7923 pos += amount;
7924 if (amount < BUF_SIZE) /* end of file */
7925 break;
7926 buf_size += amount;
7927 xrealloc(buffer, buf_size + 1);
7928 }
7929
7930 /* Allocate extra space for supplemental environment variables */
7931 if (job_ptr->details->env_cnt) {
7932 for (j = 0; j < job_ptr->details->env_cnt; j++)
7933 pos += (strlen(job_ptr->details->env_sup[j]) + 1);
7934 xrealloc(buffer, pos);
7935 }
7936
7937 /* We have all the data, now let's compute the pointers */
7938 array_ptr = xcalloc((rec_cnt + job_ptr->details->env_cnt),
7939 sizeof(char *));
7940 for (i = 0, pos = 0; i < rec_cnt; i++) {
7941 array_ptr[i] = &buffer[pos];
7942 pos += strlen(&buffer[pos]) + 1;
7943 if ((pos > buf_size) && ((i + 1) < rec_cnt)) {
7944 error("Bad environment file %s", file_name);
7945 rec_cnt = i;
7946 break;
7947 }
7948 }
7949
7950 /* Add supplemental environment variables */
7951 if (job_ptr->details->env_cnt) {
7952 char *tmp_chr;
7953 int env_len, name_len;
7954 for (j = 0; j < job_ptr->details->env_cnt; j++) {
7955 tmp_chr = strchr(job_ptr->details->env_sup[j], '=');
7956 if (tmp_chr == NULL) {
7957 error("Invalid supplemental environment "
7958 "variable: %s",
7959 job_ptr->details->env_sup[j]);
7960 continue;
7961 }
7962 env_len = strlen(job_ptr->details->env_sup[j]) + 1;
7963 name_len = tmp_chr - job_ptr->details->env_sup[j] + 1;
7964 /* search for duplicate */
7965 for (i = 0; i < rec_cnt; i++) {
7966 if (xstrncmp(array_ptr[i],
7967 job_ptr->details->env_sup[j],
7968 name_len)) {
7969 continue;
7970 }
7971
7972 /*
7973 * If we are are the front we can not overwrite
7974 * that spot, we can clear it an then add to the
7975 * end of the array.
7976 */
7977 if (i == 0) {
7978 array_ptr[0][0] = '\0';
7979 i = rec_cnt;
7980 break;
7981 }
7982 /* over-write duplicate */
7983 memcpy(&buffer[pos],
7984 job_ptr->details->env_sup[j], env_len);
7985 array_ptr[i] = &buffer[pos];
7986 pos += env_len;
7987 break;
7988 }
7989 if (i >= rec_cnt) { /* add env to array end */
7990 memcpy(&buffer[pos],
7991 job_ptr->details->env_sup[j], env_len);
7992 array_ptr[rec_cnt++] = &buffer[pos];
7993 pos += env_len;
7994 }
7995 }
7996 }
7997
7998 *size = rec_cnt;
7999 *data = array_ptr;
8000 return 0;
8001 }
8002
8003 /* Given a job request, return a multi_core_data struct.
8004 * Returns NULL if no values set in the job/step request */
8005 static multi_core_data_t *
_set_multi_core_data(job_desc_msg_t * job_desc)8006 _set_multi_core_data(job_desc_msg_t * job_desc)
8007 {
8008 multi_core_data_t * mc_ptr;
8009
8010 if ((job_desc->sockets_per_node == NO_VAL16) &&
8011 (job_desc->cores_per_socket == NO_VAL16) &&
8012 (job_desc->threads_per_core == NO_VAL16) &&
8013 (job_desc->ntasks_per_socket == NO_VAL16) &&
8014 (job_desc->ntasks_per_core == NO_VAL16) &&
8015 (job_desc->plane_size == NO_VAL16))
8016 return NULL;
8017
8018 mc_ptr = xmalloc(sizeof(multi_core_data_t));
8019 mc_ptr->sockets_per_node = job_desc->sockets_per_node;
8020 mc_ptr->cores_per_socket = job_desc->cores_per_socket;
8021 mc_ptr->threads_per_core = job_desc->threads_per_core;
8022 if (job_desc->ntasks_per_socket != NO_VAL16)
8023 mc_ptr->ntasks_per_socket = job_desc->ntasks_per_socket;
8024 else
8025 mc_ptr->ntasks_per_socket = INFINITE16;
8026 if (job_desc->ntasks_per_core != NO_VAL16)
8027 mc_ptr->ntasks_per_core = job_desc->ntasks_per_core;
8028 else if (slurmctld_conf.select_type_param & CR_ONE_TASK_PER_CORE)
8029 mc_ptr->ntasks_per_core = 1;
8030 else
8031 mc_ptr->ntasks_per_core = INFINITE16;
8032 if (job_desc->plane_size != NO_VAL16)
8033 mc_ptr->plane_size = job_desc->plane_size;
8034 else
8035 mc_ptr->plane_size = 0;
8036
8037 return mc_ptr;
8038 }
8039
8040 /* Return default "wait_all_nodes" option for a new job */
_default_wait_all_nodes(job_desc_msg_t * job_desc)8041 static uint16_t _default_wait_all_nodes(job_desc_msg_t *job_desc)
8042 {
8043 static uint16_t default_batch_wait = NO_VAL16;
8044 static time_t sched_update = 0;
8045 char *sched_params;
8046
8047 if (!job_desc->script)
8048 return 0;
8049
8050 if ((default_batch_wait != NO_VAL16) &&
8051 (sched_update == slurmctld_conf.last_update))
8052 return default_batch_wait;
8053
8054 sched_params = slurm_get_sched_params();
8055 if (xstrcasestr(sched_params, "sbatch_wait_nodes"))
8056 default_batch_wait = 1;
8057 else
8058 default_batch_wait = 0;
8059 xfree(sched_params);
8060 sched_update = slurmctld_conf.last_update;
8061
8062 return default_batch_wait;
8063 }
8064
8065 /* _copy_job_desc_to_job_record - copy the job descriptor from the RPC
8066 * structure into the actual slurmctld job record */
_copy_job_desc_to_job_record(job_desc_msg_t * job_desc,job_record_t ** job_rec_ptr,bitstr_t ** req_bitmap,bitstr_t ** exc_bitmap)8067 static int _copy_job_desc_to_job_record(job_desc_msg_t *job_desc,
8068 job_record_t **job_rec_ptr,
8069 bitstr_t **req_bitmap,
8070 bitstr_t **exc_bitmap)
8071 {
8072 int error_code;
8073 struct job_details *detail_ptr;
8074 job_record_t *job_ptr;
8075
8076 if (slurm_get_track_wckey()) {
8077 if (!job_desc->wckey) {
8078 /* get the default wckey for this user since none was
8079 * given */
8080 slurmdb_user_rec_t user_rec;
8081 memset(&user_rec, 0, sizeof(user_rec));
8082 user_rec.uid = job_desc->user_id;
8083 assoc_mgr_fill_in_user(acct_db_conn, &user_rec,
8084 accounting_enforce, NULL, false);
8085 if (user_rec.default_wckey)
8086 job_desc->wckey = xstrdup_printf(
8087 "*%s", user_rec.default_wckey);
8088 else if (!(accounting_enforce &
8089 ACCOUNTING_ENFORCE_WCKEYS))
8090 job_desc->wckey = xstrdup("*");
8091 else {
8092 error("Job didn't specify wckey and user "
8093 "%d has no default.", job_desc->user_id);
8094 return ESLURM_INVALID_WCKEY;
8095 }
8096 } else if (job_desc->wckey) {
8097 slurmdb_wckey_rec_t wckey_rec, *wckey_ptr = NULL;
8098
8099 memset(&wckey_rec, 0, sizeof(wckey_rec));
8100 wckey_rec.uid = job_desc->user_id;
8101 wckey_rec.name = job_desc->wckey;
8102
8103 if (assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec,
8104 accounting_enforce,
8105 &wckey_ptr, false)) {
8106 if (accounting_enforce &
8107 ACCOUNTING_ENFORCE_WCKEYS) {
8108 error("%s: invalid wckey '%s' for "
8109 "user %u.",
8110 __func__, wckey_rec.name,
8111 job_desc->user_id);
8112 return ESLURM_INVALID_WCKEY;
8113 }
8114 }
8115 } else if (accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS) {
8116 /* This should never happen */
8117 info("%s: no wckey was given for job submit", __func__);
8118 return ESLURM_INVALID_WCKEY;
8119 }
8120 }
8121
8122 job_ptr = _create_job_record(1);
8123 if (!job_ptr)
8124 return SLURM_ERROR;
8125
8126 *job_rec_ptr = job_ptr;
8127 job_ptr->partition = xstrdup(job_desc->partition);
8128 if (job_desc->profile != ACCT_GATHER_PROFILE_NOT_SET)
8129 job_ptr->profile = job_desc->profile;
8130
8131 if (job_desc->job_id != NO_VAL) { /* already confirmed unique */
8132 job_ptr->job_id = job_desc->job_id;
8133 } else {
8134 error_code = _set_job_id(job_ptr);
8135 if (error_code)
8136 return error_code;
8137 }
8138
8139 job_ptr->name = xstrdup(job_desc->name);
8140 job_ptr->wckey = xstrdup(job_desc->wckey);
8141
8142 /* Since this is only used in the slurmctld, copy it now. */
8143 job_ptr->tres_req_cnt = job_desc->tres_req_cnt;
8144 job_desc->tres_req_cnt = NULL;
8145 set_job_tres_req_str(job_ptr, false);
8146 _add_job_hash(job_ptr);
8147
8148 job_ptr->user_id = (uid_t) job_desc->user_id;
8149 job_ptr->group_id = (gid_t) job_desc->group_id;
8150 job_ptr->job_state = JOB_PENDING;
8151 job_ptr->time_limit = job_desc->time_limit;
8152 job_ptr->deadline = job_desc->deadline;
8153 if (job_desc->delay_boot == NO_VAL)
8154 job_ptr->delay_boot = delay_boot;
8155 else
8156 job_ptr->delay_boot = job_desc->delay_boot;
8157 if (job_desc->time_min != NO_VAL)
8158 job_ptr->time_min = job_desc->time_min;
8159 job_ptr->alloc_sid = job_desc->alloc_sid;
8160 job_ptr->alloc_node = xstrdup(job_desc->alloc_node);
8161 job_ptr->account = xstrdup(job_desc->account);
8162 job_ptr->batch_features = xstrdup(job_desc->batch_features);
8163 job_ptr->burst_buffer = xstrdup(job_desc->burst_buffer);
8164 job_ptr->network = xstrdup(job_desc->network);
8165 job_ptr->resv_name = xstrdup(job_desc->reservation);
8166 job_ptr->restart_cnt = job_desc->restart_cnt;
8167 job_ptr->comment = xstrdup(job_desc->comment);
8168 job_ptr->admin_comment = xstrdup(job_desc->admin_comment);
8169
8170 if (job_desc->kill_on_node_fail != NO_VAL16)
8171 job_ptr->kill_on_node_fail = job_desc->kill_on_node_fail;
8172
8173 job_ptr->resp_host = xstrdup(job_desc->resp_host);
8174 job_ptr->alloc_resp_port = job_desc->alloc_resp_port;
8175 job_ptr->other_port = job_desc->other_port;
8176 job_ptr->power_flags = job_desc->power_flags;
8177 job_ptr->time_last_active = time(NULL);
8178 job_ptr->cr_enabled = 0;
8179 job_ptr->derived_ec = 0;
8180
8181 job_ptr->licenses = xstrdup(job_desc->licenses);
8182 job_ptr->mail_user = _get_mail_user(job_desc->mail_user,
8183 job_ptr->user_id);
8184 if (job_desc->mail_type &&
8185 (job_desc->mail_type != NO_VAL16)) {
8186 job_ptr->mail_type = job_desc->mail_type;
8187 }
8188
8189 job_ptr->bit_flags = job_desc->bitflags;
8190 job_ptr->bit_flags &= ~BACKFILL_TEST;
8191 job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST;
8192 job_ptr->spank_job_env = job_desc->spank_job_env;
8193 job_ptr->spank_job_env_size = job_desc->spank_job_env_size;
8194 job_desc->spank_job_env = (char **) NULL; /* nothing left to free */
8195 job_desc->spank_job_env_size = 0; /* nothing left to free */
8196 job_ptr->mcs_label = xstrdup(job_desc->mcs_label);
8197 job_ptr->origin_cluster = xstrdup(job_desc->origin_cluster);
8198
8199 job_ptr->cpus_per_tres = xstrdup(job_desc->cpus_per_tres);
8200 job_ptr->mem_per_tres = xstrdup(job_desc->mem_per_tres);
8201 job_ptr->tres_bind = xstrdup(job_desc->tres_bind);
8202 job_ptr->tres_freq = xstrdup(job_desc->tres_freq);
8203 job_ptr->tres_per_job = xstrdup(job_desc->tres_per_job);
8204 job_ptr->tres_per_node = xstrdup(job_desc->tres_per_node);
8205 job_ptr->tres_per_socket = xstrdup(job_desc->tres_per_socket);
8206 job_ptr->tres_per_task = xstrdup(job_desc->tres_per_task);
8207
8208 if (job_desc->wait_all_nodes == NO_VAL16)
8209 job_ptr->wait_all_nodes = _default_wait_all_nodes(job_desc);
8210 else
8211 job_ptr->wait_all_nodes = job_desc->wait_all_nodes;
8212 job_ptr->warn_flags = job_desc->warn_flags;
8213 job_ptr->warn_signal = job_desc->warn_signal;
8214 job_ptr->warn_time = job_desc->warn_time;
8215
8216 detail_ptr = job_ptr->details;
8217 detail_ptr->argc = job_desc->argc;
8218 detail_ptr->argv = job_desc->argv;
8219 job_desc->argv = (char **) NULL; /* nothing left to free */
8220 job_desc->argc = 0; /* nothing left to free */
8221 detail_ptr->acctg_freq = xstrdup(job_desc->acctg_freq);
8222 detail_ptr->cpu_bind_type = job_desc->cpu_bind_type;
8223 detail_ptr->cpu_bind = xstrdup(job_desc->cpu_bind);
8224 detail_ptr->cpu_freq_gov = job_desc->cpu_freq_gov;
8225 detail_ptr->cpu_freq_max = job_desc->cpu_freq_max;
8226 detail_ptr->cpu_freq_min = job_desc->cpu_freq_min;
8227 detail_ptr->extra = job_desc->extra;
8228 detail_ptr->nice = job_desc->nice;
8229 detail_ptr->open_mode = job_desc->open_mode;
8230 detail_ptr->min_cpus = job_desc->min_cpus;
8231 detail_ptr->orig_min_cpus = job_desc->min_cpus;
8232 detail_ptr->max_cpus = job_desc->max_cpus;
8233 detail_ptr->orig_max_cpus = job_desc->max_cpus;
8234 detail_ptr->min_nodes = job_desc->min_nodes;
8235 detail_ptr->max_nodes = job_desc->max_nodes;
8236 detail_ptr->x11 = job_desc->x11;
8237 detail_ptr->x11_magic_cookie = xstrdup(job_desc->x11_magic_cookie);
8238 detail_ptr->x11_target = xstrdup(job_desc->x11_target);
8239 detail_ptr->x11_target_port = job_desc->x11_target_port;
8240 if (job_desc->req_nodes) {
8241 detail_ptr->req_nodes =
8242 _copy_nodelist_no_dup(job_desc->req_nodes);
8243 detail_ptr->req_node_bitmap = *req_bitmap;
8244 *req_bitmap = NULL; /* Reused nothing left to free */
8245 }
8246 if (job_desc->exc_nodes) {
8247 detail_ptr->exc_nodes =
8248 _copy_nodelist_no_dup(job_desc->exc_nodes);
8249 detail_ptr->exc_node_bitmap = *exc_bitmap;
8250 *exc_bitmap = NULL; /* Reused nothing left to free */
8251 }
8252 detail_ptr->features = xstrdup(job_desc->features);
8253 detail_ptr->cluster_features = xstrdup(job_desc->cluster_features);
8254 if (job_desc->fed_siblings_viable) {
8255 job_ptr->fed_details = xmalloc(sizeof(job_fed_details_t));
8256 job_ptr->fed_details->siblings_viable =
8257 job_desc->fed_siblings_viable;
8258 update_job_fed_details(job_ptr);
8259 }
8260 if (job_desc->shared == JOB_SHARED_NONE) {
8261 detail_ptr->share_res = 0;
8262 detail_ptr->whole_node = WHOLE_NODE_REQUIRED;
8263 } else if (job_desc->shared == JOB_SHARED_OK) {
8264 detail_ptr->share_res = 1;
8265 detail_ptr->whole_node = 0;
8266 } else if (job_desc->shared == JOB_SHARED_USER) {
8267 detail_ptr->share_res = NO_VAL8;
8268 detail_ptr->whole_node = WHOLE_NODE_USER;
8269 } else if (job_desc->shared == JOB_SHARED_MCS) {
8270 detail_ptr->share_res = NO_VAL8;
8271 detail_ptr->whole_node = WHOLE_NODE_MCS;
8272 } else {
8273 detail_ptr->share_res = NO_VAL8;
8274 detail_ptr->whole_node = 0;
8275 }
8276 if (job_desc->contiguous != NO_VAL16)
8277 detail_ptr->contiguous = job_desc->contiguous;
8278 if (slurmctld_conf.conf_flags & CTL_CONF_ASRU)
8279 detail_ptr->core_spec = job_desc->core_spec;
8280 else
8281 detail_ptr->core_spec = NO_VAL16;
8282 if (detail_ptr->core_spec != NO_VAL16)
8283 detail_ptr->whole_node = 1;
8284 if (job_desc->task_dist != NO_VAL)
8285 detail_ptr->task_dist = job_desc->task_dist;
8286 if (job_desc->cpus_per_task == NO_VAL16) {
8287 detail_ptr->cpus_per_task = 1;
8288 detail_ptr->orig_cpus_per_task = NO_VAL16;
8289 } else {
8290 detail_ptr->cpus_per_task = MAX(job_desc->cpus_per_task, 1);
8291 detail_ptr->orig_cpus_per_task = detail_ptr->cpus_per_task;
8292 }
8293 if (job_desc->pn_min_cpus != NO_VAL16)
8294 detail_ptr->pn_min_cpus = job_desc->pn_min_cpus;
8295 if (job_desc->overcommit != NO_VAL8)
8296 detail_ptr->overcommit = job_desc->overcommit;
8297 if (job_desc->num_tasks != NO_VAL)
8298 detail_ptr->num_tasks = job_desc->num_tasks;
8299 if (job_desc->ntasks_per_node != NO_VAL16) {
8300 detail_ptr->ntasks_per_node = job_desc->ntasks_per_node;
8301 if ((detail_ptr->overcommit == 0) &&
8302 (detail_ptr->num_tasks > 1)) {
8303 detail_ptr->pn_min_cpus =
8304 MAX(detail_ptr->pn_min_cpus,
8305 (detail_ptr->cpus_per_task *
8306 detail_ptr->ntasks_per_node));
8307 }
8308 }
8309 detail_ptr->pn_min_cpus = MAX(detail_ptr->pn_min_cpus,
8310 detail_ptr->cpus_per_task);
8311 detail_ptr->orig_pn_min_cpus = detail_ptr->pn_min_cpus;
8312 if (job_desc->reboot != NO_VAL16)
8313 job_ptr->reboot = MIN(job_desc->reboot, 1);
8314 else
8315 job_ptr->reboot = 0;
8316 if (job_desc->requeue != NO_VAL16)
8317 detail_ptr->requeue = MIN(job_desc->requeue, 1);
8318 else
8319 detail_ptr->requeue = slurmctld_conf.job_requeue;
8320 if (job_desc->pn_min_memory != NO_VAL64)
8321 detail_ptr->pn_min_memory = job_desc->pn_min_memory;
8322 detail_ptr->orig_pn_min_memory = detail_ptr->pn_min_memory;
8323 if (job_desc->pn_min_tmp_disk != NO_VAL)
8324 detail_ptr->pn_min_tmp_disk = job_desc->pn_min_tmp_disk;
8325 detail_ptr->std_err = xstrdup(job_desc->std_err);
8326 detail_ptr->std_in = xstrdup(job_desc->std_in);
8327 detail_ptr->std_out = xstrdup(job_desc->std_out);
8328 detail_ptr->work_dir = xstrdup(job_desc->work_dir);
8329 if (job_desc->begin_time > time(NULL))
8330 detail_ptr->begin_time = job_desc->begin_time;
8331 job_ptr->select_jobinfo =
8332 select_g_select_jobinfo_copy(job_desc->select_jobinfo);
8333
8334 select_g_select_jobinfo_set(job_ptr->select_jobinfo,
8335 SELECT_JOBDATA_NETWORK,
8336 job_ptr->network);
8337
8338 job_ptr->clusters = xstrdup(job_desc->clusters);
8339
8340 /*
8341 * The priority needs to be set after this since we don't have
8342 * an association rec yet
8343 */
8344 detail_ptr->mc_ptr = _set_multi_core_data(job_desc);
8345
8346 if ((job_ptr->bit_flags & SPREAD_JOB) && (detail_ptr->max_nodes == 0) &&
8347 (detail_ptr->num_tasks != 0)) {
8348 if (detail_ptr->min_nodes == 0)
8349 detail_ptr->min_nodes = 1;
8350 detail_ptr->max_nodes =
8351 MIN(node_record_count, detail_ptr->num_tasks);
8352 }
8353
8354 return SLURM_SUCCESS;
8355 }
8356
8357 /*
8358 * _copy_nodelist_no_dup - Take a node_list string and convert it to an
8359 * expression without duplicate names. For example, we want to convert
8360 * a users request for nodes "lx1,lx2,lx1,lx3" to "lx[1-3]"
8361 * node_list IN - string describing a list of nodes
8362 * RET a compact node expression, must be xfreed by the user
8363 */
_copy_nodelist_no_dup(char * node_list)8364 static char *_copy_nodelist_no_dup(char *node_list)
8365 {
8366 char *buf;
8367
8368 hostlist_t hl = hostlist_create(node_list);
8369 if (hl == NULL)
8370 return NULL;
8371 hostlist_uniq(hl);
8372 buf = hostlist_ranged_string_xmalloc(hl);
8373 hostlist_destroy(hl);
8374
8375 return buf;
8376 }
8377
8378 /* Return the number of CPUs on the first node in the identified partition */
_cpus_per_node_part(part_record_t * part_ptr)8379 static uint16_t _cpus_per_node_part(part_record_t *part_ptr)
8380 {
8381 int node_inx = -1;
8382 node_record_t *node_ptr;
8383
8384 if (part_ptr->node_bitmap)
8385 node_inx = bit_ffs(part_ptr->node_bitmap);
8386 if (node_inx >= 0) {
8387 node_ptr = node_record_table_ptr + node_inx;
8388 return node_ptr->config_ptr->cpus;
8389 }
8390 return 0;
8391 }
8392
8393 /*
8394 * Test if this job exceeds any of MaxMemPer[CPU|Node] limits and potentially
8395 * adjust mem / cpu ratios.
8396 *
8397 * NOTE: This function is also called with a dummy job_desc_msg_t from
8398 * job_limits_check(), if there is any new check added here you may also have to
8399 * add that parameter to the job_desc_msg_t in that function.
8400 */
_valid_pn_min_mem(job_desc_msg_t * job_desc_msg,part_record_t * part_ptr)8401 static bool _valid_pn_min_mem(job_desc_msg_t *job_desc_msg,
8402 part_record_t *part_ptr)
8403 {
8404 uint64_t job_mem_limit = job_desc_msg->pn_min_memory;
8405 uint64_t sys_mem_limit;
8406 uint16_t cpus_per_node;
8407
8408 if (part_ptr && part_ptr->max_mem_per_cpu)
8409 sys_mem_limit = part_ptr->max_mem_per_cpu;
8410 else
8411 sys_mem_limit = slurmctld_conf.max_mem_per_cpu;
8412
8413 if ((sys_mem_limit == 0) || (sys_mem_limit == MEM_PER_CPU))
8414 return true;
8415
8416 if ((job_mem_limit & MEM_PER_CPU) && (sys_mem_limit & MEM_PER_CPU)) {
8417 uint64_t mem_ratio;
8418 job_mem_limit &= (~MEM_PER_CPU);
8419 sys_mem_limit &= (~MEM_PER_CPU);
8420 if (job_mem_limit <= sys_mem_limit)
8421 return true;
8422 mem_ratio = (job_mem_limit + sys_mem_limit - 1);
8423 mem_ratio /= sys_mem_limit;
8424 debug("increasing cpus_per_task and decreasing mem_per_cpu by "
8425 "factor of %"PRIu64" based upon mem_per_cpu limits",
8426 mem_ratio);
8427 if (job_desc_msg->cpus_per_task == NO_VAL16)
8428 job_desc_msg->cpus_per_task = mem_ratio;
8429 else
8430 job_desc_msg->cpus_per_task *= mem_ratio;
8431 job_desc_msg->pn_min_memory = ((job_mem_limit + mem_ratio - 1) /
8432 mem_ratio) | MEM_PER_CPU;
8433 if ((job_desc_msg->num_tasks != NO_VAL) &&
8434 (job_desc_msg->num_tasks != 0) &&
8435 (job_desc_msg->min_cpus != NO_VAL)) {
8436 job_desc_msg->min_cpus =
8437 job_desc_msg->num_tasks *
8438 job_desc_msg->cpus_per_task;
8439
8440 if ((job_desc_msg->max_cpus != NO_VAL) &&
8441 (job_desc_msg->max_cpus < job_desc_msg->min_cpus)) {
8442 job_desc_msg->max_cpus = job_desc_msg->min_cpus;
8443 }
8444 }
8445 return true;
8446 }
8447
8448 if (((job_mem_limit & MEM_PER_CPU) == 0) &&
8449 ((sys_mem_limit & MEM_PER_CPU) == 0)) {
8450 if (job_mem_limit <= sys_mem_limit)
8451 return true;
8452 debug2("JobId=%u mem=%"PRIu64"M > MaxMemPerNode=%"PRIu64"M in partition %s",
8453 job_desc_msg->job_id, job_mem_limit, sys_mem_limit,
8454 (part_ptr && part_ptr->name) ? part_ptr->name : "N/A");
8455 return false;
8456 }
8457
8458 /* Job and system have different memory limit forms (i.e. one is a
8459 * per-job and the other is per-node). Covert them both to per-node
8460 * values for comparison. */
8461 if (part_ptr && (!part_ptr->max_share || !job_desc_msg->shared)) {
8462 /* Whole node allocation */
8463 cpus_per_node = _cpus_per_node_part(part_ptr);
8464 } else {
8465 if ((job_desc_msg->ntasks_per_node != NO_VAL16) &&
8466 (job_desc_msg->ntasks_per_node != 0))
8467 cpus_per_node = job_desc_msg->ntasks_per_node;
8468 else
8469 cpus_per_node = 1;
8470
8471 if ((job_desc_msg->num_tasks != NO_VAL) &&
8472 (job_desc_msg->num_tasks != 0) &&
8473 (job_desc_msg->max_nodes != NO_VAL) &&
8474 (job_desc_msg->max_nodes != 0)) {
8475 cpus_per_node = MAX(cpus_per_node,
8476 ((job_desc_msg->num_tasks +
8477 job_desc_msg->max_nodes - 1) /
8478 job_desc_msg->max_nodes));
8479 }
8480
8481 if ((job_desc_msg->cpus_per_task != NO_VAL16) &&
8482 (job_desc_msg->cpus_per_task != 0))
8483 cpus_per_node *= job_desc_msg->cpus_per_task;
8484
8485 if ((job_desc_msg->pn_min_cpus != NO_VAL16) &&
8486 (job_desc_msg->pn_min_cpus > cpus_per_node))
8487 cpus_per_node = job_desc_msg->pn_min_cpus;
8488 }
8489
8490 if (job_mem_limit & MEM_PER_CPU) {
8491 /* Job has per-CPU memory limit, system has per-node limit */
8492 job_mem_limit &= (~MEM_PER_CPU);
8493 job_mem_limit *= cpus_per_node;
8494 } else {
8495 /* Job has per-node memory limit, system has per-CPU limit */
8496 uint32_t min_cpus;
8497 sys_mem_limit &= (~MEM_PER_CPU);
8498 min_cpus = (job_mem_limit + sys_mem_limit - 1) / sys_mem_limit;
8499
8500 if ((job_desc_msg->pn_min_cpus == NO_VAL16) ||
8501 (job_desc_msg->pn_min_cpus < min_cpus)) {
8502 debug("Setting job's pn_min_cpus to %u due to memory "
8503 "limit", min_cpus);
8504 job_desc_msg->pn_min_cpus = min_cpus;
8505 cpus_per_node = MAX(cpus_per_node, min_cpus);
8506 }
8507 sys_mem_limit *= cpus_per_node;
8508 }
8509
8510 if (job_mem_limit <= sys_mem_limit)
8511 return true;
8512
8513 debug2("JobId=%u mem=%"PRIu64"M > MaxMemPer%s=%"PRIu64"M in partition:%s",
8514 job_desc_msg->job_id, job_mem_limit,
8515 (job_mem_limit & MEM_PER_CPU) ? "CPU" : "Node", sys_mem_limit,
8516 (part_ptr && part_ptr->name) ? part_ptr->name : "N/A");
8517
8518 return false;
8519 }
8520
8521 /*
8522 * Validate TRES specification of the form:
8523 * "name=[type:]#[,[type:]#][;name=[type:]#]"
8524 * For example: "gpu:kepler:2,craynetwork=1"
8525 */
valid_tres_cnt(char * tres)8526 extern bool valid_tres_cnt(char *tres)
8527 {
8528 char *end_ptr = NULL, *colon, *save_ptr = NULL, *sep, *tok, *tmp;
8529 bool rc = true;
8530 long long int val;
8531
8532 if (!tres || (tres[0] == '\0'))
8533 return true;
8534
8535 tmp = xstrdup(tres);
8536 tok = strtok_r(tmp, ",", &save_ptr);
8537 while (tok) {
8538 bool valid_name = false;
8539 sep = strchr(tok, ':');
8540 if (sep) {
8541 sep[0] = '\0';
8542 sep++;
8543 }
8544 if (valid_tres_name(tok))
8545 valid_name = true;
8546 if (!sep) { /* No model or count. Implicit count of 1 */
8547 if (!valid_name) {
8548 rc = false;
8549 break;
8550 }
8551 } else if ((colon = strchr(sep, ':'))) {
8552 /* Includes explicit "name:type:count" */
8553 sep = colon + 1; /* Points to count */
8554 val = strtoll(sep, &end_ptr, 10);
8555 /* First only check numeric component for validity */
8556 if (((val < 0) ||
8557 (val == LLONG_MAX)) ||
8558 (!valid_name && (val != 0))) {
8559 rc = false;
8560 break;
8561 }
8562
8563 /*
8564 * Now check that any count suffic is valid.
8565 */
8566 if (suffix_mult(end_ptr) == NO_VAL64) {
8567 rc = false;
8568 break;
8569 }
8570 } else {
8571 /*
8572 * Includes "name:type" or "name:count"
8573 * Since we don't know if there is a count,
8574 * we can not do more now.
8575 */
8576 }
8577 tok = strtok_r(NULL, ",", &save_ptr);
8578 }
8579 xfree(tmp);
8580
8581 return rc;
8582 }
8583
8584 /*
8585 * Validate the named TRES is valid for scheduling parameters.
8586 * Returns FALSE if the name is invalid or the GRES count is zero.
8587 */
valid_tres_name(char * name)8588 extern bool valid_tres_name(char *name)
8589 {
8590 if (!name || (name[0] == '\0'))
8591 return false;
8592 if (gres_get_system_cnt(name) != NO_VAL64)
8593 return true;
8594
8595 return false;
8596 }
8597
8598 /*
8599 * Increment time limit of one job record for node configuraiton.
8600 */
_job_time_limit_incr(job_record_t * job_ptr,uint32_t boot_job_id)8601 static void _job_time_limit_incr(job_record_t *job_ptr, uint32_t boot_job_id)
8602 {
8603 time_t delta_t, now = time(NULL);
8604
8605 delta_t = difftime(now, job_ptr->start_time);
8606 if ((job_ptr->job_id != boot_job_id) && !IS_JOB_CONFIGURING(job_ptr))
8607 job_ptr->tot_sus_time = delta_t;
8608
8609 if ((job_ptr->time_limit != INFINITE) &&
8610 ((job_ptr->job_id == boot_job_id) || (delta_t != 0))) {
8611 if (delta_t && !IS_JOB_CONFIGURING(job_ptr)) {
8612 verbose("Extending %pJ time limit by %u secs for configuration",
8613 job_ptr, (uint32_t) delta_t);
8614 }
8615 job_ptr->end_time = now + (job_ptr->time_limit * 60);
8616 job_ptr->end_time_exp = job_ptr->end_time;
8617 }
8618 }
8619
8620 /*
8621 * Increment time limit for all components of a hetjob for node configuraiton.
8622 * job_ptr IN - pointer to job record for which configuration is complete
8623 * boot_job_id - job ID of record with newly powered up node or 0
8624 */
_het_job_time_limit_incr(job_record_t * job_ptr,uint32_t boot_job_id)8625 static void _het_job_time_limit_incr(job_record_t *job_ptr,
8626 uint32_t boot_job_id)
8627 {
8628 job_record_t *het_job_leader, *het_job;
8629 ListIterator iter;
8630
8631 if (!job_ptr->het_job_id) {
8632 _job_time_limit_incr(job_ptr, boot_job_id);
8633 return;
8634 }
8635
8636 het_job_leader = find_job_record(job_ptr->het_job_id);
8637 if (!het_job_leader) {
8638 error("%s: Hetjob leader %pJ not found",
8639 __func__, job_ptr);
8640 _job_time_limit_incr(job_ptr, boot_job_id);
8641 return;
8642 }
8643 if (!het_job_leader->het_job_list) {
8644 error("%s: Hetjob leader %pJ job list is NULL",
8645 __func__, job_ptr);
8646 _job_time_limit_incr(job_ptr, boot_job_id);
8647 return;
8648 }
8649
8650 iter = list_iterator_create(het_job_leader->het_job_list);
8651 while ((het_job = list_next(iter))) {
8652 _job_time_limit_incr(het_job, boot_job_id);
8653 }
8654 list_iterator_destroy(iter);
8655 }
8656
8657 /* Clear job's CONFIGURING flag and advance end time as needed */
job_config_fini(job_record_t * job_ptr)8658 extern void job_config_fini(job_record_t *job_ptr)
8659 {
8660 time_t now = time(NULL);
8661
8662 last_job_update = now;
8663 job_ptr->job_state &= ~JOB_CONFIGURING;
8664 if (IS_JOB_POWER_UP_NODE(job_ptr)) {
8665 info("Resetting %pJ start time for node power up", job_ptr);
8666 job_ptr->job_state &= ~JOB_POWER_UP_NODE;
8667 job_ptr->start_time = now;
8668 _het_job_time_limit_incr(job_ptr, job_ptr->job_id);
8669 jobacct_storage_g_job_start(acct_db_conn, job_ptr);
8670 } else {
8671 _het_job_time_limit_incr(job_ptr, 0);
8672 }
8673
8674 /*
8675 * Request asynchronous launch of a prolog for a non-batch job.
8676 * PROLOG_FLAG_CONTAIN also turns on PROLOG_FLAG_ALLOC.
8677 */
8678 if (slurmctld_conf.prolog_flags & PROLOG_FLAG_ALLOC)
8679 launch_prolog(job_ptr);
8680 }
8681
8682 /*
8683 * Determine of the nodes are ready to run a job
8684 * RET true if ready
8685 */
test_job_nodes_ready(job_record_t * job_ptr)8686 extern bool test_job_nodes_ready(job_record_t *job_ptr)
8687 {
8688 if (IS_JOB_PENDING(job_ptr))
8689 return false;
8690 if (!job_ptr->node_bitmap) /* Revoked allocation */
8691 return true;
8692 if (bit_overlap_any(job_ptr->node_bitmap, power_node_bitmap))
8693 return false;
8694
8695 if (!job_ptr->batch_flag ||
8696 job_ptr->batch_features ||
8697 job_ptr->wait_all_nodes || job_ptr->burst_buffer) {
8698 /* Make sure all nodes ready to start job */
8699 if ((select_g_job_ready(job_ptr) & READY_NODE_STATE) == 0)
8700 return false;
8701 } else if (job_ptr->batch_flag) {
8702
8703 #ifdef HAVE_FRONT_END
8704 /* Make sure frontend node is ready to start batch job */
8705 front_end_record_t *front_end_ptr =
8706 find_front_end_record(job_ptr->batch_host);
8707 if (!front_end_ptr ||
8708 IS_NODE_POWER_SAVE(front_end_ptr) ||
8709 IS_NODE_POWER_UP(front_end_ptr)) {
8710 return false;
8711 }
8712 #else
8713 /* Make sure first node is ready to start batch job */
8714 node_record_t *node_ptr =
8715 find_node_record(job_ptr->batch_host);
8716 if (!node_ptr ||
8717 IS_NODE_POWER_SAVE(node_ptr) ||
8718 IS_NODE_POWER_UP(node_ptr)) {
8719 return false;
8720 }
8721 #endif
8722 }
8723
8724 return true;
8725 }
8726
8727 /*
8728 * For non-hetjob, return true if this job is configuring.
8729 * For hetjob, return true if any component of the job is configuring.
8730 */
_het_job_configuring_test(job_record_t * job_ptr)8731 static bool _het_job_configuring_test(job_record_t *job_ptr)
8732 {
8733 job_record_t *het_job_leader, *het_job;
8734 ListIterator iter;
8735 bool result = false;
8736
8737 if (IS_JOB_CONFIGURING(job_ptr))
8738 return true;
8739 if (!job_ptr->het_job_id)
8740 return false;
8741
8742 het_job_leader = find_job_record(job_ptr->het_job_id);
8743 if (!het_job_leader) {
8744 error("%s: Hetjob leader %pJ not found", __func__, job_ptr);
8745 return false;
8746 }
8747 if (!het_job_leader->het_job_list) {
8748 error("%s: Hetjob leader %pJ job list is NULL",
8749 __func__, job_ptr);
8750 return false;
8751 }
8752
8753 iter = list_iterator_create(het_job_leader->het_job_list);
8754 while ((het_job = list_next(iter))) {
8755 if (IS_JOB_CONFIGURING(het_job)) {
8756 result = true;
8757 break;
8758 }
8759 }
8760 list_iterator_destroy(iter);
8761
8762 return result;
8763 }
8764
8765 /*
8766 * job_time_limit - terminate jobs which have exceeded their time limit
8767 * global: job_list - pointer global job list
8768 * last_job_update - time of last job table update
8769 */
job_time_limit(void)8770 void job_time_limit(void)
8771 {
8772 ListIterator job_iterator;
8773 job_record_t *job_ptr;
8774 time_t now = time(NULL);
8775 time_t old = now - ((slurmctld_conf.inactive_limit * 4 / 3) +
8776 slurmctld_conf.msg_timeout + 1);
8777 time_t over_run;
8778 uint16_t over_time_limit;
8779 uint8_t prolog;
8780 int job_test_count = 0;
8781 uint32_t resv_over_run = slurmctld_conf.resv_over_run;
8782
8783 xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
8784
8785 if (resv_over_run == INFINITE16)
8786 resv_over_run = YEAR_SECONDS;
8787 else
8788 resv_over_run *= 60;
8789
8790 /*
8791 * locks same as in _slurmctld_background() (The only current place this
8792 * is called).
8793 */
8794 slurmctld_lock_t job_write_lock = {
8795 READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
8796 DEF_TIMERS;
8797
8798 job_iterator = list_iterator_create(job_list);
8799 START_TIMER;
8800 while ((job_ptr = list_next(job_iterator))) {
8801 xassert (job_ptr->magic == JOB_MAGIC);
8802 job_test_count++;
8803
8804 if (job_ptr->details)
8805 prolog = job_ptr->details->prolog_running;
8806 else
8807 prolog = 0;
8808 if ((prolog == 0) && IS_JOB_CONFIGURING(job_ptr) &&
8809 test_job_nodes_ready(job_ptr)) {
8810 info("%s: Configuration for %pJ complete",
8811 __func__, job_ptr);
8812 job_config_fini(job_ptr);
8813 if (job_ptr->bit_flags & NODE_REBOOT) {
8814 job_ptr->bit_flags &= (~NODE_REBOOT);
8815 if (job_ptr->batch_flag)
8816 launch_job(job_ptr);
8817 }
8818 }
8819
8820 /*
8821 * Features have been changed on some node, make job eligiable
8822 * to run and test to see if it can run now
8823 */
8824 if (node_features_updated &&
8825 (job_ptr->state_reason == FAIL_BAD_CONSTRAINTS) &&
8826 IS_JOB_PENDING(job_ptr) && (job_ptr->priority == 0)) {
8827 job_ptr->state_reason = WAIT_NO_REASON;
8828 set_job_prio(job_ptr);
8829 last_job_update = now;
8830 }
8831
8832 /* Don't enforce time limits for configuring hetjobs */
8833 if (_het_job_configuring_test(job_ptr))
8834 continue;
8835
8836 /*
8837 * Only running jobs can be killed due to timeout. Do not kill
8838 * suspended jobs due to timeout.
8839 */
8840 if (!IS_JOB_RUNNING(job_ptr))
8841 continue;
8842
8843 /*
8844 * everything above here is considered "quick", and skips the
8845 * timeout at the bottom of the loop by using a continue.
8846 * everything below is considered "slow", and needs to jump to
8847 * time_check before the next job is tested
8848 */
8849 if (job_ptr->preempt_time) {
8850 (void)slurm_job_preempt(job_ptr, NULL,
8851 slurm_job_preempt_mode(job_ptr),
8852 false);
8853 goto time_check;
8854 }
8855
8856 if (slurmctld_conf.inactive_limit &&
8857 (job_ptr->batch_flag == 0) &&
8858 (job_ptr->time_last_active <= old) &&
8859 (job_ptr->other_port) &&
8860 (job_ptr->part_ptr) &&
8861 (!(job_ptr->part_ptr->flags & PART_FLAG_ROOT_ONLY))) {
8862 /* job inactive, kill it */
8863 info("%s: inactivity time limit reached for %pJ",
8864 __func__, job_ptr);
8865 _job_timed_out(job_ptr, false);
8866 job_ptr->state_reason = FAIL_INACTIVE_LIMIT;
8867 xfree(job_ptr->state_desc);
8868 goto time_check;
8869 }
8870 if (job_ptr->time_limit != INFINITE) {
8871 send_job_warn_signal(job_ptr, false);
8872 if ((job_ptr->mail_type & MAIL_JOB_TIME100) &&
8873 (now >= job_ptr->end_time)) {
8874 job_ptr->mail_type &= (~MAIL_JOB_TIME100);
8875 mail_job_info(job_ptr, MAIL_JOB_TIME100);
8876 }
8877 if ((job_ptr->mail_type & MAIL_JOB_TIME90) &&
8878 (now + (job_ptr->time_limit * 60 * 0.1) >=
8879 job_ptr->end_time)) {
8880 job_ptr->mail_type &= (~MAIL_JOB_TIME90);
8881 mail_job_info(job_ptr, MAIL_JOB_TIME90);
8882 }
8883 if ((job_ptr->mail_type & MAIL_JOB_TIME80) &&
8884 (now + (job_ptr->time_limit * 60 * 0.2) >=
8885 job_ptr->end_time)) {
8886 job_ptr->mail_type &= (~MAIL_JOB_TIME80);
8887 mail_job_info(job_ptr, MAIL_JOB_TIME80);
8888 }
8889 if ((job_ptr->mail_type & MAIL_JOB_TIME50) &&
8890 (now + (job_ptr->time_limit * 60 * 0.5) >=
8891 job_ptr->end_time)) {
8892 job_ptr->mail_type &= (~MAIL_JOB_TIME50);
8893 mail_job_info(job_ptr, MAIL_JOB_TIME50);
8894 }
8895
8896 if (job_ptr->part_ptr &&
8897 (job_ptr->part_ptr->over_time_limit != NO_VAL16)) {
8898 over_time_limit =
8899 job_ptr->part_ptr->over_time_limit;
8900 } else {
8901 over_time_limit =
8902 slurmctld_conf.over_time_limit;
8903 }
8904 if (over_time_limit == INFINITE16)
8905 over_run = now - YEAR_SECONDS;
8906 else
8907 over_run = now - (over_time_limit * 60);
8908 if (job_ptr->end_time <= over_run) {
8909 last_job_update = now;
8910 info("Time limit exhausted for %pJ", job_ptr);
8911 _job_timed_out(job_ptr, false);
8912 job_ptr->state_reason = FAIL_TIMEOUT;
8913 xfree(job_ptr->state_desc);
8914 goto time_check;
8915 }
8916 }
8917
8918 if (job_ptr->resv_ptr &&
8919 !(job_ptr->resv_ptr->flags & RESERVE_FLAG_FLEX) &&
8920 (job_ptr->resv_ptr->end_time + resv_over_run) < time(NULL)){
8921 last_job_update = now;
8922 info("Reservation ended for %pJ", job_ptr);
8923 _job_timed_out(job_ptr, false);
8924 job_ptr->state_reason = FAIL_TIMEOUT;
8925 xfree(job_ptr->state_desc);
8926 goto time_check;
8927 }
8928
8929 /*
8930 * check if any individual job steps have exceeded
8931 * their time limit
8932 */
8933 if (job_ptr->step_list &&
8934 (list_count(job_ptr->step_list) > 0))
8935 check_job_step_time_limit(job_ptr, now);
8936
8937 acct_policy_job_time_out(job_ptr);
8938
8939 if (job_ptr->state_reason == FAIL_TIMEOUT) {
8940 last_job_update = now;
8941 _job_timed_out(job_ptr, false);
8942 xfree(job_ptr->state_desc);
8943 goto time_check;
8944 }
8945
8946 /* Give srun command warning message about pending timeout */
8947 if (job_ptr->end_time <= (now + PERIODIC_TIMEOUT * 2))
8948 srun_timeout (job_ptr);
8949
8950 /*
8951 * _job_timed_out() and other calls can take a long time on
8952 * some platforms. This loop is holding the job_write lock;
8953 * if a lot of jobs need to be timed out within the same cycle
8954 * this stalls other threads from running and causes
8955 * communication issues within the cluster.
8956 *
8957 * This test happens last, as job_ptr may be pointing to a job
8958 * that would be deleted by a separate thread when the job_write
8959 * lock is released. However, list_next itself is thread safe,
8960 * and can be used again once the locks are reacquired.
8961 * list_peek_next is used in the unlikely event the timer has
8962 * expired just as the end of the job_list is reached.
8963 */
8964 time_check:
8965 /* Use a hard-coded 3 second timeout, with a 1 second sleep. */
8966 if (slurm_delta_tv(&tv1) >= 3000000 && list_peek_next(job_iterator) ) {
8967 END_TIMER;
8968 debug("%s: yielding locks after testing"
8969 " %d jobs, %s",
8970 __func__, job_test_count, TIME_STR);
8971 unlock_slurmctld(job_write_lock);
8972 usleep(1000000);
8973 lock_slurmctld(job_write_lock);
8974 START_TIMER;
8975 job_test_count = 0;
8976 }
8977 }
8978 list_iterator_destroy(job_iterator);
8979 node_features_updated = false;
8980 }
8981
job_set_req_tres(job_record_t * job_ptr,bool assoc_mgr_locked)8982 extern void job_set_req_tres(job_record_t *job_ptr, bool assoc_mgr_locked)
8983 {
8984 uint32_t cpu_cnt = 0, node_cnt = 0;
8985 uint64_t mem_cnt = 0;
8986 assoc_mgr_lock_t locks = { .tres = READ_LOCK };
8987
8988 xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
8989
8990 xfree(job_ptr->tres_req_str);
8991 xfree(job_ptr->tres_fmt_req_str);
8992 xfree(job_ptr->tres_req_cnt);
8993
8994 if (!assoc_mgr_locked)
8995 assoc_mgr_lock(&locks);
8996
8997 job_ptr->tres_req_cnt = xcalloc(g_tres_count, sizeof(uint64_t));
8998
8999 if (job_ptr->details) {
9000 node_cnt = job_ptr->details->min_nodes;
9001 cpu_cnt = job_ptr->details->min_cpus;
9002 if (job_ptr->details->pn_min_memory)
9003 mem_cnt = job_ptr->details->pn_min_memory;
9004 }
9005
9006 /* if this is set just override */
9007 if (job_ptr->total_cpus)
9008 cpu_cnt = job_ptr->total_cpus;
9009
9010 if (job_ptr->node_cnt)
9011 node_cnt = job_ptr->node_cnt;
9012
9013 job_ptr->tres_req_cnt[TRES_ARRAY_NODE] = (uint64_t)node_cnt;
9014 job_ptr->tres_req_cnt[TRES_ARRAY_CPU] = (uint64_t)cpu_cnt;
9015 job_ptr->tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem(
9016 job_ptr->job_resrcs,
9017 mem_cnt, cpu_cnt,
9018 node_cnt);
9019
9020 license_set_job_tres_cnt(job_ptr->license_list,
9021 job_ptr->tres_req_cnt,
9022 true);
9023
9024 /* FIXME: this assumes that all nodes have equal TRES */
9025 gres_set_job_tres_cnt(job_ptr->gres_list,
9026 node_cnt,
9027 job_ptr->tres_req_cnt,
9028 true);
9029
9030 bb_g_job_set_tres_cnt(job_ptr,
9031 job_ptr->tres_req_cnt,
9032 true);
9033
9034 /*
9035 * Do this last as it calculates off of everything else.
9036 * Don't use calc_job_billable_tres() as it relies on allocated tres
9037 * If the partition was destroyed the part_ptr will be NULL. As this
9038 * could be run on already finished jobs running in the assoc mgr
9039 * cache.
9040 */
9041 if (job_ptr->part_ptr)
9042 job_ptr->tres_req_cnt[TRES_ARRAY_BILLING] =
9043 assoc_mgr_tres_weighted(
9044 job_ptr->tres_req_cnt,
9045 job_ptr->part_ptr->billing_weights,
9046 slurmctld_conf.priority_flags, true);
9047
9048 /* now that the array is filled lets make the string from it */
9049 set_job_tres_req_str(job_ptr, true);
9050
9051 if (!assoc_mgr_locked)
9052 assoc_mgr_unlock(&locks);
9053 }
9054
job_set_alloc_tres(job_record_t * job_ptr,bool assoc_mgr_locked)9055 extern void job_set_alloc_tres(job_record_t *job_ptr, bool assoc_mgr_locked)
9056 {
9057 uint32_t alloc_nodes = 0;
9058 assoc_mgr_lock_t locks = { .tres = READ_LOCK };
9059
9060 xfree(job_ptr->tres_alloc_str);
9061 xfree(job_ptr->tres_alloc_cnt);
9062 xfree(job_ptr->tres_fmt_alloc_str);
9063
9064 /*
9065 * We only need to do this on non-pending jobs.
9066 * Requeued jobs are marked as PENDING|COMPLETING until the epilog is
9067 * finished so we still need the alloc tres until then.
9068 */
9069 if (IS_JOB_PENDING(job_ptr) && !IS_JOB_COMPLETING(job_ptr))
9070 return;
9071
9072 if (!assoc_mgr_locked)
9073 assoc_mgr_lock(&locks);
9074
9075 job_ptr->tres_alloc_cnt = xcalloc(slurmctld_tres_cnt, sizeof(uint64_t));
9076
9077 job_ptr->tres_alloc_cnt[TRES_ARRAY_CPU] = (uint64_t)job_ptr->total_cpus;
9078
9079 alloc_nodes = job_ptr->node_cnt;
9080 job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE] = (uint64_t)alloc_nodes;
9081 job_ptr->tres_alloc_cnt[TRES_ARRAY_MEM] =
9082 job_get_tres_mem(
9083 job_ptr->job_resrcs,
9084 job_ptr->details->pn_min_memory,
9085 job_ptr->tres_alloc_cnt[TRES_ARRAY_CPU],
9086 job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE]);
9087
9088 job_ptr->tres_alloc_cnt[TRES_ARRAY_ENERGY] = NO_VAL64;
9089
9090 license_set_job_tres_cnt(job_ptr->license_list,
9091 job_ptr->tres_alloc_cnt,
9092 true);
9093
9094 gres_set_job_tres_cnt(job_ptr->gres_list,
9095 alloc_nodes,
9096 job_ptr->tres_alloc_cnt,
9097 true);
9098
9099 bb_g_job_set_tres_cnt(job_ptr,
9100 job_ptr->tres_alloc_cnt,
9101 true);
9102
9103 /* Do this last as it calculates off of everything else. */
9104 job_ptr->tres_alloc_cnt[TRES_ARRAY_BILLING] =
9105 calc_job_billable_tres(job_ptr, job_ptr->start_time, true);
9106
9107 /* now that the array is filled lets make the string from it */
9108 set_job_tres_alloc_str(job_ptr, true);
9109
9110 if (!assoc_mgr_locked)
9111 assoc_mgr_unlock(&locks);
9112
9113 return;
9114 }
9115
9116 /*
9117 * job_update_tres_cnt - when job is completing remove allocated tres
9118 * from count.
9119 * IN/OUT job_ptr - job structure to be updated
9120 * IN node_inx - node bit that is finished with job.
9121 * RET SLURM_SUCCES on success SLURM_ERROR on cpu_cnt underflow
9122 */
job_update_tres_cnt(job_record_t * job_ptr,int node_inx)9123 extern int job_update_tres_cnt(job_record_t *job_ptr, int node_inx)
9124 {
9125 int cpu_cnt, offset = -1, rc = SLURM_SUCCESS;
9126
9127 xassert(job_ptr);
9128
9129 if (job_ptr->details->whole_node == 1) {
9130 /*
9131 * Since we are allocating whole nodes don't rely on
9132 * the job_resrcs since it could be less because the
9133 * node could of only used 1 thread per core.
9134 */
9135 node_record_t *node_ptr =
9136 node_record_table_ptr + node_inx;
9137 cpu_cnt = node_ptr->config_ptr->cpus;
9138 } else {
9139 if ((offset = job_resources_node_inx_to_cpu_inx(
9140 job_ptr->job_resrcs, node_inx)) < 0) {
9141 error("%s: problem getting offset of %pJ",
9142 __func__, job_ptr);
9143 job_ptr->cpu_cnt = 0;
9144 return SLURM_ERROR;
9145 }
9146
9147 cpu_cnt = job_ptr->job_resrcs->cpus[offset];
9148 }
9149 if (cpu_cnt > job_ptr->cpu_cnt) {
9150 error("%s: cpu_cnt underflow (%d > %u) on %pJ", __func__,
9151 cpu_cnt, job_ptr->cpu_cnt, job_ptr);
9152 job_ptr->cpu_cnt = 0;
9153 rc = SLURM_ERROR;
9154 } else
9155 job_ptr->cpu_cnt -= cpu_cnt;
9156
9157 if (IS_JOB_RESIZING(job_ptr)) {
9158 if (cpu_cnt > job_ptr->total_cpus) {
9159 error("%s: total_cpus underflow on %pJ",
9160 __func__, job_ptr);
9161 job_ptr->total_cpus = 0;
9162 rc = SLURM_ERROR;
9163 } else
9164 job_ptr->total_cpus -= cpu_cnt;
9165
9166 job_set_alloc_tres(job_ptr, false);
9167 }
9168 return rc;
9169 }
9170
9171 /* Terminate a job that has exhausted its time limit */
_job_timed_out(job_record_t * job_ptr,bool preempted)9172 static void _job_timed_out(job_record_t *job_ptr, bool preempted)
9173 {
9174 xassert(job_ptr);
9175
9176 srun_timeout(job_ptr);
9177 if (job_ptr->details) {
9178 time_t now = time(NULL);
9179 job_ptr->end_time = now;
9180 job_ptr->time_last_active = now;
9181 if (!job_ptr->preempt_time)
9182 job_ptr->job_state = JOB_TIMEOUT | JOB_COMPLETING;
9183 build_cg_bitmap(job_ptr);
9184 job_completion_logger(job_ptr, false);
9185 deallocate_nodes(job_ptr, !preempted, false, preempted);
9186 } else
9187 job_signal(job_ptr, SIGKILL, 0, 0, false);
9188 return;
9189 }
9190
9191 /* _validate_job_desc - validate that a job descriptor for job submit or
9192 * allocate has valid data, set values to defaults as required
9193 * IN/OUT job_desc_msg - pointer to job descriptor, modified as needed
9194 * IN allocate - if clear job to be queued, if set allocate for user now
9195 * IN submit_uid - who request originated
9196 */
_validate_job_desc(job_desc_msg_t * job_desc_msg,int allocate,uid_t submit_uid,part_record_t * part_ptr,List part_list)9197 static int _validate_job_desc(job_desc_msg_t *job_desc_msg, int allocate,
9198 uid_t submit_uid, part_record_t *part_ptr,
9199 List part_list)
9200 {
9201 if ((job_desc_msg->min_cpus == NO_VAL) &&
9202 (job_desc_msg->min_nodes == NO_VAL) &&
9203 (job_desc_msg->req_nodes == NULL)) {
9204 info("%s: job specified no min_cpus, min_nodes or req_nodes",
9205 __func__);
9206 return ESLURM_JOB_MISSING_SIZE_SPECIFICATION;
9207 }
9208 if ((allocate == SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0) &&
9209 (job_desc_msg->script == NULL)) {
9210 info("%s: job failed to specify Script", __func__);
9211 return ESLURM_JOB_SCRIPT_MISSING;
9212 }
9213 if (job_desc_msg->script && job_desc_msg->x11) {
9214 info("%s: batch job cannot use X11 forwarding", __func__);
9215 return ESLURM_X11_NOT_AVAIL;
9216 }
9217 if (job_desc_msg->user_id == NO_VAL) {
9218 info("%s: job failed to specify User", __func__);
9219 return ESLURM_USER_ID_MISSING;
9220 }
9221 if ( job_desc_msg->group_id == NO_VAL ) {
9222 debug("%s: job failed to specify group", __func__);
9223 return ESLURM_GROUP_ID_MISSING;
9224 }
9225 if (job_desc_msg->contiguous == NO_VAL16)
9226 job_desc_msg->contiguous = 0;
9227
9228 if (job_desc_msg->task_dist == NO_VAL) {
9229 /* not typically set by salloc or sbatch */
9230 job_desc_msg->task_dist = SLURM_DIST_CYCLIC;
9231 }
9232 if (job_desc_msg->plane_size == NO_VAL16)
9233 job_desc_msg->plane_size = 0;
9234
9235 if (job_desc_msg->kill_on_node_fail == NO_VAL16)
9236 job_desc_msg->kill_on_node_fail = 1;
9237
9238 if (job_desc_msg->job_id != NO_VAL) {
9239 job_record_t *dup_job_ptr;
9240 if (!fed_mgr_fed_rec &&
9241 (submit_uid != 0) &&
9242 (submit_uid != slurmctld_conf.slurm_user_id)) {
9243 info("attempt by uid %u to set JobId=%u",
9244 submit_uid, job_desc_msg->job_id);
9245 return ESLURM_INVALID_JOB_ID;
9246 }
9247 if (job_desc_msg->job_id == 0) {
9248 info("attempt by uid %u to set JobId=0",
9249 submit_uid);
9250 return ESLURM_INVALID_JOB_ID;
9251 }
9252 dup_job_ptr = find_job_record(job_desc_msg->job_id);
9253 if (dup_job_ptr) {
9254 info("attempt to re-use active %pJ", dup_job_ptr);
9255 return ESLURM_DUPLICATE_JOB_ID;
9256 }
9257 }
9258
9259 if (job_desc_msg->nice == NO_VAL)
9260 job_desc_msg->nice = NICE_OFFSET;
9261
9262 if (job_desc_msg->pn_min_memory == NO_VAL64) {
9263 /* Default memory limit is DefMemPerCPU (if set) or no limit */
9264 if (part_ptr && part_ptr->def_mem_per_cpu) {
9265 job_desc_msg->pn_min_memory =
9266 part_ptr->def_mem_per_cpu;
9267 } else {
9268 job_desc_msg->pn_min_memory =
9269 slurmctld_conf.def_mem_per_cpu;
9270 }
9271 } else if (!_validate_min_mem_partition(job_desc_msg, part_ptr,
9272 part_list)) {
9273 return ESLURM_INVALID_TASK_MEMORY;
9274 } else {
9275 /* Memory limit explicity set by user */
9276 job_desc_msg->bitflags |= JOB_MEM_SET;
9277 }
9278
9279 if (job_desc_msg->pn_min_memory == MEM_PER_CPU) {
9280 /* Map --mem-per-cpu=0 to --mem=0 for simpler logic */
9281 job_desc_msg->pn_min_memory = 0;
9282 }
9283
9284 /* Validate a job's accounting frequency, if specified */
9285 if (acct_gather_check_acct_freq_task(
9286 job_desc_msg->pn_min_memory, job_desc_msg->acctg_freq))
9287 return ESLURMD_INVALID_ACCT_FREQ;
9288
9289 if (job_desc_msg->min_nodes == NO_VAL)
9290 job_desc_msg->min_nodes = 1; /* default node count of 1 */
9291 if (job_desc_msg->min_cpus == NO_VAL)
9292 job_desc_msg->min_cpus = job_desc_msg->min_nodes;
9293
9294 if ((job_desc_msg->pn_min_cpus == NO_VAL16) ||
9295 (job_desc_msg->pn_min_cpus == 0))
9296 job_desc_msg->pn_min_cpus = 1; /* default 1 cpu per node */
9297 if (job_desc_msg->pn_min_tmp_disk == NO_VAL)
9298 job_desc_msg->pn_min_tmp_disk = 0;/* default 0MB disk per node */
9299
9300 return SLURM_SUCCESS;
9301 }
9302
9303 /*
9304 * Traverse the list of partitions and invoke the
9305 * function validating the job memory specification.
9306 */
_validate_min_mem_partition(job_desc_msg_t * job_desc_msg,part_record_t * part_ptr,List part_list)9307 static bool _validate_min_mem_partition(job_desc_msg_t *job_desc_msg,
9308 part_record_t *part_ptr, List part_list)
9309 {
9310 ListIterator iter;
9311 part_record_t *part;
9312 uint64_t tmp_pn_min_memory;
9313 uint16_t tmp_cpus_per_task;
9314 uint32_t tmp_min_cpus;
9315 uint32_t tmp_max_cpus;
9316 uint32_t tmp_pn_min_cpus;
9317 bool cc = false;
9318
9319 /* no reason to check them here as we aren't enforcing them */
9320 if (!slurmctld_conf.enforce_part_limits)
9321 return true;
9322
9323 tmp_pn_min_memory = job_desc_msg->pn_min_memory;
9324 tmp_cpus_per_task = job_desc_msg->cpus_per_task;
9325 tmp_min_cpus = job_desc_msg->min_cpus;
9326 tmp_max_cpus = job_desc_msg->max_cpus;
9327 tmp_pn_min_cpus = job_desc_msg->pn_min_cpus;
9328
9329 if (part_list == NULL) {
9330 cc = _valid_pn_min_mem(job_desc_msg, part_ptr);
9331 } else {
9332 iter = list_iterator_create(part_list);
9333 while ((part = list_next(iter))) {
9334 cc = _valid_pn_min_mem(job_desc_msg, part);
9335
9336 /* for ALL we have to test them all */
9337 if (slurmctld_conf.enforce_part_limits ==
9338 PARTITION_ENFORCE_ALL) {
9339 if (!cc)
9340 break;
9341 } else if (cc) /* break, we found one! */
9342 break;
9343 else if (slurmctld_conf.enforce_part_limits ==
9344 PARTITION_ENFORCE_ANY) {
9345 debug("%s: Job requested for (%"PRIu64")MB is invalid"
9346 " for partition %s",
9347 __func__, job_desc_msg->pn_min_memory,
9348 part->name);
9349 }
9350
9351 job_desc_msg->pn_min_memory = tmp_pn_min_memory;
9352 job_desc_msg->cpus_per_task = tmp_cpus_per_task;
9353 job_desc_msg->min_cpus = tmp_min_cpus;
9354 job_desc_msg->max_cpus = tmp_max_cpus;
9355 job_desc_msg->pn_min_cpus = tmp_pn_min_cpus;
9356 }
9357 list_iterator_destroy(iter);
9358 }
9359
9360 /*
9361 * Restoring original values, if it is necessary,
9362 * these will be modified in job_limits_check()
9363 */
9364 job_desc_msg->pn_min_memory = tmp_pn_min_memory;
9365 job_desc_msg->cpus_per_task = tmp_cpus_per_task;
9366 job_desc_msg->min_cpus = tmp_min_cpus;
9367 job_desc_msg->max_cpus = tmp_max_cpus;
9368 job_desc_msg->pn_min_cpus = tmp_pn_min_cpus;
9369
9370 return cc;
9371 }
9372
free_null_array_recs(job_record_t * job_ptr)9373 extern void free_null_array_recs(job_record_t *job_ptr)
9374 {
9375 if (!job_ptr || !job_ptr->array_recs)
9376 return;
9377
9378 FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap);
9379 xfree(job_ptr->array_recs->task_id_str);
9380 xfree(job_ptr->array_recs);
9381 }
9382
_delete_job_common(job_record_t * job_ptr)9383 static void _delete_job_common(job_record_t *job_ptr)
9384 {
9385 /* Remove record from fed_job_list */
9386 fed_mgr_remove_fed_job_info(job_ptr->job_id);
9387
9388 /* Remove the record from job hash table */
9389 _remove_job_hash(job_ptr, JOB_HASH_JOB);
9390
9391 /* Remove the record from job array hash tables, if applicable */
9392 if (job_ptr->array_task_id != NO_VAL) {
9393 _remove_job_hash(job_ptr, JOB_HASH_ARRAY_JOB);
9394 _remove_job_hash(job_ptr, JOB_HASH_ARRAY_TASK);
9395 }
9396 }
9397
9398 /*
9399 * _list_delete_job - delete a job record and its corresponding job_details,
9400 * see common/list.h for documentation
9401 * IN job_entry - pointer to job_record to delete
9402 */
_list_delete_job(void * job_entry)9403 static void _list_delete_job(void *job_entry)
9404 {
9405 job_record_t *job_ptr = (job_record_t *) job_entry;
9406 int job_array_size, i;
9407
9408 xassert(job_entry);
9409 xassert (job_ptr->magic == JOB_MAGIC);
9410 job_ptr->magic = 0; /* make sure we don't delete record twice */
9411
9412 _delete_job_common(job_ptr);
9413
9414 if (job_ptr->array_recs) {
9415 job_array_size = MAX(1, job_ptr->array_recs->task_cnt);
9416 } else {
9417 job_array_size = 1;
9418 }
9419
9420 _delete_job_details(job_ptr);
9421 xfree(job_ptr->account);
9422 xfree(job_ptr->admin_comment);
9423 xfree(job_ptr->alias_list);
9424 xfree(job_ptr->alloc_node);
9425 free_null_array_recs(job_ptr);
9426 if (job_ptr->array_recs) {
9427 FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap);
9428 xfree(job_ptr->array_recs->task_id_str);
9429 xfree(job_ptr->array_recs);
9430 }
9431 xfree(job_ptr->batch_features);
9432 xfree(job_ptr->batch_host);
9433 xfree(job_ptr->burst_buffer);
9434 xfree(job_ptr->comment);
9435 xfree(job_ptr->clusters);
9436 xfree(job_ptr->cpus_per_tres);
9437 free_job_fed_details(&job_ptr->fed_details);
9438 free_job_resources(&job_ptr->job_resrcs);
9439 xfree(job_ptr->gres_alloc);
9440 _clear_job_gres_details(job_ptr);
9441 xfree(job_ptr->gres_req);
9442 xfree(job_ptr->gres_used);
9443 FREE_NULL_LIST(job_ptr->gres_list);
9444 xfree(job_ptr->licenses);
9445 FREE_NULL_LIST(job_ptr->license_list);
9446 xfree(job_ptr->limit_set.tres);
9447 xfree(job_ptr->mail_user);
9448 xfree(job_ptr->mcs_label);
9449 xfree(job_ptr->mem_per_tres);
9450 xfree(job_ptr->name);
9451 xfree(job_ptr->network);
9452 xfree(job_ptr->node_addr);
9453 FREE_NULL_BITMAP(job_ptr->node_bitmap);
9454 FREE_NULL_BITMAP(job_ptr->node_bitmap_cg);
9455 xfree(job_ptr->nodes);
9456 xfree(job_ptr->nodes_completing);
9457 xfree(job_ptr->origin_cluster);
9458 if (job_ptr->het_details && job_ptr->het_job_id) {
9459 /* xfree struct if hetjob leader and NULL ptr otherwise. */
9460 if (job_ptr->het_job_offset == 0)
9461 xfree(job_ptr->het_details);
9462 else
9463 job_ptr->het_details = NULL;
9464 }
9465 xfree(job_ptr->het_job_id_set);
9466 FREE_NULL_LIST(job_ptr->het_job_list);
9467 xfree(job_ptr->partition);
9468 FREE_NULL_LIST(job_ptr->part_ptr_list);
9469 xfree(job_ptr->priority_array);
9470 slurm_destroy_priority_factors_object(job_ptr->prio_factors);
9471 xfree(job_ptr->resp_host);
9472 xfree(job_ptr->resv_name);
9473 xfree(job_ptr->sched_nodes);
9474 for (i = 0; i < job_ptr->spank_job_env_size; i++)
9475 xfree(job_ptr->spank_job_env[i]);
9476 xfree(job_ptr->spank_job_env);
9477 xfree(job_ptr->state_desc);
9478 xfree(job_ptr->system_comment);
9479 xfree(job_ptr->tres_alloc_cnt);
9480 xfree(job_ptr->tres_alloc_str);
9481 xfree(job_ptr->tres_bind);
9482 xfree(job_ptr->tres_freq);
9483 xfree(job_ptr->tres_fmt_alloc_str);
9484 xfree(job_ptr->tres_per_job);
9485 xfree(job_ptr->tres_per_node);
9486 xfree(job_ptr->tres_per_socket);
9487 xfree(job_ptr->tres_per_task);
9488 xfree(job_ptr->tres_req_cnt);
9489 xfree(job_ptr->tres_req_str);
9490 xfree(job_ptr->tres_fmt_req_str);
9491 step_list_purge(job_ptr);
9492 select_g_select_jobinfo_free(job_ptr->select_jobinfo);
9493 xfree(job_ptr->user_name);
9494 xfree(job_ptr->wckey);
9495 if (job_array_size > job_count) {
9496 error("job_count underflow");
9497 job_count = 0;
9498 } else {
9499 job_count -= job_array_size;
9500 }
9501 job_ptr->job_id = 0;
9502 xfree(job_ptr);
9503 }
9504
9505
9506 /*
9507 * find specific job_id entry in the job list, key is job_id_ptr
9508 */
_list_find_job_id(void * job_entry,void * key)9509 static int _list_find_job_id(void *job_entry, void *key)
9510 {
9511 job_record_t *job_ptr = (job_record_t *) job_entry;
9512 uint32_t *job_id_ptr = (uint32_t *) key;
9513
9514 if (job_ptr->job_id == *job_id_ptr)
9515 return 1;
9516
9517 return 0;
9518 }
9519
9520 /*
9521 * _list_find_job_old - find old entries in the job list,
9522 * see common/list.h for documentation, key is ignored
9523 * job_entry IN - job pointer
9524 * key IN - if not NULL, then skip hetjobs
9525 */
_list_find_job_old(void * job_entry,void * key)9526 static int _list_find_job_old(void *job_entry, void *key)
9527 {
9528 time_t kill_age, min_age, now = time(NULL);
9529 job_record_t *job_ptr = (job_record_t *) job_entry;
9530 uint16_t cleaning = 0;
9531
9532 if ((job_ptr->job_id == NO_VAL) && IS_JOB_REVOKED(job_ptr))
9533 return 1;
9534
9535 if (key && job_ptr->het_job_id)
9536 return 0;
9537
9538 if (IS_JOB_COMPLETING(job_ptr) && !LOTS_OF_AGENTS) {
9539 kill_age = now - (slurmctld_conf.kill_wait +
9540 2 * slurm_get_msg_timeout());
9541 if (job_ptr->time_last_active < kill_age) {
9542 job_ptr->time_last_active = now;
9543 re_kill_job(job_ptr);
9544 }
9545 return 0; /* Job still completing */
9546 }
9547
9548 if (job_ptr->epilog_running)
9549 return 0; /* EpilogSlurmctld still running */
9550
9551 if (slurmctld_conf.min_job_age == 0)
9552 return 0; /* No job record purging */
9553
9554 if (fed_mgr_fed_rec && job_ptr->fed_details &&
9555 !fed_mgr_is_origin_job(job_ptr)) {
9556 uint32_t origin_id = fed_mgr_get_cluster_id(job_ptr->job_id);
9557 slurmdb_cluster_rec_t *origin =
9558 fed_mgr_get_cluster_by_id(origin_id);
9559
9560 /* keep job around until origin comes back and is synced */
9561 if (origin &&
9562 (!origin->fed.send ||
9563 (((slurm_persist_conn_t *)origin->fed.send)->fd == -1) ||
9564 !origin->fed.sync_sent))
9565 return 0;
9566 }
9567
9568 min_age = now - slurmctld_conf.min_job_age;
9569 if (job_ptr->end_time > min_age)
9570 return 0; /* Too new to purge */
9571
9572 if (!(IS_JOB_COMPLETED(job_ptr)))
9573 return 0; /* Job still active */
9574
9575 if (job_ptr->step_list && list_count(job_ptr->step_list)) {
9576 debug("%pJ still has %d active steps",
9577 job_ptr, list_count(job_ptr->step_list));
9578 /*
9579 * If the job has been around more than 30 days the steps are
9580 * bogus. Blow the job away. This was witnessed <= 16.05 but
9581 * hasn't be seen since. This is here just to clear them out if
9582 * this ever shows up again.
9583 */
9584 min_age = now - PURGE_OLD_JOB_IN_SEC;
9585 if (job_ptr->end_time <= min_age) {
9586 info("Force purge of %pJ. It ended over 30 days ago, the slurmctld thinks there are still steps running but they are most likely bogus. In any case you might want to check nodes %s to make sure nothing remains of the job.",
9587 job_ptr, job_ptr->nodes);
9588 goto end_it;
9589 } else
9590 return 0; /* steps are still active */
9591 }
9592
9593 if (job_ptr->array_recs) {
9594 if (job_ptr->array_recs->tot_run_tasks ||
9595 !_test_job_array_purged(job_ptr->array_job_id)) {
9596 /* Some tasks from this job array still active */
9597 return 0;
9598 }
9599 }
9600
9601 select_g_select_jobinfo_get(job_ptr->select_jobinfo,
9602 SELECT_JOBDATA_CLEANING,
9603 &cleaning);
9604 if (cleaning)
9605 return 0; /* Job hasn't finished yet */
9606
9607 if (bb_g_job_test_stage_out(job_ptr) != 1)
9608 return 0; /* Stage out in progress */
9609
9610 /* If we don't have a db_index by now and we are running with
9611 * the slurmdbd, lets put it on the list to be handled later
9612 * when slurmdbd comes back up since we won't get another chance.
9613 * job_start won't pend for job_db_inx when the job is finished.
9614 */
9615 end_it:
9616 if (with_slurmdbd && !job_ptr->db_index)
9617 jobacct_storage_g_job_start(acct_db_conn, job_ptr);
9618
9619 return 1; /* Purge the job */
9620 }
9621
9622 /* Determine if ALL partitions associated with a job are hidden */
_all_parts_hidden(job_record_t * job_ptr,uid_t uid)9623 static bool _all_parts_hidden(job_record_t *job_ptr, uid_t uid)
9624 {
9625 bool rc;
9626 ListIterator part_iterator;
9627 part_record_t *part_ptr;
9628
9629 if (job_ptr->part_ptr_list) {
9630 rc = true;
9631 part_iterator = list_iterator_create(job_ptr->part_ptr_list);
9632 while ((part_ptr = list_next(part_iterator))) {
9633 if (part_is_visible(part_ptr, uid)) {
9634 rc = false;
9635 break;
9636 }
9637 }
9638 list_iterator_destroy(part_iterator);
9639 return rc;
9640 }
9641
9642 if (job_ptr->part_ptr && part_is_visible(job_ptr->part_ptr, uid))
9643 return false;
9644 return true;
9645 }
9646
9647 /* Determine if a given job should be seen by a specific user */
_hide_job(job_record_t * job_ptr,uid_t uid,uint16_t show_flags)9648 static bool _hide_job(job_record_t *job_ptr, uid_t uid, uint16_t show_flags)
9649 {
9650 if (!(show_flags & SHOW_ALL) && IS_JOB_REVOKED(job_ptr))
9651 return true;
9652
9653 if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) &&
9654 (job_ptr->user_id != uid) && !validate_operator(uid) &&
9655 (((slurm_mcs_get_privatedata() == 0) &&
9656 !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
9657 job_ptr->account)) ||
9658 ((slurm_mcs_get_privatedata() == 1) &&
9659 (mcs_g_check_mcs_label(uid, job_ptr->mcs_label) != 0))))
9660 return true;
9661 return false;
9662 }
9663
_pack_job(job_record_t * job_ptr,_foreach_pack_job_info_t * pack_info)9664 static void _pack_job(job_record_t *job_ptr,
9665 _foreach_pack_job_info_t *pack_info)
9666 {
9667 xassert (job_ptr->magic == JOB_MAGIC);
9668
9669 if ((pack_info->filter_uid != NO_VAL) &&
9670 (pack_info->filter_uid != job_ptr->user_id))
9671 return;
9672
9673 if (((pack_info->show_flags & SHOW_ALL) == 0) &&
9674 (pack_info->uid != 0) &&
9675 _all_parts_hidden(job_ptr, pack_info->uid))
9676 return;
9677
9678 if (_hide_job(job_ptr, pack_info->uid, pack_info->show_flags))
9679 return;
9680
9681 pack_job(job_ptr, pack_info->show_flags, pack_info->buffer,
9682 pack_info->protocol_version, pack_info->uid);
9683
9684 (*pack_info->jobs_packed)++;
9685 }
9686
_foreach_pack_jobid(void * object,void * arg)9687 static int _foreach_pack_jobid(void *object, void *arg)
9688 {
9689 job_record_t *job_ptr;
9690 uint32_t job_id = *(uint32_t *)object;
9691 _foreach_pack_job_info_t *info = (_foreach_pack_job_info_t *)arg;
9692
9693 if (!(job_ptr = find_job_record(job_id)))
9694 return SLURM_SUCCESS;
9695
9696 _pack_job(job_ptr, info);
9697
9698 return SLURM_SUCCESS;
9699 }
9700
9701 /*
9702 * pack_all_jobs - dump all job information for all jobs in
9703 * machine independent form (for network transmission)
9704 * OUT buffer_ptr - the pointer is set to the allocated buffer.
9705 * OUT buffer_size - set to size of the buffer in bytes
9706 * IN show_flags - job filtering options
9707 * IN uid - uid of user making request (for partition filtering)
9708 * IN filter_uid - pack only jobs belonging to this user if not NO_VAL
9709 * global: job_list - global list of job records
9710 * NOTE: the buffer at *buffer_ptr must be xfreed by the caller
9711 * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
9712 * whenever the data format changes
9713 */
pack_all_jobs(char ** buffer_ptr,int * buffer_size,uint16_t show_flags,uid_t uid,uint32_t filter_uid,uint16_t protocol_version)9714 extern void pack_all_jobs(char **buffer_ptr, int *buffer_size,
9715 uint16_t show_flags, uid_t uid, uint32_t filter_uid,
9716 uint16_t protocol_version)
9717 {
9718 uint32_t jobs_packed = 0, tmp_offset;
9719 _foreach_pack_job_info_t pack_info = {0};
9720 Buf buffer;
9721 ListIterator itr;
9722 job_record_t *job_ptr = NULL;
9723
9724 buffer_ptr[0] = NULL;
9725 *buffer_size = 0;
9726
9727 buffer = init_buf(BUF_SIZE);
9728
9729 /* write message body header : size and time */
9730 /* put in a place holder job record count of 0 for now */
9731 pack32(jobs_packed, buffer);
9732 pack_time(time(NULL), buffer);
9733
9734 /* write individual job records */
9735 pack_info.buffer = buffer;
9736 pack_info.filter_uid = filter_uid;
9737 pack_info.jobs_packed = &jobs_packed;
9738 pack_info.protocol_version = protocol_version;
9739 pack_info.show_flags = show_flags;
9740 pack_info.uid = uid;
9741
9742 itr = list_iterator_create(job_list);
9743 while ((job_ptr = list_next(itr))) {
9744 _pack_job(job_ptr, &pack_info);
9745 }
9746 list_iterator_destroy(itr);
9747
9748 /* put the real record count in the message body header */
9749 tmp_offset = get_buf_offset(buffer);
9750 set_buf_offset(buffer, 0);
9751 pack32(jobs_packed, buffer);
9752 set_buf_offset(buffer, tmp_offset);
9753
9754 *buffer_size = get_buf_offset(buffer);
9755 buffer_ptr[0] = xfer_buf_data(buffer);
9756 }
9757
9758 /*
9759 * pack_spec_jobs - dump job information for specified jobs in
9760 * machine independent form (for network transmission)
9761 * OUT buffer_ptr - the pointer is set to the allocated buffer.
9762 * OUT buffer_size - set to size of the buffer in bytes
9763 * IN show_flags - job filtering options
9764 * IN job_ids - list of job_ids to pack
9765 * IN uid - uid of user making request (for partition filtering)
9766 * IN filter_uid - pack only jobs belonging to this user if not NO_VAL
9767 * global: job_list - global list of job records
9768 * NOTE: the buffer at *buffer_ptr must be xfreed by the caller
9769 * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
9770 * whenever the data format changes
9771 */
pack_spec_jobs(char ** buffer_ptr,int * buffer_size,List job_ids,uint16_t show_flags,uid_t uid,uint32_t filter_uid,uint16_t protocol_version)9772 extern void pack_spec_jobs(char **buffer_ptr, int *buffer_size, List job_ids,
9773 uint16_t show_flags, uid_t uid, uint32_t filter_uid,
9774 uint16_t protocol_version)
9775 {
9776 uint32_t jobs_packed = 0, tmp_offset;
9777 _foreach_pack_job_info_t pack_info = {0};
9778 Buf buffer;
9779
9780 xassert(job_ids);
9781
9782 buffer_ptr[0] = NULL;
9783 *buffer_size = 0;
9784
9785 buffer = init_buf(BUF_SIZE);
9786
9787 /* write message body header : size and time */
9788 /* put in a place holder job record count of 0 for now */
9789 pack32(jobs_packed, buffer);
9790 pack_time(time(NULL), buffer);
9791
9792 /* write individual job records */
9793 pack_info.buffer = buffer;
9794 pack_info.filter_uid = filter_uid;
9795 pack_info.jobs_packed = &jobs_packed;
9796 pack_info.protocol_version = protocol_version;
9797 pack_info.show_flags = show_flags;
9798 pack_info.uid = uid;
9799
9800 list_for_each(job_ids, _foreach_pack_jobid, &pack_info);
9801
9802 /* put the real record count in the message body header */
9803 tmp_offset = get_buf_offset(buffer);
9804 set_buf_offset(buffer, 0);
9805 pack32(jobs_packed, buffer);
9806 set_buf_offset(buffer, tmp_offset);
9807
9808 *buffer_size = get_buf_offset(buffer);
9809 buffer_ptr[0] = xfer_buf_data(buffer);
9810 }
9811
_pack_het_job(job_record_t * job_ptr,uint16_t show_flags,Buf buffer,uint16_t protocol_version,uid_t uid)9812 static int _pack_het_job(job_record_t *job_ptr, uint16_t show_flags,
9813 Buf buffer, uint16_t protocol_version, uid_t uid)
9814 {
9815 job_record_t *het_job_ptr;
9816 int job_cnt = 0;
9817 ListIterator iter;
9818
9819 iter = list_iterator_create(job_ptr->het_job_list);
9820 while ((het_job_ptr = list_next(iter))) {
9821 if (het_job_ptr->het_job_id == job_ptr->het_job_id) {
9822 pack_job(het_job_ptr, show_flags, buffer,
9823 protocol_version, uid);
9824 job_cnt++;
9825 } else {
9826 error("%s: Bad het_job_list for %pJ",
9827 __func__, job_ptr);
9828 }
9829 }
9830 list_iterator_destroy(iter);
9831
9832 return job_cnt;
9833 }
9834
9835 /*
9836 * pack_one_job - dump information for one jobs in
9837 * machine independent form (for network transmission)
9838 * OUT buffer_ptr - the pointer is set to the allocated buffer.
9839 * OUT buffer_size - set to size of the buffer in bytes
9840 * IN job_id - ID of job that we want info for
9841 * IN show_flags - job filtering options
9842 * IN uid - uid of user making request (for partition filtering)
9843 * NOTE: the buffer at *buffer_ptr must be xfreed by the caller
9844 * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
9845 * whenever the data format changes
9846 */
pack_one_job(char ** buffer_ptr,int * buffer_size,uint32_t job_id,uint16_t show_flags,uid_t uid,uint16_t protocol_version)9847 extern int pack_one_job(char **buffer_ptr, int *buffer_size,
9848 uint32_t job_id, uint16_t show_flags, uid_t uid,
9849 uint16_t protocol_version)
9850 {
9851 job_record_t *job_ptr;
9852 uint32_t jobs_packed = 0, tmp_offset;
9853 Buf buffer;
9854
9855 buffer_ptr[0] = NULL;
9856 *buffer_size = 0;
9857
9858 buffer = init_buf(BUF_SIZE);
9859
9860 /* write message body header : size and time */
9861 /* put in a place holder job record count of 0 for now */
9862 pack32(jobs_packed, buffer);
9863 pack_time(time(NULL), buffer);
9864
9865 job_ptr = find_job_record(job_id);
9866 if (job_ptr && job_ptr->het_job_list) {
9867 /* Pack heterogeneous job components */
9868 if (!_hide_job(job_ptr, uid, show_flags)) {
9869 jobs_packed = _pack_het_job(job_ptr, show_flags,
9870 buffer, protocol_version,
9871 uid);
9872 }
9873 } else if (job_ptr && (job_ptr->array_task_id == NO_VAL) &&
9874 !job_ptr->array_recs) {
9875 /* Pack regular (not array) job */
9876 if (!_hide_job(job_ptr, uid, show_flags)) {
9877 pack_job(job_ptr, show_flags, buffer, protocol_version,
9878 uid);
9879 jobs_packed++;
9880 }
9881 } else {
9882 bool packed_head = false;
9883
9884 /* Either the job is not found or it is a job array */
9885 if (job_ptr) {
9886 packed_head = true;
9887 if (!_hide_job(job_ptr, uid, show_flags)) {
9888 pack_job(job_ptr, show_flags, buffer,
9889 protocol_version, uid);
9890 jobs_packed++;
9891 }
9892 }
9893
9894 job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
9895 while (job_ptr) {
9896 if ((job_ptr->job_id == job_id) && packed_head) {
9897 ; /* Already packed */
9898 } else if (job_ptr->array_job_id == job_id) {
9899 if (_hide_job(job_ptr, uid, show_flags))
9900 break;
9901 pack_job(job_ptr, show_flags, buffer,
9902 protocol_version, uid);
9903 jobs_packed++;
9904 }
9905 job_ptr = job_ptr->job_array_next_j;
9906 }
9907 }
9908
9909 if (jobs_packed == 0) {
9910 free_buf(buffer);
9911 return ESLURM_INVALID_JOB_ID;
9912 }
9913
9914 /* put the real record count in the message body header */
9915 tmp_offset = get_buf_offset(buffer);
9916 set_buf_offset(buffer, 0);
9917 pack32(jobs_packed, buffer);
9918 set_buf_offset(buffer, tmp_offset);
9919
9920 *buffer_size = get_buf_offset(buffer);
9921 buffer_ptr[0] = xfer_buf_data(buffer);
9922
9923 return SLURM_SUCCESS;
9924 }
9925
_pack_job_gres(job_record_t * dump_job_ptr,Buf buffer,uint16_t protocol_version)9926 static void _pack_job_gres(job_record_t *dump_job_ptr, Buf buffer,
9927 uint16_t protocol_version)
9928 {
9929 if (!IS_JOB_STARTED(dump_job_ptr) || IS_JOB_FINISHED(dump_job_ptr) ||
9930 (dump_job_ptr->gres_list == NULL)) {
9931 packstr_array(NULL, 0, buffer);
9932 return;
9933 }
9934
9935 packstr_array(dump_job_ptr->gres_detail_str,
9936 dump_job_ptr->gres_detail_cnt, buffer);
9937 }
9938
9939 /*
9940 * pack_job - dump all configuration information about a specific job in
9941 * machine independent form (for network transmission)
9942 * IN dump_job_ptr - pointer to job for which information is requested
9943 * IN show_flags - job filtering options
9944 * IN/OUT buffer - buffer in which data is placed, pointers automatically
9945 * updated
9946 * IN uid - user requesting the data
9947 * NOTE: change _unpack_job_info_members() in common/slurm_protocol_pack.c
9948 * whenever the data format changes
9949 */
pack_job(job_record_t * dump_job_ptr,uint16_t show_flags,Buf buffer,uint16_t protocol_version,uid_t uid)9950 void pack_job(job_record_t *dump_job_ptr, uint16_t show_flags, Buf buffer,
9951 uint16_t protocol_version, uid_t uid)
9952 {
9953 struct job_details *detail_ptr;
9954 time_t accrue_time = 0, begin_time = 0, start_time = 0, end_time = 0;
9955 uint32_t time_limit;
9956 char *nodelist = NULL;
9957 assoc_mgr_lock_t locks = { .qos = READ_LOCK };
9958
9959 if (protocol_version >= SLURM_20_02_PROTOCOL_VERSION) {
9960 detail_ptr = dump_job_ptr->details;
9961 pack32(dump_job_ptr->array_job_id, buffer);
9962 pack32(dump_job_ptr->array_task_id, buffer);
9963 if (dump_job_ptr->array_recs) {
9964 build_array_str(dump_job_ptr);
9965 packstr(dump_job_ptr->array_recs->task_id_str, buffer);
9966 pack32(dump_job_ptr->array_recs->max_run_tasks, buffer);
9967 } else {
9968 job_record_t *array_head = NULL;
9969 packnull(buffer);
9970 if (dump_job_ptr->array_job_id) {
9971 array_head = find_job_record(
9972 dump_job_ptr->array_job_id);
9973 }
9974 if (array_head && array_head->array_recs) {
9975 pack32(array_head->array_recs->max_run_tasks,
9976 buffer);
9977 } else {
9978 pack32((uint32_t) 0, buffer);
9979 }
9980 }
9981
9982 pack32(dump_job_ptr->assoc_id, buffer);
9983 pack32(dump_job_ptr->delay_boot, buffer);
9984 pack32(dump_job_ptr->job_id, buffer);
9985 pack32(dump_job_ptr->user_id, buffer);
9986 pack32(dump_job_ptr->group_id, buffer);
9987 pack32(dump_job_ptr->het_job_id, buffer);
9988 packstr(dump_job_ptr->het_job_id_set, buffer);
9989 pack32(dump_job_ptr->het_job_offset, buffer);
9990 pack32(dump_job_ptr->profile, buffer);
9991
9992 pack32(dump_job_ptr->job_state, buffer);
9993 pack16(dump_job_ptr->batch_flag, buffer);
9994 pack16(dump_job_ptr->state_reason, buffer);
9995 pack8(dump_job_ptr->power_flags, buffer);
9996 pack8(dump_job_ptr->reboot, buffer);
9997 pack16(dump_job_ptr->restart_cnt, buffer);
9998 pack16(show_flags, buffer);
9999 pack_time(dump_job_ptr->deadline, buffer);
10000
10001 pack32(dump_job_ptr->alloc_sid, buffer);
10002 if ((dump_job_ptr->time_limit == NO_VAL)
10003 && dump_job_ptr->part_ptr)
10004 time_limit = dump_job_ptr->part_ptr->max_time;
10005 else
10006 time_limit = dump_job_ptr->time_limit;
10007
10008 pack32(time_limit, buffer);
10009 pack32(dump_job_ptr->time_min, buffer);
10010
10011 if (dump_job_ptr->details) {
10012 pack32(dump_job_ptr->details->nice, buffer);
10013 pack_time(dump_job_ptr->details->submit_time, buffer);
10014 /* Earliest possible begin time */
10015 begin_time = dump_job_ptr->details->begin_time;
10016 /* When we started accruing time for priority */
10017 accrue_time = dump_job_ptr->details->accrue_time;
10018 } else { /* Some job details may be purged after completion */
10019 pack32(NICE_OFFSET, buffer); /* Best guess */
10020 pack_time((time_t) 0, buffer);
10021 }
10022
10023 pack_time(begin_time, buffer);
10024 pack_time(accrue_time, buffer);
10025
10026 if (IS_JOB_STARTED(dump_job_ptr)) {
10027 /* Report actual start time, in past */
10028 start_time = dump_job_ptr->start_time;
10029 end_time = dump_job_ptr->end_time;
10030 } else if (dump_job_ptr->start_time != 0) {
10031 /* Report expected start time,
10032 * making sure that time is not in the past */
10033 start_time = MAX(dump_job_ptr->start_time, time(NULL));
10034 if (time_limit != NO_VAL) {
10035 end_time = MAX(dump_job_ptr->end_time,
10036 (start_time + time_limit * 60));
10037 }
10038 } else if (begin_time > time(NULL)) {
10039 /* earliest start time in the future */
10040 start_time = begin_time;
10041 if (time_limit != NO_VAL) {
10042 end_time = MAX(dump_job_ptr->end_time,
10043 (start_time + time_limit * 60));
10044 }
10045 }
10046 pack_time(start_time, buffer);
10047 pack_time(end_time, buffer);
10048
10049 pack_time(dump_job_ptr->suspend_time, buffer);
10050 pack_time(dump_job_ptr->pre_sus_time, buffer);
10051 pack_time(dump_job_ptr->resize_time, buffer);
10052 pack_time(dump_job_ptr->last_sched_eval, buffer);
10053 pack_time(dump_job_ptr->preempt_time, buffer);
10054 pack32(dump_job_ptr->priority, buffer);
10055 packdouble(dump_job_ptr->billable_tres, buffer);
10056
10057 packstr(slurmctld_conf.cluster_name, buffer);
10058 /* Only send the allocated nodelist since we are only sending
10059 * the number of cpus and nodes that are currently allocated. */
10060 if (!IS_JOB_COMPLETING(dump_job_ptr))
10061 packstr(dump_job_ptr->nodes, buffer);
10062 else {
10063 nodelist =
10064 bitmap2node_name(dump_job_ptr->node_bitmap_cg);
10065 packstr(nodelist, buffer);
10066 xfree(nodelist);
10067 }
10068
10069 packstr(dump_job_ptr->sched_nodes, buffer);
10070
10071 if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr)
10072 packstr(dump_job_ptr->part_ptr->name, buffer);
10073 else
10074 packstr(dump_job_ptr->partition, buffer);
10075 packstr(dump_job_ptr->account, buffer);
10076 packstr(dump_job_ptr->admin_comment, buffer);
10077 pack32(dump_job_ptr->site_factor, buffer);
10078 packstr(dump_job_ptr->network, buffer);
10079 packstr(dump_job_ptr->comment, buffer);
10080 packstr(dump_job_ptr->batch_features, buffer);
10081 packstr(dump_job_ptr->batch_host, buffer);
10082 packstr(dump_job_ptr->burst_buffer, buffer);
10083 packstr(dump_job_ptr->burst_buffer_state, buffer);
10084 packstr(dump_job_ptr->system_comment, buffer);
10085
10086 assoc_mgr_lock(&locks);
10087 if (dump_job_ptr->qos_ptr)
10088 packstr(dump_job_ptr->qos_ptr->name, buffer);
10089 else {
10090 if (assoc_mgr_qos_list) {
10091 packstr(slurmdb_qos_str(assoc_mgr_qos_list,
10092 dump_job_ptr->qos_id),
10093 buffer);
10094 } else
10095 packnull(buffer);
10096 }
10097
10098 if (IS_JOB_STARTED(dump_job_ptr) &&
10099 (slurmctld_conf.preempt_mode != PREEMPT_MODE_OFF) &&
10100 (slurm_job_preempt_mode(dump_job_ptr) != PREEMPT_MODE_OFF)) {
10101 time_t preemptable = acct_policy_get_preemptable_time(
10102 dump_job_ptr);
10103 pack_time(preemptable, buffer);
10104 } else {
10105 pack_time(0, buffer);
10106 }
10107 assoc_mgr_unlock(&locks);
10108
10109 packstr(dump_job_ptr->licenses, buffer);
10110 packstr(dump_job_ptr->state_desc, buffer);
10111 packstr(dump_job_ptr->resv_name, buffer);
10112 packstr(dump_job_ptr->mcs_label, buffer);
10113
10114 pack32(dump_job_ptr->exit_code, buffer);
10115 pack32(dump_job_ptr->derived_ec, buffer);
10116
10117 packstr(dump_job_ptr->gres_used, buffer);
10118 if (show_flags & SHOW_DETAIL) {
10119 pack_job_resources(dump_job_ptr->job_resrcs, buffer,
10120 protocol_version);
10121 _pack_job_gres(dump_job_ptr, buffer, protocol_version);
10122 } else {
10123 pack32(NO_VAL, buffer);
10124 pack32((uint32_t) 0, buffer);
10125 }
10126
10127 packstr(dump_job_ptr->name, buffer);
10128 packstr(dump_job_ptr->user_name, buffer);
10129 packstr(dump_job_ptr->wckey, buffer);
10130 pack32(dump_job_ptr->req_switch, buffer);
10131 pack32(dump_job_ptr->wait4switch, buffer);
10132
10133 packstr(dump_job_ptr->alloc_node, buffer);
10134 if (!IS_JOB_COMPLETING(dump_job_ptr))
10135 pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer);
10136 else
10137 pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer);
10138
10139 select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo,
10140 buffer, protocol_version);
10141
10142 /* A few details are always dumped here */
10143 _pack_default_job_details(dump_job_ptr, buffer,
10144 protocol_version);
10145
10146 /* other job details are only dumped until the job starts
10147 * running (at which time they become meaningless) */
10148 if (detail_ptr)
10149 _pack_pending_job_details(detail_ptr, buffer,
10150 protocol_version);
10151 else
10152 _pack_pending_job_details(NULL, buffer,
10153 protocol_version);
10154 pack32(dump_job_ptr->bit_flags, buffer);
10155 packstr(dump_job_ptr->tres_fmt_alloc_str, buffer);
10156 packstr(dump_job_ptr->tres_fmt_req_str, buffer);
10157 pack16(dump_job_ptr->start_protocol_ver, buffer);
10158
10159 if (dump_job_ptr->fed_details) {
10160 packstr(dump_job_ptr->fed_details->origin_str, buffer);
10161 pack64(dump_job_ptr->fed_details->siblings_active,
10162 buffer);
10163 packstr(dump_job_ptr->fed_details->siblings_active_str,
10164 buffer);
10165 pack64(dump_job_ptr->fed_details->siblings_viable,
10166 buffer);
10167 packstr(dump_job_ptr->fed_details->siblings_viable_str,
10168 buffer);
10169 } else {
10170 packnull(buffer);
10171 pack64((uint64_t)0, buffer);
10172 packnull(buffer);
10173 pack64((uint64_t)0, buffer);
10174 packnull(buffer);
10175 }
10176
10177 packstr(dump_job_ptr->cpus_per_tres, buffer);
10178 packstr(dump_job_ptr->mem_per_tres, buffer);
10179 packstr(dump_job_ptr->tres_bind, buffer);
10180 packstr(dump_job_ptr->tres_freq, buffer);
10181 packstr(dump_job_ptr->tres_per_job, buffer);
10182 packstr(dump_job_ptr->tres_per_node, buffer);
10183 packstr(dump_job_ptr->tres_per_socket, buffer);
10184 packstr(dump_job_ptr->tres_per_task, buffer);
10185
10186 pack16(dump_job_ptr->mail_type, buffer);
10187 packstr(dump_job_ptr->mail_user, buffer);
10188 } else if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
10189 detail_ptr = dump_job_ptr->details;
10190 pack32(dump_job_ptr->array_job_id, buffer);
10191 pack32(dump_job_ptr->array_task_id, buffer);
10192 if (dump_job_ptr->array_recs) {
10193 build_array_str(dump_job_ptr);
10194 packstr(dump_job_ptr->array_recs->task_id_str, buffer);
10195 pack32(dump_job_ptr->array_recs->max_run_tasks, buffer);
10196 } else {
10197 job_record_t *array_head = NULL;
10198 packnull(buffer);
10199 if (dump_job_ptr->array_job_id) {
10200 array_head = find_job_record(
10201 dump_job_ptr->array_job_id);
10202 }
10203 if (array_head && array_head->array_recs) {
10204 pack32(array_head->array_recs->max_run_tasks,
10205 buffer);
10206 } else {
10207 pack32((uint32_t) 0, buffer);
10208 }
10209 }
10210
10211 pack32(dump_job_ptr->assoc_id, buffer);
10212 pack32(dump_job_ptr->delay_boot, buffer);
10213 pack32(dump_job_ptr->job_id, buffer);
10214 pack32(dump_job_ptr->user_id, buffer);
10215 pack32(dump_job_ptr->group_id, buffer);
10216 pack32(dump_job_ptr->het_job_id, buffer);
10217 packstr(dump_job_ptr->het_job_id_set, buffer);
10218 pack32(dump_job_ptr->het_job_offset, buffer);
10219 pack32(dump_job_ptr->profile, buffer);
10220
10221 pack32(dump_job_ptr->job_state, buffer);
10222 pack16(dump_job_ptr->batch_flag, buffer);
10223 pack16(dump_job_ptr->state_reason, buffer);
10224 pack8(dump_job_ptr->power_flags, buffer);
10225 pack8(dump_job_ptr->reboot, buffer);
10226 pack16(dump_job_ptr->restart_cnt, buffer);
10227 pack16(show_flags, buffer);
10228 pack_time(dump_job_ptr->deadline, buffer);
10229
10230 pack32(dump_job_ptr->alloc_sid, buffer);
10231 if ((dump_job_ptr->time_limit == NO_VAL)
10232 && dump_job_ptr->part_ptr)
10233 time_limit = dump_job_ptr->part_ptr->max_time;
10234 else
10235 time_limit = dump_job_ptr->time_limit;
10236
10237 pack32(time_limit, buffer);
10238 pack32(dump_job_ptr->time_min, buffer);
10239
10240 if (dump_job_ptr->details) {
10241 pack32(dump_job_ptr->details->nice, buffer);
10242 pack_time(dump_job_ptr->details->submit_time, buffer);
10243 /* Earliest possible begin time */
10244 begin_time = dump_job_ptr->details->begin_time;
10245 /* When we started accruing time for priority */
10246 accrue_time = dump_job_ptr->details->accrue_time;
10247 } else { /* Some job details may be purged after completion */
10248 pack32(NICE_OFFSET, buffer); /* Best guess */
10249 pack_time((time_t) 0, buffer);
10250 }
10251
10252 pack_time(begin_time, buffer);
10253 pack_time(accrue_time, buffer);
10254
10255 if (IS_JOB_STARTED(dump_job_ptr)) {
10256 /* Report actual start time, in past */
10257 start_time = dump_job_ptr->start_time;
10258 end_time = dump_job_ptr->end_time;
10259 } else if (dump_job_ptr->start_time != 0) {
10260 /* Report expected start time,
10261 * making sure that time is not in the past */
10262 start_time = MAX(dump_job_ptr->start_time, time(NULL));
10263 if (time_limit != NO_VAL) {
10264 end_time = MAX(dump_job_ptr->end_time,
10265 (start_time + time_limit * 60));
10266 }
10267 } else if (begin_time > time(NULL)) {
10268 /* earliest start time in the future */
10269 start_time = begin_time;
10270 if (time_limit != NO_VAL) {
10271 end_time = MAX(dump_job_ptr->end_time,
10272 (start_time + time_limit * 60));
10273 }
10274 }
10275 pack_time(start_time, buffer);
10276 pack_time(end_time, buffer);
10277
10278 pack_time(dump_job_ptr->suspend_time, buffer);
10279 pack_time(dump_job_ptr->pre_sus_time, buffer);
10280 pack_time(dump_job_ptr->resize_time, buffer);
10281 pack_time(dump_job_ptr->last_sched_eval, buffer);
10282 pack_time(dump_job_ptr->preempt_time, buffer);
10283 pack32(dump_job_ptr->priority, buffer);
10284 packdouble(dump_job_ptr->billable_tres, buffer);
10285
10286 packstr(slurmctld_conf.cluster_name, buffer);
10287 /* Only send the allocated nodelist since we are only sending
10288 * the number of cpus and nodes that are currently allocated. */
10289 if (!IS_JOB_COMPLETING(dump_job_ptr))
10290 packstr(dump_job_ptr->nodes, buffer);
10291 else {
10292 nodelist =
10293 bitmap2node_name(dump_job_ptr->node_bitmap_cg);
10294 packstr(nodelist, buffer);
10295 xfree(nodelist);
10296 }
10297
10298 packstr(dump_job_ptr->sched_nodes, buffer);
10299
10300 if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr)
10301 packstr(dump_job_ptr->part_ptr->name, buffer);
10302 else
10303 packstr(dump_job_ptr->partition, buffer);
10304 packstr(dump_job_ptr->account, buffer);
10305 packstr(dump_job_ptr->admin_comment, buffer);
10306 pack32(dump_job_ptr->site_factor, buffer);
10307 packstr(dump_job_ptr->network, buffer);
10308 packstr(dump_job_ptr->comment, buffer);
10309 packstr(dump_job_ptr->batch_features, buffer);
10310 packstr(dump_job_ptr->batch_host, buffer);
10311 packstr(dump_job_ptr->burst_buffer, buffer);
10312 packstr(dump_job_ptr->burst_buffer_state, buffer);
10313 packstr(dump_job_ptr->system_comment, buffer);
10314
10315 assoc_mgr_lock(&locks);
10316 if (dump_job_ptr->qos_ptr)
10317 packstr(dump_job_ptr->qos_ptr->name, buffer);
10318 else {
10319 if (assoc_mgr_qos_list) {
10320 packstr(slurmdb_qos_str(assoc_mgr_qos_list,
10321 dump_job_ptr->qos_id),
10322 buffer);
10323 } else
10324 packnull(buffer);
10325 }
10326
10327 if (IS_JOB_STARTED(dump_job_ptr) &&
10328 (slurmctld_conf.preempt_mode != PREEMPT_MODE_OFF) &&
10329 (slurm_job_preempt_mode(dump_job_ptr) != PREEMPT_MODE_OFF)) {
10330 time_t preemptable = acct_policy_get_preemptable_time(
10331 dump_job_ptr);
10332 pack_time(preemptable, buffer);
10333 } else {
10334 pack_time(0, buffer);
10335 }
10336 assoc_mgr_unlock(&locks);
10337
10338 packstr(dump_job_ptr->licenses, buffer);
10339 packstr(dump_job_ptr->state_desc, buffer);
10340 packstr(dump_job_ptr->resv_name, buffer);
10341 packstr(dump_job_ptr->mcs_label, buffer);
10342
10343 pack32(dump_job_ptr->exit_code, buffer);
10344 pack32(dump_job_ptr->derived_ec, buffer);
10345
10346 if (show_flags & SHOW_DETAIL) {
10347 pack_job_resources(dump_job_ptr->job_resrcs, buffer,
10348 protocol_version);
10349 _pack_job_gres(dump_job_ptr, buffer, protocol_version);
10350 } else {
10351 pack32(NO_VAL, buffer);
10352 pack32((uint32_t) 0, buffer);
10353 }
10354
10355 packstr(dump_job_ptr->name, buffer);
10356 packstr(dump_job_ptr->user_name, buffer);
10357 packstr(dump_job_ptr->wckey, buffer);
10358 pack32(dump_job_ptr->req_switch, buffer);
10359 pack32(dump_job_ptr->wait4switch, buffer);
10360
10361 packstr(dump_job_ptr->alloc_node, buffer);
10362 if (!IS_JOB_COMPLETING(dump_job_ptr))
10363 pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer);
10364 else
10365 pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer);
10366
10367 select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo,
10368 buffer, protocol_version);
10369
10370 /* A few details are always dumped here */
10371 _pack_default_job_details(dump_job_ptr, buffer,
10372 protocol_version);
10373
10374 /* other job details are only dumped until the job starts
10375 * running (at which time they become meaningless) */
10376 if (detail_ptr)
10377 _pack_pending_job_details(detail_ptr, buffer,
10378 protocol_version);
10379 else
10380 _pack_pending_job_details(NULL, buffer,
10381 protocol_version);
10382 pack32(dump_job_ptr->bit_flags, buffer);
10383 packstr(dump_job_ptr->tres_fmt_alloc_str, buffer);
10384 packstr(dump_job_ptr->tres_fmt_req_str, buffer);
10385 pack16(dump_job_ptr->start_protocol_ver, buffer);
10386
10387 if (dump_job_ptr->fed_details) {
10388 packstr(dump_job_ptr->fed_details->origin_str, buffer);
10389 pack64(dump_job_ptr->fed_details->siblings_active,
10390 buffer);
10391 packstr(dump_job_ptr->fed_details->siblings_active_str,
10392 buffer);
10393 pack64(dump_job_ptr->fed_details->siblings_viable,
10394 buffer);
10395 packstr(dump_job_ptr->fed_details->siblings_viable_str,
10396 buffer);
10397 } else {
10398 packnull(buffer);
10399 pack64((uint64_t)0, buffer);
10400 packnull(buffer);
10401 pack64((uint64_t)0, buffer);
10402 packnull(buffer);
10403 }
10404
10405 packstr(dump_job_ptr->cpus_per_tres, buffer);
10406 packstr(dump_job_ptr->mem_per_tres, buffer);
10407 packstr(dump_job_ptr->tres_bind, buffer);
10408 packstr(dump_job_ptr->tres_freq, buffer);
10409 packstr(dump_job_ptr->tres_per_job, buffer);
10410 packstr(dump_job_ptr->tres_per_node, buffer);
10411 packstr(dump_job_ptr->tres_per_socket, buffer);
10412 packstr(dump_job_ptr->tres_per_task, buffer);
10413 } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
10414 detail_ptr = dump_job_ptr->details;
10415 pack32(dump_job_ptr->array_job_id, buffer);
10416 pack32(dump_job_ptr->array_task_id, buffer);
10417 if (dump_job_ptr->array_recs) {
10418 build_array_str(dump_job_ptr);
10419 packstr(dump_job_ptr->array_recs->task_id_str, buffer);
10420 pack32(dump_job_ptr->array_recs->max_run_tasks, buffer);
10421 } else {
10422 packnull(buffer);
10423 pack32((uint32_t) 0, buffer);
10424 }
10425
10426 pack32(dump_job_ptr->assoc_id, buffer);
10427 pack32(dump_job_ptr->delay_boot, buffer);
10428 pack32(dump_job_ptr->job_id, buffer);
10429 pack32(dump_job_ptr->user_id, buffer);
10430 pack32(dump_job_ptr->group_id, buffer);
10431 pack32(dump_job_ptr->het_job_id, buffer);
10432 packstr(dump_job_ptr->het_job_id_set, buffer);
10433 pack32(dump_job_ptr->het_job_offset, buffer);
10434 pack32(dump_job_ptr->profile, buffer);
10435
10436 pack32(dump_job_ptr->job_state, buffer);
10437 pack16(dump_job_ptr->batch_flag, buffer);
10438 pack16(dump_job_ptr->state_reason, buffer);
10439 pack8(dump_job_ptr->power_flags, buffer);
10440 pack8(dump_job_ptr->reboot, buffer);
10441 pack16(dump_job_ptr->restart_cnt, buffer);
10442 pack16(show_flags, buffer);
10443 pack_time(dump_job_ptr->deadline, buffer);
10444
10445 pack32(dump_job_ptr->alloc_sid, buffer);
10446 if ((dump_job_ptr->time_limit == NO_VAL)
10447 && dump_job_ptr->part_ptr)
10448 time_limit = dump_job_ptr->part_ptr->max_time;
10449 else
10450 time_limit = dump_job_ptr->time_limit;
10451
10452 pack32(time_limit, buffer);
10453 pack32(dump_job_ptr->time_min, buffer);
10454
10455 if (dump_job_ptr->details) {
10456 pack32(dump_job_ptr->details->nice, buffer);
10457 pack_time(dump_job_ptr->details->submit_time, buffer);
10458 /* Earliest possible begin time */
10459 begin_time = dump_job_ptr->details->begin_time;
10460 /* When we started accruing time for priority */
10461 accrue_time = dump_job_ptr->details->accrue_time;
10462 } else { /* Some job details may be purged after completion */
10463 pack32(NICE_OFFSET, buffer); /* Best guess */
10464 pack_time((time_t) 0, buffer);
10465 }
10466
10467 pack_time(begin_time, buffer);
10468 pack_time(accrue_time, buffer);
10469
10470 if (IS_JOB_STARTED(dump_job_ptr)) {
10471 /* Report actual start time, in past */
10472 start_time = dump_job_ptr->start_time;
10473 end_time = dump_job_ptr->end_time;
10474 } else if (dump_job_ptr->start_time != 0) {
10475 /* Report expected start time,
10476 * making sure that time is not in the past */
10477 start_time = MAX(dump_job_ptr->start_time, time(NULL));
10478 if (time_limit != NO_VAL) {
10479 end_time = MAX(dump_job_ptr->end_time,
10480 (start_time + time_limit * 60));
10481 }
10482 } else if (begin_time > time(NULL)) {
10483 /* earliest start time in the future */
10484 start_time = begin_time;
10485 if (time_limit != NO_VAL) {
10486 end_time = MAX(dump_job_ptr->end_time,
10487 (start_time + time_limit * 60));
10488 }
10489 }
10490 pack_time(start_time, buffer);
10491 pack_time(end_time, buffer);
10492
10493 pack_time(dump_job_ptr->suspend_time, buffer);
10494 pack_time(dump_job_ptr->pre_sus_time, buffer);
10495 pack_time(dump_job_ptr->resize_time, buffer);
10496 pack_time(dump_job_ptr->last_sched_eval, buffer);
10497 pack_time(dump_job_ptr->preempt_time, buffer);
10498 pack32(dump_job_ptr->priority, buffer);
10499 packdouble(dump_job_ptr->billable_tres, buffer);
10500
10501 packstr(slurmctld_conf.cluster_name, buffer);
10502 /* Only send the allocated nodelist since we are only sending
10503 * the number of cpus and nodes that are currently allocated. */
10504 if (!IS_JOB_COMPLETING(dump_job_ptr))
10505 packstr(dump_job_ptr->nodes, buffer);
10506 else {
10507 nodelist =
10508 bitmap2node_name(dump_job_ptr->node_bitmap_cg);
10509 packstr(nodelist, buffer);
10510 xfree(nodelist);
10511 }
10512
10513 packstr(dump_job_ptr->sched_nodes, buffer);
10514
10515 if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr)
10516 packstr(dump_job_ptr->part_ptr->name, buffer);
10517 else
10518 packstr(dump_job_ptr->partition, buffer);
10519 packstr(dump_job_ptr->account, buffer);
10520 packstr(dump_job_ptr->admin_comment, buffer);
10521 packstr(dump_job_ptr->network, buffer);
10522 packstr(dump_job_ptr->comment, buffer);
10523 packstr(dump_job_ptr->batch_features, buffer);
10524 packstr(dump_job_ptr->batch_host, buffer);
10525 packstr(dump_job_ptr->burst_buffer, buffer);
10526 packstr(dump_job_ptr->burst_buffer_state, buffer);
10527 packstr(dump_job_ptr->system_comment, buffer);
10528
10529 assoc_mgr_lock(&locks);
10530 if (dump_job_ptr->qos_ptr)
10531 packstr(dump_job_ptr->qos_ptr->name, buffer);
10532 else {
10533 if (assoc_mgr_qos_list) {
10534 packstr(slurmdb_qos_str(assoc_mgr_qos_list,
10535 dump_job_ptr->qos_id),
10536 buffer);
10537 } else
10538 packnull(buffer);
10539 }
10540 assoc_mgr_unlock(&locks);
10541
10542 packstr(dump_job_ptr->licenses, buffer);
10543 packstr(dump_job_ptr->state_desc, buffer);
10544 packstr(dump_job_ptr->resv_name, buffer);
10545 packstr(dump_job_ptr->mcs_label, buffer);
10546
10547 pack32(dump_job_ptr->exit_code, buffer);
10548 pack32(dump_job_ptr->derived_ec, buffer);
10549
10550 if (show_flags & SHOW_DETAIL) {
10551 pack_job_resources(dump_job_ptr->job_resrcs, buffer,
10552 protocol_version);
10553 _pack_job_gres(dump_job_ptr, buffer, protocol_version);
10554 } else {
10555 pack32(NO_VAL, buffer);
10556 pack32((uint32_t) 0, buffer);
10557 }
10558
10559 packstr(dump_job_ptr->name, buffer);
10560 packstr(dump_job_ptr->user_name, buffer);
10561 packstr(dump_job_ptr->wckey, buffer);
10562 pack32(dump_job_ptr->req_switch, buffer);
10563 pack32(dump_job_ptr->wait4switch, buffer);
10564
10565 packstr(dump_job_ptr->alloc_node, buffer);
10566 if (!IS_JOB_COMPLETING(dump_job_ptr))
10567 pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer);
10568 else
10569 pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer);
10570
10571 select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo,
10572 buffer, protocol_version);
10573
10574 /* A few details are always dumped here */
10575 _pack_default_job_details(dump_job_ptr, buffer,
10576 protocol_version);
10577
10578 /* other job details are only dumped until the job starts
10579 * running (at which time they become meaningless) */
10580 if (detail_ptr)
10581 _pack_pending_job_details(detail_ptr, buffer,
10582 protocol_version);
10583 else
10584 _pack_pending_job_details(NULL, buffer,
10585 protocol_version);
10586 pack32(dump_job_ptr->bit_flags, buffer);
10587 packstr(dump_job_ptr->tres_fmt_alloc_str, buffer);
10588 packstr(dump_job_ptr->tres_fmt_req_str, buffer);
10589 pack16(dump_job_ptr->start_protocol_ver, buffer);
10590
10591 if (dump_job_ptr->fed_details) {
10592 packstr(dump_job_ptr->fed_details->origin_str, buffer);
10593 pack64(dump_job_ptr->fed_details->siblings_active,
10594 buffer);
10595 packstr(dump_job_ptr->fed_details->siblings_active_str,
10596 buffer);
10597 pack64(dump_job_ptr->fed_details->siblings_viable,
10598 buffer);
10599 packstr(dump_job_ptr->fed_details->siblings_viable_str,
10600 buffer);
10601 } else {
10602 packnull(buffer);
10603 pack64((uint64_t)0, buffer);
10604 packnull(buffer);
10605 pack64((uint64_t)0, buffer);
10606 packnull(buffer);
10607 }
10608
10609 packstr(dump_job_ptr->cpus_per_tres, buffer);
10610 packstr(dump_job_ptr->mem_per_tres, buffer);
10611 packstr(dump_job_ptr->tres_bind, buffer);
10612 packstr(dump_job_ptr->tres_freq, buffer);
10613 packstr(dump_job_ptr->tres_per_job, buffer);
10614 packstr(dump_job_ptr->tres_per_node, buffer);
10615 packstr(dump_job_ptr->tres_per_socket, buffer);
10616 packstr(dump_job_ptr->tres_per_task, buffer);
10617 } else {
10618 error("pack_job: protocol_version "
10619 "%hu not supported", protocol_version);
10620 }
10621 }
10622
_find_node_config(int * cpu_cnt_ptr,int * core_cnt_ptr)10623 static void _find_node_config(int *cpu_cnt_ptr, int *core_cnt_ptr)
10624 {
10625 static int max_cpu_cnt = -1, max_core_cnt = -1;
10626 static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
10627 int i;
10628 node_record_t *node_ptr = node_record_table_ptr;
10629
10630 slurm_mutex_lock(&lock);
10631 if (max_cpu_cnt == -1) {
10632 for (i = 0; i < node_record_count; i++, node_ptr++) {
10633 /* Only data from config_record used for scheduling */
10634 max_cpu_cnt = MAX(max_cpu_cnt,
10635 node_ptr->config_ptr->cpus);
10636 max_core_cnt = MAX(max_core_cnt,
10637 node_ptr->config_ptr->cores);
10638 }
10639 }
10640 slurm_mutex_unlock(&lock);
10641
10642 *cpu_cnt_ptr = max_cpu_cnt;
10643 *core_cnt_ptr = max_core_cnt;
10644
10645 return;
10646
10647 }
10648
10649 /* pack default job details for "get_job_info" RPC */
_pack_default_job_details(job_record_t * job_ptr,Buf buffer,uint16_t protocol_version)10650 static void _pack_default_job_details(job_record_t *job_ptr, Buf buffer,
10651 uint16_t protocol_version)
10652 {
10653 int max_cpu_cnt = -1, max_core_cnt = -1;
10654 int i;
10655 struct job_details *detail_ptr = job_ptr->details;
10656 uint16_t shared = 0;
10657
10658 if (!detail_ptr)
10659 shared = NO_VAL16;
10660 else if (detail_ptr->share_res == 1) /* User --share */
10661 shared = 1;
10662 else if ((detail_ptr->share_res == 0) ||
10663 (detail_ptr->whole_node == 1))
10664 shared = 0; /* User --exclusive */
10665 else if (detail_ptr->whole_node == WHOLE_NODE_USER)
10666 shared = JOB_SHARED_USER; /* User --exclusive=user */
10667 else if (detail_ptr->whole_node == WHOLE_NODE_MCS)
10668 shared = JOB_SHARED_MCS; /* User --exclusive=mcs */
10669 else if (job_ptr->part_ptr) {
10670 /* Report shared status based upon latest partition info */
10671 if (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)
10672 shared = JOB_SHARED_USER;
10673 else if ((job_ptr->part_ptr->max_share & SHARED_FORCE) &&
10674 ((job_ptr->part_ptr->max_share & (~SHARED_FORCE)) > 1))
10675 shared = 1; /* Partition Shared=force */
10676 else if (job_ptr->part_ptr->max_share == 0)
10677 shared = 0; /* Partition Shared=exclusive */
10678 else
10679 shared = NO_VAL16; /* Part Shared=yes or no */
10680 } else
10681 shared = NO_VAL16; /* No user or partition info */
10682
10683 if (job_ptr->part_ptr && job_ptr->part_ptr->max_cpu_cnt) {
10684 max_cpu_cnt = job_ptr->part_ptr->max_cpu_cnt;
10685 max_core_cnt = job_ptr->part_ptr->max_core_cnt;
10686 } else
10687 _find_node_config(&max_cpu_cnt, &max_core_cnt);
10688
10689 if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
10690 if (detail_ptr) {
10691 packstr(detail_ptr->features, buffer);
10692 packstr(detail_ptr->cluster_features, buffer);
10693 packstr(detail_ptr->work_dir, buffer);
10694 packstr(detail_ptr->dependency, buffer);
10695
10696 if (detail_ptr->argv) {
10697 char *cmd_line = NULL, *pos = NULL;
10698 for (i = 0; detail_ptr->argv[i]; i++) {
10699 xstrfmtcatat(cmd_line, &pos, "%s%s",
10700 (i ? " " : ""),
10701 detail_ptr->argv[i]);
10702 }
10703 packstr(cmd_line, buffer);
10704 xfree(cmd_line);
10705 } else
10706 packnull(buffer);
10707
10708 if (IS_JOB_COMPLETING(job_ptr) && job_ptr->cpu_cnt) {
10709 pack32(job_ptr->cpu_cnt, buffer);
10710 pack32((uint32_t) 0, buffer);
10711 } else if (job_ptr->total_cpus &&
10712 !IS_JOB_PENDING(job_ptr)) {
10713 /* If job is PENDING ignore total_cpus,
10714 * which may have been set by previous run
10715 * followed by job requeue. */
10716 pack32(job_ptr->total_cpus, buffer);
10717 pack32((uint32_t) 0, buffer);
10718 } else {
10719 pack32(detail_ptr->min_cpus, buffer);
10720 if (detail_ptr->max_cpus != NO_VAL)
10721 pack32(detail_ptr->max_cpus, buffer);
10722 else
10723 pack32((uint32_t) 0, buffer);
10724 }
10725
10726 if (IS_JOB_COMPLETING(job_ptr) && job_ptr->node_cnt) {
10727 pack32(job_ptr->node_cnt, buffer);
10728 pack32((uint32_t) 0, buffer);
10729 } else if (job_ptr->total_nodes) {
10730 pack32(job_ptr->total_nodes, buffer);
10731 pack32((uint32_t) 0, buffer);
10732 } else if (job_ptr->node_cnt_wag) {
10733 /* This should catch everything else, but
10734 * just in case this is 0 (startup or
10735 * whatever) we will keep the rest of
10736 * this if statement around.
10737 */
10738 pack32(job_ptr->node_cnt_wag, buffer);
10739 pack32((uint32_t) detail_ptr->max_nodes,
10740 buffer);
10741 } else if (detail_ptr->ntasks_per_node) {
10742 /* min_nodes based upon task count and ntasks
10743 * per node */
10744 uint32_t min_nodes;
10745 min_nodes = detail_ptr->num_tasks /
10746 detail_ptr->ntasks_per_node;
10747 min_nodes = MAX(min_nodes,
10748 detail_ptr->min_nodes);
10749 pack32(min_nodes, buffer);
10750 pack32(detail_ptr->max_nodes, buffer);
10751 } else if (detail_ptr->cpus_per_task > 1) {
10752 /* min_nodes based upon task count and cpus
10753 * per task */
10754 uint32_t ntasks_per_node, min_nodes;
10755 ntasks_per_node = max_cpu_cnt /
10756 detail_ptr->cpus_per_task;
10757 ntasks_per_node = MAX(ntasks_per_node, 1);
10758 min_nodes = detail_ptr->num_tasks /
10759 ntasks_per_node;
10760 min_nodes = MAX(min_nodes,
10761 detail_ptr->min_nodes);
10762 if (detail_ptr->num_tasks % ntasks_per_node)
10763 min_nodes++;
10764 pack32(min_nodes, buffer);
10765 pack32(detail_ptr->max_nodes, buffer);
10766 } else if (detail_ptr->mc_ptr &&
10767 detail_ptr->mc_ptr->ntasks_per_core &&
10768 (detail_ptr->mc_ptr->ntasks_per_core
10769 != INFINITE16)) {
10770 /* min_nodes based upon task count and ntasks
10771 * per core */
10772 uint32_t min_cores, min_nodes;
10773 min_cores = detail_ptr->num_tasks +
10774 detail_ptr->mc_ptr->ntasks_per_core
10775 - 1;
10776 min_cores /= detail_ptr->mc_ptr->ntasks_per_core;
10777
10778 min_nodes = min_cores + max_core_cnt - 1;
10779 min_nodes /= max_core_cnt;
10780 min_nodes = MAX(min_nodes,
10781 detail_ptr->min_nodes);
10782 pack32(min_nodes, buffer);
10783 pack32(detail_ptr->max_nodes, buffer);
10784 } else {
10785 /* min_nodes based upon task count only */
10786 uint32_t min_nodes;
10787 min_nodes = detail_ptr->num_tasks +
10788 max_cpu_cnt - 1;
10789 min_nodes /= max_cpu_cnt;
10790 min_nodes = MAX(min_nodes,
10791 detail_ptr->min_nodes);
10792 pack32(min_nodes, buffer);
10793 pack32(detail_ptr->max_nodes, buffer);
10794 }
10795
10796 pack16(detail_ptr->requeue, buffer);
10797 pack16(detail_ptr->ntasks_per_node, buffer);
10798 if (detail_ptr->num_tasks)
10799 pack32(detail_ptr->num_tasks, buffer);
10800 else if (IS_JOB_PENDING(job_ptr))
10801 pack32(detail_ptr->min_nodes, buffer);
10802 else
10803 pack32(job_ptr->node_cnt, buffer);
10804 pack16(shared, buffer);
10805 pack32(detail_ptr->cpu_freq_min, buffer);
10806 pack32(detail_ptr->cpu_freq_max, buffer);
10807 pack32(detail_ptr->cpu_freq_gov, buffer);
10808 } else {
10809 packnull(buffer);
10810 packnull(buffer);
10811 packnull(buffer);
10812 packnull(buffer);
10813
10814 if (job_ptr->total_cpus)
10815 pack32(job_ptr->total_cpus, buffer);
10816 else
10817 pack32(job_ptr->cpu_cnt, buffer);
10818 pack32((uint32_t) 0, buffer);
10819
10820 pack32(job_ptr->node_cnt, buffer);
10821 pack32((uint32_t) 0, buffer);
10822 pack16((uint16_t) 0, buffer);
10823 pack16((uint16_t) 0, buffer);
10824 pack16((uint16_t) 0, buffer);
10825 pack32((uint32_t) 0, buffer);
10826 pack32((uint32_t) 0, buffer);
10827 pack32((uint32_t) 0, buffer);
10828 }
10829 } else {
10830 error("_pack_default_job_details: protocol_version "
10831 "%hu not supported", protocol_version);
10832 }
10833 }
10834
10835 /* pack pending job details for "get_job_info" RPC */
_pack_pending_job_details(struct job_details * detail_ptr,Buf buffer,uint16_t protocol_version)10836 static void _pack_pending_job_details(struct job_details *detail_ptr,
10837 Buf buffer, uint16_t protocol_version)
10838 {
10839 if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
10840 if (detail_ptr) {
10841 pack16(detail_ptr->contiguous, buffer);
10842 pack16(detail_ptr->core_spec, buffer);
10843 pack16(detail_ptr->cpus_per_task, buffer);
10844 pack16(detail_ptr->pn_min_cpus, buffer);
10845
10846 pack64(detail_ptr->pn_min_memory, buffer);
10847 pack32(detail_ptr->pn_min_tmp_disk, buffer);
10848
10849 packstr(detail_ptr->req_nodes, buffer);
10850 pack_bit_str_hex(detail_ptr->req_node_bitmap, buffer);
10851 packstr(detail_ptr->exc_nodes, buffer);
10852 pack_bit_str_hex(detail_ptr->exc_node_bitmap, buffer);
10853
10854 packstr(detail_ptr->std_err, buffer);
10855 packstr(detail_ptr->std_in, buffer);
10856 packstr(detail_ptr->std_out, buffer);
10857
10858 pack_multi_core_data(detail_ptr->mc_ptr, buffer,
10859 protocol_version);
10860 } else {
10861 pack16((uint16_t) 0, buffer);
10862 pack16((uint16_t) 0, buffer);
10863 pack16((uint16_t) 0, buffer);
10864 pack16((uint16_t) 0, buffer);
10865
10866 pack64((uint64_t) 0, buffer);
10867 pack32((uint32_t) 0, buffer);
10868
10869 packnull(buffer);
10870 packnull(buffer);
10871 packnull(buffer);
10872 packnull(buffer);
10873
10874 packnull(buffer);
10875 packnull(buffer);
10876 packnull(buffer);
10877
10878 pack_multi_core_data(NULL, buffer, protocol_version);
10879 }
10880 } else {
10881 error("%s: protocol_version %hu not supported", __func__,
10882 protocol_version);
10883 }
10884 }
10885
_purge_het_job_filter(void * x,void * key)10886 static int _purge_het_job_filter(void *x, void *key)
10887 {
10888 job_record_t *job_ptr = (job_record_t *) x;
10889 job_record_t *job_filter = (job_record_t *) key;
10890 if (job_ptr->het_job_id == job_filter->het_job_id)
10891 return 1;
10892 return 0;
10893 }
10894
10895 /* If this is a hetjob leader and all components are complete,
10896 * then purge all job of its hetjob records
10897 * RET true if this record purged */
_purge_complete_het_job(job_record_t * het_job_leader)10898 static inline bool _purge_complete_het_job(job_record_t *het_job_leader)
10899 {
10900 job_record_t purge_job_rec;
10901 job_record_t *het_job;
10902 ListIterator iter;
10903 bool incomplete_job = false;
10904 int i;
10905
10906 if (!het_job_leader->het_job_list)
10907 return false; /* Not hetjob leader */
10908 if (!IS_JOB_FINISHED(het_job_leader))
10909 return false; /* Hetjob leader incomplete */
10910
10911 iter = list_iterator_create(het_job_leader->het_job_list);
10912 while ((het_job = list_next(iter))) {
10913 if (het_job_leader->het_job_id != het_job->het_job_id) {
10914 error("%s: Bad het_job_list for %pJ",
10915 __func__, het_job_leader);
10916 continue;
10917 }
10918 if (!_list_find_job_old(het_job, NULL)) {
10919 incomplete_job = true;
10920 break;
10921 }
10922 }
10923 list_iterator_destroy(iter);
10924
10925 if (incomplete_job)
10926 return false;
10927
10928 purge_job_rec.het_job_id = het_job_leader->het_job_id;
10929 i = list_delete_all(job_list, &_purge_het_job_filter, &purge_job_rec);
10930 if (i) {
10931 debug2("%s: purged %d old job records", __func__, i);
10932 last_job_update = time(NULL);
10933 slurm_mutex_lock(&purge_thread_lock);
10934 slurm_cond_signal(&purge_thread_cond);
10935 slurm_mutex_unlock(&purge_thread_lock);
10936 }
10937 return true;
10938 }
10939
10940 /*
10941 * If the job or slurm.conf requests to not kill on invalid dependency,
10942 * then set the job state reason to WAIT_DEP_INVALID. Otherwise, kill the
10943 * job.
10944 */
handle_invalid_dependency(job_record_t * job_ptr)10945 void handle_invalid_dependency(job_record_t *job_ptr)
10946 {
10947 job_ptr->state_reason = WAIT_DEP_INVALID;
10948 xfree(job_ptr->state_desc);
10949 if (job_ptr->bit_flags & KILL_INV_DEP) {
10950 _kill_dependent(job_ptr);
10951 } else if (job_ptr->bit_flags & NO_KILL_INV_DEP) {
10952 debug("%s: %pJ job dependency never satisfied",
10953 __func__, job_ptr);
10954 } else if (kill_invalid_dep) {
10955 _kill_dependent(job_ptr);
10956 } else {
10957 debug("%s: %pJ job dependency never satisfied",
10958 __func__, job_ptr);
10959 job_ptr->state_reason = WAIT_DEP_INVALID;
10960 }
10961 fed_mgr_remove_remote_dependencies(job_ptr);
10962 }
10963
10964 /*
10965 * purge_old_job - purge old job records.
10966 * The jobs must have completed at least MIN_JOB_AGE minutes ago.
10967 * Test job dependencies, handle after_ok, after_not_ok before
10968 * purging any jobs.
10969 */
purge_old_job(void)10970 void purge_old_job(void)
10971 {
10972 ListIterator job_iterator;
10973 job_record_t *job_ptr;
10974 int i, purge_job_count;
10975
10976 xassert(verify_lock(CONF_LOCK, READ_LOCK));
10977 xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
10978 xassert(verify_lock(NODE_LOCK, WRITE_LOCK));
10979 xassert(verify_lock(FED_LOCK, READ_LOCK));
10980
10981 if ((purge_job_count = list_count(purge_files_list)))
10982 debug("%s: job file deletion is falling behind, "
10983 "%d left to remove", __func__, purge_job_count);
10984
10985 job_iterator = list_iterator_create(job_list);
10986 while ((job_ptr = list_next(job_iterator))) {
10987 if (_purge_complete_het_job(job_ptr))
10988 continue;
10989 if (!IS_JOB_PENDING(job_ptr))
10990 continue;
10991 /*
10992 * If the dependency is already invalid there's no reason to
10993 * keep checking it.
10994 */
10995 if (job_ptr->state_reason == WAIT_DEP_INVALID)
10996 continue;
10997 if (test_job_dependency(job_ptr, NULL) == FAIL_DEPEND) {
10998 /* Check what are the job disposition
10999 * to deal with invalid dependecies
11000 */
11001 handle_invalid_dependency(job_ptr);
11002 }
11003 }
11004 list_iterator_destroy(job_iterator);
11005 fed_mgr_test_remote_dependencies();
11006
11007 i = list_delete_all(job_list, &_list_find_job_old, "");
11008 if (i) {
11009 debug2("purge_old_job: purged %d old job records", i);
11010 last_job_update = time(NULL);
11011 slurm_mutex_lock(&purge_thread_lock);
11012 slurm_cond_signal(&purge_thread_cond);
11013 slurm_mutex_unlock(&purge_thread_lock);
11014 }
11015 }
11016
11017
11018 /*
11019 * purge_job_record - purge specific job record. No testing is performed to
11020 * ensure the job records has no active references. Use only for job
11021 * records that were never fully operational (e.g. WILL_RUN test, failed
11022 * job load, failed job create, etc.).
11023 * IN job_id - job_id of job record to be purged
11024 * RET int - count of job's purged
11025 * global: job_list - global job table
11026 */
purge_job_record(uint32_t job_id)11027 extern int purge_job_record(uint32_t job_id)
11028 {
11029 int count = 0;
11030 count = list_delete_all(job_list, _list_find_job_id, (void *)&job_id);
11031 if (count) {
11032 last_job_update = time(NULL);
11033 slurm_mutex_lock(&purge_thread_lock);
11034 slurm_cond_signal(&purge_thread_cond);
11035 slurm_mutex_unlock(&purge_thread_lock);
11036 }
11037
11038 return count;
11039 }
11040
unlink_job_record(job_record_t * job_ptr)11041 extern void unlink_job_record(job_record_t *job_ptr)
11042 {
11043 uint32_t *job_id;
11044
11045 xassert(job_ptr->magic == JOB_MAGIC);
11046
11047 _delete_job_common(job_ptr);
11048
11049 job_id = xmalloc(sizeof(uint32_t));
11050 *job_id = job_ptr->job_id;
11051 list_enqueue(purge_files_list, job_id);
11052
11053 job_ptr->job_id = NO_VAL;
11054
11055 last_job_update = time(NULL);
11056 slurm_mutex_lock(&purge_thread_lock);
11057 slurm_cond_signal(&purge_thread_cond);
11058 slurm_mutex_unlock(&purge_thread_lock);
11059 }
11060
11061 /*
11062 * reset_job_bitmaps - reestablish bitmaps for existing jobs.
11063 * this should be called after rebuilding node information,
11064 * but before using any job entries.
11065 * global: last_job_update - time of last job table update
11066 * job_list - pointer to global job list
11067 */
reset_job_bitmaps(void)11068 void reset_job_bitmaps(void)
11069 {
11070 ListIterator job_iterator;
11071 job_record_t *job_ptr;
11072 part_record_t *part_ptr;
11073 List part_ptr_list = NULL;
11074 bool job_fail = false;
11075 time_t now = time(NULL);
11076 bool gang_flag = false;
11077 static uint32_t cr_flag = NO_VAL;
11078
11079 xassert(job_list);
11080
11081 if (cr_flag == NO_VAL) {
11082 cr_flag = 0; /* call is no-op for select/linear and others */
11083 if (select_g_get_info_from_plugin(SELECT_CR_PLUGIN,
11084 NULL, &cr_flag)) {
11085 cr_flag = NO_VAL; /* error */
11086 }
11087
11088 }
11089 if (slurmctld_conf.preempt_mode & PREEMPT_MODE_GANG)
11090 gang_flag = true;
11091
11092 job_iterator = list_iterator_create(job_list);
11093 while ((job_ptr = list_next(job_iterator))) {
11094 xassert (job_ptr->magic == JOB_MAGIC);
11095 job_fail = false;
11096
11097 if (job_ptr->partition == NULL) {
11098 error("No partition for %pJ", job_ptr);
11099 part_ptr = NULL;
11100 job_fail = true;
11101 } else {
11102 char *err_part = NULL;
11103 part_ptr = find_part_record(job_ptr->partition);
11104 if (part_ptr == NULL) {
11105 part_ptr_list = get_part_list(
11106 job_ptr->partition,
11107 &err_part);
11108 if (part_ptr_list) {
11109 part_ptr = list_peek(part_ptr_list);
11110 if (list_count(part_ptr_list) == 1)
11111 FREE_NULL_LIST(part_ptr_list);
11112 }
11113 }
11114 if (part_ptr == NULL) {
11115 error("Invalid partition (%s) for %pJ",
11116 err_part, job_ptr);
11117 xfree(err_part);
11118 job_fail = true;
11119 }
11120 }
11121 job_ptr->part_ptr = part_ptr;
11122 FREE_NULL_LIST(job_ptr->part_ptr_list);
11123 if (part_ptr_list) {
11124 job_ptr->part_ptr_list = part_ptr_list;
11125 part_ptr_list = NULL; /* clear for next job */
11126 }
11127
11128 FREE_NULL_BITMAP(job_ptr->node_bitmap_cg);
11129 if (job_ptr->nodes_completing &&
11130 node_name2bitmap(job_ptr->nodes_completing,
11131 false, &job_ptr->node_bitmap_cg)) {
11132 error("Invalid nodes (%s) for %pJ",
11133 job_ptr->nodes_completing, job_ptr);
11134 job_fail = true;
11135 }
11136 FREE_NULL_BITMAP(job_ptr->node_bitmap);
11137 if (job_ptr->nodes &&
11138 node_name2bitmap(job_ptr->nodes, false,
11139 &job_ptr->node_bitmap) && !job_fail) {
11140 error("Invalid nodes (%s) for %pJ",
11141 job_ptr->nodes, job_ptr);
11142 job_fail = true;
11143 }
11144 if (reset_node_bitmap(job_ptr))
11145 job_fail = true;
11146 if (!job_fail && !IS_JOB_FINISHED(job_ptr) &&
11147 job_ptr->job_resrcs && (cr_flag || gang_flag) &&
11148 valid_job_resources(job_ptr->job_resrcs,
11149 node_record_table_ptr)) {
11150 error("Aborting %pJ due to change in socket/core configuration of allocated nodes",
11151 job_ptr);
11152 job_fail = true;
11153 }
11154 if (!job_fail && !IS_JOB_FINISHED(job_ptr) &&
11155 gres_plugin_job_revalidate(job_ptr->gres_list)) {
11156 error("Aborting %pJ due to use of unsupported GRES options",
11157 job_ptr);
11158 job_fail = true;
11159 }
11160
11161 if (!job_fail && job_ptr->job_resrcs &&
11162 (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) &&
11163 gres_plugin_job_revalidate2(job_ptr->job_id,
11164 job_ptr->gres_list,
11165 job_ptr->job_resrcs->node_bitmap)) {
11166 /*
11167 * This can be due to the job being allocated GRES
11168 * which no longer exist (i.e. the GRES count on some
11169 * allocated node changed since when the job started).
11170 */
11171 error("Aborting %pJ due to use of invalid GRES configuration",
11172 job_ptr);
11173 job_fail = true;
11174 }
11175
11176 _reset_step_bitmaps(job_ptr);
11177
11178 /* Do not increase the job->node_cnt for completed jobs */
11179 if (! IS_JOB_COMPLETED(job_ptr))
11180 build_node_details(job_ptr, false); /* set node_addr */
11181
11182 if (_reset_detail_bitmaps(job_ptr))
11183 job_fail = true;
11184
11185 if (job_fail) {
11186 if (IS_JOB_PENDING(job_ptr)) {
11187 job_ptr->start_time =
11188 job_ptr->end_time = time(NULL);
11189 job_ptr->job_state = JOB_NODE_FAIL;
11190 } else if (IS_JOB_RUNNING(job_ptr)) {
11191 job_ptr->end_time = time(NULL);
11192 job_ptr->job_state = JOB_NODE_FAIL |
11193 JOB_COMPLETING;
11194 build_cg_bitmap(job_ptr);
11195 } else if (IS_JOB_SUSPENDED(job_ptr)) {
11196 job_ptr->end_time = job_ptr->suspend_time;
11197 job_ptr->job_state = JOB_NODE_FAIL |
11198 JOB_COMPLETING;
11199 build_cg_bitmap(job_ptr);
11200 job_ptr->tot_sus_time +=
11201 difftime(now, job_ptr->suspend_time);
11202 jobacct_storage_g_job_suspend(acct_db_conn,
11203 job_ptr);
11204 }
11205 job_ptr->state_reason = FAIL_DOWN_NODE;
11206 xfree(job_ptr->state_desc);
11207 job_completion_logger(job_ptr, false);
11208 if (job_ptr->job_state == JOB_NODE_FAIL) {
11209 /* build_cg_bitmap() may clear JOB_COMPLETING */
11210 epilog_slurmctld(job_ptr);
11211 }
11212 }
11213 }
11214
11215 list_iterator_reset(job_iterator);
11216 /* This will reinitialize the select plugin database, which
11217 * we can only do after ALL job's states and bitmaps are set
11218 * (i.e. it needs to be in this second loop) */
11219 while ((job_ptr = list_next(job_iterator))) {
11220 if (select_g_select_nodeinfo_set(job_ptr) != SLURM_SUCCESS) {
11221 error("select_g_select_nodeinfo_set(%pJ): %m",
11222 job_ptr);
11223 }
11224 }
11225 list_iterator_destroy(job_iterator);
11226
11227 last_job_update = now;
11228 }
11229
_reset_detail_bitmaps(job_record_t * job_ptr)11230 static int _reset_detail_bitmaps(job_record_t *job_ptr)
11231 {
11232 if (job_ptr->details == NULL)
11233 return SLURM_SUCCESS;
11234
11235 FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
11236
11237 if ((job_ptr->details->req_nodes) &&
11238 (node_name2bitmap(job_ptr->details->req_nodes, false,
11239 &job_ptr->details->req_node_bitmap))) {
11240 error("Invalid req_nodes (%s) for %pJ",
11241 job_ptr->details->req_nodes, job_ptr);
11242 return SLURM_ERROR;
11243 }
11244
11245 FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
11246 if ((job_ptr->details->exc_nodes) &&
11247 (node_name2bitmap(job_ptr->details->exc_nodes, true,
11248 &job_ptr->details->exc_node_bitmap))) {
11249 error("Invalid exc_nodes (%s) for %pJ",
11250 job_ptr->details->exc_nodes, job_ptr);
11251 return SLURM_ERROR;
11252 }
11253
11254 return SLURM_SUCCESS;
11255 }
11256
_reset_step_bitmaps(job_record_t * job_ptr)11257 static void _reset_step_bitmaps(job_record_t *job_ptr)
11258 {
11259 ListIterator step_iterator;
11260 step_record_t *step_ptr;
11261
11262 step_iterator = list_iterator_create (job_ptr->step_list);
11263 while ((step_ptr = list_next(step_iterator))) {
11264 if (step_ptr->state < JOB_RUNNING)
11265 continue;
11266 FREE_NULL_BITMAP(step_ptr->step_node_bitmap);
11267 if (step_ptr->step_layout &&
11268 step_ptr->step_layout->node_list &&
11269 (node_name2bitmap(step_ptr->step_layout->node_list, false,
11270 &step_ptr->step_node_bitmap))) {
11271 error("Invalid step_node_list (%s) for %pS",
11272 step_ptr->step_layout->node_list, step_ptr);
11273 delete_step_record (job_ptr, step_ptr->step_id);
11274 } else if (step_ptr->step_node_bitmap == NULL) {
11275 error("Missing node_list for %pS", step_ptr);
11276 delete_step_record (job_ptr, step_ptr->step_id);
11277 }
11278 }
11279
11280 list_iterator_destroy (step_iterator);
11281 return;
11282 }
11283
11284 /* update first assigned job id as needed on reconfigure */
reset_first_job_id(void)11285 void reset_first_job_id(void)
11286 {
11287 xassert(verify_lock(CONF_LOCK, READ_LOCK));
11288 job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id);
11289 }
11290
11291 /*
11292 * Return the next available job_id to be used.
11293 *
11294 * IN test_only - if true, doesn't advance the job_id sequence, just returns
11295 * what the next job id will be.
11296 * RET a valid job_id or SLURM_ERROR if all job_ids are exhausted.
11297 */
get_next_job_id(bool test_only)11298 extern uint32_t get_next_job_id(bool test_only)
11299 {
11300 int i;
11301 uint32_t new_id, max_jobs, tmp_id_sequence;
11302
11303 xassert(verify_lock(JOB_LOCK, READ_LOCK));
11304 xassert(test_only || verify_lock(JOB_LOCK, WRITE_LOCK));
11305 xassert(verify_lock(FED_LOCK, READ_LOCK));
11306
11307 max_jobs = slurmctld_conf.max_job_id - slurmctld_conf.first_job_id;
11308 tmp_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id);
11309
11310 /* Ensure no conflict in job id if we roll over 32 bits */
11311 for (i = 0; i < max_jobs; i++) {
11312 if (++tmp_id_sequence >= slurmctld_conf.max_job_id)
11313 tmp_id_sequence = slurmctld_conf.first_job_id;
11314
11315 new_id = fed_mgr_get_job_id(tmp_id_sequence);
11316
11317 if (find_job_record(new_id))
11318 continue;
11319 if (_dup_job_file_test(new_id))
11320 continue;
11321
11322 if (!test_only)
11323 job_id_sequence = tmp_id_sequence;
11324
11325 return new_id;
11326 }
11327
11328 error("We have exhausted our supply of valid job id values. "
11329 "FirstJobId=%u MaxJobId=%u", slurmctld_conf.first_job_id,
11330 slurmctld_conf.max_job_id);
11331 return SLURM_ERROR;
11332 }
11333
11334 /*
11335 * _set_job_id - set a default job_id, ensure that it is unique
11336 * IN job_ptr - pointer to the job_record
11337 */
_set_job_id(job_record_t * job_ptr)11338 static int _set_job_id(job_record_t *job_ptr)
11339 {
11340 uint32_t new_id;
11341
11342 xassert(job_ptr);
11343 xassert (job_ptr->magic == JOB_MAGIC);
11344
11345 if ((new_id = get_next_job_id(false)) != SLURM_ERROR) {
11346 job_ptr->job_id = new_id;
11347 /* When we get a new job id might as well make sure
11348 * the db_index is 0 since there is no way it will be
11349 * correct otherwise :). */
11350 job_ptr->db_index = 0;
11351 return SLURM_SUCCESS;
11352 }
11353
11354 job_ptr->job_id = NO_VAL;
11355 return EAGAIN;
11356 }
11357
11358
11359 /*
11360 * set_job_prio - set a default job priority
11361 * IN job_ptr - pointer to the job_record
11362 */
set_job_prio(job_record_t * job_ptr)11363 extern void set_job_prio(job_record_t *job_ptr)
11364 {
11365 uint32_t relative_prio;
11366
11367 xassert(job_ptr);
11368 xassert (job_ptr->magic == JOB_MAGIC);
11369
11370 if (IS_JOB_FINISHED(job_ptr))
11371 return;
11372 job_ptr->priority = slurm_sched_g_initial_priority(lowest_prio,
11373 job_ptr);
11374 if ((job_ptr->priority == 0) || (job_ptr->direct_set_prio))
11375 return;
11376
11377 relative_prio = job_ptr->priority;
11378 if (job_ptr->details && (job_ptr->details->nice != NICE_OFFSET)) {
11379 int64_t offset = job_ptr->details->nice;
11380 offset -= NICE_OFFSET;
11381 relative_prio += offset;
11382 }
11383 lowest_prio = MIN(relative_prio, lowest_prio);
11384 }
11385
11386 /* After recovering job state, if using priority/basic then we increment the
11387 * priorities of all jobs to avoid decrementing the base down to zero */
sync_job_priorities(void)11388 extern void sync_job_priorities(void)
11389 {
11390 ListIterator job_iterator;
11391 job_record_t *job_ptr;
11392 uint32_t prio_boost = 0;
11393
11394 if ((highest_prio != 0) && (highest_prio < TOP_PRIORITY))
11395 prio_boost = TOP_PRIORITY - highest_prio;
11396 if (xstrcmp(slurmctld_conf.priority_type, "priority/basic") ||
11397 (prio_boost < 1000000))
11398 return;
11399
11400 job_iterator = list_iterator_create(job_list);
11401 while ((job_ptr = list_next(job_iterator))) {
11402 if ((job_ptr->priority) && (job_ptr->direct_set_prio == 0))
11403 job_ptr->priority += prio_boost;
11404 }
11405 list_iterator_destroy(job_iterator);
11406 lowest_prio += prio_boost;
11407 }
11408
11409 /*
11410 * _top_priority - determine if any other job has a higher priority than the
11411 * specified job
11412 * IN job_ptr - pointer to selected job
11413 * RET true if selected job has highest priority
11414 */
_top_priority(job_record_t * job_ptr,uint32_t het_job_offset)11415 static bool _top_priority(job_record_t *job_ptr, uint32_t het_job_offset)
11416 {
11417 struct job_details *detail_ptr = job_ptr->details;
11418 time_t now = time(NULL);
11419 int pend_time;
11420 bool top;
11421
11422 if (job_ptr->priority == 0) /* user held */
11423 top = false;
11424 else {
11425 ListIterator job_iterator;
11426 job_record_t *job_ptr2;
11427
11428 top = true; /* assume top priority until found otherwise */
11429 job_iterator = list_iterator_create(job_list);
11430 while ((job_ptr2 = list_next(job_iterator))) {
11431 if (job_ptr2 == job_ptr)
11432 continue;
11433 if ((het_job_offset != NO_VAL) && (job_ptr->job_id ==
11434 (job_ptr2->job_id + het_job_offset)))
11435 continue;
11436 if (!IS_JOB_PENDING(job_ptr2))
11437 continue;
11438 if (IS_JOB_COMPLETING(job_ptr2)) {
11439 /* Job is hung in pending & completing state,
11440 * indicative of job requeue */
11441 continue;
11442 }
11443
11444 if (bf_min_age_reserve) {
11445 if (job_ptr2->details->begin_time == 0)
11446 continue;
11447 pend_time = difftime(now, job_ptr2->
11448 details->begin_time);
11449 if (pend_time < bf_min_age_reserve)
11450 continue;
11451 }
11452 if (!acct_policy_job_runnable_state(job_ptr2) ||
11453 !misc_policy_job_runnable_state(job_ptr2) ||
11454 !part_policy_job_runnable_state(job_ptr2) ||
11455 !job_independent(job_ptr2))
11456 continue;
11457
11458 if (!xstrcmp(job_ptr2->resv_name, job_ptr->resv_name) ||
11459 (job_ptr2->resv_ptr &&
11460 (job_ptr->warn_time <=
11461 job_ptr2->resv_ptr->max_start_delay) &&
11462 (job_ptr->warn_flags & KILL_JOB_RESV))) {
11463 /* same reservation */
11464 if (job_ptr2->priority <= job_ptr->priority)
11465 continue;
11466 top = false;
11467 break;
11468 } else if ((job_ptr2->resv_name &&
11469 (!job_ptr->resv_name)) ||
11470 ((!job_ptr2->resv_name) &&
11471 job_ptr->resv_name))
11472 continue; /* different reservation */
11473
11474
11475 if (bb_g_job_test_stage_in(job_ptr2, true) != 1)
11476 continue; /* Waiting for buffer */
11477
11478 if (job_ptr2->part_ptr == job_ptr->part_ptr) {
11479 /* same partition */
11480 if (job_ptr2->priority <= job_ptr->priority)
11481 continue;
11482 top = false;
11483 break;
11484 }
11485 if (bit_overlap_any(job_ptr->part_ptr->node_bitmap,
11486 job_ptr2->part_ptr->node_bitmap) == 0)
11487 continue; /* no node overlap in partitions */
11488 if ((job_ptr2->part_ptr->priority_tier >
11489 job_ptr ->part_ptr->priority_tier) ||
11490 ((job_ptr2->part_ptr->priority_tier ==
11491 job_ptr ->part_ptr->priority_tier) &&
11492 (job_ptr2->priority > job_ptr->priority))) {
11493 top = false;
11494 break;
11495 }
11496 }
11497 list_iterator_destroy(job_iterator);
11498 }
11499
11500 if ((!top) && detail_ptr) { /* not top prio */
11501 if (job_ptr->priority == 0) { /* user/admin hold */
11502 if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS
11503 && (job_ptr->state_reason != WAIT_RESV_DELETED)
11504 && (job_ptr->state_reason != FAIL_BURST_BUFFER_OP)
11505 && (job_ptr->state_reason != FAIL_ACCOUNT)
11506 && (job_ptr->state_reason != FAIL_QOS)
11507 && (job_ptr->state_reason != WAIT_HELD)
11508 && (job_ptr->state_reason != WAIT_HELD_USER)
11509 && job_ptr->state_reason != WAIT_MAX_REQUEUE) {
11510 job_ptr->state_reason = WAIT_HELD;
11511 xfree(job_ptr->state_desc);
11512 }
11513 } else if (job_ptr->state_reason == WAIT_NO_REASON &&
11514 het_job_offset == NO_VAL) {
11515 job_ptr->state_reason = WAIT_PRIORITY;
11516 xfree(job_ptr->state_desc);
11517 }
11518 }
11519 return top;
11520 }
11521
_merge_job_licenses(job_record_t * shrink_job_ptr,job_record_t * expand_job_ptr)11522 static void _merge_job_licenses(job_record_t *shrink_job_ptr,
11523 job_record_t *expand_job_ptr)
11524 {
11525 xassert(shrink_job_ptr);
11526 xassert(expand_job_ptr);
11527
11528 /* FIXME: do we really need to update accounting here? It
11529 * might already happen */
11530
11531 if (!shrink_job_ptr->licenses) /* No licenses to add */
11532 return;
11533
11534 if (!expand_job_ptr->licenses) { /* Just transfer licenses */
11535 expand_job_ptr->licenses = shrink_job_ptr->licenses;
11536 shrink_job_ptr->licenses = NULL;
11537 FREE_NULL_LIST(expand_job_ptr->license_list);
11538 expand_job_ptr->license_list = shrink_job_ptr->license_list;
11539 shrink_job_ptr->license_list = NULL;
11540 return;
11541 }
11542
11543 /* Merge the license information into expanding job */
11544 xstrcat(expand_job_ptr->licenses, ",");
11545 xstrcat(expand_job_ptr->licenses, shrink_job_ptr->licenses);
11546 xfree(shrink_job_ptr->licenses);
11547 FREE_NULL_LIST(expand_job_ptr->license_list);
11548 FREE_NULL_LIST(shrink_job_ptr->license_list);
11549 license_job_merge(expand_job_ptr);
11550 return;
11551 }
11552
_hold_job_rec(job_record_t * job_ptr,uid_t uid)11553 static void _hold_job_rec(job_record_t *job_ptr, uid_t uid)
11554 {
11555 int i, j;
11556
11557 job_ptr->direct_set_prio = 1;
11558 job_ptr->priority = 0;
11559
11560 if (IS_JOB_PENDING(job_ptr))
11561 acct_policy_remove_accrue_time(job_ptr, false);
11562
11563 if (job_ptr->part_ptr_list && job_ptr->priority_array) {
11564 j = list_count(job_ptr->part_ptr_list);
11565 for (i = 0; i < j; i++) {
11566 job_ptr->priority_array[i] = 0;
11567 }
11568 }
11569 sched_info("%s: hold on %pJ by uid %u", __func__, job_ptr, uid);
11570 }
11571
_hold_job(job_record_t * job_ptr,uid_t uid)11572 static void _hold_job(job_record_t *job_ptr, uid_t uid)
11573 {
11574 job_record_t *het_job_leader = NULL, *het_job;
11575 ListIterator iter;
11576
11577 if (job_ptr->het_job_id && _get_whole_hetjob())
11578 het_job_leader = find_job_record(job_ptr->het_job_id);
11579 if (het_job_leader && het_job_leader->het_job_list) {
11580 iter = list_iterator_create(het_job_leader->het_job_list);
11581 while ((het_job = list_next(iter)))
11582 _hold_job_rec(het_job, uid);
11583 list_iterator_destroy(iter);
11584 return;
11585 }
11586 _hold_job_rec(job_ptr, uid);
11587 }
11588
_release_job_rec(job_record_t * job_ptr,uid_t uid)11589 static void _release_job_rec(job_record_t *job_ptr, uid_t uid)
11590 {
11591 time_t now = time(NULL);
11592 if (job_ptr->details && (job_ptr->details->begin_time < now))
11593 job_ptr->details->begin_time = 0;
11594 job_ptr->direct_set_prio = 0;
11595 set_job_prio(job_ptr);
11596 job_ptr->state_reason = WAIT_NO_REASON;
11597 job_ptr->state_reason_prev = WAIT_NO_REASON;
11598 job_ptr->job_state &= ~JOB_SPECIAL_EXIT;
11599 xfree(job_ptr->state_desc);
11600 job_ptr->exit_code = 0;
11601 fed_mgr_job_requeue(job_ptr); /* submit sibling jobs */
11602 sched_info("%s: release hold on %pJ by uid %u",
11603 __func__, job_ptr, uid);
11604 }
11605
_release_job(job_record_t * job_ptr,uid_t uid)11606 static void _release_job(job_record_t *job_ptr, uid_t uid)
11607 {
11608 job_record_t *het_job_leader = NULL, *het_job;
11609 ListIterator iter;
11610
11611 if (job_ptr->het_job_id && _get_whole_hetjob())
11612 het_job_leader = find_job_record(job_ptr->het_job_id);
11613 if (het_job_leader && het_job_leader->het_job_list) {
11614 iter = list_iterator_create(het_job_leader->het_job_list);
11615 while ((het_job = list_next(iter)))
11616 _release_job_rec(het_job, uid);
11617 list_iterator_destroy(iter);
11618 return;
11619 }
11620 _release_job_rec(job_ptr, uid);
11621 }
11622
11623 /*
11624 * Gets a new association giving priority to the given parameters in job_desc,
11625 * and if not possible using the job_ptr ones.
11626 * IN job_desc: The new job description to use for getting the assoc_ptr.
11627 * IN job_ptr: The original job_ptr to use when parameters are not in job_desc.
11628 * RET assoc_rec, the new association combining the most updated information
11629 * from job_desc.
11630 */
_retrieve_new_assoc(job_desc_msg_t * job_desc,job_record_t * job_ptr)11631 static slurmdb_assoc_rec_t *_retrieve_new_assoc(job_desc_msg_t *job_desc,
11632 job_record_t *job_ptr)
11633 {
11634 slurmdb_assoc_rec_t assoc_rec, *assoc_ptr = NULL;
11635
11636 memset(&assoc_rec, 0, sizeof(assoc_rec));
11637
11638 if (job_desc->partition) {
11639 part_record_t *part_ptr = NULL;
11640 int error_code =
11641 _get_job_parts(job_desc, &part_ptr, NULL, NULL);
11642 /* We don't need this we only care about part_ptr */
11643 if (error_code != SLURM_SUCCESS) {
11644 errno = error_code;
11645 return NULL;
11646 } else if (!(part_ptr->state_up & PARTITION_SUBMIT)) {
11647 errno = ESLURM_PARTITION_NOT_AVAIL;
11648 return NULL;
11649 }
11650
11651 assoc_rec.partition = part_ptr->name;
11652 } else if (job_ptr->part_ptr)
11653 assoc_rec.partition = job_ptr->part_ptr->name;
11654
11655 if (job_desc->account)
11656 assoc_rec.acct = job_desc->account;
11657 else
11658 assoc_rec.acct = job_ptr->account;
11659
11660 assoc_rec.uid = job_ptr->user_id;
11661
11662 if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
11663 accounting_enforce,
11664 &assoc_ptr, false)) {
11665 info("%s: invalid account %s for %pJ",
11666 __func__, assoc_rec.acct, job_ptr);
11667 errno = ESLURM_INVALID_ACCOUNT;
11668 return NULL;
11669 } else if (association_based_accounting &&
11670 !assoc_ptr &&
11671 !(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) &&
11672 assoc_rec.acct) {
11673 /* if not enforcing associations we want to look for
11674 * the default account and use it to avoid getting
11675 * trash in the accounting records.
11676 */
11677 assoc_rec.acct = NULL;
11678 (void) assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
11679 accounting_enforce,
11680 &assoc_ptr, false);
11681 }
11682
11683 return assoc_ptr;
11684 }
11685
11686 /* Allocate nodes to new job. Old job info will be cleared at epilog complete */
_realloc_nodes(job_record_t * job_ptr,bitstr_t * orig_node_bitmap)11687 static void _realloc_nodes(job_record_t *job_ptr, bitstr_t *orig_node_bitmap)
11688 {
11689 int i, i_first, i_last;
11690 node_record_t *node_ptr;
11691
11692 xassert(job_ptr);
11693 xassert(orig_node_bitmap);
11694 if (!job_ptr->job_resrcs || !job_ptr->job_resrcs->node_bitmap)
11695 return;
11696 i_first = bit_ffs(job_ptr->job_resrcs->node_bitmap);
11697 if (i_first >= 0)
11698 i_last = bit_fls(job_ptr->job_resrcs->node_bitmap);
11699 else
11700 i_last = -1;
11701 for (i = i_first; i <= i_last; i++) {
11702 if (!bit_test(job_ptr->job_resrcs->node_bitmap, i) ||
11703 bit_test(orig_node_bitmap, i))
11704 continue;
11705 node_ptr = node_record_table_ptr + i;
11706 make_node_alloc(node_ptr, job_ptr);
11707 }
11708 }
11709
permit_job_expansion(void)11710 extern bool permit_job_expansion(void)
11711 {
11712 static time_t sched_update = 0;
11713 static bool permit_job_expansion = false;
11714
11715 if (sched_update != slurmctld_conf.last_update) {
11716 char *sched_params = slurm_get_sched_params();
11717 sched_update = slurmctld_conf.last_update;
11718 if (xstrcasestr(sched_params, "permit_job_expansion"))
11719 permit_job_expansion = true;
11720 else
11721 permit_job_expansion = false;
11722 xfree(sched_params);
11723 }
11724
11725 return permit_job_expansion;
11726 }
11727
permit_job_shrink(void)11728 extern bool permit_job_shrink(void)
11729 {
11730 static time_t sched_update = 0;
11731 static bool permit_job_shrink = false;
11732
11733 if (sched_update != slurmctld_conf.last_update) {
11734 char *sched_params = slurm_get_sched_params();
11735 sched_update = slurmctld_conf.last_update;
11736 if (xstrcasestr(sched_params, "disable_job_shrink"))
11737 permit_job_shrink = false;
11738 else
11739 permit_job_shrink = true;
11740 xfree(sched_params);
11741 }
11742
11743 return permit_job_shrink;
11744 }
11745
_update_job(job_record_t * job_ptr,job_desc_msg_t * job_specs,uid_t uid)11746 static int _update_job(job_record_t *job_ptr, job_desc_msg_t *job_specs,
11747 uid_t uid)
11748 {
11749 int error_code = SLURM_SUCCESS;
11750 enum job_state_reason fail_reason;
11751 bool operator = false;
11752 bool is_coord_oldacc = false, is_coord_newacc = false;
11753 uint32_t save_min_nodes = 0, save_max_nodes = 0;
11754 uint32_t save_min_cpus = 0, save_max_cpus = 0;
11755 struct job_details *detail_ptr;
11756 part_record_t *new_part_ptr = NULL, *use_part_ptr = NULL;
11757 bitstr_t *exc_bitmap = NULL, *new_req_bitmap = NULL;
11758 time_t now = time(NULL);
11759 multi_core_data_t *mc_ptr = NULL;
11760 bool update_accounting = false, new_req_bitmap_given = false;
11761 acct_policy_limit_set_t acct_policy_limit_set;
11762 uint16_t tres[slurmctld_tres_cnt];
11763 bool acct_limit_already_exceeded;
11764 bool tres_changed = false;
11765 int tres_pos;
11766 uint64_t tres_req_cnt[slurmctld_tres_cnt];
11767 bool tres_req_cnt_set = false, valid_licenses = false;
11768 List gres_list = NULL;
11769 List license_list = NULL;
11770 List part_ptr_list = NULL;
11771 uint32_t orig_time_limit;
11772 bool gres_update = false;
11773 slurmdb_assoc_rec_t *new_assoc_ptr = NULL, *use_assoc_ptr = NULL;
11774 slurmdb_qos_rec_t *new_qos_ptr = NULL, *use_qos_ptr = NULL;
11775 slurmctld_resv_t *new_resv_ptr = NULL;
11776 uint32_t user_site_factor;
11777
11778 assoc_mgr_lock_t locks = { .tres = READ_LOCK };
11779
11780 /*
11781 * This means we are in the middle of requesting the db_inx from the
11782 * database. So we can't update right now. You should try again outside
11783 * the job_write lock in a second or so.
11784 */
11785 if (job_ptr->db_index == NO_VAL64)
11786 return ESLURM_JOB_SETTING_DB_INX;
11787
11788 operator = validate_operator(uid);
11789 if (job_specs->burst_buffer) {
11790 /*
11791 * burst_buffer contents are validated at job submit time and
11792 * data is possibly being staged at later times. It can not
11793 * be changed except to clear the value on a completed job and
11794 * purge the record in order to recover from a failure mode
11795 */
11796 if (IS_JOB_COMPLETED(job_ptr) && operator &&
11797 (job_specs->burst_buffer[0] == '\0')) {
11798 xfree(job_ptr->burst_buffer);
11799 last_job_update = now;
11800 } else {
11801 error_code = ESLURM_NOT_SUPPORTED;
11802 }
11803 }
11804 if (error_code != SLURM_SUCCESS)
11805 goto fini;
11806
11807 if (job_specs->array_inx && job_ptr->array_recs) {
11808 int throttle;
11809 throttle = strtoll(job_specs->array_inx, (char **) NULL, 10);
11810 if (throttle >= 0) {
11811 info("%s: set max_run_tasks to %d for job array %pJ",
11812 __func__, throttle, job_ptr);
11813 job_ptr->array_recs->max_run_tasks = throttle;
11814 } else {
11815 info("%s: invalid max_run_tasks of %d for job array %pJ, ignored",
11816 __func__, throttle, job_ptr);
11817 error_code = ESLURM_BAD_TASK_COUNT;
11818 }
11819 /*
11820 * Even if the job is complete, permit changing
11821 * ArrayTaskThrottle for other elements of the task array
11822 */
11823 if (IS_JOB_FINISHED(job_ptr))
11824 goto fini;
11825 }
11826
11827 if (IS_JOB_FINISHED(job_ptr)) {
11828 error_code = ESLURM_JOB_FINISHED;
11829 goto fini;
11830 }
11831
11832 /*
11833 * Validate before job_submit_plugin_modify() so that the job_submit
11834 * plugin can make changes to the field without triggering an auth
11835 * issue.
11836 */
11837 if (job_specs->admin_comment && !validate_super_user(uid)) {
11838 error("Attempt to change admin_comment for %pJ", job_ptr);
11839 error_code = ESLURM_ACCESS_DENIED;
11840 goto fini;
11841 }
11842
11843 /* Save before submit plugin potentially modifies it. */
11844 user_site_factor = job_specs->site_factor;
11845
11846 if (job_specs->user_id == NO_VAL) {
11847 /*
11848 * Used by job_submit/lua to find default partition and
11849 * access control logic below to validate partition change
11850 */
11851 job_specs->user_id = job_ptr->user_id;
11852 }
11853 error_code = job_submit_plugin_modify(job_specs, job_ptr,
11854 (uint32_t) uid);
11855 if (error_code != SLURM_SUCCESS)
11856 return error_code;
11857 error_code = node_features_g_job_valid(job_specs->features);
11858 if (error_code != SLURM_SUCCESS)
11859 return error_code;
11860
11861 error_code = _test_job_desc_fields(job_specs);
11862 if (error_code != SLURM_SUCCESS)
11863 return error_code;
11864
11865 memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set));
11866 acct_policy_limit_set.tres = tres;
11867
11868 if (operator) {
11869 /* set up the acct_policy if we are at least an operator */
11870 for (tres_pos = 0; tres_pos < slurmctld_tres_cnt; tres_pos++)
11871 acct_policy_limit_set.tres[tres_pos] = ADMIN_SET_LIMIT;
11872 acct_policy_limit_set.time = ADMIN_SET_LIMIT;
11873 acct_policy_limit_set.qos = ADMIN_SET_LIMIT;
11874 } else
11875 memset(tres, 0, sizeof(tres));
11876
11877 /* Check authorization for modifying this job */
11878 is_coord_oldacc = assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
11879 job_ptr->account);
11880 is_coord_newacc = assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
11881 job_specs->account);
11882 if ((job_ptr->user_id != uid) && !operator) {
11883 /*
11884 * Fail if we are not coordinators of the current account or
11885 * if we are changing an account and we are not coordinators
11886 * of both src and dest accounts.
11887 */
11888 if (!is_coord_oldacc ||
11889 (!is_coord_newacc && job_specs->account)) {
11890 error("Security violation, JOB_UPDATE RPC from uid %d",
11891 uid);
11892 return ESLURM_USER_ID_MISSING;
11893 }
11894 }
11895
11896 detail_ptr = job_ptr->details;
11897 if (detail_ptr)
11898 mc_ptr = detail_ptr->mc_ptr;
11899 last_job_update = now;
11900
11901 /*
11902 * Check to see if the new requested job_specs exceeds any
11903 * existing limit. If it passes, cool, we will check the new
11904 * association/qos/part later in the code and fail if it is wrong.
11905 *
11906 * If it doesn't pass this mean some limit was exceededed before the
11907 * update request so let's keep the user continue screwing up herself
11908 * with the limit if it is what she wants. We do this by not exiting
11909 * on the later call to acct_policy_validate() if it fails.
11910 *
11911 * We will also prevent the update to return an error code that is
11912 * confusing since many things could successfully update and we are now
11913 * just already violating a limit. The job won't be allowed to run,
11914 * but it will allow the update to happen which is most likely what
11915 * was desired.
11916 *
11917 * Changes in between this check and the next acct_policy_validate()
11918 * will not be constrained to accounting enforce limits.
11919 */
11920 orig_time_limit = job_specs->time_limit;
11921
11922 memcpy(tres_req_cnt, job_ptr->tres_req_cnt, sizeof(tres_req_cnt));
11923 job_specs->tres_req_cnt = tres_req_cnt;
11924 tres_req_cnt_set = true;
11925
11926 acct_limit_already_exceeded = false;
11927
11928 if (!operator && (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) {
11929 if (!acct_policy_validate(job_specs, job_ptr->part_ptr,
11930 job_ptr->assoc_ptr, job_ptr->qos_ptr,
11931 NULL, &acct_policy_limit_set,
11932 true)) {
11933 debug("%s: already exceeded association's cpu, node, "
11934 "memory or time limit for user %u",
11935 __func__, job_specs->user_id);
11936 acct_limit_already_exceeded = true;
11937 }
11938 job_specs->time_limit = orig_time_limit;
11939 }
11940
11941 /*
11942 * The partition, assoc, qos, reservation, and req_node_bitmap all have
11943 * to be set before checking later. So here we set them into temporary
11944 * variables set in the job way later.
11945 */
11946 if (job_specs->partition &&
11947 !xstrcmp(job_specs->partition, job_ptr->partition)) {
11948 sched_debug("%s: new partition identical to old partition %pJ",
11949 __func__, job_ptr);
11950 } else if (job_specs->partition) {
11951 if (!IS_JOB_PENDING(job_ptr)) {
11952 error_code = ESLURM_JOB_NOT_PENDING;
11953 goto fini;
11954 }
11955
11956 error_code = _get_job_parts(job_specs,
11957 &new_part_ptr,
11958 &part_ptr_list, NULL);
11959
11960 if (error_code != SLURM_SUCCESS)
11961 ;
11962 else if ((new_part_ptr->state_up & PARTITION_SUBMIT) == 0)
11963 error_code = ESLURM_PARTITION_NOT_AVAIL;
11964 else if (!part_ptr_list &&
11965 !xstrcmp(new_part_ptr->name, job_ptr->partition)) {
11966 sched_debug("%s: 2 new partition identical to old partition %pJ",
11967 __func__, job_ptr);
11968 new_part_ptr = NULL;
11969 }
11970 if (error_code != SLURM_SUCCESS)
11971 goto fini;
11972 }
11973
11974 use_part_ptr = new_part_ptr ? new_part_ptr : job_ptr->part_ptr;
11975
11976 /* Check the account and the partition as both affect the association */
11977 if (job_specs->account || new_part_ptr) {
11978 if (!IS_JOB_PENDING(job_ptr))
11979 error_code = ESLURM_JOB_NOT_PENDING;
11980 else {
11981 new_assoc_ptr = _retrieve_new_assoc(job_specs, job_ptr);
11982
11983 if (!new_assoc_ptr)
11984 error_code = errno;
11985 else if (new_assoc_ptr == job_ptr->assoc_ptr) {
11986 new_assoc_ptr = NULL;
11987 sched_debug("%s: new association identical to old association %u",
11988 __func__, job_ptr->job_id);
11989 }
11990
11991 /*
11992 * Clear errno that may have been set by
11993 * _retrieve_new_assoc.
11994 */
11995 errno = 0;
11996 }
11997
11998 if (error_code != SLURM_SUCCESS)
11999 goto fini;
12000 }
12001
12002 use_assoc_ptr = new_assoc_ptr ? new_assoc_ptr : job_ptr->assoc_ptr;
12003
12004 if (job_specs->qos) {
12005 slurmdb_qos_rec_t qos_rec;
12006 char *resv_name;
12007
12008 if (job_specs->reservation
12009 && job_specs->reservation[0] != '\0')
12010 resv_name = job_specs->reservation;
12011 else
12012 resv_name = job_ptr->resv_name;
12013
12014 memset(&qos_rec, 0, sizeof(qos_rec));
12015
12016 /* If the qos is blank that means we want the default */
12017 if (job_specs->qos[0])
12018 qos_rec.name = job_specs->qos;
12019
12020 new_qos_ptr = _determine_and_validate_qos(
12021 resv_name, use_assoc_ptr,
12022 operator, &qos_rec, &error_code, false,
12023 LOG_LEVEL_ERROR);
12024 if ((error_code == SLURM_SUCCESS) && new_qos_ptr) {
12025 if (job_ptr->qos_ptr == new_qos_ptr) {
12026 sched_debug("%s: new QOS identical to old QOS %pJ",
12027 __func__, job_ptr);
12028 new_qos_ptr = NULL;
12029 } else if (!IS_JOB_PENDING(job_ptr)) {
12030 error_code = ESLURM_JOB_NOT_PENDING;
12031 new_qos_ptr = NULL;
12032 }
12033 }
12034
12035 if (error_code != SLURM_SUCCESS)
12036 goto fini;
12037 }
12038
12039 use_qos_ptr = new_qos_ptr ? new_qos_ptr : job_ptr->qos_ptr;
12040
12041 if (job_specs->bitflags & RESET_ACCRUE_TIME) {
12042 if (!IS_JOB_PENDING(job_ptr) || !detail_ptr) {
12043 error_code = ESLURM_JOB_NOT_PENDING;
12044 goto fini;
12045 } else
12046 acct_policy_remove_accrue_time(job_ptr, false);
12047 }
12048
12049 /*
12050 * Must check req_nodes to set the job_ptr->details->req_node_bitmap
12051 * before we validate it later.
12052 */
12053 if (job_specs->req_nodes &&
12054 (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
12055 /*
12056 * Use req_nodes to change the nodes associated with a running
12057 * for lack of other field in the job request to use
12058 */
12059 if (!permit_job_shrink()) {
12060 error("%s: request to shrink %pJ denied by configuration",
12061 __func__, job_ptr);
12062 error_code = ESLURM_NOT_SUPPORTED;
12063 goto fini;
12064 } else if ((job_specs->req_nodes[0] == '\0') ||
12065 node_name2bitmap(job_specs->req_nodes,
12066 false, &new_req_bitmap) ||
12067 !bit_super_set(new_req_bitmap, job_ptr->node_bitmap) ||
12068 (job_ptr->details && job_ptr->details->expanding_jobid)) {
12069 sched_info("%s: Invalid node list (%s) for %pJ update",
12070 __func__, job_specs->req_nodes, job_ptr);
12071 error_code = ESLURM_INVALID_NODE_NAME;
12072 goto fini;
12073 } else if (new_req_bitmap) {
12074 int i, i_first, i_last;
12075 node_record_t *node_ptr;
12076 bitstr_t *rem_nodes;
12077
12078 /*
12079 * They requested a new list of nodes for the job. If
12080 * the batch host isn't in this list, then deny this
12081 * request.
12082 */
12083 if (job_ptr->batch_flag) {
12084 bitstr_t *batch_host_bitmap;
12085 if (node_name2bitmap(job_ptr->batch_host, false,
12086 &batch_host_bitmap))
12087 error("%s: Invalid batch host %s for %pJ; this should never happen",
12088 __func__, job_ptr->batch_host,
12089 job_ptr);
12090 else if (!bit_overlap_any(batch_host_bitmap,
12091 new_req_bitmap)) {
12092 error("%s: Batch host %s for %pJ is not in the requested node list %s. You cannot remove the batch host from a job when resizing.",
12093 __func__, job_ptr->batch_host,
12094 job_ptr, job_specs->req_nodes);
12095 error_code = ESLURM_INVALID_NODE_NAME;
12096 bit_free(batch_host_bitmap);
12097 goto fini;
12098 } else
12099 bit_free(batch_host_bitmap);
12100 }
12101
12102 sched_info("%s: setting nodes to %s for %pJ",
12103 __func__, job_specs->req_nodes, job_ptr);
12104 job_pre_resize_acctg(job_ptr);
12105 i_first = bit_ffs(job_ptr->node_bitmap);
12106 if (i_first >= 0)
12107 i_last = bit_fls(job_ptr->node_bitmap);
12108 else
12109 i_last = -2;
12110 rem_nodes = bit_alloc(bit_size(job_ptr->node_bitmap));
12111 for (i = i_first; i <= i_last; i++) {
12112 if (bit_test(new_req_bitmap, i) ||
12113 !bit_test(job_ptr->node_bitmap, i))
12114 continue;
12115 bit_set(rem_nodes, i);
12116 }
12117 #ifndef HAVE_FRONT_END
12118 abort_job_on_nodes(job_ptr, rem_nodes);
12119 #endif
12120 for (i = i_first; i <= i_last; i++) {
12121 if (!bit_test(rem_nodes, i))
12122 continue;
12123 node_ptr = node_record_table_ptr + i;
12124 kill_step_on_node(job_ptr, node_ptr, false);
12125 excise_node_from_job(job_ptr, node_ptr);
12126 }
12127 bit_free(rem_nodes);
12128 (void) gs_job_start(job_ptr);
12129 gres_build_job_details(job_ptr->gres_list,
12130 &job_ptr->gres_detail_cnt,
12131 &job_ptr->gres_detail_str,
12132 &job_ptr->gres_used);
12133 job_post_resize_acctg(job_ptr);
12134 /*
12135 * Since job_post_resize_acctg will restart
12136 * things, don't do it again.
12137 */
12138 update_accounting = false;
12139 } else {
12140 update_accounting = true;
12141 }
12142 FREE_NULL_BITMAP(new_req_bitmap);
12143 } else if (job_specs->req_nodes) {
12144 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
12145 error_code = ESLURM_JOB_NOT_PENDING;
12146 else if (job_specs->req_nodes[0] == '\0')
12147 new_req_bitmap_given = true;
12148 else {
12149 if (node_name2bitmap(job_specs->req_nodes, false,
12150 &new_req_bitmap)) {
12151 sched_info("%s: Invalid node list for job_update: %s",
12152 __func__, job_specs->req_nodes);
12153 FREE_NULL_BITMAP(new_req_bitmap);
12154 error_code = ESLURM_INVALID_NODE_NAME;
12155 } else
12156 new_req_bitmap_given = true;
12157 }
12158 }
12159
12160 if (error_code != SLURM_SUCCESS)
12161 goto fini;
12162
12163 /* this needs to be after partition and QOS checks */
12164 if (job_specs->reservation
12165 && (!xstrcmp(job_specs->reservation, job_ptr->resv_name) ||
12166 (!job_ptr->resv_name && job_specs->reservation[0] == '\0'))) {
12167 sched_debug("%s: new reservation identical to old reservation %pJ",
12168 __func__, job_ptr);
12169 } else if (job_specs->reservation) {
12170 if (!IS_JOB_PENDING(job_ptr) && !IS_JOB_RUNNING(job_ptr)) {
12171 error_code = ESLURM_JOB_NOT_PENDING_NOR_RUNNING;
12172 } else {
12173 job_record_t tmp_job_rec;
12174
12175 memcpy(&tmp_job_rec, job_ptr, sizeof(job_record_t));
12176 tmp_job_rec.resv_name = xstrdup(job_specs->reservation);
12177 tmp_job_rec.resv_ptr = NULL;
12178 tmp_job_rec.part_ptr = use_part_ptr;
12179 tmp_job_rec.qos_ptr = use_qos_ptr;
12180 tmp_job_rec.assoc_ptr = use_assoc_ptr;
12181
12182 error_code = validate_job_resv(&tmp_job_rec);
12183
12184 /*
12185 * It doesn't matter what this is, just set it as
12186 * failure will be NULL.
12187 */
12188 new_resv_ptr = tmp_job_rec.resv_ptr;
12189
12190 /*
12191 * Make sure this job isn't using a partition or QOS
12192 * that requires it to be in a reservation.
12193 */
12194 if ((error_code == SLURM_SUCCESS) && !new_resv_ptr) {
12195 if (use_part_ptr
12196 && use_part_ptr->flags & PART_FLAG_REQ_RESV)
12197 error_code = ESLURM_ACCESS_DENIED;
12198
12199 if (use_qos_ptr
12200 && use_qos_ptr->flags & QOS_FLAG_REQ_RESV)
12201 error_code = ESLURM_INVALID_QOS;
12202 }
12203
12204 xfree(tmp_job_rec.resv_name);
12205 }
12206 if (error_code != SLURM_SUCCESS)
12207 goto fini;
12208 }
12209
12210 if (job_specs->cpus_per_tres || job_specs->tres_per_job ||
12211 job_specs->tres_per_node || job_specs->tres_per_socket ||
12212 job_specs->tres_per_task || job_specs->mem_per_tres)
12213 gres_update = true;
12214 if (gres_update) {
12215 uint16_t orig_ntasks_per_socket = NO_VAL16;
12216 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL) ||
12217 (detail_ptr->expanding_jobid != 0)) {
12218 error_code = ESLURM_JOB_NOT_PENDING;
12219 goto fini;
12220 }
12221
12222 if (job_specs->num_tasks == NO_VAL)
12223 job_specs->num_tasks = detail_ptr->num_tasks;
12224 if (job_specs->min_nodes == NO_VAL)
12225 job_specs->min_nodes = detail_ptr->min_nodes;
12226 if (job_specs->max_nodes == NO_VAL)
12227 job_specs->max_nodes = detail_ptr->max_nodes;
12228 if (job_specs->ntasks_per_node == NO_VAL16)
12229 job_specs->ntasks_per_node = detail_ptr->ntasks_per_node;
12230 if ((job_specs->ntasks_per_socket == NO_VAL16) &&
12231 (detail_ptr->mc_ptr) &&
12232 (detail_ptr->mc_ptr->ntasks_per_socket != INFINITE16)) {
12233 job_specs->ntasks_per_socket =
12234 mc_ptr->ntasks_per_socket;
12235 orig_ntasks_per_socket = job_specs->ntasks_per_socket;
12236 }
12237 if (job_specs->cpus_per_task == NO_VAL16)
12238 job_specs->cpus_per_task = detail_ptr->cpus_per_task;
12239 gres_list = gres_plugin_job_state_dup(job_ptr->gres_list);
12240 if ((error_code = gres_plugin_job_state_validate(
12241 job_specs->cpus_per_tres,
12242 job_specs->tres_freq,
12243 job_specs->tres_per_job,
12244 job_specs->tres_per_node,
12245 job_specs->tres_per_socket,
12246 job_specs->tres_per_task,
12247 job_specs->mem_per_tres,
12248 &job_specs->num_tasks,
12249 &job_specs->min_nodes,
12250 &job_specs->max_nodes,
12251 &job_specs->ntasks_per_node,
12252 &job_specs->ntasks_per_socket,
12253 &job_specs->sockets_per_node,
12254 &job_specs->cpus_per_task,
12255 &gres_list))) {
12256 sched_info("%s: invalid GRES for %pJ",
12257 __func__, job_ptr);
12258 goto fini;
12259 }
12260 if (job_specs->num_tasks == detail_ptr->num_tasks)
12261 job_specs->num_tasks = NO_VAL; /* Unchanged */
12262 if (job_specs->min_nodes == detail_ptr->min_nodes)
12263 job_specs->min_nodes = NO_VAL; /* Unchanged */
12264 if (job_specs->max_nodes == detail_ptr->max_nodes)
12265 job_specs->max_nodes = NO_VAL; /* Unchanged */
12266 if (job_specs->ntasks_per_node == detail_ptr->ntasks_per_node)
12267 job_specs->ntasks_per_node = NO_VAL16; /* Unchanged */
12268 if (job_specs->ntasks_per_socket == orig_ntasks_per_socket)
12269 job_specs->ntasks_per_socket = NO_VAL16; /* Unchanged */
12270 if (job_specs->cpus_per_task == detail_ptr->cpus_per_task)
12271 job_specs->cpus_per_task = NO_VAL16; /* Unchanged */
12272 }
12273 if (gres_update) {
12274 gres_set_job_tres_cnt(gres_list, detail_ptr->min_nodes,
12275 job_specs->tres_req_cnt, false);
12276 }
12277
12278 if ((job_specs->min_nodes != NO_VAL) &&
12279 (job_specs->min_nodes != INFINITE)) {
12280 uint32_t min_cpus = (job_specs->pn_min_cpus != NO_VAL16 ?
12281 job_specs->pn_min_cpus : detail_ptr->pn_min_cpus) *
12282 job_specs->min_nodes;
12283 uint32_t num_cpus = job_specs->min_cpus != NO_VAL ?
12284 job_specs->min_cpus :
12285 job_ptr->tres_req_cnt[TRES_ARRAY_CPU];
12286 uint32_t num_tasks = job_specs->num_tasks != NO_VAL ?
12287 job_specs->num_tasks : detail_ptr->num_tasks;
12288
12289 if (!num_tasks) {
12290 num_tasks = detail_ptr->min_nodes;
12291
12292 } else if (num_tasks < job_specs->min_nodes) {
12293 info("%s: adjusting num_tasks (prev: %u) to be at least min_nodes: %u",
12294 __func__, num_tasks, job_specs->min_nodes);
12295 num_tasks = job_specs->min_nodes;
12296 if (IS_JOB_PENDING(job_ptr))
12297 job_specs->num_tasks = num_tasks;
12298 }
12299
12300 num_tasks *= job_specs->cpus_per_task != NO_VAL16 ?
12301 job_specs->cpus_per_task : detail_ptr->cpus_per_task;
12302 num_tasks = MAX(num_tasks, min_cpus);
12303 if (num_tasks > num_cpus) {
12304 info("%s: adjusting min_cpus (prev: %u) to be at least : %u",
12305 __func__, num_cpus, num_tasks);
12306 job_specs->min_cpus = num_tasks;
12307
12308 job_specs->pn_min_memory =
12309 job_specs->pn_min_memory != NO_VAL64 ?
12310 job_specs->pn_min_memory :
12311 detail_ptr->pn_min_memory;
12312 }
12313
12314 assoc_mgr_lock(&locks);
12315
12316 if (!job_specs->licenses) {
12317 license_set_job_tres_cnt(job_ptr->license_list,
12318 job_specs->tres_req_cnt,
12319 true);
12320 }
12321 assoc_mgr_unlock(&locks);
12322
12323
12324 job_specs->tres_req_cnt[TRES_ARRAY_NODE] = job_specs->min_nodes;
12325 }
12326
12327 if (job_specs->min_cpus != NO_VAL)
12328 job_specs->tres_req_cnt[TRES_ARRAY_CPU] = job_specs->min_cpus;
12329 else if ((job_specs->pn_min_cpus != NO_VAL16) &&
12330 (job_specs->pn_min_cpus != 0)) {
12331 job_specs->tres_req_cnt[TRES_ARRAY_CPU] =
12332 job_specs->pn_min_cpus *
12333 (job_specs->min_nodes != NO_VAL ?
12334 job_specs->min_nodes :
12335 detail_ptr ? detail_ptr->min_nodes : 1);
12336 job_specs->min_cpus = job_specs->tres_req_cnt[TRES_ARRAY_CPU];
12337 }
12338
12339 job_specs->tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem(NULL,
12340 job_specs->pn_min_memory,
12341 job_specs->tres_req_cnt[TRES_ARRAY_CPU] ?
12342 job_specs->tres_req_cnt[TRES_ARRAY_CPU] :
12343 job_ptr->tres_req_cnt[TRES_ARRAY_CPU],
12344 job_specs->min_nodes != NO_VAL ?
12345 job_specs->min_nodes :
12346 detail_ptr ? detail_ptr->min_nodes : 1);
12347
12348 if (job_specs->licenses && !xstrcmp(job_specs->licenses,
12349 job_ptr->licenses)) {
12350 sched_debug("%s: new licenses identical to old licenses \"%s\"",
12351 __func__, job_ptr->licenses);
12352 } else if (job_specs->licenses) {
12353 bool pending = IS_JOB_PENDING(job_ptr);
12354 license_list = license_validate(job_specs->licenses, true, true,
12355 pending ?
12356 job_specs->tres_req_cnt : NULL,
12357 &valid_licenses);
12358
12359 if (!valid_licenses) {
12360 sched_info("%s: invalid licenses: %s",
12361 __func__, job_specs->licenses);
12362 error_code = ESLURM_INVALID_LICENSES;
12363 }
12364 }
12365
12366 if (error_code != SLURM_SUCCESS)
12367 goto fini;
12368
12369 if (job_specs->exc_nodes && detail_ptr &&
12370 !xstrcmp(job_specs->exc_nodes, detail_ptr->exc_nodes)) {
12371 sched_debug("%s: new exc_nodes identical to old exc_nodes %s",
12372 __func__, job_specs->exc_nodes);
12373 } else if (job_specs->exc_nodes) {
12374 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
12375 error_code = ESLURM_JOB_NOT_PENDING;
12376 else if (job_specs->exc_nodes[0] == '\0') {
12377 xfree(detail_ptr->exc_nodes);
12378 FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap);
12379 } else {
12380 if (node_name2bitmap(job_specs->exc_nodes, false,
12381 &exc_bitmap)) {
12382 sched_error("%s: Invalid node list for update of %pJ: %s",
12383 __func__, job_ptr,
12384 job_specs->exc_nodes);
12385 FREE_NULL_BITMAP(exc_bitmap);
12386 error_code = ESLURM_INVALID_NODE_NAME;
12387 }
12388 if (exc_bitmap) {
12389 xfree(detail_ptr->exc_nodes);
12390 detail_ptr->exc_nodes =
12391 xstrdup(job_specs->exc_nodes);
12392 FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap);
12393 detail_ptr->exc_node_bitmap = exc_bitmap;
12394 sched_info("%s: setting exc_nodes to %s for %pJ",
12395 __func__, job_specs->exc_nodes, job_ptr);
12396 }
12397 }
12398 }
12399 if (error_code != SLURM_SUCCESS)
12400 goto fini;
12401
12402 if (job_specs->min_nodes == INFINITE) {
12403 /* Used by scontrol just to get current configuration info */
12404 job_specs->min_nodes = NO_VAL;
12405 }
12406 if ((job_specs->min_nodes != NO_VAL) &&
12407 (job_specs->min_nodes > job_ptr->node_cnt) &&
12408 !permit_job_expansion() &&
12409 (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
12410 info("%s: Change of size for %pJ not supported", __func__,
12411 job_ptr);
12412 error_code = ESLURM_NOT_SUPPORTED;
12413 goto fini;
12414 }
12415
12416 if (job_specs->req_switch != NO_VAL) {
12417 job_ptr->req_switch = job_specs->req_switch;
12418 info("%s: Change of switches to %u %pJ",
12419 __func__, job_specs->req_switch, job_ptr);
12420 }
12421 if (job_specs->wait4switch != NO_VAL) {
12422 job_ptr->wait4switch = _max_switch_wait(job_specs->wait4switch);
12423 info("%s: Change of switch wait to %u secs %pJ",
12424 __func__, job_ptr->wait4switch, job_ptr);
12425 }
12426
12427 if (job_specs->admin_comment) {
12428 if (!validate_super_user(uid)) {
12429 error("%s: Attempt to change admin_comment for %pJ",
12430 __func__, job_ptr);
12431 error_code = ESLURM_ACCESS_DENIED;
12432 } else {
12433 xfree(job_ptr->admin_comment);
12434 job_ptr->admin_comment =
12435 xstrdup(job_specs->admin_comment);
12436 info("%s: setting admin_comment to %s for %pJ",
12437 __func__, job_ptr->admin_comment, job_ptr);
12438 }
12439 }
12440
12441 if (job_specs->comment) {
12442 xfree(job_ptr->comment);
12443 job_ptr->comment = xstrdup(job_specs->comment);
12444 info("%s: setting comment to %s for %pJ",
12445 __func__, job_ptr->comment, job_ptr);
12446 }
12447
12448 if (error_code != SLURM_SUCCESS)
12449 goto fini;
12450
12451 /*
12452 * Now that we know what the new part, qos, and association are going
12453 * to be lets check the limits.
12454 * If a limit was already exceeded before this update
12455 * request, let's assume it is expected and allow the change to happen.
12456 */
12457 if (new_qos_ptr || new_assoc_ptr || new_part_ptr) {
12458 if (!operator &&
12459 (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) {
12460 uint32_t acct_reason = 0;
12461 char *resv_orig = NULL;
12462 bool resv_reset = false, min_reset = false,
12463 max_reset = false,
12464 time_min_reset = false;
12465 if (!acct_policy_validate(job_specs, use_part_ptr,
12466 use_assoc_ptr, use_qos_ptr,
12467 &acct_reason,
12468 &acct_policy_limit_set,
12469 true)
12470 && !acct_limit_already_exceeded) {
12471 info("%s: exceeded association/QOS limit for user %u: %s",
12472 __func__, job_specs->user_id,
12473 job_reason_string(acct_reason));
12474 error_code = ESLURM_ACCOUNTING_POLICY;
12475 goto fini;
12476 }
12477 /*
12478 * We need to set the various parts of job_specs below
12479 * to something since _valid_job_part() will validate
12480 * them. Note the reservation part is validated in the
12481 * sub call to _part_access_check().
12482 */
12483 if (job_specs->min_nodes == NO_VAL) {
12484 job_specs->min_nodes = detail_ptr->min_nodes;
12485 min_reset = true;
12486 }
12487 if ((job_specs->max_nodes == NO_VAL) &&
12488 (detail_ptr->max_nodes != 0)) {
12489 job_specs->max_nodes = detail_ptr->max_nodes;
12490 max_reset = true;
12491 }
12492
12493 if ((job_specs->time_min == NO_VAL) &&
12494 (job_ptr->time_min != 0)) {
12495 job_specs->time_min = job_ptr->time_min;
12496 time_min_reset = true;
12497 }
12498
12499 /*
12500 * This always gets reset, so don't worry about tracking
12501 * it.
12502 */
12503 if (job_specs->time_limit == NO_VAL)
12504 job_specs->time_limit = job_ptr->time_limit;
12505
12506 if (!job_specs->reservation
12507 || job_specs->reservation[0] == '\0') {
12508 resv_reset = true;
12509 resv_orig = job_specs->reservation;
12510 job_specs->reservation = job_ptr->resv_name;
12511 }
12512
12513 if ((error_code = _valid_job_part(
12514 job_specs, uid,
12515 new_req_bitmap_given ?
12516 new_req_bitmap :
12517 job_ptr->details->req_node_bitmap,
12518 use_part_ptr,
12519 new_part_ptr ?
12520 part_ptr_list : job_ptr->part_ptr_list,
12521 use_assoc_ptr, use_qos_ptr)))
12522 goto fini;
12523
12524 if (min_reset)
12525 job_specs->min_nodes = NO_VAL;
12526 if (max_reset)
12527 job_specs->max_nodes = NO_VAL;
12528 if (time_min_reset)
12529 job_specs->time_min = NO_VAL;
12530 if (resv_reset)
12531 job_specs->reservation = resv_orig;
12532
12533 job_specs->time_limit = orig_time_limit;
12534 }
12535
12536 /*
12537 * Since we are successful to this point remove the job from the
12538 * old qos/assoc's
12539 */
12540 acct_policy_remove_job_submit(job_ptr);
12541 acct_policy_remove_accrue_time(job_ptr, false);
12542 }
12543
12544 if (new_qos_ptr) {
12545 /* Change QOS */
12546 job_ptr->qos_id = new_qos_ptr->id;
12547 job_ptr->qos_ptr = new_qos_ptr;
12548 job_ptr->limit_set.qos = acct_policy_limit_set.qos;
12549
12550 if (job_ptr->state_reason == FAIL_QOS) {
12551 job_ptr->state_reason = WAIT_NO_REASON;
12552 xfree(job_ptr->state_desc);
12553 }
12554
12555 info("%s: setting QOS to %s for %pJ",
12556 __func__, new_qos_ptr->name, job_ptr);
12557 }
12558
12559 if (new_assoc_ptr) {
12560 /* Change account/association */
12561 xfree(job_ptr->account);
12562 job_ptr->account = xstrdup(new_assoc_ptr->acct);
12563 job_ptr->assoc_id = new_assoc_ptr->id;
12564 job_ptr->assoc_ptr = new_assoc_ptr;
12565
12566 if (job_ptr->state_reason == FAIL_ACCOUNT) {
12567 job_ptr->state_reason = WAIT_NO_REASON;
12568 xfree(job_ptr->state_desc);
12569 }
12570
12571 info("%s: setting account to %s for %pJ",
12572 __func__, job_ptr->account, job_ptr);
12573 }
12574
12575 if (new_part_ptr) {
12576 /* Change partition */
12577 job_ptr->part_ptr = new_part_ptr;
12578 FREE_NULL_LIST(job_ptr->part_ptr_list);
12579 job_ptr->part_ptr_list = part_ptr_list;
12580 part_ptr_list = NULL; /* nothing to free */
12581
12582 _rebuild_part_name_list(job_ptr);
12583
12584 /* Rebuilt in priority/multifactor plugin */
12585 xfree(job_ptr->priority_array);
12586
12587 info("%s: setting partition to %s for %pJ",
12588 __func__, job_specs->partition, job_ptr);
12589 }
12590
12591 /* Now add the job to the new qos/assoc's */
12592 if (new_qos_ptr || new_assoc_ptr || new_part_ptr) {
12593 update_accounting = true;
12594 acct_policy_add_job_submit(job_ptr);
12595 }
12596
12597 if (new_req_bitmap_given) {
12598 xfree(detail_ptr->req_nodes);
12599 if (job_specs->req_nodes[0] != '\0')
12600 detail_ptr->req_nodes = xstrdup(job_specs->req_nodes);
12601 FREE_NULL_BITMAP(detail_ptr->req_node_bitmap);
12602 detail_ptr->req_node_bitmap = new_req_bitmap;
12603 new_req_bitmap = NULL;
12604 sched_info("%s: setting req_nodes to %s for %pJ",
12605 __func__, job_specs->req_nodes, job_ptr);
12606 }
12607
12608 if (new_resv_ptr) {
12609 job_ptr->resv_name = xstrdup(new_resv_ptr->name);
12610 job_ptr->resv_ptr = new_resv_ptr;
12611 sched_info("%s: setting reservation to %s for %pJ", __func__,
12612 job_ptr->resv_name, job_ptr);
12613 update_accounting = true;
12614 } else if (job_specs->reservation &&
12615 job_specs->reservation[0] == '\0' &&
12616 job_ptr->resv_name) {
12617 xfree(job_ptr->resv_name);
12618 job_ptr->resv_id = 0;
12619 job_ptr->resv_ptr = NULL;
12620 sched_info("%s: setting reservation to '' for %pJ",
12621 __func__, job_ptr);
12622 update_accounting = true;
12623 }
12624
12625 /* Reset min and max cpu counts as needed, ensure consistency */
12626 if (job_specs->min_cpus != NO_VAL) {
12627 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
12628 error_code = ESLURM_JOB_NOT_PENDING;
12629 else if (job_specs->min_cpus < 1)
12630 error_code = ESLURM_INVALID_CPU_COUNT;
12631 else {
12632 save_min_cpus = detail_ptr->min_cpus;
12633 detail_ptr->min_cpus = job_specs->min_cpus;
12634 }
12635 }
12636 if (job_specs->max_cpus != NO_VAL) {
12637 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
12638 error_code = ESLURM_JOB_NOT_PENDING;
12639 else {
12640 save_max_cpus = detail_ptr->max_cpus;
12641 detail_ptr->max_cpus = job_specs->max_cpus;
12642 }
12643 }
12644 if ((save_min_cpus || save_max_cpus) && detail_ptr->max_cpus &&
12645 (detail_ptr->max_cpus < detail_ptr->min_cpus)) {
12646 error_code = ESLURM_INVALID_CPU_COUNT;
12647 if (save_min_cpus) {
12648 detail_ptr->min_cpus = save_min_cpus;
12649 save_min_cpus = 0;
12650 }
12651 if (save_max_cpus) {
12652 detail_ptr->max_cpus = save_max_cpus;
12653 save_max_cpus = 0;
12654 }
12655 }
12656
12657 if (error_code != SLURM_SUCCESS)
12658 goto fini;
12659
12660 if (save_min_cpus && (detail_ptr->min_cpus != save_min_cpus)) {
12661 info("%s: setting min_cpus from %u to %u for %pJ",
12662 __func__, save_min_cpus, detail_ptr->min_cpus, job_ptr);
12663 job_ptr->limit_set.tres[TRES_ARRAY_CPU] =
12664 acct_policy_limit_set.tres[TRES_ARRAY_CPU];
12665 detail_ptr->orig_min_cpus = job_specs->min_cpus;
12666 update_accounting = true;
12667 }
12668 if (save_max_cpus && (detail_ptr->max_cpus != save_max_cpus)) {
12669 info("%s: setting max_cpus from %u to %u for %pJ",
12670 __func__, save_max_cpus, detail_ptr->max_cpus, job_ptr);
12671 /*
12672 * Always use the acct_policy_limit_set.* since if set by a
12673 * super user it be set correctly
12674 */
12675 job_ptr->limit_set.tres[TRES_ARRAY_CPU] =
12676 acct_policy_limit_set.tres[TRES_ARRAY_CPU];
12677 detail_ptr->orig_max_cpus = job_specs->max_cpus;
12678 update_accounting = true;
12679 }
12680
12681 if ((job_specs->pn_min_cpus != NO_VAL16) &&
12682 (job_specs->pn_min_cpus != 0)) {
12683
12684 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
12685 error_code = ESLURM_JOB_NOT_PENDING;
12686 } else {
12687 detail_ptr->pn_min_cpus = job_specs->pn_min_cpus;
12688 detail_ptr->orig_pn_min_cpus = job_specs->pn_min_cpus;
12689 info("%s: setting pn_min_cpus to %u for %pJ",
12690 __func__, job_specs->pn_min_cpus, job_ptr);
12691 }
12692 }
12693
12694 if (error_code != SLURM_SUCCESS)
12695 goto fini;
12696
12697 if (job_specs->cpus_per_task != NO_VAL16) {
12698 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
12699 error_code = ESLURM_JOB_NOT_PENDING;
12700 } else if (detail_ptr->cpus_per_task !=
12701 job_specs->cpus_per_task) {
12702 info("%s: setting cpus_per_task from %u to %u for %pJ",
12703 __func__, detail_ptr->cpus_per_task,
12704 job_specs->cpus_per_task, job_ptr);
12705 detail_ptr->cpus_per_task = job_specs->cpus_per_task;
12706 detail_ptr->orig_cpus_per_task =
12707 job_specs->cpus_per_task;
12708 }
12709 }
12710
12711 if (error_code != SLURM_SUCCESS)
12712 goto fini;
12713
12714 /* Reset min and max node counts as needed, ensure consistency */
12715 if (job_specs->min_nodes != NO_VAL) {
12716 if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))
12717 ; /* shrink running job, processed later */
12718 else if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
12719 error_code = ESLURM_JOB_NOT_PENDING;
12720 else if (job_specs->min_nodes < 1) {
12721 info("%s: min_nodes < 1 for %pJ", __func__, job_ptr);
12722 error_code = ESLURM_INVALID_NODE_COUNT;
12723 } else {
12724 /* Resize of pending job */
12725 save_min_nodes = detail_ptr->min_nodes;
12726 detail_ptr->min_nodes = job_specs->min_nodes;
12727 }
12728 }
12729 if (job_specs->max_nodes != NO_VAL) {
12730 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
12731 error_code = ESLURM_JOB_NOT_PENDING;
12732 else {
12733 save_max_nodes = detail_ptr->max_nodes;
12734 detail_ptr->max_nodes = job_specs->max_nodes;
12735 }
12736 }
12737 if ((save_min_nodes || save_max_nodes) && detail_ptr->max_nodes &&
12738 (detail_ptr->max_nodes < detail_ptr->min_nodes)) {
12739 info("%s: max_nodes < min_nodes (%u < %u) for %pJ", __func__,
12740 detail_ptr->max_nodes, detail_ptr->min_nodes,
12741 job_ptr);
12742 error_code = ESLURM_INVALID_NODE_COUNT;
12743 if (save_min_nodes) {
12744 detail_ptr->min_nodes = save_min_nodes;
12745 save_min_nodes = 0;
12746 }
12747 if (save_max_nodes) {
12748 detail_ptr->max_nodes = save_max_nodes;
12749 save_max_nodes = 0;
12750 }
12751 }
12752 if (error_code != SLURM_SUCCESS)
12753 goto fini;
12754
12755 if (save_min_nodes && (save_min_nodes!= detail_ptr->min_nodes)) {
12756 info("%s: setting min_nodes from %u to %u for %pJ", __func__,
12757 save_min_nodes, detail_ptr->min_nodes, job_ptr);
12758 job_ptr->limit_set.tres[TRES_ARRAY_NODE] =
12759 acct_policy_limit_set.tres[TRES_ARRAY_NODE];
12760 update_accounting = true;
12761 }
12762 if (save_max_nodes && (save_max_nodes != detail_ptr->max_nodes)) {
12763 info("%s: setting max_nodes from %u to %u for %pJ", __func__,
12764 save_max_nodes, detail_ptr->max_nodes, job_ptr);
12765 /*
12766 * Always use the acct_policy_limit_set.* since if set by a
12767 * super user it be set correctly
12768 */
12769 job_ptr->limit_set.tres[TRES_ARRAY_NODE] =
12770 acct_policy_limit_set.tres[TRES_ARRAY_NODE];
12771 update_accounting = true;
12772 }
12773
12774 if (job_specs->num_tasks != NO_VAL) {
12775 if (!IS_JOB_PENDING(job_ptr))
12776 error_code = ESLURM_JOB_NOT_PENDING;
12777 else if (job_specs->num_tasks < 1)
12778 error_code = ESLURM_BAD_TASK_COUNT;
12779 else {
12780 detail_ptr->num_tasks = job_specs->num_tasks;
12781 info("%s: setting num_tasks to %u for %pJ",
12782 __func__, job_specs->num_tasks, job_ptr);
12783 }
12784 }
12785 if (error_code != SLURM_SUCCESS)
12786 goto fini;
12787
12788 if (job_specs->time_limit != NO_VAL) {
12789 if (IS_JOB_FINISHED(job_ptr) || job_ptr->preempt_time)
12790 error_code = ESLURM_JOB_FINISHED;
12791 else if (job_ptr->time_limit == job_specs->time_limit) {
12792 sched_debug("%s: new time limit identical to old time limit %pJ",
12793 __func__, job_ptr);
12794 } else if (operator ||
12795 (job_ptr->time_limit > job_specs->time_limit)) {
12796 time_t old_time = job_ptr->time_limit;
12797 uint32_t use_time_min = job_specs->time_min != NO_VAL ?
12798 job_specs->time_min : job_ptr->time_min;
12799 if (old_time == INFINITE) /* one year in mins */
12800 old_time = (365 * 24 * 60);
12801 if (job_specs->time_limit < use_time_min) {
12802 sched_info("%s: attempt to set time_limit < time_min (%u < %u)",
12803 __func__,
12804 job_specs->time_limit,
12805 use_time_min);
12806 error_code = ESLURM_INVALID_TIME_MIN_LIMIT;
12807 goto fini;
12808 }
12809 acct_policy_alter_job(job_ptr, job_specs->time_limit);
12810 job_ptr->time_limit = job_specs->time_limit;
12811 if (IS_JOB_RUNNING(job_ptr) ||
12812 IS_JOB_SUSPENDED(job_ptr)) {
12813 if (job_ptr->preempt_time) {
12814 ; /* Preemption in progress */
12815 } else if (job_ptr->time_limit == INFINITE) {
12816 /* Set end time in one year */
12817 job_ptr->end_time = now +
12818 (365 * 24 * 60 * 60);
12819 } else {
12820 /*
12821 * Update end_time based upon change
12822 * to preserve suspend time info
12823 */
12824 job_ptr->end_time = job_ptr->end_time +
12825 ((job_ptr->time_limit -
12826 old_time) * 60);
12827 }
12828 if (job_ptr->end_time < now)
12829 job_ptr->end_time = now;
12830 if (IS_JOB_RUNNING(job_ptr) &&
12831 (list_is_empty(job_ptr->step_list) == 0)) {
12832 _xmit_new_end_time(job_ptr);
12833 }
12834 job_ptr->end_time_exp = job_ptr->end_time;
12835 }
12836 sched_info("%s: setting time_limit to %u for %pJ",
12837 __func__, job_specs->time_limit, job_ptr);
12838 /*
12839 * Always use the acct_policy_limit_set.*
12840 * since if set by a super user it be set correctly
12841 */
12842 job_ptr->limit_set.time = acct_policy_limit_set.time;
12843 update_accounting = true;
12844 } else if (IS_JOB_PENDING(job_ptr) && job_ptr->part_ptr &&
12845 (job_ptr->part_ptr->max_time >=
12846 job_specs->time_limit)) {
12847 job_ptr->time_limit = job_specs->time_limit;
12848 sched_info("%s: setting time_limit to %u for %pJ",
12849 __func__, job_specs->time_limit, job_ptr);
12850 /*
12851 * Always use the acct_policy_limit_set.*
12852 * since if set by a super user it be set correctly
12853 */
12854 job_ptr->limit_set.time = acct_policy_limit_set.time;
12855 update_accounting = true;
12856 } else {
12857 sched_info("%s: Attempt to increase time limit for %pJ",
12858 __func__, job_ptr);
12859 error_code = ESLURM_ACCESS_DENIED;
12860 }
12861 }
12862 if (error_code != SLURM_SUCCESS)
12863 goto fini;
12864
12865 if ((job_specs->time_min != NO_VAL) && IS_JOB_PENDING(job_ptr)) {
12866 if (job_specs->time_min > job_ptr->time_limit) {
12867 info("%s: attempt to set TimeMin > TimeLimit (%u > %u)",
12868 __func__, job_specs->time_min, job_ptr->time_limit);
12869 error_code = ESLURM_INVALID_TIME_MIN_LIMIT;
12870 } else if (job_ptr->time_min != job_specs->time_min) {
12871 job_ptr->time_min = job_specs->time_min;
12872 info("%s: setting TimeMin to %u for %pJ",
12873 __func__, job_specs->time_min, job_ptr);
12874 }
12875 }
12876 if (error_code != SLURM_SUCCESS)
12877 goto fini;
12878
12879 if (job_specs->end_time) {
12880 if (!IS_JOB_RUNNING(job_ptr) || job_ptr->preempt_time) {
12881 /*
12882 * We may want to use this for deadline scheduling
12883 * at some point in the future. For now only reset
12884 * the time limit of running jobs.
12885 */
12886 error_code = ESLURM_JOB_NOT_RUNNING;
12887 } else if (job_specs->end_time < now) {
12888 error_code = ESLURM_INVALID_TIME_VALUE;
12889 } else if (operator ||
12890 (job_ptr->end_time > job_specs->end_time)) {
12891 int delta_t = job_specs->end_time - job_ptr->end_time;
12892 job_ptr->end_time = job_specs->end_time;
12893 job_ptr->time_limit += (delta_t+30)/60; /* Sec->min */
12894 sched_info("%s: setting time_limit to %u for %pJ",
12895 __func__, job_ptr->time_limit, job_ptr);
12896 /* Always use the acct_policy_limit_set.*
12897 * since if set by a super user it be set correctly */
12898 job_ptr->limit_set.time = acct_policy_limit_set.time;
12899 update_accounting = true;
12900 } else {
12901 sched_info("%s: Attempt to extend end time for %pJ",
12902 __func__, job_ptr);
12903 error_code = ESLURM_ACCESS_DENIED;
12904 }
12905 }
12906
12907 if ((job_specs->deadline) && (!IS_JOB_RUNNING(job_ptr))) {
12908 char time_str[32];
12909 slurm_make_time_str(&job_ptr->deadline, time_str,
12910 sizeof(time_str));
12911 if (job_specs->deadline < now) {
12912 error_code = ESLURM_INVALID_TIME_VALUE;
12913 } else if (operator) {
12914 /* update deadline */
12915 job_ptr->deadline = job_specs->deadline;
12916 sched_info("%s: setting deadline to %s for %pJ",
12917 __func__, time_str, job_ptr);
12918 /*
12919 * Always use the acct_policy_limit_set.*
12920 * since if set by a super user it be set correctly
12921 */
12922 job_ptr->limit_set.time = acct_policy_limit_set.time;
12923 update_accounting = true;
12924 } else {
12925 sched_info("%s: Attempt to extend end time for %pJ",
12926 __func__, job_ptr);
12927 error_code = ESLURM_ACCESS_DENIED;
12928 }
12929 }
12930 if (error_code != SLURM_SUCCESS)
12931 goto fini;
12932
12933 if (job_specs->delay_boot != NO_VAL) {
12934 job_ptr->delay_boot = job_specs->delay_boot;
12935 sched_info("%s: setting delay_boot to %u for %pJ",
12936 __func__, job_specs->delay_boot, job_ptr);
12937 }
12938
12939 if ((job_specs->requeue != NO_VAL16) && detail_ptr) {
12940 detail_ptr->requeue = MIN(job_specs->requeue, 1);
12941 sched_info("%s: setting requeue to %u for %pJ",
12942 __func__, job_specs->requeue, job_ptr);
12943 }
12944
12945 if (job_specs->priority != NO_VAL) {
12946 /*
12947 * If we are doing time slicing we could update the
12948 * priority of the job while running to give better
12949 * position (larger time slices) than competing jobs
12950 */
12951 if (IS_JOB_FINISHED(job_ptr) || (detail_ptr == NULL))
12952 error_code = ESLURM_JOB_FINISHED;
12953 else if (job_ptr->priority == job_specs->priority) {
12954 debug("%s: setting priority to current value",__func__);
12955 if ((job_ptr->priority == 0) && operator) {
12956 /*
12957 * Authorized user can change from user hold
12958 * to admin hold or admin hold to user hold
12959 */
12960 if (job_specs->alloc_sid == ALLOC_SID_USER_HOLD)
12961 job_ptr->state_reason = WAIT_HELD_USER;
12962 else
12963 job_ptr->state_reason = WAIT_HELD;
12964 }
12965 } else if ((job_ptr->priority == 0) &&
12966 (job_specs->priority == INFINITE) &&
12967 (operator ||
12968 (job_ptr->state_reason == WAIT_RESV_DELETED) ||
12969 (job_ptr->state_reason == WAIT_HELD_USER))) {
12970 _release_job(job_ptr, uid);
12971 } else if ((job_ptr->priority == 0) &&
12972 (job_specs->priority != INFINITE)) {
12973 info("%s: ignore priority reset request on held %pJ",
12974 __func__, job_ptr);
12975 error_code = ESLURM_JOB_HELD;
12976 } else if (operator ||
12977 (job_ptr->priority > job_specs->priority)) {
12978 if (job_specs->priority != 0)
12979 job_ptr->details->nice = NICE_OFFSET;
12980 if (job_specs->priority == INFINITE) {
12981 job_ptr->direct_set_prio = 0;
12982 set_job_prio(job_ptr);
12983 } else if (job_specs->priority == 0) {
12984 _hold_job(job_ptr, uid);
12985 } else {
12986 if (operator) {
12987 /*
12988 * Only administrator can make
12989 * persistent change to a job's
12990 * priority, except holding a job
12991 */
12992 job_ptr->direct_set_prio = 1;
12993 } else
12994 error_code = ESLURM_PRIO_RESET_FAIL;
12995 job_ptr->priority = job_specs->priority;
12996 if (job_ptr->part_ptr_list &&
12997 job_ptr->priority_array) {
12998 int i, j = list_count(
12999 job_ptr->part_ptr_list);
13000 for (i = 0; i < j; i++) {
13001 job_ptr->priority_array[i] =
13002 job_specs->priority;
13003 }
13004 }
13005 }
13006 sched_info("%s: set priority to %u for %pJ",
13007 __func__, job_ptr->priority, job_ptr);
13008 update_accounting = true;
13009 if (job_ptr->priority == 0) {
13010 if (!operator ||
13011 (job_specs->alloc_sid ==
13012 ALLOC_SID_USER_HOLD)) {
13013 job_ptr->state_reason = WAIT_HELD_USER;
13014 } else
13015 job_ptr->state_reason = WAIT_HELD;
13016 xfree(job_ptr->state_desc);
13017
13018 /* remove pending remote sibling jobs */
13019 if (IS_JOB_PENDING(job_ptr) &&
13020 !IS_JOB_REVOKED(job_ptr)) {
13021 fed_mgr_job_revoke_sibs(job_ptr);
13022 }
13023 }
13024 } else if ((job_ptr->priority != 0) &&
13025 (job_specs->priority == INFINITE)) {
13026 /*
13027 * If the job was already released, ignore another
13028 * release request.
13029 */
13030 debug("%s: %pJ already released, ignoring request",
13031 __func__, job_ptr);
13032 } else {
13033 sched_error("Attempt to modify priority for %pJ",
13034 job_ptr);
13035 error_code = ESLURM_ACCESS_DENIED;
13036 }
13037 } else if (job_ptr->state_reason == FAIL_BAD_CONSTRAINTS) {
13038 /*
13039 * We need to check if the state is BadConstraints here since we
13040 * are altering the job the bad constraint might have gone
13041 * away. If it did the priority (0) wouldn't get reset so the
13042 * job would just go into JobAdminHeld otherwise.
13043 */
13044 job_ptr->direct_set_prio = 0;
13045 set_job_prio(job_ptr);
13046 sched_debug("%s: job request changed somehow, removing the bad constraints to reevaluate %pJ uid %u",
13047 __func__, job_ptr, uid);
13048 job_ptr->state_reason = WAIT_NO_REASON;
13049 }
13050
13051 if (error_code != SLURM_SUCCESS)
13052 goto fini;
13053
13054 if (job_specs->nice != NO_VAL) {
13055 if (IS_JOB_FINISHED(job_ptr) || (job_ptr->details == NULL))
13056 error_code = ESLURM_JOB_FINISHED;
13057 else if (job_ptr->details &&
13058 (job_ptr->details->nice == job_specs->nice))
13059 sched_debug("%s: new nice identical to old nice %pJ",
13060 __func__, job_ptr);
13061 else if (job_ptr->direct_set_prio && job_ptr->priority != 0)
13062 info("%s: ignore nice set request on %pJ",
13063 __func__, job_ptr);
13064 else if (operator || (job_specs->nice >= NICE_OFFSET)) {
13065 if (!xstrcmp(slurmctld_conf.priority_type,
13066 "priority/basic")) {
13067 int64_t new_prio = job_ptr->priority;
13068 new_prio += job_ptr->details->nice;
13069 new_prio -= job_specs->nice;
13070 job_ptr->priority = MAX(new_prio, 2);
13071 sched_info("%s: nice changed from %u to %u, setting priority to %u for %pJ",
13072 __func__, job_ptr->details->nice,
13073 job_specs->nice,
13074 job_ptr->priority, job_ptr);
13075 }
13076 job_ptr->details->nice = job_specs->nice;
13077 update_accounting = true;
13078 } else {
13079 sched_error("%s: Attempt to modify nice for %pJ",
13080 __func__, job_ptr);
13081 error_code = ESLURM_ACCESS_DENIED;
13082 }
13083 }
13084 if (error_code != SLURM_SUCCESS)
13085 goto fini;
13086
13087 if (job_specs->pn_min_memory != NO_VAL64) {
13088 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
13089 error_code = ESLURM_JOB_NOT_PENDING;
13090 } else if (job_specs->pn_min_memory
13091 == detail_ptr->pn_min_memory) {
13092 sched_debug("%s: new memory limit identical to old limit for %pJ",
13093 __func__, job_ptr);
13094 } else {
13095 char *entity;
13096 if (job_specs->pn_min_memory == MEM_PER_CPU) {
13097 /* Map --mem-per-cpu=0 to --mem=0 */
13098 job_specs->pn_min_memory = 0;
13099 }
13100 if (job_specs->pn_min_memory & MEM_PER_CPU)
13101 entity = "cpu";
13102 else
13103 entity = "job";
13104
13105 detail_ptr->pn_min_memory = job_specs->pn_min_memory;
13106 detail_ptr->orig_pn_min_memory =
13107 job_specs->pn_min_memory;
13108 job_ptr->bit_flags |= JOB_MEM_SET;
13109 sched_info("%s: setting min_memory_%s to %"PRIu64" for %pJ",
13110 __func__, entity,
13111 (job_specs->pn_min_memory & (~MEM_PER_CPU)),
13112 job_ptr);
13113 /*
13114 * Always use the acct_policy_limit_set.*
13115 * since if set by a super user it be set correctly
13116 */
13117 job_ptr->limit_set.tres[TRES_ARRAY_MEM] =
13118 acct_policy_limit_set.tres[TRES_ARRAY_MEM];
13119 }
13120 }
13121 if (error_code != SLURM_SUCCESS)
13122 goto fini;
13123
13124 if (job_specs->pn_min_tmp_disk != NO_VAL) {
13125 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
13126 error_code = ESLURM_JOB_NOT_PENDING;
13127 } else {
13128 detail_ptr->pn_min_tmp_disk =
13129 job_specs->pn_min_tmp_disk;
13130
13131 sched_info("%s: setting job_min_tmp_disk to %u for %pJ",
13132 __func__, job_specs->pn_min_tmp_disk,
13133 job_ptr);
13134 }
13135 }
13136 if (error_code != SLURM_SUCCESS)
13137 goto fini;
13138
13139 if (job_specs->sockets_per_node != NO_VAL16) {
13140 if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) {
13141 error_code = ESLURM_JOB_NOT_PENDING;
13142 goto fini;
13143 } else {
13144 mc_ptr->sockets_per_node = job_specs->sockets_per_node;
13145 sched_info("%s: setting sockets_per_node to %u for %pJ",
13146 __func__, job_specs->sockets_per_node,
13147 job_ptr);
13148 }
13149 }
13150
13151 if (job_specs->cores_per_socket != NO_VAL16) {
13152 if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) {
13153 error_code = ESLURM_JOB_NOT_PENDING;
13154 goto fini;
13155 } else {
13156 mc_ptr->cores_per_socket = job_specs->cores_per_socket;
13157 sched_info("%s: setting cores_per_socket to %u for %pJ",
13158 __func__, job_specs->cores_per_socket,
13159 job_ptr);
13160 }
13161 }
13162
13163 if ((job_specs->threads_per_core != NO_VAL16)) {
13164 if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) {
13165 error_code = ESLURM_JOB_NOT_PENDING;
13166 goto fini;
13167 } else {
13168 mc_ptr->threads_per_core = job_specs->threads_per_core;
13169 sched_info("%s: setting threads_per_core to %u for %pJ",
13170 __func__, job_specs->threads_per_core,
13171 job_ptr);
13172 }
13173 }
13174
13175 if (job_specs->shared != NO_VAL16) {
13176 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
13177 error_code = ESLURM_JOB_NOT_PENDING;
13178 } else if (!operator) {
13179 sched_error("%s: Attempt to change sharing for %pJ",
13180 __func__, job_ptr);
13181 error_code = ESLURM_ACCESS_DENIED;
13182 } else {
13183 if (job_specs->shared) {
13184 detail_ptr->share_res = 1;
13185 detail_ptr->whole_node = 0;
13186 } else {
13187 detail_ptr->share_res = 0;
13188 }
13189 sched_info("%s: setting shared to %u for %pJ",
13190 __func__, job_specs->shared, job_ptr);
13191 }
13192 }
13193 if (error_code != SLURM_SUCCESS)
13194 goto fini;
13195
13196 if (job_specs->contiguous != NO_VAL16) {
13197 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
13198 error_code = ESLURM_JOB_NOT_PENDING;
13199 else if (operator
13200 || (detail_ptr->contiguous > job_specs->contiguous)) {
13201 detail_ptr->contiguous = job_specs->contiguous;
13202 sched_info("%s: setting contiguous to %u for %pJ",
13203 __func__, job_specs->contiguous, job_ptr);
13204 } else {
13205 sched_error("%s: Attempt to add contiguous for %pJ",
13206 __func__, job_ptr);
13207 error_code = ESLURM_ACCESS_DENIED;
13208 }
13209 }
13210 if (error_code != SLURM_SUCCESS)
13211 goto fini;
13212
13213 if (job_specs->core_spec != NO_VAL16) {
13214 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
13215 error_code = ESLURM_JOB_NOT_PENDING;
13216 else if (operator &&
13217 (slurmctld_conf.conf_flags & CTL_CONF_ASRU)) {
13218 if (job_specs->core_spec == INFINITE16)
13219 detail_ptr->core_spec = NO_VAL16;
13220 else
13221 detail_ptr->core_spec = job_specs->core_spec;
13222 sched_info("%s: setting core_spec to %u for %pJ",
13223 __func__, detail_ptr->core_spec, job_ptr);
13224 if (detail_ptr->core_spec != NO_VAL16)
13225 detail_ptr->whole_node = 1;
13226 } else {
13227 sched_error("%s Attempt to modify core_spec for %pJ",
13228 __func__, job_ptr);
13229 error_code = ESLURM_ACCESS_DENIED;
13230 }
13231 }
13232 if (error_code != SLURM_SUCCESS)
13233 goto fini;
13234
13235 if (job_specs->features && detail_ptr &&
13236 !xstrcmp(job_specs->features, detail_ptr->features)) {
13237 sched_debug("%s: new features identical to old features %s",
13238 __func__, job_specs->features);
13239 } else if (job_specs->features) {
13240 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
13241 error_code = ESLURM_JOB_NOT_PENDING;
13242 else if (job_specs->features[0] != '\0') {
13243 char *old_features = detail_ptr->features;
13244 List old_list = detail_ptr->feature_list;
13245 detail_ptr->features = xstrdup(job_specs->features);
13246 detail_ptr->feature_list = NULL;
13247 if (build_feature_list(job_ptr)) {
13248 sched_info("%s: invalid features(%s) for %pJ",
13249 __func__, job_specs->features,
13250 job_ptr);
13251 FREE_NULL_LIST(detail_ptr->feature_list);
13252 detail_ptr->features = old_features;
13253 detail_ptr->feature_list = old_list;
13254 error_code = ESLURM_INVALID_FEATURE;
13255 } else {
13256 sched_info("%s: setting features to %s for %pJ",
13257 __func__, job_specs->features,
13258 job_ptr);
13259 xfree(old_features);
13260 FREE_NULL_LIST(old_list);
13261 }
13262 } else {
13263 sched_info("%s: cleared features for %pJ", __func__,
13264 job_ptr);
13265 xfree(detail_ptr->features);
13266 FREE_NULL_LIST(detail_ptr->feature_list);
13267 }
13268 }
13269 if (error_code != SLURM_SUCCESS)
13270 goto fini;
13271
13272 if (job_specs->cluster_features &&
13273 (error_code = fed_mgr_update_job_cluster_features(
13274 job_ptr, job_specs->cluster_features)))
13275 goto fini;
13276
13277 if (job_specs->clusters &&
13278 (error_code = fed_mgr_update_job_clusters(job_ptr,
13279 job_specs->clusters)))
13280 goto fini;
13281
13282 if (gres_list) {
13283 char *tmp = NULL;
13284 if (job_specs->cpus_per_tres) {
13285 xstrfmtcat(tmp, "cpus_per_tres:%s ",
13286 job_specs->cpus_per_tres);
13287 xfree(job_ptr->cpus_per_tres);
13288 job_ptr->cpus_per_tres = job_specs->cpus_per_tres;
13289 job_specs->cpus_per_tres = NULL;
13290 }
13291 if (job_specs->tres_per_job) {
13292 xstrfmtcat(tmp, "tres_per_job:%s ",
13293 job_specs->tres_per_job);
13294 xfree(job_ptr->tres_per_job);
13295 job_ptr->tres_per_job = job_specs->tres_per_job;
13296 job_specs->tres_per_job = NULL;
13297 }
13298 if (job_specs->tres_per_node) {
13299 xstrfmtcat(tmp, "tres_per_node:%s ",
13300 job_specs->tres_per_node);
13301 xfree(job_ptr->tres_per_node);
13302 job_ptr->tres_per_node = job_specs->tres_per_node;
13303 job_specs->tres_per_node = NULL;
13304 }
13305 if (job_specs->tres_per_socket) {
13306 xstrfmtcat(tmp, "tres_per_socket:%s ",
13307 job_specs->tres_per_socket);
13308 xfree(job_ptr->tres_per_socket);
13309 job_ptr->tres_per_socket = job_specs->tres_per_socket;
13310 job_specs->tres_per_socket = NULL;
13311 }
13312 if (job_specs->tres_per_task) {
13313 xstrfmtcat(tmp, "tres_per_task:%s ",
13314 job_specs->tres_per_task);
13315 xfree(job_ptr->tres_per_task);
13316 job_ptr->tres_per_task = job_specs->tres_per_task;
13317 job_specs->tres_per_task = NULL;
13318 }
13319 if (job_specs->mem_per_tres) {
13320 xstrfmtcat(tmp, "mem_per_tres:%s ",
13321 job_specs->mem_per_tres);
13322 xfree(job_ptr->mem_per_tres);
13323 job_ptr->mem_per_tres = job_specs->mem_per_tres;
13324 job_specs->mem_per_tres = NULL;
13325 }
13326 sched_info("%s: setting %sfor %pJ", __func__, tmp, job_ptr);
13327 xfree(tmp);
13328 FREE_NULL_LIST(job_ptr->gres_list);
13329 job_ptr->gres_list = gres_list;
13330 gres_build_job_details(job_ptr->gres_list,
13331 &job_ptr->gres_detail_cnt,
13332 &job_ptr->gres_detail_str,
13333 &job_ptr->gres_used);
13334 gres_list = NULL;
13335 }
13336
13337 if (job_specs->name) {
13338 if (IS_JOB_FINISHED(job_ptr)) {
13339 error_code = ESLURM_JOB_FINISHED;
13340 goto fini;
13341 } else if (!xstrcmp(job_specs->name, job_ptr->name)) {
13342 sched_debug("%s: new name identical to old name %pJ",
13343 __func__, job_ptr);
13344 } else {
13345 xfree(job_ptr->name);
13346 job_ptr->name = xstrdup(job_specs->name);
13347
13348 sched_info("%s: setting name to %s for %pJ",
13349 __func__, job_ptr->name, job_ptr);
13350 update_accounting = true;
13351 }
13352 }
13353
13354 if (job_specs->work_dir && detail_ptr &&
13355 !xstrcmp(job_specs->work_dir, detail_ptr->work_dir)) {
13356 sched_debug("%s: new work_dir identical to old work_dir %s",
13357 __func__, job_specs->work_dir);
13358 } else if (job_specs->work_dir) {
13359 if (!IS_JOB_PENDING(job_ptr)) {
13360 error_code = ESLURM_JOB_NOT_PENDING;
13361 goto fini;
13362 } else if (detail_ptr) {
13363 xfree(detail_ptr->work_dir);
13364 detail_ptr->work_dir = xstrdup(job_specs->work_dir);
13365 sched_info("%s: setting work_dir to %s for %pJ",
13366 __func__, detail_ptr->work_dir, job_ptr);
13367 update_accounting = true;
13368 }
13369 }
13370
13371 if (job_specs->std_out && detail_ptr &&
13372 !xstrcmp(job_specs->std_out, detail_ptr->std_out)) {
13373 sched_debug("%s: new std_out identical to old std_out %s",
13374 __func__, job_specs->std_out);
13375 } else if (job_specs->std_out) {
13376 if (!IS_JOB_PENDING(job_ptr))
13377 error_code = ESLURM_JOB_NOT_PENDING;
13378 else if (detail_ptr) {
13379 xfree(detail_ptr->std_out);
13380 detail_ptr->std_out = xstrdup(job_specs->std_out);
13381 }
13382 }
13383 if (error_code != SLURM_SUCCESS)
13384 goto fini;
13385
13386 if (job_specs->wckey
13387 && !xstrcmp(job_specs->wckey, job_ptr->wckey)) {
13388 sched_debug("%s: new wckey identical to old wckey %pJ",
13389 __func__, job_ptr);
13390 } else if (job_specs->wckey) {
13391 if (!IS_JOB_PENDING(job_ptr))
13392 error_code = ESLURM_JOB_NOT_PENDING;
13393 else {
13394 int rc = update_job_wckey((char *) __func__,
13395 job_ptr, job_specs->wckey);
13396 if (rc != SLURM_SUCCESS)
13397 error_code = rc;
13398 else
13399 update_accounting = true;
13400 }
13401 }
13402 if (error_code != SLURM_SUCCESS)
13403 goto fini;
13404
13405 if ((job_specs->min_nodes != NO_VAL) &&
13406 (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
13407 /*
13408 * Use req_nodes to change the nodes associated with a running
13409 * for lack of other field in the job request to use
13410 */
13411 if ((job_specs->min_nodes == 0) && (job_ptr->node_cnt > 0) &&
13412 job_ptr->details && job_ptr->details->expanding_jobid) {
13413 job_record_t *expand_job_ptr;
13414 bitstr_t *orig_job_node_bitmap, *orig_jobx_node_bitmap;
13415
13416 expand_job_ptr = find_job_record(job_ptr->details->
13417 expanding_jobid);
13418 if (expand_job_ptr == NULL) {
13419 info("%s: Invalid node count (%u) for %pJ update, JobId=%u to expand not found",
13420 __func__, job_specs->min_nodes, job_ptr,
13421 job_ptr->details->expanding_jobid);
13422 error_code = ESLURM_INVALID_JOB_ID;
13423 goto fini;
13424 }
13425 if (IS_JOB_SUSPENDED(job_ptr) ||
13426 IS_JOB_SUSPENDED(expand_job_ptr)) {
13427 info("%s: Can not expand %pJ from %pJ, job is suspended",
13428 __func__, expand_job_ptr, job_ptr);
13429 error_code = ESLURM_JOB_SUSPENDED;
13430 goto fini;
13431 }
13432 if ((job_ptr->step_list != NULL) &&
13433 (list_count(job_ptr->step_list) != 0)) {
13434 info("%s: Attempt to merge %pJ with active steps into %pJ",
13435 __func__, job_ptr, expand_job_ptr);
13436 error_code = ESLURMD_STEP_EXISTS;
13437 goto fini;
13438 }
13439 sched_info("%s: killing %pJ and moving all resources to %pJ",
13440 __func__, job_ptr, expand_job_ptr);
13441 job_pre_resize_acctg(job_ptr);
13442 job_pre_resize_acctg(expand_job_ptr);
13443 _send_job_kill(job_ptr);
13444
13445 xassert(job_ptr->job_resrcs);
13446 xassert(job_ptr->job_resrcs->node_bitmap);
13447 xassert(expand_job_ptr->job_resrcs->node_bitmap);
13448 orig_job_node_bitmap = bit_copy(job_ptr->node_bitmap);
13449 orig_jobx_node_bitmap = bit_copy(expand_job_ptr->
13450 job_resrcs->
13451 node_bitmap);
13452 error_code = select_g_job_expand(job_ptr,
13453 expand_job_ptr);
13454 if (error_code == SLURM_SUCCESS) {
13455 _merge_job_licenses(job_ptr, expand_job_ptr);
13456 FREE_NULL_BITMAP(job_ptr->node_bitmap);
13457 job_ptr->node_bitmap = orig_job_node_bitmap;
13458 orig_job_node_bitmap = NULL;
13459 deallocate_nodes(job_ptr, false, false, false);
13460 bit_clear_all(job_ptr->node_bitmap);
13461 job_ptr->job_state &= JOB_STATE_FLAGS;
13462 job_ptr->job_state |= JOB_COMPLETE;
13463 _realloc_nodes(expand_job_ptr,
13464 orig_jobx_node_bitmap);
13465 rebuild_step_bitmaps(expand_job_ptr,
13466 orig_jobx_node_bitmap);
13467 (void) gs_job_fini(job_ptr);
13468 (void) gs_job_start(expand_job_ptr);
13469 }
13470 FREE_NULL_BITMAP(orig_job_node_bitmap);
13471 FREE_NULL_BITMAP(orig_jobx_node_bitmap);
13472 job_post_resize_acctg(job_ptr);
13473 job_post_resize_acctg(expand_job_ptr);
13474 /*
13475 * Since job_post_resize_acctg will restart things,
13476 * don't do it again.
13477 */
13478 update_accounting = false;
13479 if (error_code)
13480 goto fini;
13481 } else if ((job_specs->min_nodes == 0) ||
13482 (job_specs->min_nodes > job_ptr->node_cnt) ||
13483 job_ptr->details->expanding_jobid) {
13484 sched_info("%s: Invalid node count (%u) for %pJ update",
13485 __func__, job_specs->min_nodes, job_ptr);
13486 error_code = ESLURM_INVALID_NODE_COUNT;
13487 goto fini;
13488 } else if (job_specs->min_nodes == job_ptr->node_cnt) {
13489 debug2("%s: No change in node count update for %pJ",
13490 __func__, job_ptr);
13491 } else if (!permit_job_shrink()) {
13492 error("%s: request to shrink %pJ denied by configuration",
13493 __func__, job_ptr);
13494 error_code = ESLURM_NOT_SUPPORTED;
13495 goto fini;
13496 } else {
13497 int i, i_first, i_last, total = 0;
13498 node_record_t *node_ptr;
13499 bitstr_t *rem_nodes, *tmp_nodes;
13500 sched_info("%s: set node count to %u for %pJ", __func__,
13501 job_specs->min_nodes, job_ptr);
13502 job_pre_resize_acctg(job_ptr);
13503
13504 /*
13505 * Don't remove the batch host from the job. The batch
13506 * host isn't guaranteed to be the first bit set in
13507 * job_ptr->node_bitmap because the batch host can be
13508 * selected with the --batch and --constraint sbatch
13509 * flags.
13510 */
13511 tmp_nodes = bit_copy(job_ptr->node_bitmap);
13512 if (job_ptr->batch_host) {
13513 bitstr_t *batch_host_bitmap;
13514 if (node_name2bitmap(job_ptr->batch_host, false,
13515 &batch_host_bitmap))
13516 error("%s: Invalid batch host %s for %pJ; this should never happen",
13517 __func__, job_ptr->batch_host,
13518 job_ptr);
13519 else {
13520 bit_and_not(tmp_nodes,
13521 batch_host_bitmap);
13522 bit_free(batch_host_bitmap);
13523 /*
13524 * Set total to 1 since we're
13525 * guaranteeing that we won't remove the
13526 * batch host.
13527 */
13528 total = 1;
13529 }
13530 }
13531
13532 i_first = bit_ffs(tmp_nodes);
13533 if (i_first >= 0)
13534 i_last = bit_fls(tmp_nodes);
13535 else
13536 i_last = -2;
13537 rem_nodes = bit_alloc(bit_size(tmp_nodes));
13538 for (i = i_first; i <= i_last; i++) {
13539 if (!bit_test(tmp_nodes, i))
13540 continue;
13541 if (++total <= job_specs->min_nodes)
13542 continue;
13543 bit_set(rem_nodes, i);
13544 }
13545 #ifndef HAVE_FRONT_END
13546 abort_job_on_nodes(job_ptr, rem_nodes);
13547 #endif
13548 for (i = i_first, total = 0; i <= i_last; i++) {
13549 if (!bit_test(rem_nodes, i))
13550 continue;
13551 node_ptr = node_record_table_ptr + i;
13552 kill_step_on_node(job_ptr, node_ptr, false);
13553 excise_node_from_job(job_ptr, node_ptr);
13554 }
13555 bit_free(rem_nodes);
13556 bit_free(tmp_nodes);
13557 (void) gs_job_start(job_ptr);
13558 job_post_resize_acctg(job_ptr);
13559 sched_info("%s: set nodes to %s for %pJ",
13560 __func__, job_ptr->nodes, job_ptr);
13561 /*
13562 * Since job_post_resize_acctg() will restart
13563 * things don't do it again.
13564 */
13565 update_accounting = false;
13566 }
13567 gres_build_job_details(job_ptr->gres_list,
13568 &job_ptr->gres_detail_cnt,
13569 &job_ptr->gres_detail_str,
13570 &job_ptr->gres_used);
13571 }
13572
13573 if (job_specs->ntasks_per_node != NO_VAL16) {
13574 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
13575 error_code = ESLURM_JOB_NOT_PENDING;
13576 else if (operator) {
13577 detail_ptr->ntasks_per_node =
13578 job_specs->ntasks_per_node;
13579 sched_info("%s: setting ntasks_per_node to %u for %pJ",
13580 __func__, job_specs->ntasks_per_node, job_ptr);
13581 } else {
13582 sched_error("%s: Not super user: ignore ntasks_per_node change for job %pJ",
13583 __func__, job_ptr);
13584 error_code = ESLURM_ACCESS_DENIED;
13585 }
13586 }
13587 if (error_code != SLURM_SUCCESS)
13588 goto fini;
13589
13590 if (job_specs->ntasks_per_socket != NO_VAL16) {
13591 if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL) ||
13592 (detail_ptr->mc_ptr == NULL)) {
13593 error_code = ESLURM_JOB_NOT_PENDING;
13594 } else if (operator) {
13595 detail_ptr->mc_ptr->ntasks_per_socket =
13596 job_specs->ntasks_per_socket;
13597 sched_info("%s: setting ntasks_per_socket to %u for %pJ",
13598 __func__, job_specs->ntasks_per_socket,
13599 job_ptr);
13600 } else {
13601 sched_error("%s: Not super user: ignore ntasks_per_socket change for %pJ",
13602 __func__, job_ptr);
13603 error_code = ESLURM_ACCESS_DENIED;
13604 }
13605 }
13606 if (error_code != SLURM_SUCCESS)
13607 goto fini;
13608
13609 if (job_specs->dependency) {
13610 /* Can't update dependency of revoked job */
13611 if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL) ||
13612 IS_JOB_REVOKED(job_ptr))
13613 error_code = ESLURM_JOB_NOT_PENDING;
13614 else if (!fed_mgr_is_origin_job(job_ptr)) {
13615 /*
13616 * If the job became independent because of a dependency
13617 * update, that job gets requeued on siblings and then
13618 * the dependency update gets sent to siblings. So we
13619 * silently ignore this update on the sibling.
13620 */
13621 } else {
13622 int rc;
13623 rc = update_job_dependency(job_ptr,
13624 job_specs->dependency);
13625 if (rc != SLURM_SUCCESS)
13626 error_code = rc;
13627 /*
13628 * Because dependencies updated and we don't know where
13629 * they used to be, send dependencies to all siblings
13630 * so the siblings can update their dependency list.
13631 */
13632 else {
13633 rc = fed_mgr_submit_remote_dependencies(job_ptr,
13634 true,
13635 false);
13636 if (rc) {
13637 error("%s: %pJ Failed to send remote dependencies to some or all siblings.",
13638 __func__, job_ptr);
13639 error_code = rc;
13640 }
13641 /*
13642 * Even if we fail to send remote dependencies,
13643 * we already succeeded in updating the job's
13644 * dependency locally, so we still need to
13645 * do these things.
13646 */
13647 job_ptr->details->orig_dependency =
13648 xstrdup(job_ptr->details->dependency);
13649 sched_info("%s: setting dependency to %s for %pJ",
13650 __func__,
13651 job_ptr->details->dependency,
13652 job_ptr);
13653 /*
13654 * If the job isn't independent, remove pending
13655 * remote sibling jobs
13656 */
13657 if (!job_independent(job_ptr))
13658 fed_mgr_job_revoke_sibs(job_ptr);
13659 }
13660 }
13661 }
13662 if (error_code != SLURM_SUCCESS)
13663 goto fini;
13664
13665 if (job_specs->begin_time) {
13666 if (IS_JOB_PENDING(job_ptr) && detail_ptr) {
13667 char time_str[32];
13668 /*
13669 * Make sure this time is current, it does no good for
13670 * accounting to say this job could have started before
13671 * now
13672 */
13673 if (job_specs->begin_time < now)
13674 job_specs->begin_time = now;
13675
13676 if (detail_ptr->begin_time != job_specs->begin_time) {
13677 detail_ptr->begin_time = job_specs->begin_time;
13678 update_accounting = true;
13679 slurm_make_time_str(&detail_ptr->begin_time,
13680 time_str, sizeof(time_str));
13681 sched_info("%s: setting begin to %s for %pJ",
13682 __func__, time_str, job_ptr);
13683 acct_policy_remove_accrue_time(job_ptr, false);
13684 } else
13685 sched_debug("%s: new begin time identical to old begin time %pJ",
13686 __func__, job_ptr);
13687 } else {
13688 error_code = ESLURM_JOB_NOT_PENDING;
13689 goto fini;
13690 }
13691 }
13692
13693 if (valid_licenses) {
13694 if (IS_JOB_PENDING(job_ptr)) {
13695 FREE_NULL_LIST(job_ptr->license_list);
13696 job_ptr->license_list = license_list;
13697 license_list = NULL;
13698 sched_info("%s: changing licenses from '%s' to '%s' for pending %pJ",
13699 __func__, job_ptr->licenses,
13700 job_specs->licenses, job_ptr);
13701 xfree(job_ptr->licenses);
13702 job_ptr->licenses = xstrdup(job_specs->licenses);
13703 } else if (IS_JOB_RUNNING(job_ptr)) {
13704 /*
13705 * Operators can modify license counts on running jobs,
13706 * regular users can only completely remove license
13707 * counts on running jobs.
13708 */
13709 if (!operator && license_list) {
13710 sched_error("%s: Not operator user: ignore licenses change for %pJ",
13711 __func__, job_ptr);
13712 error_code = ESLURM_ACCESS_DENIED;
13713 goto fini;
13714 }
13715
13716 /*
13717 * NOTE: This can result in oversubscription of
13718 * licenses
13719 */
13720 license_job_return(job_ptr);
13721 FREE_NULL_LIST(job_ptr->license_list);
13722 job_ptr->license_list = license_list;
13723 license_list = NULL;
13724 sched_info("%s: changing licenses from '%s' to '%s' for running %pJ",
13725 __func__, job_ptr->licenses,
13726 job_specs->licenses, job_ptr);
13727 xfree(job_ptr->licenses);
13728 job_ptr->licenses = xstrdup(job_specs->licenses);
13729 license_job_get(job_ptr);
13730 } else {
13731 /*
13732 * licenses are valid, but job state or user not
13733 * allowed to make changes
13734 */
13735 sched_info("%s: could not change licenses for %pJ",
13736 __func__, job_ptr);
13737 error_code = ESLURM_JOB_NOT_PENDING_NOR_RUNNING;
13738 FREE_NULL_LIST(license_list);
13739 }
13740
13741 update_accounting = true;
13742 }
13743 if (error_code != SLURM_SUCCESS)
13744 goto fini;
13745
13746 fail_reason = job_limits_check(&job_ptr, false);
13747 if (fail_reason != WAIT_NO_REASON) {
13748 if (fail_reason == WAIT_QOS_THRES)
13749 error_code = ESLURM_QOS_THRES;
13750 else if ((fail_reason == WAIT_PART_TIME_LIMIT) ||
13751 (fail_reason == WAIT_PART_NODE_LIMIT) ||
13752 (fail_reason == WAIT_PART_DOWN) ||
13753 (fail_reason == WAIT_HELD))
13754 error_code = SLURM_SUCCESS;
13755 else
13756 error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
13757
13758 if (error_code != SLURM_SUCCESS) {
13759 if ((job_ptr->state_reason != WAIT_HELD) &&
13760 (job_ptr->state_reason != WAIT_HELD_USER) &&
13761 (job_ptr->state_reason != WAIT_RESV_DELETED)) {
13762 job_ptr->state_reason = fail_reason;
13763 xfree(job_ptr->state_desc);
13764 }
13765 goto fini;
13766 }
13767 } else if ((job_ptr->state_reason != WAIT_HELD)
13768 && (job_ptr->state_reason != WAIT_HELD_USER)
13769 && (job_ptr->state_reason != WAIT_RESV_DELETED)
13770 /*
13771 * A job update can come while the prolog is running.
13772 * Don't change state_reason if the prolog is running.
13773 * _is_prolog_finished() relies on state_reason==WAIT_PROLOG
13774 * to know if the prolog is running. If we change it here,
13775 * then slurmctld will think that the prolog isn't running
13776 * anymore and _slurm_rpc_job_ready will tell srun that the
13777 * prolog is done even if it isn't. Then srun can launch a
13778 * job step before the prolog is done, which breaks the
13779 * behavior of PrologFlags=alloc and means that the job step
13780 * could launch before the extern step sets up x11.
13781 */
13782 && (job_ptr->state_reason != WAIT_PROLOG)
13783 && (job_ptr->state_reason != WAIT_MAX_REQUEUE)) {
13784 job_ptr->state_reason = WAIT_NO_REASON;
13785 xfree(job_ptr->state_desc);
13786 }
13787
13788 if (job_specs->reboot != NO_VAL16) {
13789 if (!validate_super_user(uid)) {
13790 error("%s: Attempt to change reboot for %pJ",
13791 __func__, job_ptr);
13792 error_code = ESLURM_ACCESS_DENIED;
13793 } else if (!IS_JOB_PENDING(job_ptr)) {
13794 error_code = ESLURM_JOB_NOT_PENDING;
13795 goto fini;
13796 } else {
13797 sched_info("%s: setting reboot to %u for %pJ",
13798 __func__, job_specs->reboot, job_ptr);
13799 if (job_specs->reboot == 0)
13800 job_ptr->reboot = 0;
13801 else
13802 job_ptr->reboot = MAX(1, job_specs->reboot);
13803 }
13804 }
13805
13806 if (job_specs->network && !xstrcmp(job_specs->network,
13807 job_ptr->network)) {
13808 sched_debug("%s: new network identical to old network %s",
13809 __func__, job_ptr->network);
13810 } else if (job_specs->network) {
13811 xfree(job_ptr->network);
13812 if (!strlen(job_specs->network)
13813 || !xstrcmp(job_specs->network, "none")) {
13814 sched_info("%s: clearing Network option for %pJ",
13815 __func__, job_ptr);
13816 } else {
13817 job_ptr->network = xstrdup(job_specs->network);
13818 sched_info("%s: setting Network to %s for %pJ",
13819 __func__, job_ptr->network, job_ptr);
13820 select_g_select_jobinfo_set(
13821 job_ptr->select_jobinfo,
13822 SELECT_JOBDATA_NETWORK,
13823 job_ptr->network);
13824 }
13825 }
13826
13827 if (job_specs->fed_siblings_viable) {
13828 if (!job_ptr->fed_details) {
13829 error_code = ESLURM_JOB_NOT_FEDERATED;
13830 goto fini;
13831 }
13832
13833 info("%s: setting fed_siblings from %"PRIu64" to %"PRIu64" for %pJ",
13834 __func__, job_ptr->fed_details->siblings_viable,
13835 job_specs->fed_siblings_viable, job_ptr);
13836
13837 job_ptr->fed_details->siblings_viable =
13838 job_specs->fed_siblings_viable;
13839 update_job_fed_details(job_ptr);
13840 }
13841
13842 if (job_specs->cpus_per_tres) {
13843 if (!valid_tres_cnt(job_specs->cpus_per_tres)) {
13844 error_code = ESLURM_INVALID_TRES;
13845 goto fini;
13846 }
13847 xfree(job_ptr->cpus_per_tres);
13848 if (!strlen(job_specs->cpus_per_tres)) {
13849 sched_info("%s: clearing CpusPerTres option for %pJ",
13850 __func__, job_ptr);
13851 } else {
13852 job_ptr->cpus_per_tres =
13853 xstrdup(job_specs->cpus_per_tres);
13854 sched_info("%s: setting CpusPerTres to %s for %pJ",
13855 __func__, job_ptr->cpus_per_tres, job_ptr);
13856 }
13857 }
13858
13859 if (job_specs->mem_per_tres) {
13860 if (!valid_tres_cnt(job_specs->mem_per_tres)) {
13861 error_code = ESLURM_INVALID_TRES;
13862 goto fini;
13863 }
13864 xfree(job_ptr->mem_per_tres);
13865 if (!strlen(job_specs->mem_per_tres)) {
13866 sched_info("%s: clearing MemPerTres option for %pJ",
13867 __func__, job_ptr);
13868 } else {
13869 job_ptr->mem_per_tres =
13870 xstrdup(job_specs->mem_per_tres);
13871 sched_info("%s: setting MemPerTres to %s for %pJ",
13872 __func__, job_ptr->mem_per_tres, job_ptr);
13873 }
13874 }
13875
13876 if (job_specs->tres_bind) {
13877 if (tres_bind_verify_cmdline(job_specs->tres_bind)) {
13878 error_code = ESLURM_INVALID_TRES;
13879 goto fini;
13880 }
13881 xfree(job_ptr->tres_bind);
13882 if (!strlen(job_specs->tres_bind)) {
13883 sched_info("%s: clearing TresBind option for %pJ",
13884 __func__, job_ptr);
13885 } else {
13886 job_ptr->tres_bind = xstrdup(job_specs->tres_bind);
13887 sched_info("%s: setting TresBind to %s for %pJ",
13888 __func__, job_ptr->tres_bind, job_ptr);
13889 }
13890 }
13891
13892 if (job_specs->tres_freq) {
13893 if (tres_freq_verify_cmdline(job_specs->tres_freq)) {
13894 error_code = ESLURM_INVALID_TRES;
13895 goto fini;
13896 }
13897 xfree(job_ptr->tres_freq);
13898 if (!strlen(job_specs->tres_freq)) {
13899 sched_info("%s: clearing TresFreq option for %pJ",
13900 __func__, job_ptr);
13901 } else {
13902 job_ptr->tres_freq = xstrdup(job_specs->tres_freq);
13903 sched_info("%s: setting TresFreq to %s for %pJ",
13904 __func__, job_ptr->tres_freq, job_ptr);
13905 }
13906 }
13907
13908 if (job_specs->tres_per_job) {
13909 if (!valid_tres_cnt(job_specs->tres_per_job)) {
13910 error_code = ESLURM_INVALID_TRES;
13911 goto fini;
13912 }
13913 xfree(job_ptr->tres_per_job);
13914 if (!strlen(job_specs->tres_per_job)) {
13915 sched_info("%s: clearing TresPerJob option for %pJ",
13916 __func__, job_ptr);
13917 } else {
13918 job_ptr->tres_per_job =
13919 xstrdup(job_specs->tres_per_job);
13920 sched_info("%s: setting TresPerJob to %s for %pJ",
13921 __func__, job_ptr->tres_per_job, job_ptr);
13922 }
13923 }
13924 if (job_specs->tres_per_node) {
13925 if (!valid_tres_cnt(job_specs->tres_per_node)) {
13926 error_code = ESLURM_INVALID_TRES;
13927 goto fini;
13928 }
13929 xfree(job_ptr->tres_per_node);
13930 if (!strlen(job_specs->tres_per_node)) {
13931 sched_info("%s: clearing TresPerNode option for %pJ",
13932 __func__, job_ptr);
13933 } else {
13934 job_ptr->tres_per_node =
13935 xstrdup(job_specs->tres_per_node);
13936 sched_info("%s: setting TresPerNode to %s for %pJ",
13937 __func__, job_ptr->tres_per_node, job_ptr);
13938 }
13939 }
13940
13941 if (job_specs->tres_per_socket) {
13942 if (!valid_tres_cnt(job_specs->tres_per_socket)) {
13943 error_code = ESLURM_INVALID_TRES;
13944 goto fini;
13945 }
13946 xfree(job_ptr->tres_per_socket);
13947 if (!strlen(job_specs->tres_per_socket)) {
13948 sched_info("%s: clearing TresPerSocket option for %pJ",
13949 __func__, job_ptr);
13950 } else {
13951 job_ptr->tres_per_socket =
13952 xstrdup(job_specs->tres_per_socket);
13953 sched_info("%s: setting TresPerSocket to %s for %pJ",
13954 __func__, job_ptr->tres_per_socket, job_ptr);
13955 }
13956 }
13957
13958 if (job_specs->tres_per_task) {
13959 if (!valid_tres_cnt(job_specs->tres_per_task)) {
13960 error_code = ESLURM_INVALID_TRES;
13961 goto fini;
13962 }
13963 xfree(job_ptr->tres_per_task);
13964 if (!strlen(job_specs->tres_per_task)) {
13965 sched_info("%s: clearing TresPerTask option for %pJ",
13966 __func__, job_ptr);
13967 } else {
13968 job_ptr->tres_per_task =
13969 xstrdup(job_specs->tres_per_task);
13970 sched_info("%s: setting TresPerTask to %s for %pJ",
13971 __func__, job_ptr->tres_per_task, job_ptr);
13972 }
13973 }
13974
13975 if (job_specs->mail_type != NO_VAL16) {
13976 job_ptr->mail_type = job_specs->mail_type;
13977 sched_info("%s: setting mail_type to %u for %pJ",
13978 __func__, job_ptr->mail_type, job_ptr);
13979 }
13980
13981 if (job_specs->mail_user) {
13982 xfree(job_ptr->mail_user);
13983 job_ptr->mail_user = _get_mail_user(job_specs->mail_user,
13984 job_ptr->user_id);
13985 sched_info("%s: setting mail_user to %s for %pJ",
13986 __func__, job_ptr->mail_user, job_ptr);
13987 }
13988
13989 /*
13990 * The job submit plugin sets site_factor to NO_VAL before calling
13991 * the plugin to prevent the user from specifying it.
13992 */
13993 if (user_site_factor != NO_VAL) {
13994 if (!operator) {
13995 error("%s: Attempt to change SiteFactor for %pJ",
13996 __func__, job_ptr);
13997 error_code = ESLURM_ACCESS_DENIED;
13998 job_specs->site_factor = NO_VAL;
13999 } else
14000 job_specs->site_factor = user_site_factor;
14001 }
14002 if (job_specs->site_factor != NO_VAL) {
14003 sched_info("%s: setting AdinPrioFactor to %u for %pJ",
14004 __func__, job_specs->site_factor, job_ptr);
14005 job_ptr->site_factor = job_specs->site_factor;
14006 }
14007
14008 fini:
14009 FREE_NULL_BITMAP(new_req_bitmap);
14010 FREE_NULL_LIST(part_ptr_list);
14011
14012 if ((error_code == SLURM_SUCCESS) && tres_req_cnt_set) {
14013 for (tres_pos = 0; tres_pos < slurmctld_tres_cnt; tres_pos++) {
14014 if (!tres_req_cnt[tres_pos] ||
14015 (tres_req_cnt[tres_pos] ==
14016 job_ptr->tres_req_cnt[tres_pos]))
14017 continue;
14018
14019 job_ptr->tres_req_cnt[tres_pos] =
14020 tres_req_cnt[tres_pos];
14021 tres_changed = true;
14022 }
14023 if (tres_changed) {
14024 job_ptr->tres_req_cnt[TRES_ARRAY_BILLING] =
14025 assoc_mgr_tres_weighted(
14026 job_ptr->tres_req_cnt,
14027 job_ptr->part_ptr->billing_weights,
14028 slurmctld_conf.priority_flags,
14029 false);
14030 set_job_tres_req_str(job_ptr, false);
14031 update_accounting = true;
14032 job_ptr->node_cnt_wag = 0;
14033 }
14034 }
14035
14036 /* This was a local variable, so set it back to NULL */
14037 job_specs->tres_req_cnt = NULL;
14038
14039 FREE_NULL_LIST(gres_list);
14040 FREE_NULL_LIST(license_list);
14041 if (update_accounting) {
14042 info("%s: updating accounting", __func__);
14043 /* Update job record in accounting to reflect changes */
14044 jobacct_storage_job_start_direct(acct_db_conn, job_ptr);
14045 }
14046
14047 /*
14048 * If job isn't held recalculate the priority when not using
14049 * priority/basic. Since many factors of an update may affect priority
14050 * considerations. Do this whether or not the update was successful or
14051 * not.
14052 */
14053 if ((job_ptr->priority != 0) &&
14054 xstrcmp(slurmctld_conf.priority_type, "priority/basic"))
14055 set_job_prio(job_ptr);
14056
14057 if ((error_code == SLURM_SUCCESS) &&
14058 fed_mgr_fed_rec &&
14059 job_ptr->fed_details && fed_mgr_is_origin_job(job_ptr)) {
14060 /* Send updates to sibling jobs */
14061 /* Add the siblings_active to be updated. They could have been
14062 * updated if the job's ClusterFeatures were updated. */
14063 job_specs->fed_siblings_viable =
14064 job_ptr->fed_details->siblings_viable;
14065 fed_mgr_update_job(job_ptr->job_id, job_specs,
14066 job_ptr->fed_details->siblings_active, uid);
14067 }
14068
14069 return error_code;
14070 }
14071
14072 /*
14073 * update_job - update a job's parameters per the supplied specifications
14074 * IN msg - RPC to update job, including change specification
14075 * IN uid - uid of user issuing RPC
14076 * IN send_msg - whether to send msg back or not
14077 * RET returns an error code from slurm_errno.h
14078 * global: job_list - global list of job entries
14079 * last_job_update - time of last job table update
14080 */
update_job(slurm_msg_t * msg,uid_t uid,bool send_msg)14081 extern int update_job(slurm_msg_t *msg, uid_t uid, bool send_msg)
14082 {
14083 job_desc_msg_t *job_specs = (job_desc_msg_t *) msg->data;
14084 job_record_t *job_ptr;
14085 char *hostname = g_slurm_auth_get_host(msg->auth_cred);
14086 int rc;
14087
14088 xfree(job_specs->job_id_str);
14089 xstrfmtcat(job_specs->job_id_str, "%u", job_specs->job_id);
14090
14091 if (hostname) {
14092 xfree(job_specs->alloc_node);
14093 job_specs->alloc_node = hostname;
14094 }
14095
14096 job_ptr = find_job_record(job_specs->job_id);
14097 if (job_ptr == NULL) {
14098 info("%s: JobId=%u does not exist",
14099 __func__, job_specs->job_id);
14100 rc = ESLURM_INVALID_JOB_ID;
14101 } else {
14102 if (job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap)
14103 job_specs->array_bitmap =
14104 bit_copy(job_ptr->array_recs->task_id_bitmap);
14105
14106 rc = _update_job(job_ptr, job_specs, uid);
14107 }
14108 if (send_msg && rc != ESLURM_JOB_SETTING_DB_INX)
14109 slurm_send_rc_msg(msg, rc);
14110 xfree(job_specs->job_id_str);
14111
14112 return rc;
14113 }
14114
14115 /*
14116 * IN msg - RPC to update job, including change specification
14117 * IN job_specs - a job's specification
14118 * IN uid - uid of user issuing RPC
14119 * RET returns an error code from slurm_errno.h
14120 * global: job_list - global list of job entries
14121 * last_job_update - time of last job table update
14122 */
update_job_str(slurm_msg_t * msg,uid_t uid)14123 extern int update_job_str(slurm_msg_t *msg, uid_t uid)
14124 {
14125
14126 slurm_msg_t resp_msg;
14127 job_desc_msg_t *job_specs = (job_desc_msg_t *) msg->data;
14128 job_record_t *job_ptr, *new_job_ptr, *het_job;
14129 char *hostname = g_slurm_auth_get_host(msg->auth_cred);
14130 ListIterator iter;
14131 long int long_id;
14132 uint32_t job_id = 0, het_job_offset;
14133 bitstr_t *array_bitmap = NULL, *tmp_bitmap;
14134 bool valid = true;
14135 int32_t i, i_first, i_last;
14136 int len, rc = SLURM_SUCCESS, rc2;
14137 char *end_ptr, *tok, *tmp = NULL;
14138 char *job_id_str;
14139 resp_array_struct_t *resp_array = NULL;
14140 job_array_resp_msg_t *resp_array_msg = NULL;
14141 return_code_msg_t rc_msg;
14142
14143 job_id_str = job_specs->job_id_str;
14144
14145 if (hostname) {
14146 xfree(job_specs->alloc_node);
14147 job_specs->alloc_node = hostname;
14148
14149 }
14150
14151 if (max_array_size == NO_VAL)
14152 max_array_size = slurmctld_conf.max_array_sz;
14153
14154 long_id = strtol(job_id_str, &end_ptr, 10);
14155 if ((long_id <= 0) || (long_id == LONG_MAX) ||
14156 ((end_ptr[0] != '\0') && (end_ptr[0] != '_') &&
14157 (end_ptr[0] != '+'))) {
14158 info("%s: invalid JobId=%s", __func__, job_id_str);
14159 rc = ESLURM_INVALID_JOB_ID;
14160 goto reply;
14161 }
14162 job_id = (uint32_t) long_id;
14163 if (end_ptr[0] == '\0') { /* Single job (or full job array) */
14164 job_record_t *job_ptr_done = NULL;
14165 job_ptr = find_job_record(job_id);
14166 if (job_ptr && job_ptr->het_job_list) {
14167 iter = list_iterator_create(job_ptr->het_job_list);
14168 while ((het_job = list_next(iter))) {
14169 if (job_ptr->het_job_id !=
14170 het_job->het_job_id) {
14171 error("%s: Bad het_job_list for %pJ",
14172 __func__, job_ptr);
14173 continue;
14174 }
14175 rc = _update_job(het_job, job_specs, uid);
14176 }
14177 list_iterator_destroy(iter);
14178 goto reply;
14179 }
14180 if (job_ptr &&
14181 (((job_ptr->array_task_id == NO_VAL) &&
14182 (job_ptr->array_recs == NULL)) ||
14183 ((job_ptr->array_task_id != NO_VAL) &&
14184 (job_ptr->array_job_id != job_id)))) {
14185 /* This is a regular job or single task of job array */
14186 rc = _update_job(job_ptr, job_specs, uid);
14187 goto reply;
14188 }
14189
14190 if (job_ptr && job_ptr->array_recs) {
14191 /* This is a job array */
14192 job_ptr_done = job_ptr;
14193 if (job_ptr->array_recs->task_id_bitmap)
14194 job_specs->array_bitmap = bit_copy(
14195 job_ptr->array_recs->task_id_bitmap);
14196 rc2 = _update_job(job_ptr, job_specs, uid);
14197 if (rc2 == ESLURM_JOB_SETTING_DB_INX) {
14198 rc = rc2;
14199 goto reply;
14200 }
14201 _resp_array_add(&resp_array, job_ptr, rc2);
14202 }
14203
14204 /* Update all tasks of this job array */
14205 job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
14206 if (!job_ptr && !job_ptr_done) {
14207 info("%s: invalid JobId=%u", __func__, job_id);
14208 rc = ESLURM_INVALID_JOB_ID;
14209 goto reply;
14210 }
14211 while (job_ptr) {
14212 if ((job_ptr->array_job_id == job_id) &&
14213 (job_ptr != job_ptr_done)) {
14214 rc2 = _update_job(job_ptr, job_specs, uid);
14215 if (rc2 == ESLURM_JOB_SETTING_DB_INX) {
14216 rc = rc2;
14217 goto reply;
14218 }
14219 _resp_array_add(&resp_array, job_ptr, rc2);
14220 }
14221 job_ptr = job_ptr->job_array_next_j;
14222 }
14223 goto reply;
14224 } else if (end_ptr[0] == '+') { /* Hetjob element */
14225 long_id = strtol(end_ptr+1, &tmp, 10);
14226 if ((long_id < 0) || (long_id == LONG_MAX) ||
14227 (tmp[0] != '\0')) {
14228 info("%s: invalid JobId=%s", __func__, job_id_str);
14229 rc = ESLURM_INVALID_JOB_ID;
14230 goto reply;
14231 }
14232 het_job_offset = (uint32_t) long_id;
14233 job_ptr = find_het_job_record(job_id, het_job_offset);
14234 if (!job_ptr) {
14235 info("%s: invalid JobId=%u", __func__, job_id);
14236 rc = ESLURM_INVALID_JOB_ID;
14237 goto reply;
14238 }
14239 rc = _update_job(job_ptr, job_specs, uid);
14240 goto reply;
14241 }
14242
14243 array_bitmap = bit_alloc(max_array_size);
14244 tmp = xstrdup(end_ptr + 1);
14245 tok = strtok_r(tmp, ",", &end_ptr);
14246 while (tok && valid) {
14247 valid = _parse_array_tok(tok, array_bitmap,
14248 max_array_size);
14249 tok = strtok_r(NULL, ",", &end_ptr);
14250 }
14251 xfree(tmp);
14252 if (valid) {
14253 i_last = bit_fls(array_bitmap);
14254 if (i_last < 0)
14255 valid = false;
14256 }
14257 if (!valid) {
14258 info("%s: invalid JobId=%s", __func__, job_id_str);
14259 rc = ESLURM_INVALID_JOB_ID;
14260 goto reply;
14261 }
14262
14263 job_ptr = find_job_record(job_id);
14264 if (job_ptr && IS_JOB_PENDING(job_ptr) &&
14265 job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) {
14266 /* Ensure bitmap sizes match for AND operations */
14267 len = bit_size(job_ptr->array_recs->task_id_bitmap);
14268 i_last++;
14269 if (i_last < len) {
14270 array_bitmap = bit_realloc(array_bitmap, len);
14271 } else {
14272 array_bitmap = bit_realloc(array_bitmap, i_last);
14273 job_ptr->array_recs->task_id_bitmap = bit_realloc(
14274 job_ptr->array_recs->task_id_bitmap, i_last);
14275 }
14276 if (!bit_overlap_any(job_ptr->array_recs->task_id_bitmap,
14277 array_bitmap)) {
14278 /* Nothing to do with this job record */
14279 } else if (bit_super_set(job_ptr->array_recs->task_id_bitmap,
14280 array_bitmap)) {
14281 /* Update the record with all pending tasks */
14282 job_specs->array_bitmap =
14283 bit_copy(job_ptr->array_recs->task_id_bitmap);
14284 rc2 = _update_job(job_ptr, job_specs, uid);
14285 if (rc2 == ESLURM_JOB_SETTING_DB_INX) {
14286 rc = rc2;
14287 goto reply;
14288 }
14289 _resp_array_add(&resp_array, job_ptr, rc2);
14290 bit_and_not(array_bitmap, job_specs->array_bitmap);
14291 } else {
14292 /* Need to split out tasks to separate job records */
14293 tmp_bitmap = bit_copy(job_ptr->array_recs->
14294 task_id_bitmap);
14295 bit_and(tmp_bitmap, array_bitmap);
14296 i_first = bit_ffs(tmp_bitmap);
14297 if (i_first >= 0)
14298 i_last = bit_fls(tmp_bitmap);
14299 else
14300 i_last = -2;
14301 for (i = i_first; i <= i_last; i++) {
14302 if (!bit_test(tmp_bitmap, i))
14303 continue;
14304 job_ptr->array_task_id = i;
14305 new_job_ptr = job_array_split(job_ptr);
14306 if (!new_job_ptr) {
14307 error("%s: Unable to copy record for %pJ",
14308 __func__, job_ptr);
14309 } else {
14310 /* The array_recs structure is moved
14311 * to the new job record copy */
14312 bb_g_job_validate2(job_ptr, NULL);
14313 job_ptr = new_job_ptr;
14314 }
14315 }
14316 FREE_NULL_BITMAP(tmp_bitmap);
14317 }
14318 }
14319
14320 i_first = bit_ffs(array_bitmap);
14321 if (i_first >= 0)
14322 i_last = bit_fls(array_bitmap);
14323 else
14324 i_last = -2;
14325 for (i = i_first; i <= i_last; i++) {
14326 if (!bit_test(array_bitmap, i))
14327 continue;
14328 job_ptr = find_job_array_rec(job_id, i);
14329 if (job_ptr == NULL) {
14330 info("%s: invalid JobId=%u_%d", __func__, job_id, i);
14331 _resp_array_add_id(&resp_array, job_id, i,
14332 ESLURM_INVALID_JOB_ID);
14333 continue;
14334 }
14335
14336 rc2 = _update_job(job_ptr, job_specs, uid);
14337 if (rc2 == ESLURM_JOB_SETTING_DB_INX) {
14338 rc = rc2;
14339 goto reply;
14340 }
14341 _resp_array_add(&resp_array, job_ptr, rc2);
14342 }
14343
14344 reply:
14345 if ((rc != ESLURM_JOB_SETTING_DB_INX) && (msg->conn_fd >= 0)) {
14346 slurm_msg_t_init(&resp_msg);
14347 resp_msg.protocol_version = msg->protocol_version;
14348 if (resp_array) {
14349 resp_array_msg = _resp_array_xlate(resp_array, job_id);
14350 resp_msg.msg_type = RESPONSE_JOB_ARRAY_ERRORS;
14351 resp_msg.data = resp_array_msg;
14352 } else {
14353 resp_msg.msg_type = RESPONSE_SLURM_RC;
14354 rc_msg.return_code = rc;
14355 resp_msg.data = &rc_msg;
14356 }
14357 resp_msg.conn = msg->conn;
14358 slurm_send_node_msg(msg->conn_fd, &resp_msg);
14359
14360 if (resp_array_msg) {
14361 slurm_free_job_array_resp(resp_array_msg);
14362 resp_msg.data = NULL;
14363 }
14364 }
14365 _resp_array_free(resp_array);
14366
14367 FREE_NULL_BITMAP(array_bitmap);
14368
14369 return rc;
14370 }
14371
_send_job_kill(job_record_t * job_ptr)14372 static void _send_job_kill(job_record_t *job_ptr)
14373 {
14374 kill_job_msg_t *kill_job = NULL;
14375 agent_arg_t *agent_args = NULL;
14376 #ifdef HAVE_FRONT_END
14377 front_end_record_t *front_end_ptr;
14378 #else
14379 int i;
14380 node_record_t *node_ptr;
14381 #endif
14382
14383 xassert(job_ptr);
14384 xassert(job_ptr->details);
14385
14386 agent_args = xmalloc(sizeof(agent_arg_t));
14387 agent_args->msg_type = REQUEST_TERMINATE_JOB;
14388 agent_args->retry = 0; /* re_kill_job() resends as needed */
14389 agent_args->hostlist = hostlist_create(NULL);
14390 kill_job = xmalloc(sizeof(kill_job_msg_t));
14391 last_node_update = time(NULL);
14392 kill_job->job_gres_info =
14393 gres_plugin_epilog_build_env(job_ptr->gres_list,job_ptr->nodes);
14394 kill_job->job_id = job_ptr->job_id;
14395 kill_job->het_job_id = job_ptr->het_job_id;
14396 kill_job->step_id = NO_VAL;
14397 kill_job->job_state = job_ptr->job_state;
14398 kill_job->job_uid = job_ptr->user_id;
14399 kill_job->job_gid = job_ptr->group_id;
14400 kill_job->nodes = xstrdup(job_ptr->nodes);
14401 kill_job->time = time(NULL);
14402 kill_job->start_time = job_ptr->start_time;
14403 kill_job->select_jobinfo = select_g_select_jobinfo_copy(
14404 job_ptr->select_jobinfo);
14405 kill_job->spank_job_env = xduparray(job_ptr->spank_job_env_size,
14406 job_ptr->spank_job_env);
14407 kill_job->spank_job_env_size = job_ptr->spank_job_env_size;
14408
14409 #ifdef HAVE_FRONT_END
14410 if (job_ptr->batch_host &&
14411 (front_end_ptr = job_ptr->front_end_ptr)) {
14412 agent_args->protocol_version = front_end_ptr->protocol_version;
14413 hostlist_push_host(agent_args->hostlist, job_ptr->batch_host);
14414 agent_args->node_count++;
14415 }
14416 #else
14417 if (!job_ptr->node_bitmap_cg)
14418 build_cg_bitmap(job_ptr);
14419 agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
14420 for (i = 0, node_ptr = node_record_table_ptr;
14421 i < node_record_count; i++, node_ptr++) {
14422 if (!bit_test(job_ptr->node_bitmap_cg, i))
14423 continue;
14424 if (agent_args->protocol_version > node_ptr->protocol_version)
14425 agent_args->protocol_version =
14426 node_ptr->protocol_version;
14427 hostlist_push_host(agent_args->hostlist, node_ptr->name);
14428 agent_args->node_count++;
14429 }
14430 #endif
14431 if (agent_args->node_count == 0) {
14432 if (job_ptr->details->expanding_jobid == 0) {
14433 error("%s: %pJ allocated no nodes to be killed on",
14434 __func__, job_ptr);
14435 }
14436 xfree(kill_job->nodes);
14437 xfree(kill_job);
14438 hostlist_destroy(agent_args->hostlist);
14439 xfree(agent_args);
14440 return;
14441 }
14442
14443 agent_args->msg_args = kill_job;
14444 agent_queue_request(agent_args);
14445 return;
14446 }
14447
14448 /* Record accounting information for a job immediately before changing size */
job_pre_resize_acctg(job_record_t * job_ptr)14449 extern void job_pre_resize_acctg(job_record_t *job_ptr)
14450 {
14451 /* if we don't have a db_index go a start this one up since if
14452 running with the slurmDBD the job may not have started yet.
14453 */
14454
14455 if ((!job_ptr->db_index || job_ptr->db_index == NO_VAL64)
14456 && !job_ptr->resize_time)
14457 jobacct_storage_g_job_start(acct_db_conn, job_ptr);
14458
14459 job_ptr->job_state |= JOB_RESIZING;
14460 /* NOTE: job_completion_logger() calls
14461 * acct_policy_remove_job_submit() */
14462 job_completion_logger(job_ptr, false);
14463
14464 /* This doesn't happen in job_completion_logger, but gets
14465 * added back in with job_post_resize_acctg so remove it here. */
14466 acct_policy_job_fini(job_ptr);
14467
14468 /* NOTE: The RESIZING FLAG needed to be cleared with
14469 job_post_resize_acctg */
14470 }
14471
14472 /* Record accounting information for a job immediately after changing size */
job_post_resize_acctg(job_record_t * job_ptr)14473 extern void job_post_resize_acctg(job_record_t *job_ptr)
14474 {
14475 time_t org_submit = job_ptr->details->submit_time;
14476
14477 /*
14478 * NOTE: The RESIZING FLAG needed to be set with job_pre_resize_acctg()
14479 * the assert is here to make sure we code it that way.
14480 */
14481 xassert(IS_JOB_RESIZING(job_ptr));
14482 acct_policy_add_job_submit(job_ptr);
14483 /* job_set_alloc_tres() must be called before acct_policy_job_begin() */
14484 job_set_alloc_tres(job_ptr, false);
14485 acct_policy_job_begin(job_ptr);
14486 job_claim_resv(job_ptr);
14487
14488 if (job_ptr->resize_time)
14489 job_ptr->details->submit_time = job_ptr->resize_time;
14490
14491 job_ptr->resize_time = time(NULL);
14492
14493 /* FIXME: see if this can be changed to job_start_direct() */
14494 jobacct_storage_g_job_start(acct_db_conn, job_ptr);
14495
14496 job_ptr->details->submit_time = org_submit;
14497 job_ptr->job_state &= (~JOB_RESIZING);
14498
14499 /*
14500 * Reset the end_time_exp that was probably set to NO_VAL when
14501 * ending the job on the resize. If using the
14502 * priority/multifactor plugin if the end_time_exp is NO_VAL
14503 * it will not run again for the job.
14504 */
14505 job_ptr->end_time_exp = job_ptr->end_time;
14506
14507 /*
14508 * If a job is resized, the core bitmap will differ in the step.
14509 * See rebuild_step_bitmaps(). The problem will go away when we have
14510 * per-node core bitmaps. For now just set a flag that the job was
14511 * resized while there were active job steps.
14512 */
14513 if (job_ptr->step_list && (list_count(job_ptr->step_list) > 0))
14514 job_ptr->bit_flags |= JOB_RESIZED;
14515 }
14516
_build_step_id(char * buf,int buf_len,uint32_t step_id)14517 static char *_build_step_id(char *buf, int buf_len, uint32_t step_id)
14518 {
14519 if (step_id == SLURM_BATCH_SCRIPT)
14520 snprintf(buf, buf_len, "StepId=Batch");
14521 else
14522 snprintf(buf, buf_len, "StepId=%u", step_id);
14523 return buf;
14524 }
14525
14526 /*
14527 * validate_jobs_on_node - validate that any jobs that should be on the node
14528 * are actually running, if not clean up the job records and/or node
14529 * records.
14530 * IN reg_msg - node registration message
14531 */
14532 extern void
validate_jobs_on_node(slurm_node_registration_status_msg_t * reg_msg)14533 validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg)
14534 {
14535 int i, node_inx, jobs_on_node;
14536 node_record_t *node_ptr;
14537 job_record_t *job_ptr;
14538 step_record_t *step_ptr;
14539 char step_str[64];
14540 time_t now = time(NULL);
14541
14542 node_ptr = find_node_record(reg_msg->node_name);
14543 if (node_ptr == NULL) {
14544 error("slurmd registered on unknown node %s",
14545 reg_msg->node_name);
14546 return;
14547 }
14548
14549 if (reg_msg->energy)
14550 memcpy(node_ptr->energy, reg_msg->energy,
14551 sizeof(acct_gather_energy_t));
14552
14553 if (node_ptr->up_time > reg_msg->up_time) {
14554 verbose("Node %s rebooted %u secs ago",
14555 reg_msg->node_name, reg_msg->up_time);
14556 }
14557
14558 if (reg_msg->up_time <= now) {
14559 node_ptr->up_time = reg_msg->up_time;
14560 node_ptr->boot_time = now - reg_msg->up_time;
14561 node_ptr->slurmd_start_time = reg_msg->slurmd_start_time;
14562 } else {
14563 error("Node up_time is invalid: %u>%u", reg_msg->up_time,
14564 (uint32_t) now);
14565 }
14566
14567 if (waiting_for_node_boot(node_ptr))
14568 return;
14569
14570 node_inx = node_ptr - node_record_table_ptr;
14571
14572 /* Check that jobs running are really supposed to be there */
14573 for (i = 0; i < reg_msg->job_count; i++) {
14574 if ( (reg_msg->job_id[i] >= MIN_NOALLOC_JOBID) &&
14575 (reg_msg->job_id[i] <= MAX_NOALLOC_JOBID) ) {
14576 info("NoAllocate JobId=%u %s reported on node %s",
14577 reg_msg->job_id[i],
14578 _build_step_id(step_str, sizeof(step_str),
14579 reg_msg->step_id[i]),
14580 reg_msg->node_name);
14581 continue;
14582 }
14583
14584 job_ptr = find_job_record(reg_msg->job_id[i]);
14585 if (job_ptr == NULL) {
14586 error("Orphan JobId=%u %s reported on node %s",
14587 reg_msg->job_id[i],
14588 _build_step_id(step_str, sizeof(step_str),
14589 reg_msg->step_id[i]),
14590 reg_msg->node_name);
14591 abort_job_on_node(reg_msg->job_id[i],
14592 job_ptr, node_ptr->name);
14593 }
14594
14595 else if (IS_JOB_RUNNING(job_ptr) ||
14596 IS_JOB_SUSPENDED(job_ptr)) {
14597 if (bit_test(job_ptr->node_bitmap, node_inx)) {
14598 if ((job_ptr->batch_flag) &&
14599 (node_inx == bit_ffs(
14600 job_ptr->node_bitmap))) {
14601 /* NOTE: Used for purging defunct
14602 * batch jobs */
14603 job_ptr->time_last_active = now;
14604 }
14605 step_ptr = find_step_record(job_ptr,
14606 reg_msg->
14607 step_id[i]);
14608 if (step_ptr)
14609 step_ptr->time_last_active = now;
14610 debug3("Registered %pS on node %s",
14611 step_ptr, reg_msg->node_name);
14612 } else {
14613 /* Typically indicates a job requeue and
14614 * restart on another nodes. A node from the
14615 * original allocation just responded here. */
14616 error("Registered %pJ %s on wrong node %s",
14617 job_ptr,
14618 _build_step_id(step_str,
14619 sizeof(step_str),
14620 reg_msg->step_id[i]),
14621 reg_msg->node_name);
14622 info("%s: job nodes %s count %d inx %d",
14623 __func__, job_ptr->nodes,
14624 job_ptr->node_cnt, node_inx);
14625 abort_job_on_node(reg_msg->job_id[i], job_ptr,
14626 node_ptr->name);
14627 }
14628 }
14629
14630 else if (IS_JOB_COMPLETING(job_ptr)) {
14631 /*
14632 * Re-send kill request as needed,
14633 * not necessarily an error
14634 */
14635 kill_job_on_node(job_ptr, node_ptr);
14636 }
14637
14638
14639 else if (IS_JOB_PENDING(job_ptr)) {
14640 /* Typically indicates a job requeue and the hung
14641 * slurmd that went DOWN is now responding */
14642 error("Registered PENDING %pJ %s on node %s",
14643 job_ptr,
14644 _build_step_id(step_str, sizeof(step_str),
14645 reg_msg->step_id[i]),
14646 reg_msg->node_name);
14647 abort_job_on_node(reg_msg->job_id[i],
14648 job_ptr, node_ptr->name);
14649 }
14650
14651 else if (difftime(now, job_ptr->end_time) <
14652 slurm_get_msg_timeout()) { /* Race condition */
14653 debug("Registered newly completed %pJ %s on %s",
14654 job_ptr,
14655 _build_step_id(step_str, sizeof(step_str),
14656 reg_msg->step_id[i]),
14657 node_ptr->name);
14658 }
14659
14660 else { /* else job is supposed to be done */
14661 error("Registered %pJ %s in state %s on node %s",
14662 job_ptr,
14663 _build_step_id(step_str, sizeof(step_str),
14664 reg_msg->step_id[i]),
14665 job_state_string(job_ptr->job_state),
14666 reg_msg->node_name);
14667 kill_job_on_node(job_ptr, node_ptr);
14668 }
14669 }
14670
14671 jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt;
14672 if (jobs_on_node)
14673 _purge_missing_jobs(node_inx, now);
14674
14675 if (jobs_on_node != reg_msg->job_count) {
14676 /* slurmd will not know of a job unless the job has
14677 * steps active at registration time, so this is not
14678 * an error condition, slurmd is also reporting steps
14679 * rather than jobs */
14680 debug3("resetting job_count on node %s from %u to %d",
14681 reg_msg->node_name, reg_msg->job_count, jobs_on_node);
14682 reg_msg->job_count = jobs_on_node;
14683 }
14684
14685 return;
14686 }
14687
14688 /* Purge any batch job that should have its script running on node
14689 * node_inx, but is not. Allow BatchStartTimeout + ResumeTimeout seconds
14690 * for startup.
14691 *
14692 * Purge all job steps that were started before the node was last booted.
14693 *
14694 * Also notify srun if any job steps should be active on this node
14695 * but are not found. */
_purge_missing_jobs(int node_inx,time_t now)14696 static void _purge_missing_jobs(int node_inx, time_t now)
14697 {
14698 ListIterator job_iterator;
14699 job_record_t *job_ptr;
14700 node_record_t *node_ptr = node_record_table_ptr + node_inx;
14701 uint16_t batch_start_timeout = slurm_get_batch_start_timeout();
14702 uint16_t msg_timeout = slurm_get_msg_timeout();
14703 uint16_t resume_timeout = slurm_get_resume_timeout();
14704 uint32_t suspend_time = slurm_get_suspend_time();
14705 time_t batch_startup_time, node_boot_time = (time_t) 0, startup_time;
14706
14707 if (node_ptr->boot_time > (msg_timeout + 5)) {
14708 /* allow for message timeout and other delays */
14709 node_boot_time = node_ptr->boot_time - (msg_timeout + 5);
14710 }
14711 batch_startup_time = now - batch_start_timeout;
14712 batch_startup_time -= MIN(DEFAULT_MSG_TIMEOUT, msg_timeout);
14713
14714 job_iterator = list_iterator_create(job_list);
14715 while ((job_ptr = list_next(job_iterator))) {
14716 if ((IS_JOB_CONFIGURING(job_ptr) ||
14717 (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) ||
14718 (!bit_test(job_ptr->node_bitmap, node_inx)))
14719 continue;
14720 if ((job_ptr->batch_flag != 0) &&
14721 (suspend_time != 0) /* power mgmt on */ &&
14722 (job_ptr->start_time < node_boot_time)) {
14723 startup_time = batch_startup_time - resume_timeout;
14724 } else
14725 startup_time = batch_startup_time;
14726
14727 if ((job_ptr->batch_flag != 0) &&
14728 (job_ptr->het_job_offset == 0) &&
14729 (job_ptr->time_last_active < startup_time) &&
14730 (job_ptr->start_time < startup_time) &&
14731 (node_ptr == find_node_record(job_ptr->batch_host))) {
14732 bool requeue = false;
14733 char *requeue_msg = "";
14734 if (job_ptr->details && job_ptr->details->requeue) {
14735 requeue = true;
14736 requeue_msg = ", Requeuing job";
14737 }
14738 info("Batch %pJ missing from batch node %s (not found BatchStartTime after startup)%s",
14739 job_ptr, job_ptr->batch_host, requeue_msg);
14740 job_ptr->exit_code = 1;
14741 job_complete(job_ptr->job_id,
14742 slurmctld_conf.slurm_user_id,
14743 requeue, true, NO_VAL);
14744 } else {
14745 _notify_srun_missing_step(job_ptr, node_inx,
14746 now, node_boot_time);
14747 }
14748 }
14749 list_iterator_destroy(job_iterator);
14750 }
14751
_notify_srun_missing_step(job_record_t * job_ptr,int node_inx,time_t now,time_t node_boot_time)14752 static void _notify_srun_missing_step(job_record_t *job_ptr, int node_inx,
14753 time_t now, time_t node_boot_time)
14754 {
14755 ListIterator step_iterator;
14756 step_record_t *step_ptr;
14757 char *node_name = node_record_table_ptr[node_inx].name;
14758
14759 xassert(job_ptr);
14760 step_iterator = list_iterator_create (job_ptr->step_list);
14761 while ((step_ptr = list_next(step_iterator))) {
14762 if ((step_ptr->step_id == SLURM_EXTERN_CONT) ||
14763 (step_ptr->step_id == SLURM_BATCH_SCRIPT) ||
14764 (step_ptr->state != JOB_RUNNING))
14765 continue;
14766 if (!bit_test(step_ptr->step_node_bitmap, node_inx))
14767 continue;
14768 if (step_ptr->time_last_active >= now) {
14769 /* Back up timer in case more than one node
14770 * registration happens at this same time.
14771 * We don't want this node's registration
14772 * to count toward a different node's
14773 * registration message. */
14774 step_ptr->time_last_active = now - 1;
14775 } else if (step_ptr->host && step_ptr->port) {
14776 /* srun may be able to verify step exists on
14777 * this node using I/O sockets and kill the
14778 * job as needed */
14779 srun_step_missing(step_ptr, node_name);
14780 } else if ((step_ptr->start_time < node_boot_time) &&
14781 (step_ptr->no_kill == 0)) {
14782 /* There is a risk that the job step's tasks completed
14783 * on this node before its reboot, but that should be
14784 * very rare and there is no srun to work with (POE) */
14785 info("Node %s rebooted, killing missing step %u.%u",
14786 node_name, job_ptr->job_id, step_ptr->step_id);
14787 signal_step_tasks_on_node(node_name, step_ptr, SIGKILL,
14788 REQUEST_TERMINATE_TASKS);
14789 }
14790 }
14791 list_iterator_destroy (step_iterator);
14792 }
14793
14794 /*
14795 * abort_job_on_node - Kill the specific job_id on a specific node,
14796 * the request is not processed immediately, but queued.
14797 * This is to prevent a flood of pthreads if slurmctld restarts
14798 * without saved state and slurmd daemons register with a
14799 * multitude of running jobs. Slurmctld will not recognize
14800 * these jobs and use this function to kill them - one
14801 * agent request per node as they register.
14802 * IN job_id - id of the job to be killed
14803 * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. job reported
14804 * by slurmd on some node, but job records already purged from
14805 * slurmctld)
14806 * IN node_name - name of the node on which the job resides
14807 */
abort_job_on_node(uint32_t job_id,job_record_t * job_ptr,char * node_name)14808 extern void abort_job_on_node(uint32_t job_id, job_record_t *job_ptr,
14809 char *node_name)
14810 {
14811 agent_arg_t *agent_info;
14812 kill_job_msg_t *kill_req;
14813
14814 kill_req = xmalloc(sizeof(kill_job_msg_t));
14815 kill_req->job_id = job_id;
14816 kill_req->step_id = NO_VAL;
14817 kill_req->time = time(NULL);
14818 kill_req->nodes = xstrdup(node_name);
14819 if (job_ptr) { /* NULL if unknown */
14820 kill_req->job_gres_info =
14821 gres_plugin_epilog_build_env(job_ptr->gres_list,
14822 job_ptr->nodes);
14823 kill_req->het_job_id = job_ptr->het_job_id;
14824 kill_req->start_time = job_ptr->start_time;
14825 kill_req->select_jobinfo =
14826 select_g_select_jobinfo_copy(job_ptr->select_jobinfo);
14827 kill_req->spank_job_env = xduparray(job_ptr->spank_job_env_size,
14828 job_ptr->spank_job_env);
14829 kill_req->spank_job_env_size = job_ptr->spank_job_env_size;
14830 } else {
14831 /* kill_req->start_time = 0; Default value */
14832 }
14833
14834 agent_info = xmalloc(sizeof(agent_arg_t));
14835 agent_info->node_count = 1;
14836 agent_info->retry = 0;
14837 agent_info->hostlist = hostlist_create(node_name);
14838 #ifdef HAVE_FRONT_END
14839 if (job_ptr && job_ptr->front_end_ptr)
14840 agent_info->protocol_version =
14841 job_ptr->front_end_ptr->protocol_version;
14842 if (job_ptr) {
14843 debug("Aborting %pJ on front end node %s", job_ptr, node_name);
14844 } else {
14845 debug("Aborting JobId=%u on front end node %s", job_id,
14846 node_name);
14847 }
14848 #else
14849 node_record_t *node_ptr;
14850 if ((node_ptr = find_node_record(node_name)))
14851 agent_info->protocol_version = node_ptr->protocol_version;
14852 if (job_ptr)
14853 debug("Aborting %pJ on node %s", job_ptr, node_name);
14854 else
14855 debug("Aborting JobId=%u on node %s", job_id, node_name);
14856 #endif
14857 agent_info->msg_type = REQUEST_ABORT_JOB;
14858 agent_info->msg_args = kill_req;
14859
14860 agent_queue_request(agent_info);
14861 }
14862
14863 /*
14864 * abort_job_on_nodes - Kill the specific job_on the specific nodes,
14865 * the request is not processed immediately, but queued.
14866 * This is to prevent a flood of pthreads if slurmctld restarts
14867 * without saved state and slurmd daemons register with a
14868 * multitude of running jobs. Slurmctld will not recognize
14869 * these jobs and use this function to kill them - one
14870 * agent request per node as they register.
14871 * IN job_ptr - pointer to terminating job
14872 * IN node_name - name of the node on which the job resides
14873 */
abort_job_on_nodes(job_record_t * job_ptr,bitstr_t * node_bitmap)14874 extern void abort_job_on_nodes(job_record_t *job_ptr,
14875 bitstr_t *node_bitmap)
14876 {
14877 bitstr_t *full_node_bitmap, *tmp_node_bitmap;
14878 node_record_t *node_ptr;
14879 int i, i_first, i_last;
14880 agent_arg_t *agent_info;
14881 kill_job_msg_t *kill_req;
14882 uint16_t protocol_version;
14883
14884 #ifdef HAVE_FRONT_END
14885 fatal("%s: front-end mode not supported", __func__);
14886 #endif
14887 xassert(node_bitmap);
14888 /* Send a separate message for nodes at different protocol_versions */
14889 full_node_bitmap = bit_copy(node_bitmap);
14890 while ((i_first = bit_ffs(full_node_bitmap)) >= 0) {
14891 i_last = bit_fls(full_node_bitmap);
14892 node_ptr = node_record_table_ptr + i_first;
14893 protocol_version = node_ptr->protocol_version;
14894 tmp_node_bitmap = bit_alloc(bit_size(node_bitmap));
14895 for (i = i_first; i <= i_last; i++) {
14896 if (!bit_test(full_node_bitmap, i))
14897 continue;
14898 node_ptr = node_record_table_ptr + i;
14899 if (node_ptr->protocol_version != protocol_version)
14900 continue;
14901 bit_clear(full_node_bitmap, i);
14902 bit_set(tmp_node_bitmap, i);
14903 }
14904 kill_req = xmalloc(sizeof(kill_job_msg_t));
14905 kill_req->job_gres_info =
14906 gres_plugin_epilog_build_env(job_ptr->gres_list,
14907 job_ptr->nodes);
14908 kill_req->job_id = job_ptr->job_id;
14909 kill_req->step_id = NO_VAL;
14910 kill_req->time = time(NULL);
14911 kill_req->nodes = bitmap2node_name(tmp_node_bitmap);
14912 kill_req->het_job_id = job_ptr->het_job_id;
14913 kill_req->start_time = job_ptr->start_time;
14914 kill_req->select_jobinfo =
14915 select_g_select_jobinfo_copy(job_ptr->select_jobinfo);
14916 kill_req->spank_job_env = xduparray(job_ptr->spank_job_env_size,
14917 job_ptr->spank_job_env);
14918 kill_req->spank_job_env_size = job_ptr->spank_job_env_size;
14919 agent_info = xmalloc(sizeof(agent_arg_t));
14920 agent_info->node_count = bit_set_count(tmp_node_bitmap);
14921 agent_info->retry = 1;
14922 agent_info->hostlist = hostlist_create(kill_req->nodes);
14923 debug("Aborting %pJ on nodes %s", job_ptr, kill_req->nodes);
14924 agent_info->msg_type = REQUEST_ABORT_JOB;
14925 agent_info->msg_args = kill_req;
14926 agent_info->protocol_version = protocol_version;
14927 agent_queue_request(agent_info);
14928 bit_free(tmp_node_bitmap);
14929 }
14930 bit_free(full_node_bitmap);
14931 }
14932
14933 /*
14934 * kill_job_on_node - Kill the specific job on a specific node.
14935 * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned)
14936 * IN node_ptr - pointer to the node on which the job resides
14937 */
kill_job_on_node(job_record_t * job_ptr,node_record_t * node_ptr)14938 extern void kill_job_on_node(job_record_t *job_ptr,
14939 node_record_t *node_ptr)
14940 {
14941 agent_arg_t *agent_info;
14942 kill_job_msg_t *kill_req;
14943
14944 kill_req = xmalloc(sizeof(kill_job_msg_t));
14945 kill_req->job_gres_info =
14946 gres_plugin_epilog_build_env(job_ptr->gres_list,job_ptr->nodes);
14947 kill_req->het_job_id = job_ptr->het_job_id;
14948 kill_req->job_id = job_ptr->job_id;
14949 kill_req->step_id = NO_VAL;
14950 kill_req->time = time(NULL);
14951 kill_req->start_time = job_ptr->start_time;
14952 kill_req->nodes = xstrdup(node_ptr->name);
14953 kill_req->select_jobinfo =
14954 select_g_select_jobinfo_copy(job_ptr->select_jobinfo);
14955 kill_req->job_state = job_ptr->job_state;
14956 kill_req->spank_job_env = xduparray(job_ptr->spank_job_env_size,
14957 job_ptr->spank_job_env);
14958 kill_req->spank_job_env_size = job_ptr->spank_job_env_size;
14959
14960 agent_info = xmalloc(sizeof(agent_arg_t));
14961 agent_info->node_count = 1;
14962 agent_info->retry = 0;
14963 #ifdef HAVE_FRONT_END
14964 xassert(job_ptr->batch_host);
14965 if (job_ptr->front_end_ptr)
14966 agent_info->protocol_version =
14967 job_ptr->front_end_ptr->protocol_version;
14968 agent_info->hostlist = hostlist_create(job_ptr->batch_host);
14969 debug("Killing %pJ on front end node %s",
14970 job_ptr, job_ptr->batch_host);
14971 #else
14972 agent_info->protocol_version = node_ptr->protocol_version;
14973 agent_info->hostlist = hostlist_create(node_ptr->name);
14974 debug("Killing %pJ on node %s", job_ptr, node_ptr->name);
14975 #endif
14976 agent_info->msg_type = REQUEST_TERMINATE_JOB;
14977 agent_info->msg_args = kill_req;
14978
14979 agent_queue_request(agent_info);
14980 }
14981
14982 /*
14983 * Return true if this job is complete (including all elements of a hetjob)
14984 */
_job_all_finished(job_record_t * job_ptr)14985 static bool _job_all_finished(job_record_t *job_ptr)
14986 {
14987 job_record_t *het_job;
14988 ListIterator iter;
14989 bool finished = true;
14990
14991 if (!IS_JOB_FINISHED(job_ptr))
14992 return false;
14993
14994 if (!job_ptr->het_job_list)
14995 return true;
14996
14997 iter = list_iterator_create(job_ptr->het_job_list);
14998 while ((het_job = list_next(iter))) {
14999 if (!IS_JOB_FINISHED(het_job)) {
15000 finished = false;
15001 break;
15002 }
15003 }
15004 list_iterator_destroy(iter);
15005
15006 return finished;
15007 }
15008
15009 /*
15010 * job_alloc_info_ptr - get details about an existing job allocation
15011 * IN uid - job issuing the code
15012 * IN job_ptr - pointer to job record
15013 * NOTE: See job_alloc_info() if job pointer not known
15014 */
job_alloc_info_ptr(uint32_t uid,job_record_t * job_ptr)15015 extern int job_alloc_info_ptr(uint32_t uid, job_record_t *job_ptr)
15016 {
15017 uint8_t prolog = 0;
15018
15019 if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) &&
15020 (job_ptr->user_id != uid) && !validate_operator(uid) &&
15021 (((slurm_mcs_get_privatedata() == 0) &&
15022 !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
15023 job_ptr->account)) ||
15024 ((slurm_mcs_get_privatedata() == 1) &&
15025 (mcs_g_check_mcs_label(uid, job_ptr->mcs_label) != 0))))
15026 return ESLURM_ACCESS_DENIED;
15027 if (IS_JOB_PENDING(job_ptr))
15028 return ESLURM_JOB_PENDING;
15029 if (_job_all_finished(job_ptr))
15030 return ESLURM_ALREADY_DONE;
15031 if (job_ptr->details)
15032 prolog = job_ptr->details->prolog_running;
15033
15034 if (job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD") &&
15035 (prolog == 0) && job_ptr->node_bitmap &&
15036 (bit_overlap_any(power_node_bitmap, job_ptr->node_bitmap) == 0)) {
15037 last_job_update = time(NULL);
15038 set_job_alias_list(job_ptr);
15039 }
15040
15041 return SLURM_SUCCESS;
15042 }
15043
15044 /*
15045 * job_alloc_info - get details about an existing job allocation
15046 * IN uid - job issuing the code
15047 * IN job_id - ID of job for which info is requested
15048 * OUT job_pptr - set to pointer to job record
15049 * NOTE: See job_alloc_info_ptr() if job pointer is known
15050 */
job_alloc_info(uint32_t uid,uint32_t job_id,job_record_t ** job_pptr)15051 extern int job_alloc_info(uint32_t uid, uint32_t job_id,
15052 job_record_t **job_pptr)
15053 {
15054 job_record_t *job_ptr;
15055
15056 job_ptr = find_job_record(job_id);
15057 if (job_ptr == NULL)
15058 return ESLURM_INVALID_JOB_ID;
15059 if (job_pptr)
15060 *job_pptr = job_ptr;
15061 return job_alloc_info_ptr(uid, job_ptr);
15062 }
15063
15064 /*
15065 * Synchronize the batch job in the system with their files.
15066 * All pending batch jobs must have script and environment files
15067 * No other jobs should have such files
15068 */
sync_job_files(void)15069 int sync_job_files(void)
15070 {
15071 List batch_dirs;
15072
15073 xassert(verify_lock(CONF_LOCK, READ_LOCK));
15074 xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
15075
15076 if (!slurmctld_primary) /* Don't purge files from backup slurmctld */
15077 return SLURM_SUCCESS;
15078
15079 batch_dirs = list_create(xfree_ptr);
15080 _get_batch_job_dir_ids(batch_dirs);
15081 _validate_job_files(batch_dirs);
15082 _remove_defunct_batch_dirs(batch_dirs);
15083 FREE_NULL_LIST(batch_dirs);
15084 return SLURM_SUCCESS;
15085 }
15086
15087 /* Append to the batch_dirs list the job_id's associated with
15088 * every batch job directory in existence
15089 */
_get_batch_job_dir_ids(List batch_dirs)15090 static void _get_batch_job_dir_ids(List batch_dirs)
15091 {
15092 DIR *f_dir, *h_dir;
15093 struct dirent *dir_ent, *hash_ent;
15094 long long_job_id;
15095 uint32_t *job_id_ptr;
15096 char *endptr;
15097
15098 xassert(verify_lock(CONF_LOCK, READ_LOCK));
15099
15100 xassert(slurmctld_conf.state_save_location);
15101 f_dir = opendir(slurmctld_conf.state_save_location);
15102 if (!f_dir) {
15103 error("opendir(%s): %m",
15104 slurmctld_conf.state_save_location);
15105 return;
15106 }
15107
15108 while ((dir_ent = readdir(f_dir))) {
15109 if (!xstrncmp("hash.#", dir_ent->d_name, 5)) {
15110 char *h_path = NULL;
15111 xstrfmtcat(h_path, "%s/%s",
15112 slurmctld_conf.state_save_location,
15113 dir_ent->d_name);
15114 h_dir = opendir(h_path);
15115 xfree(h_path);
15116 if (!h_dir)
15117 continue;
15118 while ((hash_ent = readdir(h_dir))) {
15119 if (xstrncmp("job.#", hash_ent->d_name, 4))
15120 continue;
15121 long_job_id = strtol(&hash_ent->d_name[4],
15122 &endptr, 10);
15123 if ((long_job_id == 0) || (endptr[0] != '\0'))
15124 continue;
15125 debug3("Found batch directory for JobId=%ld",
15126 long_job_id);
15127 job_id_ptr = xmalloc(sizeof(uint32_t));
15128 *job_id_ptr = long_job_id;
15129 list_append(batch_dirs, job_id_ptr);
15130 }
15131 closedir(h_dir);
15132 }
15133 }
15134
15135 closedir(f_dir);
15136 }
15137
_clear_state_dir_flag(void * x,void * arg)15138 static int _clear_state_dir_flag(void *x, void *arg)
15139 {
15140 job_record_t *job_ptr = (job_record_t *) x;
15141 job_ptr->bit_flags &= ~HAS_STATE_DIR;
15142 return 0;
15143 }
15144
_test_state_dir_flag(void * x,void * arg)15145 static int _test_state_dir_flag(void *x, void *arg)
15146 {
15147 job_record_t *job_ptr = (job_record_t *) x;
15148
15149 if (job_ptr->bit_flags & HAS_STATE_DIR) {
15150 job_ptr->bit_flags &= ~HAS_STATE_DIR;
15151 return 0;
15152 }
15153
15154 if (!job_ptr->batch_flag || !IS_JOB_PENDING(job_ptr) ||
15155 (job_ptr->het_job_offset > 0))
15156 return 0; /* No files expected */
15157
15158 error("Script for %pJ lost, state set to FAILED", job_ptr);
15159 job_ptr->job_state = JOB_FAILED;
15160 job_ptr->exit_code = 1;
15161 job_ptr->state_reason = FAIL_SYSTEM;
15162 xfree(job_ptr->state_desc);
15163 job_ptr->start_time = job_ptr->end_time = time(NULL);
15164 job_completion_logger(job_ptr, false);
15165 return 0;
15166 }
15167
15168 /* All pending batch jobs must have a batch_dir entry,
15169 * otherwise we flag it as FAILED and don't schedule
15170 * If the batch_dir entry exists for a PENDING or RUNNING batch job,
15171 * remove it the list (of directories to be deleted) */
_validate_job_files(List batch_dirs)15172 static void _validate_job_files(List batch_dirs)
15173 {
15174 job_record_t *job_ptr;
15175 ListIterator batch_dir_iter;
15176 uint32_t *job_id_ptr, array_job_id;
15177
15178 list_for_each(job_list, _clear_state_dir_flag, NULL);
15179
15180 batch_dir_iter = list_iterator_create(batch_dirs);
15181 while ((job_id_ptr = list_next(batch_dir_iter))) {
15182 job_ptr = find_job_record(*job_id_ptr);
15183 if (job_ptr) {
15184 job_ptr->bit_flags |= HAS_STATE_DIR;
15185 list_delete_item(batch_dir_iter);
15186 }
15187 if (job_ptr && job_ptr->array_recs) { /* Update all tasks */
15188 array_job_id = job_ptr->array_job_id;
15189 job_ptr = job_array_hash_j[JOB_HASH_INX(array_job_id)];
15190 while (job_ptr) {
15191 if (job_ptr->array_job_id == array_job_id)
15192 job_ptr->bit_flags |= HAS_STATE_DIR;
15193 job_ptr = job_ptr->job_array_next_j;
15194 }
15195 }
15196 }
15197 list_iterator_destroy(batch_dir_iter);
15198
15199 list_for_each(job_list, _test_state_dir_flag, NULL);
15200 }
15201
15202 /* Remove all batch_dir entries in the list */
_remove_defunct_batch_dirs(List batch_dirs)15203 static void _remove_defunct_batch_dirs(List batch_dirs)
15204 {
15205 ListIterator batch_dir_inx;
15206 uint32_t *job_id_ptr;
15207
15208 xassert(verify_lock(CONF_LOCK, READ_LOCK));
15209
15210 batch_dir_inx = list_iterator_create(batch_dirs);
15211 while ((job_id_ptr = list_next(batch_dir_inx))) {
15212 info("Purged files for defunct batch JobId=%u",
15213 *job_id_ptr);
15214 delete_job_desc_files(*job_id_ptr);
15215 }
15216 list_iterator_destroy(batch_dir_inx);
15217 }
15218
15219 /*
15220 * _xmit_new_end_time
15221 * Tell all slurmd's associated with a job of its new end time
15222 * IN job_ptr - pointer to terminating job
15223 * globals: node_record_count - number of nodes in the system
15224 * node_record_table_ptr - pointer to global node table
15225 */
_xmit_new_end_time(job_record_t * job_ptr)15226 static void _xmit_new_end_time(job_record_t *job_ptr)
15227 {
15228 #ifndef HAVE_FRONT_END
15229 int i;
15230 #endif
15231 job_time_msg_t *job_time_msg_ptr;
15232 agent_arg_t *agent_args;
15233
15234 agent_args = xmalloc(sizeof(agent_arg_t));
15235 agent_args->msg_type = REQUEST_UPDATE_JOB_TIME;
15236 agent_args->retry = 1;
15237 agent_args->hostlist = hostlist_create(NULL);
15238 job_time_msg_ptr = xmalloc(sizeof(job_time_msg_t));
15239 job_time_msg_ptr->job_id = job_ptr->job_id;
15240 job_time_msg_ptr->expiration_time = job_ptr->end_time;
15241
15242 #ifdef HAVE_FRONT_END
15243 xassert(job_ptr->batch_host);
15244 if (job_ptr->front_end_ptr)
15245 agent_args->protocol_version =
15246 job_ptr->front_end_ptr->protocol_version;
15247 hostlist_push_host(agent_args->hostlist, job_ptr->batch_host);
15248 agent_args->node_count = 1;
15249 #else
15250 agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
15251 for (i = 0; i < node_record_count; i++) {
15252 if (bit_test(job_ptr->node_bitmap, i) == 0)
15253 continue;
15254 if (agent_args->protocol_version >
15255 node_record_table_ptr[i].protocol_version)
15256 agent_args->protocol_version =
15257 node_record_table_ptr[i].protocol_version;
15258 hostlist_push_host(agent_args->hostlist,
15259 node_record_table_ptr[i].name);
15260 agent_args->node_count++;
15261 }
15262 #endif
15263
15264 agent_args->msg_args = job_time_msg_ptr;
15265 agent_queue_request(agent_args);
15266 return;
15267 }
15268
15269 /*
15270 * Return total amount of memory allocated to a job. This can be based upon
15271 * a GRES specification with various GRES/memory allocations on each node.
15272 * If current allocation information is not available, estimate memory based
15273 * upon pn_min_memory and either CPU or node count.
15274 */
job_get_tres_mem(struct job_resources * job_res,uint64_t pn_min_memory,uint32_t cpu_cnt,uint32_t node_cnt)15275 extern uint64_t job_get_tres_mem(struct job_resources *job_res,
15276 uint64_t pn_min_memory, uint32_t cpu_cnt,
15277 uint32_t node_cnt)
15278 {
15279 uint64_t mem_total = 0;
15280 int i;
15281
15282 if (job_res) {
15283 for (i = 0; i < job_res->nhosts; i++) {
15284 mem_total += job_res->memory_allocated[i];
15285 }
15286 return mem_total;
15287 }
15288
15289 if (pn_min_memory == NO_VAL64)
15290 return mem_total;
15291
15292 if (pn_min_memory & MEM_PER_CPU) {
15293 if (cpu_cnt != NO_VAL) {
15294 mem_total = pn_min_memory & (~MEM_PER_CPU);
15295 mem_total *= cpu_cnt;
15296 }
15297 } else if (node_cnt != NO_VAL)
15298 mem_total = pn_min_memory * node_cnt;
15299
15300 return mem_total;
15301 }
15302
15303 /*
15304 * job_epilog_complete - Note the completion of the epilog script for a
15305 * given job
15306 * IN job_id - id of the job for which the epilog was executed
15307 * IN node_name - name of the node on which the epilog was executed
15308 * IN return_code - return code from epilog script
15309 * RET true if job is COMPLETED, otherwise false
15310 */
job_epilog_complete(uint32_t job_id,char * node_name,uint32_t return_code)15311 extern bool job_epilog_complete(uint32_t job_id, char *node_name,
15312 uint32_t return_code)
15313 {
15314 #ifdef HAVE_FRONT_END
15315 int i;
15316 #endif
15317 job_record_t *job_ptr = find_job_record(job_id);
15318 node_record_t *node_ptr;
15319
15320 if (job_ptr == NULL) {
15321 debug("%s: unable to find JobId=%u for node=%s with return_code=%u.",
15322 __func__, job_id, node_name, return_code);
15323 return true;
15324 }
15325
15326 trace_job(job_ptr, __func__, "enter");
15327
15328 /*
15329 * There is a potential race condition this handles.
15330 * If slurmctld cold-starts while slurmd keeps running, slurmd could
15331 * notify slurmctld of a job epilog completion before getting synced
15332 * up with slurmctld state. If a new job arrives and the job_id is
15333 * reused, we could try to note the termination of a job that hasn't
15334 * really started. Very rare obviously.
15335 */
15336 if ((IS_JOB_PENDING(job_ptr) && (!IS_JOB_COMPLETING(job_ptr))) ||
15337 (job_ptr->node_bitmap == NULL)) {
15338 #ifndef HAVE_FRONT_END
15339 uint32_t base_state = NODE_STATE_UNKNOWN;
15340 node_ptr = find_node_record(node_name);
15341 if (node_ptr)
15342 base_state = node_ptr->node_state & NODE_STATE_BASE;
15343 if (base_state == NODE_STATE_DOWN) {
15344 debug("%s: %pJ complete response from DOWN node %s",
15345 __func__, job_ptr, node_name);
15346 } else if (job_ptr->restart_cnt) {
15347 /*
15348 * Duplicate epilog complete can be due to race
15349 */
15350 debug("%s: %pJ duplicate epilog complete response",
15351 __func__, job_ptr);
15352 } else {
15353 error("%s: %pJ is non-running slurmctld and slurmd out of sync",
15354 __func__, job_ptr);
15355 }
15356 #endif
15357 return false;
15358 }
15359
15360 #ifdef HAVE_FRONT_END
15361 xassert(job_ptr->batch_host);
15362 /*
15363 * If there is a bad epilog error don't down the frontend node.
15364 * If needed the nodes in use by the job will be downed below.
15365 */
15366 if (return_code)
15367 error("%s: %pJ epilog error on %s",
15368 __func__, job_ptr, job_ptr->batch_host);
15369
15370 if (job_ptr->front_end_ptr && IS_JOB_COMPLETING(job_ptr)) {
15371 front_end_record_t *front_end_ptr = job_ptr->front_end_ptr;
15372 if (front_end_ptr->job_cnt_comp)
15373 front_end_ptr->job_cnt_comp--;
15374 else {
15375 error("%s: %pJ job_cnt_comp underflow on front end %s",
15376 __func__, job_ptr, front_end_ptr->name);
15377 }
15378 if (front_end_ptr->job_cnt_comp == 0)
15379 front_end_ptr->node_state &= (~NODE_STATE_COMPLETING);
15380 }
15381
15382 if ((job_ptr->total_nodes == 0) && IS_JOB_COMPLETING(job_ptr)) {
15383 /*
15384 * Job resources moved into another job and
15385 * tasks already killed
15386 */
15387 front_end_record_t *front_end_ptr = job_ptr->front_end_ptr;
15388 if (front_end_ptr)
15389 front_end_ptr->node_state &= (~NODE_STATE_COMPLETING);
15390 } else {
15391 for (i = 0; i < node_record_count; i++) {
15392 if (!bit_test(job_ptr->node_bitmap, i))
15393 continue;
15394 node_ptr = &node_record_table_ptr[i];
15395 if (return_code) {
15396 drain_nodes(node_ptr->name, "Epilog error",
15397 slurmctld_conf.slurm_user_id);
15398 }
15399 /* Change job from completing to completed */
15400 make_node_idle(node_ptr, job_ptr);
15401 }
15402 }
15403 #else
15404 if (return_code) {
15405 error("%s: %pJ epilog error on %s, draining the node",
15406 __func__, job_ptr, node_name);
15407 drain_nodes(node_name, "Epilog error",
15408 slurmctld_conf.slurm_user_id);
15409 }
15410 /* Change job from completing to completed */
15411 node_ptr = find_node_record(node_name);
15412 if (node_ptr)
15413 make_node_idle(node_ptr, job_ptr);
15414 #endif
15415
15416 step_epilog_complete(job_ptr, node_name);
15417 /* nodes_completing is out of date, rebuild when next saved */
15418 xfree(job_ptr->nodes_completing);
15419 if (!IS_JOB_COMPLETING(job_ptr)) { /* COMPLETED */
15420 batch_requeue_fini(job_ptr);
15421 return true;
15422 } else
15423 return false;
15424 }
15425
15426 /* Complete a batch job requeue logic after all steps complete so that
15427 * subsequent jobs appear in a separate accounting record. */
batch_requeue_fini(job_record_t * job_ptr)15428 void batch_requeue_fini(job_record_t *job_ptr)
15429 {
15430 if (IS_JOB_COMPLETING(job_ptr) ||
15431 !IS_JOB_PENDING(job_ptr) || !job_ptr->batch_flag)
15432 return;
15433
15434 info("Requeuing %pJ", job_ptr);
15435
15436 /* Clear everything so this appears to be a new job and then restart
15437 * it in accounting. */
15438 job_ptr->start_time = 0;
15439 job_ptr->end_time_exp = job_ptr->end_time = 0;
15440 job_ptr->total_cpus = 0;
15441 job_ptr->pre_sus_time = 0;
15442 job_ptr->preempt_time = 0;
15443 job_ptr->suspend_time = 0;
15444 job_ptr->tot_sus_time = 0;
15445 /* Current code (<= 2.1) has it so we start the new job with the next
15446 * step id. This could be used when restarting to figure out which
15447 * step the previous run of this job stopped on. */
15448 //job_ptr->next_step_id = 0;
15449
15450 job_ptr->node_cnt = 0;
15451 xfree(job_ptr->nodes);
15452 xfree(job_ptr->nodes_completing);
15453 FREE_NULL_BITMAP(job_ptr->node_bitmap);
15454 FREE_NULL_BITMAP(job_ptr->node_bitmap_cg);
15455
15456 job_resv_clear_promiscous_flag(job_ptr);
15457
15458 if (job_ptr->details) {
15459 time_t now = time(NULL);
15460 /* The time stamp on the new batch launch credential must be
15461 * larger than the time stamp on the revoke request. Also the
15462 * I/O must be all cleared out, the named socket purged and
15463 * the job credential purged by slurmd. */
15464 if (job_ptr->details->begin_time <= now) {
15465 /* See src/common/slurm_cred.c
15466 * #define DEFAULT_EXPIRATION_WINDOW 1200 */
15467 int cred_lifetime = 1200;
15468 (void) slurm_cred_ctx_get(slurmctld_config.cred_ctx,
15469 SLURM_CRED_OPT_EXPIRY_WINDOW,
15470 &cred_lifetime);
15471 job_ptr->details->begin_time = now + cred_lifetime + 1;
15472 }
15473
15474 /* Since this could happen on a launch we need to make sure the
15475 * submit isn't the same as the last submit so put now + 1 so
15476 * we get different records in the database */
15477 if (now == job_ptr->details->submit_time)
15478 now++;
15479 job_ptr->details->submit_time = now;
15480
15481 /* clear the accrue flag */
15482 job_ptr->bit_flags &= ~JOB_ACCRUE_OVER;
15483 job_ptr->details->accrue_time = 0;
15484
15485 if ((job_ptr->details->whole_node == 1) && job_ptr->gres_list) {
15486 /*
15487 * We need to reset the gres_list to what was requested
15488 * instead of what was given exclusively.
15489 */
15490 FREE_NULL_LIST(job_ptr->gres_list);
15491 (void)gres_plugin_job_state_validate(
15492 job_ptr->cpus_per_tres,
15493 job_ptr->tres_freq,
15494 job_ptr->tres_per_job,
15495 job_ptr->tres_per_node,
15496 job_ptr->tres_per_socket,
15497 job_ptr->tres_per_task,
15498 job_ptr->mem_per_tres,
15499 &job_ptr->details->num_tasks,
15500 &job_ptr->details->min_nodes,
15501 &job_ptr->details->max_nodes,
15502 &job_ptr->details->ntasks_per_node,
15503 &job_ptr->details->mc_ptr->ntasks_per_socket,
15504 &job_ptr->details->mc_ptr->sockets_per_node,
15505 &job_ptr->details->cpus_per_task,
15506 &job_ptr->gres_list);
15507 }
15508 }
15509
15510 /*
15511 * If a reservation ended and was a repeated (e.g., daily, weekly)
15512 * reservation, its ID will be different; make sure
15513 * job->resv_id matches the reservation id.
15514 */
15515 if (job_ptr->resv_ptr)
15516 job_ptr->resv_id = job_ptr->resv_ptr->resv_id;
15517
15518 /* Reset this after the batch step has finished or the batch step
15519 * information will be attributed to the next run of the job. */
15520 job_ptr->db_index = 0;
15521 if (!with_slurmdbd)
15522 jobacct_storage_g_job_start(acct_db_conn, job_ptr);
15523
15524 /* Submit new sibling jobs for fed jobs */
15525 if (fed_mgr_is_origin_job(job_ptr)) {
15526 if (fed_mgr_job_requeue(job_ptr)) {
15527 error("failed to submit requeued sibling jobs for fed %pJ",
15528 job_ptr);
15529 }
15530 }
15531 }
15532
15533
15534 /* job_fini - free all memory associated with job records */
job_fini(void)15535 void job_fini (void)
15536 {
15537 FREE_NULL_LIST(job_list);
15538 xfree(job_hash);
15539 xfree(job_array_hash_j);
15540 xfree(job_array_hash_t);
15541 FREE_NULL_LIST(purge_files_list);
15542 FREE_NULL_BITMAP(requeue_exit);
15543 FREE_NULL_BITMAP(requeue_exit_hold);
15544 }
15545
15546 /* Record the start of one job array task */
job_array_start(job_record_t * job_ptr)15547 extern void job_array_start(job_record_t *job_ptr)
15548 {
15549 job_record_t *base_job_ptr;
15550
15551 if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) {
15552 base_job_ptr = find_job_record(job_ptr->array_job_id);
15553 if (base_job_ptr && base_job_ptr->array_recs) {
15554 base_job_ptr->array_recs->tot_run_tasks++;
15555 }
15556 }
15557 }
15558
15559 /* Return true if a job array task can be started */
job_array_start_test(job_record_t * job_ptr)15560 extern bool job_array_start_test(job_record_t *job_ptr)
15561 {
15562 job_record_t *base_job_ptr;
15563 time_t now = time(NULL);
15564
15565 if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) {
15566 base_job_ptr = find_job_record(job_ptr->array_job_id);
15567 if (base_job_ptr && base_job_ptr->array_recs &&
15568 (base_job_ptr->array_recs->max_run_tasks != 0) &&
15569 (base_job_ptr->array_recs->tot_run_tasks >=
15570 base_job_ptr->array_recs->max_run_tasks)) {
15571 if (job_ptr->details &&
15572 (job_ptr->details->begin_time <= now))
15573 job_ptr->details->begin_time = (time_t) 0;
15574 xfree(job_ptr->state_desc);
15575 job_ptr->state_reason = WAIT_ARRAY_TASK_LIMIT;
15576 return false;
15577 }
15578 }
15579
15580 return true;
15581 }
15582
_job_array_comp(job_record_t * job_ptr,bool was_running,bool requeue)15583 static void _job_array_comp(job_record_t *job_ptr, bool was_running,
15584 bool requeue)
15585 {
15586 job_record_t *base_job_ptr;
15587 uint32_t status;
15588
15589 if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) {
15590 status = job_ptr->exit_code;
15591 if ((status == 0) && !IS_JOB_COMPLETE(job_ptr)) {
15592 /* Avoid max_exit_code == 0 if task did not run to
15593 * successful completion (e.g. Cancelled, NodeFail) */
15594 status = 9;
15595 }
15596 base_job_ptr = find_job_record(job_ptr->array_job_id);
15597 if (base_job_ptr && base_job_ptr->array_recs) {
15598 if (requeue) {
15599 base_job_ptr->array_recs->array_flags |=
15600 ARRAY_TASK_REQUEUED;
15601 } else if (!base_job_ptr->array_recs->tot_comp_tasks) {
15602 base_job_ptr->array_recs->min_exit_code =
15603 status;
15604 base_job_ptr->array_recs->max_exit_code =
15605 status;
15606 } else {
15607 base_job_ptr->array_recs->min_exit_code =
15608 MIN(status, base_job_ptr->
15609 array_recs->min_exit_code);
15610 base_job_ptr->array_recs->max_exit_code =
15611 MAX(status, base_job_ptr->
15612 array_recs->max_exit_code);
15613 }
15614 if (was_running &&
15615 base_job_ptr->array_recs->tot_run_tasks)
15616 base_job_ptr->array_recs->tot_run_tasks--;
15617 base_job_ptr->array_recs->tot_comp_tasks++;
15618 }
15619 }
15620 }
15621
15622 /* log the completion of the specified job */
job_completion_logger(job_record_t * job_ptr,bool requeue)15623 extern void job_completion_logger(job_record_t *job_ptr, bool requeue)
15624 {
15625 int base_state;
15626 bool arr_finished = false, task_failed = false, task_requeued = false;
15627 bool was_running = false;
15628 job_record_t *master_job = NULL;
15629 uint32_t max_exit_code = 0;
15630
15631 xassert(job_ptr);
15632
15633 acct_policy_remove_job_submit(job_ptr);
15634 if (job_ptr->nodes && ((job_ptr->bit_flags & JOB_KILL_HURRY) == 0)
15635 && !IS_JOB_RESIZING(job_ptr)) {
15636 (void) bb_g_job_start_stage_out(job_ptr);
15637 } else if (job_ptr->nodes && IS_JOB_RESIZING(job_ptr)){
15638 debug("%s: %pJ resizing, skipping bb stage_out",
15639 __func__, job_ptr);
15640 } else {
15641 /*
15642 * Never allocated compute nodes.
15643 * Unless job ran, there is no data to stage-out
15644 */
15645 (void) bb_g_job_cancel(job_ptr);
15646 }
15647 if (job_ptr->bit_flags & JOB_WAS_RUNNING) {
15648 job_ptr->bit_flags &= ~JOB_WAS_RUNNING;
15649 was_running = true;
15650 }
15651
15652 _job_array_comp(job_ptr, was_running, requeue);
15653
15654 if (!IS_JOB_RESIZING(job_ptr) &&
15655 !IS_JOB_PENDING(job_ptr) &&
15656 !IS_JOB_REVOKED(job_ptr) &&
15657 ((job_ptr->array_task_id == NO_VAL) ||
15658 (job_ptr->mail_type & MAIL_ARRAY_TASKS) ||
15659 (arr_finished = test_job_array_finished(job_ptr->array_job_id)))) {
15660 /* Remove configuring state just to make sure it isn't there
15661 * since it will throw off displays of the job. */
15662 job_ptr->job_state &= ~JOB_CONFIGURING;
15663
15664 /* make sure all parts of the job are notified
15665 * Fed Jobs: only signal the srun from where the job is running
15666 * or from the origin if the job wasn't running. */
15667 if (!job_ptr->fed_details ||
15668 fed_mgr_job_is_self_owned(job_ptr) ||
15669 (fed_mgr_is_origin_job(job_ptr) &&
15670 !fed_mgr_job_is_locked(job_ptr)))
15671 srun_job_complete(job_ptr);
15672
15673 /* mail out notifications of completion */
15674 if (arr_finished) {
15675 /* We need to summarize different tasks states. */
15676 master_job = find_job_record(job_ptr->array_job_id);
15677 if (master_job && master_job->array_recs) {
15678 task_requeued =
15679 (master_job->array_recs->array_flags &
15680 ARRAY_TASK_REQUEUED);
15681 if (task_requeued &&
15682 (job_ptr->mail_type & MAIL_JOB_REQUEUE)) {
15683 /*
15684 * At least 1 task requeued and job
15685 * req. to be notified on requeues.
15686 */
15687 mail_job_info(master_job,
15688 MAIL_JOB_REQUEUE);
15689 }
15690
15691 max_exit_code =
15692 master_job->array_recs->max_exit_code;
15693 task_failed = (WIFEXITED(max_exit_code) &&
15694 WEXITSTATUS(max_exit_code));
15695 if (task_failed &&
15696 (job_ptr->mail_type & MAIL_JOB_FAIL)) {
15697 /*
15698 * At least 1 task failed and job
15699 * req. to be notified on failures.
15700 */
15701 mail_job_info(master_job,
15702 MAIL_JOB_FAIL);
15703 } else if (job_ptr->mail_type & MAIL_JOB_END) {
15704 /*
15705 * Job req. to be notified on END.
15706 */
15707 mail_job_info(job_ptr, MAIL_JOB_END);
15708 }
15709 }
15710 } else {
15711 base_state = job_ptr->job_state & JOB_STATE_BASE;
15712 if ((base_state == JOB_COMPLETE) ||
15713 (base_state == JOB_CANCELLED)) {
15714 if (requeue &&
15715 (job_ptr->mail_type & MAIL_JOB_REQUEUE)) {
15716 mail_job_info(job_ptr,
15717 MAIL_JOB_REQUEUE);
15718 } else if (job_ptr->mail_type & MAIL_JOB_END) {
15719 mail_job_info(job_ptr, MAIL_JOB_END);
15720 }
15721 } else { /* JOB_FAILED, JOB_TIMEOUT, etc. */
15722 if (job_ptr->mail_type & MAIL_JOB_FAIL)
15723 mail_job_info(job_ptr, MAIL_JOB_FAIL);
15724 else if (job_ptr->mail_type & MAIL_JOB_END)
15725 mail_job_info(job_ptr, MAIL_JOB_END);
15726 }
15727 }
15728 }
15729
15730 g_slurm_jobcomp_write(job_ptr);
15731
15732 /* When starting the resized job everything is taken care of
15733 * elsewhere, so don't call it here. */
15734 if (IS_JOB_RESIZING(job_ptr))
15735 return;
15736
15737 if (!with_slurmdbd && !job_ptr->db_index)
15738 jobacct_storage_g_job_start(acct_db_conn, job_ptr);
15739
15740 if (!(job_ptr->bit_flags & TRES_STR_CALC) &&
15741 job_ptr->tres_alloc_cnt &&
15742 (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64))
15743 set_job_tres_alloc_str(job_ptr, false);
15744
15745 jobacct_storage_g_job_complete(acct_db_conn, job_ptr);
15746 }
15747
15748 /*
15749 * job_independent - determine if this job has a dependent job pending
15750 * or if the job's scheduled begin time is in the future
15751 * IN job_ptr - pointer to job being tested
15752 * RET - true if job no longer must be deferred for another job
15753 */
job_independent(job_record_t * job_ptr)15754 extern bool job_independent(job_record_t *job_ptr)
15755 {
15756 struct job_details *detail_ptr = job_ptr->details;
15757 time_t now = time(NULL);
15758 int depend_rc;
15759
15760 if ((job_ptr->state_reason == FAIL_BURST_BUFFER_OP) ||
15761 (job_ptr->state_reason == FAIL_ACCOUNT) ||
15762 (job_ptr->state_reason == FAIL_QOS) ||
15763 (job_ptr->state_reason == WAIT_HELD) ||
15764 (job_ptr->state_reason == WAIT_HELD_USER) ||
15765 (job_ptr->state_reason == WAIT_MAX_REQUEUE) ||
15766 (job_ptr->state_reason == WAIT_RESV_DELETED) ||
15767 (job_ptr->state_reason == WAIT_DEP_INVALID))
15768 return false;
15769
15770 /* Test dependencies first so we can cancel jobs before dependent
15771 * job records get purged (e.g. afterok, afternotok) */
15772 depend_rc = test_job_dependency(job_ptr, NULL);
15773 if ((depend_rc == LOCAL_DEPEND) || (depend_rc == REMOTE_DEPEND)) {
15774 /* start_time has passed but still has dependency which
15775 * makes it ineligible */
15776 if (detail_ptr->begin_time < now)
15777 detail_ptr->begin_time = 0;
15778 job_ptr->state_reason = WAIT_DEPENDENCY;
15779 xfree(job_ptr->state_desc);
15780 return false;
15781 } else if (depend_rc == FAIL_DEPEND) {
15782 handle_invalid_dependency(job_ptr);
15783 return false;
15784 }
15785 /* Job is eligible to start now */
15786 if (job_ptr->state_reason == WAIT_DEPENDENCY) {
15787 job_ptr->state_reason = WAIT_NO_REASON;
15788 xfree(job_ptr->state_desc);
15789 /* Submit the job to its siblings. */
15790 if (job_ptr->details) {
15791 fed_mgr_job_requeue(job_ptr);
15792 }
15793 }
15794
15795 /* Check for maximum number of running tasks in a job array */
15796 if (!job_array_start_test(job_ptr))
15797 return false;
15798
15799 if (detail_ptr && (detail_ptr->begin_time > now)) {
15800 job_ptr->state_reason = WAIT_TIME;
15801 xfree(job_ptr->state_desc);
15802 return false; /* not yet time */
15803 }
15804
15805 if (job_test_resv_now(job_ptr) != SLURM_SUCCESS) {
15806 job_ptr->state_reason = WAIT_RESERVATION;
15807 xfree(job_ptr->state_desc);
15808 return false; /* not yet time */
15809 }
15810
15811 if ((detail_ptr && (detail_ptr->begin_time == 0) &&
15812 (job_ptr->priority != 0))) {
15813 detail_ptr->begin_time = now;
15814 /*
15815 * Send begin time to the database if it is already there, or it
15816 * won't get there until the job starts.
15817 */
15818 jobacct_storage_job_start_direct(acct_db_conn, job_ptr);
15819 } else if (job_ptr->state_reason == WAIT_TIME) {
15820 job_ptr->state_reason = WAIT_NO_REASON;
15821 xfree(job_ptr->state_desc);
15822 }
15823 return true;
15824 }
15825
15826 /*
15827 * determine if job is ready to execute per the node select plugin
15828 * IN job_id - job to test
15829 * OUT ready - 1 if job is ready to execute 0 otherwise
15830 * RET Slurm error code
15831 */
job_node_ready(uint32_t job_id,int * ready)15832 extern int job_node_ready(uint32_t job_id, int *ready)
15833 {
15834 int rc;
15835 job_record_t *job_ptr;
15836 xassert(ready);
15837
15838 *ready = 0;
15839 job_ptr = find_job_record(job_id);
15840 if (job_ptr == NULL)
15841 return ESLURM_INVALID_JOB_ID;
15842
15843 /* Always call select_g_job_ready() so that select/bluegene can
15844 * test and update block state information. */
15845 rc = select_g_job_ready(job_ptr);
15846 if (rc == READY_JOB_FATAL)
15847 return ESLURM_INVALID_PARTITION_NAME;
15848 if (rc == READY_JOB_ERROR)
15849 return EAGAIN;
15850 if (rc)
15851 rc = READY_NODE_STATE;
15852
15853 if (job_ptr->details && job_ptr->details->prolog_running)
15854 rc &= (~READY_NODE_STATE);
15855
15856 if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))
15857 rc |= READY_JOB_STATE;
15858 if ((rc == (READY_NODE_STATE | READY_JOB_STATE)) &&
15859 job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD") &&
15860 job_ptr->node_bitmap &&
15861 (bit_overlap_any(power_node_bitmap, job_ptr->node_bitmap) == 0)) {
15862 last_job_update = time(NULL);
15863 set_job_alias_list(job_ptr);
15864 }
15865
15866 *ready = rc;
15867 return SLURM_SUCCESS;
15868 }
15869
15870 /* Send specified signal to all steps associated with a job */
_signal_job(job_record_t * job_ptr,int signal,uint16_t flags)15871 static void _signal_job(job_record_t *job_ptr, int signal, uint16_t flags)
15872 {
15873 #ifndef HAVE_FRONT_END
15874 int i;
15875 #endif
15876 agent_arg_t *agent_args = NULL;
15877 signal_tasks_msg_t *signal_job_msg = NULL;
15878 static int notify_srun_static = -1;
15879 int notify_srun = 0;
15880
15881 if (notify_srun_static == -1) {
15882 /* do this for all but slurm (poe, aprun, etc...) */
15883 if (xstrcmp(slurmctld_conf.launch_type, "launch/slurm"))
15884 notify_srun_static = 1;
15885 else
15886 notify_srun_static = 0;
15887 }
15888
15889 #ifdef HAVE_FRONT_END
15890 /* On a front end system always notify_srun instead of slurmd */
15891 if (notify_srun_static)
15892 notify_srun = 1;
15893 #else
15894 /* For launch/poe all signals are forwarded by srun to poe to tasks
15895 * except SIGSTOP/SIGCONT, which are used for job preemption. In that
15896 * case the slurmd must directly suspend tasks and switch resources. */
15897 if (notify_srun_static && (signal != SIGSTOP) && (signal != SIGCONT))
15898 notify_srun = 1;
15899 #endif
15900
15901 if (notify_srun) {
15902 ListIterator step_iterator;
15903 step_record_t *step_ptr;
15904 step_iterator = list_iterator_create(job_ptr->step_list);
15905 while ((step_ptr = list_next(step_iterator))) {
15906 /* Since we have already checked the uid,
15907 * we can send this signal as uid 0. */
15908 job_step_signal(job_ptr->job_id, step_ptr->step_id,
15909 signal, 0, 0);
15910 }
15911 list_iterator_destroy (step_iterator);
15912
15913 return;
15914 }
15915
15916 agent_args = xmalloc(sizeof(agent_arg_t));
15917 agent_args->msg_type = REQUEST_SIGNAL_TASKS;
15918 agent_args->retry = 1;
15919 agent_args->hostlist = hostlist_create(NULL);
15920 signal_job_msg = xmalloc(sizeof(signal_tasks_msg_t));
15921 signal_job_msg->job_id = job_ptr->job_id;
15922
15923 /*
15924 * We don't ever want to kill a step with this message. The flags below
15925 * will make sure that does happen. Just in case though, set the
15926 * step_id to an impossible number.
15927 */
15928 signal_job_msg->job_step_id = slurmctld_conf.max_step_cnt + 1;
15929
15930 /*
15931 * Encode the flags for slurm stepd to know what steps get signaled
15932 * Here if we aren't signaling the full job we always only want to
15933 * signal all other steps.
15934 */
15935 if ((flags & KILL_FULL_JOB) ||
15936 (flags & KILL_JOB_BATCH) ||
15937 (flags & KILL_STEPS_ONLY))
15938 signal_job_msg->flags = flags;
15939 else
15940 signal_job_msg->flags = KILL_STEPS_ONLY;
15941
15942 signal_job_msg->signal = signal;
15943
15944 #ifdef HAVE_FRONT_END
15945 xassert(job_ptr->batch_host);
15946 if (job_ptr->front_end_ptr)
15947 agent_args->protocol_version =
15948 job_ptr->front_end_ptr->protocol_version;
15949 hostlist_push_host(agent_args->hostlist, job_ptr->batch_host);
15950 agent_args->node_count = 1;
15951 #else
15952 agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
15953 for (i = 0; i < node_record_count; i++) {
15954 if (bit_test(job_ptr->node_bitmap, i) == 0)
15955 continue;
15956 if (agent_args->protocol_version >
15957 node_record_table_ptr[i].protocol_version)
15958 agent_args->protocol_version =
15959 node_record_table_ptr[i].protocol_version;
15960 hostlist_push_host(agent_args->hostlist,
15961 node_record_table_ptr[i].name);
15962 agent_args->node_count++;
15963 }
15964 #endif
15965
15966 if (agent_args->node_count == 0) {
15967 xfree(signal_job_msg);
15968 xfree(agent_args);
15969 return;
15970 }
15971
15972 agent_args->msg_args = signal_job_msg;
15973 agent_queue_request(agent_args);
15974 return;
15975 }
15976
_switch_suspend_info(job_record_t * job_ptr)15977 static void *_switch_suspend_info(job_record_t *job_ptr)
15978 {
15979 ListIterator step_iterator;
15980 step_record_t *step_ptr;
15981 void *switch_suspend_info = NULL;
15982
15983 step_iterator = list_iterator_create (job_ptr->step_list);
15984 while ((step_ptr = list_next(step_iterator))) {
15985 if (step_ptr->state != JOB_RUNNING)
15986 continue;
15987 switch_g_job_suspend_info_get(step_ptr->switch_job,
15988 &switch_suspend_info);
15989 }
15990 list_iterator_destroy (step_iterator);
15991
15992 return switch_suspend_info;
15993 }
15994
15995 /* Send suspend request to slumrd of all nodes associated with a job
15996 * job_ptr IN - job to be suspended or resumed
15997 * op IN - SUSPEND_JOB or RESUME_JOB
15998 * indf_susp IN - set if job is being suspended indefinitely by user
15999 * or admin, otherwise suspended for gang scheduling
16000 */
_suspend_job(job_record_t * job_ptr,uint16_t op,bool indf_susp)16001 static void _suspend_job(job_record_t *job_ptr, uint16_t op, bool indf_susp)
16002 {
16003 #ifndef HAVE_FRONT_END
16004 int i;
16005 #endif
16006 agent_arg_t *agent_args;
16007 suspend_int_msg_t *sus_ptr;
16008
16009 agent_args = xmalloc(sizeof(agent_arg_t));
16010 agent_args->msg_type = REQUEST_SUSPEND_INT;
16011 agent_args->retry = 0; /* don't resend, gang scheduler can
16012 * quickly induce huge backlog
16013 * of agent.c RPCs */
16014 agent_args->hostlist = hostlist_create(NULL);
16015 sus_ptr = xmalloc(sizeof(suspend_int_msg_t));
16016 sus_ptr->job_core_spec = job_ptr->details->core_spec;
16017 sus_ptr->job_id = job_ptr->job_id;
16018 sus_ptr->op = op;
16019 sus_ptr->indf_susp = indf_susp;
16020 sus_ptr->switch_info = _switch_suspend_info(job_ptr);
16021
16022 #ifdef HAVE_FRONT_END
16023 xassert(job_ptr->batch_host);
16024 if (job_ptr->front_end_ptr) {
16025 agent_args->protocol_version =
16026 job_ptr->front_end_ptr->protocol_version;
16027 }
16028 hostlist_push_host(agent_args->hostlist, job_ptr->batch_host);
16029 agent_args->node_count = 1;
16030 #else
16031 agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
16032 for (i = 0; i < node_record_count; i++) {
16033 if (bit_test(job_ptr->node_bitmap, i) == 0)
16034 continue;
16035 if (agent_args->protocol_version >
16036 node_record_table_ptr[i].protocol_version)
16037 agent_args->protocol_version =
16038 node_record_table_ptr[i].protocol_version;
16039 hostlist_push_host(agent_args->hostlist,
16040 node_record_table_ptr[i].name);
16041 agent_args->node_count++;
16042 }
16043 #endif
16044
16045 if (agent_args->node_count == 0) {
16046 slurm_free_suspend_int_msg(sus_ptr);
16047 xfree(agent_args);
16048 return;
16049 }
16050
16051 agent_args->msg_args = sus_ptr;
16052 agent_queue_request(agent_args);
16053 return;
16054 }
16055
16056 /*
16057 * Specified job is being suspended, release allocated nodes
16058 * job_ptr IN - job to be suspended
16059 * indf_susp IN - set if job is being suspended indefinitely by user
16060 * or admin, otherwise suspended for gang scheduling
16061 */
_suspend_job_nodes(job_record_t * job_ptr,bool indf_susp)16062 static int _suspend_job_nodes(job_record_t *job_ptr, bool indf_susp)
16063 {
16064 int i, i_first, i_last, rc = SLURM_SUCCESS;
16065 node_record_t *node_ptr;
16066 uint32_t node_flags;
16067 time_t now = time(NULL);
16068
16069 if ((rc = select_g_job_suspend(job_ptr, indf_susp)) != SLURM_SUCCESS)
16070 return rc;
16071
16072 i_first = bit_ffs(job_ptr->node_bitmap);
16073 if (i_first >= 0)
16074 i_last = bit_fls(job_ptr->node_bitmap);
16075 else
16076 i_last = -2;
16077 node_ptr = node_record_table_ptr + i_first;
16078 for (i = i_first; i <= i_last; i++, node_ptr++) {
16079 if (!bit_test(job_ptr->node_bitmap, i))
16080 continue;
16081 node_ptr->sus_job_cnt++;
16082 if (node_ptr->run_job_cnt)
16083 (node_ptr->run_job_cnt)--;
16084 else {
16085 error("%s: %pJ node %s run_job_cnt underflow",
16086 __func__, job_ptr, node_ptr->name);
16087 }
16088 if (job_ptr->details && (job_ptr->details->share_res == 0)) {
16089 if (node_ptr->no_share_job_cnt)
16090 (node_ptr->no_share_job_cnt)--;
16091 else {
16092 error("%s: %pJ node %s no_share_job_cnt underflow",
16093 __func__, job_ptr, node_ptr->name);
16094 }
16095 if (node_ptr->no_share_job_cnt == 0)
16096 bit_set(share_node_bitmap, i);
16097 }
16098 node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
16099 if ((node_ptr->run_job_cnt == 0) &&
16100 (node_ptr->comp_job_cnt == 0)) {
16101 bit_set(idle_node_bitmap, i);
16102 }
16103 if (IS_NODE_DOWN(node_ptr)) {
16104 debug3("%s: %pJ node %s left DOWN",
16105 __func__, job_ptr, node_ptr->name);
16106 } else if (node_ptr->run_job_cnt) {
16107 node_ptr->node_state = NODE_STATE_ALLOCATED |
16108 node_flags;
16109 } else {
16110 node_ptr->node_state = NODE_STATE_IDLE | node_flags;
16111 node_ptr->last_idle = now;
16112 }
16113 }
16114 last_job_update = last_node_update = now;
16115 return rc;
16116 }
16117
16118 /*
16119 * Specified job is being resumed, re-allocate the nodes
16120 * job_ptr IN - job to be resumed
16121 * indf_susp IN - set i f job is being resumed from indefinite suspend by user
16122 * or admin, otherwise resume from gang scheduling
16123 */
_resume_job_nodes(job_record_t * job_ptr,bool indf_susp)16124 static int _resume_job_nodes(job_record_t *job_ptr, bool indf_susp)
16125 {
16126 int i, i_first, i_last, rc = SLURM_SUCCESS;
16127 node_record_t *node_ptr;
16128 uint32_t node_flags;
16129
16130 if ((rc = select_g_job_resume(job_ptr, indf_susp)) != SLURM_SUCCESS)
16131 return rc;
16132
16133 i_first = bit_ffs(job_ptr->node_bitmap);
16134 if (i_first >= 0)
16135 i_last = bit_fls(job_ptr->node_bitmap);
16136 else
16137 i_last = -2;
16138 node_ptr = node_record_table_ptr + i_first;
16139 for (i = i_first; i <= i_last; i++, node_ptr++) {
16140 if (!bit_test(job_ptr->node_bitmap, i))
16141 continue;
16142 if (IS_NODE_DOWN(node_ptr))
16143 return SLURM_ERROR;
16144 }
16145
16146 node_ptr = node_record_table_ptr + i_first;
16147 for (i = i_first; i <= i_last; i++, node_ptr++) {
16148 if (!bit_test(job_ptr->node_bitmap, i))
16149 continue;
16150
16151 if (node_ptr->sus_job_cnt)
16152 (node_ptr->sus_job_cnt)--;
16153 else {
16154 error("Node %s sus_job_cnt underflow",
16155 node_ptr->name);
16156 }
16157 node_ptr->run_job_cnt++;
16158 if (job_ptr->details &&
16159 (job_ptr->details->share_res == 0)) {
16160 node_ptr->no_share_job_cnt++;
16161 if (node_ptr->no_share_job_cnt)
16162 bit_clear(share_node_bitmap, i);
16163 }
16164
16165 if (slurm_mcs_get_select(job_ptr) == 1) {
16166 xfree(node_ptr->mcs_label);
16167 node_ptr->mcs_label = xstrdup(job_ptr->mcs_label);
16168 }
16169
16170 bit_clear(idle_node_bitmap, i);
16171 node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
16172 node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags;
16173 }
16174 last_job_update = last_node_update = time(NULL);
16175 return rc;
16176 }
16177
_job_suspend_switch_test(job_record_t * job_ptr)16178 static int _job_suspend_switch_test(job_record_t *job_ptr)
16179 {
16180 int rc = SLURM_SUCCESS;
16181 ListIterator step_iterator;
16182 step_record_t *step_ptr;
16183
16184 step_iterator = list_iterator_create(job_ptr->step_list);
16185 while ((step_ptr = list_next(step_iterator))) {
16186 if (step_ptr->state != JOB_RUNNING)
16187 continue;
16188 rc = switch_g_job_suspend_test(step_ptr->switch_job);
16189 if (rc != SLURM_SUCCESS)
16190 break;
16191 }
16192 list_iterator_destroy (step_iterator);
16193
16194 return rc;
16195 }
16196
16197 /*
16198 * Determine if a job can be resumed.
16199 * Check for multiple jobs on the same nodes with core specialization.
16200 * RET 0 on success, otherwise ESLURM error code
16201 */
_job_resume_test(job_record_t * job_ptr)16202 static int _job_resume_test(job_record_t *job_ptr)
16203 {
16204 int rc = SLURM_SUCCESS;
16205 ListIterator job_iterator;
16206 job_record_t *test_job_ptr;
16207
16208 if ((job_ptr->details == NULL) ||
16209 (job_ptr->details->core_spec == NO_VAL16) ||
16210 (job_ptr->node_bitmap == NULL))
16211 return rc;
16212
16213 job_iterator = list_iterator_create(job_list);
16214 while ((test_job_ptr = list_next(job_iterator))) {
16215 if (test_job_ptr->details &&
16216 (test_job_ptr->details->core_spec != NO_VAL16) &&
16217 IS_JOB_RUNNING(test_job_ptr) &&
16218 test_job_ptr->node_bitmap &&
16219 bit_overlap_any(test_job_ptr->node_bitmap,
16220 job_ptr->node_bitmap)) {
16221 rc = ESLURM_NODES_BUSY;
16222 break;
16223 }
16224 /* FIXME: Also test for ESLURM_INTERCONNECT_BUSY */
16225 }
16226 list_iterator_destroy(job_iterator);
16227
16228 return rc;
16229 }
16230
16231 /*
16232 * _job_suspend_op - perform some suspend/resume operation on a job
16233 * op IN - operation: suspend/resume
16234 * indf_susp IN - set if job is being suspended indefinitely by user or admin
16235 * and we should clear it's priority, otherwise suspended
16236 * temporarily for gang scheduling
16237 * RET 0 on success, otherwise ESLURM error code
16238 */
_job_suspend_op(job_record_t * job_ptr,uint16_t op,bool indf_susp)16239 static int _job_suspend_op(job_record_t *job_ptr, uint16_t op, bool indf_susp)
16240 {
16241 int rc = SLURM_SUCCESS;
16242 time_t now = time(NULL);
16243
16244 if (IS_JOB_PENDING(job_ptr))
16245 return ESLURM_JOB_PENDING;
16246 if (IS_JOB_FINISHED(job_ptr))
16247 return ESLURM_ALREADY_DONE;
16248 if ((op == SUSPEND_JOB) &&
16249 (_job_suspend_switch_test(job_ptr) != SLURM_SUCCESS))
16250 return ESLURM_NOT_SUPPORTED;
16251 if ((op == RESUME_JOB) && (rc = _job_resume_test(job_ptr)))
16252 return rc;
16253
16254 /* perform the operation */
16255 if (op == SUSPEND_JOB) {
16256 if (IS_JOB_SUSPENDED(job_ptr) && indf_susp) {
16257 debug("%s: Holding %pJ, re-suspend operation",
16258 __func__, job_ptr);
16259 job_ptr->priority = 0; /* Prevent gang sched resume */
16260 return SLURM_SUCCESS;
16261 }
16262 if (!IS_JOB_RUNNING(job_ptr))
16263 return ESLURM_JOB_NOT_RUNNING;
16264 rc = _suspend_job_nodes(job_ptr, indf_susp);
16265 if (rc != SLURM_SUCCESS)
16266 return rc;
16267 _suspend_job(job_ptr, op, indf_susp);
16268 job_ptr->job_state = JOB_SUSPENDED;
16269 if (indf_susp) { /* Job being manually suspended, not gang */
16270 debug("%s: Holding %pJ, suspend operation",
16271 __func__, job_ptr);
16272 job_ptr->priority = 0;
16273 (void) gs_job_fini(job_ptr);
16274 }
16275 if (job_ptr->suspend_time) {
16276 job_ptr->pre_sus_time +=
16277 difftime(now, job_ptr->suspend_time);
16278 } else {
16279 job_ptr->pre_sus_time +=
16280 difftime(now, job_ptr->start_time);
16281 }
16282 suspend_job_step(job_ptr);
16283 } else if (op == RESUME_JOB) {
16284 if (!IS_JOB_SUSPENDED(job_ptr))
16285 return ESLURM_JOB_NOT_SUSPENDED;
16286 rc = _resume_job_nodes(job_ptr, indf_susp);
16287 power_g_job_resume(job_ptr);
16288 if (rc != SLURM_SUCCESS)
16289 return rc;
16290 _suspend_job(job_ptr, op, indf_susp);
16291 if (job_ptr->priority == 0) {
16292 /* Job was manually suspended, not gang */
16293 set_job_prio(job_ptr);
16294 (void) gs_job_start(job_ptr);
16295 }
16296 job_ptr->job_state = JOB_RUNNING;
16297 job_ptr->tot_sus_time +=
16298 difftime(now, job_ptr->suspend_time);
16299
16300 if ((job_ptr->time_limit != INFINITE) &&
16301 (!job_ptr->preempt_time)) {
16302 debug3("%pJ resumed, updating end_time", job_ptr);
16303 job_ptr->end_time_exp = job_ptr->end_time =
16304 now + (job_ptr->time_limit * 60)
16305 - job_ptr->pre_sus_time;
16306 }
16307 resume_job_step(job_ptr);
16308 }
16309
16310 job_ptr->time_last_active = now;
16311 job_ptr->suspend_time = now;
16312 jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
16313
16314 return rc;
16315 }
16316
16317
16318 /*
16319 * _job_suspend - perform some suspend/resume operation, if the specified
16320 * job records is a hetjob leader, perform the operation on all
16321 * components of the hetjob
16322 * job_ptr - job to operate upon
16323 * op IN - operation: suspend/resume
16324 * indf_susp IN - set if job is being suspended indefinitely by user or admin
16325 * and we should clear it's priority, otherwise suspended
16326 * temporarily for gang scheduling
16327 * RET 0 on success, otherwise ESLURM error code
16328 */
_job_suspend(job_record_t * job_ptr,uint16_t op,bool indf_susp)16329 static int _job_suspend(job_record_t *job_ptr, uint16_t op, bool indf_susp)
16330 {
16331 job_record_t *het_job;
16332 int rc = SLURM_SUCCESS, rc1;
16333 ListIterator iter;
16334
16335 if (job_ptr->het_job_id && !job_ptr->het_job_list)
16336 return ESLURM_NOT_WHOLE_HET_JOB;
16337
16338 /* Notify salloc/srun of suspend/resume */
16339 srun_job_suspend(job_ptr, op);
16340
16341 if (job_ptr->het_job_list) {
16342 iter = list_iterator_create(job_ptr->het_job_list);
16343 while ((het_job = list_next(iter))) {
16344 if (job_ptr->het_job_id != het_job->het_job_id) {
16345 error("%s: Bad het_job_list for %pJ",
16346 __func__, job_ptr);
16347 continue;
16348 }
16349 rc1 = _job_suspend_op(het_job, op, indf_susp);
16350 if (rc1 != SLURM_SUCCESS)
16351 rc = rc1;
16352 }
16353 list_iterator_destroy(iter);
16354 } else {
16355 rc = _job_suspend_op(job_ptr, op, indf_susp);
16356 }
16357
16358 return rc;
16359 }
16360
16361 /*
16362 * job_suspend - perform some suspend/resume operation
16363 * NOTE: job_suspend - Uses the job_id field and ignores job_id_str
16364 *
16365 * IN sus_ptr - suspend/resume request message
16366 * IN uid - user id of the user issuing the RPC
16367 * IN conn_fd - file descriptor on which to send reply,
16368 * -1 if none
16369 * indf_susp IN - set if job is being suspended indefinitely by user or admin
16370 * and we should clear it's priority, otherwise suspended
16371 * temporarily for gang scheduling
16372 * IN protocol_version - slurm protocol version of client
16373 * RET 0 on success, otherwise ESLURM error code
16374 */
job_suspend(suspend_msg_t * sus_ptr,uid_t uid,int conn_fd,bool indf_susp,uint16_t protocol_version)16375 extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid,
16376 int conn_fd, bool indf_susp,
16377 uint16_t protocol_version)
16378 {
16379 int rc = SLURM_SUCCESS;
16380 job_record_t *job_ptr = NULL;
16381 slurm_msg_t resp_msg;
16382 return_code_msg_t rc_msg;
16383
16384 xfree(sus_ptr->job_id_str);
16385 xstrfmtcat(sus_ptr->job_id_str, "%u", sus_ptr->job_id);
16386
16387 /* validate the request */
16388 if (!validate_operator(uid)) {
16389 error("SECURITY VIOLATION: Attempt to suspend job from user %u",
16390 (int) uid);
16391 rc = ESLURM_ACCESS_DENIED;
16392 goto reply;
16393 }
16394
16395 /* find the job */
16396 job_ptr = find_job_record (sus_ptr->job_id);
16397 if (job_ptr == NULL) {
16398 rc = ESLURM_INVALID_JOB_ID;
16399 goto reply;
16400 }
16401
16402 rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp);
16403
16404 reply:
16405
16406 /* Since we have already used it lets make sure we don't leak
16407 memory */
16408 xfree(sus_ptr->job_id_str);
16409
16410 if (conn_fd >= 0) {
16411 slurm_msg_t_init(&resp_msg);
16412 resp_msg.protocol_version = protocol_version;
16413 resp_msg.msg_type = RESPONSE_SLURM_RC;
16414 memset(&rc_msg, 0, sizeof(rc_msg));
16415 rc_msg.return_code = rc;
16416 resp_msg.data = &rc_msg;
16417 slurm_send_node_msg(conn_fd, &resp_msg);
16418 }
16419 return rc;
16420 }
16421
16422 /*
16423 * job_suspend2 - perform some suspend/resume operation
16424 * NB job_suspend2 - Ignores the job_id field and uses job_id_str
16425 *
16426 * IN sus_ptr - suspend/resume request message
16427 * IN uid - user id of the user issuing the RPC
16428 * IN conn_fd - file descriptor on which to send reply,
16429 * -1 if none
16430 * indf_susp IN - set if job is being suspended indefinitely by user or admin
16431 * and we should clear it's priority, otherwise suspended
16432 * temporarily for gang scheduling
16433 * IN protocol_version - slurm protocol version of client
16434 * RET 0 on success, otherwise ESLURM error code
16435 */
job_suspend2(suspend_msg_t * sus_ptr,uid_t uid,int conn_fd,bool indf_susp,uint16_t protocol_version)16436 extern int job_suspend2(suspend_msg_t *sus_ptr, uid_t uid,
16437 int conn_fd, bool indf_susp,
16438 uint16_t protocol_version)
16439 {
16440 int rc = SLURM_SUCCESS, rc2;
16441 job_record_t *job_ptr = NULL;
16442 long int long_id;
16443 uint32_t job_id = 0;
16444 char *end_ptr = NULL, *tok, *tmp;
16445 bitstr_t *array_bitmap = NULL;
16446 bool valid = true;
16447 int32_t i, i_first, i_last;
16448 slurm_msg_t resp_msg;
16449 return_code_msg_t rc_msg;
16450 resp_array_struct_t *resp_array = NULL;
16451 job_array_resp_msg_t *resp_array_msg = NULL;
16452
16453 if (max_array_size == NO_VAL) {
16454 max_array_size = slurmctld_conf.max_array_sz;
16455 }
16456
16457 /* validate the request */
16458 if (!validate_operator(uid)) {
16459 error("SECURITY VIOLATION: Attempt to suspend job from user %u",
16460 (int) uid);
16461 rc = ESLURM_ACCESS_DENIED;
16462 goto reply;
16463 }
16464
16465 long_id = strtol(sus_ptr->job_id_str, &end_ptr, 10);
16466 if (end_ptr[0] == '+')
16467 rc = ESLURM_NOT_WHOLE_HET_JOB;
16468 else if ((long_id <= 0) || (long_id == LONG_MAX) ||
16469 ((end_ptr[0] != '\0') && (end_ptr[0] != '_')))
16470 rc = ESLURM_INVALID_JOB_ID;
16471 if (rc != SLURM_SUCCESS) {
16472 info("%s: invalid JobId=%s", __func__, sus_ptr->job_id_str);
16473 goto reply;
16474 }
16475
16476 job_id = (uint32_t) long_id;
16477 if (end_ptr[0] == '\0') { /* Single job (or full job array) */
16478 job_record_t *job_ptr_done = NULL;
16479 job_ptr = find_job_record(job_id);
16480 if (job_ptr &&
16481 (((job_ptr->array_task_id == NO_VAL) &&
16482 (job_ptr->array_recs == NULL)) ||
16483 ((job_ptr->array_task_id != NO_VAL) &&
16484 (job_ptr->array_job_id != job_id)))) {
16485 /* This is a regular job or single task of job array */
16486 rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp);
16487 goto reply;
16488 }
16489
16490 if (job_ptr && job_ptr->array_recs) {
16491 /* This is a job array */
16492 job_ptr_done = job_ptr;
16493 rc2 = _job_suspend(job_ptr, sus_ptr->op, indf_susp);
16494 _resp_array_add(&resp_array, job_ptr, rc2);
16495 }
16496
16497 /* Suspend all tasks of this job array */
16498 job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
16499 if (!job_ptr && !job_ptr_done) {
16500 rc = ESLURM_INVALID_JOB_ID;
16501 goto reply;
16502 }
16503 while (job_ptr) {
16504 if ((job_ptr->array_job_id == job_id) &&
16505 (job_ptr != job_ptr_done)) {
16506 rc2 = _job_suspend(job_ptr, sus_ptr->op,
16507 indf_susp);
16508 _resp_array_add(&resp_array, job_ptr, rc2);
16509 }
16510 job_ptr = job_ptr->job_array_next_j;
16511 }
16512 goto reply;
16513 }
16514
16515 array_bitmap = bit_alloc(max_array_size);
16516 tmp = xstrdup(end_ptr + 1);
16517 tok = strtok_r(tmp, ",", &end_ptr);
16518 while (tok && valid) {
16519 valid = _parse_array_tok(tok, array_bitmap,
16520 max_array_size);
16521 tok = strtok_r(NULL, ",", &end_ptr);
16522 }
16523 xfree(tmp);
16524 if (valid) {
16525 i_last = bit_fls(array_bitmap);
16526 if (i_last < 0)
16527 valid = false;
16528 }
16529 if (!valid) {
16530 info("%s: invalid JobId=%s", __func__, sus_ptr->job_id_str);
16531 rc = ESLURM_INVALID_JOB_ID;
16532 goto reply;
16533 }
16534
16535 i_first = bit_ffs(array_bitmap);
16536 if (i_first >= 0)
16537 i_last = bit_fls(array_bitmap);
16538 else
16539 i_last = -2;
16540 for (i = i_first; i <= i_last; i++) {
16541 if (!bit_test(array_bitmap, i))
16542 continue;
16543 job_ptr = find_job_array_rec(job_id, i);
16544 if (job_ptr == NULL) {
16545 info("%s: invalid JobId=%u_%d", __func__, job_id, i);
16546 _resp_array_add_id(&resp_array, job_id, i,
16547 ESLURM_INVALID_JOB_ID);
16548 continue;
16549 }
16550 rc2 = _job_suspend(job_ptr, sus_ptr->op, indf_susp);
16551 _resp_array_add(&resp_array, job_ptr, rc2);
16552 }
16553
16554 reply:
16555 if (conn_fd >= 0) {
16556 slurm_msg_t_init(&resp_msg);
16557 resp_msg.protocol_version = protocol_version;
16558 if (resp_array) {
16559 resp_array_msg = _resp_array_xlate(resp_array, job_id);
16560 resp_msg.msg_type = RESPONSE_JOB_ARRAY_ERRORS;
16561 resp_msg.data = resp_array_msg;
16562 } else {
16563 resp_msg.msg_type = RESPONSE_SLURM_RC;
16564 rc_msg.return_code = rc;
16565 resp_msg.data = &rc_msg;
16566 }
16567 slurm_send_node_msg(conn_fd, &resp_msg);
16568
16569 if (resp_array_msg) {
16570 slurm_free_job_array_resp(resp_array_msg);
16571 resp_msg.data = NULL;
16572 }
16573 }
16574 _resp_array_free(resp_array);
16575
16576 FREE_NULL_BITMAP(array_bitmap);
16577
16578 return rc;
16579 }
16580
16581 /*
16582 * _job_requeue_op - Requeue a running or pending batch job
16583 * IN uid - user id of user issuing the RPC
16584 * IN job_ptr - job to be requeued
16585 * IN preempt - true if job being preempted
16586 * RET 0 on success, otherwise ESLURM error code
16587 */
_job_requeue_op(uid_t uid,job_record_t * job_ptr,bool preempt,uint32_t flags)16588 static int _job_requeue_op(uid_t uid, job_record_t *job_ptr, bool preempt,
16589 uint32_t flags)
16590 {
16591 bool is_running = false, is_suspended = false, is_completed = false;
16592 bool is_completing = false;
16593 time_t now = time(NULL);
16594 uint32_t completing_flags = 0;
16595
16596 /* validate the request */
16597 if ((uid != job_ptr->user_id) && !validate_operator(uid) &&
16598 !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
16599 job_ptr->account)) {
16600 return ESLURM_ACCESS_DENIED;
16601 }
16602
16603 if (((flags & JOB_STATE_BASE) == JOB_RUNNING) &&
16604 !IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr)) {
16605 return SLURM_SUCCESS;
16606 }
16607
16608 if (flags & JOB_RECONFIG_FAIL)
16609 node_features_g_get_node(job_ptr->nodes);
16610
16611 /*
16612 * If the partition was removed don't allow the job to be
16613 * requeued. If it doesn't have details then something is very
16614 * wrong and if the job doesn't want to be requeued don't.
16615 */
16616 if (!job_ptr->part_ptr || !job_ptr->details
16617 || !job_ptr->details->requeue) {
16618 if (flags & JOB_RECONFIG_FAIL)
16619 (void) _job_fail(job_ptr, JOB_BOOT_FAIL);
16620 return ESLURM_DISABLED;
16621 }
16622
16623 if (job_ptr->batch_flag == 0) {
16624 debug("Job-requeue can only be done for batch jobs");
16625 if (flags & JOB_RECONFIG_FAIL)
16626 (void) _job_fail(job_ptr, JOB_BOOT_FAIL);
16627 return ESLURM_BATCH_ONLY;
16628 }
16629
16630 /*
16631 * If the job is already pending, just return an error.
16632 * A federated origin job can be pending and revoked with a sibling job
16633 * on another cluster.
16634 */
16635 if (IS_JOB_PENDING(job_ptr) &&
16636 (!job_ptr->fed_details || !job_ptr->fed_details->cluster_lock))
16637 return ESLURM_JOB_PENDING;
16638
16639 if ((flags & JOB_RECONFIG_FAIL) && IS_JOB_CANCELLED(job_ptr)) {
16640 /*
16641 * Job was cancelled (likely be the user) while node
16642 * reconfiguration was in progress, so don't requeue it
16643 * if the node reconfiguration failed.
16644 */
16645 return ESLURM_DISABLED;
16646 }
16647
16648 if (job_ptr->fed_details) {
16649 int rc;
16650 if ((rc = fed_mgr_job_requeue_test(job_ptr, flags)))
16651 return rc;
16652
16653 /* Sent requeue request to origin cluster */
16654 if (job_ptr->job_state & JOB_REQUEUE_FED)
16655 return SLURM_SUCCESS;
16656 }
16657
16658 last_job_update = now;
16659
16660 /*
16661 * In the job is in the process of completing
16662 * return SLURM_SUCCESS and set the status
16663 * to JOB_PENDING since we support requeue
16664 * of done/exit/exiting jobs.
16665 */
16666 if (IS_JOB_COMPLETING(job_ptr)) {
16667 completing_flags = job_ptr->job_state & JOB_STATE_FLAGS;
16668 is_completing = true;
16669 }
16670
16671 if (IS_JOB_SUSPENDED(job_ptr)) {
16672 uint32_t suspend_job_state = job_ptr->job_state;
16673 /*
16674 * we can't have it as suspended when we call the
16675 * accounting stuff.
16676 */
16677 job_ptr->job_state = JOB_REQUEUE;
16678 jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
16679 job_ptr->job_state = suspend_job_state;
16680 is_suspended = true;
16681 }
16682
16683 job_ptr->time_last_active = now;
16684 if (is_suspended)
16685 job_ptr->end_time = job_ptr->suspend_time;
16686 else if (!is_completing)
16687 job_ptr->end_time = now;
16688
16689 /*
16690 * Save the state of the job so that
16691 * we deallocate the nodes if is in
16692 * running state.
16693 */
16694 if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr))
16695 is_running = true;
16696 else if (IS_JOB_COMPLETED(job_ptr))
16697 is_completed = true;
16698
16699 /* Only change state to requeue for local jobs */
16700 if (fed_mgr_is_origin_job(job_ptr) &&
16701 !fed_mgr_is_tracker_only_job(job_ptr)) {
16702 /*
16703 * We want this job to have the requeued/preempted state in the
16704 * accounting logs. Set a new submit time so the restarted
16705 * job looks like a new job.
16706 */
16707 if (preempt) {
16708 job_ptr->job_state = JOB_PREEMPTED;
16709 build_cg_bitmap(job_ptr);
16710 job_completion_logger(job_ptr, false);
16711 job_ptr->job_state = JOB_REQUEUE;
16712 } else {
16713 job_ptr->job_state = JOB_REQUEUE;
16714 build_cg_bitmap(job_ptr);
16715 job_completion_logger(job_ptr, true);
16716 }
16717 }
16718
16719 /*
16720 * Increment restart counter before completing reply so that completing
16721 * jobs get counted and so that fed jobs get counted before submitting
16722 * new siblings in batch_requeue_fini()
16723 */
16724 job_ptr->restart_cnt++;
16725
16726 if (is_completing) {
16727 job_ptr->job_state = JOB_PENDING | completing_flags;
16728 goto reply;
16729 }
16730
16731 /*
16732 * Deallocate resources only if the job has some.
16733 * JOB_COMPLETING is needed to properly clean up steps.
16734 */
16735 if (is_running) {
16736 job_ptr->job_state |= JOB_COMPLETING;
16737 deallocate_nodes(job_ptr, false, is_suspended, preempt);
16738 job_ptr->job_state &= (~JOB_COMPLETING);
16739 }
16740
16741 /* do this after the epilog complete, setting it here is too early */
16742 //job_ptr->db_index = 0;
16743 //job_ptr->details->submit_time = now;
16744
16745 job_ptr->job_state = JOB_PENDING;
16746 if (job_ptr->node_cnt)
16747 job_ptr->job_state |= JOB_COMPLETING;
16748
16749 /*
16750 * Mark the origin job as requeueing. Will finish requeueing fed job
16751 * after job has completed.
16752 * If it's completed, batch_requeue_fini is called below and will call
16753 * fed_mgr_job_requeue() to submit new siblings.
16754 * If it's not completed, batch_requeue_fini will either be called when
16755 * the running origin job finishes or the running remote sibling job
16756 * reports that the job is finished.
16757 */
16758 if (job_ptr->fed_details && !is_completed) {
16759 job_ptr->job_state |= JOB_COMPLETING;
16760 job_ptr->job_state |= JOB_REQUEUE_FED;
16761 }
16762
16763 /*
16764 * If we set the time limit it means the user didn't so reset
16765 * it here or we could bust some limit when we try again
16766 */
16767 if (job_ptr->limit_set.time == 1) {
16768 job_ptr->time_limit = NO_VAL;
16769 job_ptr->limit_set.time = 0;
16770 }
16771
16772 reply:
16773 job_ptr->pre_sus_time = (time_t) 0;
16774 job_ptr->suspend_time = (time_t) 0;
16775 job_ptr->tot_sus_time = (time_t) 0;
16776
16777 job_ptr->db_flags = 0;
16778
16779 /* clear signal sent flag on requeue */
16780 job_ptr->warn_flags &= ~WARN_SENT;
16781
16782 /*
16783 * Since the job completion logger removes the submit we need
16784 * to add it again.
16785 */
16786 acct_policy_add_job_submit(job_ptr);
16787
16788 acct_policy_update_pending_job(job_ptr);
16789
16790 if (flags & JOB_SPECIAL_EXIT) {
16791 job_ptr->job_state |= JOB_SPECIAL_EXIT;
16792 job_ptr->state_reason = WAIT_HELD_USER;
16793 xfree(job_ptr->state_desc);
16794 job_ptr->state_desc =
16795 xstrdup("job requeued in special exit state");
16796 debug("%s: Holding %pJ, special exit", __func__, job_ptr);
16797 job_ptr->priority = 0;
16798 }
16799 if (flags & JOB_REQUEUE_HOLD) {
16800 job_ptr->state_reason = WAIT_HELD_USER;
16801 xfree(job_ptr->state_desc);
16802 if (flags & JOB_LAUNCH_FAILED) {
16803 job_ptr->state_desc
16804 = xstrdup("launch failed requeued held");
16805 } else {
16806 job_ptr->state_desc
16807 = xstrdup("job requeued in held state");
16808 }
16809 debug("%s: Holding %pJ, requeue-hold exit", __func__, job_ptr);
16810 job_ptr->priority = 0;
16811 }
16812
16813 /*
16814 * When jobs are requeued while running/completing batch_requeue_fini is
16815 * called after the job is completely finished. If the job is already
16816 * finished it needs to be called to clear out states (especially the
16817 * db_index or we will just write over the last job in the database).
16818 * Call batch_requeue_fini after setting priority to 0 for requeue_hold
16819 * and special_exit so federation doesn't submit siblings for held job.
16820 */
16821 if (is_completed)
16822 batch_requeue_fini(job_ptr);
16823
16824 debug("%s: %pJ state 0x%x reason %u priority %d",
16825 __func__, job_ptr, job_ptr->job_state,
16826 job_ptr->state_reason, job_ptr->priority);
16827
16828 return SLURM_SUCCESS;
16829 }
16830
16831 /*
16832 * _job_requeue - Requeue a running or pending batch job, if the specified
16833 * job records is a hetjob leader, perform the operation on all
16834 * components of the hetjob
16835 * IN uid - user id of user issuing the RPC
16836 * IN job_ptr - job to be requeued
16837 * IN preempt - true if job being preempted
16838 * RET 0 on success, otherwise ESLURM error code
16839 */
_job_requeue(uid_t uid,job_record_t * job_ptr,bool preempt,uint32_t flags)16840 static int _job_requeue(uid_t uid, job_record_t *job_ptr, bool preempt,
16841 uint32_t flags)
16842 {
16843 job_record_t *het_job;
16844 int rc = SLURM_SUCCESS, rc1;
16845 ListIterator iter;
16846
16847 if (job_ptr->het_job_id && !job_ptr->het_job_list)
16848 return ESLURM_NOT_HET_JOB_LEADER;
16849
16850 if (job_ptr->het_job_list) {
16851 iter = list_iterator_create(job_ptr->het_job_list);
16852 while ((het_job = list_next(iter))) {
16853 if (job_ptr->het_job_id != het_job->het_job_id) {
16854 error("%s: Bad het_job_list for %pJ",
16855 __func__, job_ptr);
16856 continue;
16857 }
16858 rc1 = _job_requeue_op(uid, het_job, preempt, flags);
16859 if (rc1 != SLURM_SUCCESS)
16860 rc = rc1;
16861 }
16862 list_iterator_destroy(iter);
16863 } else {
16864 rc = _job_requeue_op(uid, job_ptr, preempt, flags);
16865 }
16866
16867 return rc;
16868 }
16869
16870 /*
16871 * job_requeue - Requeue a running or pending batch job
16872 * IN uid - user id of user issuing the RPC
16873 * IN job_id - id of the job to be requeued
16874 * IN msg - slurm_msg to send response back on
16875 * IN preempt - true if job being preempted
16876 * IN flags - JobExitRequeue | Hold | JobFailed | etc.
16877 * RET 0 on success, otherwise ESLURM error code
16878 */
job_requeue(uid_t uid,uint32_t job_id,slurm_msg_t * msg,bool preempt,uint32_t flags)16879 extern int job_requeue(uid_t uid, uint32_t job_id, slurm_msg_t *msg,
16880 bool preempt, uint32_t flags)
16881 {
16882 int rc = SLURM_SUCCESS;
16883 job_record_t *job_ptr = NULL;
16884
16885 /* find the job */
16886 job_ptr = find_job_record(job_id);
16887 if (job_ptr == NULL) {
16888 rc = ESLURM_INVALID_JOB_ID;
16889 } else {
16890 /* _job_requeue already handles het jobs */
16891 rc = _job_requeue(uid, job_ptr, preempt, flags);
16892 }
16893
16894 if (msg) {
16895 slurm_send_rc_msg(msg, rc);
16896 }
16897
16898 return rc;
16899 }
16900
16901 /*
16902 * job_requeue2 - Requeue a running or pending batch job
16903 * IN uid - user id of user issuing the RPC
16904 * IN req_ptr - request including ID of the job to be requeued
16905 * IN msg - slurm_msg to send response back on
16906 * IN preempt - true if job being preempted
16907 * RET 0 on success, otherwise ESLURM error code
16908 */
job_requeue2(uid_t uid,requeue_msg_t * req_ptr,slurm_msg_t * msg,bool preempt)16909 extern int job_requeue2(uid_t uid, requeue_msg_t *req_ptr, slurm_msg_t *msg,
16910 bool preempt)
16911 {
16912 int rc = SLURM_SUCCESS, rc2;
16913 job_record_t *job_ptr = NULL;
16914 long int long_id;
16915 uint32_t job_id = 0;
16916 char *end_ptr = NULL, *tok, *tmp;
16917 bitstr_t *array_bitmap = NULL;
16918 bool valid = true;
16919 int32_t i, i_first, i_last;
16920 slurm_msg_t resp_msg;
16921 return_code_msg_t rc_msg;
16922 uint32_t flags = req_ptr->flags;
16923 char *job_id_str = req_ptr->job_id_str;
16924 resp_array_struct_t *resp_array = NULL;
16925 job_array_resp_msg_t *resp_array_msg = NULL;
16926
16927 if (max_array_size == NO_VAL) {
16928 max_array_size = slurmctld_conf.max_array_sz;
16929 }
16930
16931 long_id = strtol(job_id_str, &end_ptr, 10);
16932 if ((long_id <= 0) || (long_id == LONG_MAX) ||
16933 ((end_ptr[0] != '\0') && (end_ptr[0] != '_'))) {
16934 info("%s: invalid JobId=%s", __func__, job_id_str);
16935 rc = ESLURM_INVALID_JOB_ID;
16936 goto reply;
16937 }
16938 if ((end_ptr[0] == '_') && (end_ptr[1] == '*'))
16939 end_ptr += 2; /* Defaults to full job array */
16940
16941 job_id = (uint32_t) long_id;
16942 if (end_ptr[0] == '\0') { /* Single job (or full job array) */
16943 job_record_t *job_ptr_done = NULL;
16944 job_ptr = find_job_record(job_id);
16945 if (job_ptr &&
16946 (((job_ptr->array_task_id == NO_VAL) &&
16947 (job_ptr->array_recs == NULL)) ||
16948 ((job_ptr->array_task_id != NO_VAL) &&
16949 (job_ptr->array_job_id != job_id)))) {
16950 /* This is a regular job or single task of job array */
16951 rc = _job_requeue(uid, job_ptr, preempt, flags);
16952 goto reply;
16953 }
16954
16955 if (job_ptr && job_ptr->array_recs) {
16956 /* This is a job array */
16957 job_ptr_done = job_ptr;
16958 rc2 = _job_requeue(uid, job_ptr, preempt, flags);
16959 _resp_array_add(&resp_array, job_ptr, rc2);
16960 }
16961
16962 /* Requeue all tasks of this job array */
16963 job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
16964 if (!job_ptr && !job_ptr_done) {
16965 rc = ESLURM_INVALID_JOB_ID;
16966 goto reply;
16967 }
16968 while (job_ptr) {
16969 if ((job_ptr->array_job_id == job_id) &&
16970 (job_ptr != job_ptr_done)) {
16971 rc2 = _job_requeue(uid, job_ptr, preempt,flags);
16972 _resp_array_add(&resp_array, job_ptr, rc2);
16973 }
16974 job_ptr = job_ptr->job_array_next_j;
16975 }
16976 goto reply;
16977 }
16978
16979 array_bitmap = bit_alloc(max_array_size);
16980 tmp = xstrdup(end_ptr + 1);
16981 tok = strtok_r(tmp, ",", &end_ptr);
16982 while (tok && valid) {
16983 valid = _parse_array_tok(tok, array_bitmap,
16984 max_array_size);
16985 tok = strtok_r(NULL, ",", &end_ptr);
16986 }
16987 xfree(tmp);
16988 if (valid) {
16989 i_last = bit_fls(array_bitmap);
16990 if (i_last < 0)
16991 valid = false;
16992 }
16993 if (!valid) {
16994 info("%s: invalid JobId=%s", __func__, job_id_str);
16995 rc = ESLURM_INVALID_JOB_ID;
16996 goto reply;
16997 }
16998
16999 i_first = bit_ffs(array_bitmap);
17000 if (i_first >= 0)
17001 i_last = bit_fls(array_bitmap);
17002 else
17003 i_last = -2;
17004 for (i = i_first; i <= i_last; i++) {
17005 if (!bit_test(array_bitmap, i))
17006 continue;
17007 job_ptr = find_job_array_rec(job_id, i);
17008 if (job_ptr == NULL) {
17009 info("%s: invalid JobId=%u_%d", __func__, job_id, i);
17010 _resp_array_add_id(&resp_array, job_id, i,
17011 ESLURM_INVALID_JOB_ID);
17012 continue;
17013 }
17014
17015 rc2 = _job_requeue(uid, job_ptr, preempt, flags);
17016 _resp_array_add(&resp_array, job_ptr, rc2);
17017 }
17018
17019 reply:
17020 if (msg) {
17021 response_init(&resp_msg, msg);
17022 if (resp_array) {
17023 resp_array_msg = _resp_array_xlate(resp_array, job_id);
17024 resp_msg.msg_type = RESPONSE_JOB_ARRAY_ERRORS;
17025 resp_msg.data = resp_array_msg;
17026 } else {
17027 resp_msg.msg_type = RESPONSE_SLURM_RC;
17028 rc_msg.return_code = rc;
17029 resp_msg.data = &rc_msg;
17030 }
17031 slurm_send_node_msg(msg->conn_fd, &resp_msg);
17032
17033 if (resp_array_msg) {
17034 slurm_free_job_array_resp(resp_array_msg);
17035 resp_msg.data = NULL;
17036 }
17037 }
17038 _resp_array_free(resp_array);
17039
17040 FREE_NULL_BITMAP(array_bitmap);
17041
17042 return rc;
17043 }
17044
_top_job_flag_clear(void * x,void * arg)17045 static int _top_job_flag_clear(void *x, void *arg)
17046 {
17047 job_record_t *job_ptr = (job_record_t *) x;
17048 job_ptr->bit_flags &= (~TOP_PRIO_TMP);
17049 return 0;
17050 }
17051
17052 /* This sorts so the highest priorities come off the list first */
_top_job_prio_sort(void * x,void * y)17053 static int _top_job_prio_sort(void *x, void *y)
17054 {
17055 uint32_t *prio1, *prio2;
17056 prio1 = *(uint32_t **) x;
17057 prio2 = *(uint32_t **) y;
17058 if (*prio1 < *prio2)
17059 return 1;
17060 if (*prio1 > *prio2)
17061 return -1;
17062 return 0;
17063 }
17064
_set_top(List top_job_list,uid_t uid)17065 static int _set_top(List top_job_list, uid_t uid)
17066 {
17067 List prio_list, other_job_list;
17068 ListIterator iter;
17069 job_record_t *job_ptr, *first_job_ptr = NULL;
17070 int rc = SLURM_SUCCESS, rc2 = SLURM_SUCCESS;
17071 uint32_t last_prio = NO_VAL, next_prio;
17072 int64_t delta_prio, delta_nice, total_delta = 0;
17073 int other_job_cnt = 0;
17074 uint32_t *prio_elem;
17075
17076 xassert(job_list);
17077 xassert(top_job_list);
17078 prio_list = list_create(xfree_ptr);
17079 (void) list_for_each(job_list, _top_job_flag_clear, NULL);
17080
17081 /* Validate the jobs in our "top" list */
17082 iter = list_iterator_create(top_job_list);
17083 while ((job_ptr = list_next(iter))) {
17084 if ((job_ptr->user_id != uid) && (uid != 0)) {
17085 error("Security violation: REQUEST_TOP_JOB for %pJ from uid=%u",
17086 job_ptr, uid);
17087 rc = ESLURM_ACCESS_DENIED;
17088 break;
17089 }
17090 if (!IS_JOB_PENDING(job_ptr) || (job_ptr->details == NULL)) {
17091 debug("%s: %pJ not pending", __func__, job_ptr);
17092 list_remove(iter);
17093 rc2 = ESLURM_JOB_NOT_PENDING;
17094 continue;
17095 }
17096 if (job_ptr->part_ptr_list) {
17097 debug("%s: %pJ in partition list", __func__, job_ptr);
17098 list_remove(iter);
17099 rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
17100 break;
17101 }
17102 if (job_ptr->priority == 0) {
17103 debug("%s: %pJ is held", __func__, job_ptr);
17104 list_remove(iter);
17105 rc2 = ESLURM_JOB_HELD;
17106 continue;
17107 }
17108 if (job_ptr->bit_flags & TOP_PRIO_TMP) {
17109 /* Duplicate job ID */
17110 list_remove(iter);
17111 continue;
17112 }
17113 if (!first_job_ptr)
17114 first_job_ptr = job_ptr;
17115 job_ptr->bit_flags |= TOP_PRIO_TMP;
17116 prio_elem = xmalloc(sizeof(uint32_t));
17117 *prio_elem = job_ptr->priority;
17118 list_append(prio_list, prio_elem);
17119 }
17120 list_iterator_destroy(iter);
17121 if (rc != SLURM_SUCCESS) {
17122 FREE_NULL_LIST(prio_list);
17123 return rc;
17124 }
17125 if (!first_job_ptr) {
17126 FREE_NULL_LIST(prio_list);
17127 return rc2;
17128 }
17129
17130 /* Identify other jobs which we can adjust the nice value of */
17131 other_job_list = list_create(NULL);
17132 iter = list_iterator_create(job_list);
17133 while ((job_ptr = list_next(iter))) {
17134 /*
17135 * Do not select jobs with priority 0 (held), or
17136 * priority 1 (would be held if we lowered the priority).
17137 */
17138 if ((job_ptr->bit_flags & TOP_PRIO_TMP) ||
17139 (job_ptr->details == NULL) ||
17140 (job_ptr->part_ptr_list) ||
17141 (job_ptr->priority <= 1) ||
17142 (job_ptr->assoc_ptr != first_job_ptr->assoc_ptr) ||
17143 (job_ptr->part_ptr != first_job_ptr->part_ptr) ||
17144 (job_ptr->qos_ptr != first_job_ptr->qos_ptr) ||
17145 (job_ptr->user_id != first_job_ptr->user_id) ||
17146 (!IS_JOB_PENDING(job_ptr)))
17147 continue;
17148 other_job_cnt++;
17149 job_ptr->bit_flags |= TOP_PRIO_TMP;
17150 prio_elem = xmalloc(sizeof(uint32_t));
17151 *prio_elem = job_ptr->priority;
17152 list_append(prio_list, prio_elem);
17153 list_append(other_job_list, job_ptr);
17154 }
17155 list_iterator_destroy(iter);
17156
17157 /* Now adjust nice values and priorities of the listed "top" jobs */
17158 list_sort(prio_list, _top_job_prio_sort);
17159 iter = list_iterator_create(top_job_list);
17160 while ((job_ptr = list_next(iter))) {
17161 prio_elem = list_pop(prio_list);
17162 next_prio = *prio_elem;
17163 xfree(prio_elem);
17164 if ((last_prio != NO_VAL) && (next_prio == last_prio) &&
17165 (last_prio > 2))
17166 /*
17167 * We don't want to set job priority lower than 1, so
17168 * last_prio cannot be smaller than 2, since we will
17169 * later use last_prio - 1 for the new job priority.
17170 */
17171 next_prio = last_prio - 1;
17172 last_prio = next_prio;
17173 delta_prio = (int64_t) next_prio - job_ptr->priority;
17174 delta_nice = MIN(job_ptr->details->nice, delta_prio);
17175 total_delta += delta_nice;
17176 job_ptr->priority = next_prio;
17177 job_ptr->details->nice -= delta_nice;
17178 job_ptr->bit_flags &= (~TOP_PRIO_TMP);
17179 }
17180 list_iterator_destroy(iter);
17181 FREE_NULL_LIST(prio_list);
17182
17183 /* Now adjust nice values and priorities of remaining effected jobs */
17184 if (other_job_cnt) {
17185 iter = list_iterator_create(other_job_list);
17186 while ((job_ptr = list_next(iter))) {
17187 delta_prio = total_delta / other_job_cnt;
17188 next_prio = job_ptr->priority - delta_prio;
17189 if (next_prio >= last_prio) {
17190 next_prio = last_prio - 1;
17191 delta_prio = job_ptr->priority - next_prio;
17192 }
17193 delta_nice = delta_prio;
17194 job_ptr->priority = next_prio;
17195 job_ptr->details->nice += delta_nice;
17196 job_ptr->bit_flags &= (~TOP_PRIO_TMP);
17197 total_delta -= delta_nice;
17198 if (--other_job_cnt == 0)
17199 break; /* Count will match list size anyway */
17200 }
17201 list_iterator_destroy(iter);
17202 }
17203 FREE_NULL_LIST(other_job_list);
17204
17205 last_job_update = time(NULL);
17206
17207 return rc;
17208 }
17209
17210 /*
17211 * job_set_top - Move the specified jobs to the top of the queue (at least
17212 * for that user ID, partition, account, and QOS).
17213 *
17214 * IN top_ptr - user request
17215 * IN uid - user id of the user issuing the RPC
17216 * IN conn_fd - file descriptor on which to send reply,
17217 * -1 if none
17218 * IN protocol_version - slurm protocol version of client
17219 * RET 0 on success, otherwise ESLURM error code
17220 */
job_set_top(top_job_msg_t * top_ptr,uid_t uid,int conn_fd,uint16_t protocol_version)17221 extern int job_set_top(top_job_msg_t *top_ptr, uid_t uid, int conn_fd,
17222 uint16_t protocol_version)
17223 {
17224 int rc = SLURM_SUCCESS;
17225 List top_job_list = NULL;
17226 char *job_str_tmp = NULL, *tok, *save_ptr = NULL, *end_ptr = NULL;
17227 job_record_t *job_ptr = NULL;
17228 long int long_id;
17229 uint32_t job_id = 0, task_id = 0;
17230 slurm_msg_t resp_msg;
17231 return_code_msg_t rc_msg;
17232
17233 if (validate_operator(uid)) {
17234 uid = 0;
17235 } else {
17236 bool disable_user_top = true;
17237 char *sched_params = slurm_get_sched_params();
17238 if (xstrcasestr(sched_params, "enable_user_top"))
17239 disable_user_top = false;
17240 xfree(sched_params);
17241 if (disable_user_top) {
17242 rc = ESLURM_ACCESS_DENIED;
17243 goto reply;
17244 }
17245 }
17246
17247 top_job_list = list_create(NULL);
17248 job_str_tmp = xstrdup(top_ptr->job_id_str);
17249 tok = strtok_r(job_str_tmp, ",", &save_ptr);
17250 while (tok) {
17251 long_id = strtol(tok, &end_ptr, 10);
17252 if ((long_id <= 0) || (long_id == LONG_MAX) ||
17253 ((end_ptr[0] != '\0') && (end_ptr[0] != '_'))) {
17254 info("%s: invalid job id %s", __func__, tok);
17255 rc = ESLURM_INVALID_JOB_ID;
17256 goto reply;
17257 }
17258 job_id = (uint32_t) long_id;
17259 if ((end_ptr[0] == '\0') || /* Single job (or full job array) */
17260 ((end_ptr[0] == '_') && (end_ptr[1] == '*') &&
17261 (end_ptr[2] == '\0'))) {
17262 job_ptr = find_job_record(job_id);
17263 if (!job_ptr) {
17264 rc = ESLURM_INVALID_JOB_ID;
17265 goto reply;
17266 }
17267 list_append(top_job_list, job_ptr);
17268 } else if (end_ptr[0] != '_') { /* Invalid job ID spec */
17269 rc = ESLURM_INVALID_JOB_ID;
17270 goto reply;
17271 } else { /* Single task of a job array */
17272 task_id = strtol(end_ptr + 1, &end_ptr, 10);
17273 if (end_ptr[0] != '\0') { /* Invalid job ID spec */
17274 rc = ESLURM_INVALID_JOB_ID;
17275 goto reply;
17276 }
17277 job_ptr = find_job_array_rec(job_id, task_id);
17278 if (!job_ptr) {
17279 rc = ESLURM_INVALID_JOB_ID;
17280 goto reply;
17281 }
17282 list_append(top_job_list, job_ptr);
17283 }
17284 tok = strtok_r(NULL, ",", &save_ptr);
17285 }
17286
17287 if (list_count(top_job_list) == 0) {
17288 rc = ESLURM_INVALID_JOB_ID;
17289 goto reply;
17290 }
17291 rc = _set_top(top_job_list, uid);
17292
17293 reply: FREE_NULL_LIST(top_job_list);
17294 xfree(job_str_tmp);
17295 if (conn_fd >= 0) {
17296 slurm_msg_t_init(&resp_msg);
17297 resp_msg.protocol_version = protocol_version;
17298 resp_msg.msg_type = RESPONSE_SLURM_RC;
17299 memset(&rc_msg, 0, sizeof(rc_msg));
17300 rc_msg.return_code = rc;
17301 resp_msg.data = &rc_msg;
17302 slurm_send_node_msg(conn_fd, &resp_msg);
17303 }
17304
17305 return rc;
17306 }
17307
17308 /*
17309 * job_end_time - Process JOB_END_TIME
17310 * IN time_req_msg - job end time request
17311 * OUT timeout_msg - job timeout response to be sent
17312 * RET SLURM_SUCCESS or an error code
17313 */
job_end_time(job_alloc_info_msg_t * time_req_msg,srun_timeout_msg_t * timeout_msg)17314 extern int job_end_time(job_alloc_info_msg_t *time_req_msg,
17315 srun_timeout_msg_t *timeout_msg)
17316 {
17317 job_record_t *job_ptr;
17318 xassert(timeout_msg);
17319
17320 job_ptr = find_job_record(time_req_msg->job_id);
17321 if (!job_ptr)
17322 return ESLURM_INVALID_JOB_ID;
17323
17324 memset(timeout_msg, 0, sizeof(srun_timeout_msg_t));
17325 timeout_msg->job_id = time_req_msg->job_id;
17326 timeout_msg->step_id = NO_VAL;
17327 timeout_msg->timeout = job_ptr->end_time;
17328 return SLURM_SUCCESS;
17329 }
17330
17331 /* Reset nodes_completing field for all jobs. */
update_job_nodes_completing(void)17332 extern void update_job_nodes_completing(void)
17333 {
17334 ListIterator job_iterator;
17335 job_record_t *job_ptr;
17336
17337 xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
17338
17339 if (!job_list)
17340 return;
17341
17342 job_iterator = list_iterator_create(job_list);
17343 while ((job_ptr = list_next(job_iterator))) {
17344 if ((!IS_JOB_COMPLETING(job_ptr)) ||
17345 (job_ptr->node_bitmap == NULL))
17346 continue;
17347 xfree(job_ptr->nodes_completing);
17348 if (job_ptr->node_bitmap_cg) {
17349 job_ptr->nodes_completing =
17350 bitmap2node_name(job_ptr->node_bitmap_cg);
17351 } else {
17352 job_ptr->nodes_completing =
17353 bitmap2node_name(job_ptr->node_bitmap);
17354 }
17355 }
17356 list_iterator_destroy(job_iterator);
17357 }
17358
17359 /*
17360 * job_hold_by_assoc_id - Hold all pending jobs with a given
17361 * association ID. This happens when an association is deleted (e.g. when
17362 * a user is removed from the association database).
17363 * RET count of held jobs
17364 */
job_hold_by_assoc_id(uint32_t assoc_id)17365 extern int job_hold_by_assoc_id(uint32_t assoc_id)
17366 {
17367 int cnt = 0;
17368 ListIterator job_iterator;
17369 job_record_t *job_ptr;
17370 /* Write lock on jobs */
17371 slurmctld_lock_t job_write_lock =
17372 { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
17373
17374 if (!job_list)
17375 return cnt;
17376
17377 lock_slurmctld(job_write_lock);
17378 job_iterator = list_iterator_create(job_list);
17379 while ((job_ptr = list_next(job_iterator))) {
17380 if (job_ptr->assoc_id != assoc_id)
17381 continue;
17382
17383 cnt += _job_fail_account(job_ptr, __func__);
17384 }
17385 list_iterator_destroy(job_iterator);
17386 unlock_slurmctld(job_write_lock);
17387 return cnt;
17388 }
17389
17390 /*
17391 * job_hold_by_qos_id - Hold all pending jobs with a given
17392 * QOS ID. This happens when a QOS is deleted (e.g. when
17393 * a QOS is removed from the association database).
17394 * RET count of held jobs
17395 */
job_hold_by_qos_id(uint32_t qos_id)17396 extern int job_hold_by_qos_id(uint32_t qos_id)
17397 {
17398 int cnt = 0;
17399 ListIterator job_iterator;
17400 job_record_t *job_ptr;
17401 /* Write lock on jobs */
17402 slurmctld_lock_t job_write_lock =
17403 { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
17404
17405 if (!job_list)
17406 return cnt;
17407
17408 lock_slurmctld(job_write_lock);
17409 job_iterator = list_iterator_create(job_list);
17410 while ((job_ptr = list_next(job_iterator))) {
17411 if (job_ptr->qos_blocking_ptr &&
17412 ((slurmdb_qos_rec_t *)job_ptr->qos_blocking_ptr)->id
17413 == qos_id)
17414 job_ptr->qos_blocking_ptr = NULL;
17415 if (job_ptr->qos_id != qos_id)
17416 continue;
17417
17418 cnt += job_fail_qos(job_ptr, __func__);
17419 }
17420 list_iterator_destroy(job_iterator);
17421 unlock_slurmctld(job_write_lock);
17422 return cnt;
17423 }
17424
17425 /*
17426 * Modify the account associated with a pending job
17427 * IN module - where this is called from
17428 * IN job_ptr - pointer to job which should be modified
17429 * IN new_wckey - desired wckey name
17430 * RET SLURM_SUCCESS or error code
17431 */
update_job_wckey(char * module,job_record_t * job_ptr,char * new_wckey)17432 extern int update_job_wckey(char *module, job_record_t *job_ptr,
17433 char *new_wckey)
17434 {
17435 slurmdb_wckey_rec_t wckey_rec, *wckey_ptr;
17436
17437 if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL)) {
17438 info("%s: attempt to modify account for non-pending %pJ",
17439 module, job_ptr);
17440 return ESLURM_JOB_NOT_PENDING;
17441 }
17442
17443 memset(&wckey_rec, 0, sizeof(wckey_rec));
17444 wckey_rec.uid = job_ptr->user_id;
17445 wckey_rec.name = new_wckey;
17446 if (assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec,
17447 accounting_enforce, &wckey_ptr, false)) {
17448 info("%s: invalid wckey %s for %pJ",
17449 module, new_wckey, job_ptr);
17450 return ESLURM_INVALID_WCKEY;
17451 } else if (association_based_accounting
17452 && !wckey_ptr
17453 && !(accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS)) {
17454 /* if not enforcing associations we want to look for
17455 the default account and use it to avoid getting
17456 trash in the accounting records.
17457 */
17458 wckey_rec.name = NULL;
17459 assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec,
17460 accounting_enforce, &wckey_ptr, false);
17461 if (!wckey_ptr) {
17462 debug("%s: we didn't have a wckey record for wckey "
17463 "'%s' and user '%u', and we can't seem to find "
17464 "a default one either. Setting it anyway. "
17465 "This will produce trash in accounting. "
17466 "If this is not what you desire please put "
17467 "AccountStorageEnforce=wckeys in your slurm.conf "
17468 "file.", module, new_wckey,
17469 job_ptr->user_id);
17470 wckey_rec.name = new_wckey;
17471 }
17472 }
17473
17474 xfree(job_ptr->wckey);
17475 if (wckey_rec.name && wckey_rec.name[0] != '\0') {
17476 job_ptr->wckey = xstrdup(wckey_rec.name);
17477 info("%s: setting wckey to %s for %pJ",
17478 module, wckey_rec.name, job_ptr);
17479 } else {
17480 info("%s: cleared wckey for %pJ", module, job_ptr);
17481 }
17482
17483 last_job_update = time(NULL);
17484
17485 return SLURM_SUCCESS;
17486 }
17487
send_jobs_to_accounting(void)17488 extern int send_jobs_to_accounting(void)
17489 {
17490 ListIterator itr = NULL;
17491 job_record_t *job_ptr;
17492 slurmctld_lock_t job_write_lock = {
17493 NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK, NO_LOCK };
17494
17495 /* send jobs in pending or running state */
17496 lock_slurmctld(job_write_lock);
17497 itr = list_iterator_create(job_list);
17498 while ((job_ptr = list_next(itr))) {
17499 if (!job_ptr->assoc_id) {
17500 slurmdb_assoc_rec_t assoc_rec;
17501 memset(&assoc_rec, 0,
17502 sizeof(assoc_rec));
17503 assoc_rec.acct = job_ptr->account;
17504 if (job_ptr->part_ptr)
17505 assoc_rec.partition = job_ptr->part_ptr->name;
17506 assoc_rec.uid = job_ptr->user_id;
17507
17508 if (assoc_mgr_fill_in_assoc(
17509 acct_db_conn, &assoc_rec,
17510 accounting_enforce,
17511 &job_ptr->assoc_ptr, false) &&
17512 (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)
17513 && (!IS_JOB_FINISHED(job_ptr))) {
17514 _job_fail_account(job_ptr, __func__);
17515 continue;
17516 } else
17517 job_ptr->assoc_id = assoc_rec.id;
17518 }
17519
17520 /* we only want active, un accounted for jobs */
17521 if (job_ptr->db_index || IS_JOB_FINISHED(job_ptr))
17522 continue;
17523
17524 debug("first reg: starting %pJ in accounting", job_ptr);
17525 jobacct_storage_g_job_start(acct_db_conn, job_ptr);
17526
17527 if (IS_JOB_SUSPENDED(job_ptr))
17528 jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
17529 }
17530 list_iterator_destroy(itr);
17531 unlock_slurmctld(job_write_lock);
17532
17533 return SLURM_SUCCESS;
17534 }
17535
17536 /*
17537 * copy_job_record_to_job_desc - construct a job_desc_msg_t for a job.
17538 * IN job_ptr - the job record
17539 * RET the job_desc_msg_t, NULL on error
17540 */
copy_job_record_to_job_desc(job_record_t * job_ptr)17541 extern job_desc_msg_t *copy_job_record_to_job_desc(job_record_t *job_ptr)
17542 {
17543 job_desc_msg_t *job_desc;
17544 struct job_details *details = job_ptr->details;
17545 multi_core_data_t *mc_ptr = details->mc_ptr;
17546 int i;
17547
17548 /* construct a job_desc_msg_t from job */
17549 job_desc = xmalloc(sizeof(job_desc_msg_t));
17550
17551 job_desc->account = xstrdup(job_ptr->account);
17552 job_desc->acctg_freq = xstrdup(details->acctg_freq);
17553 job_desc->alloc_node = xstrdup(job_ptr->alloc_node);
17554 /* Since the allocating salloc or srun is not expected to exist
17555 * when this checkpointed job is restarted, do not save these:
17556 *
17557 * job_desc->alloc_resp_port = job_ptr->alloc_resp_port;
17558 * job_desc->alloc_sid = job_ptr->alloc_sid;
17559 */
17560 job_desc->argc = details->argc;
17561 job_desc->argv = xcalloc(job_desc->argc, sizeof(char *));
17562 for (i = 0; i < job_desc->argc; i ++)
17563 job_desc->argv[i] = xstrdup(details->argv[i]);
17564 job_desc->begin_time = details->begin_time;
17565 job_desc->bitflags = job_ptr->bit_flags;
17566 job_desc->clusters = xstrdup(job_ptr->clusters);
17567 job_desc->comment = xstrdup(job_ptr->comment);
17568 job_desc->contiguous = details->contiguous;
17569 job_desc->core_spec = details->core_spec;
17570 job_desc->cpu_bind = xstrdup(details->cpu_bind);
17571 job_desc->cpu_bind_type = details->cpu_bind_type;
17572 job_desc->cpu_freq_min = details->cpu_freq_min;
17573 job_desc->cpu_freq_max = details->cpu_freq_max;
17574 job_desc->cpu_freq_gov = details->cpu_freq_gov;
17575 job_desc->deadline = job_ptr->deadline;
17576 job_desc->dependency = xstrdup(details->dependency);
17577 job_desc->end_time = 0; /* Unused today */
17578 job_desc->environment = get_job_env(job_ptr,
17579 &job_desc->env_size);
17580 job_desc->exc_nodes = xstrdup(details->exc_nodes);
17581 job_desc->features = xstrdup(details->features);
17582 job_desc->cluster_features = xstrdup(details->cluster_features);
17583 job_desc->group_id = job_ptr->group_id;
17584 job_desc->immediate = 0; /* nowhere to get this value */
17585 job_desc->job_id = job_ptr->job_id;
17586 job_desc->kill_on_node_fail = job_ptr->kill_on_node_fail;
17587 job_desc->licenses = xstrdup(job_ptr->licenses);
17588 job_desc->mail_type = job_ptr->mail_type;
17589 job_desc->mail_user = xstrdup(job_ptr->mail_user);
17590 job_desc->mcs_label = xstrdup(job_ptr->mcs_label);
17591 job_desc->mem_bind = xstrdup(details->mem_bind);
17592 job_desc->mem_bind_type = details->mem_bind_type;
17593 job_desc->name = xstrdup(job_ptr->name);
17594 job_desc->network = xstrdup(job_ptr->network);
17595 job_desc->nice = details->nice;
17596 job_desc->num_tasks = details->num_tasks;
17597 job_desc->open_mode = details->open_mode;
17598 job_desc->origin_cluster = xstrdup(job_ptr->origin_cluster);
17599 job_desc->other_port = job_ptr->other_port;
17600 job_desc->power_flags = job_ptr->power_flags;
17601 job_desc->overcommit = details->overcommit;
17602 job_desc->partition = xstrdup(job_ptr->partition);
17603 job_desc->plane_size = details->plane_size;
17604 job_desc->priority = job_ptr->priority;
17605 if (job_ptr->qos_ptr)
17606 job_desc->qos = xstrdup(job_ptr->qos_ptr->name);
17607 job_desc->resp_host = xstrdup(job_ptr->resp_host);
17608 job_desc->req_nodes = xstrdup(details->req_nodes);
17609 job_desc->requeue = details->requeue;
17610 job_desc->reservation = xstrdup(job_ptr->resv_name);
17611 job_desc->restart_cnt = job_ptr->restart_cnt;
17612 job_desc->script_buf = get_job_script(job_ptr);
17613 if (details->share_res == 1)
17614 job_desc->shared = JOB_SHARED_OK;
17615 else if (details->whole_node == WHOLE_NODE_REQUIRED)
17616 job_desc->shared = JOB_SHARED_NONE;
17617 else if (details->whole_node == WHOLE_NODE_USER)
17618 job_desc->shared = JOB_SHARED_USER;
17619 else if (details->whole_node == WHOLE_NODE_MCS)
17620 job_desc->shared = JOB_SHARED_MCS;
17621 else
17622 job_desc->shared = NO_VAL16;
17623 job_desc->spank_job_env_size = job_ptr->spank_job_env_size;
17624 job_desc->spank_job_env = xcalloc(job_desc->spank_job_env_size,
17625 sizeof(char *));
17626 for (i = 0; i < job_desc->spank_job_env_size; i ++)
17627 job_desc->spank_job_env[i]= xstrdup(job_ptr->spank_job_env[i]);
17628 job_desc->std_err = xstrdup(details->std_err);
17629 job_desc->std_in = xstrdup(details->std_in);
17630 job_desc->std_out = xstrdup(details->std_out);
17631 job_desc->task_dist = details->task_dist;
17632 job_desc->time_limit = job_ptr->time_limit;
17633 job_desc->time_min = job_ptr->time_min;
17634 job_desc->user_id = job_ptr->user_id;
17635 job_desc->wait_all_nodes = job_ptr->wait_all_nodes;
17636 job_desc->warn_flags = job_ptr->warn_flags;
17637 job_desc->warn_signal = job_ptr->warn_signal;
17638 job_desc->warn_time = job_ptr->warn_time;
17639 job_desc->wckey = xstrdup(job_ptr->wckey);
17640 job_desc->work_dir = xstrdup(details->work_dir);
17641 job_desc->pn_min_cpus = details->pn_min_cpus;
17642 job_desc->pn_min_memory = details->pn_min_memory;
17643 job_desc->pn_min_tmp_disk = details->pn_min_tmp_disk;
17644 job_desc->min_cpus = details->min_cpus;
17645 job_desc->max_cpus = details->max_cpus;
17646 job_desc->min_nodes = details->min_nodes;
17647 job_desc->max_nodes = details->max_nodes;
17648 if (job_desc->max_nodes == 0) /* set 0 in _job_create() */
17649 job_desc->max_nodes = NO_VAL;
17650 job_desc->sockets_per_node = mc_ptr->sockets_per_node;
17651 job_desc->cores_per_socket = mc_ptr->cores_per_socket;
17652 job_desc->threads_per_core = mc_ptr->threads_per_core;
17653 job_desc->cpus_per_task = details->cpus_per_task;
17654 job_desc->ntasks_per_node = details->ntasks_per_node;
17655 job_desc->ntasks_per_socket = mc_ptr->ntasks_per_socket;
17656 job_desc->ntasks_per_core = mc_ptr->ntasks_per_core;
17657
17658 job_desc->cpus_per_tres = xstrdup(job_ptr->cpus_per_tres);
17659 job_desc->mem_per_tres = xstrdup(job_ptr->mem_per_tres);
17660 job_desc->tres_bind = xstrdup(job_ptr->tres_bind);
17661 job_desc->tres_freq = xstrdup(job_ptr->tres_freq);
17662 job_desc->tres_per_job = xstrdup(job_ptr->tres_per_job);
17663 job_desc->tres_per_node = xstrdup(job_ptr->tres_per_node);
17664 job_desc->tres_per_socket = xstrdup(job_ptr->tres_per_socket);
17665 job_desc->tres_per_task = xstrdup(job_ptr->tres_per_task);
17666
17667 if (job_ptr->fed_details) {
17668 job_desc->fed_siblings_active =
17669 job_ptr->fed_details->siblings_active;
17670 job_desc->fed_siblings_viable =
17671 job_ptr->fed_details->siblings_viable;
17672 }
17673
17674 return job_desc;
17675 }
17676
17677 /* Build a bitmap of nodes completing this job */
build_cg_bitmap(job_record_t * job_ptr)17678 extern void build_cg_bitmap(job_record_t *job_ptr)
17679 {
17680 FREE_NULL_BITMAP(job_ptr->node_bitmap_cg);
17681 if (job_ptr->node_bitmap) {
17682 job_ptr->node_bitmap_cg = bit_copy(job_ptr->node_bitmap);
17683 if (bit_set_count(job_ptr->node_bitmap_cg) == 0)
17684 job_ptr->job_state &= (~JOB_COMPLETING);
17685 } else {
17686 error("build_cg_bitmap: node_bitmap is NULL");
17687 job_ptr->node_bitmap_cg = bit_alloc(node_record_count);
17688 job_ptr->job_state &= (~JOB_COMPLETING);
17689 }
17690 }
17691
17692 /* job_hold_requeue()
17693 *
17694 * Requeue the job based upon its current state.
17695 * If JOB_SPECIAL_EXIT then requeue and hold with JOB_SPECIAL_EXIT state.
17696 * If JOB_REQUEUE_HOLD then requeue and hold.
17697 * If JOB_REQUEUE then requeue and let it run again.
17698 * The requeue can happen directly from job_requeue() or from
17699 * job_epilog_complete() after the last component has finished.
17700 *
17701 * RET returns true if the job was requeued
17702 */
job_hold_requeue(job_record_t * job_ptr)17703 extern bool job_hold_requeue(job_record_t *job_ptr)
17704 {
17705 uint32_t state;
17706 uint32_t flags;
17707 job_record_t *base_job_ptr = NULL;
17708
17709 xassert(job_ptr);
17710
17711 /* If the job is already pending it was
17712 * eventually requeued somewhere else.
17713 */
17714 if (IS_JOB_PENDING(job_ptr) && !IS_JOB_REVOKED(job_ptr))
17715 return false;
17716
17717 /* If the job is not on the origin cluster, then don't worry about
17718 * requeueing the job here. The exit code will be sent the origin
17719 * cluster and the origin cluster will decide if the job should be
17720 * requeued or not. */
17721 if (!fed_mgr_is_origin_job(job_ptr))
17722 return false;
17723
17724 /*
17725 * A job may be canceled during its epilog in which case we need to
17726 * check that the job (or base job in the case of an array) was not
17727 * canceled before attemping to requeue.
17728 */
17729 if (IS_JOB_CANCELLED(job_ptr) ||
17730 (((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) &&
17731 (base_job_ptr = find_job_record(job_ptr->array_job_id)) &&
17732 base_job_ptr->array_recs && IS_JOB_CANCELLED(base_job_ptr)))
17733 return false;
17734
17735 /* Check if the job exit with one of the
17736 * configured requeue values. */
17737 _set_job_requeue_exit_value(job_ptr);
17738
17739 state = job_ptr->job_state;
17740
17741 if (! (state & JOB_REQUEUE))
17742 return false;
17743
17744 /* Sent event requeue to the database. */
17745 if (!(job_ptr->bit_flags & TRES_STR_CALC) &&
17746 job_ptr->tres_alloc_cnt &&
17747 (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64))
17748 set_job_tres_alloc_str(job_ptr, false);
17749 jobacct_storage_g_job_complete(acct_db_conn, job_ptr);
17750
17751 debug("%s: %pJ state 0x%x", __func__, job_ptr, state);
17752
17753 /* Set the job pending */
17754 flags = job_ptr->job_state & JOB_STATE_FLAGS;
17755 job_ptr->job_state = JOB_PENDING | flags;
17756
17757 job_ptr->restart_cnt++;
17758
17759 /* clear signal sent flag on requeue */
17760 job_ptr->warn_flags &= ~WARN_SENT;
17761
17762 /*
17763 * Test if user wants to requeue the job
17764 * in hold or with a special exit value.
17765 */
17766 if (state & JOB_SPECIAL_EXIT) {
17767 /*
17768 * JOB_SPECIAL_EXIT means requeue the job,
17769 * put it on hold and display state as JOB_SPECIAL_EXIT.
17770 */
17771 job_ptr->job_state |= JOB_SPECIAL_EXIT;
17772 job_ptr->state_reason = WAIT_HELD_USER;
17773 debug("%s: Holding %pJ, special exit", __func__, job_ptr);
17774 job_ptr->priority = 0;
17775 }
17776
17777 job_ptr->job_state &= ~JOB_REQUEUE;
17778
17779 /*
17780 * Mark array as requeued. Exit codes have already been handled in
17781 * _job_array_comp()
17782 */
17783 if (((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) &&
17784 (base_job_ptr = find_job_record(job_ptr->array_job_id)) &&
17785 base_job_ptr->array_recs) {
17786 base_job_ptr->array_recs->array_flags |= ARRAY_TASK_REQUEUED;
17787 }
17788
17789 debug("%s: %pJ state 0x%x reason %u priority %d",
17790 __func__, job_ptr, job_ptr->job_state,
17791 job_ptr->state_reason, job_ptr->priority);
17792
17793 return true;
17794 }
17795
_parse_max_depend_depth(char * str)17796 static void _parse_max_depend_depth(char *str)
17797 {
17798 int i = atoi(str);
17799 if (i < 0)
17800 error("ignoring max_depend_depth value of %d", i);
17801 else
17802 max_depend_depth = i;
17803 }
17804
init_depend_policy(void)17805 extern void init_depend_policy(void)
17806 {
17807 char *depend_params = slurm_get_dependency_params();
17808 char *sched_params = slurm_get_sched_params();
17809 char *tmp_ptr;
17810
17811 disable_remote_singleton =
17812 (xstrcasestr(depend_params, "disable_remote_singleton")) ?
17813 true : false;
17814
17815 /*
17816 * kill_invalid_depend and max_depend_depth are moving from
17817 * SchedulerParameters to DependencyParameters. Support both for 20.02,
17818 * then remove them from SchedulerParameters in a future release.
17819 */
17820 if (xstrcasestr(sched_params, "kill_invalid_depend")) {
17821 info("kill_invalid_depend is deprecated in SchedulerParameters and moved to DependencyParameters");
17822 kill_invalid_dep = true;
17823 } else
17824 kill_invalid_dep =
17825 (xstrcasestr(depend_params, "kill_invalid_depend")) ?
17826 true : false;
17827
17828 /* 01234567890123456 */
17829 if ((tmp_ptr = xstrcasestr(depend_params, "max_depend_depth=")))
17830 _parse_max_depend_depth(tmp_ptr + 17);
17831 else if ((tmp_ptr = xstrcasestr(sched_params, "max_depend_depth="))) {
17832 info("max_depend_depth is deprecated in SchedulerParameters and moved to DependencyParameters");
17833 _parse_max_depend_depth(tmp_ptr + 17);
17834 } else
17835 max_depend_depth = 10;
17836
17837 xfree(depend_params);
17838 xfree(sched_params);
17839
17840 if (slurmctld_conf.debug_flags & DEBUG_FLAG_DEPENDENCY)
17841 info("%s: kill_invalid_depend is set to %d; disable_remote_singleton is set to %d; max_depend_depth is set to %d",
17842 __func__, kill_invalid_dep, disable_remote_singleton,
17843 max_depend_depth);
17844 else
17845 debug2("%s: kill_invalid_depend is set to %d; disable_remote_singleton is set to %d; max_depend_depth is set to %d",
17846 __func__, kill_invalid_dep, disable_remote_singleton,
17847 max_depend_depth);
17848 }
17849
17850 /* init_requeue_policy()
17851 * Initialize the requeue exit/hold bitmaps.
17852 */
init_requeue_policy(void)17853 extern void init_requeue_policy(void)
17854 {
17855 /* clean first as we can be reconfiguring */
17856 FREE_NULL_BITMAP(requeue_exit);
17857 FREE_NULL_BITMAP(requeue_exit_hold);
17858
17859 requeue_exit = _make_requeue_array(slurmctld_conf.requeue_exit);
17860 requeue_exit_hold = _make_requeue_array(
17861 slurmctld_conf.requeue_exit_hold);
17862 }
17863
17864 /* _make_requeue_array()
17865 *
17866 * Process the RequeueExit|RequeueExitHold configuration
17867 * parameters creating two bitmaps holding the exit values
17868 * of jobs for which they have to be requeued.
17869 */
_make_requeue_array(char * conf_buf)17870 static bitstr_t *_make_requeue_array(char *conf_buf)
17871 {
17872 hostset_t hs;
17873 bitstr_t *bs = NULL;
17874 char *tok = NULL, *end_ptr = NULL;
17875 long val;
17876
17877 if (conf_buf == NULL)
17878 return bs;
17879
17880 xstrfmtcat(tok, "[%s]", conf_buf);
17881 hs = hostset_create(tok);
17882 xfree(tok);
17883 if (!hs) {
17884 error("%s: exit values: %s", __func__, conf_buf);
17885 return bs;
17886 }
17887
17888 debug("%s: exit values: %s", __func__, conf_buf);
17889
17890 bs = bit_alloc(MAX_EXIT_VAL + 1);
17891 while ((tok = hostset_shift(hs))) {
17892 val = strtol(tok, &end_ptr, 10);
17893 if ((end_ptr[0] == '\0') &&
17894 (val >= 0) && (val <= MAX_EXIT_VAL)) {
17895 bit_set(bs, val);
17896 } else {
17897 error("%s: exit values: %s (%s)",
17898 __func__, conf_buf, tok);
17899 }
17900 free(tok);
17901 }
17902 hostset_destroy(hs);
17903
17904 return bs;
17905 }
17906
17907 /* _set_job_requeue_exit_value()
17908 *
17909 * Compared the job exit values with the configured
17910 * RequeueExit and RequeueHoldExit and a match is
17911 * found, set the appropriate state for job_hold_requeue()
17912 */
_set_job_requeue_exit_value(job_record_t * job_ptr)17913 static void _set_job_requeue_exit_value(job_record_t *job_ptr)
17914 {
17915 int exit_code;
17916
17917 exit_code = WEXITSTATUS(job_ptr->exit_code);
17918 if ((exit_code < 0) || (exit_code > MAX_EXIT_VAL))
17919 return;
17920
17921 if (requeue_exit && bit_test(requeue_exit, exit_code)) {
17922 debug2("%s: %pJ exit code %d state JOB_REQUEUE",
17923 __func__, job_ptr, exit_code);
17924 job_ptr->job_state |= JOB_REQUEUE;
17925 return;
17926 }
17927
17928 if (requeue_exit_hold && bit_test(requeue_exit_hold, exit_code)) {
17929 /* Not sure if want to set special exit state in this case */
17930 debug2("%s: %pJ exit code %d state JOB_SPECIAL_EXIT",
17931 __func__, job_ptr, exit_code);
17932 job_ptr->job_state |= JOB_REQUEUE;
17933 job_ptr->job_state |= JOB_SPECIAL_EXIT;
17934 return;
17935 }
17936 }
17937
17938 /*
17939 * Reset a job's end_time based upon it's start_time and time_limit.
17940 * NOTE: Do not reset the end_time if already being preempted
17941 */
job_end_time_reset(job_record_t * job_ptr)17942 extern void job_end_time_reset(job_record_t *job_ptr)
17943 {
17944 if (job_ptr->preempt_time)
17945 return; /* Preemption in progress */
17946 if (job_ptr->time_limit == INFINITE) {
17947 job_ptr->end_time = job_ptr->start_time +
17948 (365 * 24 * 60 * 60); /* secs in year */
17949 } else {
17950 job_ptr->end_time = job_ptr->start_time +
17951 (job_ptr->time_limit * 60); /* secs */
17952 }
17953 job_ptr->end_time_exp = job_ptr->end_time;
17954 }
17955
17956 /* trace_job() - print the job details if
17957 * the DEBUG_FLAG_TRACE_JOBS is set
17958 */
trace_job(job_record_t * job_ptr,const char * func,const char * extra)17959 extern void trace_job(job_record_t *job_ptr, const char *func,
17960 const char *extra)
17961 {
17962 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRACE_JOBS) {
17963 info("%s: %s %pJ", func, extra, job_ptr);
17964 }
17965 }
17966
17967 /* If this is a job array meta-job, prepare it for being scheduled */
job_array_pre_sched(job_record_t * job_ptr)17968 extern void job_array_pre_sched(job_record_t *job_ptr)
17969 {
17970 int32_t i;
17971
17972 if (!job_ptr->array_recs || !job_ptr->array_recs->task_id_bitmap)
17973 return;
17974
17975 i = bit_ffs(job_ptr->array_recs->task_id_bitmap);
17976 if (i < 0) {
17977 /* This happens if the final task in a meta-job is requeued */
17978 if (job_ptr->restart_cnt == 0) {
17979 error("%pJ has empty task_id_bitmap", job_ptr);
17980 }
17981 FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap);
17982 return;
17983 }
17984
17985 job_ptr->array_job_id = job_ptr->job_id;
17986 job_ptr->array_task_id = i;
17987 }
17988
17989 /* If this is a job array meta-job, clean up after scheduling attempt */
job_array_post_sched(job_record_t * job_ptr)17990 extern job_record_t *job_array_post_sched(job_record_t *job_ptr)
17991 {
17992 job_record_t *new_job_ptr = NULL;
17993
17994 if (!job_ptr->array_recs || !job_ptr->array_recs->task_id_bitmap)
17995 return job_ptr;
17996
17997 if (job_ptr->array_recs->task_cnt <= 1) {
17998 /* Preserve array_recs for min/max exit codes for job array */
17999 if (job_ptr->array_recs->task_cnt) {
18000 job_ptr->array_recs->task_cnt--;
18001 } else if (job_ptr->restart_cnt) {
18002 /* Last task of a job array has been requeued */
18003 } else {
18004 error("job %pJ array_recs task count underflow",
18005 job_ptr);
18006 }
18007 xfree(job_ptr->array_recs->task_id_str);
18008 if (job_ptr->array_recs->task_cnt == 0)
18009 FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap);
18010
18011 /* While it is efficient to set the db_index to 0 here
18012 * to get the database to update the record for
18013 * pending tasks it also creates a window in which if
18014 * the association id is changed (different account or
18015 * partition) instead of returning the previous
18016 * db_index (expected) it would create a new one
18017 * leaving the other orphaned. Setting the job_state
18018 * sets things up so the db_index isn't lost but the
18019 * start message is still sent to get the desired behavior. */
18020 if (job_ptr->db_index)
18021 job_ptr->job_state |= JOB_UPDATE_DB;
18022
18023 /* If job is requeued, it will already be in the hash table */
18024 if (!find_job_array_rec(job_ptr->array_job_id,
18025 job_ptr->array_task_id)) {
18026 _add_job_array_hash(job_ptr);
18027 }
18028 new_job_ptr = job_ptr;
18029 } else {
18030 new_job_ptr = job_array_split(job_ptr);
18031 if (new_job_ptr) {
18032 new_job_ptr->job_state = JOB_PENDING;
18033 new_job_ptr->start_time = (time_t) 0;
18034 /* Do NOT set the JOB_UPDATE_DB flag here, it
18035 * is handled when task_id_str is created elsewhere */
18036 } else {
18037 error("%s: Unable to copy record for %pJ",
18038 __func__, job_ptr);
18039 }
18040 }
18041
18042 return new_job_ptr;
18043 }
18044
18045 /* _kill_dependent()
18046 *
18047 * Exterminate the job that has invalid dependency
18048 * condition.
18049 */
_kill_dependent(job_record_t * job_ptr)18050 static void _kill_dependent(job_record_t *job_ptr)
18051 {
18052 time_t now = time(NULL);
18053
18054 info("%s: Job dependency can't be satisfied, cancelling %pJ",
18055 __func__, job_ptr);
18056 job_ptr->job_state = JOB_CANCELLED;
18057 job_ptr->start_time = now;
18058 job_ptr->end_time = now;
18059 job_completion_logger(job_ptr, false);
18060 last_job_update = now;
18061 srun_allocate_abort(job_ptr);
18062 }
18063
_dup_job_fed_details(job_fed_details_t * src)18064 static job_fed_details_t *_dup_job_fed_details(job_fed_details_t *src)
18065 {
18066 job_fed_details_t *dst = NULL;
18067
18068 if (!src)
18069 return NULL;
18070
18071 dst = xmalloc(sizeof(job_fed_details_t));
18072 memcpy(dst, src, sizeof(job_fed_details_t));
18073 dst->origin_str = xstrdup(src->origin_str);
18074 dst->siblings_active_str = xstrdup(src->siblings_active_str);
18075 dst->siblings_viable_str = xstrdup(src->siblings_viable_str);
18076
18077 return dst;
18078 }
18079
free_job_fed_details(job_fed_details_t ** fed_details_pptr)18080 extern void free_job_fed_details(job_fed_details_t **fed_details_pptr)
18081 {
18082 job_fed_details_t *fed_details_ptr = *fed_details_pptr;
18083
18084 if (fed_details_ptr) {
18085 xfree(fed_details_ptr->origin_str);
18086 xfree(fed_details_ptr->siblings_active_str);
18087 xfree(fed_details_ptr->siblings_viable_str);
18088 xfree(fed_details_ptr);
18089 *fed_details_pptr = NULL;
18090 }
18091 }
18092
_dump_job_fed_details(job_fed_details_t * fed_details_ptr,Buf buffer)18093 static void _dump_job_fed_details(job_fed_details_t *fed_details_ptr,
18094 Buf buffer)
18095 {
18096 if (fed_details_ptr) {
18097 pack16(1, buffer);
18098 pack32(fed_details_ptr->cluster_lock, buffer);
18099 packstr(fed_details_ptr->origin_str, buffer);
18100 pack64(fed_details_ptr->siblings_active, buffer);
18101 packstr(fed_details_ptr->siblings_active_str, buffer);
18102 pack64(fed_details_ptr->siblings_viable, buffer);
18103 packstr(fed_details_ptr->siblings_viable_str, buffer);
18104 } else {
18105 pack16(0, buffer);
18106 }
18107 }
18108
_load_job_fed_details(job_fed_details_t ** fed_details_pptr,Buf buffer,uint16_t protocol_version)18109 static int _load_job_fed_details(job_fed_details_t **fed_details_pptr,
18110 Buf buffer,
18111 uint16_t protocol_version)
18112 {
18113 uint16_t tmp_uint16;
18114 uint32_t tmp_uint32;
18115 job_fed_details_t *fed_details_ptr = NULL;
18116
18117 xassert(fed_details_pptr);
18118
18119 if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
18120 safe_unpack16(&tmp_uint16, buffer);
18121 if (tmp_uint16) {
18122 *fed_details_pptr = xmalloc(sizeof(job_fed_details_t));
18123 fed_details_ptr = *fed_details_pptr;
18124 safe_unpack32(&fed_details_ptr->cluster_lock, buffer);
18125 safe_unpackstr_xmalloc(&fed_details_ptr->origin_str,
18126 &tmp_uint32, buffer);
18127 safe_unpack64(&fed_details_ptr->siblings_active,
18128 buffer);
18129 safe_unpackstr_xmalloc(
18130 &fed_details_ptr->siblings_active_str,
18131 &tmp_uint32, buffer);
18132 safe_unpack64(&fed_details_ptr->siblings_viable,
18133 buffer);
18134 safe_unpackstr_xmalloc(
18135 &fed_details_ptr->siblings_viable_str,
18136 &tmp_uint32, buffer);
18137 }
18138 } else
18139 goto unpack_error;
18140
18141 return SLURM_SUCCESS;
18142
18143 unpack_error:
18144 free_job_fed_details(fed_details_pptr);
18145 *fed_details_pptr = NULL;
18146
18147 return SLURM_ERROR;
18148 }
18149
18150 /* Set federated job's sibling strings. */
update_job_fed_details(job_record_t * job_ptr)18151 extern void update_job_fed_details(job_record_t *job_ptr)
18152 {
18153 xassert(job_ptr);
18154 xassert(job_ptr->fed_details);
18155
18156 xfree(job_ptr->fed_details->siblings_active_str);
18157 xfree(job_ptr->fed_details->siblings_viable_str);
18158
18159 job_ptr->fed_details->siblings_active_str =
18160 fed_mgr_cluster_ids_to_names(
18161 job_ptr->fed_details->siblings_active);
18162 job_ptr->fed_details->siblings_viable_str =
18163 fed_mgr_cluster_ids_to_names(
18164 job_ptr->fed_details->siblings_viable);
18165
18166 /* only set once */
18167 if (!job_ptr->fed_details->origin_str)
18168 job_ptr->fed_details->origin_str =
18169 fed_mgr_get_cluster_name(
18170 fed_mgr_get_cluster_id(job_ptr->job_id));
18171 }
18172
18173 /*
18174 * Set the allocation response with the current cluster's information and the
18175 * job's allocated node's addr's if the allocation is being filled by a cluster
18176 * other than the cluster that submitted the job
18177 *
18178 * Note: make sure that the resp's working_cluster_rec is NULL'ed out before the
18179 * resp is free'd since it points to global memory.
18180 *
18181 * IN resp - allocation response being sent back to client.
18182 * IN job_ptr - allocated job
18183 * IN req_cluster - the cluster requsting the allocation info.
18184 */
set_remote_working_response(resource_allocation_response_msg_t * resp,job_record_t * job_ptr,const char * req_cluster)18185 extern void set_remote_working_response(
18186 resource_allocation_response_msg_t *resp,
18187 job_record_t *job_ptr, const char *req_cluster)
18188 {
18189 xassert(resp);
18190 xassert(job_ptr);
18191
18192 if (job_ptr->node_cnt &&
18193 req_cluster && slurmctld_conf.cluster_name &&
18194 xstrcmp(slurmctld_conf.cluster_name, req_cluster)) {
18195 if (job_ptr->fed_details &&
18196 fed_mgr_cluster_rec) {
18197 resp->working_cluster_rec = fed_mgr_cluster_rec;
18198 } else {
18199 resp->working_cluster_rec = response_cluster_rec;
18200 }
18201
18202 resp->node_addr = xcalloc(job_ptr->node_cnt,
18203 sizeof(slurm_addr_t));
18204 memcpy(resp->node_addr, job_ptr->node_addr,
18205 (sizeof(slurm_addr_t) * job_ptr->node_cnt));
18206 }
18207 }
18208
18209 /* Build structure with job allocation details */
build_job_info_resp(job_record_t * job_ptr)18210 extern resource_allocation_response_msg_t *build_job_info_resp(
18211 job_record_t *job_ptr)
18212 {
18213 resource_allocation_response_msg_t *job_info_resp_msg;
18214 int i, j;
18215
18216 job_info_resp_msg = xmalloc(sizeof(resource_allocation_response_msg_t));
18217
18218
18219 if (!job_ptr->job_resrcs) {
18220 ;
18221 } else if (bit_equal(job_ptr->node_bitmap,
18222 job_ptr->job_resrcs->node_bitmap)) {
18223 job_info_resp_msg->num_cpu_groups =
18224 job_ptr->job_resrcs->cpu_array_cnt;
18225 job_info_resp_msg->cpu_count_reps =
18226 xcalloc(job_ptr->job_resrcs->cpu_array_cnt,
18227 sizeof(uint32_t));
18228 memcpy(job_info_resp_msg->cpu_count_reps,
18229 job_ptr->job_resrcs->cpu_array_reps,
18230 (sizeof(uint32_t) * job_ptr->job_resrcs->cpu_array_cnt));
18231 job_info_resp_msg->cpus_per_node =
18232 xcalloc(job_ptr->job_resrcs->cpu_array_cnt,
18233 sizeof(uint16_t));
18234 memcpy(job_info_resp_msg->cpus_per_node,
18235 job_ptr->job_resrcs->cpu_array_value,
18236 (sizeof(uint16_t) * job_ptr->job_resrcs->cpu_array_cnt));
18237 } else {
18238 /* Job has changed size, rebuild CPU count info */
18239 job_info_resp_msg->num_cpu_groups = job_ptr->node_cnt;
18240 job_info_resp_msg->cpu_count_reps = xcalloc(job_ptr->node_cnt,
18241 sizeof(uint32_t));
18242 job_info_resp_msg->cpus_per_node = xcalloc(job_ptr->node_cnt,
18243 sizeof(uint32_t));
18244 for (i = 0, j = -1; i < job_ptr->job_resrcs->nhosts; i++) {
18245 if (job_ptr->job_resrcs->cpus[i] == 0)
18246 continue;
18247 if ((j == -1) ||
18248 (job_info_resp_msg->cpus_per_node[j] !=
18249 job_ptr->job_resrcs->cpus[i])) {
18250 j++;
18251 job_info_resp_msg->cpus_per_node[j] =
18252 job_ptr->job_resrcs->cpus[i];
18253 job_info_resp_msg->cpu_count_reps[j] = 1;
18254 } else {
18255 job_info_resp_msg->cpu_count_reps[j]++;
18256 }
18257 }
18258 job_info_resp_msg->num_cpu_groups = j + 1;
18259 }
18260 job_info_resp_msg->account = xstrdup(job_ptr->account);
18261 job_info_resp_msg->alias_list = xstrdup(job_ptr->alias_list);
18262 job_info_resp_msg->job_id = job_ptr->job_id;
18263 job_info_resp_msg->node_cnt = job_ptr->node_cnt;
18264 job_info_resp_msg->node_list = xstrdup(job_ptr->nodes);
18265 job_info_resp_msg->partition = xstrdup(job_ptr->partition);
18266 if (job_ptr->qos_ptr) {
18267 slurmdb_qos_rec_t *qos;
18268 qos = (slurmdb_qos_rec_t *)job_ptr->qos_ptr;
18269 job_info_resp_msg->qos = xstrdup(qos->name);
18270 }
18271 job_info_resp_msg->resv_name = xstrdup(job_ptr->resv_name);
18272 job_info_resp_msg->select_jobinfo =
18273 select_g_select_jobinfo_copy(job_ptr->select_jobinfo);
18274 if (job_ptr->details) {
18275 if (job_ptr->bit_flags & JOB_MEM_SET) {
18276 job_info_resp_msg->pn_min_memory =
18277 job_ptr->details->pn_min_memory;
18278 }
18279 if (job_ptr->details->mc_ptr) {
18280 job_info_resp_msg->ntasks_per_board =
18281 job_ptr->details->mc_ptr->ntasks_per_board;
18282 job_info_resp_msg->ntasks_per_core =
18283 job_ptr->details->mc_ptr->ntasks_per_core;
18284 job_info_resp_msg->ntasks_per_socket =
18285 job_ptr->details->mc_ptr->ntasks_per_socket;
18286 }
18287 } else {
18288 /* job_info_resp_msg->pn_min_memory = 0; */
18289 job_info_resp_msg->ntasks_per_board = NO_VAL16;
18290 job_info_resp_msg->ntasks_per_core = NO_VAL16;
18291 job_info_resp_msg->ntasks_per_socket = NO_VAL16;
18292 }
18293
18294 if (job_ptr->details && job_ptr->details->env_cnt) {
18295 job_info_resp_msg->env_size = job_ptr->details->env_cnt;
18296 job_info_resp_msg->environment =
18297 xcalloc(job_info_resp_msg->env_size + 1,
18298 sizeof(char *));
18299 for (i = 0; i < job_info_resp_msg->env_size; i++) {
18300 job_info_resp_msg->environment[i] =
18301 xstrdup(job_ptr->details->env_sup[i]);
18302 }
18303 job_info_resp_msg->environment[i] = NULL;
18304 }
18305
18306 return job_info_resp_msg;
18307 }
18308
18309 /*
18310 * Calculate billable TRES based on partition's defined BillingWeights. If none
18311 * is defined, return total_cpus. This is cached on job_ptr->billable_tres and
18312 * is updated if the job was resized since the last iteration.
18313 *
18314 * IN job_ptr - job to calc billable tres on
18315 * IN start_time - time the has started or been resized
18316 * IN assoc_mgr_locked - whether the tres assoc lock is set or not
18317 */
calc_job_billable_tres(job_record_t * job_ptr,time_t start_time,bool assoc_mgr_locked)18318 extern double calc_job_billable_tres(job_record_t *job_ptr, time_t start_time,
18319 bool assoc_mgr_locked)
18320 {
18321 xassert(job_ptr);
18322
18323 part_record_t *part_ptr = job_ptr->part_ptr;
18324
18325 /* We don't have any resources allocated, just return 0. */
18326 if (!job_ptr->tres_alloc_cnt)
18327 return 0;
18328
18329 /* Don't recalculate unless the job is new or resized */
18330 if ((!fuzzy_equal(job_ptr->billable_tres, NO_VAL)) &&
18331 difftime(job_ptr->resize_time, start_time) < 0.0)
18332 return job_ptr->billable_tres;
18333
18334 if (slurmctld_conf.debug_flags & DEBUG_FLAG_PRIO)
18335 info("BillingWeight: %pJ is either new or it was resized",
18336 job_ptr);
18337
18338 /* No billing weights defined. Return CPU count */
18339 if (!part_ptr || !part_ptr->billing_weights) {
18340 job_ptr->billable_tres = job_ptr->total_cpus;
18341 return job_ptr->billable_tres;
18342 }
18343
18344 if (slurmctld_conf.debug_flags & DEBUG_FLAG_PRIO)
18345 info("BillingWeight: %pJ using \"%s\" from partition %s",
18346 job_ptr, part_ptr->billing_weights_str,
18347 job_ptr->part_ptr->name);
18348
18349 job_ptr->billable_tres =
18350 assoc_mgr_tres_weighted(job_ptr->tres_alloc_cnt,
18351 part_ptr->billing_weights,
18352 slurmctld_conf.priority_flags,
18353 assoc_mgr_locked);
18354
18355 if (slurmctld_conf.debug_flags & DEBUG_FLAG_PRIO)
18356 info("BillingWeight: %pJ %s = %f",
18357 job_ptr,
18358 (slurmctld_conf.priority_flags & PRIORITY_FLAGS_MAX_TRES) ?
18359 "MAX(node TRES) + SUM(Global TRES)" : "SUM(TRES)",
18360 job_ptr->billable_tres);
18361
18362 return job_ptr->billable_tres;
18363 }
18364
update_job_limit_set_tres(uint16_t ** limits_pptr)18365 extern void update_job_limit_set_tres(uint16_t **limits_pptr)
18366 {
18367 int i, old_pos;
18368 int new_size = sizeof(uint16_t) * slurmctld_tres_cnt;
18369
18370 xassert(limits_pptr);
18371
18372 *limits_pptr = xrealloc(*limits_pptr, new_size);
18373
18374 if (assoc_mgr_tres_pos_changed()) {
18375 uint16_t *limits_ptr, tmp_tres[slurmctld_tres_cnt];
18376 limits_ptr = *limits_pptr;
18377
18378 for (i = 0; i < slurmctld_tres_cnt; i++) {
18379 if ((old_pos = assoc_mgr_get_old_tres_pos(i)) == -1)
18380 tmp_tres[i] = 0;
18381 else
18382 tmp_tres[i] = limits_ptr[old_pos];
18383 }
18384 memcpy(limits_ptr, tmp_tres, new_size);
18385 }
18386 }
18387
18388
18389 /*
18390 * Send warning signal to job before end time.
18391 *
18392 * IN job_ptr - job to send warn signal to.
18393 * IN ignore_time - If set, ignore the warn time and just send it.
18394 */
send_job_warn_signal(job_record_t * job_ptr,bool ignore_time)18395 extern void send_job_warn_signal(job_record_t *job_ptr, bool ignore_time)
18396 {
18397 if (job_ptr->warn_signal &&
18398 !(job_ptr->warn_flags & WARN_SENT) &&
18399 (ignore_time ||
18400 (job_ptr->warn_time &&
18401 ((job_ptr->warn_time + PERIODIC_TIMEOUT + time(NULL)) >=
18402 job_ptr->end_time)))) {
18403 /*
18404 * If --signal B option was not specified,
18405 * signal only the steps but not the batch step.
18406 */
18407 if (!(job_ptr->warn_flags & KILL_JOB_BATCH))
18408 job_ptr->warn_flags |= KILL_STEPS_ONLY;
18409
18410 debug("%s: warning signal %u to %pJ",
18411 __func__, job_ptr->warn_signal, job_ptr);
18412
18413 job_signal(job_ptr, job_ptr->warn_signal,
18414 job_ptr->warn_flags, 0, false);
18415
18416 /* mark job as signaled */
18417 job_ptr->warn_flags |= WARN_SENT;
18418 }
18419 }
18420
_overlap_and_running_internal(void * x,void * arg)18421 static int _overlap_and_running_internal(void *x, void *arg)
18422 {
18423 job_record_t *job_ptr = (job_record_t *)x;
18424 job_overlap_args_t *overlap_args = (job_overlap_args_t *)arg;
18425
18426 /* We always break if we find something not running */
18427 if ((!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) {
18428 overlap_args->rc = 0;
18429 return 1;
18430 }
18431
18432 /*
18433 * We are just looking for something overlapping. On a hetjob we need
18434 * to check everything.
18435 */
18436 if (job_ptr->node_bitmap &&
18437 bit_overlap_any(overlap_args->node_map, job_ptr->node_bitmap))
18438 overlap_args->rc = 1;
18439
18440 return 0;
18441 }
18442
job_overlap_and_running(bitstr_t * node_map,job_record_t * job_ptr)18443 extern bool job_overlap_and_running(bitstr_t *node_map, job_record_t *job_ptr)
18444 {
18445 job_overlap_args_t overlap_args = {
18446 .node_map = node_map
18447 };
18448
18449 if (!job_ptr->het_job_list)
18450 (void)_overlap_and_running_internal(job_ptr, &overlap_args);
18451 else
18452 (void)list_for_each(job_ptr->het_job_list,
18453 _overlap_and_running_internal,
18454 &overlap_args);
18455
18456 return overlap_args.rc;
18457 }
18458