1 /*****************************************************************************\
2  *  src/slurmd/slurmstepd/slurmstepd_job.c - stepd_step_rec_t routines
3  *****************************************************************************
4  *  Copyright (C) 2002-2007 The Regents of the University of California.
5  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
6  *  Copyright (C) 2013      Intel, Inc.
7  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
8  *  Written by Mark Grondona <mgrondona@llnl.gov>.
9  *  CODE-OCEC-09-009. All rights reserved.
10  *
11  *  This file is part of Slurm, a resource management program.
12  *  For details, see <https://slurm.schedmd.com/>.
13  *  Please also read the included file: DISCLAIMER.
14  *
15  *  Slurm is free software; you can redistribute it and/or modify it under
16  *  the terms of the GNU General Public License as published by the Free
17  *  Software Foundation; either version 2 of the License, or (at your option)
18  *  any later version.
19  *
20  *  In addition, as a special exception, the copyright holders give permission
21  *  to link the code of portions of this program with the OpenSSL library under
22  *  certain conditions as described in each individual source file, and
23  *  distribute linked combinations including the two. You must obey the GNU
24  *  General Public License in all respects for all of the code used other than
25  *  OpenSSL. If you modify file(s) with this exception, you may extend this
26  *  exception to your version of the file(s), but you are not obligated to do
27  *  so. If you do not wish to do so, delete this exception statement from your
28  *  version.  If you delete this exception statement from all source files in
29  *  the program, then also delete it here.
30  *
31  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
32  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
33  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
34  *  details.
35  *
36  *  You should have received a copy of the GNU General Public License along
37  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
38  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
39 \*****************************************************************************/
40 
41 #include "config.h"
42 
43 #include <grp.h>
44 #include <signal.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <sys/types.h>
48 
49 #include "src/common/eio.h"
50 #include "src/common/fd.h"
51 #include "src/common/gres.h"
52 #include "src/common/group_cache.h"
53 #include "src/common/log.h"
54 #include "src/common/macros.h"
55 #include "src/common/node_select.h"
56 #include "src/common/slurm_jobacct_gather.h"
57 #include "src/common/slurm_acct_gather_profile.h"
58 #include "src/common/slurm_protocol_api.h"
59 #include "src/common/uid.h"
60 #include "src/common/xassert.h"
61 #include "src/common/xmalloc.h"
62 #include "src/common/xstring.h"
63 
64 #include "src/slurmd/common/fname.h"
65 #include "src/slurmd/common/xcpuinfo.h"
66 #include "src/slurmd/slurmd/slurmd.h"
67 #include "src/slurmd/slurmstepd/io.h"
68 #include "src/slurmd/slurmstepd/multi_prog.h"
69 #include "src/slurmd/slurmstepd/slurmstepd_job.h"
70 
71 static char **_array_copy(int n, char **src);
72 static void _array_free(char ***array);
73 static void _job_init_task_info(stepd_step_rec_t *job, uint32_t **gtid,
74 				char *ifname, char *ofname, char *efname);
75 static void _srun_info_destructor(void *arg);
76 static stepd_step_task_info_t *_task_info_create(int taskid, int gtaskid,
77 						 char *ifname, char *ofname,
78 						 char *efname);
79 static void _task_info_destroy(stepd_step_task_info_t *t, uint16_t multi_prog);
80 
81 /*
82  * return the default output filename for a batch job
83  */
84 static char *
_batchfilename(stepd_step_rec_t * job,const char * name)85 _batchfilename(stepd_step_rec_t *job, const char *name)
86 {
87 	if (name == NULL) {
88 		if (job->array_task_id == NO_VAL)
89 			return fname_create(job, "slurm-%J.out", 0);
90 		else
91 			return fname_create(job, "slurm-%A_%a.out", 0);
92 	} else
93 		return fname_create(job, name, 0);
94 }
95 
96 /*
97  * Expand a stdio file name.
98  *
99  * If "filename" is NULL it means that an eio object should be created
100  * for that stdio file rather than a directly connecting it to a file.
101  *
102  * If the "filename" is a valid task number in string form and the
103  * number matches "taskid", then NULL is returned so that an eio
104  * object will be used.  If is a valid number, but it does not match
105  * "taskid", then the file descriptor will be connected to /dev/null.
106  */
107 static char *
_expand_stdio_filename(char * filename,int gtaskid,stepd_step_rec_t * job)108 _expand_stdio_filename(char *filename, int gtaskid, stepd_step_rec_t *job)
109 {
110 	int id;
111 
112 	if (filename == NULL)
113 		return NULL;
114 
115 	id = fname_single_task_io(filename);
116 
117 	if (id < 0)
118 		return fname_create(job, filename, gtaskid);
119 	if (id >= job->ntasks) {
120 		error("Task ID in filename is invalid");
121 		return NULL;
122 	}
123 
124 	if (id == gtaskid)
125 		return NULL;
126 	else
127 		return xstrdup("/dev/null");
128 }
129 
130 static void
_job_init_task_info(stepd_step_rec_t * job,uint32_t ** gtid,char * ifname,char * ofname,char * efname)131 _job_init_task_info(stepd_step_rec_t *job, uint32_t **gtid,
132 		    char *ifname, char *ofname, char *efname)
133 {
134 	int          i, node_id = job->nodeid;
135 	char        *in, *out, *err;
136 	uint32_t     het_job_offset = 0;
137 
138 	if (job->node_tasks == 0) {
139 		error("User requested launch of zero tasks!");
140 		job->task = NULL;
141 		return;
142 	}
143 
144 	if (job->het_job_offset != NO_VAL)
145 		het_job_offset = job->het_job_offset;
146 
147 #if defined(HAVE_NATIVE_CRAY)
148 	for (i = 0; i < job->nnodes; i++) {
149 		int j;
150 		for (j = 1; j < job->task_cnts[i]; j++) {
151 			if (gtid[i][j] != gtid[i][j-1] + 1) {
152 				job->non_smp = 1;
153 				break;
154 			}
155 		}
156 	}
157 #endif
158 
159 	job->task = (stepd_step_task_info_t **)
160 		xmalloc(job->node_tasks * sizeof(stepd_step_task_info_t *));
161 
162 	for (i = 0; i < job->node_tasks; i++) {
163 		in  = _expand_stdio_filename(ifname,
164 					     gtid[node_id][i] + het_job_offset,
165 					     job);
166 		out = _expand_stdio_filename(ofname,
167 					     gtid[node_id][i] + het_job_offset,
168 					     job);
169 		err = _expand_stdio_filename(efname,
170 					     gtid[node_id][i] + het_job_offset,
171 					     job);
172 		job->task[i] = _task_info_create(i, gtid[node_id][i], in, out,
173 						 err);
174 		if ((job->flags & LAUNCH_MULTI_PROG) == 0) {
175 			job->task[i]->argc = job->argc;
176 			job->task[i]->argv = job->argv;
177 		}
178 	}
179 
180 	if (job->flags & LAUNCH_MULTI_PROG) {
181 		char *switch_type = slurm_get_switch_type();
182 		if (!xstrcmp(switch_type, "switch/cray_aries"))
183 			multi_prog_parse(job, gtid);
184 		xfree(switch_type);
185 		for (i = 0; i < job->node_tasks; i++){
186 			multi_prog_get_argv(job->argv[1], job->env,
187 					    gtid[node_id][i],
188 					    &job->task[i]->argc,
189 					    &job->task[i]->argv,
190 					    job->argc, job->argv);
191 		}
192 	}
193 }
194 
195 static char **
_array_copy(int n,char ** src)196 _array_copy(int n, char **src)
197 {
198 	char **dst = xmalloc((n+1) * sizeof(char *));
199 	int i;
200 
201 	for (i = 0; i < n; i++) {
202 		dst[i] = xstrdup(src[i]);
203 	}
204 	dst[n] = NULL;
205 
206 	return dst;
207 }
208 
209 static void
_array_free(char *** array)210 _array_free(char ***array)
211 {
212 	int i = 0;
213 	while ((*array)[i] != NULL)
214 		xfree((*array)[i++]);
215 	xfree(*array);
216 	*array = NULL;
217 }
218 
219 /* destructor for list routines */
220 static void
_srun_info_destructor(void * arg)221 _srun_info_destructor(void *arg)
222 {
223 	srun_info_t *srun = (srun_info_t *)arg;
224 	srun_info_destroy(srun);
225 }
226 
227 static void
_task_info_destroy(stepd_step_task_info_t * t,uint16_t multi_prog)228 _task_info_destroy(stepd_step_task_info_t *t, uint16_t multi_prog)
229 {
230 	slurm_mutex_lock(&t->mutex);
231 	slurm_mutex_unlock(&t->mutex);
232 	slurm_mutex_destroy(&t->mutex);
233 	if (multi_prog) {
234 		xfree(t->argv);
235 	} /* otherwise, t->argv is a pointer to job->argv */
236 	xfree(t);
237 }
238 
_slurm_cred_to_step_rec(slurm_cred_t * cred,stepd_step_rec_t * job)239 static void _slurm_cred_to_step_rec(slurm_cred_t *cred, stepd_step_rec_t *job)
240 {
241 	slurm_cred_arg_t cred_arg;
242 	slurm_cred_get_args(cred, &cred_arg);
243 
244 	/*
245 	 * This may have been filed in already from batch_job_launch_msg_t
246 	 * or launch_tasks_request_msg_t.
247 	 */
248 	if (!job->user_name) {
249 		job->user_name = cred_arg.pw_name;
250 		cred_arg.pw_name = NULL;
251 	}
252 
253 	job->pw_gecos = cred_arg.pw_gecos;
254 	cred_arg.pw_gecos = NULL;
255 	job->pw_dir = cred_arg.pw_dir;
256 	cred_arg.pw_dir = NULL;
257 	job->pw_shell = cred_arg.pw_shell;
258 	cred_arg.pw_shell = NULL;
259 
260 	job->ngids = cred_arg.ngids;
261 	job->gids = cred_arg.gids;
262 	cred_arg.gids = NULL;
263 	job->gr_names = cred_arg.gr_names;
264 	cred_arg.gr_names = NULL;
265 
266 	slurm_cred_free_args(&cred_arg);
267 }
268 
269 /* create a slurmd job structure from a launch tasks message */
stepd_step_rec_create(launch_tasks_request_msg_t * msg,uint16_t protocol_version)270 extern stepd_step_rec_t *stepd_step_rec_create(launch_tasks_request_msg_t *msg,
271 					       uint16_t protocol_version)
272 {
273 	stepd_step_rec_t  *job = NULL;
274 	srun_info_t   *srun = NULL;
275 	slurm_addr_t     resp_addr;
276 	slurm_addr_t     io_addr;
277 	int            i, nodeid = NO_VAL;
278 
279 	xassert(msg != NULL);
280 	xassert(msg->complete_nodelist != NULL);
281 	debug3("entering stepd_step_rec_create");
282 
283 	if (acct_gather_check_acct_freq_task(msg->job_mem_lim, msg->acctg_freq))
284 		return NULL;
285 
286 	job = xmalloc(sizeof(stepd_step_rec_t));
287 	job->msg = msg;
288 #ifndef HAVE_FRONT_END
289 	nodeid = nodelist_find(msg->complete_nodelist, conf->node_name);
290 	job->node_name = xstrdup(conf->node_name);
291 #else
292 	nodeid = 0;
293 	job->node_name = xstrdup(msg->complete_nodelist);
294 #endif
295 	if (nodeid < 0) {
296 		error("couldn't find node %s in %s",
297 		      job->node_name, msg->complete_nodelist);
298 		stepd_step_rec_destroy(job);
299 		return NULL;
300 	}
301 
302 	job->state = SLURMSTEPD_STEP_STARTING;
303 	slurm_cond_init(&job->state_cond, NULL);
304 	slurm_mutex_init(&job->state_mutex);
305 	job->node_tasks	= msg->tasks_to_launch[nodeid];
306 	job->task_cnts  = xcalloc(msg->nnodes, sizeof(uint16_t));
307 	memcpy(job->task_cnts, msg->tasks_to_launch,
308 	       sizeof(uint16_t) * msg->nnodes);
309 	job->ntasks	= msg->ntasks;
310 	job->jobid	= msg->job_id;
311 	job->stepid	= msg->job_step_id;
312 
313 	job->uid	= (uid_t) msg->uid;
314 	job->gid	= (gid_t) msg->gid;
315 	job->user_name	= xstrdup(msg->user_name);
316 	_slurm_cred_to_step_rec(msg->cred, job);
317 	/*
318 	 * Favor the group info in the launch cred if available - for 19.05+
319 	 * this is where it is managed, not in launch_tasks_request_msg_t.
320 	 * For older versions, or for when send_gids is disabled, fall back
321 	 * to the launch_tasks_request_msg_t info if necessary.
322 	 */
323 	if (!job->ngids) {
324 		job->ngids = (int) msg->ngids;
325 		job->gids = copy_gids(msg->ngids, msg->gids);
326 	}
327 
328 	job->cwd	= xstrdup(msg->cwd);
329 	job->task_dist	= msg->task_dist;
330 
331 	job->cpu_bind_type = msg->cpu_bind_type;
332 	job->cpu_bind = xstrdup(msg->cpu_bind);
333 	job->mem_bind_type = msg->mem_bind_type;
334 	job->mem_bind = xstrdup(msg->mem_bind);
335 	job->tres_bind = xstrdup(msg->tres_bind);
336 	job->tres_freq = xstrdup(msg->tres_freq);
337 	job->cpu_freq_min = msg->cpu_freq_min;
338 	job->cpu_freq_max = msg->cpu_freq_max;
339 	job->cpu_freq_gov = msg->cpu_freq_gov;
340 	job->cpus_per_task = msg->cpus_per_task;
341 
342 	job->env     = _array_copy(msg->envc, msg->env);
343 	job->array_job_id  = msg->job_id;
344 	job->array_task_id = NO_VAL;
345 	/* Used for env vars */
346 	job->het_job_node_offset = msg->het_job_node_offset;
347 	job->het_job_step_cnt = msg->het_job_step_cnt;
348 	job->het_job_id  = msg->het_job_id;	/* Used for env vars */
349 	job->het_job_nnodes = msg->het_job_nnodes;	/* Used for env vars */
350 	if (msg->het_job_nnodes && msg->het_job_ntasks &&
351 	    msg->het_job_task_cnts) {
352 		job->het_job_ntasks = msg->het_job_ntasks;/* Used for env vars*/
353 		job->het_job_task_cnts = xcalloc(msg->het_job_nnodes,
354 					      sizeof(uint16_t));
355 		memcpy(job->het_job_task_cnts, msg->het_job_task_cnts,
356 		       sizeof(uint16_t) * msg->het_job_nnodes);
357 		if (msg->het_job_tids) {
358 			/*
359 			 * het_job_tids == NULL if request from pre-v19.05
360 			 * srun
361 			 */
362 			job->het_job_tids = xcalloc(msg->het_job_nnodes,
363 						    sizeof(uint32_t *));
364 			for (i = 0; i < msg->het_job_nnodes; i++) {
365 				job->het_job_tids[i] =
366 					xcalloc(job->het_job_task_cnts[i],
367 						sizeof(uint32_t));
368 				memcpy(job->het_job_tids[i],
369 				       msg->het_job_tids[i],
370 				       sizeof(uint32_t) *
371 				       job->het_job_task_cnts[i]);
372 			}
373 		}
374 		if (msg->het_job_tid_offsets) {
375 			job->het_job_tid_offsets = xcalloc(job->het_job_ntasks,
376 							   sizeof(uint32_t));
377 			memcpy(job->het_job_tid_offsets,
378 			       msg->het_job_tid_offsets,
379 			       job->het_job_ntasks * sizeof(uint32_t));
380 		}
381 	}
382 	/* Used for env vars & labels */
383 	job->het_job_offset = msg->het_job_offset;
384 	/* Used for env vars & labels */
385 	job->het_job_task_offset = msg->het_job_task_offset;
386 	job->het_job_node_list = xstrdup(msg->het_job_node_list);
387 	for (i = 0; i < msg->envc; i++) {
388 		/*                         1234567890123456789 */
389 		if (!xstrncmp(msg->env[i], "SLURM_ARRAY_JOB_ID=", 19))
390 			job->array_job_id = atoi(msg->env[i] + 19);
391 		/*                         12345678901234567890 */
392 		if (!xstrncmp(msg->env[i], "SLURM_ARRAY_TASK_ID=", 20))
393 			job->array_task_id = atoi(msg->env[i] + 20);
394 	}
395 
396 	job->eio     = eio_handle_create(0);
397 	job->sruns   = list_create((ListDelF) _srun_info_destructor);
398 
399 	/*
400 	 * Based on my testing the next 3 lists here could use the
401 	 * eio_obj_destroy, but if you do you can get an invalid read.  Since
402 	 * these stay until the end of the job it isn't that big of a deal.
403 	 */
404 	job->clients = list_create(NULL); /* FIXME! Needs destructor */
405 	job->stdout_eio_objs = list_create(NULL); /* FIXME! Needs destructor */
406 	job->stderr_eio_objs = list_create(NULL); /* FIXME! Needs destructor */
407 	job->free_incoming = list_create(NULL); /* FIXME! Needs destructor */
408 	job->incoming_count = 0;
409 	job->free_outgoing = list_create(NULL); /* FIXME! Needs destructor */
410 	job->outgoing_count = 0;
411 	job->outgoing_cache = list_create(NULL); /* FIXME! Needs destructor */
412 
413 	job->envtp   = xmalloc(sizeof(env_t));
414 	job->envtp->jobid = -1;
415 	job->envtp->stepid = -1;
416 	job->envtp->procid = -1;
417 	job->envtp->localid = -1;
418 	job->envtp->nodeid = -1;
419 
420 	job->envtp->distribution = 0;
421 	job->envtp->cpu_bind_type = 0;
422 	job->envtp->cpu_bind = NULL;
423 	job->envtp->mem_bind_type = 0;
424 	job->envtp->mem_bind = NULL;
425 	if (!msg->resp_port)
426 		msg->num_resp_port = 0;
427 	if (msg->num_resp_port) {
428 		job->envtp->comm_port =
429 			msg->resp_port[nodeid % msg->num_resp_port];
430 		memcpy(&resp_addr, &msg->orig_addr, sizeof(slurm_addr_t));
431 		slurm_set_addr(&resp_addr,
432 			       msg->resp_port[nodeid % msg->num_resp_port],
433 			       NULL);
434 	} else {
435 		memset(&resp_addr, 0, sizeof(slurm_addr_t));
436 	}
437 	if (!msg->io_port)
438 		msg->flags |= LAUNCH_USER_MANAGED_IO;
439 	if ((msg->flags & LAUNCH_USER_MANAGED_IO) == 0) {
440 		memcpy(&io_addr,   &msg->orig_addr, sizeof(slurm_addr_t));
441 		slurm_set_addr(&io_addr,
442 			       msg->io_port[nodeid % msg->num_io_port],
443 			       NULL);
444 	} else {
445 		memset(&io_addr, 0, sizeof(slurm_addr_t));
446 	}
447 
448 	srun = srun_info_create(msg->cred, &resp_addr, &io_addr,
449 				protocol_version);
450 
451 	job->profile     = msg->profile;
452 	job->task_prolog = xstrdup(msg->task_prolog);
453 	job->task_epilog = xstrdup(msg->task_epilog);
454 
455 	job->argc    = msg->argc;
456 	job->argv    = _array_copy(job->argc, msg->argv);
457 
458 	job->nnodes  = msg->nnodes;
459 	job->nodeid  = nodeid;
460 	job->debug   = msg->slurmd_debug;
461 	job->cpus    = msg->node_cpus;
462 	job->job_core_spec = msg->job_core_spec;
463 
464 	/* This needs to happen before acct_gather_profile_startpoll
465 	   and only really looks at the profile in the job.
466 	*/
467 	acct_gather_profile_g_node_step_start(job);
468 
469 	acct_gather_profile_startpoll(msg->acctg_freq,
470 				      conf->job_acct_gather_freq);
471 
472 	job->timelimit   = (time_t) -1;
473 	job->flags       = msg->flags;
474 	job->switch_job  = msg->switch_job;
475 	job->open_mode   = msg->open_mode;
476 	job->options     = msg->options;
477 	format_core_allocs(msg->cred, conf->node_name, conf->cpus,
478 			   &job->job_alloc_cores, &job->step_alloc_cores,
479 			   &job->job_mem, &job->step_mem);
480 
481 	if (job->step_mem && conf->job_acct_oom_kill) {
482 		jobacct_gather_set_mem_limit(job->jobid, job->stepid,
483 					     job->step_mem);
484 	} else if (job->job_mem && conf->job_acct_oom_kill) {
485 		jobacct_gather_set_mem_limit(job->jobid, job->stepid,
486 					     job->job_mem);
487 	}
488 
489 	/* only need these values on the extern step, don't copy otherwise */
490 	if ((msg->job_step_id == SLURM_EXTERN_CONT) && msg->x11) {
491 		job->x11 = msg->x11;
492 		job->x11_alloc_host = xstrdup(msg->x11_alloc_host);
493 		job->x11_alloc_port = msg->x11_alloc_port;
494 		job->x11_magic_cookie = xstrdup(msg->x11_magic_cookie);
495 		job->x11_target = xstrdup(msg->x11_target);
496 		job->x11_target_port = msg->x11_target_port;
497 	}
498 
499 	get_cred_gres(msg->cred, conf->node_name,
500 		      &job->job_gres_list, &job->step_gres_list);
501 
502 	list_append(job->sruns, (void *) srun);
503 
504 	_job_init_task_info(job, msg->global_task_ids,
505 			    msg->ifname, msg->ofname, msg->efname);
506 
507 	return job;
508 }
509 
510 extern stepd_step_rec_t *
batch_stepd_step_rec_create(batch_job_launch_msg_t * msg)511 batch_stepd_step_rec_create(batch_job_launch_msg_t *msg)
512 {
513 	stepd_step_rec_t *job;
514 	srun_info_t  *srun = NULL;
515 	char *in_name;
516 
517 	xassert(msg != NULL);
518 
519 	debug3("entering batch_stepd_step_rec_create");
520 
521 	if (acct_gather_check_acct_freq_task(msg->job_mem, msg->acctg_freq))
522 		return NULL;
523 
524 	job = xmalloc(sizeof(stepd_step_rec_t));
525 
526 	job->state = SLURMSTEPD_STEP_STARTING;
527 	slurm_cond_init(&job->state_cond, NULL);
528 	slurm_mutex_init(&job->state_mutex);
529 	if (msg->cpus_per_node)
530 		job->cpus    = msg->cpus_per_node[0];
531 	job->node_tasks  = 1;
532 	job->ntasks  = msg->ntasks;
533 	job->jobid   = msg->job_id;
534 	job->stepid  = msg->step_id;
535 	job->array_job_id  = msg->array_job_id;
536 	job->array_task_id = msg->array_task_id;
537 	job->het_job_step_cnt = NO_VAL;
538 	job->het_job_id  = NO_VAL;	/* Used to set env vars */
539 	job->het_job_nnodes = NO_VAL;	/* Used to set env vars */
540 	job->het_job_ntasks = NO_VAL;	/* Used to set env vars */
541 	job->het_job_offset = NO_VAL;	/* Used to set labels and env vars */
542 	job->job_core_spec = msg->job_core_spec;
543 
544 	job->batch   = true;
545 	job->node_name  = xstrdup(conf->node_name);
546 
547 	job->uid	= (uid_t) msg->uid;
548 	job->gid	= (gid_t) msg->gid;
549 	job->user_name	= xstrdup(msg->user_name);
550 	_slurm_cred_to_step_rec(msg->cred, job);
551 	/*
552 	 * Favor the group info in the launch cred if available - for 19.05+
553 	 * this is where it is managed, not in batch_job_launch_msg_t.
554 	 * For older versions, or for when send_gids is disabled, fall back
555 	 * to the batch_job_launch_msg_t info if necessary.
556 	 */
557 	if (!job->ngids) {
558 		job->ngids = (int) msg->ngids;
559 		job->gids = copy_gids(msg->ngids, msg->gids);
560 	}
561 
562 	job->profile    = msg->profile;
563 
564 	/* give them all to the 1 task */
565 	job->cpus_per_task = job->cpus;
566 
567 	/* This needs to happen before acct_gather_profile_startpoll
568 	   and only really looks at the profile in the job.
569 	*/
570 	acct_gather_profile_g_node_step_start(job);
571 	/* needed for the jobacct_gather plugin to start */
572 	acct_gather_profile_startpoll(msg->acctg_freq,
573 				      conf->job_acct_gather_freq);
574 
575 	job->open_mode  = msg->open_mode;
576 	job->overcommit = (bool) msg->overcommit;
577 
578 	job->cwd     = xstrdup(msg->work_dir);
579 
580 	job->env     = _array_copy(msg->envc, msg->environment);
581 	job->eio     = eio_handle_create(0);
582 	job->sruns   = list_create((ListDelF) _srun_info_destructor);
583 	job->envtp   = xmalloc(sizeof(env_t));
584 	job->envtp->jobid = -1;
585 	job->envtp->stepid = -1;
586 	job->envtp->procid = -1;
587 	job->envtp->localid = -1;
588 	job->envtp->nodeid = -1;
589 
590 	job->envtp->distribution = 0;
591 	job->cpu_bind_type = msg->cpu_bind_type;
592 	job->cpu_bind = xstrdup(msg->cpu_bind);
593 	job->envtp->mem_bind_type = 0;
594 	job->envtp->mem_bind = NULL;
595 	job->envtp->restart_cnt = msg->restart_cnt;
596 
597 	if (msg->cpus_per_node)
598 		job->cpus    = msg->cpus_per_node[0];
599 
600 	format_core_allocs(msg->cred, conf->node_name, conf->cpus,
601 			   &job->job_alloc_cores, &job->step_alloc_cores,
602 			   &job->job_mem, &job->step_mem);
603 	if (job->step_mem && conf->job_acct_oom_kill)
604 		jobacct_gather_set_mem_limit(job->jobid, NO_VAL, job->step_mem);
605 	else if (job->job_mem && conf->job_acct_oom_kill)
606 		jobacct_gather_set_mem_limit(job->jobid, NO_VAL, job->job_mem);
607 
608 	get_cred_gres(msg->cred, conf->node_name,
609 		      &job->job_gres_list, &job->step_gres_list);
610 
611 	srun = srun_info_create(NULL, NULL, NULL, NO_VAL16);
612 
613 	list_append(job->sruns, (void *) srun);
614 
615 	if (msg->argc) {
616 		job->argc    = msg->argc;
617 		job->argv    = _array_copy(job->argc, msg->argv);
618 	} else {
619 		job->argc    = 1;
620 		/* job script has not yet been written out to disk --
621 		 * argv will be filled in later by _make_batch_script()
622 		 */
623 		job->argv    = (char **) xmalloc(2 * sizeof(char *));
624 	}
625 
626 	job->task = xmalloc(sizeof(stepd_step_task_info_t *));
627 	if (msg->std_err == NULL)
628 		msg->std_err = xstrdup(msg->std_out);
629 
630 	if (msg->std_in == NULL)
631 		in_name = xstrdup("/dev/null");
632 	else
633 		in_name = fname_create(job, msg->std_in, 0);
634 
635 	job->task[0] = _task_info_create(0, 0, in_name,
636 					 _batchfilename(job, msg->std_out),
637 					 _batchfilename(job, msg->std_err));
638 	job->task[0]->argc = job->argc;
639 	job->task[0]->argv = job->argv;
640 
641 	return job;
642 }
643 
644 extern void
stepd_step_rec_destroy(stepd_step_rec_t * job)645 stepd_step_rec_destroy(stepd_step_rec_t *job)
646 {
647 	uint16_t multi_prog = 0;
648 	int i;
649 
650 	_array_free(&job->env);
651 	_array_free(&job->argv);
652 
653 	if (job->flags & LAUNCH_MULTI_PROG)
654 		multi_prog = 1;
655 	for (i = 0; i < job->node_tasks; i++)
656 		_task_info_destroy(job->task[i], multi_prog);
657 	xfree(job->task);
658 	eio_handle_destroy(job->eio);
659 	FREE_NULL_LIST(job->sruns);
660 	FREE_NULL_LIST(job->clients);
661 	FREE_NULL_LIST(job->stdout_eio_objs);
662 	FREE_NULL_LIST(job->stderr_eio_objs);
663 	FREE_NULL_LIST(job->free_incoming);
664 	FREE_NULL_LIST(job->free_outgoing);
665 	FREE_NULL_LIST(job->outgoing_cache);
666 	FREE_NULL_LIST(job->job_gres_list);
667 	FREE_NULL_LIST(job->step_gres_list);
668 	xfree(job->cpu_bind);
669 	xfree(job->cwd);
670 	xfree(job->envtp);
671 	xfree(job->pw_gecos);
672 	xfree(job->pw_dir);
673 	xfree(job->pw_shell);
674 	xfree(job->gids);
675 	xfree(job->mem_bind);
676 	eio_handle_destroy(job->msg_handle);
677 	xfree(job->node_name);
678 	mpmd_free(job);
679 	xfree(job->het_job_task_cnts);
680 	if ((job->het_job_nnodes != NO_VAL) && job->het_job_tids) {
681 		/* het_job_tids == NULL if request from pre-v19.05 srun */
682 		for (i = 0; i < job->het_job_nnodes; i++)
683 			xfree(job->het_job_tids[i]);
684 		xfree(job->het_job_tids);
685 	}
686 	xfree(job->het_job_tid_offsets);
687 	xfree(job->task_prolog);
688 	xfree(job->task_epilog);
689 	xfree(job->job_alloc_cores);
690 	xfree(job->step_alloc_cores);
691 	xfree(job->task_cnts);
692 	xfree(job->tres_bind);
693 	xfree(job->tres_freq);
694 	xfree(job->user_name);
695 	xfree(job->x11_xauthority);
696 	xfree(job);
697 }
698 
699 extern srun_info_t *
srun_info_create(slurm_cred_t * cred,slurm_addr_t * resp_addr,slurm_addr_t * ioaddr,uint16_t protocol_version)700 srun_info_create(slurm_cred_t *cred, slurm_addr_t *resp_addr,
701 		 slurm_addr_t *ioaddr, uint16_t protocol_version)
702 {
703 	char             *data = NULL;
704 	uint32_t          len  = 0;
705 	srun_info_t *srun = xmalloc(sizeof(srun_info_t));
706 	srun_key_t       *key  = xmalloc(sizeof(srun_key_t));
707 
708 	srun->key    = key;
709 	if (!protocol_version || (protocol_version == NO_VAL16))
710 		protocol_version = SLURM_PROTOCOL_VERSION;
711 	srun->protocol_version = protocol_version;
712 	/*
713 	 * If no credential was provided, return the empty
714 	 * srun info object. (This is used, for example, when
715 	 * creating a batch job structure)
716 	 */
717 	if (!cred) return srun;
718 
719 	slurm_cred_get_signature(cred, &data, &len);
720 
721 	len = len > SLURM_IO_KEY_SIZE ? SLURM_IO_KEY_SIZE : len;
722 
723 	if (data != NULL) {
724 		memcpy((void *) key->data, data, len);
725 
726 		if (len < SLURM_IO_KEY_SIZE)
727 			memset( (void *) (key->data + len), 0,
728 				SLURM_IO_KEY_SIZE - len);
729 	}
730 
731 	if (ioaddr != NULL)
732 		srun->ioaddr    = *ioaddr;
733 	if (resp_addr != NULL)
734 		srun->resp_addr = *resp_addr;
735 	return srun;
736 }
737 
738 extern void
srun_info_destroy(srun_info_t * srun)739 srun_info_destroy(srun_info_t *srun)
740 {
741 	xfree(srun->key);
742 	xfree(srun);
743 }
744 
_task_info_create(int taskid,int gtaskid,char * ifname,char * ofname,char * efname)745 static stepd_step_task_info_t *_task_info_create(int taskid, int gtaskid,
746 						 char *ifname, char *ofname,
747 						 char *efname)
748 {
749 	stepd_step_task_info_t *t = xmalloc(sizeof(stepd_step_task_info_t));
750 
751 	xassert(taskid >= 0);
752 	xassert(gtaskid >= 0);
753 
754 	slurm_mutex_init(&t->mutex);
755 	slurm_mutex_lock(&t->mutex);
756 	t->state       = STEPD_STEP_TASK_INIT;
757 	t->id          = taskid;
758 	t->gtid	       = gtaskid;
759 	t->pid         = (pid_t) -1;
760 	t->ifname      = ifname;
761 	t->ofname      = ofname;
762 	t->efname      = efname;
763 	t->stdin_fd    = -1;
764 	t->to_stdin    = -1;
765 	t->stdout_fd   = -1;
766 	t->from_stdout = -1;
767 	t->stderr_fd   = -1;
768 	t->from_stderr = -1;
769 	t->in          = NULL;
770 	t->out         = NULL;
771 	t->err         = NULL;
772 	t->killed_by_cmd = false;
773 	t->aborted     = false;
774 	t->esent       = false;
775 	t->exited      = false;
776 	t->estatus     = -1;
777 	t->argc	       = 0;
778 	t->argv	       = NULL;
779 	slurm_mutex_unlock(&t->mutex);
780 	return t;
781 }
782