1 /*****************************************************************************\
2  *  srun.c - user interface to allocate resources, submit jobs, and execute
3  *	parallel jobs.
4  *****************************************************************************
5  *  Copyright (C) 2002-2007 The Regents of the University of California.
6  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
7  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
8  *  Written by Mark Grondona <grondona@llnl.gov>, et. al.
9  *  CODE-OCEC-09-009. All rights reserved.
10  *
11  *  This file is part of Slurm, a resource management program.
12  *  For details, see <https://slurm.schedmd.com/>.
13  *  Please also read the included file: DISCLAIMER.
14  *
15  *  Slurm is free software; you can redistribute it and/or modify it under
16  *  the terms of the GNU General Public License as published by the Free
17  *  Software Foundation; either version 2 of the License, or (at your option)
18  *  any later version.
19  *
20  *  In addition, as a special exception, the copyright holders give permission
21  *  to link the code of portions of this program with the OpenSSL library under
22  *  certain conditions as described in each individual source file, and
23  *  distribute linked combinations including the two. You must obey the GNU
24  *  General Public License in all respects for all of the code used other than
25  *  OpenSSL. If you modify file(s) with this exception, you may extend this
26  *  exception to your version of the file(s), but you are not obligated to do
27  *  so. If you do not wish to do so, delete this exception statement from your
28  *  version.  If you delete this exception statement from all source files in
29  *  the program, then also delete it here.
30  *
31  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
32  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
33  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
34  *  details.
35  *
36  *  You should have received a copy of the GNU General Public License along
37  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
38  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
39 \*****************************************************************************/
40 
41 #include "config.h"
42 
43 #include <ctype.h>
44 #include <fcntl.h>
45 #include <grp.h>
46 #include <pthread.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <signal.h>
51 #include <sys/param.h>
52 #include <sys/resource.h>
53 #include <sys/stat.h>
54 #include <sys/time.h>
55 #include <sys/types.h>
56 #include <sys/utsname.h>
57 #include <sys/wait.h>
58 #include <termios.h>
59 #include <unistd.h>
60 
61 #include "src/common/fd.h"
62 
63 #include "src/common/hostlist.h"
64 #include "src/common/log.h"
65 #include "src/common/net.h"
66 #include "src/common/plugstack.h"
67 #include "src/common/read_config.h"
68 #include "src/common/slurm_auth.h"
69 #include "src/common/slurm_jobacct_gather.h"
70 #include "src/common/slurm_opt.h"
71 #include "src/common/slurm_protocol_api.h"
72 #include "src/common/slurm_rlimits_info.h"
73 #include "src/common/switch.h"
74 #include "src/common/uid.h"
75 #include "src/common/xmalloc.h"
76 #include "src/common/xsignal.h"
77 #include "src/common/xstring.h"
78 
79 #include "src/bcast/file_bcast.h"
80 
81 #include "launch.h"
82 #include "allocate.h"
83 #include "srun_job.h"
84 #include "opt.h"
85 #include "debugger.h"
86 #include "src/srun/srun_pty.h"
87 #include "multi_prog.h"
88 #include "src/api/pmi_server.h"
89 #include "src/api/step_ctx.h"
90 #include "src/api/step_launch.h"
91 
92 #ifndef OPEN_MPI_PORT_ERROR
93 /* This exit code indicates the launched Open MPI tasks could
94  *	not open the reserved port. It was already open by some
95  *	other process. */
96 #define OPEN_MPI_PORT_ERROR 108
97 #endif
98 
99 static struct termios termdefaults;
100 static uint32_t global_rc = 0;
101 static uint32_t mpi_plugin_rc = 0;
102 static srun_job_t *job = NULL;
103 
104 extern char **environ;	/* job environment */
105 bool srun_max_timer = false;
106 bool srun_shutdown  = false;
107 int sig_array[] = {
108 	SIGINT,  SIGQUIT, SIGCONT, SIGTERM, SIGHUP,
109 	SIGALRM, SIGUSR1, SIGUSR2, SIGPIPE, 0 };
110 bitstr_t *g_het_grp_bits = NULL;
111 
112 typedef struct _launch_app_data
113 {
114 	bool		got_alloc;
115 	srun_job_t *	job;
116 	slurm_opt_t	*opt_local;
117 	int *		step_cnt;
118 	pthread_cond_t *step_cond;
119 	pthread_mutex_t *step_mutex;
120 } _launch_app_data_t;
121 
122 /*
123  * forward declaration of static funcs
124  */
125 static int   _file_bcast(slurm_opt_t *opt_local, srun_job_t *job);
126 static void  _launch_app(srun_job_t *job, List srun_job_list, bool got_alloc);
127 static void *_launch_one_app(void *data);
128 static void  _pty_restore(void);
129 static void  _set_exit_code(void);
130 static void  _set_node_alias(void);
131 static void  _setup_env_working_cluster(void);
132 static void  _setup_job_env(srun_job_t *job, List srun_job_list,
133 			    bool got_alloc);
134 static void  _setup_one_job_env(slurm_opt_t *opt_local, srun_job_t *job,
135 				bool got_alloc);
136 static int   _slurm_debug_env_val (void);
137 static char *_uint16_array_to_str(int count, const uint16_t *array);
138 
139 /*
140  * from libvirt-0.6.2 GPL2
141  *
142  * console.c: A dumb serial console client
143  *
144  * Copyright (C) 2007, 2008 Red Hat, Inc.
145  *
146  */
147 #ifndef HAVE_CFMAKERAW
cfmakeraw(struct termios * attr)148 void cfmakeraw(struct termios *attr)
149 {
150 	attr->c_iflag &= ~(IGNBRK | BRKINT | PARMRK | ISTRIP
151 				| INLCR | IGNCR | ICRNL | IXON);
152 	attr->c_oflag &= ~OPOST;
153 	attr->c_lflag &= ~(ECHO | ECHONL | ICANON | ISIG | IEXTEN);
154 	attr->c_cflag &= ~(CSIZE | PARENB);
155 	attr->c_cflag |= CS8;
156 }
157 #endif
158 
_enable_het_job_steps(void)159 static bool _enable_het_job_steps(void)
160 {
161 	bool enabled = true;
162 	char *sched_params = slurm_get_sched_params();
163 
164 	/* Continue supporting old terminology */
165 	if (xstrcasestr(sched_params, "disable_hetero_steps") ||
166 	    xstrcasestr(sched_params, "disable_hetjob_steps"))
167 		enabled = false;
168 	else if (xstrcasestr(sched_params, "enable_hetero_steps") ||
169 		 xstrcasestr(sched_params, "enable_hetjob_steps"))
170 		enabled = true;
171 
172 	xfree(sched_params);
173 	return enabled;
174 }
175 
srun(int ac,char ** av)176 int srun(int ac, char **av)
177 {
178 	int debug_level;
179 	log_options_t logopt = LOG_OPTS_STDERR_ONLY;
180 	bool got_alloc = false;
181 	List srun_job_list = NULL;
182 
183 	slurm_conf_init(NULL);
184 	debug_level = _slurm_debug_env_val();
185 	logopt.stderr_level += debug_level;
186 	log_init(xbasename(av[0]), logopt, 0, NULL);
187 	_set_exit_code();
188 
189 	if (slurm_select_init(0) != SLURM_SUCCESS)
190 		fatal( "failed to initialize node selection plugin" );
191 
192 	if (switch_init(0) != SLURM_SUCCESS )
193 		fatal("failed to initialize switch plugins");
194 
195 	_setup_env_working_cluster();
196 
197 	init_srun(ac, av, &logopt, debug_level, 1);
198 	if (opt_list) {
199 		if (!_enable_het_job_steps())
200 			fatal("Job steps that span multiple components of a heterogeneous job are not currently supported");
201 		create_srun_job((void **) &srun_job_list, &got_alloc, 0, 1);
202 	} else
203 		create_srun_job((void **) &job, &got_alloc, 0, 1);
204 
205 	_setup_job_env(job, srun_job_list, got_alloc);
206 	_set_node_alias();
207 	_launch_app(job, srun_job_list, got_alloc);
208 
209 	if ((global_rc & 0xff) == SIG_OOM)
210 		global_rc = 1;	/* Exit code 1 */
211 	else if (mpi_plugin_rc) {
212 		/*
213 		 * MPI plugin might have more precise information in some cases.
214 		 * For example, if PMI[?] abort was by task X with return code
215 		 * RC, the expectation is that srun will return RC as srun's
216 		 * return code. However, to ensure proper cleanup, the plugin
217 		 * kills the job with SIGKILL which obscures the original reason
218 		 * for job exit.
219 		 */
220 		global_rc = mpi_plugin_rc;
221 	}
222 
223 
224 #ifdef MEMORY_LEAK_DEBUG
225 	slurm_select_fini();
226 	switch_fini();
227 	slurm_reset_all_options(&opt, false);
228 	slurm_auth_fini();
229 	slurm_conf_destroy();
230 	log_fini();
231 #endif /* MEMORY_LEAK_DEBUG */
232 
233 	return (int)global_rc;
234 }
235 
_launch_one_app(void * data)236 static void *_launch_one_app(void *data)
237 {
238 	static pthread_mutex_t launch_mutex = PTHREAD_MUTEX_INITIALIZER;
239 	static pthread_cond_t  launch_cond  = PTHREAD_COND_INITIALIZER;
240 	static bool            launch_begin = false;
241 	static bool            launch_fini  = false;
242 	_launch_app_data_t *opts = (_launch_app_data_t *) data;
243 	slurm_opt_t *opt_local = opts->opt_local;
244 	srun_job_t *job  = opts->job;
245 	bool got_alloc   = opts->got_alloc;
246 	slurm_step_io_fds_t cio_fds = SLURM_STEP_IO_FDS_INITIALIZER;
247 	slurm_step_launch_callbacks_t step_callbacks;
248 
249 	memset(&step_callbacks, 0, sizeof(step_callbacks));
250 	step_callbacks.step_signal = launch_g_fwd_signal;
251 
252 	/*
253 	 * Run pre-launch once for entire hetjob
254 	 */
255 	slurm_mutex_lock(&launch_mutex);
256 	if (!launch_begin) {
257 		launch_begin = true;
258 		slurm_mutex_unlock(&launch_mutex);
259 
260 		pre_launch_srun_job(job, 0, 1, opt_local);
261 
262 		slurm_mutex_lock(&launch_mutex);
263 		launch_fini = true;
264 		slurm_cond_broadcast(&launch_cond);
265 	} else {
266 		while (!launch_fini)
267 			slurm_cond_wait(&launch_cond, &launch_mutex);
268 	}
269 	slurm_mutex_unlock(&launch_mutex);
270 
271 relaunch:
272 	launch_common_set_stdio_fds(job, &cio_fds, opt_local);
273 
274 	if (!launch_g_step_launch(job, &cio_fds, &global_rc, &step_callbacks,
275 				  opt_local)) {
276 		if (launch_g_step_wait(job, got_alloc, opt_local) == -1)
277 			goto relaunch;
278 		if (job->step_ctx->launch_state->mpi_rc > mpi_plugin_rc)
279 			mpi_plugin_rc = job->step_ctx->launch_state->mpi_rc;
280 	}
281 
282 	if (opts->step_mutex) {
283 		slurm_mutex_lock(opts->step_mutex);
284 		(*opts->step_cnt)--;
285 		slurm_cond_broadcast(opts->step_cond);
286 		slurm_mutex_unlock(opts->step_mutex);
287 	}
288 	xfree(data);
289 	return NULL;
290 }
291 
292 /*
293  * The het_job_node_list may not be ordered across multiple components, which
294  * can cause problems for some MPI implementations. Put the het_job_node_list
295  * records in alphabetic order and reorder het_job_task_cnts het_job_tids to
296  * match
297  */
_reorder_het_job_recs(char ** in_node_list,uint16_t ** in_task_cnts,uint32_t *** in_tids,int total_nnodes)298 static void _reorder_het_job_recs(char **in_node_list, uint16_t **in_task_cnts,
299 			       uint32_t ***in_tids, int total_nnodes)
300 {
301 	hostlist_t in_hl, out_hl;
302 	uint16_t *out_task_cnts = NULL;
303 	uint32_t **out_tids = NULL;
304 	char *hostname;
305 	int i, j;
306 
307 	in_hl = hostlist_create(*in_node_list);
308 	if (!in_hl) {
309 		error("%s: Invalid hostlist(%s)", __func__, *in_node_list);
310 		return;
311 	}
312 	out_hl = hostlist_copy(in_hl);
313 	hostlist_sort(out_hl);
314 	hostlist_uniq(out_hl);
315 	i = hostlist_count(out_hl);
316 	if (i != total_nnodes) {
317 		error("%s: Invalid hostlist(%s) count(%d)", __func__,
318 		      *in_node_list, total_nnodes);
319 		goto fini;
320 	}
321 
322 	out_task_cnts = xmalloc(sizeof(uint16_t) * total_nnodes);
323 	out_tids = xmalloc(sizeof(uint32_t *) * total_nnodes);
324 	for (i = 0; i < total_nnodes; i++) {
325 		hostname = hostlist_nth(out_hl, i);
326 		if (!hostname) {
327 			error("%s: Invalid hostlist(%s) count(%d)", __func__,
328 			      *in_node_list, total_nnodes);
329 			break;
330 		}
331 		j = hostlist_find(in_hl, hostname);
332 		if (j == -1) {
333 			error("%s: Invalid hostlist(%s) parsing", __func__,
334 			      *in_node_list);
335 			free(hostname);
336 			break;
337 		}
338 		out_task_cnts[i] = in_task_cnts[0][j];
339 		out_tids[i] = in_tids[0][j];
340 		free(hostname);
341 	}
342 
343 	if (i >= total_nnodes) {	/* Success */
344 		xfree(*in_node_list);
345 		*in_node_list = hostlist_ranged_string_xmalloc(out_hl);
346 		xfree(*in_task_cnts);
347 		*in_task_cnts = out_task_cnts;
348 		out_task_cnts = NULL;
349 		xfree(*in_tids);
350 		*in_tids = out_tids;
351 		out_tids = NULL;
352 	}
353 
354 #if 0
355 	info("NODE_LIST[%d]:%s", total_nnodes, *in_node_list);
356 	for (i = 0; i < total_nnodes; i++) {
357 		info("TASK_CNT[%d]:%u", i, in_task_cnts[0][i]);
358 		for (j = 0; j < in_task_cnts[0][i]; j++) {
359 			info("TIDS[%d][%d]: %u", i, j, in_tids[0][i][j]);
360 		}
361 	}
362 #endif
363 
364 fini:	hostlist_destroy(in_hl);
365 	hostlist_destroy(out_hl);
366 	xfree(out_task_cnts);
367 	xfree(out_tids);
368 }
369 
_launch_app(srun_job_t * job,List srun_job_list,bool got_alloc)370 static void _launch_app(srun_job_t *job, List srun_job_list, bool got_alloc)
371 {
372 	ListIterator opt_iter, job_iter;
373 	slurm_opt_t *opt_local = NULL;
374 	_launch_app_data_t *opts;
375 	int total_ntasks = 0, total_nnodes = 0, step_cnt = 0, node_offset = 0;
376 	pthread_mutex_t step_mutex = PTHREAD_MUTEX_INITIALIZER;
377 	pthread_cond_t step_cond   = PTHREAD_COND_INITIALIZER;
378 	srun_job_t *first_job = NULL;
379 	char *launch_type, *het_job_node_list = NULL;
380 	bool need_mpir = false;
381 	uint16_t *tmp_task_cnt = NULL, *het_job_task_cnts = NULL;
382 	uint32_t **tmp_tids = NULL, **het_job_tids = NULL;
383 	uint32_t *het_job_tid_offsets = NULL;
384 
385 	launch_type = slurm_get_launch_type();
386 	if (launch_type && strstr(launch_type, "slurm"))
387 		need_mpir = true;
388 	xfree(launch_type);
389 
390 	if (srun_job_list) {
391 		int het_job_step_cnt = list_count(srun_job_list);
392 		first_job = (srun_job_t *) list_peek(srun_job_list);
393 		if (!opt_list) {
394 			if (first_job)
395 				fini_srun(first_job, got_alloc, &global_rc, 0);
396 			fatal("%s: have srun_job_list, but no opt_list",
397 			      __func__);
398 		}
399 
400 		job_iter = list_iterator_create(srun_job_list);
401 		while ((job = list_next(job_iter))) {
402 			char *node_list = NULL;
403 			int i, node_inx;
404 			total_ntasks += job->ntasks;
405 			total_nnodes += job->nhosts;
406 
407 			xrealloc(het_job_task_cnts,
408 				 sizeof(uint16_t)*total_nnodes);
409 			(void) slurm_step_ctx_get(job->step_ctx,
410 						  SLURM_STEP_CTX_TASKS,
411 						  &tmp_task_cnt);
412 			xrealloc(het_job_tid_offsets,
413 				 sizeof(uint32_t) * total_ntasks);
414 
415 			for (i = total_ntasks - job->ntasks;
416 			     i < total_ntasks;
417 			     i++)
418 				het_job_tid_offsets[i] = job->het_job_offset;
419 
420 			if (!tmp_task_cnt) {
421 				fatal("%s: job %u has NULL task array",
422 				      __func__, job->jobid);
423 				break;	/* To eliminate CLANG error */
424 			}
425 			memcpy(het_job_task_cnts + node_offset, tmp_task_cnt,
426 			       sizeof(uint16_t) * job->nhosts);
427 
428 			xrealloc(het_job_tids,
429 				 sizeof(uint32_t *) * total_nnodes);
430 			(void) slurm_step_ctx_get(job->step_ctx,
431 						  SLURM_STEP_CTX_TIDS,
432 						  &tmp_tids);
433 			if (!tmp_tids) {
434 				fatal("%s: job %u has NULL task ID array",
435 				      __func__, job->jobid);
436 				break;	/* To eliminate CLANG error */
437 			}
438 			for (node_inx = 0; node_inx < job->nhosts; node_inx++) {
439 				uint32_t *node_tids;
440 				node_tids = xmalloc(sizeof(uint32_t) *
441 						    tmp_task_cnt[node_inx]);
442 				for (i = 0; i < tmp_task_cnt[node_inx]; i++) {
443 					node_tids[i] = tmp_tids[node_inx][i] +
444 						       job->het_job_task_offset;
445 				}
446 				het_job_tids[node_offset + node_inx] =
447 					node_tids;
448 			}
449 
450 			(void) slurm_step_ctx_get(job->step_ctx,
451 						  SLURM_STEP_CTX_NODE_LIST,
452 						  &node_list);
453 			if (!node_list) {
454 				fatal("%s: job %u has NULL hostname",
455 				      __func__, job->jobid);
456 			}
457 			if (het_job_node_list)
458 				xstrfmtcat(het_job_node_list, ",%s", node_list);
459 			else
460 				het_job_node_list = xstrdup(node_list);
461 			xfree(node_list);
462 			node_offset += job->nhosts;
463 		}
464 		list_iterator_reset(job_iter);
465 		_reorder_het_job_recs(&het_job_node_list, &het_job_task_cnts,
466 				   &het_job_tids, total_nnodes);
467 
468 		if (need_mpir)
469 			mpir_init(total_ntasks);
470 
471 		opt_iter = list_iterator_create(opt_list);
472 
473 		/* copy aggregated hetjob data back into each sub-job */
474 		while ((opt_local = list_next(opt_iter))) {
475 			srun_opt_t *srun_opt = opt_local->srun_opt;
476 			xassert(srun_opt);
477 			job = list_next(job_iter);
478 			if (!job) {
479 				slurm_mutex_lock(&step_mutex);
480 				while (step_cnt > 0)
481 					slurm_cond_wait(&step_cond,&step_mutex);
482 				slurm_mutex_unlock(&step_mutex);
483 				if (first_job) {
484 					fini_srun(first_job, got_alloc,
485 						  &global_rc, 0);
486 				}
487 				fatal("%s: job allocation count does not match request count (%d != %d)",
488 				      __func__, list_count(srun_job_list),
489 				      list_count(opt_list));
490 				break;	/* To eliminate CLANG error */
491 			}
492 
493 			slurm_mutex_lock(&step_mutex);
494 			step_cnt++;
495 			slurm_mutex_unlock(&step_mutex);
496 			job->het_job_node_list = xstrdup(het_job_node_list);
497 			if ((het_job_step_cnt > 1) && het_job_task_cnts &&
498 			    het_job_tid_offsets) {
499 				xassert(node_offset == job->het_job_nnodes);
500 				job->het_job_task_cnts =
501 					xcalloc(job->het_job_nnodes,
502 						sizeof(uint16_t));
503 				memcpy(job->het_job_task_cnts,
504 				       het_job_task_cnts,
505 				       sizeof(uint16_t) * job->het_job_nnodes);
506 				job->het_job_tids = xcalloc(job->het_job_nnodes,
507 							    sizeof(uint32_t *));
508 				memcpy(job->het_job_tids, het_job_tids,
509 				       sizeof(uint32_t *) *
510 				       job->het_job_nnodes);
511 
512 				job->het_job_tid_offsets = xcalloc(
513 					total_ntasks, sizeof(uint32_t));
514 				memcpy(job->het_job_tid_offsets,
515 				       het_job_tid_offsets,
516 				       sizeof(uint32_t) * total_ntasks);
517 			}
518 
519 			opts = xmalloc(sizeof(_launch_app_data_t));
520 			opts->got_alloc   = got_alloc;
521 			opts->job         = job;
522 			opts->opt_local   = opt_local;
523 			opts->step_cond   = &step_cond;
524 			opts->step_cnt    = &step_cnt;
525 			opts->step_mutex  = &step_mutex;
526 			srun_opt->het_step_cnt = het_job_step_cnt;
527 
528 			slurm_thread_create_detached(NULL, _launch_one_app,
529 						     opts);
530 		}
531 		xfree(het_job_node_list);
532 		xfree(het_job_task_cnts);
533 		xfree(het_job_tid_offsets);
534 		list_iterator_destroy(job_iter);
535 		list_iterator_destroy(opt_iter);
536 		slurm_mutex_lock(&step_mutex);
537 		while (step_cnt > 0)
538 			slurm_cond_wait(&step_cond, &step_mutex);
539 		slurm_mutex_unlock(&step_mutex);
540 
541 		if (first_job)
542 			fini_srun(first_job, got_alloc, &global_rc, 0);
543 	} else {
544 		int i;
545 		if (need_mpir)
546 			mpir_init(job->ntasks);
547 		if (job->het_job_id && (job->het_job_id != NO_VAL)) {
548 			(void) slurm_step_ctx_get(job->step_ctx,
549 						  SLURM_STEP_CTX_TASKS,
550 						  &tmp_task_cnt);
551 			job->het_job_task_cnts = xcalloc(job->het_job_nnodes,
552 							 sizeof(uint16_t));
553 			memcpy(job->het_job_task_cnts, tmp_task_cnt,
554 			       sizeof(uint16_t) * job->het_job_nnodes);
555 			(void) slurm_step_ctx_get(job->step_ctx,
556 						  SLURM_STEP_CTX_TIDS,
557 						  &tmp_tids);
558 			job->het_job_tids = xcalloc(job->het_job_nnodes,
559 						    sizeof(uint32_t *));
560 			memcpy(job->het_job_tids, tmp_tids,
561 			       sizeof(uint32_t *) * job->het_job_nnodes);
562 
563 			(void) slurm_step_ctx_get(job->step_ctx,
564 						  SLURM_STEP_CTX_NODE_LIST,
565 						  &job->het_job_node_list);
566 			if (!job->het_job_node_list)
567 				fatal("%s: job %u has NULL hostname",
568 				      __func__, job->jobid);
569 
570 			job->het_job_tid_offsets = xcalloc(job->ntasks,
571 							   sizeof(uint32_t));
572 			if (job->het_job_offset) {
573 				/*
574 				 * Only starting one hetjob component,
575 				 * het_job_offset should be zero
576 				 */
577 				for (i = 0; i < job->ntasks; i++) {
578 					job->het_job_tid_offsets[i] =
579 						job->het_job_offset;
580 				}
581 			}
582 		}
583 		opts = xmalloc(sizeof(_launch_app_data_t));
584 		opts->got_alloc   = got_alloc;
585 		opts->job         = job;
586 		opts->opt_local   = &opt;
587 		sropt.het_step_cnt = 1;
588 		_launch_one_app(opts);
589 		fini_srun(job, got_alloc, &global_rc, 0);
590 	}
591 }
592 
_setup_one_job_env(slurm_opt_t * opt_local,srun_job_t * job,bool got_alloc)593 static void _setup_one_job_env(slurm_opt_t *opt_local, srun_job_t *job,
594 			       bool got_alloc)
595 {
596 	env_t *env = xmalloc(sizeof(env_t));
597 	uint16_t *tasks = NULL;
598 	srun_opt_t *srun_opt = opt_local->srun_opt;
599 	xassert(srun_opt);
600 
601 	xassert(job);
602 
603 	env->localid = -1;
604 	env->nodeid  = -1;
605 	env->procid  = -1;
606 	env->stepid  = -1;
607 
608 	if (srun_opt->bcast_flag)
609 		_file_bcast(opt_local, job);
610 	if (opt_local->cpus_set)
611 		env->cpus_per_task = opt_local->cpus_per_task;
612 	if (opt_local->ntasks_per_node != NO_VAL)
613 		env->ntasks_per_node = opt_local->ntasks_per_node;
614 	if (opt_local->ntasks_per_socket != NO_VAL)
615 		env->ntasks_per_socket = opt_local->ntasks_per_socket;
616 	if (opt_local->ntasks_per_core != NO_VAL)
617 		env->ntasks_per_core = opt_local->ntasks_per_core;
618 	env->distribution = opt_local->distribution;
619 	if (opt_local->plane_size != NO_VAL)
620 		env->plane_size = opt_local->plane_size;
621 	env->cpu_bind_type = srun_opt->cpu_bind_type;
622 	env->cpu_bind = srun_opt->cpu_bind;
623 
624 	env->cpu_freq_min = opt_local->cpu_freq_min;
625 	env->cpu_freq_max = opt_local->cpu_freq_max;
626 	env->cpu_freq_gov = opt_local->cpu_freq_gov;
627 	env->mem_bind_type = opt_local->mem_bind_type;
628 	env->mem_bind = opt_local->mem_bind;
629 	env->overcommit = opt_local->overcommit;
630 	env->slurmd_debug = srun_opt->slurmd_debug;
631 	env->labelio = srun_opt->labelio;
632 	env->comm_port = slurmctld_comm_port;
633 	if (opt_local->job_name)
634 		env->job_name = opt_local->job_name;
635 
636 	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_TASKS, &tasks);
637 
638 	env->select_jobinfo = job->select_jobinfo;
639 	if (job->het_job_node_list)
640 		env->nodelist = job->het_job_node_list;
641 	else
642 		env->nodelist = job->nodelist;
643 	env->partition = job->partition;
644 	if (job->het_job_nnodes != NO_VAL)
645 		env->nhosts = job->het_job_nnodes;
646 	else if (got_alloc)	/* Don't overwrite unless we got allocation */
647 		env->nhosts = job->nhosts;
648 	if (job->het_job_ntasks != NO_VAL)
649 		env->ntasks = job->het_job_ntasks;
650 	else
651 		env->ntasks = job->ntasks;
652 	env->task_count = _uint16_array_to_str(job->nhosts, tasks);
653 	if (job->het_job_id != NO_VAL)
654 		env->jobid = job->het_job_id;
655 	else
656 		env->jobid = job->jobid;
657 	env->stepid = job->stepid;
658 	env->account = job->account;
659 	env->qos = job->qos;
660 	env->resv_name = job->resv_name;
661 	env->uid = getuid();
662 	env->user_name = uid_to_string(env->uid);
663 
664 	if (srun_opt->pty && (set_winsize(job) < 0)) {
665 		error("Not using a pseudo-terminal, disregarding --pty option");
666 		srun_opt->pty = false;
667 	}
668 	if (srun_opt->pty) {
669 		struct termios term;
670 		int fd = STDIN_FILENO;
671 
672 		/* Save terminal settings for restore */
673 		tcgetattr(fd, &termdefaults);
674 		tcgetattr(fd, &term);
675 		/* Set raw mode on local tty */
676 		cfmakeraw(&term);
677 		/* Re-enable output processing such that debug() and
678 		 * and error() work properly. */
679 		term.c_oflag |= OPOST;
680 		tcsetattr(fd, TCSANOW, &term);
681 		atexit(&_pty_restore);
682 
683 		block_sigwinch();
684 		pty_thread_create(job);
685 		env->pty_port = job->pty_port;
686 		env->ws_col   = job->ws_col;
687 		env->ws_row   = job->ws_row;
688 	}
689 
690 	setup_env(env, srun_opt->preserve_env);
691 	env_array_merge(&job->env, (const char **)environ);
692 	xfree(env->task_count);
693 	xfree(env->user_name);
694 	xfree(env);
695 }
696 
_setup_job_env(srun_job_t * job,List srun_job_list,bool got_alloc)697 static void _setup_job_env(srun_job_t *job, List srun_job_list, bool got_alloc)
698 {
699 	ListIterator opt_iter, job_iter;
700 	slurm_opt_t *opt_local;
701 
702 	if (srun_job_list) {
703 		srun_job_t *first_job = list_peek(srun_job_list);
704 		if (!opt_list) {
705 			if (first_job)
706 				fini_srun(first_job, got_alloc, &global_rc, 0);
707 			fatal("%s: have srun_job_list, but no opt_list",
708 			      __func__);
709 		}
710 		job_iter  = list_iterator_create(srun_job_list);
711 		opt_iter  = list_iterator_create(opt_list);
712 		while ((opt_local = list_next(opt_iter))) {
713 			job = list_next(job_iter);
714 			if (!job) {
715 				if (first_job) {
716 					fini_srun(first_job, got_alloc,
717 						  &global_rc, 0);
718 				}
719 				fatal("%s: job allocation count does not match request count (%d != %d)",
720 				      __func__, list_count(srun_job_list),
721 				      list_count(opt_list));
722 			}
723 			_setup_one_job_env(opt_local, job, got_alloc);
724 		}
725 		list_iterator_destroy(job_iter);
726 		list_iterator_destroy(opt_iter);
727 	} else if (job) {
728 		_setup_one_job_env(&opt, job, got_alloc);
729 	} else {
730 		fatal("%s: No job information", __func__);
731 	}
732 }
733 
_file_bcast(slurm_opt_t * opt_local,srun_job_t * job)734 static int _file_bcast(slurm_opt_t *opt_local, srun_job_t *job)
735 {
736 	srun_opt_t *srun_opt = opt_local->srun_opt;
737 	struct bcast_parameters *params;
738 	int rc;
739 	xassert(srun_opt);
740 
741 	if ((srun_opt->argc == 0) || (srun_opt->argv[0] == NULL)) {
742 		error("No command name to broadcast");
743 		return SLURM_ERROR;
744 	}
745 	params = xmalloc(sizeof(struct bcast_parameters));
746 	params->block_size = 8 * 1024 * 1024;
747 	params->compress = srun_opt->compress;
748 	if (srun_opt->bcast_file) {
749 		params->dst_fname = xstrdup(srun_opt->bcast_file);
750 	} else {
751 		xstrfmtcat(params->dst_fname, "%s/slurm_bcast_%u.%u",
752 			   opt_local->chdir, job->jobid, job->stepid);
753 	}
754 	params->fanout = 0;
755 	params->job_id = job->jobid;
756 	params->force = true;
757 	if (srun_opt->het_grp_bits)
758 		params->het_job_offset = bit_ffs(srun_opt->het_grp_bits);
759 	else
760 		params->het_job_offset = NO_VAL;
761 	params->preserve = true;
762 	params->src_fname = srun_opt->argv[0];
763 	params->step_id = job->stepid;
764 	params->timeout = 0;
765 	params->verbose = 0;
766 
767 	rc = bcast_file(params);
768 	if (rc == SLURM_SUCCESS) {
769 		xfree(srun_opt->argv[0]);
770 		srun_opt->argv[0] = params->dst_fname;
771 	} else {
772 		xfree(params->dst_fname);
773 	}
774 	xfree(params);
775 
776 	return rc;
777 }
778 
_slurm_debug_env_val(void)779 static int _slurm_debug_env_val (void)
780 {
781 	long int level = 0;
782 	const char *val;
783 
784 	if ((val = getenv ("SLURM_DEBUG"))) {
785 		char *p;
786 		if ((level = strtol (val, &p, 10)) < -LOG_LEVEL_INFO)
787 			level = -LOG_LEVEL_INFO;
788 		if (p && *p != '\0')
789 			level = 0;
790 	}
791 	return ((int) level);
792 }
793 
794 /*
795  * Return a string representation of an array of uint32_t elements.
796  * Each value in the array is printed in decimal notation and elements
797  * are separated by a comma.  If sequential elements in the array
798  * contain the same value, the value is written out just once followed
799  * by "(xN)", where "N" is the number of times the value is repeated.
800  *
801  * Example:
802  *   The array "1, 2, 1, 1, 1, 3, 2" becomes the string "1,2,1(x3),3,2"
803  *
804  * Returns an xmalloc'ed string.  Free with xfree().
805  */
_uint16_array_to_str(int array_len,const uint16_t * array)806 static char *_uint16_array_to_str(int array_len, const uint16_t *array)
807 {
808 	int i;
809 	int previous = 0;
810 	char *sep = ",";  /* seperator */
811 	char *str = xstrdup("");
812 
813 	if (array == NULL)
814 		return str;
815 
816 	for (i = 0; i < array_len; i++) {
817 		if ((i+1 < array_len)
818 		    && (array[i] == array[i+1])) {
819 				previous++;
820 				continue;
821 		}
822 
823 		if (i == array_len-1) /* last time through loop */
824 			sep = "";
825 		if (previous > 0) {
826 			xstrfmtcat(str, "%u(x%u)%s",
827 				   array[i], previous+1, sep);
828 		} else {
829 			xstrfmtcat(str, "%u%s", array[i], sep);
830 		}
831 		previous = 0;
832 	}
833 
834 	return str;
835 }
836 
_set_exit_code(void)837 static void _set_exit_code(void)
838 {
839 	int i;
840 	char *val;
841 
842 	if ((val = getenv("SLURM_EXIT_ERROR"))) {
843 		i = atoi(val);
844 		if (i == 0)
845 			error("SLURM_EXIT_ERROR has zero value");
846 		else
847 			error_exit = i;
848 	}
849 
850 	if ((val = getenv("SLURM_EXIT_IMMEDIATE"))) {
851 		i = atoi(val);
852 		if (i == 0)
853 			error("SLURM_EXIT_IMMEDIATE has zero value");
854 		else
855 			immediate_exit = i;
856 	}
857 }
858 
_set_node_alias(void)859 static void _set_node_alias(void)
860 {
861 	char *aliases, *save_ptr = NULL, *tmp;
862 	char *addr, *hostname, *slurm_name;
863 
864 	tmp = getenv("SLURM_NODE_ALIASES");
865 	if (!tmp)
866 		return;
867 	aliases = xstrdup(tmp);
868 	slurm_name = strtok_r(aliases, ":", &save_ptr);
869 	while (slurm_name) {
870 		addr = strtok_r(NULL, ":", &save_ptr);
871 		if (!addr)
872 			break;
873 		slurm_reset_alias(slurm_name, addr, addr);
874 		hostname = strtok_r(NULL, ",", &save_ptr);
875 		if (!hostname)
876 			break;
877 		slurm_name = strtok_r(NULL, ":", &save_ptr);
878 	}
879 	xfree(aliases);
880 }
881 
_pty_restore(void)882 static void _pty_restore(void)
883 {
884 	/* STDIN is probably closed by now */
885 	if (tcsetattr(STDOUT_FILENO, TCSANOW, &termdefaults) < 0)
886 		fprintf(stderr, "tcsetattr: %s\n", strerror(errno));
887 }
888 
_setup_env_working_cluster(void)889 static void _setup_env_working_cluster(void)
890 {
891 	char *working_env, *addr_ptr, *port_ptr, *rpc_ptr, *select_ptr;
892 
893 	if ((working_env = xstrdup(getenv("SLURM_WORKING_CLUSTER"))) == NULL)
894 		return;
895 
896 	/* Format is cluster_name:address:port:rpc[:plugin_id_select] */
897 	if (!(addr_ptr = strchr(working_env,  ':')) ||
898 	    !(port_ptr = strchr(addr_ptr + 1, ':')) ||
899 	    !(rpc_ptr  = strchr(port_ptr + 1, ':'))) {
900 		error("malformed cluster addr and port in SLURM_WORKING_CLUSTER env var: '%s'",
901 		      working_env);
902 		exit(1);
903 	}
904 
905 	*addr_ptr++ = '\0';
906 	*port_ptr++ = '\0';
907 	*rpc_ptr++  = '\0';
908 
909 	if ((select_ptr = strchr(rpc_ptr, ':')))
910 		*select_ptr++ = '\0';
911 
912 	if (xstrcmp(slurmctld_conf.cluster_name, working_env)) {
913 		working_cluster_rec = xmalloc(sizeof(slurmdb_cluster_rec_t));
914 		slurmdb_init_cluster_rec(working_cluster_rec, false);
915 
916 		working_cluster_rec->name = xstrdup(working_env);
917 		working_cluster_rec->control_host = xstrdup(addr_ptr);
918 		working_cluster_rec->control_port = strtol(port_ptr, NULL, 10);
919 		working_cluster_rec->rpc_version  = strtol(rpc_ptr, NULL, 10);
920 		slurm_set_addr(&working_cluster_rec->control_addr,
921 			       working_cluster_rec->control_port,
922 			       working_cluster_rec->control_host);
923 
924 		if (select_ptr)
925 			working_cluster_rec->plugin_id_select =
926 				select_get_plugin_id_pos(strtol(select_ptr,
927 								NULL, 10));
928 	}
929 	xfree(working_env);
930 }
931