1 /*****************************************************************************\
2  *  multi_prog.c - executing program according to task rank
3  *                 set MPIR_PROCDESC accordingly
4  *
5  *  NOTE: The logic could be eliminated if slurmstepd kept track of the
6  *  executable name for each task and returned that inforatmion in a new
7  *  launch response message (with multiple executable names).
8  *****************************************************************************
9  *  Produced at National University of Defense Technology (China)
10  *  Written by Hongjia Cao <hjcao@nudt.edu.cn>
11  *  and
12  *  Lawrence Livermore National Laboratory (cf, DISCLAIMER).
13  *  Written by Morris Jette <jette1@llnl.gov>.
14  *  CODE-OCEC-09-009. All rights reserved.
15  *
16  *  This file is part of Slurm, a resource management program.
17  *  For details, see <https://slurm.schedmd.com/>.
18  *  Please also read the included file: DISCLAIMER.
19  *
20  *  Slurm is free software; you can redistribute it and/or modify it under
21  *  the terms of the GNU General Public License as published by the Free
22  *  Software Foundation; either version 2 of the License, or (at your option)
23  *  any later version.
24  *
25  *  In addition, as a special exception, the copyright holders give permission
26  *  to link the code of portions of this program with the OpenSSL library under
27  *  certain conditions as described in each individual source file, and
28  *  distribute linked combinations including the two. You must obey the GNU
29  *  General Public License in all respects for all of the code used other than
30  *  OpenSSL. If you modify file(s) with this exception, you may extend this
31  *  exception to your version of the file(s), but you are not obligated to do
32  *  so. If you do not wish to do so, delete this exception statement from your
33  *  version.  If you delete this exception statement from all source files in
34  *  the program, then also delete it here.
35  *
36  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
37  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
38  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
39  *  details.
40  *
41  *  You should have received a copy of the GNU General Public License along
42  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
43  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
44 \*****************************************************************************/
45 
46 #include "config.h"
47 
48 #include <ctype.h>
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <sys/stat.h>
53 #include <sys/types.h>
54 #include <unistd.h>
55 
56 #include "src/common/bitstring.h"
57 #include "src/common/log.h"
58 #include "src/common/xassert.h"
59 #include "src/common/xmalloc.h"
60 #include "src/common/xstring.h"
61 #include "src/common/proc_args.h"
62 
63 #include "debugger.h"
64 #include "multi_prog.h"
65 #include "opt.h"
66 
67 static void
_set_range(int low_num,int high_num,char * exec_name,bool ignore_duplicates)68 _set_range(int low_num, int high_num, char *exec_name, bool ignore_duplicates)
69 {
70 	int i;
71 
72 	for (i = low_num; i <= high_num; i++) {
73 		MPIR_PROCDESC *tv;
74 		tv = &MPIR_proctable[i];
75 		if (tv->executable_name == NULL) {
76 			tv->executable_name = xstrdup(exec_name);
77 		} else if (!ignore_duplicates) {
78 			error("duplicate configuration for task %d ignored",
79 			      i);
80 		}
81 	}
82 }
83 
_set_exec_names(char * ranks,char * exec_name,int ntasks)84 static void _set_exec_names(char *ranks, char *exec_name, int ntasks)
85 {
86 	char *ptrptr = NULL;
87 	int low_num, high_num, num, i;
88 
89 	if ((ranks[0] == '*') && (ranks[1] == '\0')) {
90 		low_num = 0;
91 		high_num = ntasks - 1;
92 		_set_range(low_num, high_num, exec_name, true);
93 		return;
94 	}
95 
96 	ptrptr = ranks;
97 	for (i=0; i<ntasks; i++) {
98 		if (!isdigit(ptrptr[0]))
99 			goto invalid;
100 
101 		num = strtol(ptrptr, &ptrptr, 10);
102 
103 		if ((ptrptr[0] == ',') || (ptrptr[0] == '\0')) {
104 			low_num = MAX(0, num);
105 			high_num = MIN((ntasks-1), num);
106 			_set_range(low_num, high_num, exec_name, false);
107 		} else if (ptrptr[0] == '-') {
108 			low_num = MAX(0, num);
109 			num = strtol(ptrptr+1, &ptrptr, 10);
110 			if ((ptrptr[0] != ',') && (ptrptr[0] != '\0'))
111 				goto invalid;
112 			high_num = MIN((ntasks-1), num);
113 			_set_range(low_num, high_num, exec_name, false);
114 		} else
115 			goto invalid;
116 		if (ptrptr[0] == '\0')
117 			break;
118 		ptrptr++;
119 	}
120 	return;
121 
122   invalid:
123 	error ("Invalid task range specification (%s) ignored.", ranks);
124 	return;
125 }
126 
mpir_set_multi_name(int ntasks,const char * config_fname)127 extern int mpir_set_multi_name(int ntasks, const char *config_fname)
128 {
129 	FILE *config_fd;
130 	char line[BUF_SIZE];
131 	char *ranks, *exec_name, *p, *ptrptr;
132 	int line_num = 0;
133 	bool last_line_break = false, line_break = false;
134 	int line_len;
135 	int i;
136 
137 	for (i = 0; i < ntasks; i++) {
138 		MPIR_PROCDESC *tv;
139 		tv = &MPIR_proctable[i];
140 		tv->executable_name = NULL;
141 	}
142 
143 	config_fd = fopen(config_fname, "r");
144 	if (config_fd == NULL) {
145 		error("Unable to open configuration file %s", config_fname);
146 		return -1;
147 	}
148 	while (fgets(line, sizeof(line), config_fd)) {
149 		line_num ++;
150 		line_len = strlen(line);
151 		if (line_len >= (sizeof(line) - 1)) {
152 			error ("Line %d of configuration file %s too long",
153 				line_num, config_fname);
154 			fclose(config_fd);
155 			return -1;
156 		}
157 		if ((line_len > 0 && line[line_len - 1] == '\\') ||  /* EOF */
158 		    (line_len > 1 && line[line_len - 2] == '\\' &&
159 				     line[line_len - 1] == '\n'))
160 			line_break = true;
161 		else
162 			line_break = false;
163 
164 		if (last_line_break) {
165 			last_line_break = line_break;
166 			continue;
167 		}
168 		last_line_break = line_break;
169 		p = line;
170 		while (*p != '\0' && isspace (*p)) /* remove leading spaces */
171 			p ++;
172 
173 		if (*p == '#') /* only whole-line comments handled */
174 			continue;
175 
176 		if (*p == '\0') /* blank line ignored */
177 			continue;
178 
179 		ranks = strtok_r(p, " \t\n", &ptrptr);
180 		exec_name = strtok_r(NULL, " \t\n", &ptrptr);
181 		if (!ranks || !exec_name) {
182 			error("Line %d of configuration file %s is invalid",
183 				line_num, config_fname);
184 			fclose(config_fd);
185 			return -1;
186 		}
187 		_set_exec_names(ranks, exec_name, ntasks);
188 	}
189 	fclose(config_fd);
190 	return 0;
191 }
192 
193 extern void
mpir_init(int num_tasks)194 mpir_init(int num_tasks)
195 {
196 	MPIR_proctable_size = num_tasks;
197 	MPIR_proctable = xmalloc(sizeof(MPIR_PROCDESC) * num_tasks);
198 	if (MPIR_proctable == NULL) {
199 		error("Unable to initialize MPIR_proctable: %m");
200 		exit(error_exit);
201 	}
202 }
203 
204 extern void
mpir_cleanup(void)205 mpir_cleanup(void)
206 {
207 	int i;
208 
209 	for (i = 0; i < MPIR_proctable_size; i++) {
210 		xfree(MPIR_proctable[i].host_name);
211 		xfree(MPIR_proctable[i].executable_name);
212 	}
213 	xfree(MPIR_proctable);
214 }
215 
mpir_set_executable_names(const char * executable_name,uint32_t task_offset,uint32_t task_count)216 extern void mpir_set_executable_names(const char *executable_name,
217 				      uint32_t task_offset,
218 				      uint32_t task_count)
219 {
220 	int i;
221 
222 	if (task_offset == NO_VAL)
223 		task_offset = 0;
224 	xassert((task_offset + task_count) <= MPIR_proctable_size);
225 	for (i = task_offset; i < (task_offset + task_count); i++) {
226 		MPIR_proctable[i].executable_name = xstrdup(executable_name);
227 		// info("NAME[%d]:%s", i, executable_name);
228 	}
229 }
230 
231 extern void
mpir_dump_proctable(void)232 mpir_dump_proctable(void)
233 {
234 	MPIR_PROCDESC *tv;
235 	int i;
236 
237 	for (i = 0; i < MPIR_proctable_size; i++) {
238 		tv = &MPIR_proctable[i];
239 		info("task:%d, host:%s, pid:%d, executable:%s",
240 		     i, tv->host_name, tv->pid, tv->executable_name);
241 	}
242 }
243 
244 static int
_update_task_mask(int low_num,int high_num,slurm_opt_t * opt_local,bitstr_t ** task_mask,bool ignore_duplicates)245 _update_task_mask(int low_num, int high_num, slurm_opt_t *opt_local,
246 		  bitstr_t **task_mask, bool ignore_duplicates)
247 {
248 	int i;
249 
250 	if (low_num > high_num) {
251 		error("Invalid task range, %d-%d", low_num, high_num);
252 		return -1;
253 	}
254 	if (low_num < 0) {
255 		error("Invalid task id, %d < 0", low_num);
256 		return -1;
257 	}
258 	if (high_num >= opt_local->ntasks) {
259 		static bool i_set_ntasks = false;
260 		if (opt_local->ntasks_set && !i_set_ntasks) {
261 			error("Invalid task id, %d >= ntasks", high_num);
262 			return -1;
263 		} else {
264 			opt_local->ntasks = high_num + 1;
265 			opt_local->ntasks_set = true;
266 			i_set_ntasks = true;
267 			(*task_mask) = bit_realloc((*task_mask),
268 						   opt_local->ntasks);
269 		}
270 	}
271 	for (i=low_num; i<=high_num; i++) {
272 		if (bit_test((*task_mask), i)) {
273 			if (ignore_duplicates)
274 				continue;
275 			error("Duplicate record for task %d", i);
276 			return -1;
277 		}
278 		bit_set((*task_mask), i);
279 	}
280 	return 0;
281 }
282 
283 static int
_validate_ranks(char * ranks,slurm_opt_t * opt_local,bitstr_t ** task_mask)284 _validate_ranks(char *ranks, slurm_opt_t *opt_local, bitstr_t **task_mask)
285 {
286 	static bool has_asterisk = false;
287 	char *range = NULL, *p = NULL;
288 	char *ptrptr = NULL, *upper = NULL;
289 	int low_num, high_num;
290 
291 	if (ranks[0] == '*' && ranks[1] == '\0') {
292 		low_num = 0;
293 		high_num = opt_local->ntasks - 1;
294 		opt_local->ntasks_set = true; /* do not allow to change later */
295 		has_asterisk = true;	/* must be last MPMD spec line */
296 		opt_local->srun_opt->multi_prog_cmds++;
297 		return _update_task_mask(low_num, high_num, opt_local,
298 					 task_mask, true);
299 	}
300 
301 	for (range = strtok_r(ranks, ",", &ptrptr); range != NULL;
302 			range = strtok_r(NULL, ",", &ptrptr)) {
303 		/*
304 		 * Non-contiguous tasks are split into multiple commands
305 		 * in the mpmd_set so count each token separately
306 		 */
307 		opt_local->srun_opt->multi_prog_cmds++;
308 		p = range;
309 		while (*p != '\0' && isdigit (*p))
310 			p ++;
311 
312 		if (has_asterisk) {
313 			error("Task range specification with asterisk must "
314 			      "be last");
315 			return -1;
316 		} else if (*p == '\0') { /* single rank */
317 			low_num  = atoi(range);
318 			high_num = low_num;
319 		} else if (*p == '-') { /* lower-upper */
320 			upper = ++ p;
321 			while (isdigit (*p))
322 				p ++;
323 			if (*p != '\0') {
324 				error ("Invalid task range specification");
325 				return -1;
326 			}
327 			low_num  = atoi(range);
328 			high_num = atoi(upper);
329 		} else {
330 			error ("Invalid task range specification (%s)",
331 				range);
332 			return -1;
333 		}
334 
335 		if (_update_task_mask(low_num, high_num, opt_local,
336 				      task_mask, false))
337 			return -1;
338 	}
339 	return 0;
340 }
341 
342 /*
343  * Verify that we have a valid executable program specified for each task
344  *	when the --multi-prog option is used.
345  * IN config_name - MPMD configuration file name
346  * IN/OUT opt_local - slurm options
347  * RET 0 on success, -1 otherwise
348  */
349 extern int
verify_multi_name(char * config_fname,slurm_opt_t * opt_local)350 verify_multi_name(char *config_fname, slurm_opt_t *opt_local)
351 {
352 	FILE *config_fd;
353 	char line[BUF_SIZE];
354 	char *ranks, *exec_name, *p, *ptrptr, *fullpath = NULL;
355 	int line_num = 0, i, rc = 0;
356 	bool last_line_break = false, line_break = false;
357 	int line_len;
358 	bitstr_t *task_mask;
359 
360 	if (opt_local->ntasks <= 0) {
361 		error("Invalid task count %d", opt_local->ntasks);
362 		return -1;
363 	}
364 
365 	opt_local->srun_opt->multi_prog_cmds = 0;
366 
367 	config_fd = fopen(config_fname, "r");
368 	if (config_fd == NULL) {
369 		error("Unable to open configuration file %s", config_fname);
370 		return -1;
371 	}
372 
373 	task_mask = bit_alloc(opt_local->ntasks);
374 	while (fgets(line, sizeof(line), config_fd)) {
375 		line_num++;
376 		line_len = strlen(line);
377 		if (line_len >= (sizeof(line) - 1)) {
378 			error ("Line %d of configuration file %s too long",
379 				line_num, config_fname);
380 			rc = -1;
381 			goto fini;
382 		}
383 		if ((line_len > 0 && line[line_len - 1] == '\\') ||  /* EOF */
384 		    (line_len > 1 && line[line_len - 2] == '\\' &&
385 				     line[line_len - 1] == '\n'))
386 			line_break = true;
387 		else
388 			line_break = false;
389 		if (last_line_break) {
390 			last_line_break = line_break;
391 			continue;
392 		}
393 		last_line_break = line_break;
394 		p = line;
395 		while (*p != '\0' && isspace (*p)) /* remove leading spaces */
396 			p ++;
397 
398 		if (*p == '#') /* only whole-line comments handled */
399 			continue;
400 
401 		if (*p == '\0') /* blank line ignored */
402 			continue;
403 
404 		ranks = strtok_r(p, " \t\n", &ptrptr);
405 		exec_name = strtok_r(NULL, " \t\n", &ptrptr);
406 		if (!ranks || !exec_name) {
407 			error("Line %d of configuration file %s invalid",
408 				line_num, config_fname);
409 			rc = -1;
410 			goto fini;
411 		}
412 		if (_validate_ranks(ranks, opt_local, &task_mask)) {
413 			error("Line %d of configuration file %s invalid",
414 				line_num, config_fname);
415 			rc = -1;
416 			goto fini;
417 		}
418 		if (opt_local->srun_opt->test_exec &&
419 		    !(fullpath = search_path(
420 			      opt_local->chdir, exec_name, true, X_OK, true))) {
421 			error("Line %d of configuration file %s, program %s not executable",
422 			      line_num, config_fname, exec_name);
423 			rc = -1;
424 			goto fini;
425 		}
426 		xfree(fullpath);
427 	}
428 
429 	for (i = 0; i < opt_local->ntasks; i++) {
430 		if (!bit_test(task_mask, i)) {
431 			error("Configuration file %s invalid, "
432 				"no record for task id %d",
433 				config_fname, i);
434 			rc = -1;
435 			goto fini;
436 		}
437 	}
438 
439 fini:	fclose(config_fd);
440 	FREE_NULL_BITMAP(task_mask);
441 	return rc;
442 }
443