1 /*****************************************************************************\
2 * multi_prog.c - executing program according to task rank
3 * set MPIR_PROCDESC accordingly
4 *
5 * NOTE: The logic could be eliminated if slurmstepd kept track of the
6 * executable name for each task and returned that inforatmion in a new
7 * launch response message (with multiple executable names).
8 *****************************************************************************
9 * Produced at National University of Defense Technology (China)
10 * Written by Hongjia Cao <hjcao@nudt.edu.cn>
11 * and
12 * Lawrence Livermore National Laboratory (cf, DISCLAIMER).
13 * Written by Morris Jette <jette1@llnl.gov>.
14 * CODE-OCEC-09-009. All rights reserved.
15 *
16 * This file is part of Slurm, a resource management program.
17 * For details, see <https://slurm.schedmd.com/>.
18 * Please also read the included file: DISCLAIMER.
19 *
20 * Slurm is free software; you can redistribute it and/or modify it under
21 * the terms of the GNU General Public License as published by the Free
22 * Software Foundation; either version 2 of the License, or (at your option)
23 * any later version.
24 *
25 * In addition, as a special exception, the copyright holders give permission
26 * to link the code of portions of this program with the OpenSSL library under
27 * certain conditions as described in each individual source file, and
28 * distribute linked combinations including the two. You must obey the GNU
29 * General Public License in all respects for all of the code used other than
30 * OpenSSL. If you modify file(s) with this exception, you may extend this
31 * exception to your version of the file(s), but you are not obligated to do
32 * so. If you do not wish to do so, delete this exception statement from your
33 * version. If you delete this exception statement from all source files in
34 * the program, then also delete it here.
35 *
36 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
37 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
38 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
39 * details.
40 *
41 * You should have received a copy of the GNU General Public License along
42 * with Slurm; if not, write to the Free Software Foundation, Inc.,
43 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
44 \*****************************************************************************/
45
46 #include "config.h"
47
48 #include <ctype.h>
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <sys/stat.h>
53 #include <sys/types.h>
54 #include <unistd.h>
55
56 #include "src/common/bitstring.h"
57 #include "src/common/log.h"
58 #include "src/common/xassert.h"
59 #include "src/common/xmalloc.h"
60 #include "src/common/xstring.h"
61 #include "src/common/proc_args.h"
62
63 #include "debugger.h"
64 #include "multi_prog.h"
65 #include "opt.h"
66
67 static void
_set_range(int low_num,int high_num,char * exec_name,bool ignore_duplicates)68 _set_range(int low_num, int high_num, char *exec_name, bool ignore_duplicates)
69 {
70 int i;
71
72 for (i = low_num; i <= high_num; i++) {
73 MPIR_PROCDESC *tv;
74 tv = &MPIR_proctable[i];
75 if (tv->executable_name == NULL) {
76 tv->executable_name = xstrdup(exec_name);
77 } else if (!ignore_duplicates) {
78 error("duplicate configuration for task %d ignored",
79 i);
80 }
81 }
82 }
83
_set_exec_names(char * ranks,char * exec_name,int ntasks)84 static void _set_exec_names(char *ranks, char *exec_name, int ntasks)
85 {
86 char *ptrptr = NULL;
87 int low_num, high_num, num, i;
88
89 if ((ranks[0] == '*') && (ranks[1] == '\0')) {
90 low_num = 0;
91 high_num = ntasks - 1;
92 _set_range(low_num, high_num, exec_name, true);
93 return;
94 }
95
96 ptrptr = ranks;
97 for (i=0; i<ntasks; i++) {
98 if (!isdigit(ptrptr[0]))
99 goto invalid;
100
101 num = strtol(ptrptr, &ptrptr, 10);
102
103 if ((ptrptr[0] == ',') || (ptrptr[0] == '\0')) {
104 low_num = MAX(0, num);
105 high_num = MIN((ntasks-1), num);
106 _set_range(low_num, high_num, exec_name, false);
107 } else if (ptrptr[0] == '-') {
108 low_num = MAX(0, num);
109 num = strtol(ptrptr+1, &ptrptr, 10);
110 if ((ptrptr[0] != ',') && (ptrptr[0] != '\0'))
111 goto invalid;
112 high_num = MIN((ntasks-1), num);
113 _set_range(low_num, high_num, exec_name, false);
114 } else
115 goto invalid;
116 if (ptrptr[0] == '\0')
117 break;
118 ptrptr++;
119 }
120 return;
121
122 invalid:
123 error ("Invalid task range specification (%s) ignored.", ranks);
124 return;
125 }
126
mpir_set_multi_name(int ntasks,const char * config_fname)127 extern int mpir_set_multi_name(int ntasks, const char *config_fname)
128 {
129 FILE *config_fd;
130 char line[BUF_SIZE];
131 char *ranks, *exec_name, *p, *ptrptr;
132 int line_num = 0;
133 bool last_line_break = false, line_break = false;
134 int line_len;
135 int i;
136
137 for (i = 0; i < ntasks; i++) {
138 MPIR_PROCDESC *tv;
139 tv = &MPIR_proctable[i];
140 tv->executable_name = NULL;
141 }
142
143 config_fd = fopen(config_fname, "r");
144 if (config_fd == NULL) {
145 error("Unable to open configuration file %s", config_fname);
146 return -1;
147 }
148 while (fgets(line, sizeof(line), config_fd)) {
149 line_num ++;
150 line_len = strlen(line);
151 if (line_len >= (sizeof(line) - 1)) {
152 error ("Line %d of configuration file %s too long",
153 line_num, config_fname);
154 fclose(config_fd);
155 return -1;
156 }
157 if ((line_len > 0 && line[line_len - 1] == '\\') || /* EOF */
158 (line_len > 1 && line[line_len - 2] == '\\' &&
159 line[line_len - 1] == '\n'))
160 line_break = true;
161 else
162 line_break = false;
163
164 if (last_line_break) {
165 last_line_break = line_break;
166 continue;
167 }
168 last_line_break = line_break;
169 p = line;
170 while (*p != '\0' && isspace (*p)) /* remove leading spaces */
171 p ++;
172
173 if (*p == '#') /* only whole-line comments handled */
174 continue;
175
176 if (*p == '\0') /* blank line ignored */
177 continue;
178
179 ranks = strtok_r(p, " \t\n", &ptrptr);
180 exec_name = strtok_r(NULL, " \t\n", &ptrptr);
181 if (!ranks || !exec_name) {
182 error("Line %d of configuration file %s is invalid",
183 line_num, config_fname);
184 fclose(config_fd);
185 return -1;
186 }
187 _set_exec_names(ranks, exec_name, ntasks);
188 }
189 fclose(config_fd);
190 return 0;
191 }
192
193 extern void
mpir_init(int num_tasks)194 mpir_init(int num_tasks)
195 {
196 MPIR_proctable_size = num_tasks;
197 MPIR_proctable = xmalloc(sizeof(MPIR_PROCDESC) * num_tasks);
198 if (MPIR_proctable == NULL) {
199 error("Unable to initialize MPIR_proctable: %m");
200 exit(error_exit);
201 }
202 }
203
204 extern void
mpir_cleanup(void)205 mpir_cleanup(void)
206 {
207 int i;
208
209 for (i = 0; i < MPIR_proctable_size; i++) {
210 xfree(MPIR_proctable[i].host_name);
211 xfree(MPIR_proctable[i].executable_name);
212 }
213 xfree(MPIR_proctable);
214 }
215
mpir_set_executable_names(const char * executable_name,uint32_t task_offset,uint32_t task_count)216 extern void mpir_set_executable_names(const char *executable_name,
217 uint32_t task_offset,
218 uint32_t task_count)
219 {
220 int i;
221
222 if (task_offset == NO_VAL)
223 task_offset = 0;
224 xassert((task_offset + task_count) <= MPIR_proctable_size);
225 for (i = task_offset; i < (task_offset + task_count); i++) {
226 MPIR_proctable[i].executable_name = xstrdup(executable_name);
227 // info("NAME[%d]:%s", i, executable_name);
228 }
229 }
230
231 extern void
mpir_dump_proctable(void)232 mpir_dump_proctable(void)
233 {
234 MPIR_PROCDESC *tv;
235 int i;
236
237 for (i = 0; i < MPIR_proctable_size; i++) {
238 tv = &MPIR_proctable[i];
239 info("task:%d, host:%s, pid:%d, executable:%s",
240 i, tv->host_name, tv->pid, tv->executable_name);
241 }
242 }
243
244 static int
_update_task_mask(int low_num,int high_num,slurm_opt_t * opt_local,bitstr_t ** task_mask,bool ignore_duplicates)245 _update_task_mask(int low_num, int high_num, slurm_opt_t *opt_local,
246 bitstr_t **task_mask, bool ignore_duplicates)
247 {
248 int i;
249
250 if (low_num > high_num) {
251 error("Invalid task range, %d-%d", low_num, high_num);
252 return -1;
253 }
254 if (low_num < 0) {
255 error("Invalid task id, %d < 0", low_num);
256 return -1;
257 }
258 if (high_num >= opt_local->ntasks) {
259 static bool i_set_ntasks = false;
260 if (opt_local->ntasks_set && !i_set_ntasks) {
261 error("Invalid task id, %d >= ntasks", high_num);
262 return -1;
263 } else {
264 opt_local->ntasks = high_num + 1;
265 opt_local->ntasks_set = true;
266 i_set_ntasks = true;
267 (*task_mask) = bit_realloc((*task_mask),
268 opt_local->ntasks);
269 }
270 }
271 for (i=low_num; i<=high_num; i++) {
272 if (bit_test((*task_mask), i)) {
273 if (ignore_duplicates)
274 continue;
275 error("Duplicate record for task %d", i);
276 return -1;
277 }
278 bit_set((*task_mask), i);
279 }
280 return 0;
281 }
282
283 static int
_validate_ranks(char * ranks,slurm_opt_t * opt_local,bitstr_t ** task_mask)284 _validate_ranks(char *ranks, slurm_opt_t *opt_local, bitstr_t **task_mask)
285 {
286 static bool has_asterisk = false;
287 char *range = NULL, *p = NULL;
288 char *ptrptr = NULL, *upper = NULL;
289 int low_num, high_num;
290
291 if (ranks[0] == '*' && ranks[1] == '\0') {
292 low_num = 0;
293 high_num = opt_local->ntasks - 1;
294 opt_local->ntasks_set = true; /* do not allow to change later */
295 has_asterisk = true; /* must be last MPMD spec line */
296 opt_local->srun_opt->multi_prog_cmds++;
297 return _update_task_mask(low_num, high_num, opt_local,
298 task_mask, true);
299 }
300
301 for (range = strtok_r(ranks, ",", &ptrptr); range != NULL;
302 range = strtok_r(NULL, ",", &ptrptr)) {
303 /*
304 * Non-contiguous tasks are split into multiple commands
305 * in the mpmd_set so count each token separately
306 */
307 opt_local->srun_opt->multi_prog_cmds++;
308 p = range;
309 while (*p != '\0' && isdigit (*p))
310 p ++;
311
312 if (has_asterisk) {
313 error("Task range specification with asterisk must "
314 "be last");
315 return -1;
316 } else if (*p == '\0') { /* single rank */
317 low_num = atoi(range);
318 high_num = low_num;
319 } else if (*p == '-') { /* lower-upper */
320 upper = ++ p;
321 while (isdigit (*p))
322 p ++;
323 if (*p != '\0') {
324 error ("Invalid task range specification");
325 return -1;
326 }
327 low_num = atoi(range);
328 high_num = atoi(upper);
329 } else {
330 error ("Invalid task range specification (%s)",
331 range);
332 return -1;
333 }
334
335 if (_update_task_mask(low_num, high_num, opt_local,
336 task_mask, false))
337 return -1;
338 }
339 return 0;
340 }
341
342 /*
343 * Verify that we have a valid executable program specified for each task
344 * when the --multi-prog option is used.
345 * IN config_name - MPMD configuration file name
346 * IN/OUT opt_local - slurm options
347 * RET 0 on success, -1 otherwise
348 */
349 extern int
verify_multi_name(char * config_fname,slurm_opt_t * opt_local)350 verify_multi_name(char *config_fname, slurm_opt_t *opt_local)
351 {
352 FILE *config_fd;
353 char line[BUF_SIZE];
354 char *ranks, *exec_name, *p, *ptrptr, *fullpath = NULL;
355 int line_num = 0, i, rc = 0;
356 bool last_line_break = false, line_break = false;
357 int line_len;
358 bitstr_t *task_mask;
359
360 if (opt_local->ntasks <= 0) {
361 error("Invalid task count %d", opt_local->ntasks);
362 return -1;
363 }
364
365 opt_local->srun_opt->multi_prog_cmds = 0;
366
367 config_fd = fopen(config_fname, "r");
368 if (config_fd == NULL) {
369 error("Unable to open configuration file %s", config_fname);
370 return -1;
371 }
372
373 task_mask = bit_alloc(opt_local->ntasks);
374 while (fgets(line, sizeof(line), config_fd)) {
375 line_num++;
376 line_len = strlen(line);
377 if (line_len >= (sizeof(line) - 1)) {
378 error ("Line %d of configuration file %s too long",
379 line_num, config_fname);
380 rc = -1;
381 goto fini;
382 }
383 if ((line_len > 0 && line[line_len - 1] == '\\') || /* EOF */
384 (line_len > 1 && line[line_len - 2] == '\\' &&
385 line[line_len - 1] == '\n'))
386 line_break = true;
387 else
388 line_break = false;
389 if (last_line_break) {
390 last_line_break = line_break;
391 continue;
392 }
393 last_line_break = line_break;
394 p = line;
395 while (*p != '\0' && isspace (*p)) /* remove leading spaces */
396 p ++;
397
398 if (*p == '#') /* only whole-line comments handled */
399 continue;
400
401 if (*p == '\0') /* blank line ignored */
402 continue;
403
404 ranks = strtok_r(p, " \t\n", &ptrptr);
405 exec_name = strtok_r(NULL, " \t\n", &ptrptr);
406 if (!ranks || !exec_name) {
407 error("Line %d of configuration file %s invalid",
408 line_num, config_fname);
409 rc = -1;
410 goto fini;
411 }
412 if (_validate_ranks(ranks, opt_local, &task_mask)) {
413 error("Line %d of configuration file %s invalid",
414 line_num, config_fname);
415 rc = -1;
416 goto fini;
417 }
418 if (opt_local->srun_opt->test_exec &&
419 !(fullpath = search_path(
420 opt_local->chdir, exec_name, true, X_OK, true))) {
421 error("Line %d of configuration file %s, program %s not executable",
422 line_num, config_fname, exec_name);
423 rc = -1;
424 goto fini;
425 }
426 xfree(fullpath);
427 }
428
429 for (i = 0; i < opt_local->ntasks; i++) {
430 if (!bit_test(task_mask, i)) {
431 error("Configuration file %s invalid, "
432 "no record for task id %d",
433 config_fname, i);
434 rc = -1;
435 goto fini;
436 }
437 }
438
439 fini: fclose(config_fd);
440 FREE_NULL_BITMAP(task_mask);
441 return rc;
442 }
443