1 /*****************************************************************************\
2 * info_job.c - job information functions for scontrol.
3 *****************************************************************************
4 * Copyright (C) 2002-2007 The Regents of the University of California.
5 * Copyright (C) 2008-2010 Lawrence Livermore National Security.
6 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7 * Written by Morris Jette <jette1@llnl.gov>
8 * CODE-OCEC-09-009. All rights reserved.
9 *
10 * This file is part of Slurm, a resource management program.
11 * For details, see <https://slurm.schedmd.com/>.
12 * Please also read the included file: DISCLAIMER.
13 *
14 * Slurm is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option)
17 * any later version.
18 *
19 * In addition, as a special exception, the copyright holders give permission
20 * to link the code of portions of this program with the OpenSSL library under
21 * certain conditions as described in each individual source file, and
22 * distribute linked combinations including the two. You must obey the GNU
23 * General Public License in all respects for all of the code used other than
24 * OpenSSL. If you modify file(s) with this exception, you may extend this
25 * exception to your version of the file(s), but you are not obligated to do
26 * so. If you do not wish to do so, delete this exception statement from your
27 * version. If you delete this exception statement from all source files in
28 * the program, then also delete it here.
29 *
30 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
31 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
32 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
33 * details.
34 *
35 * You should have received a copy of the GNU General Public License along
36 * with Slurm; if not, write to the Free Software Foundation, Inc.,
37 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
38 \*****************************************************************************/
39 #include <arpa/inet.h>
40 #include <grp.h>
41 #include <fcntl.h>
42 #include <sys/stat.h>
43 #include <sys/types.h>
44
45 #include "scontrol.h"
46 #include "src/common/bitstring.h"
47 #include "src/common/slurm_time.h"
48 #include "src/common/stepd_api.h"
49
50 #define POLL_SLEEP 3 /* retry interval in seconds */
51
52 /* Load current job table information into *job_buffer_pptr */
53 extern int
scontrol_load_job(job_info_msg_t ** job_buffer_pptr,uint32_t job_id)54 scontrol_load_job(job_info_msg_t ** job_buffer_pptr, uint32_t job_id)
55 {
56 int error_code;
57 static uint16_t last_show_flags = 0xffff;
58 uint16_t show_flags = 0;
59 job_info_msg_t * job_info_ptr = NULL;
60
61 if (all_flag)
62 show_flags |= SHOW_ALL;
63
64 if (detail_flag)
65 show_flags |= SHOW_DETAIL;
66 if (federation_flag)
67 show_flags |= SHOW_FEDERATION;
68 if (local_flag)
69 show_flags |= SHOW_LOCAL;
70 if (sibling_flag)
71 show_flags |= SHOW_FEDERATION | SHOW_SIBLING;
72
73 if (old_job_info_ptr) {
74 if (last_show_flags != show_flags)
75 old_job_info_ptr->last_update = (time_t) 0;
76 if (job_id) {
77 error_code = slurm_load_job(&job_info_ptr, job_id,
78 show_flags);
79 } else {
80 error_code = slurm_load_jobs(
81 old_job_info_ptr->last_update,
82 &job_info_ptr, show_flags);
83 }
84 if (error_code == SLURM_SUCCESS)
85 slurm_free_job_info_msg (old_job_info_ptr);
86 else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) {
87 job_info_ptr = old_job_info_ptr;
88 error_code = SLURM_SUCCESS;
89 if (quiet_flag == -1)
90 printf ("slurm_load_jobs no change in data\n");
91 }
92 } else if (job_id) {
93 error_code = slurm_load_job(&job_info_ptr, job_id, show_flags);
94 } else {
95 error_code = slurm_load_jobs((time_t) NULL, &job_info_ptr,
96 show_flags);
97 }
98
99 if (error_code == SLURM_SUCCESS) {
100 old_job_info_ptr = job_info_ptr;
101 if (job_id)
102 old_job_info_ptr->last_update = (time_t) 0;
103 last_show_flags = show_flags;
104 *job_buffer_pptr = job_info_ptr;
105 }
106
107 return error_code;
108 }
109
110 /*
111 * scontrol_pid_info - given a local process id, print the corresponding
112 * slurm job id and its expected end time
113 * IN job_pid - the local process id of interest
114 */
115 extern void
scontrol_pid_info(pid_t job_pid)116 scontrol_pid_info(pid_t job_pid)
117 {
118 int error_code;
119 uint32_t job_id = 0;
120 time_t end_time;
121 long rem_time;
122
123 error_code = slurm_pid2jobid(job_pid, &job_id);
124 if (error_code) {
125 exit_code = 1;
126 if (quiet_flag != 1)
127 slurm_perror ("slurm_pid2jobid error");
128 return;
129 }
130
131 error_code = slurm_get_end_time(job_id, &end_time);
132 if (error_code) {
133 exit_code = 1;
134 if (quiet_flag != 1)
135 slurm_perror ("slurm_get_end_time error");
136 return;
137 }
138 printf("Slurm job id %u ends at %s\n", job_id, slurm_ctime2(&end_time));
139
140 rem_time = slurm_get_rem_time(job_id);
141 printf("slurm_get_rem_time is %ld\n", rem_time);
142 return;
143 }
144
145 /*
146 * scontrol_print_completing - print jobs in completing state and
147 * associated nodes in COMPLETING or DOWN state
148 */
149 extern void
scontrol_print_completing(void)150 scontrol_print_completing (void)
151 {
152 int error_code, i;
153 job_info_msg_t *job_info_msg;
154 job_info_t *job_info;
155 node_info_msg_t *node_info_msg;
156 uint16_t show_flags = 0;
157
158 error_code = scontrol_load_job (&job_info_msg, 0);
159 if (error_code) {
160 exit_code = 1;
161 if (quiet_flag != 1)
162 slurm_perror ("slurm_load_jobs error");
163 return;
164 }
165 /* Must load all nodes including hidden for cross-index
166 * from job's node_inx to node table to work */
167 /*if (all_flag) Always set this flag */
168 show_flags |= SHOW_ALL;
169 if (federation_flag)
170 show_flags |= SHOW_FEDERATION;
171 if (local_flag)
172 show_flags |= SHOW_LOCAL;
173 error_code = scontrol_load_nodes(&node_info_msg, show_flags);
174 if (error_code) {
175 exit_code = 1;
176 if (quiet_flag != 1)
177 slurm_perror ("slurm_load_nodes error");
178 return;
179 }
180
181 /* Scan the jobs for completing state */
182 job_info = job_info_msg->job_array;
183 for (i = 0; i < job_info_msg->record_count; i++) {
184 if (job_info[i].job_state & JOB_COMPLETING)
185 scontrol_print_completing_job(&job_info[i],
186 node_info_msg);
187 }
188 slurm_free_node_info_msg(node_info_msg);
189 }
190
191 extern void
scontrol_print_completing_job(job_info_t * job_ptr,node_info_msg_t * node_info_msg)192 scontrol_print_completing_job(job_info_t *job_ptr,
193 node_info_msg_t *node_info_msg)
194 {
195 int i, c_offset = 0;
196 node_info_t *node_info;
197 hostlist_t comp_nodes, down_nodes;
198 char *node_buf;
199 char time_str[32];
200 time_t completing_time = 0;
201
202 comp_nodes = hostlist_create(NULL);
203 down_nodes = hostlist_create(NULL);
204
205 if (job_ptr->cluster && federation_flag && !local_flag)
206 c_offset = get_cluster_node_offset(job_ptr->cluster,
207 node_info_msg);
208
209 for (i = 0; job_ptr->node_inx[i] != -1; i+=2) {
210 int j = job_ptr->node_inx[i];
211 for (; j <= job_ptr->node_inx[i+1]; j++) {
212 int node_inx = j + c_offset;
213 if (node_inx >= node_info_msg->record_count)
214 break;
215 node_info = &(node_info_msg->node_array[node_inx]);
216 if (IS_NODE_COMPLETING(node_info))
217 hostlist_push_host(comp_nodes, node_info->name);
218 else if (IS_NODE_DOWN(node_info))
219 hostlist_push_host(down_nodes, node_info->name);
220 }
221 }
222
223 fprintf(stdout, "JobId=%u ", job_ptr->job_id);
224
225 slurm_make_time_str(&job_ptr->end_time, time_str, sizeof(time_str));
226 fprintf(stdout, "EndTime=%s ", time_str);
227
228 completing_time = time(NULL) - job_ptr->end_time;
229 secs2time_str(completing_time, time_str, sizeof(time_str));
230 fprintf(stdout, "CompletingTime=%s ", time_str);
231
232 node_buf = hostlist_ranged_string_xmalloc(comp_nodes);
233 if (node_buf && node_buf[0])
234 fprintf(stdout, "Nodes(COMPLETING)=%s ", node_buf);
235 xfree(node_buf);
236
237 node_buf = hostlist_ranged_string_xmalloc(down_nodes);
238 if (node_buf && node_buf[0])
239 fprintf(stdout, "Nodes(DOWN)=%s ", node_buf);
240 xfree(node_buf);
241 fprintf(stdout, "\n");
242
243 hostlist_destroy(comp_nodes);
244 hostlist_destroy(down_nodes);
245 }
246
247 extern uint16_t
scontrol_get_job_state(uint32_t job_id)248 scontrol_get_job_state(uint32_t job_id)
249 {
250 job_info_msg_t * job_buffer_ptr = NULL;
251 int error_code = SLURM_SUCCESS, i;
252 job_info_t *job_ptr = NULL;
253
254 error_code = scontrol_load_job(&job_buffer_ptr, job_id);
255 if (error_code) {
256 exit_code = 1;
257 if (quiet_flag == -1)
258 slurm_perror ("slurm_load_job error");
259 return NO_VAL16;
260 }
261 if (quiet_flag == -1) {
262 char time_str[32];
263 slurm_make_time_str((time_t *)&job_buffer_ptr->last_update,
264 time_str, sizeof(time_str));
265 printf("last_update_time=%s, records=%d\n",
266 time_str, job_buffer_ptr->record_count);
267 }
268
269 job_ptr = job_buffer_ptr->job_array ;
270 for (i = 0; i < job_buffer_ptr->record_count; i++) {
271 if (job_ptr->job_id == job_id)
272 return job_ptr->job_state;
273 }
274 if (quiet_flag == -1)
275 printf("Could not find job %u", job_id);
276 return NO_VAL16;
277 }
278
_het_job_offset_match(job_info_t * job_ptr,uint32_t het_job_offset)279 static bool _het_job_offset_match(job_info_t *job_ptr, uint32_t het_job_offset)
280 {
281 if ((het_job_offset == NO_VAL) ||
282 (het_job_offset == job_ptr->het_job_offset))
283 return true;
284 return false;
285 }
286
_task_id_in_job(job_info_t * job_ptr,uint32_t array_id)287 static bool _task_id_in_job(job_info_t *job_ptr, uint32_t array_id)
288 {
289 bitstr_t *array_bitmap;
290 uint32_t array_len;
291
292 if ((array_id == NO_VAL) ||
293 (array_id == job_ptr->array_task_id))
294 return true;
295
296 array_bitmap = (bitstr_t *) job_ptr->array_bitmap;
297 if (array_bitmap == NULL)
298 return false;
299 array_len = bit_size(array_bitmap);
300 if (array_id >= array_len)
301 return false;
302 if (bit_test(array_bitmap, array_id))
303 return true;
304 return false;
305 }
306
307 /*
308 * scontrol_print_job - print the specified job's information
309 * IN job_id - job's id or NULL to print information about all jobs
310 */
scontrol_print_job(char * job_id_str)311 extern void scontrol_print_job(char * job_id_str)
312 {
313 int error_code = SLURM_SUCCESS, i, print_cnt = 0;
314 uint32_t job_id = 0;
315 uint32_t array_id = NO_VAL, het_job_offset = NO_VAL;
316 job_info_msg_t * job_buffer_ptr = NULL;
317 job_info_t *job_ptr = NULL;
318 char *end_ptr = NULL;
319
320 if (job_id_str) {
321 char *tmp_job_ptr = job_id_str;
322 /*
323 * Check that the input is a valid job id (i.e. 123 or 123_456).
324 */
325 while (*tmp_job_ptr) {
326 if (!isdigit(*tmp_job_ptr) &&
327 (*tmp_job_ptr != '_') && (*tmp_job_ptr != '+')) {
328 exit_code = 1;
329 slurm_seterrno(ESLURM_INVALID_JOB_ID);
330 if (quiet_flag != 1)
331 slurm_perror("scontrol_print_job error");
332 return;
333 }
334 ++tmp_job_ptr;
335 }
336 job_id = (uint32_t) strtol (job_id_str, &end_ptr, 10);
337 if (end_ptr[0] == '_')
338 array_id = strtol(end_ptr + 1, &end_ptr, 10);
339 if (end_ptr[0] == '+')
340 het_job_offset = strtol(end_ptr + 1, &end_ptr, 10);
341 }
342
343 error_code = scontrol_load_job(&job_buffer_ptr, job_id);
344 if (error_code) {
345 exit_code = 1;
346 if (quiet_flag != 1)
347 slurm_perror ("slurm_load_jobs error");
348 return;
349 }
350 if (quiet_flag == -1) {
351 char time_str[32];
352 slurm_make_time_str ((time_t *)&job_buffer_ptr->last_update,
353 time_str, sizeof(time_str));
354 printf ("last_update_time=%s, records=%d\n",
355 time_str, job_buffer_ptr->record_count);
356 }
357
358 for (i = 0, job_ptr = job_buffer_ptr->job_array;
359 i < job_buffer_ptr->record_count; i++, job_ptr++) {
360 char *save_array_str = NULL;
361 uint32_t save_task_id = 0;
362 if (!_het_job_offset_match(job_ptr, het_job_offset))
363 continue;
364 if (!_task_id_in_job(job_ptr, array_id))
365 continue;
366 if ((array_id != NO_VAL) && job_ptr->array_task_str) {
367 save_array_str = job_ptr->array_task_str;
368 job_ptr->array_task_str = NULL;
369 save_task_id = job_ptr->array_task_id;
370 job_ptr->array_task_id = array_id;
371 }
372 slurm_print_job_info(stdout, job_ptr, one_liner);
373 if (save_array_str) {
374 job_ptr->array_task_str = save_array_str;
375 job_ptr->array_task_id = save_task_id;
376 }
377 print_cnt++;
378 }
379
380 if (print_cnt == 0) {
381 if (job_id_str) {
382 exit_code = 1;
383 if (quiet_flag != 1) {
384 if (array_id != NO_VAL) {
385 printf("Job %u_%u not found\n",
386 job_id, array_id);
387 } else if (het_job_offset != NO_VAL) {
388 printf("Job %u+%u not found\n",
389 job_id, het_job_offset);
390 } else {
391 printf("Job %u not found\n", job_id);
392 }
393 }
394 } else if (quiet_flag != 1)
395 printf ("No jobs in the system\n");
396 }
397 }
398
399 /*
400 * scontrol_print_step - print the specified job step's information
401 * IN job_step_id_str - job step's id or NULL to print information
402 * about all job steps
403 */
404 extern void
scontrol_print_step(char * job_step_id_str)405 scontrol_print_step (char *job_step_id_str)
406 {
407 int error_code, i, print_cnt = 0;
408 uint32_t job_id = NO_VAL, step_id = NO_VAL;
409 uint32_t array_id = NO_VAL;
410 char *next_str;
411 job_step_info_response_msg_t *job_step_info_ptr;
412 job_step_info_t * job_step_ptr;
413 static uint32_t last_job_id = 0, last_array_id, last_step_id = 0;
414 static job_step_info_response_msg_t *old_job_step_info_ptr = NULL;
415 static uint16_t last_show_flags = 0xffff;
416 uint16_t show_flags = 0;
417
418 if (job_step_id_str) {
419 job_id = (uint32_t) strtol (job_step_id_str, &next_str, 10);
420 if (next_str[0] == '_')
421 array_id = (uint32_t) strtol(next_str+1, &next_str, 10);
422 else if (next_str[0] == '.')
423 step_id = (uint32_t) strtol (next_str+1, NULL, 10);
424 }
425
426 if (all_flag)
427 show_flags |= SHOW_ALL;
428 if (local_flag)
429 show_flags |= SHOW_LOCAL;
430
431 if ((old_job_step_info_ptr) && (last_job_id == job_id) &&
432 (last_array_id == array_id) && (last_step_id == step_id)) {
433 if (last_show_flags != show_flags)
434 old_job_step_info_ptr->last_update = (time_t) 0;
435 error_code = slurm_get_job_steps(
436 old_job_step_info_ptr->last_update,
437 job_id, step_id, &job_step_info_ptr,
438 show_flags);
439 if (error_code == SLURM_SUCCESS)
440 slurm_free_job_step_info_response_msg (
441 old_job_step_info_ptr);
442 else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) {
443 job_step_info_ptr = old_job_step_info_ptr;
444 error_code = SLURM_SUCCESS;
445 if (quiet_flag == -1)
446 printf("slurm_get_job_steps no change in data\n");
447 }
448 } else {
449 if (old_job_step_info_ptr) {
450 slurm_free_job_step_info_response_msg (
451 old_job_step_info_ptr);
452 old_job_step_info_ptr = NULL;
453 }
454 error_code = slurm_get_job_steps ( (time_t) 0, job_id, step_id,
455 &job_step_info_ptr,
456 show_flags);
457 }
458
459 if (error_code) {
460 exit_code = 1;
461 if (quiet_flag != 1)
462 slurm_perror ("slurm_get_job_steps error");
463 return;
464 }
465
466 old_job_step_info_ptr = job_step_info_ptr;
467 last_show_flags = show_flags;
468 last_job_id = job_id;
469 last_step_id = step_id;
470
471 if (quiet_flag == -1) {
472 char time_str[32];
473 slurm_make_time_str ((time_t *)&job_step_info_ptr->last_update,
474 time_str, sizeof(time_str));
475 printf ("last_update_time=%s, records=%d\n",
476 time_str, job_step_info_ptr->job_step_count);
477 }
478
479 for (i = 0, job_step_ptr = job_step_info_ptr->job_steps;
480 i < job_step_info_ptr->job_step_count; i++, job_step_ptr++) {
481 if ((array_id != NO_VAL) &&
482 (array_id != job_step_ptr->array_task_id))
483 continue;
484 slurm_print_job_step_info(stdout, job_step_ptr, one_liner);
485 print_cnt++;
486 }
487
488 if (print_cnt == 0) {
489 if (job_step_id_str) {
490 exit_code = 1;
491 if (quiet_flag != 1) {
492 if (array_id == NO_VAL) {
493 printf ("Job step %u.%u not found\n",
494 job_id, step_id);
495 } else {
496 printf ("Job step %u_%u.%u not found\n",
497 job_id, array_id, step_id);
498 }
499 }
500 } else if (quiet_flag != 1)
501 printf ("No job steps in the system\n");
502 }
503 }
504
505 /* Return 1 on success, 0 on failure to find a jobid in the string */
_parse_jobid(const char * jobid_str,uint32_t * out_jobid)506 static int _parse_jobid(const char *jobid_str, uint32_t *out_jobid)
507 {
508 char *ptr, *job;
509 long jobid;
510
511 job = xstrdup(jobid_str);
512 ptr = xstrchr(job, '.');
513 if (ptr != NULL) {
514 *ptr = '\0';
515 }
516
517 jobid = strtol(job, &ptr, 10);
518 if (!xstring_is_whitespace(ptr)) {
519 fprintf(stderr, "\"%s\" does not look like a jobid\n", job);
520 xfree(job);
521 return 0;
522 }
523
524 *out_jobid = (uint32_t) jobid;
525 xfree(job);
526 return 1;
527 }
528
529 /* Return 1 on success, 0 on failure to find a stepid in the string */
_parse_stepid(const char * jobid_str,uint32_t * out_stepid)530 static int _parse_stepid(const char *jobid_str, uint32_t *out_stepid)
531 {
532 char *ptr, *job, *step;
533 long stepid;
534
535 job = xstrdup(jobid_str);
536 ptr = xstrchr(job, '.');
537 if (ptr == NULL) {
538 /* did not find a period, so no step ID in this string */
539 xfree(job);
540 return 0;
541 } else {
542 step = ptr + 1;
543 }
544
545 stepid = strtol(step, &ptr, 10);
546 if (!xstring_is_whitespace(ptr)) {
547 fprintf(stderr, "\"%s\" does not look like a stepid\n", step);
548 xfree(job);
549 return 0;
550 }
551
552 *out_stepid = (uint32_t) stepid;
553 xfree(job);
554 return 1;
555 }
556
557
558 static bool
_in_task_array(pid_t pid,slurmstepd_task_info_t * task_array,uint32_t task_array_count)559 _in_task_array(pid_t pid, slurmstepd_task_info_t *task_array,
560 uint32_t task_array_count)
561 {
562 int i;
563
564 for (i = 0; i < task_array_count; i++) {
565 if (pid == task_array[i].pid)
566 return true;
567 }
568
569 return false;
570 }
571
572
573 static void
_list_pids_one_step(const char * node_name,uint32_t jobid,uint32_t stepid)574 _list_pids_one_step(const char *node_name, uint32_t jobid, uint32_t stepid)
575 {
576 int fd;
577 slurmstepd_task_info_t *task_info = NULL;
578 uint32_t *pids = NULL;
579 uint32_t count = 0;
580 uint32_t tcount = 0;
581 int i;
582 uint16_t protocol_version;
583
584 fd = stepd_connect(NULL, node_name, jobid, stepid, &protocol_version);
585 if (fd == -1) {
586 exit_code = 1;
587 if (errno == ENOENT) {
588 fprintf(stderr,
589 "Job step %u.%u does not exist on this node.\n",
590 jobid, stepid);
591 exit_code = 1;
592 } else {
593 perror("Unable to connect to slurmstepd");
594 }
595 return;
596 }
597
598 stepd_task_info(fd, protocol_version, &task_info, &tcount);
599 for (i = 0; i < (int)tcount; i++) {
600 if (!task_info[i].exited) {
601 if (stepid == NO_VAL)
602 printf("%-8d %-8u %-6s %-7d %-8d\n",
603 task_info[i].pid,
604 jobid,
605 "batch",
606 task_info[i].id,
607 task_info[i].gtid);
608 else
609 printf("%-8d %-8u %-6u %-7d %-8d\n",
610 task_info[i].pid,
611 jobid,
612 stepid,
613 task_info[i].id,
614 task_info[i].gtid);
615 }
616 }
617
618 stepd_list_pids(fd, protocol_version, &pids, &count);
619 for (i = 0; i < count; i++) {
620 if (!_in_task_array((pid_t)pids[i], task_info, tcount)) {
621 if (stepid == NO_VAL)
622 printf("%-8d %-8u %-6s %-7s %-8s\n",
623 pids[i], jobid, "batch", "-", "-");
624 else
625 printf("%-8d %-8u %-6u %-7s %-8s\n",
626 pids[i], jobid, stepid, "-", "-");
627 }
628 }
629
630 xfree(pids);
631 xfree(task_info);
632 close(fd);
633 }
634
635 static void
_list_pids_all_steps(const char * node_name,uint32_t jobid)636 _list_pids_all_steps(const char *node_name, uint32_t jobid)
637 {
638 List steps;
639 ListIterator itr;
640 step_loc_t *stepd;
641 int count = 0;
642
643 steps = stepd_available(NULL, node_name);
644 if (!steps || list_count(steps) == 0) {
645 fprintf(stderr, "Job %u does not exist on this node.\n", jobid);
646 FREE_NULL_LIST(steps);
647 exit_code = 1;
648 return;
649 }
650
651 itr = list_iterator_create(steps);
652 while ((stepd = list_next(itr))) {
653 if (jobid == stepd->jobid) {
654 _list_pids_one_step(stepd->nodename, stepd->jobid,
655 stepd->stepid);
656 count++;
657 }
658 }
659 list_iterator_destroy(itr);
660 FREE_NULL_LIST(steps);
661
662 if (count == 0) {
663 fprintf(stderr, "Job %u does not exist on this node.\n",
664 jobid);
665 exit_code = 1;
666 }
667 }
668
669 static void
_list_pids_all_jobs(const char * node_name)670 _list_pids_all_jobs(const char *node_name)
671 {
672 List steps;
673 ListIterator itr;
674 step_loc_t *stepd;
675
676 steps = stepd_available(NULL, node_name);
677 if (!steps || list_count(steps) == 0) {
678 fprintf(stderr, "No job steps exist on this node.\n");
679 FREE_NULL_LIST(steps);
680 exit_code = 1;
681 return;
682 }
683
684 itr = list_iterator_create(steps);
685 while((stepd = list_next(itr))) {
686 _list_pids_one_step(stepd->nodename, stepd->jobid,
687 stepd->stepid);
688 }
689 list_iterator_destroy(itr);
690 FREE_NULL_LIST(steps);
691 }
692
693 /*
694 * scontrol_list_pids - given a slurmd job ID or job ID + step ID,
695 * print the process IDs of the processes each job step (or
696 * just the specified step ID).
697 * IN jobid_str - string representing a jobid: jobid[.stepid]
698 * IN node_name - May be NULL, in which case it will attempt to
699 * determine the NodeName of the local host on its own.
700 * This is mostly of use when multiple-slurmd support is in use,
701 * because if NULL is used when there are multiple slurmd on the
702 * node, one of them will be selected more-or-less at random.
703 */
704 extern void
scontrol_list_pids(const char * jobid_str,const char * node_name)705 scontrol_list_pids(const char *jobid_str, const char *node_name)
706 {
707 uint32_t jobid = 0, stepid = 0;
708
709 /* Job ID is optional */
710 if (jobid_str != NULL
711 && jobid_str[0] != '*'
712 && !_parse_jobid(jobid_str, &jobid)) {
713 exit_code = 1;
714 return;
715 }
716
717 /* Step ID is optional */
718 printf("%-8s %-8s %-6s %-7s %-8s\n",
719 "PID", "JOBID", "STEPID", "LOCALID", "GLOBALID");
720 if (jobid_str == NULL || jobid_str[0] == '*') {
721 _list_pids_all_jobs(node_name);
722 } else if (_parse_stepid(jobid_str, &stepid)) {
723 _list_pids_one_step(node_name, jobid, stepid);
724 } else {
725 _list_pids_all_steps(node_name, jobid);
726 }
727 }
728
scontrol_getent(const char * node_name)729 extern void scontrol_getent(const char *node_name)
730 {
731 List steps = NULL;
732 ListIterator itr = NULL;
733 step_loc_t *stepd;
734 int fd;
735 struct passwd *pwd = NULL;
736 struct group **grps = NULL;
737
738 if (!(steps = stepd_available(NULL, node_name))) {
739 fprintf(stderr, "No steps found on this node\n");
740 return;
741 }
742
743 itr = list_iterator_create(steps);
744 while ((stepd = list_next(itr))) {
745 fd = stepd_connect(NULL, node_name, stepd->jobid,
746 stepd->stepid,
747 &stepd->protocol_version);
748
749 if (fd < 0)
750 continue;
751 pwd = stepd_getpw(fd, stepd->protocol_version,
752 GETPW_MATCH_ALWAYS, 0, NULL);
753
754 if (!pwd) {
755 close(fd);
756 continue;
757 }
758
759 if (stepd->stepid == SLURM_EXTERN_CONT)
760 printf("JobId=%u.Extern:\nUser:\n", stepd->jobid);
761 else if (stepd->stepid == SLURM_BATCH_SCRIPT)
762 printf("JobId=%u.Batch:\nUser:\n", stepd->jobid);
763 else
764 printf("JobId=%u.%u:\nUser:\n",
765 stepd->jobid, stepd->stepid);
766
767 printf("%s:%s:%u:%u:%s:%s:%s\nGroups:\n",
768 pwd->pw_name, pwd->pw_passwd, pwd->pw_uid, pwd->pw_gid,
769 pwd->pw_gecos, pwd->pw_dir, pwd->pw_shell);
770
771 xfree_struct_passwd(pwd);
772
773 grps = stepd_getgr(fd, stepd->protocol_version,
774 GETGR_MATCH_ALWAYS, 0, NULL);
775 if (!grps) {
776 close(fd);
777 printf("\n");
778 continue;
779 }
780
781 for (int i = 0; grps[i]; i++) {
782 printf("%s:%s:%u:%s\n",
783 grps[i]->gr_name, grps[i]->gr_passwd,
784 grps[i]->gr_gid,
785 (grps[i]->gr_mem) ? grps[i]->gr_mem[0] : "");
786 }
787 close(fd);
788 xfree_struct_group_array(grps);
789 printf("\n");
790 }
791 list_iterator_destroy(itr);
792 FREE_NULL_LIST(steps);
793 }
794
795 /*
796 * scontrol_print_hosts - given a node list expression, return
797 * a list of nodes, one per line
798 */
799 extern void
scontrol_print_hosts(char * node_list)800 scontrol_print_hosts (char * node_list)
801 {
802 hostlist_t hl;
803 char *host;
804
805 if (!node_list) {
806 error("host list is empty");
807 return;
808 }
809 hl = hostlist_create_dims(node_list, 0);
810 if (!hl) {
811 fprintf(stderr, "Invalid hostlist: %s\n", node_list);
812 return;
813 }
814 while ((host = hostlist_shift_dims(hl, 0))) {
815 printf("%s\n", host);
816 free(host);
817 }
818 hostlist_destroy(hl);
819 }
820
821 /* Replace '\n' with ',', remove duplicate comma */
822 static void
_reformat_hostlist(char * hostlist)823 _reformat_hostlist(char *hostlist)
824 {
825 int i, o;
826 for (i=0; (hostlist[i] != '\0'); i++) {
827 if (hostlist[i] == '\n')
828 hostlist[i] = ',';
829 }
830
831 o = 0;
832 for (i=0; (hostlist[i] != '\0'); i++) {
833 while ((hostlist[i] == ',') && (hostlist[i+1] == ','))
834 i++;
835 hostlist[o++] = hostlist[i];
836 }
837 hostlist[o] = '\0';
838 }
839
840 /*
841 * scontrol_encode_hostlist - given a list of hostnames or the pathname
842 * of a file containing hostnames, translate them into a hostlist
843 * expression
844 */
845 extern int
scontrol_encode_hostlist(char * hostlist,bool sorted)846 scontrol_encode_hostlist(char *hostlist, bool sorted)
847 {
848 char *io_buf = NULL, *tmp_list, *ranged_string;
849 int buf_size = 1024 * 1024;
850 hostlist_t hl;
851
852 if (!hostlist) {
853 fprintf(stderr, "Hostlist is NULL\n");
854 return SLURM_ERROR;
855 }
856
857 if (hostlist[0] == '/') {
858 ssize_t buf_read;
859 int fd = open(hostlist, O_RDONLY);
860 if (fd < 0) {
861 fprintf(stderr, "Can not open %s\n", hostlist);
862 return SLURM_ERROR;
863 }
864 io_buf = xmalloc(buf_size);
865 buf_read = read(fd, io_buf, buf_size);
866 close(fd);
867 if (buf_read >= buf_size) {
868 /* If over 1MB, the file is almost certainly invalid */
869 fprintf(stderr, "File %s is too large\n", hostlist);
870 xfree(io_buf);
871 return SLURM_ERROR;
872 }
873 io_buf[buf_read] = '\0';
874 _reformat_hostlist(io_buf);
875 tmp_list = io_buf;
876 } else
877 tmp_list = hostlist;
878
879 hl = hostlist_create(tmp_list);
880 if (hl == NULL) {
881 fprintf(stderr, "Invalid hostlist: %s\n", tmp_list);
882 xfree(io_buf);
883 return SLURM_ERROR;
884 }
885 if (sorted)
886 hostlist_sort(hl);
887 ranged_string = hostlist_ranged_string_xmalloc(hl);
888 printf("%s\n", ranged_string);
889 hostlist_destroy(hl);
890 xfree(ranged_string);
891 xfree(io_buf);
892 return SLURM_SUCCESS;
893 }
894
_wait_nodes_ready(uint32_t job_id)895 static int _wait_nodes_ready(uint32_t job_id)
896 {
897 int is_ready = SLURM_ERROR, i, rc = 0;
898 int cur_delay = 0;
899 int suspend_time, resume_time, max_delay;
900
901 suspend_time = slurm_get_suspend_timeout();
902 resume_time = slurm_get_resume_timeout();
903 if ((suspend_time == 0) || (resume_time == 0))
904 return SLURM_SUCCESS; /* Power save mode disabled */
905 max_delay = suspend_time + resume_time;
906 max_delay *= 5; /* Allow for ResumeRate support */
907
908 for (i=0; (cur_delay < max_delay); i++) {
909 if (i) {
910 if (i == 1)
911 info("Waiting for nodes to boot");
912 sleep(POLL_SLEEP);
913 cur_delay += POLL_SLEEP;
914 }
915
916 rc = slurm_job_node_ready(job_id);
917
918 if (rc == READY_JOB_FATAL)
919 break; /* fatal error */
920 if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
921 continue; /* retry */
922 if ((rc & READY_JOB_STATE) == 0) /* job killed */
923 break;
924 if (rc & READY_NODE_STATE) { /* job and node ready */
925 is_ready = SLURM_SUCCESS;
926 break;
927 }
928 }
929 if (is_ready == SLURM_SUCCESS)
930 info("Nodes are ready for job %u", job_id);
931 else if ((rc & READY_JOB_STATE) == 0)
932 info("Job %u no longer running", job_id);
933 else
934 info("Problem running job %u", job_id);
935
936 return is_ready;
937 }
938
939 /*
940 * Wait until a job is ready to execute or enters some failed state
941 * RET 1: job ready to run
942 * 0: job can't run (cancelled, failure state, timeout, etc.)
943 */
scontrol_job_ready(char * job_id_str)944 extern int scontrol_job_ready(char *job_id_str)
945 {
946 uint32_t job_id;
947
948 job_id = atoi(job_id_str);
949 if (job_id <= 0) {
950 fprintf(stderr, "Invalid job_id %s", job_id_str);
951 return SLURM_ERROR;
952 }
953
954 return _wait_nodes_ready(job_id);
955 }
956
scontrol_callerid(int argc,char ** argv)957 extern int scontrol_callerid(int argc, char **argv)
958 {
959 int af, ver = 4;
960 unsigned char ip_src[sizeof(struct in6_addr)],
961 ip_dst[sizeof(struct in6_addr)];
962 uint32_t port_src, port_dst, job_id;
963 network_callerid_msg_t req;
964 char node_name[MAXHOSTNAMELEN], *ptr;
965
966 if (argc == 5) {
967 ver = strtoul(argv[4], &ptr, 0);
968 if (ptr && ptr[0]) {
969 error("Address family not an integer");
970 return SLURM_ERROR;
971 }
972 }
973
974 if (ver != 4 && ver != 6) {
975 error("Invalid address family: %d", ver);
976 return SLURM_ERROR;
977 }
978
979 af = ver == 4 ? AF_INET : AF_INET6;
980 if (!inet_pton(af, argv[0], ip_src)) {
981 error("inet_pton failed for '%s'", argv[0]);
982 return SLURM_ERROR;
983 }
984
985 port_src = strtoul(argv[1], &ptr, 0);
986 if (ptr && ptr[0]) {
987 error("Source port not an integer");
988 return SLURM_ERROR;
989 }
990
991 if (!inet_pton(af, argv[2], ip_dst)) {
992 error("scontrol_callerid: inet_pton failed for '%s'", argv[2]);
993 return SLURM_ERROR;
994 }
995
996 port_dst = strtoul(argv[3], &ptr, 0);
997 if (ptr && ptr[0]) {
998 error("Destination port not an integer");
999 return SLURM_ERROR;
1000 }
1001
1002 memcpy(req.ip_src, ip_src, 16);
1003 memcpy(req.ip_dst, ip_dst, 16);
1004 req.port_src = port_src;
1005 req.port_dst = port_dst;
1006 req.af = af;
1007
1008 if (slurm_network_callerid(req, &job_id, node_name, MAXHOSTNAMELEN)
1009 != SLURM_SUCCESS) {
1010 fprintf(stderr,
1011 "slurm_network_callerid: unable to retrieve callerid data from remote slurmd\n");
1012 return SLURM_ERROR;
1013 } else if (job_id == NO_VAL) {
1014 fprintf(stderr,
1015 "slurm_network_callerid: remote job id indeterminate\n");
1016 return SLURM_ERROR;
1017 } else {
1018 printf("%u %s\n", job_id, node_name);
1019 return SLURM_SUCCESS;
1020 }
1021 }
1022
scontrol_batch_script(int argc,char ** argv)1023 extern int scontrol_batch_script(int argc, char **argv)
1024 {
1025 char *filename;
1026 FILE *out;
1027 int exit_code;
1028 uint32_t jobid;
1029
1030 if (argc < 1)
1031 return SLURM_ERROR;
1032
1033 jobid = atoll(argv[0]);
1034
1035 if (argc > 1)
1036 filename = xstrdup(argv[1]);
1037 else
1038 filename = xstrdup_printf("slurm-%u.sh", jobid);
1039
1040 if (!xstrcmp(filename, "-")) {
1041 out = stdout;
1042 } else {
1043 if (!(out = fopen(filename, "w"))) {
1044 fprintf(stderr, "failed to open file `%s`: %m\n",
1045 filename);
1046 xfree(filename);
1047 return errno;
1048 }
1049 }
1050
1051 exit_code = slurm_job_batch_script(out, jobid);
1052
1053 if (out != stdout)
1054 fclose(out);
1055
1056 if (exit_code != SLURM_SUCCESS) {
1057 if (out != stdout)
1058 unlink(filename);
1059 slurm_perror("job script retrieval failed");
1060 } else if ((out != stdout) && (quiet_flag != 1)) {
1061 printf("batch script for job %u written to %s\n",
1062 jobid, filename);
1063 }
1064
1065 xfree(filename);
1066 return exit_code;
1067 }
1068