1 /*****************************************************************************\
2  *  info_job.c - job information functions for scontrol.
3  *****************************************************************************
4  *  Copyright (C) 2002-2007 The Regents of the University of California.
5  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
6  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7  *  Written by Morris Jette <jette1@llnl.gov>
8  *  CODE-OCEC-09-009. All rights reserved.
9  *
10  *  This file is part of Slurm, a resource management program.
11  *  For details, see <https://slurm.schedmd.com/>.
12  *  Please also read the included file: DISCLAIMER.
13  *
14  *  Slurm is free software; you can redistribute it and/or modify it under
15  *  the terms of the GNU General Public License as published by the Free
16  *  Software Foundation; either version 2 of the License, or (at your option)
17  *  any later version.
18  *
19  *  In addition, as a special exception, the copyright holders give permission
20  *  to link the code of portions of this program with the OpenSSL library under
21  *  certain conditions as described in each individual source file, and
22  *  distribute linked combinations including the two. You must obey the GNU
23  *  General Public License in all respects for all of the code used other than
24  *  OpenSSL. If you modify file(s) with this exception, you may extend this
25  *  exception to your version of the file(s), but you are not obligated to do
26  *  so. If you do not wish to do so, delete this exception statement from your
27  *  version.  If you delete this exception statement from all source files in
28  *  the program, then also delete it here.
29  *
30  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
31  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
32  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
33  *  details.
34  *
35  *  You should have received a copy of the GNU General Public License along
36  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
37  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
38 \*****************************************************************************/
39 #include <arpa/inet.h>
40 #include <grp.h>
41 #include <fcntl.h>
42 #include <sys/stat.h>
43 #include <sys/types.h>
44 
45 #include "scontrol.h"
46 #include "src/common/bitstring.h"
47 #include "src/common/slurm_time.h"
48 #include "src/common/stepd_api.h"
49 
50 #define POLL_SLEEP	3	/* retry interval in seconds  */
51 
52 /* Load current job table information into *job_buffer_pptr */
53 extern int
scontrol_load_job(job_info_msg_t ** job_buffer_pptr,uint32_t job_id)54 scontrol_load_job(job_info_msg_t ** job_buffer_pptr, uint32_t job_id)
55 {
56 	int error_code;
57 	static uint16_t last_show_flags = 0xffff;
58 	uint16_t show_flags = 0;
59 	job_info_msg_t * job_info_ptr = NULL;
60 
61 	if (all_flag)
62 		show_flags |= SHOW_ALL;
63 
64 	if (detail_flag)
65 		show_flags |= SHOW_DETAIL;
66 	if (federation_flag)
67 		show_flags |= SHOW_FEDERATION;
68 	if (local_flag)
69 		show_flags |= SHOW_LOCAL;
70 	if (sibling_flag)
71 		show_flags |= SHOW_FEDERATION | SHOW_SIBLING;
72 
73 	if (old_job_info_ptr) {
74 		if (last_show_flags != show_flags)
75 			old_job_info_ptr->last_update = (time_t) 0;
76 		if (job_id) {
77 			error_code = slurm_load_job(&job_info_ptr, job_id,
78 						    show_flags);
79 		} else {
80 			error_code = slurm_load_jobs(
81 				old_job_info_ptr->last_update,
82 				&job_info_ptr, show_flags);
83 		}
84 		if (error_code == SLURM_SUCCESS)
85 			slurm_free_job_info_msg (old_job_info_ptr);
86 		else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) {
87 			job_info_ptr = old_job_info_ptr;
88 			error_code = SLURM_SUCCESS;
89 			if (quiet_flag == -1)
90  				printf ("slurm_load_jobs no change in data\n");
91 		}
92 	} else if (job_id) {
93 		error_code = slurm_load_job(&job_info_ptr, job_id, show_flags);
94 	} else {
95 		error_code = slurm_load_jobs((time_t) NULL, &job_info_ptr,
96 					     show_flags);
97 	}
98 
99 	if (error_code == SLURM_SUCCESS) {
100 		old_job_info_ptr = job_info_ptr;
101 		if (job_id)
102 			old_job_info_ptr->last_update = (time_t) 0;
103 		last_show_flags  = show_flags;
104 		*job_buffer_pptr = job_info_ptr;
105 	}
106 
107 	return error_code;
108 }
109 
110 /*
111  * scontrol_pid_info - given a local process id, print the corresponding
112  *	slurm job id and its expected end time
113  * IN job_pid - the local process id of interest
114  */
115 extern void
scontrol_pid_info(pid_t job_pid)116 scontrol_pid_info(pid_t job_pid)
117 {
118 	int error_code;
119 	uint32_t job_id = 0;
120 	time_t end_time;
121 	long rem_time;
122 
123 	error_code = slurm_pid2jobid(job_pid, &job_id);
124 	if (error_code) {
125 		exit_code = 1;
126 		if (quiet_flag != 1)
127 			slurm_perror ("slurm_pid2jobid error");
128 		return;
129 	}
130 
131 	error_code = slurm_get_end_time(job_id, &end_time);
132 	if (error_code) {
133 		exit_code = 1;
134 		if (quiet_flag != 1)
135 			slurm_perror ("slurm_get_end_time error");
136 		return;
137 	}
138 	printf("Slurm job id %u ends at %s\n", job_id, slurm_ctime2(&end_time));
139 
140 	rem_time = slurm_get_rem_time(job_id);
141 	printf("slurm_get_rem_time is %ld\n", rem_time);
142 	return;
143 }
144 
145 /*
146  * scontrol_print_completing - print jobs in completing state and
147  *	associated nodes in COMPLETING or DOWN state
148  */
149 extern void
scontrol_print_completing(void)150 scontrol_print_completing (void)
151 {
152 	int error_code, i;
153 	job_info_msg_t  *job_info_msg;
154 	job_info_t      *job_info;
155 	node_info_msg_t *node_info_msg;
156 	uint16_t         show_flags = 0;
157 
158 	error_code = scontrol_load_job (&job_info_msg, 0);
159 	if (error_code) {
160 		exit_code = 1;
161 		if (quiet_flag != 1)
162 			slurm_perror ("slurm_load_jobs error");
163 		return;
164 	}
165 	/* Must load all nodes including hidden for cross-index
166 	 * from job's node_inx to node table to work */
167 	/*if (all_flag)		Always set this flag */
168 	show_flags |= SHOW_ALL;
169 	if (federation_flag)
170 		show_flags |= SHOW_FEDERATION;
171 	if (local_flag)
172 		show_flags |= SHOW_LOCAL;
173 	error_code = scontrol_load_nodes(&node_info_msg, show_flags);
174 	if (error_code) {
175 		exit_code = 1;
176 		if (quiet_flag != 1)
177 			slurm_perror ("slurm_load_nodes error");
178 		return;
179 	}
180 
181 	/* Scan the jobs for completing state */
182 	job_info = job_info_msg->job_array;
183 	for (i = 0; i < job_info_msg->record_count; i++) {
184 		if (job_info[i].job_state & JOB_COMPLETING)
185 			scontrol_print_completing_job(&job_info[i],
186 						      node_info_msg);
187 	}
188 	slurm_free_node_info_msg(node_info_msg);
189 }
190 
191 extern void
scontrol_print_completing_job(job_info_t * job_ptr,node_info_msg_t * node_info_msg)192 scontrol_print_completing_job(job_info_t *job_ptr,
193 			      node_info_msg_t *node_info_msg)
194 {
195 	int i, c_offset = 0;
196 	node_info_t *node_info;
197 	hostlist_t comp_nodes, down_nodes;
198 	char *node_buf;
199 	char time_str[32];
200 	time_t completing_time = 0;
201 
202 	comp_nodes = hostlist_create(NULL);
203 	down_nodes = hostlist_create(NULL);
204 
205 	if (job_ptr->cluster && federation_flag && !local_flag)
206 		c_offset = get_cluster_node_offset(job_ptr->cluster,
207 						   node_info_msg);
208 
209 	for (i = 0; job_ptr->node_inx[i] != -1; i+=2) {
210 		int j = job_ptr->node_inx[i];
211 		for (; j <= job_ptr->node_inx[i+1]; j++) {
212 			int node_inx = j + c_offset;
213 			if (node_inx >= node_info_msg->record_count)
214 				break;
215 			node_info = &(node_info_msg->node_array[node_inx]);
216 			if (IS_NODE_COMPLETING(node_info))
217 				hostlist_push_host(comp_nodes, node_info->name);
218 			else if (IS_NODE_DOWN(node_info))
219 				hostlist_push_host(down_nodes, node_info->name);
220 		}
221 	}
222 
223 	fprintf(stdout, "JobId=%u ", job_ptr->job_id);
224 
225 	slurm_make_time_str(&job_ptr->end_time, time_str, sizeof(time_str));
226 	fprintf(stdout, "EndTime=%s ", time_str);
227 
228 	completing_time = time(NULL) - job_ptr->end_time;
229 	secs2time_str(completing_time, time_str, sizeof(time_str));
230 	fprintf(stdout, "CompletingTime=%s ", time_str);
231 
232 	node_buf = hostlist_ranged_string_xmalloc(comp_nodes);
233 	if (node_buf && node_buf[0])
234 		fprintf(stdout, "Nodes(COMPLETING)=%s ", node_buf);
235 	xfree(node_buf);
236 
237 	node_buf = hostlist_ranged_string_xmalloc(down_nodes);
238 	if (node_buf && node_buf[0])
239 		fprintf(stdout, "Nodes(DOWN)=%s ", node_buf);
240 	xfree(node_buf);
241 	fprintf(stdout, "\n");
242 
243 	hostlist_destroy(comp_nodes);
244 	hostlist_destroy(down_nodes);
245 }
246 
247 extern uint16_t
scontrol_get_job_state(uint32_t job_id)248 scontrol_get_job_state(uint32_t job_id)
249 {
250 	job_info_msg_t * job_buffer_ptr = NULL;
251 	int error_code = SLURM_SUCCESS, i;
252 	job_info_t *job_ptr = NULL;
253 
254 	error_code = scontrol_load_job(&job_buffer_ptr, job_id);
255 	if (error_code) {
256 		exit_code = 1;
257 		if (quiet_flag == -1)
258 			slurm_perror ("slurm_load_job error");
259 		return NO_VAL16;
260 	}
261 	if (quiet_flag == -1) {
262 		char time_str[32];
263 		slurm_make_time_str((time_t *)&job_buffer_ptr->last_update,
264 				    time_str, sizeof(time_str));
265 		printf("last_update_time=%s, records=%d\n",
266 		       time_str, job_buffer_ptr->record_count);
267 	}
268 
269 	job_ptr = job_buffer_ptr->job_array ;
270 	for (i = 0; i < job_buffer_ptr->record_count; i++) {
271 		if (job_ptr->job_id == job_id)
272 			return job_ptr->job_state;
273 	}
274 	if (quiet_flag == -1)
275 		printf("Could not find job %u", job_id);
276 	return NO_VAL16;
277 }
278 
_het_job_offset_match(job_info_t * job_ptr,uint32_t het_job_offset)279 static bool _het_job_offset_match(job_info_t *job_ptr, uint32_t het_job_offset)
280 {
281 	if ((het_job_offset == NO_VAL) ||
282 	    (het_job_offset == job_ptr->het_job_offset))
283 		return true;
284 	return false;
285 }
286 
_task_id_in_job(job_info_t * job_ptr,uint32_t array_id)287 static bool _task_id_in_job(job_info_t *job_ptr, uint32_t array_id)
288 {
289 	bitstr_t *array_bitmap;
290 	uint32_t array_len;
291 
292 	if ((array_id == NO_VAL) ||
293 	    (array_id == job_ptr->array_task_id))
294 		return true;
295 
296 	array_bitmap = (bitstr_t *) job_ptr->array_bitmap;
297 	if (array_bitmap == NULL)
298 		return false;
299 	array_len = bit_size(array_bitmap);
300 	if (array_id >= array_len)
301 		return false;
302 	if (bit_test(array_bitmap, array_id))
303 		return true;
304 	return false;
305 }
306 
307 /*
308  * scontrol_print_job - print the specified job's information
309  * IN job_id - job's id or NULL to print information about all jobs
310  */
scontrol_print_job(char * job_id_str)311 extern void scontrol_print_job(char * job_id_str)
312 {
313 	int error_code = SLURM_SUCCESS, i, print_cnt = 0;
314 	uint32_t job_id = 0;
315 	uint32_t array_id = NO_VAL, het_job_offset = NO_VAL;
316 	job_info_msg_t * job_buffer_ptr = NULL;
317 	job_info_t *job_ptr = NULL;
318 	char *end_ptr = NULL;
319 
320 	if (job_id_str) {
321 		char *tmp_job_ptr = job_id_str;
322 		/*
323 		 * Check that the input is a valid job id (i.e. 123 or 123_456).
324 		 */
325 		while (*tmp_job_ptr) {
326 			if (!isdigit(*tmp_job_ptr) &&
327 			    (*tmp_job_ptr != '_') && (*tmp_job_ptr != '+')) {
328 				exit_code = 1;
329 				slurm_seterrno(ESLURM_INVALID_JOB_ID);
330 				if (quiet_flag != 1)
331 					slurm_perror("scontrol_print_job error");
332 				return;
333 			}
334 			++tmp_job_ptr;
335 		}
336 		job_id = (uint32_t) strtol (job_id_str, &end_ptr, 10);
337 		if (end_ptr[0] == '_')
338 			array_id = strtol(end_ptr + 1, &end_ptr, 10);
339 		if (end_ptr[0] == '+')
340 			het_job_offset = strtol(end_ptr + 1, &end_ptr, 10);
341 	}
342 
343 	error_code = scontrol_load_job(&job_buffer_ptr, job_id);
344 	if (error_code) {
345 		exit_code = 1;
346 		if (quiet_flag != 1)
347 			slurm_perror ("slurm_load_jobs error");
348 		return;
349 	}
350 	if (quiet_flag == -1) {
351 		char time_str[32];
352 		slurm_make_time_str ((time_t *)&job_buffer_ptr->last_update,
353 				     time_str, sizeof(time_str));
354 		printf ("last_update_time=%s, records=%d\n",
355 			time_str, job_buffer_ptr->record_count);
356 	}
357 
358 	for (i = 0, job_ptr = job_buffer_ptr->job_array;
359 	     i < job_buffer_ptr->record_count; i++, job_ptr++) {
360 		char *save_array_str = NULL;
361 		uint32_t save_task_id = 0;
362 		if (!_het_job_offset_match(job_ptr, het_job_offset))
363 			continue;
364 		if (!_task_id_in_job(job_ptr, array_id))
365 			continue;
366 		if ((array_id != NO_VAL) && job_ptr->array_task_str) {
367 			save_array_str = job_ptr->array_task_str;
368 			job_ptr->array_task_str = NULL;
369 			save_task_id = job_ptr->array_task_id;
370 			job_ptr->array_task_id = array_id;
371 		}
372 		slurm_print_job_info(stdout, job_ptr, one_liner);
373 		if (save_array_str) {
374 			job_ptr->array_task_str = save_array_str;
375 			job_ptr->array_task_id = save_task_id;
376 		}
377 		print_cnt++;
378 	}
379 
380 	if (print_cnt == 0) {
381 		if (job_id_str) {
382 			exit_code = 1;
383 			if (quiet_flag != 1) {
384 				if (array_id != NO_VAL) {
385 					printf("Job %u_%u not found\n",
386 					       job_id, array_id);
387 				} else if (het_job_offset != NO_VAL) {
388 					printf("Job %u+%u not found\n",
389 					       job_id, het_job_offset);
390 				} else {
391 					printf("Job %u not found\n", job_id);
392 				}
393 			}
394 		} else if (quiet_flag != 1)
395 			printf ("No jobs in the system\n");
396 	}
397 }
398 
399 /*
400  * scontrol_print_step - print the specified job step's information
401  * IN job_step_id_str - job step's id or NULL to print information
402  *	about all job steps
403  */
404 extern void
scontrol_print_step(char * job_step_id_str)405 scontrol_print_step (char *job_step_id_str)
406 {
407 	int error_code, i, print_cnt = 0;
408 	uint32_t job_id = NO_VAL, step_id = NO_VAL;
409 	uint32_t array_id = NO_VAL;
410 	char *next_str;
411 	job_step_info_response_msg_t *job_step_info_ptr;
412 	job_step_info_t * job_step_ptr;
413 	static uint32_t last_job_id = 0, last_array_id, last_step_id = 0;
414 	static job_step_info_response_msg_t *old_job_step_info_ptr = NULL;
415 	static uint16_t last_show_flags = 0xffff;
416 	uint16_t show_flags = 0;
417 
418 	if (job_step_id_str) {
419 		job_id = (uint32_t) strtol (job_step_id_str, &next_str, 10);
420 		if (next_str[0] == '_')
421 			array_id = (uint32_t) strtol(next_str+1, &next_str, 10);
422 		else if (next_str[0] == '.')
423 			step_id = (uint32_t) strtol (next_str+1, NULL, 10);
424 	}
425 
426 	if (all_flag)
427 		show_flags |= SHOW_ALL;
428 	if (local_flag)
429 		show_flags |= SHOW_LOCAL;
430 
431 	if ((old_job_step_info_ptr) && (last_job_id == job_id) &&
432 	    (last_array_id == array_id) && (last_step_id == step_id)) {
433 		if (last_show_flags != show_flags)
434 			old_job_step_info_ptr->last_update = (time_t) 0;
435 		error_code = slurm_get_job_steps(
436 			old_job_step_info_ptr->last_update,
437 			job_id, step_id, &job_step_info_ptr,
438 			show_flags);
439 		if (error_code == SLURM_SUCCESS)
440 			slurm_free_job_step_info_response_msg (
441 				old_job_step_info_ptr);
442 		else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) {
443 			job_step_info_ptr = old_job_step_info_ptr;
444 			error_code = SLURM_SUCCESS;
445 			if (quiet_flag == -1)
446 				printf("slurm_get_job_steps no change in data\n");
447 		}
448 	} else {
449 		if (old_job_step_info_ptr) {
450 			slurm_free_job_step_info_response_msg (
451 				old_job_step_info_ptr);
452 			old_job_step_info_ptr = NULL;
453 		}
454 		error_code = slurm_get_job_steps ( (time_t) 0, job_id, step_id,
455 						   &job_step_info_ptr,
456 						   show_flags);
457 	}
458 
459 	if (error_code) {
460 		exit_code = 1;
461 		if (quiet_flag != 1)
462 			slurm_perror ("slurm_get_job_steps error");
463 		return;
464 	}
465 
466 	old_job_step_info_ptr = job_step_info_ptr;
467 	last_show_flags = show_flags;
468 	last_job_id = job_id;
469 	last_step_id = step_id;
470 
471 	if (quiet_flag == -1) {
472 		char time_str[32];
473 		slurm_make_time_str ((time_t *)&job_step_info_ptr->last_update,
474 			             time_str, sizeof(time_str));
475 		printf ("last_update_time=%s, records=%d\n",
476 			time_str, job_step_info_ptr->job_step_count);
477 	}
478 
479 	for (i = 0, job_step_ptr = job_step_info_ptr->job_steps;
480 	     i < job_step_info_ptr->job_step_count; i++, job_step_ptr++) {
481 		if ((array_id != NO_VAL) &&
482 		    (array_id != job_step_ptr->array_task_id))
483 			continue;
484 		slurm_print_job_step_info(stdout, job_step_ptr, one_liner);
485 		print_cnt++;
486 	}
487 
488 	if (print_cnt == 0) {
489 		if (job_step_id_str) {
490 			exit_code = 1;
491 			if (quiet_flag != 1) {
492 				if (array_id == NO_VAL) {
493 					printf ("Job step %u.%u not found\n",
494 						job_id, step_id);
495 				} else {
496 					printf ("Job step %u_%u.%u not found\n",
497 						job_id, array_id, step_id);
498 				}
499 			}
500 		} else if (quiet_flag != 1)
501 			printf ("No job steps in the system\n");
502 	}
503 }
504 
505 /* Return 1 on success, 0 on failure to find a jobid in the string */
_parse_jobid(const char * jobid_str,uint32_t * out_jobid)506 static int _parse_jobid(const char *jobid_str, uint32_t *out_jobid)
507 {
508 	char *ptr, *job;
509 	long jobid;
510 
511 	job = xstrdup(jobid_str);
512 	ptr = xstrchr(job, '.');
513 	if (ptr != NULL) {
514 		*ptr = '\0';
515 	}
516 
517 	jobid = strtol(job, &ptr, 10);
518 	if (!xstring_is_whitespace(ptr)) {
519 		fprintf(stderr, "\"%s\" does not look like a jobid\n", job);
520 		xfree(job);
521 		return 0;
522 	}
523 
524 	*out_jobid = (uint32_t) jobid;
525 	xfree(job);
526 	return 1;
527 }
528 
529 /* Return 1 on success, 0 on failure to find a stepid in the string */
_parse_stepid(const char * jobid_str,uint32_t * out_stepid)530 static int _parse_stepid(const char *jobid_str, uint32_t *out_stepid)
531 {
532 	char *ptr, *job, *step;
533 	long stepid;
534 
535 	job = xstrdup(jobid_str);
536 	ptr = xstrchr(job, '.');
537 	if (ptr == NULL) {
538 		/* did not find a period, so no step ID in this string */
539 		xfree(job);
540 		return 0;
541 	} else {
542 		step = ptr + 1;
543 	}
544 
545 	stepid = strtol(step, &ptr, 10);
546 	if (!xstring_is_whitespace(ptr)) {
547 		fprintf(stderr, "\"%s\" does not look like a stepid\n", step);
548 		xfree(job);
549 		return 0;
550 	}
551 
552 	*out_stepid = (uint32_t) stepid;
553 	xfree(job);
554 	return 1;
555 }
556 
557 
558 static bool
_in_task_array(pid_t pid,slurmstepd_task_info_t * task_array,uint32_t task_array_count)559 _in_task_array(pid_t pid, slurmstepd_task_info_t *task_array,
560 	       uint32_t task_array_count)
561 {
562 	int i;
563 
564 	for (i = 0; i < task_array_count; i++) {
565 		if (pid == task_array[i].pid)
566 			return true;
567 	}
568 
569 	return false;
570 }
571 
572 
573 static void
_list_pids_one_step(const char * node_name,uint32_t jobid,uint32_t stepid)574 _list_pids_one_step(const char *node_name, uint32_t jobid, uint32_t stepid)
575 {
576 	int fd;
577 	slurmstepd_task_info_t *task_info = NULL;
578 	uint32_t *pids = NULL;
579 	uint32_t count = 0;
580 	uint32_t tcount = 0;
581 	int i;
582 	uint16_t protocol_version;
583 
584 	fd = stepd_connect(NULL, node_name, jobid, stepid, &protocol_version);
585 	if (fd == -1) {
586 		exit_code = 1;
587 		if (errno == ENOENT) {
588 			fprintf(stderr,
589 				"Job step %u.%u does not exist on this node.\n",
590 				jobid, stepid);
591 			exit_code = 1;
592 		} else {
593 			perror("Unable to connect to slurmstepd");
594 		}
595 		return;
596 	}
597 
598 	stepd_task_info(fd, protocol_version, &task_info, &tcount);
599 	for (i = 0; i < (int)tcount; i++) {
600 		if (!task_info[i].exited) {
601 			if (stepid == NO_VAL)
602 				printf("%-8d %-8u %-6s %-7d %-8d\n",
603 				       task_info[i].pid,
604 				       jobid,
605 				       "batch",
606 				       task_info[i].id,
607 				       task_info[i].gtid);
608 			else
609 				printf("%-8d %-8u %-6u %-7d %-8d\n",
610 				       task_info[i].pid,
611 				       jobid,
612 				       stepid,
613 				       task_info[i].id,
614 				       task_info[i].gtid);
615 		}
616 	}
617 
618 	stepd_list_pids(fd, protocol_version, &pids, &count);
619 	for (i = 0; i < count; i++) {
620 		if (!_in_task_array((pid_t)pids[i], task_info, tcount)) {
621 			if (stepid == NO_VAL)
622 				printf("%-8d %-8u %-6s %-7s %-8s\n",
623 				       pids[i], jobid, "batch", "-", "-");
624 			else
625 				printf("%-8d %-8u %-6u %-7s %-8s\n",
626 				       pids[i], jobid, stepid, "-", "-");
627 		}
628 	}
629 
630 	xfree(pids);
631 	xfree(task_info);
632 	close(fd);
633 }
634 
635 static void
_list_pids_all_steps(const char * node_name,uint32_t jobid)636 _list_pids_all_steps(const char *node_name, uint32_t jobid)
637 {
638 	List steps;
639 	ListIterator itr;
640 	step_loc_t *stepd;
641 	int count = 0;
642 
643 	steps = stepd_available(NULL, node_name);
644 	if (!steps || list_count(steps) == 0) {
645 		fprintf(stderr, "Job %u does not exist on this node.\n", jobid);
646 		FREE_NULL_LIST(steps);
647 		exit_code = 1;
648 		return;
649 	}
650 
651 	itr = list_iterator_create(steps);
652 	while ((stepd = list_next(itr))) {
653 		if (jobid == stepd->jobid) {
654 			_list_pids_one_step(stepd->nodename, stepd->jobid,
655 					    stepd->stepid);
656 			count++;
657 		}
658 	}
659 	list_iterator_destroy(itr);
660 	FREE_NULL_LIST(steps);
661 
662 	if (count == 0) {
663 		fprintf(stderr, "Job %u does not exist on this node.\n",
664 			jobid);
665 		exit_code = 1;
666 	}
667 }
668 
669 static void
_list_pids_all_jobs(const char * node_name)670 _list_pids_all_jobs(const char *node_name)
671 {
672 	List steps;
673 	ListIterator itr;
674 	step_loc_t *stepd;
675 
676 	steps = stepd_available(NULL, node_name);
677 	if (!steps || list_count(steps) == 0) {
678 		fprintf(stderr, "No job steps exist on this node.\n");
679 		FREE_NULL_LIST(steps);
680 		exit_code = 1;
681 		return;
682 	}
683 
684 	itr = list_iterator_create(steps);
685 	while((stepd = list_next(itr))) {
686 		_list_pids_one_step(stepd->nodename, stepd->jobid,
687 				    stepd->stepid);
688 	}
689 	list_iterator_destroy(itr);
690 	FREE_NULL_LIST(steps);
691 }
692 
693 /*
694  * scontrol_list_pids - given a slurmd job ID or job ID + step ID,
695  *	print the process IDs of the processes each job step (or
696  *	just the specified step ID).
697  * IN jobid_str - string representing a jobid: jobid[.stepid]
698  * IN node_name - May be NULL, in which case it will attempt to
699  *	determine the NodeName of the local host on its own.
700  *	This is mostly of use when multiple-slurmd support is in use,
701  *	because if NULL is used when there are multiple slurmd on the
702  *	node, one of them will be selected more-or-less at random.
703  */
704 extern void
scontrol_list_pids(const char * jobid_str,const char * node_name)705 scontrol_list_pids(const char *jobid_str, const char *node_name)
706 {
707 	uint32_t jobid = 0, stepid = 0;
708 
709 	/* Job ID is optional */
710 	if (jobid_str != NULL
711 	    && jobid_str[0] != '*'
712 	    && !_parse_jobid(jobid_str, &jobid)) {
713 		exit_code = 1;
714 		return;
715 	}
716 
717 	/* Step ID is optional */
718 	printf("%-8s %-8s %-6s %-7s %-8s\n",
719 	       "PID", "JOBID", "STEPID", "LOCALID", "GLOBALID");
720 	if (jobid_str == NULL || jobid_str[0] == '*') {
721 		_list_pids_all_jobs(node_name);
722 	} else if (_parse_stepid(jobid_str, &stepid)) {
723 		_list_pids_one_step(node_name, jobid, stepid);
724 	} else {
725 		_list_pids_all_steps(node_name, jobid);
726 	}
727 }
728 
scontrol_getent(const char * node_name)729 extern void scontrol_getent(const char *node_name)
730 {
731 	List steps = NULL;
732 	ListIterator itr = NULL;
733 	step_loc_t *stepd;
734 	int fd;
735 	struct passwd *pwd = NULL;
736 	struct group **grps = NULL;
737 
738 	if (!(steps = stepd_available(NULL, node_name))) {
739 		fprintf(stderr, "No steps found on this node\n");
740 		return;
741 	}
742 
743 	itr = list_iterator_create(steps);
744 	while ((stepd = list_next(itr))) {
745 		fd = stepd_connect(NULL, node_name, stepd->jobid,
746 				   stepd->stepid,
747 				   &stepd->protocol_version);
748 
749 		if (fd < 0)
750 			continue;
751 		pwd = stepd_getpw(fd, stepd->protocol_version,
752 				  GETPW_MATCH_ALWAYS, 0, NULL);
753 
754 		if (!pwd) {
755 			close(fd);
756 			continue;
757 		}
758 
759 		if (stepd->stepid == SLURM_EXTERN_CONT)
760 			printf("JobId=%u.Extern:\nUser:\n", stepd->jobid);
761 		else if (stepd->stepid == SLURM_BATCH_SCRIPT)
762 			printf("JobId=%u.Batch:\nUser:\n", stepd->jobid);
763 		else
764 			printf("JobId=%u.%u:\nUser:\n",
765 			       stepd->jobid, stepd->stepid);
766 
767 		printf("%s:%s:%u:%u:%s:%s:%s\nGroups:\n",
768 		       pwd->pw_name, pwd->pw_passwd, pwd->pw_uid, pwd->pw_gid,
769 		       pwd->pw_gecos, pwd->pw_dir, pwd->pw_shell);
770 
771 		xfree_struct_passwd(pwd);
772 
773 		grps = stepd_getgr(fd, stepd->protocol_version,
774 				   GETGR_MATCH_ALWAYS, 0, NULL);
775 		if (!grps) {
776 			close(fd);
777 			printf("\n");
778 			continue;
779 		}
780 
781 		for (int i = 0; grps[i]; i++) {
782 			printf("%s:%s:%u:%s\n",
783 			       grps[i]->gr_name, grps[i]->gr_passwd,
784 			       grps[i]->gr_gid,
785 			       (grps[i]->gr_mem) ? grps[i]->gr_mem[0] : "");
786 		}
787 		close(fd);
788 		xfree_struct_group_array(grps);
789 		printf("\n");
790 	}
791 	list_iterator_destroy(itr);
792 	FREE_NULL_LIST(steps);
793 }
794 
795 /*
796  * scontrol_print_hosts - given a node list expression, return
797  *	a list of nodes, one per line
798  */
799 extern void
scontrol_print_hosts(char * node_list)800 scontrol_print_hosts (char * node_list)
801 {
802 	hostlist_t hl;
803 	char *host;
804 
805 	if (!node_list) {
806 		error("host list is empty");
807 		return;
808 	}
809 	hl = hostlist_create_dims(node_list, 0);
810 	if (!hl) {
811 		fprintf(stderr, "Invalid hostlist: %s\n", node_list);
812 		return;
813 	}
814 	while ((host = hostlist_shift_dims(hl, 0))) {
815 		printf("%s\n", host);
816 		free(host);
817 	}
818 	hostlist_destroy(hl);
819 }
820 
821 /* Replace '\n' with ',', remove duplicate comma */
822 static void
_reformat_hostlist(char * hostlist)823 _reformat_hostlist(char *hostlist)
824 {
825 	int i, o;
826 	for (i=0; (hostlist[i] != '\0'); i++) {
827 		if (hostlist[i] == '\n')
828 			hostlist[i] = ',';
829 	}
830 
831 	o = 0;
832 	for (i=0; (hostlist[i] != '\0'); i++) {
833 		while ((hostlist[i] == ',') && (hostlist[i+1] == ','))
834 			i++;
835 		hostlist[o++] = hostlist[i];
836 	}
837 	hostlist[o] = '\0';
838 }
839 
840 /*
841  * scontrol_encode_hostlist - given a list of hostnames or the pathname
842  *	of a file containing hostnames, translate them into a hostlist
843  *	expression
844  */
845 extern int
scontrol_encode_hostlist(char * hostlist,bool sorted)846 scontrol_encode_hostlist(char *hostlist, bool sorted)
847 {
848 	char *io_buf = NULL, *tmp_list, *ranged_string;
849 	int buf_size = 1024 * 1024;
850 	hostlist_t hl;
851 
852 	if (!hostlist) {
853 		fprintf(stderr, "Hostlist is NULL\n");
854 		return SLURM_ERROR;
855 	}
856 
857 	if (hostlist[0] == '/') {
858 		ssize_t buf_read;
859 		int fd = open(hostlist, O_RDONLY);
860 		if (fd < 0) {
861 			fprintf(stderr, "Can not open %s\n", hostlist);
862 			return SLURM_ERROR;
863 		}
864 		io_buf = xmalloc(buf_size);
865 		buf_read = read(fd, io_buf, buf_size);
866 		close(fd);
867 		if (buf_read >= buf_size) {
868 			/* If over 1MB, the file is almost certainly invalid */
869 			fprintf(stderr, "File %s is too large\n", hostlist);
870 			xfree(io_buf);
871 			return SLURM_ERROR;
872 		}
873 		io_buf[buf_read] = '\0';
874 		_reformat_hostlist(io_buf);
875 		tmp_list = io_buf;
876 	} else
877 		tmp_list = hostlist;
878 
879 	hl = hostlist_create(tmp_list);
880 	if (hl == NULL) {
881 		fprintf(stderr, "Invalid hostlist: %s\n", tmp_list);
882 		xfree(io_buf);
883 		return SLURM_ERROR;
884 	}
885 	if (sorted)
886 		hostlist_sort(hl);
887 	ranged_string = hostlist_ranged_string_xmalloc(hl);
888 	printf("%s\n", ranged_string);
889 	hostlist_destroy(hl);
890 	xfree(ranged_string);
891 	xfree(io_buf);
892 	return SLURM_SUCCESS;
893 }
894 
_wait_nodes_ready(uint32_t job_id)895 static int _wait_nodes_ready(uint32_t job_id)
896 {
897 	int is_ready = SLURM_ERROR, i, rc = 0;
898 	int cur_delay = 0;
899 	int suspend_time, resume_time, max_delay;
900 
901 	suspend_time = slurm_get_suspend_timeout();
902 	resume_time  = slurm_get_resume_timeout();
903 	if ((suspend_time == 0) || (resume_time == 0))
904 		return SLURM_SUCCESS;	/* Power save mode disabled */
905 	max_delay = suspend_time + resume_time;
906 	max_delay *= 5;		/* Allow for ResumeRate support */
907 
908 	for (i=0; (cur_delay < max_delay); i++) {
909 		if (i) {
910 			if (i == 1)
911 				info("Waiting for nodes to boot");
912 			sleep(POLL_SLEEP);
913 			cur_delay += POLL_SLEEP;
914 		}
915 
916 		rc = slurm_job_node_ready(job_id);
917 
918 		if (rc == READY_JOB_FATAL)
919 			break;				/* fatal error */
920 		if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
921 			continue;			/* retry */
922 		if ((rc & READY_JOB_STATE) == 0)	/* job killed */
923 			break;
924 		if (rc & READY_NODE_STATE) {		/* job and node ready */
925 			is_ready = SLURM_SUCCESS;
926 			break;
927 		}
928 	}
929 	if (is_ready == SLURM_SUCCESS)
930      		info("Nodes are ready for job %u", job_id);
931 	else if ((rc & READY_JOB_STATE) == 0)
932 		info("Job %u no longer running", job_id);
933 	else
934 		info("Problem running job %u", job_id);
935 
936 	return is_ready;
937 }
938 
939 /*
940  * Wait until a job is ready to execute or enters some failed state
941  * RET 1: job ready to run
942  *     0: job can't run (cancelled, failure state, timeout, etc.)
943  */
scontrol_job_ready(char * job_id_str)944 extern int scontrol_job_ready(char *job_id_str)
945 {
946 	uint32_t job_id;
947 
948 	job_id = atoi(job_id_str);
949 	if (job_id <= 0) {
950 		fprintf(stderr, "Invalid job_id %s", job_id_str);
951 		return SLURM_ERROR;
952 	}
953 
954 	return _wait_nodes_ready(job_id);
955 }
956 
scontrol_callerid(int argc,char ** argv)957 extern int scontrol_callerid(int argc, char **argv)
958 {
959 	int af, ver = 4;
960 	unsigned char ip_src[sizeof(struct in6_addr)],
961 		      ip_dst[sizeof(struct in6_addr)];
962 	uint32_t port_src, port_dst, job_id;
963 	network_callerid_msg_t req;
964 	char node_name[MAXHOSTNAMELEN], *ptr;
965 
966 	if (argc == 5) {
967 		ver = strtoul(argv[4], &ptr, 0);
968 		if (ptr && ptr[0]) {
969 			error("Address family not an integer");
970 			return SLURM_ERROR;
971 		}
972 	}
973 
974 	if (ver != 4 && ver != 6) {
975 		error("Invalid address family: %d", ver);
976 		return SLURM_ERROR;
977 	}
978 
979 	af = ver == 4 ? AF_INET : AF_INET6;
980 	if (!inet_pton(af, argv[0], ip_src)) {
981 		error("inet_pton failed for '%s'", argv[0]);
982 		return SLURM_ERROR;
983 	}
984 
985 	port_src = strtoul(argv[1], &ptr, 0);
986 	if (ptr && ptr[0]) {
987 		error("Source port not an integer");
988 		return SLURM_ERROR;
989 	}
990 
991 	if (!inet_pton(af, argv[2], ip_dst)) {
992 		error("scontrol_callerid: inet_pton failed for '%s'", argv[2]);
993 		return SLURM_ERROR;
994 	}
995 
996 	port_dst = strtoul(argv[3], &ptr, 0);
997 	if (ptr && ptr[0]) {
998 		error("Destination port not an integer");
999 		return SLURM_ERROR;
1000 	}
1001 
1002 	memcpy(req.ip_src, ip_src, 16);
1003 	memcpy(req.ip_dst, ip_dst, 16);
1004 	req.port_src = port_src;
1005 	req.port_dst = port_dst;
1006 	req.af = af;
1007 
1008 	if (slurm_network_callerid(req, &job_id, node_name, MAXHOSTNAMELEN)
1009 			!= SLURM_SUCCESS) {
1010 		fprintf(stderr,
1011 			"slurm_network_callerid: unable to retrieve callerid data from remote slurmd\n");
1012 		return SLURM_ERROR;
1013 	} else if (job_id == NO_VAL) {
1014 		fprintf(stderr,
1015 			"slurm_network_callerid: remote job id indeterminate\n");
1016 		return SLURM_ERROR;
1017 	} else {
1018 		printf("%u %s\n", job_id, node_name);
1019 		return SLURM_SUCCESS;
1020 	}
1021 }
1022 
scontrol_batch_script(int argc,char ** argv)1023 extern int scontrol_batch_script(int argc, char **argv)
1024 {
1025 	char *filename;
1026 	FILE *out;
1027 	int exit_code;
1028 	uint32_t jobid;
1029 
1030 	if (argc < 1)
1031 		return SLURM_ERROR;
1032 
1033 	jobid = atoll(argv[0]);
1034 
1035 	if (argc > 1)
1036 		filename = xstrdup(argv[1]);
1037 	else
1038 		filename = xstrdup_printf("slurm-%u.sh", jobid);
1039 
1040 	if (!xstrcmp(filename, "-")) {
1041 		out = stdout;
1042 	} else {
1043 		if (!(out = fopen(filename, "w"))) {
1044 			fprintf(stderr, "failed to open file `%s`: %m\n",
1045 				filename);
1046 			xfree(filename);
1047 			return errno;
1048 		}
1049 	}
1050 
1051 	exit_code = slurm_job_batch_script(out, jobid);
1052 
1053 	if (out != stdout)
1054 		fclose(out);
1055 
1056 	if (exit_code != SLURM_SUCCESS) {
1057 		if (out != stdout)
1058 			unlink(filename);
1059 		slurm_perror("job script retrieval failed");
1060 	} else if ((out != stdout) && (quiet_flag != 1)) {
1061 		printf("batch script for job %u written to %s\n",
1062 		       jobid, filename);
1063 	}
1064 
1065 	xfree(filename);
1066 	return exit_code;
1067 }
1068