1 /*****************************************************************************\
2  *  signal.c - Send a signal to a slurm job or job step
3  *****************************************************************************
4  *  Copyright (C) 2005 The Regents of the University of California.
5  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
6  *  Written by Christopher J. Morrone <morrone2@llnl.gov>.
7  *  CODE-OCEC-09-009. All rights reserved.
8  *
9  *  This file is part of Slurm, a resource management program.
10  *  For details, see <https://slurm.schedmd.com/>.
11  *  Please also read the included file: DISCLAIMER.
12  *
13  *  Slurm is free software; you can redistribute it and/or modify it under
14  *  the terms of the GNU General Public License as published by the Free
15  *  Software Foundation; either version 2 of the License, or (at your option)
16  *  any later version.
17  *
18  *  In addition, as a special exception, the copyright holders give permission
19  *  to link the code of portions of this program with the OpenSSL library under
20  *  certain conditions as described in each individual source file, and
21  *  distribute linked combinations including the two. You must obey the GNU
22  *  General Public License in all respects for all of the code used other than
23  *  OpenSSL. If you modify file(s) with this exception, you may extend this
24  *  exception to your version of the file(s), but you are not obligated to do
25  *  so. If you do not wish to do so, delete this exception statement from your
26  *  version.  If you delete this exception statement from all source files in
27  *  the program, then also delete it here.
28  *
29  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
30  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
31  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
32  *  details.
33  *
34  *  You should have received a copy of the GNU General Public License along
35  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
36  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
37 \*****************************************************************************/
38 
39 #include <errno.h>
40 #include <pthread.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 
44 #include "slurm/slurm.h"
45 #include "slurm/slurm_errno.h"
46 
47 #include "src/common/xmalloc.h"
48 #include "src/common/hostlist.h"
49 #include "src/common/read_config.h"
50 #include "src/common/macros.h"
51 #include "src/common/slurm_protocol_api.h"
52 
_local_send_recv_rc_msgs(const char * nodelist,slurm_msg_type_t type,void * data)53 static int _local_send_recv_rc_msgs(const char *nodelist,
54 				    slurm_msg_type_t type, void *data)
55 {
56 	List ret_list = NULL;
57 	int temp_rc = 0, rc = 0;
58 	ret_data_info_t *ret_data_info = NULL;
59 	slurm_msg_t *msg = xmalloc(sizeof(slurm_msg_t));
60 
61 	slurm_msg_t_init(msg);
62 	msg->msg_type = type;
63 	msg->data = data;
64 
65 	if ((ret_list = slurm_send_recv_msgs(nodelist, msg, 0))) {
66 		while ((ret_data_info = list_pop(ret_list))) {
67 			temp_rc = slurm_get_return_code(ret_data_info->type,
68 							ret_data_info->data);
69 			if (temp_rc)
70 				rc = temp_rc;
71 		}
72 	} else {
73 		error("slurm_signal_job: no list was returned");
74 		rc = SLURM_ERROR;
75 	}
76 
77 	/* don't attempt to free a local variable */
78 	msg->data = NULL;
79 
80 	slurm_free_msg(msg);
81 	return rc;
82 }
83 
_signal_batch_script_step(const resource_allocation_response_msg_t * allocation,uint32_t signal)84 static int _signal_batch_script_step(const resource_allocation_response_msg_t
85 				     *allocation, uint32_t signal)
86 {
87 	slurm_msg_t msg;
88 	signal_tasks_msg_t rpc;
89 	int rc = SLURM_SUCCESS;
90 	char *name = nodelist_nth_host(allocation->node_list, 0);
91 	if (!name) {
92 		error("_signal_batch_script_step: "
93 		      "can't get the first name out of %s",
94 		      allocation->node_list);
95 		return -1;
96 	}
97 	memset(&rpc, 0, sizeof(rpc));
98 	rpc.job_id = allocation->job_id;
99 	rpc.job_step_id = SLURM_BATCH_SCRIPT;
100 	rpc.signal = signal;
101 	rpc.flags = KILL_JOB_BATCH;
102 
103 	slurm_msg_t_init(&msg);
104 	msg.msg_type = REQUEST_SIGNAL_TASKS;
105 	msg.data = &rpc;
106 	if (slurm_conf_get_addr(name, &msg.address, msg.flags)
107 	    == SLURM_ERROR) {
108 		error("_signal_batch_script_step: "
109 		      "can't find address for host %s, check slurm.conf",
110 		      name);
111 		free(name);
112 		return -1;
113 	}
114 	free(name);
115 	if (slurm_send_recv_rc_msg_only_one(&msg, &rc, 0) < 0) {
116 		error("_signal_batch_script_step: %m");
117 		rc = -1;
118 	}
119 	return rc;
120 }
121 
_signal_job_step(const job_step_info_t * step,const resource_allocation_response_msg_t * allocation,uint16_t signal)122 static int _signal_job_step(const job_step_info_t *step,
123 			    const resource_allocation_response_msg_t *
124 			    allocation, uint16_t signal)
125 {
126 	signal_tasks_msg_t rpc;
127 	int rc = SLURM_SUCCESS;
128 
129 	/* same remote procedure call for each node */
130 	memset(&rpc, 0, sizeof(rpc));
131 	rpc.job_id = step->job_id;
132 	rpc.job_step_id = step->step_id;
133 	rpc.signal = signal;
134 
135 	rc = _local_send_recv_rc_msgs(allocation->node_list,
136 				      REQUEST_SIGNAL_TASKS, &rpc);
137 	return rc;
138 }
139 
_terminate_batch_script_step(const resource_allocation_response_msg_t * allocation)140 static int _terminate_batch_script_step(const resource_allocation_response_msg_t
141 					* allocation)
142 {
143 	slurm_msg_t msg;
144 	signal_tasks_msg_t rpc;
145 	int rc = SLURM_SUCCESS;
146 	int i;
147 	char *name = nodelist_nth_host(allocation->node_list, 0);
148 	if (!name) {
149 		error("_terminate_batch_script_step: "
150 		      "can't get the first name out of %s",
151 		      allocation->node_list);
152 		return -1;
153 	}
154 
155 	memset(&rpc, 0, sizeof(rpc));
156 	rpc.job_id = allocation->job_id;
157 	rpc.job_step_id = SLURM_BATCH_SCRIPT;
158 	rpc.signal = (uint16_t)-1; /* not used by slurmd */
159 
160 	slurm_msg_t_init(&msg);
161 	msg.msg_type = REQUEST_TERMINATE_TASKS;
162 	msg.data = &rpc;
163 
164 	if (slurm_conf_get_addr(name, &msg.address, msg.flags)
165 	    == SLURM_ERROR) {
166 		error("_terminate_batch_script_step: "
167 		      "can't find address for host %s, check slurm.conf",
168 		      name);
169 		free(name);
170 		return -1;
171 	}
172 	free(name);
173 	i = slurm_send_recv_rc_msg_only_one(&msg, &rc, 0);
174 	if (i != 0)
175 		rc = i;
176 
177 	return rc;
178 }
179 
180 /*
181  * Send a REQUEST_TERMINATE_TASKS rpc to all nodes in a job step.
182  *
183  * RET Upon successful termination of the job step, 0 shall be returned.
184  * Otherwise, -1 shall be returned and errno set to indicate the error.
185  */
_terminate_job_step(const job_step_info_t * step,const resource_allocation_response_msg_t * allocation)186 static int _terminate_job_step(const job_step_info_t *step,
187 			       const resource_allocation_response_msg_t *
188 			       allocation)
189 {
190 	signal_tasks_msg_t rpc;
191 	int rc = SLURM_SUCCESS;
192 
193 	/*
194 	 *  Send REQUEST_TERMINATE_TASKS to all nodes of the step
195 	 */
196 	memset(&rpc, 0, sizeof(rpc));
197 	rpc.job_id = step->job_id;
198 	rpc.job_step_id = step->step_id;
199 	rpc.signal = (uint16_t)-1; /* not used by slurmd */
200 	rc = _local_send_recv_rc_msgs(allocation->node_list,
201 				      REQUEST_TERMINATE_TASKS, &rpc);
202 	if ((rc == -1) && (errno == ESLURM_ALREADY_DONE)) {
203 		rc = 0;
204 		errno = 0;
205 	}
206 
207 	return rc;
208 }
209 
210 /*
211  * slurm_signal_job - send the specified signal to all steps of an existing job
212  * IN job_id     - the job's id
213  * IN signal     - signal number
214  * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
215  */
216 extern int
slurm_signal_job(uint32_t job_id,uint16_t signal)217 slurm_signal_job (uint32_t job_id, uint16_t signal)
218 {
219 	int rc = SLURM_SUCCESS;
220 	resource_allocation_response_msg_t *alloc_info = NULL;
221 	signal_tasks_msg_t rpc;
222 
223 	if (slurm_allocation_lookup(job_id, &alloc_info)) {
224 		rc = slurm_get_errno();
225 		goto fail1;
226 	}
227 
228 	/* same remote procedure call for each node */
229 	memset(&rpc, 0, sizeof(rpc));
230 	rpc.job_id = job_id;
231 	rpc.signal = signal;
232 	rpc.flags = KILL_STEPS_ONLY;
233 
234 	rc = _local_send_recv_rc_msgs(alloc_info->node_list,
235 				      REQUEST_SIGNAL_TASKS, &rpc);
236 	slurm_free_resource_allocation_response_msg(alloc_info);
237 fail1:
238 	if (rc) {
239 		slurm_seterrno_ret(rc);
240 	} else {
241 		return SLURM_SUCCESS;
242 	}
243 }
244 
245 /*
246  * slurm_signal_job_step - send the specified signal to an existing job step
247  * IN job_id  - the job's id
248  * IN step_id - the job step's id - use SLURM_BATCH_SCRIPT as the step_id
249  *              to send a signal to a job's batch script
250  * IN signal  - signal number
251  * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
252  */
253 extern int
slurm_signal_job_step(uint32_t job_id,uint32_t step_id,uint32_t signal)254 slurm_signal_job_step (uint32_t job_id, uint32_t step_id, uint32_t signal)
255 {
256 	resource_allocation_response_msg_t *alloc_info = NULL;
257 	job_step_info_response_msg_t *step_info = NULL;
258 	int rc;
259 	int i;
260 	int save_errno = 0;
261 
262 	if (slurm_allocation_lookup(job_id, &alloc_info)) {
263 		return -1;
264 	}
265 
266 	/*
267 	 * The controller won't give us info about the batch script job step,
268 	 * so we need to handle that seperately.
269 	 */
270 	if (step_id == SLURM_BATCH_SCRIPT) {
271 		rc = _signal_batch_script_step(alloc_info, signal);
272 		slurm_free_resource_allocation_response_msg(alloc_info);
273 		errno = rc;
274 		return rc ? -1 : 0;
275 	}
276 
277 	/*
278 	 * Otherwise, look through the list of job step info and find
279 	 * the one matching step_id.  Signal that step.
280 	 */
281 	rc = slurm_get_job_steps((time_t)0, job_id, step_id,
282 				 &step_info, SHOW_ALL);
283  	if (rc != 0) {
284  		save_errno = errno;
285  		goto fail;
286  	}
287 	for (i = 0; i < step_info->job_step_count; i++) {
288 		if ((step_info->job_steps[i].job_id == job_id) &&
289 		    (step_info->job_steps[i].step_id == step_id)) {
290  			rc = _signal_job_step(&step_info->job_steps[i],
291  					      alloc_info, signal);
292  			save_errno = rc;
293 			break;
294 		}
295 	}
296 	slurm_free_job_step_info_response_msg(step_info);
297 fail:
298 	slurm_free_resource_allocation_response_msg(alloc_info);
299  	errno = save_errno;
300  	return rc ? -1 : 0;
301 }
302 
303 /*
304  * slurm_terminate_job_step - terminates a job step by sending a
305  * 	REQUEST_TERMINATE_TASKS rpc to all slurmd of a job step.
306  * IN job_id  - the job's id
307  * IN step_id - the job step's id - use SLURM_BATCH_SCRIPT as the step_id
308  *              to terminate a job's batch script
309  * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
310  */
311 extern int
slurm_terminate_job_step(uint32_t job_id,uint32_t step_id)312 slurm_terminate_job_step (uint32_t job_id, uint32_t step_id)
313 {
314 	resource_allocation_response_msg_t *alloc_info = NULL;
315 	job_step_info_response_msg_t *step_info = NULL;
316 	int rc = 0;
317 	int i;
318 	int save_errno = 0;
319 
320 	if (slurm_allocation_lookup(job_id, &alloc_info)) {
321 		return -1;
322 	}
323 
324 	/*
325 	 * The controller won't give us info about the batch script job step,
326 	 * so we need to handle that seperately.
327 	 */
328 	if (step_id == SLURM_BATCH_SCRIPT) {
329 		rc = _terminate_batch_script_step(alloc_info);
330 		slurm_free_resource_allocation_response_msg(alloc_info);
331 		errno = rc;
332 		return rc ? -1 : 0;
333 	}
334 
335 	/*
336 	 * Otherwise, look through the list of job step info and find
337 	 * the one matching step_id.  Terminate that step.
338 	 */
339 	rc = slurm_get_job_steps((time_t)0, job_id, step_id,
340 				 &step_info, SHOW_ALL);
341 	if (rc != 0) {
342 		save_errno = errno;
343 		goto fail;
344 	}
345 	for (i = 0; i < step_info->job_step_count; i++) {
346 		if ((step_info->job_steps[i].job_id == job_id) &&
347 		    (step_info->job_steps[i].step_id == step_id)) {
348 			rc = _terminate_job_step(&step_info->job_steps[i],
349 						 alloc_info);
350 			save_errno = errno;
351 			break;
352 		}
353 	}
354 	slurm_free_job_step_info_response_msg(step_info);
355 fail:
356 	slurm_free_resource_allocation_response_msg(alloc_info);
357 	errno = save_errno;
358 	return rc ? -1 : 0;
359 }
360 
361 /*
362  * slurm_notify_job - send message to the job's stdout,
363  *	usable only by user root
364  * IN job_id - slurm job_id or 0 for all jobs
365  * IN message - arbitrary message
366  * RET 0 or -1 on error
367  */
slurm_notify_job(uint32_t job_id,char * message)368 extern int slurm_notify_job (uint32_t job_id, char *message)
369 {
370 	int rc;
371 	slurm_msg_t msg;
372 	job_notify_msg_t req;
373 
374 	slurm_msg_t_init(&msg);
375 	/*
376 	 * Request message:
377 	 */
378 	memset(&req, 0, sizeof(req));
379 	req.job_id      = job_id;
380 	req.job_step_id = NO_VAL;	/* currently not used */
381 	req.message     = message;
382 	msg.msg_type    = REQUEST_JOB_NOTIFY;
383 	msg.data        = &req;
384 
385 	if (slurm_send_recv_controller_rc_msg(&msg, &rc,
386 					      working_cluster_rec) < 0)
387 		return SLURM_ERROR;
388 
389 	if (rc) {
390 		slurm_seterrno_ret(rc);
391 		return SLURM_ERROR;
392 	}
393 
394 	return SLURM_SUCCESS;
395 }
396