1 /*****************************************************************************\
2 * signal.c - Send a signal to a slurm job or job step
3 *****************************************************************************
4 * Copyright (C) 2005 The Regents of the University of California.
5 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
6 * Written by Christopher J. Morrone <morrone2@llnl.gov>.
7 * CODE-OCEC-09-009. All rights reserved.
8 *
9 * This file is part of Slurm, a resource management program.
10 * For details, see <https://slurm.schedmd.com/>.
11 * Please also read the included file: DISCLAIMER.
12 *
13 * Slurm is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option)
16 * any later version.
17 *
18 * In addition, as a special exception, the copyright holders give permission
19 * to link the code of portions of this program with the OpenSSL library under
20 * certain conditions as described in each individual source file, and
21 * distribute linked combinations including the two. You must obey the GNU
22 * General Public License in all respects for all of the code used other than
23 * OpenSSL. If you modify file(s) with this exception, you may extend this
24 * exception to your version of the file(s), but you are not obligated to do
25 * so. If you do not wish to do so, delete this exception statement from your
26 * version. If you delete this exception statement from all source files in
27 * the program, then also delete it here.
28 *
29 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
30 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
31 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
32 * details.
33 *
34 * You should have received a copy of the GNU General Public License along
35 * with Slurm; if not, write to the Free Software Foundation, Inc.,
36 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
37 \*****************************************************************************/
38
39 #include <errno.h>
40 #include <pthread.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43
44 #include "slurm/slurm.h"
45 #include "slurm/slurm_errno.h"
46
47 #include "src/common/xmalloc.h"
48 #include "src/common/hostlist.h"
49 #include "src/common/read_config.h"
50 #include "src/common/macros.h"
51 #include "src/common/slurm_protocol_api.h"
52
_local_send_recv_rc_msgs(const char * nodelist,slurm_msg_type_t type,void * data)53 static int _local_send_recv_rc_msgs(const char *nodelist,
54 slurm_msg_type_t type, void *data)
55 {
56 List ret_list = NULL;
57 int temp_rc = 0, rc = 0;
58 ret_data_info_t *ret_data_info = NULL;
59 slurm_msg_t *msg = xmalloc(sizeof(slurm_msg_t));
60
61 slurm_msg_t_init(msg);
62 msg->msg_type = type;
63 msg->data = data;
64
65 if ((ret_list = slurm_send_recv_msgs(nodelist, msg, 0))) {
66 while ((ret_data_info = list_pop(ret_list))) {
67 temp_rc = slurm_get_return_code(ret_data_info->type,
68 ret_data_info->data);
69 if (temp_rc)
70 rc = temp_rc;
71 }
72 } else {
73 error("slurm_signal_job: no list was returned");
74 rc = SLURM_ERROR;
75 }
76
77 /* don't attempt to free a local variable */
78 msg->data = NULL;
79
80 slurm_free_msg(msg);
81 return rc;
82 }
83
_signal_batch_script_step(const resource_allocation_response_msg_t * allocation,uint32_t signal)84 static int _signal_batch_script_step(const resource_allocation_response_msg_t
85 *allocation, uint32_t signal)
86 {
87 slurm_msg_t msg;
88 signal_tasks_msg_t rpc;
89 int rc = SLURM_SUCCESS;
90 char *name = nodelist_nth_host(allocation->node_list, 0);
91 if (!name) {
92 error("_signal_batch_script_step: "
93 "can't get the first name out of %s",
94 allocation->node_list);
95 return -1;
96 }
97 memset(&rpc, 0, sizeof(rpc));
98 rpc.job_id = allocation->job_id;
99 rpc.job_step_id = SLURM_BATCH_SCRIPT;
100 rpc.signal = signal;
101 rpc.flags = KILL_JOB_BATCH;
102
103 slurm_msg_t_init(&msg);
104 msg.msg_type = REQUEST_SIGNAL_TASKS;
105 msg.data = &rpc;
106 if (slurm_conf_get_addr(name, &msg.address, msg.flags)
107 == SLURM_ERROR) {
108 error("_signal_batch_script_step: "
109 "can't find address for host %s, check slurm.conf",
110 name);
111 free(name);
112 return -1;
113 }
114 free(name);
115 if (slurm_send_recv_rc_msg_only_one(&msg, &rc, 0) < 0) {
116 error("_signal_batch_script_step: %m");
117 rc = -1;
118 }
119 return rc;
120 }
121
_signal_job_step(const job_step_info_t * step,const resource_allocation_response_msg_t * allocation,uint16_t signal)122 static int _signal_job_step(const job_step_info_t *step,
123 const resource_allocation_response_msg_t *
124 allocation, uint16_t signal)
125 {
126 signal_tasks_msg_t rpc;
127 int rc = SLURM_SUCCESS;
128
129 /* same remote procedure call for each node */
130 memset(&rpc, 0, sizeof(rpc));
131 rpc.job_id = step->job_id;
132 rpc.job_step_id = step->step_id;
133 rpc.signal = signal;
134
135 rc = _local_send_recv_rc_msgs(allocation->node_list,
136 REQUEST_SIGNAL_TASKS, &rpc);
137 return rc;
138 }
139
_terminate_batch_script_step(const resource_allocation_response_msg_t * allocation)140 static int _terminate_batch_script_step(const resource_allocation_response_msg_t
141 * allocation)
142 {
143 slurm_msg_t msg;
144 signal_tasks_msg_t rpc;
145 int rc = SLURM_SUCCESS;
146 int i;
147 char *name = nodelist_nth_host(allocation->node_list, 0);
148 if (!name) {
149 error("_terminate_batch_script_step: "
150 "can't get the first name out of %s",
151 allocation->node_list);
152 return -1;
153 }
154
155 memset(&rpc, 0, sizeof(rpc));
156 rpc.job_id = allocation->job_id;
157 rpc.job_step_id = SLURM_BATCH_SCRIPT;
158 rpc.signal = (uint16_t)-1; /* not used by slurmd */
159
160 slurm_msg_t_init(&msg);
161 msg.msg_type = REQUEST_TERMINATE_TASKS;
162 msg.data = &rpc;
163
164 if (slurm_conf_get_addr(name, &msg.address, msg.flags)
165 == SLURM_ERROR) {
166 error("_terminate_batch_script_step: "
167 "can't find address for host %s, check slurm.conf",
168 name);
169 free(name);
170 return -1;
171 }
172 free(name);
173 i = slurm_send_recv_rc_msg_only_one(&msg, &rc, 0);
174 if (i != 0)
175 rc = i;
176
177 return rc;
178 }
179
180 /*
181 * Send a REQUEST_TERMINATE_TASKS rpc to all nodes in a job step.
182 *
183 * RET Upon successful termination of the job step, 0 shall be returned.
184 * Otherwise, -1 shall be returned and errno set to indicate the error.
185 */
_terminate_job_step(const job_step_info_t * step,const resource_allocation_response_msg_t * allocation)186 static int _terminate_job_step(const job_step_info_t *step,
187 const resource_allocation_response_msg_t *
188 allocation)
189 {
190 signal_tasks_msg_t rpc;
191 int rc = SLURM_SUCCESS;
192
193 /*
194 * Send REQUEST_TERMINATE_TASKS to all nodes of the step
195 */
196 memset(&rpc, 0, sizeof(rpc));
197 rpc.job_id = step->job_id;
198 rpc.job_step_id = step->step_id;
199 rpc.signal = (uint16_t)-1; /* not used by slurmd */
200 rc = _local_send_recv_rc_msgs(allocation->node_list,
201 REQUEST_TERMINATE_TASKS, &rpc);
202 if ((rc == -1) && (errno == ESLURM_ALREADY_DONE)) {
203 rc = 0;
204 errno = 0;
205 }
206
207 return rc;
208 }
209
210 /*
211 * slurm_signal_job - send the specified signal to all steps of an existing job
212 * IN job_id - the job's id
213 * IN signal - signal number
214 * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
215 */
216 extern int
slurm_signal_job(uint32_t job_id,uint16_t signal)217 slurm_signal_job (uint32_t job_id, uint16_t signal)
218 {
219 int rc = SLURM_SUCCESS;
220 resource_allocation_response_msg_t *alloc_info = NULL;
221 signal_tasks_msg_t rpc;
222
223 if (slurm_allocation_lookup(job_id, &alloc_info)) {
224 rc = slurm_get_errno();
225 goto fail1;
226 }
227
228 /* same remote procedure call for each node */
229 memset(&rpc, 0, sizeof(rpc));
230 rpc.job_id = job_id;
231 rpc.signal = signal;
232 rpc.flags = KILL_STEPS_ONLY;
233
234 rc = _local_send_recv_rc_msgs(alloc_info->node_list,
235 REQUEST_SIGNAL_TASKS, &rpc);
236 slurm_free_resource_allocation_response_msg(alloc_info);
237 fail1:
238 if (rc) {
239 slurm_seterrno_ret(rc);
240 } else {
241 return SLURM_SUCCESS;
242 }
243 }
244
245 /*
246 * slurm_signal_job_step - send the specified signal to an existing job step
247 * IN job_id - the job's id
248 * IN step_id - the job step's id - use SLURM_BATCH_SCRIPT as the step_id
249 * to send a signal to a job's batch script
250 * IN signal - signal number
251 * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
252 */
253 extern int
slurm_signal_job_step(uint32_t job_id,uint32_t step_id,uint32_t signal)254 slurm_signal_job_step (uint32_t job_id, uint32_t step_id, uint32_t signal)
255 {
256 resource_allocation_response_msg_t *alloc_info = NULL;
257 job_step_info_response_msg_t *step_info = NULL;
258 int rc;
259 int i;
260 int save_errno = 0;
261
262 if (slurm_allocation_lookup(job_id, &alloc_info)) {
263 return -1;
264 }
265
266 /*
267 * The controller won't give us info about the batch script job step,
268 * so we need to handle that seperately.
269 */
270 if (step_id == SLURM_BATCH_SCRIPT) {
271 rc = _signal_batch_script_step(alloc_info, signal);
272 slurm_free_resource_allocation_response_msg(alloc_info);
273 errno = rc;
274 return rc ? -1 : 0;
275 }
276
277 /*
278 * Otherwise, look through the list of job step info and find
279 * the one matching step_id. Signal that step.
280 */
281 rc = slurm_get_job_steps((time_t)0, job_id, step_id,
282 &step_info, SHOW_ALL);
283 if (rc != 0) {
284 save_errno = errno;
285 goto fail;
286 }
287 for (i = 0; i < step_info->job_step_count; i++) {
288 if ((step_info->job_steps[i].job_id == job_id) &&
289 (step_info->job_steps[i].step_id == step_id)) {
290 rc = _signal_job_step(&step_info->job_steps[i],
291 alloc_info, signal);
292 save_errno = rc;
293 break;
294 }
295 }
296 slurm_free_job_step_info_response_msg(step_info);
297 fail:
298 slurm_free_resource_allocation_response_msg(alloc_info);
299 errno = save_errno;
300 return rc ? -1 : 0;
301 }
302
303 /*
304 * slurm_terminate_job_step - terminates a job step by sending a
305 * REQUEST_TERMINATE_TASKS rpc to all slurmd of a job step.
306 * IN job_id - the job's id
307 * IN step_id - the job step's id - use SLURM_BATCH_SCRIPT as the step_id
308 * to terminate a job's batch script
309 * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
310 */
311 extern int
slurm_terminate_job_step(uint32_t job_id,uint32_t step_id)312 slurm_terminate_job_step (uint32_t job_id, uint32_t step_id)
313 {
314 resource_allocation_response_msg_t *alloc_info = NULL;
315 job_step_info_response_msg_t *step_info = NULL;
316 int rc = 0;
317 int i;
318 int save_errno = 0;
319
320 if (slurm_allocation_lookup(job_id, &alloc_info)) {
321 return -1;
322 }
323
324 /*
325 * The controller won't give us info about the batch script job step,
326 * so we need to handle that seperately.
327 */
328 if (step_id == SLURM_BATCH_SCRIPT) {
329 rc = _terminate_batch_script_step(alloc_info);
330 slurm_free_resource_allocation_response_msg(alloc_info);
331 errno = rc;
332 return rc ? -1 : 0;
333 }
334
335 /*
336 * Otherwise, look through the list of job step info and find
337 * the one matching step_id. Terminate that step.
338 */
339 rc = slurm_get_job_steps((time_t)0, job_id, step_id,
340 &step_info, SHOW_ALL);
341 if (rc != 0) {
342 save_errno = errno;
343 goto fail;
344 }
345 for (i = 0; i < step_info->job_step_count; i++) {
346 if ((step_info->job_steps[i].job_id == job_id) &&
347 (step_info->job_steps[i].step_id == step_id)) {
348 rc = _terminate_job_step(&step_info->job_steps[i],
349 alloc_info);
350 save_errno = errno;
351 break;
352 }
353 }
354 slurm_free_job_step_info_response_msg(step_info);
355 fail:
356 slurm_free_resource_allocation_response_msg(alloc_info);
357 errno = save_errno;
358 return rc ? -1 : 0;
359 }
360
361 /*
362 * slurm_notify_job - send message to the job's stdout,
363 * usable only by user root
364 * IN job_id - slurm job_id or 0 for all jobs
365 * IN message - arbitrary message
366 * RET 0 or -1 on error
367 */
slurm_notify_job(uint32_t job_id,char * message)368 extern int slurm_notify_job (uint32_t job_id, char *message)
369 {
370 int rc;
371 slurm_msg_t msg;
372 job_notify_msg_t req;
373
374 slurm_msg_t_init(&msg);
375 /*
376 * Request message:
377 */
378 memset(&req, 0, sizeof(req));
379 req.job_id = job_id;
380 req.job_step_id = NO_VAL; /* currently not used */
381 req.message = message;
382 msg.msg_type = REQUEST_JOB_NOTIFY;
383 msg.data = &req;
384
385 if (slurm_send_recv_controller_rc_msg(&msg, &rc,
386 working_cluster_rec) < 0)
387 return SLURM_ERROR;
388
389 if (rc) {
390 slurm_seterrno_ret(rc);
391 return SLURM_ERROR;
392 }
393
394 return SLURM_SUCCESS;
395 }
396