1 /*****************************************************************************\
2 * trigger_mgr.c - Event trigger management
3 *****************************************************************************
4 * Copyright (C) 2007 The Regents of the University of California.
5 * Copyright (C) 2008-2010 Lawrence Livermore National Security.
6 * Portions Copyright (C) 2010-2016 SchedMD <https://www.schedmd.com>.
7 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
8 * Written by Morris Jette <jette1@llnl.gov> et. al.
9 * CODE-OCEC-09-009. All rights reserved.
10 *
11 * This file is part of Slurm, a resource management program.
12 * For details, see <https://slurm.schedmd.com/>.
13 * Please also read the included file: DISCLAIMER.
14 *
15 * Slurm is free software; you can redistribute it and/or modify it under
16 * the terms of the GNU General Public License as published by the Free
17 * Software Foundation; either version 2 of the License, or (at your option)
18 * any later version.
19 *
20 * In addition, as a special exception, the copyright holders give permission
21 * to link the code of portions of this program with the OpenSSL library under
22 * certain conditions as described in each individual source file, and
23 * distribute linked combinations including the two. You must obey the GNU
24 * General Public License in all respects for all of the code used other than
25 * OpenSSL. If you modify file(s) with this exception, you may extend this
26 * exception to your version of the file(s), but you are not obligated to do
27 * so. If you do not wish to do so, delete this exception statement from your
28 * version. If you delete this exception statement from all source files in
29 * the program, then also delete it here.
30 *
31 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
32 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
33 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
34 * details.
35 *
36 * You should have received a copy of the GNU General Public License along
37 * with Slurm; if not, write to the Free Software Foundation, Inc.,
38 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
39 \*****************************************************************************/
40
41 #include "config.h"
42
43 #include <ctype.h>
44 #include <errno.h>
45 #include <fcntl.h>
46 #include <grp.h>
47 #include <pthread.h>
48 #include <signal.h>
49 #include <stdlib.h>
50 #include <sys/stat.h>
51 #include <sys/types.h>
52
53 #include "src/common/bitstring.h"
54 #include "src/common/fd.h"
55 #include "src/common/list.h"
56 #include "src/common/slurmdbd_defs.h"
57 #include "src/common/slurm_protocol_defs.h"
58 #include "src/common/uid.h"
59 #include "src/common/xmalloc.h"
60 #include "src/common/xstring.h"
61
62 #include "src/slurmctld/locks.h"
63 #include "src/slurmctld/slurmctld.h"
64 #include "src/slurmctld/state_save.h"
65 #include "src/slurmctld/trigger_mgr.h"
66
67 #define MAX_PROG_TIME 300 /* maximum run time for program */
68
69 /* Change TRIGGER_STATE_VERSION value when changing the state save format */
70 #define TRIGGER_STATE_VERSION "PROTOCOL_VERSION"
71
72 List trigger_list;
73 uint32_t next_trigger_id = 1;
74 static pthread_mutex_t trigger_mutex = PTHREAD_MUTEX_INITIALIZER;
75 bitstr_t *trigger_down_front_end_bitmap = NULL;
76 bitstr_t *trigger_up_front_end_bitmap = NULL;
77 bitstr_t *trigger_down_nodes_bitmap = NULL;
78 bitstr_t *trigger_drained_nodes_bitmap = NULL;
79 bitstr_t *trigger_fail_nodes_bitmap = NULL;
80 bitstr_t *trigger_up_nodes_bitmap = NULL;
81 static bool trigger_bb_error = false;
82 static bool trigger_node_reconfig = false;
83 static bool trigger_pri_ctld_fail = false;
84 static bool trigger_pri_ctld_res_op = false;
85 static bool trigger_pri_ctld_res_ctrl = false;
86 static bool trigger_pri_ctld_acct_buffer_full = false;
87 static bool trigger_bu_ctld_fail = false;
88 static bool trigger_bu_ctld_res_op = false;
89 static bool trigger_bu_ctld_as_ctrl = false;
90 static bool trigger_pri_dbd_fail = false;
91 static bool trigger_pri_dbd_res_op = false;
92 static bool trigger_pri_db_fail = false;
93 static bool trigger_pri_db_res_op = false;
94
95 /* Current trigger pull states (saved and restored) */
96 uint8_t ctld_failure = 0;
97 uint8_t bu_ctld_failure = 0;
98 uint8_t db_failure = 0;
99 uint8_t dbd_failure = 0;
100
101 typedef struct trig_mgr_info {
102 uint32_t child_pid; /* pid of child process */
103 uint16_t flags; /* TRIGGER_FLAG_* */
104 uint32_t trig_id; /* trigger ID */
105 uint16_t res_type; /* TRIGGER_RES_TYPE_* */
106 char * res_id; /* node name or job_id (string) */
107 bitstr_t *nodes_bitmap; /* bitmap of requested nodes (if applicable) */
108 uint32_t job_id; /* job ID (if applicable) */
109 job_record_t *job_ptr; /* pointer to job record (if applicable) */
110 uint32_t trig_type; /* TRIGGER_TYPE_* */
111 time_t trig_time; /* offset (pending) or time stamp (complete) */
112 uint32_t user_id; /* user requesting trigger */
113 uint32_t group_id; /* user's group id */
114 char * program; /* program to execute */
115 uint8_t state; /* 0=pending, 1=pulled, 2=completed */
116
117 /* The orig_ fields are used to save and clone the orignal values */
118 bitstr_t *orig_bitmap; /* bitmap of requested nodes (if applicable) */
119 char * orig_res_id; /* original node name or job_id (string) */
120 time_t orig_time; /* offset (pending) or time stamp (complete) */
121 } trig_mgr_info_t;
122
123 /* Prototype for ListDelF */
_trig_del(void * x)124 void _trig_del(void *x) {
125 trig_mgr_info_t * tmp = (trig_mgr_info_t *) x;
126 xfree(tmp->res_id);
127 xfree(tmp->orig_res_id);
128 xfree(tmp->program);
129 FREE_NULL_BITMAP(tmp->nodes_bitmap);
130 FREE_NULL_BITMAP(tmp->orig_bitmap);
131 xfree(tmp);
132 }
133
_trig_offset(uint16_t offset)134 static int _trig_offset(uint16_t offset)
135 {
136 static int rc;
137 rc = offset;
138 rc -= 0x8000;
139 return rc;
140 }
141
_dump_trigger_msg(char * header,trigger_info_msg_t * msg)142 static void _dump_trigger_msg(char *header, trigger_info_msg_t *msg)
143 {
144 int i;
145
146 if ((slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) == 0)
147 return;
148
149 info("%s", header);
150 if ((msg == NULL) || (msg->record_count == 0)) {
151 info("Trigger has no entries");
152 return;
153 }
154
155 info("INDEX TRIG_ID RES_TYPE RES_ID TRIG_TYPE OFFSET UID PROGRAM");
156 for (i=0; i<msg->record_count; i++) {
157 info("trigger[%u] %u %s %s %s %d %u %s", i,
158 msg->trigger_array[i].trig_id,
159 trigger_res_type(msg->trigger_array[i].res_type),
160 msg->trigger_array[i].res_id,
161 trigger_type(msg->trigger_array[i].trig_type),
162 _trig_offset(msg->trigger_array[i].offset),
163 msg->trigger_array[i].user_id,
164 msg->trigger_array[i].program);
165 }
166 }
167
168 /* Validate trigger program */
_validate_trigger(trig_mgr_info_t * trig_in)169 static bool _validate_trigger(trig_mgr_info_t *trig_in)
170 {
171 struct stat buf;
172 int i, modes;
173 char *program = xstrdup(trig_in->program);
174
175 for (i = 0; program[i]; i++) {
176 if (isspace(program[i])) {
177 program[i] = '\0';
178 break;
179 }
180 }
181
182 if (stat(program, &buf) != 0) {
183 info("trigger program %s not found", trig_in->program);
184 xfree(program);
185 return false;
186 }
187 xfree(program);
188
189 if (!S_ISREG(buf.st_mode)) {
190 info("trigger program %s not a regular file", trig_in->program);
191 return false;
192 }
193 if (buf.st_uid == trig_in->user_id)
194 modes = (buf.st_mode >> 6) & 07;
195 else if (buf.st_gid == trig_in->group_id)
196 modes = (buf.st_mode >> 3) & 07;
197 else
198 modes = buf.st_mode & 07;
199 if (modes & 01)
200 return true;
201
202 info("trigger program %s not executable", trig_in->program);
203 return false;
204 }
205
206
trigger_pull(trigger_info_msg_t * msg)207 extern int trigger_pull(trigger_info_msg_t *msg)
208 {
209 int rc = SLURM_SUCCESS;
210 ListIterator trig_iter;
211 trigger_info_t *trig_in;
212 trig_mgr_info_t *trig_test;
213
214 if (trigger_list == NULL) {
215 trigger_list = list_create(_trig_del);
216 }
217
218 /* validate the request, designated trigger must be set */
219 _dump_trigger_msg("trigger_pull", msg);
220 if (msg->record_count != 1)
221 return ESRCH;
222 trig_in = msg->trigger_array;
223
224 if ((trig_in->res_type != TRIGGER_RES_TYPE_SLURMCTLD) &&
225 (trig_in->res_type != TRIGGER_RES_TYPE_SLURMDBD) &&
226 (trig_in->res_type != TRIGGER_RES_TYPE_DATABASE)) {
227 return EINVAL;
228 }
229
230 /* now look for a valid request */
231 trig_iter = list_iterator_create(trigger_list);
232 while ((trig_test = list_next(trig_iter))) {
233 if ((trig_test->res_type == trig_in->res_type) &&
234 (trig_test->trig_type == trig_in->trig_type)) {
235 switch(trig_test->trig_type) {
236 case TRIGGER_TYPE_PRI_CTLD_ACCT_FULL:
237 trigger_primary_ctld_acct_full();
238 break;
239 case TRIGGER_TYPE_BU_CTLD_FAIL:
240 trigger_backup_ctld_fail(trig_in->control_inx);
241 break;
242 case TRIGGER_TYPE_BU_CTLD_RES_OP:
243 trigger_backup_ctld_res_op(trig_in->control_inx);
244 break;
245 case TRIGGER_TYPE_BU_CTLD_AS_CTRL:
246 trigger_backup_ctld_as_ctrl();
247 break;
248 case TRIGGER_TYPE_PRI_DBD_FAIL:
249 trigger_primary_dbd_fail();
250 break;
251 case TRIGGER_TYPE_PRI_DBD_RES_OP:
252 trigger_primary_dbd_res_op();
253 break;
254 case TRIGGER_TYPE_PRI_DB_FAIL:
255 trigger_primary_db_fail();
256 break;
257 case TRIGGER_TYPE_PRI_DB_RES_OP:
258 trigger_primary_db_res_op();
259 break;
260 default:
261 error("trigger_pull call has invalid type: %u",
262 trig_test->trig_type);
263 rc = EINVAL;
264 break;
265 }
266 }
267 }
268 list_iterator_destroy(trig_iter);
269
270 return rc;
271 }
272
trigger_clear(uid_t uid,trigger_info_msg_t * msg)273 extern int trigger_clear(uid_t uid, trigger_info_msg_t *msg)
274 {
275 int rc = ESRCH;
276 ListIterator trig_iter;
277 trigger_info_t *trig_in;
278 trig_mgr_info_t *trig_test;
279 uint32_t job_id = 0;
280
281 slurm_mutex_lock(&trigger_mutex);
282 if (trigger_list == NULL)
283 trigger_list = list_create(_trig_del);
284
285 /* validate the request, need a job_id and/or trigger_id */
286 _dump_trigger_msg("trigger_clear", msg);
287 if (msg->record_count != 1)
288 goto fini;
289 trig_in = msg->trigger_array;
290 if (trig_in->res_type == TRIGGER_RES_TYPE_JOB) {
291 job_id = (uint32_t) atol(trig_in->res_id);
292 if (job_id == 0) {
293 rc = ESLURM_INVALID_JOB_ID;
294 goto fini;
295 }
296 } else if ((trig_in->trig_id == 0) && (trig_in->user_id == NO_VAL)) {
297 rc = EINVAL;
298 goto fini;
299 }
300
301 /* now look for a valid request, matching uid */
302 trig_iter = list_iterator_create(trigger_list);
303 while ((trig_test = list_next(trig_iter))) {
304 if (trig_in->trig_id &&
305 (trig_in->trig_id != trig_test->trig_id))
306 continue;
307 if (job_id && (job_id != trig_test->job_id))
308 continue;
309 if ((trig_in->user_id != NO_VAL) &&
310 (trig_in->user_id != trig_test->user_id))
311 continue;
312 if (trig_test->state == 2) /* wait for proc termination */
313 continue;
314 if ((trig_test->user_id != (uint32_t) uid) && (uid != 0)) {
315 rc = ESLURM_ACCESS_DENIED;
316 continue;
317 }
318 list_delete_item(trig_iter);
319 rc = SLURM_SUCCESS;
320 }
321 list_iterator_destroy(trig_iter);
322 schedule_trigger_save();
323
324 fini: slurm_mutex_unlock(&trigger_mutex);
325 return rc;
326 }
327
trigger_get(uid_t uid,trigger_info_msg_t * msg)328 extern trigger_info_msg_t * trigger_get(uid_t uid, trigger_info_msg_t *msg)
329 {
330 trigger_info_msg_t *resp_data;
331 ListIterator trig_iter;
332 trigger_info_t *trig_out;
333 trig_mgr_info_t *trig_in;
334 int recs_written = 0;
335
336 slurm_mutex_lock(&trigger_mutex);
337 if (trigger_list == NULL)
338 trigger_list = list_create(_trig_del);
339
340 _dump_trigger_msg("trigger_get", NULL);
341 resp_data = xmalloc(sizeof(trigger_info_msg_t));
342 resp_data->record_count = list_count(trigger_list);
343 resp_data->trigger_array = xcalloc(resp_data->record_count,
344 sizeof(trigger_info_t));
345 trig_iter = list_iterator_create(trigger_list);
346 trig_out = resp_data->trigger_array;
347 while ((trig_in = list_next(trig_iter))) {
348 /* Note: Filtering currently done by strigger */
349 if ((trig_in->state >= 1) &&
350 ((trig_out->flags & TRIGGER_FLAG_PERM) == 0))
351 continue; /* no longer pending */
352 trig_out->flags = trig_in->flags;
353 trig_out->trig_id = trig_in->trig_id;
354 trig_out->res_type = trig_in->res_type;
355 trig_out->res_id = xstrdup(trig_in->res_id);
356 trig_out->trig_type = trig_in->trig_type;
357 trig_out->offset = trig_in->trig_time;
358 trig_out->user_id = trig_in->user_id;
359 trig_out->program = xstrdup(trig_in->program);
360 trig_out++;
361 recs_written++;
362 }
363 list_iterator_destroy(trig_iter);
364 slurm_mutex_unlock(&trigger_mutex);
365 resp_data->record_count = recs_written;
366
367 _dump_trigger_msg("trigger_got", resp_data);
368 return resp_data;
369 }
370
_duplicate_trigger(trigger_info_t * trig_desc)371 static bool _duplicate_trigger(trigger_info_t *trig_desc)
372 {
373 bool found_dup = false;
374 ListIterator trig_iter;
375 trig_mgr_info_t *trig_rec;
376
377 trig_iter = list_iterator_create(trigger_list);
378 while ((trig_rec = list_next(trig_iter))) {
379 if ((trig_desc->flags == trig_rec->flags) &&
380 (trig_desc->res_type == trig_rec->res_type) &&
381 (trig_desc->trig_type == trig_rec->trig_type) &&
382 (trig_desc->offset == trig_rec->trig_time) &&
383 (trig_desc->user_id == trig_rec->user_id) &&
384 !xstrcmp(trig_desc->program, trig_rec->program) &&
385 !xstrcmp(trig_desc->res_id, trig_rec->res_id)) {
386 found_dup = true;
387 break;
388 }
389 }
390 list_iterator_destroy(trig_iter);
391 return found_dup;
392 }
393
trigger_set(uid_t uid,gid_t gid,trigger_info_msg_t * msg)394 extern int trigger_set(uid_t uid, gid_t gid, trigger_info_msg_t *msg)
395 {
396 int i;
397 int rc = SLURM_SUCCESS;
398 uint32_t job_id;
399 bitstr_t *bitmap = NULL;
400 trig_mgr_info_t * trig_add;
401 job_record_t *job_ptr;
402 /* Read config and job info */
403 slurmctld_lock_t job_read_lock =
404 { READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
405
406 lock_slurmctld(job_read_lock);
407 slurm_mutex_lock(&trigger_mutex);
408
409 if ((slurmctld_conf.slurm_user_id != 0) &&
410 (slurmctld_conf.slurm_user_id != uid)) {
411 /* If SlurmUser is not root, then it is unable to set the
412 * appropriate user id and group id for the program to be
413 * launched. To prevent the launched program for an arbitrary
414 * user being executed as user SlurmUser, disable all other
415 * users from setting triggers. */
416 info("Attempt to set trigger by uid %u != SlurmUser", uid);
417 rc = ESLURM_ACCESS_DENIED;
418 goto fini;
419 }
420
421 if (trigger_list == NULL) {
422 trigger_list = list_create(_trig_del);
423 } else if ((uid != 0) &&
424 (list_count(trigger_list) >= slurmctld_conf.max_job_cnt)) {
425 rc = EAGAIN;
426 goto fini;
427 }
428
429 _dump_trigger_msg("trigger_set", msg);
430 for (i = 0; i < msg->record_count; i++) {
431 if (msg->trigger_array[i].res_type ==
432 TRIGGER_RES_TYPE_JOB) {
433 job_id = (uint32_t) atol(
434 msg->trigger_array[i].res_id);
435 job_ptr = find_job_record(job_id);
436 if (job_ptr == NULL) {
437 rc = ESLURM_INVALID_JOB_ID;
438 continue;
439 }
440 if (IS_JOB_FINISHED(job_ptr)) {
441 rc = ESLURM_ALREADY_DONE;
442 continue;
443 }
444 } else {
445 job_id = 0;
446 job_ptr = NULL;
447 if ((msg->trigger_array[i].res_id != NULL) &&
448 (msg->trigger_array[i].res_id[0] != '*') &&
449 (node_name2bitmap(msg->trigger_array[i].res_id,
450 false, &bitmap) != 0)) {
451 FREE_NULL_BITMAP(bitmap);
452 rc = ESLURM_INVALID_NODE_NAME;
453 continue;
454 }
455 }
456 msg->trigger_array[i].user_id = (uint32_t) uid;
457 if (_duplicate_trigger(&msg->trigger_array[i])) {
458 FREE_NULL_BITMAP(bitmap);
459 rc = ESLURM_TRIGGER_DUP;
460 continue;
461 }
462 trig_add = xmalloc(sizeof(trig_mgr_info_t));
463 msg->trigger_array[i].trig_id = next_trigger_id;
464 trig_add->trig_id = next_trigger_id;
465 next_trigger_id++;
466 trig_add->flags = msg->trigger_array[i].flags;
467 trig_add->res_type = msg->trigger_array[i].res_type;
468 if (bitmap) {
469 trig_add->nodes_bitmap = bitmap;
470 trig_add->orig_bitmap = bit_copy(bitmap);
471 bitmap = NULL;
472 }
473 trig_add->job_id = job_id;
474 trig_add->job_ptr = job_ptr;
475 if (msg->trigger_array[i].res_id) {
476 trig_add->res_id = msg->trigger_array[i].res_id;
477 trig_add->orig_res_id = xstrdup(trig_add->res_id);
478 msg->trigger_array[i].res_id = NULL; /* moved */
479 }
480 trig_add->trig_type = msg->trigger_array[i].trig_type;
481 trig_add->trig_time = msg->trigger_array[i].offset;
482 trig_add->orig_time = msg->trigger_array[i].offset;
483 trig_add->user_id = msg->trigger_array[i].user_id;
484 trig_add->group_id = (uint32_t) gid;
485 /* move don't copy "program" */
486 trig_add->program = msg->trigger_array[i].program;
487 msg->trigger_array[i].program = NULL;
488 if (!_validate_trigger(trig_add)) {
489 rc = ESLURM_ACCESS_DENIED;
490 FREE_NULL_BITMAP(trig_add->nodes_bitmap);
491 FREE_NULL_BITMAP(trig_add->orig_bitmap);
492 xfree(trig_add->program);
493 xfree(trig_add->res_id);
494 xfree(trig_add);
495 continue;
496 }
497 list_append(trigger_list, trig_add);
498 schedule_trigger_save();
499 }
500
501 fini: slurm_mutex_unlock(&trigger_mutex);
502 unlock_slurmctld(job_read_lock);
503 return rc;
504 }
505
trigger_front_end_down(front_end_record_t * front_end_ptr)506 extern void trigger_front_end_down(front_end_record_t *front_end_ptr)
507 {
508 int inx = front_end_ptr - front_end_nodes;
509
510 xassert(verify_lock(NODE_LOCK, READ_LOCK));
511
512 slurm_mutex_lock(&trigger_mutex);
513 if (trigger_down_front_end_bitmap == NULL)
514 trigger_down_front_end_bitmap = bit_alloc(front_end_node_cnt);
515 bit_set(trigger_down_front_end_bitmap, inx);
516 slurm_mutex_unlock(&trigger_mutex);
517 }
518
trigger_front_end_up(front_end_record_t * front_end_ptr)519 extern void trigger_front_end_up(front_end_record_t *front_end_ptr)
520 {
521 int inx = front_end_ptr - front_end_nodes;
522
523 xassert(verify_lock(NODE_LOCK, READ_LOCK));
524
525 slurm_mutex_lock(&trigger_mutex);
526 if (trigger_up_front_end_bitmap == NULL)
527 trigger_up_front_end_bitmap = bit_alloc(front_end_node_cnt);
528 bit_set(trigger_up_front_end_bitmap, inx);
529 slurm_mutex_unlock(&trigger_mutex);
530 }
531
trigger_node_down(node_record_t * node_ptr)532 extern void trigger_node_down(node_record_t *node_ptr)
533 {
534 int inx = node_ptr - node_record_table_ptr;
535
536 xassert(verify_lock(NODE_LOCK, READ_LOCK));
537
538 slurm_mutex_lock(&trigger_mutex);
539 if (trigger_down_nodes_bitmap == NULL)
540 trigger_down_nodes_bitmap = bit_alloc(node_record_count);
541 bit_set(trigger_down_nodes_bitmap, inx);
542 slurm_mutex_unlock(&trigger_mutex);
543 }
544
trigger_node_drained(node_record_t * node_ptr)545 extern void trigger_node_drained(node_record_t *node_ptr)
546 {
547 int inx = node_ptr - node_record_table_ptr;
548
549 xassert(verify_lock(NODE_LOCK, READ_LOCK));
550
551 slurm_mutex_lock(&trigger_mutex);
552 if (trigger_drained_nodes_bitmap == NULL)
553 trigger_drained_nodes_bitmap = bit_alloc(node_record_count);
554 bit_set(trigger_drained_nodes_bitmap, inx);
555 slurm_mutex_unlock(&trigger_mutex);
556 }
557
trigger_node_failing(node_record_t * node_ptr)558 extern void trigger_node_failing(node_record_t *node_ptr)
559 {
560 int inx = node_ptr - node_record_table_ptr;
561
562 xassert(verify_lock(NODE_LOCK, READ_LOCK));
563
564 slurm_mutex_lock(&trigger_mutex);
565 if (trigger_fail_nodes_bitmap == NULL)
566 trigger_fail_nodes_bitmap = bit_alloc(node_record_count);
567 bit_set(trigger_fail_nodes_bitmap, inx);
568 slurm_mutex_unlock(&trigger_mutex);
569 }
570
trigger_node_up(node_record_t * node_ptr)571 extern void trigger_node_up(node_record_t *node_ptr)
572 {
573 int inx = node_ptr - node_record_table_ptr;
574
575 xassert(verify_lock(NODE_LOCK, READ_LOCK));
576
577 slurm_mutex_lock(&trigger_mutex);
578 if (trigger_up_nodes_bitmap == NULL)
579 trigger_up_nodes_bitmap = bit_alloc(node_record_count);
580 bit_set(trigger_up_nodes_bitmap, inx);
581 slurm_mutex_unlock(&trigger_mutex);
582 }
583
trigger_reconfig(void)584 extern void trigger_reconfig(void)
585 {
586 slurmctld_lock_t node_read_lock = { .node = READ_LOCK };
587
588 lock_slurmctld(node_read_lock);
589 slurm_mutex_lock(&trigger_mutex);
590 trigger_node_reconfig = true;
591 if (trigger_down_front_end_bitmap)
592 trigger_down_front_end_bitmap = bit_realloc(
593 trigger_down_front_end_bitmap, node_record_count);
594 if (trigger_up_front_end_bitmap)
595 trigger_up_front_end_bitmap = bit_realloc(
596 trigger_up_front_end_bitmap, node_record_count);
597 if (trigger_down_nodes_bitmap)
598 trigger_down_nodes_bitmap = bit_realloc(
599 trigger_down_nodes_bitmap, node_record_count);
600 if (trigger_drained_nodes_bitmap)
601 trigger_drained_nodes_bitmap = bit_realloc(
602 trigger_drained_nodes_bitmap, node_record_count);
603 if (trigger_fail_nodes_bitmap)
604 trigger_fail_nodes_bitmap = bit_realloc(
605 trigger_fail_nodes_bitmap, node_record_count);
606 if (trigger_up_nodes_bitmap)
607 trigger_up_nodes_bitmap = bit_realloc(
608 trigger_up_nodes_bitmap, node_record_count);
609 slurm_mutex_unlock(&trigger_mutex);
610 unlock_slurmctld(node_read_lock);
611 }
612
trigger_primary_ctld_fail(void)613 extern void trigger_primary_ctld_fail(void)
614 {
615 slurm_mutex_lock(&trigger_mutex);
616 if (ctld_failure != 1) {
617 trigger_pri_ctld_fail = true;
618 ctld_failure = 1;
619 }
620 slurm_mutex_unlock(&trigger_mutex);
621 }
622
trigger_primary_ctld_res_op(void)623 extern void trigger_primary_ctld_res_op(void)
624 {
625 slurm_mutex_lock(&trigger_mutex);
626 trigger_pri_ctld_res_op = true;
627 ctld_failure = 0;
628 slurm_mutex_unlock(&trigger_mutex);
629 }
630
trigger_primary_ctld_res_ctrl(void)631 extern void trigger_primary_ctld_res_ctrl(void)
632 {
633 slurm_mutex_lock(&trigger_mutex);
634 trigger_pri_ctld_res_ctrl = true;
635 slurm_mutex_unlock(&trigger_mutex);
636 }
637
trigger_primary_ctld_acct_full(void)638 extern void trigger_primary_ctld_acct_full(void)
639 {
640 slurm_mutex_lock(&trigger_mutex);
641 trigger_pri_ctld_acct_buffer_full = true;
642 slurm_mutex_unlock(&trigger_mutex);
643 }
644
trigger_backup_ctld_fail(int index)645 extern void trigger_backup_ctld_fail(int index)
646 {
647 slurm_mutex_lock(&trigger_mutex);
648 if (bu_ctld_failure != 1) {
649 trigger_bu_ctld_fail = true;
650 bu_ctld_failure = 1;
651 }
652 slurm_mutex_unlock(&trigger_mutex);
653 }
654
trigger_backup_ctld_res_op(int index)655 extern void trigger_backup_ctld_res_op(int index)
656 {
657 slurm_mutex_lock(&trigger_mutex);
658 trigger_bu_ctld_res_op = true;
659 bu_ctld_failure = 0;
660 slurm_mutex_unlock(&trigger_mutex);
661 }
662
trigger_backup_ctld_as_ctrl(void)663 extern void trigger_backup_ctld_as_ctrl(void)
664 {
665 slurm_mutex_lock(&trigger_mutex);
666 trigger_bu_ctld_as_ctrl = true;
667 slurm_mutex_unlock(&trigger_mutex);
668 }
669
trigger_primary_dbd_fail(void)670 extern void trigger_primary_dbd_fail(void)
671 {
672 slurm_mutex_lock(&trigger_mutex);
673 if (dbd_failure != 1) {
674 trigger_pri_dbd_fail = true;
675 dbd_failure = 1;
676 }
677 slurm_mutex_unlock(&trigger_mutex);
678 }
679
trigger_primary_dbd_res_op(void)680 extern void trigger_primary_dbd_res_op(void)
681 {
682 slurm_mutex_lock(&trigger_mutex);
683 trigger_pri_dbd_res_op = true;
684 dbd_failure = 0;
685 slurm_mutex_unlock(&trigger_mutex);
686 }
687
trigger_primary_db_fail(void)688 extern void trigger_primary_db_fail(void)
689 {
690 slurm_mutex_lock(&trigger_mutex);
691 if (db_failure != 1) {
692 trigger_pri_db_fail = true;
693 db_failure = 1;
694 }
695 slurm_mutex_unlock(&trigger_mutex);
696 }
697
trigger_primary_db_res_op(void)698 extern void trigger_primary_db_res_op(void)
699 {
700 slurm_mutex_lock(&trigger_mutex);
701 trigger_pri_db_res_op = true;
702 db_failure = 0;
703 slurm_mutex_unlock(&trigger_mutex);
704 }
705
trigger_burst_buffer(void)706 extern void trigger_burst_buffer(void)
707 {
708 slurm_mutex_lock(&trigger_mutex);
709 trigger_bb_error = true;
710 slurm_mutex_unlock(&trigger_mutex);
711 }
712
_dump_trigger_state(trig_mgr_info_t * trig_ptr,Buf buffer)713 static void _dump_trigger_state(trig_mgr_info_t *trig_ptr, Buf buffer)
714 {
715 /* write trigger pull state flags */
716 pack8(ctld_failure, buffer);
717 pack8(bu_ctld_failure, buffer);
718 pack8(dbd_failure, buffer);
719 pack8(db_failure, buffer);
720
721 pack16 (trig_ptr->flags, buffer);
722 pack32 (trig_ptr->trig_id, buffer);
723 pack16 (trig_ptr->res_type, buffer);
724 packstr (trig_ptr->orig_res_id, buffer); /* restores res_id too */
725 /* rebuild nodes_bitmap as needed from res_id */
726 /* rebuild job_id as needed from res_id */
727 /* rebuild job_ptr as needed from res_id */
728 pack32 (trig_ptr->trig_type, buffer);
729 pack_time(trig_ptr->orig_time, buffer); /* restores trig_time too */
730 pack32 (trig_ptr->user_id, buffer);
731 pack32 (trig_ptr->group_id, buffer);
732 packstr (trig_ptr->program, buffer);
733 pack8 (trig_ptr->state, buffer);
734 }
735
_load_trigger_state(Buf buffer,uint16_t protocol_version)736 static int _load_trigger_state(Buf buffer, uint16_t protocol_version)
737 {
738 trig_mgr_info_t *trig_ptr;
739 uint32_t str_len;
740
741 xassert(verify_lock(JOB_LOCK, READ_LOCK));
742
743 trig_ptr = xmalloc(sizeof(trig_mgr_info_t));
744
745 if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
746 /* restore trigger pull state flags */
747 safe_unpack8(&ctld_failure, buffer);
748 safe_unpack8(&bu_ctld_failure, buffer);
749 safe_unpack8(&dbd_failure, buffer);
750 safe_unpack8(&db_failure, buffer);
751
752 safe_unpack16 (&trig_ptr->flags, buffer);
753 safe_unpack32 (&trig_ptr->trig_id, buffer);
754 safe_unpack16 (&trig_ptr->res_type, buffer);
755 safe_unpackstr_xmalloc(&trig_ptr->res_id, &str_len, buffer);
756 /* rebuild nodes_bitmap as needed from res_id */
757 /* rebuild job_id as needed from res_id */
758 /* rebuild job_ptr as needed from res_id */
759 safe_unpack32 (&trig_ptr->trig_type, buffer);
760 safe_unpack_time(&trig_ptr->trig_time, buffer);
761 safe_unpack32 (&trig_ptr->user_id, buffer);
762 safe_unpack32 (&trig_ptr->group_id, buffer);
763 safe_unpackstr_xmalloc(&trig_ptr->program, &str_len, buffer);
764 safe_unpack8 (&trig_ptr->state, buffer);
765 } else {
766 error("_load_trigger_state: protocol_version "
767 "%hu not supported", protocol_version);
768 goto unpack_error;
769 }
770
771 if ((trig_ptr->res_type < TRIGGER_RES_TYPE_JOB) ||
772 (trig_ptr->res_type > TRIGGER_RES_TYPE_OTHER) ||
773 (trig_ptr->state > 2))
774 goto unpack_error;
775 if (trig_ptr->res_type == TRIGGER_RES_TYPE_JOB) {
776 trig_ptr->job_id = (uint32_t) atol(trig_ptr->res_id);
777 trig_ptr->job_ptr = find_job_record(trig_ptr->job_id);
778 if ((trig_ptr->job_id == 0) ||
779 (trig_ptr->job_ptr == NULL) ||
780 (IS_JOB_COMPLETED(trig_ptr->job_ptr) &&
781 trig_ptr->state != 2))
782 goto unpack_error;
783 } else if (trig_ptr->res_type == TRIGGER_RES_TYPE_NODE) {
784 trig_ptr->job_id = 0;
785 trig_ptr->job_ptr = NULL;
786 if ((trig_ptr->res_id != NULL) &&
787 (trig_ptr->res_id[0] != '*') &&
788 (node_name2bitmap(trig_ptr->res_id, false,
789 &trig_ptr->nodes_bitmap) != 0))
790 goto unpack_error;
791 }
792 if (trig_ptr->nodes_bitmap)
793 trig_ptr->orig_bitmap = bit_copy(trig_ptr->nodes_bitmap);
794 if (trig_ptr->res_id)
795 trig_ptr->orig_res_id = xstrdup(trig_ptr->res_id);
796 trig_ptr->orig_time = trig_ptr->trig_time;
797
798 slurm_mutex_lock(&trigger_mutex);
799 if (trigger_list == NULL)
800 trigger_list = list_create(_trig_del);
801 list_append(trigger_list, trig_ptr);
802 next_trigger_id = MAX(next_trigger_id, trig_ptr->trig_id + 1);
803 slurm_mutex_unlock(&trigger_mutex);
804
805 return SLURM_SUCCESS;
806
807 unpack_error:
808 error("Incomplete trigger record");
809 xfree(trig_ptr->res_id);
810 xfree(trig_ptr->program);
811 FREE_NULL_BITMAP(trig_ptr->nodes_bitmap);
812 xfree(trig_ptr);
813 return SLURM_ERROR;
814 }
815
trigger_state_save(void)816 extern int trigger_state_save(void)
817 {
818 /* Save high-water mark to avoid buffer growth with copies */
819 static int high_buffer_size = (1024 * 1024);
820 int error_code = 0, log_fd;
821 char *old_file, *new_file, *reg_file;
822 Buf buffer = init_buf(high_buffer_size);
823 ListIterator trig_iter;
824 trig_mgr_info_t *trig_in;
825 /* Locks: Read config */
826 slurmctld_lock_t config_read_lock =
827 { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
828
829 /* write header: version, time */
830 packstr(TRIGGER_STATE_VERSION, buffer);
831 pack16(SLURM_PROTOCOL_VERSION, buffer);
832 pack_time(time(NULL), buffer);
833
834 /* write individual trigger records */
835 slurm_mutex_lock(&trigger_mutex);
836 if (trigger_list == NULL)
837 trigger_list = list_create(_trig_del);
838
839 trig_iter = list_iterator_create(trigger_list);
840 while ((trig_in = list_next(trig_iter)))
841 _dump_trigger_state(trig_in, buffer);
842 list_iterator_destroy(trig_iter);
843 slurm_mutex_unlock(&trigger_mutex);
844
845 /* write the buffer to file */
846 lock_slurmctld(config_read_lock);
847 old_file = xstrdup(slurmctld_conf.state_save_location);
848 xstrcat(old_file, "/trigger_state.old");
849 reg_file = xstrdup(slurmctld_conf.state_save_location);
850 xstrcat(reg_file, "/trigger_state");
851 new_file = xstrdup(slurmctld_conf.state_save_location);
852 xstrcat(new_file, "/trigger_state.new");
853 unlock_slurmctld(config_read_lock);
854
855 lock_state_files();
856 log_fd = creat(new_file, 0600);
857 if (log_fd < 0) {
858 error("Can't save state, create file %s error %m",
859 new_file);
860 error_code = errno;
861 } else {
862 int pos = 0, nwrite = get_buf_offset(buffer), amount, rc;
863 char *data = (char *)get_buf_data(buffer);
864 high_buffer_size = MAX(nwrite, high_buffer_size);
865 while (nwrite > 0) {
866 amount = write(log_fd, &data[pos], nwrite);
867 if ((amount < 0) && (errno != EINTR)) {
868 error("Error writing file %s, %m", new_file);
869 error_code = errno;
870 break;
871 }
872 nwrite -= amount;
873 pos += amount;
874 }
875
876 rc = fsync_and_close(log_fd, "trigger");
877 if (rc && !error_code)
878 error_code = rc;
879 }
880 if (error_code) {
881 (void) unlink(new_file);
882 } else { /* file shuffle */
883 (void) unlink(old_file);
884 if (link(reg_file, old_file)) {
885 debug4("unable to create link for %s -> %s: %m",
886 reg_file, old_file);
887 }
888 (void) unlink(reg_file);
889 if (link(new_file, reg_file)) {
890 debug4("unable to create link for %s -> %s: %m",
891 new_file, reg_file);
892 }
893 (void) unlink(new_file);
894 }
895 xfree(old_file);
896 xfree(reg_file);
897 xfree(new_file);
898 unlock_state_files();
899 free_buf(buffer);
900 return error_code;
901 }
902
903 /* Open the trigger state save file, or backup if necessary.
904 * state_file IN - the name of the state save file used
905 * RET the file description to read from or error code
906 */
_open_trigger_state_file(char ** state_file)907 static Buf _open_trigger_state_file(char **state_file)
908 {
909 Buf buf;
910
911 *state_file = xstrdup(slurmctld_conf.state_save_location);
912 xstrcat(*state_file, "/trigger_state");
913 if (!(buf = create_mmap_buf(*state_file)))
914 error("Could not open trigger state file %s: %m",
915 *state_file);
916 else
917 return buf;
918
919 error("NOTE: Trying backup state save file. Triggers may be lost!");
920 xstrcat(*state_file, ".old");
921 return create_mmap_buf(*state_file);;
922 }
923
trigger_state_restore(void)924 extern void trigger_state_restore(void)
925 {
926 uint16_t protocol_version = NO_VAL16;
927 int trigger_cnt = 0;
928 char *state_file;
929 Buf buffer;
930 time_t buf_time;
931 char *ver_str = NULL;
932 uint32_t ver_str_len;
933
934 /* read the file */
935 xassert(verify_lock(CONF_LOCK, READ_LOCK));
936
937 lock_state_files();
938 if (!(buffer = _open_trigger_state_file(&state_file))) {
939 info("No trigger state file (%s) to recover", state_file);
940 xfree(state_file);
941 unlock_state_files();
942 return;
943 }
944 xfree(state_file);
945 unlock_state_files();
946
947 safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
948 if (ver_str && !xstrcmp(ver_str, TRIGGER_STATE_VERSION))
949 safe_unpack16(&protocol_version, buffer);
950
951 if (protocol_version == NO_VAL16) {
952 if (!ignore_state_errors)
953 fatal("Can't recover trigger state, data version incompatible, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
954 error("Can't recover trigger state, data version "
955 "incompatible");
956 xfree(ver_str);
957 free_buf(buffer);
958 return;
959 }
960 xfree(ver_str);
961
962 safe_unpack_time(&buf_time, buffer);
963 if (trigger_list)
964 list_flush(trigger_list);
965 while (remaining_buf(buffer) > 0) {
966 if (_load_trigger_state(buffer, protocol_version) !=
967 SLURM_SUCCESS)
968 goto unpack_error;
969 trigger_cnt++;
970 }
971 goto fini;
972
973 unpack_error:
974 if (!ignore_state_errors)
975 fatal("Incomplete trigger data checkpoint file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
976 error("Incomplete trigger data checkpoint file");
977 fini: verbose("State of %d triggers recovered", trigger_cnt);
978 free_buf(buffer);
979 }
980
_front_end_job_test(bitstr_t * front_end_bitmap,job_record_t * job_ptr)981 static bool _front_end_job_test(bitstr_t *front_end_bitmap,
982 job_record_t *job_ptr)
983 {
984 #ifdef HAVE_FRONT_END
985 int i;
986
987 /* Need node read lock for reading front_end_node_cnt. */
988 xassert(verify_lock(NODE_LOCK, READ_LOCK));
989
990 if ((front_end_bitmap == NULL) || (job_ptr->batch_host == NULL))
991 return false;
992
993 for (i = 0; i < front_end_node_cnt; i++) {
994 if (bit_test(front_end_bitmap, i) &&
995 !xstrcmp(front_end_nodes[i].name, job_ptr->batch_host)) {
996 return true;
997 }
998 }
999 #endif
1000 return false;
1001 }
1002
1003 /* Test if the event has been triggered, change trigger state as needed */
_trigger_job_event(trig_mgr_info_t * trig_in,time_t now)1004 static void _trigger_job_event(trig_mgr_info_t *trig_in, time_t now)
1005 {
1006 xassert(verify_lock(JOB_LOCK, READ_LOCK));
1007
1008 trig_in->job_ptr = find_job_record(trig_in->job_id);
1009
1010 if ((trig_in->trig_type & TRIGGER_TYPE_FINI) &&
1011 ((trig_in->job_ptr == NULL) ||
1012 (IS_JOB_COMPLETED(trig_in->job_ptr)))) {
1013 trig_in->state = 1;
1014 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1015 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1016 info("trigger[%u] event for job %u fini",
1017 trig_in->trig_id, trig_in->job_id);
1018 }
1019 return;
1020 }
1021
1022 if (trig_in->job_ptr == NULL) {
1023 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1024 info("trigger[%u] for defunct job %u",
1025 trig_in->trig_id, trig_in->job_id);
1026 }
1027 trig_in->state = 2;
1028 trig_in->trig_time = now;
1029 return;
1030 }
1031
1032 if (!IS_JOB_PENDING(trig_in->job_ptr) &&
1033 (trig_in->trig_type & TRIGGER_TYPE_TIME)) {
1034 long rem_time = (trig_in->job_ptr->end_time - now);
1035 if (rem_time <= (0x8000 - trig_in->trig_time)) {
1036 trig_in->state = 1;
1037 trig_in->trig_time = now;
1038 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1039 info("trigger[%u] for job %u time",
1040 trig_in->trig_id, trig_in->job_id);
1041 }
1042 return;
1043 }
1044 }
1045
1046 if (trig_in->trig_type & TRIGGER_TYPE_DOWN) {
1047 if (_front_end_job_test(trigger_down_front_end_bitmap,
1048 trig_in->job_ptr)) {
1049 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1050 info("trigger[%u] for job %u down",
1051 trig_in->trig_id, trig_in->job_id);
1052 }
1053 trig_in->state = 1;
1054 trig_in->trig_time = now +
1055 (trig_in->trig_time - 0x8000);
1056 return;
1057 }
1058 }
1059
1060 if (trig_in->trig_type & TRIGGER_TYPE_DOWN) {
1061 if (trigger_down_nodes_bitmap &&
1062 bit_overlap_any(trig_in->job_ptr->node_bitmap,
1063 trigger_down_nodes_bitmap)) {
1064 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1065 info("trigger[%u] for job %u down",
1066 trig_in->trig_id, trig_in->job_id);
1067 }
1068 trig_in->state = 1;
1069 trig_in->trig_time = now +
1070 (trig_in->trig_time - 0x8000);
1071 return;
1072 }
1073 }
1074
1075 if (trig_in->trig_type & TRIGGER_TYPE_FAIL) {
1076 if (trigger_fail_nodes_bitmap &&
1077 bit_overlap_any(trig_in->job_ptr->node_bitmap,
1078 trigger_fail_nodes_bitmap)) {
1079 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1080 info("trigger[%u] for job %u node fail",
1081 trig_in->trig_id, trig_in->job_id);
1082 }
1083 trig_in->state = 1;
1084 trig_in->trig_time = now +
1085 (trig_in->trig_time - 0x8000);
1086 return;
1087 }
1088 }
1089
1090 if (trig_in->trig_type & TRIGGER_TYPE_UP) {
1091 if (trigger_up_nodes_bitmap &&
1092 bit_overlap_any(trig_in->job_ptr->node_bitmap,
1093 trigger_up_nodes_bitmap)) {
1094 trig_in->state = 1;
1095 trig_in->trig_time = now +
1096 (0x8000 - trig_in->trig_time);
1097 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1098 info("trigger[%u] for job %u up",
1099 trig_in->trig_id, trig_in->job_id);
1100 }
1101 return;
1102 }
1103 }
1104 }
1105
1106
_trigger_front_end_event(trig_mgr_info_t * trig_in,time_t now)1107 static void _trigger_front_end_event(trig_mgr_info_t *trig_in, time_t now)
1108 {
1109 int i;
1110
1111 xassert(verify_lock(NODE_LOCK, READ_LOCK));
1112
1113 if ((trig_in->trig_type & TRIGGER_TYPE_DOWN) &&
1114 (trigger_down_front_end_bitmap != NULL) &&
1115 ((i = bit_ffs(trigger_down_front_end_bitmap)) != -1)) {
1116 xfree(trig_in->res_id);
1117 for (i = 0; i < front_end_node_cnt; i++) {
1118 if (!bit_test(trigger_down_front_end_bitmap, i))
1119 continue;
1120 if (trig_in->res_id != NULL)
1121 xstrcat(trig_in->res_id, ",");
1122 xstrcat(trig_in->res_id, front_end_nodes[i].name);
1123 }
1124 trig_in->state = 1;
1125 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1126 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1127 info("trigger[%u] for node %s down",
1128 trig_in->trig_id, trig_in->res_id);
1129 }
1130 return;
1131 }
1132
1133 if ((trig_in->trig_type & TRIGGER_TYPE_UP) &&
1134 (trigger_up_front_end_bitmap != NULL) &&
1135 ((i = bit_ffs(trigger_up_front_end_bitmap)) != -1)) {
1136 xfree(trig_in->res_id);
1137 for (i = 0; i < front_end_node_cnt; i++) {
1138 if (!bit_test(trigger_up_front_end_bitmap, i))
1139 continue;
1140 if (trig_in->res_id != NULL)
1141 xstrcat(trig_in->res_id, ",");
1142 xstrcat(trig_in->res_id, front_end_nodes[i].name);
1143 }
1144 trig_in->state = 1;
1145 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1146 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1147 info("trigger[%u] for node %s up",
1148 trig_in->trig_id, trig_in->res_id);
1149 }
1150 return;
1151 }
1152 }
1153
_trigger_other_event(trig_mgr_info_t * trig_in,time_t now)1154 static void _trigger_other_event(trig_mgr_info_t *trig_in, time_t now)
1155 {
1156 if ((trig_in->trig_type & TRIGGER_TYPE_BURST_BUFFER) &&
1157 trigger_bb_error) {
1158 trig_in->state = 1;
1159 trig_in->trig_time = now;
1160 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS)
1161 info("trigger[%u] for burst buffer", trig_in->trig_id);
1162 return;
1163 }
1164 }
1165
_trigger_node_event(trig_mgr_info_t * trig_in,time_t now)1166 static void _trigger_node_event(trig_mgr_info_t *trig_in, time_t now)
1167 {
1168 xassert(verify_lock(NODE_LOCK, READ_LOCK));
1169
1170 if ((trig_in->trig_type & TRIGGER_TYPE_DOWN) &&
1171 trigger_down_nodes_bitmap &&
1172 (bit_ffs(trigger_down_nodes_bitmap) != -1)) {
1173 if (trig_in->nodes_bitmap == NULL) { /* all nodes */
1174 xfree(trig_in->res_id);
1175 trig_in->res_id = bitmap2node_name(
1176 trigger_down_nodes_bitmap);
1177 trig_in->state = 1;
1178 } else if (bit_overlap_any(trig_in->nodes_bitmap,
1179 trigger_down_nodes_bitmap)) {
1180 bit_and(trig_in->nodes_bitmap,
1181 trigger_down_nodes_bitmap);
1182 xfree(trig_in->res_id);
1183 trig_in->res_id = bitmap2node_name(
1184 trig_in->nodes_bitmap);
1185 trig_in->state = 1;
1186 }
1187 if (trig_in->state == 1) {
1188 trig_in->trig_time = now +
1189 (trig_in->trig_time - 0x8000);
1190 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1191 info("trigger[%u] for node %s down",
1192 trig_in->trig_id, trig_in->res_id);
1193 }
1194 return;
1195 }
1196 }
1197
1198 if ((trig_in->trig_type & TRIGGER_TYPE_DRAINED) &&
1199 trigger_drained_nodes_bitmap &&
1200 (bit_ffs(trigger_drained_nodes_bitmap) != -1)) {
1201 if (trig_in->nodes_bitmap == NULL) { /* all nodes */
1202 xfree(trig_in->res_id);
1203 trig_in->res_id = bitmap2node_name(
1204 trigger_drained_nodes_bitmap);
1205 trig_in->state = 1;
1206 } else if (bit_overlap_any(trig_in->nodes_bitmap,
1207 trigger_drained_nodes_bitmap)) {
1208 bit_and(trig_in->nodes_bitmap,
1209 trigger_drained_nodes_bitmap);
1210 xfree(trig_in->res_id);
1211 trig_in->res_id = bitmap2node_name(
1212 trig_in->nodes_bitmap);
1213 trig_in->state = 1;
1214 }
1215 if (trig_in->state == 1) {
1216 trig_in->trig_time = now +
1217 (trig_in->trig_time - 0x8000);
1218 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1219 info("trigger[%u] for node %s drained",
1220 trig_in->trig_id, trig_in->res_id);
1221 }
1222 return;
1223 }
1224 }
1225
1226 if ((trig_in->trig_type & TRIGGER_TYPE_FAIL) &&
1227 trigger_fail_nodes_bitmap &&
1228 (bit_ffs(trigger_fail_nodes_bitmap) != -1)) {
1229 if (trig_in->nodes_bitmap == NULL) { /* all nodes */
1230 xfree(trig_in->res_id);
1231 trig_in->res_id = bitmap2node_name(
1232 trigger_fail_nodes_bitmap);
1233 trig_in->state = 1;
1234 } else if (bit_overlap_any(trig_in->nodes_bitmap,
1235 trigger_fail_nodes_bitmap)) {
1236 bit_and(trig_in->nodes_bitmap,
1237 trigger_fail_nodes_bitmap);
1238 xfree(trig_in->res_id);
1239 trig_in->res_id = bitmap2node_name(
1240 trig_in->nodes_bitmap);
1241 trig_in->state = 1;
1242 }
1243 if (trig_in->state == 1) {
1244 trig_in->trig_time = now +
1245 (trig_in->trig_time - 0x8000);
1246 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1247 info("trigger[%u] for node %s fail",
1248 trig_in->trig_id, trig_in->res_id);
1249 }
1250 return;
1251 }
1252 }
1253
1254 if (trig_in->trig_type & TRIGGER_TYPE_IDLE) {
1255 /* We need to determine which (if any) of these
1256 * nodes have been idle for at least the offset time */
1257 time_t min_idle = now - (trig_in->trig_time - 0x8000);
1258 int i;
1259 node_record_t *node_ptr = node_record_table_ptr;
1260 bitstr_t *trigger_idle_node_bitmap;
1261
1262 trigger_idle_node_bitmap = bit_alloc(node_record_count);
1263 for (i = 0; i < node_record_count; i++, node_ptr++) {
1264 if (!IS_NODE_IDLE(node_ptr) ||
1265 (node_ptr->last_idle > min_idle))
1266 continue;
1267 bit_set(trigger_idle_node_bitmap, i);
1268 }
1269 if (trig_in->nodes_bitmap == NULL) { /* all nodes */
1270 xfree(trig_in->res_id);
1271 trig_in->res_id = bitmap2node_name(
1272 trigger_idle_node_bitmap);
1273 trig_in->state = 1;
1274 } else if (bit_overlap_any(trig_in->nodes_bitmap,
1275 trigger_idle_node_bitmap)) {
1276 bit_and(trig_in->nodes_bitmap,
1277 trigger_idle_node_bitmap);
1278 xfree(trig_in->res_id);
1279 trig_in->res_id = bitmap2node_name(
1280 trig_in->nodes_bitmap);
1281 trig_in->state = 1;
1282 }
1283 FREE_NULL_BITMAP(trigger_idle_node_bitmap);
1284 if (trig_in->state == 1) {
1285 trig_in->trig_time = now;
1286 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1287 info("trigger[%u] for node %s idle",
1288 trig_in->trig_id, trig_in->res_id);
1289 }
1290 return;
1291 }
1292 }
1293
1294 if ((trig_in->trig_type & TRIGGER_TYPE_UP) &&
1295 trigger_up_nodes_bitmap &&
1296 (bit_ffs(trigger_up_nodes_bitmap) != -1)) {
1297 if (trig_in->nodes_bitmap == NULL) { /* all nodes */
1298 xfree(trig_in->res_id);
1299 trig_in->res_id = bitmap2node_name(
1300 trigger_up_nodes_bitmap);
1301 trig_in->state = 1;
1302 } else if (bit_overlap_any(trig_in->nodes_bitmap,
1303 trigger_up_nodes_bitmap)) {
1304 bit_and(trig_in->nodes_bitmap,
1305 trigger_up_nodes_bitmap);
1306 xfree(trig_in->res_id);
1307 trig_in->res_id = bitmap2node_name(
1308 trig_in->nodes_bitmap);
1309 trig_in->state = 1;
1310 }
1311 if (trig_in->state == 1) {
1312 trig_in->trig_time = now +
1313 (trig_in->trig_time - 0x8000);
1314 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1315 info("trigger[%u] for node %s up",
1316 trig_in->trig_id, trig_in->res_id);
1317 }
1318 return;
1319 }
1320 }
1321
1322 if ((trig_in->trig_type & TRIGGER_TYPE_RECONFIG) &&
1323 trigger_node_reconfig) {
1324 trig_in->state = 1;
1325 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1326 xfree(trig_in->res_id);
1327 trig_in->res_id = xstrdup("reconfig");
1328 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS)
1329 info("trigger[%u] for reconfig", trig_in->trig_id);
1330 return;
1331 }
1332 }
1333
_trigger_slurmctld_event(trig_mgr_info_t * trig_in,time_t now)1334 static void _trigger_slurmctld_event(trig_mgr_info_t *trig_in, time_t now)
1335 {
1336 if ((trig_in->trig_type & TRIGGER_TYPE_PRI_CTLD_FAIL) &&
1337 trigger_pri_ctld_fail) {
1338 trig_in->state = 1;
1339 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1340 xfree(trig_in->res_id);
1341 trig_in->res_id = xstrdup("primary_slurmctld_failure");
1342 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1343 info("trigger[%u] for primary_slurmctld_failure",
1344 trig_in->trig_id);
1345 }
1346 return;
1347 }
1348 if ((trig_in->trig_type & TRIGGER_TYPE_PRI_CTLD_RES_OP) &&
1349 trigger_pri_ctld_res_op) {
1350 trig_in->state = 1;
1351 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1352 xfree(trig_in->res_id);
1353 trig_in->res_id =
1354 xstrdup("primary_slurmctld_resumed_operation");
1355 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1356 info("trigger[%u] for primary_slurmctld_resumed_"
1357 "operation", trig_in->trig_id);
1358 }
1359 return;
1360 }
1361 if ((trig_in->trig_type & TRIGGER_TYPE_PRI_CTLD_RES_CTRL) &&
1362 trigger_pri_ctld_res_ctrl) {
1363 trig_in->state = 1;
1364 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1365 xfree(trig_in->res_id);
1366 trig_in->res_id = xstrdup("primary_slurmctld_resumed_control");
1367 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1368 info("trigger[%u] for primary_slurmctld_resumed_"
1369 "control", trig_in->trig_id);
1370 }
1371 return;
1372 }
1373 if ((trig_in->trig_type & TRIGGER_TYPE_PRI_CTLD_ACCT_FULL) &&
1374 trigger_pri_ctld_acct_buffer_full) {
1375 trig_in->state = 1;
1376 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1377 xfree(trig_in->res_id);
1378 trig_in->res_id = xstrdup("primary_slurmctld_acct_buffer_full");
1379 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1380 info("trigger[%u] for primary_slurmctld_acct_"
1381 "buffer_full", trig_in->trig_id);
1382 }
1383 return;
1384 }
1385 if ((trig_in->trig_type & TRIGGER_TYPE_BU_CTLD_FAIL) &&
1386 trigger_bu_ctld_fail) {
1387 trig_in->state = 1;
1388 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1389 xfree(trig_in->res_id);
1390 trig_in->res_id = xstrdup("backup_slurmctld_failure");
1391 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1392 info("trigger[%u] for backup_slurmctld_failure",
1393 trig_in->trig_id);
1394 }
1395 return;
1396 }
1397 if ((trig_in->trig_type & TRIGGER_TYPE_BU_CTLD_RES_OP) &&
1398 trigger_bu_ctld_res_op) {
1399 trig_in->state = 1;
1400 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1401 xfree(trig_in->res_id);
1402 trig_in->res_id = xstrdup("backup_slurmctld_resumed_operation");
1403 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1404 info("trigger[%u] for backup_slurmctld_resumed_"
1405 "operation", trig_in->trig_id);
1406 }
1407 return;
1408 }
1409 if ((trig_in->trig_type & TRIGGER_TYPE_BU_CTLD_AS_CTRL) &&
1410 trigger_bu_ctld_as_ctrl) {
1411 trig_in->state = 1;
1412 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1413 xfree(trig_in->res_id);
1414 trig_in->res_id = xstrdup("backup_slurmctld_assumed_control");
1415 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1416 info("trigger[%u] for bu_slurmctld_assumed_control",
1417 trig_in->trig_id);
1418 }
1419 return;
1420 }
1421 }
1422
_trigger_slurmdbd_event(trig_mgr_info_t * trig_in,time_t now)1423 static void _trigger_slurmdbd_event(trig_mgr_info_t *trig_in, time_t now)
1424 {
1425 if ((trig_in->trig_type & TRIGGER_TYPE_PRI_DBD_FAIL) &&
1426 trigger_pri_dbd_fail) {
1427 trig_in->state = 1;
1428 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1429 xfree(trig_in->res_id);
1430 trig_in->res_id = xstrdup("primary_slurmdbd_failure");
1431 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS)
1432 info("trigger[%u] for primary_slurmcdbd_failure",
1433 trig_in->trig_id);
1434 return;
1435 }
1436 if ((trig_in->trig_type & TRIGGER_TYPE_PRI_DBD_RES_OP) &&
1437 trigger_pri_dbd_res_op) {
1438 trig_in->state = 1;
1439 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1440 xfree(trig_in->res_id);
1441 trig_in->res_id = xstrdup("primary_slurmdbd_resumed_operation");
1442 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1443 info("trigger[%u] for primary_slurmdbd_resumed_"
1444 "operation", trig_in->trig_id);
1445 }
1446 return;
1447 }
1448 }
1449
_trigger_database_event(trig_mgr_info_t * trig_in,time_t now)1450 static void _trigger_database_event(trig_mgr_info_t *trig_in, time_t now)
1451 {
1452 if ((trig_in->trig_type & TRIGGER_TYPE_PRI_DB_FAIL) &&
1453 trigger_pri_db_fail) {
1454 trig_in->state = 1;
1455 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1456 xfree(trig_in->res_id);
1457 trig_in->res_id = xstrdup("primary_database_failure");
1458 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1459 info("trigger[%u] for primary_database_failure",
1460 trig_in->trig_id);
1461 }
1462 return;
1463 }
1464 if ((trig_in->trig_type & TRIGGER_TYPE_PRI_DB_RES_OP) &&
1465 trigger_pri_db_res_op) {
1466 trig_in->state = 1;
1467 trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1468 xfree(trig_in->res_id);
1469 trig_in->res_id = xstrdup("primary_database_resumed_operation");
1470 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1471 info("trigger[%u] for primary_database_resumed_"
1472 "operation", trig_in->trig_id);
1473 }
1474 return;
1475 }
1476 }
1477
1478 /* Ideally we would use the existing proctrack plugin to prevent any
1479 * processes from escaping our control, but that plugin is tied
1480 * to various slurmd data structures. We just the process group ID
1481 * to kill the spawned program after MAX_PROG_TIME. Since triggers are
1482 * meant primarily for system administrators rather than users, this
1483 * may be sufficient. */
_trigger_run_program(trig_mgr_info_t * trig_in)1484 static void _trigger_run_program(trig_mgr_info_t *trig_in)
1485 {
1486 char *tmp, *save_ptr = NULL, *tok;
1487 char *program, *args[64], user_name[1024];
1488 char *pname, *uname;
1489 uid_t uid;
1490 gid_t gid;
1491 pid_t child_pid;
1492 int i;
1493
1494 if (!_validate_trigger(trig_in))
1495 return;
1496
1497 tmp = xstrdup(trig_in->program);
1498 tok = strtok_r(trig_in->program, " ", &save_ptr);
1499 program = xstrdup(tok);
1500 pname = strrchr(program, '/');
1501 if (pname == NULL)
1502 pname = program;
1503 else
1504 pname++;
1505 args[0] = xstrdup(pname);
1506 for (i = 1; i < 63; i++) {
1507 tok = strtok_r(NULL, " ", &save_ptr);
1508 if (!tok) {
1509 args[i] = xstrdup(trig_in->res_id);
1510 break;
1511 }
1512 args[i] = xstrdup(tok);
1513 }
1514 for (i++; i < 64; i++)
1515 args[i] = NULL;
1516 xfree(tmp);
1517
1518 uid = trig_in->user_id;
1519 gid = trig_in->group_id;
1520 uname = uid_to_string(uid);
1521 snprintf(user_name, sizeof(user_name), "%s", uname);
1522 xfree(uname);
1523
1524 child_pid = fork();
1525 if (child_pid > 0) {
1526 trig_in->child_pid = child_pid;
1527 } else if (child_pid == 0) {
1528 int i;
1529 bool run_as_self = (uid == slurmctld_conf.slurm_user_id);
1530
1531 for (i = 0; i < 1024; i++)
1532 (void) close(i);
1533 setpgid(0, 0);
1534 setsid();
1535 if ((initgroups(user_name, gid) == -1) && !run_as_self) {
1536 error("trigger: initgroups: %m");
1537 exit(1);
1538 }
1539 if ((setgid(gid) == -1) && !run_as_self){
1540 error("trigger: setgid: %m");
1541 exit(1);
1542 }
1543 if ((setuid(uid) == -1) && !run_as_self) {
1544 error("trigger: setuid: %m");
1545 exit(1);
1546 }
1547 execv(program, args);
1548 exit(1);
1549 } else {
1550 error("fork: %m");
1551 }
1552 xfree(program);
1553 for (i = 0; i < 64; i++)
1554 xfree(args[i]);
1555 }
1556
_clear_event_triggers(void)1557 static void _clear_event_triggers(void)
1558 {
1559 if (trigger_down_front_end_bitmap) {
1560 bit_nclear(trigger_down_front_end_bitmap,
1561 0, (bit_size(trigger_down_front_end_bitmap) - 1));
1562 }
1563 if (trigger_up_front_end_bitmap) {
1564 bit_nclear(trigger_up_front_end_bitmap,
1565 0, (bit_size(trigger_up_front_end_bitmap) - 1));
1566 }
1567 if (trigger_down_nodes_bitmap) {
1568 bit_nclear(trigger_down_nodes_bitmap,
1569 0, (bit_size(trigger_down_nodes_bitmap) - 1));
1570 }
1571 if (trigger_drained_nodes_bitmap) {
1572 bit_nclear(trigger_drained_nodes_bitmap,
1573 0, (bit_size(trigger_drained_nodes_bitmap) - 1));
1574 }
1575 if (trigger_up_nodes_bitmap) {
1576 bit_nclear(trigger_up_nodes_bitmap,
1577 0, (bit_size(trigger_up_nodes_bitmap) - 1));
1578 }
1579 trigger_node_reconfig = false;
1580 trigger_bb_error = false;
1581 trigger_pri_ctld_fail = false;
1582 trigger_pri_ctld_res_op = false;
1583 trigger_pri_ctld_res_ctrl = false;
1584 trigger_pri_ctld_acct_buffer_full = false;
1585 trigger_bu_ctld_fail = false;
1586 trigger_bu_ctld_res_op = false;
1587 trigger_bu_ctld_as_ctrl = false;
1588 trigger_pri_dbd_fail = false;
1589 trigger_pri_dbd_res_op = false;
1590 trigger_pri_db_fail = false;
1591 trigger_pri_db_res_op = false;
1592 }
1593
1594 /* Make a copy of a trigger and pre-pend it on our list */
_trigger_clone(trig_mgr_info_t * trig_in)1595 static void _trigger_clone(trig_mgr_info_t *trig_in)
1596 {
1597 trig_mgr_info_t *trig_add;
1598
1599 trig_add = xmalloc(sizeof(trig_mgr_info_t));
1600 trig_add->flags = trig_in->flags;
1601 trig_add->trig_id = trig_in->trig_id;
1602 trig_add->res_type = trig_in->res_type;
1603 if (trig_in->orig_res_id) {
1604 trig_add->res_id = xstrdup(trig_in->orig_res_id);
1605 trig_add->orig_res_id = xstrdup(trig_in->orig_res_id);
1606 }
1607 if (trig_in->orig_bitmap) {
1608 trig_add->nodes_bitmap = bit_copy(trig_in->orig_bitmap);
1609 trig_add->orig_bitmap = bit_copy(trig_in->orig_bitmap);
1610 }
1611 trig_add->job_id = trig_in->job_id;
1612 trig_add->job_ptr = trig_in->job_ptr;
1613 trig_add->trig_type = trig_in->trig_type;
1614 trig_add->trig_time = trig_in->orig_time;
1615 trig_add->orig_time = trig_in->orig_time;
1616 trig_add->user_id = trig_in->user_id;
1617 trig_add->group_id = trig_in->group_id;
1618 trig_add->program = xstrdup(trig_in->program);;
1619 list_prepend(trigger_list, trig_add);
1620 }
1621
trigger_process(void)1622 extern void trigger_process(void)
1623 {
1624 ListIterator trig_iter;
1625 trig_mgr_info_t *trig_in;
1626 time_t now = time(NULL);
1627 bool state_change = false;
1628 pid_t rc;
1629 int prog_stat;
1630
1631 slurm_mutex_lock(&trigger_mutex);
1632 if (trigger_list == NULL)
1633 trigger_list = list_create(_trig_del);
1634
1635 trig_iter = list_iterator_create(trigger_list);
1636 while ((trig_in = list_next(trig_iter))) {
1637 if (trig_in->state == 0) {
1638 if (trig_in->res_type == TRIGGER_RES_TYPE_OTHER)
1639 _trigger_other_event(trig_in, now);
1640 else if (trig_in->res_type == TRIGGER_RES_TYPE_JOB)
1641 _trigger_job_event(trig_in, now);
1642 else if (trig_in->res_type == TRIGGER_RES_TYPE_NODE)
1643 _trigger_node_event(trig_in, now);
1644 else if (trig_in->res_type ==
1645 TRIGGER_RES_TYPE_SLURMCTLD)
1646 _trigger_slurmctld_event(trig_in, now);
1647 else if (trig_in->res_type ==
1648 TRIGGER_RES_TYPE_SLURMDBD)
1649 _trigger_slurmdbd_event(trig_in, now);
1650 else if (trig_in->res_type ==
1651 TRIGGER_RES_TYPE_DATABASE)
1652 _trigger_database_event(trig_in, now);
1653 else if (trig_in->res_type ==
1654 TRIGGER_RES_TYPE_FRONT_END)
1655 _trigger_front_end_event(trig_in, now);
1656 }
1657 if ((trig_in->state == 1) &&
1658 (trig_in->trig_time <= now)) {
1659 if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1660 info("launching program for trigger[%u]",
1661 trig_in->trig_id);
1662 info(" uid=%u gid=%u program=%s arg=%s",
1663 trig_in->user_id, trig_in->group_id,
1664 trig_in->program, trig_in->res_id);
1665 }
1666 if (trig_in->flags & TRIGGER_FLAG_PERM) {
1667 _trigger_clone(trig_in);
1668 }
1669 trig_in->state = 2;
1670 trig_in->trig_time = now;
1671 state_change = true;
1672 _trigger_run_program(trig_in);
1673 } else if ((trig_in->state == 2) &&
1674 (difftime(now, trig_in->trig_time) >
1675 MAX_PROG_TIME)) {
1676 if (trig_in->child_pid != 0) {
1677 killpg(trig_in->child_pid, SIGKILL);
1678 rc = waitpid(trig_in->child_pid, &prog_stat,
1679 WNOHANG);
1680 if ((rc > 0) && prog_stat) {
1681 info("trigger uid=%u type=%s:%s "
1682 "exit=%u:%u",
1683 trig_in->user_id,
1684 trigger_res_type(trig_in->res_type),
1685 trigger_type(trig_in->trig_type),
1686 WIFEXITED(prog_stat),
1687 WTERMSIG(prog_stat));
1688 }
1689 if ((rc == trig_in->child_pid) ||
1690 ((rc == -1) && (errno == ECHILD)))
1691 trig_in->child_pid = 0;
1692 }
1693
1694 if (trig_in->child_pid == 0) {
1695 if (slurmctld_conf.debug_flags &
1696 DEBUG_FLAG_TRIGGERS) {
1697 info("purging trigger[%u]",
1698 trig_in->trig_id);
1699 }
1700 list_delete_item(trig_iter);
1701 state_change = true;
1702 }
1703 } else if (trig_in->state == 2) {
1704 /* Elimiate zombie processes right away.
1705 * Purge trigger entry above MAX_PROG_TIME later */
1706 rc = waitpid(trig_in->child_pid, &prog_stat, WNOHANG);
1707 if ((rc > 0) && prog_stat) {
1708 info("trigger uid=%u type=%s:%s exit=%u:%u",
1709 trig_in->user_id,
1710 trigger_res_type(trig_in->res_type),
1711 trigger_type(trig_in->trig_type),
1712 WIFEXITED(prog_stat),
1713 WTERMSIG(prog_stat));
1714 }
1715 if ((rc == trig_in->child_pid) ||
1716 ((rc == -1) && (errno == ECHILD)))
1717 trig_in->child_pid = 0;
1718 }
1719 }
1720 list_iterator_destroy(trig_iter);
1721 _clear_event_triggers();
1722 slurm_mutex_unlock(&trigger_mutex);
1723 if (state_change)
1724 schedule_trigger_save();
1725 }
1726
1727 /* Free all allocated memory */
trigger_fini(void)1728 extern void trigger_fini(void)
1729 {
1730 FREE_NULL_LIST(trigger_list);
1731 FREE_NULL_BITMAP(trigger_down_front_end_bitmap);
1732 FREE_NULL_BITMAP(trigger_up_front_end_bitmap);
1733 FREE_NULL_BITMAP(trigger_down_nodes_bitmap);
1734 FREE_NULL_BITMAP(trigger_drained_nodes_bitmap);
1735 FREE_NULL_BITMAP(trigger_fail_nodes_bitmap);
1736 FREE_NULL_BITMAP(trigger_up_nodes_bitmap);
1737 }
1738