1 /*****************************************************************************\
2  *  trigger_mgr.c - Event trigger management
3  *****************************************************************************
4  *  Copyright (C) 2007 The Regents of the University of California.
5  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
6  *  Portions Copyright (C) 2010-2016 SchedMD <https://www.schedmd.com>.
7  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
8  *  Written by Morris Jette <jette1@llnl.gov> et. al.
9  *  CODE-OCEC-09-009. All rights reserved.
10  *
11  *  This file is part of Slurm, a resource management program.
12  *  For details, see <https://slurm.schedmd.com/>.
13  *  Please also read the included file: DISCLAIMER.
14  *
15  *  Slurm is free software; you can redistribute it and/or modify it under
16  *  the terms of the GNU General Public License as published by the Free
17  *  Software Foundation; either version 2 of the License, or (at your option)
18  *  any later version.
19  *
20  *  In addition, as a special exception, the copyright holders give permission
21  *  to link the code of portions of this program with the OpenSSL library under
22  *  certain conditions as described in each individual source file, and
23  *  distribute linked combinations including the two. You must obey the GNU
24  *  General Public License in all respects for all of the code used other than
25  *  OpenSSL. If you modify file(s) with this exception, you may extend this
26  *  exception to your version of the file(s), but you are not obligated to do
27  *  so. If you do not wish to do so, delete this exception statement from your
28  *  version.  If you delete this exception statement from all source files in
29  *  the program, then also delete it here.
30  *
31  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
32  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
33  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
34  *  details.
35  *
36  *  You should have received a copy of the GNU General Public License along
37  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
38  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
39 \*****************************************************************************/
40 
41 #include "config.h"
42 
43 #include <ctype.h>
44 #include <errno.h>
45 #include <fcntl.h>
46 #include <grp.h>
47 #include <pthread.h>
48 #include <signal.h>
49 #include <stdlib.h>
50 #include <sys/stat.h>
51 #include <sys/types.h>
52 
53 #include "src/common/bitstring.h"
54 #include "src/common/fd.h"
55 #include "src/common/list.h"
56 #include "src/common/slurmdbd_defs.h"
57 #include "src/common/slurm_protocol_defs.h"
58 #include "src/common/uid.h"
59 #include "src/common/xmalloc.h"
60 #include "src/common/xstring.h"
61 
62 #include "src/slurmctld/locks.h"
63 #include "src/slurmctld/slurmctld.h"
64 #include "src/slurmctld/state_save.h"
65 #include "src/slurmctld/trigger_mgr.h"
66 
67 #define MAX_PROG_TIME 300	/* maximum run time for program */
68 
69 /* Change TRIGGER_STATE_VERSION value when changing the state save format */
70 #define TRIGGER_STATE_VERSION        "PROTOCOL_VERSION"
71 
72 List trigger_list;
73 uint32_t next_trigger_id = 1;
74 static pthread_mutex_t trigger_mutex = PTHREAD_MUTEX_INITIALIZER;
75 bitstr_t *trigger_down_front_end_bitmap = NULL;
76 bitstr_t *trigger_up_front_end_bitmap = NULL;
77 bitstr_t *trigger_down_nodes_bitmap = NULL;
78 bitstr_t *trigger_drained_nodes_bitmap = NULL;
79 bitstr_t *trigger_fail_nodes_bitmap = NULL;
80 bitstr_t *trigger_up_nodes_bitmap   = NULL;
81 static bool trigger_bb_error = false;
82 static bool trigger_node_reconfig = false;
83 static bool trigger_pri_ctld_fail = false;
84 static bool trigger_pri_ctld_res_op = false;
85 static bool trigger_pri_ctld_res_ctrl = false;
86 static bool trigger_pri_ctld_acct_buffer_full = false;
87 static bool trigger_bu_ctld_fail = false;
88 static bool trigger_bu_ctld_res_op = false;
89 static bool trigger_bu_ctld_as_ctrl = false;
90 static bool trigger_pri_dbd_fail = false;
91 static bool trigger_pri_dbd_res_op = false;
92 static bool trigger_pri_db_fail = false;
93 static bool trigger_pri_db_res_op = false;
94 
95 /* Current trigger pull states (saved and restored) */
96 uint8_t ctld_failure = 0;
97 uint8_t bu_ctld_failure = 0;
98 uint8_t db_failure = 0;
99 uint8_t dbd_failure = 0;
100 
101 typedef struct trig_mgr_info {
102 	uint32_t child_pid;	/* pid of child process */
103 	uint16_t flags;		/* TRIGGER_FLAG_* */
104 	uint32_t trig_id;	/* trigger ID */
105 	uint16_t res_type;	/* TRIGGER_RES_TYPE_* */
106 	char *   res_id;	/* node name or job_id (string) */
107 	bitstr_t *nodes_bitmap;	/* bitmap of requested nodes (if applicable) */
108 	uint32_t job_id;	/* job ID (if applicable) */
109 	job_record_t *job_ptr;	/* pointer to job record (if applicable) */
110 	uint32_t trig_type;	/* TRIGGER_TYPE_* */
111 	time_t   trig_time;	/* offset (pending) or time stamp (complete) */
112 	uint32_t user_id;	/* user requesting trigger */
113 	uint32_t group_id;	/* user's group id */
114 	char *   program;	/* program to execute */
115 	uint8_t  state;		/* 0=pending, 1=pulled, 2=completed */
116 
117 	/* The orig_ fields are used to save  and clone the orignal values */
118 	bitstr_t *orig_bitmap;	/* bitmap of requested nodes (if applicable) */
119 	char *   orig_res_id;	/* original node name or job_id (string) */
120 	time_t   orig_time;	/* offset (pending) or time stamp (complete) */
121 } trig_mgr_info_t;
122 
123 /* Prototype for ListDelF */
_trig_del(void * x)124 void _trig_del(void *x) {
125 	trig_mgr_info_t * tmp = (trig_mgr_info_t *) x;
126 	xfree(tmp->res_id);
127 	xfree(tmp->orig_res_id);
128 	xfree(tmp->program);
129 	FREE_NULL_BITMAP(tmp->nodes_bitmap);
130 	FREE_NULL_BITMAP(tmp->orig_bitmap);
131 	xfree(tmp);
132 }
133 
_trig_offset(uint16_t offset)134 static int _trig_offset(uint16_t offset)
135 {
136 	static int rc;
137 	rc  = offset;
138 	rc -= 0x8000;
139 	return rc;
140 }
141 
_dump_trigger_msg(char * header,trigger_info_msg_t * msg)142 static void _dump_trigger_msg(char *header, trigger_info_msg_t *msg)
143 {
144 	int i;
145 
146 	if ((slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) == 0)
147 		return;
148 
149 	info("%s", header);
150 	if ((msg == NULL) || (msg->record_count == 0)) {
151 		info("Trigger has no entries");
152 		return;
153 	}
154 
155 	info("INDEX TRIG_ID RES_TYPE RES_ID TRIG_TYPE OFFSET UID PROGRAM");
156 	for (i=0; i<msg->record_count; i++) {
157 		info("trigger[%u] %u %s %s %s %d %u %s", i,
158 		     msg->trigger_array[i].trig_id,
159 		     trigger_res_type(msg->trigger_array[i].res_type),
160 		     msg->trigger_array[i].res_id,
161 		     trigger_type(msg->trigger_array[i].trig_type),
162 		     _trig_offset(msg->trigger_array[i].offset),
163 		     msg->trigger_array[i].user_id,
164 		     msg->trigger_array[i].program);
165 	}
166 }
167 
168 /* Validate trigger program */
_validate_trigger(trig_mgr_info_t * trig_in)169 static bool _validate_trigger(trig_mgr_info_t *trig_in)
170 {
171 	struct stat buf;
172 	int i, modes;
173 	char *program = xstrdup(trig_in->program);
174 
175 	for (i = 0; program[i]; i++) {
176 		if (isspace(program[i])) {
177 			program[i] = '\0';
178 			break;
179 		}
180 	}
181 
182 	if (stat(program, &buf) != 0) {
183 		info("trigger program %s not found", trig_in->program);
184 		xfree(program);
185 		return false;
186 	}
187 	xfree(program);
188 
189 	if (!S_ISREG(buf.st_mode)) {
190 		info("trigger program %s not a regular file", trig_in->program);
191 		return false;
192 	}
193 	if (buf.st_uid == trig_in->user_id)
194 		modes =  (buf.st_mode >> 6) & 07;
195 	else if (buf.st_gid == trig_in->group_id)
196 		modes =  (buf.st_mode >> 3) & 07;
197 	else
198 		modes = buf.st_mode  & 07;
199 	if (modes & 01)
200 		return true;
201 
202 	info("trigger program %s not executable", trig_in->program);
203 	return false;
204 }
205 
206 
trigger_pull(trigger_info_msg_t * msg)207 extern int trigger_pull(trigger_info_msg_t *msg)
208 {
209 	int rc = SLURM_SUCCESS;
210 	ListIterator trig_iter;
211 	trigger_info_t *trig_in;
212 	trig_mgr_info_t *trig_test;
213 
214 	if (trigger_list == NULL) {
215 		trigger_list = list_create(_trig_del);
216 	}
217 
218 	/* validate the request, designated trigger must be set */
219 	_dump_trigger_msg("trigger_pull", msg);
220 	if (msg->record_count != 1)
221 		return ESRCH;
222 	trig_in = msg->trigger_array;
223 
224 	if ((trig_in->res_type != TRIGGER_RES_TYPE_SLURMCTLD) &&
225 	    (trig_in->res_type != TRIGGER_RES_TYPE_SLURMDBD)  &&
226 	    (trig_in->res_type != TRIGGER_RES_TYPE_DATABASE)) {
227 		return EINVAL;
228 	}
229 
230 	/* now look for a valid request */
231 	trig_iter = list_iterator_create(trigger_list);
232 	while ((trig_test = list_next(trig_iter))) {
233 		if ((trig_test->res_type  == trig_in->res_type) &&
234    		    (trig_test->trig_type == trig_in->trig_type)) {
235 			switch(trig_test->trig_type) {
236 			case TRIGGER_TYPE_PRI_CTLD_ACCT_FULL:
237 				trigger_primary_ctld_acct_full();
238 				break;
239 			case TRIGGER_TYPE_BU_CTLD_FAIL:
240 				trigger_backup_ctld_fail(trig_in->control_inx);
241 				break;
242 			case TRIGGER_TYPE_BU_CTLD_RES_OP:
243 				trigger_backup_ctld_res_op(trig_in->control_inx);
244 				break;
245 			case TRIGGER_TYPE_BU_CTLD_AS_CTRL:
246 				trigger_backup_ctld_as_ctrl();
247 				break;
248 			case TRIGGER_TYPE_PRI_DBD_FAIL:
249 				trigger_primary_dbd_fail();
250 				break;
251 			case TRIGGER_TYPE_PRI_DBD_RES_OP:
252 				trigger_primary_dbd_res_op();
253 				break;
254 			case TRIGGER_TYPE_PRI_DB_FAIL:
255 				trigger_primary_db_fail();
256 				break;
257 			case TRIGGER_TYPE_PRI_DB_RES_OP:
258 				trigger_primary_db_res_op();
259 				break;
260 			default:
261 				error("trigger_pull call has invalid type: %u",
262 				      trig_test->trig_type);
263 				rc = EINVAL;
264 				break;
265 			}
266 		}
267 	}
268 	list_iterator_destroy(trig_iter);
269 
270 	return rc;
271 }
272 
trigger_clear(uid_t uid,trigger_info_msg_t * msg)273 extern int trigger_clear(uid_t uid, trigger_info_msg_t *msg)
274 {
275 	int rc = ESRCH;
276 	ListIterator trig_iter;
277 	trigger_info_t *trig_in;
278 	trig_mgr_info_t *trig_test;
279 	uint32_t job_id = 0;
280 
281 	slurm_mutex_lock(&trigger_mutex);
282 	if (trigger_list == NULL)
283 		trigger_list = list_create(_trig_del);
284 
285 	/* validate the request, need a job_id and/or trigger_id */
286 	_dump_trigger_msg("trigger_clear", msg);
287 	if (msg->record_count != 1)
288 		goto fini;
289 	trig_in = msg->trigger_array;
290 	if (trig_in->res_type == TRIGGER_RES_TYPE_JOB) {
291 		job_id = (uint32_t) atol(trig_in->res_id);
292 		if (job_id == 0) {
293 			rc = ESLURM_INVALID_JOB_ID;
294 			goto fini;
295 		}
296 	} else if ((trig_in->trig_id == 0) && (trig_in->user_id == NO_VAL)) {
297 		rc = EINVAL;
298 		goto fini;
299 	}
300 
301 	/* now look for a valid request, matching uid */
302 	trig_iter = list_iterator_create(trigger_list);
303 	while ((trig_test = list_next(trig_iter))) {
304 		if (trig_in->trig_id &&
305 		    (trig_in->trig_id != trig_test->trig_id))
306 			continue;
307 		if (job_id && (job_id != trig_test->job_id))
308 			continue;
309 		if ((trig_in->user_id != NO_VAL) &&
310 		    (trig_in->user_id != trig_test->user_id))
311 			continue;
312 		if (trig_test->state == 2)	/* wait for proc termination */
313 			continue;
314 		if ((trig_test->user_id != (uint32_t) uid) && (uid != 0)) {
315 			rc = ESLURM_ACCESS_DENIED;
316 			continue;
317 		}
318 		list_delete_item(trig_iter);
319 		rc = SLURM_SUCCESS;
320 	}
321 	list_iterator_destroy(trig_iter);
322 	schedule_trigger_save();
323 
324 fini:	slurm_mutex_unlock(&trigger_mutex);
325 	return rc;
326 }
327 
trigger_get(uid_t uid,trigger_info_msg_t * msg)328 extern trigger_info_msg_t * trigger_get(uid_t uid, trigger_info_msg_t *msg)
329 {
330 	trigger_info_msg_t *resp_data;
331 	ListIterator trig_iter;
332 	trigger_info_t *trig_out;
333 	trig_mgr_info_t *trig_in;
334 	int recs_written = 0;
335 
336 	slurm_mutex_lock(&trigger_mutex);
337 	if (trigger_list == NULL)
338 		trigger_list = list_create(_trig_del);
339 
340 	_dump_trigger_msg("trigger_get", NULL);
341 	resp_data = xmalloc(sizeof(trigger_info_msg_t));
342 	resp_data->record_count = list_count(trigger_list);
343 	resp_data->trigger_array = xcalloc(resp_data->record_count,
344 					   sizeof(trigger_info_t));
345 	trig_iter = list_iterator_create(trigger_list);
346 	trig_out = resp_data->trigger_array;
347 	while ((trig_in = list_next(trig_iter))) {
348 		/* Note: Filtering currently done by strigger */
349 		if ((trig_in->state >= 1) &&
350 		    ((trig_out->flags & TRIGGER_FLAG_PERM) == 0))
351 			continue;	/* no longer pending */
352 		trig_out->flags     = trig_in->flags;
353 		trig_out->trig_id   = trig_in->trig_id;
354 		trig_out->res_type  = trig_in->res_type;
355 		trig_out->res_id    = xstrdup(trig_in->res_id);
356 		trig_out->trig_type = trig_in->trig_type;
357 		trig_out->offset    = trig_in->trig_time;
358 		trig_out->user_id   = trig_in->user_id;
359 		trig_out->program   = xstrdup(trig_in->program);
360 		trig_out++;
361 		recs_written++;
362 	}
363 	list_iterator_destroy(trig_iter);
364 	slurm_mutex_unlock(&trigger_mutex);
365 	resp_data->record_count = recs_written;
366 
367 	_dump_trigger_msg("trigger_got", resp_data);
368 	return resp_data;
369 }
370 
_duplicate_trigger(trigger_info_t * trig_desc)371 static bool _duplicate_trigger(trigger_info_t *trig_desc)
372 {
373 	bool found_dup = false;
374 	ListIterator trig_iter;
375 	trig_mgr_info_t *trig_rec;
376 
377 	trig_iter = list_iterator_create(trigger_list);
378 	while ((trig_rec = list_next(trig_iter))) {
379 		if ((trig_desc->flags     == trig_rec->flags)      &&
380 		    (trig_desc->res_type  == trig_rec->res_type)   &&
381 		    (trig_desc->trig_type == trig_rec->trig_type)  &&
382 		    (trig_desc->offset    == trig_rec->trig_time)  &&
383 		    (trig_desc->user_id   == trig_rec->user_id)    &&
384 		    !xstrcmp(trig_desc->program, trig_rec->program) &&
385 		    !xstrcmp(trig_desc->res_id, trig_rec->res_id)) {
386 			found_dup = true;
387 			break;
388 		}
389 	}
390 	list_iterator_destroy(trig_iter);
391 	return found_dup;
392 }
393 
trigger_set(uid_t uid,gid_t gid,trigger_info_msg_t * msg)394 extern int trigger_set(uid_t uid, gid_t gid, trigger_info_msg_t *msg)
395 {
396 	int i;
397 	int rc = SLURM_SUCCESS;
398 	uint32_t job_id;
399 	bitstr_t *bitmap = NULL;
400 	trig_mgr_info_t * trig_add;
401 	job_record_t *job_ptr;
402 	/* Read config and job info */
403 	slurmctld_lock_t job_read_lock =
404 		{ READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
405 
406 	lock_slurmctld(job_read_lock);
407 	slurm_mutex_lock(&trigger_mutex);
408 
409 	if ((slurmctld_conf.slurm_user_id != 0) &&
410 	    (slurmctld_conf.slurm_user_id != uid)) {
411 		/* If SlurmUser is not root, then it is unable to set the
412 		 * appropriate user id and group id for the program to be
413 		 * launched. To prevent the launched program for an arbitrary
414 		 * user being executed as user SlurmUser, disable all other
415 		 * users from setting triggers. */
416 		info("Attempt to set trigger by uid %u != SlurmUser", uid);
417 		rc = ESLURM_ACCESS_DENIED;
418 		goto fini;
419 	}
420 
421 	if (trigger_list == NULL) {
422 		trigger_list = list_create(_trig_del);
423 	} else if ((uid != 0) &&
424 		   (list_count(trigger_list) >= slurmctld_conf.max_job_cnt)) {
425 		rc = EAGAIN;
426 		goto fini;
427 	}
428 
429 	_dump_trigger_msg("trigger_set", msg);
430 	for (i = 0; i < msg->record_count; i++) {
431 		if (msg->trigger_array[i].res_type ==
432 		    TRIGGER_RES_TYPE_JOB) {
433 			job_id = (uint32_t) atol(
434 				 msg->trigger_array[i].res_id);
435 			job_ptr = find_job_record(job_id);
436 			if (job_ptr == NULL) {
437 				rc = ESLURM_INVALID_JOB_ID;
438 				continue;
439 			}
440 			if (IS_JOB_FINISHED(job_ptr)) {
441 				rc = ESLURM_ALREADY_DONE;
442 				continue;
443 			}
444 		} else {
445 			job_id = 0;
446 			job_ptr = NULL;
447 			if ((msg->trigger_array[i].res_id != NULL)   &&
448 			    (msg->trigger_array[i].res_id[0] != '*') &&
449 			    (node_name2bitmap(msg->trigger_array[i].res_id,
450 					      false, &bitmap) != 0)) {
451 				FREE_NULL_BITMAP(bitmap);
452 				rc = ESLURM_INVALID_NODE_NAME;
453 				continue;
454 			}
455 		}
456 		msg->trigger_array[i].user_id = (uint32_t) uid;
457 		if (_duplicate_trigger(&msg->trigger_array[i])) {
458 			FREE_NULL_BITMAP(bitmap);
459 			rc = ESLURM_TRIGGER_DUP;
460 			continue;
461 		}
462 		trig_add = xmalloc(sizeof(trig_mgr_info_t));
463 		msg->trigger_array[i].trig_id = next_trigger_id;
464 		trig_add->trig_id = next_trigger_id;
465 		next_trigger_id++;
466 		trig_add->flags = msg->trigger_array[i].flags;
467 		trig_add->res_type = msg->trigger_array[i].res_type;
468 		if (bitmap) {
469 			trig_add->nodes_bitmap = bitmap;
470 			trig_add->orig_bitmap  = bit_copy(bitmap);
471 			bitmap = NULL;
472 		}
473 		trig_add->job_id = job_id;
474 		trig_add->job_ptr = job_ptr;
475 		if (msg->trigger_array[i].res_id) {
476 			trig_add->res_id = msg->trigger_array[i].res_id;
477 			trig_add->orig_res_id = xstrdup(trig_add->res_id);
478 			msg->trigger_array[i].res_id = NULL; /* moved */
479 		}
480 		trig_add->trig_type = msg->trigger_array[i].trig_type;
481 		trig_add->trig_time = msg->trigger_array[i].offset;
482 		trig_add->orig_time = msg->trigger_array[i].offset;
483 		trig_add->user_id   = msg->trigger_array[i].user_id;
484 		trig_add->group_id  = (uint32_t) gid;
485 		/* move don't copy "program" */
486 		trig_add->program = msg->trigger_array[i].program;
487 		msg->trigger_array[i].program = NULL;
488 		if (!_validate_trigger(trig_add)) {
489 			rc = ESLURM_ACCESS_DENIED;
490 			FREE_NULL_BITMAP(trig_add->nodes_bitmap);
491 			FREE_NULL_BITMAP(trig_add->orig_bitmap);
492 			xfree(trig_add->program);
493 			xfree(trig_add->res_id);
494 			xfree(trig_add);
495 			continue;
496 		}
497 		list_append(trigger_list, trig_add);
498 		schedule_trigger_save();
499 	}
500 
501 fini:	slurm_mutex_unlock(&trigger_mutex);
502 	unlock_slurmctld(job_read_lock);
503 	return rc;
504 }
505 
trigger_front_end_down(front_end_record_t * front_end_ptr)506 extern void trigger_front_end_down(front_end_record_t *front_end_ptr)
507 {
508 	int inx = front_end_ptr - front_end_nodes;
509 
510 	xassert(verify_lock(NODE_LOCK, READ_LOCK));
511 
512 	slurm_mutex_lock(&trigger_mutex);
513 	if (trigger_down_front_end_bitmap == NULL)
514 		trigger_down_front_end_bitmap = bit_alloc(front_end_node_cnt);
515 	bit_set(trigger_down_front_end_bitmap, inx);
516 	slurm_mutex_unlock(&trigger_mutex);
517 }
518 
trigger_front_end_up(front_end_record_t * front_end_ptr)519 extern void trigger_front_end_up(front_end_record_t *front_end_ptr)
520 {
521 	int inx = front_end_ptr - front_end_nodes;
522 
523 	xassert(verify_lock(NODE_LOCK, READ_LOCK));
524 
525 	slurm_mutex_lock(&trigger_mutex);
526 	if (trigger_up_front_end_bitmap == NULL)
527 		trigger_up_front_end_bitmap = bit_alloc(front_end_node_cnt);
528 	bit_set(trigger_up_front_end_bitmap, inx);
529 	slurm_mutex_unlock(&trigger_mutex);
530 }
531 
trigger_node_down(node_record_t * node_ptr)532 extern void trigger_node_down(node_record_t *node_ptr)
533 {
534 	int inx = node_ptr - node_record_table_ptr;
535 
536 	xassert(verify_lock(NODE_LOCK, READ_LOCK));
537 
538 	slurm_mutex_lock(&trigger_mutex);
539 	if (trigger_down_nodes_bitmap == NULL)
540 		trigger_down_nodes_bitmap = bit_alloc(node_record_count);
541 	bit_set(trigger_down_nodes_bitmap, inx);
542 	slurm_mutex_unlock(&trigger_mutex);
543 }
544 
trigger_node_drained(node_record_t * node_ptr)545 extern void trigger_node_drained(node_record_t *node_ptr)
546 {
547 	int inx = node_ptr - node_record_table_ptr;
548 
549 	xassert(verify_lock(NODE_LOCK, READ_LOCK));
550 
551 	slurm_mutex_lock(&trigger_mutex);
552 	if (trigger_drained_nodes_bitmap == NULL)
553 		trigger_drained_nodes_bitmap = bit_alloc(node_record_count);
554 	bit_set(trigger_drained_nodes_bitmap, inx);
555 	slurm_mutex_unlock(&trigger_mutex);
556 }
557 
trigger_node_failing(node_record_t * node_ptr)558 extern void trigger_node_failing(node_record_t *node_ptr)
559 {
560 	int inx = node_ptr - node_record_table_ptr;
561 
562 	xassert(verify_lock(NODE_LOCK, READ_LOCK));
563 
564 	slurm_mutex_lock(&trigger_mutex);
565 	if (trigger_fail_nodes_bitmap == NULL)
566 		trigger_fail_nodes_bitmap = bit_alloc(node_record_count);
567 	bit_set(trigger_fail_nodes_bitmap, inx);
568 	slurm_mutex_unlock(&trigger_mutex);
569 }
570 
trigger_node_up(node_record_t * node_ptr)571 extern void trigger_node_up(node_record_t *node_ptr)
572 {
573 	int inx = node_ptr - node_record_table_ptr;
574 
575 	xassert(verify_lock(NODE_LOCK, READ_LOCK));
576 
577 	slurm_mutex_lock(&trigger_mutex);
578 	if (trigger_up_nodes_bitmap == NULL)
579 		trigger_up_nodes_bitmap = bit_alloc(node_record_count);
580 	bit_set(trigger_up_nodes_bitmap, inx);
581 	slurm_mutex_unlock(&trigger_mutex);
582 }
583 
trigger_reconfig(void)584 extern void trigger_reconfig(void)
585 {
586 	slurmctld_lock_t node_read_lock = { .node = READ_LOCK };
587 
588 	lock_slurmctld(node_read_lock);
589 	slurm_mutex_lock(&trigger_mutex);
590 	trigger_node_reconfig = true;
591 	if (trigger_down_front_end_bitmap)
592 		trigger_down_front_end_bitmap = bit_realloc(
593 			trigger_down_front_end_bitmap, node_record_count);
594 	if (trigger_up_front_end_bitmap)
595 		trigger_up_front_end_bitmap = bit_realloc(
596 			trigger_up_front_end_bitmap, node_record_count);
597 	if (trigger_down_nodes_bitmap)
598 		trigger_down_nodes_bitmap = bit_realloc(
599 			trigger_down_nodes_bitmap, node_record_count);
600 	if (trigger_drained_nodes_bitmap)
601 		trigger_drained_nodes_bitmap = bit_realloc(
602 			trigger_drained_nodes_bitmap, node_record_count);
603 	if (trigger_fail_nodes_bitmap)
604 		trigger_fail_nodes_bitmap = bit_realloc(
605 			trigger_fail_nodes_bitmap, node_record_count);
606 	if (trigger_up_nodes_bitmap)
607 		trigger_up_nodes_bitmap = bit_realloc(
608 			trigger_up_nodes_bitmap, node_record_count);
609 	slurm_mutex_unlock(&trigger_mutex);
610 	unlock_slurmctld(node_read_lock);
611 }
612 
trigger_primary_ctld_fail(void)613 extern void trigger_primary_ctld_fail(void)
614 {
615 	slurm_mutex_lock(&trigger_mutex);
616 	if (ctld_failure != 1) {
617 		trigger_pri_ctld_fail = true;
618 		ctld_failure = 1;
619 	}
620 	slurm_mutex_unlock(&trigger_mutex);
621 }
622 
trigger_primary_ctld_res_op(void)623 extern void trigger_primary_ctld_res_op(void)
624 {
625 	slurm_mutex_lock(&trigger_mutex);
626 	trigger_pri_ctld_res_op = true;
627 	ctld_failure = 0;
628 	slurm_mutex_unlock(&trigger_mutex);
629 }
630 
trigger_primary_ctld_res_ctrl(void)631 extern void trigger_primary_ctld_res_ctrl(void)
632 {
633 	slurm_mutex_lock(&trigger_mutex);
634 	trigger_pri_ctld_res_ctrl = true;
635 	slurm_mutex_unlock(&trigger_mutex);
636 }
637 
trigger_primary_ctld_acct_full(void)638 extern void trigger_primary_ctld_acct_full(void)
639 {
640 	slurm_mutex_lock(&trigger_mutex);
641 	trigger_pri_ctld_acct_buffer_full = true;
642 	slurm_mutex_unlock(&trigger_mutex);
643 }
644 
trigger_backup_ctld_fail(int index)645 extern void trigger_backup_ctld_fail(int index)
646 {
647 	slurm_mutex_lock(&trigger_mutex);
648 	if (bu_ctld_failure != 1) {
649 		trigger_bu_ctld_fail = true;
650 		bu_ctld_failure = 1;
651 	}
652 	slurm_mutex_unlock(&trigger_mutex);
653 }
654 
trigger_backup_ctld_res_op(int index)655 extern void trigger_backup_ctld_res_op(int index)
656 {
657 	slurm_mutex_lock(&trigger_mutex);
658 	trigger_bu_ctld_res_op = true;
659 	bu_ctld_failure = 0;
660 	slurm_mutex_unlock(&trigger_mutex);
661 }
662 
trigger_backup_ctld_as_ctrl(void)663 extern void trigger_backup_ctld_as_ctrl(void)
664 {
665 	slurm_mutex_lock(&trigger_mutex);
666 	trigger_bu_ctld_as_ctrl = true;
667 	slurm_mutex_unlock(&trigger_mutex);
668 }
669 
trigger_primary_dbd_fail(void)670 extern void trigger_primary_dbd_fail(void)
671 {
672 	slurm_mutex_lock(&trigger_mutex);
673 	if (dbd_failure != 1) {
674 		trigger_pri_dbd_fail = true;
675 		dbd_failure = 1;
676 	}
677 	slurm_mutex_unlock(&trigger_mutex);
678 }
679 
trigger_primary_dbd_res_op(void)680 extern void trigger_primary_dbd_res_op(void)
681 {
682 	slurm_mutex_lock(&trigger_mutex);
683 	trigger_pri_dbd_res_op = true;
684 	dbd_failure = 0;
685 	slurm_mutex_unlock(&trigger_mutex);
686 }
687 
trigger_primary_db_fail(void)688 extern void trigger_primary_db_fail(void)
689 {
690 	slurm_mutex_lock(&trigger_mutex);
691 	if (db_failure != 1) {
692 		trigger_pri_db_fail = true;
693 		db_failure = 1;
694 	}
695 	slurm_mutex_unlock(&trigger_mutex);
696 }
697 
trigger_primary_db_res_op(void)698 extern void trigger_primary_db_res_op(void)
699 {
700 	slurm_mutex_lock(&trigger_mutex);
701 		trigger_pri_db_res_op = true;
702 		db_failure = 0;
703 	slurm_mutex_unlock(&trigger_mutex);
704 }
705 
trigger_burst_buffer(void)706 extern void trigger_burst_buffer(void)
707 {
708 	slurm_mutex_lock(&trigger_mutex);
709 	trigger_bb_error = true;
710 	slurm_mutex_unlock(&trigger_mutex);
711 }
712 
_dump_trigger_state(trig_mgr_info_t * trig_ptr,Buf buffer)713 static void _dump_trigger_state(trig_mgr_info_t *trig_ptr, Buf buffer)
714 {
715 	/* write trigger pull state flags */
716 	pack8(ctld_failure,    buffer);
717 	pack8(bu_ctld_failure, buffer);
718 	pack8(dbd_failure,     buffer);
719 	pack8(db_failure,      buffer);
720 
721 	pack16   (trig_ptr->flags,     buffer);
722 	pack32   (trig_ptr->trig_id,   buffer);
723 	pack16   (trig_ptr->res_type,  buffer);
724 	packstr  (trig_ptr->orig_res_id, buffer);  /* restores res_id too */
725 	/* rebuild nodes_bitmap as needed from res_id */
726 	/* rebuild job_id as needed from res_id */
727 	/* rebuild job_ptr as needed from res_id */
728 	pack32   (trig_ptr->trig_type, buffer);
729 	pack_time(trig_ptr->orig_time, buffer);    /* restores trig_time too */
730 	pack32   (trig_ptr->user_id,   buffer);
731 	pack32   (trig_ptr->group_id,  buffer);
732 	packstr  (trig_ptr->program,   buffer);
733 	pack8    (trig_ptr->state,     buffer);
734 }
735 
_load_trigger_state(Buf buffer,uint16_t protocol_version)736 static int _load_trigger_state(Buf buffer, uint16_t protocol_version)
737 {
738 	trig_mgr_info_t *trig_ptr;
739 	uint32_t str_len;
740 
741 	xassert(verify_lock(JOB_LOCK, READ_LOCK));
742 
743 	trig_ptr = xmalloc(sizeof(trig_mgr_info_t));
744 
745 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
746 		/* restore trigger pull state flags */
747 		safe_unpack8(&ctld_failure, buffer);
748 		safe_unpack8(&bu_ctld_failure, buffer);
749 		safe_unpack8(&dbd_failure, buffer);
750 		safe_unpack8(&db_failure, buffer);
751 
752 		safe_unpack16   (&trig_ptr->flags,     buffer);
753 		safe_unpack32   (&trig_ptr->trig_id,   buffer);
754 		safe_unpack16   (&trig_ptr->res_type,  buffer);
755 		safe_unpackstr_xmalloc(&trig_ptr->res_id, &str_len, buffer);
756 		/* rebuild nodes_bitmap as needed from res_id */
757 		/* rebuild job_id as needed from res_id */
758 		/* rebuild job_ptr as needed from res_id */
759 		safe_unpack32   (&trig_ptr->trig_type, buffer);
760 		safe_unpack_time(&trig_ptr->trig_time, buffer);
761 		safe_unpack32   (&trig_ptr->user_id,   buffer);
762 		safe_unpack32   (&trig_ptr->group_id,  buffer);
763 		safe_unpackstr_xmalloc(&trig_ptr->program, &str_len, buffer);
764 		safe_unpack8    (&trig_ptr->state,     buffer);
765 	} else {
766 		error("_load_trigger_state: protocol_version "
767 		      "%hu not supported", protocol_version);
768 		goto unpack_error;
769 	}
770 
771 	if ((trig_ptr->res_type < TRIGGER_RES_TYPE_JOB)  ||
772 	    (trig_ptr->res_type > TRIGGER_RES_TYPE_OTHER) ||
773 	    (trig_ptr->state > 2))
774 		goto unpack_error;
775 	if (trig_ptr->res_type == TRIGGER_RES_TYPE_JOB) {
776 		trig_ptr->job_id = (uint32_t) atol(trig_ptr->res_id);
777 		trig_ptr->job_ptr = find_job_record(trig_ptr->job_id);
778 		if ((trig_ptr->job_id == 0)     ||
779 		    (trig_ptr->job_ptr == NULL) ||
780 		    (IS_JOB_COMPLETED(trig_ptr->job_ptr) &&
781 		     trig_ptr->state != 2))
782 			goto unpack_error;
783 	} else if (trig_ptr->res_type == TRIGGER_RES_TYPE_NODE) {
784 		trig_ptr->job_id = 0;
785 		trig_ptr->job_ptr = NULL;
786 		if ((trig_ptr->res_id != NULL)   &&
787 		    (trig_ptr->res_id[0] != '*') &&
788 		    (node_name2bitmap(trig_ptr->res_id, false,
789 				      &trig_ptr->nodes_bitmap) != 0))
790 			goto unpack_error;
791 	}
792 	if (trig_ptr->nodes_bitmap)
793 		trig_ptr->orig_bitmap = bit_copy(trig_ptr->nodes_bitmap);
794 	if (trig_ptr->res_id)
795 		trig_ptr->orig_res_id = xstrdup(trig_ptr->res_id);
796 	trig_ptr->orig_time = trig_ptr->trig_time;
797 
798 	slurm_mutex_lock(&trigger_mutex);
799 	if (trigger_list == NULL)
800 		trigger_list = list_create(_trig_del);
801 	list_append(trigger_list, trig_ptr);
802 	next_trigger_id = MAX(next_trigger_id, trig_ptr->trig_id + 1);
803 	slurm_mutex_unlock(&trigger_mutex);
804 
805 	return SLURM_SUCCESS;
806 
807 unpack_error:
808 	error("Incomplete trigger record");
809 	xfree(trig_ptr->res_id);
810 	xfree(trig_ptr->program);
811 	FREE_NULL_BITMAP(trig_ptr->nodes_bitmap);
812 	xfree(trig_ptr);
813 	return SLURM_ERROR;
814 }
815 
trigger_state_save(void)816 extern int trigger_state_save(void)
817 {
818 	/* Save high-water mark to avoid buffer growth with copies */
819 	static int high_buffer_size = (1024 * 1024);
820 	int error_code = 0, log_fd;
821 	char *old_file, *new_file, *reg_file;
822 	Buf buffer = init_buf(high_buffer_size);
823 	ListIterator trig_iter;
824 	trig_mgr_info_t *trig_in;
825 	/* Locks: Read config */
826 	slurmctld_lock_t config_read_lock =
827 		{ READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
828 
829 	/* write header: version, time */
830 	packstr(TRIGGER_STATE_VERSION, buffer);
831 	pack16(SLURM_PROTOCOL_VERSION, buffer);
832 	pack_time(time(NULL), buffer);
833 
834 	/* write individual trigger records */
835 	slurm_mutex_lock(&trigger_mutex);
836 	if (trigger_list == NULL)
837 		trigger_list = list_create(_trig_del);
838 
839 	trig_iter = list_iterator_create(trigger_list);
840 	while ((trig_in = list_next(trig_iter)))
841 		_dump_trigger_state(trig_in, buffer);
842 	list_iterator_destroy(trig_iter);
843 	slurm_mutex_unlock(&trigger_mutex);
844 
845 	/* write the buffer to file */
846 	lock_slurmctld(config_read_lock);
847 	old_file = xstrdup(slurmctld_conf.state_save_location);
848 	xstrcat(old_file, "/trigger_state.old");
849 	reg_file = xstrdup(slurmctld_conf.state_save_location);
850 	xstrcat(reg_file, "/trigger_state");
851 	new_file = xstrdup(slurmctld_conf.state_save_location);
852 	xstrcat(new_file, "/trigger_state.new");
853 	unlock_slurmctld(config_read_lock);
854 
855 	lock_state_files();
856 	log_fd = creat(new_file, 0600);
857 	if (log_fd < 0) {
858 		error("Can't save state, create file %s error %m",
859 		      new_file);
860 		error_code = errno;
861 	} else {
862 		int pos = 0, nwrite = get_buf_offset(buffer), amount, rc;
863 		char *data = (char *)get_buf_data(buffer);
864 		high_buffer_size = MAX(nwrite, high_buffer_size);
865 		while (nwrite > 0) {
866 			amount = write(log_fd, &data[pos], nwrite);
867 			if ((amount < 0) && (errno != EINTR)) {
868 				error("Error writing file %s, %m", new_file);
869 				error_code = errno;
870 				break;
871 			}
872 			nwrite -= amount;
873 			pos    += amount;
874 		}
875 
876 		rc = fsync_and_close(log_fd, "trigger");
877 		if (rc && !error_code)
878 			error_code = rc;
879 	}
880 	if (error_code) {
881 		(void) unlink(new_file);
882 	} else {			/* file shuffle */
883 		(void) unlink(old_file);
884 		if (link(reg_file, old_file)) {
885 			debug4("unable to create link for %s -> %s: %m",
886 			       reg_file, old_file);
887 		}
888 		(void) unlink(reg_file);
889 		if (link(new_file, reg_file)) {
890 			debug4("unable to create link for %s -> %s: %m",
891 			       new_file, reg_file);
892 		}
893 		(void) unlink(new_file);
894 	}
895 	xfree(old_file);
896 	xfree(reg_file);
897 	xfree(new_file);
898 	unlock_state_files();
899 	free_buf(buffer);
900 	return error_code;
901 }
902 
903 /* Open the trigger state save file, or backup if necessary.
904  * state_file IN - the name of the state save file used
905  * RET the file description to read from or error code
906  */
_open_trigger_state_file(char ** state_file)907 static Buf _open_trigger_state_file(char **state_file)
908 {
909 	Buf buf;
910 
911 	*state_file = xstrdup(slurmctld_conf.state_save_location);
912 	xstrcat(*state_file, "/trigger_state");
913 	if (!(buf = create_mmap_buf(*state_file)))
914 		error("Could not open trigger state file %s: %m",
915 		      *state_file);
916 	else
917 		return buf;
918 
919 	error("NOTE: Trying backup state save file. Triggers may be lost!");
920 	xstrcat(*state_file, ".old");
921 	return create_mmap_buf(*state_file);;
922 }
923 
trigger_state_restore(void)924 extern void trigger_state_restore(void)
925 {
926 	uint16_t protocol_version = NO_VAL16;
927 	int trigger_cnt = 0;
928 	char *state_file;
929 	Buf buffer;
930 	time_t buf_time;
931 	char *ver_str = NULL;
932 	uint32_t ver_str_len;
933 
934 	/* read the file */
935 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
936 
937 	lock_state_files();
938 	if (!(buffer = _open_trigger_state_file(&state_file))) {
939 		info("No trigger state file (%s) to recover", state_file);
940 		xfree(state_file);
941 		unlock_state_files();
942 		return;
943 	}
944 	xfree(state_file);
945 	unlock_state_files();
946 
947 	safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
948 	if (ver_str && !xstrcmp(ver_str, TRIGGER_STATE_VERSION))
949 		safe_unpack16(&protocol_version, buffer);
950 
951 	if (protocol_version == NO_VAL16) {
952 		if (!ignore_state_errors)
953 			fatal("Can't recover trigger state, data version incompatible, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
954 		error("Can't recover trigger state, data version "
955 		      "incompatible");
956 		xfree(ver_str);
957 		free_buf(buffer);
958 		return;
959 	}
960 	xfree(ver_str);
961 
962 	safe_unpack_time(&buf_time, buffer);
963 	if (trigger_list)
964 		list_flush(trigger_list);
965 	while (remaining_buf(buffer) > 0) {
966 		if (_load_trigger_state(buffer, protocol_version) !=
967 		    SLURM_SUCCESS)
968 			goto unpack_error;
969 		trigger_cnt++;
970 	}
971 	goto fini;
972 
973 unpack_error:
974 	if (!ignore_state_errors)
975 		fatal("Incomplete trigger data checkpoint file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
976 	error("Incomplete trigger data checkpoint file");
977 fini:	verbose("State of %d triggers recovered", trigger_cnt);
978 	free_buf(buffer);
979 }
980 
_front_end_job_test(bitstr_t * front_end_bitmap,job_record_t * job_ptr)981 static bool _front_end_job_test(bitstr_t *front_end_bitmap,
982 				job_record_t *job_ptr)
983 {
984 #ifdef HAVE_FRONT_END
985 	int i;
986 
987 	/* Need node read lock for reading front_end_node_cnt. */
988 	xassert(verify_lock(NODE_LOCK, READ_LOCK));
989 
990 	if ((front_end_bitmap == NULL) || (job_ptr->batch_host == NULL))
991 		return false;
992 
993 	for (i = 0; i < front_end_node_cnt; i++) {
994 		if (bit_test(front_end_bitmap, i) &&
995 		    !xstrcmp(front_end_nodes[i].name, job_ptr->batch_host)) {
996 			return true;
997 		}
998 	}
999 #endif
1000 	return false;
1001 }
1002 
1003 /* Test if the event has been triggered, change trigger state as needed */
_trigger_job_event(trig_mgr_info_t * trig_in,time_t now)1004 static void _trigger_job_event(trig_mgr_info_t *trig_in, time_t now)
1005 {
1006 	xassert(verify_lock(JOB_LOCK, READ_LOCK));
1007 
1008 	trig_in->job_ptr = find_job_record(trig_in->job_id);
1009 
1010 	if ((trig_in->trig_type & TRIGGER_TYPE_FINI) &&
1011 	    ((trig_in->job_ptr == NULL) ||
1012 	     (IS_JOB_COMPLETED(trig_in->job_ptr)))) {
1013 		trig_in->state = 1;
1014 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1015 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1016 			info("trigger[%u] event for job %u fini",
1017 			     trig_in->trig_id, trig_in->job_id);
1018 		}
1019 		return;
1020 	}
1021 
1022 	if (trig_in->job_ptr == NULL) {
1023 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1024 			info("trigger[%u] for defunct job %u",
1025 			     trig_in->trig_id, trig_in->job_id);
1026 		}
1027 		trig_in->state = 2;
1028 		trig_in->trig_time = now;
1029 		return;
1030 	}
1031 
1032 	if (!IS_JOB_PENDING(trig_in->job_ptr) &&
1033 	    (trig_in->trig_type & TRIGGER_TYPE_TIME)) {
1034 		long rem_time = (trig_in->job_ptr->end_time - now);
1035 		if (rem_time <= (0x8000 - trig_in->trig_time)) {
1036 			trig_in->state = 1;
1037 			trig_in->trig_time = now;
1038 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1039 				info("trigger[%u] for job %u time",
1040 				     trig_in->trig_id, trig_in->job_id);
1041 			}
1042 			return;
1043 		}
1044 	}
1045 
1046 	if (trig_in->trig_type & TRIGGER_TYPE_DOWN) {
1047 		if (_front_end_job_test(trigger_down_front_end_bitmap,
1048 					trig_in->job_ptr)) {
1049 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1050 				info("trigger[%u] for job %u down",
1051 				     trig_in->trig_id, trig_in->job_id);
1052 			}
1053 			trig_in->state = 1;
1054 			trig_in->trig_time = now +
1055 					    (trig_in->trig_time - 0x8000);
1056 			return;
1057 		}
1058 	}
1059 
1060 	if (trig_in->trig_type & TRIGGER_TYPE_DOWN) {
1061 		if (trigger_down_nodes_bitmap &&
1062 		    bit_overlap_any(trig_in->job_ptr->node_bitmap,
1063 				    trigger_down_nodes_bitmap)) {
1064 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1065 				info("trigger[%u] for job %u down",
1066 				     trig_in->trig_id, trig_in->job_id);
1067 			}
1068 			trig_in->state = 1;
1069 			trig_in->trig_time = now +
1070 					    (trig_in->trig_time - 0x8000);
1071 			return;
1072 		}
1073 	}
1074 
1075 	if (trig_in->trig_type & TRIGGER_TYPE_FAIL) {
1076 		if (trigger_fail_nodes_bitmap &&
1077 		    bit_overlap_any(trig_in->job_ptr->node_bitmap,
1078 				    trigger_fail_nodes_bitmap)) {
1079 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1080 				info("trigger[%u] for job %u node fail",
1081 				     trig_in->trig_id, trig_in->job_id);
1082 			}
1083 			trig_in->state = 1;
1084 			trig_in->trig_time = now +
1085 					     (trig_in->trig_time - 0x8000);
1086 			return;
1087 		}
1088 	}
1089 
1090 	if (trig_in->trig_type & TRIGGER_TYPE_UP) {
1091 		if (trigger_up_nodes_bitmap &&
1092 		    bit_overlap_any(trig_in->job_ptr->node_bitmap,
1093 				    trigger_up_nodes_bitmap)) {
1094 			trig_in->state = 1;
1095 			trig_in->trig_time = now +
1096 					    (0x8000 - trig_in->trig_time);
1097 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1098 				info("trigger[%u] for job %u up",
1099 				     trig_in->trig_id, trig_in->job_id);
1100 			}
1101 			return;
1102 		}
1103 	}
1104 }
1105 
1106 
_trigger_front_end_event(trig_mgr_info_t * trig_in,time_t now)1107 static void _trigger_front_end_event(trig_mgr_info_t *trig_in, time_t now)
1108 {
1109 	int i;
1110 
1111 	xassert(verify_lock(NODE_LOCK, READ_LOCK));
1112 
1113 	if ((trig_in->trig_type & TRIGGER_TYPE_DOWN) &&
1114 	    (trigger_down_front_end_bitmap != NULL) &&
1115 	    ((i = bit_ffs(trigger_down_front_end_bitmap)) != -1)) {
1116 		xfree(trig_in->res_id);
1117 		for (i = 0; i < front_end_node_cnt; i++) {
1118 			if (!bit_test(trigger_down_front_end_bitmap, i))
1119 				continue;
1120 			if (trig_in->res_id != NULL)
1121 				xstrcat(trig_in->res_id, ",");
1122 			xstrcat(trig_in->res_id, front_end_nodes[i].name);
1123 		}
1124 		trig_in->state = 1;
1125 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1126 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1127 			info("trigger[%u] for node %s down",
1128 			     trig_in->trig_id, trig_in->res_id);
1129 		}
1130 		return;
1131 	}
1132 
1133 	if ((trig_in->trig_type & TRIGGER_TYPE_UP) &&
1134 	    (trigger_up_front_end_bitmap != NULL) &&
1135 	    ((i = bit_ffs(trigger_up_front_end_bitmap)) != -1)) {
1136 		xfree(trig_in->res_id);
1137 		for (i = 0; i < front_end_node_cnt; i++) {
1138 			if (!bit_test(trigger_up_front_end_bitmap, i))
1139 				continue;
1140 			if (trig_in->res_id != NULL)
1141 				xstrcat(trig_in->res_id, ",");
1142 			xstrcat(trig_in->res_id, front_end_nodes[i].name);
1143 		}
1144 		trig_in->state = 1;
1145 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1146 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1147 			info("trigger[%u] for node %s up",
1148 			     trig_in->trig_id, trig_in->res_id);
1149 		}
1150 		return;
1151 	}
1152 }
1153 
_trigger_other_event(trig_mgr_info_t * trig_in,time_t now)1154 static void _trigger_other_event(trig_mgr_info_t *trig_in, time_t now)
1155 {
1156 	if ((trig_in->trig_type & TRIGGER_TYPE_BURST_BUFFER) &&
1157 	    trigger_bb_error) {
1158 		trig_in->state = 1;
1159 		trig_in->trig_time = now;
1160 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS)
1161 			info("trigger[%u] for burst buffer", trig_in->trig_id);
1162 		return;
1163 	}
1164 }
1165 
_trigger_node_event(trig_mgr_info_t * trig_in,time_t now)1166 static void _trigger_node_event(trig_mgr_info_t *trig_in, time_t now)
1167 {
1168 	xassert(verify_lock(NODE_LOCK, READ_LOCK));
1169 
1170 	if ((trig_in->trig_type & TRIGGER_TYPE_DOWN) &&
1171 	    trigger_down_nodes_bitmap                &&
1172 	    (bit_ffs(trigger_down_nodes_bitmap) != -1)) {
1173 		if (trig_in->nodes_bitmap == NULL) {	/* all nodes */
1174 			xfree(trig_in->res_id);
1175 			trig_in->res_id = bitmap2node_name(
1176 					  trigger_down_nodes_bitmap);
1177 			trig_in->state = 1;
1178 		} else if (bit_overlap_any(trig_in->nodes_bitmap,
1179 				           trigger_down_nodes_bitmap)) {
1180 			bit_and(trig_in->nodes_bitmap,
1181 				trigger_down_nodes_bitmap);
1182 			xfree(trig_in->res_id);
1183 			trig_in->res_id = bitmap2node_name(
1184 					 trig_in->nodes_bitmap);
1185 			trig_in->state = 1;
1186 		}
1187 		if (trig_in->state == 1) {
1188 			trig_in->trig_time = now +
1189 					     (trig_in->trig_time - 0x8000);
1190 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1191 				info("trigger[%u] for node %s down",
1192 				     trig_in->trig_id, trig_in->res_id);
1193 			}
1194 			return;
1195 		}
1196 	}
1197 
1198 	if ((trig_in->trig_type & TRIGGER_TYPE_DRAINED) &&
1199 	    trigger_drained_nodes_bitmap                &&
1200 	    (bit_ffs(trigger_drained_nodes_bitmap) != -1)) {
1201 		if (trig_in->nodes_bitmap == NULL) {	/* all nodes */
1202 			xfree(trig_in->res_id);
1203 			trig_in->res_id = bitmap2node_name(
1204 					  trigger_drained_nodes_bitmap);
1205 			trig_in->state = 1;
1206 		} else if (bit_overlap_any(trig_in->nodes_bitmap,
1207 				           trigger_drained_nodes_bitmap)) {
1208 			bit_and(trig_in->nodes_bitmap,
1209 				trigger_drained_nodes_bitmap);
1210 			xfree(trig_in->res_id);
1211 			trig_in->res_id = bitmap2node_name(
1212 					  trig_in->nodes_bitmap);
1213 			trig_in->state = 1;
1214 		}
1215 		if (trig_in->state == 1) {
1216 			trig_in->trig_time = now +
1217 					     (trig_in->trig_time - 0x8000);
1218 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1219 				info("trigger[%u] for node %s drained",
1220 				     trig_in->trig_id, trig_in->res_id);
1221 			}
1222 			return;
1223 		}
1224 	}
1225 
1226 	if ((trig_in->trig_type & TRIGGER_TYPE_FAIL) &&
1227 	    trigger_fail_nodes_bitmap                &&
1228 	    (bit_ffs(trigger_fail_nodes_bitmap) != -1)) {
1229 		if (trig_in->nodes_bitmap == NULL) {	/* all nodes */
1230 			xfree(trig_in->res_id);
1231 			trig_in->res_id = bitmap2node_name(
1232 					  trigger_fail_nodes_bitmap);
1233 			trig_in->state = 1;
1234 		} else if (bit_overlap_any(trig_in->nodes_bitmap,
1235 					   trigger_fail_nodes_bitmap)) {
1236 			bit_and(trig_in->nodes_bitmap,
1237 				trigger_fail_nodes_bitmap);
1238 			xfree(trig_in->res_id);
1239 			trig_in->res_id = bitmap2node_name(
1240 					  trig_in->nodes_bitmap);
1241 			trig_in->state = 1;
1242 		}
1243 		if (trig_in->state == 1) {
1244 			trig_in->trig_time = now +
1245 					     (trig_in->trig_time - 0x8000);
1246 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1247 				info("trigger[%u] for node %s fail",
1248 				     trig_in->trig_id, trig_in->res_id);
1249 			}
1250 			return;
1251 		}
1252 	}
1253 
1254 	if (trig_in->trig_type & TRIGGER_TYPE_IDLE) {
1255 		/* We need to determine which (if any) of these
1256 		 * nodes have been idle for at least the offset time */
1257 		time_t min_idle = now - (trig_in->trig_time - 0x8000);
1258 		int i;
1259 		node_record_t *node_ptr = node_record_table_ptr;
1260 		bitstr_t *trigger_idle_node_bitmap;
1261 
1262 		trigger_idle_node_bitmap = bit_alloc(node_record_count);
1263 		for (i = 0; i < node_record_count; i++, node_ptr++) {
1264 			if (!IS_NODE_IDLE(node_ptr) ||
1265 			    (node_ptr->last_idle > min_idle))
1266 				continue;
1267 			bit_set(trigger_idle_node_bitmap, i);
1268 		}
1269 		if (trig_in->nodes_bitmap == NULL) {    /* all nodes */
1270 			xfree(trig_in->res_id);
1271 			trig_in->res_id = bitmap2node_name(
1272 					  trigger_idle_node_bitmap);
1273 			trig_in->state = 1;
1274 		} else if (bit_overlap_any(trig_in->nodes_bitmap,
1275 					   trigger_idle_node_bitmap)) {
1276 			bit_and(trig_in->nodes_bitmap,
1277 				trigger_idle_node_bitmap);
1278 			xfree(trig_in->res_id);
1279 			trig_in->res_id = bitmap2node_name(
1280 					  trig_in->nodes_bitmap);
1281 			trig_in->state = 1;
1282 		}
1283 		FREE_NULL_BITMAP(trigger_idle_node_bitmap);
1284 		if (trig_in->state == 1) {
1285 			trig_in->trig_time = now;
1286 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1287 				info("trigger[%u] for node %s idle",
1288 				     trig_in->trig_id, trig_in->res_id);
1289 			}
1290 			return;
1291 		}
1292 	}
1293 
1294 	if ((trig_in->trig_type & TRIGGER_TYPE_UP) &&
1295 	    trigger_up_nodes_bitmap                &&
1296 	    (bit_ffs(trigger_up_nodes_bitmap) != -1)) {
1297 		if (trig_in->nodes_bitmap == NULL) {	/* all nodes */
1298 			xfree(trig_in->res_id);
1299 			trig_in->res_id = bitmap2node_name(
1300 					  trigger_up_nodes_bitmap);
1301 			trig_in->state = 1;
1302 		} else if (bit_overlap_any(trig_in->nodes_bitmap,
1303 					   trigger_up_nodes_bitmap)) {
1304 			bit_and(trig_in->nodes_bitmap,
1305 				trigger_up_nodes_bitmap);
1306 			xfree(trig_in->res_id);
1307 			trig_in->res_id = bitmap2node_name(
1308 					  trig_in->nodes_bitmap);
1309 			trig_in->state = 1;
1310 		}
1311 		if (trig_in->state == 1) {
1312 			trig_in->trig_time = now +
1313 					     (trig_in->trig_time - 0x8000);
1314 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1315 				info("trigger[%u] for node %s up",
1316 				     trig_in->trig_id, trig_in->res_id);
1317 			}
1318 			return;
1319 		}
1320 	}
1321 
1322 	if ((trig_in->trig_type & TRIGGER_TYPE_RECONFIG) &&
1323 	    trigger_node_reconfig) {
1324 		trig_in->state = 1;
1325 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1326 		xfree(trig_in->res_id);
1327 		trig_in->res_id = xstrdup("reconfig");
1328 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS)
1329 			info("trigger[%u] for reconfig", trig_in->trig_id);
1330 		return;
1331 	}
1332 }
1333 
_trigger_slurmctld_event(trig_mgr_info_t * trig_in,time_t now)1334 static void _trigger_slurmctld_event(trig_mgr_info_t *trig_in, time_t now)
1335 {
1336 	if ((trig_in->trig_type & TRIGGER_TYPE_PRI_CTLD_FAIL) &&
1337 	    trigger_pri_ctld_fail) {
1338 		trig_in->state = 1;
1339 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1340 		xfree(trig_in->res_id);
1341 		trig_in->res_id = xstrdup("primary_slurmctld_failure");
1342 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1343 			info("trigger[%u] for primary_slurmctld_failure",
1344 			     trig_in->trig_id);
1345 		}
1346 		return;
1347 	}
1348 	if ((trig_in->trig_type & TRIGGER_TYPE_PRI_CTLD_RES_OP) &&
1349 	    trigger_pri_ctld_res_op) {
1350 		trig_in->state = 1;
1351 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1352 		xfree(trig_in->res_id);
1353 		trig_in->res_id =
1354 			xstrdup("primary_slurmctld_resumed_operation");
1355 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1356 			info("trigger[%u] for primary_slurmctld_resumed_"
1357 			     "operation", trig_in->trig_id);
1358 		}
1359 		return;
1360 	}
1361 	if ((trig_in->trig_type & TRIGGER_TYPE_PRI_CTLD_RES_CTRL) &&
1362 	    trigger_pri_ctld_res_ctrl) {
1363 		trig_in->state = 1;
1364 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1365 		xfree(trig_in->res_id);
1366 		trig_in->res_id = xstrdup("primary_slurmctld_resumed_control");
1367 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1368 			info("trigger[%u] for primary_slurmctld_resumed_"
1369 			     "control", trig_in->trig_id);
1370 		}
1371 		return;
1372 	}
1373 	if ((trig_in->trig_type & TRIGGER_TYPE_PRI_CTLD_ACCT_FULL) &&
1374 	    trigger_pri_ctld_acct_buffer_full) {
1375 		trig_in->state = 1;
1376 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1377 		xfree(trig_in->res_id);
1378 		trig_in->res_id = xstrdup("primary_slurmctld_acct_buffer_full");
1379 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1380 			info("trigger[%u] for primary_slurmctld_acct_"
1381 			     "buffer_full", trig_in->trig_id);
1382 		}
1383 		return;
1384 	}
1385 	if ((trig_in->trig_type & TRIGGER_TYPE_BU_CTLD_FAIL) &&
1386 	    trigger_bu_ctld_fail) {
1387 		trig_in->state = 1;
1388 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1389 		xfree(trig_in->res_id);
1390 		trig_in->res_id = xstrdup("backup_slurmctld_failure");
1391 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1392 			info("trigger[%u] for backup_slurmctld_failure",
1393 			     trig_in->trig_id);
1394 		}
1395 		return;
1396 	}
1397 	if ((trig_in->trig_type & TRIGGER_TYPE_BU_CTLD_RES_OP) &&
1398 	    trigger_bu_ctld_res_op) {
1399 		trig_in->state = 1;
1400 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1401 		xfree(trig_in->res_id);
1402 		trig_in->res_id = xstrdup("backup_slurmctld_resumed_operation");
1403 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1404 			info("trigger[%u] for backup_slurmctld_resumed_"
1405 			     "operation", trig_in->trig_id);
1406 		}
1407 		return;
1408 	}
1409 	if ((trig_in->trig_type & TRIGGER_TYPE_BU_CTLD_AS_CTRL) &&
1410 	    trigger_bu_ctld_as_ctrl) {
1411 		trig_in->state = 1;
1412 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1413 		xfree(trig_in->res_id);
1414 		trig_in->res_id = xstrdup("backup_slurmctld_assumed_control");
1415 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1416 			info("trigger[%u] for bu_slurmctld_assumed_control",
1417 			     trig_in->trig_id);
1418 		}
1419 		return;
1420 	}
1421 }
1422 
_trigger_slurmdbd_event(trig_mgr_info_t * trig_in,time_t now)1423 static void _trigger_slurmdbd_event(trig_mgr_info_t *trig_in, time_t now)
1424 {
1425 	if ((trig_in->trig_type & TRIGGER_TYPE_PRI_DBD_FAIL) &&
1426 	    trigger_pri_dbd_fail) {
1427 		trig_in->state = 1;
1428 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1429 		xfree(trig_in->res_id);
1430 		trig_in->res_id = xstrdup("primary_slurmdbd_failure");
1431 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS)
1432 			info("trigger[%u] for primary_slurmcdbd_failure",
1433 			     trig_in->trig_id);
1434 		return;
1435 	}
1436 	if ((trig_in->trig_type & TRIGGER_TYPE_PRI_DBD_RES_OP) &&
1437 	    trigger_pri_dbd_res_op) {
1438 		trig_in->state = 1;
1439 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1440 		xfree(trig_in->res_id);
1441 		trig_in->res_id = xstrdup("primary_slurmdbd_resumed_operation");
1442 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1443 			info("trigger[%u] for primary_slurmdbd_resumed_"
1444 			     "operation", trig_in->trig_id);
1445 		}
1446 		return;
1447 	}
1448 }
1449 
_trigger_database_event(trig_mgr_info_t * trig_in,time_t now)1450 static void _trigger_database_event(trig_mgr_info_t *trig_in, time_t now)
1451 {
1452 	if ((trig_in->trig_type & TRIGGER_TYPE_PRI_DB_FAIL) &&
1453 	    trigger_pri_db_fail) {
1454 		trig_in->state = 1;
1455 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1456 		xfree(trig_in->res_id);
1457 		trig_in->res_id = xstrdup("primary_database_failure");
1458 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1459 			info("trigger[%u] for primary_database_failure",
1460 			     trig_in->trig_id);
1461 		}
1462 		return;
1463 	}
1464 	if ((trig_in->trig_type & TRIGGER_TYPE_PRI_DB_RES_OP) &&
1465 	    trigger_pri_db_res_op) {
1466 		trig_in->state = 1;
1467 		trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
1468 		xfree(trig_in->res_id);
1469 		trig_in->res_id = xstrdup("primary_database_resumed_operation");
1470 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1471 			info("trigger[%u] for primary_database_resumed_"
1472 			     "operation", trig_in->trig_id);
1473 		}
1474 		return;
1475 	}
1476 }
1477 
1478 /* Ideally we would use the existing proctrack plugin to prevent any
1479  * processes from escaping our control, but that plugin is tied
1480  * to various slurmd data structures. We just the process group ID
1481  * to kill the spawned program after MAX_PROG_TIME. Since triggers are
1482  * meant primarily for system administrators rather than users, this
1483  * may be sufficient. */
_trigger_run_program(trig_mgr_info_t * trig_in)1484 static void _trigger_run_program(trig_mgr_info_t *trig_in)
1485 {
1486 	char *tmp, *save_ptr = NULL, *tok;
1487 	char *program, *args[64], user_name[1024];
1488 	char *pname, *uname;
1489 	uid_t uid;
1490 	gid_t gid;
1491 	pid_t child_pid;
1492 	int i;
1493 
1494 	if (!_validate_trigger(trig_in))
1495 		return;
1496 
1497 	tmp = xstrdup(trig_in->program);
1498 	tok = strtok_r(trig_in->program, " ", &save_ptr);
1499 	program = xstrdup(tok);
1500 	pname = strrchr(program, '/');
1501 	if (pname == NULL)
1502 		pname = program;
1503 	else
1504 		pname++;
1505 	args[0] = xstrdup(pname);
1506 	for (i = 1; i < 63; i++) {
1507 		tok = strtok_r(NULL, " ", &save_ptr);
1508 		if (!tok) {
1509 			args[i] = xstrdup(trig_in->res_id);
1510 			break;
1511 		}
1512 		args[i] = xstrdup(tok);
1513 	}
1514 	for (i++; i < 64; i++)
1515 		args[i] = NULL;
1516 	xfree(tmp);
1517 
1518 	uid = trig_in->user_id;
1519 	gid = trig_in->group_id;
1520 	uname = uid_to_string(uid);
1521 	snprintf(user_name, sizeof(user_name), "%s", uname);
1522 	xfree(uname);
1523 
1524 	child_pid = fork();
1525 	if (child_pid > 0) {
1526 		trig_in->child_pid = child_pid;
1527 	} else if (child_pid == 0) {
1528 		int i;
1529 		bool run_as_self = (uid == slurmctld_conf.slurm_user_id);
1530 
1531 		for (i = 0; i < 1024; i++)
1532 			(void) close(i);
1533 		setpgid(0, 0);
1534 		setsid();
1535 		if ((initgroups(user_name, gid) == -1) && !run_as_self) {
1536 			error("trigger: initgroups: %m");
1537 			exit(1);
1538 		}
1539 		if ((setgid(gid) == -1) && !run_as_self){
1540 			error("trigger: setgid: %m");
1541 			exit(1);
1542 		}
1543 		if ((setuid(uid) == -1) && !run_as_self) {
1544 			error("trigger: setuid: %m");
1545 			exit(1);
1546 		}
1547 		execv(program, args);
1548 		exit(1);
1549 	} else {
1550 		error("fork: %m");
1551 	}
1552 	xfree(program);
1553 	for (i = 0; i < 64; i++)
1554 		xfree(args[i]);
1555 }
1556 
_clear_event_triggers(void)1557 static void _clear_event_triggers(void)
1558 {
1559 	if (trigger_down_front_end_bitmap) {
1560 		bit_nclear(trigger_down_front_end_bitmap,
1561 			   0, (bit_size(trigger_down_front_end_bitmap) - 1));
1562 	}
1563 	if (trigger_up_front_end_bitmap) {
1564 		bit_nclear(trigger_up_front_end_bitmap,
1565 			   0, (bit_size(trigger_up_front_end_bitmap) - 1));
1566 	}
1567 	if (trigger_down_nodes_bitmap) {
1568 		bit_nclear(trigger_down_nodes_bitmap,
1569 			   0, (bit_size(trigger_down_nodes_bitmap) - 1));
1570 	}
1571 	if (trigger_drained_nodes_bitmap) {
1572 		bit_nclear(trigger_drained_nodes_bitmap,
1573 			   0, (bit_size(trigger_drained_nodes_bitmap) - 1));
1574 	}
1575 	if (trigger_up_nodes_bitmap) {
1576 		bit_nclear(trigger_up_nodes_bitmap,
1577 			   0, (bit_size(trigger_up_nodes_bitmap) - 1));
1578 	}
1579 	trigger_node_reconfig = false;
1580 	trigger_bb_error = false;
1581 	trigger_pri_ctld_fail = false;
1582 	trigger_pri_ctld_res_op = false;
1583 	trigger_pri_ctld_res_ctrl = false;
1584 	trigger_pri_ctld_acct_buffer_full = false;
1585 	trigger_bu_ctld_fail = false;
1586 	trigger_bu_ctld_res_op = false;
1587 	trigger_bu_ctld_as_ctrl = false;
1588 	trigger_pri_dbd_fail = false;
1589 	trigger_pri_dbd_res_op = false;
1590 	trigger_pri_db_fail = false;
1591 	trigger_pri_db_res_op = false;
1592 }
1593 
1594 /* Make a copy of a trigger and pre-pend it on our list */
_trigger_clone(trig_mgr_info_t * trig_in)1595 static void _trigger_clone(trig_mgr_info_t *trig_in)
1596 {
1597 	trig_mgr_info_t *trig_add;
1598 
1599 	trig_add = xmalloc(sizeof(trig_mgr_info_t));
1600 	trig_add->flags     = trig_in->flags;
1601 	trig_add->trig_id   = trig_in->trig_id;
1602 	trig_add->res_type  = trig_in->res_type;
1603 	if (trig_in->orig_res_id) {
1604 		trig_add->res_id = xstrdup(trig_in->orig_res_id);
1605 		trig_add->orig_res_id = xstrdup(trig_in->orig_res_id);
1606 	}
1607 	if (trig_in->orig_bitmap) {
1608 		trig_add->nodes_bitmap = bit_copy(trig_in->orig_bitmap);
1609 		trig_add->orig_bitmap  = bit_copy(trig_in->orig_bitmap);
1610 	}
1611 	trig_add->job_id    = trig_in->job_id;
1612 	trig_add->job_ptr   = trig_in->job_ptr;
1613 	trig_add->trig_type = trig_in->trig_type;
1614 	trig_add->trig_time = trig_in->orig_time;
1615 	trig_add->orig_time = trig_in->orig_time;
1616 	trig_add->user_id   = trig_in->user_id;
1617 	trig_add->group_id  = trig_in->group_id;
1618 	trig_add->program   = xstrdup(trig_in->program);;
1619 	list_prepend(trigger_list, trig_add);
1620 }
1621 
trigger_process(void)1622 extern void trigger_process(void)
1623 {
1624 	ListIterator trig_iter;
1625 	trig_mgr_info_t *trig_in;
1626 	time_t now = time(NULL);
1627 	bool state_change = false;
1628 	pid_t rc;
1629 	int prog_stat;
1630 
1631 	slurm_mutex_lock(&trigger_mutex);
1632 	if (trigger_list == NULL)
1633 		trigger_list = list_create(_trig_del);
1634 
1635 	trig_iter = list_iterator_create(trigger_list);
1636 	while ((trig_in = list_next(trig_iter))) {
1637 		if (trig_in->state == 0) {
1638 			if (trig_in->res_type == TRIGGER_RES_TYPE_OTHER)
1639 				_trigger_other_event(trig_in, now);
1640 			else if (trig_in->res_type == TRIGGER_RES_TYPE_JOB)
1641 				_trigger_job_event(trig_in, now);
1642 			else if (trig_in->res_type == TRIGGER_RES_TYPE_NODE)
1643 				_trigger_node_event(trig_in, now);
1644 			else if (trig_in->res_type ==
1645 				 TRIGGER_RES_TYPE_SLURMCTLD)
1646 				_trigger_slurmctld_event(trig_in, now);
1647 			else if (trig_in->res_type ==
1648 				 TRIGGER_RES_TYPE_SLURMDBD)
1649 				_trigger_slurmdbd_event(trig_in, now);
1650 			else if (trig_in->res_type ==
1651 				 TRIGGER_RES_TYPE_DATABASE)
1652 			 	_trigger_database_event(trig_in, now);
1653 			else if (trig_in->res_type ==
1654 				 TRIGGER_RES_TYPE_FRONT_END)
1655 			 	_trigger_front_end_event(trig_in, now);
1656 		}
1657 		if ((trig_in->state == 1) &&
1658 		    (trig_in->trig_time <= now)) {
1659 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRIGGERS) {
1660 				info("launching program for trigger[%u]",
1661 				     trig_in->trig_id);
1662 				info("  uid=%u gid=%u program=%s arg=%s",
1663 				     trig_in->user_id, trig_in->group_id,
1664 				     trig_in->program, trig_in->res_id);
1665 			}
1666 			if (trig_in->flags & TRIGGER_FLAG_PERM) {
1667 				_trigger_clone(trig_in);
1668 			}
1669 			trig_in->state = 2;
1670 			trig_in->trig_time = now;
1671 			state_change = true;
1672 			_trigger_run_program(trig_in);
1673 		} else if ((trig_in->state == 2) &&
1674 			   (difftime(now, trig_in->trig_time) >
1675 			    MAX_PROG_TIME)) {
1676 			if (trig_in->child_pid != 0) {
1677 				killpg(trig_in->child_pid, SIGKILL);
1678 				rc = waitpid(trig_in->child_pid, &prog_stat,
1679 					     WNOHANG);
1680 				if ((rc > 0) && prog_stat) {
1681 					info("trigger uid=%u type=%s:%s "
1682 					     "exit=%u:%u",
1683 					     trig_in->user_id,
1684 					     trigger_res_type(trig_in->res_type),
1685 					     trigger_type(trig_in->trig_type),
1686 					     WIFEXITED(prog_stat),
1687 					     WTERMSIG(prog_stat));
1688 				}
1689 				if ((rc == trig_in->child_pid) ||
1690 				    ((rc == -1) && (errno == ECHILD)))
1691 					trig_in->child_pid = 0;
1692 			}
1693 
1694 			if (trig_in->child_pid == 0) {
1695 				if (slurmctld_conf.debug_flags &
1696 				    DEBUG_FLAG_TRIGGERS) {
1697 					info("purging trigger[%u]",
1698 					     trig_in->trig_id);
1699 				}
1700 				list_delete_item(trig_iter);
1701 				state_change = true;
1702 			}
1703 		} else if (trig_in->state == 2) {
1704 			/* Elimiate zombie processes right away.
1705 			 * Purge trigger entry above MAX_PROG_TIME later */
1706 			rc = waitpid(trig_in->child_pid, &prog_stat, WNOHANG);
1707 			if ((rc > 0) && prog_stat) {
1708 				info("trigger uid=%u type=%s:%s exit=%u:%u",
1709 				     trig_in->user_id,
1710 				     trigger_res_type(trig_in->res_type),
1711 				     trigger_type(trig_in->trig_type),
1712 				     WIFEXITED(prog_stat),
1713 				     WTERMSIG(prog_stat));
1714 			}
1715 			if ((rc == trig_in->child_pid) ||
1716 			    ((rc == -1) && (errno == ECHILD)))
1717 				trig_in->child_pid = 0;
1718 		}
1719 	}
1720 	list_iterator_destroy(trig_iter);
1721 	_clear_event_triggers();
1722 	slurm_mutex_unlock(&trigger_mutex);
1723 	if (state_change)
1724 		schedule_trigger_save();
1725 }
1726 
1727 /* Free all allocated memory */
trigger_fini(void)1728 extern void trigger_fini(void)
1729 {
1730 	FREE_NULL_LIST(trigger_list);
1731 	FREE_NULL_BITMAP(trigger_down_front_end_bitmap);
1732 	FREE_NULL_BITMAP(trigger_up_front_end_bitmap);
1733 	FREE_NULL_BITMAP(trigger_down_nodes_bitmap);
1734 	FREE_NULL_BITMAP(trigger_drained_nodes_bitmap);
1735 	FREE_NULL_BITMAP(trigger_fail_nodes_bitmap);
1736 	FREE_NULL_BITMAP(trigger_up_nodes_bitmap);
1737 }
1738