1 /*
2 * Copyright 2012-2020 the Pacemaker project contributors
3 *
4 * The version control history for this file may have further details.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 */
21
22 #include <crm_internal.h>
23
24 #include <glib.h>
25 #include <unistd.h>
26
27 #include <crm/crm.h>
28 #include <crm/services.h>
29 #include <crm/common/mainloop.h>
30 #include <crm/common/ipc.h>
31 #include <crm/common/ipcs.h>
32 #include <crm/msg_xml.h>
33
34 #include <lrmd_private.h>
35
36 #ifdef HAVE_SYS_TIMEB_H
37 # include <sys/timeb.h>
38 #endif
39
40 #define EXIT_REASON_MAX_LEN 128
41
42 GHashTable *rsc_list = NULL;
43
44 typedef struct lrmd_cmd_s {
45 int timeout;
46 int interval;
47 int start_delay;
48 int timeout_orig;
49
50 int call_id;
51 int exec_rc;
52 int lrmd_op_status;
53
54 int call_opts;
55 /* Timer ids, must be removed on cmd destruction. */
56 int delay_id;
57 int stonith_recurring_id;
58
59 int rsc_deleted;
60
61 int service_flags;
62
63 char *client_id;
64 char *origin;
65 char *rsc_id;
66 char *action;
67 char *real_action;
68 char *exit_reason;
69 char *output;
70 char *userdata_str;
71
72 /* when set, this cmd should go through a container wrapper */
73 const char *isolation_wrapper;
74
75 #ifdef HAVE_SYS_TIMEB_H
76 /* recurring and systemd operations may involve more than one lrmd command
77 * per operation, so they need info about original and most recent
78 */
79 struct timeb t_first_run; /* Timestamp of when op first ran */
80 struct timeb t_run; /* Timestamp of when op most recently ran */
81 struct timeb t_first_queue; /* Timestamp of when op first was queued */
82 struct timeb t_queue; /* Timestamp of when op most recently was queued */
83 struct timeb t_rcchange; /* Timestamp of last rc change */
84 #endif
85
86 int first_notify_sent;
87 int last_notify_rc;
88 int last_notify_op_status;
89 int last_pid;
90
91 GHashTable *params;
92 } lrmd_cmd_t;
93
94 static void cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc);
95 static gboolean lrmd_rsc_dispatch(gpointer user_data);
96 static void cancel_all_recurring(lrmd_rsc_t * rsc, const char *client_id);
97
98 static void
log_finished(lrmd_cmd_t * cmd,int exec_time,int queue_time)99 log_finished(lrmd_cmd_t * cmd, int exec_time, int queue_time)
100 {
101 char pid_str[32] = { 0, };
102 int log_level = LOG_INFO;
103
104 if (cmd->last_pid) {
105 snprintf(pid_str, 32, "%d", cmd->last_pid);
106 }
107
108 if (safe_str_eq(cmd->action, "monitor")) {
109 log_level = LOG_DEBUG;
110 }
111 #ifdef HAVE_SYS_TIMEB_H
112 do_crm_log(log_level,
113 "finished - rsc:%s action:%s call_id:%d %s%s exit-code:%d exec-time:%dms queue-time:%dms",
114 cmd->rsc_id, cmd->action, cmd->call_id, cmd->last_pid ? "pid:" : "", pid_str,
115 cmd->exec_rc, exec_time, queue_time);
116 #else
117 do_crm_log(log_level, "finished - rsc:%s action:%s call_id:%d %s%s exit-code:%d",
118 cmd->rsc_id,
119 cmd->action, cmd->call_id, cmd->last_pid ? "pid:" : "", pid_str, cmd->exec_rc);
120 #endif
121 }
122
123 static void
log_execute(lrmd_cmd_t * cmd)124 log_execute(lrmd_cmd_t * cmd)
125 {
126 int log_level = LOG_INFO;
127
128 if (safe_str_eq(cmd->action, "monitor")) {
129 log_level = LOG_DEBUG;
130 }
131
132 do_crm_log(log_level, "executing - rsc:%s action:%s call_id:%d",
133 cmd->rsc_id, cmd->action, cmd->call_id);
134 }
135
136 static const char *
normalize_action_name(lrmd_rsc_t * rsc,const char * action)137 normalize_action_name(lrmd_rsc_t * rsc, const char *action)
138 {
139 if (safe_str_eq(action, "monitor") &&
140 is_set(pcmk_get_ra_caps(rsc->class), pcmk_ra_cap_status)) {
141 return "status";
142 }
143 return action;
144 }
145
146 static lrmd_rsc_t *
build_rsc_from_xml(xmlNode * msg)147 build_rsc_from_xml(xmlNode * msg)
148 {
149 xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, msg, LOG_ERR);
150 lrmd_rsc_t *rsc = NULL;
151
152 rsc = calloc(1, sizeof(lrmd_rsc_t));
153
154 crm_element_value_int(msg, F_LRMD_CALLOPTS, &rsc->call_opts);
155
156 rsc->rsc_id = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ID);
157 rsc->class = crm_element_value_copy(rsc_xml, F_LRMD_CLASS);
158 rsc->provider = crm_element_value_copy(rsc_xml, F_LRMD_PROVIDER);
159 rsc->type = crm_element_value_copy(rsc_xml, F_LRMD_TYPE);
160 rsc->work = mainloop_add_trigger(G_PRIORITY_HIGH, lrmd_rsc_dispatch, rsc);
161 rsc->st_probe_rc = -ENODEV; // if stonith, initialize to "not running"
162 return rsc;
163 }
164
165 static lrmd_cmd_t *
create_lrmd_cmd(xmlNode * msg,crm_client_t * client,lrmd_rsc_t * rsc)166 create_lrmd_cmd(xmlNode * msg, crm_client_t * client, lrmd_rsc_t *rsc)
167 {
168 int call_options = 0;
169 xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, msg, LOG_ERR);
170 lrmd_cmd_t *cmd = NULL;
171
172 cmd = calloc(1, sizeof(lrmd_cmd_t));
173
174 crm_element_value_int(msg, F_LRMD_CALLOPTS, &call_options);
175 cmd->call_opts = call_options;
176 cmd->client_id = strdup(client->id);
177
178 crm_element_value_int(msg, F_LRMD_CALLID, &cmd->call_id);
179 crm_element_value_int(rsc_xml, F_LRMD_RSC_INTERVAL, &cmd->interval);
180 crm_element_value_int(rsc_xml, F_LRMD_TIMEOUT, &cmd->timeout);
181 crm_element_value_int(rsc_xml, F_LRMD_RSC_START_DELAY, &cmd->start_delay);
182 cmd->timeout_orig = cmd->timeout;
183
184 cmd->origin = crm_element_value_copy(rsc_xml, F_LRMD_ORIGIN);
185 cmd->action = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ACTION);
186 cmd->userdata_str = crm_element_value_copy(rsc_xml, F_LRMD_RSC_USERDATA_STR);
187 cmd->rsc_id = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ID);
188
189 cmd->params = xml2list(rsc_xml);
190 cmd->isolation_wrapper = g_hash_table_lookup(cmd->params, "CRM_meta_isolation_wrapper");
191
192 if (cmd->isolation_wrapper) {
193 if (g_hash_table_lookup(cmd->params, "CRM_meta_isolation_instance") == NULL) {
194 g_hash_table_insert(cmd->params, strdup("CRM_meta_isolation_instance"), strdup(rsc->rsc_id));
195 }
196 if (rsc->provider) {
197 g_hash_table_insert(cmd->params, strdup("CRM_meta_provider"), strdup(rsc->provider));
198 }
199 g_hash_table_insert(cmd->params, strdup("CRM_meta_class"), strdup(rsc->class));
200 g_hash_table_insert(cmd->params, strdup("CRM_meta_type"), strdup(rsc->type));
201 }
202
203 if (safe_str_eq(g_hash_table_lookup(cmd->params, "CRM_meta_on_fail"), "block")) {
204 crm_debug("Setting flag to leave pid group on timeout and only kill action pid for %s_%s_%d", cmd->rsc_id, cmd->action, cmd->interval);
205 cmd->service_flags |= SVC_ACTION_LEAVE_GROUP;
206 }
207 return cmd;
208 }
209
210 static void
free_lrmd_cmd(lrmd_cmd_t * cmd)211 free_lrmd_cmd(lrmd_cmd_t * cmd)
212 {
213 if (cmd->stonith_recurring_id) {
214 g_source_remove(cmd->stonith_recurring_id);
215 }
216 if (cmd->delay_id) {
217 g_source_remove(cmd->delay_id);
218 }
219 if (cmd->params) {
220 g_hash_table_destroy(cmd->params);
221 }
222 free(cmd->origin);
223 free(cmd->action);
224 free(cmd->real_action);
225 free(cmd->userdata_str);
226 free(cmd->rsc_id);
227 free(cmd->output);
228 free(cmd->exit_reason);
229 free(cmd->client_id);
230 free(cmd);
231 }
232
233 static gboolean
stonith_recurring_op_helper(gpointer data)234 stonith_recurring_op_helper(gpointer data)
235 {
236 lrmd_cmd_t *cmd = data;
237 lrmd_rsc_t *rsc;
238
239 cmd->stonith_recurring_id = 0;
240
241 if (!cmd->rsc_id) {
242 return FALSE;
243 }
244
245 rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id);
246
247 CRM_ASSERT(rsc != NULL);
248 /* take it out of recurring_ops list, and put it in the pending ops
249 * to be executed */
250 rsc->recurring_ops = g_list_remove(rsc->recurring_ops, cmd);
251 rsc->pending_ops = g_list_append(rsc->pending_ops, cmd);
252 #ifdef HAVE_SYS_TIMEB_H
253 ftime(&cmd->t_queue);
254 if (cmd->t_first_queue.time == 0) {
255 cmd->t_first_queue = cmd->t_queue;
256 }
257 #endif
258 mainloop_set_trigger(rsc->work);
259
260 return FALSE;
261 }
262
263 static gboolean
start_delay_helper(gpointer data)264 start_delay_helper(gpointer data)
265 {
266 lrmd_cmd_t *cmd = data;
267 lrmd_rsc_t *rsc = NULL;
268
269 cmd->delay_id = 0;
270 rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL;
271
272 if (rsc) {
273 mainloop_set_trigger(rsc->work);
274 }
275
276 return FALSE;
277 }
278
279 static gboolean
merge_recurring_duplicate(lrmd_rsc_t * rsc,lrmd_cmd_t * cmd)280 merge_recurring_duplicate(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
281 {
282 GListPtr gIter = NULL;
283 lrmd_cmd_t * dup = NULL;
284 gboolean dup_pending = FALSE;
285
286 if (cmd->interval == 0) {
287 return 0;
288 }
289
290 for (gIter = rsc->pending_ops; gIter != NULL; gIter = gIter->next) {
291 dup = gIter->data;
292 if (safe_str_eq(cmd->action, dup->action) && cmd->interval == dup->interval) {
293 dup_pending = TRUE;
294 goto merge_dup;
295 }
296 }
297
298 /* if dup is in recurring_ops list, that means it has already executed
299 * and is in the interval loop. we can't just remove it in this case. */
300 for (gIter = rsc->recurring_ops; gIter != NULL; gIter = gIter->next) {
301 dup = gIter->data;
302 if (safe_str_eq(cmd->action, dup->action) && cmd->interval == dup->interval) {
303 goto merge_dup;
304 }
305 }
306
307 return FALSE;
308 merge_dup:
309
310
311 /* This should not occur, if it does we need to investigate in the crmd
312 * how something like this is possible */
313 crm_warn("Duplicate recurring op entry detected (%s_%s_%d), merging with previous op entry",
314 rsc->rsc_id,
315 normalize_action_name(rsc, dup->action),
316 dup->interval);
317
318 /* merge */
319 dup->first_notify_sent = 0;
320 free(dup->userdata_str);
321 dup->userdata_str = cmd->userdata_str;
322 cmd->userdata_str = NULL;
323 dup->call_id = cmd->call_id;
324
325 if (safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH)) {
326 /* if we are waiting for the next interval, kick it off now */
327 if (dup_pending == TRUE) {
328 g_source_remove(cmd->stonith_recurring_id);
329 cmd->stonith_recurring_id = 0;
330 stonith_recurring_op_helper(cmd);
331 }
332
333 } else if (dup_pending == FALSE) {
334 /* if we've already handed this to the service lib, kick off an early execution */
335 services_action_kick(rsc->rsc_id, normalize_action_name(rsc, dup->action), dup->interval);
336 }
337 free_lrmd_cmd(cmd);
338
339 return TRUE;
340 }
341
342 static void
schedule_lrmd_cmd(lrmd_rsc_t * rsc,lrmd_cmd_t * cmd)343 schedule_lrmd_cmd(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
344 {
345 gboolean dup_processed = FALSE;
346 CRM_CHECK(cmd != NULL, return);
347 CRM_CHECK(rsc != NULL, return);
348
349 crm_trace("Scheduling %s on %s", cmd->action, rsc->rsc_id);
350
351 dup_processed = merge_recurring_duplicate(rsc, cmd);
352 if (dup_processed) {
353 /* duplicate recurring cmd found, cmds merged */
354 return;
355 }
356
357 /* crmd expects lrmd to automatically cancel recurring ops before rsc stops. */
358 if (safe_str_eq(cmd->action, "stop")) {
359 cancel_all_recurring(rsc, NULL);
360 }
361
362 rsc->pending_ops = g_list_append(rsc->pending_ops, cmd);
363 #ifdef HAVE_SYS_TIMEB_H
364 ftime(&cmd->t_queue);
365 if (cmd->t_first_queue.time == 0) {
366 cmd->t_first_queue = cmd->t_queue;
367 }
368 #endif
369 mainloop_set_trigger(rsc->work);
370
371 if (cmd->start_delay) {
372 cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
373 }
374 }
375
376 static xmlNode *
create_lrmd_reply(const char * origin,int rc,int call_id)377 create_lrmd_reply(const char *origin, int rc, int call_id)
378 {
379 xmlNode *reply = create_xml_node(NULL, T_LRMD_REPLY);
380
381 crm_xml_add(reply, F_LRMD_ORIGIN, origin);
382 crm_xml_add_int(reply, F_LRMD_RC, rc);
383 crm_xml_add_int(reply, F_LRMD_CALLID, call_id);
384 return reply;
385 }
386
387 static void
send_client_notify(gpointer key,gpointer value,gpointer user_data)388 send_client_notify(gpointer key, gpointer value, gpointer user_data)
389 {
390 xmlNode *update_msg = user_data;
391 crm_client_t *client = value;
392 int rc;
393
394 if (client == NULL) {
395 crm_err("Asked to send event to NULL client");
396 return;
397 } else if (client->name == NULL) {
398 crm_trace("Asked to send event to client with no name");
399 return;
400 }
401
402 rc = lrmd_server_send_notify(client, update_msg);
403 if ((rc <= 0) && (rc != -ENOTCONN)) {
404 crm_warn("Could not notify client %s/%s: %s " CRM_XS " rc=%d",
405 client->name, client->id,
406 (rc? pcmk_strerror(rc) : "no data sent"), rc);
407 }
408 }
409
410 #ifdef HAVE_SYS_TIMEB_H
411 /*!
412 * \internal
413 * \brief Return difference between two times in milliseconds
414 *
415 * \param[in] now More recent time (or NULL to use current time)
416 * \param[in] old Earlier time
417 *
418 * \return milliseconds difference (or 0 if old is NULL or has time zero)
419 */
420 static int
time_diff_ms(struct timeb * now,struct timeb * old)421 time_diff_ms(struct timeb *now, struct timeb *old)
422 {
423 struct timeb local_now = { 0, };
424
425 if (now == NULL) {
426 ftime(&local_now);
427 now = &local_now;
428 }
429 if ((old == NULL) || (old->time == 0)) {
430 return 0;
431 }
432 return difftime(now->time, old->time) * 1000 + now->millitm - old->millitm;
433 }
434
435 /*!
436 * \internal
437 * \brief Reset a command's operation times to their original values.
438 *
439 * Reset a command's run and queued timestamps to the timestamps of the original
440 * command, so we report the entire time since then and not just the time since
441 * the most recent command (for recurring and systemd operations).
442 *
443 * /param[in] cmd LRMD command object to reset
444 *
445 * /note It's not obvious what the queued time should be for a systemd
446 * start/stop operation, which might go like this:
447 * initial command queued 5ms, runs 3s
448 * monitor command queued 10ms, runs 10s
449 * monitor command queued 10ms, runs 10s
450 * Is the queued time for that operation 5ms, 10ms or 25ms? The current
451 * implementation will report 5ms. If it's 25ms, then we need to
452 * subtract 20ms from the total exec time so as not to count it twice.
453 * We can implement that later if it matters to anyone ...
454 */
455 static void
cmd_original_times(lrmd_cmd_t * cmd)456 cmd_original_times(lrmd_cmd_t * cmd)
457 {
458 cmd->t_run = cmd->t_first_run;
459 cmd->t_queue = cmd->t_first_queue;
460 }
461 #endif
462
463 static void
send_cmd_complete_notify(lrmd_cmd_t * cmd)464 send_cmd_complete_notify(lrmd_cmd_t * cmd)
465 {
466 int exec_time = 0;
467 int queue_time = 0;
468 xmlNode *notify = NULL;
469
470 #ifdef HAVE_SYS_TIMEB_H
471 exec_time = time_diff_ms(NULL, &cmd->t_run);
472 queue_time = time_diff_ms(&cmd->t_run, &cmd->t_queue);
473 #endif
474
475 log_finished(cmd, exec_time, queue_time);
476
477 /* if the first notify result for a cmd has already been sent earlier, and the
478 * the option to only send notifies on result changes is set. Check to see
479 * if the last result is the same as the new one. If so, suppress this update */
480 if (cmd->first_notify_sent && (cmd->call_opts & lrmd_opt_notify_changes_only)) {
481 if (cmd->last_notify_rc == cmd->exec_rc &&
482 cmd->last_notify_op_status == cmd->lrmd_op_status) {
483
484 /* only send changes */
485 return;
486 }
487
488 }
489
490 cmd->first_notify_sent = 1;
491 cmd->last_notify_rc = cmd->exec_rc;
492 cmd->last_notify_op_status = cmd->lrmd_op_status;
493
494 notify = create_xml_node(NULL, T_LRMD_NOTIFY);
495
496 crm_xml_add(notify, F_LRMD_ORIGIN, __FUNCTION__);
497 crm_xml_add_int(notify, F_LRMD_TIMEOUT, cmd->timeout);
498 crm_xml_add_int(notify, F_LRMD_RSC_INTERVAL, cmd->interval);
499 crm_xml_add_int(notify, F_LRMD_RSC_START_DELAY, cmd->start_delay);
500 crm_xml_add_int(notify, F_LRMD_EXEC_RC, cmd->exec_rc);
501 crm_xml_add_int(notify, F_LRMD_OP_STATUS, cmd->lrmd_op_status);
502 crm_xml_add_int(notify, F_LRMD_CALLID, cmd->call_id);
503 crm_xml_add_int(notify, F_LRMD_RSC_DELETED, cmd->rsc_deleted);
504
505 #ifdef HAVE_SYS_TIMEB_H
506 crm_xml_add_int(notify, F_LRMD_RSC_RUN_TIME, cmd->t_run.time);
507 crm_xml_add_int(notify, F_LRMD_RSC_RCCHANGE_TIME, cmd->t_rcchange.time);
508 crm_xml_add_int(notify, F_LRMD_RSC_EXEC_TIME, exec_time);
509 crm_xml_add_int(notify, F_LRMD_RSC_QUEUE_TIME, queue_time);
510 #endif
511
512 crm_xml_add(notify, F_LRMD_OPERATION, LRMD_OP_RSC_EXEC);
513 crm_xml_add(notify, F_LRMD_RSC_ID, cmd->rsc_id);
514 if(cmd->real_action) {
515 crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->real_action);
516 } else {
517 crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->action);
518 }
519 crm_xml_add(notify, F_LRMD_RSC_USERDATA_STR, cmd->userdata_str);
520 crm_xml_add(notify, F_LRMD_RSC_OUTPUT, cmd->output);
521 crm_xml_add(notify, F_LRMD_RSC_EXIT_REASON, cmd->exit_reason);
522
523 if (cmd->params) {
524 char *key = NULL;
525 char *value = NULL;
526 GHashTableIter iter;
527
528 xmlNode *args = create_xml_node(notify, XML_TAG_ATTRS);
529
530 g_hash_table_iter_init(&iter, cmd->params);
531 while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
532 hash2smartfield((gpointer) key, (gpointer) value, args);
533 }
534 }
535 if (cmd->client_id && (cmd->call_opts & lrmd_opt_notify_orig_only)) {
536 crm_client_t *client = crm_client_get_by_id(cmd->client_id);
537
538 if (client) {
539 send_client_notify(client->id, client, notify);
540 }
541 } else if (client_connections != NULL) {
542 g_hash_table_foreach(client_connections, send_client_notify, notify);
543 }
544
545 free_xml(notify);
546 }
547
548 static void
send_generic_notify(int rc,xmlNode * request)549 send_generic_notify(int rc, xmlNode * request)
550 {
551 if (client_connections != NULL) {
552 int call_id = 0;
553 xmlNode *notify = NULL;
554 xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
555 const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
556 const char *op = crm_element_value(request, F_LRMD_OPERATION);
557
558 crm_element_value_int(request, F_LRMD_CALLID, &call_id);
559
560 notify = create_xml_node(NULL, T_LRMD_NOTIFY);
561 crm_xml_add(notify, F_LRMD_ORIGIN, __FUNCTION__);
562 crm_xml_add_int(notify, F_LRMD_RC, rc);
563 crm_xml_add_int(notify, F_LRMD_CALLID, call_id);
564 crm_xml_add(notify, F_LRMD_OPERATION, op);
565 crm_xml_add(notify, F_LRMD_RSC_ID, rsc_id);
566
567 g_hash_table_foreach(client_connections, send_client_notify, notify);
568
569 free_xml(notify);
570 }
571 }
572
573 static void
cmd_reset(lrmd_cmd_t * cmd)574 cmd_reset(lrmd_cmd_t * cmd)
575 {
576 cmd->lrmd_op_status = 0;
577 cmd->last_pid = 0;
578 #ifdef HAVE_SYS_TIMEB_H
579 memset(&cmd->t_run, 0, sizeof(cmd->t_run));
580 memset(&cmd->t_queue, 0, sizeof(cmd->t_queue));
581 #endif
582 free(cmd->exit_reason);
583 cmd->exit_reason = NULL;
584 free(cmd->output);
585 cmd->output = NULL;
586
587 }
588
589 static void
cmd_finalize(lrmd_cmd_t * cmd,lrmd_rsc_t * rsc)590 cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc)
591 {
592 crm_trace("Resource operation rsc:%s action:%s completed (%p %p)", cmd->rsc_id, cmd->action,
593 rsc ? rsc->active : NULL, cmd);
594
595 if (rsc && (rsc->active == cmd)) {
596 rsc->active = NULL;
597 mainloop_set_trigger(rsc->work);
598 }
599
600 if (!rsc) {
601 cmd->rsc_deleted = 1;
602 }
603
604 /* reset original timeout so client notification has correct information */
605 cmd->timeout = cmd->timeout_orig;
606
607 send_cmd_complete_notify(cmd);
608
609 if (cmd->interval && (cmd->lrmd_op_status == PCMK_LRM_OP_CANCELLED)) {
610 if (rsc) {
611 rsc->recurring_ops = g_list_remove(rsc->recurring_ops, cmd);
612 rsc->pending_ops = g_list_remove(rsc->pending_ops, cmd);
613 }
614 free_lrmd_cmd(cmd);
615 } else if (cmd->interval == 0) {
616 if (rsc) {
617 rsc->pending_ops = g_list_remove(rsc->pending_ops, cmd);
618 }
619 free_lrmd_cmd(cmd);
620 } else {
621 /* Clear all the values pertaining just to the last iteration of a recurring op. */
622 cmd_reset(cmd);
623 }
624 }
625
626 #if SUPPORT_HEARTBEAT
pattern_matched(const char * pat,const char * str)627 static int pattern_matched(const char *pat, const char *str)
628 {
629 if (g_pattern_match_simple(pat, str)) {
630 crm_debug("RA output matched stopped pattern [%s]", pat);
631 return TRUE;
632 }
633 return FALSE;
634 }
635
636 static int
hb2uniform_rc(const char * action,int rc,const char * stdout_data)637 hb2uniform_rc(const char *action, int rc, const char *stdout_data)
638 {
639 const char *stop_pattern[] = { "*stopped*", "*not*running*" };
640 const char *running_pattern[] = { "*running*", "*OK*" };
641 char *lower_std_output = NULL;
642 int result;
643
644
645 if (rc < 0) {
646 return PCMK_OCF_UNKNOWN_ERROR;
647 }
648
649 /* Treat class heartbeat the same as class lsb. */
650 if (!safe_str_eq(action, "status") && !safe_str_eq(action, "monitor")) {
651 return services_get_ocf_exitcode(action, rc);
652 }
653
654 /* for status though, exit code is ignored,
655 * and the stdout is scanned for specific strings */
656 if (stdout_data == NULL) {
657 crm_warn("No status output from the (hb) resource agent, assuming stopped");
658 return PCMK_OCF_NOT_RUNNING;
659 }
660
661 lower_std_output = g_ascii_strdown(stdout_data, -1);
662
663 if (pattern_matched(stop_pattern[0], lower_std_output) ||
664 pattern_matched(stop_pattern[1], lower_std_output)) {
665 result = PCMK_OCF_NOT_RUNNING;
666 } else if (pattern_matched(running_pattern[0], lower_std_output) ||
667 pattern_matched(running_pattern[1], stdout_data)) {
668 /* "OK" is matched case sensitive */
669 result = PCMK_OCF_OK;
670 } else {
671 /* It didn't say it was running - must be stopped */
672 crm_debug("RA output did not match any pattern, assuming stopped");
673 result = PCMK_OCF_NOT_RUNNING;
674 }
675 free(lower_std_output);
676 return result;
677 }
678 #endif
679
680 static int
ocf2uniform_rc(int rc)681 ocf2uniform_rc(int rc)
682 {
683 if (rc < 0 || rc > PCMK_OCF_FAILED_MASTER) {
684 return PCMK_OCF_UNKNOWN_ERROR;
685 }
686
687 return rc;
688 }
689
690 static int
stonith2uniform_rc(const char * action,int rc)691 stonith2uniform_rc(const char *action, int rc)
692 {
693 if (rc == -ENODEV) {
694 if (safe_str_eq(action, "stop")) {
695 rc = PCMK_OCF_OK;
696 } else if (safe_str_eq(action, "start")) {
697 rc = PCMK_OCF_NOT_INSTALLED;
698 } else {
699 rc = PCMK_OCF_NOT_RUNNING;
700 }
701 } else if (rc != 0) {
702 rc = PCMK_OCF_UNKNOWN_ERROR;
703 }
704 return rc;
705 }
706
707 #if SUPPORT_NAGIOS
708 static int
nagios2uniform_rc(const char * action,int rc)709 nagios2uniform_rc(const char *action, int rc)
710 {
711 if (rc < 0) {
712 return PCMK_OCF_UNKNOWN_ERROR;
713 }
714
715 switch (rc) {
716 case NAGIOS_STATE_OK:
717 return PCMK_OCF_OK;
718 case NAGIOS_INSUFFICIENT_PRIV:
719 return PCMK_OCF_INSUFFICIENT_PRIV;
720 case NAGIOS_NOT_INSTALLED:
721 return PCMK_OCF_NOT_INSTALLED;
722 case NAGIOS_STATE_WARNING:
723 case NAGIOS_STATE_CRITICAL:
724 case NAGIOS_STATE_UNKNOWN:
725 case NAGIOS_STATE_DEPENDENT:
726 default:
727 return PCMK_OCF_UNKNOWN_ERROR;
728 }
729
730 return PCMK_OCF_UNKNOWN_ERROR;
731 }
732 #endif
733
734 static int
get_uniform_rc(const char * standard,const char * action,int rc)735 get_uniform_rc(const char *standard, const char *action, int rc)
736 {
737 if (safe_str_eq(standard, PCMK_RESOURCE_CLASS_OCF)) {
738 return ocf2uniform_rc(rc);
739 } else if (safe_str_eq(standard, PCMK_RESOURCE_CLASS_STONITH)) {
740 return stonith2uniform_rc(action, rc);
741 } else if (safe_str_eq(standard, PCMK_RESOURCE_CLASS_SYSTEMD)) {
742 return rc;
743 } else if (safe_str_eq(standard, PCMK_RESOURCE_CLASS_UPSTART)) {
744 return rc;
745 #if SUPPORT_NAGIOS
746 } else if (safe_str_eq(standard, PCMK_RESOURCE_CLASS_NAGIOS)) {
747 return nagios2uniform_rc(action, rc);
748 #endif
749 } else {
750 return services_get_ocf_exitcode(action, rc);
751 }
752 }
753
754 static int
action_get_uniform_rc(svc_action_t * action)755 action_get_uniform_rc(svc_action_t * action)
756 {
757 lrmd_cmd_t *cmd = action->cb_data;
758 #if SUPPORT_HEARTBEAT
759 if (safe_str_eq(action->standard, PCMK_RESOURCE_CLASS_HB)) {
760 return hb2uniform_rc(cmd->action, action->rc, action->stdout_data);
761 }
762 #endif
763 return get_uniform_rc(action->standard, cmd->action, action->rc);
764 }
765
766 void
notify_of_new_client(crm_client_t * new_client)767 notify_of_new_client(crm_client_t *new_client)
768 {
769 crm_client_t *client = NULL;
770 GHashTableIter iter;
771 xmlNode *notify = NULL;
772 char *key = NULL;
773
774 notify = create_xml_node(NULL, T_LRMD_NOTIFY);
775 crm_xml_add(notify, F_LRMD_ORIGIN, __FUNCTION__);
776 crm_xml_add(notify, F_LRMD_OPERATION, LRMD_OP_NEW_CLIENT);
777
778 g_hash_table_iter_init(&iter, client_connections);
779 while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & client)) {
780
781 if (safe_str_eq(client->id, new_client->id)) {
782 continue;
783 }
784
785 send_client_notify((gpointer) key, (gpointer) client, (gpointer) notify);
786 }
787 free_xml(notify);
788 }
789
790 static char *
parse_exit_reason(const char * output)791 parse_exit_reason(const char *output)
792 {
793 const char *cur = NULL;
794 const char *last = NULL;
795 char *reason = NULL;
796 static int cookie_len = 0;
797 char *eol = NULL;
798
799 if (output == NULL) {
800 return NULL;
801 }
802
803 if (!cookie_len) {
804 cookie_len = strlen(PCMK_OCF_REASON_PREFIX);
805 }
806
807 cur = strstr(output, PCMK_OCF_REASON_PREFIX);
808 for (; cur != NULL; cur = strstr(cur, PCMK_OCF_REASON_PREFIX)) {
809 /* skip over the cookie delimiter string */
810 cur += cookie_len;
811 last = cur;
812 }
813 if (last == NULL) {
814 return NULL;
815 }
816
817 /* make our own copy */
818 reason = calloc(1, (EXIT_REASON_MAX_LEN+1));
819 CRM_ASSERT(reason);
820
821 /* limit reason string size */
822 strncpy(reason, last, EXIT_REASON_MAX_LEN);
823
824 /* truncate everything after a new line */
825 eol = strchr(reason, '\n');
826 if (eol != NULL) {
827 *eol = '\0';
828 }
829
830 return reason;
831 }
832
833 void
client_disconnect_cleanup(const char * client_id)834 client_disconnect_cleanup(const char *client_id)
835 {
836 GHashTableIter iter;
837 lrmd_rsc_t *rsc = NULL;
838 char *key = NULL;
839
840 g_hash_table_iter_init(&iter, rsc_list);
841 while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & rsc)) {
842 if (rsc->call_opts & lrmd_opt_drop_recurring) {
843 /* This client is disconnecting, drop any recurring operations
844 * it may have initiated on the resource */
845 cancel_all_recurring(rsc, client_id);
846 }
847 }
848 }
849
850 static void
action_complete(svc_action_t * action)851 action_complete(svc_action_t * action)
852 {
853 lrmd_rsc_t *rsc;
854 lrmd_cmd_t *cmd = action->cb_data;
855 const char *rclass = NULL;
856
857 bool goagain = false;
858
859 if (!cmd) {
860 crm_err("LRMD action (%s) completed does not match any known operations.", action->id);
861 return;
862 }
863 #ifdef HAVE_SYS_TIMEB_H
864 if (cmd->exec_rc != action->rc) {
865 ftime(&cmd->t_rcchange);
866 }
867 #endif
868
869 cmd->last_pid = action->pid;
870 cmd->exec_rc = action_get_uniform_rc(action);
871 cmd->lrmd_op_status = action->status;
872 rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL;
873
874 if (rsc && safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_SERVICE)) {
875 rclass = resources_find_service_class(rsc->type);
876 } else if(rsc) {
877 rclass = rsc->class;
878 }
879
880 if (safe_str_eq(rclass, PCMK_RESOURCE_CLASS_SYSTEMD)) {
881 if(cmd->exec_rc == PCMK_OCF_OK && safe_str_eq(cmd->action, "start")) {
882 /* systemd I curse thee!
883 *
884 * systemd returns from start actions after the start _begins_
885 * not after it completes.
886 *
887 * So we have to jump through a few hoops so that we don't
888 * report 'complete' to the rest of pacemaker until, you know,
889 * it's actually done.
890 */
891 goagain = true;
892 cmd->real_action = cmd->action;
893 cmd->action = strdup("monitor");
894
895 } else if(cmd->exec_rc == PCMK_OCF_OK && safe_str_eq(cmd->action, "stop")) {
896 goagain = true;
897 cmd->real_action = cmd->action;
898 cmd->action = strdup("monitor");
899
900 } else if(cmd->real_action) {
901 /* Ok, so this is the follow up monitor action to check if start actually completed */
902 if(cmd->lrmd_op_status == PCMK_LRM_OP_DONE && cmd->exec_rc == PCMK_OCF_PENDING) {
903 goagain = true;
904 } else if(cmd->exec_rc == PCMK_OCF_OK && safe_str_eq(cmd->real_action, "stop")) {
905 goagain = true;
906
907 } else {
908 #ifdef HAVE_SYS_TIMEB_H
909 int time_sum = time_diff_ms(NULL, &cmd->t_first_run);
910 int timeout_left = cmd->timeout_orig - time_sum;
911
912 crm_debug("%s %s is now complete (elapsed=%dms, remaining=%dms): %s (%d)",
913 cmd->rsc_id, cmd->real_action, time_sum, timeout_left, services_ocf_exitcode_str(cmd->exec_rc), cmd->exec_rc);
914 cmd_original_times(cmd);
915 #endif
916
917 // Monitors may return "not running", but start/stop shouldn't
918 if ((cmd->lrmd_op_status == PCMK_LRM_OP_DONE)
919 && (cmd->exec_rc == PCMK_OCF_NOT_RUNNING)) {
920
921 if (safe_str_eq(cmd->real_action, "start")) {
922 cmd->exec_rc = PCMK_OCF_UNKNOWN_ERROR;
923 } else if (safe_str_eq(cmd->real_action, "stop")) {
924 cmd->exec_rc = PCMK_OCF_OK;
925 }
926 }
927 }
928 }
929 }
930
931 #if SUPPORT_NAGIOS
932 if (rsc && safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_NAGIOS)) {
933 if (safe_str_eq(cmd->action, "monitor") &&
934 cmd->interval == 0 && cmd->exec_rc == PCMK_OCF_OK) {
935 /* Successfully executed --version for the nagios plugin */
936 cmd->exec_rc = PCMK_OCF_NOT_RUNNING;
937
938 } else if (safe_str_eq(cmd->action, "start") && cmd->exec_rc != PCMK_OCF_OK) {
939 goagain = true;
940 }
941 }
942 #endif
943
944 /* Wrapping this section in ifdef implies that systemd resources are not
945 * fully supported on platforms without sys/timeb.h. Since timeb is
946 * obsolete, we should eventually prefer a clock_gettime() implementation
947 * (wrapped in its own ifdef) with timeb as a fallback.
948 */
949 if(goagain) {
950 #ifdef HAVE_SYS_TIMEB_H
951 int time_sum = time_diff_ms(NULL, &cmd->t_first_run);
952 int timeout_left = cmd->timeout_orig - time_sum;
953 int delay = cmd->timeout_orig / 10;
954
955 if(delay >= timeout_left && timeout_left > 20) {
956 delay = timeout_left/2;
957 }
958
959 delay = QB_MIN(2000, delay);
960 if (delay < timeout_left) {
961 cmd->start_delay = delay;
962 cmd->timeout = timeout_left;
963
964 if(cmd->exec_rc == PCMK_OCF_OK) {
965 crm_debug("%s %s may still be in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)",
966 cmd->rsc_id, cmd->real_action, time_sum, timeout_left, delay);
967
968 } else if(cmd->exec_rc == PCMK_OCF_PENDING) {
969 crm_info("%s %s is still in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)",
970 cmd->rsc_id, cmd->action, time_sum, timeout_left, delay);
971
972 } else {
973 crm_notice("%s %s failed '%s' (%d): re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)",
974 cmd->rsc_id, cmd->action, services_ocf_exitcode_str(cmd->exec_rc), cmd->exec_rc, time_sum, timeout_left, delay);
975 }
976
977 cmd_reset(cmd);
978 if(rsc) {
979 rsc->active = NULL;
980 }
981 schedule_lrmd_cmd(rsc, cmd);
982
983 /* Don't finalize cmd, we're not done with it yet */
984 return;
985
986 } else {
987 crm_notice("Giving up on %s %s (rc=%d): timeout (elapsed=%dms, remaining=%dms)",
988 cmd->rsc_id, cmd->real_action?cmd->real_action:cmd->action, cmd->exec_rc, time_sum, timeout_left);
989 cmd->lrmd_op_status = PCMK_LRM_OP_TIMEOUT;
990 cmd->exec_rc = PCMK_OCF_TIMEOUT;
991 cmd_original_times(cmd);
992 }
993 #endif
994 }
995
996 if (action->stderr_data) {
997 cmd->output = strdup(action->stderr_data);
998 cmd->exit_reason = parse_exit_reason(action->stderr_data);
999
1000 } else if (action->stdout_data) {
1001 cmd->output = strdup(action->stdout_data);
1002 }
1003
1004 cmd_finalize(cmd, rsc);
1005 }
1006
1007 static void
stonith_action_complete(lrmd_cmd_t * cmd,int rc)1008 stonith_action_complete(lrmd_cmd_t * cmd, int rc)
1009 {
1010 int recurring = cmd->interval;
1011 lrmd_rsc_t *rsc = NULL;
1012
1013 cmd->exec_rc = get_uniform_rc(PCMK_RESOURCE_CLASS_STONITH, cmd->action, rc);
1014
1015 rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id);
1016
1017 if (cmd->lrmd_op_status == PCMK_LRM_OP_CANCELLED) {
1018 recurring = 0;
1019 /* do nothing */
1020
1021 } else if (rc == -ENODEV && safe_str_eq(cmd->action, "monitor")) {
1022 // The device is not registered with the fencer
1023
1024 cmd->lrmd_op_status = PCMK_LRM_OP_ERROR;
1025 cmd->exec_rc = PCMK_OCF_NOT_RUNNING;
1026
1027 } else if (rc) {
1028 /* Attempt to map return codes to op status if possible */
1029 switch (rc) {
1030 case -EPROTONOSUPPORT:
1031 cmd->lrmd_op_status = PCMK_LRM_OP_NOTSUPPORTED;
1032 break;
1033 case -ETIME:
1034 cmd->lrmd_op_status = PCMK_LRM_OP_TIMEOUT;
1035 break;
1036 default:
1037 /* TODO: This looks wrong. Status should be _DONE and exec_rc set to an error */
1038 cmd->lrmd_op_status = PCMK_LRM_OP_ERROR;
1039 }
1040 } else {
1041 /* command successful */
1042 cmd->lrmd_op_status = PCMK_LRM_OP_DONE;
1043 if (rsc) {
1044 if (safe_str_eq(cmd->action, "start")) {
1045 rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK
1046 } else if (safe_str_eq(cmd->action, "stop")) {
1047 rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING
1048 }
1049 }
1050 }
1051
1052 if (recurring && rsc) {
1053 if (cmd->stonith_recurring_id) {
1054 g_source_remove(cmd->stonith_recurring_id);
1055 }
1056 cmd->stonith_recurring_id = g_timeout_add(cmd->interval, stonith_recurring_op_helper, cmd);
1057 }
1058
1059 cmd_finalize(cmd, rsc);
1060 }
1061
1062 static void
lrmd_stonith_callback(stonith_t * stonith,stonith_callback_data_t * data)1063 lrmd_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
1064 {
1065 stonith_action_complete(data->userdata, data->rc);
1066 }
1067
1068 void
stonith_connection_failed(void)1069 stonith_connection_failed(void)
1070 {
1071 GHashTableIter iter;
1072 GList *cmd_list = NULL;
1073 GList *cmd_iter = NULL;
1074 lrmd_rsc_t *rsc = NULL;
1075 char *key = NULL;
1076
1077 g_hash_table_iter_init(&iter, rsc_list);
1078 while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & rsc)) {
1079 if (safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH)) {
1080 /* If we registered this fence device, we don't know whether the
1081 * fencer still has the registration or not. Cause future probes to
1082 * return PCMK_OCF_UNKNOWN_ERROR until the resource is stopped or
1083 * started successfully. This is especially important if the
1084 * controller also went away (possibly due to a cluster layer
1085 * restart) and won't receive our client notification of any
1086 * monitors finalized below.
1087 */
1088 if (rsc->st_probe_rc == pcmk_ok) {
1089 rsc->st_probe_rc = pcmk_err_generic;
1090 }
1091
1092 if (rsc->active) {
1093 cmd_list = g_list_append(cmd_list, rsc->active);
1094 }
1095 if (rsc->recurring_ops) {
1096 cmd_list = g_list_concat(cmd_list, rsc->recurring_ops);
1097 }
1098 if (rsc->pending_ops) {
1099 cmd_list = g_list_concat(cmd_list, rsc->pending_ops);
1100 }
1101 rsc->pending_ops = rsc->recurring_ops = NULL;
1102 }
1103 }
1104
1105 if (!cmd_list) {
1106 return;
1107 }
1108
1109 crm_err("STONITH connection failed, finalizing %d pending operations.",
1110 g_list_length(cmd_list));
1111 for (cmd_iter = cmd_list; cmd_iter; cmd_iter = cmd_iter->next) {
1112 stonith_action_complete(cmd_iter->data, -ENOTCONN);
1113 }
1114 g_list_free(cmd_list);
1115 }
1116
1117 static int
lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc,lrmd_cmd_t * cmd)1118 lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
1119 {
1120 int rc = 0;
1121 int do_monitor = 0;
1122
1123 stonith_t *stonith_api = get_stonith_connection();
1124
1125 if (!stonith_api) {
1126 cmd->exec_rc = get_uniform_rc(PCMK_RESOURCE_CLASS_STONITH, cmd->action,
1127 -ENOTCONN);
1128 cmd->lrmd_op_status = PCMK_LRM_OP_ERROR;
1129 cmd_finalize(cmd, rsc);
1130 return -EUNATCH;
1131 }
1132
1133 if (safe_str_eq(cmd->action, "start")) {
1134 char *key = NULL;
1135 char *value = NULL;
1136 stonith_key_value_t *device_params = NULL;
1137
1138 if (cmd->params) {
1139 GHashTableIter iter;
1140
1141 g_hash_table_iter_init(&iter, cmd->params);
1142 while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
1143 device_params = stonith_key_value_add(device_params, key, value);
1144 }
1145 }
1146
1147 /* Stonith automatically registers devices from the IPC when changes occur,
1148 * but to avoid a possible race condition between stonith receiving the IPC update
1149 * and the lrmd requesting that resource, the lrmd still registers the device as well.
1150 * Stonith knows how to handle duplicate device registrations correctly. */
1151 rc = stonith_api->cmds->register_device(stonith_api,
1152 st_opt_sync_call,
1153 cmd->rsc_id,
1154 rsc->provider, rsc->type, device_params);
1155
1156 stonith_key_value_freeall(device_params, 1, 1);
1157 if (rc == 0) {
1158 do_monitor = 1;
1159 }
1160 } else if (safe_str_eq(cmd->action, "stop")) {
1161 rc = stonith_api->cmds->remove_device(stonith_api, st_opt_sync_call, cmd->rsc_id);
1162 } else if (safe_str_eq(cmd->action, "monitor")) {
1163 if (cmd->interval) {
1164 do_monitor = 1;
1165 } else {
1166 rc = rsc->st_probe_rc;
1167 }
1168 }
1169
1170 if (!do_monitor) {
1171 goto cleanup_stonith_exec;
1172 }
1173
1174 rc = stonith_api->cmds->monitor(stonith_api, 0, cmd->rsc_id, cmd->timeout / 1000);
1175
1176 rc = stonith_api->cmds->register_callback(stonith_api,
1177 rc,
1178 0,
1179 0,
1180 cmd, "lrmd_stonith_callback", lrmd_stonith_callback);
1181
1182 /* don't cleanup yet, we will find out the result of the monitor later */
1183 if (rc > 0) {
1184 rsc->active = cmd;
1185 return rc;
1186 } else if (rc == 0) {
1187 rc = -1;
1188 }
1189
1190 cleanup_stonith_exec:
1191 stonith_action_complete(cmd, rc);
1192 return rc;
1193 }
1194
1195 static int
lrmd_rsc_execute_service_lib(lrmd_rsc_t * rsc,lrmd_cmd_t * cmd)1196 lrmd_rsc_execute_service_lib(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
1197 {
1198 svc_action_t *action = NULL;
1199 GHashTable *params_copy = NULL;
1200
1201 CRM_ASSERT(rsc);
1202 CRM_ASSERT(cmd);
1203
1204 crm_trace("Creating action, resource:%s action:%s class:%s provider:%s agent:%s",
1205 rsc->rsc_id, cmd->action, rsc->class, rsc->provider, rsc->type);
1206
1207 #if SUPPORT_NAGIOS
1208 /* Recurring operations are cancelled anyway for a stop operation */
1209 if (safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_NAGIOS)
1210 && safe_str_eq(cmd->action, "stop")) {
1211
1212 cmd->exec_rc = PCMK_OCF_OK;
1213 goto exec_done;
1214 }
1215 #endif
1216
1217 params_copy = crm_str_table_dup(cmd->params);
1218
1219 if (cmd->isolation_wrapper) {
1220 g_hash_table_remove(params_copy, "CRM_meta_isolation_wrapper");
1221 action = resources_action_create(rsc->rsc_id,
1222 PCMK_RESOURCE_CLASS_OCF,
1223 LRMD_ISOLATION_PROVIDER,
1224 cmd->isolation_wrapper,
1225 cmd->action, /*action will be normalized in wrapper*/
1226 cmd->interval,
1227 cmd->timeout,
1228 params_copy,
1229 cmd->service_flags);
1230 } else {
1231 action = resources_action_create(rsc->rsc_id,
1232 rsc->class,
1233 rsc->provider,
1234 rsc->type,
1235 normalize_action_name(rsc, cmd->action),
1236 cmd->interval,
1237 cmd->timeout,
1238 params_copy,
1239 cmd->service_flags);
1240 }
1241
1242 if (!action) {
1243 crm_err("Failed to create action, action:%s on resource %s", cmd->action, rsc->rsc_id);
1244 cmd->lrmd_op_status = PCMK_LRM_OP_ERROR;
1245 goto exec_done;
1246 }
1247
1248 action->cb_data = cmd;
1249
1250 /* 'cmd' may not be valid after this point if
1251 * services_action_async() returned TRUE
1252 *
1253 * Upstart and systemd both synchronously determine monitor/status
1254 * results and call action_complete (which may free 'cmd') if necessary.
1255 */
1256 if (services_action_async(action, action_complete)) {
1257 return TRUE;
1258 }
1259
1260 cmd->exec_rc = action->rc;
1261 if(action->status != PCMK_LRM_OP_DONE) {
1262 cmd->lrmd_op_status = action->status;
1263 } else {
1264 cmd->lrmd_op_status = PCMK_LRM_OP_ERROR;
1265 }
1266 services_action_free(action);
1267 action = NULL;
1268
1269 exec_done:
1270 cmd_finalize(cmd, rsc);
1271 return TRUE;
1272 }
1273
1274 static gboolean
lrmd_rsc_execute(lrmd_rsc_t * rsc)1275 lrmd_rsc_execute(lrmd_rsc_t * rsc)
1276 {
1277 lrmd_cmd_t *cmd = NULL;
1278
1279 CRM_CHECK(rsc != NULL, return FALSE);
1280
1281 if (rsc->active) {
1282 crm_trace("%s is still active", rsc->rsc_id);
1283 return TRUE;
1284 }
1285
1286 if (rsc->pending_ops) {
1287 GList *first = rsc->pending_ops;
1288
1289 cmd = first->data;
1290 if (cmd->delay_id) {
1291 crm_trace
1292 ("Command %s %s was asked to run too early, waiting for start_delay timeout of %dms",
1293 cmd->rsc_id, cmd->action, cmd->start_delay);
1294 return TRUE;
1295 }
1296 rsc->pending_ops = g_list_remove_link(rsc->pending_ops, first);
1297 g_list_free_1(first);
1298
1299 #ifdef HAVE_SYS_TIMEB_H
1300 if (cmd->t_first_run.time == 0) {
1301 ftime(&cmd->t_first_run);
1302 }
1303 ftime(&cmd->t_run);
1304 #endif
1305 }
1306
1307 if (!cmd) {
1308 crm_trace("Nothing further to do for %s", rsc->rsc_id);
1309 return TRUE;
1310 }
1311
1312 rsc->active = cmd; /* only one op at a time for a rsc */
1313 if (cmd->interval) {
1314 rsc->recurring_ops = g_list_append(rsc->recurring_ops, cmd);
1315 }
1316
1317 log_execute(cmd);
1318
1319 if (safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH)) {
1320 lrmd_rsc_execute_stonith(rsc, cmd);
1321 } else {
1322 lrmd_rsc_execute_service_lib(rsc, cmd);
1323 }
1324
1325 return TRUE;
1326 }
1327
1328 static gboolean
lrmd_rsc_dispatch(gpointer user_data)1329 lrmd_rsc_dispatch(gpointer user_data)
1330 {
1331 return lrmd_rsc_execute(user_data);
1332 }
1333
1334 void
free_rsc(gpointer data)1335 free_rsc(gpointer data)
1336 {
1337 GListPtr gIter = NULL;
1338 lrmd_rsc_t *rsc = data;
1339 int is_stonith = safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH);
1340
1341 gIter = rsc->pending_ops;
1342 while (gIter != NULL) {
1343 GListPtr next = gIter->next;
1344 lrmd_cmd_t *cmd = gIter->data;
1345
1346 /* command was never executed */
1347 cmd->lrmd_op_status = PCMK_LRM_OP_CANCELLED;
1348 cmd_finalize(cmd, NULL);
1349
1350 gIter = next;
1351 }
1352 /* frees list, but not list elements. */
1353 g_list_free(rsc->pending_ops);
1354
1355 gIter = rsc->recurring_ops;
1356 while (gIter != NULL) {
1357 GListPtr next = gIter->next;
1358 lrmd_cmd_t *cmd = gIter->data;
1359
1360 if (is_stonith) {
1361 cmd->lrmd_op_status = PCMK_LRM_OP_CANCELLED;
1362 /* If a stonith command is in-flight, just mark it as cancelled;
1363 * it is not safe to finalize/free the cmd until the stonith api
1364 * says it has either completed or timed out.
1365 */
1366 if (rsc->active != cmd) {
1367 cmd_finalize(cmd, NULL);
1368 }
1369 } else {
1370 /* This command is already handed off to service library,
1371 * let service library cancel it and tell us via the callback
1372 * when it is cancelled. The rsc can be safely destroyed
1373 * even if we are waiting for the cancel result */
1374 services_action_cancel(rsc->rsc_id, normalize_action_name(rsc, cmd->action), cmd->interval);
1375 }
1376
1377 gIter = next;
1378 }
1379 /* frees list, but not list elements. */
1380 g_list_free(rsc->recurring_ops);
1381
1382 free(rsc->rsc_id);
1383 free(rsc->class);
1384 free(rsc->provider);
1385 free(rsc->type);
1386 mainloop_destroy_trigger(rsc->work);
1387
1388 free(rsc);
1389 }
1390
1391 static int
process_lrmd_signon(crm_client_t * client,xmlNode * request,int call_id,xmlNode ** reply)1392 process_lrmd_signon(crm_client_t *client, xmlNode *request, int call_id,
1393 xmlNode **reply)
1394 {
1395 int rc = pcmk_ok;
1396 const char *is_ipc_provider = crm_element_value(request, F_LRMD_IS_IPC_PROVIDER);
1397 const char *protocol_version = crm_element_value(request, F_LRMD_PROTOCOL_VERSION);
1398
1399 if (compare_version(protocol_version, LRMD_MIN_PROTOCOL_VERSION) < 0) {
1400 crm_err("Cluster API version must be greater than or equal to %s, not %s",
1401 LRMD_MIN_PROTOCOL_VERSION, protocol_version);
1402 rc = -EPROTO;
1403 }
1404
1405 if (crm_is_true(is_ipc_provider)) {
1406 #ifdef SUPPORT_REMOTE
1407 if ((client->remote != NULL) && client->remote->tls_handshake_complete) {
1408 // This is a remote connection from a cluster node's controller
1409 ipc_proxy_add_provider(client);
1410 } else {
1411 rc = -EACCES;
1412 }
1413 #else
1414 rc = -EPROTONOSUPPORT;
1415 #endif
1416 }
1417
1418 *reply = create_lrmd_reply(__func__, rc, call_id);
1419 crm_xml_add(*reply, F_LRMD_OPERATION, CRM_OP_REGISTER);
1420 crm_xml_add(*reply, F_LRMD_CLIENTID, client->id);
1421 crm_xml_add(*reply, F_LRMD_PROTOCOL_VERSION, LRMD_PROTOCOL_VERSION);
1422
1423 return rc;
1424 }
1425
1426 static int
process_lrmd_rsc_register(crm_client_t * client,uint32_t id,xmlNode * request)1427 process_lrmd_rsc_register(crm_client_t * client, uint32_t id, xmlNode * request)
1428 {
1429 int rc = pcmk_ok;
1430 lrmd_rsc_t *rsc = build_rsc_from_xml(request);
1431 lrmd_rsc_t *dup = g_hash_table_lookup(rsc_list, rsc->rsc_id);
1432
1433 if (dup &&
1434 safe_str_eq(rsc->class, dup->class) &&
1435 safe_str_eq(rsc->provider, dup->provider) && safe_str_eq(rsc->type, dup->type)) {
1436
1437 crm_warn("Can't add, RSC '%s' already present in the rsc list (%d active resources)",
1438 rsc->rsc_id, g_hash_table_size(rsc_list));
1439
1440 free_rsc(rsc);
1441 return rc;
1442 }
1443
1444 g_hash_table_replace(rsc_list, rsc->rsc_id, rsc);
1445 crm_info("Added '%s' to the rsc list (%d active resources)",
1446 rsc->rsc_id, g_hash_table_size(rsc_list));
1447
1448 return rc;
1449 }
1450
1451 static xmlNode *
process_lrmd_get_rsc_info(xmlNode * request,int call_id)1452 process_lrmd_get_rsc_info(xmlNode *request, int call_id)
1453 {
1454 int rc = pcmk_ok;
1455 xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
1456 const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
1457 xmlNode *reply = NULL;
1458 lrmd_rsc_t *rsc = NULL;
1459
1460 if (rsc_id == NULL) {
1461 rc = -ENODEV;
1462 } else {
1463 rsc = g_hash_table_lookup(rsc_list, rsc_id);
1464 if (rsc == NULL) {
1465 crm_info("Resource '%s' not found (%d active resources)",
1466 rsc_id, g_hash_table_size(rsc_list));
1467 rc = -ENODEV;
1468 }
1469 }
1470
1471 reply = create_lrmd_reply(__FUNCTION__, rc, call_id);
1472 if (rsc) {
1473 crm_xml_add(reply, F_LRMD_RSC_ID, rsc->rsc_id);
1474 crm_xml_add(reply, F_LRMD_CLASS, rsc->class);
1475 crm_xml_add(reply, F_LRMD_PROVIDER, rsc->provider);
1476 crm_xml_add(reply, F_LRMD_TYPE, rsc->type);
1477 }
1478 return reply;
1479 }
1480
1481 static int
process_lrmd_rsc_unregister(crm_client_t * client,uint32_t id,xmlNode * request)1482 process_lrmd_rsc_unregister(crm_client_t * client, uint32_t id, xmlNode * request)
1483 {
1484 int rc = pcmk_ok;
1485 lrmd_rsc_t *rsc = NULL;
1486 xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
1487 const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
1488
1489 if (!rsc_id) {
1490 return -ENODEV;
1491 }
1492
1493 if (!(rsc = g_hash_table_lookup(rsc_list, rsc_id))) {
1494 crm_info("Resource '%s' not found (%d active resources)",
1495 rsc_id, g_hash_table_size(rsc_list));
1496 return pcmk_ok;
1497 }
1498
1499 if (rsc->active) {
1500 /* let the caller know there are still active ops on this rsc to watch for */
1501 crm_trace("Operation still in progress: %p", rsc->active);
1502 rc = -EINPROGRESS;
1503 }
1504
1505 g_hash_table_remove(rsc_list, rsc_id);
1506
1507 return rc;
1508 }
1509
1510 static int
process_lrmd_rsc_exec(crm_client_t * client,uint32_t id,xmlNode * request)1511 process_lrmd_rsc_exec(crm_client_t * client, uint32_t id, xmlNode * request)
1512 {
1513 lrmd_rsc_t *rsc = NULL;
1514 lrmd_cmd_t *cmd = NULL;
1515 xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
1516 const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
1517 int call_id;
1518
1519 if (!rsc_id) {
1520 return -EINVAL;
1521 }
1522 if (!(rsc = g_hash_table_lookup(rsc_list, rsc_id))) {
1523 crm_info("Resource '%s' not found (%d active resources)",
1524 rsc_id, g_hash_table_size(rsc_list));
1525 return -ENODEV;
1526 }
1527
1528 cmd = create_lrmd_cmd(request, client, rsc);
1529 call_id = cmd->call_id;
1530
1531 /* Don't reference cmd after handing it off to be scheduled.
1532 * The cmd could get merged and freed. */
1533 schedule_lrmd_cmd(rsc, cmd);
1534
1535 return call_id;
1536 }
1537
1538 static int
cancel_op(const char * rsc_id,const char * action,int interval)1539 cancel_op(const char *rsc_id, const char *action, int interval)
1540 {
1541 GListPtr gIter = NULL;
1542 lrmd_rsc_t *rsc = g_hash_table_lookup(rsc_list, rsc_id);
1543
1544 /* How to cancel an action.
1545 * 1. Check pending ops list, if it hasn't been handed off
1546 * to the service library or stonith recurring list remove
1547 * it there and that will stop it.
1548 * 2. If it isn't in the pending ops list, then it's either a
1549 * recurring op in the stonith recurring list, or the service
1550 * library's recurring list. Stop it there
1551 * 3. If not found in any lists, then this operation has either
1552 * been executed already and is not a recurring operation, or
1553 * never existed.
1554 */
1555 if (!rsc) {
1556 return -ENODEV;
1557 }
1558
1559 for (gIter = rsc->pending_ops; gIter != NULL; gIter = gIter->next) {
1560 lrmd_cmd_t *cmd = gIter->data;
1561
1562 if (safe_str_eq(cmd->action, action) && cmd->interval == interval) {
1563 cmd->lrmd_op_status = PCMK_LRM_OP_CANCELLED;
1564 cmd_finalize(cmd, rsc);
1565 return pcmk_ok;
1566 }
1567 }
1568
1569 if (safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH)) {
1570 /* The service library does not handle stonith operations.
1571 * We have to handle recurring stonith operations ourselves. */
1572 for (gIter = rsc->recurring_ops; gIter != NULL; gIter = gIter->next) {
1573 lrmd_cmd_t *cmd = gIter->data;
1574
1575 if (safe_str_eq(cmd->action, action) && cmd->interval == interval) {
1576 cmd->lrmd_op_status = PCMK_LRM_OP_CANCELLED;
1577 if (rsc->active != cmd) {
1578 cmd_finalize(cmd, rsc);
1579 }
1580 return pcmk_ok;
1581 }
1582 }
1583 } else if (services_action_cancel(rsc_id, normalize_action_name(rsc, action), interval) == TRUE) {
1584 /* The service library will tell the action_complete callback function
1585 * this action was cancelled, which will destroy the cmd and remove
1586 * it from the recurring_op list. Do not do that in this function
1587 * if the service library says it cancelled it. */
1588 return pcmk_ok;
1589 }
1590
1591 return -EOPNOTSUPP;
1592 }
1593
1594 static void
cancel_all_recurring(lrmd_rsc_t * rsc,const char * client_id)1595 cancel_all_recurring(lrmd_rsc_t * rsc, const char *client_id)
1596 {
1597 GList *cmd_list = NULL;
1598 GList *cmd_iter = NULL;
1599
1600 /* Notice a copy of each list is created when concat is called.
1601 * This prevents odd behavior from occurring when the cmd_list
1602 * is iterated through later on. It is possible the cancel_op
1603 * function may end up modifying the recurring_ops and pending_ops
1604 * lists. If we did not copy those lists, our cmd_list iteration
1605 * could get messed up.*/
1606 if (rsc->recurring_ops) {
1607 cmd_list = g_list_concat(cmd_list, g_list_copy(rsc->recurring_ops));
1608 }
1609 if (rsc->pending_ops) {
1610 cmd_list = g_list_concat(cmd_list, g_list_copy(rsc->pending_ops));
1611 }
1612 if (!cmd_list) {
1613 return;
1614 }
1615
1616 for (cmd_iter = cmd_list; cmd_iter; cmd_iter = cmd_iter->next) {
1617 lrmd_cmd_t *cmd = cmd_iter->data;
1618
1619 if (cmd->interval == 0) {
1620 continue;
1621 }
1622
1623 if (client_id && safe_str_neq(cmd->client_id, client_id)) {
1624 continue;
1625 }
1626
1627 cancel_op(rsc->rsc_id, cmd->action, cmd->interval);
1628 }
1629 /* frees only the copied list data, not the cmds */
1630 g_list_free(cmd_list);
1631 }
1632
1633 static int
process_lrmd_rsc_cancel(crm_client_t * client,uint32_t id,xmlNode * request)1634 process_lrmd_rsc_cancel(crm_client_t * client, uint32_t id, xmlNode * request)
1635 {
1636 xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
1637 const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
1638 const char *action = crm_element_value(rsc_xml, F_LRMD_RSC_ACTION);
1639 int interval = 0;
1640
1641 crm_element_value_int(rsc_xml, F_LRMD_RSC_INTERVAL, &interval);
1642
1643 if (!rsc_id || !action) {
1644 return -EINVAL;
1645 }
1646
1647 return cancel_op(rsc_id, action, interval);
1648 }
1649
1650 void
process_lrmd_message(crm_client_t * client,uint32_t id,xmlNode * request)1651 process_lrmd_message(crm_client_t * client, uint32_t id, xmlNode * request)
1652 {
1653 int rc = pcmk_ok;
1654 int call_id = 0;
1655 const char *op = crm_element_value(request, F_LRMD_OPERATION);
1656 int do_reply = 0;
1657 int do_notify = 0;
1658 xmlNode *reply = NULL;
1659
1660 #if ENABLE_ACL
1661 /* Certain IPC commands may be done only by privileged users (i.e. root or
1662 * hacluster) when ACLs are enabled, because they would otherwise provide a
1663 * means of bypassing ACLs.
1664 */
1665 bool allowed = is_set(client->flags, crm_client_flag_ipc_privileged);
1666 #else
1667 bool allowed = true;
1668 #endif
1669
1670 crm_trace("Processing %s operation from %s", op, client->id);
1671 crm_element_value_int(request, F_LRMD_CALLID, &call_id);
1672
1673 if (crm_str_eq(op, CRM_OP_IPC_FWD, TRUE)) {
1674 #ifdef SUPPORT_REMOTE
1675 if (allowed) {
1676 ipc_proxy_forward_client(client, request);
1677 } else {
1678 rc = -EACCES;
1679 }
1680 #else
1681 rc = -EPROTONOSUPPORT;
1682 #endif
1683 do_reply = 1;
1684 } else if (crm_str_eq(op, CRM_OP_REGISTER, TRUE)) {
1685 rc = process_lrmd_signon(client, request, call_id, &reply);
1686 do_reply = 1;
1687 } else if (crm_str_eq(op, LRMD_OP_RSC_REG, TRUE)) {
1688 if (allowed) {
1689 rc = process_lrmd_rsc_register(client, id, request);
1690 do_notify = 1;
1691 } else {
1692 rc = -EACCES;
1693 }
1694 do_reply = 1;
1695 } else if (crm_str_eq(op, LRMD_OP_RSC_INFO, TRUE)) {
1696 if (allowed) {
1697 reply = process_lrmd_get_rsc_info(request, call_id);
1698 } else {
1699 rc = -EACCES;
1700 }
1701 do_reply = 1;
1702 } else if (crm_str_eq(op, LRMD_OP_RSC_UNREG, TRUE)) {
1703 if (allowed) {
1704 rc = process_lrmd_rsc_unregister(client, id, request);
1705 /* don't notify anyone about failed un-registers */
1706 if (rc == pcmk_ok || rc == -EINPROGRESS) {
1707 do_notify = 1;
1708 }
1709 } else {
1710 rc = -EACCES;
1711 }
1712 do_reply = 1;
1713 } else if (crm_str_eq(op, LRMD_OP_RSC_EXEC, TRUE)) {
1714 if (allowed) {
1715 rc = process_lrmd_rsc_exec(client, id, request);
1716 } else {
1717 rc = -EACCES;
1718 }
1719 do_reply = 1;
1720 } else if (crm_str_eq(op, LRMD_OP_RSC_CANCEL, TRUE)) {
1721 if (allowed) {
1722 rc = process_lrmd_rsc_cancel(client, id, request);
1723 } else {
1724 rc = -EACCES;
1725 }
1726 do_reply = 1;
1727 } else if (crm_str_eq(op, LRMD_OP_POKE, TRUE)) {
1728 do_notify = 1;
1729 do_reply = 1;
1730 } else if (crm_str_eq(op, LRMD_OP_CHECK, TRUE)) {
1731 if (allowed) {
1732 xmlNode *data = get_message_xml(request, F_LRMD_CALLDATA);
1733
1734 CRM_LOG_ASSERT(data != NULL);
1735 check_sbd_timeout(crm_element_value(data, F_LRMD_WATCHDOG));
1736 } else {
1737 rc = -EACCES;
1738 }
1739 } else if (crm_str_eq(op, LRMD_OP_ALERT_EXEC, TRUE)) {
1740 if (allowed) {
1741 rc = process_lrmd_alert_exec(client, id, request);
1742 } else {
1743 rc = -EACCES;
1744 }
1745 do_reply = 1;
1746 } else {
1747 rc = -EOPNOTSUPP;
1748 do_reply = 1;
1749 crm_err("Unknown %s from %s", op, client->name);
1750 crm_log_xml_warn(request, "UnknownOp");
1751 }
1752
1753 if (rc == -EACCES) {
1754 crm_warn("Rejecting IPC request '%s' from unprivileged client %s",
1755 op, crm_client_name(client));
1756 }
1757
1758 crm_debug("Processed %s operation from %s: rc=%d, reply=%d, notify=%d",
1759 op, client->id, rc, do_reply, do_notify);
1760
1761 if (do_reply) {
1762 int send_rc = pcmk_ok;
1763
1764 if (reply == NULL) {
1765 reply = create_lrmd_reply(__FUNCTION__, rc, call_id);
1766 }
1767 send_rc = lrmd_server_send_reply(client, id, reply);
1768 free_xml(reply);
1769 if (send_rc < 0) {
1770 crm_warn("Reply to client %s failed: %s " CRM_XS " %d",
1771 client->name, pcmk_strerror(send_rc), send_rc);
1772 }
1773 }
1774
1775 if (do_notify) {
1776 send_generic_notify(rc, request);
1777 }
1778 }
1779