1 /*
2  * Copyright 2012-2020 the Pacemaker project contributors
3  *
4  * The version control history for this file may have further details.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19  *
20  */
21 
22 #include <crm_internal.h>
23 
24 #include <glib.h>
25 #include <unistd.h>
26 
27 #include <crm/crm.h>
28 #include <crm/services.h>
29 #include <crm/common/mainloop.h>
30 #include <crm/common/ipc.h>
31 #include <crm/common/ipcs.h>
32 #include <crm/msg_xml.h>
33 
34 #include <lrmd_private.h>
35 
36 #ifdef HAVE_SYS_TIMEB_H
37 #  include <sys/timeb.h>
38 #endif
39 
40 #define EXIT_REASON_MAX_LEN 128
41 
42 GHashTable *rsc_list = NULL;
43 
44 typedef struct lrmd_cmd_s {
45     int timeout;
46     int interval;
47     int start_delay;
48     int timeout_orig;
49 
50     int call_id;
51     int exec_rc;
52     int lrmd_op_status;
53 
54     int call_opts;
55     /* Timer ids, must be removed on cmd destruction. */
56     int delay_id;
57     int stonith_recurring_id;
58 
59     int rsc_deleted;
60 
61     int service_flags;
62 
63     char *client_id;
64     char *origin;
65     char *rsc_id;
66     char *action;
67     char *real_action;
68     char *exit_reason;
69     char *output;
70     char *userdata_str;
71 
72     /* when set, this cmd should go through a container wrapper */
73     const char *isolation_wrapper;
74 
75 #ifdef HAVE_SYS_TIMEB_H
76     /* recurring and systemd operations may involve more than one lrmd command
77      * per operation, so they need info about original and most recent
78      */
79     struct timeb t_first_run;   /* Timestamp of when op first ran */
80     struct timeb t_run;         /* Timestamp of when op most recently ran */
81     struct timeb t_first_queue; /* Timestamp of when op first was queued */
82     struct timeb t_queue;       /* Timestamp of when op most recently was queued */
83     struct timeb t_rcchange;    /* Timestamp of last rc change */
84 #endif
85 
86     int first_notify_sent;
87     int last_notify_rc;
88     int last_notify_op_status;
89     int last_pid;
90 
91     GHashTable *params;
92 } lrmd_cmd_t;
93 
94 static void cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc);
95 static gboolean lrmd_rsc_dispatch(gpointer user_data);
96 static void cancel_all_recurring(lrmd_rsc_t * rsc, const char *client_id);
97 
98 static void
log_finished(lrmd_cmd_t * cmd,int exec_time,int queue_time)99 log_finished(lrmd_cmd_t * cmd, int exec_time, int queue_time)
100 {
101     char pid_str[32] = { 0, };
102     int log_level = LOG_INFO;
103 
104     if (cmd->last_pid) {
105         snprintf(pid_str, 32, "%d", cmd->last_pid);
106     }
107 
108     if (safe_str_eq(cmd->action, "monitor")) {
109         log_level = LOG_DEBUG;
110     }
111 #ifdef HAVE_SYS_TIMEB_H
112     do_crm_log(log_level,
113                "finished - rsc:%s action:%s call_id:%d %s%s exit-code:%d exec-time:%dms queue-time:%dms",
114                cmd->rsc_id, cmd->action, cmd->call_id, cmd->last_pid ? "pid:" : "", pid_str,
115                cmd->exec_rc, exec_time, queue_time);
116 #else
117     do_crm_log(log_level, "finished - rsc:%s action:%s call_id:%d %s%s exit-code:%d",
118                cmd->rsc_id,
119                cmd->action, cmd->call_id, cmd->last_pid ? "pid:" : "", pid_str, cmd->exec_rc);
120 #endif
121 }
122 
123 static void
log_execute(lrmd_cmd_t * cmd)124 log_execute(lrmd_cmd_t * cmd)
125 {
126     int log_level = LOG_INFO;
127 
128     if (safe_str_eq(cmd->action, "monitor")) {
129         log_level = LOG_DEBUG;
130     }
131 
132     do_crm_log(log_level, "executing - rsc:%s action:%s call_id:%d",
133                cmd->rsc_id, cmd->action, cmd->call_id);
134 }
135 
136 static const char *
normalize_action_name(lrmd_rsc_t * rsc,const char * action)137 normalize_action_name(lrmd_rsc_t * rsc, const char *action)
138 {
139     if (safe_str_eq(action, "monitor") &&
140         is_set(pcmk_get_ra_caps(rsc->class), pcmk_ra_cap_status)) {
141         return "status";
142     }
143     return action;
144 }
145 
146 static lrmd_rsc_t *
build_rsc_from_xml(xmlNode * msg)147 build_rsc_from_xml(xmlNode * msg)
148 {
149     xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, msg, LOG_ERR);
150     lrmd_rsc_t *rsc = NULL;
151 
152     rsc = calloc(1, sizeof(lrmd_rsc_t));
153 
154     crm_element_value_int(msg, F_LRMD_CALLOPTS, &rsc->call_opts);
155 
156     rsc->rsc_id = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ID);
157     rsc->class = crm_element_value_copy(rsc_xml, F_LRMD_CLASS);
158     rsc->provider = crm_element_value_copy(rsc_xml, F_LRMD_PROVIDER);
159     rsc->type = crm_element_value_copy(rsc_xml, F_LRMD_TYPE);
160     rsc->work = mainloop_add_trigger(G_PRIORITY_HIGH, lrmd_rsc_dispatch, rsc);
161     rsc->st_probe_rc = -ENODEV; // if stonith, initialize to "not running"
162     return rsc;
163 }
164 
165 static lrmd_cmd_t *
create_lrmd_cmd(xmlNode * msg,crm_client_t * client,lrmd_rsc_t * rsc)166 create_lrmd_cmd(xmlNode * msg, crm_client_t * client, lrmd_rsc_t *rsc)
167 {
168     int call_options = 0;
169     xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, msg, LOG_ERR);
170     lrmd_cmd_t *cmd = NULL;
171 
172     cmd = calloc(1, sizeof(lrmd_cmd_t));
173 
174     crm_element_value_int(msg, F_LRMD_CALLOPTS, &call_options);
175     cmd->call_opts = call_options;
176     cmd->client_id = strdup(client->id);
177 
178     crm_element_value_int(msg, F_LRMD_CALLID, &cmd->call_id);
179     crm_element_value_int(rsc_xml, F_LRMD_RSC_INTERVAL, &cmd->interval);
180     crm_element_value_int(rsc_xml, F_LRMD_TIMEOUT, &cmd->timeout);
181     crm_element_value_int(rsc_xml, F_LRMD_RSC_START_DELAY, &cmd->start_delay);
182     cmd->timeout_orig = cmd->timeout;
183 
184     cmd->origin = crm_element_value_copy(rsc_xml, F_LRMD_ORIGIN);
185     cmd->action = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ACTION);
186     cmd->userdata_str = crm_element_value_copy(rsc_xml, F_LRMD_RSC_USERDATA_STR);
187     cmd->rsc_id = crm_element_value_copy(rsc_xml, F_LRMD_RSC_ID);
188 
189     cmd->params = xml2list(rsc_xml);
190     cmd->isolation_wrapper = g_hash_table_lookup(cmd->params, "CRM_meta_isolation_wrapper");
191 
192     if (cmd->isolation_wrapper) {
193         if (g_hash_table_lookup(cmd->params, "CRM_meta_isolation_instance") == NULL) {
194             g_hash_table_insert(cmd->params, strdup("CRM_meta_isolation_instance"), strdup(rsc->rsc_id));
195         }
196         if (rsc->provider) {
197             g_hash_table_insert(cmd->params, strdup("CRM_meta_provider"), strdup(rsc->provider));
198         }
199         g_hash_table_insert(cmd->params, strdup("CRM_meta_class"), strdup(rsc->class));
200         g_hash_table_insert(cmd->params, strdup("CRM_meta_type"), strdup(rsc->type));
201     }
202 
203     if (safe_str_eq(g_hash_table_lookup(cmd->params, "CRM_meta_on_fail"), "block")) {
204         crm_debug("Setting flag to leave pid group on timeout and only kill action pid for %s_%s_%d", cmd->rsc_id, cmd->action, cmd->interval);
205         cmd->service_flags |= SVC_ACTION_LEAVE_GROUP;
206     }
207     return cmd;
208 }
209 
210 static void
free_lrmd_cmd(lrmd_cmd_t * cmd)211 free_lrmd_cmd(lrmd_cmd_t * cmd)
212 {
213     if (cmd->stonith_recurring_id) {
214         g_source_remove(cmd->stonith_recurring_id);
215     }
216     if (cmd->delay_id) {
217         g_source_remove(cmd->delay_id);
218     }
219     if (cmd->params) {
220         g_hash_table_destroy(cmd->params);
221     }
222     free(cmd->origin);
223     free(cmd->action);
224     free(cmd->real_action);
225     free(cmd->userdata_str);
226     free(cmd->rsc_id);
227     free(cmd->output);
228     free(cmd->exit_reason);
229     free(cmd->client_id);
230     free(cmd);
231 }
232 
233 static gboolean
stonith_recurring_op_helper(gpointer data)234 stonith_recurring_op_helper(gpointer data)
235 {
236     lrmd_cmd_t *cmd = data;
237     lrmd_rsc_t *rsc;
238 
239     cmd->stonith_recurring_id = 0;
240 
241     if (!cmd->rsc_id) {
242         return FALSE;
243     }
244 
245     rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id);
246 
247     CRM_ASSERT(rsc != NULL);
248     /* take it out of recurring_ops list, and put it in the pending ops
249      * to be executed */
250     rsc->recurring_ops = g_list_remove(rsc->recurring_ops, cmd);
251     rsc->pending_ops = g_list_append(rsc->pending_ops, cmd);
252 #ifdef HAVE_SYS_TIMEB_H
253     ftime(&cmd->t_queue);
254     if (cmd->t_first_queue.time == 0) {
255         cmd->t_first_queue = cmd->t_queue;
256     }
257 #endif
258     mainloop_set_trigger(rsc->work);
259 
260     return FALSE;
261 }
262 
263 static gboolean
start_delay_helper(gpointer data)264 start_delay_helper(gpointer data)
265 {
266     lrmd_cmd_t *cmd = data;
267     lrmd_rsc_t *rsc = NULL;
268 
269     cmd->delay_id = 0;
270     rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL;
271 
272     if (rsc) {
273         mainloop_set_trigger(rsc->work);
274     }
275 
276     return FALSE;
277 }
278 
279 static gboolean
merge_recurring_duplicate(lrmd_rsc_t * rsc,lrmd_cmd_t * cmd)280 merge_recurring_duplicate(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
281 {
282     GListPtr gIter = NULL;
283     lrmd_cmd_t * dup = NULL;
284     gboolean dup_pending = FALSE;
285 
286     if (cmd->interval == 0) {
287         return 0;
288     }
289 
290     for (gIter = rsc->pending_ops; gIter != NULL; gIter = gIter->next) {
291         dup = gIter->data;
292         if (safe_str_eq(cmd->action, dup->action) && cmd->interval == dup->interval) {
293             dup_pending = TRUE;
294             goto merge_dup;
295         }
296     }
297 
298     /* if dup is in recurring_ops list, that means it has already executed
299      * and is in the interval loop. we can't just remove it in this case. */
300     for (gIter = rsc->recurring_ops; gIter != NULL; gIter = gIter->next) {
301         dup = gIter->data;
302         if (safe_str_eq(cmd->action, dup->action) && cmd->interval == dup->interval) {
303             goto merge_dup;
304         }
305     }
306 
307     return FALSE;
308 merge_dup:
309 
310 
311     /* This should not occur, if it does we need to investigate in the crmd
312      * how something like this is possible */
313     crm_warn("Duplicate recurring op entry detected (%s_%s_%d), merging with previous op entry",
314             rsc->rsc_id,
315             normalize_action_name(rsc, dup->action),
316             dup->interval);
317 
318     /* merge */
319     dup->first_notify_sent = 0;
320     free(dup->userdata_str);
321     dup->userdata_str = cmd->userdata_str;
322     cmd->userdata_str = NULL;
323     dup->call_id = cmd->call_id;
324 
325     if (safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH)) {
326         /* if we are waiting for the next interval, kick it off now */
327         if (dup_pending == TRUE) {
328             g_source_remove(cmd->stonith_recurring_id);
329             cmd->stonith_recurring_id = 0;
330             stonith_recurring_op_helper(cmd);
331         }
332 
333     } else if (dup_pending == FALSE) {
334         /* if we've already handed this to the service lib, kick off an early execution */
335         services_action_kick(rsc->rsc_id, normalize_action_name(rsc, dup->action), dup->interval);
336     }
337     free_lrmd_cmd(cmd);
338 
339     return TRUE;
340 }
341 
342 static void
schedule_lrmd_cmd(lrmd_rsc_t * rsc,lrmd_cmd_t * cmd)343 schedule_lrmd_cmd(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
344 {
345     gboolean dup_processed = FALSE;
346     CRM_CHECK(cmd != NULL, return);
347     CRM_CHECK(rsc != NULL, return);
348 
349     crm_trace("Scheduling %s on %s", cmd->action, rsc->rsc_id);
350 
351     dup_processed = merge_recurring_duplicate(rsc, cmd);
352     if (dup_processed) {
353         /* duplicate recurring cmd found, cmds merged */
354         return;
355     }
356 
357     /* crmd expects lrmd to automatically cancel recurring ops before rsc stops. */
358     if (safe_str_eq(cmd->action, "stop")) {
359         cancel_all_recurring(rsc, NULL);
360     }
361 
362     rsc->pending_ops = g_list_append(rsc->pending_ops, cmd);
363 #ifdef HAVE_SYS_TIMEB_H
364     ftime(&cmd->t_queue);
365     if (cmd->t_first_queue.time == 0) {
366         cmd->t_first_queue = cmd->t_queue;
367     }
368 #endif
369     mainloop_set_trigger(rsc->work);
370 
371     if (cmd->start_delay) {
372         cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
373     }
374 }
375 
376 static xmlNode *
create_lrmd_reply(const char * origin,int rc,int call_id)377 create_lrmd_reply(const char *origin, int rc, int call_id)
378 {
379     xmlNode *reply = create_xml_node(NULL, T_LRMD_REPLY);
380 
381     crm_xml_add(reply, F_LRMD_ORIGIN, origin);
382     crm_xml_add_int(reply, F_LRMD_RC, rc);
383     crm_xml_add_int(reply, F_LRMD_CALLID, call_id);
384     return reply;
385 }
386 
387 static void
send_client_notify(gpointer key,gpointer value,gpointer user_data)388 send_client_notify(gpointer key, gpointer value, gpointer user_data)
389 {
390     xmlNode *update_msg = user_data;
391     crm_client_t *client = value;
392     int rc;
393 
394     if (client == NULL) {
395         crm_err("Asked to send event to  NULL client");
396         return;
397     } else if (client->name == NULL) {
398         crm_trace("Asked to send event to client with no name");
399         return;
400     }
401 
402     rc = lrmd_server_send_notify(client, update_msg);
403     if ((rc <= 0) && (rc != -ENOTCONN)) {
404         crm_warn("Could not notify client %s/%s: %s " CRM_XS " rc=%d",
405                  client->name, client->id,
406                  (rc? pcmk_strerror(rc) : "no data sent"), rc);
407     }
408 }
409 
410 #ifdef HAVE_SYS_TIMEB_H
411 /*!
412  * \internal
413  * \brief Return difference between two times in milliseconds
414  *
415  * \param[in] now  More recent time (or NULL to use current time)
416  * \param[in] old  Earlier time
417  *
418  * \return milliseconds difference (or 0 if old is NULL or has time zero)
419  */
420 static int
time_diff_ms(struct timeb * now,struct timeb * old)421 time_diff_ms(struct timeb *now, struct timeb *old)
422 {
423     struct timeb local_now = { 0, };
424 
425     if (now == NULL) {
426         ftime(&local_now);
427         now = &local_now;
428     }
429     if ((old == NULL) || (old->time == 0)) {
430         return 0;
431     }
432     return difftime(now->time, old->time) * 1000 + now->millitm - old->millitm;
433 }
434 
435 /*!
436  * \internal
437  * \brief Reset a command's operation times to their original values.
438  *
439  * Reset a command's run and queued timestamps to the timestamps of the original
440  * command, so we report the entire time since then and not just the time since
441  * the most recent command (for recurring and systemd operations).
442  *
443  * /param[in] cmd  LRMD command object to reset
444  *
445  * /note It's not obvious what the queued time should be for a systemd
446  * start/stop operation, which might go like this:
447  *   initial command queued 5ms, runs 3s
448  *   monitor command queued 10ms, runs 10s
449  *   monitor command queued 10ms, runs 10s
450  * Is the queued time for that operation 5ms, 10ms or 25ms? The current
451  * implementation will report 5ms. If it's 25ms, then we need to
452  * subtract 20ms from the total exec time so as not to count it twice.
453  * We can implement that later if it matters to anyone ...
454  */
455 static void
cmd_original_times(lrmd_cmd_t * cmd)456 cmd_original_times(lrmd_cmd_t * cmd)
457 {
458     cmd->t_run = cmd->t_first_run;
459     cmd->t_queue = cmd->t_first_queue;
460 }
461 #endif
462 
463 static void
send_cmd_complete_notify(lrmd_cmd_t * cmd)464 send_cmd_complete_notify(lrmd_cmd_t * cmd)
465 {
466     int exec_time = 0;
467     int queue_time = 0;
468     xmlNode *notify = NULL;
469 
470 #ifdef HAVE_SYS_TIMEB_H
471     exec_time = time_diff_ms(NULL, &cmd->t_run);
472     queue_time = time_diff_ms(&cmd->t_run, &cmd->t_queue);
473 #endif
474 
475     log_finished(cmd, exec_time, queue_time);
476 
477     /* if the first notify result for a cmd has already been sent earlier, and the
478      * the option to only send notifies on result changes is set. Check to see
479      * if the last result is the same as the new one. If so, suppress this update */
480     if (cmd->first_notify_sent && (cmd->call_opts & lrmd_opt_notify_changes_only)) {
481         if (cmd->last_notify_rc == cmd->exec_rc &&
482             cmd->last_notify_op_status == cmd->lrmd_op_status) {
483 
484             /* only send changes */
485             return;
486         }
487 
488     }
489 
490     cmd->first_notify_sent = 1;
491     cmd->last_notify_rc = cmd->exec_rc;
492     cmd->last_notify_op_status = cmd->lrmd_op_status;
493 
494     notify = create_xml_node(NULL, T_LRMD_NOTIFY);
495 
496     crm_xml_add(notify, F_LRMD_ORIGIN, __FUNCTION__);
497     crm_xml_add_int(notify, F_LRMD_TIMEOUT, cmd->timeout);
498     crm_xml_add_int(notify, F_LRMD_RSC_INTERVAL, cmd->interval);
499     crm_xml_add_int(notify, F_LRMD_RSC_START_DELAY, cmd->start_delay);
500     crm_xml_add_int(notify, F_LRMD_EXEC_RC, cmd->exec_rc);
501     crm_xml_add_int(notify, F_LRMD_OP_STATUS, cmd->lrmd_op_status);
502     crm_xml_add_int(notify, F_LRMD_CALLID, cmd->call_id);
503     crm_xml_add_int(notify, F_LRMD_RSC_DELETED, cmd->rsc_deleted);
504 
505 #ifdef HAVE_SYS_TIMEB_H
506     crm_xml_add_int(notify, F_LRMD_RSC_RUN_TIME, cmd->t_run.time);
507     crm_xml_add_int(notify, F_LRMD_RSC_RCCHANGE_TIME, cmd->t_rcchange.time);
508     crm_xml_add_int(notify, F_LRMD_RSC_EXEC_TIME, exec_time);
509     crm_xml_add_int(notify, F_LRMD_RSC_QUEUE_TIME, queue_time);
510 #endif
511 
512     crm_xml_add(notify, F_LRMD_OPERATION, LRMD_OP_RSC_EXEC);
513     crm_xml_add(notify, F_LRMD_RSC_ID, cmd->rsc_id);
514     if(cmd->real_action) {
515         crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->real_action);
516     } else {
517         crm_xml_add(notify, F_LRMD_RSC_ACTION, cmd->action);
518     }
519     crm_xml_add(notify, F_LRMD_RSC_USERDATA_STR, cmd->userdata_str);
520     crm_xml_add(notify, F_LRMD_RSC_OUTPUT, cmd->output);
521     crm_xml_add(notify, F_LRMD_RSC_EXIT_REASON, cmd->exit_reason);
522 
523     if (cmd->params) {
524         char *key = NULL;
525         char *value = NULL;
526         GHashTableIter iter;
527 
528         xmlNode *args = create_xml_node(notify, XML_TAG_ATTRS);
529 
530         g_hash_table_iter_init(&iter, cmd->params);
531         while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
532             hash2smartfield((gpointer) key, (gpointer) value, args);
533         }
534     }
535     if (cmd->client_id && (cmd->call_opts & lrmd_opt_notify_orig_only)) {
536         crm_client_t *client = crm_client_get_by_id(cmd->client_id);
537 
538         if (client) {
539             send_client_notify(client->id, client, notify);
540         }
541     } else if (client_connections != NULL) {
542         g_hash_table_foreach(client_connections, send_client_notify, notify);
543     }
544 
545     free_xml(notify);
546 }
547 
548 static void
send_generic_notify(int rc,xmlNode * request)549 send_generic_notify(int rc, xmlNode * request)
550 {
551     if (client_connections != NULL) {
552         int call_id = 0;
553         xmlNode *notify = NULL;
554         xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
555         const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
556         const char *op = crm_element_value(request, F_LRMD_OPERATION);
557 
558         crm_element_value_int(request, F_LRMD_CALLID, &call_id);
559 
560         notify = create_xml_node(NULL, T_LRMD_NOTIFY);
561         crm_xml_add(notify, F_LRMD_ORIGIN, __FUNCTION__);
562         crm_xml_add_int(notify, F_LRMD_RC, rc);
563         crm_xml_add_int(notify, F_LRMD_CALLID, call_id);
564         crm_xml_add(notify, F_LRMD_OPERATION, op);
565         crm_xml_add(notify, F_LRMD_RSC_ID, rsc_id);
566 
567         g_hash_table_foreach(client_connections, send_client_notify, notify);
568 
569         free_xml(notify);
570     }
571 }
572 
573 static void
cmd_reset(lrmd_cmd_t * cmd)574 cmd_reset(lrmd_cmd_t * cmd)
575 {
576     cmd->lrmd_op_status = 0;
577     cmd->last_pid = 0;
578 #ifdef HAVE_SYS_TIMEB_H
579     memset(&cmd->t_run, 0, sizeof(cmd->t_run));
580     memset(&cmd->t_queue, 0, sizeof(cmd->t_queue));
581 #endif
582     free(cmd->exit_reason);
583     cmd->exit_reason = NULL;
584     free(cmd->output);
585     cmd->output = NULL;
586 
587 }
588 
589 static void
cmd_finalize(lrmd_cmd_t * cmd,lrmd_rsc_t * rsc)590 cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc)
591 {
592     crm_trace("Resource operation rsc:%s action:%s completed (%p %p)", cmd->rsc_id, cmd->action,
593               rsc ? rsc->active : NULL, cmd);
594 
595     if (rsc && (rsc->active == cmd)) {
596         rsc->active = NULL;
597         mainloop_set_trigger(rsc->work);
598     }
599 
600     if (!rsc) {
601         cmd->rsc_deleted = 1;
602     }
603 
604     /* reset original timeout so client notification has correct information */
605     cmd->timeout = cmd->timeout_orig;
606 
607     send_cmd_complete_notify(cmd);
608 
609     if (cmd->interval && (cmd->lrmd_op_status == PCMK_LRM_OP_CANCELLED)) {
610         if (rsc) {
611             rsc->recurring_ops = g_list_remove(rsc->recurring_ops, cmd);
612             rsc->pending_ops = g_list_remove(rsc->pending_ops, cmd);
613         }
614         free_lrmd_cmd(cmd);
615     } else if (cmd->interval == 0) {
616         if (rsc) {
617             rsc->pending_ops = g_list_remove(rsc->pending_ops, cmd);
618         }
619         free_lrmd_cmd(cmd);
620     } else {
621         /* Clear all the values pertaining just to the last iteration of a recurring op. */
622         cmd_reset(cmd);
623     }
624 }
625 
626 #if SUPPORT_HEARTBEAT
pattern_matched(const char * pat,const char * str)627 static int pattern_matched(const char *pat, const char *str)
628 {
629     if (g_pattern_match_simple(pat, str)) {
630         crm_debug("RA output matched stopped pattern [%s]", pat);
631         return TRUE;
632     }
633     return FALSE;
634 }
635 
636 static int
hb2uniform_rc(const char * action,int rc,const char * stdout_data)637 hb2uniform_rc(const char *action, int rc, const char *stdout_data)
638 {
639     const char *stop_pattern[] = { "*stopped*", "*not*running*" };
640     const char *running_pattern[] = { "*running*", "*OK*" };
641     char *lower_std_output = NULL;
642     int result;
643 
644 
645     if (rc < 0) {
646         return PCMK_OCF_UNKNOWN_ERROR;
647     }
648 
649     /* Treat class heartbeat the same as class lsb. */
650     if (!safe_str_eq(action, "status") && !safe_str_eq(action, "monitor")) {
651         return services_get_ocf_exitcode(action, rc);
652     }
653 
654     /* for status though, exit code is ignored,
655      * and the stdout is scanned for specific strings */
656     if (stdout_data == NULL) {
657         crm_warn("No status output from the (hb) resource agent, assuming stopped");
658         return PCMK_OCF_NOT_RUNNING;
659     }
660 
661     lower_std_output = g_ascii_strdown(stdout_data, -1);
662 
663     if (pattern_matched(stop_pattern[0], lower_std_output) ||
664         pattern_matched(stop_pattern[1], lower_std_output)) {
665         result = PCMK_OCF_NOT_RUNNING;
666     } else if (pattern_matched(running_pattern[0], lower_std_output) ||
667         pattern_matched(running_pattern[1], stdout_data)) {
668             /* "OK" is matched case sensitive */
669         result = PCMK_OCF_OK;
670     } else {
671         /* It didn't say it was running - must be stopped */
672         crm_debug("RA output did not match any pattern, assuming stopped");
673         result = PCMK_OCF_NOT_RUNNING;
674     }
675     free(lower_std_output);
676     return result;
677 }
678 #endif
679 
680 static int
ocf2uniform_rc(int rc)681 ocf2uniform_rc(int rc)
682 {
683     if (rc < 0 || rc > PCMK_OCF_FAILED_MASTER) {
684         return PCMK_OCF_UNKNOWN_ERROR;
685     }
686 
687     return rc;
688 }
689 
690 static int
stonith2uniform_rc(const char * action,int rc)691 stonith2uniform_rc(const char *action, int rc)
692 {
693     if (rc == -ENODEV) {
694         if (safe_str_eq(action, "stop")) {
695             rc = PCMK_OCF_OK;
696         } else if (safe_str_eq(action, "start")) {
697             rc = PCMK_OCF_NOT_INSTALLED;
698         } else {
699             rc = PCMK_OCF_NOT_RUNNING;
700         }
701     } else if (rc != 0) {
702         rc = PCMK_OCF_UNKNOWN_ERROR;
703     }
704     return rc;
705 }
706 
707 #if SUPPORT_NAGIOS
708 static int
nagios2uniform_rc(const char * action,int rc)709 nagios2uniform_rc(const char *action, int rc)
710 {
711     if (rc < 0) {
712         return PCMK_OCF_UNKNOWN_ERROR;
713     }
714 
715     switch (rc) {
716         case NAGIOS_STATE_OK:
717             return PCMK_OCF_OK;
718         case NAGIOS_INSUFFICIENT_PRIV:
719             return PCMK_OCF_INSUFFICIENT_PRIV;
720         case NAGIOS_NOT_INSTALLED:
721             return PCMK_OCF_NOT_INSTALLED;
722         case NAGIOS_STATE_WARNING:
723         case NAGIOS_STATE_CRITICAL:
724         case NAGIOS_STATE_UNKNOWN:
725         case NAGIOS_STATE_DEPENDENT:
726         default:
727             return PCMK_OCF_UNKNOWN_ERROR;
728     }
729 
730     return PCMK_OCF_UNKNOWN_ERROR;
731 }
732 #endif
733 
734 static int
get_uniform_rc(const char * standard,const char * action,int rc)735 get_uniform_rc(const char *standard, const char *action, int rc)
736 {
737     if (safe_str_eq(standard, PCMK_RESOURCE_CLASS_OCF)) {
738         return ocf2uniform_rc(rc);
739     } else if (safe_str_eq(standard, PCMK_RESOURCE_CLASS_STONITH)) {
740         return stonith2uniform_rc(action, rc);
741     } else if (safe_str_eq(standard, PCMK_RESOURCE_CLASS_SYSTEMD)) {
742         return rc;
743     } else if (safe_str_eq(standard, PCMK_RESOURCE_CLASS_UPSTART)) {
744         return rc;
745 #if SUPPORT_NAGIOS
746     } else if (safe_str_eq(standard, PCMK_RESOURCE_CLASS_NAGIOS)) {
747         return nagios2uniform_rc(action, rc);
748 #endif
749     } else {
750         return services_get_ocf_exitcode(action, rc);
751     }
752 }
753 
754 static int
action_get_uniform_rc(svc_action_t * action)755 action_get_uniform_rc(svc_action_t * action)
756 {
757     lrmd_cmd_t *cmd = action->cb_data;
758 #if SUPPORT_HEARTBEAT
759     if (safe_str_eq(action->standard, PCMK_RESOURCE_CLASS_HB)) {
760         return hb2uniform_rc(cmd->action, action->rc, action->stdout_data);
761     }
762 #endif
763     return get_uniform_rc(action->standard, cmd->action, action->rc);
764 }
765 
766 void
notify_of_new_client(crm_client_t * new_client)767 notify_of_new_client(crm_client_t *new_client)
768 {
769     crm_client_t *client = NULL;
770     GHashTableIter iter;
771     xmlNode *notify = NULL;
772     char *key = NULL;
773 
774     notify = create_xml_node(NULL, T_LRMD_NOTIFY);
775     crm_xml_add(notify, F_LRMD_ORIGIN, __FUNCTION__);
776     crm_xml_add(notify, F_LRMD_OPERATION, LRMD_OP_NEW_CLIENT);
777 
778     g_hash_table_iter_init(&iter, client_connections);
779     while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & client)) {
780 
781         if (safe_str_eq(client->id, new_client->id)) {
782             continue;
783         }
784 
785         send_client_notify((gpointer) key, (gpointer) client, (gpointer) notify);
786     }
787     free_xml(notify);
788 }
789 
790 static char *
parse_exit_reason(const char * output)791 parse_exit_reason(const char *output)
792 {
793     const char *cur = NULL;
794     const char *last = NULL;
795     char *reason = NULL;
796     static int cookie_len = 0;
797     char *eol = NULL;
798 
799     if (output == NULL) {
800         return NULL;
801     }
802 
803     if (!cookie_len) {
804         cookie_len = strlen(PCMK_OCF_REASON_PREFIX);
805     }
806 
807     cur = strstr(output, PCMK_OCF_REASON_PREFIX);
808     for (; cur != NULL; cur = strstr(cur, PCMK_OCF_REASON_PREFIX)) {
809         /* skip over the cookie delimiter string */
810         cur += cookie_len;
811         last = cur;
812     }
813     if (last == NULL) {
814         return NULL;
815     }
816 
817     /* make our own copy */
818     reason = calloc(1, (EXIT_REASON_MAX_LEN+1));
819     CRM_ASSERT(reason);
820 
821     /* limit reason string size */
822     strncpy(reason, last, EXIT_REASON_MAX_LEN);
823 
824     /* truncate everything after a new line */
825     eol = strchr(reason, '\n');
826     if (eol != NULL) {
827         *eol = '\0';
828     }
829 
830     return reason;
831 }
832 
833 void
client_disconnect_cleanup(const char * client_id)834 client_disconnect_cleanup(const char *client_id)
835 {
836     GHashTableIter iter;
837     lrmd_rsc_t *rsc = NULL;
838     char *key = NULL;
839 
840     g_hash_table_iter_init(&iter, rsc_list);
841     while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & rsc)) {
842         if (rsc->call_opts & lrmd_opt_drop_recurring) {
843             /* This client is disconnecting, drop any recurring operations
844              * it may have initiated on the resource */
845             cancel_all_recurring(rsc, client_id);
846         }
847     }
848 }
849 
850 static void
action_complete(svc_action_t * action)851 action_complete(svc_action_t * action)
852 {
853     lrmd_rsc_t *rsc;
854     lrmd_cmd_t *cmd = action->cb_data;
855     const char *rclass = NULL;
856 
857     bool goagain = false;
858 
859     if (!cmd) {
860         crm_err("LRMD action (%s) completed does not match any known operations.", action->id);
861         return;
862     }
863 #ifdef HAVE_SYS_TIMEB_H
864     if (cmd->exec_rc != action->rc) {
865         ftime(&cmd->t_rcchange);
866     }
867 #endif
868 
869     cmd->last_pid = action->pid;
870     cmd->exec_rc = action_get_uniform_rc(action);
871     cmd->lrmd_op_status = action->status;
872     rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL;
873 
874     if (rsc && safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_SERVICE)) {
875         rclass = resources_find_service_class(rsc->type);
876     } else if(rsc) {
877         rclass = rsc->class;
878     }
879 
880     if (safe_str_eq(rclass, PCMK_RESOURCE_CLASS_SYSTEMD)) {
881         if(cmd->exec_rc == PCMK_OCF_OK && safe_str_eq(cmd->action, "start")) {
882             /* systemd I curse thee!
883              *
884              * systemd returns from start actions after the start _begins_
885              * not after it completes.
886              *
887              * So we have to jump through a few hoops so that we don't
888              * report 'complete' to the rest of pacemaker until, you know,
889              * it's actually done.
890              */
891             goagain = true;
892             cmd->real_action = cmd->action;
893             cmd->action = strdup("monitor");
894 
895         } else if(cmd->exec_rc == PCMK_OCF_OK && safe_str_eq(cmd->action, "stop")) {
896             goagain = true;
897             cmd->real_action = cmd->action;
898             cmd->action = strdup("monitor");
899 
900         } else if(cmd->real_action) {
901             /* Ok, so this is the follow up monitor action to check if start actually completed */
902             if(cmd->lrmd_op_status == PCMK_LRM_OP_DONE && cmd->exec_rc == PCMK_OCF_PENDING) {
903                 goagain = true;
904             } else if(cmd->exec_rc == PCMK_OCF_OK && safe_str_eq(cmd->real_action, "stop")) {
905                 goagain = true;
906 
907             } else {
908 #ifdef HAVE_SYS_TIMEB_H
909                 int time_sum = time_diff_ms(NULL, &cmd->t_first_run);
910                 int timeout_left = cmd->timeout_orig - time_sum;
911 
912                 crm_debug("%s %s is now complete (elapsed=%dms, remaining=%dms): %s (%d)",
913                           cmd->rsc_id, cmd->real_action, time_sum, timeout_left, services_ocf_exitcode_str(cmd->exec_rc), cmd->exec_rc);
914                 cmd_original_times(cmd);
915 #endif
916 
917                 // Monitors may return "not running", but start/stop shouldn't
918                 if ((cmd->lrmd_op_status == PCMK_LRM_OP_DONE)
919                     && (cmd->exec_rc == PCMK_OCF_NOT_RUNNING)) {
920 
921                     if (safe_str_eq(cmd->real_action, "start")) {
922                         cmd->exec_rc = PCMK_OCF_UNKNOWN_ERROR;
923                     } else if (safe_str_eq(cmd->real_action, "stop")) {
924                         cmd->exec_rc = PCMK_OCF_OK;
925                     }
926                 }
927             }
928         }
929     }
930 
931 #if SUPPORT_NAGIOS
932     if (rsc && safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_NAGIOS)) {
933         if (safe_str_eq(cmd->action, "monitor") &&
934             cmd->interval == 0 && cmd->exec_rc == PCMK_OCF_OK) {
935             /* Successfully executed --version for the nagios plugin */
936             cmd->exec_rc = PCMK_OCF_NOT_RUNNING;
937 
938         } else if (safe_str_eq(cmd->action, "start") && cmd->exec_rc != PCMK_OCF_OK) {
939             goagain = true;
940         }
941     }
942 #endif
943 
944     /* Wrapping this section in ifdef implies that systemd resources are not
945      * fully supported on platforms without sys/timeb.h. Since timeb is
946      * obsolete, we should eventually prefer a clock_gettime() implementation
947      * (wrapped in its own ifdef) with timeb as a fallback.
948      */
949     if(goagain) {
950 #ifdef HAVE_SYS_TIMEB_H
951         int time_sum = time_diff_ms(NULL, &cmd->t_first_run);
952         int timeout_left = cmd->timeout_orig - time_sum;
953         int delay = cmd->timeout_orig / 10;
954 
955         if(delay >= timeout_left && timeout_left > 20) {
956             delay = timeout_left/2;
957         }
958 
959         delay = QB_MIN(2000, delay);
960         if (delay < timeout_left) {
961             cmd->start_delay = delay;
962             cmd->timeout = timeout_left;
963 
964             if(cmd->exec_rc == PCMK_OCF_OK) {
965                 crm_debug("%s %s may still be in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)",
966                           cmd->rsc_id, cmd->real_action, time_sum, timeout_left, delay);
967 
968             } else if(cmd->exec_rc == PCMK_OCF_PENDING) {
969                 crm_info("%s %s is still in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)",
970                          cmd->rsc_id, cmd->action, time_sum, timeout_left, delay);
971 
972             } else {
973                 crm_notice("%s %s failed '%s' (%d): re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)",
974                            cmd->rsc_id, cmd->action, services_ocf_exitcode_str(cmd->exec_rc), cmd->exec_rc, time_sum, timeout_left, delay);
975             }
976 
977             cmd_reset(cmd);
978             if(rsc) {
979                 rsc->active = NULL;
980             }
981             schedule_lrmd_cmd(rsc, cmd);
982 
983             /* Don't finalize cmd, we're not done with it yet */
984             return;
985 
986         } else {
987             crm_notice("Giving up on %s %s (rc=%d): timeout (elapsed=%dms, remaining=%dms)",
988                        cmd->rsc_id, cmd->real_action?cmd->real_action:cmd->action, cmd->exec_rc, time_sum, timeout_left);
989             cmd->lrmd_op_status = PCMK_LRM_OP_TIMEOUT;
990             cmd->exec_rc = PCMK_OCF_TIMEOUT;
991             cmd_original_times(cmd);
992         }
993 #endif
994     }
995 
996     if (action->stderr_data) {
997         cmd->output = strdup(action->stderr_data);
998         cmd->exit_reason = parse_exit_reason(action->stderr_data);
999 
1000     } else if (action->stdout_data) {
1001         cmd->output = strdup(action->stdout_data);
1002     }
1003 
1004     cmd_finalize(cmd, rsc);
1005 }
1006 
1007 static void
stonith_action_complete(lrmd_cmd_t * cmd,int rc)1008 stonith_action_complete(lrmd_cmd_t * cmd, int rc)
1009 {
1010     int recurring = cmd->interval;
1011     lrmd_rsc_t *rsc = NULL;
1012 
1013     cmd->exec_rc = get_uniform_rc(PCMK_RESOURCE_CLASS_STONITH, cmd->action, rc);
1014 
1015     rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id);
1016 
1017     if (cmd->lrmd_op_status == PCMK_LRM_OP_CANCELLED) {
1018         recurring = 0;
1019         /* do nothing */
1020 
1021     } else if (rc == -ENODEV && safe_str_eq(cmd->action, "monitor")) {
1022         // The device is not registered with the fencer
1023 
1024         cmd->lrmd_op_status = PCMK_LRM_OP_ERROR;
1025         cmd->exec_rc = PCMK_OCF_NOT_RUNNING;
1026 
1027     } else if (rc) {
1028         /* Attempt to map return codes to op status if possible */
1029         switch (rc) {
1030             case -EPROTONOSUPPORT:
1031                 cmd->lrmd_op_status = PCMK_LRM_OP_NOTSUPPORTED;
1032                 break;
1033             case -ETIME:
1034                 cmd->lrmd_op_status = PCMK_LRM_OP_TIMEOUT;
1035                 break;
1036             default:
1037                 /* TODO: This looks wrong.  Status should be _DONE and exec_rc set to an error */
1038                 cmd->lrmd_op_status = PCMK_LRM_OP_ERROR;
1039         }
1040     } else {
1041         /* command successful */
1042         cmd->lrmd_op_status = PCMK_LRM_OP_DONE;
1043         if (rsc) {
1044             if (safe_str_eq(cmd->action, "start")) {
1045                 rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK
1046             } else if (safe_str_eq(cmd->action, "stop")) {
1047                 rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING
1048             }
1049         }
1050     }
1051 
1052     if (recurring && rsc) {
1053         if (cmd->stonith_recurring_id) {
1054             g_source_remove(cmd->stonith_recurring_id);
1055         }
1056         cmd->stonith_recurring_id = g_timeout_add(cmd->interval, stonith_recurring_op_helper, cmd);
1057     }
1058 
1059     cmd_finalize(cmd, rsc);
1060 }
1061 
1062 static void
lrmd_stonith_callback(stonith_t * stonith,stonith_callback_data_t * data)1063 lrmd_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
1064 {
1065     stonith_action_complete(data->userdata, data->rc);
1066 }
1067 
1068 void
stonith_connection_failed(void)1069 stonith_connection_failed(void)
1070 {
1071     GHashTableIter iter;
1072     GList *cmd_list = NULL;
1073     GList *cmd_iter = NULL;
1074     lrmd_rsc_t *rsc = NULL;
1075     char *key = NULL;
1076 
1077     g_hash_table_iter_init(&iter, rsc_list);
1078     while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & rsc)) {
1079         if (safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH)) {
1080             /* If we registered this fence device, we don't know whether the
1081              * fencer still has the registration or not. Cause future probes to
1082              * return PCMK_OCF_UNKNOWN_ERROR until the resource is stopped or
1083              * started successfully. This is especially important if the
1084              * controller also went away (possibly due to a cluster layer
1085              * restart) and won't receive our client notification of any
1086              * monitors finalized below.
1087              */
1088             if (rsc->st_probe_rc == pcmk_ok) {
1089                 rsc->st_probe_rc = pcmk_err_generic;
1090             }
1091 
1092             if (rsc->active) {
1093                 cmd_list = g_list_append(cmd_list, rsc->active);
1094             }
1095             if (rsc->recurring_ops) {
1096                 cmd_list = g_list_concat(cmd_list, rsc->recurring_ops);
1097             }
1098             if (rsc->pending_ops) {
1099                 cmd_list = g_list_concat(cmd_list, rsc->pending_ops);
1100             }
1101             rsc->pending_ops = rsc->recurring_ops = NULL;
1102         }
1103     }
1104 
1105     if (!cmd_list) {
1106         return;
1107     }
1108 
1109     crm_err("STONITH connection failed, finalizing %d pending operations.",
1110             g_list_length(cmd_list));
1111     for (cmd_iter = cmd_list; cmd_iter; cmd_iter = cmd_iter->next) {
1112         stonith_action_complete(cmd_iter->data, -ENOTCONN);
1113     }
1114     g_list_free(cmd_list);
1115 }
1116 
1117 static int
lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc,lrmd_cmd_t * cmd)1118 lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
1119 {
1120     int rc = 0;
1121     int do_monitor = 0;
1122 
1123     stonith_t *stonith_api = get_stonith_connection();
1124 
1125     if (!stonith_api) {
1126         cmd->exec_rc = get_uniform_rc(PCMK_RESOURCE_CLASS_STONITH, cmd->action,
1127                                       -ENOTCONN);
1128         cmd->lrmd_op_status = PCMK_LRM_OP_ERROR;
1129         cmd_finalize(cmd, rsc);
1130         return -EUNATCH;
1131     }
1132 
1133     if (safe_str_eq(cmd->action, "start")) {
1134         char *key = NULL;
1135         char *value = NULL;
1136         stonith_key_value_t *device_params = NULL;
1137 
1138         if (cmd->params) {
1139             GHashTableIter iter;
1140 
1141             g_hash_table_iter_init(&iter, cmd->params);
1142             while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
1143                 device_params = stonith_key_value_add(device_params, key, value);
1144             }
1145         }
1146 
1147         /* Stonith automatically registers devices from the IPC when changes occur,
1148          * but to avoid a possible race condition between stonith receiving the IPC update
1149          * and the lrmd requesting that resource, the lrmd still registers the device as well.
1150          * Stonith knows how to handle duplicate device registrations correctly. */
1151         rc = stonith_api->cmds->register_device(stonith_api,
1152                                                 st_opt_sync_call,
1153                                                 cmd->rsc_id,
1154                                                 rsc->provider, rsc->type, device_params);
1155 
1156         stonith_key_value_freeall(device_params, 1, 1);
1157         if (rc == 0) {
1158             do_monitor = 1;
1159         }
1160     } else if (safe_str_eq(cmd->action, "stop")) {
1161         rc = stonith_api->cmds->remove_device(stonith_api, st_opt_sync_call, cmd->rsc_id);
1162     } else if (safe_str_eq(cmd->action, "monitor")) {
1163         if (cmd->interval) {
1164             do_monitor = 1;
1165         } else {
1166             rc = rsc->st_probe_rc;
1167         }
1168     }
1169 
1170     if (!do_monitor) {
1171         goto cleanup_stonith_exec;
1172     }
1173 
1174     rc = stonith_api->cmds->monitor(stonith_api, 0, cmd->rsc_id, cmd->timeout / 1000);
1175 
1176     rc = stonith_api->cmds->register_callback(stonith_api,
1177                                               rc,
1178                                               0,
1179                                               0,
1180                                               cmd, "lrmd_stonith_callback", lrmd_stonith_callback);
1181 
1182     /* don't cleanup yet, we will find out the result of the monitor later */
1183     if (rc > 0) {
1184         rsc->active = cmd;
1185         return rc;
1186     } else if (rc == 0) {
1187         rc = -1;
1188     }
1189 
1190   cleanup_stonith_exec:
1191     stonith_action_complete(cmd, rc);
1192     return rc;
1193 }
1194 
1195 static int
lrmd_rsc_execute_service_lib(lrmd_rsc_t * rsc,lrmd_cmd_t * cmd)1196 lrmd_rsc_execute_service_lib(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd)
1197 {
1198     svc_action_t *action = NULL;
1199     GHashTable *params_copy = NULL;
1200 
1201     CRM_ASSERT(rsc);
1202     CRM_ASSERT(cmd);
1203 
1204     crm_trace("Creating action, resource:%s action:%s class:%s provider:%s agent:%s",
1205               rsc->rsc_id, cmd->action, rsc->class, rsc->provider, rsc->type);
1206 
1207 #if SUPPORT_NAGIOS
1208     /* Recurring operations are cancelled anyway for a stop operation */
1209     if (safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_NAGIOS)
1210         && safe_str_eq(cmd->action, "stop")) {
1211 
1212         cmd->exec_rc = PCMK_OCF_OK;
1213         goto exec_done;
1214     }
1215 #endif
1216 
1217     params_copy = crm_str_table_dup(cmd->params);
1218 
1219     if (cmd->isolation_wrapper) {
1220         g_hash_table_remove(params_copy, "CRM_meta_isolation_wrapper");
1221         action = resources_action_create(rsc->rsc_id,
1222                                          PCMK_RESOURCE_CLASS_OCF,
1223                                          LRMD_ISOLATION_PROVIDER,
1224                                          cmd->isolation_wrapper,
1225                                          cmd->action, /*action will be normalized in wrapper*/
1226                                          cmd->interval,
1227                                          cmd->timeout,
1228                                          params_copy,
1229                                          cmd->service_flags);
1230     } else {
1231         action = resources_action_create(rsc->rsc_id,
1232                                          rsc->class,
1233                                          rsc->provider,
1234                                          rsc->type,
1235                                          normalize_action_name(rsc, cmd->action),
1236                                          cmd->interval,
1237                                          cmd->timeout,
1238                                          params_copy,
1239                                          cmd->service_flags);
1240     }
1241 
1242     if (!action) {
1243         crm_err("Failed to create action, action:%s on resource %s", cmd->action, rsc->rsc_id);
1244         cmd->lrmd_op_status = PCMK_LRM_OP_ERROR;
1245         goto exec_done;
1246     }
1247 
1248     action->cb_data = cmd;
1249 
1250     /* 'cmd' may not be valid after this point if
1251      * services_action_async() returned TRUE
1252      *
1253      * Upstart and systemd both synchronously determine monitor/status
1254      * results and call action_complete (which may free 'cmd') if necessary.
1255      */
1256     if (services_action_async(action, action_complete)) {
1257         return TRUE;
1258     }
1259 
1260     cmd->exec_rc = action->rc;
1261     if(action->status != PCMK_LRM_OP_DONE) {
1262         cmd->lrmd_op_status = action->status;
1263     } else {
1264         cmd->lrmd_op_status = PCMK_LRM_OP_ERROR;
1265     }
1266     services_action_free(action);
1267     action = NULL;
1268 
1269   exec_done:
1270     cmd_finalize(cmd, rsc);
1271     return TRUE;
1272 }
1273 
1274 static gboolean
lrmd_rsc_execute(lrmd_rsc_t * rsc)1275 lrmd_rsc_execute(lrmd_rsc_t * rsc)
1276 {
1277     lrmd_cmd_t *cmd = NULL;
1278 
1279     CRM_CHECK(rsc != NULL, return FALSE);
1280 
1281     if (rsc->active) {
1282         crm_trace("%s is still active", rsc->rsc_id);
1283         return TRUE;
1284     }
1285 
1286     if (rsc->pending_ops) {
1287         GList *first = rsc->pending_ops;
1288 
1289         cmd = first->data;
1290         if (cmd->delay_id) {
1291             crm_trace
1292                 ("Command %s %s was asked to run too early, waiting for start_delay timeout of %dms",
1293                  cmd->rsc_id, cmd->action, cmd->start_delay);
1294             return TRUE;
1295         }
1296         rsc->pending_ops = g_list_remove_link(rsc->pending_ops, first);
1297         g_list_free_1(first);
1298 
1299 #ifdef HAVE_SYS_TIMEB_H
1300         if (cmd->t_first_run.time == 0) {
1301             ftime(&cmd->t_first_run);
1302         }
1303         ftime(&cmd->t_run);
1304 #endif
1305     }
1306 
1307     if (!cmd) {
1308         crm_trace("Nothing further to do for %s", rsc->rsc_id);
1309         return TRUE;
1310     }
1311 
1312     rsc->active = cmd;          /* only one op at a time for a rsc */
1313     if (cmd->interval) {
1314         rsc->recurring_ops = g_list_append(rsc->recurring_ops, cmd);
1315     }
1316 
1317     log_execute(cmd);
1318 
1319     if (safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH)) {
1320         lrmd_rsc_execute_stonith(rsc, cmd);
1321     } else {
1322         lrmd_rsc_execute_service_lib(rsc, cmd);
1323     }
1324 
1325     return TRUE;
1326 }
1327 
1328 static gboolean
lrmd_rsc_dispatch(gpointer user_data)1329 lrmd_rsc_dispatch(gpointer user_data)
1330 {
1331     return lrmd_rsc_execute(user_data);
1332 }
1333 
1334 void
free_rsc(gpointer data)1335 free_rsc(gpointer data)
1336 {
1337     GListPtr gIter = NULL;
1338     lrmd_rsc_t *rsc = data;
1339     int is_stonith = safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH);
1340 
1341     gIter = rsc->pending_ops;
1342     while (gIter != NULL) {
1343         GListPtr next = gIter->next;
1344         lrmd_cmd_t *cmd = gIter->data;
1345 
1346         /* command was never executed */
1347         cmd->lrmd_op_status = PCMK_LRM_OP_CANCELLED;
1348         cmd_finalize(cmd, NULL);
1349 
1350         gIter = next;
1351     }
1352     /* frees list, but not list elements. */
1353     g_list_free(rsc->pending_ops);
1354 
1355     gIter = rsc->recurring_ops;
1356     while (gIter != NULL) {
1357         GListPtr next = gIter->next;
1358         lrmd_cmd_t *cmd = gIter->data;
1359 
1360         if (is_stonith) {
1361             cmd->lrmd_op_status = PCMK_LRM_OP_CANCELLED;
1362             /* If a stonith command is in-flight, just mark it as cancelled;
1363              * it is not safe to finalize/free the cmd until the stonith api
1364              * says it has either completed or timed out.
1365              */
1366             if (rsc->active != cmd) {
1367                 cmd_finalize(cmd, NULL);
1368             }
1369         } else {
1370             /* This command is already handed off to service library,
1371              * let service library cancel it and tell us via the callback
1372              * when it is cancelled. The rsc can be safely destroyed
1373              * even if we are waiting for the cancel result */
1374             services_action_cancel(rsc->rsc_id, normalize_action_name(rsc, cmd->action), cmd->interval);
1375         }
1376 
1377         gIter = next;
1378     }
1379     /* frees list, but not list elements. */
1380     g_list_free(rsc->recurring_ops);
1381 
1382     free(rsc->rsc_id);
1383     free(rsc->class);
1384     free(rsc->provider);
1385     free(rsc->type);
1386     mainloop_destroy_trigger(rsc->work);
1387 
1388     free(rsc);
1389 }
1390 
1391 static int
process_lrmd_signon(crm_client_t * client,xmlNode * request,int call_id,xmlNode ** reply)1392 process_lrmd_signon(crm_client_t *client, xmlNode *request, int call_id,
1393                     xmlNode **reply)
1394 {
1395     int rc = pcmk_ok;
1396     const char *is_ipc_provider = crm_element_value(request, F_LRMD_IS_IPC_PROVIDER);
1397     const char *protocol_version = crm_element_value(request, F_LRMD_PROTOCOL_VERSION);
1398 
1399     if (compare_version(protocol_version, LRMD_MIN_PROTOCOL_VERSION) < 0) {
1400         crm_err("Cluster API version must be greater than or equal to %s, not %s",
1401                 LRMD_MIN_PROTOCOL_VERSION, protocol_version);
1402         rc = -EPROTO;
1403     }
1404 
1405     if (crm_is_true(is_ipc_provider)) {
1406 #ifdef SUPPORT_REMOTE
1407         if ((client->remote != NULL) && client->remote->tls_handshake_complete) {
1408             // This is a remote connection from a cluster node's controller
1409             ipc_proxy_add_provider(client);
1410         } else {
1411             rc = -EACCES;
1412         }
1413 #else
1414         rc = -EPROTONOSUPPORT;
1415 #endif
1416     }
1417 
1418     *reply = create_lrmd_reply(__func__, rc, call_id);
1419     crm_xml_add(*reply, F_LRMD_OPERATION, CRM_OP_REGISTER);
1420     crm_xml_add(*reply, F_LRMD_CLIENTID, client->id);
1421     crm_xml_add(*reply, F_LRMD_PROTOCOL_VERSION, LRMD_PROTOCOL_VERSION);
1422 
1423     return rc;
1424 }
1425 
1426 static int
process_lrmd_rsc_register(crm_client_t * client,uint32_t id,xmlNode * request)1427 process_lrmd_rsc_register(crm_client_t * client, uint32_t id, xmlNode * request)
1428 {
1429     int rc = pcmk_ok;
1430     lrmd_rsc_t *rsc = build_rsc_from_xml(request);
1431     lrmd_rsc_t *dup = g_hash_table_lookup(rsc_list, rsc->rsc_id);
1432 
1433     if (dup &&
1434         safe_str_eq(rsc->class, dup->class) &&
1435         safe_str_eq(rsc->provider, dup->provider) && safe_str_eq(rsc->type, dup->type)) {
1436 
1437         crm_warn("Can't add, RSC '%s' already present in the rsc list (%d active resources)",
1438                  rsc->rsc_id, g_hash_table_size(rsc_list));
1439 
1440         free_rsc(rsc);
1441         return rc;
1442     }
1443 
1444     g_hash_table_replace(rsc_list, rsc->rsc_id, rsc);
1445     crm_info("Added '%s' to the rsc list (%d active resources)",
1446              rsc->rsc_id, g_hash_table_size(rsc_list));
1447 
1448     return rc;
1449 }
1450 
1451 static xmlNode *
process_lrmd_get_rsc_info(xmlNode * request,int call_id)1452 process_lrmd_get_rsc_info(xmlNode *request, int call_id)
1453 {
1454     int rc = pcmk_ok;
1455     xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
1456     const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
1457     xmlNode *reply = NULL;
1458     lrmd_rsc_t *rsc = NULL;
1459 
1460     if (rsc_id == NULL) {
1461         rc = -ENODEV;
1462     } else {
1463         rsc = g_hash_table_lookup(rsc_list, rsc_id);
1464         if (rsc == NULL) {
1465             crm_info("Resource '%s' not found (%d active resources)",
1466                      rsc_id, g_hash_table_size(rsc_list));
1467             rc = -ENODEV;
1468         }
1469     }
1470 
1471     reply = create_lrmd_reply(__FUNCTION__, rc, call_id);
1472     if (rsc) {
1473         crm_xml_add(reply, F_LRMD_RSC_ID, rsc->rsc_id);
1474         crm_xml_add(reply, F_LRMD_CLASS, rsc->class);
1475         crm_xml_add(reply, F_LRMD_PROVIDER, rsc->provider);
1476         crm_xml_add(reply, F_LRMD_TYPE, rsc->type);
1477     }
1478     return reply;
1479 }
1480 
1481 static int
process_lrmd_rsc_unregister(crm_client_t * client,uint32_t id,xmlNode * request)1482 process_lrmd_rsc_unregister(crm_client_t * client, uint32_t id, xmlNode * request)
1483 {
1484     int rc = pcmk_ok;
1485     lrmd_rsc_t *rsc = NULL;
1486     xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
1487     const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
1488 
1489     if (!rsc_id) {
1490         return -ENODEV;
1491     }
1492 
1493     if (!(rsc = g_hash_table_lookup(rsc_list, rsc_id))) {
1494         crm_info("Resource '%s' not found (%d active resources)",
1495                  rsc_id, g_hash_table_size(rsc_list));
1496         return pcmk_ok;
1497     }
1498 
1499     if (rsc->active) {
1500         /* let the caller know there are still active ops on this rsc to watch for */
1501         crm_trace("Operation still in progress: %p", rsc->active);
1502         rc = -EINPROGRESS;
1503     }
1504 
1505     g_hash_table_remove(rsc_list, rsc_id);
1506 
1507     return rc;
1508 }
1509 
1510 static int
process_lrmd_rsc_exec(crm_client_t * client,uint32_t id,xmlNode * request)1511 process_lrmd_rsc_exec(crm_client_t * client, uint32_t id, xmlNode * request)
1512 {
1513     lrmd_rsc_t *rsc = NULL;
1514     lrmd_cmd_t *cmd = NULL;
1515     xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
1516     const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
1517     int call_id;
1518 
1519     if (!rsc_id) {
1520         return -EINVAL;
1521     }
1522     if (!(rsc = g_hash_table_lookup(rsc_list, rsc_id))) {
1523         crm_info("Resource '%s' not found (%d active resources)",
1524                  rsc_id, g_hash_table_size(rsc_list));
1525         return -ENODEV;
1526     }
1527 
1528     cmd = create_lrmd_cmd(request, client, rsc);
1529     call_id = cmd->call_id;
1530 
1531     /* Don't reference cmd after handing it off to be scheduled.
1532      * The cmd could get merged and freed. */
1533     schedule_lrmd_cmd(rsc, cmd);
1534 
1535     return call_id;
1536 }
1537 
1538 static int
cancel_op(const char * rsc_id,const char * action,int interval)1539 cancel_op(const char *rsc_id, const char *action, int interval)
1540 {
1541     GListPtr gIter = NULL;
1542     lrmd_rsc_t *rsc = g_hash_table_lookup(rsc_list, rsc_id);
1543 
1544     /* How to cancel an action.
1545      * 1. Check pending ops list, if it hasn't been handed off
1546      *    to the service library or stonith recurring list remove
1547      *    it there and that will stop it.
1548      * 2. If it isn't in the pending ops list, then it's either a
1549      *    recurring op in the stonith recurring list, or the service
1550      *    library's recurring list.  Stop it there
1551      * 3. If not found in any lists, then this operation has either
1552      *    been executed already and is not a recurring operation, or
1553      *    never existed.
1554      */
1555     if (!rsc) {
1556         return -ENODEV;
1557     }
1558 
1559     for (gIter = rsc->pending_ops; gIter != NULL; gIter = gIter->next) {
1560         lrmd_cmd_t *cmd = gIter->data;
1561 
1562         if (safe_str_eq(cmd->action, action) && cmd->interval == interval) {
1563             cmd->lrmd_op_status = PCMK_LRM_OP_CANCELLED;
1564             cmd_finalize(cmd, rsc);
1565             return pcmk_ok;
1566         }
1567     }
1568 
1569     if (safe_str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH)) {
1570         /* The service library does not handle stonith operations.
1571          * We have to handle recurring stonith operations ourselves. */
1572         for (gIter = rsc->recurring_ops; gIter != NULL; gIter = gIter->next) {
1573             lrmd_cmd_t *cmd = gIter->data;
1574 
1575             if (safe_str_eq(cmd->action, action) && cmd->interval == interval) {
1576                 cmd->lrmd_op_status = PCMK_LRM_OP_CANCELLED;
1577                 if (rsc->active != cmd) {
1578                     cmd_finalize(cmd, rsc);
1579                 }
1580                 return pcmk_ok;
1581             }
1582         }
1583     } else if (services_action_cancel(rsc_id, normalize_action_name(rsc, action), interval) == TRUE) {
1584         /* The service library will tell the action_complete callback function
1585          * this action was cancelled, which will destroy the cmd and remove
1586          * it from the recurring_op list. Do not do that in this function
1587          * if the service library says it cancelled it. */
1588         return pcmk_ok;
1589     }
1590 
1591     return -EOPNOTSUPP;
1592 }
1593 
1594 static void
cancel_all_recurring(lrmd_rsc_t * rsc,const char * client_id)1595 cancel_all_recurring(lrmd_rsc_t * rsc, const char *client_id)
1596 {
1597     GList *cmd_list = NULL;
1598     GList *cmd_iter = NULL;
1599 
1600     /* Notice a copy of each list is created when concat is called.
1601      * This prevents odd behavior from occurring when the cmd_list
1602      * is iterated through later on.  It is possible the cancel_op
1603      * function may end up modifying the recurring_ops and pending_ops
1604      * lists.  If we did not copy those lists, our cmd_list iteration
1605      * could get messed up.*/
1606     if (rsc->recurring_ops) {
1607         cmd_list = g_list_concat(cmd_list, g_list_copy(rsc->recurring_ops));
1608     }
1609     if (rsc->pending_ops) {
1610         cmd_list = g_list_concat(cmd_list, g_list_copy(rsc->pending_ops));
1611     }
1612     if (!cmd_list) {
1613         return;
1614     }
1615 
1616     for (cmd_iter = cmd_list; cmd_iter; cmd_iter = cmd_iter->next) {
1617         lrmd_cmd_t *cmd = cmd_iter->data;
1618 
1619         if (cmd->interval == 0) {
1620             continue;
1621         }
1622 
1623         if (client_id && safe_str_neq(cmd->client_id, client_id)) {
1624             continue;
1625         }
1626 
1627         cancel_op(rsc->rsc_id, cmd->action, cmd->interval);
1628     }
1629     /* frees only the copied list data, not the cmds */
1630     g_list_free(cmd_list);
1631 }
1632 
1633 static int
process_lrmd_rsc_cancel(crm_client_t * client,uint32_t id,xmlNode * request)1634 process_lrmd_rsc_cancel(crm_client_t * client, uint32_t id, xmlNode * request)
1635 {
1636     xmlNode *rsc_xml = get_xpath_object("//" F_LRMD_RSC, request, LOG_ERR);
1637     const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
1638     const char *action = crm_element_value(rsc_xml, F_LRMD_RSC_ACTION);
1639     int interval = 0;
1640 
1641     crm_element_value_int(rsc_xml, F_LRMD_RSC_INTERVAL, &interval);
1642 
1643     if (!rsc_id || !action) {
1644         return -EINVAL;
1645     }
1646 
1647     return cancel_op(rsc_id, action, interval);
1648 }
1649 
1650 void
process_lrmd_message(crm_client_t * client,uint32_t id,xmlNode * request)1651 process_lrmd_message(crm_client_t * client, uint32_t id, xmlNode * request)
1652 {
1653     int rc = pcmk_ok;
1654     int call_id = 0;
1655     const char *op = crm_element_value(request, F_LRMD_OPERATION);
1656     int do_reply = 0;
1657     int do_notify = 0;
1658     xmlNode *reply = NULL;
1659 
1660 #if ENABLE_ACL
1661     /* Certain IPC commands may be done only by privileged users (i.e. root or
1662      * hacluster) when ACLs are enabled, because they would otherwise provide a
1663      * means of bypassing ACLs.
1664      */
1665     bool allowed = is_set(client->flags, crm_client_flag_ipc_privileged);
1666 #else
1667     bool allowed = true;
1668 #endif
1669 
1670     crm_trace("Processing %s operation from %s", op, client->id);
1671     crm_element_value_int(request, F_LRMD_CALLID, &call_id);
1672 
1673     if (crm_str_eq(op, CRM_OP_IPC_FWD, TRUE)) {
1674 #ifdef SUPPORT_REMOTE
1675         if (allowed) {
1676             ipc_proxy_forward_client(client, request);
1677         } else {
1678             rc = -EACCES;
1679         }
1680 #else
1681         rc = -EPROTONOSUPPORT;
1682 #endif
1683         do_reply = 1;
1684     } else if (crm_str_eq(op, CRM_OP_REGISTER, TRUE)) {
1685         rc = process_lrmd_signon(client, request, call_id, &reply);
1686         do_reply = 1;
1687     } else if (crm_str_eq(op, LRMD_OP_RSC_REG, TRUE)) {
1688         if (allowed) {
1689             rc = process_lrmd_rsc_register(client, id, request);
1690             do_notify = 1;
1691         } else {
1692             rc = -EACCES;
1693         }
1694         do_reply = 1;
1695     } else if (crm_str_eq(op, LRMD_OP_RSC_INFO, TRUE)) {
1696         if (allowed) {
1697             reply = process_lrmd_get_rsc_info(request, call_id);
1698         } else {
1699             rc = -EACCES;
1700         }
1701         do_reply = 1;
1702     } else if (crm_str_eq(op, LRMD_OP_RSC_UNREG, TRUE)) {
1703         if (allowed) {
1704             rc = process_lrmd_rsc_unregister(client, id, request);
1705             /* don't notify anyone about failed un-registers */
1706             if (rc == pcmk_ok || rc == -EINPROGRESS) {
1707                 do_notify = 1;
1708             }
1709         } else {
1710             rc = -EACCES;
1711         }
1712         do_reply = 1;
1713     } else if (crm_str_eq(op, LRMD_OP_RSC_EXEC, TRUE)) {
1714         if (allowed) {
1715             rc = process_lrmd_rsc_exec(client, id, request);
1716         } else {
1717             rc = -EACCES;
1718         }
1719         do_reply = 1;
1720     } else if (crm_str_eq(op, LRMD_OP_RSC_CANCEL, TRUE)) {
1721         if (allowed) {
1722             rc = process_lrmd_rsc_cancel(client, id, request);
1723         } else {
1724             rc = -EACCES;
1725         }
1726         do_reply = 1;
1727     } else if (crm_str_eq(op, LRMD_OP_POKE, TRUE)) {
1728         do_notify = 1;
1729         do_reply = 1;
1730     } else if (crm_str_eq(op, LRMD_OP_CHECK, TRUE)) {
1731         if (allowed) {
1732             xmlNode *data = get_message_xml(request, F_LRMD_CALLDATA);
1733 
1734             CRM_LOG_ASSERT(data != NULL);
1735             check_sbd_timeout(crm_element_value(data, F_LRMD_WATCHDOG));
1736         } else {
1737             rc = -EACCES;
1738         }
1739     } else if (crm_str_eq(op, LRMD_OP_ALERT_EXEC, TRUE)) {
1740         if (allowed) {
1741             rc = process_lrmd_alert_exec(client, id, request);
1742         } else {
1743             rc = -EACCES;
1744         }
1745         do_reply = 1;
1746     } else {
1747         rc = -EOPNOTSUPP;
1748         do_reply = 1;
1749         crm_err("Unknown %s from %s", op, client->name);
1750         crm_log_xml_warn(request, "UnknownOp");
1751     }
1752 
1753     if (rc == -EACCES) {
1754         crm_warn("Rejecting IPC request '%s' from unprivileged client %s",
1755                  op, crm_client_name(client));
1756     }
1757 
1758     crm_debug("Processed %s operation from %s: rc=%d, reply=%d, notify=%d",
1759               op, client->id, rc, do_reply, do_notify);
1760 
1761     if (do_reply) {
1762         int send_rc = pcmk_ok;
1763 
1764         if (reply == NULL) {
1765             reply = create_lrmd_reply(__FUNCTION__, rc, call_id);
1766         }
1767         send_rc = lrmd_server_send_reply(client, id, reply);
1768         free_xml(reply);
1769         if (send_rc < 0) {
1770             crm_warn("Reply to client %s failed: %s " CRM_XS " %d",
1771                      client->name, pcmk_strerror(send_rc), send_rc);
1772         }
1773     }
1774 
1775     if (do_notify) {
1776         send_generic_notify(rc, request);
1777     }
1778 }
1779