1 /*
2  * Copyright 2004-2020 the Pacemaker project contributors
3  *
4  * The version control history for this file may have further details.
5  *
6  * This source code is licensed under the GNU General Public License version 2
7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
8  */
9 
10 #include <crm_internal.h>
11 
12 #include <unistd.h>  /* pid_t, sleep, ssize_t */
13 
14 #include <crm/cib.h>
15 #include <crm/cluster.h>
16 #include <crm/common/xml.h>
17 #include <crm/crm.h>
18 #include <crm/msg_xml.h>
19 #include <crm/common/xml_internal.h>
20 
21 #include <pacemaker-controld.h>
22 
23 static mainloop_io_t *pe_subsystem = NULL;
24 
25 /*!
26  * \internal
27  * \brief Close any scheduler connection and free associated memory
28  */
29 void
pe_subsystem_free(void)30 pe_subsystem_free(void)
31 {
32     controld_clear_fsa_input_flags(R_PE_REQUIRED);
33     if (pe_subsystem) {
34         controld_expect_sched_reply(NULL);
35         mainloop_del_ipc_client(pe_subsystem);
36         pe_subsystem = NULL;
37         controld_clear_fsa_input_flags(R_PE_CONNECTED);
38     }
39 }
40 
41 /*!
42  * \internal
43  * \brief Save CIB query result to file, raising FSA error
44  *
45  * \param[in] msg        Ignored
46  * \param[in] call_id    Call ID of CIB query
47  * \param[in] rc         Return code of CIB query
48  * \param[in] output     Result of CIB query
49  * \param[in] user_data  Unique identifier for filename (will be freed)
50  *
51  * \note This is intended to be called after a scheduler connection fails.
52  */
53 static void
save_cib_contents(xmlNode * msg,int call_id,int rc,xmlNode * output,void * user_data)54 save_cib_contents(xmlNode *msg, int call_id, int rc, xmlNode *output,
55                   void *user_data)
56 {
57     char *id = user_data;
58 
59     register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, NULL, NULL, __func__);
60     CRM_CHECK(id != NULL, return);
61 
62     if (rc == pcmk_ok) {
63         char *filename = crm_strdup_printf(PE_STATE_DIR "/pe-core-%s.bz2", id);
64 
65         if (write_xml_file(output, filename, TRUE) < 0) {
66             crm_err("Could not save Cluster Information Base to %s after scheduler crash",
67                     filename);
68         } else {
69             crm_notice("Saved Cluster Information Base to %s after scheduler crash",
70                        filename);
71         }
72         free(filename);
73     }
74 }
75 
76 /*!
77  * \internal
78  * \brief Respond to scheduler connection failure
79  *
80  * \param[in] user_data  Ignored
81  */
82 static void
pe_ipc_destroy(gpointer user_data)83 pe_ipc_destroy(gpointer user_data)
84 {
85     // If we aren't connected to the scheduler, we can't expect a reply
86     controld_expect_sched_reply(NULL);
87 
88     if (pcmk_is_set(fsa_input_register, R_PE_REQUIRED)) {
89         int rc = pcmk_ok;
90         char *uuid_str = crm_generate_uuid();
91 
92         crm_crit("Connection to the scheduler failed "
93                  CRM_XS " uuid=%s", uuid_str);
94 
95         /*
96          * The scheduler died...
97          *
98          * Save the current CIB so that we have a chance of
99          * figuring out what killed it.
100          *
101          * Delay raising the I_ERROR until the query below completes or
102          * 5s is up, whichever comes first.
103          *
104          */
105         rc = fsa_cib_conn->cmds->query(fsa_cib_conn, NULL, NULL, cib_scope_local);
106         fsa_register_cib_callback(rc, FALSE, uuid_str, save_cib_contents);
107 
108     } else {
109         crm_info("Connection to the scheduler released");
110     }
111 
112     controld_clear_fsa_input_flags(R_PE_CONNECTED);
113     pe_subsystem = NULL;
114     mainloop_set_trigger(fsa_source);
115     return;
116 }
117 
118 /*!
119  * \internal
120  * \brief Handle message from scheduler connection
121  *
122  * \param[in] buffer    XML message (will be freed)
123  * \param[in] length    Ignored
124  * \param[in] userdata  Ignored
125  *
126  * \return 0
127  */
128 static int
pe_ipc_dispatch(const char * buffer,ssize_t length,gpointer userdata)129 pe_ipc_dispatch(const char *buffer, ssize_t length, gpointer userdata)
130 {
131     xmlNode *msg = string2xml(buffer);
132 
133     if (msg) {
134         route_message(C_IPC_MESSAGE, msg);
135     }
136     free_xml(msg);
137     return 0;
138 }
139 
140 /*!
141  * \internal
142  * \brief Make new connection to scheduler
143  *
144  * \return TRUE on success, FALSE otherwise
145  */
146 static bool
pe_subsystem_new(void)147 pe_subsystem_new(void)
148 {
149     struct ipc_client_callbacks pe_callbacks = {
150         .dispatch = pe_ipc_dispatch,
151         .destroy = pe_ipc_destroy
152     };
153     static bool retry_one = TRUE;
154 
155     controld_set_fsa_input_flags(R_PE_REQUIRED);
156 retry:
157     pe_subsystem = mainloop_add_ipc_client(CRM_SYSTEM_PENGINE,
158                                            G_PRIORITY_DEFAULT,
159                                            5 * 1024 * 1024 /* 5MB */,
160                                            NULL, &pe_callbacks);
161     if (pe_subsystem == NULL) {
162         crm_debug("Could not connect to scheduler : %s(%d)", pcmk_rc_str(errno), errno);
163         if (errno == EAGAIN && retry_one) {
164             /* In rare cases, a SIGTERM may be received and the connection may fail when the cluster shuts down. */
165             /* At this time, the connection will be retried only once. */
166             crm_debug("Scheduler connection attempt.");
167             retry_one = FALSE;
168             goto retry;
169         }
170         return FALSE;
171     }
172     controld_set_fsa_input_flags(R_PE_CONNECTED);
173     return TRUE;
174 }
175 
176 /*!
177  * \internal
178  * \brief Send an XML message to the scheduler
179  *
180  * \param[in] cmd  XML message to send
181  *
182  * \return pcmk_ok on success, -errno otherwise
183  */
184 static int
pe_subsystem_send(xmlNode * cmd)185 pe_subsystem_send(xmlNode *cmd)
186 {
187     if (pe_subsystem) {
188         int sent = crm_ipc_send(mainloop_get_ipc_client(pe_subsystem), cmd,
189                                 0, 0, NULL);
190 
191         if (sent == 0) {
192             sent = -ENODATA;
193         } else if (sent > 0) {
194             sent = pcmk_ok;
195         }
196         return sent;
197     }
198     return -ENOTCONN;
199 }
200 
201 static void do_pe_invoke_callback(xmlNode *msg, int call_id, int rc,
202                                   xmlNode *output, void *user_data);
203 
204 /*	 A_PE_START, A_PE_STOP, O_PE_RESTART	*/
205 void
do_pe_control(long long action,enum crmd_fsa_cause cause,enum crmd_fsa_state cur_state,enum crmd_fsa_input current_input,fsa_data_t * msg_data)206 do_pe_control(long long action,
207               enum crmd_fsa_cause cause,
208               enum crmd_fsa_state cur_state,
209               enum crmd_fsa_input current_input, fsa_data_t * msg_data)
210 {
211     if (action & A_PE_STOP) {
212         pe_subsystem_free();
213     }
214     if ((action & A_PE_START)
215         && !pcmk_is_set(fsa_input_register, R_PE_CONNECTED)) {
216 
217         if (cur_state == S_STOPPING) {
218             crm_info("Ignoring request to connect to scheduler while shutting down");
219 
220         } else if (!pe_subsystem_new()) {
221             crm_warn("Could not connect to scheduler");
222             register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
223         }
224     }
225 }
226 
227 int fsa_pe_query = 0;
228 char *fsa_pe_ref = NULL;
229 static mainloop_timer_t *controld_sched_timer = NULL;
230 
231 // @TODO Make this a configurable cluster option if there's demand for it
232 #define SCHED_TIMEOUT_MS (120000)
233 
234 /*!
235  * \internal
236  * \brief Handle a timeout waiting for scheduler reply
237  *
238  * \param[in] user_data  Ignored
239  *
240  * \return FALSE (indicating that timer should not be restarted)
241  */
242 static gboolean
controld_sched_timeout(gpointer user_data)243 controld_sched_timeout(gpointer user_data)
244 {
245     if (AM_I_DC) {
246         /* If this node is the DC but can't communicate with the scheduler, just
247          * exit (and likely get fenced) so this node doesn't interfere with any
248          * further DC elections.
249          *
250          * @TODO We could try something less drastic first, like disconnecting
251          * and reconnecting to the scheduler, but something is likely going
252          * seriously wrong, so perhaps it's better to just fail as quickly as
253          * possible.
254          */
255         crmd_exit(CRM_EX_FATAL);
256     }
257     return FALSE;
258 }
259 
260 void
controld_stop_sched_timer(void)261 controld_stop_sched_timer(void)
262 {
263     if (controld_sched_timer && fsa_pe_ref) {
264         crm_trace("Stopping timer for scheduler reply %s", fsa_pe_ref);
265     }
266     mainloop_timer_stop(controld_sched_timer);
267 }
268 
269 /*!
270  * \internal
271  * \brief Set the scheduler request currently being waited on
272  *
273  * \param[in] msg  Request to expect reply to (or NULL for none)
274  */
275 void
controld_expect_sched_reply(xmlNode * msg)276 controld_expect_sched_reply(xmlNode *msg)
277 {
278     char *ref = NULL;
279 
280     if (msg) {
281         ref = crm_element_value_copy(msg, XML_ATTR_REFERENCE);
282         CRM_ASSERT(ref != NULL);
283 
284         if (controld_sched_timer == NULL) {
285             controld_sched_timer = mainloop_timer_add("scheduler_reply_timer",
286                                                       SCHED_TIMEOUT_MS, FALSE,
287                                                       controld_sched_timeout,
288                                                       NULL);
289         }
290         mainloop_timer_start(controld_sched_timer);
291     } else {
292         controld_stop_sched_timer();
293     }
294     free(fsa_pe_ref);
295     fsa_pe_ref = ref;
296 }
297 
298 /*!
299  * \internal
300  * \brief Free the scheduler reply timer
301  */
302 void
controld_free_sched_timer(void)303 controld_free_sched_timer(void)
304 {
305     if (controld_sched_timer != NULL) {
306         mainloop_timer_del(controld_sched_timer);
307         controld_sched_timer = NULL;
308     }
309 }
310 
311 /*	 A_PE_INVOKE	*/
312 void
do_pe_invoke(long long action,enum crmd_fsa_cause cause,enum crmd_fsa_state cur_state,enum crmd_fsa_input current_input,fsa_data_t * msg_data)313 do_pe_invoke(long long action,
314              enum crmd_fsa_cause cause,
315              enum crmd_fsa_state cur_state,
316              enum crmd_fsa_input current_input, fsa_data_t * msg_data)
317 {
318     if (AM_I_DC == FALSE) {
319         crm_err("Not invoking scheduler because not DC: %s",
320                 fsa_action2string(action));
321         return;
322     }
323 
324     if (!pcmk_is_set(fsa_input_register, R_PE_CONNECTED)) {
325         if (pcmk_is_set(fsa_input_register, R_SHUTDOWN)) {
326             crm_err("Cannot shut down gracefully without the scheduler");
327             register_fsa_input_before(C_FSA_INTERNAL, I_TERMINATE, NULL);
328 
329         } else {
330             crm_info("Waiting for the scheduler to connect");
331             crmd_fsa_stall(FALSE);
332             controld_set_fsa_action_flags(A_PE_START);
333             trigger_fsa();
334         }
335         return;
336     }
337 
338     if (cur_state != S_POLICY_ENGINE) {
339         crm_notice("Not invoking scheduler because in state %s",
340                    fsa_state2string(cur_state));
341         return;
342     }
343     if (!pcmk_is_set(fsa_input_register, R_HAVE_CIB)) {
344         crm_err("Attempted to invoke scheduler without consistent Cluster Information Base!");
345 
346         /* start the join from scratch */
347         register_fsa_input_before(C_FSA_INTERNAL, I_ELECTION, NULL);
348         return;
349     }
350 
351     fsa_pe_query = fsa_cib_conn->cmds->query(fsa_cib_conn, NULL, NULL, cib_scope_local);
352 
353     crm_debug("Query %d: Requesting the current CIB: %s", fsa_pe_query,
354               fsa_state2string(fsa_state));
355 
356     controld_expect_sched_reply(NULL);
357     fsa_register_cib_callback(fsa_pe_query, FALSE, NULL, do_pe_invoke_callback);
358 }
359 
360 static void
force_local_option(xmlNode * xml,const char * attr_name,const char * attr_value)361 force_local_option(xmlNode *xml, const char *attr_name, const char *attr_value)
362 {
363     int max = 0;
364     int lpc = 0;
365     char *xpath_string = NULL;
366     xmlXPathObjectPtr xpathObj = NULL;
367 
368     xpath_string = crm_strdup_printf("%.128s//%s//nvpair[@name='%.128s']",
369                                      get_object_path(XML_CIB_TAG_CRMCONFIG),
370                                      XML_CIB_TAG_PROPSET, attr_name);
371     xpathObj = xpath_search(xml, xpath_string);
372     max = numXpathResults(xpathObj);
373     free(xpath_string);
374 
375     for (lpc = 0; lpc < max; lpc++) {
376         xmlNode *match = getXpathResult(xpathObj, lpc);
377         crm_trace("Forcing %s/%s = %s", ID(match), attr_name, attr_value);
378         crm_xml_add(match, XML_NVPAIR_ATTR_VALUE, attr_value);
379     }
380 
381     if(max == 0) {
382         xmlNode *configuration = NULL;
383         xmlNode *crm_config = NULL;
384         xmlNode *cluster_property_set = NULL;
385 
386         crm_trace("Creating %s-%s for %s=%s",
387                   CIB_OPTIONS_FIRST, attr_name, attr_name, attr_value);
388 
389         configuration = pcmk__xe_match(xml, XML_CIB_TAG_CONFIGURATION, NULL,
390                                        NULL);
391         if (configuration == NULL) {
392             configuration = create_xml_node(xml, XML_CIB_TAG_CONFIGURATION);
393         }
394 
395         crm_config = pcmk__xe_match(configuration, XML_CIB_TAG_CRMCONFIG, NULL,
396                                     NULL);
397         if (crm_config == NULL) {
398             crm_config = create_xml_node(configuration, XML_CIB_TAG_CRMCONFIG);
399         }
400 
401         cluster_property_set = pcmk__xe_match(crm_config, XML_CIB_TAG_PROPSET,
402                                               NULL, NULL);
403         if (cluster_property_set == NULL) {
404             cluster_property_set = create_xml_node(crm_config, XML_CIB_TAG_PROPSET);
405             crm_xml_add(cluster_property_set, XML_ATTR_ID, CIB_OPTIONS_FIRST);
406         }
407 
408         xml = create_xml_node(cluster_property_set, XML_CIB_TAG_NVPAIR);
409 
410         crm_xml_set_id(xml, "%s-%s", CIB_OPTIONS_FIRST, attr_name);
411         crm_xml_add(xml, XML_NVPAIR_ATTR_NAME, attr_name);
412         crm_xml_add(xml, XML_NVPAIR_ATTR_VALUE, attr_value);
413     }
414     freeXpathObject(xpathObj);
415 }
416 
417 static void
do_pe_invoke_callback(xmlNode * msg,int call_id,int rc,xmlNode * output,void * user_data)418 do_pe_invoke_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
419 {
420     xmlNode *cmd = NULL;
421     pid_t watchdog = pcmk__locate_sbd();
422 
423     if (rc != pcmk_ok) {
424         crm_err("Could not retrieve the Cluster Information Base: %s "
425                 CRM_XS " rc=%d call=%d", pcmk_strerror(rc), rc, call_id);
426         register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, NULL, NULL, __func__);
427         return;
428 
429     } else if (call_id != fsa_pe_query) {
430         crm_trace("Skipping superseded CIB query: %d (current=%d)", call_id, fsa_pe_query);
431         return;
432 
433     } else if (!AM_I_DC || !pcmk_is_set(fsa_input_register, R_PE_CONNECTED)) {
434         crm_debug("No need to invoke the scheduler anymore");
435         return;
436 
437     } else if (fsa_state != S_POLICY_ENGINE) {
438         crm_debug("Discarding scheduler request in state: %s",
439                   fsa_state2string(fsa_state));
440         return;
441 
442     /* this callback counts as 1 */
443     } else if (num_cib_op_callbacks() > 1) {
444         crm_debug("Re-asking for the CIB: %d other peer updates still pending",
445                   (num_cib_op_callbacks() - 1));
446         sleep(1);
447         controld_set_fsa_action_flags(A_PE_INVOKE);
448         trigger_fsa();
449         return;
450     }
451 
452     CRM_LOG_ASSERT(output != NULL);
453 
454     /* Refresh the remote node cache and the known node cache when the
455      * scheduler is invoked */
456     pcmk__refresh_node_caches_from_cib(output);
457 
458     crm_xml_add(output, XML_ATTR_DC_UUID, fsa_our_uuid);
459     crm_xml_add_int(output, XML_ATTR_HAVE_QUORUM, fsa_has_quorum);
460 
461     force_local_option(output, XML_ATTR_HAVE_WATCHDOG, pcmk__btoa(watchdog));
462 
463     if (ever_had_quorum && crm_have_quorum == FALSE) {
464         crm_xml_add_int(output, XML_ATTR_QUORUM_PANIC, 1);
465     }
466 
467     cmd = create_request(CRM_OP_PECALC, output, NULL, CRM_SYSTEM_PENGINE, CRM_SYSTEM_DC, NULL);
468 
469     rc = pe_subsystem_send(cmd);
470     if (rc < 0) {
471         crm_err("Could not contact the scheduler: %s " CRM_XS " rc=%d",
472                 pcmk_strerror(rc), rc);
473         register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, NULL, NULL, __func__);
474     } else {
475         controld_expect_sched_reply(cmd);
476         crm_debug("Invoking the scheduler: query=%d, ref=%s, seq=%llu, quorate=%d",
477                   fsa_pe_query, fsa_pe_ref, crm_peer_seq, fsa_has_quorum);
478     }
479     free_xml(cmd);
480 }
481