1 /*
2 * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This software is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 /* put these first so that uuid_t is defined without conflicts */
20 #include <crm_internal.h>
21
22 #include <string.h>
23
24 #include <crm/crm.h>
25 #include <crm/cib.h>
26 #include <crm/msg_xml.h>
27 #include <crm/common/xml.h>
28 #include <crm/cluster.h>
29 #include <crmd_messages.h>
30 #include <crmd_fsa.h>
31 #include <fsa_proto.h>
32 #include <crmd_callbacks.h>
33 #include <tengine.h>
34 #include <membership.h>
35
36 #include <ocf/oc_event.h>
37 #include <ocf/oc_membership.h>
38
39 void oc_ev_special(const oc_ev_t *, oc_ev_class_t, int);
40 void ccm_event_detail(const oc_ev_membership_t * oc, oc_ed_t event);
41 gboolean crmd_ha_msg_dispatch(ll_cluster_t * cluster_conn, gpointer user_data);
42 void crmd_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data);
43 int ccm_dispatch(gpointer user_data);
44
45 #define CCM_EVENT_DETAIL 0
46 #define CCM_EVENT_DETAIL_PARTIAL 0
47
48 int (*ccm_api_callback_done) (void *cookie) = NULL;
49 int (*ccm_api_handle_event) (const oc_ev_t * token) = NULL;
50
51 static oc_ev_t *fsa_ev_token;
52 static void *ccm_library = NULL;
53 static int num_ccm_register_fails = 0;
54 static int max_ccm_register_fails = 30;
55
56 static void
ccm_connection_destroy(void * userdata)57 ccm_connection_destroy(void *userdata)
58 {
59 }
60
61 /* A_CCM_CONNECT */
62 void
do_ccm_control(long long action,enum crmd_fsa_cause cause,enum crmd_fsa_state cur_state,enum crmd_fsa_input current_input,fsa_data_t * msg_data)63 do_ccm_control(long long action,
64 enum crmd_fsa_cause cause,
65 enum crmd_fsa_state cur_state,
66 enum crmd_fsa_input current_input, fsa_data_t * msg_data)
67 {
68 static struct mainloop_fd_callbacks ccm_fd_callbacks = {
69 .dispatch = ccm_dispatch,
70 .destroy = ccm_connection_destroy,
71 };
72
73 if (is_heartbeat_cluster()) {
74 int (*ccm_api_register) (oc_ev_t ** token) =
75 find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_register", 1);
76
77 int (*ccm_api_set_callback) (const oc_ev_t * token,
78 oc_ev_class_t class,
79 oc_ev_callback_t * fn,
80 oc_ev_callback_t ** prev_fn) =
81 find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_set_callback", 1);
82
83 void (*ccm_api_special) (const oc_ev_t *, oc_ev_class_t, int) =
84 find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_special", 1);
85 int (*ccm_api_activate) (const oc_ev_t * token, int *fd) =
86 find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_activate", 1);
87 int (*ccm_api_unregister) (oc_ev_t * token) =
88 find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_unregister", 1);
89
90 if (action & A_CCM_DISCONNECT) {
91 set_bit(fsa_input_register, R_CCM_DISCONNECTED);
92 (*ccm_api_unregister) (fsa_ev_token);
93 }
94
95 if (action & A_CCM_CONNECT) {
96 int ret;
97 int fsa_ev_fd;
98 gboolean did_fail = FALSE;
99
100 crm_trace("Registering with CCM");
101 clear_bit(fsa_input_register, R_CCM_DISCONNECTED);
102 ret = (*ccm_api_register) (&fsa_ev_token);
103 if (ret != 0) {
104 crm_warn("CCM registration failed");
105 did_fail = TRUE;
106 }
107
108 if (did_fail == FALSE) {
109 crm_trace("Setting up CCM callbacks");
110 ret = (*ccm_api_set_callback) (fsa_ev_token, OC_EV_MEMB_CLASS,
111 crmd_ccm_msg_callback, NULL);
112 if (ret != 0) {
113 crm_warn("CCM callback not set");
114 did_fail = TRUE;
115 }
116 }
117 if (did_fail == FALSE) {
118 (*ccm_api_special) (fsa_ev_token, OC_EV_MEMB_CLASS, 0 /*don't care */ );
119
120 crm_trace("Activating CCM token");
121 ret = (*ccm_api_activate) (fsa_ev_token, &fsa_ev_fd);
122 if (ret != 0) {
123 crm_warn("CCM Activation failed");
124 did_fail = TRUE;
125 }
126 }
127
128 if (did_fail) {
129 num_ccm_register_fails++;
130 (*ccm_api_unregister) (fsa_ev_token);
131
132 if (num_ccm_register_fails < max_ccm_register_fails) {
133 crm_warn("CCM Connection failed"
134 " %d times (%d max)", num_ccm_register_fails, max_ccm_register_fails);
135
136 crm_timer_start(wait_timer);
137 crmd_fsa_stall(FALSE);
138 return;
139
140 } else {
141 crm_err("CCM Activation failed %d (max) times", num_ccm_register_fails);
142 register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
143 return;
144 }
145 }
146
147 crm_info("CCM connection established... waiting for first callback");
148 mainloop_add_fd("heartbeat-ccm", G_PRIORITY_HIGH, fsa_ev_fd, fsa_ev_token,
149 &ccm_fd_callbacks);
150
151 }
152 }
153
154 if (action & ~(A_CCM_CONNECT | A_CCM_DISCONNECT)) {
155 crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__);
156 }
157 }
158
159 void
ccm_event_detail(const oc_ev_membership_t * oc,oc_ed_t event)160 ccm_event_detail(const oc_ev_membership_t * oc, oc_ed_t event)
161 {
162 int lpc;
163 gboolean member = FALSE;
164
165 member = FALSE;
166
167 crm_trace("-----------------------");
168 crm_info("%s: trans=%d, nodes=%d, new=%d, lost=%d n_idx=%d, "
169 "new_idx=%d, old_idx=%d",
170 ccm_event_name(event),
171 oc->m_instance,
172 oc->m_n_member, oc->m_n_in, oc->m_n_out, oc->m_memb_idx, oc->m_in_idx, oc->m_out_idx);
173
174 #if !CCM_EVENT_DETAIL_PARTIAL
175 for (lpc = 0; lpc < oc->m_n_member; lpc++) {
176 crm_info("\tCURRENT: %s [nodeid=%d, born=%d]",
177 oc->m_array[oc->m_memb_idx + lpc].node_uname,
178 oc->m_array[oc->m_memb_idx + lpc].node_id,
179 oc->m_array[oc->m_memb_idx + lpc].node_born_on);
180
181 if (safe_str_eq(fsa_our_uname, oc->m_array[oc->m_memb_idx + lpc].node_uname)) {
182 member = TRUE;
183 }
184 }
185 if (member == FALSE) {
186 crm_warn("MY NODE IS NOT IN CCM THE MEMBERSHIP LIST");
187 }
188 #endif
189 for (lpc = 0; lpc < (int)oc->m_n_in; lpc++) {
190 crm_info("\tNEW: %s [nodeid=%d, born=%d]",
191 oc->m_array[oc->m_in_idx + lpc].node_uname,
192 oc->m_array[oc->m_in_idx + lpc].node_id,
193 oc->m_array[oc->m_in_idx + lpc].node_born_on);
194 }
195
196 for (lpc = 0; lpc < (int)oc->m_n_out; lpc++) {
197 crm_info("\tLOST: %s [nodeid=%d, born=%d]",
198 oc->m_array[oc->m_out_idx + lpc].node_uname,
199 oc->m_array[oc->m_out_idx + lpc].node_id,
200 oc->m_array[oc->m_out_idx + lpc].node_born_on);
201 }
202
203 crm_trace("-----------------------");
204
205 }
206
207 /* A_CCM_UPDATE_CACHE */
208 /*
209 * Take the opportunity to update the node status in the CIB as well
210 */
211 void
do_ccm_update_cache(enum crmd_fsa_cause cause,enum crmd_fsa_state cur_state,oc_ed_t event,const oc_ev_membership_t * oc,xmlNode * xml)212 do_ccm_update_cache(enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state,
213 oc_ed_t event, const oc_ev_membership_t * oc, xmlNode * xml)
214 {
215 unsigned long long instance = 0;
216 unsigned int lpc = 0;
217
218 if (is_heartbeat_cluster()) {
219 CRM_ASSERT(oc != NULL);
220 instance = oc->m_instance;
221 }
222
223 CRM_ASSERT(crm_peer_seq <= instance);
224
225 switch (cur_state) {
226 case S_STOPPING:
227 case S_TERMINATE:
228 case S_HALT:
229 crm_debug("Ignoring %s CCM event %llu, we're in state %s",
230 ccm_event_name(event), instance, fsa_state2string(cur_state));
231 return;
232 case S_ELECTION:
233 register_fsa_action(A_ELECTION_CHECK);
234 break;
235 default:
236 break;
237 }
238
239 if (is_heartbeat_cluster()) {
240 ccm_event_detail(oc, event);
241
242 /*--*-- Recently Dead Member Nodes --*--*/
243 for (lpc = 0; lpc < oc->m_n_out; lpc++) {
244 crm_update_ccm_node(oc, lpc + oc->m_out_idx, CRM_NODE_LOST, instance);
245 }
246
247 /*--*-- All Member Nodes --*--*/
248 for (lpc = 0; lpc < oc->m_n_member; lpc++) {
249 crm_update_ccm_node(oc, lpc + oc->m_memb_idx, CRM_NODE_MEMBER, instance);
250 }
251 heartbeat_cluster->llc_ops->client_status(heartbeat_cluster, NULL, crm_system_name, 0);
252 }
253
254 if (event == OC_EV_MS_EVICTED) {
255 crm_node_t *peer = crm_get_peer(0, fsa_our_uname);
256
257 crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_EVICTED, 0);
258
259 /* todo: drop back to S_PENDING instead */
260 /* get out... NOW!
261 *
262 * go via the error recovery process so that HA will
263 * restart us if required
264 */
265 register_fsa_error_adv(cause, I_ERROR, NULL, NULL, __FUNCTION__);
266 }
267
268 post_cache_update(instance);
269 return;
270 }
271
272 int
ccm_dispatch(gpointer user_data)273 ccm_dispatch(gpointer user_data)
274 {
275 int rc = 0;
276 oc_ev_t *ccm_token = (oc_ev_t *) user_data;
277 gboolean was_error = FALSE;
278
279 crm_trace("Invoked");
280 if (ccm_api_handle_event == NULL) {
281 ccm_api_handle_event =
282 find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_handle_event", 1);
283 }
284 rc = (*ccm_api_handle_event) (ccm_token);
285
286 if (rc != 0) {
287 if (is_set(fsa_input_register, R_CCM_DISCONNECTED) == FALSE) {
288 /* we signed out, so this is expected */
289 register_fsa_input(C_CCM_CALLBACK, I_ERROR, NULL);
290 crm_err("CCM connection appears to have failed: rc=%d.", rc);
291 }
292 was_error = TRUE;
293 }
294
295 trigger_fsa(fsa_source);
296 if (was_error) {
297 return -1;
298 }
299
300 return 0;
301 }
302
303 void
crmd_ccm_msg_callback(oc_ed_t event,void * cookie,size_t size,const void * data)304 crmd_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data)
305 {
306 gboolean update_cache = FALSE;
307 const oc_ev_membership_t *membership = data;
308
309 gboolean update_quorum = FALSE;
310
311 crm_trace("Invoked");
312 CRM_ASSERT(data != NULL);
313
314 crm_info("Quorum %s after event=%s (id=%d)",
315 ccm_have_quorum(event) ? "(re)attained" : "lost",
316 ccm_event_name(event), membership->m_instance);
317
318 if (crm_peer_seq > membership->m_instance) {
319 crm_err("Membership instance ID went backwards! %llu->%d",
320 crm_peer_seq, membership->m_instance);
321 CRM_ASSERT(crm_peer_seq <= membership->m_instance);
322 return;
323 }
324
325 /*
326 * OC_EV_MS_NEW_MEMBERSHIP: membership with quorum
327 * OC_EV_MS_MS_INVALID: membership without quorum
328 * OC_EV_MS_NOT_PRIMARY: previous membership no longer valid
329 * OC_EV_MS_PRIMARY_RESTORED: previous membership restored
330 * OC_EV_MS_EVICTED: the client is evicted from ccm.
331 */
332
333 switch (event) {
334 case OC_EV_MS_NEW_MEMBERSHIP:
335 case OC_EV_MS_INVALID:
336 update_cache = TRUE;
337 update_quorum = TRUE;
338 break;
339 case OC_EV_MS_NOT_PRIMARY:
340 break;
341 case OC_EV_MS_PRIMARY_RESTORED:
342 update_cache = TRUE;
343 crm_peer_seq = membership->m_instance;
344 break;
345 case OC_EV_MS_EVICTED:
346 update_quorum = TRUE;
347 register_fsa_input(C_FSA_INTERNAL, I_STOP, NULL);
348 crm_err("Shutting down after CCM event: %s", ccm_event_name(event));
349 break;
350 default:
351 crm_err("Unknown CCM event: %d", event);
352 }
353
354 if (update_quorum) {
355 crm_have_quorum = ccm_have_quorum(event);
356 if (crm_have_quorum == FALSE) {
357 /* did we just lose quorum? */
358 if (fsa_has_quorum) {
359 crm_info("Quorum lost: %s", ccm_event_name(event));
360 }
361 }
362 crm_update_quorum(crm_have_quorum, FALSE);
363 }
364
365 if (update_cache) {
366 crm_trace("Updating cache after event %s", ccm_event_name(event));
367 do_ccm_update_cache(C_CCM_CALLBACK, fsa_state, event, data, NULL);
368
369 } else if (event != OC_EV_MS_NOT_PRIMARY) {
370 crm_peer_seq = membership->m_instance;
371 register_fsa_action(A_TE_CANCEL);
372 }
373
374 if (ccm_api_callback_done == NULL) {
375 ccm_api_callback_done =
376 find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_callback_done", 1);
377 }
378 (*ccm_api_callback_done) (cookie);
379 return;
380 }
381
382 void
crmd_ha_status_callback(const char * node,const char * status,void * private)383 crmd_ha_status_callback(const char *node, const char *status, void *private)
384 {
385 xmlNode *update = NULL;
386 crm_node_t *peer = NULL;
387
388 crm_notice("Status update: Node %s now has status [%s]", node, status);
389
390 peer = crm_get_peer(0, node);
391 if (safe_str_eq(status, PINGSTATUS)) {
392 return;
393 }
394
395 if (safe_str_eq(status, DEADSTATUS)) {
396 /* this node is toast */
397 crm_update_peer_proc(__FUNCTION__, peer, crm_proc_crmd|crm_proc_heartbeat, OFFLINESTATUS);
398
399 } else {
400 crm_update_peer_proc(__FUNCTION__, peer, crm_proc_heartbeat, ONLINESTATUS);
401 }
402
403 trigger_fsa(fsa_source);
404
405 if (AM_I_DC) {
406 update = create_node_state_update(peer, node_update_cluster, NULL,
407 __FUNCTION__);
408 fsa_cib_anon_update(XML_CIB_TAG_STATUS, update);
409 free_xml(update);
410 }
411 }
412
413 void
crmd_client_status_callback(const char * node,const char * client,const char * status,void * private)414 crmd_client_status_callback(const char *node, const char *client, const char *status, void *private)
415 {
416 crm_node_t *peer = NULL;
417
418 crm_trace("Invoked");
419 if (safe_str_neq(client, CRM_SYSTEM_CRMD)) {
420 return;
421 }
422
423 peer = crm_get_peer(0, node);
424
425 if (safe_str_neq(peer->state, CRM_NODE_MEMBER)) {
426 crm_warn("This peer is not a ccm member (yet). "
427 "Status ignored: Client %s/%s announced status [%s] (DC=%s)",
428 node, client, status, AM_I_DC ? "true" : "false");
429 return;
430 }
431
432 set_bit(fsa_input_register, R_PEER_DATA);
433
434 crm_notice("Status update: Client %s/%s now has status [%s] (DC=%s)",
435 node, client, status, AM_I_DC ? "true" : "false");
436
437 /* rest of the code, especially crm_update_peer_proc,
438 * does not know about JOINSTATUS, but expects ONLINESTATUS.
439 * See also cib/callbacks.c */
440 if (safe_str_eq(status, JOINSTATUS)) {
441 status = ONLINESTATUS;
442 } else if (safe_str_eq(status, LEAVESTATUS)) {
443 status = OFFLINESTATUS;
444 }
445
446 if (safe_str_eq(status, ONLINESTATUS)) {
447 /* remove the cached value in case it changed */
448 crm_trace("Uncaching UUID for %s", node);
449 free(peer->uuid);
450 peer->uuid = NULL;
451 }
452
453 crm_update_peer_proc(__FUNCTION__, peer, crm_proc_crmd, status);
454
455 if (AM_I_DC) {
456 xmlNode *update = NULL;
457
458 crm_trace("Got client status callback");
459 update = create_node_state_update(peer, node_update_peer, NULL,
460 __FUNCTION__);
461 fsa_cib_anon_update(XML_CIB_TAG_STATUS, update);
462 free_xml(update);
463 }
464 }
465
466 void
crmd_ha_msg_callback(HA_Message * hamsg,void * private_data)467 crmd_ha_msg_callback(HA_Message * hamsg, void *private_data)
468 {
469 int level = LOG_DEBUG;
470 crm_node_t *from_node = NULL;
471
472 xmlNode *msg = convert_ha_message(NULL, hamsg, __FUNCTION__);
473 const char *from = crm_element_value(msg, F_ORIG);
474 const char *op = crm_element_value(msg, F_CRM_TASK);
475 const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM);
476
477 CRM_CHECK(from != NULL, crm_log_xml_err(msg, "anon"); goto bail);
478
479 crm_trace("HA[inbound]: %s from %s", op, from);
480
481 if (crm_peer_cache == NULL || crm_active_peers() == 0) {
482 crm_debug("Ignoring HA messages until we are"
483 " connected to the CCM (%s op from %s)", op, from);
484 crm_log_xml_trace(msg, "HA[inbound]: Ignore (No CCM)");
485 goto bail;
486 }
487
488 from_node = crm_get_peer(0, from);
489 if (crm_is_peer_active(from_node) == FALSE) {
490 if (safe_str_eq(op, CRM_OP_VOTE)) {
491 level = LOG_WARNING;
492
493 } else if (AM_I_DC && safe_str_eq(op, CRM_OP_JOIN_ANNOUNCE)) {
494 level = LOG_WARNING;
495
496 } else if (safe_str_eq(sys_from, CRM_SYSTEM_DC)) {
497 level = LOG_WARNING;
498 }
499 do_crm_log(level,
500 "Ignoring HA message (op=%s) from %s: not in our"
501 " membership list (size=%d)", op, from, crm_active_peers());
502
503 crm_log_xml_trace(msg, "HA[inbound]: CCM Discard");
504
505 } else {
506 crmd_ha_msg_filter(msg);
507 }
508
509 bail:
510 free_xml(msg);
511 return;
512 }
513
514 gboolean
crmd_ha_msg_dispatch(ll_cluster_t * cluster_conn,gpointer user_data)515 crmd_ha_msg_dispatch(ll_cluster_t * cluster_conn, gpointer user_data)
516 {
517 IPC_Channel *channel = NULL;
518 gboolean stay_connected = TRUE;
519
520 crm_trace("Invoked");
521
522 if (cluster_conn != NULL) {
523 channel = cluster_conn->llc_ops->ipcchan(cluster_conn);
524 }
525
526 CRM_CHECK(cluster_conn != NULL,;);
527 CRM_CHECK(channel != NULL,;);
528
529 if (channel != NULL && IPC_ISRCONN(channel)) {
530 if (cluster_conn->llc_ops->msgready(cluster_conn) == 0) {
531 crm_trace("no message ready yet");
532 }
533 /* invoke the callbacks but don't block */
534 cluster_conn->llc_ops->rcvmsg(cluster_conn, 0);
535 }
536
537 if (channel == NULL || channel->ch_status != IPC_CONNECT) {
538 if (is_set(fsa_input_register, R_HA_DISCONNECTED) == FALSE) {
539 crm_crit("Lost connection to heartbeat service.");
540 } else {
541 crm_info("Lost connection to heartbeat service.");
542 }
543 trigger_fsa(fsa_source);
544 stay_connected = FALSE;
545 }
546
547 return stay_connected;
548 }
549