1 /*
2  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License as published by the Free Software Foundation; either
7  * version 2 of the License, or (at your option) any later version.
8  *
9  * This software is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public
15  * License along with this library; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 
19 /* put these first so that uuid_t is defined without conflicts */
20 #include <crm_internal.h>
21 
22 #include <string.h>
23 
24 #include <crm/crm.h>
25 #include <crm/cib.h>
26 #include <crm/msg_xml.h>
27 #include <crm/common/xml.h>
28 #include <crm/cluster.h>
29 #include <crmd_messages.h>
30 #include <crmd_fsa.h>
31 #include <fsa_proto.h>
32 #include <crmd_callbacks.h>
33 #include <tengine.h>
34 #include <membership.h>
35 
36 #include <ocf/oc_event.h>
37 #include <ocf/oc_membership.h>
38 
39 void oc_ev_special(const oc_ev_t *, oc_ev_class_t, int);
40 void ccm_event_detail(const oc_ev_membership_t * oc, oc_ed_t event);
41 gboolean crmd_ha_msg_dispatch(ll_cluster_t * cluster_conn, gpointer user_data);
42 void crmd_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data);
43 int ccm_dispatch(gpointer user_data);
44 
45 #define CCM_EVENT_DETAIL 0
46 #define CCM_EVENT_DETAIL_PARTIAL 0
47 
48 int (*ccm_api_callback_done) (void *cookie) = NULL;
49 int (*ccm_api_handle_event) (const oc_ev_t * token) = NULL;
50 
51 static oc_ev_t *fsa_ev_token;
52 static void *ccm_library = NULL;
53 static int num_ccm_register_fails = 0;
54 static int max_ccm_register_fails = 30;
55 
56 static void
ccm_connection_destroy(void * userdata)57 ccm_connection_destroy(void *userdata)
58 {
59 }
60 
61 /*	 A_CCM_CONNECT	*/
62 void
do_ccm_control(long long action,enum crmd_fsa_cause cause,enum crmd_fsa_state cur_state,enum crmd_fsa_input current_input,fsa_data_t * msg_data)63 do_ccm_control(long long action,
64                enum crmd_fsa_cause cause,
65                enum crmd_fsa_state cur_state,
66                enum crmd_fsa_input current_input, fsa_data_t * msg_data)
67 {
68     static struct mainloop_fd_callbacks ccm_fd_callbacks = {
69         .dispatch = ccm_dispatch,
70         .destroy = ccm_connection_destroy,
71     };
72 
73     if (is_heartbeat_cluster()) {
74         int (*ccm_api_register) (oc_ev_t ** token) =
75             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_register", 1);
76 
77         int (*ccm_api_set_callback) (const oc_ev_t * token,
78                                      oc_ev_class_t class,
79                                      oc_ev_callback_t * fn,
80                                      oc_ev_callback_t ** prev_fn) =
81             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_set_callback", 1);
82 
83         void (*ccm_api_special) (const oc_ev_t *, oc_ev_class_t, int) =
84             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_special", 1);
85         int (*ccm_api_activate) (const oc_ev_t * token, int *fd) =
86             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_activate", 1);
87         int (*ccm_api_unregister) (oc_ev_t * token) =
88             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_unregister", 1);
89 
90         if (action & A_CCM_DISCONNECT) {
91             set_bit(fsa_input_register, R_CCM_DISCONNECTED);
92             (*ccm_api_unregister) (fsa_ev_token);
93         }
94 
95         if (action & A_CCM_CONNECT) {
96             int ret;
97             int fsa_ev_fd;
98             gboolean did_fail = FALSE;
99 
100             crm_trace("Registering with CCM");
101             clear_bit(fsa_input_register, R_CCM_DISCONNECTED);
102             ret = (*ccm_api_register) (&fsa_ev_token);
103             if (ret != 0) {
104                 crm_warn("CCM registration failed");
105                 did_fail = TRUE;
106             }
107 
108             if (did_fail == FALSE) {
109                 crm_trace("Setting up CCM callbacks");
110                 ret = (*ccm_api_set_callback) (fsa_ev_token, OC_EV_MEMB_CLASS,
111                                                crmd_ccm_msg_callback, NULL);
112                 if (ret != 0) {
113                     crm_warn("CCM callback not set");
114                     did_fail = TRUE;
115                 }
116             }
117             if (did_fail == FALSE) {
118                 (*ccm_api_special) (fsa_ev_token, OC_EV_MEMB_CLASS, 0 /*don't care */ );
119 
120                 crm_trace("Activating CCM token");
121                 ret = (*ccm_api_activate) (fsa_ev_token, &fsa_ev_fd);
122                 if (ret != 0) {
123                     crm_warn("CCM Activation failed");
124                     did_fail = TRUE;
125                 }
126             }
127 
128             if (did_fail) {
129                 num_ccm_register_fails++;
130                 (*ccm_api_unregister) (fsa_ev_token);
131 
132                 if (num_ccm_register_fails < max_ccm_register_fails) {
133                     crm_warn("CCM Connection failed"
134                              " %d times (%d max)", num_ccm_register_fails, max_ccm_register_fails);
135 
136                     crm_timer_start(wait_timer);
137                     crmd_fsa_stall(FALSE);
138                     return;
139 
140                 } else {
141                     crm_err("CCM Activation failed %d (max) times", num_ccm_register_fails);
142                     register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
143                     return;
144                 }
145             }
146 
147             crm_info("CCM connection established... waiting for first callback");
148             mainloop_add_fd("heartbeat-ccm", G_PRIORITY_HIGH, fsa_ev_fd, fsa_ev_token,
149                             &ccm_fd_callbacks);
150 
151         }
152     }
153 
154     if (action & ~(A_CCM_CONNECT | A_CCM_DISCONNECT)) {
155         crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__);
156     }
157 }
158 
159 void
ccm_event_detail(const oc_ev_membership_t * oc,oc_ed_t event)160 ccm_event_detail(const oc_ev_membership_t * oc, oc_ed_t event)
161 {
162     int lpc;
163     gboolean member = FALSE;
164 
165     member = FALSE;
166 
167     crm_trace("-----------------------");
168     crm_info("%s: trans=%d, nodes=%d, new=%d, lost=%d n_idx=%d, "
169              "new_idx=%d, old_idx=%d",
170              ccm_event_name(event),
171              oc->m_instance,
172              oc->m_n_member, oc->m_n_in, oc->m_n_out, oc->m_memb_idx, oc->m_in_idx, oc->m_out_idx);
173 
174 #if !CCM_EVENT_DETAIL_PARTIAL
175     for (lpc = 0; lpc < oc->m_n_member; lpc++) {
176         crm_info("\tCURRENT: %s [nodeid=%d, born=%d]",
177                  oc->m_array[oc->m_memb_idx + lpc].node_uname,
178                  oc->m_array[oc->m_memb_idx + lpc].node_id,
179                  oc->m_array[oc->m_memb_idx + lpc].node_born_on);
180 
181         if (safe_str_eq(fsa_our_uname, oc->m_array[oc->m_memb_idx + lpc].node_uname)) {
182             member = TRUE;
183         }
184     }
185     if (member == FALSE) {
186         crm_warn("MY NODE IS NOT IN CCM THE MEMBERSHIP LIST");
187     }
188 #endif
189     for (lpc = 0; lpc < (int)oc->m_n_in; lpc++) {
190         crm_info("\tNEW:     %s [nodeid=%d, born=%d]",
191                  oc->m_array[oc->m_in_idx + lpc].node_uname,
192                  oc->m_array[oc->m_in_idx + lpc].node_id,
193                  oc->m_array[oc->m_in_idx + lpc].node_born_on);
194     }
195 
196     for (lpc = 0; lpc < (int)oc->m_n_out; lpc++) {
197         crm_info("\tLOST:    %s [nodeid=%d, born=%d]",
198                  oc->m_array[oc->m_out_idx + lpc].node_uname,
199                  oc->m_array[oc->m_out_idx + lpc].node_id,
200                  oc->m_array[oc->m_out_idx + lpc].node_born_on);
201     }
202 
203     crm_trace("-----------------------");
204 
205 }
206 
207 /*	 A_CCM_UPDATE_CACHE	*/
208 /*
209  * Take the opportunity to update the node status in the CIB as well
210  */
211 void
do_ccm_update_cache(enum crmd_fsa_cause cause,enum crmd_fsa_state cur_state,oc_ed_t event,const oc_ev_membership_t * oc,xmlNode * xml)212 do_ccm_update_cache(enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state,
213                     oc_ed_t event, const oc_ev_membership_t * oc, xmlNode * xml)
214 {
215     unsigned long long instance = 0;
216     unsigned int lpc = 0;
217 
218     if (is_heartbeat_cluster()) {
219         CRM_ASSERT(oc != NULL);
220         instance = oc->m_instance;
221     }
222 
223     CRM_ASSERT(crm_peer_seq <= instance);
224 
225     switch (cur_state) {
226         case S_STOPPING:
227         case S_TERMINATE:
228         case S_HALT:
229             crm_debug("Ignoring %s CCM event %llu, we're in state %s",
230                       ccm_event_name(event), instance, fsa_state2string(cur_state));
231             return;
232         case S_ELECTION:
233             register_fsa_action(A_ELECTION_CHECK);
234             break;
235         default:
236             break;
237     }
238 
239     if (is_heartbeat_cluster()) {
240         ccm_event_detail(oc, event);
241 
242         /*--*-- Recently Dead Member Nodes --*--*/
243         for (lpc = 0; lpc < oc->m_n_out; lpc++) {
244             crm_update_ccm_node(oc, lpc + oc->m_out_idx, CRM_NODE_LOST, instance);
245         }
246 
247             /*--*-- All Member Nodes --*--*/
248         for (lpc = 0; lpc < oc->m_n_member; lpc++) {
249             crm_update_ccm_node(oc, lpc + oc->m_memb_idx, CRM_NODE_MEMBER, instance);
250         }
251         heartbeat_cluster->llc_ops->client_status(heartbeat_cluster, NULL, crm_system_name, 0);
252     }
253 
254     if (event == OC_EV_MS_EVICTED) {
255         crm_node_t *peer = crm_get_peer(0, fsa_our_uname);
256 
257         crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_EVICTED, 0);
258 
259         /* todo: drop back to S_PENDING instead */
260         /* get out... NOW!
261          *
262          * go via the error recovery process so that HA will
263          *    restart us if required
264          */
265         register_fsa_error_adv(cause, I_ERROR, NULL, NULL, __FUNCTION__);
266     }
267 
268     post_cache_update(instance);
269     return;
270 }
271 
272 int
ccm_dispatch(gpointer user_data)273 ccm_dispatch(gpointer user_data)
274 {
275     int rc = 0;
276     oc_ev_t *ccm_token = (oc_ev_t *) user_data;
277     gboolean was_error = FALSE;
278 
279     crm_trace("Invoked");
280     if (ccm_api_handle_event == NULL) {
281         ccm_api_handle_event =
282             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_handle_event", 1);
283     }
284     rc = (*ccm_api_handle_event) (ccm_token);
285 
286     if (rc != 0) {
287         if (is_set(fsa_input_register, R_CCM_DISCONNECTED) == FALSE) {
288             /* we signed out, so this is expected */
289             register_fsa_input(C_CCM_CALLBACK, I_ERROR, NULL);
290             crm_err("CCM connection appears to have failed: rc=%d.", rc);
291         }
292         was_error = TRUE;
293     }
294 
295     trigger_fsa(fsa_source);
296     if (was_error) {
297         return -1;
298     }
299 
300     return 0;
301 }
302 
303 void
crmd_ccm_msg_callback(oc_ed_t event,void * cookie,size_t size,const void * data)304 crmd_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data)
305 {
306     gboolean update_cache = FALSE;
307     const oc_ev_membership_t *membership = data;
308 
309     gboolean update_quorum = FALSE;
310 
311     crm_trace("Invoked");
312     CRM_ASSERT(data != NULL);
313 
314     crm_info("Quorum %s after event=%s (id=%d)",
315              ccm_have_quorum(event) ? "(re)attained" : "lost",
316              ccm_event_name(event), membership->m_instance);
317 
318     if (crm_peer_seq > membership->m_instance) {
319         crm_err("Membership instance ID went backwards! %llu->%d",
320                 crm_peer_seq, membership->m_instance);
321         CRM_ASSERT(crm_peer_seq <= membership->m_instance);
322         return;
323     }
324 
325     /*
326      * OC_EV_MS_NEW_MEMBERSHIP:   membership with quorum
327      * OC_EV_MS_MS_INVALID:       membership without quorum
328      * OC_EV_MS_NOT_PRIMARY:      previous membership no longer valid
329      * OC_EV_MS_PRIMARY_RESTORED: previous membership restored
330      * OC_EV_MS_EVICTED:          the client is evicted from ccm.
331      */
332 
333     switch (event) {
334         case OC_EV_MS_NEW_MEMBERSHIP:
335         case OC_EV_MS_INVALID:
336             update_cache = TRUE;
337             update_quorum = TRUE;
338             break;
339         case OC_EV_MS_NOT_PRIMARY:
340             break;
341         case OC_EV_MS_PRIMARY_RESTORED:
342             update_cache = TRUE;
343             crm_peer_seq = membership->m_instance;
344             break;
345         case OC_EV_MS_EVICTED:
346             update_quorum = TRUE;
347             register_fsa_input(C_FSA_INTERNAL, I_STOP, NULL);
348             crm_err("Shutting down after CCM event: %s", ccm_event_name(event));
349             break;
350         default:
351             crm_err("Unknown CCM event: %d", event);
352     }
353 
354     if (update_quorum) {
355         crm_have_quorum = ccm_have_quorum(event);
356         if (crm_have_quorum == FALSE) {
357             /* did we just lose quorum? */
358             if (fsa_has_quorum) {
359                 crm_info("Quorum lost: %s", ccm_event_name(event));
360             }
361         }
362         crm_update_quorum(crm_have_quorum, FALSE);
363     }
364 
365     if (update_cache) {
366         crm_trace("Updating cache after event %s", ccm_event_name(event));
367         do_ccm_update_cache(C_CCM_CALLBACK, fsa_state, event, data, NULL);
368 
369     } else if (event != OC_EV_MS_NOT_PRIMARY) {
370         crm_peer_seq = membership->m_instance;
371         register_fsa_action(A_TE_CANCEL);
372     }
373 
374     if (ccm_api_callback_done == NULL) {
375         ccm_api_callback_done =
376             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_callback_done", 1);
377     }
378     (*ccm_api_callback_done) (cookie);
379     return;
380 }
381 
382 void
crmd_ha_status_callback(const char * node,const char * status,void * private)383 crmd_ha_status_callback(const char *node, const char *status, void *private)
384 {
385     xmlNode *update = NULL;
386     crm_node_t *peer = NULL;
387 
388     crm_notice("Status update: Node %s now has status [%s]", node, status);
389 
390     peer = crm_get_peer(0, node);
391     if (safe_str_eq(status, PINGSTATUS)) {
392         return;
393     }
394 
395     if (safe_str_eq(status, DEADSTATUS)) {
396         /* this node is toast */
397         crm_update_peer_proc(__FUNCTION__, peer, crm_proc_crmd|crm_proc_heartbeat, OFFLINESTATUS);
398 
399     } else {
400         crm_update_peer_proc(__FUNCTION__, peer, crm_proc_heartbeat, ONLINESTATUS);
401     }
402 
403     trigger_fsa(fsa_source);
404 
405     if (AM_I_DC) {
406         update = create_node_state_update(peer, node_update_cluster, NULL,
407                                           __FUNCTION__);
408         fsa_cib_anon_update(XML_CIB_TAG_STATUS, update);
409         free_xml(update);
410     }
411 }
412 
413 void
crmd_client_status_callback(const char * node,const char * client,const char * status,void * private)414 crmd_client_status_callback(const char *node, const char *client, const char *status, void *private)
415 {
416     crm_node_t *peer = NULL;
417 
418     crm_trace("Invoked");
419     if (safe_str_neq(client, CRM_SYSTEM_CRMD)) {
420         return;
421     }
422 
423     peer = crm_get_peer(0, node);
424 
425     if (safe_str_neq(peer->state, CRM_NODE_MEMBER)) {
426         crm_warn("This peer is not a ccm member (yet). "
427             "Status ignored: Client %s/%s announced status [%s] (DC=%s)",
428             node, client, status, AM_I_DC ? "true" : "false");
429         return;
430     }
431 
432     set_bit(fsa_input_register, R_PEER_DATA);
433 
434     crm_notice("Status update: Client %s/%s now has status [%s] (DC=%s)",
435                node, client, status, AM_I_DC ? "true" : "false");
436 
437     /* rest of the code, especially crm_update_peer_proc,
438      * does not know about JOINSTATUS, but expects ONLINESTATUS.
439      * See also cib/callbacks.c */
440     if (safe_str_eq(status, JOINSTATUS)) {
441         status = ONLINESTATUS;
442     }  else if (safe_str_eq(status, LEAVESTATUS)) {
443         status = OFFLINESTATUS;
444     }
445 
446     if (safe_str_eq(status, ONLINESTATUS)) {
447         /* remove the cached value in case it changed */
448         crm_trace("Uncaching UUID for %s", node);
449         free(peer->uuid);
450         peer->uuid = NULL;
451     }
452 
453     crm_update_peer_proc(__FUNCTION__, peer, crm_proc_crmd, status);
454 
455     if (AM_I_DC) {
456         xmlNode *update = NULL;
457 
458         crm_trace("Got client status callback");
459         update = create_node_state_update(peer, node_update_peer, NULL,
460                                           __FUNCTION__);
461         fsa_cib_anon_update(XML_CIB_TAG_STATUS, update);
462         free_xml(update);
463     }
464 }
465 
466 void
crmd_ha_msg_callback(HA_Message * hamsg,void * private_data)467 crmd_ha_msg_callback(HA_Message * hamsg, void *private_data)
468 {
469     int level = LOG_DEBUG;
470     crm_node_t *from_node = NULL;
471 
472     xmlNode *msg = convert_ha_message(NULL, hamsg, __FUNCTION__);
473     const char *from = crm_element_value(msg, F_ORIG);
474     const char *op = crm_element_value(msg, F_CRM_TASK);
475     const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM);
476 
477     CRM_CHECK(from != NULL, crm_log_xml_err(msg, "anon"); goto bail);
478 
479     crm_trace("HA[inbound]: %s from %s", op, from);
480 
481     if (crm_peer_cache == NULL || crm_active_peers() == 0) {
482         crm_debug("Ignoring HA messages until we are"
483                   " connected to the CCM (%s op from %s)", op, from);
484         crm_log_xml_trace(msg, "HA[inbound]: Ignore (No CCM)");
485         goto bail;
486     }
487 
488     from_node = crm_get_peer(0, from);
489     if (crm_is_peer_active(from_node) == FALSE) {
490         if (safe_str_eq(op, CRM_OP_VOTE)) {
491             level = LOG_WARNING;
492 
493         } else if (AM_I_DC && safe_str_eq(op, CRM_OP_JOIN_ANNOUNCE)) {
494             level = LOG_WARNING;
495 
496         } else if (safe_str_eq(sys_from, CRM_SYSTEM_DC)) {
497             level = LOG_WARNING;
498         }
499         do_crm_log(level,
500                    "Ignoring HA message (op=%s) from %s: not in our"
501                    " membership list (size=%d)", op, from, crm_active_peers());
502 
503         crm_log_xml_trace(msg, "HA[inbound]: CCM Discard");
504 
505     } else {
506         crmd_ha_msg_filter(msg);
507     }
508 
509   bail:
510     free_xml(msg);
511     return;
512 }
513 
514 gboolean
crmd_ha_msg_dispatch(ll_cluster_t * cluster_conn,gpointer user_data)515 crmd_ha_msg_dispatch(ll_cluster_t * cluster_conn, gpointer user_data)
516 {
517     IPC_Channel *channel = NULL;
518     gboolean stay_connected = TRUE;
519 
520     crm_trace("Invoked");
521 
522     if (cluster_conn != NULL) {
523         channel = cluster_conn->llc_ops->ipcchan(cluster_conn);
524     }
525 
526     CRM_CHECK(cluster_conn != NULL,;);
527     CRM_CHECK(channel != NULL,;);
528 
529     if (channel != NULL && IPC_ISRCONN(channel)) {
530         if (cluster_conn->llc_ops->msgready(cluster_conn) == 0) {
531             crm_trace("no message ready yet");
532         }
533         /* invoke the callbacks but don't block */
534         cluster_conn->llc_ops->rcvmsg(cluster_conn, 0);
535     }
536 
537     if (channel == NULL || channel->ch_status != IPC_CONNECT) {
538         if (is_set(fsa_input_register, R_HA_DISCONNECTED) == FALSE) {
539             crm_crit("Lost connection to heartbeat service.");
540         } else {
541             crm_info("Lost connection to heartbeat service.");
542         }
543         trigger_fsa(fsa_source);
544         stay_connected = FALSE;
545     }
546 
547     return stay_connected;
548 }
549