1 /*
2 * Copyright 2004-2021 the Pacemaker project contributors
3 *
4 * The version control history for this file may have further details.
5 *
6 * This source code is licensed under the GNU Lesser General Public License
7 * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8 */
9
10 #include <crm_internal.h>
11
12 #ifndef _GNU_SOURCE
13 # define _GNU_SOURCE
14 #endif
15
16 #include <sys/param.h>
17 #include <sys/types.h>
18 #include <stdio.h>
19 #include <unistd.h>
20 #include <string.h>
21 #include <glib.h>
22 #include <crm/common/ipc.h>
23 #include <crm/common/xml_internal.h>
24 #include <crm/cluster/internal.h>
25 #include <crm/msg_xml.h>
26 #include <crm/stonith-ng.h>
27 #include "crmcluster_private.h"
28
29 /* The peer cache remembers cluster nodes that have been seen.
30 * This is managed mostly automatically by libcluster, based on
31 * cluster membership events.
32 *
33 * Because cluster nodes can have conflicting names or UUIDs,
34 * the hash table key is a uniquely generated ID.
35 */
36 GHashTable *crm_peer_cache = NULL;
37
38 /*
39 * The remote peer cache tracks pacemaker_remote nodes. While the
40 * value has the same type as the peer cache's, it is tracked separately for
41 * three reasons: pacemaker_remote nodes can't have conflicting names or UUIDs,
42 * so the name (which is also the UUID) is used as the hash table key; there
43 * is no equivalent of membership events, so management is not automatic; and
44 * most users of the peer cache need to exclude pacemaker_remote nodes.
45 *
46 * That said, using a single cache would be more logical and less error-prone,
47 * so it would be a good idea to merge them one day.
48 *
49 * libcluster provides two avenues for populating the cache:
50 * crm_remote_peer_get() and crm_remote_peer_cache_remove() directly manage it,
51 * while crm_remote_peer_cache_refresh() populates it via the CIB.
52 */
53 GHashTable *crm_remote_peer_cache = NULL;
54
55 /*
56 * The known node cache tracks cluster and remote nodes that have been seen in
57 * the CIB. It is useful mainly when a caller needs to know about a node that
58 * may no longer be in the membership, but doesn't want to add the node to the
59 * main peer cache tables.
60 */
61 static GHashTable *known_node_cache = NULL;
62
63 unsigned long long crm_peer_seq = 0;
64 gboolean crm_have_quorum = FALSE;
65 static gboolean crm_autoreap = TRUE;
66
67 // Flag setting and clearing for crm_node_t:flags
68
69 #define set_peer_flags(peer, flags_to_set) do { \
70 (peer)->flags = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \
71 "Peer", (peer)->uname, \
72 (peer)->flags, (flags_to_set), \
73 #flags_to_set); \
74 } while (0)
75
76 #define clear_peer_flags(peer, flags_to_clear) do { \
77 (peer)->flags = pcmk__clear_flags_as(__func__, __LINE__, \
78 LOG_TRACE, \
79 "Peer", (peer)->uname, \
80 (peer)->flags, (flags_to_clear), \
81 #flags_to_clear); \
82 } while (0)
83
84 static void update_peer_uname(crm_node_t *node, const char *uname);
85
86 int
crm_remote_peer_cache_size(void)87 crm_remote_peer_cache_size(void)
88 {
89 if (crm_remote_peer_cache == NULL) {
90 return 0;
91 }
92 return g_hash_table_size(crm_remote_peer_cache);
93 }
94
95 /*!
96 * \brief Get a remote node peer cache entry, creating it if necessary
97 *
98 * \param[in] node_name Name of remote node
99 *
100 * \return Cache entry for node on success, NULL (and set errno) otherwise
101 *
102 * \note When creating a new entry, this will leave the node state undetermined,
103 * so the caller should also call pcmk__update_peer_state() if the state
104 * is known.
105 */
106 crm_node_t *
crm_remote_peer_get(const char * node_name)107 crm_remote_peer_get(const char *node_name)
108 {
109 crm_node_t *node;
110
111 if (node_name == NULL) {
112 errno = -EINVAL;
113 return NULL;
114 }
115
116 /* Return existing cache entry if one exists */
117 node = g_hash_table_lookup(crm_remote_peer_cache, node_name);
118 if (node) {
119 return node;
120 }
121
122 /* Allocate a new entry */
123 node = calloc(1, sizeof(crm_node_t));
124 if (node == NULL) {
125 return NULL;
126 }
127
128 /* Populate the essential information */
129 set_peer_flags(node, crm_remote_node);
130 node->uuid = strdup(node_name);
131 if (node->uuid == NULL) {
132 free(node);
133 errno = -ENOMEM;
134 return NULL;
135 }
136
137 /* Add the new entry to the cache */
138 g_hash_table_replace(crm_remote_peer_cache, node->uuid, node);
139 crm_trace("added %s to remote cache", node_name);
140
141 /* Update the entry's uname, ensuring peer status callbacks are called */
142 update_peer_uname(node, node_name);
143 return node;
144 }
145
146 void
crm_remote_peer_cache_remove(const char * node_name)147 crm_remote_peer_cache_remove(const char *node_name)
148 {
149 if (g_hash_table_remove(crm_remote_peer_cache, node_name)) {
150 crm_trace("removed %s from remote peer cache", node_name);
151 }
152 }
153
154 /*!
155 * \internal
156 * \brief Return node status based on a CIB status entry
157 *
158 * \param[in] node_state XML of node state
159 *
160 * \return CRM_NODE_LOST if XML_NODE_IN_CLUSTER is false in node_state,
161 * CRM_NODE_MEMBER otherwise
162 * \note Unlike most boolean XML attributes, this one defaults to true, for
163 * backward compatibility with older controllers that don't set it.
164 */
165 static const char *
remote_state_from_cib(xmlNode * node_state)166 remote_state_from_cib(xmlNode *node_state)
167 {
168 const char *status;
169
170 status = crm_element_value(node_state, XML_NODE_IN_CLUSTER);
171 if (status && !crm_is_true(status)) {
172 status = CRM_NODE_LOST;
173 } else {
174 status = CRM_NODE_MEMBER;
175 }
176 return status;
177 }
178
179 /* user data for looping through remote node xpath searches */
180 struct refresh_data {
181 const char *field; /* XML attribute to check for node name */
182 gboolean has_state; /* whether to update node state based on XML */
183 };
184
185 /*!
186 * \internal
187 * \brief Process one pacemaker_remote node xpath search result
188 *
189 * \param[in] result XML search result
190 * \param[in] user_data what to look for in the XML
191 */
192 static void
remote_cache_refresh_helper(xmlNode * result,void * user_data)193 remote_cache_refresh_helper(xmlNode *result, void *user_data)
194 {
195 struct refresh_data *data = user_data;
196 const char *remote = crm_element_value(result, data->field);
197 const char *state = NULL;
198 crm_node_t *node;
199
200 CRM_CHECK(remote != NULL, return);
201
202 /* Determine node's state, if the result has it */
203 if (data->has_state) {
204 state = remote_state_from_cib(result);
205 }
206
207 /* Check whether cache already has entry for node */
208 node = g_hash_table_lookup(crm_remote_peer_cache, remote);
209
210 if (node == NULL) {
211 /* Node is not in cache, so add a new entry for it */
212 node = crm_remote_peer_get(remote);
213 CRM_ASSERT(node);
214 if (state) {
215 pcmk__update_peer_state(__func__, node, state, 0);
216 }
217
218 } else if (pcmk_is_set(node->flags, crm_node_dirty)) {
219 /* Node is in cache and hasn't been updated already, so mark it clean */
220 clear_peer_flags(node, crm_node_dirty);
221 if (state) {
222 pcmk__update_peer_state(__func__, node, state, 0);
223 }
224 }
225 }
226
227 static void
mark_dirty(gpointer key,gpointer value,gpointer user_data)228 mark_dirty(gpointer key, gpointer value, gpointer user_data)
229 {
230 set_peer_flags((crm_node_t *) value, crm_node_dirty);
231 }
232
233 static gboolean
is_dirty(gpointer key,gpointer value,gpointer user_data)234 is_dirty(gpointer key, gpointer value, gpointer user_data)
235 {
236 return pcmk_is_set(((crm_node_t*)value)->flags, crm_node_dirty);
237 }
238
239 /*!
240 * \brief Repopulate the remote peer cache based on CIB XML
241 *
242 * \param[in] xmlNode CIB XML to parse
243 */
244 void
crm_remote_peer_cache_refresh(xmlNode * cib)245 crm_remote_peer_cache_refresh(xmlNode *cib)
246 {
247 struct refresh_data data;
248
249 crm_peer_init();
250
251 /* First, we mark all existing cache entries as dirty,
252 * so that later we can remove any that weren't in the CIB.
253 * We don't empty the cache, because we need to detect changes in state.
254 */
255 g_hash_table_foreach(crm_remote_peer_cache, mark_dirty, NULL);
256
257 /* Look for guest nodes and remote nodes in the status section */
258 data.field = "id";
259 data.has_state = TRUE;
260 crm_foreach_xpath_result(cib, PCMK__XP_REMOTE_NODE_STATUS,
261 remote_cache_refresh_helper, &data);
262
263 /* Look for guest nodes and remote nodes in the configuration section,
264 * because they may have just been added and not have a status entry yet.
265 * In that case, the cached node state will be left NULL, so that the
266 * peer status callback isn't called until we're sure the node started
267 * successfully.
268 */
269 data.field = "value";
270 data.has_state = FALSE;
271 crm_foreach_xpath_result(cib, PCMK__XP_GUEST_NODE_CONFIG,
272 remote_cache_refresh_helper, &data);
273 data.field = "id";
274 data.has_state = FALSE;
275 crm_foreach_xpath_result(cib, PCMK__XP_REMOTE_NODE_CONFIG,
276 remote_cache_refresh_helper, &data);
277
278 /* Remove all old cache entries that weren't seen in the CIB */
279 g_hash_table_foreach_remove(crm_remote_peer_cache, is_dirty, NULL);
280 }
281
282 gboolean
crm_is_peer_active(const crm_node_t * node)283 crm_is_peer_active(const crm_node_t * node)
284 {
285 if(node == NULL) {
286 return FALSE;
287 }
288
289 if (pcmk_is_set(node->flags, crm_remote_node)) {
290 /* remote nodes are never considered active members. This
291 * guarantees they will never be considered for DC membership.*/
292 return FALSE;
293 }
294 #if SUPPORT_COROSYNC
295 if (is_corosync_cluster()) {
296 return crm_is_corosync_peer_active(node);
297 }
298 #endif
299 crm_err("Unhandled cluster type: %s", name_for_cluster_type(get_cluster_type()));
300 return FALSE;
301 }
302
303 static gboolean
crm_reap_dead_member(gpointer key,gpointer value,gpointer user_data)304 crm_reap_dead_member(gpointer key, gpointer value, gpointer user_data)
305 {
306 crm_node_t *node = value;
307 crm_node_t *search = user_data;
308
309 if (search == NULL) {
310 return FALSE;
311
312 } else if (search->id && node->id != search->id) {
313 return FALSE;
314
315 } else if (search->id == 0 && !pcmk__str_eq(node->uname, search->uname, pcmk__str_casei)) {
316 return FALSE;
317
318 } else if (crm_is_peer_active(value) == FALSE) {
319 crm_info("Removing node with name %s and id %u from membership cache",
320 (node->uname? node->uname : "unknown"), node->id);
321 return TRUE;
322 }
323 return FALSE;
324 }
325
326 /*!
327 * \brief Remove all peer cache entries matching a node ID and/or uname
328 *
329 * \param[in] id ID of node to remove (or 0 to ignore)
330 * \param[in] name Uname of node to remove (or NULL to ignore)
331 *
332 * \return Number of cache entries removed
333 */
334 guint
reap_crm_member(uint32_t id,const char * name)335 reap_crm_member(uint32_t id, const char *name)
336 {
337 int matches = 0;
338 crm_node_t search;
339
340 if (crm_peer_cache == NULL) {
341 crm_trace("Membership cache not initialized, ignoring purge request");
342 return 0;
343 }
344
345 search.id = id;
346 search.uname = name ? strdup(name) : NULL;
347 matches = g_hash_table_foreach_remove(crm_peer_cache, crm_reap_dead_member, &search);
348 if(matches) {
349 crm_notice("Purged %d peer%s with id=%u%s%s from the membership cache",
350 matches, pcmk__plural_s(matches), search.id,
351 (search.uname? " and/or uname=" : ""),
352 (search.uname? search.uname : ""));
353
354 } else {
355 crm_info("No peers with id=%u%s%s to purge from the membership cache",
356 search.id, (search.uname? " and/or uname=" : ""),
357 (search.uname? search.uname : ""));
358 }
359
360 free(search.uname);
361 return matches;
362 }
363
364 static void
count_peer(gpointer key,gpointer value,gpointer user_data)365 count_peer(gpointer key, gpointer value, gpointer user_data)
366 {
367 guint *count = user_data;
368 crm_node_t *node = value;
369
370 if (crm_is_peer_active(node)) {
371 *count = *count + 1;
372 }
373 }
374
375 guint
crm_active_peers(void)376 crm_active_peers(void)
377 {
378 guint count = 0;
379
380 if (crm_peer_cache) {
381 g_hash_table_foreach(crm_peer_cache, count_peer, &count);
382 }
383 return count;
384 }
385
386 static void
destroy_crm_node(gpointer data)387 destroy_crm_node(gpointer data)
388 {
389 crm_node_t *node = data;
390
391 crm_trace("Destroying entry for node %u: %s", node->id, node->uname);
392
393 free(node->uname);
394 free(node->state);
395 free(node->uuid);
396 free(node->expected);
397 free(node);
398 }
399
400 void
crm_peer_init(void)401 crm_peer_init(void)
402 {
403 if (crm_peer_cache == NULL) {
404 crm_peer_cache = pcmk__strikey_table(free, destroy_crm_node);
405 }
406
407 if (crm_remote_peer_cache == NULL) {
408 crm_remote_peer_cache = pcmk__strikey_table(NULL, destroy_crm_node);
409 }
410
411 if (known_node_cache == NULL) {
412 known_node_cache = pcmk__strikey_table(free, destroy_crm_node);
413 }
414 }
415
416 void
crm_peer_destroy(void)417 crm_peer_destroy(void)
418 {
419 if (crm_peer_cache != NULL) {
420 crm_trace("Destroying peer cache with %d members", g_hash_table_size(crm_peer_cache));
421 g_hash_table_destroy(crm_peer_cache);
422 crm_peer_cache = NULL;
423 }
424
425 if (crm_remote_peer_cache != NULL) {
426 crm_trace("Destroying remote peer cache with %d members", g_hash_table_size(crm_remote_peer_cache));
427 g_hash_table_destroy(crm_remote_peer_cache);
428 crm_remote_peer_cache = NULL;
429 }
430
431 if (known_node_cache != NULL) {
432 crm_trace("Destroying known node cache with %d members",
433 g_hash_table_size(known_node_cache));
434 g_hash_table_destroy(known_node_cache);
435 known_node_cache = NULL;
436 }
437
438 }
439
440 static void (*peer_status_callback)(enum crm_status_type, crm_node_t *,
441 const void *) = NULL;
442
443 /*!
444 * \brief Set a client function that will be called after peer status changes
445 *
446 * \param[in] dispatch Pointer to function to use as callback
447 *
448 * \note Previously, client callbacks were responsible for peer cache
449 * management. This is no longer the case, and client callbacks should do
450 * only client-specific handling. Callbacks MUST NOT add or remove entries
451 * in the peer caches.
452 */
453 void
crm_set_status_callback(void (* dispatch)(enum crm_status_type,crm_node_t *,const void *))454 crm_set_status_callback(void (*dispatch) (enum crm_status_type, crm_node_t *, const void *))
455 {
456 peer_status_callback = dispatch;
457 }
458
459 /*!
460 * \brief Tell the library whether to automatically reap lost nodes
461 *
462 * If TRUE (the default), calling crm_update_peer_proc() will also update the
463 * peer state to CRM_NODE_MEMBER or CRM_NODE_LOST, and pcmk__update_peer_state()
464 * will reap peers whose state changes to anything other than CRM_NODE_MEMBER.
465 * Callers should leave this enabled unless they plan to manage the cache
466 * separately on their own.
467 *
468 * \param[in] autoreap TRUE to enable automatic reaping, FALSE to disable
469 */
470 void
crm_set_autoreap(gboolean autoreap)471 crm_set_autoreap(gboolean autoreap)
472 {
473 crm_autoreap = autoreap;
474 }
475
476 static void
dump_peer_hash(int level,const char * caller)477 dump_peer_hash(int level, const char *caller)
478 {
479 GHashTableIter iter;
480 const char *id = NULL;
481 crm_node_t *node = NULL;
482
483 g_hash_table_iter_init(&iter, crm_peer_cache);
484 while (g_hash_table_iter_next(&iter, (gpointer *) &id, (gpointer *) &node)) {
485 do_crm_log(level, "%s: Node %u/%s = %p - %s", caller, node->id, node->uname, node, id);
486 }
487 }
488
489 static gboolean
hash_find_by_data(gpointer key,gpointer value,gpointer user_data)490 hash_find_by_data(gpointer key, gpointer value, gpointer user_data)
491 {
492 return value == user_data;
493 }
494
495 /*!
496 * \internal
497 * \brief Search caches for a node (cluster or Pacemaker Remote)
498 *
499 * \param[in] id If not 0, cluster node ID to search for
500 * \param[in] uname If not NULL, node name to search for
501 * \param[in] flags Bitmask of enum crm_get_peer_flags
502 *
503 * \return Node cache entry if found, otherwise NULL
504 */
505 crm_node_t *
pcmk__search_node_caches(unsigned int id,const char * uname,uint32_t flags)506 pcmk__search_node_caches(unsigned int id, const char *uname, uint32_t flags)
507 {
508 crm_node_t *node = NULL;
509
510 CRM_ASSERT(id > 0 || uname != NULL);
511
512 crm_peer_init();
513
514 if ((uname != NULL) && pcmk_is_set(flags, CRM_GET_PEER_REMOTE)) {
515 node = g_hash_table_lookup(crm_remote_peer_cache, uname);
516 }
517
518 if ((node == NULL) && pcmk_is_set(flags, CRM_GET_PEER_CLUSTER)) {
519 node = pcmk__search_cluster_node_cache(id, uname);
520 }
521 return node;
522 }
523
524 /*!
525 * \brief Get a node cache entry (cluster or Pacemaker Remote)
526 *
527 * \param[in] id If not 0, cluster node ID to search for
528 * \param[in] uname If not NULL, node name to search for
529 * \param[in] flags Bitmask of enum crm_get_peer_flags
530 *
531 * \return (Possibly newly created) node cache entry
532 */
533 crm_node_t *
crm_get_peer_full(unsigned int id,const char * uname,int flags)534 crm_get_peer_full(unsigned int id, const char *uname, int flags)
535 {
536 crm_node_t *node = NULL;
537
538 CRM_ASSERT(id > 0 || uname != NULL);
539
540 crm_peer_init();
541
542 if (pcmk_is_set(flags, CRM_GET_PEER_REMOTE)) {
543 node = g_hash_table_lookup(crm_remote_peer_cache, uname);
544 }
545
546 if ((node == NULL) && pcmk_is_set(flags, CRM_GET_PEER_CLUSTER)) {
547 node = crm_get_peer(id, uname);
548 }
549 return node;
550 }
551
552 /*!
553 * \internal
554 * \brief Search cluster node cache
555 *
556 * \param[in] id If not 0, cluster node ID to search for
557 * \param[in] uname If not NULL, node name to search for
558 *
559 * \return Cluster node cache entry if found, otherwise NULL
560 */
561 crm_node_t *
pcmk__search_cluster_node_cache(unsigned int id,const char * uname)562 pcmk__search_cluster_node_cache(unsigned int id, const char *uname)
563 {
564 GHashTableIter iter;
565 crm_node_t *node = NULL;
566 crm_node_t *by_id = NULL;
567 crm_node_t *by_name = NULL;
568
569 CRM_ASSERT(id > 0 || uname != NULL);
570
571 crm_peer_init();
572
573 if (uname != NULL) {
574 g_hash_table_iter_init(&iter, crm_peer_cache);
575 while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
576 if(node->uname && strcasecmp(node->uname, uname) == 0) {
577 crm_trace("Name match: %s = %p", node->uname, node);
578 by_name = node;
579 break;
580 }
581 }
582 }
583
584 if (id > 0) {
585 g_hash_table_iter_init(&iter, crm_peer_cache);
586 while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
587 if(node->id == id) {
588 crm_trace("ID match: %u = %p", node->id, node);
589 by_id = node;
590 break;
591 }
592 }
593 }
594
595 node = by_id; /* Good default */
596 if(by_id == by_name) {
597 /* Nothing to do if they match (both NULL counts) */
598 crm_trace("Consistent: %p for %u/%s", by_id, id, uname);
599
600 } else if(by_id == NULL && by_name) {
601 crm_trace("Only one: %p for %u/%s", by_name, id, uname);
602
603 if(id && by_name->id) {
604 dump_peer_hash(LOG_WARNING, __func__);
605 crm_crit("Node %u and %u share the same name '%s'",
606 id, by_name->id, uname);
607 node = NULL; /* Create a new one */
608
609 } else {
610 node = by_name;
611 }
612
613 } else if(by_name == NULL && by_id) {
614 crm_trace("Only one: %p for %u/%s", by_id, id, uname);
615
616 if(uname && by_id->uname) {
617 dump_peer_hash(LOG_WARNING, __func__);
618 crm_crit("Node '%s' and '%s' share the same cluster nodeid %u: assuming '%s' is correct",
619 uname, by_id->uname, id, uname);
620 }
621
622 } else if(uname && by_id->uname) {
623 if(pcmk__str_eq(uname, by_id->uname, pcmk__str_casei)) {
624 crm_notice("Node '%s' has changed its ID from %u to %u", by_id->uname, by_name->id, by_id->id);
625 g_hash_table_foreach_remove(crm_peer_cache, hash_find_by_data, by_name);
626
627 } else {
628 crm_warn("Node '%s' and '%s' share the same cluster nodeid: %u %s", by_id->uname, by_name->uname, id, uname);
629 dump_peer_hash(LOG_INFO, __func__);
630 crm_abort(__FILE__, __func__, __LINE__, "member weirdness", TRUE,
631 TRUE);
632 }
633
634 } else if(id && by_name->id) {
635 crm_warn("Node %u and %u share the same name: '%s'", by_id->id, by_name->id, uname);
636
637 } else {
638 /* Simple merge */
639
640 /* Only corosync-based clusters use node IDs. The functions that call
641 * pcmk__update_peer_state() and crm_update_peer_proc() only know
642 * nodeid, so 'by_id' is authoritative when merging.
643 */
644 dump_peer_hash(LOG_DEBUG, __func__);
645
646 crm_info("Merging %p into %p", by_name, by_id);
647 g_hash_table_foreach_remove(crm_peer_cache, hash_find_by_data, by_name);
648 }
649
650 return node;
651 }
652
653 #if SUPPORT_COROSYNC
654 static guint
remove_conflicting_peer(crm_node_t * node)655 remove_conflicting_peer(crm_node_t *node)
656 {
657 int matches = 0;
658 GHashTableIter iter;
659 crm_node_t *existing_node = NULL;
660
661 if (node->id == 0 || node->uname == NULL) {
662 return 0;
663 }
664
665 if (!pcmk__corosync_has_nodelist()) {
666 return 0;
667 }
668
669 g_hash_table_iter_init(&iter, crm_peer_cache);
670 while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &existing_node)) {
671 if (existing_node->id > 0
672 && existing_node->id != node->id
673 && existing_node->uname != NULL
674 && strcasecmp(existing_node->uname, node->uname) == 0) {
675
676 if (crm_is_peer_active(existing_node)) {
677 continue;
678 }
679
680 crm_warn("Removing cached offline node %u/%s which has conflicting uname with %u",
681 existing_node->id, existing_node->uname, node->id);
682
683 g_hash_table_iter_remove(&iter);
684 matches++;
685 }
686 }
687
688 return matches;
689 }
690 #endif
691
692 /*!
693 * \brief Get a cluster node cache entry
694 *
695 * \param[in] id If not 0, cluster node ID to search for
696 * \param[in] uname If not NULL, node name to search for
697 *
698 * \return (Possibly newly created) cluster node cache entry
699 */
700 /* coverity[-alloc] Memory is referenced in one or both hashtables */
701 crm_node_t *
crm_get_peer(unsigned int id,const char * uname)702 crm_get_peer(unsigned int id, const char *uname)
703 {
704 crm_node_t *node = NULL;
705 char *uname_lookup = NULL;
706
707 CRM_ASSERT(id > 0 || uname != NULL);
708
709 crm_peer_init();
710
711 node = pcmk__search_cluster_node_cache(id, uname);
712
713 /* if uname wasn't provided, and find_peer did not turn up a uname based on id.
714 * we need to do a lookup of the node name using the id in the cluster membership. */
715 if ((node == NULL || node->uname == NULL) && (uname == NULL)) {
716 uname_lookup = get_node_name(id);
717 }
718
719 if (uname_lookup) {
720 uname = uname_lookup;
721 crm_trace("Inferred a name of '%s' for node %u", uname, id);
722
723 /* try to turn up the node one more time now that we know the uname. */
724 if (node == NULL) {
725 node = pcmk__search_cluster_node_cache(id, uname);
726 }
727 }
728
729
730 if (node == NULL) {
731 char *uniqueid = crm_generate_uuid();
732
733 node = calloc(1, sizeof(crm_node_t));
734 CRM_ASSERT(node);
735
736 crm_info("Created entry %s/%p for node %s/%u (%d total)",
737 uniqueid, node, uname, id, 1 + g_hash_table_size(crm_peer_cache));
738 g_hash_table_replace(crm_peer_cache, uniqueid, node);
739 }
740
741 if(id > 0 && uname && (node->id == 0 || node->uname == NULL)) {
742 crm_info("Node %u is now known as %s", id, uname);
743 }
744
745 if(id > 0 && node->id == 0) {
746 node->id = id;
747 }
748
749 if (uname && (node->uname == NULL)) {
750 update_peer_uname(node, uname);
751 }
752
753 if(node->uuid == NULL) {
754 const char *uuid = crm_peer_uuid(node);
755
756 if (uuid) {
757 crm_info("Node %u has uuid %s", id, uuid);
758
759 } else {
760 crm_info("Cannot obtain a UUID for node %u/%s", id, node->uname);
761 }
762 }
763
764 free(uname_lookup);
765
766 return node;
767 }
768
769 /*!
770 * \internal
771 * \brief Update a node's uname
772 *
773 * \param[in] node Node object to update
774 * \param[in] uname New name to set
775 *
776 * \note This function should not be called within a peer cache iteration,
777 * because in some cases it can remove conflicting cache entries,
778 * which would invalidate the iterator.
779 */
780 static void
update_peer_uname(crm_node_t * node,const char * uname)781 update_peer_uname(crm_node_t *node, const char *uname)
782 {
783 CRM_CHECK(uname != NULL,
784 crm_err("Bug: can't update node name without name"); return);
785 CRM_CHECK(node != NULL,
786 crm_err("Bug: can't update node name to %s without node", uname);
787 return);
788
789 if (pcmk__str_eq(uname, node->uname, pcmk__str_casei)) {
790 crm_debug("Node uname '%s' did not change", uname);
791 return;
792 }
793
794 for (const char *c = uname; *c; ++c) {
795 if ((*c >= 'A') && (*c <= 'Z')) {
796 crm_warn("Node names with capitals are discouraged, consider changing '%s'",
797 uname);
798 break;
799 }
800 }
801
802 free(node->uname);
803 node->uname = strdup(uname);
804 CRM_ASSERT(node->uname != NULL);
805
806 if (peer_status_callback != NULL) {
807 peer_status_callback(crm_status_uname, node, NULL);
808 }
809
810 #if SUPPORT_COROSYNC
811 if (is_corosync_cluster() && !pcmk_is_set(node->flags, crm_remote_node)) {
812 remove_conflicting_peer(node);
813 }
814 #endif
815 }
816
817 /*!
818 * \internal
819 * \brief Get log-friendly string equivalent of a process flag
820 *
821 * \param[in] proc Process flag
822 *
823 * \return Log-friendly string equivalent of \p proc
824 */
825 static inline const char *
proc2text(enum crm_proc_flag proc)826 proc2text(enum crm_proc_flag proc)
827 {
828 const char *text = "unknown";
829
830 switch (proc) {
831 case crm_proc_none:
832 text = "none";
833 break;
834 case crm_proc_based:
835 text = "pacemaker-based";
836 break;
837 case crm_proc_controld:
838 text = "pacemaker-controld";
839 break;
840 case crm_proc_schedulerd:
841 text = "pacemaker-schedulerd";
842 break;
843 case crm_proc_execd:
844 text = "pacemaker-execd";
845 break;
846 case crm_proc_attrd:
847 text = "pacemaker-attrd";
848 break;
849 case crm_proc_fenced:
850 text = "pacemaker-fenced";
851 break;
852 case crm_proc_cpg:
853 text = "corosync-cpg";
854 break;
855 }
856 return text;
857 }
858
859 /*!
860 * \internal
861 * \brief Update a node's process information (and potentially state)
862 *
863 * \param[in] source Caller's function name (for log messages)
864 * \param[in] node Node object to update
865 * \param[in] flag Bitmask of new process information
866 * \param[in] status node status (online, offline, etc.)
867 *
868 * \return NULL if any node was reaped from peer caches, value of node otherwise
869 *
870 * \note If this function returns NULL, the supplied node object was likely
871 * freed and should not be used again. This function should not be
872 * called within a cache iteration if reaping is possible, otherwise
873 * reaping could invalidate the iterator.
874 */
875 crm_node_t *
crm_update_peer_proc(const char * source,crm_node_t * node,uint32_t flag,const char * status)876 crm_update_peer_proc(const char *source, crm_node_t * node, uint32_t flag, const char *status)
877 {
878 uint32_t last = 0;
879 gboolean changed = FALSE;
880
881 CRM_CHECK(node != NULL, crm_err("%s: Could not set %s to %s for NULL",
882 source, proc2text(flag), status);
883 return NULL);
884
885 /* Pacemaker doesn't spawn processes on remote nodes */
886 if (pcmk_is_set(node->flags, crm_remote_node)) {
887 return node;
888 }
889
890 last = node->processes;
891 if (status == NULL) {
892 node->processes = flag;
893 if (node->processes != last) {
894 changed = TRUE;
895 }
896
897 } else if (pcmk__str_eq(status, ONLINESTATUS, pcmk__str_casei)) {
898 if ((node->processes & flag) != flag) {
899 node->processes = pcmk__set_flags_as(__func__, __LINE__,
900 LOG_TRACE, "Peer process",
901 node->uname, node->processes,
902 flag, "processes");
903 changed = TRUE;
904 }
905
906 } else if (node->processes & flag) {
907 node->processes = pcmk__clear_flags_as(__func__, __LINE__,
908 LOG_TRACE, "Peer process",
909 node->uname, node->processes,
910 flag, "processes");
911 changed = TRUE;
912 }
913
914 if (changed) {
915 if (status == NULL && flag <= crm_proc_none) {
916 crm_info("%s: Node %s[%u] - all processes are now offline", source, node->uname,
917 node->id);
918 } else {
919 crm_info("%s: Node %s[%u] - %s is now %s", source, node->uname, node->id,
920 proc2text(flag), status);
921 }
922
923 /* Call the client callback first, then update the peer state,
924 * in case the node will be reaped
925 */
926 if (peer_status_callback != NULL) {
927 peer_status_callback(crm_status_processes, node, &last);
928 }
929
930 /* The client callback shouldn't touch the peer caches,
931 * but as a safety net, bail if the peer cache was destroyed.
932 */
933 if (crm_peer_cache == NULL) {
934 return NULL;
935 }
936
937 if (crm_autoreap) {
938 const char *peer_state = NULL;
939
940 if (pcmk_is_set(node->processes, crm_get_cluster_proc())) {
941 peer_state = CRM_NODE_MEMBER;
942 } else {
943 peer_state = CRM_NODE_LOST;
944 }
945 node = pcmk__update_peer_state(__func__, node, peer_state, 0);
946 }
947 } else {
948 crm_trace("%s: Node %s[%u] - %s is unchanged (%s)", source, node->uname, node->id,
949 proc2text(flag), status);
950 }
951 return node;
952 }
953
954 /*!
955 * \internal
956 * \brief Update a cluster node cache entry's expected join state
957 *
958 * \param[in] source Caller's function name (for logging)
959 * \param[in,out] node Node to update
960 * \param[in] expected Node's new join state
961 */
962 void
pcmk__update_peer_expected(const char * source,crm_node_t * node,const char * expected)963 pcmk__update_peer_expected(const char *source, crm_node_t *node,
964 const char *expected)
965 {
966 char *last = NULL;
967 gboolean changed = FALSE;
968
969 CRM_CHECK(node != NULL, crm_err("%s: Could not set 'expected' to %s", source, expected);
970 return);
971
972 /* Remote nodes don't participate in joins */
973 if (pcmk_is_set(node->flags, crm_remote_node)) {
974 return;
975 }
976
977 last = node->expected;
978 if (expected != NULL && !pcmk__str_eq(node->expected, expected, pcmk__str_casei)) {
979 node->expected = strdup(expected);
980 changed = TRUE;
981 }
982
983 if (changed) {
984 crm_info("%s: Node %s[%u] - expected state is now %s (was %s)", source, node->uname, node->id,
985 expected, last);
986 free(last);
987 } else {
988 crm_trace("%s: Node %s[%u] - expected state is unchanged (%s)", source, node->uname,
989 node->id, expected);
990 }
991 }
992
993 /*!
994 * \internal
995 * \brief Update a node's state and membership information
996 *
997 * \param[in] source Caller's function name (for log messages)
998 * \param[in] node Node object to update
999 * \param[in] state Node's new state
1000 * \param[in] membership Node's new membership ID
1001 * \param[in] iter If not NULL, pointer to node's peer cache iterator
1002 *
1003 * \return NULL if any node was reaped, value of node otherwise
1004 *
1005 * \note If this function returns NULL, the supplied node object was likely
1006 * freed and should not be used again. This function may be called from
1007 * within a peer cache iteration if the iterator is supplied.
1008 */
1009 static crm_node_t *
update_peer_state_iter(const char * source,crm_node_t * node,const char * state,uint64_t membership,GHashTableIter * iter)1010 update_peer_state_iter(const char *source, crm_node_t *node, const char *state,
1011 uint64_t membership, GHashTableIter *iter)
1012 {
1013 gboolean is_member;
1014
1015 CRM_CHECK(node != NULL,
1016 crm_err("Could not set state for unknown host to %s"
1017 CRM_XS " source=%s", state, source);
1018 return NULL);
1019
1020 is_member = pcmk__str_eq(state, CRM_NODE_MEMBER, pcmk__str_casei);
1021 if (is_member) {
1022 node->when_lost = 0;
1023 if (membership) {
1024 node->last_seen = membership;
1025 }
1026 }
1027
1028 if (state && !pcmk__str_eq(node->state, state, pcmk__str_casei)) {
1029 char *last = node->state;
1030
1031 node->state = strdup(state);
1032 crm_notice("Node %s state is now %s " CRM_XS
1033 " nodeid=%u previous=%s source=%s", node->uname, state,
1034 node->id, (last? last : "unknown"), source);
1035 if (peer_status_callback != NULL) {
1036 peer_status_callback(crm_status_nstate, node, last);
1037 }
1038 free(last);
1039
1040 if (crm_autoreap && !is_member
1041 && !pcmk_is_set(node->flags, crm_remote_node)) {
1042 /* We only autoreap from the peer cache, not the remote peer cache,
1043 * because the latter should be managed only by
1044 * crm_remote_peer_cache_refresh().
1045 */
1046 if(iter) {
1047 crm_notice("Purged 1 peer with id=%u and/or uname=%s from the membership cache", node->id, node->uname);
1048 g_hash_table_iter_remove(iter);
1049
1050 } else {
1051 reap_crm_member(node->id, node->uname);
1052 }
1053 node = NULL;
1054 }
1055
1056 } else {
1057 crm_trace("Node %s state is unchanged (%s) " CRM_XS
1058 " nodeid=%u source=%s", node->uname, state, node->id, source);
1059 }
1060 return node;
1061 }
1062
1063 /*!
1064 * \brief Update a node's state and membership information
1065 *
1066 * \param[in] source Caller's function name (for log messages)
1067 * \param[in] node Node object to update
1068 * \param[in] state Node's new state
1069 * \param[in] membership Node's new membership ID
1070 *
1071 * \return NULL if any node was reaped, value of node otherwise
1072 *
1073 * \note If this function returns NULL, the supplied node object was likely
1074 * freed and should not be used again. This function should not be
1075 * called within a cache iteration if reaping is possible,
1076 * otherwise reaping could invalidate the iterator.
1077 */
1078 crm_node_t *
pcmk__update_peer_state(const char * source,crm_node_t * node,const char * state,uint64_t membership)1079 pcmk__update_peer_state(const char *source, crm_node_t *node,
1080 const char *state, uint64_t membership)
1081 {
1082 return update_peer_state_iter(source, node, state, membership, NULL);
1083 }
1084
1085 /*!
1086 * \internal
1087 * \brief Reap all nodes from cache whose membership information does not match
1088 *
1089 * \param[in] membership Membership ID of nodes to keep
1090 */
1091 void
pcmk__reap_unseen_nodes(uint64_t membership)1092 pcmk__reap_unseen_nodes(uint64_t membership)
1093 {
1094 GHashTableIter iter;
1095 crm_node_t *node = NULL;
1096
1097 crm_trace("Reaping unseen nodes...");
1098 g_hash_table_iter_init(&iter, crm_peer_cache);
1099 while (g_hash_table_iter_next(&iter, NULL, (gpointer *)&node)) {
1100 if (node->last_seen != membership) {
1101 if (node->state) {
1102 /*
1103 * Calling update_peer_state_iter() allows us to
1104 * remove the node from crm_peer_cache without
1105 * invalidating our iterator
1106 */
1107 update_peer_state_iter(__func__, node, CRM_NODE_LOST,
1108 membership, &iter);
1109
1110 } else {
1111 crm_info("State of node %s[%u] is still unknown",
1112 node->uname, node->id);
1113 }
1114 }
1115 }
1116 }
1117
1118 static crm_node_t *
find_known_node(const char * id,const char * uname)1119 find_known_node(const char *id, const char *uname)
1120 {
1121 GHashTableIter iter;
1122 crm_node_t *node = NULL;
1123 crm_node_t *by_id = NULL;
1124 crm_node_t *by_name = NULL;
1125
1126 if (uname) {
1127 g_hash_table_iter_init(&iter, known_node_cache);
1128 while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
1129 if (node->uname && strcasecmp(node->uname, uname) == 0) {
1130 crm_trace("Name match: %s = %p", node->uname, node);
1131 by_name = node;
1132 break;
1133 }
1134 }
1135 }
1136
1137 if (id) {
1138 g_hash_table_iter_init(&iter, known_node_cache);
1139 while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
1140 if(strcasecmp(node->uuid, id) == 0) {
1141 crm_trace("ID match: %s= %p", id, node);
1142 by_id = node;
1143 break;
1144 }
1145 }
1146 }
1147
1148 node = by_id; /* Good default */
1149 if (by_id == by_name) {
1150 /* Nothing to do if they match (both NULL counts) */
1151 crm_trace("Consistent: %p for %s/%s", by_id, id, uname);
1152
1153 } else if (by_id == NULL && by_name) {
1154 crm_trace("Only one: %p for %s/%s", by_name, id, uname);
1155
1156 if (id) {
1157 node = NULL;
1158
1159 } else {
1160 node = by_name;
1161 }
1162
1163 } else if (by_name == NULL && by_id) {
1164 crm_trace("Only one: %p for %s/%s", by_id, id, uname);
1165
1166 if (uname) {
1167 node = NULL;
1168 }
1169
1170 } else if (uname && by_id->uname
1171 && pcmk__str_eq(uname, by_id->uname, pcmk__str_casei)) {
1172 /* Multiple nodes have the same uname in the CIB.
1173 * Return by_id. */
1174
1175 } else if (id && by_name->uuid
1176 && pcmk__str_eq(id, by_name->uuid, pcmk__str_casei)) {
1177 /* Multiple nodes have the same id in the CIB.
1178 * Return by_name. */
1179 node = by_name;
1180
1181 } else {
1182 node = NULL;
1183 }
1184
1185 if (node == NULL) {
1186 crm_debug("Couldn't find node%s%s%s%s",
1187 id? " " : "",
1188 id? id : "",
1189 uname? " with name " : "",
1190 uname? uname : "");
1191 }
1192
1193 return node;
1194 }
1195
1196 static void
known_node_cache_refresh_helper(xmlNode * xml_node,void * user_data)1197 known_node_cache_refresh_helper(xmlNode *xml_node, void *user_data)
1198 {
1199 const char *id = crm_element_value(xml_node, XML_ATTR_ID);
1200 const char *uname = crm_element_value(xml_node, XML_ATTR_UNAME);
1201 crm_node_t * node = NULL;
1202
1203 CRM_CHECK(id != NULL && uname !=NULL, return);
1204 node = find_known_node(id, uname);
1205
1206 if (node == NULL) {
1207 char *uniqueid = crm_generate_uuid();
1208
1209 node = calloc(1, sizeof(crm_node_t));
1210 CRM_ASSERT(node != NULL);
1211
1212 node->uname = strdup(uname);
1213 CRM_ASSERT(node->uname != NULL);
1214
1215 node->uuid = strdup(id);
1216 CRM_ASSERT(node->uuid != NULL);
1217
1218 g_hash_table_replace(known_node_cache, uniqueid, node);
1219
1220 } else if (pcmk_is_set(node->flags, crm_node_dirty)) {
1221 if (!pcmk__str_eq(uname, node->uname, pcmk__str_casei)) {
1222 free(node->uname);
1223 node->uname = strdup(uname);
1224 CRM_ASSERT(node->uname != NULL);
1225 }
1226
1227 /* Node is in cache and hasn't been updated already, so mark it clean */
1228 clear_peer_flags(node, crm_node_dirty);
1229 }
1230
1231 }
1232
1233 static void
refresh_known_node_cache(xmlNode * cib)1234 refresh_known_node_cache(xmlNode *cib)
1235 {
1236 crm_peer_init();
1237
1238 g_hash_table_foreach(known_node_cache, mark_dirty, NULL);
1239
1240 crm_foreach_xpath_result(cib, PCMK__XP_MEMBER_NODE_CONFIG,
1241 known_node_cache_refresh_helper, NULL);
1242
1243 /* Remove all old cache entries that weren't seen in the CIB */
1244 g_hash_table_foreach_remove(known_node_cache, is_dirty, NULL);
1245 }
1246
1247 void
pcmk__refresh_node_caches_from_cib(xmlNode * cib)1248 pcmk__refresh_node_caches_from_cib(xmlNode *cib)
1249 {
1250 crm_remote_peer_cache_refresh(cib);
1251 refresh_known_node_cache(cib);
1252 }
1253
1254 /*!
1255 * \internal
1256 * \brief Search known node cache
1257 *
1258 * \param[in] id If not 0, cluster node ID to search for
1259 * \param[in] uname If not NULL, node name to search for
1260 * \param[in] flags Bitmask of enum crm_get_peer_flags
1261 *
1262 * \return Known node cache entry if found, otherwise NULL
1263 */
1264 crm_node_t *
pcmk__search_known_node_cache(unsigned int id,const char * uname,uint32_t flags)1265 pcmk__search_known_node_cache(unsigned int id, const char *uname,
1266 uint32_t flags)
1267 {
1268 crm_node_t *node = NULL;
1269 char *id_str = NULL;
1270
1271 CRM_ASSERT(id > 0 || uname != NULL);
1272
1273 node = pcmk__search_node_caches(id, uname, flags);
1274
1275 if (node || !(flags & CRM_GET_PEER_CLUSTER)) {
1276 return node;
1277 }
1278
1279 if (id > 0) {
1280 id_str = crm_strdup_printf("%u", id);
1281 }
1282
1283 node = find_known_node(id_str, uname);
1284
1285 free(id_str);
1286 return node;
1287 }
1288
1289
1290 // Deprecated functions kept only for backward API compatibility
1291
1292 #include <crm/cluster/compat.h>
1293
1294 int
crm_terminate_member(int nodeid,const char * uname,void * unused)1295 crm_terminate_member(int nodeid, const char *uname, void *unused)
1296 {
1297 return stonith_api_kick(nodeid, uname, 120, TRUE);
1298 }
1299
1300 int
crm_terminate_member_no_mainloop(int nodeid,const char * uname,int * connection)1301 crm_terminate_member_no_mainloop(int nodeid, const char *uname, int *connection)
1302 {
1303 return stonith_api_kick(nodeid, uname, 120, TRUE);
1304 }
1305
1306 // End deprecated API
1307