1 /* -*-pgsql-c-*- */
2 /*
3 * $Header$
4 *
5 * pgpool: a language independent connection pool server for PostgreSQL
6 * written by Tatsuo Ishii
7 *
8 * Copyright (c) 2003-2021 PgPool Global Development Group
9 *
10 * Permission to use, copy, modify, and distribute this software and
11 * its documentation for any purpose and without fee is hereby
12 * granted, provided that the above copyright notice appear in all
13 * copies and that both that copyright notice and this permission
14 * notice appear in supporting documentation, and that the name of the
15 * author not be used in advertising or publicity pertaining to
16 * distribution of the software without specific, written prior
17 * permission. The author makes no representations about the
18 * suitability of this software for any purpose. It is provided "as
19 * is" without express or implied warranty.
20 *
21 * watchdog.c: child process main
22 *
23 */
24
25
26
27 #include <stdio.h>
28 #include <errno.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <stdlib.h>
32 #include <sys/time.h>
33 #include <sys/utsname.h>
34 #include <sys/un.h>
35 #include <sys/types.h>
36 #include <sys/socket.h>
37 #include <sys/wait.h>
38 #include <netinet/in.h>
39 #include <netinet/tcp.h>
40 #include <net/if.h>
41 #include <arpa/inet.h>
42 #include <netdb.h>
43 #include <fcntl.h>
44 #include <ctype.h>
45
46 #include "pool.h"
47 #include "pool_config.h"
48 #include "auth/md5.h"
49 #include "utils/palloc.h"
50 #include "utils/memutils.h"
51 #include "utils/elog.h"
52 #include "utils/json_writer.h"
53 #include "utils/json.h"
54 #include "utils/socket_stream.h"
55 #include "utils/pool_signal.h"
56 #include "utils/ps_status.h"
57 #include "main/pool_internal_comms.h"
58 #include "pcp/recovery.h"
59
60 #include "watchdog/wd_utils.h"
61 #include "watchdog/watchdog.h"
62 #include "watchdog/wd_json_data.h"
63 #include "watchdog/wd_ipc_defines.h"
64 #include "watchdog/wd_internal_commands.h"
65 #include "parser/stringinfo.h"
66
67 /* These defines enables the consensus building feature
68 * in watchdog for node failover operations
69 * We can also take these to the configure script
70 */
71 #define NODE_UP_REQUIRE_CONSENSUS
72 #define NODE_DOWN_REQUIRE_CONSENSUS
73 #define NODE_PROMOTE_REQUIRE_CONSENSUS
74
75 typedef enum IPC_CMD_PROCESS_RES
76 {
77 IPC_CMD_COMPLETE,
78 IPC_CMD_PROCESSING,
79 IPC_CMD_ERROR,
80 IPC_CMD_OK,
81 IPC_CMD_TRY_AGAIN
82 } IPC_CMD_PROCESS_RES;
83
84
85 #define MIN_SECS_CONNECTION_RETRY 10 /* Time in seconds to retry connection
86 * with node once it was failed */
87
88 #define MAX_SECS_ESC_PROC_EXIT_WAIT 5 /* maximum amount of seconds to wait
89 * for escalation/de-escalation process
90 * to exit normally before moving on */
91
92 #define BEACON_MESSAGE_INTERVAL_SECONDS 10 /* interval between beacon
93 * messages */
94
95 #define MAX_SECS_WAIT_FOR_REPLY_FROM_NODE 5 /* time in seconds to wait for
96 * the reply from remote
97 * watchdog node */
98
99 #define MAX_ALLOWED_SEND_FAILURES 3 /* number of times sending message failure
100 * can be tolerated
101 */
102 #define MAX_ALLOWED_BEACON_REPLY_MISS 3 /* number of times missing beacon message reply
103 * can be tolerated
104 */
105
106
107 #define FAILOVER_COMMAND_FINISH_TIMEOUT 15 /* timeout in seconds to wait
108 * for Pgpool-II to build
109 * consensus for failover */
110
111 #define MIN_SECS_BETWEEN_BROADCAST_SRV_MSG 5 /* minimum amount of seconds to wait
112 * before broadcasting the same cluster
113 * service message */
114
115 /*
116 * Packet types. Used in WDPacketData->type.
117 */
118 #define WD_NO_MESSAGE 0
119 #define WD_ADD_NODE_MESSAGE 'A'
120 #define WD_REQ_INFO_MESSAGE 'B'
121 #define WD_DECLARE_COORDINATOR_MESSAGE 'C'
122 #define WD_DATA_MESSAGE 'D'
123 #define WD_ERROR_MESSAGE 'E'
124 #define WD_ACCEPT_MESSAGE 'G'
125 #define WD_INFO_MESSAGE 'I'
126 #define WD_JOIN_COORDINATOR_MESSAGE 'J'
127 #define WD_IAM_COORDINATOR_MESSAGE 'M'
128 #define WD_IAM_IN_NW_TROUBLE_MESSAGE 'N'
129 #define WD_QUORUM_IS_LOST 'Q'
130 #define WD_REJECT_MESSAGE 'R'
131 #define WD_STAND_FOR_COORDINATOR_MESSAGE 'S'
132 #define WD_REMOTE_FAILOVER_REQUEST 'V'
133 #define WD_INFORM_I_AM_GOING_DOWN 'X'
134 #define WD_ASK_FOR_POOL_CONFIG 'Y'
135 #define WD_POOL_CONFIG_DATA 'Z'
136 #define WD_CMD_REPLY_IN_DATA '-'
137 #define WD_CLUSTER_SERVICE_MESSAGE '#'
138
139 #define WD_EXECUTE_COMMAND_REQUEST '!'
140
141 #define WD_FAILOVER_START 'F'
142 #define WD_FAILOVER_END 'H'
143 #define WD_FAILOVER_WAITING_FOR_CONSENSUS 'K'
144
145 /*Cluster Service Message Types */
146 #define CLUSTER_QUORUM_LOST 'L'
147 #define CLUSTER_QUORUM_FOUND 'F'
148 #define CLUSTER_IN_SPLIT_BRAIN 'B'
149 #define CLUSTER_NEEDS_ELECTION 'E'
150 #define CLUSTER_IAM_TRUE_LEADER 'M'
151 #define CLUSTER_IAM_NOT_TRUE_LEADER 'X'
152 #define CLUSTER_IAM_RESIGNING_FROM_LEADER 'R'
153 #define CLUSTER_NODE_INVALID_VERSION 'V'
154 #define CLUSTER_NODE_REQUIRE_TO_RELOAD 'I'
155 #define CLUSTER_NODE_APPEARING_LOST 'Y'
156 #define CLUSTER_NODE_APPEARING_FOUND 'Z'
157
158
159 #define WD_LEADER_NODE getLeaderWatchdogNode()
160
161 typedef struct packet_types
162 {
163 char type;
164 char name[100];
165 } packet_types;
166
167 packet_types all_packet_types[] = {
168 {WD_ADD_NODE_MESSAGE, "ADD NODE"},
169 {WD_REQ_INFO_MESSAGE, "REQUEST INFO"},
170 {WD_DECLARE_COORDINATOR_MESSAGE, "DECLARE COORDINATOR"},
171 {WD_DATA_MESSAGE, "DATA"},
172 {WD_ERROR_MESSAGE, "ERROR"},
173 {WD_ACCEPT_MESSAGE, "ACCEPT"},
174 {WD_INFO_MESSAGE, "NODE INFO"},
175 {WD_JOIN_COORDINATOR_MESSAGE, "JOIN COORDINATOR"},
176 {WD_IAM_COORDINATOR_MESSAGE, "IAM COORDINATOR"},
177 {WD_IAM_IN_NW_TROUBLE_MESSAGE, "I AM IN NETWORK TROUBLE"},
178 {WD_QUORUM_IS_LOST, "QUORUM IS LOST"},
179 {WD_REJECT_MESSAGE, "REJECT"},
180 {WD_STAND_FOR_COORDINATOR_MESSAGE, "STAND FOR COORDINATOR"},
181 {WD_REMOTE_FAILOVER_REQUEST, "REPLICATE FAILOVER REQUEST"},
182 {WD_IPC_ONLINE_RECOVERY_COMMAND, "ONLINE RECOVERY REQUEST"},
183 {WD_EXECUTE_CLUSTER_COMMAND, "EXECUTE CLUSTER COMMAND"},
184 {WD_IPC_FAILOVER_COMMAND, "FAILOVER FUNCTION COMMAND"},
185 {WD_INFORM_I_AM_GOING_DOWN, "INFORM I AM GOING DOWN"},
186 {WD_ASK_FOR_POOL_CONFIG, "ASK FOR POOL CONFIG"},
187 {WD_POOL_CONFIG_DATA, "CONFIG DATA"},
188 {WD_GET_LEADER_DATA_REQUEST, "DATA REQUEST FOR LEADER"},
189 {WD_GET_RUNTIME_VARIABLE_VALUE, "GET WD RUNTIME VARIABLE VALUE"},
190 {WD_CMD_REPLY_IN_DATA, "COMMAND REPLY IN DATA"},
191 {WD_FAILOVER_LOCKING_REQUEST, "FAILOVER LOCKING REQUEST"},
192 {WD_FAILOVER_INDICATION, "FAILOVER INDICATION"},
193 {WD_CLUSTER_SERVICE_MESSAGE, "CLUSTER SERVICE MESSAGE"},
194 {WD_REGISTER_FOR_NOTIFICATION, "REGISTER FOR NOTIFICATION"},
195 {WD_NODE_STATUS_CHANGE_COMMAND, "NODE STATUS CHANGE"},
196 {WD_GET_NODES_LIST_COMMAND, "GET NODES LIST"},
197 {WD_IPC_CMD_CLUSTER_IN_TRAN, "CLUSTER STATE NOT STABLE"},
198 {WD_IPC_CMD_RESULT_BAD, "IPC RESPONSE BAD"},
199 {WD_IPC_CMD_RESULT_OK, "IPC RESPONSE GOOD"},
200 {WD_IPC_CMD_TIMEOUT, "IPC TIMEOUT"},
201 {WD_EXECUTE_COMMAND_REQUEST, "WD EXECUTE COMMAND"},
202 {WD_NO_MESSAGE, ""}
203 };
204
205
206 char *wd_event_name[] =
207 {"STATE CHANGED",
208 "TIMEOUT",
209 "PACKET RECEIVED",
210 "COMMAND FINISHED",
211 "NEW OUTBOUND_CONNECTION",
212 "NETWORK IP IS REMOVED",
213 "NETWORK IP IS ASSIGNED",
214 "NETWORK LINK IS INACTIVE",
215 "NETWORK LINK IS ACTIVE",
216 "THIS NODE LOST",
217 "REMOTE NODE LOST",
218 "REMOTE NODE FOUND",
219 "THIS NODE FOUND",
220 "NODE CONNECTION LOST",
221 "NODE CONNECTION FOUND",
222 "CLUSTER QUORUM STATUS CHANGED",
223 "NODE REQUIRE TO RELOAD STATE",
224 "I AM APPEARING LOST"
225 };
226
227 char *wd_state_names[] = {
228 "DEAD",
229 "LOADING",
230 "JOINING",
231 "INITIALIZING",
232 "LEADER",
233 "PARTICIPATING IN ELECTION",
234 "STANDING FOR LEADER",
235 "STANDBY",
236 "LOST",
237 "IN NETWORK TROUBLE",
238 "SHUTDOWN",
239 "ADD MESSAGE SENT",
240 "NETWORK ISOLATION"
241 };
242
243 char *wd_node_lost_reasons[] = {
244 "UNKNOWN REASON",
245 "REPORTED BY LIFECHECK",
246 "SEND MESSAGE FAILURES",
247 "MISSING BEACON REPLIES",
248 "RECEIVE TIMEOUT",
249 "NOT REACHABLE",
250 "SHUTDOWN"
251 };
252
253 char *wd_cluster_membership_status[] = {
254 "MEMBER",
255 "REVOKED-SHUTDOWN",
256 "REVOKED-NO-SHOW",
257 "REVOKED-LOST"
258 };
259 /*
260 * Command packet definition.
261 */
262 typedef struct WDPacketData
263 {
264 char type; /* packet type. e.g. WD_ADD_NODE_MESSAGE. See #define above. */
265 int command_id; /* command sequence number starting from 1 */
266 int len;
267 char *data;
268 } WDPacketData;
269
270
271 typedef enum WDNodeCommandState
272 {
273 COMMAND_STATE_INIT,
274 COMMAND_STATE_SENT,
275 COMMAND_STATE_REPLIED,
276 COMMAND_STATE_SEND_ERROR,
277 COMMAND_STATE_DO_NOT_SEND
278 } WDNodeCommandState;
279
280 typedef struct WDCommandNodeResult
281 {
282 WatchdogNode *wdNode;
283 WDNodeCommandState cmdState;
284 char result_type;
285 int result_data_len;
286 char *result_data;
287 } WDCommandNodeResult;
288
289 typedef enum WDCommandSource
290 {
291 COMMAND_SOURCE_IPC,
292 COMMAND_SOURCE_LOCAL,
293 COMMAND_SOURCE_REMOTE,
294 COMMAND_SOURCE_INTERNAL
295 } WDCommandSource;
296
297 /*
298 * Watchdog "function" descriptor. "function" is not a C-function, it's one
299 * of: START_RECOVERY, END_RECOVERY, FAILBACK_REQUEST, DEGENERATE_REQUEST and
300 * PROMOTE_REQUEST. See #define function names (they are prefixed by
301 * "WD_FUNCTION" in src/include/watchdog/wd_ipc_defines.h for more details.
302 */
303 typedef struct WDFunctionCommandData
304 {
305 char commandType;
306 unsigned int commandID;
307 char *funcName; /* function name */
308 WatchdogNode *wdNode;
309 } WDFunctionCommandData;
310
311 typedef struct WDCommandTimerData
312 {
313 struct timeval startTime;
314 unsigned int expire_sec;
315 bool need_tics;
316 WDFunctionCommandData *wd_func_command;
317 } WDCommandTimerData;
318
319
320 typedef enum WDCommandStatus
321 {
322 COMMAND_EMPTY,
323 COMMAND_IN_PROGRESS,
324 COMMAND_FINISHED_TIMEOUT,
325 COMMAND_FINISHED_ALL_REPLIED,
326 COMMAND_FINISHED_NODE_REJECTED,
327 COMMAND_FINISHED_SEND_FAILED
328 } WDCommandStatus;
329
330 typedef struct WDCommandData
331 {
332 WDPacketData sourcePacket;
333 WDPacketData commandPacket;
334 WDCommandNodeResult *nodeResults;
335 WatchdogNode *sendToNode; /* NULL means send to all */
336 WDCommandStatus commandStatus;
337 unsigned int commandTimeoutSecs;
338 struct timeval commandTime;
339 unsigned int commandSendToCount;
340 unsigned int commandSendToErrorCount;
341 unsigned int commandReplyFromCount;
342 WDCommandSource commandSource;
343 int sourceIPCSocket; /* Only valid for COMMAND_SOURCE_IPC */
344 WatchdogNode *sourceWdNode; /* Only valid for COMMAND_SOURCE_REMOTE */
345 char *errorMessage;
346 MemoryContext memoryContext;
347 void (*commandCompleteFunc) (struct WDCommandData *command);
348 } WDCommandData;
349
350 typedef struct WDInterfaceStatus
351 {
352 char *if_name;
353 unsigned int if_index;
354 bool if_up;
355 } WDInterfaceStatus;
356
357 typedef struct WDClusterLeader
358 {
359 WatchdogNode *leaderNode;
360 WatchdogNode **standbyNodes;
361 int standby_nodes_count;
362 bool holding_vip;
363 } WDClusterLeaderInfo;
364
365 typedef struct wd_cluster
366 {
367 WatchdogNode *localNode;
368 WatchdogNode *remoteNodes;
369 WDClusterLeaderInfo clusterLeaderInfo;
370 int remoteNodeCount;
371 int memberRemoteNodeCount; /* no of nodes that count towards quorum and consensus */
372 int quorum_status;
373 unsigned int nextCommandID;
374 pid_t escalation_pid;
375 pid_t de_escalation_pid;
376 int command_server_sock;
377 int network_monitor_sock;
378 bool clusterInitialized;
379 bool ipc_auth_needed;
380 int current_failover_id;
381 struct timeval last_bcast_srv_msg_time; /* timestamp when last packet was
382 * broadcasted by the local node */
383 char last_bcast_srv_msg;
384
385 List *unidentified_socks;
386 List *notify_clients;
387 List *ipc_command_socks;
388 List *ipc_commands;
389 List *clusterCommands;
390 List *wd_timer_commands;
391 List *wdInterfaceToMonitor;
392 List *wdCurrentFailovers;
393 } wd_cluster;
394
395 typedef struct WDFailoverObject
396 {
397 int id;
398 POOL_REQUEST_KIND reqKind;
399 unsigned char reqFlags;
400 int nodesCount;
401 unsigned int failoverID;
402 int *nodeList;
403 List *requestingNodes;
404 int request_count;
405 struct timeval startTime;
406 int state;
407 } WDFailoverObject;
408
409 #ifdef WATCHDOG_DEBUG_OPTS
410 #if WATCHDOG_DEBUG_OPTS > 0
411 #define WATCHDOG_DEBUG
412 #endif
413 #endif
414
415 static bool check_debug_request_do_not_send_beacon(void);
416 static bool check_debug_request_do_not_reply_beacon(void);
417 static bool check_debug_request_kill_all_communication(void);
418 static bool check_debug_request_kill_all_receivers(void);
419 static bool check_debug_request_kill_all_senders(void);
420
421
422 #ifdef WATCHDOG_DEBUG
423 static void load_watchdog_debug_test_option(void);
424 #endif
425
426 static void process_remote_failover_command_on_coordinator(WatchdogNode * wdNode, WDPacketData * pkt);
427 static WDFailoverObject * get_failover_object(POOL_REQUEST_KIND reqKind, int nodesCount, int *nodeList);
428 static bool does_int_array_contains_value(int *intArray, int count, int value);
429 static void clear_all_failovers(void);
430 static void remove_failover_object(WDFailoverObject * failoverObj);
431 static void service_expired_failovers(void);
432 static WDFailoverObject * add_failover(POOL_REQUEST_KIND reqKind, int *node_id_list, int node_count, WatchdogNode * wdNode,
433 unsigned char flags, bool *duplicate);
434 static WDFailoverCMDResults compute_failover_consensus(POOL_REQUEST_KIND reqKind, int *node_id_list,
435 int node_count, unsigned char *flags, WatchdogNode * wdNode);
436
437 static int send_command_packet_to_remote_nodes(WDCommandData * ipcCommand, bool source_included);
438 static void wd_command_is_complete(WDCommandData * ipcCommand);
439 static IPC_CMD_PROCESS_RES wd_command_processor_for_node_lost_event(WDCommandData * ipcCommand, WatchdogNode * wdLostNode);
440
441 volatile sig_atomic_t reload_config_signal = 0;
442 volatile sig_atomic_t sigchld_request = 0;
443
444 static void check_signals(void);
445 static void wd_child_signal_handler(void);
446 static RETSIGTYPE watchdog_signal_handler(int sig);
447 static void FileUnlink(int code, Datum path);
448 static void wd_child_exit(int exit_signo);
449
450 static void wd_cluster_initialize(void);
451 static void wd_initialize_monitoring_interfaces(void);
452 static int wd_create_client_socket(char *hostname, int port, bool *connected);
453 static int connect_with_all_configured_nodes(void);
454 static void try_connecting_with_all_unreachable_nodes(void);
455 static bool connect_to_node(WatchdogNode * wdNode);
456 static bool is_socket_connection_connected(SocketConnection * conn);
457
458 static void service_unreachable_nodes(void);
459
460 static void allocate_resultNodes_in_command(WDCommandData * ipcCommand);
461 static bool is_node_active_and_reachable(WatchdogNode * wdNode);
462 static bool is_node_active(WatchdogNode * wdNode);
463 static bool is_node_reachable(WatchdogNode * wdNode);
464
465 static int update_successful_outgoing_cons(fd_set *wmask, int pending_fds_count);
466 static int prepare_fds(fd_set *rmask, fd_set *wmask, fd_set *emask);
467
468 static void set_next_commandID_in_message(WDPacketData * pkt);
469 static void set_message_commandID(WDPacketData * pkt, unsigned int commandID);
470 static void set_message_data(WDPacketData * pkt, const char *data, int len);
471 static void set_message_type(WDPacketData * pkt, char type);
472 static void free_packet(WDPacketData * pkt);
473
474 static WDPacketData * get_empty_packet(void);
475 static WDPacketData * read_packet_of_type(SocketConnection * conn, char ensure_type);
476 static WDPacketData * read_packet(SocketConnection * conn);
477 static WDPacketData * get_message_of_type(char type, WDPacketData * replyFor);
478 static WDPacketData * get_addnode_message(void);
479 static WDPacketData * get_beacon_message(char type, WDPacketData * replyFor);
480 static WDPacketData * get_mynode_info_message(WDPacketData * replyFor);
481 static WDPacketData * get_minimum_message(char type, WDPacketData * replyFor);
482
483
484 static int issue_watchdog_internal_command(WatchdogNode * wdNode, WDPacketData * pkt, int timeout_sec);
485 static void check_for_current_command_timeout(void);
486 static bool watchdog_internal_command_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt);
487 static bool service_lost_connections(void);
488 static void service_ipc_commands(void);
489 static void service_internal_command(void);
490
491 static unsigned int get_next_commandID(void);
492 static WatchdogNode * parse_node_info_message(WDPacketData * pkt, char **authkey);
493 static void update_quorum_status(void);
494 static int get_minimum_remote_nodes_required_for_quorum(void);
495 static int get_minimum_votes_to_resolve_consensus(void);
496
497 static bool write_packet_to_socket(int sock, WDPacketData * pkt, bool ipcPacket);
498 static int read_sockets(fd_set *rmask, int pending_fds_count);
499 static void set_timeout(unsigned int sec);
500 static int wd_create_command_server_socket(void);
501 static void close_socket_connection(SocketConnection * conn);
502 static bool send_message_to_connection(SocketConnection * conn, WDPacketData * pkt);
503
504 static int send_message(WatchdogNode * wdNode, WDPacketData * pkt);
505 static bool send_message_to_node(WatchdogNode * wdNode, WDPacketData * pkt);
506 static bool reply_with_minimal_message(WatchdogNode * wdNode, char type, WDPacketData * replyFor);
507 static bool reply_with_message(WatchdogNode * wdNode, char type, char *data, int data_len, WDPacketData * replyFor);
508 static int send_cluster_command(WatchdogNode * wdNode, char type, int timeout_sec);
509 static int send_message_of_type(WatchdogNode * wdNode, char type, WDPacketData * replyFor);
510
511 static bool send_cluster_service_message(WatchdogNode * wdNode, WDPacketData * replyFor, char message);
512
513
514 static int accept_incoming_connections(fd_set *rmask, int pending_fds_count);
515
516 static int standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt);
517 static void cluster_service_message_processor(WatchdogNode * wdNode, WDPacketData * pkt);
518 static int get_cluster_node_count(void);
519 static void clear_command_node_result(WDCommandNodeResult * nodeResult);
520
521 static inline bool is_local_node_true_leader(void);
522 static inline WD_STATES get_local_node_state(void);
523 static int set_state(WD_STATES newState);
524
525 static int watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
526 static int watchdog_state_machine_voting(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
527 static int watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
528 static int watchdog_state_machine_standForCord(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
529 static int watchdog_state_machine_initializing(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
530 static int watchdog_state_machine_joining(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
531 static int watchdog_state_machine_loading(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
532 static int watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
533 static int watchdog_state_machine_nw_error(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
534 static int watchdog_state_machine_nw_isolation(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
535
536 static int I_am_leader_and_cluster_in_split_brain(WatchdogNode * otherLeaderNode);
537 static void handle_split_brain(WatchdogNode * otherLeaderNode, WDPacketData * pkt);
538 static bool beacon_message_received_from_node(WatchdogNode * wdNode, WDPacketData * pkt);
539
540 static void cleanUpIPCCommand(WDCommandData * ipcCommand);
541 static bool read_ipc_socket_and_process(int socket, bool *remove_socket);
542
543 static JsonNode * get_node_list_json(int id);
544 static bool add_nodeinfo_to_json(JsonNode * jNode, WatchdogNode * node);
545 static bool fire_node_status_event(int nodeID, int nodeStatus);
546 static void resign_from_escalated_node(void);
547 static void start_escalated_node(void);
548 static void init_wd_packet(WDPacketData * pkt);
549 static void wd_packet_shallow_copy(WDPacketData * srcPkt, WDPacketData * dstPkt);
550 static bool wd_commands_packet_processor(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt);
551
552 static WDCommandData * get_wd_command_from_reply(List *commands, WDPacketData * pkt);
553 static WDCommandData * get_wd_cluster_command_from_reply(WDPacketData * pkt);
554 static WDCommandData * get_wd_IPC_command_from_reply(WDPacketData * pkt);
555 static WDCommandData * get_wd_IPC_command_from_socket(int sock);
556
557 static IPC_CMD_PROCESS_RES process_IPC_command(WDCommandData * ipcCommand);
558 static IPC_CMD_PROCESS_RES process_IPC_nodeStatusChange_command(WDCommandData * ipcCommand);
559 static IPC_CMD_PROCESS_RES process_IPC_nodeList_command(WDCommandData * ipcCommand);
560 static IPC_CMD_PROCESS_RES process_IPC_get_runtime_variable_value_request(WDCommandData * ipcCommand);
561 static IPC_CMD_PROCESS_RES process_IPC_online_recovery(WDCommandData * ipcCommand);
562 static IPC_CMD_PROCESS_RES process_IPC_failover_indication(WDCommandData * ipcCommand);
563 static IPC_CMD_PROCESS_RES process_IPC_data_request_from_leader(WDCommandData * ipcCommand);
564 static IPC_CMD_PROCESS_RES process_IPC_failover_command(WDCommandData * ipcCommand);
565 static IPC_CMD_PROCESS_RES process_failover_command_on_coordinator(WDCommandData * ipcCommand);
566 static IPC_CMD_PROCESS_RES process_IPC_execute_cluster_command(WDCommandData * ipcCommand);
567
568 static bool write_ipc_command_with_result_data(WDCommandData * ipcCommand, char type, char *data, int len);
569
570 static void process_wd_func_commands_for_timer_events(void);
571 static void add_wd_command_for_timer_events(unsigned int expire_secs, bool need_tics, WDFunctionCommandData * wd_func_command);
572 static bool reply_is_received_for_pgpool_replicate_command(WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * ipcCommand);
573
574 static void process_remote_online_recovery_command(WatchdogNode * wdNode, WDPacketData * pkt);
575
576 static WDFailoverCMDResults failover_end_indication(WDCommandData * ipcCommand);
577 static WDFailoverCMDResults failover_start_indication(WDCommandData * ipcCommand);
578
579 static void wd_system_will_go_down(int code, Datum arg);
580 static void verify_pool_configurations(WatchdogNode * wdNode, POOL_CONFIG * config);
581
582 static bool get_authhash_for_node(WatchdogNode * wdNode, char *authhash);
583 static bool verify_authhash_for_node(WatchdogNode * wdNode, char *authhash);
584
585 static void print_watchdog_node_info(WatchdogNode * wdNode);
586 static int wd_create_recv_socket(int port);
587 static void wd_check_config(void);
588 static pid_t watchdog_main(void);
589 static pid_t fork_watchdog_child(void);
590 static bool check_IPC_client_authentication(json_value * rootObj, bool internal_client_only);
591 static bool check_and_report_IPC_authentication(WDCommandData * ipcCommand);
592
593 static void print_packet_node_info(WDPacketData * pkt, WatchdogNode * wdNode, bool sending);
594 static void print_packet_info(WDPacketData * pkt, bool sending);
595 static void update_interface_status(void);
596 static bool any_interface_available(void);
597 static WDPacketData * process_data_request(WatchdogNode * wdNode, WDPacketData * pkt);
598
599 static WatchdogNode * getLeaderWatchdogNode(void);
600 static void set_cluster_leader_node(WatchdogNode * wdNode);
601 static void clear_standby_nodes_list(void);
602 static int standby_node_left_cluster(WatchdogNode * wdNode);
603 static int standby_node_join_cluster(WatchdogNode * wdNode);
604 static void reset_lost_timers(void);
605 static int update_cluster_memberships(void);
606 static int revoke_cluster_membership_of_node(WatchdogNode* wdNode, WD_NODE_MEMBERSHIP_STATUS revoke_status);
607 static int restore_cluster_membership_of_node(WatchdogNode* wdNode);
608 static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear);
609 static void wd_execute_cluster_command_processor(WatchdogNode * wdNode, WDPacketData * pkt);
610
611 /* global variables */
612 wd_cluster g_cluster;
613 struct timeval g_tm_set_time;
614 int g_timeout_sec = 0;
615
616 static unsigned int
get_next_commandID(void)617 get_next_commandID(void)
618 {
619 return ++g_cluster.nextCommandID;
620 }
621
622 static void
set_timeout(unsigned int sec)623 set_timeout(unsigned int sec)
624 {
625 g_timeout_sec = sec;
626 gettimeofday(&g_tm_set_time, NULL);
627 }
628
629 pid_t
initialize_watchdog(void)630 initialize_watchdog(void)
631 {
632 if (!pool_config->use_watchdog)
633 return -1;
634 /* check pool_config data related to watchdog */
635 wd_check_config();
636 return fork_watchdog_child();
637 }
638
639 static void
wd_check_config(void)640 wd_check_config(void)
641 {
642 if (pool_config->wd_nodes.num_wd == 0)
643 ereport(ERROR,
644 (errmsg("invalid watchdog configuration. no watchdog nodes configured")));
645
646 if (strlen(pool_config->wd_authkey) > MAX_PASSWORD_SIZE)
647 ereport(ERROR,
648 (errmsg("invalid watchdog configuration. wd_authkey length can't be larger than %d",
649 MAX_PASSWORD_SIZE)));
650 if (pool_config->wd_lifecheck_method == LIFECHECK_BY_HB)
651 {
652 if (pool_config->num_hb_dest_if <= 0)
653 ereport(ERROR,
654 (errmsg("invalid lifecheck configuration. no heartbeat interfaces defined")));
655 }
656 }
657
658 static void
wd_initialize_monitoring_interfaces(void)659 wd_initialize_monitoring_interfaces(void)
660 {
661 g_cluster.wdInterfaceToMonitor = NULL;
662
663 if (pool_config->num_wd_monitoring_interfaces_list <= 0)
664 {
665 ereport(LOG,
666 (errmsg("interface monitoring is disabled in watchdog")));
667 return;
668 }
669
670 if (strcasecmp("any", pool_config->wd_monitoring_interfaces_list[0]) == 0)
671 {
672 struct if_nameindex *if_ni,
673 *idx;
674
675 ereport(LOG,
676 (errmsg("ensure availability on any interface")));
677
678 if_ni = if_nameindex();
679 if (if_ni == NULL)
680 {
681 ereport(ERROR,
682 (errmsg("initializing watchdog failed. unable to get network interface information")));
683 }
684
685 for (idx = if_ni; !(idx->if_index == 0 && idx->if_name == NULL); idx++)
686 {
687 WDInterfaceStatus *if_status;
688
689 ereport(DEBUG1,
690 (errmsg("interface name %s at index %d", idx->if_name, idx->if_index)));
691 if (strncasecmp("lo", idx->if_name, 2) == 0)
692 {
693 /* ignoring local interface */
694 continue;
695 }
696 if_status = palloc(sizeof(WDInterfaceStatus));
697 if_status->if_name = pstrdup(idx->if_name);
698 if_status->if_index = idx->if_index;
699 if_status->if_up = true; /* start with optimism */
700 g_cluster.wdInterfaceToMonitor = lappend(g_cluster.wdInterfaceToMonitor, if_status);
701 }
702 if_freenameindex(if_ni);
703 }
704 else
705 {
706 WDInterfaceStatus *if_status;
707 char *if_name;
708 int i;
709 unsigned int if_idx;
710
711 for (i = 0; i < pool_config->num_wd_monitoring_interfaces_list; i++)
712 {
713 if_name = pool_config->wd_monitoring_interfaces_list[i];
714 /* ignore leading spaces */
715 while (*if_name && isspace(*if_name))
716 if_name++;
717
718 if_idx = if_nametoindex(if_name);
719 if (if_idx == 0)
720 ereport(ERROR,
721 (errmsg("initializing watchdog failed. invalid interface name \"%s\"", pool_config->wd_monitoring_interfaces_list[0])));
722
723 ereport(DEBUG1,
724 (errmsg("adding monitoring interface [%d] name %s index %d", i, if_name, if_idx)));
725
726 if_status = palloc(sizeof(WDInterfaceStatus));
727 if_status->if_name = pstrdup(if_name);
728 if_status->if_index = if_idx;
729 if_status->if_up = true; /* start with optimism */
730 g_cluster.wdInterfaceToMonitor = lappend(g_cluster.wdInterfaceToMonitor, if_status);
731 }
732 }
733 }
734
735 static void
wd_cluster_initialize(void)736 wd_cluster_initialize(void)
737 {
738 int i = 0;
739 int pgpool_node_id = pool_config->pgpool_node_id;
740
741 if (pool_config->wd_nodes.num_wd <= 0)
742 {
743 /* should also have upper limit??? */
744 ereport(ERROR,
745 (errmsg("initializing watchdog failed. no watchdog nodes configured")));
746 }
747
748 /* initialize local node settings */
749 g_cluster.localNode = palloc0(sizeof(WatchdogNode));
750 g_cluster.localNode->wd_port = pool_config->wd_nodes.wd_node_info[pgpool_node_id].wd_port;
751 g_cluster.localNode->pgpool_port = pool_config->wd_nodes.wd_node_info[pgpool_node_id].pgpool_port;
752 g_cluster.localNode->wd_priority = pool_config->wd_priority;
753 g_cluster.localNode->pgpool_node_id = pool_config->pgpool_node_id;
754 gettimeofday(&g_cluster.localNode->startup_time, NULL);
755
756 strncpy(g_cluster.localNode->hostname, pool_config->wd_nodes.wd_node_info[pgpool_node_id].hostname, sizeof(g_cluster.localNode->hostname) - 1);
757 strncpy(g_cluster.localNode->delegate_ip, pool_config->delegate_IP, sizeof(g_cluster.localNode->delegate_ip) - 1);
758 /* Assign the node name */
759 {
760 struct utsname unameData;
761
762 uname(&unameData);
763 snprintf(g_cluster.localNode->nodeName, sizeof(g_cluster.localNode->nodeName), "%s:%d %s %s",
764 pool_config->wd_nodes.wd_node_info[pgpool_node_id].hostname,
765 pool_config->wd_nodes.wd_node_info[pgpool_node_id].pgpool_port,
766 unameData.sysname,
767 unameData.nodename);
768 /* should also have upper limit??? */
769 ereport(LOG,
770 (errmsg("setting the local watchdog node name to \"%s\"", g_cluster.localNode->nodeName)));
771 }
772
773 /* initialize remote nodes */
774 g_cluster.remoteNodeCount = pool_config->wd_nodes.num_wd - 1;
775 g_cluster.memberRemoteNodeCount = g_cluster.remoteNodeCount;
776 if (g_cluster.remoteNodeCount == 0)
777 ereport(ERROR,
778 (errmsg("invalid watchdog configuration. other pgpools setting is not defined")));
779 ereport(LOG,
780 (errmsg("watchdog cluster is configured with %d remote nodes", g_cluster.remoteNodeCount)));
781 g_cluster.remoteNodes = palloc0((sizeof(WatchdogNode) * g_cluster.remoteNodeCount));
782 int idx = 0;
783 for (i = 0; i < pool_config->wd_nodes.num_wd; i++)
784 {
785 if (i == pool_config->pgpool_node_id)
786 continue;
787
788 g_cluster.remoteNodes[idx].wd_port = pool_config->wd_nodes.wd_node_info[i].wd_port;
789 g_cluster.remoteNodes[idx].pgpool_node_id = i;
790 g_cluster.remoteNodes[idx].pgpool_port = pool_config->wd_nodes.wd_node_info[i].pgpool_port;
791 strcpy(g_cluster.remoteNodes[idx].hostname, pool_config->wd_nodes.wd_node_info[i].hostname);
792 g_cluster.remoteNodes[idx].delegate_ip[0] = '\0'; /* this will be
793 * populated by remote
794 * node */
795
796 ereport(LOG,
797 (errmsg("watchdog remote node:%d on %s:%d", idx, g_cluster.remoteNodes[idx].hostname, g_cluster.remoteNodes[idx].wd_port)));
798
799 idx++;
800 }
801
802 g_cluster.clusterLeaderInfo.leaderNode = NULL;
803 g_cluster.clusterLeaderInfo.standbyNodes = palloc0(sizeof(WatchdogNode *) * g_cluster.remoteNodeCount);
804 g_cluster.clusterLeaderInfo.standby_nodes_count = 0;
805 g_cluster.clusterLeaderInfo.holding_vip = false;
806 g_cluster.quorum_status = -1;
807 g_cluster.nextCommandID = 1;
808 g_cluster.clusterInitialized = false;
809 g_cluster.escalation_pid = 0;
810 g_cluster.de_escalation_pid = 0;
811 g_cluster.unidentified_socks = NULL;
812 g_cluster.command_server_sock = 0;
813 g_cluster.notify_clients = NULL;
814 g_cluster.ipc_command_socks = NULL;
815 g_cluster.wd_timer_commands = NULL;
816 g_cluster.wdCurrentFailovers = NULL;
817 g_cluster.ipc_commands = NULL;
818 g_cluster.localNode->state = WD_DEAD;
819 g_cluster.clusterCommands = NULL;
820 g_cluster.ipc_auth_needed = strlen(pool_config->wd_authkey) ? true : false;
821
822 g_cluster.localNode->escalated = get_watchdog_node_escalation_state();
823
824 wd_initialize_monitoring_interfaces();
825 if (g_cluster.ipc_auth_needed)
826 {
827 #ifndef USE_SSL
828 ereport(LOG,
829 (errmsg("watchdog is configured to use authentication, but pgpool-II is built without SSL support"),
830 errdetail("The authentication method used by pgpool-II without the SSL support is known to be weak")));
831 #endif
832 }
833 if (get_watchdog_process_needs_cleanup())
834 {
835 ereport(LOG,
836 (errmsg("watchdog is recovering from the crash of watchdog process")));
837
838 /*
839 * If we are recovering from crash or abnormal termination de-escalate
840 * the node if it was coordinator when it crashed
841 */
842 resign_from_escalated_node();
843 }
844 }
845
846 static void
clear_command_node_result(WDCommandNodeResult * nodeResult)847 clear_command_node_result(WDCommandNodeResult * nodeResult)
848 {
849 nodeResult->result_type = WD_NO_MESSAGE;
850 nodeResult->result_data = NULL;
851 nodeResult->result_data_len = 0;
852 nodeResult->cmdState = COMMAND_STATE_INIT;
853 }
854
855 static int
wd_create_recv_socket(int port)856 wd_create_recv_socket(int port)
857 {
858 size_t len = 0;
859 struct sockaddr_in addr;
860 int one = 1;
861 int sock = -1;
862 int saved_errno;
863
864 if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
865 {
866 /* socket create failed */
867 ereport(ERROR,
868 (errmsg("failed to create watchdog receive socket"),
869 errdetail("create socket failed with reason: \"%m\"")));
870 }
871
872 socket_set_nonblock(sock);
873
874 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *) &one, sizeof(one)) == -1)
875 {
876 /* setsockopt(SO_REUSEADDR) failed */
877 saved_errno = errno;
878 close(sock);
879 ereport(ERROR,
880 (errmsg("failed to create watchdog receive socket"),
881 errdetail("setsockopt(SO_REUSEADDR) failed with reason: \"%s\"", strerror(saved_errno))));
882 }
883 if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)) == -1)
884 {
885 /* setsockopt(TCP_NODELAY) failed */
886 saved_errno = errno;
887 close(sock);
888 ereport(ERROR,
889 (errmsg("failed to create watchdog receive socket"),
890 errdetail("setsockopt(TCP_NODELAY) failed with reason: \"%s\"", strerror(saved_errno))));
891 }
892 if (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *) &one, sizeof(one)) == -1)
893 {
894 /* setsockopt(SO_KEEPALIVE) failed */
895 saved_errno = errno;
896 close(sock);
897 ereport(ERROR,
898 (errmsg("failed to create watchdog receive socket"),
899 errdetail("setsockopt(SO_KEEPALIVE) failed with reason: \"%s\"", strerror(saved_errno))));
900 }
901
902 addr.sin_family = AF_INET;
903 addr.sin_addr.s_addr = htonl(INADDR_ANY);
904 addr.sin_port = htons(port);
905 len = sizeof(struct sockaddr_in);
906
907 if (bind(sock, (struct sockaddr *) &addr, len) < 0)
908 {
909 /* bind failed */
910 saved_errno = errno;
911 close(sock);
912 ereport(ERROR,
913 (errmsg("failed to create watchdog receive socket"),
914 errdetail("bind on \"TCP:%d\" failed with reason: \"%s\"", port, strerror(saved_errno))));
915 }
916
917 if (listen(sock, MAX_WATCHDOG_NUM * 2) < 0)
918 {
919 /* listen failed */
920 saved_errno = errno;
921 close(sock);
922 ereport(ERROR,
923 (errmsg("failed to create watchdog receive socket"),
924 errdetail("listen failed with reason: \"%s\"", strerror(saved_errno))));
925 }
926
927 return sock;
928 }
929
930
931
932 /*
933 * creates a socket in non blocking mode and connects it to the hostname and port
934 * the out parameter connected is set to true if the connection is successful
935 */
936 static int
wd_create_client_socket(char * hostname,int port,bool * connected)937 wd_create_client_socket(char *hostname, int port, bool *connected)
938 {
939 int sock;
940 int one = 1;
941 size_t len = 0;
942 struct sockaddr_in addr;
943 struct hostent *hp;
944
945 *connected = false;
946 /* create socket */
947 if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
948 {
949 /* socket create failed */
950 ereport(LOG,
951 (errmsg("create socket failed with reason: \"%m\"")));
952 return -1;
953 }
954
955 /* set socket option */
956 if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)) == -1)
957 {
958 close(sock);
959 ereport(LOG,
960 (errmsg("failed to set socket options"),
961 errdetail("setsockopt(TCP_NODELAY) failed with error: \"%m\"")));
962 return -1;
963 }
964 if (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *) &one, sizeof(one)) == -1)
965 {
966 ereport(LOG,
967 (errmsg("failed to set socket options"),
968 errdetail("setsockopt(SO_KEEPALIVE) failed with error: \"%m\"")));
969 close(sock);
970 return -1;
971 }
972 /* set sockaddr_in */
973 memset(&addr, 0, sizeof(addr));
974 addr.sin_family = AF_INET;
975 hp = gethostbyname(hostname);
976 if ((hp == NULL) || (hp->h_addrtype != AF_INET))
977 {
978 hp = gethostbyaddr(hostname, strlen(hostname), AF_INET);
979 if ((hp == NULL) || (hp->h_addrtype != AF_INET))
980 {
981 ereport(LOG,
982 (errmsg("failed to get host address for \"%s\"", hostname),
983 errdetail("gethostbyaddr failed with error: \"%s\"", hstrerror(h_errno))));
984 close(sock);
985 return -1;
986 }
987 }
988 memmove((char *) &(addr.sin_addr), (char *) hp->h_addr, hp->h_length);
989 addr.sin_port = htons(port);
990 len = sizeof(struct sockaddr_in);
991
992 /* set socket to non blocking */
993 socket_set_nonblock(sock);
994
995 if (connect(sock, (struct sockaddr *) &addr, len) < 0)
996 {
997 if (errno == EINPROGRESS)
998 {
999 return sock;
1000 }
1001 if (errno == EISCONN)
1002 {
1003 socket_unset_nonblock(sock);
1004 *connected = true;
1005 return sock;
1006 }
1007 ereport(LOG,
1008 (errmsg("connect on socket failed"),
1009 errdetail("connect failed with error: \"%m\"")));
1010 close(sock);
1011 return -1;
1012 }
1013 /* set socket to blocking again */
1014 socket_unset_nonblock(sock);
1015 *connected = true;
1016 return sock;
1017 }
1018
1019 /* returns the number of successful connections */
1020 static int
connect_with_all_configured_nodes(void)1021 connect_with_all_configured_nodes(void)
1022 {
1023 int connect_count = 0;
1024 int i;
1025
1026 for (i = 0; i < g_cluster.remoteNodeCount; i++)
1027 {
1028 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
1029
1030 if (connect_to_node(wdNode))
1031 connect_count++;
1032 }
1033 return connect_count;
1034 }
1035
1036 /*
1037 * Function tries to connect with nodes which have both sockets
1038 * disconnected
1039 */
1040 static void
try_connecting_with_all_unreachable_nodes(void)1041 try_connecting_with_all_unreachable_nodes(void)
1042 {
1043 int i;
1044
1045 for (i = 0; i < g_cluster.remoteNodeCount; i++)
1046 {
1047 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
1048
1049 if (wdNode->client_socket.sock_state != WD_SOCK_WAITING_FOR_CONNECT && wdNode->client_socket.sock_state != WD_SOCK_CONNECTED &&
1050 wdNode->server_socket.sock_state != WD_SOCK_WAITING_FOR_CONNECT && wdNode->server_socket.sock_state != WD_SOCK_CONNECTED)
1051 {
1052 if (wdNode->state == WD_SHUTDOWN)
1053 continue;
1054 connect_to_node(wdNode);
1055 if (wdNode->client_socket.sock_state == WD_SOCK_CONNECTED)
1056 {
1057 ereport(LOG,
1058 (errmsg("connection to the remote node \"%s\" is restored", wdNode->nodeName)));
1059 watchdog_state_machine(WD_EVENT_NEW_OUTBOUND_CONNECTION, wdNode, NULL, NULL);
1060 }
1061 }
1062 }
1063 }
1064
1065 /*
1066 * returns true if the connection is in progress or connected successfully
1067 * false is returned in case of failure
1068 */
1069 static bool
connect_to_node(WatchdogNode * wdNode)1070 connect_to_node(WatchdogNode * wdNode)
1071 {
1072 bool connected = false;
1073
1074 wdNode->client_socket.sock = wd_create_client_socket(wdNode->hostname, wdNode->wd_port, &connected);
1075 gettimeofday(&wdNode->client_socket.tv, NULL);
1076 if (wdNode->client_socket.sock <= 0)
1077 {
1078 wdNode->client_socket.sock_state = WD_SOCK_ERROR;
1079 ereport(DEBUG1,
1080 (errmsg("outbound connection to \"%s:%d\" failed", wdNode->hostname, wdNode->wd_port)));
1081 }
1082 else
1083 {
1084 if (connected)
1085 wdNode->client_socket.sock_state = WD_SOCK_CONNECTED;
1086 else
1087 wdNode->client_socket.sock_state = WD_SOCK_WAITING_FOR_CONNECT;
1088 }
1089 return (wdNode->client_socket.sock_state != WD_SOCK_ERROR);
1090 }
1091
1092 /* signal handler for SIGHUP and SIGCHLD handler */
watchdog_signal_handler(int sig)1093 static RETSIGTYPE watchdog_signal_handler(int sig)
1094 {
1095 if (sig == SIGHUP)
1096 reload_config_signal = 1;
1097 else if (sig == SIGCHLD)
1098 sigchld_request = 1;
1099 }
1100
1101 static void
check_signals(void)1102 check_signals(void)
1103 {
1104 /* reload config file signal? */
1105 if (reload_config_signal)
1106 {
1107 MemoryContext oldContext = MemoryContextSwitchTo(TopMemoryContext);
1108
1109 pool_get_config(get_config_file_name(), CFGCXT_RELOAD);
1110 MemoryContextSwitchTo(oldContext);
1111 reload_config_signal = 0;
1112 }
1113 else if (sigchld_request)
1114 {
1115 wd_child_signal_handler();
1116 }
1117 }
1118
1119
1120 /*
1121 * fork a child for watchdog
1122 */
1123 static pid_t
fork_watchdog_child(void)1124 fork_watchdog_child(void)
1125 {
1126 pid_t pid;
1127
1128 pid = fork();
1129
1130 if (pid == 0)
1131 {
1132 on_exit_reset();
1133
1134 SetProcessGlobalVariables(PT_WATCHDOG);
1135
1136 /* call watchdog child main */
1137 POOL_SETMASK(&UnBlockSig);
1138 watchdog_main();
1139 }
1140 else if (pid == -1)
1141 {
1142 ereport(FATAL,
1143 (return_code(POOL_EXIT_FATAL),
1144 errmsg("fork() failed"),
1145 errdetail("%m")));
1146 }
1147
1148 return pid;
1149 }
1150
1151 /* Never returns */
1152 static int
watchdog_main(void)1153 watchdog_main(void)
1154 {
1155 fd_set rmask;
1156 fd_set wmask;
1157 fd_set emask;
1158 const int select_timeout = 1;
1159 struct timeval tv,
1160 ref_time;
1161
1162 volatile int fd;
1163 sigjmp_buf local_sigjmp_buf;
1164
1165 pool_signal(SIGTERM, wd_child_exit);
1166 pool_signal(SIGINT, wd_child_exit);
1167 pool_signal(SIGQUIT, wd_child_exit);
1168 pool_signal(SIGHUP, watchdog_signal_handler);
1169 pool_signal(SIGCHLD, watchdog_signal_handler);
1170 pool_signal(SIGUSR1, SIG_IGN);
1171 pool_signal(SIGUSR2, SIG_IGN);
1172 pool_signal(SIGPIPE, SIG_IGN);
1173 pool_signal(SIGALRM, SIG_IGN);
1174
1175 init_ps_display("", "", "", "");
1176
1177 /* Create per loop iteration memory context */
1178 ProcessLoopContext = AllocSetContextCreate(TopMemoryContext,
1179 "wd_child_main_loop",
1180 ALLOCSET_DEFAULT_MINSIZE,
1181 ALLOCSET_DEFAULT_INITSIZE,
1182 ALLOCSET_DEFAULT_MAXSIZE);
1183
1184 MemoryContextSwitchTo(TopMemoryContext);
1185
1186 set_ps_display("watchdog", false);
1187
1188 /* initialize all the local structures for watchdog */
1189 wd_cluster_initialize();
1190 /* create a server socket for incoming watchdog connections */
1191 g_cluster.localNode->server_socket.sock = wd_create_recv_socket(g_cluster.localNode->wd_port);
1192 g_cluster.localNode->server_socket.sock_state = WD_SOCK_CONNECTED;
1193 /* open the command server */
1194 g_cluster.command_server_sock = wd_create_command_server_socket();
1195
1196 /* try connecting to all watchdog nodes */
1197 g_cluster.network_monitor_sock = create_monitoring_socket();
1198
1199 if (any_interface_available() == false)
1200 {
1201 ereport(FATAL,
1202 (return_code(POOL_EXIT_FATAL),
1203 errmsg("no valid network interface is active"),
1204 errdetail("watchdog requires at least one valid network interface to continue"),
1205 errhint("you can disable interface checking by setting wd_monitoring_interfaces_list = '' in pgpool config")));
1206 }
1207
1208 connect_with_all_configured_nodes();
1209
1210 /* set the initial state of local node */
1211 set_state(WD_LOADING);
1212
1213 /*
1214 * install the callback for the preparation of system exit
1215 */
1216 on_system_exit(wd_system_will_go_down, (Datum) NULL);
1217
1218 if (sigsetjmp(local_sigjmp_buf, 1) != 0)
1219 {
1220 /* Since not using PG_TRY, must reset error stack by hand */
1221 if (fd > 0)
1222 close(fd);
1223
1224 error_context_stack = NULL;
1225
1226 EmitErrorReport();
1227 MemoryContextSwitchTo(TopMemoryContext);
1228 FlushErrorState();
1229 }
1230
1231 /* We can now handle ereport(ERROR) */
1232 PG_exception_stack = &local_sigjmp_buf;
1233 reset_watchdog_process_needs_cleanup();
1234 /* watchdog child loop */
1235 for (;;)
1236 {
1237 int fd_max,
1238 select_ret;
1239 bool timeout_event = false;
1240
1241 MemoryContextSwitchTo(ProcessLoopContext);
1242 MemoryContextResetAndDeleteChildren(ProcessLoopContext);
1243
1244 /* take care config reload request and SIGCHLD */
1245 check_signals();
1246
1247 /*
1248 * Establish all accepting socket descriptors and wait for
1249 * incoming/outcoming events for up to 1 second.
1250 */
1251 fd_max = prepare_fds(&rmask, &wmask, &emask);
1252 tv.tv_sec = select_timeout;
1253 tv.tv_usec = 0;
1254 select_ret = select(fd_max + 1, &rmask, &wmask, &emask, &tv);
1255
1256 gettimeofday(&ref_time, NULL);
1257
1258 if (g_timeout_sec > 0)
1259 {
1260 if (WD_TIME_DIFF_SEC(ref_time, g_tm_set_time) >= g_timeout_sec)
1261 {
1262 timeout_event = true;
1263 g_timeout_sec = 0;
1264 }
1265 }
1266 #ifdef WATCHDOG_DEBUG
1267 load_watchdog_debug_test_option();
1268 #endif
1269 /* process events */
1270 if (select_ret > 0)
1271 {
1272 int processed_fds = 0;
1273
1274 processed_fds += accept_incoming_connections(&rmask, (select_ret - processed_fds));
1275 processed_fds += update_successful_outgoing_cons(&wmask, (select_ret - processed_fds));
1276 processed_fds += read_sockets(&rmask, (select_ret - processed_fds));
1277 }
1278
1279 /*
1280 * Take care online recovery
1281 */
1282 if (WD_TIME_DIFF_SEC(ref_time, g_tm_set_time) >= 1)
1283 {
1284 process_wd_func_commands_for_timer_events();
1285 }
1286
1287 if (timeout_event)
1288 {
1289 g_timeout_sec = 0;
1290 watchdog_state_machine(WD_EVENT_TIMEOUT, NULL, NULL, NULL);
1291 }
1292
1293 check_for_current_command_timeout();
1294
1295 /*
1296 * If any of connections to remote nodes are established, send
1297 * commands to the remote nodes.
1298 */
1299 if (service_lost_connections() == true)
1300 {
1301 service_internal_command();
1302 service_ipc_commands();
1303 }
1304
1305 /*
1306 * Remove the unreachable nodes from cluster
1307 */
1308 service_unreachable_nodes();
1309
1310 /*
1311 * If I am the leader, update the quorum status.
1312 */
1313 if (get_local_node_state() == WD_COORDINATOR)
1314 {
1315 update_quorum_status();
1316 }
1317
1318 /*
1319 * Remove any expired failover command (had spent over 15 seconds
1320 * (FAILOVER_COMMAND_FINISH_TIMEOUT)
1321 */
1322 service_expired_failovers();
1323 }
1324 return 0;
1325 }
1326
1327 static int
wd_create_command_server_socket(void)1328 wd_create_command_server_socket(void)
1329 {
1330 size_t len = 0;
1331 struct sockaddr_un addr;
1332 int sock = -1;
1333
1334 /* We use unix domain stream sockets for the purpose */
1335 if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
1336 {
1337 /* socket create failed */
1338 ereport(FATAL,
1339 (return_code(POOL_EXIT_FATAL),
1340 errmsg("failed to create watchdog command server socket"),
1341 errdetail("create socket failed with reason: \"%m\"")));
1342 }
1343 memset((char *) &addr, 0, sizeof(addr));
1344 addr.sun_family = AF_UNIX;
1345 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", get_watchdog_ipc_address());
1346 len = sizeof(struct sockaddr_un);
1347
1348 ereport(INFO,
1349 (errmsg("IPC socket path: \"%s\"", get_watchdog_ipc_address())));
1350
1351 /* Delete any pre-existing socket file to avoid failure at bind() time */
1352 unlink(addr.sun_path);
1353
1354 if (bind(sock, (struct sockaddr *) &addr, len) == -1)
1355 {
1356 int saved_errno = errno;
1357
1358 close(sock);
1359 unlink(addr.sun_path);
1360 ereport(FATAL,
1361 (return_code(POOL_EXIT_FATAL),
1362 errmsg("failed to create watchdog command server socket"),
1363 errdetail("bind on \"%s\" failed with reason: \"%s\"", addr.sun_path, strerror(saved_errno))));
1364 }
1365
1366 if (listen(sock, 5) < 0)
1367 {
1368 /* listen failed */
1369 int saved_errno = errno;
1370
1371 close(sock);
1372 unlink(addr.sun_path);
1373 ereport(FATAL,
1374 (return_code(POOL_EXIT_FATAL),
1375 errmsg("failed to create watchdog command server socket"),
1376 errdetail("listen failed with reason: \"%s\"", strerror(saved_errno))));
1377 }
1378 on_proc_exit(FileUnlink, (Datum) pstrdup(addr.sun_path));
1379 return sock;
1380 }
1381
1382 static void
FileUnlink(int code,Datum path)1383 FileUnlink(int code, Datum path)
1384 {
1385 char *filePath = (char *) path;
1386
1387 unlink(filePath);
1388 }
1389
1390
1391 /*
1392 * sets all the valid watchdog cluster descriptors to the fd_set.
1393 returns the fd_max */
1394 static int
prepare_fds(fd_set * rmask,fd_set * wmask,fd_set * emask)1395 prepare_fds(fd_set *rmask, fd_set *wmask, fd_set *emask)
1396 {
1397 int i;
1398 ListCell *lc;
1399 int fd_max = g_cluster.localNode->server_socket.sock;
1400
1401 FD_ZERO(rmask);
1402 FD_ZERO(wmask);
1403 FD_ZERO(emask);
1404
1405 /* local node server socket will set the read and exception fds */
1406 FD_SET(g_cluster.localNode->server_socket.sock, rmask);
1407 FD_SET(g_cluster.localNode->server_socket.sock, emask);
1408
1409 /* command server socket will set the read and exception fds */
1410 FD_SET(g_cluster.command_server_sock, rmask);
1411 FD_SET(g_cluster.command_server_sock, emask);
1412 if (fd_max < g_cluster.command_server_sock)
1413 fd_max = g_cluster.command_server_sock;
1414
1415 FD_SET(g_cluster.network_monitor_sock, rmask);
1416 if (fd_max < g_cluster.network_monitor_sock)
1417 fd_max = g_cluster.network_monitor_sock;
1418
1419 /*
1420 * set write fdset for all waiting for connection sockets, while already
1421 * connected will be only be waiting for read
1422 */
1423 for (i = 0; i < g_cluster.remoteNodeCount; i++)
1424 {
1425 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
1426
1427 if (wdNode->client_socket.sock > 0)
1428 {
1429 if (fd_max < wdNode->client_socket.sock)
1430 fd_max = wdNode->client_socket.sock;
1431
1432 FD_SET(wdNode->client_socket.sock, emask);
1433
1434 if (wdNode->client_socket.sock_state == WD_SOCK_WAITING_FOR_CONNECT)
1435 FD_SET(wdNode->client_socket.sock, wmask);
1436 else
1437 FD_SET(wdNode->client_socket.sock, rmask);
1438 }
1439 if (wdNode->server_socket.sock > 0)
1440 {
1441 if (fd_max < wdNode->server_socket.sock)
1442 fd_max = wdNode->server_socket.sock;
1443
1444 FD_SET(wdNode->server_socket.sock, emask);
1445 FD_SET(wdNode->server_socket.sock, rmask);
1446 }
1447 }
1448
1449 /*
1450 * I know this is getting complex but we need to add all incoming
1451 * unassigned connection sockets these one will go for reading
1452 */
1453 foreach(lc, g_cluster.unidentified_socks)
1454 {
1455 SocketConnection *conn = lfirst(lc);
1456 int ui_sock = conn->sock;
1457
1458 if (ui_sock > 0)
1459 {
1460 FD_SET(ui_sock, rmask);
1461 FD_SET(ui_sock, emask);
1462 if (fd_max < ui_sock)
1463 fd_max = ui_sock;
1464 }
1465 }
1466
1467 /* Add the notification connected clients */
1468 foreach(lc, g_cluster.notify_clients)
1469 {
1470 int ui_sock = lfirst_int(lc);
1471
1472 if (ui_sock > 0)
1473 {
1474 FD_SET(ui_sock, rmask);
1475 FD_SET(ui_sock, emask);
1476 if (fd_max < ui_sock)
1477 fd_max = ui_sock;
1478 }
1479 }
1480
1481 /* Finally Add the command IPC sockets */
1482 foreach(lc, g_cluster.ipc_command_socks)
1483 {
1484 int ui_sock = lfirst_int(lc);
1485
1486 if (ui_sock > 0)
1487 {
1488 FD_SET(ui_sock, rmask);
1489 FD_SET(ui_sock, emask);
1490 if (fd_max < ui_sock)
1491 fd_max = ui_sock;
1492 }
1493 }
1494
1495 return fd_max;
1496 }
1497
1498 static int
read_sockets(fd_set * rmask,int pending_fds_count)1499 read_sockets(fd_set *rmask, int pending_fds_count)
1500 {
1501 int i,
1502 count = 0;
1503 List *socks_to_del = NIL;
1504 ListCell *lc;
1505
1506 for (i = 0; i < g_cluster.remoteNodeCount; i++)
1507 {
1508 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
1509
1510 if (is_socket_connection_connected(&wdNode->client_socket))
1511 {
1512 if (FD_ISSET(wdNode->client_socket.sock, rmask))
1513 {
1514 ereport(DEBUG2,
1515 (errmsg("client socket of %s is ready for reading", wdNode->nodeName)));
1516
1517 WDPacketData *pkt = read_packet(&wdNode->client_socket);
1518
1519 if (pkt)
1520 {
1521 if (check_debug_request_kill_all_communication() == false &&
1522 check_debug_request_kill_all_receivers() == false)
1523 {
1524 watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
1525 /* since a packet is received reset last sent time */
1526 wdNode->last_sent_time.tv_sec = 0;
1527 wdNode->last_sent_time.tv_usec = 0;
1528 }
1529 free_packet(pkt);
1530 }
1531 else
1532 {
1533 ereport(LOG,
1534 (errmsg("client socket of %s is closed", wdNode->nodeName)));
1535 }
1536
1537 count++;
1538 if (count >= pending_fds_count)
1539 return count;
1540 }
1541 }
1542 if (is_socket_connection_connected(&wdNode->server_socket))
1543 {
1544 if (FD_ISSET(wdNode->server_socket.sock, rmask))
1545 {
1546 ereport(DEBUG2,
1547 (errmsg("server socket of %s is ready for reading", wdNode->nodeName)));
1548 WDPacketData *pkt = read_packet(&wdNode->server_socket);
1549
1550 if (pkt)
1551 {
1552 if (check_debug_request_kill_all_communication() == false &&
1553 check_debug_request_kill_all_receivers() == false)
1554 {
1555 watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
1556 /* since a packet is received reset last sent time */
1557 wdNode->last_sent_time.tv_sec = 0;
1558 wdNode->last_sent_time.tv_usec = 0;
1559 }
1560 free_packet(pkt);
1561 }
1562 else
1563 {
1564 ereport(LOG,
1565 (errmsg("outbound socket of %s is closed", wdNode->nodeName)));
1566 }
1567
1568 count++;
1569 if (count >= pending_fds_count)
1570 return count;
1571 }
1572 }
1573 }
1574
1575 foreach(lc, g_cluster.unidentified_socks)
1576 {
1577 SocketConnection *conn = lfirst(lc);
1578
1579 if (conn->sock > 0 && FD_ISSET(conn->sock, rmask))
1580 {
1581 WDPacketData *pkt;
1582
1583 ereport(DEBUG2,
1584 (errmsg("un-identified socket %d is ready for reading", conn->sock)));
1585 /* we only entertain ADD NODE messages from unidentified sockets */
1586 pkt = read_packet_of_type(conn, WD_ADD_NODE_MESSAGE);
1587 if (pkt)
1588 {
1589 struct timeval previous_startup_time;
1590 char *authkey = NULL;
1591 WatchdogNode *tempNode = parse_node_info_message(pkt, &authkey);
1592
1593 if (tempNode)
1594 {
1595 WatchdogNode *wdNode;
1596 bool found = false;
1597 bool authenticated = false;
1598
1599 if (tempNode->pgpool_node_id == pool_config->pgpool_node_id)
1600 {
1601 ereport(ERROR,
1602 (errmsg("the pgpool node id configured on node \"%s\" cannot be same as local node", tempNode->nodeName),
1603 errdetail("this node id is \"%d\" while local node is \"%d\"",
1604 tempNode->pgpool_node_id,
1605 pool_config->pgpool_node_id)));
1606 }
1607
1608 print_watchdog_node_info(tempNode);
1609 authenticated = verify_authhash_for_node(tempNode, authkey);
1610 ereport(DEBUG1,
1611 (errmsg("ADD NODE MESSAGE from hostname:\"%s\" port:%d pgpool_port:%d", tempNode->hostname, tempNode->wd_port, tempNode->pgpool_port)));
1612 /* verify this node */
1613 if (authenticated)
1614 {
1615 WD_STATES oldNodeState = WD_DEAD;
1616 for (i = 0; i < g_cluster.remoteNodeCount; i++)
1617 {
1618 wdNode = &(g_cluster.remoteNodes[i]);
1619
1620 if ((wdNode->wd_port == tempNode->wd_port && wdNode->pgpool_port == tempNode->pgpool_port &&
1621 wdNode->pgpool_node_id == tempNode->pgpool_node_id) &&
1622 ((strcmp(wdNode->hostname, conn->addr) == 0) || (strcmp(wdNode->hostname, tempNode->hostname) == 0)))
1623 {
1624 /* We have found the match */
1625 found = true;
1626 previous_startup_time.tv_sec = wdNode->startup_time.tv_sec;
1627 oldNodeState = wdNode->state;
1628
1629 close_socket_connection(&wdNode->server_socket);
1630 strlcpy(wdNode->delegate_ip, tempNode->delegate_ip, WD_MAX_HOST_NAMELEN);
1631 strlcpy(wdNode->nodeName, tempNode->nodeName, WD_MAX_HOST_NAMELEN);
1632 strlcpy(wdNode->pgp_version, tempNode->pgp_version, MAX_VERSION_STR_LEN);
1633 wdNode->state = tempNode->state;
1634 wdNode->wd_data_major_version = tempNode->wd_data_major_version;
1635 wdNode->wd_data_minor_version = tempNode->wd_data_minor_version;
1636 wdNode->startup_time.tv_sec = tempNode->startup_time.tv_sec;
1637 wdNode->wd_priority = tempNode->wd_priority;
1638 wdNode->server_socket = *conn;
1639 wdNode->server_socket.sock_state = WD_SOCK_CONNECTED;
1640 if (tempNode->current_state_time.tv_sec)
1641 {
1642 wdNode->current_state_time.tv_sec = tempNode->current_state_time.tv_sec;
1643 wdNode->escalated = tempNode->escalated;
1644 wdNode->standby_nodes_count = tempNode->standby_nodes_count;
1645 wdNode->quorum_status = tempNode->quorum_status;
1646 }
1647 break;
1648 }
1649 }
1650 if (found)
1651 {
1652 restore_cluster_membership_of_node(wdNode);
1653 /* reply with node info message */
1654 ereport(LOG,
1655 (errmsg("new node joined the cluster hostname:\"%s\" port:%d pgpool_port:%d", wdNode->hostname,
1656 wdNode->wd_port,
1657 wdNode->pgpool_port),
1658 errdetail("Pgpool-II version:\"%s\" watchdog messaging version: %d.%d",
1659 wdNode->pgp_version,
1660 wdNode->wd_data_major_version,
1661 wdNode->wd_data_minor_version)));
1662
1663 if (oldNodeState == WD_SHUTDOWN)
1664 {
1665 ereport(LOG,
1666 (errmsg("The newly joined node:\"%s\" had left the cluster because it was shutdown",wdNode->nodeName)));
1667 watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
1668
1669 }
1670 else if (oldNodeState == WD_LOST)
1671 {
1672 ereport(LOG,
1673 (errmsg("The newly joined node:\"%s\" had left the cluster because it was lost",wdNode->nodeName),
1674 errdetail("lost reason was \"%s\" and startup time diff = %d",
1675 wd_node_lost_reasons[wdNode->node_lost_reason],
1676 abs((int)(previous_startup_time.tv_sec - wdNode->startup_time.tv_sec)))));
1677
1678 if (abs((int)(previous_startup_time.tv_sec - wdNode->startup_time.tv_sec)) <= 2 &&
1679 wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK)
1680 {
1681 ereport(LOG,
1682 (errmsg("node:\"%s\" was reported lost by the lifecheck process",wdNode->nodeName),
1683 errdetail("only lifecheck process can mark this node alive again")));
1684 /* restore the node's lost state */
1685 wdNode->state = oldNodeState;
1686 }
1687 else
1688 watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
1689
1690 }
1691
1692 }
1693 else
1694 ereport(NOTICE,
1695 (errmsg("add node from hostname:\"%s\" port:%d pgpool_port:%d rejected.", tempNode->hostname, tempNode->wd_port, tempNode->pgpool_port),
1696 errdetail("verify the other watchdog node configurations")));
1697 }
1698 else
1699 {
1700 ereport(NOTICE,
1701 (errmsg("authentication failed for add node from hostname:\"%s\" port:%d pgpool_port:%d", tempNode->hostname, tempNode->wd_port, tempNode->pgpool_port),
1702 errdetail("make sure wd_authkey configuration is same on all nodes")));
1703 }
1704
1705 if (found == false || authenticated == false)
1706 {
1707 /*
1708 * reply with reject message, We do not need to go to
1709 * state processor
1710 */
1711 /* For now, create a empty temp node. */
1712 WatchdogNode tmpNode;
1713
1714 tmpNode.client_socket = *conn;
1715 tmpNode.client_socket.sock_state = WD_SOCK_CONNECTED;
1716 tmpNode.server_socket.sock = -1;
1717 tmpNode.server_socket.sock_state = WD_SOCK_UNINITIALIZED;
1718 reply_with_minimal_message(&tmpNode, WD_REJECT_MESSAGE, pkt);
1719 close_socket_connection(conn);
1720 }
1721 pfree(tempNode);
1722 }
1723 else
1724 {
1725 /*
1726 * Probably some invalid data in the add message
1727 */
1728 WatchdogNode tmpNode;
1729
1730 ereport(LOG,
1731 (errmsg("unable to parse the add node message")));
1732 tmpNode.client_socket = *conn;
1733 tmpNode.client_socket.sock_state = WD_SOCK_CONNECTED;
1734 tmpNode.server_socket.sock = -1;
1735 tmpNode.server_socket.sock_state = WD_SOCK_UNINITIALIZED;
1736 reply_with_minimal_message(&tmpNode, WD_REJECT_MESSAGE, pkt);
1737 close_socket_connection(conn);
1738 }
1739 if (authkey)
1740 pfree(authkey);
1741 free_packet(pkt);
1742 count++;
1743 }
1744 socks_to_del = lappend(socks_to_del, conn);
1745 count++;
1746 if (count >= pending_fds_count)
1747 break;
1748 }
1749 }
1750
1751 /* delete all the sockets from unidentified list which are now identified */
1752 foreach(lc, socks_to_del)
1753 {
1754 g_cluster.unidentified_socks = list_delete_ptr(g_cluster.unidentified_socks, lfirst(lc));
1755 }
1756
1757 list_free_deep(socks_to_del);
1758 socks_to_del = NULL;
1759
1760 if (count >= pending_fds_count)
1761 return count;
1762
1763 foreach(lc, g_cluster.ipc_command_socks)
1764 {
1765 int command_sock = lfirst_int(lc);
1766
1767 if (command_sock > 0 && FD_ISSET(command_sock, rmask))
1768 {
1769 bool remove_sock = false;
1770
1771 read_ipc_socket_and_process(command_sock, &remove_sock);
1772 if (remove_sock)
1773 {
1774 /* Also locate the command if it has this socket */
1775 WDCommandData *ipcCommand = get_wd_IPC_command_from_socket(command_sock);
1776
1777 if (ipcCommand)
1778 {
1779 /*
1780 * special case we want to remove the socket from
1781 * ipc_command_sock list manually, so mark the issuing
1782 * socket of ipcCommand to invalid value
1783 */
1784 ipcCommand->sourceIPCSocket = -1;
1785 }
1786 close(command_sock);
1787 socks_to_del = lappend_int(socks_to_del, command_sock);
1788 }
1789 count++;
1790 if (count >= pending_fds_count)
1791 break;
1792 }
1793 }
1794 /* delete all the sockets from unidentified list which are now identified */
1795 foreach(lc, socks_to_del)
1796 {
1797 g_cluster.ipc_command_socks = list_delete_int(g_cluster.ipc_command_socks, lfirst_int(lc));
1798 }
1799
1800 list_free(socks_to_del);
1801 socks_to_del = NULL;
1802
1803 if (count >= pending_fds_count)
1804 return count;
1805
1806 foreach(lc, g_cluster.notify_clients)
1807 {
1808 int notify_sock = lfirst_int(lc);
1809
1810 if (notify_sock > 0 && FD_ISSET(notify_sock, rmask))
1811 {
1812 bool remove_sock = false;
1813
1814 read_ipc_socket_and_process(notify_sock, &remove_sock);
1815 if (remove_sock)
1816 {
1817 close(notify_sock);
1818 socks_to_del = lappend_int(socks_to_del, notify_sock);
1819 }
1820 count++;
1821 if (count >= pending_fds_count)
1822 break;
1823 }
1824 }
1825 /* delete all the sockets from unidentified list which are now identified */
1826 foreach(lc, socks_to_del)
1827 {
1828 g_cluster.notify_clients = list_delete_int(g_cluster.notify_clients, lfirst_int(lc));
1829 }
1830
1831 list_free(socks_to_del);
1832 socks_to_del = NULL;
1833
1834
1835 /* Finally check if something waits us on interface monitoring socket */
1836 if (g_cluster.network_monitor_sock > 0 && FD_ISSET(g_cluster.network_monitor_sock, rmask))
1837 {
1838 bool deleted;
1839 bool link_event;
1840
1841 if (read_interface_change_event(g_cluster.network_monitor_sock, &link_event, &deleted))
1842 {
1843 ereport(DEBUG1,
1844 (errmsg("network event received"),
1845 errdetail("deleted = %s Link change event = %s",
1846 deleted ? "YES" : "NO",
1847 link_event ? "YES" : "NO")));
1848 if (link_event)
1849 {
1850 if (deleted)
1851 watchdog_state_machine(WD_EVENT_NW_LINK_IS_INACTIVE, NULL, NULL, NULL);
1852 else
1853 watchdog_state_machine(WD_EVENT_NW_LINK_IS_ACTIVE, NULL, NULL, NULL);
1854 }
1855 else
1856 {
1857 if (deleted)
1858 watchdog_state_machine(WD_EVENT_NW_IP_IS_REMOVED, NULL, NULL, NULL);
1859 else
1860 watchdog_state_machine(WD_EVENT_NW_IP_IS_ASSIGNED, NULL, NULL, NULL);
1861 }
1862 }
1863 count++;
1864 }
1865 return count;
1866 }
1867
1868 static bool
write_ipc_command_with_result_data(WDCommandData * ipcCommand,char type,char * data,int len)1869 write_ipc_command_with_result_data(WDCommandData * ipcCommand, char type, char *data, int len)
1870 {
1871 WDPacketData pkt;
1872
1873 pkt.data = data;
1874 pkt.len = len;
1875 pkt.type = type;
1876 pkt.command_id = 0; /* command Id is not used in IPC packets */
1877
1878 if (ipcCommand == NULL || ipcCommand->commandSource != COMMAND_SOURCE_IPC || ipcCommand->sourceIPCSocket <= 0)
1879 {
1880 ereport(DEBUG1,
1881 (errmsg("not replying to IPC, Invalid IPC command.")));
1882 return false;
1883 }
1884 /* DEBUG AID */
1885 if (ipcCommand->commandSource == COMMAND_SOURCE_REMOTE &&
1886 (check_debug_request_kill_all_senders() ||
1887 check_debug_request_kill_all_communication()))
1888 return false;
1889
1890 return write_packet_to_socket(ipcCommand->sourceIPCSocket, &pkt, true);
1891 }
1892
create_command_object(int packet_data_length)1893 static WDCommandData * create_command_object(int packet_data_length)
1894 {
1895 MemoryContext mCxt,
1896 oldCxt;
1897 WDCommandData *wdCommand;
1898
1899 /* wd command lives in its own memory context */
1900 mCxt = AllocSetContextCreate(TopMemoryContext,
1901 "WDCommand",
1902 ALLOCSET_SMALL_MINSIZE,
1903 ALLOCSET_SMALL_INITSIZE,
1904 ALLOCSET_SMALL_MAXSIZE);
1905 oldCxt = MemoryContextSwitchTo(mCxt);
1906
1907 wdCommand = palloc0(sizeof(WDCommandData));
1908 wdCommand->memoryContext = mCxt;
1909 if (packet_data_length > 0)
1910 wdCommand->sourcePacket.data = palloc(packet_data_length);
1911 wdCommand->commandPacket.type = WD_NO_MESSAGE;
1912 wdCommand->sourcePacket.type = WD_NO_MESSAGE;
1913 MemoryContextSwitchTo(oldCxt);
1914 return wdCommand;
1915 }
1916
1917 static bool
read_ipc_socket_and_process(int sock,bool * remove_socket)1918 read_ipc_socket_and_process(int sock, bool *remove_socket)
1919 {
1920 char type;
1921 int data_len,
1922 ret;
1923 WDCommandData *ipcCommand;
1924 IPC_CMD_PROCESS_RES res;
1925
1926 *remove_socket = true;
1927
1928 /* 1st byte is command type */
1929 ret = socket_read(sock, &type, sizeof(char), 0);
1930 if (ret == 0) /* remote end has closed the connection */
1931 return false;
1932
1933 if (ret != sizeof(char))
1934 {
1935 ereport(WARNING,
1936 (errmsg("error reading from IPC socket"),
1937 errdetail("read from socket failed with error \"%m\"")));
1938 return false;
1939 }
1940
1941 /* We should have data length */
1942 ret = socket_read(sock, &data_len, sizeof(int), 0);
1943 if (ret != sizeof(int))
1944 {
1945 ereport(WARNING,
1946 (errmsg("error reading from IPC socket"),
1947 errdetail("read from socket failed with error \"%m\"")));
1948 return false;
1949 }
1950
1951 data_len = ntohl(data_len);
1952 /* see if we have enough information to process this command */
1953 ipcCommand = create_command_object(data_len);
1954 ipcCommand->sourceIPCSocket = sock;
1955 ipcCommand->commandSource = COMMAND_SOURCE_IPC;
1956 ipcCommand->sourceWdNode = g_cluster.localNode;
1957 ipcCommand->sourcePacket.type = type;
1958 ipcCommand->sourcePacket.len = data_len;
1959 gettimeofday(&ipcCommand->commandTime, NULL);
1960
1961 if (data_len > 0)
1962 {
1963 if (socket_read(sock, ipcCommand->sourcePacket.data, data_len, 0) <= 0)
1964 {
1965 ereport(LOG,
1966 (errmsg("error reading IPC from socket"),
1967 errdetail("read from socket failed with error \"%m\"")));
1968 return false;
1969 }
1970 }
1971
1972 res = process_IPC_command(ipcCommand);
1973 if (res == IPC_CMD_PROCESSING)
1974 {
1975 /*
1976 * The command still needs further processing store it in the list
1977 */
1978 MemoryContext oldCxt;
1979
1980 *remove_socket = false;
1981 oldCxt = MemoryContextSwitchTo(TopMemoryContext);
1982 g_cluster.ipc_commands = lappend(g_cluster.ipc_commands, ipcCommand);
1983 MemoryContextSwitchTo(oldCxt);
1984 return true;
1985 }
1986 else if (res != IPC_CMD_COMPLETE)
1987 {
1988 char res_type;
1989 char *data = NULL;
1990 int data_len = 0;
1991
1992 switch (res)
1993 {
1994 case IPC_CMD_TRY_AGAIN:
1995 res_type = WD_IPC_CMD_CLUSTER_IN_TRAN;
1996 break;
1997 case IPC_CMD_ERROR:
1998 ereport(NOTICE,
1999 (errmsg("IPC command returned error")));
2000 res_type = WD_IPC_CMD_RESULT_BAD;
2001 break;
2002 case IPC_CMD_OK:
2003 res_type = WD_IPC_CMD_RESULT_OK;
2004 break;
2005 default:
2006 res_type = WD_IPC_CMD_RESULT_BAD;
2007 ereport(NOTICE,
2008 (errmsg("unexpected IPC processing result")));
2009 break;
2010 }
2011 if (ipcCommand->errorMessage)
2012 {
2013 data = get_wd_simple_message_json(ipcCommand->errorMessage);
2014 data_len = strlen(data) + 1;
2015 }
2016
2017 if (write_ipc_command_with_result_data(ipcCommand, res_type, data, data_len))
2018 {
2019 ereport(NOTICE,
2020 (errmsg("error writing to IPC socket")));
2021 }
2022 if (data)
2023 pfree(data);
2024 }
2025
2026 /*
2027 * Delete the Command structure, it is as simple as to delete the memory
2028 * context
2029 */
2030 MemoryContextDelete(ipcCommand->memoryContext);
2031 return (res != IPC_CMD_ERROR);
2032 }
2033
process_IPC_command(WDCommandData * ipcCommand)2034 static IPC_CMD_PROCESS_RES process_IPC_command(WDCommandData * ipcCommand)
2035 {
2036 /* authenticate the client first */
2037 if (check_and_report_IPC_authentication(ipcCommand) == false)
2038 {
2039 /* authentication error is already reported to the caller */
2040 return IPC_CMD_ERROR;
2041 }
2042
2043 switch (ipcCommand->sourcePacket.type)
2044 {
2045
2046 case WD_NODE_STATUS_CHANGE_COMMAND:
2047 return process_IPC_nodeStatusChange_command(ipcCommand);
2048 break;
2049
2050 case WD_REGISTER_FOR_NOTIFICATION:
2051 /* Add this socket to the notify socket list */
2052 g_cluster.notify_clients = lappend_int(g_cluster.notify_clients, ipcCommand->sourceIPCSocket);
2053 /* The command is completed successfully */
2054 return IPC_CMD_COMPLETE;
2055 break;
2056
2057 case WD_GET_NODES_LIST_COMMAND:
2058 return process_IPC_nodeList_command(ipcCommand);
2059 break;
2060
2061 case WD_IPC_FAILOVER_COMMAND:
2062 return process_IPC_failover_command(ipcCommand);
2063
2064 case WD_IPC_ONLINE_RECOVERY_COMMAND:
2065 return process_IPC_online_recovery(ipcCommand);
2066 break;
2067
2068 case WD_FAILOVER_INDICATION:
2069 return process_IPC_failover_indication(ipcCommand);
2070 break;
2071
2072 case WD_GET_LEADER_DATA_REQUEST:
2073 return process_IPC_data_request_from_leader(ipcCommand);
2074 break;
2075
2076 case WD_GET_RUNTIME_VARIABLE_VALUE:
2077 return process_IPC_get_runtime_variable_value_request(ipcCommand);
2078 break;
2079
2080 case WD_EXECUTE_CLUSTER_COMMAND:
2081 return process_IPC_execute_cluster_command(ipcCommand);
2082 break;
2083
2084 default:
2085 ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext, "unknown IPC command type");
2086 break;
2087 }
2088 return IPC_CMD_ERROR;
2089 }
2090
2091 static IPC_CMD_PROCESS_RES
process_IPC_execute_cluster_command(WDCommandData * ipcCommand)2092 process_IPC_execute_cluster_command(WDCommandData * ipcCommand)
2093 {
2094 /* get the json for node list */
2095 char *clusterCommand = NULL;
2096 List *args_list = NULL;
2097
2098 if (ipcCommand->sourcePacket.len <= 0 || ipcCommand->sourcePacket.data == NULL)
2099 return IPC_CMD_ERROR;
2100
2101 if (!parse_wd_exec_cluster_command_json(ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len,
2102 &clusterCommand, &args_list))
2103 {
2104 goto ERROR_EXIT;
2105 }
2106 if (strcasecmp(WD_COMMAND_SHUTDOWN_CLUSTER, clusterCommand) == 0)
2107 {
2108 ereport(LOG,
2109 (errmsg("Watchdog has received shutdown cluster command from IPC channel")));
2110 }
2111 else if (strcasecmp(WD_COMMAND_RELOAD_CONFIG_CLUSTER, clusterCommand) == 0)
2112 {
2113 ereport(LOG,
2114 (errmsg("Watchdog has received reload config cluster command from IPC channel")));
2115 }
2116 else if (strcasecmp(WD_COMMAND_LOCK_ON_STANDBY, clusterCommand) == 0)
2117 {
2118 ereport(LOG,
2119 (errmsg("Watchdog has received 'LOCK ON STANDBY' command from IPC channel")));
2120 if (get_local_node_state() != WD_COORDINATOR)
2121 {
2122 ereport(LOG,
2123 (errmsg("'LOCK ON STANDBY' command can only be processed on coordinator node")));
2124 goto ERROR_EXIT;
2125 }
2126 }
2127 else
2128 {
2129 ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
2130 "unknown cluster command requested");
2131 goto ERROR_EXIT;
2132 }
2133
2134 /*
2135 * Just broadcast the execute command request to destination node
2136 * Processing the command on the local node is the responsibility of caller
2137 * process
2138 */
2139 reply_with_message(NULL, WD_EXECUTE_COMMAND_REQUEST,
2140 ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len,
2141 NULL);
2142
2143 if (args_list)
2144 list_free_deep(args_list);
2145
2146 pfree(clusterCommand);
2147 return IPC_CMD_OK;
2148
2149 ERROR_EXIT:
2150 if (args_list)
2151 list_free_deep(args_list);
2152 if (clusterCommand)
2153 pfree(clusterCommand);
2154 return IPC_CMD_ERROR;
2155 }
2156
process_IPC_get_runtime_variable_value_request(WDCommandData * ipcCommand)2157 static IPC_CMD_PROCESS_RES process_IPC_get_runtime_variable_value_request(WDCommandData * ipcCommand)
2158 {
2159 /* get the json for node list */
2160 JsonNode *jNode = NULL;
2161 char *requestVarName = NULL;
2162
2163 if (ipcCommand->sourcePacket.len <= 0 || ipcCommand->sourcePacket.data == NULL)
2164 return IPC_CMD_ERROR;
2165
2166 json_value *root = json_parse(ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len);
2167
2168 /* The root node must be object */
2169 if (root == NULL || root->type != json_object)
2170 {
2171 json_value_free(root);
2172 ereport(NOTICE,
2173 (errmsg("failed to process get local variable IPC command"),
2174 errdetail("unable to parse JSON data")));
2175 return IPC_CMD_ERROR;
2176 }
2177
2178 requestVarName = json_get_string_value_for_key(root, WD_JSON_KEY_VARIABLE_NAME);
2179
2180 if (requestVarName == NULL)
2181 {
2182 json_value_free(root);
2183 ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
2184 "requested variable name is null");
2185 return IPC_CMD_ERROR;
2186 }
2187
2188 jNode = jw_create_with_object(true);
2189
2190 if (strcasecmp(WD_RUNTIME_VAR_WD_STATE, requestVarName) == 0)
2191 {
2192 jw_put_int(jNode, WD_JSON_KEY_VALUE_DATA_TYPE, VALUE_DATA_TYPE_INT);
2193 jw_put_int(jNode, WD_JSON_KEY_VALUE_DATA, g_cluster.localNode->state);
2194 }
2195 else if (strcasecmp(WD_RUNTIME_VAR_QUORUM_STATE, requestVarName) == 0)
2196 {
2197 jw_put_int(jNode, WD_JSON_KEY_VALUE_DATA_TYPE, VALUE_DATA_TYPE_INT);
2198 jw_put_int(jNode, WD_JSON_KEY_VALUE_DATA, WD_LEADER_NODE ? WD_LEADER_NODE->quorum_status : -2);
2199 }
2200 else if (strcasecmp(WD_RUNTIME_VAR_ESCALATION_STATE, requestVarName) == 0)
2201 {
2202 jw_put_int(jNode, WD_JSON_KEY_VALUE_DATA_TYPE, VALUE_DATA_TYPE_BOOL);
2203 jw_put_int(jNode, WD_JSON_KEY_VALUE_DATA, g_cluster.localNode->escalated);
2204 }
2205 else
2206 {
2207 json_value_free(root);
2208 jw_destroy(jNode);
2209 ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
2210 "unknown variable requested");
2211 return IPC_CMD_ERROR;
2212 }
2213
2214 jw_finish_document(jNode);
2215 json_value_free(root);
2216 write_ipc_command_with_result_data(ipcCommand, WD_IPC_CMD_RESULT_OK,
2217 jw_get_json_string(jNode), jw_get_json_length(jNode) + 1);
2218 jw_destroy(jNode);
2219 return IPC_CMD_COMPLETE;
2220 }
2221
process_IPC_nodeList_command(WDCommandData * ipcCommand)2222 static IPC_CMD_PROCESS_RES process_IPC_nodeList_command(WDCommandData * ipcCommand)
2223 {
2224 /* get the json for node list */
2225 JsonNode *jNode = NULL;
2226 int NodeID = -1;
2227
2228 if (ipcCommand->sourcePacket.len <= 0 || ipcCommand->sourcePacket.data == NULL)
2229 return IPC_CMD_ERROR;
2230
2231 json_value *root = json_parse(ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len);
2232
2233 /* The root node must be object */
2234 if (root == NULL || root->type != json_object)
2235 {
2236 json_value_free(root);
2237 ereport(NOTICE,
2238 (errmsg("failed to process GET NODE LIST IPC command"),
2239 errdetail("unable to parse json data")));
2240 return IPC_CMD_ERROR;
2241 }
2242
2243 if (json_get_int_value_for_key(root, "NodeID", &NodeID))
2244 {
2245 json_value_free(root);
2246 return IPC_CMD_ERROR;
2247 }
2248
2249 json_value_free(root);
2250 jNode = get_node_list_json(NodeID);
2251 write_ipc_command_with_result_data(ipcCommand, WD_IPC_CMD_RESULT_OK,
2252 jw_get_json_string(jNode), jw_get_json_length(jNode) + 1);
2253 jw_destroy(jNode);
2254 return IPC_CMD_COMPLETE;
2255 }
2256
process_IPC_nodeStatusChange_command(WDCommandData * ipcCommand)2257 static IPC_CMD_PROCESS_RES process_IPC_nodeStatusChange_command(WDCommandData * ipcCommand)
2258 {
2259 int nodeStatus;
2260 int nodeID;
2261 char *message = NULL;
2262 bool ret;
2263
2264 if (ipcCommand->sourcePacket.len <= 0 || ipcCommand->sourcePacket.data == NULL)
2265 return IPC_CMD_ERROR;
2266
2267 ret = parse_node_status_json(ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len, &nodeID, &nodeStatus, &message);
2268
2269 if (ret == false)
2270 {
2271 ereport(NOTICE,
2272 (errmsg("failed to process NODE STATE CHANGE IPC command"),
2273 errdetail("unable to parse JSON data")));
2274 return IPC_CMD_ERROR;
2275 }
2276
2277 if (message)
2278 {
2279 ereport(LOG,
2280 (errmsg("received node status change ipc message"),
2281 errdetail("%s", message)));
2282 pfree(message);
2283 }
2284 if (fire_node_status_event(nodeID, nodeStatus) == false)
2285 return IPC_CMD_ERROR;
2286
2287 return IPC_CMD_COMPLETE;
2288 }
2289
2290 static bool
fire_node_status_event(int nodeID,int nodeStatus)2291 fire_node_status_event(int nodeID, int nodeStatus)
2292 {
2293 WatchdogNode *wdNode = NULL;
2294
2295 if (g_cluster.localNode->pgpool_node_id == nodeID)
2296 {
2297 wdNode = g_cluster.localNode;
2298 }
2299 else
2300 {
2301 int i;
2302
2303 for (i = 0; i < g_cluster.remoteNodeCount; i++)
2304 {
2305 if (nodeID == g_cluster.remoteNodes[i].pgpool_node_id)
2306 {
2307 wdNode = &g_cluster.remoteNodes[i];
2308 break;
2309 }
2310 }
2311 }
2312 if (wdNode == NULL)
2313 {
2314 ereport(LOG,
2315 (errmsg("failed to process node status change event"),
2316 errdetail("invalid Node ID in the event")));
2317 return false;
2318 }
2319
2320 if (nodeStatus == WD_LIFECHECK_NODE_STATUS_DEAD)
2321 {
2322 ereport(DEBUG1,
2323 (errmsg("processing node status changed to DEAD event for node ID:%d", nodeID)));
2324
2325 if (wdNode == g_cluster.localNode)
2326 watchdog_state_machine(WD_EVENT_LOCAL_NODE_LOST, wdNode, NULL, NULL);
2327 else
2328 {
2329 wdNode->node_lost_reason = NODE_LOST_BY_LIFECHECK;
2330 watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
2331 }
2332 }
2333 else if (nodeStatus == WD_LIFECHECK_NODE_STATUS_ALIVE)
2334 {
2335 ereport(DEBUG1,
2336 (errmsg("processing node status changed to ALIVE event for node ID:%d", nodeID)));
2337
2338 if (wdNode == g_cluster.localNode)
2339 watchdog_state_machine(WD_EVENT_LOCAL_NODE_FOUND, wdNode, NULL, NULL);
2340 else
2341 watchdog_state_machine(WD_EVENT_REMOTE_NODE_FOUND, wdNode, NULL, NULL);
2342 }
2343 else
2344 ereport(LOG,
2345 (errmsg("failed to process node status change event"),
2346 errdetail("invalid event type")));
2347 return true;
2348 }
2349
2350 /*
2351 * Free the failover object
2352 */
2353 static void
remove_failover_object(WDFailoverObject * failoverObj)2354 remove_failover_object(WDFailoverObject * failoverObj)
2355 {
2356 ereport(DEBUG1,
2357 (errmsg("removing failover request from %d nodes with ID:%d", failoverObj->request_count, failoverObj->failoverID)));
2358 g_cluster.wdCurrentFailovers = list_delete_ptr(g_cluster.wdCurrentFailovers, failoverObj);
2359 list_free(failoverObj->requestingNodes);
2360 pfree(failoverObj->nodeList);
2361 pfree(failoverObj);
2362 }
2363
2364
2365 /* if the wdNode is NULL. The function removes all failover objects */
2366 static void
clear_all_failovers(void)2367 clear_all_failovers(void)
2368 {
2369 ListCell *lc;
2370 List *failovers_to_del = list_copy(g_cluster.wdCurrentFailovers);
2371
2372 ereport(DEBUG1,
2373 (errmsg("Removing all failover objects")));
2374
2375 foreach(lc, failovers_to_del)
2376 {
2377 WDFailoverObject *failoverObj = lfirst(lc);
2378
2379 remove_failover_object(failoverObj);
2380 }
2381 list_free(failovers_to_del);
2382 }
2383
2384 /* Remove the over stayed failover objects */
2385 static void
service_expired_failovers(void)2386 service_expired_failovers(void)
2387 {
2388 ListCell *lc;
2389 List *failovers_to_del = NULL;
2390 bool need_to_resign = false;
2391 struct timeval currTime;
2392
2393 if (get_local_node_state() != WD_COORDINATOR)
2394 return;
2395
2396 gettimeofday(&currTime, NULL);
2397
2398 foreach(lc, g_cluster.wdCurrentFailovers)
2399 {
2400 WDFailoverObject *failoverObj = lfirst(lc);
2401
2402 if (failoverObj)
2403 {
2404 if (WD_TIME_DIFF_SEC(currTime, failoverObj->startTime) >= FAILOVER_COMMAND_FINISH_TIMEOUT)
2405 {
2406 failovers_to_del = lappend(failovers_to_del, failoverObj);
2407 ereport(DEBUG1,
2408 (errmsg("failover request from %d nodes with ID:%d is expired", failoverObj->request_count, failoverObj->failoverID),
2409 errdetail("marking the failover object for removal")));
2410 if (!need_to_resign && failoverObj->reqKind == NODE_DOWN_REQUEST)
2411 {
2412 ListCell *lc;
2413 /* search the in the requesting node list if we are also the ones
2414 * who think the failover must have been done
2415 */
2416 foreach(lc, failoverObj->requestingNodes)
2417 {
2418 WatchdogNode *reqWdNode = lfirst(lc);
2419 if (g_cluster.localNode == reqWdNode)
2420 {
2421 /* verify if that node requested by us is now quarantined */
2422 int i;
2423 for (i = 0; i < failoverObj->nodesCount; i++)
2424 {
2425 int node_id = failoverObj->nodeList[i];
2426 if (node_id != -1)
2427 {
2428 if (Req_info->primary_node_id == -1 &&
2429 BACKEND_INFO(node_id).quarantine == true &&
2430 BACKEND_INFO(node_id).role == ROLE_PRIMARY)
2431 {
2432 ereport(LOG,
2433 (errmsg("We are not able to build consensus for our primary node failover request, got %d votes only for failover request ID:%d", failoverObj->request_count, failoverObj->failoverID),
2434 errdetail("resigning from the coordinator")));
2435 need_to_resign = true;
2436 }
2437 }
2438 }
2439 }
2440 }
2441 }
2442 }
2443 }
2444 }
2445
2446 /* delete the failover objects */
2447 foreach(lc, failovers_to_del)
2448 {
2449 WDFailoverObject *failoverObj = lfirst(lc);
2450
2451 remove_failover_object(failoverObj);
2452 }
2453 list_free(failovers_to_del);
2454 if (need_to_resign)
2455 {
2456 /* lower my wd_priority for moment */
2457 g_cluster.localNode->wd_priority = -1;
2458 send_cluster_service_message(NULL, NULL, CLUSTER_IAM_RESIGNING_FROM_LEADER);
2459 set_state(WD_JOINING);
2460 }
2461 }
2462
2463 static bool
does_int_array_contains_value(int * intArray,int count,int value)2464 does_int_array_contains_value(int *intArray, int count, int value)
2465 {
2466 int i;
2467
2468 for (i = 0; i < count; i++)
2469 {
2470 if (intArray[i] == value)
2471 return true;
2472 }
2473 return false;
2474 }
2475
get_failover_object(POOL_REQUEST_KIND reqKind,int nodesCount,int * nodeList)2476 static WDFailoverObject * get_failover_object(POOL_REQUEST_KIND reqKind, int nodesCount, int *nodeList)
2477 {
2478 ListCell *lc;
2479
2480 foreach(lc, g_cluster.wdCurrentFailovers)
2481 {
2482 WDFailoverObject *failoverObj = lfirst(lc);
2483
2484 if (failoverObj)
2485 {
2486 if (failoverObj->reqKind == reqKind && failoverObj->nodesCount == nodesCount)
2487 {
2488 bool equal = true;
2489 int i;
2490
2491 for (i = 0; i < nodesCount; i++)
2492 {
2493 if (does_int_array_contains_value(nodeList, nodesCount, failoverObj->nodeList[i]) == false)
2494 {
2495 equal = false;
2496 break;
2497 }
2498 }
2499 if (equal)
2500 return failoverObj;
2501 }
2502 }
2503 }
2504 return NULL;
2505 }
2506
2507 static void
process_remote_failover_command_on_coordinator(WatchdogNode * wdNode,WDPacketData * pkt)2508 process_remote_failover_command_on_coordinator(WatchdogNode * wdNode, WDPacketData * pkt)
2509 {
2510 if (get_local_node_state() != WD_COORDINATOR)
2511 {
2512 /* only lock holder can resign itself */
2513 reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
2514 }
2515 else
2516 {
2517 IPC_CMD_PROCESS_RES res;
2518 WDCommandData *ipcCommand = create_command_object(pkt->len);
2519
2520 ipcCommand->sourcePacket.type = pkt->type;
2521 ipcCommand->sourcePacket.len = pkt->len;
2522 ipcCommand->sourcePacket.command_id = pkt->command_id;
2523
2524 if (pkt->len > 0)
2525 memcpy(ipcCommand->sourcePacket.data, pkt->data, pkt->len);
2526
2527 ipcCommand->commandSource = COMMAND_SOURCE_REMOTE;
2528 ipcCommand->sourceWdNode = wdNode;
2529 gettimeofday(&ipcCommand->commandTime, NULL);
2530
2531 ereport(LOG,
2532 (errmsg("watchdog received the failover command from remote pgpool-II node \"%s\"", wdNode->nodeName)));
2533
2534 res = process_failover_command_on_coordinator(ipcCommand);
2535 if (res == IPC_CMD_PROCESSING)
2536 {
2537 MemoryContext oldCxt = MemoryContextSwitchTo(TopMemoryContext);
2538
2539 g_cluster.ipc_commands = lappend(g_cluster.ipc_commands, ipcCommand);
2540 MemoryContextSwitchTo(oldCxt);
2541 ereport(LOG,
2542 (errmsg("failover command from remote pgpool-II node \"%s\" is still processing", wdNode->nodeName),
2543 errdetail("waiting for results...")));
2544 }
2545 else
2546 {
2547 cleanUpIPCCommand(ipcCommand);
2548 }
2549 }
2550 }
2551
2552 static bool
reply_to_failover_command(WDCommandData * ipcCommand,WDFailoverCMDResults cmdResult,unsigned int failoverID)2553 reply_to_failover_command(WDCommandData * ipcCommand, WDFailoverCMDResults cmdResult, unsigned int failoverID)
2554 {
2555 bool ret = false;
2556 JsonNode *jNode = jw_create_with_object(true);
2557
2558 jw_put_int(jNode, WD_FAILOVER_RESULT_KEY, cmdResult);
2559 jw_put_int(jNode, WD_FAILOVER_ID_KEY, failoverID);
2560 /* create the packet */
2561 jw_end_element(jNode);
2562 jw_finish_document(jNode);
2563
2564 ereport(DEBUG2,
2565 (errmsg("replying to failover command with failover ID: %d", failoverID),
2566 errdetail("%.*s", jw_get_json_length(jNode), jw_get_json_string(jNode))));
2567
2568 if (ipcCommand->commandSource == COMMAND_SOURCE_IPC)
2569 {
2570 ret = write_ipc_command_with_result_data(ipcCommand, WD_IPC_CMD_RESULT_OK,
2571 jw_get_json_string(jNode), jw_get_json_length(jNode) + 1);
2572 }
2573 else if (ipcCommand->commandSource == COMMAND_SOURCE_REMOTE)
2574 {
2575 reply_with_message(ipcCommand->sourceWdNode, WD_CMD_REPLY_IN_DATA,
2576 jw_get_json_string(jNode), jw_get_json_length(jNode) + 1,
2577 &ipcCommand->sourcePacket);
2578 }
2579 jw_destroy(jNode);
2580 return ret;
2581 }
2582
2583 /*
2584 * This function process the failover command and decides
2585 * about the execution of failover command.
2586 */
2587
compute_failover_consensus(POOL_REQUEST_KIND reqKind,int * node_id_list,int node_count,unsigned char * flags,WatchdogNode * wdNode)2588 static WDFailoverCMDResults compute_failover_consensus(POOL_REQUEST_KIND reqKind, int *node_id_list, int node_count, unsigned char *flags, WatchdogNode * wdNode)
2589 {
2590 #ifndef NODE_UP_REQUIRE_CONSENSUS
2591 if (reqKind == NODE_UP_REQUEST)
2592 return FAILOVER_RES_PROCEED;
2593 #endif
2594 #ifndef NODE_DOWN_REQUIRE_CONSENSUS
2595 if (reqKind == NODE_DOWN_REQUEST)
2596 return FAILOVER_RES_PROCEED;
2597 #endif
2598 #ifndef NODE_PROMOTE_REQUIRE_CONSENSUS
2599 if (reqKind == PROMOTE_NODE_REQUEST)
2600 return FAILOVER_RES_PROCEED;
2601 #endif
2602
2603 if (pool_config->failover_when_quorum_exists == false)
2604 {
2605 /* No need for any calculation, We do not need a quorum for failover */
2606 ereport(LOG, (
2607 errmsg("we do not need quorum to hold to proceed with failover"),
2608 errdetail("proceeding with the failover"),
2609 errhint("failover_when_quorum_exists is set to false")));
2610
2611 return FAILOVER_RES_PROCEED;
2612 }
2613 if (*flags & REQ_DETAIL_CONFIRMED)
2614 {
2615 /* Check the request flags, If it asks to bypass the quorum status */
2616 ereport(LOG, (
2617 errmsg("The failover request does not need quorum to hold"),
2618 errdetail("proceeding with the failover"),
2619 errhint("REQ_DETAIL_CONFIRMED")));
2620 return FAILOVER_RES_PROCEED;
2621 }
2622 update_quorum_status();
2623 if (g_cluster.quorum_status < 0)
2624 {
2625 /* quorum is must and it is not present at the moment */
2626 ereport(LOG, (
2627 errmsg("failover requires the quorum to hold, which is not present at the moment"),
2628 errdetail("Rejecting the failover request")));
2629 return FAILOVER_RES_NO_QUORUM;
2630 }
2631
2632 /*
2633 * So we reached here means quorum is present Now come to difficult part of
2634 * ensuring the consensus
2635 */
2636 if (pool_config->failover_require_consensus == true)
2637 {
2638 /* Record the failover. */
2639 bool duplicate = false;
2640 WDFailoverObject *failoverObj = add_failover(reqKind, node_id_list, node_count, wdNode, *flags, &duplicate);
2641
2642 if (failoverObj->request_count < get_minimum_votes_to_resolve_consensus())
2643 {
2644 ereport(LOG, (
2645 errmsg("failover requires the majority vote, waiting for consensus"),
2646 errdetail("failover request noted")));
2647 if (duplicate && !pool_config->allow_multiple_failover_requests_from_node)
2648 return FAILOVER_RES_CONSENSUS_MAY_FAIL;
2649 else
2650 return FAILOVER_RES_BUILDING_CONSENSUS;
2651 }
2652 else
2653 {
2654 /* We have received enough votes for this failover */
2655 ereport(LOG, (
2656 errmsg("we have got the consensus to perform the failover"),
2657 errdetail("%d node(s) voted in the favor", failoverObj->request_count)));
2658 /* restore the flag value to the one from the first call */
2659 *flags = failoverObj->reqFlags;
2660 /* remove this object, It is no longer needed */
2661 remove_failover_object(failoverObj);
2662 return FAILOVER_RES_PROCEED;
2663 }
2664 }
2665 else
2666 {
2667 ereport(LOG, (
2668 errmsg("we do not require majority votes to proceed with failover"),
2669 errdetail("proceeding with the failover"),
2670 errhint("failover_require_consensus is set to false")));
2671 }
2672 return FAILOVER_RES_PROCEED;
2673 }
2674
add_failover(POOL_REQUEST_KIND reqKind,int * node_id_list,int node_count,WatchdogNode * wdNode,unsigned char flags,bool * duplicate)2675 static WDFailoverObject * add_failover(POOL_REQUEST_KIND reqKind, int *node_id_list, int node_count, WatchdogNode * wdNode,
2676 unsigned char flags, bool *duplicate)
2677 {
2678 MemoryContext oldCxt;
2679
2680 /* Find the failover */
2681 WDFailoverObject *failoverObj = get_failover_object(reqKind, node_count, node_id_list);
2682
2683 *duplicate = false;
2684 if (failoverObj)
2685 {
2686 ListCell *lc;
2687
2688 /* search the node if it is a duplicate request */
2689 foreach(lc, failoverObj->requestingNodes)
2690 {
2691 WatchdogNode *reqWdNode = lfirst(lc);
2692
2693 if (wdNode == reqWdNode)
2694 {
2695 *duplicate = true;
2696 /* The failover request is duplicate */
2697 if (pool_config->allow_multiple_failover_requests_from_node)
2698 {
2699 failoverObj->request_count++;
2700 ereport(LOG, (
2701 errmsg("duplicate failover request from \"%s\" node", wdNode->nodeName),
2702 errdetail("Pgpool-II can send multiple failover requests for same node"),
2703 errhint("allow_multiple_failover_requests_from_node is enabled")));
2704 }
2705 else
2706 {
2707 ereport(LOG, (
2708 errmsg("Duplicate failover request from \"%s\" node", wdNode->nodeName),
2709 errdetail("request ignored")));
2710 }
2711 return failoverObj;
2712 }
2713 }
2714 }
2715 else
2716 {
2717 oldCxt = MemoryContextSwitchTo(TopMemoryContext);
2718 failoverObj = palloc0(sizeof(WDFailoverObject));
2719 failoverObj->reqKind = reqKind;
2720 failoverObj->requestingNodes = NULL;
2721 failoverObj->nodesCount = node_count;
2722 failoverObj->reqFlags = flags;
2723 failoverObj->request_count = 0;
2724 if (node_count > 0)
2725 {
2726 failoverObj->nodeList = palloc(sizeof(int) * node_count);
2727 memcpy(failoverObj->nodeList, node_id_list, sizeof(int) * node_count);
2728 }
2729 failoverObj->failoverID = get_next_commandID();
2730 gettimeofday(&failoverObj->startTime, NULL);
2731 g_cluster.wdCurrentFailovers = lappend(g_cluster.wdCurrentFailovers, failoverObj);
2732 MemoryContextSwitchTo(oldCxt);
2733 }
2734
2735 failoverObj->request_count++;
2736 oldCxt = MemoryContextSwitchTo(TopMemoryContext);
2737 failoverObj->requestingNodes = lappend(failoverObj->requestingNodes, wdNode);
2738 MemoryContextSwitchTo(oldCxt);
2739 return failoverObj;
2740 }
2741
2742 /*
2743 * The function processes all failover commands on leader node
2744 */
process_failover_command_on_coordinator(WDCommandData * ipcCommand)2745 static IPC_CMD_PROCESS_RES process_failover_command_on_coordinator(WDCommandData * ipcCommand)
2746 {
2747 char *func_name;
2748 int node_count = 0;
2749 int *node_id_list = NULL;
2750 bool ret = false;
2751 unsigned char flags;
2752 POOL_REQUEST_KIND reqKind;
2753 WDFailoverCMDResults res;
2754
2755 if (get_local_node_state() != WD_COORDINATOR)
2756 return IPC_CMD_ERROR; /* should never happen */
2757
2758 ret = parse_wd_node_function_json(ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len,
2759 &func_name, &node_id_list, &node_count, &flags);
2760 if (ret == false)
2761 {
2762 ereport(LOG, (
2763 errmsg("failed to process failover command"),
2764 errdetail("unable to parse the command data")));
2765 reply_to_failover_command(ipcCommand, FAILOVER_RES_INVALID_FUNCTION, 0);
2766 return IPC_CMD_COMPLETE;
2767 }
2768
2769 if (strcasecmp(WD_FUNCTION_FAILBACK_REQUEST, func_name) == 0)
2770 reqKind = NODE_UP_REQUEST;
2771 else if (strcasecmp(WD_FUNCTION_DEGENERATE_REQUEST, func_name) == 0)
2772 reqKind = NODE_DOWN_REQUEST;
2773 else if (strcasecmp(WD_FUNCTION_PROMOTE_REQUEST, func_name) == 0)
2774 reqKind = PROMOTE_NODE_REQUEST;
2775 else
2776 {
2777 reply_to_failover_command(ipcCommand, FAILOVER_RES_INVALID_FUNCTION, 0);
2778 return IPC_CMD_COMPLETE;
2779 }
2780
2781 ereport(LOG,
2782 (errmsg("watchdog is processing the failover command [%s] received from %s",
2783 func_name,
2784 ipcCommand->commandSource == COMMAND_SOURCE_IPC ?
2785 "local pgpool-II on IPC interface" : ipcCommand->sourceWdNode->nodeName)));
2786
2787 res = compute_failover_consensus(reqKind, node_id_list, node_count, &flags, ipcCommand->sourceWdNode);
2788
2789 if (res == FAILOVER_RES_PROCEED)
2790 {
2791 /*
2792 * We are allowed to proceed with the failover, now if the command was
2793 * originated by the remote node, Kick the failover function on the
2794 * Pgpool-II main process and inform the remote caller to wait for
2795 * sync
2796 */
2797 if (ipcCommand->commandSource == COMMAND_SOURCE_REMOTE)
2798 {
2799 /*
2800 * Set the flag indicating the failover request is originated by
2801 * watchdog
2802 */
2803 flags |= REQ_DETAIL_WATCHDOG;
2804
2805 if (reqKind == NODE_DOWN_REQUEST)
2806 ret = degenerate_backend_set(node_id_list, node_count, flags);
2807 else if (reqKind == NODE_UP_REQUEST)
2808 ret = send_failback_request(node_id_list[0], false, flags);
2809 else if (reqKind == PROMOTE_NODE_REQUEST)
2810 ret = promote_backend(node_id_list[0], flags);
2811
2812 if (ret == true)
2813 reply_to_failover_command(ipcCommand, FAILOVER_RES_WILL_BE_DONE, 0);
2814 else
2815 reply_to_failover_command(ipcCommand, FAILOVER_RES_ERROR, 0);
2816 }
2817 else
2818 {
2819 /*
2820 * It was the request from the local node, Just reply the caller
2821 * to get on with the failover
2822 */
2823 reply_to_failover_command(ipcCommand, FAILOVER_RES_PROCEED, 0);
2824 }
2825 return IPC_CMD_COMPLETE;
2826 }
2827 else if (res == FAILOVER_RES_NO_QUORUM)
2828 {
2829 ereport(LOG,
2830 (errmsg("failover command [%s] request from pgpool-II node \"%s\" is rejected because the watchdog cluster does not hold the quorum",
2831 func_name,
2832 ipcCommand->sourceWdNode->nodeName)));
2833 }
2834 else if (res == FAILOVER_RES_BUILDING_CONSENSUS)
2835 {
2836 ereport(LOG,
2837 (errmsg("failover command [%s] request from pgpool-II node \"%s\" is queued, waiting for the confirmation from other nodes",
2838 func_name,
2839 ipcCommand->sourceWdNode->nodeName)));
2840
2841 /*
2842 * Ask all the nodes to re-send the failover request for the
2843 * quarantined nodes.
2844 */
2845 send_message_of_type(NULL, WD_FAILOVER_WAITING_FOR_CONSENSUS, NULL);
2846
2847 /*
2848 * Also if the command was originated by remote node, check local
2849 * quarantine space as-well
2850 */
2851 if (ipcCommand->commandSource == COMMAND_SOURCE_REMOTE)
2852 register_inform_quarantine_nodes_req();
2853 }
2854
2855 reply_to_failover_command(ipcCommand, res, 0);
2856 return IPC_CMD_COMPLETE;
2857 }
2858
process_IPC_failover_command(WDCommandData * ipcCommand)2859 static IPC_CMD_PROCESS_RES process_IPC_failover_command(WDCommandData * ipcCommand)
2860 {
2861 if (is_local_node_true_leader())
2862 {
2863 ereport(LOG,
2864 (errmsg("watchdog received the failover command from local pgpool-II on IPC interface")));
2865 return process_failover_command_on_coordinator(ipcCommand);
2866 }
2867 else if (get_local_node_state() == WD_STANDBY)
2868 {
2869 /* I am a standby node, Just forward the request to coordinator */
2870
2871 wd_packet_shallow_copy(&ipcCommand->sourcePacket, &ipcCommand->commandPacket);
2872 set_next_commandID_in_message(&ipcCommand->commandPacket);
2873
2874 ipcCommand->sendToNode = WD_LEADER_NODE; /* send the command to
2875 * leader node */
2876 if (send_command_packet_to_remote_nodes(ipcCommand, true) <= 0)
2877 {
2878 ereport(LOG,
2879 (errmsg("unable to process the failover command request received on IPC interface"),
2880 errdetail("failed to forward the request to the leader watchdog node \"%s\"", WD_LEADER_NODE->nodeName)));
2881 return IPC_CMD_ERROR;
2882 }
2883 else
2884 {
2885 /*
2886 * we need to wait for the result
2887 */
2888 ereport(LOG,
2889 (errmsg("failover request from local pgpool-II node received on IPC interface is forwarded to leader watchdog node \"%s\"",
2890 WD_LEADER_NODE->nodeName),
2891 errdetail("waiting for the reply...")));
2892 return IPC_CMD_PROCESSING;
2893 }
2894 }
2895 else
2896 {
2897 /* we are not in stable state at the moment */
2898 ereport(LOG,
2899 (errmsg("unable to process the failover request received on IPC interface"),
2900 errdetail("this watchdog node has not joined the cluster yet"),
2901 errhint("try again in few seconds")));
2902 }
2903 return IPC_CMD_ERROR;
2904 }
2905
process_IPC_online_recovery(WDCommandData * ipcCommand)2906 static IPC_CMD_PROCESS_RES process_IPC_online_recovery(WDCommandData * ipcCommand)
2907 {
2908 if (get_local_node_state() == WD_STANDBY ||
2909 get_local_node_state() == WD_COORDINATOR)
2910 {
2911 /* save the hassel if I am the only alive node */
2912 if (get_cluster_node_count() == 0)
2913 return IPC_CMD_OK;
2914
2915 wd_packet_shallow_copy(&ipcCommand->sourcePacket, &ipcCommand->commandPacket);
2916 set_next_commandID_in_message(&ipcCommand->commandPacket);
2917
2918 ipcCommand->sendToNode = NULL; /* command needs to be sent to all
2919 * nodes */
2920 if (send_command_packet_to_remote_nodes(ipcCommand, true) <= 0)
2921 {
2922 ereport(LOG,
2923 (errmsg("unable to process the online recovery request received on IPC interface"),
2924 errdetail("failed to forward the request to the leader watchdog node \"%s\"", WD_LEADER_NODE->nodeName)));
2925 return IPC_CMD_ERROR;
2926 }
2927 ereport(LOG,
2928 (errmsg("online recovery request from local pgpool-II node received on IPC interface is forwarded to leader watchdog node \"%s\"",
2929 WD_LEADER_NODE->nodeName),
2930 errdetail("waiting for the reply...")));
2931
2932 return IPC_CMD_PROCESSING;
2933 }
2934 /* we are not in any stable state at the moment */
2935
2936 ereport(LOG,
2937 (errmsg("unable to process the online recovery request received on IPC interface"),
2938 errdetail("this watchdog node has not joined the cluster yet"),
2939 errhint("try again in few seconds")));
2940
2941 return IPC_CMD_TRY_AGAIN;
2942 }
2943
process_IPC_data_request_from_leader(WDCommandData * ipcCommand)2944 static IPC_CMD_PROCESS_RES process_IPC_data_request_from_leader(WDCommandData * ipcCommand)
2945 {
2946 /*
2947 * if cluster or myself is not in stable state just return cluster in
2948 * transaction
2949 */
2950 ereport(LOG,
2951 (errmsg("received the get data request from local pgpool-II on IPC interface")));
2952
2953 if (get_local_node_state() == WD_STANDBY)
2954 {
2955 /*
2956 * set the command id in the IPC packet before forwarding it on the
2957 * watchdog socket
2958 */
2959 wd_packet_shallow_copy(&ipcCommand->sourcePacket, &ipcCommand->commandPacket);
2960 set_next_commandID_in_message(&ipcCommand->commandPacket);
2961
2962 ipcCommand->sendToNode = WD_LEADER_NODE;
2963 if (send_command_packet_to_remote_nodes(ipcCommand, true) <= 0)
2964 {
2965 ereport(LOG,
2966 (errmsg("unable to process the get data request received on IPC interface"),
2967 errdetail("failed to forward the request to the leader watchdog node \"%s\"", WD_LEADER_NODE->nodeName)));
2968 return IPC_CMD_ERROR;
2969 }
2970 else
2971 {
2972 /*
2973 * we need to wait for the result
2974 */
2975 ereport(LOG,
2976 (errmsg("get data request from local pgpool-II node received on IPC interface is forwarded to leader watchdog node \"%s\"",
2977 WD_LEADER_NODE->nodeName),
2978 errdetail("waiting for the reply...")));
2979
2980 return IPC_CMD_PROCESSING;
2981 }
2982 }
2983 else if (is_local_node_true_leader())
2984 {
2985 /*
2986 * This node is itself a leader node, So send the empty result with OK
2987 * tag
2988 */
2989 return IPC_CMD_OK;
2990 }
2991
2992 /* we are not in any stable state at the moment */
2993 ereport(LOG,
2994 (errmsg("unable to process the get data request received on IPC interface"),
2995 errdetail("this watchdog node has not joined the cluster yet"),
2996 errhint("try again in few seconds")));
2997
2998 return IPC_CMD_TRY_AGAIN;
2999 }
3000
process_IPC_failover_indication(WDCommandData * ipcCommand)3001 static IPC_CMD_PROCESS_RES process_IPC_failover_indication(WDCommandData * ipcCommand)
3002 {
3003 WDFailoverCMDResults res = FAILOVER_RES_NOT_ALLOWED;
3004
3005 /*
3006 * if cluster or myself is not in stable state just return cluster in
3007 * transaction
3008 */
3009 ereport(LOG,
3010 (errmsg("received the failover indication from Pgpool-II on IPC interface")));
3011
3012 if (get_local_node_state() == WD_COORDINATOR)
3013 {
3014 int failoverState = -1;
3015
3016 if (ipcCommand->sourcePacket.data == NULL || ipcCommand->sourcePacket.len <= 0)
3017 {
3018 ereport(LOG,
3019 (errmsg("watchdog unable to process failover indication"),
3020 errdetail("invalid command packet")));
3021 res = FAILOVER_RES_INVALID_FUNCTION;
3022 }
3023 else
3024 {
3025 json_value *root = json_parse(ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len);
3026
3027 if (root && root->type == json_object)
3028 {
3029 if (json_get_int_value_for_key(root, "FailoverFuncState", &failoverState))
3030 {
3031 ereport(LOG,
3032 (errmsg("unable to process failover indication"),
3033 errdetail("failed to get failover state from json data in command packet")));
3034 res = FAILOVER_RES_INVALID_FUNCTION;
3035 }
3036 }
3037 else
3038 {
3039 ereport(LOG,
3040 (errmsg("unable to process failover indication"),
3041 errdetail("invalid JSON data in command packet")));
3042 res = FAILOVER_RES_INVALID_FUNCTION;
3043 }
3044 if (root)
3045 json_value_free(root);
3046 }
3047
3048 if (failoverState < 0)
3049 {
3050 ereport(LOG,
3051 (errmsg("unable to process failover indication"),
3052 errdetail("invalid JSON data in command packet")));
3053 res = FAILOVER_RES_INVALID_FUNCTION;
3054 }
3055 else if (failoverState == 0) /* start */
3056 {
3057 res = failover_start_indication(ipcCommand);
3058 }
3059 else /* end */
3060 {
3061 res = failover_end_indication(ipcCommand);
3062 }
3063 }
3064 else
3065 {
3066 ereport(LOG,
3067 (errmsg("received the failover indication from Pgpool-II on IPC interface, but only leader can do failover")));
3068 }
3069 reply_to_failover_command(ipcCommand, res, 0);
3070
3071 return IPC_CMD_COMPLETE;
3072 }
3073
3074
3075 /* Failover start basically does nothing fancy, It just sets the failover_in_progress
3076 * flag and inform all nodes that the failover is in progress.
3077 *
3078 * only the local node that is a leader can start the failover.
3079 */
3080 static WDFailoverCMDResults
failover_start_indication(WDCommandData * ipcCommand)3081 failover_start_indication(WDCommandData * ipcCommand)
3082 {
3083 ereport(LOG,
3084 (errmsg("watchdog is informed of failover start by the main process")));
3085
3086 /* only coordinator(leader) node is allowed to process failover */
3087 if (get_local_node_state() == WD_COORDINATOR)
3088 {
3089 /* inform to all nodes about failover start */
3090 send_message_of_type(NULL, WD_FAILOVER_START, NULL);
3091 return FAILOVER_RES_PROCEED;
3092 }
3093 else if (get_local_node_state() == WD_STANDBY)
3094 {
3095 /* The node might be performing the local quarantine operation */
3096 ereport(DEBUG1,
3097 (errmsg("main process is starting the local quarantine operation")));
3098 return FAILOVER_RES_PROCEED;
3099 }
3100 else
3101 {
3102 ereport(LOG,
3103 (errmsg("failed to process failover start request, I am not in stable state")));
3104 }
3105 return FAILOVER_RES_TRANSITION;
3106 }
3107
3108 static WDFailoverCMDResults
failover_end_indication(WDCommandData * ipcCommand)3109 failover_end_indication(WDCommandData * ipcCommand)
3110 {
3111 ereport(LOG,
3112 (errmsg("watchdog is informed of failover end by the main process")));
3113
3114 /* only coordinator(leader) node is allowed to process failover */
3115 if (get_local_node_state() == WD_COORDINATOR)
3116 {
3117 send_message_of_type(NULL, WD_FAILOVER_END, NULL);
3118 return FAILOVER_RES_PROCEED;
3119 }
3120 else if (get_local_node_state() == WD_STANDBY)
3121 {
3122 /* The node might be performing the local quarantine operation */
3123 ereport(DEBUG1,
3124 (errmsg("main process is ending the local quarantine operation")));
3125 return FAILOVER_RES_PROCEED;
3126 }
3127 else
3128 {
3129 ereport(LOG,
3130 (errmsg("failed to process failover start request, I am not in stable state")));
3131 }
3132 return FAILOVER_RES_TRANSITION;
3133 }
3134
parse_node_info_message(WDPacketData * pkt,char ** authkey)3135 static WatchdogNode * parse_node_info_message(WDPacketData * pkt, char **authkey)
3136 {
3137 if (pkt == NULL || (pkt->type != WD_ADD_NODE_MESSAGE && pkt->type != WD_INFO_MESSAGE))
3138 return NULL;
3139 if (pkt->data == NULL || pkt->len <= 0)
3140 return NULL;
3141 return get_watchdog_node_from_json(pkt->data, pkt->len, authkey);
3142 }
3143
read_packet(SocketConnection * conn)3144 static WDPacketData * read_packet(SocketConnection * conn)
3145 {
3146 return read_packet_of_type(conn, WD_NO_MESSAGE);
3147 }
3148
read_packet_of_type(SocketConnection * conn,char ensure_type)3149 static WDPacketData * read_packet_of_type(SocketConnection * conn, char ensure_type)
3150 {
3151 char type;
3152 int len;
3153 unsigned int cmd_id;
3154 char *buf;
3155 WDPacketData *pkt = NULL;
3156 int ret;
3157
3158 if (is_socket_connection_connected(conn) == false)
3159 {
3160 ereport(LOG,
3161 (errmsg("error reading from socket connection,socket is not connected")));
3162 return NULL;
3163 }
3164
3165 ret = socket_read(conn->sock, &type, sizeof(char), 1);
3166 if (ret != sizeof(char))
3167 {
3168 close_socket_connection(conn);
3169 return NULL;
3170 }
3171
3172 ereport(DEBUG1,
3173 (errmsg("received watchdog packet type:%c", type)));
3174
3175 if (ensure_type != WD_NO_MESSAGE && ensure_type != type)
3176 {
3177 /* The packet type is not what we want. */
3178 ereport(DEBUG1,
3179 (errmsg("invalid packet type. expecting %c while received %c", ensure_type, type)));
3180 close_socket_connection(conn);
3181 return NULL;
3182 }
3183
3184 ret = socket_read(conn->sock, &cmd_id, sizeof(int), 1);
3185 if (ret != sizeof(int))
3186 {
3187 close_socket_connection(conn);
3188 return NULL;
3189 }
3190 cmd_id = ntohl(cmd_id);
3191
3192 ereport(DEBUG2,
3193 (errmsg("received packet with command id %d from watchdog node ", cmd_id)));
3194
3195 ret = socket_read(conn->sock, &len, sizeof(int), 1);
3196 if (ret != sizeof(int))
3197 {
3198 close_socket_connection(conn);
3199 return NULL;
3200 }
3201
3202 len = ntohl(len);
3203
3204 ereport(DEBUG1,
3205 (errmsg("reading packet type %c of length %d", type, len)));
3206
3207 pkt = get_empty_packet();
3208 set_message_type(pkt, type);
3209 set_message_commandID(pkt, cmd_id);
3210
3211 buf = palloc(len);
3212
3213 ret = socket_read(conn->sock, buf, len, 1);
3214 if (ret != len)
3215 {
3216 close_socket_connection(conn);
3217 free_packet(pkt);
3218 pfree(buf);
3219 return NULL;
3220 }
3221 set_message_data(pkt, buf, len);
3222 return pkt;
3223 }
3224
3225
3226
3227 static void
wd_child_exit(int exit_signo)3228 wd_child_exit(int exit_signo)
3229 {
3230 sigset_t mask;
3231
3232 sigemptyset(&mask);
3233 sigaddset(&mask, SIGTERM);
3234 sigaddset(&mask, SIGINT);
3235 sigaddset(&mask, SIGQUIT);
3236 sigprocmask(SIG_BLOCK, &mask, NULL);
3237 exit(0);
3238 }
3239
3240 static void
wd_child_signal_handler(void)3241 wd_child_signal_handler(void)
3242 {
3243 pid_t pid;
3244 int status;
3245
3246 ereport(DEBUG1,
3247 (errmsg("watchdog process signal handler")));
3248
3249 /* clear SIGCHLD request */
3250 sigchld_request = 0;
3251
3252 while ((pid = pool_waitpid(&status)) > 0)
3253 {
3254 char *exiting_process_name;
3255
3256 if (g_cluster.de_escalation_pid == pid)
3257 {
3258 exiting_process_name = "de-escalation";
3259 g_cluster.de_escalation_pid = 0;
3260 }
3261 else if (g_cluster.escalation_pid == pid)
3262 {
3263 exiting_process_name = "escalation";
3264 g_cluster.escalation_pid = 0;
3265 }
3266 else
3267 exiting_process_name = "unknown";
3268
3269 if (WIFEXITED(status))
3270 {
3271 if (WEXITSTATUS(status) == POOL_EXIT_FATAL)
3272 ereport(LOG,
3273 (errmsg("watchdog %s process with pid: %d exit with FATAL ERROR.", exiting_process_name, pid)));
3274 else if (WEXITSTATUS(status) == POOL_EXIT_NO_RESTART)
3275 ereport(LOG,
3276 (errmsg("watchdog %s process with pid: %d exit with SUCCESS.", exiting_process_name, pid)));
3277 }
3278 else if (WIFSIGNALED(status))
3279 {
3280 /* Child terminated by segmentation fault. Report it */
3281 if (WTERMSIG(status) == SIGSEGV)
3282 ereport(WARNING,
3283 (errmsg("watchdog %s process with pid: %d was terminated by segmentation fault", exiting_process_name, pid)));
3284 else
3285 ereport(LOG,
3286 (errmsg("watchdog %s process with pid: %d exits with status %d by signal %d", exiting_process_name, pid, status, WTERMSIG(status))));
3287 }
3288 else
3289 ereport(LOG,
3290 (errmsg("watchdog %s process with pid: %d exits with status %d", exiting_process_name, pid, status)));
3291 }
3292 }
3293
3294 /* Function invoked when watchdog process is about to exit */
3295 static void
wd_system_will_go_down(int code,Datum arg)3296 wd_system_will_go_down(int code, Datum arg)
3297 {
3298 int i;
3299
3300 ereport(LOG,
3301 (errmsg("Watchdog is shutting down")));
3302
3303 send_cluster_command(NULL, WD_INFORM_I_AM_GOING_DOWN, 0);
3304
3305 if (get_local_node_state() == WD_COORDINATOR)
3306 resign_from_escalated_node();
3307 /* close server socket */
3308 close_socket_connection(&g_cluster.localNode->server_socket);
3309 /* close all node sockets */
3310 for (i = 0; i < g_cluster.remoteNodeCount; i++)
3311 {
3312 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
3313
3314 close_socket_connection(&wdNode->client_socket);
3315 close_socket_connection(&wdNode->server_socket);
3316 }
3317 /* close network monitoring socket */
3318 if (g_cluster.network_monitor_sock > 0)
3319 close(g_cluster.network_monitor_sock);
3320 /* wait for sub-processes to exit */
3321 if (g_cluster.de_escalation_pid > 0 || g_cluster.escalation_pid > 0)
3322 {
3323 pid_t wpid;
3324
3325 do
3326 {
3327 wpid = wait(NULL);
3328 } while (wpid > 0 || (wpid == -1 && errno == EINTR));
3329 }
3330 }
3331
3332 static void
close_socket_connection(SocketConnection * conn)3333 close_socket_connection(SocketConnection * conn)
3334 {
3335 if ((conn->sock > 0 && conn->sock_state == WD_SOCK_CONNECTED)
3336 || conn->sock_state == WD_SOCK_WAITING_FOR_CONNECT)
3337 {
3338 close(conn->sock);
3339 conn->sock = -1;
3340 conn->sock_state = WD_SOCK_CLOSED;
3341 }
3342 }
3343
3344 static bool
is_socket_connection_connected(SocketConnection * conn)3345 is_socket_connection_connected(SocketConnection * conn)
3346 {
3347 return (conn->sock > 0 && conn->sock_state == WD_SOCK_CONNECTED);
3348 }
3349
3350
3351 static bool
is_node_reachable(WatchdogNode * wdNode)3352 is_node_reachable(WatchdogNode * wdNode)
3353 {
3354 if (is_socket_connection_connected(&wdNode->client_socket))
3355 return true;
3356 if (is_socket_connection_connected(&wdNode->server_socket))
3357 return true;
3358 return false;
3359 }
3360
3361 static bool
is_node_active(WatchdogNode * wdNode)3362 is_node_active(WatchdogNode * wdNode)
3363 {
3364 if (wdNode->state == WD_DEAD || wdNode->state == WD_LOST || wdNode->state == WD_SHUTDOWN)
3365 return false;
3366 return true;
3367 }
3368
3369 static bool
is_node_active_and_reachable(WatchdogNode * wdNode)3370 is_node_active_and_reachable(WatchdogNode * wdNode)
3371 {
3372 if (is_node_active(wdNode))
3373 return is_node_reachable(wdNode);
3374 return false;
3375 }
3376
3377 static int
accept_incoming_connections(fd_set * rmask,int pending_fds_count)3378 accept_incoming_connections(fd_set *rmask, int pending_fds_count)
3379 {
3380 int processed_fds = 0;
3381 int fd;
3382
3383 if (FD_ISSET(g_cluster.localNode->server_socket.sock, rmask))
3384 {
3385 struct sockaddr_in addr;
3386 socklen_t addrlen = sizeof(struct sockaddr_in);
3387
3388 processed_fds++;
3389 fd = accept(g_cluster.localNode->server_socket.sock, (struct sockaddr *) &addr, &addrlen);
3390 if (fd < 0)
3391 {
3392 if (errno == EINTR || errno == 0 || errno == EAGAIN || errno == EWOULDBLOCK)
3393 {
3394 /* nothing to accept now */
3395 ereport(DEBUG2,
3396 (errmsg("Failed to accept incoming watchdog connection, Nothing to accept")));
3397 }
3398 /* accept failed */
3399 ereport(DEBUG1,
3400 (errmsg("Failed to accept incoming watchdog connection")));
3401 }
3402 else
3403 {
3404 MemoryContext oldCxt = MemoryContextSwitchTo(TopMemoryContext);
3405 SocketConnection *conn = palloc(sizeof(SocketConnection));
3406
3407 conn->sock = fd;
3408 conn->sock_state = WD_SOCK_CONNECTED;
3409 gettimeofday(&conn->tv, NULL);
3410 strncpy(conn->addr, inet_ntoa(addr.sin_addr), sizeof(conn->addr) - 1);
3411 ereport(LOG,
3412 (errmsg("new watchdog node connection is received from \"%s:%d\"", inet_ntoa(addr.sin_addr), addr.sin_port)));
3413 g_cluster.unidentified_socks = lappend(g_cluster.unidentified_socks, conn);
3414 MemoryContextSwitchTo(oldCxt);
3415 }
3416 }
3417
3418 if (processed_fds >= pending_fds_count)
3419 return processed_fds;
3420
3421 if (FD_ISSET(g_cluster.command_server_sock, rmask))
3422 {
3423 struct sockaddr addr;
3424 socklen_t addrlen = sizeof(struct sockaddr);
3425
3426 processed_fds++;
3427
3428 int fd = accept(g_cluster.command_server_sock, &addr, &addrlen);
3429
3430 if (fd < 0)
3431 {
3432 if (errno == EINTR || errno == 0 || errno == EAGAIN || errno == EWOULDBLOCK)
3433 {
3434 /* nothing to accept now */
3435 ereport(WARNING,
3436 (errmsg("failed to accept incoming watchdog IPC connection, Nothing to accept")));
3437 }
3438 /* accept failed */
3439 ereport(WARNING,
3440 (errmsg("failed to accept incoming watchdog IPC connection")));
3441 }
3442 else
3443 {
3444 MemoryContext oldCxt = MemoryContextSwitchTo(TopMemoryContext);
3445
3446 ereport(LOG,
3447 (errmsg("new IPC connection received")));
3448 g_cluster.ipc_command_socks = lappend_int(g_cluster.ipc_command_socks, fd);
3449 MemoryContextSwitchTo(oldCxt);
3450 }
3451 }
3452
3453 return processed_fds;
3454 }
3455
3456 static int
update_successful_outgoing_cons(fd_set * wmask,int pending_fds_count)3457 update_successful_outgoing_cons(fd_set *wmask, int pending_fds_count)
3458 {
3459 int i;
3460 int count = 0;
3461
3462 for (i = 0; i < g_cluster.remoteNodeCount; i++)
3463 {
3464 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
3465
3466 if (wdNode->client_socket.sock > 0 && wdNode->client_socket.sock_state == WD_SOCK_WAITING_FOR_CONNECT)
3467 {
3468 if (FD_ISSET(wdNode->client_socket.sock, wmask))
3469 {
3470 socklen_t lon;
3471 int valopt;
3472
3473 lon = sizeof(int);
3474
3475 gettimeofday(&wdNode->client_socket.tv, NULL);
3476
3477 if (getsockopt(wdNode->client_socket.sock, SOL_SOCKET, SO_ERROR, (void *) (&valopt), &lon) == 0)
3478 {
3479 if (valopt)
3480 {
3481 ereport(DEBUG1,
3482 (errmsg("error in outbound connection to %s:%d", wdNode->hostname, wdNode->wd_port),
3483 errdetail("%s", strerror(valopt))));
3484 close_socket_connection(&wdNode->client_socket);
3485 wdNode->client_socket.sock_state = WD_SOCK_ERROR;
3486 }
3487 else
3488 {
3489 wdNode->client_socket.sock_state = WD_SOCK_CONNECTED;
3490 ereport(LOG,
3491 (errmsg("new outbound connection to %s:%d ", wdNode->hostname, wdNode->wd_port)));
3492 /* set socket to blocking again */
3493 socket_unset_nonblock(wdNode->client_socket.sock);
3494 watchdog_state_machine(WD_EVENT_NEW_OUTBOUND_CONNECTION, wdNode, NULL, NULL);
3495 }
3496 }
3497 else
3498 {
3499 ereport(DEBUG1,
3500 (errmsg("error in outbound connection to %s:%d ", wdNode->hostname, wdNode->wd_port),
3501 errdetail("getsockopt failed with error \"%m\"")));
3502 close_socket_connection(&wdNode->client_socket);
3503 wdNode->client_socket.sock_state = WD_SOCK_ERROR;
3504
3505 }
3506 count++;
3507 if (count >= pending_fds_count)
3508 break;
3509 }
3510 }
3511 }
3512 return count;
3513 }
3514
3515 static bool
write_packet_to_socket(int sock,WDPacketData * pkt,bool ipcPacket)3516 write_packet_to_socket(int sock, WDPacketData * pkt, bool ipcPacket)
3517 {
3518 int ret = 0;
3519 int command_id,
3520 len;
3521
3522 ereport(DEBUG1,
3523 (errmsg("sending watchdog packet to socket:%d, type:[%c], command ID:%d, data Length:%d", sock, pkt->type,
3524 pkt->command_id, pkt->len)));
3525
3526 print_packet_info(pkt, true);
3527
3528 /* TYPE */
3529 if (write(sock, &pkt->type, 1) < 1)
3530 {
3531 ereport(LOG,
3532 (errmsg("failed to write watchdog packet to socket"),
3533 errdetail("%m")));
3534 return false;
3535 }
3536 if (ipcPacket == false)
3537 {
3538 /* IPC packets does not have command ID field */
3539 command_id = htonl(pkt->command_id);
3540 if (write(sock, &command_id, 4) < 4)
3541 {
3542 ereport(LOG,
3543 (errmsg("failed to write watchdog packet to socket"),
3544 errdetail("%m")));
3545 return false;
3546 }
3547 }
3548 /* data length */
3549 len = htonl(pkt->len);
3550 if (write(sock, &len, 4) < 4)
3551 {
3552 ereport(LOG,
3553 (errmsg("failed to write watchdog packet to socket"),
3554 errdetail("%m")));
3555 return false;
3556 }
3557 /* DATA */
3558 if (pkt->len > 0 && pkt->data)
3559 {
3560 int bytes_send = 0;
3561
3562 do
3563 {
3564 ret = write(sock, pkt->data + bytes_send, (pkt->len - bytes_send));
3565 if (ret <= 0)
3566 {
3567 ereport(LOG,
3568 (errmsg("failed to write watchdog packet to socket"),
3569 errdetail("%m")));
3570 return false;
3571 }
3572 bytes_send += ret;
3573 } while (bytes_send < pkt->len);
3574 }
3575 return true;
3576 }
3577
3578 static void
wd_packet_shallow_copy(WDPacketData * srcPkt,WDPacketData * dstPkt)3579 wd_packet_shallow_copy(WDPacketData * srcPkt, WDPacketData * dstPkt)
3580 {
3581 dstPkt->command_id = srcPkt->command_id;
3582 dstPkt->data = srcPkt->data;
3583 dstPkt->len = srcPkt->len;
3584 dstPkt->type = srcPkt->type;
3585 }
3586
3587 static void
init_wd_packet(WDPacketData * pkt)3588 init_wd_packet(WDPacketData * pkt)
3589 {
3590 pkt->len = 0;
3591 pkt->data = NULL;
3592 }
3593
get_empty_packet(void)3594 static WDPacketData * get_empty_packet(void)
3595 {
3596 WDPacketData *pkt = palloc0(sizeof(WDPacketData));
3597
3598 return pkt;
3599 }
3600
3601 static void
free_packet(WDPacketData * pkt)3602 free_packet(WDPacketData * pkt)
3603 {
3604 if (pkt)
3605 {
3606 if (pkt->data)
3607 pfree(pkt->data);
3608 pfree(pkt);
3609 }
3610 }
3611
3612 static void
set_message_type(WDPacketData * pkt,char type)3613 set_message_type(WDPacketData * pkt, char type)
3614 {
3615 pkt->type = type;
3616 }
3617
3618 static void
set_message_commandID(WDPacketData * pkt,unsigned int commandID)3619 set_message_commandID(WDPacketData * pkt, unsigned int commandID)
3620 {
3621 pkt->command_id = commandID;
3622 }
3623
3624 static void
set_next_commandID_in_message(WDPacketData * pkt)3625 set_next_commandID_in_message(WDPacketData * pkt)
3626 {
3627 set_message_commandID(pkt, get_next_commandID());
3628 }
3629
3630 static void
set_message_data(WDPacketData * pkt,const char * data,int len)3631 set_message_data(WDPacketData * pkt, const char *data, int len)
3632 {
3633 pkt->data = (char *) data;
3634 pkt->len = len;
3635 }
3636
3637 #define nodeIfNull_str(m,v) node&&strlen(node->m)?node->m:v
3638 #define nodeIfNull_int(m,v) node?node->m:v
3639 #define NotSet "Not_Set"
3640
3641 static bool
add_nodeinfo_to_json(JsonNode * jNode,WatchdogNode * node)3642 add_nodeinfo_to_json(JsonNode * jNode, WatchdogNode * node)
3643 {
3644 jw_start_object(jNode, "WatchdogNode");
3645
3646 jw_put_int(jNode, "ID", nodeIfNull_int(pgpool_node_id, -1));
3647 jw_put_int(jNode, "State", nodeIfNull_int(state, -1));
3648 jw_put_int(jNode, "Membership", nodeIfNull_int(membership_status, -1));
3649 jw_put_string(jNode, "MembershipString", node ? wd_cluster_membership_status[node->membership_status] : NotSet);
3650 jw_put_string(jNode, "NodeName", nodeIfNull_str(nodeName, NotSet));
3651 jw_put_string(jNode, "HostName", nodeIfNull_str(hostname, NotSet));
3652 jw_put_string(jNode, "StateName", node ? wd_state_names[node->state] : NotSet);
3653 jw_put_string(jNode, "DelegateIP", nodeIfNull_str(delegate_ip, NotSet));
3654 jw_put_int(jNode, "WdPort", nodeIfNull_int(wd_port, 0));
3655 jw_put_int(jNode, "PgpoolPort", nodeIfNull_int(pgpool_port, 0));
3656 jw_put_int(jNode, "Priority", nodeIfNull_int(wd_priority, 0));
3657
3658 jw_end_element(jNode);
3659
3660 return true;
3661 }
3662
get_node_list_json(int id)3663 static JsonNode * get_node_list_json(int id)
3664 {
3665 int i;
3666 JsonNode *jNode = jw_create_with_object(true);
3667
3668 jw_put_int(jNode, "RemoteNodeCount", g_cluster.remoteNodeCount);
3669 jw_put_int(jNode, "MemberRemoteNodeCount", g_cluster.memberRemoteNodeCount);
3670 jw_put_int(jNode, "NodesRequireForQuorum", get_minimum_votes_to_resolve_consensus());
3671 jw_put_int(jNode, "QuorumStatus", WD_LEADER_NODE ? WD_LEADER_NODE->quorum_status : -2);
3672 jw_put_int(jNode, "AliveNodeCount", WD_LEADER_NODE ? WD_LEADER_NODE->standby_nodes_count : 0);
3673 jw_put_int(jNode, "Escalated", g_cluster.localNode->escalated);
3674 jw_put_string(jNode, "LeaderNodeName", WD_LEADER_NODE ? WD_LEADER_NODE->nodeName : "Not Set");
3675 jw_put_string(jNode, "LeaderHostName", WD_LEADER_NODE ? WD_LEADER_NODE->hostname : "Not Set");
3676 if (id < 0)
3677 {
3678 jw_put_int(jNode, "NodeCount", g_cluster.remoteNodeCount + 1);
3679
3680 /* add the array */
3681 jw_start_array(jNode, "WatchdogNodes");
3682 /* add the local node info */
3683 add_nodeinfo_to_json(jNode, g_cluster.localNode);
3684 /* add all remote nodes */
3685 for (i = 0; i < g_cluster.remoteNodeCount; i++)
3686 {
3687 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
3688
3689 add_nodeinfo_to_json(jNode, wdNode);
3690 }
3691 }
3692 else
3693 {
3694 jw_put_int(jNode, "NodeCount", 1);
3695 /* add the array */
3696 jw_start_array(jNode, "WatchdogNodes");
3697
3698 if (id == g_cluster.localNode->pgpool_node_id)
3699 {
3700 /* add the local node info */
3701 add_nodeinfo_to_json(jNode, g_cluster.localNode);
3702 }
3703 else
3704 {
3705 /* find from remote nodes */
3706 WatchdogNode *wdNodeToAdd = NULL;
3707
3708 for (i = 0; i < g_cluster.remoteNodeCount; i++)
3709 {
3710 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
3711
3712 if (wdNode->pgpool_node_id == id)
3713 {
3714 wdNodeToAdd = wdNode;
3715 break;
3716 }
3717 }
3718 add_nodeinfo_to_json(jNode, wdNodeToAdd);
3719 }
3720 }
3721 jw_finish_document(jNode);
3722 return jNode;
3723 }
3724
get_beacon_message(char type,WDPacketData * replyFor)3725 static WDPacketData * get_beacon_message(char type, WDPacketData * replyFor)
3726 {
3727 WDPacketData *message = get_empty_packet();
3728 char *json_data;
3729
3730 json_data = get_beacon_message_json(g_cluster.localNode);
3731
3732 set_message_type(message, type);
3733
3734 if (replyFor == NULL)
3735 set_next_commandID_in_message(message);
3736 else
3737 set_message_commandID(message, replyFor->command_id);
3738
3739 set_message_data(message, json_data, strlen(json_data));
3740 return message;
3741 }
3742
get_addnode_message(void)3743 static WDPacketData * get_addnode_message(void)
3744 {
3745 char authhash[WD_AUTH_HASH_LEN + 1];
3746 WDPacketData *message = get_empty_packet();
3747 bool include_hash = get_authhash_for_node(g_cluster.localNode, authhash);
3748 char *json_data = get_watchdog_node_info_json(g_cluster.localNode, include_hash ? authhash : NULL);
3749
3750 set_message_type(message, WD_ADD_NODE_MESSAGE);
3751 set_next_commandID_in_message(message);
3752 set_message_data(message, json_data, strlen(json_data));
3753 return message;
3754 }
3755
get_mynode_info_message(WDPacketData * replyFor)3756 static WDPacketData * get_mynode_info_message(WDPacketData * replyFor)
3757 {
3758 char authhash[WD_AUTH_HASH_LEN + 1];
3759 WDPacketData *message = get_empty_packet();
3760 bool include_hash = get_authhash_for_node(g_cluster.localNode, authhash);
3761 char *json_data = get_watchdog_node_info_json(g_cluster.localNode, include_hash ? authhash : NULL);
3762
3763 set_message_type(message, WD_INFO_MESSAGE);
3764 if (replyFor == NULL)
3765 set_next_commandID_in_message(message);
3766 else
3767 set_message_commandID(message, replyFor->command_id);
3768
3769 set_message_data(message, json_data, strlen(json_data));
3770 return message;
3771 }
3772
get_minimum_message(char type,WDPacketData * replyFor)3773 static WDPacketData * get_minimum_message(char type, WDPacketData * replyFor)
3774 {
3775 /* TODO it is a waste of space */
3776 WDPacketData *message = get_empty_packet();
3777
3778 set_message_type(message, type);
3779 if (replyFor == NULL)
3780 set_next_commandID_in_message(message);
3781 else
3782 set_message_commandID(message, replyFor->command_id);
3783 return message;
3784 }
3785
get_wd_IPC_command_from_reply(WDPacketData * pkt)3786 static WDCommandData * get_wd_IPC_command_from_reply(WDPacketData * pkt)
3787 {
3788 return get_wd_command_from_reply(g_cluster.ipc_commands, pkt);
3789 }
get_wd_cluster_command_from_reply(WDPacketData * pkt)3790 static WDCommandData * get_wd_cluster_command_from_reply(WDPacketData * pkt)
3791 {
3792 return get_wd_command_from_reply(g_cluster.clusterCommands, pkt);
3793 }
3794
get_wd_command_from_reply(List * commands,WDPacketData * pkt)3795 static WDCommandData * get_wd_command_from_reply(List *commands, WDPacketData * pkt)
3796 {
3797 ListCell *lc;
3798
3799 if (commands == NULL)
3800 return NULL;
3801
3802 foreach(lc, commands)
3803 {
3804 WDCommandData *ipcCommand = lfirst(lc);
3805
3806 if (ipcCommand)
3807 {
3808 if (ipcCommand->commandPacket.command_id == pkt->command_id)
3809 {
3810 ereport(DEBUG1,
3811 (errmsg("packet %c with command ID %d is reply to the command %c", pkt->type, pkt->command_id,
3812 ipcCommand->commandPacket.type)));
3813 return ipcCommand;
3814 }
3815 }
3816 }
3817 return NULL;
3818 }
3819
get_wd_IPC_command_from_socket(int sock)3820 static WDCommandData * get_wd_IPC_command_from_socket(int sock)
3821 {
3822 ListCell *lc;
3823
3824 foreach(lc, g_cluster.ipc_commands)
3825 {
3826 WDCommandData *ipcCommand = lfirst(lc);
3827
3828 if (ipcCommand)
3829 {
3830 if (ipcCommand->commandSource != COMMAND_SOURCE_IPC)
3831 continue;
3832
3833 if (ipcCommand->sourceIPCSocket == sock)
3834 return ipcCommand;
3835 }
3836 }
3837 return NULL;
3838 }
3839
3840
3841 static void
cleanUpIPCCommand(WDCommandData * ipcCommand)3842 cleanUpIPCCommand(WDCommandData * ipcCommand)
3843 {
3844 /*
3845 * close the socket associated with ipcCommand and remove it from
3846 * ipcSocket list
3847 */
3848 if (ipcCommand->commandSource == COMMAND_SOURCE_IPC &&
3849 ipcCommand->sourceIPCSocket > 0)
3850 {
3851 close(ipcCommand->sourceIPCSocket);
3852 g_cluster.ipc_command_socks = list_delete_int(g_cluster.ipc_command_socks, ipcCommand->sourceIPCSocket);
3853 ipcCommand->sourceIPCSocket = -1;
3854 }
3855 /* Now remove the ipcCommand instance from the command list */
3856 g_cluster.ipc_commands = list_delete_ptr(g_cluster.ipc_commands, ipcCommand);
3857
3858 /*
3859 * Finally the memory part As everything of IPCCommand live inside its own
3860 * memory context. Delete the MemoryContext and we are good
3861 */
3862 MemoryContextDelete(ipcCommand->memoryContext);
3863 }
3864
process_data_request(WatchdogNode * wdNode,WDPacketData * pkt)3865 static WDPacketData * process_data_request(WatchdogNode * wdNode, WDPacketData * pkt)
3866 {
3867 char *request_type;
3868 char *data = NULL;
3869 WDPacketData *replyPkt = NULL;
3870
3871 if (pkt->data == NULL || pkt->len <= 0)
3872 {
3873 ereport(WARNING,
3874 (errmsg("invalid data request packet from watchdog node \"%s\"", wdNode->nodeName),
3875 errdetail("no data found in the packet")));
3876
3877 replyPkt = get_minimum_message(WD_ERROR_MESSAGE, pkt);
3878 return replyPkt;
3879 }
3880
3881 if (!parse_data_request_json(pkt->data, pkt->len, &request_type))
3882 {
3883 ereport(WARNING,
3884 (errmsg("invalid data request packet from watchdog node \"%s\"", wdNode->nodeName),
3885 errdetail("no data found in the packet")));
3886
3887 replyPkt = get_minimum_message(WD_ERROR_MESSAGE, pkt);
3888 return replyPkt;
3889 }
3890
3891 if (strcasecmp(request_type, WD_DATE_REQ_PG_BACKEND_DATA) == 0)
3892 {
3893 data = get_backend_node_status_json(g_cluster.localNode);
3894 }
3895
3896 if (data)
3897 {
3898 replyPkt = get_empty_packet();
3899 set_message_type(replyPkt, WD_DATA_MESSAGE);
3900 set_message_commandID(replyPkt, pkt->command_id);
3901 set_message_data(replyPkt, data, strlen(data));
3902 }
3903 else
3904 {
3905 replyPkt = get_minimum_message(WD_ERROR_MESSAGE, pkt);
3906 }
3907
3908 return replyPkt;
3909 }
3910
3911 static void
cluster_service_message_processor(WatchdogNode * wdNode,WDPacketData * pkt)3912 cluster_service_message_processor(WatchdogNode * wdNode, WDPacketData * pkt)
3913 {
3914 if (pkt->type != WD_CLUSTER_SERVICE_MESSAGE)
3915 return;
3916
3917 if (pkt->len != 1 || pkt->data == NULL)
3918 {
3919 ereport(LOG,
3920 (errmsg("node \"%s\" sent an invalid cluster service message", wdNode->nodeName)));
3921 return;
3922 }
3923
3924 switch (pkt->data[0])
3925 {
3926 case CLUSTER_IAM_TRUE_LEADER:
3927 {
3928 /*
3929 * The cluster was in split-brain and remote node thinks it is
3930 * the worthy leader
3931 */
3932 if (get_local_node_state() == WD_COORDINATOR)
3933 {
3934 ereport(LOG,
3935 (errmsg("remote node \"%s\" decided it is the true leader", wdNode->nodeName),
3936 errdetail("re-initializing the local watchdog cluster state because of split-brain")));
3937
3938 send_cluster_service_message(NULL, pkt, CLUSTER_IAM_RESIGNING_FROM_LEADER);
3939 set_state(WD_JOINING);
3940 }
3941 else if (WD_LEADER_NODE != NULL && WD_LEADER_NODE != wdNode)
3942 {
3943 ereport(LOG,
3944 (errmsg("remote node \"%s\" thinks it is a leader/coordinator and I am causing the split-brain," \
3945 " but as per our record \"%s\" is the cluster leader/coordinator",
3946 wdNode->nodeName,
3947 WD_LEADER_NODE->nodeName),
3948 errdetail("restarting the cluster")));
3949 send_cluster_service_message(NULL, pkt, CLUSTER_NEEDS_ELECTION);
3950 set_state(WD_JOINING);
3951 }
3952 }
3953 break;
3954
3955 case CLUSTER_IAM_RESIGNING_FROM_LEADER:
3956 {
3957 if (WD_LEADER_NODE == wdNode)
3958 {
3959 ereport(LOG,
3960 (errmsg("leader/coordinator node \"%s\" decided to resigning from leader, probably because of split-brain",
3961 wdNode->nodeName),
3962 errdetail("re-initializing the local watchdog cluster state")));
3963
3964 set_state(WD_JOINING);
3965 }
3966 else
3967 {
3968 ereport(LOG,
3969 (errmsg("leader/coordinator node \"%s\" decided to resign from leader, probably because of split-brain",
3970 wdNode->nodeName),
3971 errdetail("It was not our coordinator/leader anyway. ignoring the message")));
3972 }
3973 }
3974 break;
3975
3976 case CLUSTER_IN_SPLIT_BRAIN:
3977 {
3978 try_connecting_with_all_unreachable_nodes();
3979 if (get_local_node_state() == WD_COORDINATOR)
3980 {
3981 ereport(LOG,
3982 (errmsg("remote node \"%s\" detected the cluster is in split-brain", wdNode->nodeName),
3983 errdetail("broadcasting the beacon message")));
3984 send_message_of_type(NULL, WD_IAM_COORDINATOR_MESSAGE, NULL);
3985 }
3986 }
3987 break;
3988
3989 case CLUSTER_NEEDS_ELECTION:
3990 {
3991 ereport(LOG,
3992 (errmsg("remote node \"%s\" detected the problem and asking us to rejoin the cluster", wdNode->nodeName)));
3993
3994 set_state(WD_JOINING);
3995 }
3996 break;
3997
3998 case CLUSTER_IAM_NOT_TRUE_LEADER:
3999 {
4000 if (WD_LEADER_NODE == wdNode)
4001 {
4002 ereport(LOG,
4003 (errmsg("leader/coordinator node \"%s\" decided it was not true leader, probably because of split-brain", wdNode->nodeName),
4004 errdetail("re-initializing the local watchdog cluster state")));
4005
4006 set_state(WD_JOINING);
4007 }
4008 else if (get_local_node_state() == WD_COORDINATOR)
4009 {
4010 ereport(LOG,
4011 (errmsg("node \"%s\" was also thinking it was a leader/coordinator and decided to resign", wdNode->nodeName),
4012 errdetail("cluster is recovering from split-brain")));
4013 }
4014 else
4015 {
4016 ereport(LOG,
4017 (errmsg("leader/coordinator node \"%s\" decided to resign from leader, probably because of split-brain",
4018 wdNode->nodeName),
4019 errdetail("but it was not our coordinator/leader anyway. ignoring the message")));
4020 }
4021 }
4022 break;
4023
4024 case CLUSTER_NODE_REQUIRE_TO_RELOAD:
4025 {
4026 watchdog_state_machine(WD_EVENT_WD_STATE_REQUIRE_RELOAD, NULL, NULL, NULL);
4027 }
4028 break;
4029
4030 case CLUSTER_NODE_APPEARING_LOST:
4031 {
4032 ereport(LOG,
4033 (errmsg("remote node \"%s\" is reporting that it has lost us",
4034 wdNode->nodeName)));
4035 wdNode->has_lost_us = true;
4036 watchdog_state_machine(WD_EVENT_I_AM_APPEARING_LOST, wdNode, NULL, NULL);
4037 }
4038 break;
4039
4040 case CLUSTER_NODE_APPEARING_FOUND:
4041 {
4042 ereport(LOG,
4043 (errmsg("remote node \"%s\" is reporting that it has found us again",
4044 wdNode->nodeName)));
4045 wdNode->has_lost_us = false;
4046 watchdog_state_machine(WD_EVENT_I_AM_APPEARING_FOUND, wdNode, NULL, NULL);
4047 }
4048 break;
4049
4050 case CLUSTER_NODE_INVALID_VERSION:
4051 {
4052 /*
4053 * this should never happen means something is seriously wrong
4054 */
4055 ereport(FATAL,
4056 (return_code(POOL_EXIT_FATAL),
4057 errmsg("\"%s\" node has found serious issues in our watchdog messages",
4058 wdNode->nodeName),
4059 errdetail("shutting down")));
4060 }
4061 break;
4062 default:
4063 break;
4064 }
4065 }
4066
4067 static void
wd_execute_cluster_command_processor(WatchdogNode * wdNode,WDPacketData * pkt)4068 wd_execute_cluster_command_processor(WatchdogNode * wdNode, WDPacketData * pkt)
4069 {
4070 /* get the json for node list */
4071 char *clusterCommand = NULL;
4072 List *args_list = NULL;
4073
4074 if (pkt->type != WD_EXECUTE_COMMAND_REQUEST)
4075 return;
4076
4077 if (pkt->len <= 0 || pkt->data == NULL)
4078 {
4079 ereport(LOG,
4080 (errmsg("node \"%s\" sent an empty execute cluster command message", wdNode->nodeName)));
4081 return;
4082 }
4083
4084 if (!parse_wd_exec_cluster_command_json(pkt->data, pkt->len,
4085 &clusterCommand, &args_list))
4086 {
4087 ereport(LOG,
4088 (errmsg("node \"%s\" sent an invalid JSON data in cluster command message", wdNode->nodeName)));
4089 return;
4090 }
4091
4092 ereport(DEBUG1,
4093 (errmsg("received \"%s\" command from node \"%s\"",clusterCommand, wdNode->nodeName)));
4094 if (strcasecmp(WD_COMMAND_SHUTDOWN_CLUSTER, clusterCommand) == 0)
4095 {
4096 char mode = 's';
4097 ListCell *lc;
4098 foreach(lc, args_list)
4099 {
4100 WDExecCommandArg *wdExecCommandArg = lfirst(lc);
4101 if (strcmp(wdExecCommandArg->arg_name, "mode") == 0)
4102 {
4103 mode = wdExecCommandArg->arg_value[0];
4104 }
4105 else
4106 ereport(LOG,
4107 (errmsg("unsupported argument \"%s\" in shutdown command from remote node \"%s\"", wdExecCommandArg->arg_name, wdNode->nodeName)));
4108 }
4109
4110 ereport(LOG,
4111 (errmsg("processing shutdown command from remote node \"%s\"", wdNode->nodeName)));
4112 terminate_pgpool(mode, false);
4113 }
4114 else if (strcasecmp(WD_COMMAND_RELOAD_CONFIG_CLUSTER, clusterCommand) == 0)
4115 {
4116 ereport(LOG,
4117 (errmsg("processing reload config command from remote node \"%s\"", wdNode->nodeName)));
4118 pool_signal_parent(SIGHUP);
4119 }
4120 else if (strcasecmp(WD_COMMAND_LOCK_ON_STANDBY, clusterCommand) == 0)
4121 {
4122 int lock_type = -1;
4123 char *operation = NULL;
4124 if (get_local_node_state() == WD_STANDBY && wdNode->state == WD_COORDINATOR)
4125 {
4126 if (list_length(args_list) == 2)
4127 {
4128 ListCell *lc;
4129 foreach(lc, args_list)
4130 {
4131 WDExecCommandArg *wdExecCommandArg = lfirst(lc);
4132 if (strcmp(wdExecCommandArg->arg_name, "StandbyLockType") == 0)
4133 {
4134 lock_type = atoi(wdExecCommandArg->arg_value);
4135 }
4136 else if (strcmp(wdExecCommandArg->arg_name, "LockingOperation") == 0)
4137 {
4138 operation = wdExecCommandArg->arg_value;
4139 }
4140 else
4141 ereport(LOG,
4142 (errmsg("unsupported argument \"%s\" in 'LOCK ON STANDBY' from remote node \"%s\"", wdExecCommandArg->arg_name, wdNode->nodeName)));
4143 }
4144 if (lock_type < 0 || operation == NULL)
4145 {
4146 ereport(LOG,
4147 (errmsg("missing argument in 'LOCK ON STANDBY' from remote node \"%s\"", wdNode->nodeName),
4148 errdetail("command ignored")));
4149 }
4150 else if (lock_type == WD_FOLLOW_PRIMARY_LOCK)
4151 {
4152 ereport(LOG,
4153 (errmsg("processing follow primary looking[%s] request from remote node \"%s\"", operation,wdNode->nodeName)));
4154
4155 if (strcasecmp("acquire", operation) == 0)
4156 pool_acquire_follow_primary_lock(false, true);
4157 else if (strcasecmp("release", operation) == 0)
4158 pool_release_follow_primary_lock(true);
4159 else
4160 ereport(LOG,
4161 (errmsg("invalid looking operaition[%s] in 'LOCK ON STANDBY' from remote node \"%s\"", operation, wdNode->nodeName),
4162 errdetail("command ignored")));
4163 }
4164 else
4165 ereport(LOG,
4166 (errmsg("unsupported lock-type:%d in 'LOCK ON STANDBY' from remote node \"%s\"", lock_type, wdNode->nodeName)));
4167
4168 }
4169 else
4170 {
4171 ereport(LOG,
4172 (errmsg("invalid arguments in 'LOCK ON STANDBY' command from remote node \"%s\"", wdNode->nodeName)));
4173 }
4174 }
4175 else if (get_local_node_state() != WD_STANDBY)
4176 {
4177 ereport(LOG,
4178 (errmsg("invalid node state to execute 'LOCK ON STANDBY' command")));
4179
4180 }
4181 else
4182 {
4183 ereport(LOG,
4184 (errmsg("'LOCK ON STANDBY' command can only be accepted from the coordinator watchdog node"),
4185 errdetail("ignoring...")));
4186 }
4187 }
4188 else
4189 {
4190 ereport(WARNING,
4191 (errmsg("received \"%s\" command from node \"%s\" is not supported",clusterCommand, wdNode->nodeName)));
4192 }
4193
4194 if (args_list)
4195 list_free_deep(args_list);
4196 pfree(clusterCommand);
4197 return;
4198 }
4199
4200 static int
standard_packet_processor(WatchdogNode * wdNode,WDPacketData * pkt)4201 standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt)
4202 {
4203 WDPacketData *replyPkt = NULL;
4204
4205 switch (pkt->type)
4206 {
4207 case WD_FAILOVER_WAITING_FOR_CONSENSUS:
4208 ereport(LOG,
4209 (errmsg("remote node \"%s\" is asking to inform about quarantined backend nodes", wdNode->nodeName)));
4210 register_inform_quarantine_nodes_req();
4211 break;
4212
4213 case WD_EXECUTE_COMMAND_REQUEST:
4214 wd_execute_cluster_command_processor(wdNode, pkt);
4215 break;
4216
4217 case WD_CLUSTER_SERVICE_MESSAGE:
4218 cluster_service_message_processor(wdNode, pkt);
4219 break;
4220
4221 case WD_GET_LEADER_DATA_REQUEST:
4222 replyPkt = process_data_request(wdNode, pkt);
4223 break;
4224
4225 case WD_ASK_FOR_POOL_CONFIG:
4226 {
4227 char *config_data = get_pool_config_json();
4228
4229 if (config_data)
4230 {
4231 replyPkt = get_empty_packet();
4232 set_message_type(replyPkt, WD_POOL_CONFIG_DATA);
4233 set_message_commandID(replyPkt, pkt->command_id);
4234 set_message_data(replyPkt, config_data, strlen(config_data));
4235 }
4236 else
4237 {
4238 replyPkt = get_minimum_message(WD_ERROR_MESSAGE, pkt);
4239
4240 }
4241 }
4242 break;
4243
4244 case WD_POOL_CONFIG_DATA:
4245 {
4246 /* only accept config data if I am the coordinator node */
4247 if (get_local_node_state() == WD_COORDINATOR && pkt->data)
4248 {
4249 POOL_CONFIG *standby_config = get_pool_config_from_json(pkt->data, pkt->len);
4250
4251 if (standby_config)
4252 {
4253 verify_pool_configurations(wdNode, standby_config);
4254 }
4255 }
4256 }
4257 break;
4258
4259 case WD_ADD_NODE_MESSAGE:
4260 case WD_REQ_INFO_MESSAGE:
4261 replyPkt = get_mynode_info_message(pkt);
4262 break;
4263
4264 case WD_INFO_MESSAGE:
4265 {
4266 char *authkey = NULL;
4267 int oldQuorumStatus;
4268 WD_STATES oldNodeState;
4269 WatchdogNode *tempNode = parse_node_info_message(pkt, &authkey);
4270
4271 if (tempNode == NULL)
4272 {
4273 ereport(WARNING,
4274 (errmsg("node \"%s\" sent an invalid node info message", wdNode->nodeName)));
4275 send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_INVALID_VERSION);
4276 break;
4277 }
4278 oldQuorumStatus = wdNode->quorum_status;
4279 oldNodeState = wdNode->state;
4280 wdNode->state = tempNode->state;
4281 wdNode->startup_time.tv_sec = tempNode->startup_time.tv_sec;
4282 wdNode->wd_priority = tempNode->wd_priority;
4283 strlcpy(wdNode->nodeName, tempNode->nodeName, WD_MAX_HOST_NAMELEN);
4284
4285 wdNode->current_state_time.tv_sec = tempNode->current_state_time.tv_sec;
4286 wdNode->escalated = tempNode->escalated;
4287 wdNode->standby_nodes_count = tempNode->standby_nodes_count;
4288 wdNode->quorum_status = tempNode->quorum_status;
4289
4290 print_watchdog_node_info(wdNode);
4291
4292 if (authkey)
4293 pfree(authkey);
4294
4295 if (wdNode->state == WD_COORDINATOR)
4296 {
4297 if (WD_LEADER_NODE == NULL)
4298 {
4299 set_cluster_leader_node(wdNode);
4300 }
4301 else if (WD_LEADER_NODE != wdNode)
4302 {
4303 ereport(LOG,
4304 (errmsg("\"%s\" is the coordinator as per our record but \"%s\" is also announcing as a coordinator",
4305 WD_LEADER_NODE->nodeName, wdNode->nodeName),
4306 errdetail("cluster is in the split-brain")));
4307
4308 if (get_local_node_state() != WD_COORDINATOR)
4309 {
4310 /*
4311 * This fight doesn't belong to me broadcast the
4312 * message about cluster in split-brain
4313 */
4314
4315 send_cluster_service_message(NULL, pkt, CLUSTER_IN_SPLIT_BRAIN);
4316 }
4317 else
4318 {
4319 /*
4320 * okay the contention is between me and the other
4321 * node try to figure out which node is the worthy
4322 * leader
4323 */
4324 ereport(LOG,
4325 (errmsg("I am the coordinator but \"%s\" is also announcing as a coordinator", wdNode->nodeName),
4326 errdetail("trying to figure out the best contender for the leader/coordinator node")));
4327
4328 handle_split_brain(wdNode, pkt);
4329 }
4330 }
4331 else if (WD_LEADER_NODE == wdNode && oldQuorumStatus != wdNode->quorum_status)
4332 {
4333 /* inform Pgpool main about quorum status changes */
4334 register_watchdog_quorum_change_interrupt();
4335 }
4336 }
4337
4338 /*
4339 * if the info message is from leader node. Make sure we are
4340 * in sync with the leader node state
4341 */
4342 else if (WD_LEADER_NODE == wdNode)
4343 {
4344 if (wdNode->state != WD_COORDINATOR)
4345 {
4346 ereport(WARNING,
4347 (errmsg("the coordinator as per our record is not coordinator anymore"),
4348 errdetail("re-initializing the cluster")));
4349 set_state(WD_JOINING);
4350 }
4351 }
4352 pfree(tempNode);
4353
4354 if (oldNodeState == WD_STANDBY && wdNode->state != oldNodeState)
4355 {
4356 standby_node_left_cluster(wdNode);
4357 }
4358 if (oldNodeState == WD_LOST)
4359 {
4360 /*
4361 * We have received the message from lost node
4362 * add it back to cluster if it was not marked by
4363 * life-check
4364 * Node lost by life-check processes can only be
4365 * added back when we get alive notification for the
4366 * node from life-check
4367 */
4368 ereport(LOG,
4369 (errmsg("we have received the NODE INFO message from the node:\"%s\" that was lost",wdNode->nodeName),
4370 errdetail("we had lost this node because of \"%s\"",wd_node_lost_reasons[wdNode->node_lost_reason])));
4371
4372 if (wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK)
4373 {
4374 ereport(LOG,
4375 (errmsg("node:\"%s\" was reported lost by the lifecheck process",wdNode->nodeName),
4376 errdetail("only life-check process can mark this node alive again")));
4377 /* restore the node's lost state */
4378 wdNode->state = oldNodeState;
4379 }
4380 else
4381 {
4382 watchdog_state_machine(WD_EVENT_REMOTE_NODE_FOUND, wdNode, NULL, NULL);
4383 }
4384 }
4385 }
4386 break;
4387
4388 case WD_JOIN_COORDINATOR_MESSAGE:
4389 {
4390 /*
4391 * if I am coordinator reply with accept, otherwise reject
4392 */
4393 if (g_cluster.localNode == WD_LEADER_NODE)
4394 {
4395 replyPkt = get_minimum_message(WD_ACCEPT_MESSAGE, pkt);
4396 }
4397 else
4398 {
4399 replyPkt = get_minimum_message(WD_REJECT_MESSAGE, pkt);
4400 }
4401 }
4402 break;
4403
4404 case WD_IAM_COORDINATOR_MESSAGE:
4405 {
4406 /*
4407 * if the message is received from coordinator reply with
4408 * info, otherwise reject
4409 */
4410 if (WD_LEADER_NODE != NULL && wdNode != WD_LEADER_NODE)
4411 {
4412 ereport(LOG,
4413 (errmsg("\"%s\" is our coordinator node, but \"%s\" is also announcing as a coordinator",
4414 WD_LEADER_NODE->nodeName, wdNode->nodeName),
4415 errdetail("broadcasting the cluster in split-brain message")));
4416
4417 send_cluster_service_message(NULL, pkt, CLUSTER_IN_SPLIT_BRAIN);
4418 }
4419 else if (WD_LEADER_NODE != NULL)
4420 {
4421 replyPkt = get_mynode_info_message(pkt);
4422 beacon_message_received_from_node(wdNode, pkt);
4423 }
4424 /*
4425 * if (WD_LEADER_NODE == NULL)
4426 * do not reply to beacon if we are not connected to
4427 * any leader node
4428 */
4429 }
4430 break;
4431
4432 default:
4433 break;
4434 }
4435 if (replyPkt)
4436 {
4437 if (send_message_to_node(wdNode, replyPkt) == false)
4438 ereport(LOG,
4439 (errmsg("sending packet to node \"%s\" failed", wdNode->nodeName)));
4440 free_packet(replyPkt);
4441 }
4442 return 1;
4443 }
4444
4445
4446 static bool
send_message_to_connection(SocketConnection * conn,WDPacketData * pkt)4447 send_message_to_connection(SocketConnection * conn, WDPacketData * pkt)
4448 {
4449 if (check_debug_request_kill_all_communication() == true ||
4450 check_debug_request_kill_all_senders() == true)
4451 return false;
4452
4453 if (conn->sock > 0 && conn->sock_state == WD_SOCK_CONNECTED)
4454 {
4455 if (write_packet_to_socket(conn->sock, pkt, false) == true)
4456 return true;
4457 ereport(DEBUG1,
4458 (errmsg("sending packet failed, closing connection")));
4459 close_socket_connection(conn);
4460 }
4461
4462 return false;
4463 }
4464
4465 static bool
send_message_to_node(WatchdogNode * wdNode,WDPacketData * pkt)4466 send_message_to_node(WatchdogNode * wdNode, WDPacketData * pkt)
4467 {
4468 bool ret;
4469
4470 print_packet_node_info(pkt, wdNode, true);
4471
4472 ret = send_message_to_connection(&wdNode->client_socket, pkt);
4473 if (ret == false)
4474 {
4475 ret = send_message_to_connection(&wdNode->server_socket, pkt);
4476 }
4477 if (ret)
4478 {
4479 /* reset the sending error counter */
4480 wdNode->sending_failures_count = 0;
4481 /* we only update the last sent time if reply for packet is expected */
4482 switch (pkt->type)
4483 {
4484 case WD_REMOTE_FAILOVER_REQUEST:
4485 case WD_IPC_FAILOVER_COMMAND:
4486 if (wdNode->last_sent_time.tv_sec <= 0)
4487 gettimeofday(&wdNode->last_sent_time, NULL);
4488 break;
4489 default:
4490 break;
4491 }
4492 }
4493 else
4494 {
4495 wdNode->sending_failures_count++;
4496 ereport(DEBUG1,
4497 (errmsg("sending packet %c to node \"%s\" failed", pkt->type, wdNode->nodeName)));
4498 }
4499 return ret;
4500 }
4501
4502 /*
4503 * If wdNode is NULL message is sent to all nodes
4504 * Returns the number of nodes the message is sent to
4505 */
4506 static int
send_message(WatchdogNode * wdNode,WDPacketData * pkt)4507 send_message(WatchdogNode * wdNode, WDPacketData * pkt)
4508 {
4509 int i,
4510 count = 0;
4511
4512 if (wdNode)
4513 {
4514 if (wdNode == g_cluster.localNode) /* Always return 1 if I myself is
4515 * intended receiver */
4516 return 1;
4517 if (send_message_to_node(wdNode, pkt))
4518 return 1;
4519 return 0;
4520 }
4521 /* NULL means send to all reachable nodes */
4522 for (i = 0; i < g_cluster.remoteNodeCount; i++)
4523 {
4524 wdNode = &(g_cluster.remoteNodes[i]);
4525 if (is_node_reachable(wdNode) && send_message_to_node(wdNode, pkt))
4526 count++;
4527 }
4528 return count;
4529 }
4530
wd_command_processor_for_node_lost_event(WDCommandData * ipcCommand,WatchdogNode * wdLostNode)4531 static IPC_CMD_PROCESS_RES wd_command_processor_for_node_lost_event(WDCommandData * ipcCommand, WatchdogNode * wdLostNode)
4532 {
4533 if (ipcCommand->sendToNode)
4534 {
4535 /* The command was sent to one node only */
4536 if (ipcCommand->sendToNode == wdLostNode)
4537 {
4538 /*
4539 * Fail this command, Since the only node it was sent to is lost
4540 */
4541 ipcCommand->commandStatus = COMMAND_FINISHED_SEND_FAILED;
4542 wd_command_is_complete(ipcCommand);
4543 return IPC_CMD_ERROR;
4544 }
4545 else
4546 {
4547 /* Dont worry this command is fine for now */
4548 return IPC_CMD_PROCESSING;
4549 }
4550 }
4551 else
4552 {
4553 /* search the node that is lost */
4554 int i;
4555
4556 for (i = 0; i < g_cluster.remoteNodeCount; i++)
4557 {
4558 WDCommandNodeResult *nodeResult = &ipcCommand->nodeResults[i];
4559
4560 if (nodeResult->wdNode == wdLostNode)
4561 {
4562 if (nodeResult->cmdState == COMMAND_STATE_SENT)
4563 {
4564 ereport(LOG,
4565 (errmsg("remote node \"%s\" lost while IPC command was in progress ", wdLostNode->nodeName)));
4566
4567 /*
4568 * since the node is lost and will be removed from the
4569 * cluster So remove decrement the sent count of command
4570 * and see what is the situation after that
4571 */
4572 nodeResult->cmdState = COMMAND_STATE_DO_NOT_SEND;
4573 ipcCommand->commandSendToCount--;
4574 if (ipcCommand->commandSendToCount <= ipcCommand->commandReplyFromCount)
4575 {
4576 /*
4577 * If we have already received the results from all
4578 * alive nodes finish the command
4579 */
4580 ipcCommand->commandStatus = COMMAND_FINISHED_ALL_REPLIED;
4581 wd_command_is_complete(ipcCommand);
4582 return IPC_CMD_COMPLETE;
4583 }
4584 }
4585 break;
4586 }
4587 }
4588 }
4589 return IPC_CMD_PROCESSING;
4590 }
4591
4592 static void
wd_command_is_complete(WDCommandData * ipcCommand)4593 wd_command_is_complete(WDCommandData * ipcCommand)
4594 {
4595 if (ipcCommand->commandCompleteFunc)
4596 {
4597 ipcCommand->commandCompleteFunc(ipcCommand);
4598 return;
4599 }
4600
4601 /*
4602 * There is not special function for this command use the standard reply
4603 */
4604 if (ipcCommand->commandSource == COMMAND_SOURCE_IPC)
4605 {
4606 char res_type;
4607
4608 switch (ipcCommand->commandStatus)
4609 {
4610 case COMMAND_FINISHED_ALL_REPLIED:
4611 res_type = WD_IPC_CMD_RESULT_OK;
4612 break;
4613 case COMMAND_FINISHED_TIMEOUT:
4614 res_type = WD_IPC_CMD_TIMEOUT;
4615 break;
4616 case COMMAND_FINISHED_NODE_REJECTED:
4617 case COMMAND_FINISHED_SEND_FAILED:
4618 res_type = WD_IPC_CMD_RESULT_BAD;
4619 break;
4620 default:
4621 res_type = WD_IPC_CMD_RESULT_OK;
4622 break;
4623 }
4624 write_ipc_command_with_result_data(ipcCommand, res_type, NULL, 0);
4625 }
4626 else if (ipcCommand->commandSource == COMMAND_SOURCE_REMOTE)
4627 {
4628 char res_type;
4629
4630 if (ipcCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED)
4631 res_type = WD_ACCEPT_MESSAGE;
4632 else
4633 res_type = WD_REJECT_MESSAGE;
4634
4635 reply_with_minimal_message(ipcCommand->sourceWdNode, res_type, &ipcCommand->commandPacket);
4636 }
4637 }
4638
4639
4640 static void
node_lost_while_ipc_command(WatchdogNode * wdNode)4641 node_lost_while_ipc_command(WatchdogNode * wdNode)
4642 {
4643 List *ipcCommands_to_del = NIL;
4644 ListCell *lc;
4645
4646 foreach(lc, g_cluster.ipc_commands)
4647 {
4648 WDCommandData *ipcCommand = lfirst(lc);
4649 IPC_CMD_PROCESS_RES res = wd_command_processor_for_node_lost_event(ipcCommand, wdNode);
4650
4651 if (res != IPC_CMD_PROCESSING)
4652 {
4653 ipcCommands_to_del = lappend(ipcCommands_to_del, ipcCommand);
4654 }
4655 }
4656 /* delete completed commands */
4657 foreach(lc, ipcCommands_to_del)
4658 {
4659 WDCommandData *ipcCommand = lfirst(lc);
4660
4661 cleanUpIPCCommand(ipcCommand);
4662 }
4663
4664 list_free(ipcCommands_to_del);
4665 }
4666
4667
4668 /*
4669 * The function walks through all command and resends
4670 * the failed message again if it can.
4671 */
4672 static void
service_ipc_commands(void)4673 service_ipc_commands(void)
4674 {
4675 ListCell *lc;
4676
4677 foreach(lc, g_cluster.ipc_commands)
4678 {
4679 WDCommandData *ipcCommand = lfirst(lc);
4680
4681 if (ipcCommand && ipcCommand->commandSendToErrorCount)
4682 {
4683 int i;
4684
4685 for (i = 0; i < g_cluster.remoteNodeCount; i++)
4686 {
4687 WDCommandNodeResult *nodeResult = &ipcCommand->nodeResults[i];
4688
4689 if (nodeResult->cmdState == COMMAND_STATE_SEND_ERROR)
4690 {
4691 if (is_node_active_and_reachable(nodeResult->wdNode))
4692 {
4693 ereport(LOG,
4694 (errmsg("remote node \"%s\" is reachable again, resending the command packet ", nodeResult->wdNode->nodeName)));
4695
4696 if (send_message_to_node(nodeResult->wdNode, &ipcCommand->commandPacket) == true)
4697 {
4698 nodeResult->cmdState = COMMAND_STATE_SENT;
4699 ipcCommand->commandSendToErrorCount--;
4700 ipcCommand->commandSendToCount++;
4701 if (ipcCommand->commandSendToErrorCount == 0)
4702 break;
4703 }
4704 }
4705 }
4706 }
4707 }
4708 }
4709 }
4710
4711 static void
service_internal_command(void)4712 service_internal_command(void)
4713 {
4714 int i;
4715 ListCell *lc;
4716 List *finishedCommands = NULL;
4717
4718 if (g_cluster.clusterCommands == NULL)
4719 return;
4720
4721 foreach(lc, g_cluster.clusterCommands)
4722 {
4723 WDCommandData *clusterCommand = lfirst(lc);
4724
4725 if (clusterCommand->commandStatus != COMMAND_IN_PROGRESS)
4726 {
4727 /* command needs to be cleaned up */
4728 finishedCommands = lappend(finishedCommands, clusterCommand);
4729 continue;
4730 }
4731
4732 for (i = 0; i < g_cluster.remoteNodeCount; i++)
4733 {
4734 WDCommandNodeResult *nodeResult = &clusterCommand->nodeResults[i];
4735
4736 if (nodeResult->cmdState == COMMAND_STATE_SEND_ERROR)
4737 {
4738 if (is_node_active_and_reachable(nodeResult->wdNode))
4739 {
4740 if (send_message_to_node(nodeResult->wdNode, &clusterCommand->commandPacket) == true)
4741 {
4742 nodeResult->cmdState = COMMAND_STATE_SENT;
4743 clusterCommand->commandSendToCount++;
4744 }
4745 }
4746 }
4747 }
4748 }
4749 /* delete the finished commands */
4750 foreach(lc, finishedCommands)
4751 {
4752 WDCommandData *clusterCommand = lfirst(lc);
4753
4754 g_cluster.clusterCommands = list_delete_ptr(g_cluster.clusterCommands, clusterCommand);
4755 MemoryContextDelete(clusterCommand->memoryContext);
4756 }
4757
4758 list_free(finishedCommands);
4759 }
4760
4761 /* remove the unreachable nodes from cluster */
4762 static void
service_unreachable_nodes(void)4763 service_unreachable_nodes(void)
4764 {
4765 int i;
4766 struct timeval currTime;
4767
4768 gettimeofday(&currTime, NULL);
4769
4770 for (i = 0; i < g_cluster.remoteNodeCount; i++)
4771 {
4772 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
4773
4774 if (wdNode->state == WD_LOST && wdNode->membership_status == WD_NODE_MEMBERSHIP_ACTIVE
4775 && pool_config->wd_lost_node_removal_timeout)
4776 {
4777 int lost_seconds = WD_TIME_DIFF_SEC(currTime, wdNode->lost_time);
4778 if (lost_seconds >= pool_config->wd_lost_node_removal_timeout)
4779 {
4780 ereport(LOG,
4781 (errmsg("remote node \"%s\" is lost for %d seconds", wdNode->nodeName,lost_seconds),
4782 errdetail("revoking the node's membership")));
4783 revoke_cluster_membership_of_node(wdNode,WD_NODE_REVOKED_LOST);
4784 }
4785 continue;
4786 }
4787
4788 if (wdNode->state == WD_DEAD && wdNode->membership_status == WD_NODE_MEMBERSHIP_ACTIVE
4789 && pool_config->wd_initial_node_showup_time)
4790 {
4791 int no_show_seconds = WD_TIME_DIFF_SEC(currTime, g_cluster.localNode->startup_time);
4792 if (no_show_seconds >= pool_config->wd_initial_node_showup_time)
4793 {
4794 ereport(LOG,
4795 (errmsg("remote node \"%s\" didn't showed-up in %d seconds", wdNode->nodeName,no_show_seconds),
4796 errdetail("revoking the node's membership")));
4797 revoke_cluster_membership_of_node(wdNode,WD_NODE_REVOKED_NO_SHOW);
4798 }
4799 continue;
4800 }
4801
4802 if (is_node_active(wdNode) == false)
4803 continue;
4804
4805 if (is_node_reachable(wdNode) || wdNode->client_socket.sock_state == WD_SOCK_WAITING_FOR_CONNECT)
4806 {
4807 /* check if we are waiting for reply from this node */
4808 if (wdNode->last_sent_time.tv_sec > 0)
4809 {
4810 if (WD_TIME_DIFF_SEC(currTime, wdNode->last_sent_time) >= MAX_SECS_WAIT_FOR_REPLY_FROM_NODE)
4811 {
4812 ereport(LOG,
4813 (errmsg("remote node \"%s\" is not replying..", wdNode->nodeName),
4814 errdetail("marking the node as lost")));
4815 /* mark the node as lost */
4816 wdNode->node_lost_reason = NODE_LOST_BY_RECEIVE_TIMEOUT;
4817 watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
4818 }
4819 }
4820 else if (wdNode->sending_failures_count > MAX_ALLOWED_SEND_FAILURES)
4821 {
4822 ereport(LOG,
4823 (errmsg("not able to send messages to remote node \"%s\"",wdNode->nodeName),
4824 errdetail("marking the node as lost")));
4825 /* mark the node as lost */
4826 wdNode->node_lost_reason = NODE_LOST_BY_SEND_FAILURE;
4827 watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
4828 }
4829 else if (wdNode->missed_beacon_count > MAX_ALLOWED_BEACON_REPLY_MISS)
4830 {
4831 ereport(LOG,
4832 (errmsg("remote node \"%s\" is not responding to our beacon messages",wdNode->nodeName),
4833 errdetail("marking the node as lost")));
4834 /* mark the node as lost */
4835 wdNode->node_lost_reason = NODE_LOST_BY_MISSING_BEACON;
4836 wdNode->missed_beacon_count = 0; /* Reset the counter */
4837 watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
4838 }
4839 }
4840 else
4841 {
4842 ereport(LOG,
4843 (errmsg("remote node \"%s\" is not reachable", wdNode->nodeName),
4844 errdetail("marking the node as lost")));
4845 wdNode->node_lost_reason = NODE_LOST_BY_NOT_REACHABLE;
4846 watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
4847 }
4848 }
4849 }
4850
4851 static bool
watchdog_internal_command_packet_processor(WatchdogNode * wdNode,WDPacketData * pkt)4852 watchdog_internal_command_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt)
4853 {
4854 int i;
4855 WDCommandNodeResult *nodeResult = NULL;
4856 WDCommandData *clusterCommand = get_wd_cluster_command_from_reply(pkt);
4857
4858 if (clusterCommand == NULL || clusterCommand->commandStatus != COMMAND_IN_PROGRESS)
4859 return false;
4860
4861 if (pkt->type != WD_ERROR_MESSAGE &&
4862 pkt->type != WD_ACCEPT_MESSAGE &&
4863 pkt->type != WD_REJECT_MESSAGE &&
4864 pkt->type != WD_INFO_MESSAGE)
4865 return false;
4866
4867 if (pkt->type == WD_INFO_MESSAGE)
4868 standard_packet_processor(wdNode, pkt);
4869
4870 /* get the result node for */
4871 for (i = 0; i < g_cluster.remoteNodeCount; i++)
4872 {
4873 WDCommandNodeResult *nodeRes = &clusterCommand->nodeResults[i];
4874 if (nodeRes->wdNode == wdNode)
4875 {
4876 nodeResult = nodeRes;
4877 break;
4878 }
4879 }
4880 if (nodeResult == NULL)
4881 {
4882 ereport(NOTICE, (errmsg("unable to find node result")));
4883 return true;
4884 }
4885
4886 ereport(DEBUG1,
4887 (errmsg("Watchdog node \"%s\" has replied for command id %d", nodeResult->wdNode->nodeName, pkt->command_id)));
4888
4889 nodeResult->result_type = pkt->type;
4890 nodeResult->cmdState = COMMAND_STATE_REPLIED;
4891 clusterCommand->commandReplyFromCount++;
4892
4893 if (clusterCommand->commandReplyFromCount >= clusterCommand->commandSendToCount)
4894 {
4895 if (pkt->type == WD_REJECT_MESSAGE || pkt->type == WD_ERROR_MESSAGE)
4896 {
4897 ereport(DEBUG1,
4898 (errmsg("command %c with command id %d is finished with COMMAND_FINISHED_NODE_REJECTED", pkt->type, pkt->command_id)));
4899 clusterCommand->commandStatus = COMMAND_FINISHED_NODE_REJECTED;
4900 }
4901 else
4902 {
4903 ereport(DEBUG1,
4904 (errmsg("command %c with command id %d is finished with COMMAND_FINISHED_ALL_REPLIED", pkt->type, pkt->command_id)));
4905 clusterCommand->commandStatus = COMMAND_FINISHED_ALL_REPLIED;
4906 }
4907 watchdog_state_machine(WD_EVENT_COMMAND_FINISHED, wdNode, pkt, clusterCommand);
4908 g_cluster.clusterCommands = list_delete_ptr(g_cluster.clusterCommands, clusterCommand);
4909 MemoryContextDelete(clusterCommand->memoryContext);
4910 }
4911 else if (pkt->type == WD_REJECT_MESSAGE || pkt->type == WD_ERROR_MESSAGE)
4912 {
4913 /* Error or reject message by any node immediately finishes the command */
4914 ereport(DEBUG1,
4915 (errmsg("command %c with command id %d is finished with COMMAND_FINISHED_NODE_REJECTED", pkt->type, pkt->command_id)));
4916 clusterCommand->commandStatus = COMMAND_FINISHED_NODE_REJECTED;
4917 watchdog_state_machine(WD_EVENT_COMMAND_FINISHED, wdNode, pkt, clusterCommand);
4918 g_cluster.clusterCommands = list_delete_ptr(g_cluster.clusterCommands, clusterCommand);
4919 MemoryContextDelete(clusterCommand->memoryContext);
4920 }
4921 return true; /* do not process this packet further */
4922 }
4923
4924
4925 static void
check_for_current_command_timeout(void)4926 check_for_current_command_timeout(void)
4927 {
4928 struct timeval currTime;
4929
4930 ListCell *lc;
4931 List *finishedCommands = NULL;
4932
4933 if (g_cluster.clusterCommands == NULL)
4934 return;
4935
4936 gettimeofday(&currTime, NULL);
4937
4938 foreach(lc, g_cluster.clusterCommands)
4939 {
4940 WDCommandData *clusterCommand = lfirst(lc);
4941
4942 if (clusterCommand->commandStatus != COMMAND_IN_PROGRESS)
4943 {
4944 /* command needs to be cleaned up */
4945 finishedCommands = lappend(finishedCommands, clusterCommand);
4946 continue;
4947 }
4948 if (WD_TIME_DIFF_SEC(currTime, clusterCommand->commandTime) >= clusterCommand->commandTimeoutSecs)
4949 {
4950 clusterCommand->commandStatus = COMMAND_FINISHED_TIMEOUT;
4951 watchdog_state_machine(WD_EVENT_COMMAND_FINISHED, NULL, NULL, clusterCommand);
4952 finishedCommands = lappend(finishedCommands, clusterCommand);
4953 }
4954 }
4955 /* delete the finished commands */
4956 foreach(lc, finishedCommands)
4957 {
4958 WDCommandData *clusterCommand = lfirst(lc);
4959
4960 g_cluster.clusterCommands = list_delete_ptr(g_cluster.clusterCommands, clusterCommand);
4961 MemoryContextDelete(clusterCommand->memoryContext);
4962 }
4963
4964 list_free(finishedCommands);
4965 }
4966
4967
4968 /*
4969 * If wdNode is NULL message is sent to all nodes
4970 * Returns the number of nodes the message is sent to
4971 */
4972 static int
issue_watchdog_internal_command(WatchdogNode * wdNode,WDPacketData * pkt,int timeout_sec)4973 issue_watchdog_internal_command(WatchdogNode * wdNode, WDPacketData * pkt, int timeout_sec)
4974 {
4975 int i;
4976 bool save_message = false;
4977 WDCommandData *clusterCommand;
4978 MemoryContext oldCxt;
4979
4980 clusterCommand = create_command_object(0);
4981
4982 clusterCommand->commandSource = COMMAND_SOURCE_LOCAL;
4983 clusterCommand->sourceWdNode = g_cluster.localNode;
4984 gettimeofday(&clusterCommand->commandTime, NULL);
4985
4986 clusterCommand->commandTimeoutSecs = timeout_sec;
4987 clusterCommand->commandPacket.type = pkt->type;
4988 clusterCommand->commandPacket.command_id = pkt->command_id;
4989 clusterCommand->commandPacket.len = 0;
4990 clusterCommand->commandPacket.data = NULL;
4991
4992 clusterCommand->sendToNode = wdNode;
4993 clusterCommand->commandSendToCount = 0;
4994 clusterCommand->commandReplyFromCount = 0;
4995 clusterCommand->commandStatus = COMMAND_IN_PROGRESS;
4996
4997 allocate_resultNodes_in_command(clusterCommand);
4998
4999 if (wdNode == NULL) /* This is send to all */
5000 {
5001 for (i = 0; i < g_cluster.remoteNodeCount; i++)
5002 {
5003 WDCommandNodeResult *nodeResult = &clusterCommand->nodeResults[i];
5004
5005 clear_command_node_result(nodeResult);
5006 if (is_node_active(nodeResult->wdNode) == false)
5007 {
5008 ereport(DEBUG2,
5009 (errmsg("not sending watchdog internal command packet to DEAD %s", nodeResult->wdNode->nodeName)));
5010 /* Do not send to dead nodes */
5011 nodeResult->cmdState = COMMAND_STATE_DO_NOT_SEND;
5012 }
5013 else
5014 {
5015 if (send_message_to_node(nodeResult->wdNode, pkt) == false)
5016 {
5017 ereport(DEBUG1,
5018 (errmsg("failed to send watchdog internal command packet %s", nodeResult->wdNode->nodeName),
5019 errdetail("saving the packet. will try to resend it if connection recovers")));
5020
5021 /* failed to send. May be try again later */
5022 save_message = true;
5023 nodeResult->cmdState = COMMAND_STATE_SEND_ERROR;
5024 }
5025 else
5026 {
5027 nodeResult->cmdState = COMMAND_STATE_SENT;
5028 clusterCommand->commandSendToCount++;
5029 }
5030 }
5031 }
5032 }
5033 if (wdNode)
5034 {
5035 WDCommandNodeResult *nodeResult = NULL;
5036
5037 for (i = 0; i < g_cluster.remoteNodeCount; i++)
5038 {
5039 WDCommandNodeResult *nodeRes = &clusterCommand->nodeResults[i];
5040
5041 clear_command_node_result(nodeRes);
5042 if (nodeRes->wdNode == wdNode)
5043 nodeResult = nodeRes;
5044 }
5045 if (nodeResult == NULL)
5046 {
5047 /* should never happen */
5048 ereport(WARNING,
5049 (errmsg("Internal error. Not able to locate node result slot")));
5050 MemoryContextDelete(clusterCommand->memoryContext);
5051 return -1;
5052 }
5053 if (send_message_to_node(nodeResult->wdNode, pkt) == false)
5054 {
5055 /* failed to send. May be try again later */
5056 save_message = true;
5057 nodeResult->cmdState = COMMAND_STATE_SEND_ERROR;
5058 }
5059 else
5060 {
5061 nodeResult->cmdState = COMMAND_STATE_SENT;
5062 clusterCommand->commandSendToCount++;
5063 }
5064 }
5065 if (save_message && pkt->len > 0)
5066 {
5067 clusterCommand->commandPacket.data = MemoryContextAlloc(clusterCommand->memoryContext, pkt->len);
5068 memcpy(clusterCommand->commandPacket.data, pkt->data, pkt->len);
5069 clusterCommand->commandPacket.len = pkt->len;
5070 }
5071 ereport(DEBUG2,
5072 (errmsg("new cluster command %c issued with command id %d", pkt->type, pkt->command_id)));
5073
5074 oldCxt = MemoryContextSwitchTo(TopMemoryContext);
5075 g_cluster.clusterCommands = lappend(g_cluster.clusterCommands, clusterCommand);
5076 MemoryContextSwitchTo(oldCxt);
5077
5078 return clusterCommand->commandSendToCount;
5079 }
5080
5081 /*
5082 * Check remote connections except their state are either WD_SHUTDOWN or
5083 * WD_DEAD. If suncceeded in connecting to any of the remote nodes, returns
5084 * true, otherwise false.
5085 */
5086 static bool
service_lost_connections(void)5087 service_lost_connections(void)
5088 {
5089 int i;
5090 struct timeval currTime;
5091 bool ret = false;
5092
5093 gettimeofday(&currTime, NULL);
5094 for (i = 0; i < g_cluster.remoteNodeCount; i++)
5095 {
5096 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
5097
5098 if (wdNode->state == WD_SHUTDOWN || wdNode->state == WD_DEAD)
5099 continue;
5100
5101 if (is_socket_connection_connected(&wdNode->client_socket) == false)
5102 {
5103 if (WD_TIME_DIFF_SEC(currTime, wdNode->client_socket.tv) <= MIN_SECS_CONNECTION_RETRY)
5104 continue;
5105
5106 if (wdNode->client_socket.sock_state != WD_SOCK_WAITING_FOR_CONNECT)
5107 {
5108 connect_to_node(wdNode);
5109 if (wdNode->client_socket.sock_state == WD_SOCK_CONNECTED)
5110 {
5111 ereport(LOG,
5112 (errmsg("connection to the remote node \"%s\" is restored", wdNode->nodeName)));
5113 watchdog_state_machine(WD_EVENT_NEW_OUTBOUND_CONNECTION, wdNode, NULL, NULL);
5114 ret = true;
5115 }
5116 }
5117 }
5118 }
5119 return ret;
5120 }
5121
5122 /*
5123 * The function only considers the node state.
5124 * All node states count towards the cluster participating nodes
5125 * except the dead and lost nodes.
5126 */
5127 static int
get_cluster_node_count(void)5128 get_cluster_node_count(void)
5129 {
5130 int i;
5131 int count = 0;
5132
5133 for (i = 0; i < g_cluster.remoteNodeCount; i++)
5134 {
5135 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
5136
5137 if (wdNode->state == WD_DEAD || wdNode->state == WD_LOST || wdNode->state == WD_SHUTDOWN)
5138 continue;
5139 count++;
5140 }
5141 return count;
5142 }
5143
get_message_of_type(char type,WDPacketData * replyFor)5144 static WDPacketData * get_message_of_type(char type, WDPacketData * replyFor)
5145 {
5146 WDPacketData *pkt = NULL;
5147
5148 switch (type)
5149 {
5150 case WD_INFO_MESSAGE:
5151 pkt = get_mynode_info_message(replyFor);
5152 break;
5153 case WD_ADD_NODE_MESSAGE:
5154 pkt = get_addnode_message();
5155 break;
5156 case WD_IAM_COORDINATOR_MESSAGE:
5157 pkt = get_beacon_message(WD_IAM_COORDINATOR_MESSAGE, replyFor);
5158 break;
5159
5160 case WD_FAILOVER_START:
5161 case WD_FAILOVER_END:
5162 case WD_REQ_INFO_MESSAGE:
5163 case WD_STAND_FOR_COORDINATOR_MESSAGE:
5164 case WD_DECLARE_COORDINATOR_MESSAGE:
5165 case WD_JOIN_COORDINATOR_MESSAGE:
5166 case WD_QUORUM_IS_LOST:
5167 case WD_INFORM_I_AM_GOING_DOWN:
5168 case WD_ASK_FOR_POOL_CONFIG:
5169 case WD_FAILOVER_WAITING_FOR_CONSENSUS:
5170 pkt = get_minimum_message(type, replyFor);
5171 break;
5172 default:
5173 ereport(LOG, (errmsg("invalid message type %c", type)));
5174 break;
5175 }
5176 return pkt;
5177 }
5178
5179 static int
send_message_of_type(WatchdogNode * wdNode,char type,WDPacketData * replyFor)5180 send_message_of_type(WatchdogNode * wdNode, char type, WDPacketData * replyFor)
5181 {
5182 int ret = -1;
5183 WDPacketData *pkt = get_message_of_type(type, replyFor);
5184
5185 if (pkt)
5186 {
5187 ret = send_message(wdNode, pkt);
5188 free_packet(pkt);
5189 }
5190 return ret;
5191 }
5192
5193 static int
send_cluster_command(WatchdogNode * wdNode,char type,int timeout_sec)5194 send_cluster_command(WatchdogNode * wdNode, char type, int timeout_sec)
5195 {
5196 int ret = -1;
5197 WDPacketData *pkt = get_message_of_type(type, NULL);
5198
5199 if (pkt)
5200 {
5201 ret = issue_watchdog_internal_command(wdNode, pkt, timeout_sec);
5202 free_packet(pkt);
5203 }
5204 return ret;
5205 }
5206
5207 static bool
reply_with_minimal_message(WatchdogNode * wdNode,char type,WDPacketData * replyFor)5208 reply_with_minimal_message(WatchdogNode * wdNode, char type, WDPacketData * replyFor)
5209 {
5210 WDPacketData *pkt = get_minimum_message(type, replyFor);
5211 int ret = send_message(wdNode, pkt);
5212
5213 free_packet(pkt);
5214 return ret;
5215 }
5216
5217 static bool
send_cluster_service_message(WatchdogNode * wdNode,WDPacketData * replyFor,char message)5218 send_cluster_service_message(WatchdogNode * wdNode, WDPacketData * replyFor, char message)
5219 {
5220 /* Check if its a broadcast message */
5221 if (wdNode == NULL)
5222 {
5223 /* see if we have already broadcasted the similar message recently */
5224 if (message == g_cluster.last_bcast_srv_msg)
5225 {
5226 struct timeval currTime;
5227 gettimeofday(&currTime, NULL);
5228 int last_bcast_sec = WD_TIME_DIFF_SEC(currTime, g_cluster.last_bcast_srv_msg_time);
5229 if (last_bcast_sec < MIN_SECS_BETWEEN_BROADCAST_SRV_MSG)
5230 {
5231 /*
5232 * do not broadcast this message
5233 * to prevent flooding
5234 */
5235 ereport(DEBUG4,
5236 (errmsg("not broadcasting cluster service message %c to prevent flooding ",message),
5237 errdetail("last time same message was sent %d seconds ago",last_bcast_sec)));
5238 return true;
5239 }
5240 }
5241 g_cluster.last_bcast_srv_msg = message;
5242 gettimeofday(&g_cluster.last_bcast_srv_msg_time, NULL);
5243 }
5244 return reply_with_message(wdNode, WD_CLUSTER_SERVICE_MESSAGE, &message, 1, replyFor);
5245 }
5246
5247
5248 static bool
reply_with_message(WatchdogNode * wdNode,char type,char * data,int data_len,WDPacketData * replyFor)5249 reply_with_message(WatchdogNode * wdNode, char type, char *data, int data_len, WDPacketData * replyFor)
5250 {
5251 WDPacketData wdPacket;
5252 int ret;
5253
5254 init_wd_packet(&wdPacket);
5255 set_message_type(&wdPacket, type);
5256
5257 if (replyFor == NULL)
5258 set_next_commandID_in_message(&wdPacket);
5259 else
5260 set_message_commandID(&wdPacket, replyFor->command_id);
5261
5262 set_message_data(&wdPacket, data, data_len);
5263 ret = send_message(wdNode, &wdPacket);
5264 return ret;
5265 }
5266
get_local_node_state(void)5267 static inline WD_STATES get_local_node_state(void)
5268 {
5269 return g_cluster.localNode->state;
5270 }
5271
5272 static inline bool
is_local_node_true_leader(void)5273 is_local_node_true_leader(void)
5274 {
5275 return (get_local_node_state() == WD_COORDINATOR && WD_LEADER_NODE == g_cluster.localNode);
5276 }
5277
5278 /*
5279 * returns true if no message is swallowed by the
5280 * processor and no further action is required
5281 */
5282 static bool
wd_commands_packet_processor(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt)5283 wd_commands_packet_processor(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt)
5284 {
5285 WDCommandData *ipcCommand;
5286
5287 if (event != WD_EVENT_PACKET_RCV)
5288 return false;
5289 if (pkt == NULL)
5290 return false;
5291
5292 if (pkt->type == WD_FAILOVER_LOCKING_REQUEST ||
5293 pkt->type == WD_REMOTE_FAILOVER_REQUEST)
5294 {
5295 /* Node is using the older version of Pgpool-II */
5296 ereport(WARNING,
5297 (errmsg("node \"%s\" is using the older version of Pgpool-II", wdNode->nodeName)));
5298 send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_INVALID_VERSION);
5299 return true;
5300 }
5301
5302 if (pkt->type == WD_IPC_FAILOVER_COMMAND)
5303 {
5304 process_remote_failover_command_on_coordinator(wdNode, pkt);
5305 return true;
5306 }
5307
5308 if (pkt->type == WD_IPC_ONLINE_RECOVERY_COMMAND)
5309 {
5310 process_remote_online_recovery_command(wdNode, pkt);
5311 return true;
5312 }
5313
5314 if (pkt->type == WD_DATA_MESSAGE)
5315 {
5316 ipcCommand = get_wd_IPC_command_from_reply(pkt);
5317 if (ipcCommand)
5318 {
5319 if (write_ipc_command_with_result_data(ipcCommand, WD_IPC_CMD_RESULT_OK, pkt->data, pkt->len) == false)
5320 ereport(LOG,
5321 (errmsg("failed to forward data message to IPC command socket")));
5322
5323 cleanUpIPCCommand(ipcCommand);
5324 return true; /* do not process this packet further */
5325 }
5326 return false;
5327 }
5328
5329 if (pkt->type == WD_CMD_REPLY_IN_DATA)
5330 {
5331 ipcCommand = get_wd_IPC_command_from_reply(pkt);
5332 if (ipcCommand == NULL)
5333 return false;
5334
5335 /* Just forward the data to IPC socket and finish the command */
5336 if (write_ipc_command_with_result_data(ipcCommand, WD_IPC_CMD_RESULT_OK, pkt->data, pkt->len) == false)
5337 ereport(LOG,
5338 (errmsg("failed to forward data message to IPC command socket")));
5339
5340 /*
5341 * ok we are done, delete this command
5342 */
5343 cleanUpIPCCommand(ipcCommand);
5344 return true; /* do not process this packet further */
5345 }
5346
5347 else if (pkt->type == WD_ACCEPT_MESSAGE ||
5348 pkt->type == WD_REJECT_MESSAGE ||
5349 pkt->type == WD_ERROR_MESSAGE)
5350 {
5351 ipcCommand = get_wd_IPC_command_from_reply(pkt);
5352
5353 if (ipcCommand == NULL)
5354 return false;
5355
5356 if (ipcCommand->commandPacket.type == WD_IPC_FAILOVER_COMMAND)
5357 {
5358 if (pkt->type == WD_ACCEPT_MESSAGE)
5359 reply_to_failover_command(ipcCommand, FAILOVER_RES_PROCEED, 0);
5360 else
5361 reply_to_failover_command(ipcCommand, FAILOVER_RES_LEADER_REJECTED, 0);
5362 return true;
5363 }
5364
5365 else if (ipcCommand->commandPacket.type == WD_IPC_ONLINE_RECOVERY_COMMAND)
5366 {
5367 return reply_is_received_for_pgpool_replicate_command(wdNode, pkt, ipcCommand);
5368 }
5369 }
5370
5371 return false;
5372 }
5373
5374
5375 static void
update_interface_status(void)5376 update_interface_status(void)
5377 {
5378 struct ifaddrs *ifAddrStruct = NULL;
5379 struct ifaddrs *ifa = NULL;
5380 ListCell *lc;
5381
5382 if (g_cluster.wdInterfaceToMonitor == NULL)
5383 return;
5384
5385 getifaddrs(&ifAddrStruct);
5386 for (ifa = ifAddrStruct; ifa != NULL; ifa = ifa->ifa_next)
5387 {
5388 ereport(DEBUG1,
5389 (errmsg("network interface %s having flags %d", ifa->ifa_name, ifa->ifa_flags)));
5390
5391 if (!strncasecmp("lo", ifa->ifa_name, 2))
5392 continue; /* We do not need loop back addresses */
5393
5394 foreach(lc, g_cluster.wdInterfaceToMonitor)
5395 {
5396 WDInterfaceStatus *if_status = lfirst(lc);
5397
5398 if (!strcasecmp(if_status->if_name, ifa->ifa_name))
5399 {
5400 if_status->if_up = is_interface_up(ifa);
5401 break;
5402 }
5403 }
5404 }
5405
5406 if (ifAddrStruct != NULL)
5407 freeifaddrs(ifAddrStruct);
5408
5409 }
5410
5411 static bool
any_interface_available(void)5412 any_interface_available(void)
5413 {
5414 ListCell *lc;
5415
5416 update_interface_status();
5417 /* if interface monitoring is disabled we are good */
5418 if (g_cluster.wdInterfaceToMonitor == NULL)
5419 return true;
5420
5421 foreach(lc, g_cluster.wdInterfaceToMonitor)
5422 {
5423 WDInterfaceStatus *if_status = lfirst(lc);
5424
5425 if (if_status->if_up)
5426 {
5427 ereport(DEBUG1,
5428 (errmsg("network interface \"%s\" is up and we can continue", if_status->if_name)));
5429 return true;
5430 }
5431 }
5432 return false;
5433 }
5434
5435 static int
watchdog_state_machine(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)5436 watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
5437 {
5438 ereport(DEBUG1,
5439 (errmsg("STATE MACHINE INVOKED WITH EVENT = %s Current State = %s",
5440 wd_event_name[event], wd_state_names[get_local_node_state()])));
5441
5442 if (event == WD_EVENT_REMOTE_NODE_LOST)
5443 {
5444
5445 if (wdNode->state == WD_SHUTDOWN)
5446 {
5447 ereport(LOG,
5448 (errmsg("remote node \"%s\" is shutting down", wdNode->nodeName)));
5449 if (pool_config->wd_remove_shutdown_nodes)
5450 revoke_cluster_membership_of_node(wdNode,WD_NODE_REVOKED_SHUTDOWN);
5451 }
5452 else
5453 {
5454 wdNode->state = WD_LOST;
5455 ereport(LOG,
5456 (errmsg("remote node \"%s\" is lost", wdNode->nodeName)));
5457 /* Inform the node, that it is lost for us */
5458 send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_APPEARING_LOST);
5459 }
5460 if (wdNode == WD_LEADER_NODE)
5461 {
5462 ereport(LOG,
5463 (errmsg("watchdog cluster has lost the coordinator node")));
5464 set_cluster_leader_node(NULL);
5465 }
5466
5467 /* close all socket connections to the node */
5468 close_socket_connection(&wdNode->client_socket);
5469 close_socket_connection(&wdNode->server_socket);
5470
5471 /* clear the wait timer on the node */
5472 wdNode->last_sent_time.tv_sec = 0;
5473 wdNode->last_sent_time.tv_usec = 0;
5474 wdNode->sending_failures_count = 0;
5475 node_lost_while_ipc_command(wdNode);
5476 }
5477 else if (event == WD_EVENT_REMOTE_NODE_FOUND)
5478 {
5479 ereport(LOG,
5480 (errmsg("remote node \"%s\" became reachable again", wdNode->nodeName),
5481 errdetail("requesting the node info")));
5482 /*
5483 * remove the lost state from the node
5484 * and change it to joining for now
5485 */
5486 wdNode->node_lost_reason = NODE_LOST_UNKNOWN_REASON;
5487 wdNode->state = WD_LOADING;
5488 send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_APPEARING_FOUND);
5489 /* if this node was kicked out of quorum calculation. add it back */
5490 restore_cluster_membership_of_node(wdNode);
5491 }
5492 else if (event == WD_EVENT_PACKET_RCV)
5493 {
5494 print_packet_node_info(pkt, wdNode, false);
5495 /* update the last receive time */
5496 gettimeofday(&wdNode->last_rcv_time, NULL);
5497
5498 if (pkt->type == WD_INFO_MESSAGE)
5499 {
5500 standard_packet_processor(wdNode, pkt);
5501 }
5502
5503 if (pkt->type == WD_INFORM_I_AM_GOING_DOWN)
5504 {
5505 wdNode->state = WD_SHUTDOWN;
5506 wdNode->node_lost_reason = NODE_LOST_SHUTDOWN;
5507 return watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
5508 }
5509
5510 if (watchdog_internal_command_packet_processor(wdNode, pkt) == true)
5511 {
5512 return 0;
5513 }
5514 }
5515 else if (event == WD_EVENT_NEW_OUTBOUND_CONNECTION)
5516 {
5517 WDPacketData *addPkt = get_addnode_message();
5518
5519 send_message(wdNode, addPkt);
5520 free_packet(addPkt);
5521 }
5522
5523 else if (event == WD_EVENT_NW_IP_IS_REMOVED || event == WD_EVENT_NW_LINK_IS_INACTIVE)
5524 {
5525 List *local_addresses;
5526
5527 /* check if we have an active link */
5528 if (any_interface_available() == false)
5529 {
5530 ereport(WARNING,
5531 (errmsg("network event has occurred and all monitored interfaces are down"),
5532 errdetail("changing the state to in network trouble")));
5533
5534 set_state(WD_IN_NW_TROUBLE);
5535
5536 }
5537 /* check if all IP addresses are lost */
5538 local_addresses = get_all_local_ips();
5539 if (local_addresses == NULL)
5540 {
5541 /*
5542 * We have lost all IP addresses we are in network trouble. Just
5543 * move to in network trouble state
5544 */
5545 ereport(WARNING,
5546 (errmsg("network IP is removed and system has no IP is assigned"),
5547 errdetail("changing the state to in network trouble")));
5548
5549 set_state(WD_IN_NW_TROUBLE);
5550 }
5551 else
5552 {
5553 ListCell *lc;
5554
5555 ereport(DEBUG1,
5556 (errmsg("network IP is removed but system still has a valid IP is assigned")));
5557 foreach(lc, local_addresses)
5558 {
5559 char *ip = lfirst(lc);
5560
5561 ereport(DEBUG1,
5562 (errmsg("IP = %s", ip ? ip : "NULL")));
5563 }
5564
5565 list_free_deep(local_addresses);
5566 local_addresses = NULL;
5567 }
5568 }
5569
5570 else if (event == WD_EVENT_LOCAL_NODE_LOST)
5571 {
5572 ereport(WARNING,
5573 (errmsg("watchdog life-check reported, we are disconnected from the network"),
5574 errdetail("changing the state to LOST")));
5575 set_state(WD_LOST);
5576 }
5577
5578 if (wd_commands_packet_processor(event, wdNode, pkt) == true)
5579 return 0;
5580
5581 switch (get_local_node_state())
5582 {
5583 case WD_LOADING:
5584 watchdog_state_machine_loading(event, wdNode, pkt, clusterCommand);
5585 break;
5586 case WD_JOINING:
5587 watchdog_state_machine_joining(event, wdNode, pkt, clusterCommand);
5588 break;
5589 case WD_INITIALIZING:
5590 watchdog_state_machine_initializing(event, wdNode, pkt, clusterCommand);
5591 break;
5592 case WD_COORDINATOR:
5593 watchdog_state_machine_coordinator(event, wdNode, pkt, clusterCommand);
5594 break;
5595 case WD_PARTICIPATE_IN_ELECTION:
5596 watchdog_state_machine_voting(event, wdNode, pkt, clusterCommand);
5597 break;
5598 case WD_STAND_FOR_COORDINATOR:
5599 watchdog_state_machine_standForCord(event, wdNode, pkt, clusterCommand);
5600 break;
5601 case WD_STANDBY:
5602 watchdog_state_machine_standby(event, wdNode, pkt, clusterCommand);
5603 break;
5604 case WD_LOST:
5605 case WD_IN_NW_TROUBLE:
5606 watchdog_state_machine_nw_error(event, wdNode, pkt, clusterCommand);
5607 break;
5608 case WD_NETWORK_ISOLATION:
5609 watchdog_state_machine_nw_isolation(event, wdNode, pkt, clusterCommand);
5610 break;
5611 default:
5612 /* Should never ever happen */
5613 ereport(WARNING,
5614 (errmsg("invalid watchdog state")));
5615 set_state(WD_LOADING);
5616 break;
5617 }
5618
5619 return 0;
5620 }
5621
5622 /*
5623 * This is the state where the watchdog enters when starting up.
5624 * upon entering this state we sends ADD node message to all reachable
5625 * nodes.
5626 * Wait for 4 seconds if some node rejects us.
5627 */
5628 static int
watchdog_state_machine_loading(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)5629 watchdog_state_machine_loading(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
5630 {
5631 switch (event)
5632 {
5633 case WD_EVENT_WD_STATE_CHANGED:
5634 {
5635 int i;
5636 WDPacketData *addPkt = get_addnode_message();
5637
5638 /* set the status to ADD_MESSAGE_SEND by hand */
5639 for (i = 0; i < g_cluster.remoteNodeCount; i++)
5640 {
5641 WatchdogNode *wdTmpNode;
5642
5643 wdTmpNode = &(g_cluster.remoteNodes[i]);
5644 if (wdTmpNode->client_socket.sock_state == WD_SOCK_CONNECTED && wdTmpNode->state == WD_DEAD)
5645 {
5646 if (send_message(wdTmpNode, addPkt))
5647 wdTmpNode->state = WD_ADD_MESSAGE_SENT;
5648 }
5649 }
5650 free_packet(addPkt);
5651 set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE);
5652 }
5653 break;
5654
5655 case WD_EVENT_TIMEOUT:
5656 set_state(WD_JOINING);
5657 break;
5658
5659 case WD_EVENT_PACKET_RCV:
5660 {
5661 switch (pkt->type)
5662 {
5663 case WD_STAND_FOR_COORDINATOR_MESSAGE:
5664 {
5665 /*
5666 * We are loading but a node is already contesting
5667 * for coordinator node well we can ignore it but
5668 * then this could eventually mean a lower
5669 * priority node can became a coordinator node. So
5670 * check the priority of the node in stand for
5671 * coordinator state
5672 */
5673 if (g_cluster.localNode->wd_priority > wdNode->wd_priority)
5674 {
5675 reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
5676 set_state(WD_STAND_FOR_COORDINATOR);
5677 }
5678 else
5679 {
5680 reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
5681 set_state(WD_PARTICIPATE_IN_ELECTION);
5682 }
5683 }
5684 break;
5685
5686 case WD_INFO_MESSAGE:
5687 {
5688 int i;
5689 bool all_replied = true;
5690
5691 for (i = 0; i < g_cluster.remoteNodeCount; i++)
5692 {
5693 wdNode = &(g_cluster.remoteNodes[i]);
5694 if (wdNode->state == WD_ADD_MESSAGE_SENT)
5695 {
5696 all_replied = false;
5697 break;
5698 }
5699 }
5700 if (all_replied)
5701 {
5702 /*
5703 * we are already connected to all configured
5704 * nodes Just move to initializing state
5705 */
5706 set_state(WD_INITIALIZING);
5707 }
5708 }
5709 break;
5710
5711 case WD_REJECT_MESSAGE:
5712 if (wdNode->state == WD_ADD_MESSAGE_SENT || wdNode->state == WD_DEAD)
5713 ereport(FATAL,
5714 (return_code(POOL_EXIT_FATAL),
5715 errmsg("Add to watchdog cluster request is rejected by node \"%s:%d\"", wdNode->hostname, wdNode->wd_port),
5716 errhint("check the watchdog configurations.")));
5717 break;
5718 default:
5719 standard_packet_processor(wdNode, pkt);
5720 break;
5721 }
5722 }
5723 break;
5724 default:
5725 break;
5726 }
5727 return 0;
5728 }
5729
5730 /*
5731 * This is the intermediate state before going to cluster initialization
5732 * here we update the information of all connected nodes and move to the
5733 * initialization state. moving to this state from loading does not make
5734 * much sence as at loading time we already have updated node informations
5735 */
5736 static int
watchdog_state_machine_joining(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)5737 watchdog_state_machine_joining(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
5738 {
5739 switch (event)
5740 {
5741 case WD_EVENT_WD_STATE_CHANGED:
5742 set_cluster_leader_node(NULL);
5743 try_connecting_with_all_unreachable_nodes();
5744 send_cluster_command(NULL, WD_REQ_INFO_MESSAGE, 4);
5745 set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE);
5746 break;
5747
5748 case WD_EVENT_TIMEOUT:
5749 set_state(WD_INITIALIZING);
5750 break;
5751
5752 case WD_EVENT_COMMAND_FINISHED:
5753 {
5754 if (clusterCommand->commandPacket.type == WD_REQ_INFO_MESSAGE)
5755 set_state(WD_INITIALIZING);
5756 }
5757 break;
5758
5759 case WD_EVENT_PACKET_RCV:
5760 {
5761 switch (pkt->type)
5762 {
5763 case WD_REJECT_MESSAGE:
5764 if (wdNode->state == WD_ADD_MESSAGE_SENT)
5765 ereport(FATAL,
5766 (return_code(POOL_EXIT_FATAL),
5767 errmsg("add to watchdog cluster request is rejected by node \"%s:%d\"", wdNode->hostname, wdNode->wd_port),
5768 errhint("check the watchdog configurations.")));
5769 break;
5770
5771 case WD_STAND_FOR_COORDINATOR_MESSAGE:
5772 {
5773 /*
5774 * We are loading but a node is already contesting
5775 * for coordinator node well we can ignore it but
5776 * then this could eventually mean a lower
5777 * priority node can became a coordinator node. So
5778 * check the priority of the node in stand for
5779 * coordinator state
5780 */
5781 if (g_cluster.localNode->wd_priority > wdNode->wd_priority)
5782 {
5783 reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
5784 set_state(WD_STAND_FOR_COORDINATOR);
5785 }
5786 else
5787 {
5788 reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
5789 set_state(WD_PARTICIPATE_IN_ELECTION);
5790 }
5791 }
5792 break;
5793
5794 default:
5795 standard_packet_processor(wdNode, pkt);
5796 break;
5797 }
5798 }
5799 break;
5800
5801 default:
5802 break;
5803 }
5804
5805 return 0;
5806 }
5807
5808 /*
5809 * This state only works on the local data and does not
5810 * sends any cluster command.
5811 */
5812
5813 static int
watchdog_state_machine_initializing(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)5814 watchdog_state_machine_initializing(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
5815 {
5816 switch (event)
5817 {
5818 case WD_EVENT_WD_STATE_CHANGED:
5819 /* set 1 sec timeout, save ourself from recursion */
5820 set_timeout(1);
5821 break;
5822
5823 case WD_EVENT_TIMEOUT:
5824 {
5825 /*
5826 * If leader node exists in cluster, Join it otherwise try
5827 * becoming a leader
5828 */
5829 if (WD_LEADER_NODE)
5830 {
5831 /*
5832 * we found the coordinator node in network. Just join the
5833 * network
5834 */
5835 set_state(WD_STANDBY);
5836 }
5837 else if (get_cluster_node_count() == 0)
5838 {
5839 ereport(LOG,
5840 (errmsg("I am the only alive node in the watchdog cluster"),
5841 errhint("skipping stand for coordinator state")));
5842
5843 /*
5844 * I am the alone node in the cluster at the moment skip
5845 * the intermediate steps and jump to the coordinator
5846 * state
5847 */
5848 set_state(WD_COORDINATOR);
5849 }
5850 else
5851 {
5852 int i;
5853
5854 for (i = 0; i < g_cluster.remoteNodeCount; i++)
5855 {
5856 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
5857
5858 if (wdNode->state == WD_STAND_FOR_COORDINATOR)
5859 {
5860 set_state(WD_PARTICIPATE_IN_ELECTION);
5861 return 0;
5862 }
5863 }
5864 /* stand for coordinator */
5865 set_state(WD_STAND_FOR_COORDINATOR);
5866 }
5867 }
5868 break;
5869
5870 case WD_EVENT_PACKET_RCV:
5871 {
5872 switch (pkt->type)
5873 {
5874 case WD_REJECT_MESSAGE:
5875 if (wdNode->state == WD_ADD_MESSAGE_SENT)
5876 ereport(FATAL,
5877 (return_code(POOL_EXIT_FATAL),
5878 errmsg("Add to watchdog cluster request is rejected by node \"%s:%d\"", wdNode->hostname, wdNode->wd_port),
5879 errhint("check the watchdog configurations.")));
5880 break;
5881 default:
5882 standard_packet_processor(wdNode, pkt);
5883 break;
5884 }
5885 }
5886
5887 break;
5888
5889 default:
5890 break;
5891 }
5892 return 0;
5893 }
5894
5895 static int
watchdog_state_machine_standForCord(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)5896 watchdog_state_machine_standForCord(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
5897 {
5898 switch (event)
5899 {
5900 case WD_EVENT_WD_STATE_CHANGED:
5901 send_cluster_command(NULL, WD_STAND_FOR_COORDINATOR_MESSAGE, 4);
5902 /* wait for 5 seconds if someone rejects us */
5903 set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE);
5904 break;
5905
5906 case WD_EVENT_COMMAND_FINISHED:
5907 {
5908 if (clusterCommand->commandPacket.type == WD_STAND_FOR_COORDINATOR_MESSAGE)
5909 {
5910 if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ||
5911 clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT)
5912 {
5913 set_state(WD_COORDINATOR);
5914 }
5915 else
5916 {
5917 /* command finished with an error */
5918 if (pkt)
5919 {
5920 if (pkt->type == WD_ERROR_MESSAGE)
5921 {
5922 ereport(LOG,
5923 (errmsg("our stand for coordinator request is rejected by node \"%s\"",wdNode->nodeName),
5924 errdetail("we might be in partial network isolation and cluster already have a valid leader"),
5925 errhint("please verify the watchdog life-check and network is working properly")));
5926 set_state(WD_NETWORK_ISOLATION);
5927 }
5928 else if (pkt->type == WD_REJECT_MESSAGE)
5929 {
5930 ereport(LOG,
5931 (errmsg("our stand for coordinator request is rejected by node \"%s\"", wdNode->nodeName)));
5932 set_state(WD_PARTICIPATE_IN_ELECTION);
5933 }
5934 }
5935 else
5936 {
5937 ereport(LOG,
5938 (errmsg("our stand for coordinator request is rejected by node \"%s\"", wdNode->nodeName)));
5939 set_state(WD_JOINING);
5940 }
5941 }
5942 }
5943 }
5944 break;
5945
5946 case WD_EVENT_TIMEOUT:
5947 set_state(WD_COORDINATOR);
5948 break;
5949
5950 case WD_EVENT_PACKET_RCV:
5951 {
5952 switch (pkt->type)
5953 {
5954 case WD_STAND_FOR_COORDINATOR_MESSAGE:
5955 /* decide on base of priority */
5956 if (g_cluster.localNode->wd_priority > wdNode->wd_priority)
5957 {
5958 reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
5959 }
5960 else if (g_cluster.localNode->wd_priority == wdNode->wd_priority)
5961 {
5962 /* decide on base of starting time */
5963 if (g_cluster.localNode->startup_time.tv_sec <= wdNode->startup_time.tv_sec) /* I am older */
5964 {
5965 reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
5966 }
5967 else
5968 {
5969 reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
5970 set_state(WD_PARTICIPATE_IN_ELECTION);
5971 }
5972 }
5973 else
5974 {
5975 reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
5976 set_state(WD_PARTICIPATE_IN_ELECTION);
5977 }
5978 break;
5979
5980 case WD_DECLARE_COORDINATOR_MESSAGE:
5981 {
5982 /*
5983 * meanwhile someone has declared itself
5984 * coordinator
5985 */
5986 if (g_cluster.localNode->wd_priority > wdNode->wd_priority)
5987 {
5988 ereport(LOG,
5989 (errmsg("rejecting the declare coordinator request from node \"%s\"", wdNode->nodeName),
5990 errdetail("my wd_priority [%d] is higher than the requesting node's priority [%d]", g_cluster.localNode->wd_priority, wdNode->wd_priority)));
5991 reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
5992 }
5993 else
5994 {
5995 ereport(LOG,
5996 (errmsg("node \"%s\" has declared itself as a coordinator", wdNode->nodeName)));
5997 reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
5998 set_state(WD_JOINING);
5999 }
6000 }
6001 break;
6002 default:
6003 standard_packet_processor(wdNode, pkt);
6004 break;
6005 }
6006 }
6007 break;
6008
6009 default:
6010 break;
6011 }
6012 return 0;
6013 }
6014
6015 /*
6016 * Event handler for the coordinator/leader state.
6017 * The function handels all the event received when the local
6018 * node is the leader/coordinator node.
6019 */
6020 static int
watchdog_state_machine_coordinator(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)6021 watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
6022 {
6023 switch (event)
6024 {
6025 case WD_EVENT_WD_STATE_CHANGED:
6026 {
6027 int i;
6028
6029 send_cluster_command(NULL, WD_DECLARE_COORDINATOR_MESSAGE, 4);
6030 set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE);
6031 update_missed_beacon_count(NULL,true);
6032 ereport(LOG,
6033 (errmsg("I am announcing my self as leader/coordinator watchdog node")));
6034
6035 for (i = 0; i < g_cluster.remoteNodeCount; i++)
6036 {
6037 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
6038
6039 ereport(DEBUG2,
6040 (errmsg("printing all remote node information")));
6041 print_watchdog_node_info(wdNode);
6042 }
6043 /* Also reset my priority as per the original configuration */
6044 g_cluster.localNode->wd_priority = pool_config->wd_priority;
6045 }
6046 break;
6047
6048 case WD_EVENT_COMMAND_FINISHED:
6049 {
6050 if (clusterCommand->commandPacket.type == WD_DECLARE_COORDINATOR_MESSAGE)
6051 {
6052 if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ||
6053 clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT)
6054 {
6055 update_cluster_memberships();
6056 update_quorum_status();
6057 reset_lost_timers();
6058 ereport(DEBUG1,
6059 (errmsg("declare coordinator command finished with status:[%s]",
6060 clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ?
6061 "ALL NODES REPLIED" :
6062 "COMMAND TIMED OUT"),
6063 errdetail("The command was sent to %d nodes and %d nodes replied to it",
6064 clusterCommand->commandSendToCount,
6065 clusterCommand->commandReplyFromCount
6066 )));
6067
6068 ereport(LOG,
6069 (errmsg("I am the cluster leader node"),
6070 errdetail("our declare coordinator message is accepted by all nodes")));
6071
6072 set_cluster_leader_node(g_cluster.localNode);
6073 register_watchdog_state_change_interrupt();
6074
6075 /*
6076 * Check if the quorum is present then start the
6077 * escalation process otherwise keep in the
6078 * coordinator state and wait for the quorum
6079 */
6080 if (g_cluster.quorum_status == -1)
6081 {
6082 ereport(LOG,
6083 (errmsg("I am the cluster leader node but we do not have enough nodes in cluster"),
6084 errdetail("waiting for the quorum to start escalation process")));
6085 }
6086 else
6087 {
6088 ereport(LOG,
6089 (errmsg("I am the cluster leader node. Starting escalation process")));
6090 start_escalated_node();
6091 }
6092 }
6093 else
6094 {
6095 /* command is finished but because of error */
6096 ereport(NOTICE,
6097 (errmsg("possible split brain scenario detected by \"%s\" node", wdNode->nodeName),
6098 (errdetail("re-initializing cluster"))));
6099 set_state(WD_JOINING);
6100 }
6101 }
6102
6103 else if (clusterCommand->commandPacket.type == WD_IAM_COORDINATOR_MESSAGE)
6104 {
6105 update_missed_beacon_count(clusterCommand,false);
6106
6107 if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED)
6108 {
6109 ereport(DEBUG1,
6110 (errmsg("I am the cluster leader node command finished with status:[ALL NODES REPLIED]"),
6111 errdetail("The command was sent to %d nodes and %d nodes replied to it",
6112 clusterCommand->commandSendToCount,
6113 clusterCommand->commandReplyFromCount
6114 )));
6115 }
6116 else if (clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT)
6117 {
6118 ereport(DEBUG1,
6119 (errmsg("I am the cluster leader node command finished with status:[COMMAND TIMED OUT] which is success"),
6120 errdetail("The command was sent to %d nodes and %d nodes replied to it",
6121 clusterCommand->commandSendToCount,
6122 clusterCommand->commandReplyFromCount
6123 )));
6124 }
6125 else if (clusterCommand->commandStatus == COMMAND_FINISHED_NODE_REJECTED)
6126 {
6127 /*
6128 * one of the node rejected out I am coordinator
6129 * message
6130 */
6131 ereport(LOG,
6132 (errmsg("possible split brain, \"%s\" node has rejected our coordinator beacon", wdNode->nodeName),
6133 (errdetail("removing the node from out standby list"))));
6134
6135 standby_node_left_cluster(wdNode);
6136 }
6137 }
6138 }
6139 break;
6140
6141 case WD_EVENT_CLUSTER_QUORUM_CHANGED:
6142 {
6143 /* make sure we are accepted as leader */
6144 if (WD_LEADER_NODE == g_cluster.localNode)
6145 {
6146 if (g_cluster.quorum_status == -1)
6147 {
6148 ereport(LOG,
6149 (errmsg("We have lost the quorum")));
6150
6151 /*
6152 * We have lost the quorum, stay as a leader node but
6153 * perform de-escalation. As keeping the VIP may
6154 * result in split-brain
6155 */
6156 resign_from_escalated_node();
6157 }
6158 else if (g_cluster.quorum_status >= 0)
6159 {
6160 if (g_cluster.localNode->escalated == false)
6161 {
6162 ereport(LOG,
6163 (errmsg("quorum found"),
6164 errdetail("starting escalation process")));
6165 start_escalated_node();
6166 }
6167 }
6168 /* inform to the cluster about the new quorum status */
6169 send_message_of_type(NULL, WD_INFO_MESSAGE, NULL);
6170 register_watchdog_quorum_change_interrupt();
6171 }
6172 }
6173 break;
6174
6175 case WD_EVENT_NW_IP_IS_REMOVED:
6176 {
6177 /* check if we were holding the virtual IP and it is now lost */
6178 List *local_addresses = get_all_local_ips();
6179
6180 if (local_addresses == NULL)
6181 {
6182 /*
6183 * We have lost all IP addresses we are in network
6184 * trouble. Just move to in network trouble state
6185 */
6186 set_state(WD_IN_NW_TROUBLE);
6187 }
6188 else
6189 {
6190 /*
6191 * We do have some IP addresses assigned so its not a
6192 * total black-out check if we still have the VIP assigned
6193 */
6194 if (g_cluster.clusterLeaderInfo.holding_vip == true)
6195 {
6196 ListCell *lc;
6197 bool vip_exists = false;
6198
6199 foreach(lc, local_addresses)
6200 {
6201 char *ip = lfirst(lc);
6202
6203 if (!strcmp(ip, g_cluster.localNode->delegate_ip))
6204 {
6205 vip_exists = true;
6206 break;
6207 }
6208 }
6209 if (vip_exists == false)
6210 {
6211 /*
6212 * Okay this is the case when only our VIP is lost
6213 * but network interface seems to be working fine
6214 * try to re-acquire the VIP
6215 */
6216 wd_IP_up();
6217 }
6218 }
6219 list_free_deep(local_addresses);
6220 local_addresses = NULL;
6221 }
6222 }
6223 break;
6224
6225 case WD_EVENT_NW_IP_IS_ASSIGNED:
6226 break;
6227
6228 case WD_EVENT_TIMEOUT:
6229 {
6230 if (check_debug_request_do_not_send_beacon() == false)
6231 send_cluster_command(NULL, WD_IAM_COORDINATOR_MESSAGE, 5);
6232 set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
6233 }
6234 break;
6235
6236 case WD_EVENT_I_AM_APPEARING_LOST:
6237 {
6238 /* The remote node has lost us, It would have already marked
6239 * us as lost, So remove it from standby*/
6240 standby_node_left_cluster(wdNode);
6241 }
6242 break;
6243
6244 case WD_EVENT_I_AM_APPEARING_FOUND:
6245 {
6246 /* The remote node has found us again */
6247 if (wdNode->wd_data_major_version >= 1 && wdNode->wd_data_minor_version >= 1)
6248 {
6249 /*
6250 * Since data version 1.1 we support CLUSTER_NODE_REQUIRE_TO_RELOAD
6251 * which makes the standby nodes to re-send the join leader node
6252 */
6253 ereport(DEBUG1,
6254 (errmsg("asking remote node \"%s\" to rejoin leader", wdNode->nodeName),
6255 errdetail("watchdog data version %s",WD_MESSAGE_DATA_VERSION)));
6256
6257 send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_REQUIRE_TO_RELOAD);
6258 }
6259 else
6260 {
6261 /*
6262 * The node is on older version
6263 * So ask it to re-join the cluster
6264 */
6265 ereport(DEBUG1,
6266 (errmsg("asking remote node \"%s\" to rejoin cluster", wdNode->nodeName),
6267 errdetail("watchdog data version %s",WD_MESSAGE_DATA_VERSION)));
6268 send_cluster_service_message(wdNode, pkt, CLUSTER_NEEDS_ELECTION);
6269 }
6270 }
6271 break;
6272
6273 case WD_EVENT_REMOTE_NODE_LOST:
6274 {
6275 standby_node_left_cluster(wdNode);
6276 }
6277 break;
6278
6279 case WD_EVENT_REMOTE_NODE_FOUND:
6280 {
6281 ereport(LOG,
6282 (errmsg("remote node \"%s\" is reachable again", wdNode->nodeName),
6283 errdetail("trying to add it back as a standby")));
6284 wdNode->node_lost_reason = NODE_LOST_UNKNOWN_REASON;
6285 /* If I am the cluster leader. Ask for the node info and to re-send the join message */
6286 send_message_of_type(wdNode, WD_REQ_INFO_MESSAGE, NULL);
6287 if (wdNode->wd_data_major_version >= 1 && wdNode->wd_data_minor_version >= 1)
6288 {
6289 /*
6290 * Since data version 1.1 we support CLUSTER_NODE_REQUIRE_TO_RELOAD
6291 * which makes the standby nodes to re-send the join leader node
6292 */
6293 ereport(DEBUG1,
6294 (errmsg("asking remote node \"%s\" to rejoin leader", wdNode->nodeName),
6295 errdetail("watchdog data version %s",WD_MESSAGE_DATA_VERSION)));
6296
6297 send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_REQUIRE_TO_RELOAD);
6298 }
6299 else
6300 {
6301 /*
6302 * The node is on older version
6303 * So ask it to re-join the cluster
6304 */
6305 ereport(DEBUG1,
6306 (errmsg("asking remote node \"%s\" to rejoin cluster", wdNode->nodeName),
6307 errdetail("watchdog data version %s",WD_MESSAGE_DATA_VERSION)));
6308 send_cluster_service_message(wdNode, pkt, CLUSTER_NEEDS_ELECTION);
6309 }
6310 break;
6311 }
6312
6313 case WD_EVENT_PACKET_RCV:
6314 {
6315 switch (pkt->type)
6316 {
6317 case WD_ADD_NODE_MESSAGE:
6318 /* In case we received the ADD node message from
6319 * one of our standby, Remove that standby from
6320 * the list
6321 */
6322 standby_node_left_cluster(wdNode);
6323 standard_packet_processor(wdNode, pkt);
6324 break;
6325
6326 case WD_STAND_FOR_COORDINATOR_MESSAGE:
6327 reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
6328 break;
6329 case WD_DECLARE_COORDINATOR_MESSAGE:
6330 ereport(NOTICE,
6331 (errmsg("We are coordinator and another node tried a coup")));
6332 reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
6333 break;
6334
6335 case WD_IAM_COORDINATOR_MESSAGE:
6336 {
6337 ereport(NOTICE,
6338 (errmsg("We are in split brain, I AM COORDINATOR MESSAGE received from \"%s\" node", wdNode->nodeName)));
6339
6340 if (beacon_message_received_from_node(wdNode, pkt) == true)
6341 {
6342 handle_split_brain(wdNode, pkt);
6343 }
6344 else
6345 {
6346 /*
6347 * we are not able to decide which should be
6348 * the best candidate to stay as
6349 * leader/coordinator node This could also
6350 * happen if the remote node is using the
6351 * older version of Pgpool-II which send the
6352 * empty beacon messages.
6353 */
6354 ereport(LOG,
6355 (errmsg("We are in split brain, and not able to decide the best candidate for leader/coordinator"),
6356 errdetail("re-initializing the local watchdog cluster state")));
6357
6358 send_cluster_service_message(wdNode, pkt, CLUSTER_NEEDS_ELECTION);
6359 set_state(WD_JOINING);
6360 }
6361 }
6362 break;
6363
6364 case WD_JOIN_COORDINATOR_MESSAGE:
6365 {
6366 /*
6367 * If the node is marked as lost because of
6368 * life-check, Do not let it join the cluster
6369 */
6370 if (wdNode->state == WD_LOST && wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK)
6371 {
6372 ereport(LOG,
6373 (errmsg("lost remote node \"%s\" is requesting to join the cluster",wdNode->nodeName),
6374 errdetail("rejecting the request until life-check inform us that it is reachable again")));
6375 reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
6376 }
6377 else
6378 {
6379 reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
6380 /* Also get the configurations from the standby node */
6381 send_message_of_type(wdNode,WD_ASK_FOR_POOL_CONFIG,NULL);
6382 standby_node_join_cluster(wdNode);
6383 }
6384 }
6385 break;
6386
6387 default:
6388 standard_packet_processor(wdNode, pkt);
6389 break;
6390 }
6391 }
6392 break;
6393
6394 default:
6395 break;
6396 }
6397 return 0;
6398 }
6399
6400 /*
6401 * We can get into this state if we detect the total
6402 * network blackout, Here we just keep waiting for the
6403 * network to come back, and when it does we re-initialize
6404 * the cluster state.
6405 *
6406 * Note:
6407 *
6408 * All this is very good to detect the network black out or cable unplugged
6409 * scenarios, and moving to the WD_IN_NW_TROUBLE state. Although this state machine
6410 * function can gracefully handle the network black out situation and recovers the
6411 * watchdog node when the network becomes reachable, but there is a problem.
6412 *
6413 * Once the cable on the system is unplugged or when the node gets isolated from the
6414 * cluster there is every likelihood that the backend health-check of the isolated node
6415 * start reporting the backend node failure and the pgpool-II proceeds to perform
6416 * the failover for all attached backend nodes. Since the pgpool-II is yet not
6417 * smart enough to figure out it is because of the network failure of its own
6418 * system and the backend nodes are not actually at fault but, are working properly.
6419 *
6420 * So now when the network gets back the backend status of the node will be different
6421 * and incorrect from the other pgpool-II nodes in the cluster. So the ideal solution
6422 * for the situation is to make the pgpool-II main process aware of the network black out
6423 * and when the network recovers the pgpool-II asks the watchdog to sync again the state of
6424 * all configured backend nodes from the leader pgpool-II node. But to implement this lot
6425 * of time is required, So until that time we are just opting for the easiest solution here
6426 * which is to commit a suicide as soon an the network becomes unreachable
6427 */
6428 static int
watchdog_state_machine_nw_error(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)6429 watchdog_state_machine_nw_error(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
6430 {
6431 switch (event)
6432 {
6433 case WD_EVENT_WD_STATE_CHANGED:
6434 /* commit suicide, see above note */
6435 ereport(FATAL,
6436 (return_code(POOL_EXIT_FATAL),
6437 errmsg("system has lost the network")));
6438
6439 set_timeout(2);
6440 break;
6441
6442 case WD_EVENT_PACKET_RCV:
6443
6444 /*
6445 * Okay this is funny because according to us we are in network
6446 * black out but yet we are able to receive the packet. Just check
6447 * may be network is back and we are unable to detect it
6448 */
6449 /* fall through */
6450 case WD_EVENT_TIMEOUT:
6451 case WD_EVENT_NW_IP_IS_ASSIGNED:
6452 {
6453 List *local_addresses = get_all_local_ips();
6454
6455 if (local_addresses == NULL)
6456 {
6457 /*
6458 * How come this is possible ?? but if somehow this
6459 * happens keep in the state and ignore the packet
6460 */
6461 }
6462 else
6463 {
6464 /*
6465 * Seems like the network is back just go on initialize
6466 * the cluster
6467 */
6468 /*
6469 * we might have broken sockets when the network gets
6470 * back. Send the request info message to all nodes to
6471 * confirm socket state
6472 */
6473 WDPacketData *pkt = get_minimum_message(WD_IAM_IN_NW_TROUBLE_MESSAGE, NULL);
6474
6475 send_message(NULL, pkt);
6476 try_connecting_with_all_unreachable_nodes();
6477 pfree(pkt);
6478 list_free_deep(local_addresses);
6479 local_addresses = NULL;
6480 set_state(WD_LOADING);
6481 }
6482 }
6483 break;
6484
6485 default:
6486 break;
6487 }
6488 return 0;
6489 }
6490
6491 /*
6492 * we could end up in tis state if we were connected to the
6493 * leader node as standby and got lost on the leader.
6494 * Here we just wait for BEACON_MESSAGE_INTERVAL_SECONDS
6495 * and retry to join the cluster.
6496 */
6497 static int
watchdog_state_machine_nw_isolation(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)6498 watchdog_state_machine_nw_isolation(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
6499 {
6500 switch (event)
6501 {
6502 case WD_EVENT_WD_STATE_CHANGED:
6503 set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
6504 break;
6505
6506 case WD_EVENT_PACKET_RCV:
6507 standard_packet_processor(wdNode, pkt);
6508 break;
6509
6510 case WD_EVENT_REMOTE_NODE_FOUND:
6511 case WD_EVENT_WD_STATE_REQUIRE_RELOAD:
6512 case WD_EVENT_I_AM_APPEARING_FOUND:
6513 case WD_EVENT_TIMEOUT:
6514 /* fall through */
6515 case WD_EVENT_NW_IP_IS_ASSIGNED:
6516 ereport(LOG,
6517 (errmsg("trying again to join the cluster")));
6518 set_state(WD_JOINING);
6519 break;
6520
6521 default:
6522 break;
6523 }
6524 return 0;
6525 }
6526
6527 static bool
beacon_message_received_from_node(WatchdogNode * wdNode,WDPacketData * pkt)6528 beacon_message_received_from_node(WatchdogNode * wdNode, WDPacketData * pkt)
6529 {
6530 long seconds_since_node_startup;
6531 long seconds_since_current_state;
6532 int quorum_status;
6533 int standby_nodes_count;
6534 bool escalated;
6535 int state;
6536 struct timeval current_time;
6537
6538 gettimeofday(¤t_time, NULL);
6539
6540 if (pkt->data == NULL || pkt->len <= 0)
6541 return false;
6542
6543 if (parse_beacon_message_json(pkt->data, pkt->len,
6544 &state,
6545 &seconds_since_node_startup,
6546 &seconds_since_current_state,
6547 &quorum_status,
6548 &standby_nodes_count,
6549 &escalated) == false)
6550 {
6551 return false;
6552 }
6553
6554 wdNode->current_state_time.tv_sec = current_time.tv_sec - seconds_since_current_state;
6555 wdNode->startup_time.tv_sec = current_time.tv_sec - seconds_since_node_startup;
6556 wdNode->current_state_time.tv_usec = wdNode->startup_time.tv_usec = 0;
6557 wdNode->quorum_status = quorum_status;
6558 wdNode->standby_nodes_count = standby_nodes_count;
6559 wdNode->state = state;
6560 wdNode->escalated = escalated;
6561 return true;
6562 }
6563
6564 /*
6565 * This function decides the best contender for a coordinator/leader node
6566 * when the remote node info states it is a coordinator while
6567 * the local node is also in the leader/coordinator state.
6568 *
6569 * return:
6570 * -1 : remote node is the best candidate to remain as leader
6571 * 0 : both local and remote nodes are not worthy leader or error
6572 * 1 : local node should remain as the leader/coordinator
6573 */
6574 static int
I_am_leader_and_cluster_in_split_brain(WatchdogNode * otherLeaderNode)6575 I_am_leader_and_cluster_in_split_brain(WatchdogNode * otherLeaderNode)
6576 {
6577 if (get_local_node_state() != WD_COORDINATOR)
6578 return 0;
6579 if (otherLeaderNode->state != WD_COORDINATOR)
6580 return 0;
6581
6582 if (otherLeaderNode->current_state_time.tv_sec == 0)
6583 {
6584 ereport(LOG,
6585 (errmsg("not enough data to decide the leader node"),
6586 errdetail("the watchdog node:\"%s\" is using the older version of Pgpool-II", otherLeaderNode->nodeName)));
6587 return 0;
6588 }
6589
6590 /* Decide which node should stay as leader */
6591 if (otherLeaderNode->escalated != g_cluster.localNode->escalated)
6592 {
6593 if (otherLeaderNode->escalated == true && g_cluster.localNode->escalated == false)
6594 {
6595 /* remote node stays as the leader */
6596 ereport(LOG,
6597 (errmsg("remote node:\"%s\" is best suitable to stay as leader because it is escalated and I am not",
6598 otherLeaderNode->nodeName)));
6599 return -1;
6600 }
6601 else
6602 {
6603 /* local node stays as leader */
6604 ereport(LOG,
6605 (errmsg("remote node:\"%s\" should step down from leader because it is not escalated",
6606 otherLeaderNode->nodeName)));
6607 return 1;
6608 }
6609 }
6610 else if (otherLeaderNode->quorum_status != g_cluster.quorum_status)
6611 {
6612 if (otherLeaderNode->quorum_status > g_cluster.quorum_status)
6613 {
6614 /* quorum of remote node is in better state */
6615 ereport(LOG,
6616 (errmsg("remote node:\"%s\" is best suitable to stay as leader because it holds the quorum"
6617 ,otherLeaderNode->nodeName)));
6618
6619 return -1;
6620 }
6621 else
6622 {
6623 /* local node stays as leader */
6624 ereport(LOG,
6625 (errmsg("remote node:\"%s\" should step down from leader because it does not hold the quorum"
6626 ,otherLeaderNode->nodeName)));
6627 return 1;
6628 }
6629 }
6630 else if (otherLeaderNode->standby_nodes_count != g_cluster.clusterLeaderInfo.standby_nodes_count)
6631 {
6632 if (otherLeaderNode->standby_nodes_count > g_cluster.clusterLeaderInfo.standby_nodes_count)
6633 {
6634 /* remote node has more alive nodes */
6635 ereport(LOG,
6636 (errmsg("remote node:\"%s\" is best suitable to stay as leader because it has more connected standby nodes"
6637 ,otherLeaderNode->nodeName)));
6638 return -1;
6639 }
6640 else
6641 {
6642 /* local node stays as leader */
6643 ereport(LOG,
6644 (errmsg("remote node:\"%s\" should step down from leader because we have more connected standby nodes"
6645 ,otherLeaderNode->nodeName)));
6646 return 1;
6647 }
6648 }
6649 else /* decide on which node is the older master */
6650 {
6651 if (otherLeaderNode->current_state_time.tv_sec < g_cluster.localNode->current_state_time.tv_sec)
6652 {
6653 /* remote node has more alive nodes */
6654 ereport(LOG,
6655 (errmsg("remote node:\"%s\" is best suitable to stay as leader because it is the older leader"
6656 ,otherLeaderNode->nodeName)));
6657
6658 return -1;
6659 }
6660 else
6661 {
6662 /* local node should keep the leader status */
6663 ereport(LOG,
6664 (errmsg("remote node:\"%s\" should step down from leader because we are the older leader"
6665 ,otherLeaderNode->nodeName)));
6666
6667 return 1;
6668 }
6669 }
6670 return 0; /* keep the compiler quite */
6671 }
6672
6673 static void
handle_split_brain(WatchdogNode * otherLeaderNode,WDPacketData * pkt)6674 handle_split_brain(WatchdogNode * otherLeaderNode, WDPacketData * pkt)
6675 {
6676 int decide_leader = I_am_leader_and_cluster_in_split_brain(otherLeaderNode);
6677
6678 if (decide_leader == 0)
6679 {
6680 /*
6681 * we are not able to decide which should be the best candidate to
6682 * stay as leader/coordinator node This could also happen if the
6683 * remote node is using the older version of Pgpool-II which send the
6684 * empty beacon messages.
6685 */
6686 ereport(LOG,
6687 (errmsg("We are in split brain, and not able to decide the best candidate for leader/coordinator"),
6688 errdetail("re-initializing the local watchdog cluster state")));
6689 send_cluster_service_message(otherLeaderNode, pkt, CLUSTER_NEEDS_ELECTION);
6690 set_state(WD_JOINING);
6691 }
6692 else if (decide_leader == -1)
6693 {
6694 /* Remote node is the best candidate for the leader node */
6695 ereport(LOG,
6696 (errmsg("We are in split brain, and \"%s\" node is the best candidate for leader/coordinator"
6697 ,otherLeaderNode->nodeName),
6698 errdetail("re-initializing the local watchdog cluster state")));
6699 /* broadcast the message about I am not the true leader node */
6700 send_cluster_service_message(NULL, pkt, CLUSTER_IAM_NOT_TRUE_LEADER);
6701 set_state(WD_JOINING);
6702 }
6703 else
6704 {
6705 /* I am the best candidate for the leader node */
6706 ereport(LOG,
6707 (errmsg("We are in split brain, and I am the best candidate for leader/coordinator"),
6708 errdetail("asking the remote node \"%s\" to step down", otherLeaderNode->nodeName)));
6709 send_cluster_service_message(otherLeaderNode, pkt, CLUSTER_IAM_TRUE_LEADER);
6710 }
6711
6712 }
6713
6714 static void
start_escalated_node(void)6715 start_escalated_node(void)
6716 {
6717 int wait_secs = MAX_SECS_ESC_PROC_EXIT_WAIT;
6718
6719 if (g_cluster.localNode->escalated == true) /* already escalated */
6720 return;
6721
6722 while (g_cluster.de_escalation_pid > 0 && wait_secs-- > 0)
6723 {
6724 /*
6725 * de_escalation process was already running and we are escalating
6726 * again. give some time to de-escalation process to exit normally
6727 */
6728 ereport(LOG,
6729 (errmsg("waiting for de-escalation process to exit before starting escalation")));
6730 if (sigchld_request)
6731 wd_child_signal_handler();
6732 sleep(1);
6733 }
6734 if (g_cluster.de_escalation_pid > 0)
6735 ereport(LOG,
6736 (errmsg("de-escalation process does not exited in time."),
6737 errdetail("starting the escalation anyway")));
6738
6739 g_cluster.escalation_pid = fork_escalation_process();
6740 if (g_cluster.escalation_pid > 0)
6741 {
6742 g_cluster.localNode->escalated = true;
6743 set_watchdog_node_escalated();
6744 ereport(LOG,
6745 (errmsg("escalation process started with PID:%d", g_cluster.escalation_pid)));
6746 if (strlen(g_cluster.localNode->delegate_ip) > 0)
6747 g_cluster.clusterLeaderInfo.holding_vip = true;
6748 }
6749 else
6750 {
6751 ereport(LOG,
6752 (errmsg("failed to start escalation process")));
6753 }
6754 }
6755
6756 static void
resign_from_escalated_node(void)6757 resign_from_escalated_node(void)
6758 {
6759 int wait_secs = MAX_SECS_ESC_PROC_EXIT_WAIT;
6760
6761 if (g_cluster.localNode->escalated == false)
6762 return;
6763
6764 while (g_cluster.escalation_pid > 0 && wait_secs-- > 0)
6765 {
6766 /*
6767 * escalation process was already running and we are resigning from
6768 * it. wait for the escalation process to exit normally
6769 */
6770 ereport(LOG,
6771 (errmsg("waiting for escalation process to exit before starting de-escalation")));
6772 if (sigchld_request)
6773 wd_child_signal_handler();
6774 sleep(1);
6775 }
6776 if (g_cluster.escalation_pid > 0)
6777 ereport(LOG,
6778 (errmsg("escalation process does not exited in time"),
6779 errdetail("starting the de-escalation anyway")));
6780 g_cluster.de_escalation_pid = fork_plunging_process();
6781 g_cluster.clusterLeaderInfo.holding_vip = false;
6782 g_cluster.localNode->escalated = false;
6783 reset_watchdog_node_escalated();
6784 }
6785
6786 /*
6787 * state machine function for state participate in elections
6788 */
6789 static int
watchdog_state_machine_voting(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)6790 watchdog_state_machine_voting(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
6791 {
6792 switch (event)
6793 {
6794 case WD_EVENT_WD_STATE_CHANGED:
6795 set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE);
6796 break;
6797
6798 case WD_EVENT_TIMEOUT:
6799 set_state(WD_JOINING);
6800 break;
6801
6802 case WD_EVENT_PACKET_RCV:
6803 {
6804 if (pkt == NULL)
6805 {
6806 ereport(LOG,
6807 (errmsg("packet is NULL")));
6808 break;
6809 }
6810 switch (pkt->type)
6811 {
6812 case WD_STAND_FOR_COORDINATOR_MESSAGE:
6813 {
6814 /* Check the node priority */
6815 if (wdNode->wd_priority >= g_cluster.localNode->wd_priority)
6816 {
6817 reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
6818 }
6819 else
6820 {
6821 reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
6822 set_state(WD_STAND_FOR_COORDINATOR);
6823 }
6824 }
6825 break;
6826 case WD_IAM_COORDINATOR_MESSAGE:
6827 set_state(WD_JOINING);
6828 break;
6829 case WD_DECLARE_COORDINATOR_MESSAGE:
6830 /* Check the node priority */
6831 if (wdNode->wd_priority >= g_cluster.localNode->wd_priority)
6832 {
6833 reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
6834 set_state(WD_INITIALIZING);
6835 }
6836 else
6837 {
6838 reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
6839 set_state(WD_STAND_FOR_COORDINATOR);
6840 }
6841 break;
6842 default:
6843 standard_packet_processor(wdNode, pkt);
6844 break;
6845 }
6846 }
6847 break;
6848
6849 default:
6850 break;
6851 }
6852 return 0;
6853 }
6854
6855 static int
watchdog_state_machine_standby(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)6856 watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
6857 {
6858 switch (event)
6859 {
6860 case WD_EVENT_WD_STATE_CHANGED:
6861 send_cluster_command(WD_LEADER_NODE, WD_JOIN_COORDINATOR_MESSAGE, 5);
6862 /* Also reset my priority as per the original configuration */
6863 g_cluster.localNode->wd_priority = pool_config->wd_priority;
6864 set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
6865 break;
6866
6867 case WD_EVENT_TIMEOUT:
6868 set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
6869 break;
6870
6871 case WD_EVENT_WD_STATE_REQUIRE_RELOAD:
6872
6873 ereport(LOG,
6874 (errmsg("re-sending join coordinator message to leader node: \"%s\"", WD_LEADER_NODE->nodeName)));
6875
6876 send_cluster_command(WD_LEADER_NODE, WD_JOIN_COORDINATOR_MESSAGE, 5);
6877 break;
6878
6879 case WD_EVENT_COMMAND_FINISHED:
6880 {
6881 if (clusterCommand->commandPacket.type == WD_JOIN_COORDINATOR_MESSAGE)
6882 {
6883 if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ||
6884 clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT)
6885 {
6886 register_watchdog_state_change_interrupt();
6887
6888 ereport(LOG,
6889 (errmsg("successfully joined the watchdog cluster as standby node"),
6890 errdetail("our join coordinator request is accepted by cluster leader node \"%s\"", WD_LEADER_NODE->nodeName)));
6891 /* broadcast our new state change to the cluster */
6892 send_message_of_type(NULL, WD_INFO_MESSAGE, NULL);
6893
6894 }
6895 else
6896 {
6897 ereport(NOTICE,
6898 (errmsg("our join coordinator is rejected by node \"%s\"", wdNode->nodeName),
6899 errhint("rejoining the cluster.")));
6900
6901 if (WD_LEADER_NODE->has_lost_us)
6902 {
6903 ereport(LOG,
6904 (errmsg("leader node \"%s\" thinks we are lost, and \"%s\" is not letting us join",WD_LEADER_NODE->nodeName,wdNode->nodeName),
6905 errhint("please verify the watchdog life-check and network is working properly")));
6906 set_state(WD_NETWORK_ISOLATION);
6907 }
6908 else
6909 {
6910 set_state(WD_JOINING);
6911 }
6912 }
6913 }
6914 }
6915 break;
6916
6917 case WD_EVENT_I_AM_APPEARING_LOST:
6918 {
6919 /* The remote node has lost us, and if it
6920 * was our coordinator we might already be
6921 * removed from it's standby list
6922 * So re-Join the cluster
6923 */
6924 if (WD_LEADER_NODE == wdNode)
6925 {
6926 ereport(LOG,
6927 (errmsg("we are lost on the leader node \"%s\"",wdNode->nodeName)));
6928 set_state(WD_JOINING);
6929 }
6930 }
6931 break;
6932
6933 case WD_EVENT_I_AM_APPEARING_FOUND:
6934 {
6935 ereport(DEBUG1,
6936 (errmsg("updating remote node \"%s\" with node info message", wdNode->nodeName)));
6937
6938 send_message_of_type(wdNode, WD_INFO_MESSAGE, NULL);
6939 }
6940 break;
6941
6942 case WD_EVENT_REMOTE_NODE_LOST:
6943 {
6944 /*
6945 * we have lost one remote connected node check if the node
6946 * was coordinator
6947 */
6948 if (WD_LEADER_NODE == NULL)
6949 {
6950 ereport(LOG,
6951 (errmsg("We have lost the cluster leader node \"%s\"", wdNode->nodeName)));
6952 set_state(WD_JOINING);
6953 }
6954 }
6955 break;
6956
6957 case WD_EVENT_PACKET_RCV:
6958 {
6959 switch (pkt->type)
6960 {
6961 case WD_ADD_NODE_MESSAGE:
6962 {
6963 /* In case we received the ADD node message from
6964 * our coordinator. Reset the cluster state
6965 */
6966 if (wdNode == WD_LEADER_NODE)
6967 {
6968 ereport(LOG,
6969 (errmsg("received ADD NODE message from the leader node \"%s\"", wdNode->nodeName),
6970 errdetail("re-joining the cluster")));
6971 set_state(WD_JOINING);
6972 }
6973 standard_packet_processor(wdNode, pkt);
6974 }
6975 break;
6976
6977 case WD_FAILOVER_END:
6978 {
6979 register_backend_state_sync_req_interrupt();
6980 }
6981 break;
6982
6983 case WD_STAND_FOR_COORDINATOR_MESSAGE:
6984 {
6985 if (WD_LEADER_NODE == NULL)
6986 {
6987 reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
6988 set_state(WD_PARTICIPATE_IN_ELECTION);
6989 }
6990 else
6991 {
6992 ereport(LOG,
6993 (errmsg("We are connected to leader node \"%s\" and another node \"%s\" is trying to become a leader",WD_LEADER_NODE->nodeName, wdNode->nodeName)));
6994 reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
6995 /* Ask leader to re-send its node info */
6996 send_message_of_type(WD_LEADER_NODE, WD_REQ_INFO_MESSAGE, NULL);
6997 }
6998 }
6999 break;
7000
7001 case WD_DECLARE_COORDINATOR_MESSAGE:
7002 {
7003 if (wdNode != WD_LEADER_NODE)
7004 {
7005 /*
7006 * we already have a leader node and we got a
7007 * new node trying to be leader
7008 */
7009 ereport(LOG,
7010 (errmsg("We are connected to leader node \"%s\" and another node \"%s\" is trying to declare itself as a leader",WD_LEADER_NODE->nodeName, wdNode->nodeName)));
7011 reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
7012 /* Ask leader to re-send its node info */
7013 send_message_of_type(WD_LEADER_NODE, WD_REQ_INFO_MESSAGE, NULL);
7014
7015 }
7016 }
7017 break;
7018
7019 case WD_IAM_COORDINATOR_MESSAGE:
7020 {
7021 /*
7022 * if the message is received from coordinator
7023 * reply with info, otherwise reject
7024 */
7025 if (wdNode != WD_LEADER_NODE)
7026 {
7027 ereport(LOG,
7028 (errmsg("\"%s\" is our coordinator node, but \"%s\" is also announcing as a coordinator",
7029 WD_LEADER_NODE->nodeName, wdNode->nodeName),
7030 errdetail("broadcasting the cluster in split-brain message")));
7031
7032 send_cluster_service_message(NULL, pkt, CLUSTER_IN_SPLIT_BRAIN);
7033 }
7034 else if (check_debug_request_do_not_reply_beacon() == false)
7035 {
7036 send_message_of_type(wdNode, WD_INFO_MESSAGE, pkt);
7037 beacon_message_received_from_node(wdNode, pkt);
7038 }
7039 }
7040 break;
7041
7042 default:
7043 standard_packet_processor(wdNode, pkt);
7044 break;
7045 }
7046 }
7047 break;
7048
7049 default:
7050 break;
7051 }
7052
7053 /*
7054 * before returning from the function make sure that we are connected with
7055 * the leader node
7056 */
7057 if (WD_LEADER_NODE)
7058 {
7059 struct timeval currTime;
7060
7061 gettimeofday(&currTime, NULL);
7062 int last_rcv_sec = WD_TIME_DIFF_SEC(currTime, WD_LEADER_NODE->last_rcv_time);
7063
7064 if (last_rcv_sec >= (3 * BEACON_MESSAGE_INTERVAL_SECONDS))
7065 {
7066 /* we have missed atleast two beacons from leader node */
7067 ereport(WARNING,
7068 (errmsg("we have not received a beacon message from leader node \"%s\" and it has not replied to our info request",
7069 WD_LEADER_NODE->nodeName),
7070 errdetail("re-initializing the cluster")));
7071 set_state(WD_JOINING);
7072 }
7073 else if (last_rcv_sec >= (2 * BEACON_MESSAGE_INTERVAL_SECONDS))
7074 {
7075 /*
7076 * We have not received a last beacon from leader ask for the
7077 * node info from leader node
7078 */
7079 ereport(WARNING,
7080 (errmsg("we have not received a beacon message from leader node \"%s\"",
7081 WD_LEADER_NODE->nodeName),
7082 errdetail("requesting info message from leader node")));
7083 send_message_of_type(WD_LEADER_NODE, WD_REQ_INFO_MESSAGE, NULL);
7084 }
7085 }
7086 return 0;
7087 }
7088
7089
7090 /*
7091 * The function identifies the current quorum state
7092 * quorum values:
7093 * -1:
7094 * quorum is lost or does not exists
7095 * 0:
7096 * The quorum is on the edge. (when participating cluster is configured
7097 * with even number of nodes, and we have exactly 50% nodes
7098 * 1:
7099 * quorum exists
7100 */
7101 static void
update_quorum_status(void)7102 update_quorum_status(void)
7103 {
7104 int quorum_status = g_cluster.quorum_status;
7105
7106 if (g_cluster.clusterLeaderInfo.standby_nodes_count > get_minimum_remote_nodes_required_for_quorum())
7107 {
7108 g_cluster.quorum_status = 1;
7109 }
7110 else if (g_cluster.clusterLeaderInfo.standby_nodes_count == get_minimum_remote_nodes_required_for_quorum())
7111 {
7112 if (g_cluster.memberRemoteNodeCount % 2 != 0)
7113 {
7114 if (pool_config->enable_consensus_with_half_votes)
7115 g_cluster.quorum_status = 0; /* on the edge */
7116 else
7117 g_cluster.quorum_status = -1;
7118 }
7119 else
7120 g_cluster.quorum_status = 1;
7121 }
7122 else
7123 {
7124 g_cluster.quorum_status = -1;
7125 }
7126 g_cluster.localNode->quorum_status = g_cluster.quorum_status;
7127 if (g_cluster.quorum_status != quorum_status)
7128 {
7129 watchdog_state_machine(WD_EVENT_CLUSTER_QUORUM_CHANGED, NULL, NULL, NULL);
7130 }
7131 }
7132
7133 /*
7134 * returns the minimum number of remote nodes required for quorum
7135 */
7136 static int
get_minimum_remote_nodes_required_for_quorum(void)7137 get_minimum_remote_nodes_required_for_quorum(void)
7138 {
7139 /*
7140 * Even number of remote nodes, That means total number of nodes are odd,
7141 * so minimum quorum is just remote/2.
7142 */
7143 if (g_cluster.memberRemoteNodeCount % 2 == 0)
7144 return (g_cluster.memberRemoteNodeCount / 2);
7145
7146 /*
7147 * Total nodes including self are even, So we return 50% nodes as quorum
7148 * requirements
7149 */
7150 return ((g_cluster.memberRemoteNodeCount - 1) / 2);
7151 }
7152
7153 /*
7154 * returns the minimum number of votes required for consensus
7155 */
7156 static int
get_minimum_votes_to_resolve_consensus(void)7157 get_minimum_votes_to_resolve_consensus(void)
7158 {
7159 /*
7160 * Since get_minimum_remote_nodes_required_for_quorum() returns
7161 * the number of remote nodes required to complete the quorum
7162 * that is always one less than the total number of nodes required
7163 * for the cluster to build quorum or consensus, reason being
7164 * in get_minimum_remote_nodes_required_for_quorum()
7165 * we always consider the local node as a valid pre-casted vote.
7166 * But when it comes to count the number of votes required to build
7167 * consensus for any type of decision, for example for building the
7168 * consensus on backend failover, the local node can vote on either
7169 * side. So it's vote is not explicitly counted and for the consensus
7170 * we actually need one more vote than the total number of remote nodes
7171 * required for the quorum
7172 *
7173 * For example
7174 * If Total nodes in cluster = 4
7175 * remote node will be = 3
7176 * get_minimum_remote_nodes_required_for_quorum() return = 1
7177 * Minimum number of votes required for consensus will be
7178 *
7179 * if(pool_config->enable_consensus_with_half_votes = true)
7180 * (exact 50% n/2) ==> 4/2 = 2
7181 *
7182 * if(pool_config->enable_consensus_with_half_votes = false)
7183 * (exact 50% +1 ==> (n/2)+1) ==> (4/2)+1 = 3
7184 *
7185 */
7186
7187 int required_node_count = get_minimum_remote_nodes_required_for_quorum() + 1;
7188 /*
7189 * When the total number of nodes in the watchdog cluster including the
7190 * local node are even, The number of votes required for the consensus
7191 * depends on the enable_consensus_with_half_votes.
7192 * So for even number of nodes when enable_consensus_with_half_votes is
7193 * not allowed than we would add one more vote than exact 50%
7194 */
7195 if (g_cluster.memberRemoteNodeCount % 2 != 0)
7196 {
7197 if (pool_config->enable_consensus_with_half_votes == false)
7198 required_node_count += 1;
7199 }
7200
7201 return required_node_count;
7202 }
7203
7204 /*
7205 * sets the state of local watchdog node, and fires a state change event
7206 * if the new and old state differs
7207 */
7208 static int
set_state(WD_STATES newState)7209 set_state(WD_STATES newState)
7210 {
7211 WD_STATES oldState = get_local_node_state();
7212
7213 g_cluster.localNode->state = newState;
7214 if (oldState != newState)
7215 {
7216 gettimeofday(&g_cluster.localNode->current_state_time, NULL);
7217
7218 /*
7219 * if we changing from the coordinator state, do the de-escalation if
7220 * required
7221 */
7222 if (oldState == WD_COORDINATOR)
7223 {
7224 resign_from_escalated_node();
7225 clear_standby_nodes_list();
7226 clear_all_failovers();
7227 }
7228
7229 ereport(LOG,
7230 (errmsg("watchdog node state changed from [%s] to [%s]", wd_state_names[oldState], wd_state_names[newState])));
7231 watchdog_state_machine(WD_EVENT_WD_STATE_CHANGED, NULL, NULL, NULL);
7232 /* send out the info message to all nodes */
7233 send_message_of_type(NULL, WD_INFO_MESSAGE, NULL);
7234 }
7235 return 0;
7236 }
7237
7238
7239 static void
allocate_resultNodes_in_command(WDCommandData * ipcCommand)7240 allocate_resultNodes_in_command(WDCommandData * ipcCommand)
7241 {
7242 MemoryContext oldCxt;
7243 int i;
7244
7245 if (ipcCommand->nodeResults != NULL)
7246 return;
7247
7248 oldCxt = MemoryContextSwitchTo(ipcCommand->memoryContext);
7249 ipcCommand->nodeResults = palloc0((sizeof(WDCommandNodeResult) * g_cluster.remoteNodeCount));
7250 for (i = 0; i < g_cluster.remoteNodeCount; i++)
7251 {
7252 ipcCommand->nodeResults[i].wdNode = &g_cluster.remoteNodes[i];
7253 }
7254 MemoryContextSwitchTo(oldCxt);
7255 }
7256
7257
7258 static void
process_remote_online_recovery_command(WatchdogNode * wdNode,WDPacketData * pkt)7259 process_remote_online_recovery_command(WatchdogNode * wdNode, WDPacketData * pkt)
7260 {
7261 char *func_name;
7262 int node_count = 0;
7263 int *node_id_list = NULL;
7264 unsigned char flags;
7265
7266 if (pkt->data == NULL || pkt->len == 0)
7267 {
7268 ereport(LOG,
7269 (errmsg("watchdog is unable to process pgpool online recovery command"),
7270 errdetail("command packet contains no data")));
7271 reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
7272 return;
7273 }
7274
7275 ereport(LOG,
7276 (errmsg("watchdog received online recovery request from \"%s\"", wdNode->nodeName)));
7277
7278 if (parse_wd_node_function_json(pkt->data, pkt->len, &func_name, &node_id_list, &node_count, &flags))
7279 {
7280 if (strcasecmp(WD_FUNCTION_START_RECOVERY, func_name) == 0)
7281 {
7282 if (*InRecovery != RECOVERY_INIT)
7283 {
7284 reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
7285 }
7286 else
7287 {
7288 *InRecovery = RECOVERY_ONLINE;
7289 if (Req_info->conn_counter == 0)
7290 {
7291 reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
7292 }
7293 else if (pool_config->recovery_timeout <= 0)
7294 {
7295 if (ensure_conn_counter_validity() == 0)
7296 reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
7297 else
7298 reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
7299 }
7300 else
7301 {
7302 WDFunctionCommandData *wd_func_command;
7303 MemoryContext oldCxt = MemoryContextSwitchTo(TopMemoryContext);
7304
7305 wd_func_command = palloc(sizeof(WDFunctionCommandData));
7306 wd_func_command->commandType = pkt->type;
7307 wd_func_command->commandID = pkt->command_id;
7308 wd_func_command->funcName = MemoryContextStrdup(TopMemoryContext, func_name);
7309 wd_func_command->wdNode = wdNode;
7310
7311 /* Add this command for timer tick */
7312 add_wd_command_for_timer_events(pool_config->recovery_timeout, true, wd_func_command);
7313
7314 MemoryContextSwitchTo(oldCxt);
7315 }
7316 }
7317 }
7318 else if (strcasecmp(WD_FUNCTION_END_RECOVERY, func_name) == 0)
7319 {
7320 *InRecovery = RECOVERY_INIT;
7321 reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
7322 kill(getppid(), SIGUSR2);
7323 }
7324 else
7325 {
7326 ereport(LOG,
7327 (errmsg("watchdog failed to process online recovery request"),
7328 errdetail("invalid command [%s] in online recovery request from \"%s\"", func_name, wdNode->nodeName)));
7329 reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
7330 }
7331 }
7332 else
7333 {
7334 ereport(LOG,
7335 (errmsg("watchdog failed to process online recovery request"),
7336 errdetail("invalid data in online recovery request from \"%s\"", wdNode->nodeName)));
7337 reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
7338 }
7339
7340 if (func_name)
7341 pfree(func_name);
7342 if (node_id_list)
7343 pfree(node_id_list);
7344 }
7345
7346
7347 static bool
reply_is_received_for_pgpool_replicate_command(WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * ipcCommand)7348 reply_is_received_for_pgpool_replicate_command(WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * ipcCommand)
7349 {
7350 int i;
7351 WDCommandNodeResult *nodeResult = NULL;
7352
7353 /* get the result node for */
7354 ereport(DEBUG1,
7355 (errmsg("watchdog node \"%s\" has replied for pgpool-II replicate command packet", wdNode->nodeName)));
7356
7357 for (i = 0; i < g_cluster.remoteNodeCount; i++)
7358 {
7359 nodeResult = &ipcCommand->nodeResults[i];
7360 if (nodeResult->wdNode == wdNode)
7361 break;
7362 nodeResult = NULL;
7363 }
7364 if (nodeResult == NULL)
7365 {
7366 ereport(WARNING,
7367 (errmsg("unable to find result node for pgpool-II replicate command packet received from watchdog node \"%s\"", wdNode->nodeName)));
7368 return true;
7369 }
7370
7371 nodeResult->result_type = pkt->type;
7372 nodeResult->cmdState = COMMAND_STATE_REPLIED;
7373 ipcCommand->commandReplyFromCount++;
7374 ereport(DEBUG2,
7375 (errmsg("watchdog node \"%s\" has replied for pgpool-II replicate command packet", wdNode->nodeName),
7376 errdetail("command was sent to %d nodes and %d nodes have replied to it", ipcCommand->commandSendToCount, ipcCommand->commandReplyFromCount)));
7377
7378 if (pkt->type != WD_ACCEPT_MESSAGE)
7379 {
7380 /* reject message from any node finishes the command */
7381 ipcCommand->commandStatus = COMMAND_FINISHED_NODE_REJECTED;
7382 wd_command_is_complete(ipcCommand);
7383 cleanUpIPCCommand(ipcCommand);
7384 }
7385 else if (ipcCommand->commandReplyFromCount >= ipcCommand->commandSendToCount)
7386 {
7387 /*
7388 * we have received results from all nodes analyze the result
7389 */
7390 ipcCommand->commandStatus = COMMAND_FINISHED_ALL_REPLIED;
7391 wd_command_is_complete(ipcCommand);
7392 cleanUpIPCCommand(ipcCommand);
7393 }
7394
7395 /* do not process this packet further */
7396 return true;
7397 }
7398
7399 /*
7400 * return true if want to cancel timer,
7401 */
7402 static bool
process_wd_command_timer_event(bool timer_expired,WDFunctionCommandData * wd_func_command)7403 process_wd_command_timer_event(bool timer_expired, WDFunctionCommandData * wd_func_command)
7404 {
7405 if (wd_func_command->commandType == WD_IPC_ONLINE_RECOVERY_COMMAND)
7406 {
7407 if (wd_func_command->funcName && strcasecmp("START_RECOVERY", wd_func_command->funcName) == 0)
7408 {
7409 if (Req_info->conn_counter == 0)
7410 {
7411 WDPacketData emptyPkt;
7412
7413 emptyPkt.command_id = wd_func_command->commandID;
7414 reply_with_minimal_message(wd_func_command->wdNode, WD_ACCEPT_MESSAGE, &emptyPkt);
7415 return true;
7416 }
7417 else if (timer_expired)
7418 {
7419 WDPacketData emptyPkt;
7420
7421 emptyPkt.command_id = wd_func_command->commandID;
7422
7423 if (ensure_conn_counter_validity() == 0)
7424 reply_with_minimal_message(wd_func_command->wdNode, WD_ACCEPT_MESSAGE, &emptyPkt);
7425 else
7426 reply_with_minimal_message(wd_func_command->wdNode, WD_REJECT_MESSAGE, &emptyPkt);
7427 return true;
7428 }
7429 return false;
7430 }
7431 }
7432 /* Just remove the timer. */
7433 return true;
7434 }
7435
7436 static void
process_wd_func_commands_for_timer_events(void)7437 process_wd_func_commands_for_timer_events(void)
7438 {
7439 struct timeval currTime;
7440 ListCell *lc;
7441 List *timers_to_del = NIL;
7442
7443 if (g_cluster.wd_timer_commands == NULL)
7444 return;
7445
7446 gettimeofday(&currTime, NULL);
7447
7448 /*
7449 * Take care online recovery
7450 */
7451 foreach(lc, g_cluster.wd_timer_commands)
7452 {
7453 WDCommandTimerData *timerData = lfirst(lc);
7454
7455 if (timerData)
7456 {
7457 bool del = false;
7458
7459 if (WD_TIME_DIFF_SEC(currTime, timerData->startTime) >= timerData->expire_sec)
7460 {
7461 del = process_wd_command_timer_event(true, timerData->wd_func_command);
7462
7463 }
7464 else if (timerData->need_tics)
7465 {
7466 del = process_wd_command_timer_event(false, timerData->wd_func_command);
7467 }
7468 if (del)
7469 timers_to_del = lappend(timers_to_del, timerData);
7470 }
7471 }
7472 foreach(lc, timers_to_del)
7473 {
7474 g_cluster.wd_timer_commands = list_delete_ptr(g_cluster.wd_timer_commands, lfirst(lc));
7475 }
7476
7477 list_free(timers_to_del);
7478 }
7479
7480 static void
add_wd_command_for_timer_events(unsigned int expire_secs,bool need_tics,WDFunctionCommandData * wd_func_command)7481 add_wd_command_for_timer_events(unsigned int expire_secs, bool need_tics, WDFunctionCommandData * wd_func_command)
7482 {
7483 /* create a new Timer struct */
7484 MemoryContext oldCtx = MemoryContextSwitchTo(TopMemoryContext);
7485 WDCommandTimerData *timerData = palloc(sizeof(WDCommandTimerData));
7486
7487 gettimeofday(&timerData->startTime, NULL);
7488 timerData->expire_sec = expire_secs;
7489 timerData->need_tics = need_tics;
7490 timerData->wd_func_command = wd_func_command;
7491
7492 g_cluster.wd_timer_commands = lappend(g_cluster.wd_timer_commands, timerData);
7493
7494 MemoryContextSwitchTo(oldCtx);
7495
7496 }
7497
7498 #define WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config_obj, wdNode, parameter) \
7499 do { \
7500 if (config_obj->parameter != pool_config->parameter) \
7501 { \
7502 ereport(WARNING, \
7503 (errmsg("configurations value for \"%s\" on node \"%s\" is different", #parameter, wdNode->nodeName), \
7504 errdetail("\"%s\" on this node is %d while on \"%s\" is %d", \
7505 #parameter, \
7506 pool_config->parameter, \
7507 wdNode->nodeName, \
7508 config_obj->parameter))); \
7509 } \
7510 } while(0)
7511 #define WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config_obj,wdNode, parameter) \
7512 do { \
7513 if (config_obj->parameter != pool_config->parameter) \
7514 { \
7515 ereport(WARNING, \
7516 (errmsg("configurations value for \"%s\" on node \"%s\" is different", #parameter, wdNode->nodeName), \
7517 errdetail("\"%s\" on this node is %s while on \"%s\" is %s", \
7518 #parameter, \
7519 pool_config->parameter?"ON":"OFF", \
7520 wdNode->nodeName, \
7521 config_obj->parameter?"ON":"OFF"))); \
7522 } \
7523 } while(0)
7524
7525 static void
verify_pool_configurations(WatchdogNode * wdNode,POOL_CONFIG * config)7526 verify_pool_configurations(WatchdogNode * wdNode, POOL_CONFIG * config)
7527 {
7528 int i;
7529
7530 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, num_init_children);
7531 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, listen_backlog_multiplier);
7532 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, child_life_time);
7533 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, connection_life_time);
7534 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, child_max_connections);
7535 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, client_idle_limit);
7536 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, max_pool);
7537 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, health_check_timeout);
7538 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, health_check_period);
7539 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, health_check_max_retries);
7540 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, health_check_retry_delay);
7541 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, recovery_timeout);
7542 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, search_primary_node_timeout);
7543 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, client_idle_limit_in_recovery);
7544
7545 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, replication_mode);
7546 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, enable_pool_hba);
7547 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, load_balance_mode);
7548 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, replication_stop_on_mismatch);
7549 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, allow_clear_text_frontend_auth);
7550 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, failover_if_affected_tuples_mismatch);
7551 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, failover_on_backend_error);
7552 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, replicate_select);
7553 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, connection_cache);
7554 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, insert_lock);
7555 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, memory_cache_enabled);
7556 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, clear_memqcache_on_escalation);
7557
7558 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, failover_when_quorum_exists);
7559 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, failover_require_consensus);
7560 WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, allow_multiple_failover_requests_from_node);
7561
7562 if (config->backend_desc->num_backends != pool_config->backend_desc->num_backends)
7563 {
7564 ereport(WARNING,
7565 (errmsg("number of configured backends on node \"%s\" are different", wdNode->nodeName),
7566 errdetail("this node has %d backends while on \"%s\" number of configured backends are %d",
7567 pool_config->backend_desc->num_backends,
7568 wdNode->nodeName,
7569 config->backend_desc->num_backends)));
7570 }
7571 for (i = 0; i < pool_config->backend_desc->num_backends; i++)
7572 {
7573 if (strncasecmp(pool_config->backend_desc->backend_info[i].backend_hostname, config->backend_desc->backend_info[i].backend_hostname, sizeof(pool_config->backend_desc->backend_info[i].backend_hostname)))
7574 {
7575 ereport(WARNING,
7576 (errmsg("configurations value for backend[%d] \"hostname\" on node \"%s\" is different", i, wdNode->nodeName),
7577 errdetail("\"backend_hostname%d\" on this node is %s while on \"%s\" is %s",
7578 i,
7579 pool_config->backend_desc->backend_info[i].backend_hostname,
7580 wdNode->nodeName,
7581 config->backend_desc->backend_info[i].backend_hostname)));
7582 }
7583 if (config->backend_desc->backend_info[i].backend_port != pool_config->backend_desc->backend_info[i].backend_port)
7584 {
7585 ereport(WARNING,
7586 (errmsg("configurations value for backend[%d] \"port\" on node \"%s\" is different", i, wdNode->nodeName),
7587 errdetail("\"backend_port%d\" on this node is %d while on \"%s\" is %d",
7588 i,
7589 pool_config->backend_desc->backend_info[i].backend_port,
7590 wdNode->nodeName,
7591 config->backend_desc->backend_info[i].backend_port)));
7592 }
7593 }
7594
7595 if (config->wd_nodes.num_wd != pool_config->wd_nodes.num_wd)
7596 {
7597 ereport(WARNING,
7598 (errmsg("the number of configured watchdog nodes on node \"%s\" are different", wdNode->nodeName),
7599 errdetail("this node has %d watchdog nodes while \"%s\" is configured with %d watchdog nodes",
7600 pool_config->wd_nodes.num_wd,
7601 wdNode->nodeName,
7602 config->wd_nodes.num_wd)));
7603 }
7604 }
7605
7606 static bool
get_authhash_for_node(WatchdogNode * wdNode,char * authhash)7607 get_authhash_for_node(WatchdogNode * wdNode, char *authhash)
7608 {
7609 if (strlen(pool_config->wd_authkey))
7610 {
7611 char nodeStr[WD_MAX_PACKET_STRING + 1];
7612 int len = snprintf(nodeStr, WD_MAX_PACKET_STRING, "state=%d wd_port=%d",
7613 wdNode->state, wdNode->wd_port);
7614
7615
7616 /* calculate hash from packet */
7617 wd_calc_hash(nodeStr, len, authhash);
7618 if (authhash[0] == '\0')
7619 ereport(WARNING,
7620 (errmsg("failed to calculate wd_authkey hash from a send packet")));
7621 return true;
7622 }
7623 return false;
7624 }
7625
7626 static bool
verify_authhash_for_node(WatchdogNode * wdNode,char * authhash)7627 verify_authhash_for_node(WatchdogNode * wdNode, char *authhash)
7628 {
7629 if (strlen(pool_config->wd_authkey))
7630 {
7631 char calculated_authhash[WD_AUTH_HASH_LEN + 1];
7632
7633 char nodeStr[WD_MAX_PACKET_STRING];
7634 int len = snprintf(nodeStr, WD_MAX_PACKET_STRING, "state=%d wd_port=%d",
7635 wdNode->state, wdNode->wd_port);
7636
7637
7638 /* calculate hash from packet */
7639 wd_calc_hash(nodeStr, len, calculated_authhash);
7640 if (calculated_authhash[0] == '\0')
7641 ereport(WARNING,
7642 (errmsg("failed to calculate wd_authkey hash from a receive packet")));
7643 return (strcmp(calculated_authhash, authhash) == 0);
7644 }
7645 /* authkey is not enabled. */
7646 return true;
7647 }
7648
7649 /*
7650 * function authenticates the IPC command by looking for the
7651 * auth key in the JSON data of IPC command.
7652 * For IPC commands coming from outer world the function validates the
7653 * authkey in JSON packet with configured pool_config->wd_authkey.
7654 * if internal_client_only is true then the JSON data must contain the
7655 * shared key present in the pgpool-II shared memory. This can be used
7656 * to restrict certain watchdog IPC functions for outside of pgpool-II
7657 */
7658 static bool
check_IPC_client_authentication(json_value * rootObj,bool internal_client_only)7659 check_IPC_client_authentication(json_value * rootObj, bool internal_client_only)
7660 {
7661 char *packet_auth_key;
7662 unsigned int packet_key;
7663 bool has_shared_key;
7664 unsigned int *shared_key = get_ipc_shared_key();
7665
7666 if (json_get_int_value_for_key(rootObj, WD_IPC_SHARED_KEY, (int *) &packet_key))
7667 {
7668 ereport(DEBUG2,
7669 (errmsg("IPC JSON data packet does not contain shared key")));
7670 has_shared_key = false;
7671 }
7672 else
7673 {
7674 has_shared_key = true;
7675 }
7676
7677 if (internal_client_only)
7678 {
7679
7680 if (shared_key == NULL)
7681 {
7682 ereport(LOG,
7683 (errmsg("shared key not initialized")));
7684 return false;
7685 }
7686
7687 if (has_shared_key == false)
7688 {
7689 ereport(LOG,
7690 (errmsg("invalid JSON data packet"),
7691 errdetail("authentication shared key not found in JSON data")));
7692 return false;
7693 }
7694 /* compare if shared keys match */
7695 if (*shared_key != packet_key)
7696 return false;
7697
7698 /* providing a valid shared key for internal clients is enough */
7699 return true;
7700 }
7701
7702 /* If no authentication is required, no need to look further */
7703 if (g_cluster.ipc_auth_needed == false)
7704 return true;
7705
7706 /* if shared key is provided and it matched, we are good */
7707 if (has_shared_key == true && *shared_key == packet_key)
7708 return true;
7709
7710 /* shared key is out of question validate the authKey values */
7711 packet_auth_key = json_get_string_value_for_key(rootObj, WD_IPC_AUTH_KEY);
7712
7713 if (packet_auth_key == NULL)
7714 {
7715 ereport(DEBUG1,
7716 (errmsg("invalid JSON data packet"),
7717 errdetail("authentication key not found in JSON data")));
7718 return false;
7719 }
7720
7721 /* compare the packet key with configured auth key */
7722 if (strcmp(pool_config->wd_authkey, packet_auth_key) != 0)
7723 return false;
7724 return true;
7725 }
7726
7727 /*
7728 * function to check authentication of IPC command based on the command type
7729 * this one also informs the calling client about the failure
7730 */
7731
7732 static bool
check_and_report_IPC_authentication(WDCommandData * ipcCommand)7733 check_and_report_IPC_authentication(WDCommandData * ipcCommand)
7734 {
7735 json_value *root = NULL;
7736 bool internal_client_only = false;
7737 bool ret;
7738
7739 if (ipcCommand == NULL)
7740 return false; /* should never happen */
7741
7742 /* first identify the command type */
7743 switch (ipcCommand->sourcePacket.type)
7744 {
7745 case WD_NODE_STATUS_CHANGE_COMMAND:
7746 case WD_REGISTER_FOR_NOTIFICATION:
7747 case WD_GET_NODES_LIST_COMMAND:
7748 case WD_GET_RUNTIME_VARIABLE_VALUE:
7749 internal_client_only = false;
7750 break;
7751
7752 case WD_IPC_FAILOVER_COMMAND:
7753 case WD_IPC_ONLINE_RECOVERY_COMMAND:
7754 case WD_EXECUTE_CLUSTER_COMMAND:
7755 case WD_GET_LEADER_DATA_REQUEST:
7756 /* only allowed internally. */
7757 internal_client_only = true;
7758 break;
7759
7760 default:
7761 /* unknown command, ignore it */
7762 return true;
7763 break;
7764 }
7765
7766 if (internal_client_only == false && g_cluster.ipc_auth_needed == false)
7767 {
7768 /* no need to look further */
7769 return true;
7770 }
7771
7772 if (ipcCommand->sourcePacket.len <= 0 || ipcCommand->sourcePacket.data == NULL)
7773 {
7774 ereport(LOG,
7775 (errmsg("authentication failed"),
7776 errdetail("IPC command contains no data")));
7777 ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
7778 "authentication failed: invalid data");
7779
7780 return false;
7781 }
7782
7783 root = json_parse(ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len);
7784 /* The root node must be object */
7785 if (root == NULL || root->type != json_object)
7786 {
7787 json_value_free(root);
7788 ereport(LOG,
7789 (errmsg("authentication failed"),
7790 errdetail("IPC command contains an invalid data")));
7791
7792 ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
7793 "authentication failed: invalid data");
7794
7795 return false;
7796 }
7797
7798 ret = check_IPC_client_authentication(root, internal_client_only);
7799 json_value_free(root);
7800
7801 if (ret == false)
7802 {
7803 ereport(WARNING,
7804 (errmsg("authentication failed"),
7805 errdetail("invalid IPC key")));
7806 ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
7807 "authentication failed: invalid KEY");
7808 }
7809 return ret;
7810 }
7811
7812 static void
print_watchdog_node_info(WatchdogNode * wdNode)7813 print_watchdog_node_info(WatchdogNode * wdNode)
7814 {
7815 ereport(DEBUG2,
7816 (errmsg("state: \"%s\" Host: \"%s\" Name: \"%s\" WD Port:%d PP Port: %d priority:%d",
7817 wd_state_names[wdNode->state],
7818 wdNode->hostname
7819 ,wdNode->nodeName
7820 ,wdNode->wd_port
7821 ,wdNode->pgpool_port
7822 ,wdNode->wd_priority)));
7823 }
7824
7825 static void
print_packet_node_info(WDPacketData * pkt,WatchdogNode * wdNode,bool sending)7826 print_packet_node_info(WDPacketData * pkt, WatchdogNode * wdNode, bool sending)
7827 {
7828 int i;
7829 packet_types *pkt_type = NULL;
7830
7831 /*
7832 * save the cpu cycles if our log level would swallow this message
7833 */
7834 if (pool_config->log_min_messages > DEBUG1)
7835 return;
7836
7837 for (i = 0;; i++)
7838 {
7839 if (all_packet_types[i].type == WD_NO_MESSAGE)
7840 break;
7841
7842 if (all_packet_types[i].type == pkt->type)
7843 {
7844 pkt_type = &all_packet_types[i];
7845 break;
7846 }
7847 }
7848
7849 ereport(DEBUG1,
7850 (errmsg("%s packet, watchdog node:[%s] command id:[%d] type:[%s] state:[%s]",
7851 sending ? "sending" : "received",
7852 wdNode->nodeName,
7853 pkt->command_id,
7854 pkt_type ? pkt_type->name : "UNKNOWN",
7855 wd_state_names[get_local_node_state()])));
7856 }
7857
7858 static void
print_packet_info(WDPacketData * pkt,bool sending)7859 print_packet_info(WDPacketData * pkt, bool sending)
7860 {
7861 int i;
7862 packet_types *pkt_type = NULL;
7863
7864 /*
7865 * save the cpu cycles if our log level would swallow this message
7866 */
7867 if (pool_config->log_min_messages > DEBUG2)
7868 return;
7869
7870 for (i = 0;; i++)
7871 {
7872 if (all_packet_types[i].type == WD_NO_MESSAGE)
7873 break;
7874
7875 if (all_packet_types[i].type == pkt->type)
7876 {
7877 pkt_type = &all_packet_types[i];
7878 break;
7879 }
7880 }
7881
7882 ereport(DEBUG2,
7883 (errmsg("%s watchdog packet, command id:[%d] type:[%s] state :[%s]",
7884 sending ? "sending" : "received",
7885 pkt->command_id,
7886 pkt_type ? pkt_type->name : "UNKNOWN",
7887 wd_state_names[get_local_node_state()])));
7888 }
7889
7890 static int
send_command_packet_to_remote_nodes(WDCommandData * ipcCommand,bool source_included)7891 send_command_packet_to_remote_nodes(WDCommandData * ipcCommand, bool source_included)
7892 {
7893 int i;
7894
7895 ipcCommand->commandSendToCount = 0;
7896 ipcCommand->commandReplyFromCount = 0;
7897 ipcCommand->commandSendToErrorCount = 0;
7898 allocate_resultNodes_in_command(ipcCommand);
7899 ereport(DEBUG2,
7900 (errmsg("sending the %c type message to \"%s\"",
7901 ipcCommand->commandPacket.type,
7902 ipcCommand->sendToNode ? ipcCommand->sendToNode->nodeName : "ALL NODES")));
7903 for (i = 0; i < g_cluster.remoteNodeCount; i++)
7904 {
7905 WDCommandNodeResult *nodeResult = &ipcCommand->nodeResults[i];
7906
7907 if (ipcCommand->sendToNode != NULL && ipcCommand->sendToNode != nodeResult->wdNode)
7908 {
7909 /*
7910 * The command is intended for specific node and this is not the
7911 * one
7912 */
7913 nodeResult->cmdState = COMMAND_STATE_DO_NOT_SEND;
7914 }
7915 else if (source_included == false && ipcCommand->sourceWdNode == nodeResult->wdNode &&
7916 ipcCommand->commandSource == COMMAND_SOURCE_REMOTE)
7917 {
7918 ereport(DEBUG1,
7919 (errmsg("not sending the %c type message to command originator node \"%s\"",
7920 ipcCommand->commandPacket.type, nodeResult->wdNode->nodeName)));
7921
7922 /*
7923 * The message is not supposed to be sent to the watchdog node
7924 * that started this command
7925 */
7926 nodeResult->cmdState = COMMAND_STATE_DO_NOT_SEND;
7927 }
7928 else if (is_node_active(nodeResult->wdNode) == false)
7929 {
7930 nodeResult->cmdState = COMMAND_STATE_DO_NOT_SEND;
7931 }
7932 else if (is_node_reachable(nodeResult->wdNode) == false)
7933 {
7934 nodeResult->cmdState = COMMAND_STATE_SEND_ERROR;
7935 ipcCommand->commandSendToErrorCount++;
7936 }
7937 else if (send_message_to_node(nodeResult->wdNode, &ipcCommand->commandPacket) == true)
7938 {
7939 ereport(DEBUG2,
7940 (errmsg("%c type message written to socket for node \"%s\"",
7941 ipcCommand->commandPacket.type, nodeResult->wdNode->nodeName)));
7942
7943 nodeResult->cmdState = COMMAND_STATE_SENT;
7944 ipcCommand->commandSendToCount++;
7945 }
7946 else
7947 {
7948 nodeResult->cmdState = COMMAND_STATE_SEND_ERROR;
7949 ipcCommand->commandSendToErrorCount++;
7950 }
7951 }
7952 return ipcCommand->commandSendToCount;
7953 }
7954
7955 static void
set_cluster_leader_node(WatchdogNode * wdNode)7956 set_cluster_leader_node(WatchdogNode * wdNode)
7957 {
7958 if (WD_LEADER_NODE != wdNode)
7959 {
7960 if (wdNode == NULL)
7961 ereport(LOG,
7962 (errmsg("removing the %s node \"%s\" from watchdog cluster leader",
7963 (g_cluster.localNode == WD_LEADER_NODE) ? "local" : "remote",
7964 WD_LEADER_NODE->nodeName)));
7965 else
7966 ereport(LOG,
7967 (errmsg("setting the %s node \"%s\" as watchdog cluster leader",
7968 (g_cluster.localNode == wdNode) ? "local" : "remote",
7969 wdNode->nodeName)));
7970 g_cluster.clusterLeaderInfo.leaderNode = wdNode;
7971 }
7972 }
7973
7974 static WatchdogNode*
getLeaderWatchdogNode(void)7975 getLeaderWatchdogNode(void)
7976 {
7977 return g_cluster.clusterLeaderInfo.leaderNode;
7978 }
7979
7980 static int
update_cluster_memberships(void)7981 update_cluster_memberships(void)
7982 {
7983 int i;
7984 g_cluster.memberRemoteNodeCount = g_cluster.remoteNodeCount;
7985 for (i = 0; i < g_cluster.remoteNodeCount; i++)
7986 {
7987 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
7988 if (wdNode->membership_status != WD_NODE_MEMBERSHIP_ACTIVE)
7989 g_cluster.memberRemoteNodeCount--;
7990 }
7991 return g_cluster.memberRemoteNodeCount;
7992 }
7993
7994 static int
revoke_cluster_membership_of_node(WatchdogNode * wdNode,WD_NODE_MEMBERSHIP_STATUS revoke_status)7995 revoke_cluster_membership_of_node(WatchdogNode* wdNode, WD_NODE_MEMBERSHIP_STATUS revoke_status)
7996 {
7997 if (wdNode->membership_status == WD_NODE_MEMBERSHIP_ACTIVE)
7998 {
7999 wdNode->membership_status = revoke_status;
8000
8001 ereport(LOG,
8002 (errmsg("revoking the membership of [%s] node:\"%s\" [node_id:%d]",
8003 wd_state_names[wdNode->state], wdNode->nodeName,wdNode->pgpool_node_id),
8004 errdetail("membership revoke reason: \"%s\"",
8005 wd_cluster_membership_status[wdNode->membership_status])));
8006
8007 g_cluster.memberRemoteNodeCount--;
8008 }
8009 return g_cluster.memberRemoteNodeCount;
8010 }
8011
8012 static int
restore_cluster_membership_of_node(WatchdogNode * wdNode)8013 restore_cluster_membership_of_node(WatchdogNode* wdNode)
8014 {
8015 if (wdNode->membership_status != WD_NODE_MEMBERSHIP_ACTIVE)
8016 {
8017 ereport(LOG,
8018 (errmsg("Restoring cluster membership of node:\"%s\"",wdNode->nodeName),
8019 errdetail("membership of node was revoked because it was \"%s\"",
8020 wd_cluster_membership_status[wdNode->membership_status])));
8021
8022 wdNode->membership_status = WD_NODE_MEMBERSHIP_ACTIVE;
8023 /* reset the lost time on the node */
8024 wdNode->lost_time.tv_sec = 0;
8025 wdNode->lost_time.tv_usec = 0;
8026 g_cluster.memberRemoteNodeCount++;
8027 }
8028 return g_cluster.memberRemoteNodeCount;
8029 }
8030
8031 static void
reset_lost_timers(void)8032 reset_lost_timers(void)
8033 {
8034 int i;
8035 for (i = 0; i < g_cluster.remoteNodeCount; i++)
8036 {
8037 WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
8038 wdNode->lost_time.tv_sec = 0;
8039 wdNode->lost_time.tv_usec = 0;
8040 }
8041 }
8042
8043 static int
standby_node_join_cluster(WatchdogNode * wdNode)8044 standby_node_join_cluster(WatchdogNode * wdNode)
8045 {
8046 if (get_local_node_state() == WD_COORDINATOR)
8047 {
8048 int i;
8049 /* Just rest the lost time stamp*/
8050 /* set the timestamp on node to track for how long this node is lost */
8051 wdNode->lost_time.tv_sec = 0;
8052 wdNode->lost_time.tv_usec = 0;
8053 /* First check if the node is already in the List */
8054 for (i = 0; i < g_cluster.clusterLeaderInfo.standby_nodes_count; i++)
8055 {
8056 WatchdogNode *node = g_cluster.clusterLeaderInfo.standbyNodes[i];
8057
8058 if (node && node == wdNode)
8059 {
8060 /* The node is already in the standby list */
8061 return g_cluster.clusterLeaderInfo.standby_nodes_count;
8062 }
8063 }
8064 /* okay the node is not in the list */
8065 ereport(LOG,
8066 (errmsg("adding watchdog node \"%s\" to the standby list", wdNode->nodeName)));
8067 g_cluster.clusterLeaderInfo.standbyNodes[g_cluster.clusterLeaderInfo.standby_nodes_count] = wdNode;
8068 g_cluster.clusterLeaderInfo.standby_nodes_count++;
8069 }
8070 g_cluster.localNode->standby_nodes_count = g_cluster.clusterLeaderInfo.standby_nodes_count;
8071 return g_cluster.clusterLeaderInfo.standby_nodes_count;
8072 }
8073
8074 static int
standby_node_left_cluster(WatchdogNode * wdNode)8075 standby_node_left_cluster(WatchdogNode * wdNode)
8076 {
8077 if (get_local_node_state() == WD_COORDINATOR)
8078 {
8079 int i;
8080 bool removed = false;
8081 int standby_nodes_count = g_cluster.clusterLeaderInfo.standby_nodes_count;
8082
8083 for (i = 0; i < standby_nodes_count; i++)
8084 {
8085 WatchdogNode *node = g_cluster.clusterLeaderInfo.standbyNodes[i];
8086
8087 if (node)
8088 {
8089 if (removed)
8090 {
8091 /* move this to previous index */
8092 g_cluster.clusterLeaderInfo.standbyNodes[i - 1] = node;
8093 g_cluster.clusterLeaderInfo.standbyNodes[i] = NULL;
8094 }
8095 else if (node == wdNode)
8096 {
8097 /*
8098 * okay we have found the node in the list.
8099 */
8100 ereport(LOG,
8101 (errmsg("removing watchdog node \"%s\" from the standby list", wdNode->nodeName)));
8102 /* set the timestamp on node to track for how long this node is lost */
8103 gettimeofday(&wdNode->lost_time, NULL);
8104 g_cluster.clusterLeaderInfo.standbyNodes[i] = NULL;
8105 g_cluster.clusterLeaderInfo.standby_nodes_count--;
8106 removed = true;
8107 }
8108 }
8109 }
8110 }
8111 g_cluster.localNode->standby_nodes_count = g_cluster.clusterLeaderInfo.standby_nodes_count;
8112 return g_cluster.clusterLeaderInfo.standby_nodes_count;
8113 }
8114
8115 static void
clear_standby_nodes_list(void)8116 clear_standby_nodes_list(void)
8117 {
8118 int i;
8119
8120 ereport(DEBUG1,
8121 (errmsg("removing all watchdog nodes from the standby list"),
8122 errdetail("standby list contains %d nodes", g_cluster.clusterLeaderInfo.standby_nodes_count)));
8123 for (i = 0; i < g_cluster.remoteNodeCount; i++)
8124 {
8125 g_cluster.clusterLeaderInfo.standbyNodes[i] = NULL;
8126 }
8127 g_cluster.clusterLeaderInfo.standby_nodes_count = 0;
8128 g_cluster.localNode->standby_nodes_count = 0;
8129 }
8130
update_missed_beacon_count(WDCommandData * ipcCommand,bool clear)8131 static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear)
8132 {
8133 int i;
8134 for (i=0; i< g_cluster.remoteNodeCount; i++)
8135 {
8136 if (clear)
8137 {
8138 WatchdogNode* wdNode = &(g_cluster.remoteNodes[i]);
8139 wdNode->missed_beacon_count = 0;
8140 }
8141 else
8142 {
8143 WDCommandNodeResult* nodeResult = &ipcCommand->nodeResults[i];
8144 if (ipcCommand->commandStatus == COMMAND_IN_PROGRESS )
8145 return;
8146
8147 if (nodeResult->cmdState == COMMAND_STATE_SENT)
8148 {
8149 if (nodeResult->wdNode->state == WD_STANDBY)
8150 {
8151 nodeResult->wdNode->missed_beacon_count++;
8152 if (nodeResult->wdNode->missed_beacon_count > 1)
8153 ereport(LOG,
8154 (errmsg("remote node \"%s\" is not replying to our beacons",nodeResult->wdNode->nodeName),
8155 errdetail("missed beacon reply count:%d",nodeResult->wdNode->missed_beacon_count)));
8156 }
8157 else
8158 nodeResult->wdNode->missed_beacon_count = 0;
8159 }
8160 if (nodeResult->cmdState == COMMAND_STATE_REPLIED)
8161 {
8162 if (nodeResult->wdNode->missed_beacon_count > 0)
8163 ereport(LOG,
8164 (errmsg("remote node \"%s\" is replying again after missing %d beacons",nodeResult->wdNode->nodeName,
8165 nodeResult->wdNode->missed_beacon_count)));
8166 nodeResult->wdNode->missed_beacon_count = 0;
8167 }
8168 }
8169 }
8170 }
8171
8172 #ifdef WATCHDOG_DEBUG
8173 /*
8174 * Node down request file. In the file, each line consists of watchdog
8175 * debug command. The possible commands are same as the defines below
8176 * for example to stop Pgpool-II from sending the reply to beacon messages
8177 * from the leader node write DO_NOT_REPLY_TO_BEACON in watchdog_debug_requests
8178 *
8179 *
8180 * echo "DO_NOT_REPLY_TO_BEACON" > pgpool_logdir/watchdog_debug_requests
8181 */
8182
8183 typedef struct watchdog_debug_commands
8184 {
8185 char command[100];
8186 unsigned int code;
8187 } watchdog_debug_commands;
8188
8189 unsigned int watchdog_debug_command = 0;
8190
8191
8192 #define WATCHDOG_DEBUG_FILE "watchdog_debug_requests"
8193
8194 #define DO_NOT_REPLY_TO_BEACON 1
8195 #define DO_NOT_SEND_BEACON 2
8196 #define KILL_ALL_COMMUNICATION 4
8197 #define KILL_ALL_RECEIVERS 8
8198 #define KILL_ALL_SENDERS 16
8199
8200
8201 watchdog_debug_commands wd_debug_commands[] = {
8202 {"DO_NOT_REPLY_TO_BEACON", DO_NOT_REPLY_TO_BEACON},
8203 {"DO_NOT_SEND_BEACON", DO_NOT_SEND_BEACON},
8204 {"KILL_ALL_COMMUNICATION", KILL_ALL_COMMUNICATION},
8205 {"KILL_ALL_RECEIVERS", KILL_ALL_RECEIVERS},
8206 {"KILL_ALL_SENDERS", KILL_ALL_SENDERS},
8207 {"", 0}
8208 };
8209
8210 static bool
check_debug_request_kill_all_communication(void)8211 check_debug_request_kill_all_communication(void)
8212 {
8213 return (watchdog_debug_command & KILL_ALL_COMMUNICATION);
8214 }
8215 static bool
check_debug_request_kill_all_receivers(void)8216 check_debug_request_kill_all_receivers(void)
8217 {
8218 return (watchdog_debug_command & KILL_ALL_RECEIVERS);
8219 }
8220 static bool
check_debug_request_kill_all_senders(void)8221 check_debug_request_kill_all_senders(void)
8222 {
8223 return (watchdog_debug_command & KILL_ALL_SENDERS);
8224 }
8225
8226 static bool
check_debug_request_do_not_send_beacon(void)8227 check_debug_request_do_not_send_beacon(void)
8228 {
8229 return (watchdog_debug_command & DO_NOT_SEND_BEACON);
8230 }
8231
8232 static bool
check_debug_request_do_not_reply_beacon(void)8233 check_debug_request_do_not_reply_beacon(void)
8234 {
8235 return (watchdog_debug_command & DO_NOT_REPLY_TO_BEACON);
8236 }
8237 /*
8238 * Check watchdog debug request options file for debug commands
8239 * each line should contain only one command
8240 *
8241 * Possible commands
8242 * DO_NOT_REPLY_TO_BEACON
8243 * DO_NOT_SEND_BEACON
8244 * KILL_ALL_COMMUNICATION
8245 * KILL_ALL_RECEIVERS
8246 * KILL_ALL_SENDERS
8247 */
8248
8249 static void
load_watchdog_debug_test_option(void)8250 load_watchdog_debug_test_option(void)
8251 {
8252 static char wd_debug_request_file[POOLMAXPATHLEN];
8253 FILE *fd;
8254 int i;
8255 #define MAXLINE 128
8256 char readbuf[MAXLINE];
8257
8258 watchdog_debug_command = 0;
8259
8260 if (wd_debug_request_file[0] == '\0')
8261 {
8262 snprintf(wd_debug_request_file, sizeof(wd_debug_request_file),
8263 "%s/%s", pool_config->logdir, WATCHDOG_DEBUG_FILE);
8264 }
8265
8266 fd = fopen(wd_debug_request_file, "r");
8267 if (!fd)
8268 {
8269 ereport(DEBUG3,
8270 (errmsg("load_watchdog_debug_test_option: failed to open file %s",
8271 wd_debug_request_file),
8272 errdetail("%m")));
8273 return;
8274 }
8275
8276 for (i = 0;; i++)
8277 {
8278 int cmd = 0;
8279 bool valid_command = false;
8280 readbuf[MAXLINE - 1] = '\0';
8281 if (fgets(readbuf, MAXLINE - 1, fd) == 0)
8282 break;
8283 for (cmd =0 ;; cmd++)
8284 {
8285 if (strlen(wd_debug_commands[cmd].command) == 0 || wd_debug_commands[cmd].code == 0)
8286 break;
8287
8288 if (strncasecmp(wd_debug_commands[cmd].command,readbuf,strlen(wd_debug_commands[cmd].command)) == 0)
8289 {
8290 ereport(DEBUG3,
8291 (errmsg("Watchdog DEBUG COMMAND %d: \"%s\" request found",
8292 cmd,wd_debug_commands[cmd].command)));
8293
8294 watchdog_debug_command |= wd_debug_commands[cmd].code;
8295 valid_command = true;
8296 break;
8297 }
8298 }
8299 if (!valid_command)
8300 ereport(WARNING,
8301 (errmsg("%s file contains invalid command",
8302 wd_debug_request_file),
8303 errdetail("\"%s\" not recognized", readbuf)));
8304 }
8305
8306 fclose(fd);
8307 }
8308 #else
8309 /*
8310 * All these command checks return false when WATCHDOG_DEBUG is
8311 * not enabled
8312 */
8313 static bool
check_debug_request_do_not_send_beacon(void)8314 check_debug_request_do_not_send_beacon(void)
8315 {return false;}
8316 static bool
check_debug_request_do_not_reply_beacon(void)8317 check_debug_request_do_not_reply_beacon(void)
8318 {return false;}
8319 static bool
check_debug_request_kill_all_communication(void)8320 check_debug_request_kill_all_communication(void)
8321 {return false;}
8322 static bool
check_debug_request_kill_all_receivers(void)8323 check_debug_request_kill_all_receivers(void)
8324 {return false;}
8325 static bool
check_debug_request_kill_all_senders(void)8326 check_debug_request_kill_all_senders(void)
8327 {return false;}
8328 #endif
8329