1 /* -*-pgsql-c-*- */
2 /*
3  * $Header$
4  *
5  * pgpool: a language independent connection pool server for PostgreSQL
6  * written by Tatsuo Ishii
7  *
8  * Copyright (c) 2003-2021	PgPool Global Development Group
9  *
10  * Permission to use, copy, modify, and distribute this software and
11  * its documentation for any purpose and without fee is hereby
12  * granted, provided that the above copyright notice appear in all
13  * copies and that both that copyright notice and this permission
14  * notice appear in supporting documentation, and that the name of the
15  * author not be used in advertising or publicity pertaining to
16  * distribution of the software without specific, written prior
17  * permission. The author makes no representations about the
18  * suitability of this software for any purpose.  It is provided "as
19  * is" without express or implied warranty.
20  *
21  * watchdog.c: child process main
22  *
23  */
24 
25 
26 
27 #include <stdio.h>
28 #include <errno.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <stdlib.h>
32 #include <sys/time.h>
33 #include <sys/utsname.h>
34 #include <sys/un.h>
35 #include <sys/types.h>
36 #include <sys/socket.h>
37 #include <sys/wait.h>
38 #include <netinet/in.h>
39 #include <netinet/tcp.h>
40 #include <net/if.h>
41 #include <arpa/inet.h>
42 #include <netdb.h>
43 #include <fcntl.h>
44 #include <ctype.h>
45 
46 #include "pool.h"
47 #include "pool_config.h"
48 #include "auth/md5.h"
49 #include "utils/palloc.h"
50 #include "utils/memutils.h"
51 #include "utils/elog.h"
52 #include "utils/json_writer.h"
53 #include "utils/json.h"
54 #include "utils/socket_stream.h"
55 #include "utils/pool_signal.h"
56 #include "utils/ps_status.h"
57 #include "main/pool_internal_comms.h"
58 #include "pcp/recovery.h"
59 
60 #include "watchdog/wd_utils.h"
61 #include "watchdog/watchdog.h"
62 #include "watchdog/wd_json_data.h"
63 #include "watchdog/wd_ipc_defines.h"
64 #include "watchdog/wd_internal_commands.h"
65 #include "parser/stringinfo.h"
66 
67 /* These defines enables the consensus building feature
68  * in watchdog for node failover operations
69  * We can also take these to the configure script
70  */
71 #define NODE_UP_REQUIRE_CONSENSUS
72 #define NODE_DOWN_REQUIRE_CONSENSUS
73 #define NODE_PROMOTE_REQUIRE_CONSENSUS
74 
75 typedef enum IPC_CMD_PROCESS_RES
76 {
77 	IPC_CMD_COMPLETE,
78 	IPC_CMD_PROCESSING,
79 	IPC_CMD_ERROR,
80 	IPC_CMD_OK,
81 	IPC_CMD_TRY_AGAIN
82 }			IPC_CMD_PROCESS_RES;
83 
84 
85 #define MIN_SECS_CONNECTION_RETRY	10	/* Time in seconds to retry connection
86 										 * with node once it was failed */
87 
88 #define MAX_SECS_ESC_PROC_EXIT_WAIT 5	/* maximum amount of seconds to wait
89 										 * for escalation/de-escalation process
90 										 * to exit normally before moving on */
91 
92 #define BEACON_MESSAGE_INTERVAL_SECONDS		10	/* interval between beacon
93 												 * messages */
94 
95 #define	MAX_SECS_WAIT_FOR_REPLY_FROM_NODE	5	/* time in seconds to wait for
96 												 * the reply from remote
97 												 * watchdog node */
98 
99 #define MAX_ALLOWED_SEND_FAILURES           3	/* number of times sending message failure
100                                                   * can be tolerated
101                                                   */
102 #define MAX_ALLOWED_BEACON_REPLY_MISS       3	/* number of times missing beacon message reply
103                                                   * can be tolerated
104                                                   */
105 
106 
107 #define	FAILOVER_COMMAND_FINISH_TIMEOUT		15	/* timeout in seconds to wait
108 												 * for Pgpool-II to build
109 												 * consensus for failover */
110 
111 #define MIN_SECS_BETWEEN_BROADCAST_SRV_MSG 5	/* minimum amount of seconds to wait
112 												 * before broadcasting the same cluster
113 												 * service message */
114 
115 /*
116  * Packet types. Used in WDPacketData->type.
117  */
118 #define WD_NO_MESSAGE						0
119 #define WD_ADD_NODE_MESSAGE					'A'
120 #define WD_REQ_INFO_MESSAGE					'B'
121 #define WD_DECLARE_COORDINATOR_MESSAGE		'C'
122 #define WD_DATA_MESSAGE						'D'
123 #define WD_ERROR_MESSAGE					'E'
124 #define WD_ACCEPT_MESSAGE					'G'
125 #define WD_INFO_MESSAGE						'I'
126 #define WD_JOIN_COORDINATOR_MESSAGE			'J'
127 #define WD_IAM_COORDINATOR_MESSAGE			'M'
128 #define WD_IAM_IN_NW_TROUBLE_MESSAGE		'N'
129 #define WD_QUORUM_IS_LOST					'Q'
130 #define WD_REJECT_MESSAGE					'R'
131 #define WD_STAND_FOR_COORDINATOR_MESSAGE	'S'
132 #define WD_REMOTE_FAILOVER_REQUEST			'V'
133 #define WD_INFORM_I_AM_GOING_DOWN			'X'
134 #define WD_ASK_FOR_POOL_CONFIG				'Y'
135 #define WD_POOL_CONFIG_DATA					'Z'
136 #define WD_CMD_REPLY_IN_DATA				'-'
137 #define WD_CLUSTER_SERVICE_MESSAGE			'#'
138 
139 #define WD_EXECUTE_COMMAND_REQUEST			'!'
140 
141 #define WD_FAILOVER_START					'F'
142 #define WD_FAILOVER_END						'H'
143 #define WD_FAILOVER_WAITING_FOR_CONSENSUS	'K'
144 
145 /*Cluster Service Message Types */
146 #define CLUSTER_QUORUM_LOST					'L'
147 #define CLUSTER_QUORUM_FOUND				'F'
148 #define CLUSTER_IN_SPLIT_BRAIN				'B'
149 #define CLUSTER_NEEDS_ELECTION				'E'
150 #define CLUSTER_IAM_TRUE_LEADER				'M'
151 #define CLUSTER_IAM_NOT_TRUE_LEADER			'X'
152 #define CLUSTER_IAM_RESIGNING_FROM_LEADER	'R'
153 #define CLUSTER_NODE_INVALID_VERSION		'V'
154 #define CLUSTER_NODE_REQUIRE_TO_RELOAD		'I'
155 #define CLUSTER_NODE_APPEARING_LOST 		'Y'
156 #define CLUSTER_NODE_APPEARING_FOUND 		'Z'
157 
158 
159 #define WD_LEADER_NODE getLeaderWatchdogNode()
160 
161 typedef struct packet_types
162 {
163 	char		type;
164 	char		name[100];
165 }			packet_types;
166 
167 packet_types all_packet_types[] = {
168 	{WD_ADD_NODE_MESSAGE, "ADD NODE"},
169 	{WD_REQ_INFO_MESSAGE, "REQUEST INFO"},
170 	{WD_DECLARE_COORDINATOR_MESSAGE, "DECLARE COORDINATOR"},
171 	{WD_DATA_MESSAGE, "DATA"},
172 	{WD_ERROR_MESSAGE, "ERROR"},
173 	{WD_ACCEPT_MESSAGE, "ACCEPT"},
174 	{WD_INFO_MESSAGE, "NODE INFO"},
175 	{WD_JOIN_COORDINATOR_MESSAGE, "JOIN COORDINATOR"},
176 	{WD_IAM_COORDINATOR_MESSAGE, "IAM COORDINATOR"},
177 	{WD_IAM_IN_NW_TROUBLE_MESSAGE, "I AM IN NETWORK TROUBLE"},
178 	{WD_QUORUM_IS_LOST, "QUORUM IS LOST"},
179 	{WD_REJECT_MESSAGE, "REJECT"},
180 	{WD_STAND_FOR_COORDINATOR_MESSAGE, "STAND FOR COORDINATOR"},
181 	{WD_REMOTE_FAILOVER_REQUEST, "REPLICATE FAILOVER REQUEST"},
182 	{WD_IPC_ONLINE_RECOVERY_COMMAND, "ONLINE RECOVERY REQUEST"},
183 	{WD_EXECUTE_CLUSTER_COMMAND, "EXECUTE CLUSTER COMMAND"},
184 	{WD_IPC_FAILOVER_COMMAND, "FAILOVER FUNCTION COMMAND"},
185 	{WD_INFORM_I_AM_GOING_DOWN, "INFORM I AM GOING DOWN"},
186 	{WD_ASK_FOR_POOL_CONFIG, "ASK FOR POOL CONFIG"},
187 	{WD_POOL_CONFIG_DATA, "CONFIG DATA"},
188 	{WD_GET_LEADER_DATA_REQUEST, "DATA REQUEST FOR LEADER"},
189 	{WD_GET_RUNTIME_VARIABLE_VALUE, "GET WD RUNTIME VARIABLE VALUE"},
190 	{WD_CMD_REPLY_IN_DATA, "COMMAND REPLY IN DATA"},
191 	{WD_FAILOVER_LOCKING_REQUEST, "FAILOVER LOCKING REQUEST"},
192 	{WD_FAILOVER_INDICATION, "FAILOVER INDICATION"},
193 	{WD_CLUSTER_SERVICE_MESSAGE, "CLUSTER SERVICE MESSAGE"},
194 	{WD_REGISTER_FOR_NOTIFICATION, "REGISTER FOR NOTIFICATION"},
195 	{WD_NODE_STATUS_CHANGE_COMMAND, "NODE STATUS CHANGE"},
196 	{WD_GET_NODES_LIST_COMMAND, "GET NODES LIST"},
197 	{WD_IPC_CMD_CLUSTER_IN_TRAN, "CLUSTER STATE NOT STABLE"},
198 	{WD_IPC_CMD_RESULT_BAD, "IPC RESPONSE BAD"},
199 	{WD_IPC_CMD_RESULT_OK, "IPC RESPONSE GOOD"},
200 	{WD_IPC_CMD_TIMEOUT, "IPC TIMEOUT"},
201 	{WD_EXECUTE_COMMAND_REQUEST, "WD EXECUTE COMMAND"},
202 	{WD_NO_MESSAGE, ""}
203 };
204 
205 
206 char	   *wd_event_name[] =
207 {"STATE CHANGED",
208 	"TIMEOUT",
209 	"PACKET RECEIVED",
210 	"COMMAND FINISHED",
211 	"NEW OUTBOUND_CONNECTION",
212 	"NETWORK IP IS REMOVED",
213 	"NETWORK IP IS ASSIGNED",
214 	"NETWORK LINK IS INACTIVE",
215 	"NETWORK LINK IS ACTIVE",
216 	"THIS NODE LOST",
217 	"REMOTE NODE LOST",
218 	"REMOTE NODE FOUND",
219 	"THIS NODE FOUND",
220 	"NODE CONNECTION LOST",
221 	"NODE CONNECTION FOUND",
222 	"CLUSTER QUORUM STATUS CHANGED",
223 	"NODE REQUIRE TO RELOAD STATE",
224 	"I AM APPEARING LOST"
225 };
226 
227 char	   *wd_state_names[] = {
228 	"DEAD",
229 	"LOADING",
230 	"JOINING",
231 	"INITIALIZING",
232 	"LEADER",
233 	"PARTICIPATING IN ELECTION",
234 	"STANDING FOR LEADER",
235 	"STANDBY",
236 	"LOST",
237 	"IN NETWORK TROUBLE",
238 	"SHUTDOWN",
239 	"ADD MESSAGE SENT",
240 	"NETWORK ISOLATION"
241 };
242 
243 char *wd_node_lost_reasons[] = {
244 	"UNKNOWN REASON",
245 	"REPORTED BY LIFECHECK",
246 	"SEND MESSAGE FAILURES",
247 	"MISSING BEACON REPLIES",
248 	"RECEIVE TIMEOUT",
249 	"NOT REACHABLE",
250 	"SHUTDOWN"
251 };
252 
253 char *wd_cluster_membership_status[] = {
254 	"MEMBER",
255 	"REVOKED-SHUTDOWN",
256 	"REVOKED-NO-SHOW",
257 	"REVOKED-LOST"
258 };
259 /*
260  * Command packet definition.
261  */
262 typedef struct WDPacketData
263 {
264 	char		type;	/* packet type. e.g. WD_ADD_NODE_MESSAGE. See #define above. */
265 	int			command_id;	/* command sequence number starting from 1 */
266 	int			len;
267 	char	   *data;
268 }			WDPacketData;
269 
270 
271 typedef enum WDNodeCommandState
272 {
273 	COMMAND_STATE_INIT,
274 	COMMAND_STATE_SENT,
275 	COMMAND_STATE_REPLIED,
276 	COMMAND_STATE_SEND_ERROR,
277 	COMMAND_STATE_DO_NOT_SEND
278 }			WDNodeCommandState;
279 
280 typedef struct WDCommandNodeResult
281 {
282 	WatchdogNode *wdNode;
283 	WDNodeCommandState cmdState;
284 	char		result_type;
285 	int			result_data_len;
286 	char	   *result_data;
287 }			WDCommandNodeResult;
288 
289 typedef enum WDCommandSource
290 {
291 	COMMAND_SOURCE_IPC,
292 	COMMAND_SOURCE_LOCAL,
293 	COMMAND_SOURCE_REMOTE,
294 	COMMAND_SOURCE_INTERNAL
295 }			WDCommandSource;
296 
297 /*
298  * Watchdog "function" descriptor.  "function" is not a C-function, it's one
299  * of: START_RECOVERY, END_RECOVERY, FAILBACK_REQUEST, DEGENERATE_REQUEST and
300  * PROMOTE_REQUEST. See #define function names (they are prefixed by
301  * "WD_FUNCTION" in src/include/watchdog/wd_ipc_defines.h for more details.
302  */
303 typedef struct WDFunctionCommandData
304 {
305 	char		commandType;
306 	unsigned int commandID;
307 	char	   *funcName;	/* function name */
308 	WatchdogNode *wdNode;
309 }			WDFunctionCommandData;
310 
311 typedef struct WDCommandTimerData
312 {
313 	struct timeval startTime;
314 	unsigned int expire_sec;
315 	bool		need_tics;
316 	WDFunctionCommandData *wd_func_command;
317 }			WDCommandTimerData;
318 
319 
320 typedef enum WDCommandStatus
321 {
322 	COMMAND_EMPTY,
323 	COMMAND_IN_PROGRESS,
324 	COMMAND_FINISHED_TIMEOUT,
325 	COMMAND_FINISHED_ALL_REPLIED,
326 	COMMAND_FINISHED_NODE_REJECTED,
327 	COMMAND_FINISHED_SEND_FAILED
328 }			WDCommandStatus;
329 
330 typedef struct WDCommandData
331 {
332 	WDPacketData sourcePacket;
333 	WDPacketData commandPacket;
334 	WDCommandNodeResult *nodeResults;
335 	WatchdogNode *sendToNode;	/* NULL means send to all */
336 	WDCommandStatus commandStatus;
337 	unsigned int commandTimeoutSecs;
338 	struct timeval commandTime;
339 	unsigned int commandSendToCount;
340 	unsigned int commandSendToErrorCount;
341 	unsigned int commandReplyFromCount;
342 	WDCommandSource commandSource;
343 	int			sourceIPCSocket;	/* Only valid for COMMAND_SOURCE_IPC */
344 	WatchdogNode *sourceWdNode; /* Only valid for COMMAND_SOURCE_REMOTE */
345 	char	   *errorMessage;
346 	MemoryContext memoryContext;
347 	void		(*commandCompleteFunc) (struct WDCommandData *command);
348 }			WDCommandData;
349 
350 typedef struct WDInterfaceStatus
351 {
352 	char	   *if_name;
353 	unsigned int if_index;
354 	bool		if_up;
355 }			WDInterfaceStatus;
356 
357 typedef struct WDClusterLeader
358 {
359 	WatchdogNode *leaderNode;
360 	WatchdogNode **standbyNodes;
361 	int			standby_nodes_count;
362 	bool		holding_vip;
363 }			WDClusterLeaderInfo;
364 
365 typedef struct wd_cluster
366 {
367 	WatchdogNode *localNode;
368 	WatchdogNode *remoteNodes;
369 	WDClusterLeaderInfo clusterLeaderInfo;
370 	int			remoteNodeCount;
371 	int			memberRemoteNodeCount; /* no of nodes that count towards quorum and consensus */
372 	int			quorum_status;
373 	unsigned int nextCommandID;
374 	pid_t		escalation_pid;
375 	pid_t		de_escalation_pid;
376 	int			command_server_sock;
377 	int			network_monitor_sock;
378 	bool		clusterInitialized;
379 	bool		ipc_auth_needed;
380 	int			current_failover_id;
381 	struct timeval last_bcast_srv_msg_time;	/* timestamp when last packet was
382 											 *  broadcasted by the local node */
383 	char		last_bcast_srv_msg;
384 
385 	List	   *unidentified_socks;
386 	List	   *notify_clients;
387 	List	   *ipc_command_socks;
388 	List	   *ipc_commands;
389 	List	   *clusterCommands;
390 	List	   *wd_timer_commands;
391 	List	   *wdInterfaceToMonitor;
392 	List	   *wdCurrentFailovers;
393 }			wd_cluster;
394 
395 typedef struct WDFailoverObject
396 {
397 	int			id;
398 	POOL_REQUEST_KIND reqKind;
399 	unsigned char reqFlags;
400 	int			nodesCount;
401 	unsigned int failoverID;
402 	int		   *nodeList;
403 	List	   *requestingNodes;
404 	int			request_count;
405 	struct timeval startTime;
406 	int			state;
407 }			WDFailoverObject;
408 
409 #ifdef WATCHDOG_DEBUG_OPTS
410 #if WATCHDOG_DEBUG_OPTS > 0
411 #define WATCHDOG_DEBUG
412 #endif
413 #endif
414 
415 static bool check_debug_request_do_not_send_beacon(void);
416 static bool check_debug_request_do_not_reply_beacon(void);
417 static bool check_debug_request_kill_all_communication(void);
418 static bool check_debug_request_kill_all_receivers(void);
419 static bool check_debug_request_kill_all_senders(void);
420 
421 
422 #ifdef WATCHDOG_DEBUG
423 static void load_watchdog_debug_test_option(void);
424 #endif
425 
426 static void process_remote_failover_command_on_coordinator(WatchdogNode * wdNode, WDPacketData * pkt);
427 static WDFailoverObject * get_failover_object(POOL_REQUEST_KIND reqKind, int nodesCount, int *nodeList);
428 static bool does_int_array_contains_value(int *intArray, int count, int value);
429 static void clear_all_failovers(void);
430 static void remove_failover_object(WDFailoverObject * failoverObj);
431 static void service_expired_failovers(void);
432 static WDFailoverObject * add_failover(POOL_REQUEST_KIND reqKind, int *node_id_list, int node_count, WatchdogNode * wdNode,
433 									   unsigned char flags, bool *duplicate);
434 static WDFailoverCMDResults compute_failover_consensus(POOL_REQUEST_KIND reqKind, int *node_id_list,
435 													   int node_count, unsigned char *flags, WatchdogNode * wdNode);
436 
437 static int	send_command_packet_to_remote_nodes(WDCommandData * ipcCommand, bool source_included);
438 static void wd_command_is_complete(WDCommandData * ipcCommand);
439 static IPC_CMD_PROCESS_RES wd_command_processor_for_node_lost_event(WDCommandData * ipcCommand, WatchdogNode * wdLostNode);
440 
441 volatile sig_atomic_t reload_config_signal = 0;
442 volatile sig_atomic_t sigchld_request = 0;
443 
444 static void check_signals(void);
445 static void wd_child_signal_handler(void);
446 static RETSIGTYPE watchdog_signal_handler(int sig);
447 static void FileUnlink(int code, Datum path);
448 static void wd_child_exit(int exit_signo);
449 
450 static void wd_cluster_initialize(void);
451 static void wd_initialize_monitoring_interfaces(void);
452 static int	wd_create_client_socket(char *hostname, int port, bool *connected);
453 static int	connect_with_all_configured_nodes(void);
454 static void try_connecting_with_all_unreachable_nodes(void);
455 static bool connect_to_node(WatchdogNode * wdNode);
456 static bool is_socket_connection_connected(SocketConnection * conn);
457 
458 static void service_unreachable_nodes(void);
459 
460 static void allocate_resultNodes_in_command(WDCommandData * ipcCommand);
461 static bool is_node_active_and_reachable(WatchdogNode * wdNode);
462 static bool is_node_active(WatchdogNode * wdNode);
463 static bool is_node_reachable(WatchdogNode * wdNode);
464 
465 static int	update_successful_outgoing_cons(fd_set *wmask, int pending_fds_count);
466 static int	prepare_fds(fd_set *rmask, fd_set *wmask, fd_set *emask);
467 
468 static void set_next_commandID_in_message(WDPacketData * pkt);
469 static void set_message_commandID(WDPacketData * pkt, unsigned int commandID);
470 static void set_message_data(WDPacketData * pkt, const char *data, int len);
471 static void set_message_type(WDPacketData * pkt, char type);
472 static void free_packet(WDPacketData * pkt);
473 
474 static WDPacketData * get_empty_packet(void);
475 static WDPacketData * read_packet_of_type(SocketConnection * conn, char ensure_type);
476 static WDPacketData * read_packet(SocketConnection * conn);
477 static WDPacketData * get_message_of_type(char type, WDPacketData * replyFor);
478 static WDPacketData * get_addnode_message(void);
479 static WDPacketData * get_beacon_message(char type, WDPacketData * replyFor);
480 static WDPacketData * get_mynode_info_message(WDPacketData * replyFor);
481 static WDPacketData * get_minimum_message(char type, WDPacketData * replyFor);
482 
483 
484 static int	issue_watchdog_internal_command(WatchdogNode * wdNode, WDPacketData * pkt, int timeout_sec);
485 static void check_for_current_command_timeout(void);
486 static bool watchdog_internal_command_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt);
487 static bool service_lost_connections(void);
488 static void service_ipc_commands(void);
489 static void service_internal_command(void);
490 
491 static unsigned int get_next_commandID(void);
492 static WatchdogNode * parse_node_info_message(WDPacketData * pkt, char **authkey);
493 static void update_quorum_status(void);
494 static int	get_minimum_remote_nodes_required_for_quorum(void);
495 static int	get_minimum_votes_to_resolve_consensus(void);
496 
497 static bool write_packet_to_socket(int sock, WDPacketData * pkt, bool ipcPacket);
498 static int	read_sockets(fd_set *rmask, int pending_fds_count);
499 static void set_timeout(unsigned int sec);
500 static int	wd_create_command_server_socket(void);
501 static void close_socket_connection(SocketConnection * conn);
502 static bool send_message_to_connection(SocketConnection * conn, WDPacketData * pkt);
503 
504 static int	send_message(WatchdogNode * wdNode, WDPacketData * pkt);
505 static bool send_message_to_node(WatchdogNode * wdNode, WDPacketData * pkt);
506 static bool reply_with_minimal_message(WatchdogNode * wdNode, char type, WDPacketData * replyFor);
507 static bool reply_with_message(WatchdogNode * wdNode, char type, char *data, int data_len, WDPacketData * replyFor);
508 static int	send_cluster_command(WatchdogNode * wdNode, char type, int timeout_sec);
509 static int	send_message_of_type(WatchdogNode * wdNode, char type, WDPacketData * replyFor);
510 
511 static bool send_cluster_service_message(WatchdogNode * wdNode, WDPacketData * replyFor, char message);
512 
513 
514 static int	accept_incoming_connections(fd_set *rmask, int pending_fds_count);
515 
516 static int	standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt);
517 static void cluster_service_message_processor(WatchdogNode * wdNode, WDPacketData * pkt);
518 static int	get_cluster_node_count(void);
519 static void clear_command_node_result(WDCommandNodeResult * nodeResult);
520 
521 static inline bool is_local_node_true_leader(void);
522 static inline WD_STATES get_local_node_state(void);
523 static int	set_state(WD_STATES newState);
524 
525 static int	watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
526 static int	watchdog_state_machine_voting(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
527 static int	watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
528 static int	watchdog_state_machine_standForCord(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
529 static int	watchdog_state_machine_initializing(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
530 static int	watchdog_state_machine_joining(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
531 static int	watchdog_state_machine_loading(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
532 static int	watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
533 static int	watchdog_state_machine_nw_error(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
534 static int watchdog_state_machine_nw_isolation(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
535 
536 static int	I_am_leader_and_cluster_in_split_brain(WatchdogNode * otherLeaderNode);
537 static void handle_split_brain(WatchdogNode * otherLeaderNode, WDPacketData * pkt);
538 static bool beacon_message_received_from_node(WatchdogNode * wdNode, WDPacketData * pkt);
539 
540 static void cleanUpIPCCommand(WDCommandData * ipcCommand);
541 static bool read_ipc_socket_and_process(int socket, bool *remove_socket);
542 
543 static JsonNode * get_node_list_json(int id);
544 static bool add_nodeinfo_to_json(JsonNode * jNode, WatchdogNode * node);
545 static bool fire_node_status_event(int nodeID, int nodeStatus);
546 static void resign_from_escalated_node(void);
547 static void start_escalated_node(void);
548 static void init_wd_packet(WDPacketData * pkt);
549 static void wd_packet_shallow_copy(WDPacketData * srcPkt, WDPacketData * dstPkt);
550 static bool wd_commands_packet_processor(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt);
551 
552 static WDCommandData * get_wd_command_from_reply(List *commands, WDPacketData * pkt);
553 static WDCommandData * get_wd_cluster_command_from_reply(WDPacketData * pkt);
554 static WDCommandData * get_wd_IPC_command_from_reply(WDPacketData * pkt);
555 static WDCommandData * get_wd_IPC_command_from_socket(int sock);
556 
557 static IPC_CMD_PROCESS_RES process_IPC_command(WDCommandData * ipcCommand);
558 static IPC_CMD_PROCESS_RES process_IPC_nodeStatusChange_command(WDCommandData * ipcCommand);
559 static IPC_CMD_PROCESS_RES process_IPC_nodeList_command(WDCommandData * ipcCommand);
560 static IPC_CMD_PROCESS_RES process_IPC_get_runtime_variable_value_request(WDCommandData * ipcCommand);
561 static IPC_CMD_PROCESS_RES process_IPC_online_recovery(WDCommandData * ipcCommand);
562 static IPC_CMD_PROCESS_RES process_IPC_failover_indication(WDCommandData * ipcCommand);
563 static IPC_CMD_PROCESS_RES process_IPC_data_request_from_leader(WDCommandData * ipcCommand);
564 static IPC_CMD_PROCESS_RES process_IPC_failover_command(WDCommandData * ipcCommand);
565 static IPC_CMD_PROCESS_RES process_failover_command_on_coordinator(WDCommandData * ipcCommand);
566 static IPC_CMD_PROCESS_RES process_IPC_execute_cluster_command(WDCommandData * ipcCommand);
567 
568 static bool write_ipc_command_with_result_data(WDCommandData * ipcCommand, char type, char *data, int len);
569 
570 static void process_wd_func_commands_for_timer_events(void);
571 static void add_wd_command_for_timer_events(unsigned int expire_secs, bool need_tics, WDFunctionCommandData * wd_func_command);
572 static bool reply_is_received_for_pgpool_replicate_command(WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * ipcCommand);
573 
574 static void process_remote_online_recovery_command(WatchdogNode * wdNode, WDPacketData * pkt);
575 
576 static WDFailoverCMDResults failover_end_indication(WDCommandData * ipcCommand);
577 static WDFailoverCMDResults failover_start_indication(WDCommandData * ipcCommand);
578 
579 static void wd_system_will_go_down(int code, Datum arg);
580 static void verify_pool_configurations(WatchdogNode * wdNode, POOL_CONFIG * config);
581 
582 static bool get_authhash_for_node(WatchdogNode * wdNode, char *authhash);
583 static bool verify_authhash_for_node(WatchdogNode * wdNode, char *authhash);
584 
585 static void print_watchdog_node_info(WatchdogNode * wdNode);
586 static int	wd_create_recv_socket(int port);
587 static void wd_check_config(void);
588 static pid_t watchdog_main(void);
589 static pid_t fork_watchdog_child(void);
590 static bool check_IPC_client_authentication(json_value * rootObj, bool internal_client_only);
591 static bool check_and_report_IPC_authentication(WDCommandData * ipcCommand);
592 
593 static void print_packet_node_info(WDPacketData * pkt, WatchdogNode * wdNode, bool sending);
594 static void print_packet_info(WDPacketData * pkt, bool sending);
595 static void update_interface_status(void);
596 static bool any_interface_available(void);
597 static WDPacketData * process_data_request(WatchdogNode * wdNode, WDPacketData * pkt);
598 
599 static WatchdogNode * getLeaderWatchdogNode(void);
600 static void set_cluster_leader_node(WatchdogNode * wdNode);
601 static void clear_standby_nodes_list(void);
602 static int	standby_node_left_cluster(WatchdogNode * wdNode);
603 static int	standby_node_join_cluster(WatchdogNode * wdNode);
604 static void reset_lost_timers(void);
605 static int update_cluster_memberships(void);
606 static int revoke_cluster_membership_of_node(WatchdogNode* wdNode, WD_NODE_MEMBERSHIP_STATUS revoke_status);
607 static int restore_cluster_membership_of_node(WatchdogNode* wdNode);
608 static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear);
609 static void wd_execute_cluster_command_processor(WatchdogNode * wdNode, WDPacketData * pkt);
610 
611 /* global variables */
612 wd_cluster	g_cluster;
613 struct timeval g_tm_set_time;
614 int			g_timeout_sec = 0;
615 
616 static unsigned int
get_next_commandID(void)617 get_next_commandID(void)
618 {
619 	return ++g_cluster.nextCommandID;
620 }
621 
622 static void
set_timeout(unsigned int sec)623 set_timeout(unsigned int sec)
624 {
625 	g_timeout_sec = sec;
626 	gettimeofday(&g_tm_set_time, NULL);
627 }
628 
629 pid_t
initialize_watchdog(void)630 initialize_watchdog(void)
631 {
632 	if (!pool_config->use_watchdog)
633 		return -1;
634 	/* check pool_config data related to watchdog */
635 	wd_check_config();
636 	return fork_watchdog_child();
637 }
638 
639 static void
wd_check_config(void)640 wd_check_config(void)
641 {
642 	if (pool_config->wd_nodes.num_wd == 0)
643 		ereport(ERROR,
644 				(errmsg("invalid watchdog configuration. no watchdog nodes configured")));
645 
646 	if (strlen(pool_config->wd_authkey) > MAX_PASSWORD_SIZE)
647 		ereport(ERROR,
648 				(errmsg("invalid watchdog configuration. wd_authkey length can't be larger than %d",
649 						MAX_PASSWORD_SIZE)));
650 	if (pool_config->wd_lifecheck_method == LIFECHECK_BY_HB)
651 	{
652 		if (pool_config->num_hb_dest_if <= 0)
653 			ereport(ERROR,
654 					(errmsg("invalid lifecheck configuration. no heartbeat interfaces defined")));
655 	}
656 }
657 
658 static void
wd_initialize_monitoring_interfaces(void)659 wd_initialize_monitoring_interfaces(void)
660 {
661 	g_cluster.wdInterfaceToMonitor = NULL;
662 
663 	if (pool_config->num_wd_monitoring_interfaces_list <= 0)
664 	{
665 		ereport(LOG,
666 				(errmsg("interface monitoring is disabled in watchdog")));
667 		return;
668 	}
669 
670 	if (strcasecmp("any", pool_config->wd_monitoring_interfaces_list[0]) == 0)
671 	{
672 		struct if_nameindex *if_ni,
673 				   *idx;
674 
675 		ereport(LOG,
676 				(errmsg("ensure availability on any interface")));
677 
678 		if_ni = if_nameindex();
679 		if (if_ni == NULL)
680 		{
681 			ereport(ERROR,
682 					(errmsg("initializing watchdog failed. unable to get network interface information")));
683 		}
684 
685 		for (idx = if_ni; !(idx->if_index == 0 && idx->if_name == NULL); idx++)
686 		{
687 			WDInterfaceStatus *if_status;
688 
689 			ereport(DEBUG1,
690 					(errmsg("interface name %s at index %d", idx->if_name, idx->if_index)));
691 			if (strncasecmp("lo", idx->if_name, 2) == 0)
692 			{
693 				/* ignoring local interface */
694 				continue;
695 			}
696 			if_status = palloc(sizeof(WDInterfaceStatus));
697 			if_status->if_name = pstrdup(idx->if_name);
698 			if_status->if_index = idx->if_index;
699 			if_status->if_up = true;	/* start with optimism */
700 			g_cluster.wdInterfaceToMonitor = lappend(g_cluster.wdInterfaceToMonitor, if_status);
701 		}
702 		if_freenameindex(if_ni);
703 	}
704 	else
705 	{
706 		WDInterfaceStatus *if_status;
707 		char	   *if_name;
708 		int			i;
709 		unsigned int if_idx;
710 
711 		for (i = 0; i < pool_config->num_wd_monitoring_interfaces_list; i++)
712 		{
713 			if_name = pool_config->wd_monitoring_interfaces_list[i];
714 			/* ignore leading spaces */
715 			while (*if_name && isspace(*if_name))
716 				if_name++;
717 
718 			if_idx = if_nametoindex(if_name);
719 			if (if_idx == 0)
720 				ereport(ERROR,
721 						(errmsg("initializing watchdog failed. invalid interface name \"%s\"", pool_config->wd_monitoring_interfaces_list[0])));
722 
723 			ereport(DEBUG1,
724 					(errmsg("adding monitoring interface [%d] name %s index %d", i, if_name, if_idx)));
725 
726 			if_status = palloc(sizeof(WDInterfaceStatus));
727 			if_status->if_name = pstrdup(if_name);
728 			if_status->if_index = if_idx;
729 			if_status->if_up = true;	/* start with optimism */
730 			g_cluster.wdInterfaceToMonitor = lappend(g_cluster.wdInterfaceToMonitor, if_status);
731 		}
732 	}
733 }
734 
735 static void
wd_cluster_initialize(void)736 wd_cluster_initialize(void)
737 {
738 	int			i = 0;
739 	int			pgpool_node_id = pool_config->pgpool_node_id;
740 
741 	if (pool_config->wd_nodes.num_wd <= 0)
742 	{
743 		/* should also have upper limit??? */
744 		ereport(ERROR,
745 				(errmsg("initializing watchdog failed. no watchdog nodes configured")));
746 	}
747 
748 	/* initialize local node settings */
749 	g_cluster.localNode = palloc0(sizeof(WatchdogNode));
750 	g_cluster.localNode->wd_port = pool_config->wd_nodes.wd_node_info[pgpool_node_id].wd_port;
751 	g_cluster.localNode->pgpool_port = pool_config->wd_nodes.wd_node_info[pgpool_node_id].pgpool_port;
752 	g_cluster.localNode->wd_priority = pool_config->wd_priority;
753 	g_cluster.localNode->pgpool_node_id = pool_config->pgpool_node_id;
754 	gettimeofday(&g_cluster.localNode->startup_time, NULL);
755 
756 	strncpy(g_cluster.localNode->hostname, pool_config->wd_nodes.wd_node_info[pgpool_node_id].hostname, sizeof(g_cluster.localNode->hostname) - 1);
757 	strncpy(g_cluster.localNode->delegate_ip, pool_config->delegate_IP, sizeof(g_cluster.localNode->delegate_ip) - 1);
758 	/* Assign the node name */
759 	{
760 		struct utsname unameData;
761 
762 		uname(&unameData);
763 		snprintf(g_cluster.localNode->nodeName, sizeof(g_cluster.localNode->nodeName), "%s:%d %s %s",
764 				 pool_config->wd_nodes.wd_node_info[pgpool_node_id].hostname,
765 				 pool_config->wd_nodes.wd_node_info[pgpool_node_id].pgpool_port,
766 				 unameData.sysname,
767 				 unameData.nodename);
768 		/* should also have upper limit??? */
769 		ereport(LOG,
770 				(errmsg("setting the local watchdog node name to \"%s\"", g_cluster.localNode->nodeName)));
771 	}
772 
773 	/* initialize remote nodes */
774 	g_cluster.remoteNodeCount = pool_config->wd_nodes.num_wd - 1;
775 	g_cluster.memberRemoteNodeCount = g_cluster.remoteNodeCount;
776 	if (g_cluster.remoteNodeCount == 0)
777 		ereport(ERROR,
778                 (errmsg("invalid watchdog configuration. other pgpools setting is not defined")));
779 	ereport(LOG,
780 			(errmsg("watchdog cluster is configured with %d remote nodes", g_cluster.remoteNodeCount)));
781 	g_cluster.remoteNodes = palloc0((sizeof(WatchdogNode) * g_cluster.remoteNodeCount));
782 	int idx = 0;
783 	for (i = 0; i < pool_config->wd_nodes.num_wd; i++)
784 	{
785 		if (i == pool_config->pgpool_node_id)
786 			continue;
787 
788 		g_cluster.remoteNodes[idx].wd_port = pool_config->wd_nodes.wd_node_info[i].wd_port;
789 		g_cluster.remoteNodes[idx].pgpool_node_id = i;
790 		g_cluster.remoteNodes[idx].pgpool_port = pool_config->wd_nodes.wd_node_info[i].pgpool_port;
791 		strcpy(g_cluster.remoteNodes[idx].hostname, pool_config->wd_nodes.wd_node_info[i].hostname);
792 		g_cluster.remoteNodes[idx].delegate_ip[0] = '\0'; /* this will be
793 														 * populated by remote
794 														 * node */
795 
796 		ereport(LOG,
797 				(errmsg("watchdog remote node:%d on %s:%d", idx, g_cluster.remoteNodes[idx].hostname, g_cluster.remoteNodes[idx].wd_port)));
798 
799 		idx++;
800 	}
801 
802 	g_cluster.clusterLeaderInfo.leaderNode = NULL;
803 	g_cluster.clusterLeaderInfo.standbyNodes = palloc0(sizeof(WatchdogNode *) * g_cluster.remoteNodeCount);
804 	g_cluster.clusterLeaderInfo.standby_nodes_count = 0;
805 	g_cluster.clusterLeaderInfo.holding_vip = false;
806 	g_cluster.quorum_status = -1;
807 	g_cluster.nextCommandID = 1;
808 	g_cluster.clusterInitialized = false;
809 	g_cluster.escalation_pid = 0;
810 	g_cluster.de_escalation_pid = 0;
811 	g_cluster.unidentified_socks = NULL;
812 	g_cluster.command_server_sock = 0;
813 	g_cluster.notify_clients = NULL;
814 	g_cluster.ipc_command_socks = NULL;
815 	g_cluster.wd_timer_commands = NULL;
816 	g_cluster.wdCurrentFailovers = NULL;
817 	g_cluster.ipc_commands = NULL;
818 	g_cluster.localNode->state = WD_DEAD;
819 	g_cluster.clusterCommands = NULL;
820 	g_cluster.ipc_auth_needed = strlen(pool_config->wd_authkey) ? true : false;
821 
822 	g_cluster.localNode->escalated = get_watchdog_node_escalation_state();
823 
824 	wd_initialize_monitoring_interfaces();
825 	if (g_cluster.ipc_auth_needed)
826 	{
827 #ifndef USE_SSL
828 		ereport(LOG,
829 				(errmsg("watchdog is configured to use authentication, but pgpool-II is built without SSL support"),
830 				 errdetail("The authentication method used by pgpool-II without the SSL support is known to be weak")));
831 #endif
832 	}
833 	if (get_watchdog_process_needs_cleanup())
834 	{
835 		ereport(LOG,
836 				(errmsg("watchdog is recovering from the crash of watchdog process")));
837 
838 		/*
839 		 * If we are recovering from crash or abnormal termination de-escalate
840 		 * the node if it was coordinator when it crashed
841 		 */
842 		resign_from_escalated_node();
843 	}
844 }
845 
846 static void
clear_command_node_result(WDCommandNodeResult * nodeResult)847 clear_command_node_result(WDCommandNodeResult * nodeResult)
848 {
849 	nodeResult->result_type = WD_NO_MESSAGE;
850 	nodeResult->result_data = NULL;
851 	nodeResult->result_data_len = 0;
852 	nodeResult->cmdState = COMMAND_STATE_INIT;
853 }
854 
855 static int
wd_create_recv_socket(int port)856 wd_create_recv_socket(int port)
857 {
858 	size_t		len = 0;
859 	struct sockaddr_in addr;
860 	int			one = 1;
861 	int			sock = -1;
862 	int			saved_errno;
863 
864 	if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
865 	{
866 		/* socket create failed */
867 		ereport(ERROR,
868 				(errmsg("failed to create watchdog receive socket"),
869 				 errdetail("create socket failed with reason: \"%m\"")));
870 	}
871 
872 	socket_set_nonblock(sock);
873 
874 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *) &one, sizeof(one)) == -1)
875 	{
876 		/* setsockopt(SO_REUSEADDR) failed */
877 		saved_errno = errno;
878 		close(sock);
879 		ereport(ERROR,
880 				(errmsg("failed to create watchdog receive socket"),
881 				 errdetail("setsockopt(SO_REUSEADDR) failed with reason: \"%s\"", strerror(saved_errno))));
882 	}
883 	if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)) == -1)
884 	{
885 		/* setsockopt(TCP_NODELAY) failed */
886 		saved_errno = errno;
887 		close(sock);
888 		ereport(ERROR,
889 				(errmsg("failed to create watchdog receive socket"),
890 				 errdetail("setsockopt(TCP_NODELAY) failed with reason: \"%s\"", strerror(saved_errno))));
891 	}
892 	if (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *) &one, sizeof(one)) == -1)
893 	{
894 		/* setsockopt(SO_KEEPALIVE) failed */
895 		saved_errno = errno;
896 		close(sock);
897 		ereport(ERROR,
898 				(errmsg("failed to create watchdog receive socket"),
899 				 errdetail("setsockopt(SO_KEEPALIVE) failed with reason: \"%s\"", strerror(saved_errno))));
900 	}
901 
902 	addr.sin_family = AF_INET;
903 	addr.sin_addr.s_addr = htonl(INADDR_ANY);
904 	addr.sin_port = htons(port);
905 	len = sizeof(struct sockaddr_in);
906 
907 	if (bind(sock, (struct sockaddr *) &addr, len) < 0)
908 	{
909 		/* bind failed */
910 		saved_errno = errno;
911 		close(sock);
912 		ereport(ERROR,
913 				(errmsg("failed to create watchdog receive socket"),
914 				 errdetail("bind on \"TCP:%d\" failed with reason: \"%s\"", port, strerror(saved_errno))));
915 	}
916 
917 	if (listen(sock, MAX_WATCHDOG_NUM * 2) < 0)
918 	{
919 		/* listen failed */
920 		saved_errno = errno;
921 		close(sock);
922 		ereport(ERROR,
923 				(errmsg("failed to create watchdog receive socket"),
924 				 errdetail("listen failed with reason: \"%s\"", strerror(saved_errno))));
925 	}
926 
927 	return sock;
928 }
929 
930 
931 
932 /*
933  * creates a socket in non blocking mode and connects it to the hostname and port
934  * the out parameter connected is set to true if the connection is successful
935  */
936 static int
wd_create_client_socket(char * hostname,int port,bool * connected)937 wd_create_client_socket(char *hostname, int port, bool *connected)
938 {
939 	int			sock;
940 	int			one = 1;
941 	size_t		len = 0;
942 	struct sockaddr_in addr;
943 	struct hostent *hp;
944 
945 	*connected = false;
946 	/* create socket */
947 	if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
948 	{
949 		/* socket create failed */
950 		ereport(LOG,
951 				(errmsg("create socket failed with reason: \"%m\"")));
952 		return -1;
953 	}
954 
955 	/* set socket option */
956 	if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)) == -1)
957 	{
958 		close(sock);
959 		ereport(LOG,
960 				(errmsg("failed to set socket options"),
961 				 errdetail("setsockopt(TCP_NODELAY) failed with error: \"%m\"")));
962 		return -1;
963 	}
964 	if (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *) &one, sizeof(one)) == -1)
965 	{
966 		ereport(LOG,
967 				(errmsg("failed to set socket options"),
968 				 errdetail("setsockopt(SO_KEEPALIVE) failed with error: \"%m\"")));
969 		close(sock);
970 		return -1;
971 	}
972 	/* set sockaddr_in */
973 	memset(&addr, 0, sizeof(addr));
974 	addr.sin_family = AF_INET;
975 	hp = gethostbyname(hostname);
976 	if ((hp == NULL) || (hp->h_addrtype != AF_INET))
977 	{
978 		hp = gethostbyaddr(hostname, strlen(hostname), AF_INET);
979 		if ((hp == NULL) || (hp->h_addrtype != AF_INET))
980 		{
981 			ereport(LOG,
982 					(errmsg("failed to get host address for \"%s\"", hostname),
983 					 errdetail("gethostbyaddr failed with error: \"%s\"", hstrerror(h_errno))));
984 			close(sock);
985 			return -1;
986 		}
987 	}
988 	memmove((char *) &(addr.sin_addr), (char *) hp->h_addr, hp->h_length);
989 	addr.sin_port = htons(port);
990 	len = sizeof(struct sockaddr_in);
991 
992 	/* set socket to non blocking */
993 	socket_set_nonblock(sock);
994 
995 	if (connect(sock, (struct sockaddr *) &addr, len) < 0)
996 	{
997 		if (errno == EINPROGRESS)
998 		{
999 			return sock;
1000 		}
1001 		if (errno == EISCONN)
1002 		{
1003 			socket_unset_nonblock(sock);
1004 			*connected = true;
1005 			return sock;
1006 		}
1007 		ereport(LOG,
1008 				(errmsg("connect on socket failed"),
1009 				 errdetail("connect failed with error: \"%m\"")));
1010 		close(sock);
1011 		return -1;
1012 	}
1013 	/* set socket to blocking again */
1014 	socket_unset_nonblock(sock);
1015 	*connected = true;
1016 	return sock;
1017 }
1018 
1019 /* returns the number of successful connections */
1020 static int
connect_with_all_configured_nodes(void)1021 connect_with_all_configured_nodes(void)
1022 {
1023 	int			connect_count = 0;
1024 	int			i;
1025 
1026 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
1027 	{
1028 		WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
1029 
1030 		if (connect_to_node(wdNode))
1031 			connect_count++;
1032 	}
1033 	return connect_count;
1034 }
1035 
1036 /*
1037  * Function tries to connect with nodes which have both sockets
1038  * disconnected
1039  */
1040 static void
try_connecting_with_all_unreachable_nodes(void)1041 try_connecting_with_all_unreachable_nodes(void)
1042 {
1043 	int			i;
1044 
1045 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
1046 	{
1047 		WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
1048 
1049 		if (wdNode->client_socket.sock_state != WD_SOCK_WAITING_FOR_CONNECT && wdNode->client_socket.sock_state != WD_SOCK_CONNECTED &&
1050 			wdNode->server_socket.sock_state != WD_SOCK_WAITING_FOR_CONNECT && wdNode->server_socket.sock_state != WD_SOCK_CONNECTED)
1051 		{
1052 			if (wdNode->state == WD_SHUTDOWN)
1053 				continue;
1054 			connect_to_node(wdNode);
1055 			if (wdNode->client_socket.sock_state == WD_SOCK_CONNECTED)
1056 			{
1057 				ereport(LOG,
1058 						(errmsg("connection to the remote node \"%s\" is restored", wdNode->nodeName)));
1059 				watchdog_state_machine(WD_EVENT_NEW_OUTBOUND_CONNECTION, wdNode, NULL, NULL);
1060 			}
1061 		}
1062 	}
1063 }
1064 
1065 /*
1066  * returns true if the connection is in progress or connected successfully
1067  * false is returned in case of failure
1068  */
1069 static bool
connect_to_node(WatchdogNode * wdNode)1070 connect_to_node(WatchdogNode * wdNode)
1071 {
1072 	bool		connected = false;
1073 
1074 	wdNode->client_socket.sock = wd_create_client_socket(wdNode->hostname, wdNode->wd_port, &connected);
1075 	gettimeofday(&wdNode->client_socket.tv, NULL);
1076 	if (wdNode->client_socket.sock <= 0)
1077 	{
1078 		wdNode->client_socket.sock_state = WD_SOCK_ERROR;
1079 		ereport(DEBUG1,
1080 				(errmsg("outbound connection to \"%s:%d\" failed", wdNode->hostname, wdNode->wd_port)));
1081 	}
1082 	else
1083 	{
1084 		if (connected)
1085 			wdNode->client_socket.sock_state = WD_SOCK_CONNECTED;
1086 		else
1087 			wdNode->client_socket.sock_state = WD_SOCK_WAITING_FOR_CONNECT;
1088 	}
1089 	return (wdNode->client_socket.sock_state != WD_SOCK_ERROR);
1090 }
1091 
1092 /* signal handler for SIGHUP and SIGCHLD handler */
watchdog_signal_handler(int sig)1093 static RETSIGTYPE watchdog_signal_handler(int sig)
1094 {
1095 	if (sig == SIGHUP)
1096 		reload_config_signal = 1;
1097 	else if (sig == SIGCHLD)
1098 		sigchld_request = 1;
1099 }
1100 
1101 static void
check_signals(void)1102 check_signals(void)
1103 {
1104 	/* reload config file signal? */
1105 	if (reload_config_signal)
1106 	{
1107 		MemoryContext oldContext = MemoryContextSwitchTo(TopMemoryContext);
1108 
1109 		pool_get_config(get_config_file_name(), CFGCXT_RELOAD);
1110 		MemoryContextSwitchTo(oldContext);
1111 		reload_config_signal = 0;
1112 	}
1113 	else if (sigchld_request)
1114 	{
1115 		wd_child_signal_handler();
1116 	}
1117 }
1118 
1119 
1120 /*
1121  * fork a child for watchdog
1122  */
1123 static pid_t
fork_watchdog_child(void)1124 fork_watchdog_child(void)
1125 {
1126 	pid_t		pid;
1127 
1128 	pid = fork();
1129 
1130 	if (pid == 0)
1131 	{
1132 		on_exit_reset();
1133 
1134 		SetProcessGlobalVariables(PT_WATCHDOG);
1135 
1136 		/* call watchdog child main */
1137 		POOL_SETMASK(&UnBlockSig);
1138 		watchdog_main();
1139 	}
1140 	else if (pid == -1)
1141 	{
1142 		ereport(FATAL,
1143 				(return_code(POOL_EXIT_FATAL),
1144 				 errmsg("fork() failed"),
1145 				 errdetail("%m")));
1146 	}
1147 
1148 	return pid;
1149 }
1150 
1151 /* Never returns */
1152 static int
watchdog_main(void)1153 watchdog_main(void)
1154 {
1155 	fd_set		rmask;
1156 	fd_set		wmask;
1157 	fd_set		emask;
1158 	const int	select_timeout = 1;
1159 	struct timeval tv,
1160 				ref_time;
1161 
1162 	volatile int fd;
1163 	sigjmp_buf	local_sigjmp_buf;
1164 
1165 	pool_signal(SIGTERM, wd_child_exit);
1166 	pool_signal(SIGINT, wd_child_exit);
1167 	pool_signal(SIGQUIT, wd_child_exit);
1168 	pool_signal(SIGHUP, watchdog_signal_handler);
1169 	pool_signal(SIGCHLD, watchdog_signal_handler);
1170 	pool_signal(SIGUSR1, SIG_IGN);
1171 	pool_signal(SIGUSR2, SIG_IGN);
1172 	pool_signal(SIGPIPE, SIG_IGN);
1173 	pool_signal(SIGALRM, SIG_IGN);
1174 
1175 	init_ps_display("", "", "", "");
1176 
1177 	/* Create per loop iteration memory context */
1178 	ProcessLoopContext = AllocSetContextCreate(TopMemoryContext,
1179 											   "wd_child_main_loop",
1180 											   ALLOCSET_DEFAULT_MINSIZE,
1181 											   ALLOCSET_DEFAULT_INITSIZE,
1182 											   ALLOCSET_DEFAULT_MAXSIZE);
1183 
1184 	MemoryContextSwitchTo(TopMemoryContext);
1185 
1186 	set_ps_display("watchdog", false);
1187 
1188 	/* initialize all the local structures for watchdog */
1189 	wd_cluster_initialize();
1190 	/* create a server socket for incoming watchdog connections */
1191 	g_cluster.localNode->server_socket.sock = wd_create_recv_socket(g_cluster.localNode->wd_port);
1192 	g_cluster.localNode->server_socket.sock_state = WD_SOCK_CONNECTED;
1193 	/* open the command server */
1194 	g_cluster.command_server_sock = wd_create_command_server_socket();
1195 
1196 	/* try connecting to all watchdog nodes */
1197 	g_cluster.network_monitor_sock = create_monitoring_socket();
1198 
1199 	if (any_interface_available() == false)
1200 	{
1201 		ereport(FATAL,
1202 				(return_code(POOL_EXIT_FATAL),
1203 				 errmsg("no valid network interface is active"),
1204 				 errdetail("watchdog requires at least one valid network interface to continue"),
1205 				 errhint("you can disable interface checking by setting wd_monitoring_interfaces_list = '' in pgpool config")));
1206 	}
1207 
1208 	connect_with_all_configured_nodes();
1209 
1210 	/* set the initial state of local node */
1211 	set_state(WD_LOADING);
1212 
1213 	/*
1214 	 * install the callback for the preparation of system exit
1215 	 */
1216 	on_system_exit(wd_system_will_go_down, (Datum) NULL);
1217 
1218 	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
1219 	{
1220 		/* Since not using PG_TRY, must reset error stack by hand */
1221 		if (fd > 0)
1222 			close(fd);
1223 
1224 		error_context_stack = NULL;
1225 
1226 		EmitErrorReport();
1227 		MemoryContextSwitchTo(TopMemoryContext);
1228 		FlushErrorState();
1229 	}
1230 
1231 	/* We can now handle ereport(ERROR) */
1232 	PG_exception_stack = &local_sigjmp_buf;
1233 	reset_watchdog_process_needs_cleanup();
1234 	/* watchdog child loop */
1235 	for (;;)
1236 	{
1237 		int			fd_max,
1238 					select_ret;
1239 		bool		timeout_event = false;
1240 
1241 		MemoryContextSwitchTo(ProcessLoopContext);
1242 		MemoryContextResetAndDeleteChildren(ProcessLoopContext);
1243 
1244 		/* take care config reload request and SIGCHLD */
1245 		check_signals();
1246 
1247 		/*
1248 		 * Establish all accepting socket descriptors and wait for
1249 		 * incoming/outcoming events for up to 1 second.
1250 		 */
1251 		fd_max = prepare_fds(&rmask, &wmask, &emask);
1252 		tv.tv_sec = select_timeout;
1253 		tv.tv_usec = 0;
1254 		select_ret = select(fd_max + 1, &rmask, &wmask, &emask, &tv);
1255 
1256 		gettimeofday(&ref_time, NULL);
1257 
1258 		if (g_timeout_sec > 0)
1259 		{
1260 			if (WD_TIME_DIFF_SEC(ref_time, g_tm_set_time) >= g_timeout_sec)
1261 			{
1262 				timeout_event = true;
1263 				g_timeout_sec = 0;
1264 			}
1265 		}
1266 #ifdef WATCHDOG_DEBUG
1267 		load_watchdog_debug_test_option();
1268 #endif
1269 		/* process events */
1270 		if (select_ret > 0)
1271 		{
1272 			int			processed_fds = 0;
1273 
1274 			processed_fds += accept_incoming_connections(&rmask, (select_ret - processed_fds));
1275 			processed_fds += update_successful_outgoing_cons(&wmask, (select_ret - processed_fds));
1276 			processed_fds += read_sockets(&rmask, (select_ret - processed_fds));
1277 		}
1278 
1279 		/*
1280 		 * Take care online recovery
1281 		 */
1282 		if (WD_TIME_DIFF_SEC(ref_time, g_tm_set_time) >= 1)
1283 		{
1284 			process_wd_func_commands_for_timer_events();
1285 		}
1286 
1287 		if (timeout_event)
1288 		{
1289 			g_timeout_sec = 0;
1290 			watchdog_state_machine(WD_EVENT_TIMEOUT, NULL, NULL, NULL);
1291 		}
1292 
1293 		check_for_current_command_timeout();
1294 
1295 		/*
1296 		 * If any of connections to remote nodes are established, send
1297 		 * commands to the remote nodes.
1298 		 */
1299 		if (service_lost_connections() == true)
1300 		{
1301 			service_internal_command();
1302 			service_ipc_commands();
1303 		}
1304 
1305 		/*
1306 		 * Remove the unreachable nodes from cluster
1307 		 */
1308 		service_unreachable_nodes();
1309 
1310 		/*
1311 		 * If I am the leader, update the quorum status.
1312 		 */
1313 		if (get_local_node_state() == WD_COORDINATOR)
1314 		{
1315 			update_quorum_status();
1316 		}
1317 
1318 		/*
1319 		 * Remove any expired failover command (had spent over 15 seconds
1320 		 * (FAILOVER_COMMAND_FINISH_TIMEOUT)
1321 		 */
1322 		service_expired_failovers();
1323 	}
1324 	return 0;
1325 }
1326 
1327 static int
wd_create_command_server_socket(void)1328 wd_create_command_server_socket(void)
1329 {
1330 	size_t		len = 0;
1331 	struct sockaddr_un addr;
1332 	int			sock = -1;
1333 
1334 	/* We use unix domain stream sockets for the purpose */
1335 	if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
1336 	{
1337 		/* socket create failed */
1338 		ereport(FATAL,
1339 				(return_code(POOL_EXIT_FATAL),
1340 				 errmsg("failed to create watchdog command server socket"),
1341 				 errdetail("create socket failed with reason: \"%m\"")));
1342 	}
1343 	memset((char *) &addr, 0, sizeof(addr));
1344 	addr.sun_family = AF_UNIX;
1345 	snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", get_watchdog_ipc_address());
1346 	len = sizeof(struct sockaddr_un);
1347 
1348 	ereport(INFO,
1349 			(errmsg("IPC socket path: \"%s\"", get_watchdog_ipc_address())));
1350 
1351 	/* Delete any pre-existing socket file to avoid failure at bind() time */
1352 	unlink(addr.sun_path);
1353 
1354 	if (bind(sock, (struct sockaddr *) &addr, len) == -1)
1355 	{
1356 		int			saved_errno = errno;
1357 
1358 		close(sock);
1359 		unlink(addr.sun_path);
1360 		ereport(FATAL,
1361 				(return_code(POOL_EXIT_FATAL),
1362 				 errmsg("failed to create watchdog command server socket"),
1363 				 errdetail("bind on \"%s\" failed with reason: \"%s\"", addr.sun_path, strerror(saved_errno))));
1364 	}
1365 
1366 	if (listen(sock, 5) < 0)
1367 	{
1368 		/* listen failed */
1369 		int			saved_errno = errno;
1370 
1371 		close(sock);
1372 		unlink(addr.sun_path);
1373 		ereport(FATAL,
1374 				(return_code(POOL_EXIT_FATAL),
1375 				 errmsg("failed to create watchdog command server socket"),
1376 				 errdetail("listen failed with reason: \"%s\"", strerror(saved_errno))));
1377 	}
1378 	on_proc_exit(FileUnlink, (Datum) pstrdup(addr.sun_path));
1379 	return sock;
1380 }
1381 
1382 static void
FileUnlink(int code,Datum path)1383 FileUnlink(int code, Datum path)
1384 {
1385 	char	   *filePath = (char *) path;
1386 
1387 	unlink(filePath);
1388 }
1389 
1390 
1391 /*
1392  * sets all the valid watchdog cluster descriptors to the fd_set.
1393  returns the fd_max */
1394 static int
prepare_fds(fd_set * rmask,fd_set * wmask,fd_set * emask)1395 prepare_fds(fd_set *rmask, fd_set *wmask, fd_set *emask)
1396 {
1397 	int			i;
1398 	ListCell   *lc;
1399 	int			fd_max = g_cluster.localNode->server_socket.sock;
1400 
1401 	FD_ZERO(rmask);
1402 	FD_ZERO(wmask);
1403 	FD_ZERO(emask);
1404 
1405 	/* local node server socket will set the read and exception fds */
1406 	FD_SET(g_cluster.localNode->server_socket.sock, rmask);
1407 	FD_SET(g_cluster.localNode->server_socket.sock, emask);
1408 
1409 	/* command server socket will set the read and exception fds */
1410 	FD_SET(g_cluster.command_server_sock, rmask);
1411 	FD_SET(g_cluster.command_server_sock, emask);
1412 	if (fd_max < g_cluster.command_server_sock)
1413 		fd_max = g_cluster.command_server_sock;
1414 
1415 	FD_SET(g_cluster.network_monitor_sock, rmask);
1416 	if (fd_max < g_cluster.network_monitor_sock)
1417 		fd_max = g_cluster.network_monitor_sock;
1418 
1419 	/*
1420 	 * set write fdset for all waiting for connection sockets, while already
1421 	 * connected will be only be waiting for read
1422 	 */
1423 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
1424 	{
1425 		WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
1426 
1427 		if (wdNode->client_socket.sock > 0)
1428 		{
1429 			if (fd_max < wdNode->client_socket.sock)
1430 				fd_max = wdNode->client_socket.sock;
1431 
1432 			FD_SET(wdNode->client_socket.sock, emask);
1433 
1434 			if (wdNode->client_socket.sock_state == WD_SOCK_WAITING_FOR_CONNECT)
1435 				FD_SET(wdNode->client_socket.sock, wmask);
1436 			else
1437 				FD_SET(wdNode->client_socket.sock, rmask);
1438 		}
1439 		if (wdNode->server_socket.sock > 0)
1440 		{
1441 			if (fd_max < wdNode->server_socket.sock)
1442 				fd_max = wdNode->server_socket.sock;
1443 
1444 			FD_SET(wdNode->server_socket.sock, emask);
1445 			FD_SET(wdNode->server_socket.sock, rmask);
1446 		}
1447 	}
1448 
1449 	/*
1450 	 * I know this is getting complex but we need to add all incoming
1451 	 * unassigned connection sockets these one will go for reading
1452 	 */
1453 	foreach(lc, g_cluster.unidentified_socks)
1454 	{
1455 		SocketConnection *conn = lfirst(lc);
1456 		int			ui_sock = conn->sock;
1457 
1458 		if (ui_sock > 0)
1459 		{
1460 			FD_SET(ui_sock, rmask);
1461 			FD_SET(ui_sock, emask);
1462 			if (fd_max < ui_sock)
1463 				fd_max = ui_sock;
1464 		}
1465 	}
1466 
1467 	/* Add the notification connected clients */
1468 	foreach(lc, g_cluster.notify_clients)
1469 	{
1470 		int			ui_sock = lfirst_int(lc);
1471 
1472 		if (ui_sock > 0)
1473 		{
1474 			FD_SET(ui_sock, rmask);
1475 			FD_SET(ui_sock, emask);
1476 			if (fd_max < ui_sock)
1477 				fd_max = ui_sock;
1478 		}
1479 	}
1480 
1481 	/* Finally Add the command IPC sockets */
1482 	foreach(lc, g_cluster.ipc_command_socks)
1483 	{
1484 		int			ui_sock = lfirst_int(lc);
1485 
1486 		if (ui_sock > 0)
1487 		{
1488 			FD_SET(ui_sock, rmask);
1489 			FD_SET(ui_sock, emask);
1490 			if (fd_max < ui_sock)
1491 				fd_max = ui_sock;
1492 		}
1493 	}
1494 
1495 	return fd_max;
1496 }
1497 
1498 static int
read_sockets(fd_set * rmask,int pending_fds_count)1499 read_sockets(fd_set *rmask, int pending_fds_count)
1500 {
1501 	int			i,
1502 				count = 0;
1503 	List	   *socks_to_del = NIL;
1504 	ListCell   *lc;
1505 
1506 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
1507 	{
1508 		WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
1509 
1510 		if (is_socket_connection_connected(&wdNode->client_socket))
1511 		{
1512 			if (FD_ISSET(wdNode->client_socket.sock, rmask))
1513 			{
1514 				ereport(DEBUG2,
1515 						(errmsg("client socket of %s is ready for reading", wdNode->nodeName)));
1516 
1517 				WDPacketData *pkt = read_packet(&wdNode->client_socket);
1518 
1519 				if (pkt)
1520 				{
1521 					if (check_debug_request_kill_all_communication() == false &&
1522 						check_debug_request_kill_all_receivers() == false)
1523 					{
1524 						watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
1525 						/* since a packet is received reset last sent time */
1526 						wdNode->last_sent_time.tv_sec = 0;
1527 						wdNode->last_sent_time.tv_usec = 0;
1528 					}
1529 					free_packet(pkt);
1530 				}
1531 				else
1532 				{
1533 					ereport(LOG,
1534 							(errmsg("client socket of %s is closed", wdNode->nodeName)));
1535 				}
1536 
1537 				count++;
1538 				if (count >= pending_fds_count)
1539 					return count;
1540 			}
1541 		}
1542 		if (is_socket_connection_connected(&wdNode->server_socket))
1543 		{
1544 			if (FD_ISSET(wdNode->server_socket.sock, rmask))
1545 			{
1546 				ereport(DEBUG2,
1547 						(errmsg("server socket of %s is ready for reading", wdNode->nodeName)));
1548 				WDPacketData *pkt = read_packet(&wdNode->server_socket);
1549 
1550 				if (pkt)
1551 				{
1552 					if (check_debug_request_kill_all_communication() == false &&
1553 						check_debug_request_kill_all_receivers() == false)
1554 					{
1555 						watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
1556 						/* since a packet is received reset last sent time */
1557 						wdNode->last_sent_time.tv_sec = 0;
1558 						wdNode->last_sent_time.tv_usec = 0;
1559 					}
1560 					free_packet(pkt);
1561 				}
1562 				else
1563 				{
1564 					ereport(LOG,
1565 							(errmsg("outbound socket of %s is closed", wdNode->nodeName)));
1566 				}
1567 
1568 				count++;
1569 				if (count >= pending_fds_count)
1570 					return count;
1571 			}
1572 		}
1573 	}
1574 
1575 	foreach(lc, g_cluster.unidentified_socks)
1576 	{
1577 		SocketConnection *conn = lfirst(lc);
1578 
1579 		if (conn->sock > 0 && FD_ISSET(conn->sock, rmask))
1580 		{
1581 			WDPacketData *pkt;
1582 
1583 			ereport(DEBUG2,
1584 					(errmsg("un-identified socket %d is ready for reading", conn->sock)));
1585 			/* we only entertain ADD NODE messages from unidentified sockets */
1586 			pkt = read_packet_of_type(conn, WD_ADD_NODE_MESSAGE);
1587 			if (pkt)
1588 			{
1589 				struct timeval 	previous_startup_time;
1590 				char	   *authkey = NULL;
1591 				WatchdogNode *tempNode = parse_node_info_message(pkt, &authkey);
1592 
1593 				if (tempNode)
1594 				{
1595 					WatchdogNode *wdNode;
1596 					bool		found = false;
1597 					bool		authenticated = false;
1598 
1599 					if (tempNode->pgpool_node_id == pool_config->pgpool_node_id)
1600 					{
1601 						ereport(ERROR,
1602 								(errmsg("the pgpool node id configured on node \"%s\" cannot be same as local node", tempNode->nodeName),
1603 								 errdetail("this node id is \"%d\" while local node is \"%d\"",
1604 										   tempNode->pgpool_node_id,
1605 										   pool_config->pgpool_node_id)));
1606 					}
1607 
1608 					print_watchdog_node_info(tempNode);
1609 					authenticated = verify_authhash_for_node(tempNode, authkey);
1610 					ereport(DEBUG1,
1611 							(errmsg("ADD NODE MESSAGE from hostname:\"%s\" port:%d pgpool_port:%d", tempNode->hostname, tempNode->wd_port, tempNode->pgpool_port)));
1612 					/* verify this node */
1613 					if (authenticated)
1614 					{
1615 						WD_STATES oldNodeState = WD_DEAD;
1616 						for (i = 0; i < g_cluster.remoteNodeCount; i++)
1617 						{
1618 							wdNode = &(g_cluster.remoteNodes[i]);
1619 
1620 							if ((wdNode->wd_port == tempNode->wd_port && wdNode->pgpool_port == tempNode->pgpool_port &&
1621 								wdNode->pgpool_node_id == tempNode->pgpool_node_id) &&
1622 								((strcmp(wdNode->hostname, conn->addr) == 0) || (strcmp(wdNode->hostname, tempNode->hostname) == 0)))
1623 							{
1624 								/* We have found the match */
1625 								found = true;
1626 								previous_startup_time.tv_sec = wdNode->startup_time.tv_sec;
1627 								oldNodeState = wdNode->state;
1628 
1629 								close_socket_connection(&wdNode->server_socket);
1630 								strlcpy(wdNode->delegate_ip, tempNode->delegate_ip, WD_MAX_HOST_NAMELEN);
1631 								strlcpy(wdNode->nodeName, tempNode->nodeName, WD_MAX_HOST_NAMELEN);
1632 								strlcpy(wdNode->pgp_version, tempNode->pgp_version, MAX_VERSION_STR_LEN);
1633 								wdNode->state = tempNode->state;
1634 								wdNode->wd_data_major_version = tempNode->wd_data_major_version;
1635 								wdNode->wd_data_minor_version = tempNode->wd_data_minor_version;
1636 								wdNode->startup_time.tv_sec = tempNode->startup_time.tv_sec;
1637 								wdNode->wd_priority = tempNode->wd_priority;
1638 								wdNode->server_socket = *conn;
1639 								wdNode->server_socket.sock_state = WD_SOCK_CONNECTED;
1640 								if (tempNode->current_state_time.tv_sec)
1641 								{
1642 									wdNode->current_state_time.tv_sec = tempNode->current_state_time.tv_sec;
1643 									wdNode->escalated = tempNode->escalated;
1644 									wdNode->standby_nodes_count = tempNode->standby_nodes_count;
1645 									wdNode->quorum_status = tempNode->quorum_status;
1646 								}
1647 								break;
1648 							}
1649 						}
1650 						if (found)
1651 						{
1652 							restore_cluster_membership_of_node(wdNode);
1653 							/* reply with node info message */
1654 							ereport(LOG,
1655 									(errmsg("new node joined the cluster hostname:\"%s\" port:%d pgpool_port:%d", wdNode->hostname,
1656 											wdNode->wd_port,
1657 											wdNode->pgpool_port),
1658 									 errdetail("Pgpool-II version:\"%s\" watchdog messaging version: %d.%d",
1659 											   wdNode->pgp_version,
1660 											   wdNode->wd_data_major_version,
1661 											   wdNode->wd_data_minor_version)));
1662 
1663 							if (oldNodeState == WD_SHUTDOWN)
1664 							{
1665 								ereport(LOG,
1666 										(errmsg("The newly joined node:\"%s\" had left the cluster because it was shutdown",wdNode->nodeName)));
1667 								watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
1668 
1669 							}
1670 							else if (oldNodeState == WD_LOST)
1671 							{
1672 								ereport(LOG,
1673 										(errmsg("The newly joined node:\"%s\" had left the cluster because it was lost",wdNode->nodeName),
1674 										 errdetail("lost reason was \"%s\" and startup time diff = %d",
1675 												   wd_node_lost_reasons[wdNode->node_lost_reason],
1676 												   abs((int)(previous_startup_time.tv_sec - wdNode->startup_time.tv_sec)))));
1677 
1678 								if (abs((int)(previous_startup_time.tv_sec - wdNode->startup_time.tv_sec)) <= 2 &&
1679 									wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK)
1680 								{
1681 									ereport(LOG,
1682 										(errmsg("node:\"%s\" was reported lost by the lifecheck process",wdNode->nodeName),
1683 											 errdetail("only lifecheck process can mark this node alive again")));
1684 									/* restore the node's lost state */
1685 									wdNode->state = oldNodeState;
1686 								}
1687 								else
1688 									watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
1689 
1690 							}
1691 
1692 						}
1693 						else
1694 							ereport(NOTICE,
1695 									(errmsg("add node from hostname:\"%s\" port:%d pgpool_port:%d rejected.", tempNode->hostname, tempNode->wd_port, tempNode->pgpool_port),
1696 									 errdetail("verify the other watchdog node configurations")));
1697 					}
1698 					else
1699 					{
1700 						ereport(NOTICE,
1701 								(errmsg("authentication failed for add node from hostname:\"%s\" port:%d pgpool_port:%d", tempNode->hostname, tempNode->wd_port, tempNode->pgpool_port),
1702 								 errdetail("make sure wd_authkey configuration is same on all nodes")));
1703 					}
1704 
1705 					if (found == false || authenticated == false)
1706 					{
1707 						/*
1708 						 * reply with reject message, We do not need to go to
1709 						 * state processor
1710 						 */
1711 						/* For now, create a empty temp node. */
1712 						WatchdogNode tmpNode;
1713 
1714 						tmpNode.client_socket = *conn;
1715 						tmpNode.client_socket.sock_state = WD_SOCK_CONNECTED;
1716 						tmpNode.server_socket.sock = -1;
1717 						tmpNode.server_socket.sock_state = WD_SOCK_UNINITIALIZED;
1718 						reply_with_minimal_message(&tmpNode, WD_REJECT_MESSAGE, pkt);
1719 						close_socket_connection(conn);
1720 					}
1721 					pfree(tempNode);
1722 				}
1723 				else
1724 				{
1725 					/*
1726 					 * Probably some invalid data in the add message
1727 					 */
1728 					WatchdogNode tmpNode;
1729 
1730 					ereport(LOG,
1731 							(errmsg("unable to parse the add node message")));
1732 					tmpNode.client_socket = *conn;
1733 					tmpNode.client_socket.sock_state = WD_SOCK_CONNECTED;
1734 					tmpNode.server_socket.sock = -1;
1735 					tmpNode.server_socket.sock_state = WD_SOCK_UNINITIALIZED;
1736 					reply_with_minimal_message(&tmpNode, WD_REJECT_MESSAGE, pkt);
1737 					close_socket_connection(conn);
1738 				}
1739 				if (authkey)
1740 					pfree(authkey);
1741 				free_packet(pkt);
1742 				count++;
1743 			}
1744 			socks_to_del = lappend(socks_to_del, conn);
1745 			count++;
1746 			if (count >= pending_fds_count)
1747 				break;
1748 		}
1749 	}
1750 
1751 	/* delete all the sockets from unidentified list which are now identified */
1752 	foreach(lc, socks_to_del)
1753 	{
1754 		g_cluster.unidentified_socks = list_delete_ptr(g_cluster.unidentified_socks, lfirst(lc));
1755 	}
1756 
1757 	list_free_deep(socks_to_del);
1758 	socks_to_del = NULL;
1759 
1760 	if (count >= pending_fds_count)
1761 		return count;
1762 
1763 	foreach(lc, g_cluster.ipc_command_socks)
1764 	{
1765 		int			command_sock = lfirst_int(lc);
1766 
1767 		if (command_sock > 0 && FD_ISSET(command_sock, rmask))
1768 		{
1769 			bool		remove_sock = false;
1770 
1771 			read_ipc_socket_and_process(command_sock, &remove_sock);
1772 			if (remove_sock)
1773 			{
1774 				/* Also locate the command if it has this socket */
1775 				WDCommandData *ipcCommand = get_wd_IPC_command_from_socket(command_sock);
1776 
1777 				if (ipcCommand)
1778 				{
1779 					/*
1780 					 * special case we want to remove the socket from
1781 					 * ipc_command_sock list manually, so mark the issuing
1782 					 * socket of ipcCommand to invalid value
1783 					 */
1784 					ipcCommand->sourceIPCSocket = -1;
1785 				}
1786 				close(command_sock);
1787 				socks_to_del = lappend_int(socks_to_del, command_sock);
1788 			}
1789 			count++;
1790 			if (count >= pending_fds_count)
1791 				break;
1792 		}
1793 	}
1794 	/* delete all the sockets from unidentified list which are now identified */
1795 	foreach(lc, socks_to_del)
1796 	{
1797 		g_cluster.ipc_command_socks = list_delete_int(g_cluster.ipc_command_socks, lfirst_int(lc));
1798 	}
1799 
1800 	list_free(socks_to_del);
1801 	socks_to_del = NULL;
1802 
1803 	if (count >= pending_fds_count)
1804 		return count;
1805 
1806 	foreach(lc, g_cluster.notify_clients)
1807 	{
1808 		int			notify_sock = lfirst_int(lc);
1809 
1810 		if (notify_sock > 0 && FD_ISSET(notify_sock, rmask))
1811 		{
1812 			bool		remove_sock = false;
1813 
1814 			read_ipc_socket_and_process(notify_sock, &remove_sock);
1815 			if (remove_sock)
1816 			{
1817 				close(notify_sock);
1818 				socks_to_del = lappend_int(socks_to_del, notify_sock);
1819 			}
1820 			count++;
1821 			if (count >= pending_fds_count)
1822 				break;
1823 		}
1824 	}
1825 	/* delete all the sockets from unidentified list which are now identified */
1826 	foreach(lc, socks_to_del)
1827 	{
1828 		g_cluster.notify_clients = list_delete_int(g_cluster.notify_clients, lfirst_int(lc));
1829 	}
1830 
1831 	list_free(socks_to_del);
1832 	socks_to_del = NULL;
1833 
1834 
1835 	/* Finally check if something waits us on interface monitoring socket */
1836 	if (g_cluster.network_monitor_sock > 0 && FD_ISSET(g_cluster.network_monitor_sock, rmask))
1837 	{
1838 		bool		deleted;
1839 		bool		link_event;
1840 
1841 		if (read_interface_change_event(g_cluster.network_monitor_sock, &link_event, &deleted))
1842 		{
1843 			ereport(DEBUG1,
1844 					(errmsg("network event received"),
1845 					 errdetail("deleted = %s Link change event = %s",
1846 							   deleted ? "YES" : "NO",
1847 							   link_event ? "YES" : "NO")));
1848 			if (link_event)
1849 			{
1850 				if (deleted)
1851 					watchdog_state_machine(WD_EVENT_NW_LINK_IS_INACTIVE, NULL, NULL, NULL);
1852 				else
1853 					watchdog_state_machine(WD_EVENT_NW_LINK_IS_ACTIVE, NULL, NULL, NULL);
1854 			}
1855 			else
1856 			{
1857 				if (deleted)
1858 					watchdog_state_machine(WD_EVENT_NW_IP_IS_REMOVED, NULL, NULL, NULL);
1859 				else
1860 					watchdog_state_machine(WD_EVENT_NW_IP_IS_ASSIGNED, NULL, NULL, NULL);
1861 			}
1862 		}
1863 		count++;
1864 	}
1865 	return count;
1866 }
1867 
1868 static bool
write_ipc_command_with_result_data(WDCommandData * ipcCommand,char type,char * data,int len)1869 write_ipc_command_with_result_data(WDCommandData * ipcCommand, char type, char *data, int len)
1870 {
1871 	WDPacketData pkt;
1872 
1873 	pkt.data = data;
1874 	pkt.len = len;
1875 	pkt.type = type;
1876 	pkt.command_id = 0;			/* command Id is not used in IPC packets */
1877 
1878 	if (ipcCommand == NULL || ipcCommand->commandSource != COMMAND_SOURCE_IPC || ipcCommand->sourceIPCSocket <= 0)
1879 	{
1880 		ereport(DEBUG1,
1881 				(errmsg("not replying to IPC, Invalid IPC command.")));
1882 		return false;
1883 	}
1884 	/* DEBUG AID */
1885 	if (ipcCommand->commandSource == COMMAND_SOURCE_REMOTE &&
1886 		(check_debug_request_kill_all_senders() ||
1887 		check_debug_request_kill_all_communication()))
1888 		return false;
1889 
1890 	return write_packet_to_socket(ipcCommand->sourceIPCSocket, &pkt, true);
1891 }
1892 
create_command_object(int packet_data_length)1893 static WDCommandData * create_command_object(int packet_data_length)
1894 {
1895 	MemoryContext mCxt,
1896 				oldCxt;
1897 	WDCommandData *wdCommand;
1898 
1899 	/* wd command lives in its own memory context */
1900 	mCxt = AllocSetContextCreate(TopMemoryContext,
1901 								 "WDCommand",
1902 								 ALLOCSET_SMALL_MINSIZE,
1903 								 ALLOCSET_SMALL_INITSIZE,
1904 								 ALLOCSET_SMALL_MAXSIZE);
1905 	oldCxt = MemoryContextSwitchTo(mCxt);
1906 
1907 	wdCommand = palloc0(sizeof(WDCommandData));
1908 	wdCommand->memoryContext = mCxt;
1909 	if (packet_data_length > 0)
1910 		wdCommand->sourcePacket.data = palloc(packet_data_length);
1911 	wdCommand->commandPacket.type = WD_NO_MESSAGE;
1912 	wdCommand->sourcePacket.type = WD_NO_MESSAGE;
1913 	MemoryContextSwitchTo(oldCxt);
1914 	return wdCommand;
1915 }
1916 
1917 static bool
read_ipc_socket_and_process(int sock,bool * remove_socket)1918 read_ipc_socket_and_process(int sock, bool *remove_socket)
1919 {
1920 	char		type;
1921 	int			data_len,
1922 				ret;
1923 	WDCommandData *ipcCommand;
1924 	IPC_CMD_PROCESS_RES res;
1925 
1926 	*remove_socket = true;
1927 
1928 	/* 1st byte is command type */
1929 	ret = socket_read(sock, &type, sizeof(char), 0);
1930 	if (ret == 0)				/* remote end has closed the connection */
1931 		return false;
1932 
1933 	if (ret != sizeof(char))
1934 	{
1935 		ereport(WARNING,
1936 				(errmsg("error reading from IPC socket"),
1937 				 errdetail("read from socket failed with error \"%m\"")));
1938 		return false;
1939 	}
1940 
1941 	/* We should have data length */
1942 	ret = socket_read(sock, &data_len, sizeof(int), 0);
1943 	if (ret != sizeof(int))
1944 	{
1945 		ereport(WARNING,
1946 				(errmsg("error reading from IPC socket"),
1947 				 errdetail("read from socket failed with error \"%m\"")));
1948 		return false;
1949 	}
1950 
1951 	data_len = ntohl(data_len);
1952 	/* see if we have enough information to process this command */
1953 	ipcCommand = create_command_object(data_len);
1954 	ipcCommand->sourceIPCSocket = sock;
1955 	ipcCommand->commandSource = COMMAND_SOURCE_IPC;
1956 	ipcCommand->sourceWdNode = g_cluster.localNode;
1957 	ipcCommand->sourcePacket.type = type;
1958 	ipcCommand->sourcePacket.len = data_len;
1959 	gettimeofday(&ipcCommand->commandTime, NULL);
1960 
1961 	if (data_len > 0)
1962 	{
1963 		if (socket_read(sock, ipcCommand->sourcePacket.data, data_len, 0) <= 0)
1964 		{
1965 			ereport(LOG,
1966 					(errmsg("error reading IPC from socket"),
1967 					 errdetail("read from socket failed with error \"%m\"")));
1968 			return false;
1969 		}
1970 	}
1971 
1972 	res = process_IPC_command(ipcCommand);
1973 	if (res == IPC_CMD_PROCESSING)
1974 	{
1975 		/*
1976 		 * The command still needs further processing store it in the list
1977 		 */
1978 		MemoryContext oldCxt;
1979 
1980 		*remove_socket = false;
1981 		oldCxt = MemoryContextSwitchTo(TopMemoryContext);
1982 		g_cluster.ipc_commands = lappend(g_cluster.ipc_commands, ipcCommand);
1983 		MemoryContextSwitchTo(oldCxt);
1984 		return true;
1985 	}
1986 	else if (res != IPC_CMD_COMPLETE)
1987 	{
1988 		char		res_type;
1989 		char	   *data = NULL;
1990 		int			data_len = 0;
1991 
1992 		switch (res)
1993 		{
1994 			case IPC_CMD_TRY_AGAIN:
1995 				res_type = WD_IPC_CMD_CLUSTER_IN_TRAN;
1996 				break;
1997 			case IPC_CMD_ERROR:
1998 				ereport(NOTICE,
1999 						(errmsg("IPC command returned error")));
2000 				res_type = WD_IPC_CMD_RESULT_BAD;
2001 				break;
2002 			case IPC_CMD_OK:
2003 				res_type = WD_IPC_CMD_RESULT_OK;
2004 				break;
2005 			default:
2006 				res_type = WD_IPC_CMD_RESULT_BAD;
2007 				ereport(NOTICE,
2008 						(errmsg("unexpected IPC processing result")));
2009 				break;
2010 		}
2011 		if (ipcCommand->errorMessage)
2012 		{
2013 			data = get_wd_simple_message_json(ipcCommand->errorMessage);
2014 			data_len = strlen(data) + 1;
2015 		}
2016 
2017 		if (write_ipc_command_with_result_data(ipcCommand, res_type, data, data_len))
2018 		{
2019 			ereport(NOTICE,
2020 					(errmsg("error writing to IPC socket")));
2021 		}
2022 		if (data)
2023 			pfree(data);
2024 	}
2025 
2026 	/*
2027 	 * Delete the Command structure, it is as simple as to delete the memory
2028 	 * context
2029 	 */
2030 	MemoryContextDelete(ipcCommand->memoryContext);
2031 	return (res != IPC_CMD_ERROR);
2032 }
2033 
process_IPC_command(WDCommandData * ipcCommand)2034 static IPC_CMD_PROCESS_RES process_IPC_command(WDCommandData * ipcCommand)
2035 {
2036 	/* authenticate the client first */
2037 	if (check_and_report_IPC_authentication(ipcCommand) == false)
2038 	{
2039 		/* authentication error is already reported to the caller */
2040 		return IPC_CMD_ERROR;
2041 	}
2042 
2043 	switch (ipcCommand->sourcePacket.type)
2044 	{
2045 
2046 		case WD_NODE_STATUS_CHANGE_COMMAND:
2047 			return process_IPC_nodeStatusChange_command(ipcCommand);
2048 			break;
2049 
2050 		case WD_REGISTER_FOR_NOTIFICATION:
2051 			/* Add this socket to the notify socket list */
2052 			g_cluster.notify_clients = lappend_int(g_cluster.notify_clients, ipcCommand->sourceIPCSocket);
2053 			/* The command is completed successfully */
2054 			return IPC_CMD_COMPLETE;
2055 			break;
2056 
2057 		case WD_GET_NODES_LIST_COMMAND:
2058 			return process_IPC_nodeList_command(ipcCommand);
2059 			break;
2060 
2061 		case WD_IPC_FAILOVER_COMMAND:
2062 			return process_IPC_failover_command(ipcCommand);
2063 
2064 		case WD_IPC_ONLINE_RECOVERY_COMMAND:
2065 			return process_IPC_online_recovery(ipcCommand);
2066 			break;
2067 
2068 		case WD_FAILOVER_INDICATION:
2069 			return process_IPC_failover_indication(ipcCommand);
2070 			break;
2071 
2072 		case WD_GET_LEADER_DATA_REQUEST:
2073 			return process_IPC_data_request_from_leader(ipcCommand);
2074 			break;
2075 
2076 		case WD_GET_RUNTIME_VARIABLE_VALUE:
2077 			return process_IPC_get_runtime_variable_value_request(ipcCommand);
2078 			break;
2079 
2080 		case WD_EXECUTE_CLUSTER_COMMAND:
2081 			return process_IPC_execute_cluster_command(ipcCommand);
2082 			break;
2083 
2084 		default:
2085 			ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext, "unknown IPC command type");
2086 			break;
2087 	}
2088 	return IPC_CMD_ERROR;
2089 }
2090 
2091 static IPC_CMD_PROCESS_RES
process_IPC_execute_cluster_command(WDCommandData * ipcCommand)2092 process_IPC_execute_cluster_command(WDCommandData * ipcCommand)
2093 {
2094 	/* get the json for node list */
2095 	char 	*clusterCommand = NULL;
2096 	List 	*args_list = NULL;
2097 
2098 	if (ipcCommand->sourcePacket.len <= 0 || ipcCommand->sourcePacket.data == NULL)
2099 		return IPC_CMD_ERROR;
2100 
2101 	if (!parse_wd_exec_cluster_command_json(ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len,
2102 									   &clusterCommand, &args_list))
2103 	{
2104 		goto ERROR_EXIT;
2105 	}
2106 	if (strcasecmp(WD_COMMAND_SHUTDOWN_CLUSTER, clusterCommand) == 0)
2107 	{
2108 		ereport(LOG,
2109 				(errmsg("Watchdog has received shutdown cluster command from IPC channel")));
2110 	}
2111 	else if (strcasecmp(WD_COMMAND_RELOAD_CONFIG_CLUSTER, clusterCommand) == 0)
2112 	{
2113 		ereport(LOG,
2114 				(errmsg("Watchdog has received reload config cluster command from IPC channel")));
2115 	}
2116 	else if (strcasecmp(WD_COMMAND_LOCK_ON_STANDBY, clusterCommand) == 0)
2117 	{
2118 		ereport(LOG,
2119 				(errmsg("Watchdog has received 'LOCK ON STANDBY' command from IPC channel")));
2120 		if (get_local_node_state() != WD_COORDINATOR)
2121 		{
2122 			ereport(LOG,
2123 					(errmsg("'LOCK ON STANDBY' command can only be processed on coordinator node")));
2124 			goto ERROR_EXIT;
2125 		}
2126 	}
2127 	else
2128 	{
2129 		ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
2130 													   "unknown cluster command requested");
2131 		goto ERROR_EXIT;
2132 	}
2133 
2134 	/*
2135 	 * Just broadcast the execute command request to destination node
2136 	 * Processing the command on the local node is the responsibility of caller
2137 	 * process
2138 	 */
2139 	reply_with_message(NULL, WD_EXECUTE_COMMAND_REQUEST,
2140 					   ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len,
2141 					   NULL);
2142 
2143 	if (args_list)
2144 		list_free_deep(args_list);
2145 
2146 	pfree(clusterCommand);
2147 	return IPC_CMD_OK;
2148 
2149 ERROR_EXIT:
2150 	if (args_list)
2151 		list_free_deep(args_list);
2152 	if (clusterCommand)
2153 		pfree(clusterCommand);
2154 	return IPC_CMD_ERROR;
2155 }
2156 
process_IPC_get_runtime_variable_value_request(WDCommandData * ipcCommand)2157 static IPC_CMD_PROCESS_RES process_IPC_get_runtime_variable_value_request(WDCommandData * ipcCommand)
2158 {
2159 	/* get the json for node list */
2160 	JsonNode   *jNode = NULL;
2161 	char	   *requestVarName = NULL;
2162 
2163 	if (ipcCommand->sourcePacket.len <= 0 || ipcCommand->sourcePacket.data == NULL)
2164 		return IPC_CMD_ERROR;
2165 
2166 	json_value *root = json_parse(ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len);
2167 
2168 	/* The root node must be object */
2169 	if (root == NULL || root->type != json_object)
2170 	{
2171 		json_value_free(root);
2172 		ereport(NOTICE,
2173 				(errmsg("failed to process get local variable IPC command"),
2174 				 errdetail("unable to parse JSON data")));
2175 		return IPC_CMD_ERROR;
2176 	}
2177 
2178 	requestVarName = json_get_string_value_for_key(root, WD_JSON_KEY_VARIABLE_NAME);
2179 
2180 	if (requestVarName == NULL)
2181 	{
2182 		json_value_free(root);
2183 		ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
2184 													   "requested variable name is null");
2185 		return IPC_CMD_ERROR;
2186 	}
2187 
2188 	jNode = jw_create_with_object(true);
2189 
2190 	if (strcasecmp(WD_RUNTIME_VAR_WD_STATE, requestVarName) == 0)
2191 	{
2192 		jw_put_int(jNode, WD_JSON_KEY_VALUE_DATA_TYPE, VALUE_DATA_TYPE_INT);
2193 		jw_put_int(jNode, WD_JSON_KEY_VALUE_DATA, g_cluster.localNode->state);
2194 	}
2195 	else if (strcasecmp(WD_RUNTIME_VAR_QUORUM_STATE, requestVarName) == 0)
2196 	{
2197 		jw_put_int(jNode, WD_JSON_KEY_VALUE_DATA_TYPE, VALUE_DATA_TYPE_INT);
2198 		jw_put_int(jNode, WD_JSON_KEY_VALUE_DATA, WD_LEADER_NODE ? WD_LEADER_NODE->quorum_status : -2);
2199 	}
2200 	else if (strcasecmp(WD_RUNTIME_VAR_ESCALATION_STATE, requestVarName) == 0)
2201 	{
2202 		jw_put_int(jNode, WD_JSON_KEY_VALUE_DATA_TYPE, VALUE_DATA_TYPE_BOOL);
2203 		jw_put_int(jNode, WD_JSON_KEY_VALUE_DATA, g_cluster.localNode->escalated);
2204 	}
2205 	else
2206 	{
2207 		json_value_free(root);
2208 		jw_destroy(jNode);
2209 		ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
2210 													   "unknown variable requested");
2211 		return IPC_CMD_ERROR;
2212 	}
2213 
2214 	jw_finish_document(jNode);
2215 	json_value_free(root);
2216 	write_ipc_command_with_result_data(ipcCommand, WD_IPC_CMD_RESULT_OK,
2217 									   jw_get_json_string(jNode), jw_get_json_length(jNode) + 1);
2218 	jw_destroy(jNode);
2219 	return IPC_CMD_COMPLETE;
2220 }
2221 
process_IPC_nodeList_command(WDCommandData * ipcCommand)2222 static IPC_CMD_PROCESS_RES process_IPC_nodeList_command(WDCommandData * ipcCommand)
2223 {
2224 	/* get the json for node list */
2225 	JsonNode   *jNode = NULL;
2226 	int			NodeID = -1;
2227 
2228 	if (ipcCommand->sourcePacket.len <= 0 || ipcCommand->sourcePacket.data == NULL)
2229 		return IPC_CMD_ERROR;
2230 
2231 	json_value *root = json_parse(ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len);
2232 
2233 	/* The root node must be object */
2234 	if (root == NULL || root->type != json_object)
2235 	{
2236 		json_value_free(root);
2237 		ereport(NOTICE,
2238 				(errmsg("failed to process GET NODE LIST IPC command"),
2239 				 errdetail("unable to parse json data")));
2240 		return IPC_CMD_ERROR;
2241 	}
2242 
2243 	if (json_get_int_value_for_key(root, "NodeID", &NodeID))
2244 	{
2245 		json_value_free(root);
2246 		return IPC_CMD_ERROR;
2247 	}
2248 
2249 	json_value_free(root);
2250 	jNode = get_node_list_json(NodeID);
2251 	write_ipc_command_with_result_data(ipcCommand, WD_IPC_CMD_RESULT_OK,
2252 									   jw_get_json_string(jNode), jw_get_json_length(jNode) + 1);
2253 	jw_destroy(jNode);
2254 	return IPC_CMD_COMPLETE;
2255 }
2256 
process_IPC_nodeStatusChange_command(WDCommandData * ipcCommand)2257 static IPC_CMD_PROCESS_RES process_IPC_nodeStatusChange_command(WDCommandData * ipcCommand)
2258 {
2259 	int			nodeStatus;
2260 	int			nodeID;
2261 	char	   *message = NULL;
2262 	bool		ret;
2263 
2264 	if (ipcCommand->sourcePacket.len <= 0 || ipcCommand->sourcePacket.data == NULL)
2265 		return IPC_CMD_ERROR;
2266 
2267 	ret = parse_node_status_json(ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len, &nodeID, &nodeStatus, &message);
2268 
2269 	if (ret == false)
2270 	{
2271 		ereport(NOTICE,
2272 				(errmsg("failed to process NODE STATE CHANGE IPC command"),
2273 				 errdetail("unable to parse JSON data")));
2274 		return IPC_CMD_ERROR;
2275 	}
2276 
2277 	if (message)
2278 	{
2279 		ereport(LOG,
2280 				(errmsg("received node status change ipc message"),
2281 				 errdetail("%s", message)));
2282 		pfree(message);
2283 	}
2284 	if (fire_node_status_event(nodeID, nodeStatus) == false)
2285 		return IPC_CMD_ERROR;
2286 
2287 	return IPC_CMD_COMPLETE;
2288 }
2289 
2290 static bool
fire_node_status_event(int nodeID,int nodeStatus)2291 fire_node_status_event(int nodeID, int nodeStatus)
2292 {
2293 	WatchdogNode *wdNode = NULL;
2294 
2295 	if (g_cluster.localNode->pgpool_node_id == nodeID)
2296 	{
2297 		wdNode = g_cluster.localNode;
2298 	}
2299 	else
2300 	{
2301 		int			i;
2302 
2303 		for (i = 0; i < g_cluster.remoteNodeCount; i++)
2304 		{
2305 			if (nodeID == g_cluster.remoteNodes[i].pgpool_node_id)
2306 			{
2307 				wdNode = &g_cluster.remoteNodes[i];
2308 				break;
2309 			}
2310 		}
2311 	}
2312 	if (wdNode == NULL)
2313 	{
2314 		ereport(LOG,
2315 				(errmsg("failed to process node status change event"),
2316 				 errdetail("invalid Node ID in the event")));
2317 		return false;
2318 	}
2319 
2320 	if (nodeStatus == WD_LIFECHECK_NODE_STATUS_DEAD)
2321 	{
2322 		ereport(DEBUG1,
2323 				(errmsg("processing node status changed to DEAD event for node ID:%d", nodeID)));
2324 
2325 		if (wdNode == g_cluster.localNode)
2326 			watchdog_state_machine(WD_EVENT_LOCAL_NODE_LOST, wdNode, NULL, NULL);
2327 		else
2328 		{
2329 			wdNode->node_lost_reason = NODE_LOST_BY_LIFECHECK;
2330 			watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
2331 		}
2332 	}
2333 	else if (nodeStatus == WD_LIFECHECK_NODE_STATUS_ALIVE)
2334 	{
2335 		ereport(DEBUG1,
2336 				(errmsg("processing node status changed to ALIVE event for node ID:%d", nodeID)));
2337 
2338 		if (wdNode == g_cluster.localNode)
2339 			watchdog_state_machine(WD_EVENT_LOCAL_NODE_FOUND, wdNode, NULL, NULL);
2340 		else
2341 			watchdog_state_machine(WD_EVENT_REMOTE_NODE_FOUND, wdNode, NULL, NULL);
2342 	}
2343 	else
2344 		ereport(LOG,
2345 				(errmsg("failed to process node status change event"),
2346 				 errdetail("invalid event type")));
2347 	return true;
2348 }
2349 
2350 /*
2351  * Free the failover object
2352  */
2353 static void
remove_failover_object(WDFailoverObject * failoverObj)2354 remove_failover_object(WDFailoverObject * failoverObj)
2355 {
2356 	ereport(DEBUG1,
2357 			(errmsg("removing failover request from %d nodes with ID:%d", failoverObj->request_count, failoverObj->failoverID)));
2358 	g_cluster.wdCurrentFailovers = list_delete_ptr(g_cluster.wdCurrentFailovers, failoverObj);
2359 	list_free(failoverObj->requestingNodes);
2360 	pfree(failoverObj->nodeList);
2361 	pfree(failoverObj);
2362 }
2363 
2364 
2365 /* if the wdNode is NULL. The function removes all failover objects */
2366 static void
clear_all_failovers(void)2367 clear_all_failovers(void)
2368 {
2369 	ListCell   *lc;
2370 	List	   *failovers_to_del = list_copy(g_cluster.wdCurrentFailovers);
2371 
2372 	ereport(DEBUG1,
2373 			(errmsg("Removing all failover objects")));
2374 
2375 	foreach(lc, failovers_to_del)
2376 	{
2377 		WDFailoverObject *failoverObj = lfirst(lc);
2378 
2379 		remove_failover_object(failoverObj);
2380 	}
2381 	list_free(failovers_to_del);
2382 }
2383 
2384 /* Remove the over stayed failover objects */
2385 static void
service_expired_failovers(void)2386 service_expired_failovers(void)
2387 {
2388 	ListCell   *lc;
2389 	List	   *failovers_to_del = NULL;
2390 	bool		need_to_resign = false;
2391 	struct timeval currTime;
2392 
2393 	if (get_local_node_state() != WD_COORDINATOR)
2394 		return;
2395 
2396 	gettimeofday(&currTime, NULL);
2397 
2398 	foreach(lc, g_cluster.wdCurrentFailovers)
2399 	{
2400 		WDFailoverObject *failoverObj = lfirst(lc);
2401 
2402 		if (failoverObj)
2403 		{
2404 			if (WD_TIME_DIFF_SEC(currTime, failoverObj->startTime) >= FAILOVER_COMMAND_FINISH_TIMEOUT)
2405 			{
2406 				failovers_to_del = lappend(failovers_to_del, failoverObj);
2407 				ereport(DEBUG1,
2408 					(errmsg("failover request from %d nodes with ID:%d is expired", failoverObj->request_count, failoverObj->failoverID),
2409 						 errdetail("marking the failover object for removal")));
2410 				if (!need_to_resign && failoverObj->reqKind == NODE_DOWN_REQUEST)
2411 				{
2412 					ListCell   *lc;
2413 					/* search the in the requesting node list if we are also the ones
2414 					 * who think the failover must have been done
2415 					 */
2416 					foreach(lc, failoverObj->requestingNodes)
2417 					{
2418 						WatchdogNode *reqWdNode = lfirst(lc);
2419 						if (g_cluster.localNode == reqWdNode)
2420 						{
2421 							/* verify if that node requested by us is now quarantined */
2422 							int	 i;
2423 							for (i = 0; i < failoverObj->nodesCount; i++)
2424 							{
2425 								int node_id = failoverObj->nodeList[i];
2426 								if (node_id != -1)
2427 								{
2428 									if (Req_info->primary_node_id == -1 &&
2429 										BACKEND_INFO(node_id).quarantine == true &&
2430 										BACKEND_INFO(node_id).role == ROLE_PRIMARY)
2431 									{
2432 										ereport(LOG,
2433 												(errmsg("We are not able to build consensus for our primary node failover request, got %d votes only for failover request ID:%d", failoverObj->request_count, failoverObj->failoverID),
2434 												 errdetail("resigning from the coordinator")));
2435 										need_to_resign = true;
2436 									}
2437 								}
2438 							}
2439 						}
2440 					}
2441 				}
2442 			}
2443 		}
2444 	}
2445 
2446 	/* delete the failover objects */
2447 	foreach(lc, failovers_to_del)
2448 	{
2449 		WDFailoverObject *failoverObj = lfirst(lc);
2450 
2451 		remove_failover_object(failoverObj);
2452 	}
2453 	list_free(failovers_to_del);
2454 	if (need_to_resign)
2455 	{
2456 		/* lower my wd_priority for moment */
2457 		g_cluster.localNode->wd_priority = -1;
2458 		send_cluster_service_message(NULL, NULL, CLUSTER_IAM_RESIGNING_FROM_LEADER);
2459 		set_state(WD_JOINING);
2460 	}
2461 }
2462 
2463 static bool
does_int_array_contains_value(int * intArray,int count,int value)2464 does_int_array_contains_value(int *intArray, int count, int value)
2465 {
2466 	int			i;
2467 
2468 	for (i = 0; i < count; i++)
2469 	{
2470 		if (intArray[i] == value)
2471 			return true;
2472 	}
2473 	return false;
2474 }
2475 
get_failover_object(POOL_REQUEST_KIND reqKind,int nodesCount,int * nodeList)2476 static WDFailoverObject * get_failover_object(POOL_REQUEST_KIND reqKind, int nodesCount, int *nodeList)
2477 {
2478 	ListCell   *lc;
2479 
2480 	foreach(lc, g_cluster.wdCurrentFailovers)
2481 	{
2482 		WDFailoverObject *failoverObj = lfirst(lc);
2483 
2484 		if (failoverObj)
2485 		{
2486 			if (failoverObj->reqKind == reqKind && failoverObj->nodesCount == nodesCount)
2487 			{
2488 				bool		equal = true;
2489 				int			i;
2490 
2491 				for (i = 0; i < nodesCount; i++)
2492 				{
2493 					if (does_int_array_contains_value(nodeList, nodesCount, failoverObj->nodeList[i]) == false)
2494 					{
2495 						equal = false;
2496 						break;
2497 					}
2498 				}
2499 				if (equal)
2500 					return failoverObj;
2501 			}
2502 		}
2503 	}
2504 	return NULL;
2505 }
2506 
2507 static void
process_remote_failover_command_on_coordinator(WatchdogNode * wdNode,WDPacketData * pkt)2508 process_remote_failover_command_on_coordinator(WatchdogNode * wdNode, WDPacketData * pkt)
2509 {
2510 	if (get_local_node_state() != WD_COORDINATOR)
2511 	{
2512 		/* only lock holder can resign itself */
2513 		reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
2514 	}
2515 	else
2516 	{
2517 		IPC_CMD_PROCESS_RES res;
2518 		WDCommandData *ipcCommand = create_command_object(pkt->len);
2519 
2520 		ipcCommand->sourcePacket.type = pkt->type;
2521 		ipcCommand->sourcePacket.len = pkt->len;
2522 		ipcCommand->sourcePacket.command_id = pkt->command_id;
2523 
2524 		if (pkt->len > 0)
2525 			memcpy(ipcCommand->sourcePacket.data, pkt->data, pkt->len);
2526 
2527 		ipcCommand->commandSource = COMMAND_SOURCE_REMOTE;
2528 		ipcCommand->sourceWdNode = wdNode;
2529 		gettimeofday(&ipcCommand->commandTime, NULL);
2530 
2531 		ereport(LOG,
2532 				(errmsg("watchdog received the failover command from remote pgpool-II node \"%s\"", wdNode->nodeName)));
2533 
2534 		res = process_failover_command_on_coordinator(ipcCommand);
2535 		if (res == IPC_CMD_PROCESSING)
2536 		{
2537 			MemoryContext oldCxt = MemoryContextSwitchTo(TopMemoryContext);
2538 
2539 			g_cluster.ipc_commands = lappend(g_cluster.ipc_commands, ipcCommand);
2540 			MemoryContextSwitchTo(oldCxt);
2541 			ereport(LOG,
2542 					(errmsg("failover command from remote pgpool-II node \"%s\" is still processing", wdNode->nodeName),
2543 					 errdetail("waiting for results...")));
2544 		}
2545 		else
2546 		{
2547 			cleanUpIPCCommand(ipcCommand);
2548 		}
2549 	}
2550 }
2551 
2552 static bool
reply_to_failover_command(WDCommandData * ipcCommand,WDFailoverCMDResults cmdResult,unsigned int failoverID)2553 reply_to_failover_command(WDCommandData * ipcCommand, WDFailoverCMDResults cmdResult, unsigned int failoverID)
2554 {
2555 	bool		ret = false;
2556 	JsonNode   *jNode = jw_create_with_object(true);
2557 
2558 	jw_put_int(jNode, WD_FAILOVER_RESULT_KEY, cmdResult);
2559 	jw_put_int(jNode, WD_FAILOVER_ID_KEY, failoverID);
2560 	/* create the packet */
2561 	jw_end_element(jNode);
2562 	jw_finish_document(jNode);
2563 
2564 	ereport(DEBUG2,
2565 			(errmsg("replying to failover command with failover ID: %d", failoverID),
2566 			 errdetail("%.*s", jw_get_json_length(jNode), jw_get_json_string(jNode))));
2567 
2568 	if (ipcCommand->commandSource == COMMAND_SOURCE_IPC)
2569 	{
2570 		ret = write_ipc_command_with_result_data(ipcCommand, WD_IPC_CMD_RESULT_OK,
2571 												 jw_get_json_string(jNode), jw_get_json_length(jNode) + 1);
2572 	}
2573 	else if (ipcCommand->commandSource == COMMAND_SOURCE_REMOTE)
2574 	{
2575 		reply_with_message(ipcCommand->sourceWdNode, WD_CMD_REPLY_IN_DATA,
2576 						   jw_get_json_string(jNode), jw_get_json_length(jNode) + 1,
2577 						   &ipcCommand->sourcePacket);
2578 	}
2579 	jw_destroy(jNode);
2580 	return ret;
2581 }
2582 
2583 /*
2584  * This function process the failover command and decides
2585  * about the execution of failover command.
2586  */
2587 
compute_failover_consensus(POOL_REQUEST_KIND reqKind,int * node_id_list,int node_count,unsigned char * flags,WatchdogNode * wdNode)2588 static WDFailoverCMDResults compute_failover_consensus(POOL_REQUEST_KIND reqKind, int *node_id_list, int node_count, unsigned char *flags, WatchdogNode * wdNode)
2589 {
2590 #ifndef NODE_UP_REQUIRE_CONSENSUS
2591 	if (reqKind == NODE_UP_REQUEST)
2592 		return FAILOVER_RES_PROCEED;
2593 #endif
2594 #ifndef NODE_DOWN_REQUIRE_CONSENSUS
2595 	if (reqKind == NODE_DOWN_REQUEST)
2596 		return FAILOVER_RES_PROCEED;
2597 #endif
2598 #ifndef NODE_PROMOTE_REQUIRE_CONSENSUS
2599 	if (reqKind == PROMOTE_NODE_REQUEST)
2600 		return FAILOVER_RES_PROCEED;
2601 #endif
2602 
2603 	if (pool_config->failover_when_quorum_exists == false)
2604 	{
2605 		/* No need for any calculation, We do not need a quorum for failover */
2606 		ereport(LOG, (
2607 					  errmsg("we do not need quorum to hold to proceed with failover"),
2608 					  errdetail("proceeding with the failover"),
2609 					  errhint("failover_when_quorum_exists is set to false")));
2610 
2611 		return FAILOVER_RES_PROCEED;
2612 	}
2613 	if (*flags & REQ_DETAIL_CONFIRMED)
2614 	{
2615 		/* Check the request flags, If it asks to bypass the quorum status */
2616 		ereport(LOG, (
2617 					  errmsg("The failover request does not need quorum to hold"),
2618 					  errdetail("proceeding with the failover"),
2619 					  errhint("REQ_DETAIL_CONFIRMED")));
2620 		return FAILOVER_RES_PROCEED;
2621 	}
2622 	update_quorum_status();
2623 	if (g_cluster.quorum_status < 0)
2624 	{
2625 		/* quorum is must and it is not present at the moment */
2626 		ereport(LOG, (
2627 					  errmsg("failover requires the quorum to hold, which is not present at the moment"),
2628 					  errdetail("Rejecting the failover request")));
2629 		return FAILOVER_RES_NO_QUORUM;
2630 	}
2631 
2632 	/*
2633 	 * So we reached here means quorum is present Now come to difficult part of
2634 	 * ensuring the consensus
2635 	 */
2636 	if (pool_config->failover_require_consensus == true)
2637 	{
2638 		/* Record the failover. */
2639 		bool		duplicate = false;
2640 		WDFailoverObject *failoverObj = add_failover(reqKind, node_id_list, node_count, wdNode, *flags, &duplicate);
2641 
2642 		if (failoverObj->request_count < get_minimum_votes_to_resolve_consensus())
2643 		{
2644 			ereport(LOG, (
2645 						  errmsg("failover requires the majority vote, waiting for consensus"),
2646 						  errdetail("failover request noted")));
2647 			if (duplicate && !pool_config->allow_multiple_failover_requests_from_node)
2648 				return FAILOVER_RES_CONSENSUS_MAY_FAIL;
2649 			else
2650 				return FAILOVER_RES_BUILDING_CONSENSUS;
2651 		}
2652 		else
2653 		{
2654 			/* We have received enough votes for this failover */
2655 			ereport(LOG, (
2656 						  errmsg("we have got the consensus to perform the failover"),
2657 						  errdetail("%d node(s) voted in the favor", failoverObj->request_count)));
2658 			/* restore the flag value to the one from the first call */
2659 			*flags = failoverObj->reqFlags;
2660 			/* remove this object, It is no longer needed */
2661 			remove_failover_object(failoverObj);
2662 			return FAILOVER_RES_PROCEED;
2663 		}
2664 	}
2665 	else
2666 	{
2667 		ereport(LOG, (
2668 					  errmsg("we do not require majority votes to proceed with failover"),
2669 					  errdetail("proceeding with the failover"),
2670 					  errhint("failover_require_consensus is set to false")));
2671 	}
2672 	return FAILOVER_RES_PROCEED;
2673 }
2674 
add_failover(POOL_REQUEST_KIND reqKind,int * node_id_list,int node_count,WatchdogNode * wdNode,unsigned char flags,bool * duplicate)2675 static WDFailoverObject * add_failover(POOL_REQUEST_KIND reqKind, int *node_id_list, int node_count, WatchdogNode * wdNode,
2676 									   unsigned char flags, bool *duplicate)
2677 {
2678 	MemoryContext oldCxt;
2679 
2680 	/* Find the failover */
2681 	WDFailoverObject *failoverObj = get_failover_object(reqKind, node_count, node_id_list);
2682 
2683 	*duplicate = false;
2684 	if (failoverObj)
2685 	{
2686 		ListCell   *lc;
2687 
2688 		/* search the node if it is a duplicate request */
2689 		foreach(lc, failoverObj->requestingNodes)
2690 		{
2691 			WatchdogNode *reqWdNode = lfirst(lc);
2692 
2693 			if (wdNode == reqWdNode)
2694 			{
2695 				*duplicate = true;
2696 				/* The failover request is duplicate */
2697 				if (pool_config->allow_multiple_failover_requests_from_node)
2698 				{
2699 					failoverObj->request_count++;
2700 					ereport(LOG, (
2701 								  errmsg("duplicate failover request from \"%s\" node", wdNode->nodeName),
2702 								  errdetail("Pgpool-II can send multiple failover requests for same node"),
2703 								  errhint("allow_multiple_failover_requests_from_node is enabled")));
2704 				}
2705 				else
2706 				{
2707 					ereport(LOG, (
2708 								  errmsg("Duplicate failover request from \"%s\" node", wdNode->nodeName),
2709 								  errdetail("request ignored")));
2710 				}
2711 				return failoverObj;
2712 			}
2713 		}
2714 	}
2715 	else
2716 	{
2717 		oldCxt = MemoryContextSwitchTo(TopMemoryContext);
2718 		failoverObj = palloc0(sizeof(WDFailoverObject));
2719 		failoverObj->reqKind = reqKind;
2720 		failoverObj->requestingNodes = NULL;
2721 		failoverObj->nodesCount = node_count;
2722 		failoverObj->reqFlags = flags;
2723 		failoverObj->request_count = 0;
2724 		if (node_count > 0)
2725 		{
2726 			failoverObj->nodeList = palloc(sizeof(int) * node_count);
2727 			memcpy(failoverObj->nodeList, node_id_list, sizeof(int) * node_count);
2728 		}
2729 		failoverObj->failoverID = get_next_commandID();
2730 		gettimeofday(&failoverObj->startTime, NULL);
2731 		g_cluster.wdCurrentFailovers = lappend(g_cluster.wdCurrentFailovers, failoverObj);
2732 		MemoryContextSwitchTo(oldCxt);
2733 	}
2734 
2735 	failoverObj->request_count++;
2736 	oldCxt = MemoryContextSwitchTo(TopMemoryContext);
2737 	failoverObj->requestingNodes = lappend(failoverObj->requestingNodes, wdNode);
2738 	MemoryContextSwitchTo(oldCxt);
2739 	return failoverObj;
2740 }
2741 
2742 /*
2743  * The function processes all failover commands on leader node
2744  */
process_failover_command_on_coordinator(WDCommandData * ipcCommand)2745 static IPC_CMD_PROCESS_RES process_failover_command_on_coordinator(WDCommandData * ipcCommand)
2746 {
2747 	char	   *func_name;
2748 	int			node_count = 0;
2749 	int		   *node_id_list = NULL;
2750 	bool		ret = false;
2751 	unsigned char flags;
2752 	POOL_REQUEST_KIND reqKind;
2753 	WDFailoverCMDResults res;
2754 
2755 	if (get_local_node_state() != WD_COORDINATOR)
2756 		return IPC_CMD_ERROR;	/* should never happen */
2757 
2758 	ret = parse_wd_node_function_json(ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len,
2759 									  &func_name, &node_id_list, &node_count, &flags);
2760 	if (ret == false)
2761 	{
2762 		ereport(LOG, (
2763 					  errmsg("failed to process failover command"),
2764 					  errdetail("unable to parse the command data")));
2765 		reply_to_failover_command(ipcCommand, FAILOVER_RES_INVALID_FUNCTION, 0);
2766 		return IPC_CMD_COMPLETE;
2767 	}
2768 
2769 	if (strcasecmp(WD_FUNCTION_FAILBACK_REQUEST, func_name) == 0)
2770 		reqKind = NODE_UP_REQUEST;
2771 	else if (strcasecmp(WD_FUNCTION_DEGENERATE_REQUEST, func_name) == 0)
2772 		reqKind = NODE_DOWN_REQUEST;
2773 	else if (strcasecmp(WD_FUNCTION_PROMOTE_REQUEST, func_name) == 0)
2774 		reqKind = PROMOTE_NODE_REQUEST;
2775 	else
2776 	{
2777 		reply_to_failover_command(ipcCommand, FAILOVER_RES_INVALID_FUNCTION, 0);
2778 		return IPC_CMD_COMPLETE;
2779 	}
2780 
2781 	ereport(LOG,
2782 			(errmsg("watchdog is processing the failover command [%s] received from %s",
2783 					func_name,
2784 					ipcCommand->commandSource == COMMAND_SOURCE_IPC ?
2785 					"local pgpool-II on IPC interface" : ipcCommand->sourceWdNode->nodeName)));
2786 
2787 	res = compute_failover_consensus(reqKind, node_id_list, node_count, &flags, ipcCommand->sourceWdNode);
2788 
2789 	if (res == FAILOVER_RES_PROCEED)
2790 	{
2791 		/*
2792 		 * We are allowed to proceed with the failover, now if the command was
2793 		 * originated by the remote node, Kick the failover function on the
2794 		 * Pgpool-II main process and inform the remote caller to wait for
2795 		 * sync
2796 		 */
2797 		if (ipcCommand->commandSource == COMMAND_SOURCE_REMOTE)
2798 		{
2799 			/*
2800 			 * Set the flag indicating the failover request is originated by
2801 			 * watchdog
2802 			 */
2803 			flags |= REQ_DETAIL_WATCHDOG;
2804 
2805 			if (reqKind == NODE_DOWN_REQUEST)
2806 				ret = degenerate_backend_set(node_id_list, node_count, flags);
2807 			else if (reqKind == NODE_UP_REQUEST)
2808 				ret = send_failback_request(node_id_list[0], false, flags);
2809 			else if (reqKind == PROMOTE_NODE_REQUEST)
2810 				ret = promote_backend(node_id_list[0], flags);
2811 
2812 			if (ret == true)
2813 				reply_to_failover_command(ipcCommand, FAILOVER_RES_WILL_BE_DONE, 0);
2814 			else
2815 				reply_to_failover_command(ipcCommand, FAILOVER_RES_ERROR, 0);
2816 		}
2817 		else
2818 		{
2819 			/*
2820 			 * It was the request from the local node, Just reply the caller
2821 			 * to get on with the failover
2822 			 */
2823 			reply_to_failover_command(ipcCommand, FAILOVER_RES_PROCEED, 0);
2824 		}
2825 		return IPC_CMD_COMPLETE;
2826 	}
2827 	else if (res == FAILOVER_RES_NO_QUORUM)
2828 	{
2829 		ereport(LOG,
2830 				(errmsg("failover command [%s] request from pgpool-II node \"%s\" is rejected because the watchdog cluster does not hold the quorum",
2831 						func_name,
2832 						ipcCommand->sourceWdNode->nodeName)));
2833 	}
2834 	else if (res == FAILOVER_RES_BUILDING_CONSENSUS)
2835 	{
2836 		ereport(LOG,
2837 				(errmsg("failover command [%s] request from pgpool-II node \"%s\" is queued, waiting for the confirmation from other nodes",
2838 						func_name,
2839 						ipcCommand->sourceWdNode->nodeName)));
2840 
2841 		/*
2842 		 * Ask all the nodes to re-send the failover request for the
2843 		 * quarantined nodes.
2844 		 */
2845 		send_message_of_type(NULL, WD_FAILOVER_WAITING_FOR_CONSENSUS, NULL);
2846 
2847 		/*
2848 		 * Also if the command was originated by remote node, check local
2849 		 * quarantine space as-well
2850 		 */
2851 		if (ipcCommand->commandSource == COMMAND_SOURCE_REMOTE)
2852 			register_inform_quarantine_nodes_req();
2853 	}
2854 
2855 	reply_to_failover_command(ipcCommand, res, 0);
2856 	return IPC_CMD_COMPLETE;
2857 }
2858 
process_IPC_failover_command(WDCommandData * ipcCommand)2859 static IPC_CMD_PROCESS_RES process_IPC_failover_command(WDCommandData * ipcCommand)
2860 {
2861 	if (is_local_node_true_leader())
2862 	{
2863 		ereport(LOG,
2864 				(errmsg("watchdog received the failover command from local pgpool-II on IPC interface")));
2865 		return process_failover_command_on_coordinator(ipcCommand);
2866 	}
2867 	else if (get_local_node_state() == WD_STANDBY)
2868 	{
2869 		/* I am a standby node, Just forward the request to coordinator */
2870 
2871 		wd_packet_shallow_copy(&ipcCommand->sourcePacket, &ipcCommand->commandPacket);
2872 		set_next_commandID_in_message(&ipcCommand->commandPacket);
2873 
2874 		ipcCommand->sendToNode = WD_LEADER_NODE;	/* send the command to
2875 													 * leader node */
2876 		if (send_command_packet_to_remote_nodes(ipcCommand, true) <= 0)
2877 		{
2878 			ereport(LOG,
2879 					(errmsg("unable to process the failover command request received on IPC interface"),
2880 					 errdetail("failed to forward the request to the leader watchdog node \"%s\"", WD_LEADER_NODE->nodeName)));
2881 			return IPC_CMD_ERROR;
2882 		}
2883 		else
2884 		{
2885 			/*
2886 			 * we need to wait for the result
2887 			 */
2888 			ereport(LOG,
2889 					(errmsg("failover request from local pgpool-II node received on IPC interface is forwarded to leader watchdog node \"%s\"",
2890 							WD_LEADER_NODE->nodeName),
2891 					 errdetail("waiting for the reply...")));
2892 			return IPC_CMD_PROCESSING;
2893 		}
2894 	}
2895 	else
2896 	{
2897 		/* we are not in stable state at the moment */
2898 		ereport(LOG,
2899 				(errmsg("unable to process the failover request received on IPC interface"),
2900 				 errdetail("this watchdog node has not joined the cluster yet"),
2901 				 errhint("try again in few seconds")));
2902 	}
2903 	return IPC_CMD_ERROR;
2904 }
2905 
process_IPC_online_recovery(WDCommandData * ipcCommand)2906 static IPC_CMD_PROCESS_RES process_IPC_online_recovery(WDCommandData * ipcCommand)
2907 {
2908 	if (get_local_node_state() == WD_STANDBY ||
2909 		get_local_node_state() == WD_COORDINATOR)
2910 	{
2911 		/* save the hassel if I am the only alive node */
2912 		if (get_cluster_node_count() == 0)
2913 			return IPC_CMD_OK;
2914 
2915 		wd_packet_shallow_copy(&ipcCommand->sourcePacket, &ipcCommand->commandPacket);
2916 		set_next_commandID_in_message(&ipcCommand->commandPacket);
2917 
2918 		ipcCommand->sendToNode = NULL;	/* command needs to be sent to all
2919 										 * nodes */
2920 		if (send_command_packet_to_remote_nodes(ipcCommand, true) <= 0)
2921 		{
2922 			ereport(LOG,
2923 					(errmsg("unable to process the online recovery request received on IPC interface"),
2924 					 errdetail("failed to forward the request to the leader watchdog node \"%s\"", WD_LEADER_NODE->nodeName)));
2925 			return IPC_CMD_ERROR;
2926 		}
2927 		ereport(LOG,
2928 				(errmsg("online recovery request from local pgpool-II node received on IPC interface is forwarded to leader watchdog node \"%s\"",
2929 						WD_LEADER_NODE->nodeName),
2930 				 errdetail("waiting for the reply...")));
2931 
2932 		return IPC_CMD_PROCESSING;
2933 	}
2934 	/* we are not in any stable state at the moment */
2935 
2936 	ereport(LOG,
2937 			(errmsg("unable to process the online recovery request received on IPC interface"),
2938 			 errdetail("this watchdog node has not joined the cluster yet"),
2939 			 errhint("try again in few seconds")));
2940 
2941 	return IPC_CMD_TRY_AGAIN;
2942 }
2943 
process_IPC_data_request_from_leader(WDCommandData * ipcCommand)2944 static IPC_CMD_PROCESS_RES process_IPC_data_request_from_leader(WDCommandData * ipcCommand)
2945 {
2946 	/*
2947 	 * if cluster or myself is not in stable state just return cluster in
2948 	 * transaction
2949 	 */
2950 	ereport(LOG,
2951 			(errmsg("received the get data request from local pgpool-II on IPC interface")));
2952 
2953 	if (get_local_node_state() == WD_STANDBY)
2954 	{
2955 		/*
2956 		 * set the command id in the IPC packet before forwarding it on the
2957 		 * watchdog socket
2958 		 */
2959 		wd_packet_shallow_copy(&ipcCommand->sourcePacket, &ipcCommand->commandPacket);
2960 		set_next_commandID_in_message(&ipcCommand->commandPacket);
2961 
2962 		ipcCommand->sendToNode = WD_LEADER_NODE;
2963 		if (send_command_packet_to_remote_nodes(ipcCommand, true) <= 0)
2964 		{
2965 			ereport(LOG,
2966 					(errmsg("unable to process the get data request received on IPC interface"),
2967 					 errdetail("failed to forward the request to the leader watchdog node \"%s\"", WD_LEADER_NODE->nodeName)));
2968 			return IPC_CMD_ERROR;
2969 		}
2970 		else
2971 		{
2972 			/*
2973 			 * we need to wait for the result
2974 			 */
2975 			ereport(LOG,
2976 					(errmsg("get data request from local pgpool-II node received on IPC interface is forwarded to leader watchdog node \"%s\"",
2977 							WD_LEADER_NODE->nodeName),
2978 					 errdetail("waiting for the reply...")));
2979 
2980 			return IPC_CMD_PROCESSING;
2981 		}
2982 	}
2983 	else if (is_local_node_true_leader())
2984 	{
2985 		/*
2986 		 * This node is itself a leader node, So send the empty result with OK
2987 		 * tag
2988 		 */
2989 		return IPC_CMD_OK;
2990 	}
2991 
2992 	/* we are not in any stable state at the moment */
2993 	ereport(LOG,
2994 			(errmsg("unable to process the get data request received on IPC interface"),
2995 			 errdetail("this watchdog node has not joined the cluster yet"),
2996 			 errhint("try again in few seconds")));
2997 
2998 	return IPC_CMD_TRY_AGAIN;
2999 }
3000 
process_IPC_failover_indication(WDCommandData * ipcCommand)3001 static IPC_CMD_PROCESS_RES process_IPC_failover_indication(WDCommandData * ipcCommand)
3002 {
3003 	WDFailoverCMDResults res = FAILOVER_RES_NOT_ALLOWED;
3004 
3005 	/*
3006 	 * if cluster or myself is not in stable state just return cluster in
3007 	 * transaction
3008 	 */
3009 	ereport(LOG,
3010 			(errmsg("received the failover indication from Pgpool-II on IPC interface")));
3011 
3012 	if (get_local_node_state() == WD_COORDINATOR)
3013 	{
3014 		int			failoverState = -1;
3015 
3016 		if (ipcCommand->sourcePacket.data == NULL || ipcCommand->sourcePacket.len <= 0)
3017 		{
3018 			ereport(LOG,
3019 					(errmsg("watchdog unable to process failover indication"),
3020 					 errdetail("invalid command packet")));
3021 			res = FAILOVER_RES_INVALID_FUNCTION;
3022 		}
3023 		else
3024 		{
3025 			json_value *root = json_parse(ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len);
3026 
3027 			if (root && root->type == json_object)
3028 			{
3029 				if (json_get_int_value_for_key(root, "FailoverFuncState", &failoverState))
3030 				{
3031 					ereport(LOG,
3032 							(errmsg("unable to process failover indication"),
3033 							 errdetail("failed to get failover state from json data in command packet")));
3034 					res = FAILOVER_RES_INVALID_FUNCTION;
3035 				}
3036 			}
3037 			else
3038 			{
3039 				ereport(LOG,
3040 						(errmsg("unable to process failover indication"),
3041 						 errdetail("invalid JSON data in command packet")));
3042 				res = FAILOVER_RES_INVALID_FUNCTION;
3043 			}
3044 			if (root)
3045 				json_value_free(root);
3046 		}
3047 
3048 		if (failoverState < 0)
3049 		{
3050 			ereport(LOG,
3051 					(errmsg("unable to process failover indication"),
3052 					 errdetail("invalid JSON data in command packet")));
3053 			res = FAILOVER_RES_INVALID_FUNCTION;
3054 		}
3055 		else if (failoverState == 0)	/* start */
3056 		{
3057 			res = failover_start_indication(ipcCommand);
3058 		}
3059 		else					/* end */
3060 		{
3061 			res = failover_end_indication(ipcCommand);
3062 		}
3063 	}
3064 	else
3065 	{
3066 		ereport(LOG,
3067 				(errmsg("received the failover indication from Pgpool-II on IPC interface, but only leader can do failover")));
3068 	}
3069 	reply_to_failover_command(ipcCommand, res, 0);
3070 
3071 	return IPC_CMD_COMPLETE;
3072 }
3073 
3074 
3075 /* Failover start basically does nothing fancy, It just sets the failover_in_progress
3076  * flag and inform all nodes that the failover is in progress.
3077  *
3078  * only the local node that is a leader can start the failover.
3079  */
3080 static WDFailoverCMDResults
failover_start_indication(WDCommandData * ipcCommand)3081 failover_start_indication(WDCommandData * ipcCommand)
3082 {
3083 	ereport(LOG,
3084 			(errmsg("watchdog is informed of failover start by the main process")));
3085 
3086 	/* only coordinator(leader) node is allowed to process failover */
3087 	if (get_local_node_state() == WD_COORDINATOR)
3088 	{
3089 		/* inform to all nodes about failover start */
3090 		send_message_of_type(NULL, WD_FAILOVER_START, NULL);
3091 		return FAILOVER_RES_PROCEED;
3092 	}
3093 	else if (get_local_node_state() == WD_STANDBY)
3094 	{
3095 		/* The node might be performing the local quarantine operation */
3096 		ereport(DEBUG1,
3097 				(errmsg("main process is starting the local quarantine operation")));
3098 		return FAILOVER_RES_PROCEED;
3099 	}
3100 	else
3101 	{
3102 		ereport(LOG,
3103 				(errmsg("failed to process failover start request, I am not in stable state")));
3104 	}
3105 	return FAILOVER_RES_TRANSITION;
3106 }
3107 
3108 static WDFailoverCMDResults
failover_end_indication(WDCommandData * ipcCommand)3109 failover_end_indication(WDCommandData * ipcCommand)
3110 {
3111 	ereport(LOG,
3112 			(errmsg("watchdog is informed of failover end by the main process")));
3113 
3114 	/* only coordinator(leader) node is allowed to process failover */
3115 	if (get_local_node_state() == WD_COORDINATOR)
3116 	{
3117 		send_message_of_type(NULL, WD_FAILOVER_END, NULL);
3118 		return FAILOVER_RES_PROCEED;
3119 	}
3120 	else if (get_local_node_state() == WD_STANDBY)
3121 	{
3122 		/* The node might be performing the local quarantine operation */
3123 		ereport(DEBUG1,
3124 				(errmsg("main process is ending the local quarantine operation")));
3125 		return FAILOVER_RES_PROCEED;
3126 	}
3127 	else
3128 	{
3129 		ereport(LOG,
3130 				(errmsg("failed to process failover start request, I am not in stable state")));
3131 	}
3132 	return FAILOVER_RES_TRANSITION;
3133 }
3134 
parse_node_info_message(WDPacketData * pkt,char ** authkey)3135 static WatchdogNode * parse_node_info_message(WDPacketData * pkt, char **authkey)
3136 {
3137 	if (pkt == NULL || (pkt->type != WD_ADD_NODE_MESSAGE && pkt->type != WD_INFO_MESSAGE))
3138 		return NULL;
3139 	if (pkt->data == NULL || pkt->len <= 0)
3140 		return NULL;
3141 	return get_watchdog_node_from_json(pkt->data, pkt->len, authkey);
3142 }
3143 
read_packet(SocketConnection * conn)3144 static WDPacketData * read_packet(SocketConnection * conn)
3145 {
3146 	return read_packet_of_type(conn, WD_NO_MESSAGE);
3147 }
3148 
read_packet_of_type(SocketConnection * conn,char ensure_type)3149 static WDPacketData * read_packet_of_type(SocketConnection * conn, char ensure_type)
3150 {
3151 	char		type;
3152 	int			len;
3153 	unsigned int cmd_id;
3154 	char	   *buf;
3155 	WDPacketData *pkt = NULL;
3156 	int			ret;
3157 
3158 	if (is_socket_connection_connected(conn) == false)
3159 	{
3160 		ereport(LOG,
3161 				(errmsg("error reading from socket connection,socket is not connected")));
3162 		return NULL;
3163 	}
3164 
3165 	ret = socket_read(conn->sock, &type, sizeof(char), 1);
3166 	if (ret != sizeof(char))
3167 	{
3168 		close_socket_connection(conn);
3169 		return NULL;
3170 	}
3171 
3172 	ereport(DEBUG1,
3173 			(errmsg("received watchdog packet type:%c", type)));
3174 
3175 	if (ensure_type != WD_NO_MESSAGE && ensure_type != type)
3176 	{
3177 		/* The packet type is not what we want. */
3178 		ereport(DEBUG1,
3179 				(errmsg("invalid packet type. expecting %c while received %c", ensure_type, type)));
3180 		close_socket_connection(conn);
3181 		return NULL;
3182 	}
3183 
3184 	ret = socket_read(conn->sock, &cmd_id, sizeof(int), 1);
3185 	if (ret != sizeof(int))
3186 	{
3187 		close_socket_connection(conn);
3188 		return NULL;
3189 	}
3190 	cmd_id = ntohl(cmd_id);
3191 
3192 	ereport(DEBUG2,
3193 			(errmsg("received packet with command id %d from watchdog node ", cmd_id)));
3194 
3195 	ret = socket_read(conn->sock, &len, sizeof(int), 1);
3196 	if (ret != sizeof(int))
3197 	{
3198 		close_socket_connection(conn);
3199 		return NULL;
3200 	}
3201 
3202 	len = ntohl(len);
3203 
3204 	ereport(DEBUG1,
3205 			(errmsg("reading packet type %c of length %d", type, len)));
3206 
3207 	pkt = get_empty_packet();
3208 	set_message_type(pkt, type);
3209 	set_message_commandID(pkt, cmd_id);
3210 
3211 	buf = palloc(len);
3212 
3213 	ret = socket_read(conn->sock, buf, len, 1);
3214 	if (ret != len)
3215 	{
3216 		close_socket_connection(conn);
3217 		free_packet(pkt);
3218 		pfree(buf);
3219 		return NULL;
3220 	}
3221 	set_message_data(pkt, buf, len);
3222 	return pkt;
3223 }
3224 
3225 
3226 
3227 static void
wd_child_exit(int exit_signo)3228 wd_child_exit(int exit_signo)
3229 {
3230 	sigset_t	mask;
3231 
3232 	sigemptyset(&mask);
3233 	sigaddset(&mask, SIGTERM);
3234 	sigaddset(&mask, SIGINT);
3235 	sigaddset(&mask, SIGQUIT);
3236 	sigprocmask(SIG_BLOCK, &mask, NULL);
3237 	exit(0);
3238 }
3239 
3240 static void
wd_child_signal_handler(void)3241 wd_child_signal_handler(void)
3242 {
3243 	pid_t		pid;
3244 	int			status;
3245 
3246 	ereport(DEBUG1,
3247 			(errmsg("watchdog process signal handler")));
3248 
3249 	/* clear SIGCHLD request */
3250 	sigchld_request = 0;
3251 
3252 	while ((pid = pool_waitpid(&status)) > 0)
3253 	{
3254 		char	   *exiting_process_name;
3255 
3256 		if (g_cluster.de_escalation_pid == pid)
3257 		{
3258 			exiting_process_name = "de-escalation";
3259 			g_cluster.de_escalation_pid = 0;
3260 		}
3261 		else if (g_cluster.escalation_pid == pid)
3262 		{
3263 			exiting_process_name = "escalation";
3264 			g_cluster.escalation_pid = 0;
3265 		}
3266 		else
3267 			exiting_process_name = "unknown";
3268 
3269 		if (WIFEXITED(status))
3270 		{
3271 			if (WEXITSTATUS(status) == POOL_EXIT_FATAL)
3272 				ereport(LOG,
3273 						(errmsg("watchdog %s process with pid: %d exit with FATAL ERROR.", exiting_process_name, pid)));
3274 			else if (WEXITSTATUS(status) == POOL_EXIT_NO_RESTART)
3275 				ereport(LOG,
3276 						(errmsg("watchdog %s process with pid: %d exit with SUCCESS.", exiting_process_name, pid)));
3277 		}
3278 		else if (WIFSIGNALED(status))
3279 		{
3280 			/* Child terminated by segmentation fault. Report it */
3281 			if (WTERMSIG(status) == SIGSEGV)
3282 				ereport(WARNING,
3283 						(errmsg("watchdog %s process with pid: %d was terminated by segmentation fault", exiting_process_name, pid)));
3284 			else
3285 				ereport(LOG,
3286 						(errmsg("watchdog %s process with pid: %d exits with status %d by signal %d", exiting_process_name, pid, status, WTERMSIG(status))));
3287 		}
3288 		else
3289 			ereport(LOG,
3290 					(errmsg("watchdog %s process with pid: %d exits with status %d", exiting_process_name, pid, status)));
3291 	}
3292 }
3293 
3294 /* Function invoked when watchdog process is about to exit */
3295 static void
wd_system_will_go_down(int code,Datum arg)3296 wd_system_will_go_down(int code, Datum arg)
3297 {
3298 	int			i;
3299 
3300 	ereport(LOG,
3301 			(errmsg("Watchdog is shutting down")));
3302 
3303 	send_cluster_command(NULL, WD_INFORM_I_AM_GOING_DOWN, 0);
3304 
3305 	if (get_local_node_state() == WD_COORDINATOR)
3306 		resign_from_escalated_node();
3307 	/* close server socket */
3308 	close_socket_connection(&g_cluster.localNode->server_socket);
3309 	/* close all node sockets */
3310 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
3311 	{
3312 		WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
3313 
3314 		close_socket_connection(&wdNode->client_socket);
3315 		close_socket_connection(&wdNode->server_socket);
3316 	}
3317 	/* close network monitoring socket */
3318 	if (g_cluster.network_monitor_sock > 0)
3319 		close(g_cluster.network_monitor_sock);
3320 	/* wait for sub-processes to exit */
3321 	if (g_cluster.de_escalation_pid > 0 || g_cluster.escalation_pid > 0)
3322 	{
3323 		pid_t		wpid;
3324 
3325 		do
3326 		{
3327 			wpid = wait(NULL);
3328 		} while (wpid > 0 || (wpid == -1 && errno == EINTR));
3329 	}
3330 }
3331 
3332 static void
close_socket_connection(SocketConnection * conn)3333 close_socket_connection(SocketConnection * conn)
3334 {
3335 	if ((conn->sock > 0 && conn->sock_state == WD_SOCK_CONNECTED)
3336 		|| conn->sock_state == WD_SOCK_WAITING_FOR_CONNECT)
3337 	{
3338 		close(conn->sock);
3339 		conn->sock = -1;
3340 		conn->sock_state = WD_SOCK_CLOSED;
3341 	}
3342 }
3343 
3344 static bool
is_socket_connection_connected(SocketConnection * conn)3345 is_socket_connection_connected(SocketConnection * conn)
3346 {
3347 	return (conn->sock > 0 && conn->sock_state == WD_SOCK_CONNECTED);
3348 }
3349 
3350 
3351 static bool
is_node_reachable(WatchdogNode * wdNode)3352 is_node_reachable(WatchdogNode * wdNode)
3353 {
3354 	if (is_socket_connection_connected(&wdNode->client_socket))
3355 		return true;
3356 	if (is_socket_connection_connected(&wdNode->server_socket))
3357 		return true;
3358 	return false;
3359 }
3360 
3361 static bool
is_node_active(WatchdogNode * wdNode)3362 is_node_active(WatchdogNode * wdNode)
3363 {
3364 	if (wdNode->state == WD_DEAD || wdNode->state == WD_LOST || wdNode->state == WD_SHUTDOWN)
3365 		return false;
3366 	return true;
3367 }
3368 
3369 static bool
is_node_active_and_reachable(WatchdogNode * wdNode)3370 is_node_active_and_reachable(WatchdogNode * wdNode)
3371 {
3372 	if (is_node_active(wdNode))
3373 		return is_node_reachable(wdNode);
3374 	return false;
3375 }
3376 
3377 static int
accept_incoming_connections(fd_set * rmask,int pending_fds_count)3378 accept_incoming_connections(fd_set *rmask, int pending_fds_count)
3379 {
3380 	int			processed_fds = 0;
3381 	int			fd;
3382 
3383 	if (FD_ISSET(g_cluster.localNode->server_socket.sock, rmask))
3384 	{
3385 		struct sockaddr_in addr;
3386 		socklen_t	addrlen = sizeof(struct sockaddr_in);
3387 
3388 		processed_fds++;
3389 		fd = accept(g_cluster.localNode->server_socket.sock, (struct sockaddr *) &addr, &addrlen);
3390 		if (fd < 0)
3391 		{
3392 			if (errno == EINTR || errno == 0 || errno == EAGAIN || errno == EWOULDBLOCK)
3393 			{
3394 				/* nothing to accept now */
3395 				ereport(DEBUG2,
3396 						(errmsg("Failed to accept incoming watchdog connection, Nothing to accept")));
3397 			}
3398 			/* accept failed */
3399 			ereport(DEBUG1,
3400 					(errmsg("Failed to accept incoming watchdog connection")));
3401 		}
3402 		else
3403 		{
3404 			MemoryContext oldCxt = MemoryContextSwitchTo(TopMemoryContext);
3405 			SocketConnection *conn = palloc(sizeof(SocketConnection));
3406 
3407 			conn->sock = fd;
3408 			conn->sock_state = WD_SOCK_CONNECTED;
3409 			gettimeofday(&conn->tv, NULL);
3410 			strncpy(conn->addr, inet_ntoa(addr.sin_addr), sizeof(conn->addr) - 1);
3411 			ereport(LOG,
3412 					(errmsg("new watchdog node connection is received from \"%s:%d\"", inet_ntoa(addr.sin_addr), addr.sin_port)));
3413 			g_cluster.unidentified_socks = lappend(g_cluster.unidentified_socks, conn);
3414 			MemoryContextSwitchTo(oldCxt);
3415 		}
3416 	}
3417 
3418 	if (processed_fds >= pending_fds_count)
3419 		return processed_fds;
3420 
3421 	if (FD_ISSET(g_cluster.command_server_sock, rmask))
3422 	{
3423 		struct sockaddr addr;
3424 		socklen_t	addrlen = sizeof(struct sockaddr);
3425 
3426 		processed_fds++;
3427 
3428 		int			fd = accept(g_cluster.command_server_sock, &addr, &addrlen);
3429 
3430 		if (fd < 0)
3431 		{
3432 			if (errno == EINTR || errno == 0 || errno == EAGAIN || errno == EWOULDBLOCK)
3433 			{
3434 				/* nothing to accept now */
3435 				ereport(WARNING,
3436 						(errmsg("failed to accept incoming watchdog IPC connection, Nothing to accept")));
3437 			}
3438 			/* accept failed */
3439 			ereport(WARNING,
3440 					(errmsg("failed to accept incoming watchdog IPC connection")));
3441 		}
3442 		else
3443 		{
3444 			MemoryContext oldCxt = MemoryContextSwitchTo(TopMemoryContext);
3445 
3446 			ereport(LOG,
3447 					(errmsg("new IPC connection received")));
3448 			g_cluster.ipc_command_socks = lappend_int(g_cluster.ipc_command_socks, fd);
3449 			MemoryContextSwitchTo(oldCxt);
3450 		}
3451 	}
3452 
3453 	return processed_fds;
3454 }
3455 
3456 static int
update_successful_outgoing_cons(fd_set * wmask,int pending_fds_count)3457 update_successful_outgoing_cons(fd_set *wmask, int pending_fds_count)
3458 {
3459 	int			i;
3460 	int			count = 0;
3461 
3462 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
3463 	{
3464 		WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
3465 
3466 		if (wdNode->client_socket.sock > 0 && wdNode->client_socket.sock_state == WD_SOCK_WAITING_FOR_CONNECT)
3467 		{
3468 			if (FD_ISSET(wdNode->client_socket.sock, wmask))
3469 			{
3470 				socklen_t	lon;
3471 				int			valopt;
3472 
3473 				lon = sizeof(int);
3474 
3475 				gettimeofday(&wdNode->client_socket.tv, NULL);
3476 
3477 				if (getsockopt(wdNode->client_socket.sock, SOL_SOCKET, SO_ERROR, (void *) (&valopt), &lon) == 0)
3478 				{
3479 					if (valopt)
3480 					{
3481 						ereport(DEBUG1,
3482 								(errmsg("error in outbound connection to %s:%d", wdNode->hostname, wdNode->wd_port),
3483 								 errdetail("%s", strerror(valopt))));
3484 						close_socket_connection(&wdNode->client_socket);
3485 						wdNode->client_socket.sock_state = WD_SOCK_ERROR;
3486 					}
3487 					else
3488 					{
3489 						wdNode->client_socket.sock_state = WD_SOCK_CONNECTED;
3490 						ereport(LOG,
3491 								(errmsg("new outbound connection to %s:%d ", wdNode->hostname, wdNode->wd_port)));
3492 						/* set socket to blocking again */
3493 						socket_unset_nonblock(wdNode->client_socket.sock);
3494 						watchdog_state_machine(WD_EVENT_NEW_OUTBOUND_CONNECTION, wdNode, NULL, NULL);
3495 					}
3496 				}
3497 				else
3498 				{
3499 					ereport(DEBUG1,
3500 							(errmsg("error in outbound connection to %s:%d ", wdNode->hostname, wdNode->wd_port),
3501 							 errdetail("getsockopt failed with error \"%m\"")));
3502 					close_socket_connection(&wdNode->client_socket);
3503 					wdNode->client_socket.sock_state = WD_SOCK_ERROR;
3504 
3505 				}
3506 				count++;
3507 				if (count >= pending_fds_count)
3508 					break;
3509 			}
3510 		}
3511 	}
3512 	return count;
3513 }
3514 
3515 static bool
write_packet_to_socket(int sock,WDPacketData * pkt,bool ipcPacket)3516 write_packet_to_socket(int sock, WDPacketData * pkt, bool ipcPacket)
3517 {
3518 	int			ret = 0;
3519 	int			command_id,
3520 				len;
3521 
3522 	ereport(DEBUG1,
3523 			(errmsg("sending watchdog packet to socket:%d, type:[%c], command ID:%d, data Length:%d", sock, pkt->type,
3524 					pkt->command_id, pkt->len)));
3525 
3526 	print_packet_info(pkt, true);
3527 
3528 	/* TYPE */
3529 	if (write(sock, &pkt->type, 1) < 1)
3530 	{
3531 		ereport(LOG,
3532 				(errmsg("failed to write watchdog packet to socket"),
3533 				 errdetail("%m")));
3534 		return false;
3535 	}
3536 	if (ipcPacket == false)
3537 	{
3538 		/* IPC packets does not have command ID field */
3539 		command_id = htonl(pkt->command_id);
3540 		if (write(sock, &command_id, 4) < 4)
3541 		{
3542 			ereport(LOG,
3543 					(errmsg("failed to write watchdog packet to socket"),
3544 					 errdetail("%m")));
3545 			return false;
3546 		}
3547 	}
3548 	/* data length */
3549 	len = htonl(pkt->len);
3550 	if (write(sock, &len, 4) < 4)
3551 	{
3552 		ereport(LOG,
3553 				(errmsg("failed to write watchdog packet to socket"),
3554 				 errdetail("%m")));
3555 		return false;
3556 	}
3557 	/* DATA */
3558 	if (pkt->len > 0 && pkt->data)
3559 	{
3560 		int			bytes_send = 0;
3561 
3562 		do
3563 		{
3564 			ret = write(sock, pkt->data + bytes_send, (pkt->len - bytes_send));
3565 			if (ret <= 0)
3566 			{
3567 				ereport(LOG,
3568 						(errmsg("failed to write watchdog packet to socket"),
3569 						 errdetail("%m")));
3570 				return false;
3571 			}
3572 			bytes_send += ret;
3573 		} while (bytes_send < pkt->len);
3574 	}
3575 	return true;
3576 }
3577 
3578 static void
wd_packet_shallow_copy(WDPacketData * srcPkt,WDPacketData * dstPkt)3579 wd_packet_shallow_copy(WDPacketData * srcPkt, WDPacketData * dstPkt)
3580 {
3581 	dstPkt->command_id = srcPkt->command_id;
3582 	dstPkt->data = srcPkt->data;
3583 	dstPkt->len = srcPkt->len;
3584 	dstPkt->type = srcPkt->type;
3585 }
3586 
3587 static void
init_wd_packet(WDPacketData * pkt)3588 init_wd_packet(WDPacketData * pkt)
3589 {
3590 	pkt->len = 0;
3591 	pkt->data = NULL;
3592 }
3593 
get_empty_packet(void)3594 static WDPacketData * get_empty_packet(void)
3595 {
3596 	WDPacketData *pkt = palloc0(sizeof(WDPacketData));
3597 
3598 	return pkt;
3599 }
3600 
3601 static void
free_packet(WDPacketData * pkt)3602 free_packet(WDPacketData * pkt)
3603 {
3604 	if (pkt)
3605 	{
3606 		if (pkt->data)
3607 			pfree(pkt->data);
3608 		pfree(pkt);
3609 	}
3610 }
3611 
3612 static void
set_message_type(WDPacketData * pkt,char type)3613 set_message_type(WDPacketData * pkt, char type)
3614 {
3615 	pkt->type = type;
3616 }
3617 
3618 static void
set_message_commandID(WDPacketData * pkt,unsigned int commandID)3619 set_message_commandID(WDPacketData * pkt, unsigned int commandID)
3620 {
3621 	pkt->command_id = commandID;
3622 }
3623 
3624 static void
set_next_commandID_in_message(WDPacketData * pkt)3625 set_next_commandID_in_message(WDPacketData * pkt)
3626 {
3627 	set_message_commandID(pkt, get_next_commandID());
3628 }
3629 
3630 static void
set_message_data(WDPacketData * pkt,const char * data,int len)3631 set_message_data(WDPacketData * pkt, const char *data, int len)
3632 {
3633 	pkt->data = (char *) data;
3634 	pkt->len = len;
3635 }
3636 
3637 #define nodeIfNull_str(m,v) node&&strlen(node->m)?node->m:v
3638 #define nodeIfNull_int(m,v) node?node->m:v
3639 #define NotSet "Not_Set"
3640 
3641 static bool
add_nodeinfo_to_json(JsonNode * jNode,WatchdogNode * node)3642 add_nodeinfo_to_json(JsonNode * jNode, WatchdogNode * node)
3643 {
3644 	jw_start_object(jNode, "WatchdogNode");
3645 
3646 	jw_put_int(jNode, "ID", nodeIfNull_int(pgpool_node_id, -1));
3647 	jw_put_int(jNode, "State", nodeIfNull_int(state, -1));
3648 	jw_put_int(jNode, "Membership", nodeIfNull_int(membership_status, -1));
3649 	jw_put_string(jNode, "MembershipString", node ? wd_cluster_membership_status[node->membership_status] : NotSet);
3650 	jw_put_string(jNode, "NodeName", nodeIfNull_str(nodeName, NotSet));
3651 	jw_put_string(jNode, "HostName", nodeIfNull_str(hostname, NotSet));
3652 	jw_put_string(jNode, "StateName", node ? wd_state_names[node->state] : NotSet);
3653 	jw_put_string(jNode, "DelegateIP", nodeIfNull_str(delegate_ip, NotSet));
3654 	jw_put_int(jNode, "WdPort", nodeIfNull_int(wd_port, 0));
3655 	jw_put_int(jNode, "PgpoolPort", nodeIfNull_int(pgpool_port, 0));
3656 	jw_put_int(jNode, "Priority", nodeIfNull_int(wd_priority, 0));
3657 
3658 	jw_end_element(jNode);
3659 
3660 	return true;
3661 }
3662 
get_node_list_json(int id)3663 static JsonNode * get_node_list_json(int id)
3664 {
3665 	int			i;
3666 	JsonNode   *jNode = jw_create_with_object(true);
3667 
3668 	jw_put_int(jNode, "RemoteNodeCount", g_cluster.remoteNodeCount);
3669 	jw_put_int(jNode, "MemberRemoteNodeCount", g_cluster.memberRemoteNodeCount);
3670 	jw_put_int(jNode, "NodesRequireForQuorum", get_minimum_votes_to_resolve_consensus());
3671 	jw_put_int(jNode, "QuorumStatus", WD_LEADER_NODE ? WD_LEADER_NODE->quorum_status : -2);
3672 	jw_put_int(jNode, "AliveNodeCount", WD_LEADER_NODE ? WD_LEADER_NODE->standby_nodes_count : 0);
3673 	jw_put_int(jNode, "Escalated", g_cluster.localNode->escalated);
3674 	jw_put_string(jNode, "LeaderNodeName", WD_LEADER_NODE ? WD_LEADER_NODE->nodeName : "Not Set");
3675 	jw_put_string(jNode, "LeaderHostName", WD_LEADER_NODE ? WD_LEADER_NODE->hostname : "Not Set");
3676 	if (id < 0)
3677 	{
3678 		jw_put_int(jNode, "NodeCount", g_cluster.remoteNodeCount + 1);
3679 
3680 		/* add the array */
3681 		jw_start_array(jNode, "WatchdogNodes");
3682 		/* add the local node info */
3683 		add_nodeinfo_to_json(jNode, g_cluster.localNode);
3684 		/* add all remote nodes */
3685 		for (i = 0; i < g_cluster.remoteNodeCount; i++)
3686 		{
3687 			WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
3688 
3689 			add_nodeinfo_to_json(jNode, wdNode);
3690 		}
3691 	}
3692 	else
3693 	{
3694 		jw_put_int(jNode, "NodeCount", 1);
3695 		/* add the array */
3696 		jw_start_array(jNode, "WatchdogNodes");
3697 
3698 		if (id == g_cluster.localNode->pgpool_node_id)
3699 		{
3700 			/* add the local node info */
3701 			add_nodeinfo_to_json(jNode, g_cluster.localNode);
3702 		}
3703 		else
3704 		{
3705 			/* find from remote nodes */
3706 			WatchdogNode *wdNodeToAdd = NULL;
3707 
3708 			for (i = 0; i < g_cluster.remoteNodeCount; i++)
3709 			{
3710 				WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
3711 
3712 				if (wdNode->pgpool_node_id == id)
3713 				{
3714 					wdNodeToAdd = wdNode;
3715 					break;
3716 				}
3717 			}
3718 			add_nodeinfo_to_json(jNode, wdNodeToAdd);
3719 		}
3720 	}
3721 	jw_finish_document(jNode);
3722 	return jNode;
3723 }
3724 
get_beacon_message(char type,WDPacketData * replyFor)3725 static WDPacketData * get_beacon_message(char type, WDPacketData * replyFor)
3726 {
3727 	WDPacketData *message = get_empty_packet();
3728 	char	   *json_data;
3729 
3730 	json_data = get_beacon_message_json(g_cluster.localNode);
3731 
3732 	set_message_type(message, type);
3733 
3734 	if (replyFor == NULL)
3735 		set_next_commandID_in_message(message);
3736 	else
3737 		set_message_commandID(message, replyFor->command_id);
3738 
3739 	set_message_data(message, json_data, strlen(json_data));
3740 	return message;
3741 }
3742 
get_addnode_message(void)3743 static WDPacketData * get_addnode_message(void)
3744 {
3745 	char		authhash[WD_AUTH_HASH_LEN + 1];
3746 	WDPacketData *message = get_empty_packet();
3747 	bool		include_hash = get_authhash_for_node(g_cluster.localNode, authhash);
3748 	char	   *json_data = get_watchdog_node_info_json(g_cluster.localNode, include_hash ? authhash : NULL);
3749 
3750 	set_message_type(message, WD_ADD_NODE_MESSAGE);
3751 	set_next_commandID_in_message(message);
3752 	set_message_data(message, json_data, strlen(json_data));
3753 	return message;
3754 }
3755 
get_mynode_info_message(WDPacketData * replyFor)3756 static WDPacketData * get_mynode_info_message(WDPacketData * replyFor)
3757 {
3758 	char		authhash[WD_AUTH_HASH_LEN + 1];
3759 	WDPacketData *message = get_empty_packet();
3760 	bool		include_hash = get_authhash_for_node(g_cluster.localNode, authhash);
3761 	char	   *json_data = get_watchdog_node_info_json(g_cluster.localNode, include_hash ? authhash : NULL);
3762 
3763 	set_message_type(message, WD_INFO_MESSAGE);
3764 	if (replyFor == NULL)
3765 		set_next_commandID_in_message(message);
3766 	else
3767 		set_message_commandID(message, replyFor->command_id);
3768 
3769 	set_message_data(message, json_data, strlen(json_data));
3770 	return message;
3771 }
3772 
get_minimum_message(char type,WDPacketData * replyFor)3773 static WDPacketData * get_minimum_message(char type, WDPacketData * replyFor)
3774 {
3775 	/* TODO it is a waste of space */
3776 	WDPacketData *message = get_empty_packet();
3777 
3778 	set_message_type(message, type);
3779 	if (replyFor == NULL)
3780 		set_next_commandID_in_message(message);
3781 	else
3782 		set_message_commandID(message, replyFor->command_id);
3783 	return message;
3784 }
3785 
get_wd_IPC_command_from_reply(WDPacketData * pkt)3786 static WDCommandData * get_wd_IPC_command_from_reply(WDPacketData * pkt)
3787 {
3788 	return get_wd_command_from_reply(g_cluster.ipc_commands, pkt);
3789 }
get_wd_cluster_command_from_reply(WDPacketData * pkt)3790 static WDCommandData * get_wd_cluster_command_from_reply(WDPacketData * pkt)
3791 {
3792 	return get_wd_command_from_reply(g_cluster.clusterCommands, pkt);
3793 }
3794 
get_wd_command_from_reply(List * commands,WDPacketData * pkt)3795 static WDCommandData * get_wd_command_from_reply(List *commands, WDPacketData * pkt)
3796 {
3797 	ListCell   *lc;
3798 
3799 	if (commands == NULL)
3800 		return NULL;
3801 
3802 	foreach(lc, commands)
3803 	{
3804 		WDCommandData *ipcCommand = lfirst(lc);
3805 
3806 		if (ipcCommand)
3807 		{
3808 			if (ipcCommand->commandPacket.command_id == pkt->command_id)
3809 			{
3810 				ereport(DEBUG1,
3811 						(errmsg("packet %c with command ID %d is reply to the command %c", pkt->type, pkt->command_id,
3812 								ipcCommand->commandPacket.type)));
3813 				return ipcCommand;
3814 			}
3815 		}
3816 	}
3817 	return NULL;
3818 }
3819 
get_wd_IPC_command_from_socket(int sock)3820 static WDCommandData * get_wd_IPC_command_from_socket(int sock)
3821 {
3822 	ListCell   *lc;
3823 
3824 	foreach(lc, g_cluster.ipc_commands)
3825 	{
3826 		WDCommandData *ipcCommand = lfirst(lc);
3827 
3828 		if (ipcCommand)
3829 		{
3830 			if (ipcCommand->commandSource != COMMAND_SOURCE_IPC)
3831 				continue;
3832 
3833 			if (ipcCommand->sourceIPCSocket == sock)
3834 				return ipcCommand;
3835 		}
3836 	}
3837 	return NULL;
3838 }
3839 
3840 
3841 static void
cleanUpIPCCommand(WDCommandData * ipcCommand)3842 cleanUpIPCCommand(WDCommandData * ipcCommand)
3843 {
3844 	/*
3845 	 * close the socket associated with ipcCommand and remove it from
3846 	 * ipcSocket list
3847 	 */
3848 	if (ipcCommand->commandSource == COMMAND_SOURCE_IPC &&
3849 		ipcCommand->sourceIPCSocket > 0)
3850 	{
3851 		close(ipcCommand->sourceIPCSocket);
3852 		g_cluster.ipc_command_socks = list_delete_int(g_cluster.ipc_command_socks, ipcCommand->sourceIPCSocket);
3853 		ipcCommand->sourceIPCSocket = -1;
3854 	}
3855 	/* Now remove the ipcCommand instance from the command list */
3856 	g_cluster.ipc_commands = list_delete_ptr(g_cluster.ipc_commands, ipcCommand);
3857 
3858 	/*
3859 	 * Finally the memory part As everything of IPCCommand live inside its own
3860 	 * memory context. Delete the MemoryContext and we are good
3861 	 */
3862 	MemoryContextDelete(ipcCommand->memoryContext);
3863 }
3864 
process_data_request(WatchdogNode * wdNode,WDPacketData * pkt)3865 static WDPacketData * process_data_request(WatchdogNode * wdNode, WDPacketData * pkt)
3866 {
3867 	char	   *request_type;
3868 	char	   *data = NULL;
3869 	WDPacketData *replyPkt = NULL;
3870 
3871 	if (pkt->data == NULL || pkt->len <= 0)
3872 	{
3873 		ereport(WARNING,
3874 				(errmsg("invalid data request packet from watchdog node \"%s\"", wdNode->nodeName),
3875 				 errdetail("no data found in the packet")));
3876 
3877 		replyPkt = get_minimum_message(WD_ERROR_MESSAGE, pkt);
3878 		return replyPkt;
3879 	}
3880 
3881 	if (!parse_data_request_json(pkt->data, pkt->len, &request_type))
3882 	{
3883 		ereport(WARNING,
3884 				(errmsg("invalid data request packet from watchdog node \"%s\"", wdNode->nodeName),
3885 				 errdetail("no data found in the packet")));
3886 
3887 		replyPkt = get_minimum_message(WD_ERROR_MESSAGE, pkt);
3888 		return replyPkt;
3889 	}
3890 
3891 	if (strcasecmp(request_type, WD_DATE_REQ_PG_BACKEND_DATA) == 0)
3892 	{
3893 		data = get_backend_node_status_json(g_cluster.localNode);
3894 	}
3895 
3896 	if (data)
3897 	{
3898 		replyPkt = get_empty_packet();
3899 		set_message_type(replyPkt, WD_DATA_MESSAGE);
3900 		set_message_commandID(replyPkt, pkt->command_id);
3901 		set_message_data(replyPkt, data, strlen(data));
3902 	}
3903 	else
3904 	{
3905 		replyPkt = get_minimum_message(WD_ERROR_MESSAGE, pkt);
3906 	}
3907 
3908 	return replyPkt;
3909 }
3910 
3911 static void
cluster_service_message_processor(WatchdogNode * wdNode,WDPacketData * pkt)3912 cluster_service_message_processor(WatchdogNode * wdNode, WDPacketData * pkt)
3913 {
3914 	if (pkt->type != WD_CLUSTER_SERVICE_MESSAGE)
3915 		return;
3916 
3917 	if (pkt->len != 1 || pkt->data == NULL)
3918 	{
3919 		ereport(LOG,
3920 				(errmsg("node \"%s\" sent an invalid cluster service message", wdNode->nodeName)));
3921 		return;
3922 	}
3923 
3924 	switch (pkt->data[0])
3925 	{
3926 		case CLUSTER_IAM_TRUE_LEADER:
3927 			{
3928 				/*
3929 				 * The cluster was in split-brain and remote node thinks it is
3930 				 * the worthy leader
3931 				 */
3932 				if (get_local_node_state() == WD_COORDINATOR)
3933 				{
3934 					ereport(LOG,
3935 							(errmsg("remote node \"%s\" decided it is the true leader", wdNode->nodeName),
3936 							 errdetail("re-initializing the local watchdog cluster state because of split-brain")));
3937 
3938 					send_cluster_service_message(NULL, pkt, CLUSTER_IAM_RESIGNING_FROM_LEADER);
3939 					set_state(WD_JOINING);
3940 				}
3941 				else if (WD_LEADER_NODE != NULL && WD_LEADER_NODE != wdNode)
3942 				{
3943 					ereport(LOG,
3944 							(errmsg("remote node \"%s\" thinks it is a leader/coordinator and I am causing the split-brain," \
3945 									" but as per our record \"%s\" is the cluster leader/coordinator",
3946 									wdNode->nodeName,
3947 									WD_LEADER_NODE->nodeName),
3948 							 errdetail("restarting the cluster")));
3949 					send_cluster_service_message(NULL, pkt, CLUSTER_NEEDS_ELECTION);
3950 					set_state(WD_JOINING);
3951 				}
3952 			}
3953 			break;
3954 
3955 		case CLUSTER_IAM_RESIGNING_FROM_LEADER:
3956 			{
3957 				if (WD_LEADER_NODE == wdNode)
3958 				{
3959 					ereport(LOG,
3960 							(errmsg("leader/coordinator node \"%s\" decided to resigning from leader, probably because of split-brain",
3961 									wdNode->nodeName),
3962 							 errdetail("re-initializing the local watchdog cluster state")));
3963 
3964 					set_state(WD_JOINING);
3965 				}
3966 				else
3967 				{
3968 					ereport(LOG,
3969 							(errmsg("leader/coordinator node \"%s\" decided to resign from leader, probably because of split-brain",
3970 									wdNode->nodeName),
3971 							 errdetail("It was not our coordinator/leader anyway. ignoring the message")));
3972 				}
3973 			}
3974 			break;
3975 
3976 		case CLUSTER_IN_SPLIT_BRAIN:
3977 			{
3978 				try_connecting_with_all_unreachable_nodes();
3979 				if (get_local_node_state() == WD_COORDINATOR)
3980 				{
3981 					ereport(LOG,
3982 							(errmsg("remote node \"%s\" detected the cluster is in split-brain", wdNode->nodeName),
3983 							 errdetail("broadcasting the beacon message")));
3984 					send_message_of_type(NULL, WD_IAM_COORDINATOR_MESSAGE, NULL);
3985 				}
3986 			}
3987 			break;
3988 
3989 		case CLUSTER_NEEDS_ELECTION:
3990 			{
3991 				ereport(LOG,
3992 						(errmsg("remote node \"%s\" detected the problem and asking us to rejoin the cluster", wdNode->nodeName)));
3993 
3994 				set_state(WD_JOINING);
3995 			}
3996 			break;
3997 
3998 		case CLUSTER_IAM_NOT_TRUE_LEADER:
3999 			{
4000 				if (WD_LEADER_NODE == wdNode)
4001 				{
4002 					ereport(LOG,
4003 							(errmsg("leader/coordinator node \"%s\" decided it was not true leader, probably because of split-brain", wdNode->nodeName),
4004 							 errdetail("re-initializing the local watchdog cluster state")));
4005 
4006 					set_state(WD_JOINING);
4007 				}
4008 				else if (get_local_node_state() == WD_COORDINATOR)
4009 				{
4010 					ereport(LOG,
4011 							(errmsg("node \"%s\" was also thinking it was a leader/coordinator and decided to resign", wdNode->nodeName),
4012 							 errdetail("cluster is recovering from split-brain")));
4013 				}
4014 				else
4015 				{
4016 					ereport(LOG,
4017 							(errmsg("leader/coordinator node \"%s\" decided to resign from leader, probably because of split-brain",
4018 									wdNode->nodeName),
4019 							 errdetail("but it was not our coordinator/leader anyway. ignoring the message")));
4020 				}
4021 			}
4022 			break;
4023 
4024 		case CLUSTER_NODE_REQUIRE_TO_RELOAD:
4025 		{
4026 			watchdog_state_machine(WD_EVENT_WD_STATE_REQUIRE_RELOAD, NULL, NULL, NULL);
4027 		}
4028 			break;
4029 
4030 		case CLUSTER_NODE_APPEARING_LOST:
4031 		{
4032 			ereport(LOG,
4033 				(errmsg("remote node \"%s\" is reporting that it has lost us",
4034 							wdNode->nodeName)));
4035 			wdNode->has_lost_us = true;
4036 			watchdog_state_machine(WD_EVENT_I_AM_APPEARING_LOST, wdNode, NULL, NULL);
4037 		}
4038 			break;
4039 
4040 		case CLUSTER_NODE_APPEARING_FOUND:
4041 		{
4042 			ereport(LOG,
4043 				(errmsg("remote node \"%s\" is reporting that it has found us again",
4044 							wdNode->nodeName)));
4045 			wdNode->has_lost_us = false;
4046 			watchdog_state_machine(WD_EVENT_I_AM_APPEARING_FOUND, wdNode, NULL, NULL);
4047 		}
4048 			break;
4049 
4050 		case CLUSTER_NODE_INVALID_VERSION:
4051 			{
4052 				/*
4053 				 * this should never happen means something is seriously wrong
4054 				 */
4055 				ereport(FATAL,
4056 						(return_code(POOL_EXIT_FATAL),
4057 						 errmsg("\"%s\" node has found serious issues in our watchdog messages",
4058 								wdNode->nodeName),
4059 						 errdetail("shutting down")));
4060 			}
4061 			break;
4062 		default:
4063 			break;
4064 	}
4065 }
4066 
4067 static void
wd_execute_cluster_command_processor(WatchdogNode * wdNode,WDPacketData * pkt)4068 wd_execute_cluster_command_processor(WatchdogNode * wdNode, WDPacketData * pkt)
4069 {
4070 	/* get the json for node list */
4071 	char 	*clusterCommand = NULL;
4072 	List 	*args_list = NULL;
4073 
4074 	if (pkt->type != WD_EXECUTE_COMMAND_REQUEST)
4075 		return;
4076 
4077 	if (pkt->len <= 0 || pkt->data == NULL)
4078 	{
4079 		ereport(LOG,
4080 				(errmsg("node \"%s\" sent an empty execute cluster command message", wdNode->nodeName)));
4081 		return;
4082 	}
4083 
4084 	if (!parse_wd_exec_cluster_command_json(pkt->data, pkt->len,
4085 									   &clusterCommand, &args_list))
4086 	{
4087 		ereport(LOG,
4088 				(errmsg("node \"%s\" sent an invalid JSON data in cluster command message", wdNode->nodeName)));
4089 		return;
4090 	}
4091 
4092 	ereport(DEBUG1,
4093 			(errmsg("received \"%s\" command from node \"%s\"",clusterCommand, wdNode->nodeName)));
4094 	if (strcasecmp(WD_COMMAND_SHUTDOWN_CLUSTER, clusterCommand) == 0)
4095 	{
4096 		char mode = 's';
4097 		ListCell   *lc;
4098 		foreach(lc, args_list)
4099 		{
4100 			WDExecCommandArg *wdExecCommandArg = lfirst(lc);
4101 			if (strcmp(wdExecCommandArg->arg_name, "mode") == 0)
4102 			{
4103 				mode = wdExecCommandArg->arg_value[0];
4104 			}
4105 			else
4106 				ereport(LOG,
4107 						(errmsg("unsupported argument \"%s\" in shutdown command from remote node \"%s\"", wdExecCommandArg->arg_name, wdNode->nodeName)));
4108 		}
4109 
4110 		ereport(LOG,
4111 				(errmsg("processing shutdown command from remote node \"%s\"", wdNode->nodeName)));
4112 		terminate_pgpool(mode, false);
4113 	}
4114 	else if (strcasecmp(WD_COMMAND_RELOAD_CONFIG_CLUSTER, clusterCommand) == 0)
4115 	{
4116 		ereport(LOG,
4117 				(errmsg("processing reload config command from remote node \"%s\"", wdNode->nodeName)));
4118 		pool_signal_parent(SIGHUP);
4119 	}
4120 	else if (strcasecmp(WD_COMMAND_LOCK_ON_STANDBY, clusterCommand) == 0)
4121 	{
4122 		int lock_type = -1;
4123 		char *operation = NULL;
4124 		if (get_local_node_state() == WD_STANDBY && wdNode->state == WD_COORDINATOR)
4125 		{
4126 			if (list_length(args_list) == 2)
4127 			{
4128 				ListCell   *lc;
4129 				foreach(lc, args_list)
4130 				{
4131 					WDExecCommandArg *wdExecCommandArg = lfirst(lc);
4132 					if (strcmp(wdExecCommandArg->arg_name, "StandbyLockType") == 0)
4133 					{
4134 						lock_type = atoi(wdExecCommandArg->arg_value);
4135 					}
4136 					else if (strcmp(wdExecCommandArg->arg_name, "LockingOperation") == 0)
4137 					{
4138 						operation = wdExecCommandArg->arg_value;
4139 					}
4140 					else
4141 						ereport(LOG,
4142 								(errmsg("unsupported argument \"%s\" in 'LOCK ON STANDBY' from remote node \"%s\"", wdExecCommandArg->arg_name, wdNode->nodeName)));
4143 				}
4144 				if (lock_type < 0 || operation == NULL)
4145 				{
4146 					ereport(LOG,
4147 							(errmsg("missing argument in 'LOCK ON STANDBY' from remote node \"%s\"", wdNode->nodeName),
4148 							 errdetail("command ignored")));
4149 				}
4150 				else if (lock_type == WD_FOLLOW_PRIMARY_LOCK)
4151 				{
4152 					ereport(LOG,
4153 							(errmsg("processing follow primary looking[%s] request from remote node \"%s\"", operation,wdNode->nodeName)));
4154 
4155 					if (strcasecmp("acquire", operation) == 0)
4156 						pool_acquire_follow_primary_lock(false, true);
4157 					else if (strcasecmp("release", operation) == 0)
4158 						pool_release_follow_primary_lock(true);
4159 					else
4160 						ereport(LOG,
4161 								(errmsg("invalid looking operaition[%s] in 'LOCK ON STANDBY' from remote node \"%s\"", operation, wdNode->nodeName),
4162 								 errdetail("command ignored")));
4163 				}
4164 				else
4165 					ereport(LOG,
4166 							(errmsg("unsupported lock-type:%d in 'LOCK ON STANDBY' from remote node \"%s\"", lock_type, wdNode->nodeName)));
4167 
4168 			}
4169 			else
4170 			{
4171 				ereport(LOG,
4172 						(errmsg("invalid arguments in 'LOCK ON STANDBY' command from remote node \"%s\"",  wdNode->nodeName)));
4173 			}
4174 		}
4175 		else if (get_local_node_state() != WD_STANDBY)
4176 		{
4177 			ereport(LOG,
4178 					(errmsg("invalid node state to execute 'LOCK ON STANDBY' command")));
4179 
4180 		}
4181 		else
4182 		{
4183 			ereport(LOG,
4184 					(errmsg("'LOCK ON STANDBY' command can only be accepted from the coordinator watchdog node"),
4185 					 errdetail("ignoring...")));
4186 		}
4187 	}
4188 	else
4189 	{
4190 		ereport(WARNING,
4191 				(errmsg("received \"%s\" command from node \"%s\" is not supported",clusterCommand, wdNode->nodeName)));
4192 	}
4193 
4194 	if (args_list)
4195 		list_free_deep(args_list);
4196 	pfree(clusterCommand);
4197 	return;
4198 }
4199 
4200 static int
standard_packet_processor(WatchdogNode * wdNode,WDPacketData * pkt)4201 standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt)
4202 {
4203 	WDPacketData *replyPkt = NULL;
4204 
4205 	switch (pkt->type)
4206 	{
4207 		case WD_FAILOVER_WAITING_FOR_CONSENSUS:
4208 			ereport(LOG,
4209 					(errmsg("remote node \"%s\" is asking to inform about quarantined backend nodes", wdNode->nodeName)));
4210 			register_inform_quarantine_nodes_req();
4211 			break;
4212 
4213 		case WD_EXECUTE_COMMAND_REQUEST:
4214 			wd_execute_cluster_command_processor(wdNode, pkt);
4215 			break;
4216 
4217 		case WD_CLUSTER_SERVICE_MESSAGE:
4218 			cluster_service_message_processor(wdNode, pkt);
4219 			break;
4220 
4221 		case WD_GET_LEADER_DATA_REQUEST:
4222 			replyPkt = process_data_request(wdNode, pkt);
4223 			break;
4224 
4225 		case WD_ASK_FOR_POOL_CONFIG:
4226 			{
4227 				char	   *config_data = get_pool_config_json();
4228 
4229 				if (config_data)
4230 				{
4231 					replyPkt = get_empty_packet();
4232 					set_message_type(replyPkt, WD_POOL_CONFIG_DATA);
4233 					set_message_commandID(replyPkt, pkt->command_id);
4234 					set_message_data(replyPkt, config_data, strlen(config_data));
4235 				}
4236 				else
4237 				{
4238 					replyPkt = get_minimum_message(WD_ERROR_MESSAGE, pkt);
4239 
4240 				}
4241 			}
4242 			break;
4243 
4244 		case WD_POOL_CONFIG_DATA:
4245 			{
4246 				/* only accept config data if I am the coordinator node */
4247 				if (get_local_node_state() == WD_COORDINATOR && pkt->data)
4248 				{
4249 					POOL_CONFIG *standby_config = get_pool_config_from_json(pkt->data, pkt->len);
4250 
4251 					if (standby_config)
4252 					{
4253 						verify_pool_configurations(wdNode, standby_config);
4254 					}
4255 				}
4256 			}
4257 			break;
4258 
4259 		case WD_ADD_NODE_MESSAGE:
4260 		case WD_REQ_INFO_MESSAGE:
4261 			replyPkt = get_mynode_info_message(pkt);
4262 			break;
4263 
4264 		case WD_INFO_MESSAGE:
4265 			{
4266 				char	   *authkey = NULL;
4267 				int			oldQuorumStatus;
4268 				WD_STATES	oldNodeState;
4269 				WatchdogNode *tempNode = parse_node_info_message(pkt, &authkey);
4270 
4271 				if (tempNode == NULL)
4272 				{
4273 					ereport(WARNING,
4274 							(errmsg("node \"%s\" sent an invalid node info message", wdNode->nodeName)));
4275 					send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_INVALID_VERSION);
4276 					break;
4277 				}
4278 				oldQuorumStatus = wdNode->quorum_status;
4279 				oldNodeState = wdNode->state;
4280 				wdNode->state = tempNode->state;
4281 				wdNode->startup_time.tv_sec = tempNode->startup_time.tv_sec;
4282 				wdNode->wd_priority = tempNode->wd_priority;
4283 				strlcpy(wdNode->nodeName, tempNode->nodeName, WD_MAX_HOST_NAMELEN);
4284 
4285 				wdNode->current_state_time.tv_sec = tempNode->current_state_time.tv_sec;
4286 				wdNode->escalated = tempNode->escalated;
4287 				wdNode->standby_nodes_count = tempNode->standby_nodes_count;
4288 				wdNode->quorum_status = tempNode->quorum_status;
4289 
4290 				print_watchdog_node_info(wdNode);
4291 
4292 				if (authkey)
4293 					pfree(authkey);
4294 
4295 				if (wdNode->state == WD_COORDINATOR)
4296 				{
4297 					if (WD_LEADER_NODE == NULL)
4298 					{
4299 						set_cluster_leader_node(wdNode);
4300 					}
4301 					else if (WD_LEADER_NODE != wdNode)
4302 					{
4303 						ereport(LOG,
4304 								(errmsg("\"%s\" is the coordinator as per our record but \"%s\" is also announcing as a coordinator",
4305 										WD_LEADER_NODE->nodeName, wdNode->nodeName),
4306 								 errdetail("cluster is in the split-brain")));
4307 
4308 						if (get_local_node_state() != WD_COORDINATOR)
4309 						{
4310 							/*
4311 							 * This fight doesn't belong to me broadcast the
4312 							 * message about cluster in split-brain
4313 							 */
4314 
4315 							send_cluster_service_message(NULL, pkt, CLUSTER_IN_SPLIT_BRAIN);
4316 						}
4317 						else
4318 						{
4319 							/*
4320 							 * okay the contention is between me and the other
4321 							 * node try to figure out which node is the worthy
4322 							 * leader
4323 							 */
4324 							ereport(LOG,
4325 									(errmsg("I am the coordinator but \"%s\" is also announcing as a coordinator", wdNode->nodeName),
4326 									 errdetail("trying to figure out the best contender for the leader/coordinator node")));
4327 
4328 							handle_split_brain(wdNode, pkt);
4329 						}
4330 					}
4331 					else if (WD_LEADER_NODE == wdNode && oldQuorumStatus != wdNode->quorum_status)
4332 					{
4333 						/* inform Pgpool main about quorum status changes */
4334 						register_watchdog_quorum_change_interrupt();
4335 					}
4336 				}
4337 
4338 				/*
4339 				 * if the info message is from leader node. Make sure we are
4340 				 * in sync with the leader node state
4341 				 */
4342 				else if (WD_LEADER_NODE == wdNode)
4343 				{
4344 					if (wdNode->state != WD_COORDINATOR)
4345 					{
4346 						ereport(WARNING,
4347 								(errmsg("the coordinator as per our record is not coordinator anymore"),
4348 								 errdetail("re-initializing the cluster")));
4349 						set_state(WD_JOINING);
4350 					}
4351 				}
4352 				pfree(tempNode);
4353 
4354 				if (oldNodeState == WD_STANDBY && wdNode->state != oldNodeState)
4355 				{
4356 					standby_node_left_cluster(wdNode);
4357 				}
4358 				if (oldNodeState == WD_LOST)
4359 				{
4360 					/*
4361 					 * We have received the message from lost node
4362 					 * add it back to cluster if it was not marked by
4363 					 * life-check
4364 					 * Node lost by life-check processes can only be
4365 					 * added back when we get alive notification for the
4366 					 * node from life-check
4367 					 */
4368 					ereport(LOG,
4369 						(errmsg("we have received the NODE INFO message from the node:\"%s\" that was lost",wdNode->nodeName),
4370 						 errdetail("we had lost this node because of \"%s\"",wd_node_lost_reasons[wdNode->node_lost_reason])));
4371 
4372 					if (wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK)
4373 					{
4374 						ereport(LOG,
4375 							(errmsg("node:\"%s\" was reported lost by the lifecheck process",wdNode->nodeName),
4376 								 errdetail("only life-check process can mark this node alive again")));
4377 						/* restore the node's lost state */
4378 						wdNode->state = oldNodeState;
4379 					}
4380 					else
4381 					{
4382 						watchdog_state_machine(WD_EVENT_REMOTE_NODE_FOUND, wdNode, NULL, NULL);
4383 					}
4384 				}
4385 			}
4386 			break;
4387 
4388 		case WD_JOIN_COORDINATOR_MESSAGE:
4389 			{
4390 				/*
4391 				 * if I am coordinator reply with accept, otherwise reject
4392 				 */
4393 				if (g_cluster.localNode == WD_LEADER_NODE)
4394 				{
4395 					replyPkt = get_minimum_message(WD_ACCEPT_MESSAGE, pkt);
4396 				}
4397 				else
4398 				{
4399 					replyPkt = get_minimum_message(WD_REJECT_MESSAGE, pkt);
4400 				}
4401 			}
4402 			break;
4403 
4404 		case WD_IAM_COORDINATOR_MESSAGE:
4405 			{
4406 				/*
4407 				 * if the message is received from coordinator reply with
4408 				 * info, otherwise reject
4409 				 */
4410 				if (WD_LEADER_NODE != NULL && wdNode != WD_LEADER_NODE)
4411 				{
4412 					ereport(LOG,
4413 							(errmsg("\"%s\" is our coordinator node, but \"%s\" is also announcing as a coordinator",
4414 									WD_LEADER_NODE->nodeName, wdNode->nodeName),
4415 							 errdetail("broadcasting the cluster in split-brain message")));
4416 
4417 					send_cluster_service_message(NULL, pkt, CLUSTER_IN_SPLIT_BRAIN);
4418 				}
4419 				else if (WD_LEADER_NODE != NULL)
4420 				{
4421 					replyPkt = get_mynode_info_message(pkt);
4422 					beacon_message_received_from_node(wdNode, pkt);
4423 				}
4424 				/*
4425 				 * if (WD_LEADER_NODE == NULL)
4426 				 * do not reply to beacon if we are not connected to
4427 				 * any leader node
4428 				 */
4429 			}
4430 			break;
4431 
4432 		default:
4433 			break;
4434 	}
4435 	if (replyPkt)
4436 	{
4437 		if (send_message_to_node(wdNode, replyPkt) == false)
4438 			ereport(LOG,
4439 					(errmsg("sending packet to node \"%s\" failed", wdNode->nodeName)));
4440 		free_packet(replyPkt);
4441 	}
4442 	return 1;
4443 }
4444 
4445 
4446 static bool
send_message_to_connection(SocketConnection * conn,WDPacketData * pkt)4447 send_message_to_connection(SocketConnection * conn, WDPacketData * pkt)
4448 {
4449 	if (check_debug_request_kill_all_communication() == true ||
4450 		check_debug_request_kill_all_senders() == true)
4451 		return false;
4452 
4453 	if (conn->sock > 0 && conn->sock_state == WD_SOCK_CONNECTED)
4454 	{
4455 		if (write_packet_to_socket(conn->sock, pkt, false) == true)
4456 			return true;
4457 		ereport(DEBUG1,
4458 				(errmsg("sending packet failed, closing connection")));
4459 		close_socket_connection(conn);
4460 	}
4461 
4462 	return false;
4463 }
4464 
4465 static bool
send_message_to_node(WatchdogNode * wdNode,WDPacketData * pkt)4466 send_message_to_node(WatchdogNode * wdNode, WDPacketData * pkt)
4467 {
4468 	bool		ret;
4469 
4470 	print_packet_node_info(pkt, wdNode, true);
4471 
4472 	ret = send_message_to_connection(&wdNode->client_socket, pkt);
4473 	if (ret == false)
4474 	{
4475 		ret = send_message_to_connection(&wdNode->server_socket, pkt);
4476 	}
4477 	if (ret)
4478 	{
4479 		/* reset the sending error counter */
4480 		wdNode->sending_failures_count = 0;
4481 		/* we only update the last sent time if reply for packet is expected */
4482 		switch (pkt->type)
4483 		{
4484 			case WD_REMOTE_FAILOVER_REQUEST:
4485 			case WD_IPC_FAILOVER_COMMAND:
4486 				if (wdNode->last_sent_time.tv_sec <= 0)
4487 					gettimeofday(&wdNode->last_sent_time, NULL);
4488 				break;
4489 			default:
4490 				break;
4491 		}
4492 	}
4493 	else
4494 	{
4495 		wdNode->sending_failures_count++;
4496 		ereport(DEBUG1,
4497 				(errmsg("sending packet %c to node \"%s\" failed", pkt->type, wdNode->nodeName)));
4498 	}
4499 	return ret;
4500 }
4501 
4502 /*
4503  * If wdNode is NULL message is sent to all nodes
4504  * Returns the number of nodes the message is sent to
4505  */
4506 static int
send_message(WatchdogNode * wdNode,WDPacketData * pkt)4507 send_message(WatchdogNode * wdNode, WDPacketData * pkt)
4508 {
4509 	int			i,
4510 				count = 0;
4511 
4512 	if (wdNode)
4513 	{
4514 		if (wdNode == g_cluster.localNode)	/* Always return 1 if I myself is
4515 											 * intended receiver */
4516 			return 1;
4517 		if (send_message_to_node(wdNode, pkt))
4518 			return 1;
4519 		return 0;
4520 	}
4521 	/* NULL means send to all reachable nodes */
4522 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
4523 	{
4524 		wdNode = &(g_cluster.remoteNodes[i]);
4525 		if (is_node_reachable(wdNode) && send_message_to_node(wdNode, pkt))
4526 			count++;
4527 	}
4528 	return count;
4529 }
4530 
wd_command_processor_for_node_lost_event(WDCommandData * ipcCommand,WatchdogNode * wdLostNode)4531 static IPC_CMD_PROCESS_RES wd_command_processor_for_node_lost_event(WDCommandData * ipcCommand, WatchdogNode * wdLostNode)
4532 {
4533 	if (ipcCommand->sendToNode)
4534 	{
4535 		/* The command was sent to one node only */
4536 		if (ipcCommand->sendToNode == wdLostNode)
4537 		{
4538 			/*
4539 			 * Fail this command, Since the only node it was sent to is lost
4540 			 */
4541 			ipcCommand->commandStatus = COMMAND_FINISHED_SEND_FAILED;
4542 			wd_command_is_complete(ipcCommand);
4543 			return IPC_CMD_ERROR;
4544 		}
4545 		else
4546 		{
4547 			/* Dont worry this command is fine for now */
4548 			return IPC_CMD_PROCESSING;
4549 		}
4550 	}
4551 	else
4552 	{
4553 		/* search the node that is lost */
4554 		int			i;
4555 
4556 		for (i = 0; i < g_cluster.remoteNodeCount; i++)
4557 		{
4558 			WDCommandNodeResult *nodeResult = &ipcCommand->nodeResults[i];
4559 
4560 			if (nodeResult->wdNode == wdLostNode)
4561 			{
4562 				if (nodeResult->cmdState == COMMAND_STATE_SENT)
4563 				{
4564 					ereport(LOG,
4565 							(errmsg("remote node \"%s\" lost while IPC command was in progress ", wdLostNode->nodeName)));
4566 
4567 					/*
4568 					 * since the node is lost and will be removed from the
4569 					 * cluster So remove decrement the sent count of command
4570 					 * and see what is the situation after that
4571 					 */
4572 					nodeResult->cmdState = COMMAND_STATE_DO_NOT_SEND;
4573 					ipcCommand->commandSendToCount--;
4574 					if (ipcCommand->commandSendToCount <= ipcCommand->commandReplyFromCount)
4575 					{
4576 						/*
4577 						 * If we have already received the results from all
4578 						 * alive nodes finish the command
4579 						 */
4580 						ipcCommand->commandStatus = COMMAND_FINISHED_ALL_REPLIED;
4581 						wd_command_is_complete(ipcCommand);
4582 						return IPC_CMD_COMPLETE;
4583 					}
4584 				}
4585 				break;
4586 			}
4587 		}
4588 	}
4589 	return IPC_CMD_PROCESSING;
4590 }
4591 
4592 static void
wd_command_is_complete(WDCommandData * ipcCommand)4593 wd_command_is_complete(WDCommandData * ipcCommand)
4594 {
4595 	if (ipcCommand->commandCompleteFunc)
4596 	{
4597 		ipcCommand->commandCompleteFunc(ipcCommand);
4598 		return;
4599 	}
4600 
4601 	/*
4602 	 * There is not special function for this command use the standard reply
4603 	 */
4604 	if (ipcCommand->commandSource == COMMAND_SOURCE_IPC)
4605 	{
4606 		char		res_type;
4607 
4608 		switch (ipcCommand->commandStatus)
4609 		{
4610 			case COMMAND_FINISHED_ALL_REPLIED:
4611 				res_type = WD_IPC_CMD_RESULT_OK;
4612 				break;
4613 			case COMMAND_FINISHED_TIMEOUT:
4614 				res_type = WD_IPC_CMD_TIMEOUT;
4615 				break;
4616 			case COMMAND_FINISHED_NODE_REJECTED:
4617 			case COMMAND_FINISHED_SEND_FAILED:
4618 				res_type = WD_IPC_CMD_RESULT_BAD;
4619 				break;
4620 			default:
4621 				res_type = WD_IPC_CMD_RESULT_OK;
4622 				break;
4623 		}
4624 		write_ipc_command_with_result_data(ipcCommand, res_type, NULL, 0);
4625 	}
4626 	else if (ipcCommand->commandSource == COMMAND_SOURCE_REMOTE)
4627 	{
4628 		char		res_type;
4629 
4630 		if (ipcCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED)
4631 			res_type = WD_ACCEPT_MESSAGE;
4632 		else
4633 			res_type = WD_REJECT_MESSAGE;
4634 
4635 		reply_with_minimal_message(ipcCommand->sourceWdNode, res_type, &ipcCommand->commandPacket);
4636 	}
4637 }
4638 
4639 
4640 static void
node_lost_while_ipc_command(WatchdogNode * wdNode)4641 node_lost_while_ipc_command(WatchdogNode * wdNode)
4642 {
4643 	List	   *ipcCommands_to_del = NIL;
4644 	ListCell   *lc;
4645 
4646 	foreach(lc, g_cluster.ipc_commands)
4647 	{
4648 		WDCommandData *ipcCommand = lfirst(lc);
4649 		IPC_CMD_PROCESS_RES res = wd_command_processor_for_node_lost_event(ipcCommand, wdNode);
4650 
4651 		if (res != IPC_CMD_PROCESSING)
4652 		{
4653 			ipcCommands_to_del = lappend(ipcCommands_to_del, ipcCommand);
4654 		}
4655 	}
4656 	/* delete completed commands */
4657 	foreach(lc, ipcCommands_to_del)
4658 	{
4659 		WDCommandData *ipcCommand = lfirst(lc);
4660 
4661 		cleanUpIPCCommand(ipcCommand);
4662 	}
4663 
4664 	list_free(ipcCommands_to_del);
4665 }
4666 
4667 
4668 /*
4669  * The function walks through all command and resends
4670  * the failed message again if it can.
4671  */
4672 static void
service_ipc_commands(void)4673 service_ipc_commands(void)
4674 {
4675 	ListCell   *lc;
4676 
4677 	foreach(lc, g_cluster.ipc_commands)
4678 	{
4679 		WDCommandData *ipcCommand = lfirst(lc);
4680 
4681 		if (ipcCommand && ipcCommand->commandSendToErrorCount)
4682 		{
4683 			int			i;
4684 
4685 			for (i = 0; i < g_cluster.remoteNodeCount; i++)
4686 			{
4687 				WDCommandNodeResult *nodeResult = &ipcCommand->nodeResults[i];
4688 
4689 				if (nodeResult->cmdState == COMMAND_STATE_SEND_ERROR)
4690 				{
4691 					if (is_node_active_and_reachable(nodeResult->wdNode))
4692 					{
4693 						ereport(LOG,
4694 								(errmsg("remote node \"%s\" is reachable again, resending the command packet ", nodeResult->wdNode->nodeName)));
4695 
4696 						if (send_message_to_node(nodeResult->wdNode, &ipcCommand->commandPacket) == true)
4697 						{
4698 							nodeResult->cmdState = COMMAND_STATE_SENT;
4699 							ipcCommand->commandSendToErrorCount--;
4700 							ipcCommand->commandSendToCount++;
4701 							if (ipcCommand->commandSendToErrorCount == 0)
4702 								break;
4703 						}
4704 					}
4705 				}
4706 			}
4707 		}
4708 	}
4709 }
4710 
4711 static void
service_internal_command(void)4712 service_internal_command(void)
4713 {
4714 	int			i;
4715 	ListCell   *lc;
4716 	List	   *finishedCommands = NULL;
4717 
4718 	if (g_cluster.clusterCommands == NULL)
4719 		return;
4720 
4721 	foreach(lc, g_cluster.clusterCommands)
4722 	{
4723 		WDCommandData *clusterCommand = lfirst(lc);
4724 
4725 		if (clusterCommand->commandStatus != COMMAND_IN_PROGRESS)
4726 		{
4727 			/* command needs to be cleaned up */
4728 			finishedCommands = lappend(finishedCommands, clusterCommand);
4729 			continue;
4730 		}
4731 
4732 		for (i = 0; i < g_cluster.remoteNodeCount; i++)
4733 		{
4734 			WDCommandNodeResult *nodeResult = &clusterCommand->nodeResults[i];
4735 
4736 			if (nodeResult->cmdState == COMMAND_STATE_SEND_ERROR)
4737 			{
4738 				if (is_node_active_and_reachable(nodeResult->wdNode))
4739 				{
4740 					if (send_message_to_node(nodeResult->wdNode, &clusterCommand->commandPacket) == true)
4741 					{
4742 						nodeResult->cmdState = COMMAND_STATE_SENT;
4743 						clusterCommand->commandSendToCount++;
4744 					}
4745 				}
4746 			}
4747 		}
4748 	}
4749 	/* delete the finished commands */
4750 	foreach(lc, finishedCommands)
4751 	{
4752 		WDCommandData *clusterCommand = lfirst(lc);
4753 
4754 		g_cluster.clusterCommands = list_delete_ptr(g_cluster.clusterCommands, clusterCommand);
4755 		MemoryContextDelete(clusterCommand->memoryContext);
4756 	}
4757 
4758 	list_free(finishedCommands);
4759 }
4760 
4761 /* remove the unreachable nodes from cluster */
4762 static void
service_unreachable_nodes(void)4763 service_unreachable_nodes(void)
4764 {
4765 	int			i;
4766 	struct timeval currTime;
4767 
4768 	gettimeofday(&currTime, NULL);
4769 
4770 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
4771 	{
4772 		WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
4773 
4774 		if (wdNode->state == WD_LOST && wdNode->membership_status == WD_NODE_MEMBERSHIP_ACTIVE
4775 			&& pool_config->wd_lost_node_removal_timeout)
4776 		{
4777 			int lost_seconds = WD_TIME_DIFF_SEC(currTime, wdNode->lost_time);
4778 			if (lost_seconds >= pool_config->wd_lost_node_removal_timeout)
4779 			{
4780 				ereport(LOG,
4781 						(errmsg("remote node \"%s\" is lost for %d seconds", wdNode->nodeName,lost_seconds),
4782 						 errdetail("revoking the node's membership")));
4783 				revoke_cluster_membership_of_node(wdNode,WD_NODE_REVOKED_LOST);
4784 			}
4785 			continue;
4786 		}
4787 
4788 		if (wdNode->state == WD_DEAD && wdNode->membership_status == WD_NODE_MEMBERSHIP_ACTIVE
4789 			&& pool_config->wd_initial_node_showup_time)
4790 		{
4791 			int no_show_seconds = WD_TIME_DIFF_SEC(currTime, g_cluster.localNode->startup_time);
4792 			if (no_show_seconds >= pool_config->wd_initial_node_showup_time)
4793 			{
4794 				ereport(LOG,
4795 						(errmsg("remote node \"%s\" didn't showed-up in %d seconds", wdNode->nodeName,no_show_seconds),
4796 						 errdetail("revoking the node's membership")));
4797 				revoke_cluster_membership_of_node(wdNode,WD_NODE_REVOKED_NO_SHOW);
4798 			}
4799 			continue;
4800 		}
4801 
4802 		if (is_node_active(wdNode) == false)
4803 			continue;
4804 
4805 		if (is_node_reachable(wdNode) || wdNode->client_socket.sock_state == WD_SOCK_WAITING_FOR_CONNECT)
4806 		{
4807 			/* check if we are waiting for reply from this node */
4808 			if (wdNode->last_sent_time.tv_sec > 0)
4809 			{
4810 				if (WD_TIME_DIFF_SEC(currTime, wdNode->last_sent_time) >= MAX_SECS_WAIT_FOR_REPLY_FROM_NODE)
4811 				{
4812 					ereport(LOG,
4813 							(errmsg("remote node \"%s\" is not replying..", wdNode->nodeName),
4814 							 errdetail("marking the node as lost")));
4815 					/* mark the node as lost */
4816 					wdNode->node_lost_reason = NODE_LOST_BY_RECEIVE_TIMEOUT;
4817 					watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
4818 				}
4819 			}
4820 			else if (wdNode->sending_failures_count > MAX_ALLOWED_SEND_FAILURES)
4821 			{
4822 				ereport(LOG,
4823 						(errmsg("not able to send messages to remote node \"%s\"",wdNode->nodeName),
4824 						 errdetail("marking the node as lost")));
4825 				/* mark the node as lost */
4826 				wdNode->node_lost_reason = NODE_LOST_BY_SEND_FAILURE;
4827 				watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
4828 			}
4829 			else if (wdNode->missed_beacon_count > MAX_ALLOWED_BEACON_REPLY_MISS)
4830 			{
4831 				ereport(LOG,
4832 						(errmsg("remote node \"%s\" is not responding to our beacon messages",wdNode->nodeName),
4833 						 errdetail("marking the node as lost")));
4834 				/* mark the node as lost */
4835 				wdNode->node_lost_reason = NODE_LOST_BY_MISSING_BEACON;
4836 				wdNode->missed_beacon_count = 0; /* Reset the counter */
4837 				watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
4838 			}
4839 		}
4840 		else
4841 		{
4842 			ereport(LOG,
4843 					(errmsg("remote node \"%s\" is not reachable", wdNode->nodeName),
4844 					 errdetail("marking the node as lost")));
4845 			wdNode->node_lost_reason = NODE_LOST_BY_NOT_REACHABLE;
4846 			watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
4847 		}
4848 	}
4849 }
4850 
4851 static bool
watchdog_internal_command_packet_processor(WatchdogNode * wdNode,WDPacketData * pkt)4852 watchdog_internal_command_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt)
4853 {
4854 	int			i;
4855 	WDCommandNodeResult *nodeResult = NULL;
4856 	WDCommandData *clusterCommand = get_wd_cluster_command_from_reply(pkt);
4857 
4858 	if (clusterCommand == NULL || clusterCommand->commandStatus != COMMAND_IN_PROGRESS)
4859 		return false;
4860 
4861 	if (pkt->type != WD_ERROR_MESSAGE &&
4862 		pkt->type != WD_ACCEPT_MESSAGE &&
4863 		pkt->type != WD_REJECT_MESSAGE &&
4864 		pkt->type != WD_INFO_MESSAGE)
4865 		return false;
4866 
4867 	if (pkt->type == WD_INFO_MESSAGE)
4868 		standard_packet_processor(wdNode, pkt);
4869 
4870 	/* get the result node for */
4871 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
4872 	{
4873 		WDCommandNodeResult *nodeRes = &clusterCommand->nodeResults[i];
4874 		if (nodeRes->wdNode == wdNode)
4875 		{
4876 			nodeResult = nodeRes;
4877 			break;
4878 		}
4879 	}
4880 	if (nodeResult == NULL)
4881 	{
4882 		ereport(NOTICE, (errmsg("unable to find node result")));
4883 		return true;
4884 	}
4885 
4886 	ereport(DEBUG1,
4887 			(errmsg("Watchdog node \"%s\" has replied for command id %d", nodeResult->wdNode->nodeName, pkt->command_id)));
4888 
4889 	nodeResult->result_type = pkt->type;
4890 	nodeResult->cmdState = COMMAND_STATE_REPLIED;
4891 	clusterCommand->commandReplyFromCount++;
4892 
4893 	if (clusterCommand->commandReplyFromCount >= clusterCommand->commandSendToCount)
4894 	{
4895 		if (pkt->type == WD_REJECT_MESSAGE || pkt->type == WD_ERROR_MESSAGE)
4896 		{
4897 			ereport(DEBUG1,
4898 					(errmsg("command %c with command id %d is finished with COMMAND_FINISHED_NODE_REJECTED", pkt->type, pkt->command_id)));
4899 			clusterCommand->commandStatus = COMMAND_FINISHED_NODE_REJECTED;
4900 		}
4901 		else
4902 		{
4903 			ereport(DEBUG1,
4904 					(errmsg("command %c with command id %d is finished with COMMAND_FINISHED_ALL_REPLIED", pkt->type, pkt->command_id)));
4905 			clusterCommand->commandStatus = COMMAND_FINISHED_ALL_REPLIED;
4906 		}
4907 		watchdog_state_machine(WD_EVENT_COMMAND_FINISHED, wdNode, pkt, clusterCommand);
4908 		g_cluster.clusterCommands = list_delete_ptr(g_cluster.clusterCommands, clusterCommand);
4909 		MemoryContextDelete(clusterCommand->memoryContext);
4910 	}
4911 	else if (pkt->type == WD_REJECT_MESSAGE || pkt->type == WD_ERROR_MESSAGE)
4912 	{
4913 		/* Error or reject message by any node immediately finishes the command */
4914 		ereport(DEBUG1,
4915 				(errmsg("command %c with command id %d is finished with COMMAND_FINISHED_NODE_REJECTED", pkt->type, pkt->command_id)));
4916 		clusterCommand->commandStatus = COMMAND_FINISHED_NODE_REJECTED;
4917 		watchdog_state_machine(WD_EVENT_COMMAND_FINISHED, wdNode, pkt, clusterCommand);
4918 		g_cluster.clusterCommands = list_delete_ptr(g_cluster.clusterCommands, clusterCommand);
4919 		MemoryContextDelete(clusterCommand->memoryContext);
4920 	}
4921 	return true;				/* do not process this packet further */
4922 }
4923 
4924 
4925 static void
check_for_current_command_timeout(void)4926 check_for_current_command_timeout(void)
4927 {
4928 	struct timeval currTime;
4929 
4930 	ListCell   *lc;
4931 	List	   *finishedCommands = NULL;
4932 
4933 	if (g_cluster.clusterCommands == NULL)
4934 		return;
4935 
4936 	gettimeofday(&currTime, NULL);
4937 
4938 	foreach(lc, g_cluster.clusterCommands)
4939 	{
4940 		WDCommandData *clusterCommand = lfirst(lc);
4941 
4942 		if (clusterCommand->commandStatus != COMMAND_IN_PROGRESS)
4943 		{
4944 			/* command needs to be cleaned up */
4945 			finishedCommands = lappend(finishedCommands, clusterCommand);
4946 			continue;
4947 		}
4948 		if (WD_TIME_DIFF_SEC(currTime, clusterCommand->commandTime) >= clusterCommand->commandTimeoutSecs)
4949 		{
4950 			clusterCommand->commandStatus = COMMAND_FINISHED_TIMEOUT;
4951 			watchdog_state_machine(WD_EVENT_COMMAND_FINISHED, NULL, NULL, clusterCommand);
4952 			finishedCommands = lappend(finishedCommands, clusterCommand);
4953 		}
4954 	}
4955 	/* delete the finished commands */
4956 	foreach(lc, finishedCommands)
4957 	{
4958 		WDCommandData *clusterCommand = lfirst(lc);
4959 
4960 		g_cluster.clusterCommands = list_delete_ptr(g_cluster.clusterCommands, clusterCommand);
4961 		MemoryContextDelete(clusterCommand->memoryContext);
4962 	}
4963 
4964 	list_free(finishedCommands);
4965 }
4966 
4967 
4968 /*
4969  * If wdNode is NULL message is sent to all nodes
4970  * Returns the number of nodes the message is sent to
4971  */
4972 static int
issue_watchdog_internal_command(WatchdogNode * wdNode,WDPacketData * pkt,int timeout_sec)4973 issue_watchdog_internal_command(WatchdogNode * wdNode, WDPacketData * pkt, int timeout_sec)
4974 {
4975 	int			i;
4976 	bool		save_message = false;
4977 	WDCommandData *clusterCommand;
4978 	MemoryContext oldCxt;
4979 
4980 	clusterCommand = create_command_object(0);
4981 
4982 	clusterCommand->commandSource = COMMAND_SOURCE_LOCAL;
4983 	clusterCommand->sourceWdNode = g_cluster.localNode;
4984 	gettimeofday(&clusterCommand->commandTime, NULL);
4985 
4986 	clusterCommand->commandTimeoutSecs = timeout_sec;
4987 	clusterCommand->commandPacket.type = pkt->type;
4988 	clusterCommand->commandPacket.command_id = pkt->command_id;
4989 	clusterCommand->commandPacket.len = 0;
4990 	clusterCommand->commandPacket.data = NULL;
4991 
4992 	clusterCommand->sendToNode = wdNode;
4993 	clusterCommand->commandSendToCount = 0;
4994 	clusterCommand->commandReplyFromCount = 0;
4995 	clusterCommand->commandStatus = COMMAND_IN_PROGRESS;
4996 
4997 	allocate_resultNodes_in_command(clusterCommand);
4998 
4999 	if (wdNode == NULL)			/* This is send to all */
5000 	{
5001 		for (i = 0; i < g_cluster.remoteNodeCount; i++)
5002 		{
5003 			WDCommandNodeResult *nodeResult = &clusterCommand->nodeResults[i];
5004 
5005 			clear_command_node_result(nodeResult);
5006 			if (is_node_active(nodeResult->wdNode) == false)
5007 			{
5008 				ereport(DEBUG2,
5009 						(errmsg("not sending watchdog internal command packet to DEAD %s", nodeResult->wdNode->nodeName)));
5010 				/* Do not send to dead nodes */
5011 				nodeResult->cmdState = COMMAND_STATE_DO_NOT_SEND;
5012 			}
5013 			else
5014 			{
5015 				if (send_message_to_node(nodeResult->wdNode, pkt) == false)
5016 				{
5017 					ereport(DEBUG1,
5018 							(errmsg("failed to send watchdog internal command packet %s", nodeResult->wdNode->nodeName),
5019 							 errdetail("saving the packet. will try to resend it if connection recovers")));
5020 
5021 					/* failed to send. May be try again later */
5022 					save_message = true;
5023 					nodeResult->cmdState = COMMAND_STATE_SEND_ERROR;
5024 				}
5025 				else
5026 				{
5027 					nodeResult->cmdState = COMMAND_STATE_SENT;
5028 					clusterCommand->commandSendToCount++;
5029 				}
5030 			}
5031 		}
5032 	}
5033 	if (wdNode)
5034 	{
5035 		WDCommandNodeResult *nodeResult = NULL;
5036 
5037 		for (i = 0; i < g_cluster.remoteNodeCount; i++)
5038 		{
5039 			WDCommandNodeResult *nodeRes = &clusterCommand->nodeResults[i];
5040 
5041 			clear_command_node_result(nodeRes);
5042 			if (nodeRes->wdNode == wdNode)
5043 				nodeResult = nodeRes;
5044 		}
5045 		if (nodeResult == NULL)
5046 		{
5047 			/* should never happen */
5048 			ereport(WARNING,
5049 					(errmsg("Internal error. Not able to locate node result slot")));
5050 			MemoryContextDelete(clusterCommand->memoryContext);
5051 			return -1;
5052 		}
5053 		if (send_message_to_node(nodeResult->wdNode, pkt) == false)
5054 		{
5055 			/* failed to send. May be try again later */
5056 			save_message = true;
5057 			nodeResult->cmdState = COMMAND_STATE_SEND_ERROR;
5058 		}
5059 		else
5060 		{
5061 			nodeResult->cmdState = COMMAND_STATE_SENT;
5062 			clusterCommand->commandSendToCount++;
5063 		}
5064 	}
5065 	if (save_message && pkt->len > 0)
5066 	{
5067 		clusterCommand->commandPacket.data = MemoryContextAlloc(clusterCommand->memoryContext, pkt->len);
5068 		memcpy(clusterCommand->commandPacket.data, pkt->data, pkt->len);
5069 		clusterCommand->commandPacket.len = pkt->len;
5070 	}
5071 	ereport(DEBUG2,
5072 			(errmsg("new cluster command %c issued with command id %d", pkt->type, pkt->command_id)));
5073 
5074 	oldCxt = MemoryContextSwitchTo(TopMemoryContext);
5075 	g_cluster.clusterCommands = lappend(g_cluster.clusterCommands, clusterCommand);
5076 	MemoryContextSwitchTo(oldCxt);
5077 
5078 	return clusterCommand->commandSendToCount;
5079 }
5080 
5081 /*
5082  * Check remote connections except their state are either WD_SHUTDOWN or
5083  * WD_DEAD. If suncceeded in connecting to any of the remote nodes, returns
5084  * true, otherwise false.
5085  */
5086 static bool
service_lost_connections(void)5087 service_lost_connections(void)
5088 {
5089 	int			i;
5090 	struct timeval currTime;
5091 	bool		ret = false;
5092 
5093 	gettimeofday(&currTime, NULL);
5094 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
5095 	{
5096 		WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
5097 
5098 		if (wdNode->state == WD_SHUTDOWN || wdNode->state == WD_DEAD)
5099 			continue;
5100 
5101 		if (is_socket_connection_connected(&wdNode->client_socket) == false)
5102 		{
5103 			if (WD_TIME_DIFF_SEC(currTime, wdNode->client_socket.tv) <= MIN_SECS_CONNECTION_RETRY)
5104 				continue;
5105 
5106 			if (wdNode->client_socket.sock_state != WD_SOCK_WAITING_FOR_CONNECT)
5107 			{
5108 				connect_to_node(wdNode);
5109 				if (wdNode->client_socket.sock_state == WD_SOCK_CONNECTED)
5110 				{
5111 					ereport(LOG,
5112 							(errmsg("connection to the remote node \"%s\" is restored", wdNode->nodeName)));
5113 					watchdog_state_machine(WD_EVENT_NEW_OUTBOUND_CONNECTION, wdNode, NULL, NULL);
5114 					ret = true;
5115 				}
5116 			}
5117 		}
5118 	}
5119 	return ret;
5120 }
5121 
5122 /*
5123  * The function only considers the node state.
5124  * All node states count towards the cluster participating nodes
5125  * except the dead and lost nodes.
5126  */
5127 static int
get_cluster_node_count(void)5128 get_cluster_node_count(void)
5129 {
5130 	int			i;
5131 	int			count = 0;
5132 
5133 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
5134 	{
5135 		WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
5136 
5137 		if (wdNode->state == WD_DEAD || wdNode->state == WD_LOST || wdNode->state == WD_SHUTDOWN)
5138 			continue;
5139 		count++;
5140 	}
5141 	return count;
5142 }
5143 
get_message_of_type(char type,WDPacketData * replyFor)5144 static WDPacketData * get_message_of_type(char type, WDPacketData * replyFor)
5145 {
5146 	WDPacketData *pkt = NULL;
5147 
5148 	switch (type)
5149 	{
5150 		case WD_INFO_MESSAGE:
5151 			pkt = get_mynode_info_message(replyFor);
5152 			break;
5153 		case WD_ADD_NODE_MESSAGE:
5154 			pkt = get_addnode_message();
5155 			break;
5156 		case WD_IAM_COORDINATOR_MESSAGE:
5157 			pkt = get_beacon_message(WD_IAM_COORDINATOR_MESSAGE, replyFor);
5158 			break;
5159 
5160 		case WD_FAILOVER_START:
5161 		case WD_FAILOVER_END:
5162 		case WD_REQ_INFO_MESSAGE:
5163 		case WD_STAND_FOR_COORDINATOR_MESSAGE:
5164 		case WD_DECLARE_COORDINATOR_MESSAGE:
5165 		case WD_JOIN_COORDINATOR_MESSAGE:
5166 		case WD_QUORUM_IS_LOST:
5167 		case WD_INFORM_I_AM_GOING_DOWN:
5168 		case WD_ASK_FOR_POOL_CONFIG:
5169 		case WD_FAILOVER_WAITING_FOR_CONSENSUS:
5170 			pkt = get_minimum_message(type, replyFor);
5171 			break;
5172 		default:
5173 			ereport(LOG, (errmsg("invalid message type %c", type)));
5174 			break;
5175 	}
5176 	return pkt;
5177 }
5178 
5179 static int
send_message_of_type(WatchdogNode * wdNode,char type,WDPacketData * replyFor)5180 send_message_of_type(WatchdogNode * wdNode, char type, WDPacketData * replyFor)
5181 {
5182 	int			ret = -1;
5183 	WDPacketData *pkt = get_message_of_type(type, replyFor);
5184 
5185 	if (pkt)
5186 	{
5187 		ret = send_message(wdNode, pkt);
5188 		free_packet(pkt);
5189 	}
5190 	return ret;
5191 }
5192 
5193 static int
send_cluster_command(WatchdogNode * wdNode,char type,int timeout_sec)5194 send_cluster_command(WatchdogNode * wdNode, char type, int timeout_sec)
5195 {
5196 	int			ret = -1;
5197 	WDPacketData *pkt = get_message_of_type(type, NULL);
5198 
5199 	if (pkt)
5200 	{
5201 		ret = issue_watchdog_internal_command(wdNode, pkt, timeout_sec);
5202 		free_packet(pkt);
5203 	}
5204 	return ret;
5205 }
5206 
5207 static bool
reply_with_minimal_message(WatchdogNode * wdNode,char type,WDPacketData * replyFor)5208 reply_with_minimal_message(WatchdogNode * wdNode, char type, WDPacketData * replyFor)
5209 {
5210 	WDPacketData *pkt = get_minimum_message(type, replyFor);
5211 	int			ret = send_message(wdNode, pkt);
5212 
5213 	free_packet(pkt);
5214 	return ret;
5215 }
5216 
5217 static bool
send_cluster_service_message(WatchdogNode * wdNode,WDPacketData * replyFor,char message)5218 send_cluster_service_message(WatchdogNode * wdNode, WDPacketData * replyFor, char message)
5219 {
5220 	/* Check if its a broadcast message */
5221 	if (wdNode == NULL)
5222 	{
5223 		/* see if we have already broadcasted the similar message recently */
5224 		if (message == g_cluster.last_bcast_srv_msg)
5225 		{
5226 			struct timeval currTime;
5227 			gettimeofday(&currTime, NULL);
5228 			int	 last_bcast_sec = WD_TIME_DIFF_SEC(currTime, g_cluster.last_bcast_srv_msg_time);
5229 			if (last_bcast_sec < MIN_SECS_BETWEEN_BROADCAST_SRV_MSG)
5230 			{
5231 				/*
5232 				 * do not broadcast this message
5233 				 * to prevent flooding
5234 				 */
5235 				ereport(DEBUG4,
5236 						(errmsg("not broadcasting cluster service message %c to prevent flooding ",message),
5237 						 errdetail("last time same message was sent %d seconds ago",last_bcast_sec)));
5238 				return true;
5239 			}
5240 		}
5241 		g_cluster.last_bcast_srv_msg = message;
5242 		gettimeofday(&g_cluster.last_bcast_srv_msg_time, NULL);
5243 	}
5244 	return reply_with_message(wdNode, WD_CLUSTER_SERVICE_MESSAGE, &message, 1, replyFor);
5245 }
5246 
5247 
5248 static bool
reply_with_message(WatchdogNode * wdNode,char type,char * data,int data_len,WDPacketData * replyFor)5249 reply_with_message(WatchdogNode * wdNode, char type, char *data, int data_len, WDPacketData * replyFor)
5250 {
5251 	WDPacketData wdPacket;
5252 	int			ret;
5253 
5254 	init_wd_packet(&wdPacket);
5255 	set_message_type(&wdPacket, type);
5256 
5257 	if (replyFor == NULL)
5258 		set_next_commandID_in_message(&wdPacket);
5259 	else
5260 		set_message_commandID(&wdPacket, replyFor->command_id);
5261 
5262 	set_message_data(&wdPacket, data, data_len);
5263 	ret = send_message(wdNode, &wdPacket);
5264 	return ret;
5265 }
5266 
get_local_node_state(void)5267 static inline WD_STATES get_local_node_state(void)
5268 {
5269 	return g_cluster.localNode->state;
5270 }
5271 
5272 static inline bool
is_local_node_true_leader(void)5273 is_local_node_true_leader(void)
5274 {
5275 	return (get_local_node_state() == WD_COORDINATOR && WD_LEADER_NODE == g_cluster.localNode);
5276 }
5277 
5278 /*
5279  * returns true if no message is swallowed by the
5280  * processor and no further action is required
5281  */
5282 static bool
wd_commands_packet_processor(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt)5283 wd_commands_packet_processor(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt)
5284 {
5285 	WDCommandData *ipcCommand;
5286 
5287 	if (event != WD_EVENT_PACKET_RCV)
5288 		return false;
5289 	if (pkt == NULL)
5290 		return false;
5291 
5292 	if (pkt->type == WD_FAILOVER_LOCKING_REQUEST ||
5293 		pkt->type == WD_REMOTE_FAILOVER_REQUEST)
5294 	{
5295 		/* Node is using the older version of Pgpool-II */
5296 		ereport(WARNING,
5297 				(errmsg("node \"%s\" is using the older version of Pgpool-II", wdNode->nodeName)));
5298 		send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_INVALID_VERSION);
5299 		return true;
5300 	}
5301 
5302 	if (pkt->type == WD_IPC_FAILOVER_COMMAND)
5303 	{
5304 		process_remote_failover_command_on_coordinator(wdNode, pkt);
5305 		return true;
5306 	}
5307 
5308 	if (pkt->type == WD_IPC_ONLINE_RECOVERY_COMMAND)
5309 	{
5310 		process_remote_online_recovery_command(wdNode, pkt);
5311 		return true;
5312 	}
5313 
5314 	if (pkt->type == WD_DATA_MESSAGE)
5315 	{
5316 		ipcCommand = get_wd_IPC_command_from_reply(pkt);
5317 		if (ipcCommand)
5318 		{
5319 			if (write_ipc_command_with_result_data(ipcCommand, WD_IPC_CMD_RESULT_OK, pkt->data, pkt->len) == false)
5320 				ereport(LOG,
5321 						(errmsg("failed to forward data message to IPC command socket")));
5322 
5323 			cleanUpIPCCommand(ipcCommand);
5324 			return true;		/* do not process this packet further */
5325 		}
5326 		return false;
5327 	}
5328 
5329 	if (pkt->type == WD_CMD_REPLY_IN_DATA)
5330 	{
5331 		ipcCommand = get_wd_IPC_command_from_reply(pkt);
5332 		if (ipcCommand == NULL)
5333 			return false;
5334 
5335 		/* Just forward the data to IPC socket and finish the command */
5336 		if (write_ipc_command_with_result_data(ipcCommand, WD_IPC_CMD_RESULT_OK, pkt->data, pkt->len) == false)
5337 			ereport(LOG,
5338 					(errmsg("failed to forward data message to IPC command socket")));
5339 
5340 		/*
5341 		 * ok we are done, delete this command
5342 		 */
5343 		cleanUpIPCCommand(ipcCommand);
5344 		return true;			/* do not process this packet further */
5345 	}
5346 
5347 	else if (pkt->type == WD_ACCEPT_MESSAGE ||
5348 			 pkt->type == WD_REJECT_MESSAGE ||
5349 			 pkt->type == WD_ERROR_MESSAGE)
5350 	{
5351 		ipcCommand = get_wd_IPC_command_from_reply(pkt);
5352 
5353 		if (ipcCommand == NULL)
5354 			return false;
5355 
5356 		if (ipcCommand->commandPacket.type == WD_IPC_FAILOVER_COMMAND)
5357 		{
5358 			if (pkt->type == WD_ACCEPT_MESSAGE)
5359 				reply_to_failover_command(ipcCommand, FAILOVER_RES_PROCEED, 0);
5360 			else
5361 				reply_to_failover_command(ipcCommand, FAILOVER_RES_LEADER_REJECTED, 0);
5362 			return true;
5363 		}
5364 
5365 		else if (ipcCommand->commandPacket.type == WD_IPC_ONLINE_RECOVERY_COMMAND)
5366 		{
5367 			return reply_is_received_for_pgpool_replicate_command(wdNode, pkt, ipcCommand);
5368 		}
5369 	}
5370 
5371 	return false;
5372 }
5373 
5374 
5375 static void
update_interface_status(void)5376 update_interface_status(void)
5377 {
5378 	struct ifaddrs *ifAddrStruct = NULL;
5379 	struct ifaddrs *ifa = NULL;
5380 	ListCell   *lc;
5381 
5382 	if (g_cluster.wdInterfaceToMonitor == NULL)
5383 		return;
5384 
5385 	getifaddrs(&ifAddrStruct);
5386 	for (ifa = ifAddrStruct; ifa != NULL; ifa = ifa->ifa_next)
5387 	{
5388 		ereport(DEBUG1,
5389 				(errmsg("network interface %s having flags %d", ifa->ifa_name, ifa->ifa_flags)));
5390 
5391 		if (!strncasecmp("lo", ifa->ifa_name, 2))
5392 			continue;			/* We do not need loop back addresses */
5393 
5394 		foreach(lc, g_cluster.wdInterfaceToMonitor)
5395 		{
5396 			WDInterfaceStatus *if_status = lfirst(lc);
5397 
5398 			if (!strcasecmp(if_status->if_name, ifa->ifa_name))
5399 			{
5400 				if_status->if_up = is_interface_up(ifa);
5401 				break;
5402 			}
5403 		}
5404 	}
5405 
5406 	if (ifAddrStruct != NULL)
5407 		freeifaddrs(ifAddrStruct);
5408 
5409 }
5410 
5411 static bool
any_interface_available(void)5412 any_interface_available(void)
5413 {
5414 	ListCell   *lc;
5415 
5416 	update_interface_status();
5417 	/* if interface monitoring is disabled we are good */
5418 	if (g_cluster.wdInterfaceToMonitor == NULL)
5419 		return true;
5420 
5421 	foreach(lc, g_cluster.wdInterfaceToMonitor)
5422 	{
5423 		WDInterfaceStatus *if_status = lfirst(lc);
5424 
5425 		if (if_status->if_up)
5426 		{
5427 			ereport(DEBUG1,
5428 					(errmsg("network interface \"%s\" is up and we can continue", if_status->if_name)));
5429 			return true;
5430 		}
5431 	}
5432 	return false;
5433 }
5434 
5435 static int
watchdog_state_machine(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)5436 watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
5437 {
5438 	ereport(DEBUG1,
5439 			(errmsg("STATE MACHINE INVOKED WITH EVENT = %s Current State = %s",
5440 					wd_event_name[event], wd_state_names[get_local_node_state()])));
5441 
5442 	if (event == WD_EVENT_REMOTE_NODE_LOST)
5443 	{
5444 
5445 		if (wdNode->state == WD_SHUTDOWN)
5446 		{
5447 			ereport(LOG,
5448 					(errmsg("remote node \"%s\" is shutting down", wdNode->nodeName)));
5449 			if (pool_config->wd_remove_shutdown_nodes)
5450 				revoke_cluster_membership_of_node(wdNode,WD_NODE_REVOKED_SHUTDOWN);
5451 		}
5452 		else
5453 		{
5454 			wdNode->state = WD_LOST;
5455 			ereport(LOG,
5456 					(errmsg("remote node \"%s\" is lost", wdNode->nodeName)));
5457 			/* Inform the node, that it is lost for us */
5458 			 send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_APPEARING_LOST);
5459 		}
5460 		if (wdNode == WD_LEADER_NODE)
5461 		{
5462 			ereport(LOG,
5463 					(errmsg("watchdog cluster has lost the coordinator node")));
5464 			set_cluster_leader_node(NULL);
5465 		}
5466 
5467 		/* close all socket connections to the node */
5468 		close_socket_connection(&wdNode->client_socket);
5469 		close_socket_connection(&wdNode->server_socket);
5470 
5471 		/* clear the wait timer on the node */
5472 		wdNode->last_sent_time.tv_sec = 0;
5473 		wdNode->last_sent_time.tv_usec = 0;
5474 		wdNode->sending_failures_count = 0;
5475 		node_lost_while_ipc_command(wdNode);
5476 	}
5477 	else if (event == WD_EVENT_REMOTE_NODE_FOUND)
5478 	{
5479 		ereport(LOG,
5480 				(errmsg("remote node \"%s\" became reachable again", wdNode->nodeName),
5481 				 errdetail("requesting the node info")));
5482 		/*
5483 		 * remove the lost state from the node
5484 		 * and change it to joining for now
5485 		 */
5486 		wdNode->node_lost_reason = NODE_LOST_UNKNOWN_REASON;
5487 		wdNode->state = WD_LOADING;
5488 		send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_APPEARING_FOUND);
5489 		/* if this node was kicked out of quorum calculation. add it back */
5490 		restore_cluster_membership_of_node(wdNode);
5491 	}
5492 	else if (event == WD_EVENT_PACKET_RCV)
5493 	{
5494 		print_packet_node_info(pkt, wdNode, false);
5495 		/* update the last receive time */
5496 		gettimeofday(&wdNode->last_rcv_time, NULL);
5497 
5498 		if (pkt->type == WD_INFO_MESSAGE)
5499 		{
5500 			standard_packet_processor(wdNode, pkt);
5501 		}
5502 
5503 		if (pkt->type == WD_INFORM_I_AM_GOING_DOWN)
5504 		{
5505 			wdNode->state = WD_SHUTDOWN;
5506 			wdNode->node_lost_reason = NODE_LOST_SHUTDOWN;
5507 			return watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
5508 		}
5509 
5510 		if (watchdog_internal_command_packet_processor(wdNode, pkt) == true)
5511 		{
5512 			return 0;
5513 		}
5514 	}
5515 	else if (event == WD_EVENT_NEW_OUTBOUND_CONNECTION)
5516 	{
5517 		WDPacketData *addPkt = get_addnode_message();
5518 
5519 		send_message(wdNode, addPkt);
5520 		free_packet(addPkt);
5521 	}
5522 
5523 	else if (event == WD_EVENT_NW_IP_IS_REMOVED || event == WD_EVENT_NW_LINK_IS_INACTIVE)
5524 	{
5525 		List	   *local_addresses;
5526 
5527 		/* check if we have an active link */
5528 		if (any_interface_available() == false)
5529 		{
5530 			ereport(WARNING,
5531 					(errmsg("network event has occurred and all monitored interfaces are down"),
5532 					 errdetail("changing the state to in network trouble")));
5533 
5534 			set_state(WD_IN_NW_TROUBLE);
5535 
5536 		}
5537 		/* check if all IP addresses are lost */
5538 		local_addresses = get_all_local_ips();
5539 		if (local_addresses == NULL)
5540 		{
5541 			/*
5542 			 * We have lost all IP addresses we are in network trouble. Just
5543 			 * move to in network trouble state
5544 			 */
5545 			ereport(WARNING,
5546 					(errmsg("network IP is removed and system has no IP is assigned"),
5547 					 errdetail("changing the state to in network trouble")));
5548 
5549 			set_state(WD_IN_NW_TROUBLE);
5550 		}
5551 		else
5552 		{
5553 			ListCell   *lc;
5554 
5555 			ereport(DEBUG1,
5556 					(errmsg("network IP is removed but system still has a valid IP is assigned")));
5557 			foreach(lc, local_addresses)
5558 			{
5559 				char	   *ip = lfirst(lc);
5560 
5561 				ereport(DEBUG1,
5562 						(errmsg("IP = %s", ip ? ip : "NULL")));
5563 			}
5564 
5565 			list_free_deep(local_addresses);
5566 			local_addresses = NULL;
5567 		}
5568 	}
5569 
5570 	else if (event == WD_EVENT_LOCAL_NODE_LOST)
5571 	{
5572 		ereport(WARNING,
5573 				(errmsg("watchdog life-check reported, we are disconnected from the network"),
5574 				 errdetail("changing the state to LOST")));
5575 		set_state(WD_LOST);
5576 	}
5577 
5578 	if (wd_commands_packet_processor(event, wdNode, pkt) == true)
5579 		return 0;
5580 
5581 	switch (get_local_node_state())
5582 	{
5583 		case WD_LOADING:
5584 			watchdog_state_machine_loading(event, wdNode, pkt, clusterCommand);
5585 			break;
5586 		case WD_JOINING:
5587 			watchdog_state_machine_joining(event, wdNode, pkt, clusterCommand);
5588 			break;
5589 		case WD_INITIALIZING:
5590 			watchdog_state_machine_initializing(event, wdNode, pkt, clusterCommand);
5591 			break;
5592 		case WD_COORDINATOR:
5593 			watchdog_state_machine_coordinator(event, wdNode, pkt, clusterCommand);
5594 			break;
5595 		case WD_PARTICIPATE_IN_ELECTION:
5596 			watchdog_state_machine_voting(event, wdNode, pkt, clusterCommand);
5597 			break;
5598 		case WD_STAND_FOR_COORDINATOR:
5599 			watchdog_state_machine_standForCord(event, wdNode, pkt, clusterCommand);
5600 			break;
5601 		case WD_STANDBY:
5602 			watchdog_state_machine_standby(event, wdNode, pkt, clusterCommand);
5603 			break;
5604 		case WD_LOST:
5605 		case WD_IN_NW_TROUBLE:
5606 			watchdog_state_machine_nw_error(event, wdNode, pkt, clusterCommand);
5607 			break;
5608 		case WD_NETWORK_ISOLATION:
5609 			watchdog_state_machine_nw_isolation(event, wdNode, pkt, clusterCommand);
5610 			break;
5611 		default:
5612 			/* Should never ever happen */
5613 			ereport(WARNING,
5614 					(errmsg("invalid watchdog state")));
5615 			set_state(WD_LOADING);
5616 			break;
5617 	}
5618 
5619 	return 0;
5620 }
5621 
5622 /*
5623  * This is the state where the watchdog enters when starting up.
5624  * upon entering this state we sends ADD node message to all reachable
5625  * nodes.
5626  * Wait for 4 seconds if some node rejects us.
5627  */
5628 static int
watchdog_state_machine_loading(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)5629 watchdog_state_machine_loading(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
5630 {
5631 	switch (event)
5632 	{
5633 		case WD_EVENT_WD_STATE_CHANGED:
5634 			{
5635 				int			i;
5636 				WDPacketData *addPkt = get_addnode_message();
5637 
5638 				/* set the status to ADD_MESSAGE_SEND by hand */
5639 				for (i = 0; i < g_cluster.remoteNodeCount; i++)
5640 				{
5641 					WatchdogNode *wdTmpNode;
5642 
5643 					wdTmpNode = &(g_cluster.remoteNodes[i]);
5644 					if (wdTmpNode->client_socket.sock_state == WD_SOCK_CONNECTED && wdTmpNode->state == WD_DEAD)
5645 					{
5646 						if (send_message(wdTmpNode, addPkt))
5647 							wdTmpNode->state = WD_ADD_MESSAGE_SENT;
5648 					}
5649 				}
5650 				free_packet(addPkt);
5651 				set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE);
5652 			}
5653 			break;
5654 
5655 		case WD_EVENT_TIMEOUT:
5656 			set_state(WD_JOINING);
5657 			break;
5658 
5659 		case WD_EVENT_PACKET_RCV:
5660 			{
5661 				switch (pkt->type)
5662 				{
5663 					case WD_STAND_FOR_COORDINATOR_MESSAGE:
5664 						{
5665 							/*
5666 							 * We are loading but a node is already contesting
5667 							 * for coordinator node well we can ignore it but
5668 							 * then this could eventually mean a lower
5669 							 * priority node can became a coordinator node. So
5670 							 * check the priority of the node in stand for
5671 							 * coordinator state
5672 							 */
5673 							if (g_cluster.localNode->wd_priority > wdNode->wd_priority)
5674 							{
5675 								reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
5676 								set_state(WD_STAND_FOR_COORDINATOR);
5677 							}
5678 							else
5679 							{
5680 								reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
5681 								set_state(WD_PARTICIPATE_IN_ELECTION);
5682 							}
5683 						}
5684 						break;
5685 
5686 					case WD_INFO_MESSAGE:
5687 						{
5688 							int			i;
5689 							bool		all_replied = true;
5690 
5691 							for (i = 0; i < g_cluster.remoteNodeCount; i++)
5692 							{
5693 								wdNode = &(g_cluster.remoteNodes[i]);
5694 								if (wdNode->state == WD_ADD_MESSAGE_SENT)
5695 								{
5696 									all_replied = false;
5697 									break;
5698 								}
5699 							}
5700 							if (all_replied)
5701 							{
5702 								/*
5703 								 * we are already connected to all configured
5704 								 * nodes Just move to initializing state
5705 								 */
5706 								set_state(WD_INITIALIZING);
5707 							}
5708 						}
5709 						break;
5710 
5711 					case WD_REJECT_MESSAGE:
5712 						if (wdNode->state == WD_ADD_MESSAGE_SENT || wdNode->state == WD_DEAD)
5713 							ereport(FATAL,
5714 									(return_code(POOL_EXIT_FATAL),
5715 									 errmsg("Add to watchdog cluster request is rejected by node \"%s:%d\"", wdNode->hostname, wdNode->wd_port),
5716 									 errhint("check the watchdog configurations.")));
5717 						break;
5718 					default:
5719 						standard_packet_processor(wdNode, pkt);
5720 						break;
5721 				}
5722 			}
5723 			break;
5724 		default:
5725 			break;
5726 	}
5727 	return 0;
5728 }
5729 
5730 /*
5731  * This is the intermediate state before going to cluster initialization
5732  * here we update the information of all connected nodes and move to the
5733  * initialization state. moving to this state from loading does not make
5734  * much sence as at loading time we already have updated node informations
5735  */
5736 static int
watchdog_state_machine_joining(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)5737 watchdog_state_machine_joining(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
5738 {
5739 	switch (event)
5740 	{
5741 		case WD_EVENT_WD_STATE_CHANGED:
5742 			set_cluster_leader_node(NULL);
5743 			try_connecting_with_all_unreachable_nodes();
5744 			send_cluster_command(NULL, WD_REQ_INFO_MESSAGE, 4);
5745 			set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE);
5746 			break;
5747 
5748 		case WD_EVENT_TIMEOUT:
5749 			set_state(WD_INITIALIZING);
5750 			break;
5751 
5752 		case WD_EVENT_COMMAND_FINISHED:
5753 			{
5754 				if (clusterCommand->commandPacket.type == WD_REQ_INFO_MESSAGE)
5755 					set_state(WD_INITIALIZING);
5756 			}
5757 			break;
5758 
5759 		case WD_EVENT_PACKET_RCV:
5760 			{
5761 				switch (pkt->type)
5762 				{
5763 					case WD_REJECT_MESSAGE:
5764 						if (wdNode->state == WD_ADD_MESSAGE_SENT)
5765 							ereport(FATAL,
5766 									(return_code(POOL_EXIT_FATAL),
5767 									 errmsg("add to watchdog cluster request is rejected by node \"%s:%d\"", wdNode->hostname, wdNode->wd_port),
5768 									 errhint("check the watchdog configurations.")));
5769 						break;
5770 
5771 					case WD_STAND_FOR_COORDINATOR_MESSAGE:
5772 						{
5773 							/*
5774 							 * We are loading but a node is already contesting
5775 							 * for coordinator node well we can ignore it but
5776 							 * then this could eventually mean a lower
5777 							 * priority node can became a coordinator node. So
5778 							 * check the priority of the node in stand for
5779 							 * coordinator state
5780 							 */
5781 							if (g_cluster.localNode->wd_priority > wdNode->wd_priority)
5782 							{
5783 								reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
5784 								set_state(WD_STAND_FOR_COORDINATOR);
5785 							}
5786 							else
5787 							{
5788 								reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
5789 								set_state(WD_PARTICIPATE_IN_ELECTION);
5790 							}
5791 						}
5792 						break;
5793 
5794 					default:
5795 						standard_packet_processor(wdNode, pkt);
5796 						break;
5797 				}
5798 			}
5799 			break;
5800 
5801 		default:
5802 			break;
5803 	}
5804 
5805 	return 0;
5806 }
5807 
5808 /*
5809  * This state only works on the local data and does not
5810  * sends any cluster command.
5811  */
5812 
5813 static int
watchdog_state_machine_initializing(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)5814 watchdog_state_machine_initializing(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
5815 {
5816 	switch (event)
5817 	{
5818 		case WD_EVENT_WD_STATE_CHANGED:
5819 			/* set 1 sec timeout, save ourself from recursion */
5820 			set_timeout(1);
5821 			break;
5822 
5823 		case WD_EVENT_TIMEOUT:
5824 			{
5825 				/*
5826 				 * If leader node exists in cluster, Join it otherwise try
5827 				 * becoming a leader
5828 				 */
5829 				if (WD_LEADER_NODE)
5830 				{
5831 					/*
5832 					 * we found the coordinator node in network. Just join the
5833 					 * network
5834 					 */
5835 					set_state(WD_STANDBY);
5836 				}
5837 				else if (get_cluster_node_count() == 0)
5838 				{
5839 					ereport(LOG,
5840 							(errmsg("I am the only alive node in the watchdog cluster"),
5841 							 errhint("skipping stand for coordinator state")));
5842 
5843 					/*
5844 					 * I am the alone node in the cluster at the moment skip
5845 					 * the intermediate steps and jump to the coordinator
5846 					 * state
5847 					 */
5848 					set_state(WD_COORDINATOR);
5849 				}
5850 				else
5851 				{
5852 					int			i;
5853 
5854 					for (i = 0; i < g_cluster.remoteNodeCount; i++)
5855 					{
5856 						WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
5857 
5858 						if (wdNode->state == WD_STAND_FOR_COORDINATOR)
5859 						{
5860 							set_state(WD_PARTICIPATE_IN_ELECTION);
5861 							return 0;
5862 						}
5863 					}
5864 					/* stand for coordinator */
5865 					set_state(WD_STAND_FOR_COORDINATOR);
5866 				}
5867 			}
5868 			break;
5869 
5870 		case WD_EVENT_PACKET_RCV:
5871 			{
5872 				switch (pkt->type)
5873 				{
5874 					case WD_REJECT_MESSAGE:
5875 						if (wdNode->state == WD_ADD_MESSAGE_SENT)
5876 							ereport(FATAL,
5877 									(return_code(POOL_EXIT_FATAL),
5878 									 errmsg("Add to watchdog cluster request is rejected by node \"%s:%d\"", wdNode->hostname, wdNode->wd_port),
5879 									 errhint("check the watchdog configurations.")));
5880 						break;
5881 					default:
5882 						standard_packet_processor(wdNode, pkt);
5883 						break;
5884 				}
5885 			}
5886 
5887 			break;
5888 
5889 		default:
5890 			break;
5891 	}
5892 	return 0;
5893 }
5894 
5895 static int
watchdog_state_machine_standForCord(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)5896 watchdog_state_machine_standForCord(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
5897 {
5898 	switch (event)
5899 	{
5900 		case WD_EVENT_WD_STATE_CHANGED:
5901 			send_cluster_command(NULL, WD_STAND_FOR_COORDINATOR_MESSAGE, 4);
5902 			/* wait for 5 seconds if someone rejects us */
5903 			set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE);
5904 			break;
5905 
5906 		case WD_EVENT_COMMAND_FINISHED:
5907 			{
5908 				if (clusterCommand->commandPacket.type == WD_STAND_FOR_COORDINATOR_MESSAGE)
5909 				{
5910 					if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ||
5911 						clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT)
5912 					{
5913 						set_state(WD_COORDINATOR);
5914 					}
5915 					else
5916 					{
5917 						/* command finished with an error */
5918 						if (pkt)
5919 						{
5920 							if (pkt->type == WD_ERROR_MESSAGE)
5921 							{
5922 								ereport(LOG,
5923 										(errmsg("our stand for coordinator request is rejected by node \"%s\"",wdNode->nodeName),
5924 										 errdetail("we might be in partial network isolation and cluster already have a valid leader"),
5925 										 errhint("please verify the watchdog life-check and network is working properly")));
5926 								set_state(WD_NETWORK_ISOLATION);
5927 							}
5928 							else if (pkt->type == WD_REJECT_MESSAGE)
5929 							{
5930 								ereport(LOG,
5931 										(errmsg("our stand for coordinator request is rejected by node \"%s\"", wdNode->nodeName)));
5932 								set_state(WD_PARTICIPATE_IN_ELECTION);
5933 							}
5934 						}
5935 						else
5936 						{
5937 							ereport(LOG,
5938 									(errmsg("our stand for coordinator request is rejected by node \"%s\"", wdNode->nodeName)));
5939 							set_state(WD_JOINING);
5940 						}
5941 					}
5942 				}
5943 			}
5944 			break;
5945 
5946 		case WD_EVENT_TIMEOUT:
5947 			set_state(WD_COORDINATOR);
5948 			break;
5949 
5950 		case WD_EVENT_PACKET_RCV:
5951 			{
5952 				switch (pkt->type)
5953 				{
5954 					case WD_STAND_FOR_COORDINATOR_MESSAGE:
5955 						/* decide on base of priority */
5956 						if (g_cluster.localNode->wd_priority > wdNode->wd_priority)
5957 						{
5958 							reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
5959 						}
5960 						else if (g_cluster.localNode->wd_priority == wdNode->wd_priority)
5961 						{
5962 							/* decide on base of starting time */
5963 							if (g_cluster.localNode->startup_time.tv_sec <= wdNode->startup_time.tv_sec)	/* I am older */
5964 							{
5965 								reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
5966 							}
5967 							else
5968 							{
5969 								reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
5970 								set_state(WD_PARTICIPATE_IN_ELECTION);
5971 							}
5972 						}
5973 						else
5974 						{
5975 							reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
5976 							set_state(WD_PARTICIPATE_IN_ELECTION);
5977 						}
5978 						break;
5979 
5980 					case WD_DECLARE_COORDINATOR_MESSAGE:
5981 						{
5982 							/*
5983 							 * meanwhile someone has declared itself
5984 							 * coordinator
5985 							 */
5986 							if (g_cluster.localNode->wd_priority > wdNode->wd_priority)
5987 							{
5988 								ereport(LOG,
5989 										(errmsg("rejecting the declare coordinator request from node \"%s\"", wdNode->nodeName),
5990 										 errdetail("my wd_priority [%d] is higher than the requesting node's priority [%d]", g_cluster.localNode->wd_priority, wdNode->wd_priority)));
5991 								reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
5992 							}
5993 							else
5994 							{
5995 								ereport(LOG,
5996 										(errmsg("node \"%s\" has declared itself as a coordinator", wdNode->nodeName)));
5997 								reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
5998 								set_state(WD_JOINING);
5999 							}
6000 						}
6001 						break;
6002 					default:
6003 						standard_packet_processor(wdNode, pkt);
6004 						break;
6005 				}
6006 			}
6007 			break;
6008 
6009 		default:
6010 			break;
6011 	}
6012 	return 0;
6013 }
6014 
6015 /*
6016  * Event handler for the coordinator/leader state.
6017  * The function handels all the event received when the local
6018  * node is the leader/coordinator node.
6019  */
6020 static int
watchdog_state_machine_coordinator(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)6021 watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
6022 {
6023 	switch (event)
6024 	{
6025 		case WD_EVENT_WD_STATE_CHANGED:
6026 			{
6027 				int			i;
6028 
6029 				send_cluster_command(NULL, WD_DECLARE_COORDINATOR_MESSAGE, 4);
6030 				set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE);
6031 				update_missed_beacon_count(NULL,true);
6032 				ereport(LOG,
6033 						(errmsg("I am announcing my self as leader/coordinator watchdog node")));
6034 
6035 				for (i = 0; i < g_cluster.remoteNodeCount; i++)
6036 				{
6037 					WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
6038 
6039 					ereport(DEBUG2,
6040 							(errmsg("printing all remote node information")));
6041 					print_watchdog_node_info(wdNode);
6042 				}
6043 				/* Also reset my priority as per the original configuration */
6044 				g_cluster.localNode->wd_priority = pool_config->wd_priority;
6045 			}
6046 			break;
6047 
6048 		case WD_EVENT_COMMAND_FINISHED:
6049 			{
6050 				if (clusterCommand->commandPacket.type == WD_DECLARE_COORDINATOR_MESSAGE)
6051 				{
6052 					if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ||
6053 						clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT)
6054 					{
6055 						update_cluster_memberships();
6056 						update_quorum_status();
6057 						reset_lost_timers();
6058 						ereport(DEBUG1,
6059 								(errmsg("declare coordinator command finished with status:[%s]",
6060 										clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ?
6061 										"ALL NODES REPLIED" :
6062 										"COMMAND TIMED OUT"),
6063 								 errdetail("The command was sent to %d nodes and %d nodes replied to it",
6064 										   clusterCommand->commandSendToCount,
6065 										   clusterCommand->commandReplyFromCount
6066 										   )));
6067 
6068 						ereport(LOG,
6069 								(errmsg("I am the cluster leader node"),
6070 								 errdetail("our declare coordinator message is accepted by all nodes")));
6071 
6072 						set_cluster_leader_node(g_cluster.localNode);
6073 						register_watchdog_state_change_interrupt();
6074 
6075 						/*
6076 						 * Check if the quorum is present then start the
6077 						 * escalation process otherwise keep in the
6078 						 * coordinator state and wait for the quorum
6079 						 */
6080 						if (g_cluster.quorum_status == -1)
6081 						{
6082 							ereport(LOG,
6083 									(errmsg("I am the cluster leader node but we do not have enough nodes in cluster"),
6084 									 errdetail("waiting for the quorum to start escalation process")));
6085 						}
6086 						else
6087 						{
6088 							ereport(LOG,
6089 									(errmsg("I am the cluster leader node. Starting escalation process")));
6090 							start_escalated_node();
6091 						}
6092 					}
6093 					else
6094 					{
6095 						/* command is finished but because of error */
6096 						ereport(NOTICE,
6097 								(errmsg("possible split brain scenario detected by \"%s\" node", wdNode->nodeName),
6098 								 (errdetail("re-initializing cluster"))));
6099 						set_state(WD_JOINING);
6100 					}
6101 				}
6102 
6103 				else if (clusterCommand->commandPacket.type == WD_IAM_COORDINATOR_MESSAGE)
6104 				{
6105 					update_missed_beacon_count(clusterCommand,false);
6106 
6107 					if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED)
6108 					{
6109 						ereport(DEBUG1,
6110 								(errmsg("I am the cluster leader node command finished with status:[ALL NODES REPLIED]"),
6111 								 errdetail("The command was sent to %d nodes and %d nodes replied to it",
6112 										   clusterCommand->commandSendToCount,
6113 										   clusterCommand->commandReplyFromCount
6114 										   )));
6115 					}
6116 					else if (clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT)
6117 					{
6118 						ereport(DEBUG1,
6119 								(errmsg("I am the cluster leader node command finished with status:[COMMAND TIMED OUT] which is success"),
6120 								 errdetail("The command was sent to %d nodes and %d nodes replied to it",
6121 										   clusterCommand->commandSendToCount,
6122 										   clusterCommand->commandReplyFromCount
6123 										   )));
6124 					}
6125 					else if (clusterCommand->commandStatus == COMMAND_FINISHED_NODE_REJECTED)
6126 					{
6127 						/*
6128 						 * one of the node rejected out I am coordinator
6129 						 * message
6130 						 */
6131 						ereport(LOG,
6132 								(errmsg("possible split brain, \"%s\" node has rejected our coordinator beacon", wdNode->nodeName),
6133 								 (errdetail("removing the node from out standby list"))));
6134 
6135 						standby_node_left_cluster(wdNode);
6136 					}
6137 				}
6138 			}
6139 			break;
6140 
6141 		case WD_EVENT_CLUSTER_QUORUM_CHANGED:
6142 			{
6143 				/* make sure we are accepted as leader */
6144 				if (WD_LEADER_NODE == g_cluster.localNode)
6145 				{
6146 					if (g_cluster.quorum_status == -1)
6147 					{
6148 						ereport(LOG,
6149 								(errmsg("We have lost the quorum")));
6150 
6151 						/*
6152 						 * We have lost the quorum, stay as a leader node but
6153 						 * perform de-escalation. As keeping the VIP may
6154 						 * result in split-brain
6155 						 */
6156 						resign_from_escalated_node();
6157 					}
6158 					else if (g_cluster.quorum_status >= 0)
6159 					{
6160 						if (g_cluster.localNode->escalated == false)
6161 						{
6162 							ereport(LOG,
6163 									(errmsg("quorum found"),
6164 									 errdetail("starting escalation process")));
6165 							start_escalated_node();
6166 						}
6167 					}
6168 					/* inform to the cluster about the new quorum status */
6169 					send_message_of_type(NULL, WD_INFO_MESSAGE, NULL);
6170 					register_watchdog_quorum_change_interrupt();
6171 				}
6172 			}
6173 			break;
6174 
6175 		case WD_EVENT_NW_IP_IS_REMOVED:
6176 			{
6177 				/* check if we were holding the virtual IP and it is now lost */
6178 				List	   *local_addresses = get_all_local_ips();
6179 
6180 				if (local_addresses == NULL)
6181 				{
6182 					/*
6183 					 * We have lost all IP addresses we are in network
6184 					 * trouble. Just move to in network trouble state
6185 					 */
6186 					set_state(WD_IN_NW_TROUBLE);
6187 				}
6188 				else
6189 				{
6190 					/*
6191 					 * We do have some IP addresses assigned so its not a
6192 					 * total black-out check if we still have the VIP assigned
6193 					 */
6194 					if (g_cluster.clusterLeaderInfo.holding_vip == true)
6195 					{
6196 						ListCell   *lc;
6197 						bool		vip_exists = false;
6198 
6199 						foreach(lc, local_addresses)
6200 						{
6201 							char	   *ip = lfirst(lc);
6202 
6203 							if (!strcmp(ip, g_cluster.localNode->delegate_ip))
6204 							{
6205 								vip_exists = true;
6206 								break;
6207 							}
6208 						}
6209 						if (vip_exists == false)
6210 						{
6211 							/*
6212 							 * Okay this is the case when only our VIP is lost
6213 							 * but network interface seems to be working fine
6214 							 * try to re-acquire the VIP
6215 							 */
6216 							wd_IP_up();
6217 						}
6218 					}
6219 					list_free_deep(local_addresses);
6220 					local_addresses = NULL;
6221 				}
6222 			}
6223 			break;
6224 
6225 		case WD_EVENT_NW_IP_IS_ASSIGNED:
6226 			break;
6227 
6228 		case WD_EVENT_TIMEOUT:
6229 			{
6230 				if (check_debug_request_do_not_send_beacon() == false)
6231 					send_cluster_command(NULL, WD_IAM_COORDINATOR_MESSAGE, 5);
6232 				set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
6233 			}
6234 			break;
6235 
6236 		case WD_EVENT_I_AM_APPEARING_LOST:
6237 		{
6238 			/* The remote node has lost us, It would have already marked
6239 			 * us as lost, So remove it from standby*/
6240 			standby_node_left_cluster(wdNode);
6241 		}
6242 			break;
6243 
6244 		case WD_EVENT_I_AM_APPEARING_FOUND:
6245 		{
6246 			/* The remote node has found us again */
6247 			if (wdNode->wd_data_major_version >= 1 && wdNode->wd_data_minor_version >= 1)
6248 			{
6249 				/*
6250 				 * Since data version 1.1 we support CLUSTER_NODE_REQUIRE_TO_RELOAD
6251 				 * which makes the standby nodes to re-send the join leader node
6252 				 */
6253 				ereport(DEBUG1,
6254 					(errmsg("asking remote node \"%s\" to rejoin leader", wdNode->nodeName),
6255 						errdetail("watchdog data version %s",WD_MESSAGE_DATA_VERSION)));
6256 
6257 				send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_REQUIRE_TO_RELOAD);
6258 			}
6259 			else
6260 			{
6261 				/*
6262 				 * The node is on older version
6263 				 * So ask it to re-join the cluster
6264 				 */
6265 				ereport(DEBUG1,
6266 					(errmsg("asking remote node \"%s\" to rejoin cluster", wdNode->nodeName),
6267 						errdetail("watchdog data version %s",WD_MESSAGE_DATA_VERSION)));
6268 				send_cluster_service_message(wdNode, pkt, CLUSTER_NEEDS_ELECTION);
6269 			}
6270 		}
6271 			break;
6272 
6273 		case WD_EVENT_REMOTE_NODE_LOST:
6274 			{
6275 				standby_node_left_cluster(wdNode);
6276 			}
6277 			break;
6278 
6279 		case WD_EVENT_REMOTE_NODE_FOUND:
6280 		{
6281 			ereport(LOG,
6282 				(errmsg("remote node \"%s\" is reachable again", wdNode->nodeName),
6283 					errdetail("trying to add it back as a standby")));
6284 			wdNode->node_lost_reason = NODE_LOST_UNKNOWN_REASON;
6285 			/* If I am the cluster leader. Ask for the node info and to re-send the join message */
6286 			send_message_of_type(wdNode, WD_REQ_INFO_MESSAGE, NULL);
6287 			if (wdNode->wd_data_major_version >= 1 && wdNode->wd_data_minor_version >= 1)
6288 			{
6289 				/*
6290 				 * Since data version 1.1 we support CLUSTER_NODE_REQUIRE_TO_RELOAD
6291 				 * which makes the standby nodes to re-send the join leader node
6292 				 */
6293 				ereport(DEBUG1,
6294 					(errmsg("asking remote node \"%s\" to rejoin leader", wdNode->nodeName),
6295 						errdetail("watchdog data version %s",WD_MESSAGE_DATA_VERSION)));
6296 
6297 				send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_REQUIRE_TO_RELOAD);
6298 			}
6299 			else
6300 			{
6301 				/*
6302 				 * The node is on older version
6303 				 * So ask it to re-join the cluster
6304 				 */
6305 				ereport(DEBUG1,
6306 					(errmsg("asking remote node \"%s\" to rejoin cluster", wdNode->nodeName),
6307 						errdetail("watchdog data version %s",WD_MESSAGE_DATA_VERSION)));
6308 				send_cluster_service_message(wdNode, pkt, CLUSTER_NEEDS_ELECTION);
6309 			}
6310 			break;
6311 		}
6312 
6313 		case WD_EVENT_PACKET_RCV:
6314 			{
6315 				switch (pkt->type)
6316 				{
6317 					case WD_ADD_NODE_MESSAGE:
6318 						/* In case we received the ADD node message from
6319 						 * one of our standby, Remove that standby from
6320 						 * the list
6321 						 */
6322 						standby_node_left_cluster(wdNode);
6323 						standard_packet_processor(wdNode, pkt);
6324 						break;
6325 
6326 					case WD_STAND_FOR_COORDINATOR_MESSAGE:
6327 						reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
6328 						break;
6329 					case WD_DECLARE_COORDINATOR_MESSAGE:
6330 						ereport(NOTICE,
6331 								(errmsg("We are coordinator and another node tried a coup")));
6332 						reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
6333 						break;
6334 
6335 					case WD_IAM_COORDINATOR_MESSAGE:
6336 						{
6337 							ereport(NOTICE,
6338 									(errmsg("We are in split brain, I AM COORDINATOR MESSAGE received from \"%s\" node", wdNode->nodeName)));
6339 
6340 							if (beacon_message_received_from_node(wdNode, pkt) == true)
6341 							{
6342 								handle_split_brain(wdNode, pkt);
6343 							}
6344 							else
6345 							{
6346 								/*
6347 								 * we are not able to decide which should be
6348 								 * the best candidate to stay as
6349 								 * leader/coordinator node This could also
6350 								 * happen if the remote node is using the
6351 								 * older version of Pgpool-II which send the
6352 								 * empty beacon messages.
6353 								 */
6354 								ereport(LOG,
6355 										(errmsg("We are in split brain, and not able to decide the best candidate for leader/coordinator"),
6356 										 errdetail("re-initializing the local watchdog cluster state")));
6357 
6358 								send_cluster_service_message(wdNode, pkt, CLUSTER_NEEDS_ELECTION);
6359 								set_state(WD_JOINING);
6360 							}
6361 						}
6362 						break;
6363 
6364 					case WD_JOIN_COORDINATOR_MESSAGE:
6365 						{
6366 							/*
6367 							 * If the node is marked as lost because of
6368 							 * life-check, Do not let it join the cluster
6369 							 */
6370 							if (wdNode->state == WD_LOST && wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK)
6371 							{
6372 								ereport(LOG,
6373 										(errmsg("lost remote node \"%s\" is requesting to join the cluster",wdNode->nodeName),
6374 										 errdetail("rejecting the request until life-check inform us that it is reachable again")));
6375 								reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
6376 							}
6377 							else
6378 							{
6379 								reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
6380 								/* Also get the configurations from the standby node */
6381 								send_message_of_type(wdNode,WD_ASK_FOR_POOL_CONFIG,NULL);
6382 								standby_node_join_cluster(wdNode);
6383 							}
6384 						}
6385 						break;
6386 
6387 					default:
6388 						standard_packet_processor(wdNode, pkt);
6389 						break;
6390 				}
6391 			}
6392 			break;
6393 
6394 		default:
6395 			break;
6396 	}
6397 	return 0;
6398 }
6399 
6400 /*
6401  * We can get into this state if we detect the total
6402  * network blackout, Here we just keep waiting for the
6403  * network to come back, and when it does we re-initialize
6404  * the cluster state.
6405  *
6406  * Note:
6407  *
6408  * All this is very good to detect the network black out or cable unplugged
6409  * scenarios, and moving to the WD_IN_NW_TROUBLE state. Although this state machine
6410  * function can gracefully handle the network black out situation and recovers the
6411  * watchdog node when the network becomes reachable, but there is a problem.
6412  *
6413  * Once the cable on the system is unplugged or when the node gets isolated from the
6414  * cluster there is every likelihood that the backend health-check of the isolated node
6415  * start reporting the backend node failure and the pgpool-II proceeds to perform
6416  * the failover for all attached backend nodes. Since the pgpool-II is yet not
6417  * smart enough to figure out it is because of the network failure of its own
6418  * system and the backend nodes are not actually at fault but, are working properly.
6419  *
6420  * So now when the network gets back the backend status of the node will be different
6421  * and incorrect from the other pgpool-II nodes in the cluster. So the ideal solution
6422  * for the situation is to make the pgpool-II main process aware of the network black out
6423  * and when the network recovers the pgpool-II asks the watchdog to sync again the state of
6424  * all configured backend nodes from the leader pgpool-II node. But to implement this lot
6425  * of time is required, So until that time we are just opting for the easiest solution here
6426  * which is to commit a suicide as soon an the network becomes unreachable
6427  */
6428 static int
watchdog_state_machine_nw_error(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)6429 watchdog_state_machine_nw_error(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
6430 {
6431 	switch (event)
6432 	{
6433 		case WD_EVENT_WD_STATE_CHANGED:
6434 			/* commit suicide, see above note */
6435 			ereport(FATAL,
6436 					(return_code(POOL_EXIT_FATAL),
6437 					 errmsg("system has lost the network")));
6438 
6439 			set_timeout(2);
6440 			break;
6441 
6442 		case WD_EVENT_PACKET_RCV:
6443 
6444 			/*
6445 			 * Okay this is funny because according to us we are in network
6446 			 * black out but yet we are able to receive the packet. Just check
6447 			 * may be network is back and we are unable to detect it
6448 			 */
6449 			/* fall through */
6450 		case WD_EVENT_TIMEOUT:
6451 		case WD_EVENT_NW_IP_IS_ASSIGNED:
6452 			{
6453 				List	   *local_addresses = get_all_local_ips();
6454 
6455 				if (local_addresses == NULL)
6456 				{
6457 					/*
6458 					 * How come this is possible ?? but if somehow this
6459 					 * happens keep in the state and ignore the packet
6460 					 */
6461 				}
6462 				else
6463 				{
6464 					/*
6465 					 * Seems like the network is back just go on initialize
6466 					 * the cluster
6467 					 */
6468 					/*
6469 					 * we might have broken sockets when the network gets
6470 					 * back. Send the request info message to all nodes to
6471 					 * confirm socket state
6472 					 */
6473 					WDPacketData *pkt = get_minimum_message(WD_IAM_IN_NW_TROUBLE_MESSAGE, NULL);
6474 
6475 					send_message(NULL, pkt);
6476 					try_connecting_with_all_unreachable_nodes();
6477 					pfree(pkt);
6478 					list_free_deep(local_addresses);
6479 					local_addresses = NULL;
6480 					set_state(WD_LOADING);
6481 				}
6482 			}
6483 			break;
6484 
6485 		default:
6486 			break;
6487 	}
6488 	return 0;
6489 }
6490 
6491 /*
6492  * we could end up in tis state if we were connected to the
6493  * leader node as standby and got lost on the leader.
6494  * Here we just wait for BEACON_MESSAGE_INTERVAL_SECONDS
6495  * and retry to join the cluster.
6496  */
6497 static int
watchdog_state_machine_nw_isolation(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)6498 watchdog_state_machine_nw_isolation(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
6499 {
6500 	switch (event)
6501 	{
6502 		case WD_EVENT_WD_STATE_CHANGED:
6503 			set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
6504 			break;
6505 
6506 		case WD_EVENT_PACKET_RCV:
6507 			standard_packet_processor(wdNode, pkt);
6508 			break;
6509 
6510 		case WD_EVENT_REMOTE_NODE_FOUND:
6511 		case WD_EVENT_WD_STATE_REQUIRE_RELOAD:
6512 		case WD_EVENT_I_AM_APPEARING_FOUND:
6513 		case WD_EVENT_TIMEOUT:
6514 			/* fall through */
6515 		case WD_EVENT_NW_IP_IS_ASSIGNED:
6516 			ereport(LOG,
6517 				(errmsg("trying again to join the cluster")));
6518 			set_state(WD_JOINING);
6519 			break;
6520 
6521 		default:
6522 			break;
6523 	}
6524 	return 0;
6525 }
6526 
6527 static bool
beacon_message_received_from_node(WatchdogNode * wdNode,WDPacketData * pkt)6528 beacon_message_received_from_node(WatchdogNode * wdNode, WDPacketData * pkt)
6529 {
6530 	long		seconds_since_node_startup;
6531 	long		seconds_since_current_state;
6532 	int			quorum_status;
6533 	int			standby_nodes_count;
6534 	bool		escalated;
6535 	int			state;
6536 	struct timeval current_time;
6537 
6538 	gettimeofday(&current_time, NULL);
6539 
6540 	if (pkt->data == NULL || pkt->len <= 0)
6541 		return false;
6542 
6543 	if (parse_beacon_message_json(pkt->data, pkt->len,
6544 								  &state,
6545 								  &seconds_since_node_startup,
6546 								  &seconds_since_current_state,
6547 								  &quorum_status,
6548 								  &standby_nodes_count,
6549 								  &escalated) == false)
6550 	{
6551 		return false;
6552 	}
6553 
6554 	wdNode->current_state_time.tv_sec = current_time.tv_sec - seconds_since_current_state;
6555 	wdNode->startup_time.tv_sec = current_time.tv_sec - seconds_since_node_startup;
6556 	wdNode->current_state_time.tv_usec = wdNode->startup_time.tv_usec = 0;
6557 	wdNode->quorum_status = quorum_status;
6558 	wdNode->standby_nodes_count = standby_nodes_count;
6559 	wdNode->state = state;
6560 	wdNode->escalated = escalated;
6561 	return true;
6562 }
6563 
6564 /*
6565  * This function decides the best contender for a coordinator/leader node
6566  * when the remote node info states it is a coordinator while
6567  * the local node is also in the leader/coordinator state.
6568  *
6569  * return:
6570  * -1 : remote node is the best candidate to remain as leader
6571  *  0 : both local and remote nodes are not worthy leader or error
6572  *  1 : local node should remain as the leader/coordinator
6573  */
6574 static int
I_am_leader_and_cluster_in_split_brain(WatchdogNode * otherLeaderNode)6575 I_am_leader_and_cluster_in_split_brain(WatchdogNode * otherLeaderNode)
6576 {
6577 	if (get_local_node_state() != WD_COORDINATOR)
6578 		return 0;
6579 	if (otherLeaderNode->state != WD_COORDINATOR)
6580 		return 0;
6581 
6582 	if (otherLeaderNode->current_state_time.tv_sec == 0)
6583 	{
6584 		ereport(LOG,
6585 				(errmsg("not enough data to decide the leader node"),
6586 				 errdetail("the watchdog node:\"%s\" is using the older version of Pgpool-II", otherLeaderNode->nodeName)));
6587 		return 0;
6588 	}
6589 
6590 	/* Decide which node should stay as leader */
6591 	if (otherLeaderNode->escalated != g_cluster.localNode->escalated)
6592 	{
6593 		if (otherLeaderNode->escalated == true && g_cluster.localNode->escalated == false)
6594 		{
6595 			/* remote node stays as the leader */
6596 			ereport(LOG,
6597 					(errmsg("remote node:\"%s\" is best suitable to stay as leader because it is escalated and I am not",
6598 							otherLeaderNode->nodeName)));
6599 			return -1;
6600 		}
6601 		else
6602 		{
6603 			/* local node stays as leader */
6604 			ereport(LOG,
6605 					(errmsg("remote node:\"%s\" should step down from leader because it is not escalated",
6606 							otherLeaderNode->nodeName)));
6607 			return 1;
6608 		}
6609 	}
6610 	else if (otherLeaderNode->quorum_status != g_cluster.quorum_status)
6611 	{
6612 		if (otherLeaderNode->quorum_status > g_cluster.quorum_status)
6613 		{
6614 			/* quorum of remote node is in better state */
6615 			ereport(LOG,
6616 					(errmsg("remote node:\"%s\" is best suitable to stay as leader because it holds the quorum"
6617 							,otherLeaderNode->nodeName)));
6618 
6619 			return -1;
6620 		}
6621 		else
6622 		{
6623 			/* local node stays as leader */
6624 			ereport(LOG,
6625 					(errmsg("remote node:\"%s\" should step down from leader because it does not hold the quorum"
6626 							,otherLeaderNode->nodeName)));
6627 			return 1;
6628 		}
6629 	}
6630 	else if (otherLeaderNode->standby_nodes_count != g_cluster.clusterLeaderInfo.standby_nodes_count)
6631 	{
6632 		if (otherLeaderNode->standby_nodes_count > g_cluster.clusterLeaderInfo.standby_nodes_count)
6633 		{
6634 			/* remote node has more alive nodes */
6635 			ereport(LOG,
6636 					(errmsg("remote node:\"%s\" is best suitable to stay as leader because it has more connected standby nodes"
6637 							,otherLeaderNode->nodeName)));
6638 			return -1;
6639 		}
6640 		else
6641 		{
6642 			/* local node stays as leader */
6643 			ereport(LOG,
6644 					(errmsg("remote node:\"%s\" should step down from leader because we have more connected standby nodes"
6645 							,otherLeaderNode->nodeName)));
6646 			return 1;
6647 		}
6648 	}
6649 	else						/* decide on which node is the older master */
6650 	{
6651 		if (otherLeaderNode->current_state_time.tv_sec < g_cluster.localNode->current_state_time.tv_sec)
6652 		{
6653 			/* remote node has more alive nodes */
6654 			ereport(LOG,
6655 					(errmsg("remote node:\"%s\" is best suitable to stay as leader because it is the older leader"
6656 							,otherLeaderNode->nodeName)));
6657 
6658 			return -1;
6659 		}
6660 		else
6661 		{
6662 			/* local node should keep the leader status */
6663 			ereport(LOG,
6664 					(errmsg("remote node:\"%s\" should step down from leader because we are the older leader"
6665 							,otherLeaderNode->nodeName)));
6666 
6667 			return 1;
6668 		}
6669 	}
6670 	return 0;					/* keep the compiler quite */
6671 }
6672 
6673 static void
handle_split_brain(WatchdogNode * otherLeaderNode,WDPacketData * pkt)6674 handle_split_brain(WatchdogNode * otherLeaderNode, WDPacketData * pkt)
6675 {
6676 	int			decide_leader = I_am_leader_and_cluster_in_split_brain(otherLeaderNode);
6677 
6678 	if (decide_leader == 0)
6679 	{
6680 		/*
6681 		 * we are not able to decide which should be the best candidate to
6682 		 * stay as leader/coordinator node This could also happen if the
6683 		 * remote node is using the older version of Pgpool-II which send the
6684 		 * empty beacon messages.
6685 		 */
6686 		ereport(LOG,
6687 				(errmsg("We are in split brain, and not able to decide the best candidate for leader/coordinator"),
6688 				 errdetail("re-initializing the local watchdog cluster state")));
6689 		send_cluster_service_message(otherLeaderNode, pkt, CLUSTER_NEEDS_ELECTION);
6690 		set_state(WD_JOINING);
6691 	}
6692 	else if (decide_leader == -1)
6693 	{
6694 		/* Remote node is the best candidate for the leader node */
6695 		ereport(LOG,
6696 				(errmsg("We are in split brain, and \"%s\" node is the best candidate for leader/coordinator"
6697 						,otherLeaderNode->nodeName),
6698 				 errdetail("re-initializing the local watchdog cluster state")));
6699 		/* broadcast the message about I am not the true leader node */
6700 		send_cluster_service_message(NULL, pkt, CLUSTER_IAM_NOT_TRUE_LEADER);
6701 		set_state(WD_JOINING);
6702 	}
6703 	else
6704 	{
6705 		/* I am the best candidate for the leader node */
6706 		ereport(LOG,
6707 				(errmsg("We are in split brain, and I am the best candidate for leader/coordinator"),
6708 				 errdetail("asking the remote node \"%s\" to step down", otherLeaderNode->nodeName)));
6709 		send_cluster_service_message(otherLeaderNode, pkt, CLUSTER_IAM_TRUE_LEADER);
6710 	}
6711 
6712 }
6713 
6714 static void
start_escalated_node(void)6715 start_escalated_node(void)
6716 {
6717 	int			wait_secs = MAX_SECS_ESC_PROC_EXIT_WAIT;
6718 
6719 	if (g_cluster.localNode->escalated == true) /* already escalated */
6720 		return;
6721 
6722 	while (g_cluster.de_escalation_pid > 0 && wait_secs-- > 0)
6723 	{
6724 		/*
6725 		 * de_escalation process was already running and we are escalating
6726 		 * again. give some time to de-escalation process to exit normally
6727 		 */
6728 		ereport(LOG,
6729 				(errmsg("waiting for de-escalation process to exit before starting escalation")));
6730 		if (sigchld_request)
6731 			wd_child_signal_handler();
6732 		sleep(1);
6733 	}
6734 	if (g_cluster.de_escalation_pid > 0)
6735 		ereport(LOG,
6736 				(errmsg("de-escalation process does not exited in time."),
6737 				 errdetail("starting the escalation anyway")));
6738 
6739 	g_cluster.escalation_pid = fork_escalation_process();
6740 	if (g_cluster.escalation_pid > 0)
6741 	{
6742 		g_cluster.localNode->escalated = true;
6743 		set_watchdog_node_escalated();
6744 		ereport(LOG,
6745 				(errmsg("escalation process started with PID:%d", g_cluster.escalation_pid)));
6746 		if (strlen(g_cluster.localNode->delegate_ip) > 0)
6747 			g_cluster.clusterLeaderInfo.holding_vip = true;
6748 	}
6749 	else
6750 	{
6751 		ereport(LOG,
6752 				(errmsg("failed to start escalation process")));
6753 	}
6754 }
6755 
6756 static void
resign_from_escalated_node(void)6757 resign_from_escalated_node(void)
6758 {
6759 	int			wait_secs = MAX_SECS_ESC_PROC_EXIT_WAIT;
6760 
6761 	if (g_cluster.localNode->escalated == false)
6762 		return;
6763 
6764 	while (g_cluster.escalation_pid > 0 && wait_secs-- > 0)
6765 	{
6766 		/*
6767 		 * escalation process was already running and we are resigning from
6768 		 * it. wait for the escalation process to exit normally
6769 		 */
6770 		ereport(LOG,
6771 				(errmsg("waiting for escalation process to exit before starting de-escalation")));
6772 		if (sigchld_request)
6773 			wd_child_signal_handler();
6774 		sleep(1);
6775 	}
6776 	if (g_cluster.escalation_pid > 0)
6777 		ereport(LOG,
6778 				(errmsg("escalation process does not exited in time"),
6779 				 errdetail("starting the de-escalation anyway")));
6780 	g_cluster.de_escalation_pid = fork_plunging_process();
6781 	g_cluster.clusterLeaderInfo.holding_vip = false;
6782 	g_cluster.localNode->escalated = false;
6783 	reset_watchdog_node_escalated();
6784 }
6785 
6786 /*
6787  * state machine function for state participate in elections
6788  */
6789 static int
watchdog_state_machine_voting(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)6790 watchdog_state_machine_voting(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
6791 {
6792 	switch (event)
6793 	{
6794 		case WD_EVENT_WD_STATE_CHANGED:
6795 			set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE);
6796 			break;
6797 
6798 		case WD_EVENT_TIMEOUT:
6799 			set_state(WD_JOINING);
6800 			break;
6801 
6802 		case WD_EVENT_PACKET_RCV:
6803 			{
6804 				if (pkt == NULL)
6805 				{
6806 					ereport(LOG,
6807 							(errmsg("packet is NULL")));
6808 					break;
6809 				}
6810 				switch (pkt->type)
6811 				{
6812 					case WD_STAND_FOR_COORDINATOR_MESSAGE:
6813 						{
6814 							/* Check the node priority */
6815 							if (wdNode->wd_priority >= g_cluster.localNode->wd_priority)
6816 							{
6817 								reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
6818 							}
6819 							else
6820 							{
6821 								reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
6822 								set_state(WD_STAND_FOR_COORDINATOR);
6823 							}
6824 						}
6825 						break;
6826 					case WD_IAM_COORDINATOR_MESSAGE:
6827 						set_state(WD_JOINING);
6828 						break;
6829 					case WD_DECLARE_COORDINATOR_MESSAGE:
6830 						/* Check the node priority */
6831 						if (wdNode->wd_priority >= g_cluster.localNode->wd_priority)
6832 						{
6833 							reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
6834 							set_state(WD_INITIALIZING);
6835 						}
6836 						else
6837 						{
6838 							reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
6839 							set_state(WD_STAND_FOR_COORDINATOR);
6840 						}
6841 						break;
6842 					default:
6843 						standard_packet_processor(wdNode, pkt);
6844 						break;
6845 				}
6846 			}
6847 			break;
6848 
6849 		default:
6850 			break;
6851 	}
6852 	return 0;
6853 }
6854 
6855 static int
watchdog_state_machine_standby(WD_EVENTS event,WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * clusterCommand)6856 watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
6857 {
6858 	switch (event)
6859 	{
6860 		case WD_EVENT_WD_STATE_CHANGED:
6861 			send_cluster_command(WD_LEADER_NODE, WD_JOIN_COORDINATOR_MESSAGE, 5);
6862 			/* Also reset my priority as per the original configuration */
6863 			g_cluster.localNode->wd_priority = pool_config->wd_priority;
6864 			set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
6865 			break;
6866 
6867 		case WD_EVENT_TIMEOUT:
6868 			set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
6869 			break;
6870 
6871 		case WD_EVENT_WD_STATE_REQUIRE_RELOAD:
6872 
6873 			ereport(LOG,
6874 					(errmsg("re-sending join coordinator message to leader node: \"%s\"", WD_LEADER_NODE->nodeName)));
6875 
6876 			send_cluster_command(WD_LEADER_NODE, WD_JOIN_COORDINATOR_MESSAGE, 5);
6877 			break;
6878 
6879 		case WD_EVENT_COMMAND_FINISHED:
6880 		{
6881 			if (clusterCommand->commandPacket.type == WD_JOIN_COORDINATOR_MESSAGE)
6882 			{
6883 				if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ||
6884 					clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT)
6885 				{
6886 					register_watchdog_state_change_interrupt();
6887 
6888 					ereport(LOG,
6889 							(errmsg("successfully joined the watchdog cluster as standby node"),
6890 							 errdetail("our join coordinator request is accepted by cluster leader node \"%s\"", WD_LEADER_NODE->nodeName)));
6891 					/* broadcast our new state change to the cluster */
6892 					send_message_of_type(NULL, WD_INFO_MESSAGE, NULL);
6893 
6894 				}
6895 				else
6896 				{
6897 					ereport(NOTICE,
6898 							(errmsg("our join coordinator is rejected by node \"%s\"", wdNode->nodeName),
6899 							 errhint("rejoining the cluster.")));
6900 
6901 					if (WD_LEADER_NODE->has_lost_us)
6902 					{
6903 						ereport(LOG,
6904 								(errmsg("leader node \"%s\" thinks we are lost, and \"%s\" is not letting us join",WD_LEADER_NODE->nodeName,wdNode->nodeName),
6905 								 errhint("please verify the watchdog life-check and network is working properly")));
6906 						set_state(WD_NETWORK_ISOLATION);
6907 					}
6908 					else
6909 					{
6910 						set_state(WD_JOINING);
6911 					}
6912 				}
6913 			}
6914 		}
6915 			break;
6916 
6917 		case WD_EVENT_I_AM_APPEARING_LOST:
6918 		{
6919 			/* The remote node has lost us, and if it
6920 			 * was our coordinator we might already be
6921 			 * removed from it's standby list
6922 			 * So re-Join the cluster
6923 			 */
6924 			if (WD_LEADER_NODE == wdNode)
6925 			{
6926 				ereport(LOG,
6927 						(errmsg("we are lost on the leader node \"%s\"",wdNode->nodeName)));
6928 				set_state(WD_JOINING);
6929 			}
6930 		}
6931 			break;
6932 
6933 		case WD_EVENT_I_AM_APPEARING_FOUND:
6934 			{
6935 				ereport(DEBUG1,
6936 					(errmsg("updating remote node \"%s\" with node info message", wdNode->nodeName)));
6937 
6938 				send_message_of_type(wdNode, WD_INFO_MESSAGE, NULL);
6939 			}
6940 			break;
6941 
6942 		case WD_EVENT_REMOTE_NODE_LOST:
6943 			{
6944 				/*
6945 				 * we have lost one remote connected node check if the node
6946 				 * was coordinator
6947 				 */
6948 				if (WD_LEADER_NODE == NULL)
6949 				{
6950 					ereport(LOG,
6951 							(errmsg("We have lost the cluster leader node \"%s\"", wdNode->nodeName)));
6952 					set_state(WD_JOINING);
6953 				}
6954 			}
6955 			break;
6956 
6957 		case WD_EVENT_PACKET_RCV:
6958 			{
6959 				switch (pkt->type)
6960 				{
6961 					case WD_ADD_NODE_MESSAGE:
6962 					{
6963 						/* In case we received the ADD node message from
6964 						 * our coordinator. Reset the cluster state
6965 						 */
6966 						if (wdNode == WD_LEADER_NODE)
6967 						{
6968 							ereport(LOG,
6969 									(errmsg("received ADD NODE message from the leader node \"%s\"", wdNode->nodeName),
6970 									 errdetail("re-joining the cluster")));
6971 							set_state(WD_JOINING);
6972 						}
6973 						standard_packet_processor(wdNode, pkt);
6974 					}
6975 						break;
6976 
6977 					case WD_FAILOVER_END:
6978 						{
6979 							register_backend_state_sync_req_interrupt();
6980 						}
6981 						break;
6982 
6983 					case WD_STAND_FOR_COORDINATOR_MESSAGE:
6984 						{
6985 							if (WD_LEADER_NODE == NULL)
6986 							{
6987 								reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
6988 								set_state(WD_PARTICIPATE_IN_ELECTION);
6989 							}
6990 							else
6991 							{
6992 								ereport(LOG,
6993 										(errmsg("We are connected to leader node \"%s\" and another node \"%s\" is trying to become a leader",WD_LEADER_NODE->nodeName, wdNode->nodeName)));
6994 								reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
6995 								/* Ask leader to re-send its node info */
6996 								send_message_of_type(WD_LEADER_NODE, WD_REQ_INFO_MESSAGE, NULL);
6997 							}
6998 						}
6999 						break;
7000 
7001 					case WD_DECLARE_COORDINATOR_MESSAGE:
7002 						{
7003 							if (wdNode != WD_LEADER_NODE)
7004 							{
7005 								/*
7006 								 * we already have a leader node and we got a
7007 								 * new node trying to be leader
7008 								 */
7009 								ereport(LOG,
7010 										(errmsg("We are connected to leader node \"%s\" and another node \"%s\" is trying to declare itself as a leader",WD_LEADER_NODE->nodeName, wdNode->nodeName)));
7011 								reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
7012 								/* Ask leader to re-send its node info */
7013 								send_message_of_type(WD_LEADER_NODE, WD_REQ_INFO_MESSAGE, NULL);
7014 
7015 							}
7016 						}
7017 						break;
7018 
7019 					case WD_IAM_COORDINATOR_MESSAGE:
7020 						{
7021 							/*
7022 							 * if the message is received from coordinator
7023 							 * reply with info, otherwise reject
7024 							 */
7025 							if (wdNode != WD_LEADER_NODE)
7026 							{
7027 								ereport(LOG,
7028 										(errmsg("\"%s\" is our coordinator node, but \"%s\" is also announcing as a coordinator",
7029 												WD_LEADER_NODE->nodeName, wdNode->nodeName),
7030 										 errdetail("broadcasting the cluster in split-brain message")));
7031 
7032 								send_cluster_service_message(NULL, pkt, CLUSTER_IN_SPLIT_BRAIN);
7033 							}
7034 							else if (check_debug_request_do_not_reply_beacon() == false)
7035 							{
7036 								send_message_of_type(wdNode, WD_INFO_MESSAGE, pkt);
7037 								beacon_message_received_from_node(wdNode, pkt);
7038 							}
7039 						}
7040 						break;
7041 
7042 					default:
7043 						standard_packet_processor(wdNode, pkt);
7044 						break;
7045 				}
7046 			}
7047 			break;
7048 
7049 		default:
7050 			break;
7051 	}
7052 
7053 	/*
7054 	 * before returning from the function make sure that we are connected with
7055 	 * the leader node
7056 	 */
7057 	if (WD_LEADER_NODE)
7058 	{
7059 		struct timeval currTime;
7060 
7061 		gettimeofday(&currTime, NULL);
7062 		int			last_rcv_sec = WD_TIME_DIFF_SEC(currTime, WD_LEADER_NODE->last_rcv_time);
7063 
7064 		if (last_rcv_sec >= (3 * BEACON_MESSAGE_INTERVAL_SECONDS))
7065 		{
7066 			/* we have missed atleast two beacons from leader node */
7067 			ereport(WARNING,
7068 					(errmsg("we have not received a beacon message from leader node \"%s\" and it has not replied to our info request",
7069 							WD_LEADER_NODE->nodeName),
7070 					 errdetail("re-initializing the cluster")));
7071 			set_state(WD_JOINING);
7072 		}
7073 		else if (last_rcv_sec >= (2 * BEACON_MESSAGE_INTERVAL_SECONDS))
7074 		{
7075 			/*
7076 			 * We have not received a last beacon from leader ask for the
7077 			 * node info from leader node
7078 			 */
7079 			ereport(WARNING,
7080 					(errmsg("we have not received a beacon message from leader node \"%s\"",
7081 							WD_LEADER_NODE->nodeName),
7082 					 errdetail("requesting info message from leader node")));
7083 			send_message_of_type(WD_LEADER_NODE, WD_REQ_INFO_MESSAGE, NULL);
7084 		}
7085 	}
7086 	return 0;
7087 }
7088 
7089 
7090 /*
7091  * The function identifies the current quorum state
7092  * quorum values:
7093  * -1:
7094  *     quorum is lost or does not exists
7095  * 0:
7096  *     The quorum is on the edge. (when participating cluster is configured
7097  *     with even number of nodes, and we have exactly 50% nodes
7098  * 1:
7099  *     quorum exists
7100  */
7101 static void
update_quorum_status(void)7102 update_quorum_status(void)
7103 {
7104 	int			quorum_status = g_cluster.quorum_status;
7105 
7106 	if (g_cluster.clusterLeaderInfo.standby_nodes_count > get_minimum_remote_nodes_required_for_quorum())
7107 	{
7108 		g_cluster.quorum_status = 1;
7109 	}
7110 	else if (g_cluster.clusterLeaderInfo.standby_nodes_count == get_minimum_remote_nodes_required_for_quorum())
7111 	{
7112 		if (g_cluster.memberRemoteNodeCount % 2 != 0)
7113 		{
7114 			if (pool_config->enable_consensus_with_half_votes)
7115 				g_cluster.quorum_status = 0;	/* on the edge */
7116 			else
7117 				g_cluster.quorum_status = -1;
7118 		}
7119 		else
7120 			g_cluster.quorum_status = 1;
7121 	}
7122 	else
7123 	{
7124 		g_cluster.quorum_status = -1;
7125 	}
7126 	g_cluster.localNode->quorum_status = g_cluster.quorum_status;
7127 	if (g_cluster.quorum_status != quorum_status)
7128 	{
7129 		watchdog_state_machine(WD_EVENT_CLUSTER_QUORUM_CHANGED, NULL, NULL, NULL);
7130 	}
7131 }
7132 
7133 /*
7134  * returns the minimum number of remote nodes required for quorum
7135  */
7136 static int
get_minimum_remote_nodes_required_for_quorum(void)7137 get_minimum_remote_nodes_required_for_quorum(void)
7138 {
7139 	/*
7140 	 * Even number of remote nodes, That means total number of nodes are odd,
7141 	 * so minimum quorum is just remote/2.
7142 	 */
7143 	if (g_cluster.memberRemoteNodeCount % 2 == 0)
7144 			return (g_cluster.memberRemoteNodeCount / 2);
7145 
7146 	/*
7147 	 * Total nodes including self are even, So we return 50% nodes as quorum
7148 	 * requirements
7149 	 */
7150 	return ((g_cluster.memberRemoteNodeCount - 1) / 2);
7151 }
7152 
7153 /*
7154  * returns the minimum number of votes required for consensus
7155  */
7156 static int
get_minimum_votes_to_resolve_consensus(void)7157 get_minimum_votes_to_resolve_consensus(void)
7158 {
7159 	/*
7160 	 * Since get_minimum_remote_nodes_required_for_quorum() returns
7161 	 * the number of remote nodes required to complete the quorum
7162 	 * that is always one less than the total number of nodes required
7163 	 * for the cluster to build quorum or consensus, reason being
7164 	 * in get_minimum_remote_nodes_required_for_quorum()
7165 	 * we always consider the local node as a valid pre-casted vote.
7166 	 * But when it comes to count the number of votes required to build
7167 	 * consensus for any type of decision, for example for building the
7168 	 * consensus on backend failover, the local node can vote on either
7169 	 * side. So it's vote is not explicitly counted and for the consensus
7170 	 * we actually need one more vote than the total number of remote nodes
7171 	 * required for the quorum
7172 	 *
7173 	 * For example
7174 	 * If Total nodes in cluster = 4
7175 	 * 		remote node will be = 3
7176 	 * 		get_minimum_remote_nodes_required_for_quorum() return = 1
7177 	 *		Minimum number of votes required for consensus will be
7178 	 *
7179 	 *		if(pool_config->enable_consensus_with_half_votes = true)
7180 	 *			(exact 50% n/2) ==> 4/2 = 2
7181 	 *
7182 	 *		if(pool_config->enable_consensus_with_half_votes = false)
7183 	 *			(exact 50% +1 ==> (n/2)+1) ==> (4/2)+1 = 3
7184 	 *
7185 	 */
7186 
7187 	int required_node_count = get_minimum_remote_nodes_required_for_quorum()  + 1;
7188 	/*
7189 	 * When the total number of nodes in the watchdog cluster including the
7190 	 * local node are even, The number of votes required for the consensus
7191 	 * depends on the enable_consensus_with_half_votes.
7192 	 * So for even number of nodes when enable_consensus_with_half_votes is
7193 	 * not allowed than we would add one more vote than exact 50%
7194 	 */
7195 	if (g_cluster.memberRemoteNodeCount % 2 != 0)
7196 	{
7197 		if (pool_config->enable_consensus_with_half_votes == false)
7198 			required_node_count += 1;
7199 	}
7200 
7201 	return required_node_count;
7202 }
7203 
7204 /*
7205  * sets the state of local watchdog node, and fires a state change event
7206  * if the new and old state differs
7207  */
7208 static int
set_state(WD_STATES newState)7209 set_state(WD_STATES newState)
7210 {
7211 	WD_STATES	oldState = get_local_node_state();
7212 
7213 	g_cluster.localNode->state = newState;
7214 	if (oldState != newState)
7215 	{
7216 		gettimeofday(&g_cluster.localNode->current_state_time, NULL);
7217 
7218 		/*
7219 		 * if we changing from the coordinator state, do the de-escalation if
7220 		 * required
7221 		 */
7222 		if (oldState == WD_COORDINATOR)
7223 		{
7224 			resign_from_escalated_node();
7225 			clear_standby_nodes_list();
7226 			clear_all_failovers();
7227 		}
7228 
7229 		ereport(LOG,
7230 				(errmsg("watchdog node state changed from [%s] to [%s]", wd_state_names[oldState], wd_state_names[newState])));
7231 		watchdog_state_machine(WD_EVENT_WD_STATE_CHANGED, NULL, NULL, NULL);
7232 		/* send out the info message to all nodes */
7233 		send_message_of_type(NULL, WD_INFO_MESSAGE, NULL);
7234 	}
7235 	return 0;
7236 }
7237 
7238 
7239 static void
allocate_resultNodes_in_command(WDCommandData * ipcCommand)7240 allocate_resultNodes_in_command(WDCommandData * ipcCommand)
7241 {
7242 	MemoryContext oldCxt;
7243 	int			i;
7244 
7245 	if (ipcCommand->nodeResults != NULL)
7246 		return;
7247 
7248 	oldCxt = MemoryContextSwitchTo(ipcCommand->memoryContext);
7249 	ipcCommand->nodeResults = palloc0((sizeof(WDCommandNodeResult) * g_cluster.remoteNodeCount));
7250 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
7251 	{
7252 		ipcCommand->nodeResults[i].wdNode = &g_cluster.remoteNodes[i];
7253 	}
7254 	MemoryContextSwitchTo(oldCxt);
7255 }
7256 
7257 
7258 static void
process_remote_online_recovery_command(WatchdogNode * wdNode,WDPacketData * pkt)7259 process_remote_online_recovery_command(WatchdogNode * wdNode, WDPacketData * pkt)
7260 {
7261 	char	   *func_name;
7262 	int			node_count = 0;
7263 	int		   *node_id_list = NULL;
7264 	unsigned char flags;
7265 
7266 	if (pkt->data == NULL || pkt->len == 0)
7267 	{
7268 		ereport(LOG,
7269 				(errmsg("watchdog is unable to process pgpool online recovery command"),
7270 				 errdetail("command packet contains no data")));
7271 		reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
7272 		return;
7273 	}
7274 
7275 	ereport(LOG,
7276 			(errmsg("watchdog received online recovery request from \"%s\"", wdNode->nodeName)));
7277 
7278 	if (parse_wd_node_function_json(pkt->data, pkt->len, &func_name, &node_id_list, &node_count, &flags))
7279 	{
7280 		if (strcasecmp(WD_FUNCTION_START_RECOVERY, func_name) == 0)
7281 		{
7282 			if (*InRecovery != RECOVERY_INIT)
7283 			{
7284 				reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
7285 			}
7286 			else
7287 			{
7288 				*InRecovery = RECOVERY_ONLINE;
7289 				if (Req_info->conn_counter == 0)
7290 				{
7291 					reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
7292 				}
7293 				else if (pool_config->recovery_timeout <= 0)
7294 				{
7295 					if (ensure_conn_counter_validity() == 0)
7296 						reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
7297 					else
7298 						reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
7299 				}
7300 				else
7301 				{
7302 					WDFunctionCommandData *wd_func_command;
7303 					MemoryContext oldCxt = MemoryContextSwitchTo(TopMemoryContext);
7304 
7305 					wd_func_command = palloc(sizeof(WDFunctionCommandData));
7306 					wd_func_command->commandType = pkt->type;
7307 					wd_func_command->commandID = pkt->command_id;
7308 					wd_func_command->funcName = MemoryContextStrdup(TopMemoryContext, func_name);
7309 					wd_func_command->wdNode = wdNode;
7310 
7311 					/* Add this command for timer tick */
7312 					add_wd_command_for_timer_events(pool_config->recovery_timeout, true, wd_func_command);
7313 
7314 					MemoryContextSwitchTo(oldCxt);
7315 				}
7316 			}
7317 		}
7318 		else if (strcasecmp(WD_FUNCTION_END_RECOVERY, func_name) == 0)
7319 		{
7320 			*InRecovery = RECOVERY_INIT;
7321 			reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
7322 			kill(getppid(), SIGUSR2);
7323 		}
7324 		else
7325 		{
7326 			ereport(LOG,
7327 					(errmsg("watchdog failed to process online recovery request"),
7328 					 errdetail("invalid command [%s] in online recovery request from \"%s\"", func_name, wdNode->nodeName)));
7329 			reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
7330 		}
7331 	}
7332 	else
7333 	{
7334 		ereport(LOG,
7335 				(errmsg("watchdog failed to process online recovery request"),
7336 				 errdetail("invalid data in online recovery request from \"%s\"", wdNode->nodeName)));
7337 		reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
7338 	}
7339 
7340 	if (func_name)
7341 		pfree(func_name);
7342 	if (node_id_list)
7343 		pfree(node_id_list);
7344 }
7345 
7346 
7347 static bool
reply_is_received_for_pgpool_replicate_command(WatchdogNode * wdNode,WDPacketData * pkt,WDCommandData * ipcCommand)7348 reply_is_received_for_pgpool_replicate_command(WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * ipcCommand)
7349 {
7350 	int			i;
7351 	WDCommandNodeResult *nodeResult = NULL;
7352 
7353 	/* get the result node for */
7354 	ereport(DEBUG1,
7355 			(errmsg("watchdog node \"%s\" has replied for pgpool-II replicate command packet", wdNode->nodeName)));
7356 
7357 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
7358 	{
7359 		nodeResult = &ipcCommand->nodeResults[i];
7360 		if (nodeResult->wdNode == wdNode)
7361 			break;
7362 		nodeResult = NULL;
7363 	}
7364 	if (nodeResult == NULL)
7365 	{
7366 		ereport(WARNING,
7367 				(errmsg("unable to find result node for pgpool-II replicate command packet received from watchdog node \"%s\"", wdNode->nodeName)));
7368 		return true;
7369 	}
7370 
7371 	nodeResult->result_type = pkt->type;
7372 	nodeResult->cmdState = COMMAND_STATE_REPLIED;
7373 	ipcCommand->commandReplyFromCount++;
7374 	ereport(DEBUG2,
7375 			(errmsg("watchdog node \"%s\" has replied for pgpool-II replicate command packet", wdNode->nodeName),
7376 			 errdetail("command was sent to %d nodes and %d nodes have replied to it", ipcCommand->commandSendToCount, ipcCommand->commandReplyFromCount)));
7377 
7378 	if (pkt->type != WD_ACCEPT_MESSAGE)
7379 	{
7380 		/* reject message from any node finishes the command */
7381 		ipcCommand->commandStatus = COMMAND_FINISHED_NODE_REJECTED;
7382 		wd_command_is_complete(ipcCommand);
7383 		cleanUpIPCCommand(ipcCommand);
7384 	}
7385 	else if (ipcCommand->commandReplyFromCount >= ipcCommand->commandSendToCount)
7386 	{
7387 		/*
7388 		 * we have received results from all nodes analyze the result
7389 		 */
7390 		ipcCommand->commandStatus = COMMAND_FINISHED_ALL_REPLIED;
7391 		wd_command_is_complete(ipcCommand);
7392 		cleanUpIPCCommand(ipcCommand);
7393 	}
7394 
7395 	/* do not process this packet further */
7396 	return true;
7397 }
7398 
7399 /*
7400  * return true if want to cancel timer,
7401  */
7402 static bool
process_wd_command_timer_event(bool timer_expired,WDFunctionCommandData * wd_func_command)7403 process_wd_command_timer_event(bool timer_expired, WDFunctionCommandData * wd_func_command)
7404 {
7405 	if (wd_func_command->commandType == WD_IPC_ONLINE_RECOVERY_COMMAND)
7406 	{
7407 		if (wd_func_command->funcName && strcasecmp("START_RECOVERY", wd_func_command->funcName) == 0)
7408 		{
7409 			if (Req_info->conn_counter == 0)
7410 			{
7411 				WDPacketData emptyPkt;
7412 
7413 				emptyPkt.command_id = wd_func_command->commandID;
7414 				reply_with_minimal_message(wd_func_command->wdNode, WD_ACCEPT_MESSAGE, &emptyPkt);
7415 				return true;
7416 			}
7417 			else if (timer_expired)
7418 			{
7419 				WDPacketData emptyPkt;
7420 
7421 				emptyPkt.command_id = wd_func_command->commandID;
7422 
7423 				if (ensure_conn_counter_validity() == 0)
7424 					reply_with_minimal_message(wd_func_command->wdNode, WD_ACCEPT_MESSAGE, &emptyPkt);
7425 				else
7426 					reply_with_minimal_message(wd_func_command->wdNode, WD_REJECT_MESSAGE, &emptyPkt);
7427 				return true;
7428 			}
7429 			return false;
7430 		}
7431 	}
7432 	/* Just remove the timer. */
7433 	return true;
7434 }
7435 
7436 static void
process_wd_func_commands_for_timer_events(void)7437 process_wd_func_commands_for_timer_events(void)
7438 {
7439 	struct timeval currTime;
7440 	ListCell   *lc;
7441 	List	   *timers_to_del = NIL;
7442 
7443 	if (g_cluster.wd_timer_commands == NULL)
7444 		return;
7445 
7446 	gettimeofday(&currTime, NULL);
7447 
7448 	/*
7449 	 * Take care online recovery
7450 	 */
7451 	foreach(lc, g_cluster.wd_timer_commands)
7452 	{
7453 		WDCommandTimerData *timerData = lfirst(lc);
7454 
7455 		if (timerData)
7456 		{
7457 			bool		del = false;
7458 
7459 			if (WD_TIME_DIFF_SEC(currTime, timerData->startTime) >= timerData->expire_sec)
7460 			{
7461 				del = process_wd_command_timer_event(true, timerData->wd_func_command);
7462 
7463 			}
7464 			else if (timerData->need_tics)
7465 			{
7466 				del = process_wd_command_timer_event(false, timerData->wd_func_command);
7467 			}
7468 			if (del)
7469 				timers_to_del = lappend(timers_to_del, timerData);
7470 		}
7471 	}
7472 	foreach(lc, timers_to_del)
7473 	{
7474 		g_cluster.wd_timer_commands = list_delete_ptr(g_cluster.wd_timer_commands, lfirst(lc));
7475 	}
7476 
7477 	list_free(timers_to_del);
7478 }
7479 
7480 static void
add_wd_command_for_timer_events(unsigned int expire_secs,bool need_tics,WDFunctionCommandData * wd_func_command)7481 add_wd_command_for_timer_events(unsigned int expire_secs, bool need_tics, WDFunctionCommandData * wd_func_command)
7482 {
7483 	/* create a new Timer struct */
7484 	MemoryContext oldCtx = MemoryContextSwitchTo(TopMemoryContext);
7485 	WDCommandTimerData *timerData = palloc(sizeof(WDCommandTimerData));
7486 
7487 	gettimeofday(&timerData->startTime, NULL);
7488 	timerData->expire_sec = expire_secs;
7489 	timerData->need_tics = need_tics;
7490 	timerData->wd_func_command = wd_func_command;
7491 
7492 	g_cluster.wd_timer_commands = lappend(g_cluster.wd_timer_commands, timerData);
7493 
7494 	MemoryContextSwitchTo(oldCtx);
7495 
7496 }
7497 
7498 #define WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config_obj, wdNode, parameter) \
7499 do { \
7500 	if (config_obj->parameter != pool_config->parameter) \
7501 	{ \
7502 		ereport(WARNING, \
7503 			(errmsg("configurations value for \"%s\" on node \"%s\" is different", #parameter, wdNode->nodeName), \
7504 				errdetail("\"%s\" on this node is %d while on \"%s\" is %d", \
7505 				   #parameter, \
7506 				   pool_config->parameter, \
7507 				   wdNode->nodeName, \
7508 				   config_obj->parameter))); \
7509 	} \
7510 } while(0)
7511 #define WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config_obj,wdNode, parameter) \
7512 do { \
7513 	if (config_obj->parameter != pool_config->parameter) \
7514 	{ \
7515 		ereport(WARNING, \
7516 			(errmsg("configurations value for \"%s\" on node \"%s\" is different", #parameter, wdNode->nodeName), \
7517 				errdetail("\"%s\" on this node is %s while on \"%s\" is %s", \
7518 					#parameter, \
7519 					pool_config->parameter?"ON":"OFF", \
7520 					wdNode->nodeName, \
7521 					config_obj->parameter?"ON":"OFF"))); \
7522 	} \
7523 } while(0)
7524 
7525 static void
verify_pool_configurations(WatchdogNode * wdNode,POOL_CONFIG * config)7526 verify_pool_configurations(WatchdogNode * wdNode, POOL_CONFIG * config)
7527 {
7528 	int			i;
7529 
7530 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, num_init_children);
7531 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, listen_backlog_multiplier);
7532 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, child_life_time);
7533 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, connection_life_time);
7534 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, child_max_connections);
7535 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, client_idle_limit);
7536 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, max_pool);
7537 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, health_check_timeout);
7538 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, health_check_period);
7539 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, health_check_max_retries);
7540 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, health_check_retry_delay);
7541 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, recovery_timeout);
7542 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, search_primary_node_timeout);
7543 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_INT(config, wdNode, client_idle_limit_in_recovery);
7544 
7545 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, replication_mode);
7546 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, enable_pool_hba);
7547 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, load_balance_mode);
7548 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, replication_stop_on_mismatch);
7549 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, allow_clear_text_frontend_auth);
7550 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, failover_if_affected_tuples_mismatch);
7551 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, failover_on_backend_error);
7552 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, replicate_select);
7553 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, connection_cache);
7554 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, insert_lock);
7555 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, memory_cache_enabled);
7556 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, clear_memqcache_on_escalation);
7557 
7558 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, failover_when_quorum_exists);
7559 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, failover_require_consensus);
7560 	WD_VERIFY_RECEIVED_CONFIG_PARAMETER_VAL_BOOL(config, wdNode, allow_multiple_failover_requests_from_node);
7561 
7562 	if (config->backend_desc->num_backends != pool_config->backend_desc->num_backends)
7563 	{
7564 		ereport(WARNING,
7565 				(errmsg("number of configured backends on node \"%s\" are different", wdNode->nodeName),
7566 				 errdetail("this node has %d backends while on \"%s\" number of configured backends are %d",
7567 						   pool_config->backend_desc->num_backends,
7568 						   wdNode->nodeName,
7569 						   config->backend_desc->num_backends)));
7570 	}
7571 	for (i = 0; i < pool_config->backend_desc->num_backends; i++)
7572 	{
7573 		if (strncasecmp(pool_config->backend_desc->backend_info[i].backend_hostname, config->backend_desc->backend_info[i].backend_hostname, sizeof(pool_config->backend_desc->backend_info[i].backend_hostname)))
7574 		{
7575 			ereport(WARNING,
7576 					(errmsg("configurations value for backend[%d] \"hostname\" on node \"%s\" is different", i, wdNode->nodeName),
7577 					 errdetail("\"backend_hostname%d\" on this node is %s while on \"%s\" is %s",
7578 							   i,
7579 							   pool_config->backend_desc->backend_info[i].backend_hostname,
7580 							   wdNode->nodeName,
7581 							   config->backend_desc->backend_info[i].backend_hostname)));
7582 		}
7583 		if (config->backend_desc->backend_info[i].backend_port != pool_config->backend_desc->backend_info[i].backend_port)
7584 		{
7585 			ereport(WARNING,
7586 					(errmsg("configurations value for backend[%d] \"port\" on node \"%s\" is different", i, wdNode->nodeName),
7587 					 errdetail("\"backend_port%d\" on this node is %d while on \"%s\" is %d",
7588 							   i,
7589 							   pool_config->backend_desc->backend_info[i].backend_port,
7590 							   wdNode->nodeName,
7591 							   config->backend_desc->backend_info[i].backend_port)));
7592 		}
7593 	}
7594 
7595 	if (config->wd_nodes.num_wd != pool_config->wd_nodes.num_wd)
7596 	{
7597 		ereport(WARNING,
7598 				(errmsg("the number of configured watchdog nodes on node \"%s\" are different", wdNode->nodeName),
7599 				 errdetail("this node has %d watchdog nodes while \"%s\" is configured with %d watchdog nodes",
7600 						   pool_config->wd_nodes.num_wd,
7601 						   wdNode->nodeName,
7602 						   config->wd_nodes.num_wd)));
7603 	}
7604 }
7605 
7606 static bool
get_authhash_for_node(WatchdogNode * wdNode,char * authhash)7607 get_authhash_for_node(WatchdogNode * wdNode, char *authhash)
7608 {
7609 	if (strlen(pool_config->wd_authkey))
7610 	{
7611 		char		nodeStr[WD_MAX_PACKET_STRING + 1];
7612 		int			len = snprintf(nodeStr, WD_MAX_PACKET_STRING, "state=%d wd_port=%d",
7613 								   wdNode->state, wdNode->wd_port);
7614 
7615 
7616 		/* calculate hash from packet */
7617 		wd_calc_hash(nodeStr, len, authhash);
7618 		if (authhash[0] == '\0')
7619 			ereport(WARNING,
7620 					(errmsg("failed to calculate wd_authkey hash from a send packet")));
7621 		return true;
7622 	}
7623 	return false;
7624 }
7625 
7626 static bool
verify_authhash_for_node(WatchdogNode * wdNode,char * authhash)7627 verify_authhash_for_node(WatchdogNode * wdNode, char *authhash)
7628 {
7629 	if (strlen(pool_config->wd_authkey))
7630 	{
7631 		char		calculated_authhash[WD_AUTH_HASH_LEN + 1];
7632 
7633 		char		nodeStr[WD_MAX_PACKET_STRING];
7634 		int			len = snprintf(nodeStr, WD_MAX_PACKET_STRING, "state=%d wd_port=%d",
7635 								   wdNode->state, wdNode->wd_port);
7636 
7637 
7638 		/* calculate hash from packet */
7639 		wd_calc_hash(nodeStr, len, calculated_authhash);
7640 		if (calculated_authhash[0] == '\0')
7641 			ereport(WARNING,
7642 					(errmsg("failed to calculate wd_authkey hash from a receive packet")));
7643 		return (strcmp(calculated_authhash, authhash) == 0);
7644 	}
7645 	/* authkey is not enabled. */
7646 	return true;
7647 }
7648 
7649 /*
7650  * function authenticates the IPC command by looking for the
7651  * auth key in the JSON data of IPC command.
7652  * For IPC commands coming from outer world the function validates the
7653  * authkey in JSON packet with configured pool_config->wd_authkey.
7654  * if internal_client_only is true then the JSON data must contain the
7655  * shared key present in the pgpool-II shared memory. This can be used
7656  * to restrict certain watchdog IPC functions for outside of pgpool-II
7657  */
7658 static bool
check_IPC_client_authentication(json_value * rootObj,bool internal_client_only)7659 check_IPC_client_authentication(json_value * rootObj, bool internal_client_only)
7660 {
7661 	char	   *packet_auth_key;
7662 	unsigned int packet_key;
7663 	bool		has_shared_key;
7664 	unsigned int *shared_key = get_ipc_shared_key();
7665 
7666 	if (json_get_int_value_for_key(rootObj, WD_IPC_SHARED_KEY, (int *) &packet_key))
7667 	{
7668 		ereport(DEBUG2,
7669 				(errmsg("IPC JSON data packet does not contain shared key")));
7670 		has_shared_key = false;
7671 	}
7672 	else
7673 	{
7674 		has_shared_key = true;
7675 	}
7676 
7677 	if (internal_client_only)
7678 	{
7679 
7680 		if (shared_key == NULL)
7681 		{
7682 			ereport(LOG,
7683 					(errmsg("shared key not initialized")));
7684 			return false;
7685 		}
7686 
7687 		if (has_shared_key == false)
7688 		{
7689 			ereport(LOG,
7690 					(errmsg("invalid JSON data packet"),
7691 					 errdetail("authentication shared key not found in JSON data")));
7692 			return false;
7693 		}
7694 		/* compare if shared keys match */
7695 		if (*shared_key != packet_key)
7696 			return false;
7697 
7698 		/* providing a valid shared key for internal clients is enough */
7699 		return true;
7700 	}
7701 
7702 	/* If no authentication is required, no need to look further */
7703 	if (g_cluster.ipc_auth_needed == false)
7704 		return true;
7705 
7706 	/* if shared key is provided and it matched, we are good */
7707 	if (has_shared_key == true && *shared_key == packet_key)
7708 		return true;
7709 
7710 	/* shared key is out of question validate the authKey values */
7711 	packet_auth_key = json_get_string_value_for_key(rootObj, WD_IPC_AUTH_KEY);
7712 
7713 	if (packet_auth_key == NULL)
7714 	{
7715 		ereport(DEBUG1,
7716 				(errmsg("invalid JSON data packet"),
7717 				 errdetail("authentication key not found in JSON data")));
7718 		return false;
7719 	}
7720 
7721 	/* compare the packet key with configured auth key */
7722 	if (strcmp(pool_config->wd_authkey, packet_auth_key) != 0)
7723 		return false;
7724 	return true;
7725 }
7726 
7727 /*
7728  * function to check authentication of IPC command based on the command type
7729  * this one also informs the calling client about the failure
7730  */
7731 
7732 static bool
check_and_report_IPC_authentication(WDCommandData * ipcCommand)7733 check_and_report_IPC_authentication(WDCommandData * ipcCommand)
7734 {
7735 	json_value *root = NULL;
7736 	bool		internal_client_only = false;
7737 	bool		ret;
7738 
7739 	if (ipcCommand == NULL)
7740 		return false;			/* should never happen */
7741 
7742 	/* first identify the command type */
7743 	switch (ipcCommand->sourcePacket.type)
7744 	{
7745 		case WD_NODE_STATUS_CHANGE_COMMAND:
7746 		case WD_REGISTER_FOR_NOTIFICATION:
7747 		case WD_GET_NODES_LIST_COMMAND:
7748 		case WD_GET_RUNTIME_VARIABLE_VALUE:
7749 			internal_client_only = false;
7750 			break;
7751 
7752 		case WD_IPC_FAILOVER_COMMAND:
7753 		case WD_IPC_ONLINE_RECOVERY_COMMAND:
7754 		case WD_EXECUTE_CLUSTER_COMMAND:
7755 		case WD_GET_LEADER_DATA_REQUEST:
7756 			/* only allowed internally. */
7757 			internal_client_only = true;
7758 			break;
7759 
7760 		default:
7761 			/* unknown command, ignore it */
7762 			return true;
7763 			break;
7764 	}
7765 
7766 	if (internal_client_only == false && g_cluster.ipc_auth_needed == false)
7767 	{
7768 		/* no need to look further */
7769 		return true;
7770 	}
7771 
7772 	if (ipcCommand->sourcePacket.len <= 0 || ipcCommand->sourcePacket.data == NULL)
7773 	{
7774 		ereport(LOG,
7775 				(errmsg("authentication failed"),
7776 				 errdetail("IPC command contains no data")));
7777 		ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
7778 													   "authentication failed: invalid data");
7779 
7780 		return false;
7781 	}
7782 
7783 	root = json_parse(ipcCommand->sourcePacket.data, ipcCommand->sourcePacket.len);
7784 	/* The root node must be object */
7785 	if (root == NULL || root->type != json_object)
7786 	{
7787 		json_value_free(root);
7788 		ereport(LOG,
7789 				(errmsg("authentication failed"),
7790 				 errdetail("IPC command contains an invalid data")));
7791 
7792 		ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
7793 													   "authentication failed: invalid data");
7794 
7795 		return false;
7796 	}
7797 
7798 	ret = check_IPC_client_authentication(root, internal_client_only);
7799 	json_value_free(root);
7800 
7801 	if (ret == false)
7802 	{
7803 		ereport(WARNING,
7804 				(errmsg("authentication failed"),
7805 				 errdetail("invalid IPC key")));
7806 		ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
7807 													   "authentication failed: invalid KEY");
7808 	}
7809 	return ret;
7810 }
7811 
7812 static void
print_watchdog_node_info(WatchdogNode * wdNode)7813 print_watchdog_node_info(WatchdogNode * wdNode)
7814 {
7815 	ereport(DEBUG2,
7816 			(errmsg("state: \"%s\" Host: \"%s\" Name: \"%s\" WD Port:%d PP Port: %d priority:%d",
7817 					wd_state_names[wdNode->state],
7818 					wdNode->hostname
7819 					,wdNode->nodeName
7820 					,wdNode->wd_port
7821 					,wdNode->pgpool_port
7822 					,wdNode->wd_priority)));
7823 }
7824 
7825 static void
print_packet_node_info(WDPacketData * pkt,WatchdogNode * wdNode,bool sending)7826 print_packet_node_info(WDPacketData * pkt, WatchdogNode * wdNode, bool sending)
7827 {
7828 	int			i;
7829 	packet_types *pkt_type = NULL;
7830 
7831 	/*
7832 	 * save the cpu cycles if our log level would swallow this message
7833 	 */
7834 	if (pool_config->log_min_messages > DEBUG1)
7835 		return;
7836 
7837 	for (i = 0;; i++)
7838 	{
7839 		if (all_packet_types[i].type == WD_NO_MESSAGE)
7840 			break;
7841 
7842 		if (all_packet_types[i].type == pkt->type)
7843 		{
7844 			pkt_type = &all_packet_types[i];
7845 			break;
7846 		}
7847 	}
7848 
7849 	ereport(DEBUG1,
7850 			(errmsg("%s packet, watchdog node:[%s] command id:[%d] type:[%s] state:[%s]",
7851 					sending ? "sending" : "received",
7852 					wdNode->nodeName,
7853 					pkt->command_id,
7854 					pkt_type ? pkt_type->name : "UNKNOWN",
7855 					wd_state_names[get_local_node_state()])));
7856 }
7857 
7858 static void
print_packet_info(WDPacketData * pkt,bool sending)7859 print_packet_info(WDPacketData * pkt, bool sending)
7860 {
7861 	int			i;
7862 	packet_types *pkt_type = NULL;
7863 
7864 	/*
7865 	 * save the cpu cycles if our log level would swallow this message
7866 	 */
7867 	if (pool_config->log_min_messages > DEBUG2)
7868 		return;
7869 
7870 	for (i = 0;; i++)
7871 	{
7872 		if (all_packet_types[i].type == WD_NO_MESSAGE)
7873 			break;
7874 
7875 		if (all_packet_types[i].type == pkt->type)
7876 		{
7877 			pkt_type = &all_packet_types[i];
7878 			break;
7879 		}
7880 	}
7881 
7882 	ereport(DEBUG2,
7883 			(errmsg("%s watchdog packet, command id:[%d] type:[%s] state :[%s]",
7884 					sending ? "sending" : "received",
7885 					pkt->command_id,
7886 					pkt_type ? pkt_type->name : "UNKNOWN",
7887 					wd_state_names[get_local_node_state()])));
7888 }
7889 
7890 static int
send_command_packet_to_remote_nodes(WDCommandData * ipcCommand,bool source_included)7891 send_command_packet_to_remote_nodes(WDCommandData * ipcCommand, bool source_included)
7892 {
7893 	int			i;
7894 
7895 	ipcCommand->commandSendToCount = 0;
7896 	ipcCommand->commandReplyFromCount = 0;
7897 	ipcCommand->commandSendToErrorCount = 0;
7898 	allocate_resultNodes_in_command(ipcCommand);
7899 	ereport(DEBUG2,
7900 			(errmsg("sending the %c type message to \"%s\"",
7901 					ipcCommand->commandPacket.type,
7902 					ipcCommand->sendToNode ? ipcCommand->sendToNode->nodeName : "ALL NODES")));
7903 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
7904 	{
7905 		WDCommandNodeResult *nodeResult = &ipcCommand->nodeResults[i];
7906 
7907 		if (ipcCommand->sendToNode != NULL && ipcCommand->sendToNode != nodeResult->wdNode)
7908 		{
7909 			/*
7910 			 * The command is intended for specific node and this is not the
7911 			 * one
7912 			 */
7913 			nodeResult->cmdState = COMMAND_STATE_DO_NOT_SEND;
7914 		}
7915 		else if (source_included == false && ipcCommand->sourceWdNode == nodeResult->wdNode &&
7916 				 ipcCommand->commandSource == COMMAND_SOURCE_REMOTE)
7917 		{
7918 			ereport(DEBUG1,
7919 					(errmsg("not sending the %c type message to command originator node \"%s\"",
7920 							ipcCommand->commandPacket.type, nodeResult->wdNode->nodeName)));
7921 
7922 			/*
7923 			 * The message is not supposed to be sent to the watchdog node
7924 			 * that started this command
7925 			 */
7926 			nodeResult->cmdState = COMMAND_STATE_DO_NOT_SEND;
7927 		}
7928 		else if (is_node_active(nodeResult->wdNode) == false)
7929 		{
7930 			nodeResult->cmdState = COMMAND_STATE_DO_NOT_SEND;
7931 		}
7932 		else if (is_node_reachable(nodeResult->wdNode) == false)
7933 		{
7934 			nodeResult->cmdState = COMMAND_STATE_SEND_ERROR;
7935 			ipcCommand->commandSendToErrorCount++;
7936 		}
7937 		else if (send_message_to_node(nodeResult->wdNode, &ipcCommand->commandPacket) == true)
7938 		{
7939 			ereport(DEBUG2,
7940 					(errmsg("%c type message written to socket for node \"%s\"",
7941 							ipcCommand->commandPacket.type, nodeResult->wdNode->nodeName)));
7942 
7943 			nodeResult->cmdState = COMMAND_STATE_SENT;
7944 			ipcCommand->commandSendToCount++;
7945 		}
7946 		else
7947 		{
7948 			nodeResult->cmdState = COMMAND_STATE_SEND_ERROR;
7949 			ipcCommand->commandSendToErrorCount++;
7950 		}
7951 	}
7952 	return ipcCommand->commandSendToCount;
7953 }
7954 
7955 static void
set_cluster_leader_node(WatchdogNode * wdNode)7956 set_cluster_leader_node(WatchdogNode * wdNode)
7957 {
7958 	if (WD_LEADER_NODE != wdNode)
7959 	{
7960 		if (wdNode == NULL)
7961 			ereport(LOG,
7962 					(errmsg("removing the %s node \"%s\" from watchdog cluster leader",
7963 							(g_cluster.localNode == WD_LEADER_NODE) ? "local" : "remote",
7964 							WD_LEADER_NODE->nodeName)));
7965 		else
7966 			ereport(LOG,
7967 					(errmsg("setting the %s node \"%s\" as watchdog cluster leader",
7968 							(g_cluster.localNode == wdNode) ? "local" : "remote",
7969 							wdNode->nodeName)));
7970 		g_cluster.clusterLeaderInfo.leaderNode = wdNode;
7971 	}
7972 }
7973 
7974 static WatchdogNode*
getLeaderWatchdogNode(void)7975 getLeaderWatchdogNode(void)
7976 {
7977 	return g_cluster.clusterLeaderInfo.leaderNode;
7978 }
7979 
7980 static int
update_cluster_memberships(void)7981 update_cluster_memberships(void)
7982 {
7983 	int i;
7984 	g_cluster.memberRemoteNodeCount = g_cluster.remoteNodeCount;
7985 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
7986 	{
7987 		WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
7988 		if (wdNode->membership_status != WD_NODE_MEMBERSHIP_ACTIVE)
7989 			g_cluster.memberRemoteNodeCount--;
7990 	}
7991 	return g_cluster.memberRemoteNodeCount;
7992 }
7993 
7994 static int
revoke_cluster_membership_of_node(WatchdogNode * wdNode,WD_NODE_MEMBERSHIP_STATUS revoke_status)7995 revoke_cluster_membership_of_node(WatchdogNode* wdNode, WD_NODE_MEMBERSHIP_STATUS revoke_status)
7996 {
7997 	if (wdNode->membership_status == WD_NODE_MEMBERSHIP_ACTIVE)
7998 	{
7999 		wdNode->membership_status = revoke_status;
8000 
8001 		ereport(LOG,
8002 				(errmsg("revoking the membership of [%s] node:\"%s\" [node_id:%d]",
8003 							wd_state_names[wdNode->state], wdNode->nodeName,wdNode->pgpool_node_id),
8004 				 errdetail("membership revoke reason: \"%s\"",
8005 						   wd_cluster_membership_status[wdNode->membership_status])));
8006 
8007 		g_cluster.memberRemoteNodeCount--;
8008 	}
8009 	return g_cluster.memberRemoteNodeCount;
8010 }
8011 
8012 static int
restore_cluster_membership_of_node(WatchdogNode * wdNode)8013 restore_cluster_membership_of_node(WatchdogNode* wdNode)
8014 {
8015 	if (wdNode->membership_status != WD_NODE_MEMBERSHIP_ACTIVE)
8016 	{
8017 		ereport(LOG,
8018 				(errmsg("Restoring cluster membership of node:\"%s\"",wdNode->nodeName),
8019 				 errdetail("membership of node was revoked because it was \"%s\"",
8020 				   wd_cluster_membership_status[wdNode->membership_status])));
8021 
8022 		wdNode->membership_status = WD_NODE_MEMBERSHIP_ACTIVE;
8023 		/* reset the lost time on the node */
8024 		wdNode->lost_time.tv_sec = 0;
8025 		wdNode->lost_time.tv_usec = 0;
8026 		g_cluster.memberRemoteNodeCount++;
8027 	}
8028 	return g_cluster.memberRemoteNodeCount;
8029 }
8030 
8031 static void
reset_lost_timers(void)8032 reset_lost_timers(void)
8033 {
8034 	int i;
8035 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
8036 	{
8037 		WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
8038 		wdNode->lost_time.tv_sec = 0;
8039 		wdNode->lost_time.tv_usec = 0;
8040 	}
8041 }
8042 
8043 static int
standby_node_join_cluster(WatchdogNode * wdNode)8044 standby_node_join_cluster(WatchdogNode * wdNode)
8045 {
8046 	if (get_local_node_state() == WD_COORDINATOR)
8047 	{
8048 		int			i;
8049 		/* Just rest the lost time stamp*/
8050 		/* set the timestamp on node to track for how long this node is lost */
8051 		wdNode->lost_time.tv_sec = 0;
8052 		wdNode->lost_time.tv_usec = 0;
8053 		/* First check if the node is already in the List */
8054 		for (i = 0; i < g_cluster.clusterLeaderInfo.standby_nodes_count; i++)
8055 		{
8056 			WatchdogNode *node = g_cluster.clusterLeaderInfo.standbyNodes[i];
8057 
8058 			if (node && node == wdNode)
8059 			{
8060 				/* The node is already in the standby list */
8061 				return g_cluster.clusterLeaderInfo.standby_nodes_count;
8062 			}
8063 		}
8064 		/* okay the node is not in the list */
8065 		ereport(LOG,
8066 				(errmsg("adding watchdog node \"%s\" to the standby list", wdNode->nodeName)));
8067 		g_cluster.clusterLeaderInfo.standbyNodes[g_cluster.clusterLeaderInfo.standby_nodes_count] = wdNode;
8068 		g_cluster.clusterLeaderInfo.standby_nodes_count++;
8069 	}
8070 	g_cluster.localNode->standby_nodes_count = g_cluster.clusterLeaderInfo.standby_nodes_count;
8071 	return g_cluster.clusterLeaderInfo.standby_nodes_count;
8072 }
8073 
8074 static int
standby_node_left_cluster(WatchdogNode * wdNode)8075 standby_node_left_cluster(WatchdogNode * wdNode)
8076 {
8077 	if (get_local_node_state() == WD_COORDINATOR)
8078 	{
8079 		int			i;
8080 		bool		removed = false;
8081 		int			standby_nodes_count = g_cluster.clusterLeaderInfo.standby_nodes_count;
8082 
8083 		for (i = 0; i < standby_nodes_count; i++)
8084 		{
8085 			WatchdogNode *node = g_cluster.clusterLeaderInfo.standbyNodes[i];
8086 
8087 			if (node)
8088 			{
8089 				if (removed)
8090 				{
8091 					/* move this to previous index */
8092 					g_cluster.clusterLeaderInfo.standbyNodes[i - 1] = node;
8093 					g_cluster.clusterLeaderInfo.standbyNodes[i] = NULL;
8094 				}
8095 				else if (node == wdNode)
8096 				{
8097 					/*
8098 					 * okay we have found the node in the list.
8099 					 */
8100 					ereport(LOG,
8101 							(errmsg("removing watchdog node \"%s\" from the standby list", wdNode->nodeName)));
8102 					/* set the timestamp on node to track for how long this node is lost */
8103 					gettimeofday(&wdNode->lost_time, NULL);
8104 					g_cluster.clusterLeaderInfo.standbyNodes[i] = NULL;
8105 					g_cluster.clusterLeaderInfo.standby_nodes_count--;
8106 					removed = true;
8107 				}
8108 			}
8109 		}
8110 	}
8111 	g_cluster.localNode->standby_nodes_count = g_cluster.clusterLeaderInfo.standby_nodes_count;
8112 	return g_cluster.clusterLeaderInfo.standby_nodes_count;
8113 }
8114 
8115 static void
clear_standby_nodes_list(void)8116 clear_standby_nodes_list(void)
8117 {
8118 	int			i;
8119 
8120 	ereport(DEBUG1,
8121 			(errmsg("removing all watchdog nodes from the standby list"),
8122 			 errdetail("standby list contains %d nodes", g_cluster.clusterLeaderInfo.standby_nodes_count)));
8123 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
8124 	{
8125 		g_cluster.clusterLeaderInfo.standbyNodes[i] = NULL;
8126 	}
8127 	g_cluster.clusterLeaderInfo.standby_nodes_count = 0;
8128 	g_cluster.localNode->standby_nodes_count = 0;
8129 }
8130 
update_missed_beacon_count(WDCommandData * ipcCommand,bool clear)8131 static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear)
8132 {
8133 	int i;
8134 	for (i=0; i< g_cluster.remoteNodeCount; i++)
8135 	{
8136 		if (clear)
8137 		{
8138 			WatchdogNode* wdNode = &(g_cluster.remoteNodes[i]);
8139 			wdNode->missed_beacon_count = 0;
8140 		}
8141 		else
8142 		{
8143 			WDCommandNodeResult* nodeResult = &ipcCommand->nodeResults[i];
8144 			if (ipcCommand->commandStatus == COMMAND_IN_PROGRESS )
8145 				return;
8146 
8147 			if (nodeResult->cmdState == COMMAND_STATE_SENT)
8148 			{
8149 				if (nodeResult->wdNode->state == WD_STANDBY)
8150 				{
8151 					nodeResult->wdNode->missed_beacon_count++;
8152 					if (nodeResult->wdNode->missed_beacon_count > 1)
8153 						ereport(LOG,
8154 							(errmsg("remote node \"%s\" is not replying to our beacons",nodeResult->wdNode->nodeName),
8155 							 errdetail("missed beacon reply count:%d",nodeResult->wdNode->missed_beacon_count)));
8156 				}
8157 				else
8158 					nodeResult->wdNode->missed_beacon_count = 0;
8159 			}
8160 			if (nodeResult->cmdState == COMMAND_STATE_REPLIED)
8161 			{
8162 				if (nodeResult->wdNode->missed_beacon_count > 0)
8163 					ereport(LOG,
8164 							(errmsg("remote node \"%s\" is replying again after missing %d beacons",nodeResult->wdNode->nodeName,
8165 									nodeResult->wdNode->missed_beacon_count)));
8166 				nodeResult->wdNode->missed_beacon_count = 0;
8167 			}
8168 		}
8169 	}
8170 }
8171 
8172 #ifdef WATCHDOG_DEBUG
8173 /*
8174  * Node down request file. In the file, each line consists of watchdog
8175  * debug command.  The possible commands are same as the defines below
8176  * for example to stop Pgpool-II from sending the reply to beacon messages
8177  * from the leader node write DO_NOT_REPLY_TO_BEACON in watchdog_debug_requests
8178  *
8179  *
8180  * echo "DO_NOT_REPLY_TO_BEACON" > pgpool_logdir/watchdog_debug_requests
8181  */
8182 
8183 typedef struct watchdog_debug_commands
8184 {
8185 	char		command[100];
8186 	unsigned int code;
8187 }			watchdog_debug_commands;
8188 
8189 unsigned int watchdog_debug_command = 0;
8190 
8191 
8192 #define WATCHDOG_DEBUG_FILE	"watchdog_debug_requests"
8193 
8194 #define DO_NOT_REPLY_TO_BEACON 	1
8195 #define DO_NOT_SEND_BEACON 		2
8196 #define KILL_ALL_COMMUNICATION	4
8197 #define KILL_ALL_RECEIVERS		8
8198 #define KILL_ALL_SENDERS		16
8199 
8200 
8201 watchdog_debug_commands wd_debug_commands[] = {
8202 	{"DO_NOT_REPLY_TO_BEACON", DO_NOT_REPLY_TO_BEACON},
8203 	{"DO_NOT_SEND_BEACON",     DO_NOT_SEND_BEACON},
8204 	{"KILL_ALL_COMMUNICATION", KILL_ALL_COMMUNICATION},
8205 	{"KILL_ALL_RECEIVERS",     KILL_ALL_RECEIVERS},
8206 	{"KILL_ALL_SENDERS",       KILL_ALL_SENDERS},
8207 	{"", 0}
8208 };
8209 
8210 static bool
check_debug_request_kill_all_communication(void)8211 check_debug_request_kill_all_communication(void)
8212 {
8213 	return (watchdog_debug_command & KILL_ALL_COMMUNICATION);
8214 }
8215 static bool
check_debug_request_kill_all_receivers(void)8216 check_debug_request_kill_all_receivers(void)
8217 {
8218 	return (watchdog_debug_command & KILL_ALL_RECEIVERS);
8219 }
8220 static bool
check_debug_request_kill_all_senders(void)8221 check_debug_request_kill_all_senders(void)
8222 {
8223 	return (watchdog_debug_command & KILL_ALL_SENDERS);
8224 }
8225 
8226 static bool
check_debug_request_do_not_send_beacon(void)8227 check_debug_request_do_not_send_beacon(void)
8228 {
8229 	return (watchdog_debug_command & DO_NOT_SEND_BEACON);
8230 }
8231 
8232 static bool
check_debug_request_do_not_reply_beacon(void)8233 check_debug_request_do_not_reply_beacon(void)
8234 {
8235 	return (watchdog_debug_command & DO_NOT_REPLY_TO_BEACON);
8236 }
8237 /*
8238  * Check watchdog debug request options file for debug commands
8239  * each line should contain only one command
8240  *
8241  * Possible commands
8242  * 		DO_NOT_REPLY_TO_BEACON
8243  *		DO_NOT_SEND_BEACON
8244  *		KILL_ALL_COMMUNICATION
8245  *		KILL_ALL_RECEIVERS
8246  *		KILL_ALL_SENDERS
8247  */
8248 
8249 static void
load_watchdog_debug_test_option(void)8250 load_watchdog_debug_test_option(void)
8251 {
8252 	static char wd_debug_request_file[POOLMAXPATHLEN];
8253 	FILE	   *fd;
8254 	int			i;
8255 #define MAXLINE 128
8256 	char		readbuf[MAXLINE];
8257 
8258 	watchdog_debug_command = 0;
8259 
8260 	if (wd_debug_request_file[0] == '\0')
8261 	{
8262 		snprintf(wd_debug_request_file, sizeof(wd_debug_request_file),
8263 				 "%s/%s", pool_config->logdir, WATCHDOG_DEBUG_FILE);
8264 	}
8265 
8266 	fd = fopen(wd_debug_request_file, "r");
8267 	if (!fd)
8268 	{
8269 		ereport(DEBUG3,
8270 				(errmsg("load_watchdog_debug_test_option: failed to open file %s",
8271 						wd_debug_request_file),
8272 				 errdetail("%m")));
8273 		return;
8274 	}
8275 
8276 	for (i = 0;; i++)
8277 	{
8278 		int cmd = 0;
8279 		bool valid_command = false;
8280 		readbuf[MAXLINE - 1] = '\0';
8281 		if (fgets(readbuf, MAXLINE - 1, fd) == 0)
8282 			break;
8283 		for (cmd =0 ;; cmd++)
8284 		{
8285 			if (strlen(wd_debug_commands[cmd].command) == 0 || wd_debug_commands[cmd].code == 0)
8286 				break;
8287 
8288 			if (strncasecmp(wd_debug_commands[cmd].command,readbuf,strlen(wd_debug_commands[cmd].command)) == 0)
8289 			{
8290 				ereport(DEBUG3,
8291 						(errmsg("Watchdog DEBUG COMMAND %d: \"%s\" request found",
8292 								cmd,wd_debug_commands[cmd].command)));
8293 
8294 				watchdog_debug_command |= wd_debug_commands[cmd].code;
8295 				valid_command = true;
8296 				break;
8297 			}
8298 		}
8299 		if (!valid_command)
8300 			ereport(WARNING,
8301 				(errmsg("%s file contains invalid command",
8302 							wd_debug_request_file),
8303 					 errdetail("\"%s\" not recognized", readbuf)));
8304 	}
8305 
8306 	fclose(fd);
8307 }
8308 #else
8309 /*
8310  * All these command checks return false when WATCHDOG_DEBUG is
8311  * not enabled
8312  */
8313 static bool
check_debug_request_do_not_send_beacon(void)8314 check_debug_request_do_not_send_beacon(void)
8315 {return false;}
8316 static bool
check_debug_request_do_not_reply_beacon(void)8317 check_debug_request_do_not_reply_beacon(void)
8318 {return false;}
8319 static bool
check_debug_request_kill_all_communication(void)8320 check_debug_request_kill_all_communication(void)
8321 {return false;}
8322 static bool
check_debug_request_kill_all_receivers(void)8323 check_debug_request_kill_all_receivers(void)
8324 {return false;}
8325 static bool
check_debug_request_kill_all_senders(void)8326 check_debug_request_kill_all_senders(void)
8327 {return false;}
8328 #endif
8329