1 /*
2  * repmgr-action-node.c
3  *
4  * Implements actions available for any kind of node
5  *
6  * Copyright (c) 2ndQuadrant, 2010-2020
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include <sys/stat.h>
23 #include <dirent.h>
24 
25 #include "repmgr.h"
26 #include "controldata.h"
27 #include "dirutil.h"
28 #include "dbutils.h"
29 #include "compat.h"
30 
31 #include "repmgr-client-global.h"
32 #include "repmgr-action-node.h"
33 #include "repmgr-action-standby.h"
34 
35 static bool copy_file(const char *src_file, const char *dest_file);
36 static void format_archive_dir(PQExpBufferData *archive_dir);
37 static t_server_action parse_server_action(const char *action);
38 
39 static void exit_optformat_error(const char *error, int errcode);
40 
41 static void _do_node_service_list_actions(t_server_action action);
42 static void _do_node_status_is_shutdown_cleanly(void);
43 static void _do_node_archive_config(void);
44 static void _do_node_restore_config(void);
45 
46 static void do_node_check_replication_connection(void);
47 static CheckStatus do_node_check_archive_ready(PGconn *conn, OutputMode mode, CheckStatusList *list_output);
48 static CheckStatus do_node_check_downstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
49 static CheckStatus do_node_check_upstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
50 static CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
51 static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
52 static CheckStatus do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
53 static CheckStatus do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
54 static CheckStatus do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
55 static CheckStatus do_node_check_replication_config_owner(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
56 static CheckStatus do_node_check_db_connection(PGconn *conn, OutputMode mode);
57 
58 /*
59  * NODE STATUS
60  *
61  * Can only be run on the local node, as it needs to be able to
62  * read the data directory.
63  *
64  * Parameters:
65  *   --is-shutdown-cleanly (for internal use only)
66  *   --csv
67  */
68 
69 void
do_node_status(void)70 do_node_status(void)
71 {
72 	PGconn	   *conn = NULL;
73 
74 	t_node_info node_info = T_NODE_INFO_INITIALIZER;
75 	char		cluster_size[MAXLEN];
76 	PQExpBufferData output;
77 
78 	KeyValueList node_status = {NULL, NULL};
79 	KeyValueListCell *cell = NULL;
80 	NodeInfoList missing_slots = T_NODE_INFO_LIST_INITIALIZER;
81 
82 	ItemList	warnings = {NULL, NULL};
83 	RecoveryType recovery_type = RECTYPE_UNKNOWN;
84 	ReplInfo	replication_info;
85 	t_recovery_conf recovery_conf = T_RECOVERY_CONF_INITIALIZER;
86 
87 	char		data_dir[MAXPGPATH] = "";
88 	char		server_version_str[MAXVERSIONSTR] = "";
89 
90 	/*
91 	 * A database connection is *not* required for this check
92 	 */
93 	if (runtime_options.is_shutdown_cleanly == true)
94 	{
95 		return _do_node_status_is_shutdown_cleanly();
96 	}
97 
98 	init_replication_info(&replication_info);
99 
100 
101 	/* config file required, so we should have "conninfo" and "data_directory" */
102 	conn = establish_db_connection(config_file_options.conninfo, true);
103 	strncpy(data_dir, config_file_options.data_directory, MAXPGPATH);
104 
105 	(void)get_server_version(conn, server_version_str);
106 
107 	/* check node exists  */
108 
109 	if (get_node_record_with_upstream(conn, config_file_options.node_id, &node_info) != RECORD_FOUND)
110 	{
111 		log_error(_("no record found for node %i"), config_file_options.node_id);
112 		PQfinish(conn);
113 		exit(ERR_BAD_CONFIG);
114 	}
115 
116 	if (get_cluster_size(conn, cluster_size) == false)
117 		strncpy(cluster_size, _("unknown"), MAXLEN);
118 
119 	recovery_type = get_recovery_type(conn);
120 
121 	get_node_replication_stats(conn, &node_info);
122 
123 	key_value_list_set(&node_status,
124 					   "PostgreSQL version",
125 					   server_version_str);
126 
127 	key_value_list_set(&node_status,
128 					   "Total data size",
129 					   cluster_size);
130 
131 	key_value_list_set(&node_status,
132 					   "Conninfo",
133 					   node_info.conninfo);
134 
135 	if (runtime_options.verbose == true)
136 	{
137 		uint64		local_system_identifier = get_system_identifier(config_file_options.data_directory);
138 
139 		if (local_system_identifier == UNKNOWN_SYSTEM_IDENTIFIER)
140 		{
141 			key_value_list_set(&node_status,
142 							   "System identifier",
143 							   "unknown");
144 			item_list_append_format(&warnings,
145 									_("unable to retrieve system identifier from pg_control"));
146 		}
147 		else
148 		{
149 			key_value_list_set_format(&node_status,
150 									  "System identifier",
151 									  "%lu", local_system_identifier);
152 		}
153 	}
154 
155 	key_value_list_set(&node_status,
156 					   "Role",
157 					   get_node_type_string(node_info.type));
158 
159 	switch (node_info.type)
160 	{
161 		case PRIMARY:
162 			if (recovery_type == RECTYPE_STANDBY)
163 			{
164 				item_list_append(&warnings,
165 								 _("- node is registered as primary but running as standby"));
166 			}
167 			break;
168 		case STANDBY:
169 			if (recovery_type == RECTYPE_PRIMARY)
170 			{
171 				item_list_append(&warnings,
172 								 _("- node is registered as standby but running as primary"));
173 			}
174 			break;
175 		default:
176 			break;
177 	}
178 
179 	if (guc_set(conn, "archive_mode", "=", "off"))
180 	{
181 		key_value_list_set(&node_status,
182 						   "WAL archiving",
183 						   "off");
184 
185 		key_value_list_set(&node_status,
186 						   "Archive command",
187 						   "(none)");
188 	}
189 	else
190 	{
191 		/* "archive_mode" is not "off", i.e. one of "on", "always" */
192 		bool		enabled = true;
193 		PQExpBufferData archiving_status;
194 		char		archive_command[MAXLEN] = "";
195 
196 		initPQExpBuffer(&archiving_status);
197 
198 		/*
199 		 * if the node is a standby, and "archive_mode" is "on", archiving will
200 		 * actually be disabled.
201 		 */
202 		if (recovery_type == RECTYPE_STANDBY)
203 		{
204 			if (guc_set(conn, "archive_mode", "=", "on"))
205 				enabled = false;
206 		}
207 
208 		if (enabled == true)
209 		{
210 			appendPQExpBufferStr(&archiving_status, "enabled");
211 		}
212 		else
213 		{
214 			appendPQExpBufferStr(&archiving_status, "disabled");
215 		}
216 
217 		if (enabled == false && recovery_type == RECTYPE_STANDBY)
218 		{
219 			if (PQserverVersion(conn) >= 90500)
220 			{
221 				appendPQExpBufferStr(&archiving_status,
222 									 " (on standbys \"archive_mode\" must be set to \"always\" to be effective)");
223 			}
224 			else
225 			{
226 				appendPQExpBufferStr(&archiving_status,
227 									 " (\"archive_mode\" has no effect on standbys)");
228 			}
229 		}
230 
231 		key_value_list_set(&node_status,
232 						   "WAL archiving",
233 						   archiving_status.data);
234 
235 		termPQExpBuffer(&archiving_status);
236 
237 		get_pg_setting(conn, "archive_command", archive_command);
238 
239 		key_value_list_set(&node_status,
240 						   "Archive command",
241 						   archive_command);
242 	}
243 
244 	{
245 		int			ready_files;
246 
247 		ready_files = get_ready_archive_files(conn, data_dir);
248 
249 		if (ready_files == ARCHIVE_STATUS_DIR_ERROR)
250 		{
251 			item_list_append_format(&warnings,
252 									"- unable to check archive_status directory\n");
253 		}
254 		else
255 		{
256 			if (runtime_options.output_mode == OM_CSV)
257 			{
258 				key_value_list_set_format(&node_status,
259 										  "WALs pending archiving",
260 										  "%i",
261 										  ready_files);
262 			}
263 			else
264 			{
265 				key_value_list_set_format(&node_status,
266 										  "WALs pending archiving",
267 										  "%i pending files",
268 										  ready_files);
269 			}
270 		}
271 
272 		if (guc_set(conn, "archive_mode", "=", "off"))
273 		{
274 			key_value_list_set_output_mode(&node_status, "WALs pending archiving", OM_CSV);
275 		}
276 
277 	}
278 
279 
280 	if (node_info.max_wal_senders >= 0)
281 	{
282 		/* In CSV mode, raw values supplied as well */
283 		key_value_list_set_format(&node_status,
284 								  "Replication connections",
285 								  "%i (of maximal %i)",
286 								  node_info.attached_wal_receivers,
287 								  node_info.max_wal_senders);
288 	}
289 	else if (node_info.max_wal_senders == 0)
290 	{
291 		key_value_list_set_format(&node_status,
292 								  "Replication connections",
293 								  "disabled");
294 	}
295 
296 	/* check for attached nodes */
297 	{
298 		NodeInfoList downstream_nodes = T_NODE_INFO_LIST_INITIALIZER;
299 		NodeInfoListCell *node_cell = NULL;
300 		ItemList	missing_nodes = {NULL, NULL};
301 		int			missing_nodes_count = 0;
302 		int			expected_nodes_count = 0;
303 
304 		get_downstream_node_records(conn, config_file_options.node_id, &downstream_nodes);
305 
306 		/* if a witness node is present, we'll need to remove this from the total */
307 		expected_nodes_count = downstream_nodes.node_count;
308 
309 		for (node_cell = downstream_nodes.head; node_cell; node_cell = node_cell->next)
310 		{
311 			/* skip witness server */
312 			if (node_cell->node_info->type == WITNESS)
313 			{
314 				expected_nodes_count --;
315 				continue;
316 			}
317 
318 			if (is_downstream_node_attached(conn, node_cell->node_info->node_name, NULL) != NODE_ATTACHED)
319 			{
320 				missing_nodes_count++;
321 				item_list_append_format(&missing_nodes,
322 										"%s (ID: %i)",
323 										node_cell->node_info->node_name,
324 										node_cell->node_info->node_id);
325 			}
326 		}
327 
328 		if (missing_nodes_count)
329 		{
330 			ItemListCell *missing_cell = NULL;
331 
332 			item_list_append_format(&warnings,
333 									_("- %i of %i downstream nodes not attached:"),
334 									missing_nodes_count,
335 									expected_nodes_count);
336 
337 			for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next)
338 			{
339 				item_list_append_format(&warnings,
340 										"  - %s\n", missing_cell->string);
341 			}
342 		}
343 	}
344 
345 	if (node_info.max_replication_slots == 0)
346 	{
347 		key_value_list_set(&node_status,
348 						   "Replication slots",
349 						   "disabled");
350 	}
351 	else
352 	{
353 		PQExpBufferData slotinfo;
354 
355 		/*
356 		 * check for missing replication slots - we do this regardless of
357 		 * what "max_replication_slots" is set to, in case the downstream
358 		 * node was configured with "use_replication_slots=true" and is
359 		 * expecting a replication slot to be available
360 		 */
361 		get_downstream_nodes_with_missing_slot(conn,
362 											   config_file_options.node_id,
363 											   &missing_slots);
364 
365 		if (missing_slots.node_count > 0)
366 		{
367 			NodeInfoListCell *missing_slot_cell = NULL;
368 
369 			item_list_append_format(&warnings,
370 									_("- replication slots missing for following %i node(s):"),
371 									missing_slots.node_count);
372 
373 			for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next)
374 			{
375 				item_list_append_format(&warnings,
376 										_("  - %s (ID: %i, slot name: \"%s\")"),
377 										missing_slot_cell->node_info->node_name,
378 										missing_slot_cell->node_info->node_id,
379 										missing_slot_cell->node_info->slot_name);
380 			}
381 		}
382 
383 		initPQExpBuffer(&slotinfo);
384 
385 		appendPQExpBuffer(&slotinfo,
386 						  "%i physical (of maximal %i; %i missing)",
387 						  node_info.active_replication_slots + node_info.inactive_replication_slots,
388 						  node_info.max_replication_slots,
389 						  missing_slots.node_count);
390 
391 		if (node_info.inactive_replication_slots > 0)
392 		{
393 			KeyValueList inactive_replication_slots = {NULL, NULL};
394 			KeyValueListCell *cell = NULL;
395 
396 			(void) get_inactive_replication_slots(conn, &inactive_replication_slots);
397 
398 			appendPQExpBuffer(&slotinfo,
399 							  "; %i inactive",
400 							  node_info.inactive_replication_slots);
401 
402 			item_list_append_format(&warnings,
403 									_("- node has %i inactive physical replication slots"),
404 									node_info.inactive_replication_slots);
405 
406 			for (cell = inactive_replication_slots.head; cell; cell = cell->next)
407 			{
408 				item_list_append_format(&warnings,
409 										"  - %s", cell->key);
410 			}
411 
412 			key_value_list_free(&inactive_replication_slots);
413 		}
414 
415 		key_value_list_set(&node_status,
416 						   "Replication slots",
417 						   slotinfo.data);
418 
419 		termPQExpBuffer(&slotinfo);
420 	}
421 
422 
423 	if (node_info.type == STANDBY)
424 	{
425 		key_value_list_set_format(&node_status,
426 								  "Upstream node",
427 								  "%s (ID: %i)",
428 								  node_info.upstream_node_name,
429 								  node_info.upstream_node_id);
430 
431 		get_replication_info(conn, node_info.type, &replication_info);
432 
433 		key_value_list_set_format(&node_status,
434 								  "Replication lag",
435 								  "%i seconds",
436 								  replication_info.replication_lag_time);
437 
438 		key_value_list_set_format(&node_status,
439 								  "Last received LSN",
440 								  "%X/%X", format_lsn(replication_info.last_wal_receive_lsn));
441 
442 		key_value_list_set_format(&node_status,
443 								  "Last replayed LSN",
444 								  "%X/%X", format_lsn(replication_info.last_wal_replay_lsn));
445 	}
446 	else
447 	{
448 		key_value_list_set(&node_status,
449 						   "Upstream node",
450 						   "(none)");
451 		key_value_list_set_output_mode(&node_status,
452 									   "Upstream node",
453 									   OM_CSV);
454 
455 		key_value_list_set(&node_status,
456 						   "Replication lag",
457 						   "n/a");
458 
459 		key_value_list_set(&node_status,
460 						   "Last received LSN",
461 						   "(none)");
462 
463 		key_value_list_set_output_mode(&node_status,
464 									   "Last received LSN",
465 									   OM_CSV);
466 
467 		key_value_list_set(&node_status,
468 						   "Last replayed LSN",
469 						   "(none)");
470 
471 		key_value_list_set_output_mode(&node_status,
472 									   "Last replayed LSN",
473 									   OM_CSV);
474 	}
475 
476 
477 	parse_recovery_conf(data_dir, &recovery_conf);
478 
479 	/* format output */
480 	initPQExpBuffer(&output);
481 
482 	if (runtime_options.output_mode == OM_CSV)
483 	{
484 		appendPQExpBuffer(&output,
485 						  "\"Node name\",\"%s\"\n",
486 						  node_info.node_name);
487 
488 		appendPQExpBuffer(&output,
489 						  "\"Node ID\",\"%i\"\n",
490 						  node_info.node_id);
491 
492 		for (cell = node_status.head; cell; cell = cell->next)
493 		{
494 			appendPQExpBuffer(&output,
495 							  "\"%s\",\"%s\"\n",
496 							  cell->key, cell->value);
497 		}
498 
499 		/* we'll add the raw data as well */
500 		appendPQExpBuffer(&output,
501 						  "\"max_wal_senders\",%i\n",
502 						  node_info.max_wal_senders);
503 
504 		appendPQExpBuffer(&output,
505 						  "\"occupied_wal_senders\",%i\n",
506 						  node_info.attached_wal_receivers);
507 
508 		appendPQExpBuffer(&output,
509 						  "\"max_replication_slots\",%i\n",
510 						  node_info.max_replication_slots);
511 
512 		appendPQExpBuffer(&output,
513 						  "\"active_replication_slots\",%i\n",
514 						  node_info.active_replication_slots);
515 
516 		/* output inactive slot information */
517 		appendPQExpBuffer(&output,
518 						  "\"inactive_replication_slots\",%i",
519 						  node_info.inactive_replication_slots);
520 
521 		if (node_info.inactive_replication_slots)
522 		{
523 			KeyValueList inactive_replication_slots = {NULL, NULL};
524 			KeyValueListCell *cell = NULL;
525 
526 			(void) get_inactive_replication_slots(conn, &inactive_replication_slots);
527 			for (cell = inactive_replication_slots.head; cell; cell = cell->next)
528 			{
529 				appendPQExpBuffer(&output,
530 								  ",\"%s\"", cell->key);
531 			}
532 
533 			key_value_list_free(&inactive_replication_slots);
534 		}
535 
536 		/* output missing slot information */
537 
538 		appendPQExpBufferChar(&output, '\n');
539 		appendPQExpBuffer(&output,
540 						  "\"missing_replication_slots\",%i",
541 						  missing_slots.node_count);
542 
543 		if (missing_slots.node_count > 0)
544 		{
545 			NodeInfoListCell *missing_slot_cell = NULL;
546 
547 			for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next)
548 			{
549 				appendPQExpBuffer(&output,
550 								  ",\"%s\"", missing_slot_cell->node_info->slot_name);
551 			}
552 		}
553 
554 	}
555 	else
556 	{
557 		appendPQExpBuffer(&output,
558 						  "Node \"%s\":\n",
559 						  node_info.node_name);
560 
561 		for (cell = node_status.head; cell; cell = cell->next)
562 		{
563 			if (cell->output_mode == OM_NOT_SET)
564 				appendPQExpBuffer(&output,
565 								  "\t%s: %s\n",
566 								  cell->key, cell->value);
567 		}
568 	}
569 
570 	puts(output.data);
571 
572 	termPQExpBuffer(&output);
573 
574 	if (warnings.head != NULL && runtime_options.terse == false && runtime_options.output_mode == OM_TEXT)
575 	{
576 		log_warning(_("following issue(s) were detected:"));
577 		print_item_list(&warnings);
578 		log_hint(_("execute \"repmgr node check\" for more details"));
579 	}
580 
581 	clear_node_info_list(&missing_slots);
582 	key_value_list_free(&node_status);
583 	item_list_free(&warnings);
584 	PQfinish(conn);
585 
586 	/*
587 	 * If warnings were noted, even if they're not displayed (e.g. in --csv node),
588 	 * that means something's not right so we need to emit a non-zero exit code.
589 	 */
590 	if (warnings.head != NULL)
591 	{
592 		exit(ERR_NODE_STATUS);
593 	}
594 
595 	return;
596 }
597 
598 
599 /*
600  * Returns information about the running state of the node.
601  * For internal use during "standby switchover".
602  *
603  * Returns "longopt" output:
604  *
605  * --status=(RUNNING|SHUTDOWN|UNCLEAN_SHUTDOWN|UNKNOWN)
606  * --last-checkpoint=...
607  */
608 
609 static void
_do_node_status_is_shutdown_cleanly(void)610 _do_node_status_is_shutdown_cleanly(void)
611 {
612 	PGPing		ping_status;
613 	PQExpBufferData output;
614 
615 	DBState		db_state;
616 	XLogRecPtr	checkPoint = InvalidXLogRecPtr;
617 
618 	NodeStatus	node_status = NODE_STATUS_UNKNOWN;
619 
620 	initPQExpBuffer(&output);
621 
622 	appendPQExpBufferStr(&output,
623 					  "--state=");
624 
625 	/* sanity-check we're dealing with a PostgreSQL directory */
626 	if (is_pg_dir(config_file_options.data_directory) == false)
627 	{
628 		appendPQExpBufferStr(&output, "UNKNOWN");
629 		printf("%s\n", output.data);
630 		termPQExpBuffer(&output);
631 		return;
632 	}
633 
634 	ping_status = PQping(config_file_options.conninfo);
635 
636 	switch (ping_status)
637 	{
638 		case PQPING_OK:
639 			node_status = NODE_STATUS_UP;
640 			break;
641 		case PQPING_REJECT:
642 			node_status = NODE_STATUS_UP;
643 			break;
644 		case PQPING_NO_ATTEMPT:
645 		case PQPING_NO_RESPONSE:
646 			/* status not yet clear */
647 			break;
648 	}
649 
650 	/* check what pg_control says */
651 
652 	if (get_db_state(config_file_options.data_directory, &db_state) == false)
653 	{
654 		/*
655 		 * Unable to retrieve the database state from pg_control
656 		 */
657 		node_status = NODE_STATUS_UNKNOWN;
658 		log_verbose(LOG_DEBUG, "unable to determine db state");
659 		goto return_state;
660 	}
661 
662 	log_verbose(LOG_DEBUG, "db state now: %s", describe_db_state(db_state));
663 
664 	if (db_state != DB_SHUTDOWNED && db_state != DB_SHUTDOWNED_IN_RECOVERY)
665 	{
666 		if (node_status != NODE_STATUS_UP)
667 		{
668 			node_status = NODE_STATUS_UNCLEAN_SHUTDOWN;
669 		}
670 		/* server is still responding but shutting down */
671 		else if (db_state == DB_SHUTDOWNING)
672 		{
673 			node_status = NODE_STATUS_SHUTTING_DOWN;
674 		}
675 	}
676 
677 	checkPoint = get_latest_checkpoint_location(config_file_options.data_directory);
678 
679 	if (checkPoint == InvalidXLogRecPtr)
680 	{
681 		/* unable to read pg_control, don't know what's happening */
682 		node_status = NODE_STATUS_UNKNOWN;
683 	}
684 	else if (node_status == NODE_STATUS_UNKNOWN)
685 	{
686 		/*
687 		 * if still "UNKNOWN" at this point, then the node must be cleanly shut
688 		 * down
689 		 */
690 		node_status = NODE_STATUS_DOWN;
691 	}
692 
693 
694 return_state:
695 
696 	log_verbose(LOG_DEBUG, "node status determined as: %s",
697 				print_node_status(node_status));
698 
699 	appendPQExpBuffer(&output,
700 					  "%s", print_node_status(node_status));
701 
702 	if (node_status == NODE_STATUS_DOWN)
703 	{
704 		appendPQExpBuffer(&output,
705 						  " --last-checkpoint-lsn=%X/%X",
706 						  format_lsn(checkPoint));
707 	}
708 
709 	printf("%s\n", output.data);
710 	termPQExpBuffer(&output);
711 	return;
712 }
713 
714 static void
exit_optformat_error(const char * error,int errcode)715 exit_optformat_error(const char *error, int errcode)
716 {
717 	PQExpBufferData output;
718 
719 	Assert(runtime_options.output_mode == OM_OPTFORMAT);
720 
721 	initPQExpBuffer(&output);
722 
723 	appendPQExpBuffer(&output,
724 					  "--error=%s",
725 					  error);
726 
727 	printf("%s\n", output.data);
728 
729 	termPQExpBuffer(&output);
730 
731 	exit(errcode);
732 }
733 
734 /*
735  * Configuration file required
736  */
737 void
do_node_check(void)738 do_node_check(void)
739 {
740 	PGconn	   *conn = NULL;
741 	PQExpBufferData output;
742 
743 	t_node_info node_info = T_NODE_INFO_INITIALIZER;
744 
745 	CheckStatus return_code;
746 	CheckStatusList status_list = {NULL, NULL};
747 	CheckStatusListCell *cell = NULL;
748 
749 	bool			issue_detected = false;
750 	bool			exit_on_connection_error = true;
751 
752 	/* for internal use */
753 	if (runtime_options.has_passfile == true)
754 	{
755 		return_code = has_passfile() ? 0 : 1;
756 
757 		exit(return_code);
758 	}
759 
760 	/* for use by "standby switchover" */
761 	if (runtime_options.replication_connection == true)
762 	{
763 		do_node_check_replication_connection();
764 		exit(SUCCESS);
765 	}
766 
767 	if (runtime_options.db_connection == true)
768 	{
769 		exit_on_connection_error = false;
770 	}
771 
772 	/*
773 	 * If --optformat was provided, we'll assume this is a remote invocation
774 	 * and instead of exiting with an error, we'll return an error string to
775 	 * so the remote invoker will know what's happened.
776 	 */
777 	if (runtime_options.output_mode == OM_OPTFORMAT)
778 	{
779 		exit_on_connection_error = false;
780 	}
781 
782 
783 	if (config_file_options.conninfo[0] != '\0')
784 	{
785 		t_conninfo_param_list node_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
786 		char	   *errmsg = NULL;
787 		bool		parse_success = false;
788 
789 		initialize_conninfo_params(&node_conninfo, false);
790 
791 		parse_success = parse_conninfo_string(config_file_options.conninfo,
792 											  &node_conninfo,
793 											  &errmsg, false);
794 
795 		if (parse_success == false)
796 		{
797 			if (runtime_options.output_mode == OM_OPTFORMAT)
798 			{
799 				exit_optformat_error("CONNINFO_PARSE",
800 									 ERR_BAD_CONFIG);
801 			}
802 
803 			log_error(_("unable to parse conninfo string \"%s\" for local node"),
804 					  config_file_options.conninfo);
805 			log_detail("%s", errmsg);
806 
807 			exit(ERR_BAD_CONFIG);
808 		}
809 
810 		/*
811 		 * If --superuser option provided, attempt to connect as the specified user
812 		 */
813 
814 		if (runtime_options.superuser[0] != '\0')
815 		{
816 			conn = establish_db_connection_with_replacement_param(
817 				config_file_options.conninfo,
818 				"user",
819 				runtime_options.superuser,
820 				exit_on_connection_error);
821 		}
822 		else
823 		{
824 			conn = establish_db_connection_by_params(&node_conninfo, exit_on_connection_error);
825 		}
826 	}
827 	else
828 	{
829 		conn = establish_db_connection_by_params(&source_conninfo, exit_on_connection_error);
830 	}
831 
832 
833 	/*
834 	 * --db-connection option provided
835 	 */
836 	if (runtime_options.db_connection == true)
837 	{
838 		return_code = do_node_check_db_connection(conn, runtime_options.output_mode);
839 		PQfinish(conn);
840 		exit(return_code);
841 	}
842 
843 	/*
844 	 * If we've reached here, and the connection is invalid, then --optformat was provided
845 	 */
846 	if (PQstatus(conn) != CONNECTION_OK)
847 	{
848 		exit_optformat_error("DB_CONNECTION",
849 							 ERR_DB_CONN);
850 	}
851 
852 	if (get_node_record(conn, config_file_options.node_id, &node_info) != RECORD_FOUND)
853 	{
854 		log_error(_("no record found for node %i"), config_file_options.node_id);
855 		PQfinish(conn);
856 		exit(ERR_BAD_CONFIG);
857 	}
858 
859 	/* add replication statistics to node record */
860 	get_node_replication_stats(conn, &node_info);
861 
862 	/*
863 	 * handle specific checks ======================
864 	 */
865 	if (runtime_options.archive_ready == true)
866 	{
867 		return_code = do_node_check_archive_ready(conn,
868 												  runtime_options.output_mode,
869 												  NULL);
870 		PQfinish(conn);
871 		exit(return_code);
872 	}
873 
874 	if (runtime_options.upstream == true)
875 	{
876 		return_code = do_node_check_upstream(conn,
877 											 runtime_options.output_mode,
878 											 &node_info,
879 											 NULL);
880 		PQfinish(conn);
881 		exit(return_code);
882 	}
883 
884 	if (runtime_options.downstream == true)
885 	{
886 		return_code = do_node_check_downstream(conn,
887 											   runtime_options.output_mode,
888 											   &node_info,
889 											   NULL);
890 		PQfinish(conn);
891 		exit(return_code);
892 	}
893 
894 	if (runtime_options.replication_lag == true)
895 	{
896 		return_code = do_node_check_replication_lag(conn,
897 													runtime_options.output_mode,
898 													&node_info,
899 													NULL);
900 		PQfinish(conn);
901 		exit(return_code);
902 	}
903 
904 	if (runtime_options.role == true)
905 	{
906 		return_code = do_node_check_role(conn,
907 										 runtime_options.output_mode,
908 										 &node_info,
909 										 NULL);
910 		PQfinish(conn);
911 		exit(return_code);
912 	}
913 
914 	if (runtime_options.slots == true)
915 	{
916 		return_code = do_node_check_slots(conn,
917 										  runtime_options.output_mode,
918 										  &node_info,
919 										  NULL);
920 		PQfinish(conn);
921 		exit(return_code);
922 	}
923 
924 	if (runtime_options.missing_slots == true)
925 	{
926 		return_code = do_node_check_missing_slots(conn,
927 												  runtime_options.output_mode,
928 												  &node_info,
929 												  NULL);
930 		PQfinish(conn);
931 		exit(return_code);
932 	}
933 
934 	if (runtime_options.data_directory_config == true)
935 	{
936 		return_code = do_node_check_data_directory(conn,
937 												   runtime_options.output_mode,
938 												   &node_info,
939 												   NULL);
940 		PQfinish(conn);
941 		exit(return_code);
942 	}
943 
944 	if (runtime_options.replication_config_owner == true)
945 	{
946 		return_code = do_node_check_replication_config_owner(conn,
947 													   runtime_options.output_mode,
948 													   &node_info,
949 													   NULL);
950 		PQfinish(conn);
951 		exit(return_code);
952 	}
953 
954 
955 	if (runtime_options.output_mode == OM_NAGIOS)
956 	{
957 		log_error(_("--nagios can only be used with a specific check"));
958 		log_hint(_("execute \"repmgr node --help\" for details"));
959 		PQfinish(conn);
960 		exit(ERR_BAD_CONFIG);
961 	}
962 
963 	/* output general overview */
964 
965 	initPQExpBuffer(&output);
966 
967 	/* order functions are called is also output order */
968 	if (do_node_check_role(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
969 		issue_detected = true;
970 
971 	if (do_node_check_replication_lag(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
972 		issue_detected = true;
973 
974 	if (do_node_check_archive_ready(conn, runtime_options.output_mode, &status_list) != CHECK_STATUS_OK)
975 		issue_detected = true;
976 
977 	if (do_node_check_upstream(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
978 		issue_detected = true;
979 
980 	if (do_node_check_downstream(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
981 		issue_detected = true;
982 
983 	if (do_node_check_slots(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
984 		issue_detected = true;
985 
986 	if (do_node_check_missing_slots(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
987 		issue_detected = true;
988 
989 	if (do_node_check_data_directory(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
990 		issue_detected = true;
991 
992 	if (runtime_options.output_mode == OM_CSV)
993 	{
994 		appendPQExpBuffer(&output,
995 						  "\"Node name\",\"%s\"\n",
996 						  node_info.node_name);
997 
998 		appendPQExpBuffer(&output,
999 						  "\"Node ID\",\"%i\"\n",
1000 						  node_info.node_id);
1001 
1002 		for (cell = status_list.head; cell; cell = cell->next)
1003 		{
1004 			appendPQExpBuffer(&output,
1005 							  "\"%s\",\"%s\"",
1006 							  cell->item,
1007 							  output_check_status(cell->status));
1008 
1009 			if (strlen(cell->details))
1010 			{
1011 				appendPQExpBuffer(&output,
1012 								  ",\"%s\"",
1013 								  cell->details);
1014 			}
1015 			appendPQExpBufferChar(&output, '\n');
1016 		}
1017 	}
1018 	else
1019 	{
1020 		appendPQExpBuffer(&output,
1021 						  "Node \"%s\":\n",
1022 						  node_info.node_name);
1023 
1024 		for (cell = status_list.head; cell; cell = cell->next)
1025 		{
1026 			appendPQExpBuffer(&output,
1027 							  "\t%s: %s",
1028 							  cell->item,
1029 							  output_check_status(cell->status));
1030 
1031 			if (strlen(cell->details))
1032 			{
1033 				appendPQExpBuffer(&output,
1034 								  " (%s)",
1035 								  cell->details);
1036 			}
1037 			appendPQExpBufferChar(&output, '\n');
1038 		}
1039 	}
1040 
1041 
1042 	printf("%s", output.data);
1043 	termPQExpBuffer(&output);
1044 	check_status_list_free(&status_list);
1045 
1046 	PQfinish(conn);
1047 
1048 	if (issue_detected == true)
1049 	{
1050 		exit(ERR_NODE_STATUS);
1051 	}
1052 }
1053 
1054 
1055 static void
do_node_check_replication_connection(void)1056 do_node_check_replication_connection(void)
1057 {
1058 	PGconn *local_conn = NULL;
1059 	PGconn *repl_conn = NULL;
1060 	t_node_info node_record = T_NODE_INFO_INITIALIZER;
1061 	RecordStatus record_status = RECORD_NOT_FOUND;
1062 	PQExpBufferData output;
1063 
1064 
1065 	initPQExpBuffer(&output);
1066 	appendPQExpBufferStr(&output,
1067 						 "--connection=");
1068 
1069 	if (runtime_options.remote_node_id == UNKNOWN_NODE_ID)
1070 	{
1071 		appendPQExpBufferStr(&output, "UNKNOWN");
1072 		printf("%s\n", output.data);
1073 		termPQExpBuffer(&output);
1074 		return;
1075 	}
1076 
1077 	/* retrieve remote node record from local database */
1078 	local_conn = establish_db_connection(config_file_options.conninfo, false);
1079 
1080 	if (PQstatus(local_conn) != CONNECTION_OK)
1081 	{
1082 		appendPQExpBufferStr(&output, "CONNECTION_ERROR");
1083 		printf("%s\n", output.data);
1084 		termPQExpBuffer(&output);
1085 		return;
1086 	}
1087 
1088 	record_status = get_node_record(local_conn, runtime_options.remote_node_id, &node_record);
1089 	PQfinish(local_conn);
1090 
1091 	if (record_status != RECORD_FOUND)
1092 	{
1093 		appendPQExpBufferStr(&output, "UNKNOWN");
1094 		printf("%s\n", output.data);
1095 		termPQExpBuffer(&output);
1096 		return;
1097 	}
1098 
1099 	repl_conn = establish_replication_connection_from_conninfo(node_record.conninfo,
1100 															   node_record.repluser);
1101 
1102 	if (PQstatus(repl_conn) != CONNECTION_OK)
1103 	{
1104 		appendPQExpBufferStr(&output, "BAD");
1105 		printf("%s\n", output.data);
1106 		termPQExpBuffer(&output);
1107 		return;
1108 	}
1109 
1110 	PQfinish(repl_conn);
1111 
1112 	appendPQExpBufferStr(&output, "OK");
1113 	printf("%s\n", output.data);
1114 	termPQExpBuffer(&output);
1115 
1116 	return;
1117 }
1118 
1119 
1120 
1121 static CheckStatus
do_node_check_archive_ready(PGconn * conn,OutputMode mode,CheckStatusList * list_output)1122 do_node_check_archive_ready(PGconn *conn, OutputMode mode, CheckStatusList *list_output)
1123 {
1124 	int			ready_archive_files = 0;
1125 	CheckStatus status = CHECK_STATUS_UNKNOWN;
1126 	PQExpBufferData details;
1127 
1128 	if (mode == OM_CSV && list_output == NULL)
1129 	{
1130 		log_error(_("--csv output not provided with --archive-ready option"));
1131 		PQfinish(conn);
1132 		exit(ERR_BAD_CONFIG);
1133 	}
1134 
1135 	initPQExpBuffer(&details);
1136 
1137 	ready_archive_files = get_ready_archive_files(conn, config_file_options.data_directory);
1138 
1139 	if (ready_archive_files > config_file_options.archive_ready_critical)
1140 	{
1141 		status = CHECK_STATUS_CRITICAL;
1142 
1143 		switch (mode)
1144 		{
1145 			case OM_OPTFORMAT:
1146 				appendPQExpBuffer(&details,
1147 								  "--files=%i --threshold=%i",
1148 								  ready_archive_files, config_file_options.archive_ready_critical);
1149 				break;
1150 			case OM_NAGIOS:
1151 				appendPQExpBuffer(&details,
1152 								  "%i pending archive ready files | files=%i;%i;%i",
1153 								  ready_archive_files,
1154 								  ready_archive_files,
1155 								  config_file_options.archive_ready_warning,
1156 								  config_file_options.archive_ready_critical);
1157 				break;
1158 			case OM_TEXT:
1159 				appendPQExpBuffer(&details,
1160 								  "%i pending archive ready files, critical threshold: %i",
1161 								  ready_archive_files, config_file_options.archive_ready_critical);
1162 				break;
1163 
1164 			default:
1165 				break;
1166 		}
1167 	}
1168 	else if (ready_archive_files > config_file_options.archive_ready_warning)
1169 	{
1170 		status = CHECK_STATUS_WARNING;
1171 
1172 		switch (mode)
1173 		{
1174 			case OM_OPTFORMAT:
1175 				appendPQExpBuffer(&details,
1176 								  "--files=%i --threshold=%i",
1177 								  ready_archive_files, config_file_options.archive_ready_warning);
1178 				break;
1179 			case OM_NAGIOS:
1180 				appendPQExpBuffer(&details,
1181 								  "%i pending archive ready files | files=%i;%i;%i",
1182 								  ready_archive_files,
1183 								  ready_archive_files,
1184 								  config_file_options.archive_ready_warning,
1185 								  config_file_options.archive_ready_critical);
1186 
1187 				break;
1188 			case OM_TEXT:
1189 				appendPQExpBuffer(&details,
1190 								  "%i pending archive ready files (threshold: %i)",
1191 								  ready_archive_files, config_file_options.archive_ready_warning);
1192 				break;
1193 
1194 			default:
1195 				break;
1196 		}
1197 	}
1198 	else if (ready_archive_files < 0)
1199 	{
1200 		status = CHECK_STATUS_UNKNOWN;
1201 
1202 		switch (mode)
1203 		{
1204 			case OM_OPTFORMAT:
1205 				break;
1206 			case OM_NAGIOS:
1207 			case OM_TEXT:
1208 				appendPQExpBufferStr(&details,
1209 									 "unable to check archive_status directory");
1210 				break;
1211 
1212 			default:
1213 				break;
1214 		}
1215 	}
1216 	else
1217 	{
1218 		status = CHECK_STATUS_OK;
1219 
1220 		switch (mode)
1221 		{
1222 			case OM_OPTFORMAT:
1223 				appendPQExpBuffer(&details,
1224 								  "--files=%i", ready_archive_files);
1225 				break;
1226 			case OM_NAGIOS:
1227 				appendPQExpBuffer(&details,
1228 								  "%i pending archive ready files | files=%i;%i;%i",
1229 								  ready_archive_files,
1230 								  ready_archive_files,
1231 								  config_file_options.archive_ready_warning,
1232 								  config_file_options.archive_ready_critical);
1233 				break;
1234 			case OM_TEXT:
1235 				appendPQExpBuffer(&details,
1236 								  "%i pending archive ready files", ready_archive_files);
1237 				break;
1238 
1239 			default:
1240 				break;
1241 		}
1242 	}
1243 
1244 	switch (mode)
1245 	{
1246 		case OM_OPTFORMAT:
1247 			{
1248 				printf("--status=%s %s\n",
1249 					   output_check_status(status),
1250 					   details.data);
1251 			}
1252 			break;
1253 		case OM_NAGIOS:
1254 			printf("REPMGR_ARCHIVE_READY %s: %s\n",
1255 				   output_check_status(status),
1256 				   details.data);
1257 			break;
1258 		case OM_CSV:
1259 		case OM_TEXT:
1260 			if (list_output != NULL)
1261 			{
1262 				check_status_list_set(list_output,
1263 									  "WAL archiving",
1264 									  status,
1265 									  details.data);
1266 			}
1267 			else
1268 			{
1269 				printf("%s (%s)\n",
1270 					   output_check_status(status),
1271 					   details.data);
1272 			}
1273 		default:
1274 			break;
1275 	}
1276 
1277 	termPQExpBuffer(&details);
1278 	return status;
1279 }
1280 
1281 
1282 static CheckStatus
do_node_check_downstream(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)1283 do_node_check_downstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
1284 {
1285 	NodeInfoList downstream_nodes = T_NODE_INFO_LIST_INITIALIZER;
1286 	NodeInfoListCell *cell = NULL;
1287 	int			missing_nodes_count = 0;
1288 	int			expected_nodes_count = 0;
1289 	CheckStatus status = CHECK_STATUS_OK;
1290 	ItemList	missing_nodes = {NULL, NULL};
1291 	ItemList	attached_nodes = {NULL, NULL};
1292 	PQExpBufferData details;
1293 
1294 	if (mode == OM_CSV && list_output == NULL)
1295 	{
1296 		log_error(_("--csv output not provided with --downstream option"));
1297 		PQfinish(conn);
1298 		exit(ERR_BAD_CONFIG);
1299 	}
1300 
1301 	initPQExpBuffer(&details);
1302 
1303 	get_downstream_node_records(conn, config_file_options.node_id, &downstream_nodes);
1304 
1305 	/* if a witness node is present, we'll need to remove this from the total */
1306 	expected_nodes_count = downstream_nodes.node_count;
1307 
1308 	for (cell = downstream_nodes.head; cell; cell = cell->next)
1309 	{
1310 		/* skip witness server */
1311 		if (cell->node_info->type == WITNESS)
1312 		{
1313 			expected_nodes_count --;
1314 			continue;
1315 		}
1316 
1317 		if (is_downstream_node_attached(conn, cell->node_info->node_name, NULL) != NODE_ATTACHED)
1318 		{
1319 			missing_nodes_count++;
1320 			item_list_append_format(&missing_nodes,
1321 									"%s (ID: %i)",
1322 									cell->node_info->node_name,
1323 									cell->node_info->node_id);
1324 		}
1325 		else
1326 		{
1327 			item_list_append_format(&attached_nodes,
1328 									"%s (ID: %i)",
1329 									cell->node_info->node_name,
1330 									cell->node_info->node_id);
1331 		}
1332 	}
1333 
1334 	if (node_info->type == WITNESS)
1335 	{
1336 		/* witness is not connecting to any upstream */
1337 		appendPQExpBufferStr(&details,
1338 							 _("N/A - node is a witness"));
1339 	}
1340 	else if (missing_nodes_count == 0)
1341 	{
1342 		if (expected_nodes_count == 0)
1343 			appendPQExpBufferStr(&details,
1344 								 "this node has no downstream nodes");
1345 		else
1346 			appendPQExpBuffer(&details,
1347 							  "%i of %i downstream nodes attached",
1348 							  expected_nodes_count - missing_nodes_count,
1349 							  expected_nodes_count);
1350 	}
1351 	else
1352 	{
1353 		ItemListCell *missing_cell = NULL;
1354 		bool		first = true;
1355 
1356 		status = CHECK_STATUS_CRITICAL;
1357 
1358 		appendPQExpBuffer(&details,
1359 						  "%i of %i downstream nodes not attached",
1360 						  missing_nodes_count,
1361 						  expected_nodes_count);
1362 
1363 		if (mode != OM_NAGIOS)
1364 		{
1365 			appendPQExpBufferStr(&details, "; missing: ");
1366 
1367 			for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next)
1368 			{
1369 				if (first == false)
1370 					appendPQExpBufferStr(&details,
1371 										 ", ");
1372 				else
1373 					first = false;
1374 
1375 				if (first == false)
1376 					appendPQExpBufferStr(&details, missing_cell->string);
1377 			}
1378 		}
1379 	}
1380 
1381 	switch (mode)
1382 	{
1383 		case OM_NAGIOS:
1384 			{
1385 				printf("REPMGR_DOWNSTREAM_SERVERS %s: %s | ",
1386 					   output_check_status(status),
1387 					   details.data);
1388 
1389 				if (missing_nodes_count)
1390 				{
1391 					ItemListCell *missing_cell = NULL;
1392 					bool		first = true;
1393 
1394 					printf("missing: ");
1395 					for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next)
1396 					{
1397 						if (first == false)
1398 							printf(", ");
1399 						else
1400 							first = false;
1401 
1402 						if (first == false)
1403 							printf("%s", missing_cell->string);
1404 					}
1405 				}
1406 
1407 				if (expected_nodes_count - missing_nodes_count)
1408 				{
1409 					ItemListCell *attached_cell = NULL;
1410 					bool		first = true;
1411 
1412 					if (missing_nodes_count)
1413 						printf("; ");
1414 					printf("attached: ");
1415 					for (attached_cell = attached_nodes.head; attached_cell; attached_cell = attached_cell->next)
1416 					{
1417 						if (first == false)
1418 							printf(", ");
1419 						else
1420 							first = false;
1421 
1422 						if (first == false)
1423 							printf("%s", attached_cell->string);
1424 					}
1425 				}
1426 				printf("\n");
1427 
1428 			}
1429 			break;
1430 		case OM_CSV:
1431 		case OM_TEXT:
1432 			if (list_output != NULL)
1433 			{
1434 				check_status_list_set(list_output,
1435 									  "Downstream servers",
1436 									  status,
1437 									  details.data);
1438 			}
1439 			else
1440 			{
1441 				printf("%s (%s)\n",
1442 					   output_check_status(status),
1443 					   details.data);
1444 			}
1445 		default:
1446 			break;
1447 
1448 	}
1449 	termPQExpBuffer(&details);
1450 	clear_node_info_list(&downstream_nodes);
1451 	return status;
1452 }
1453 
1454 
1455 static CheckStatus
do_node_check_upstream(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)1456 do_node_check_upstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
1457 {
1458 	PGconn	   *upstream_conn = NULL;
1459 	t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER;
1460 	PQExpBufferData details;
1461 
1462 	CheckStatus status = CHECK_STATUS_OK;
1463 
1464 	if (mode == OM_CSV && list_output == NULL)
1465 	{
1466 		log_error(_("--csv output not provided with --upstream option"));
1467 		PQfinish(conn);
1468 		exit(ERR_BAD_CONFIG);
1469 	}
1470 
1471 	initPQExpBuffer(&details);
1472 
1473 	if (node_info->type == WITNESS)
1474 	{
1475 		/* witness is not connecting to any upstream */
1476 		appendPQExpBufferStr(&details,
1477 							 _("N/A - node is a witness"));
1478 	}
1479 	else if (get_node_record(conn, node_info->upstream_node_id, &upstream_node_info) != RECORD_FOUND)
1480 	{
1481 		if (get_recovery_type(conn) == RECTYPE_STANDBY)
1482 		{
1483 			appendPQExpBuffer(&details,
1484 							  _("node \"%s\" (ID: %i) is a standby but no upstream record found"),
1485 							  node_info->node_name,
1486 							  node_info->node_id);
1487 			status = CHECK_STATUS_CRITICAL;
1488 		}
1489 		else
1490 		{
1491 			appendPQExpBufferStr(&details,
1492 								 _("N/A - node is primary"));
1493 		}
1494 	}
1495 	else
1496 	{
1497 		upstream_conn = establish_db_connection(upstream_node_info.conninfo, true);
1498 
1499 		/* check our node is connected */
1500 		if (is_downstream_node_attached(upstream_conn, config_file_options.node_name, NULL) != NODE_ATTACHED)
1501 		{
1502 			appendPQExpBuffer(&details,
1503 							  _("node \"%s\" (ID: %i) is not attached to expected upstream node \"%s\" (ID: %i)"),
1504 							  node_info->node_name,
1505 							  node_info->node_id,
1506 							  upstream_node_info.node_name,
1507 							  upstream_node_info.node_id);
1508 			status = CHECK_STATUS_CRITICAL;
1509 		}
1510 		else
1511 		{
1512 			appendPQExpBuffer(&details,
1513 							  _("node \"%s\" (ID: %i) is attached to expected upstream node \"%s\" (ID: %i)"),
1514 							  node_info->node_name,
1515 							  node_info->node_id,
1516 							  upstream_node_info.node_name,
1517 							  upstream_node_info.node_id);
1518 		}
1519 	}
1520 
1521 	switch (mode)
1522 	{
1523 		case OM_NAGIOS:
1524 			{
1525 				printf("REPMGR_UPSTREAM_SERVER %s: %s | ",
1526 					   output_check_status(status),
1527 					   details.data);
1528 			}
1529 			break;
1530 		case OM_TEXT:
1531 			if (list_output != NULL)
1532 			{
1533 				check_status_list_set(list_output,
1534 									  "Upstream connection",
1535 									  status,
1536 									  details.data);
1537 			}
1538 			else
1539 			{
1540 				printf("%s (%s)\n",
1541 					   output_check_status(status),
1542 					   details.data);
1543 			}
1544 		default:
1545 			break;
1546 	}
1547 
1548 	termPQExpBuffer(&details);
1549 
1550 	return status;
1551 }
1552 
1553 
1554 static CheckStatus
do_node_check_replication_lag(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)1555 do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
1556 {
1557 	CheckStatus status = CHECK_STATUS_OK;
1558 	int			lag_seconds = 0;
1559 	PQExpBufferData details;
1560 
1561 	if (mode == OM_CSV && list_output == NULL)
1562 	{
1563 		log_error(_("--csv output not provided with --replication-lag option"));
1564 		PQfinish(conn);
1565 		exit(ERR_BAD_CONFIG);
1566 	}
1567 
1568 	initPQExpBuffer(&details);
1569 
1570 	if (node_info->recovery_type == RECTYPE_PRIMARY)
1571 	{
1572 		switch (mode)
1573 		{
1574 			case OM_OPTFORMAT:
1575 				appendPQExpBufferStr(&details,
1576 									 "--lag=0");
1577 				break;
1578 			case OM_NAGIOS:
1579 				appendPQExpBuffer(&details,
1580 								  "0 seconds | lag=0;%i;%i",
1581 								  config_file_options.replication_lag_warning,
1582 								  config_file_options.replication_lag_critical);
1583 				break;
1584 			case OM_TEXT:
1585 				if (node_info->type == WITNESS)
1586 				{
1587 					appendPQExpBufferStr(&details,
1588 										 "N/A - node is witness");
1589 				}
1590 				else
1591 				{
1592 					appendPQExpBufferStr(&details,
1593 										 "N/A - node is primary");
1594 				}
1595 				break;
1596 			default:
1597 				break;
1598 		}
1599 	}
1600 	else
1601 	{
1602 		lag_seconds = get_replication_lag_seconds(conn);
1603 
1604 		log_debug("lag seconds: %i", lag_seconds);
1605 
1606 		if (lag_seconds >= config_file_options.replication_lag_critical)
1607 		{
1608 			status = CHECK_STATUS_CRITICAL;
1609 
1610 			switch (mode)
1611 			{
1612 				case OM_OPTFORMAT:
1613 					appendPQExpBuffer(&details,
1614 									  "--lag=%i --threshold=%i",
1615 									  lag_seconds, config_file_options.replication_lag_critical);
1616 					break;
1617 				case OM_NAGIOS:
1618 					appendPQExpBuffer(&details,
1619 									  "%i seconds | lag=%i;%i;%i",
1620 									  lag_seconds,
1621 									  lag_seconds,
1622 									  config_file_options.replication_lag_warning,
1623 									  config_file_options.replication_lag_critical);
1624 					break;
1625 				case OM_TEXT:
1626 					appendPQExpBuffer(&details,
1627 									  "%i seconds, critical threshold: %i)",
1628 									  lag_seconds, config_file_options.replication_lag_critical);
1629 					break;
1630 
1631 				default:
1632 					break;
1633 			}
1634 		}
1635 		else if (lag_seconds > config_file_options.replication_lag_warning)
1636 		{
1637 			status = CHECK_STATUS_WARNING;
1638 
1639 			switch (mode)
1640 			{
1641 				case OM_OPTFORMAT:
1642 					appendPQExpBuffer(&details,
1643 									  "--lag=%i --threshold=%i",
1644 									  lag_seconds, config_file_options.replication_lag_warning);
1645 					break;
1646 				case OM_NAGIOS:
1647 					appendPQExpBuffer(&details,
1648 									  "%i seconds | lag=%i;%i;%i",
1649 									  lag_seconds,
1650 									  lag_seconds,
1651 									  config_file_options.replication_lag_warning,
1652 									  config_file_options.replication_lag_critical);
1653 					break;
1654 				case OM_TEXT:
1655 					appendPQExpBuffer(&details,
1656 									  "%i seconds, warning threshold: %i)",
1657 									  lag_seconds, config_file_options.replication_lag_warning);
1658 					break;
1659 
1660 				default:
1661 					break;
1662 			}
1663 		}
1664 		else if (lag_seconds == UNKNOWN_REPLICATION_LAG)
1665 		{
1666 			status = CHECK_STATUS_UNKNOWN;
1667 
1668 			switch (mode)
1669 			{
1670 				case OM_OPTFORMAT:
1671 					break;
1672 				case OM_NAGIOS:
1673 				case OM_TEXT:
1674 					appendPQExpBufferStr(&details,
1675 										 "unable to query replication lag");
1676 					break;
1677 
1678 				default:
1679 					break;
1680 			}
1681 		}
1682 		else
1683 		{
1684 			status = CHECK_STATUS_OK;
1685 
1686 			switch (mode)
1687 			{
1688 				case OM_OPTFORMAT:
1689 					appendPQExpBuffer(&details,
1690 									  "--lag=%i",
1691 									  lag_seconds);
1692 					break;
1693 				case OM_NAGIOS:
1694 					appendPQExpBuffer(&details,
1695 									  "%i seconds | lag=%i;%i;%i",
1696 									  lag_seconds,
1697 									  lag_seconds,
1698 									  config_file_options.replication_lag_warning,
1699 									  config_file_options.replication_lag_critical);
1700 					break;
1701 				case OM_TEXT:
1702 					appendPQExpBuffer(&details,
1703 									  "%i seconds",
1704 									  lag_seconds);
1705 					break;
1706 
1707 				default:
1708 					break;
1709 			}
1710 		}
1711 	}
1712 
1713 	switch (mode)
1714 	{
1715 		case OM_OPTFORMAT:
1716 			printf("--status=%s %s\n",
1717 				   output_check_status(status),
1718 				   details.data);
1719 			break;
1720 		case OM_NAGIOS:
1721 			printf("REPMGR_REPLICATION_LAG %s: %s\n",
1722 				   output_check_status(status),
1723 				   details.data);
1724 			break;
1725 		case OM_CSV:
1726 		case OM_TEXT:
1727 			if (list_output != NULL)
1728 			{
1729 				check_status_list_set(list_output,
1730 									  "Replication lag",
1731 									  status,
1732 									  details.data);
1733 			}
1734 			else
1735 			{
1736 				printf("%s (%s)\n",
1737 					   output_check_status(status),
1738 					   details.data);
1739 			}
1740 		default:
1741 			break;
1742 	}
1743 
1744 	termPQExpBuffer(&details);
1745 
1746 	return status;
1747 }
1748 
1749 
1750 static CheckStatus
do_node_check_role(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)1751 do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
1752 {
1753 
1754 	CheckStatus status = CHECK_STATUS_OK;
1755 	PQExpBufferData details;
1756 	RecoveryType recovery_type = get_recovery_type(conn);
1757 
1758 	if (mode == OM_CSV && list_output == NULL)
1759 	{
1760 		log_error(_("--csv output not provided with --role option"));
1761 		PQfinish(conn);
1762 		exit(ERR_BAD_CONFIG);
1763 	}
1764 
1765 	initPQExpBuffer(&details);
1766 
1767 	switch (node_info->type)
1768 	{
1769 		case PRIMARY:
1770 			if (recovery_type == RECTYPE_STANDBY)
1771 			{
1772 				status = CHECK_STATUS_CRITICAL;
1773 				appendPQExpBufferStr(&details,
1774 									 _("node is registered as primary but running as standby"));
1775 			}
1776 			else
1777 			{
1778 				appendPQExpBufferStr(&details,
1779 									 _("node is primary"));
1780 			}
1781 			break;
1782 		case STANDBY:
1783 			if (recovery_type == RECTYPE_PRIMARY)
1784 			{
1785 				status = CHECK_STATUS_CRITICAL;
1786 				appendPQExpBufferStr(&details,
1787 									 _("node is registered as standby but running as primary"));
1788 			}
1789 			else
1790 			{
1791 				appendPQExpBufferStr(&details,
1792 									 _("node is standby"));
1793 			}
1794 			break;
1795 		case WITNESS:
1796 			if (recovery_type == RECTYPE_STANDBY)
1797 			{
1798 				status = CHECK_STATUS_CRITICAL;
1799 				appendPQExpBufferStr(&details,
1800 									 _("node is registered as witness but running as standby"));
1801 			}
1802 			else
1803 			{
1804 				appendPQExpBufferStr(&details,
1805 									 _("node is witness"));
1806 			}
1807 			break;
1808 		default:
1809 			break;
1810 	}
1811 
1812 	switch (mode)
1813 	{
1814 		case OM_NAGIOS:
1815 			printf("REPMGR_SERVER_ROLE %s: %s\n",
1816 				   output_check_status(status),
1817 				   details.data);
1818 			break;
1819 		case OM_CSV:
1820 		case OM_TEXT:
1821 			if (list_output != NULL)
1822 			{
1823 				check_status_list_set(list_output,
1824 									  "Server role",
1825 									  status,
1826 									  details.data);
1827 			}
1828 			else
1829 			{
1830 				printf("%s (%s)\n",
1831 					   output_check_status(status),
1832 					   details.data);
1833 			}
1834 		default:
1835 			break;
1836 	}
1837 
1838 	termPQExpBuffer(&details);
1839 	return status;
1840 
1841 }
1842 
1843 
1844 static CheckStatus
do_node_check_slots(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)1845 do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
1846 {
1847 	CheckStatus status = CHECK_STATUS_OK;
1848 	PQExpBufferData details;
1849 
1850 	if (mode == OM_CSV && list_output == NULL)
1851 	{
1852 		log_error(_("--csv output not provided with --slots option"));
1853 		PQfinish(conn);
1854 		exit(ERR_BAD_CONFIG);
1855 	}
1856 
1857 	initPQExpBuffer(&details);
1858 
1859 	if (node_info->total_replication_slots == 0)
1860 	{
1861 		appendPQExpBufferStr(&details,
1862 							 _("node has no physical replication slots"));
1863 	}
1864 	else if (node_info->inactive_replication_slots == 0)
1865 	{
1866 		appendPQExpBuffer(&details,
1867 						  _("%i of %i physical replication slots are active"),
1868 						  node_info->total_replication_slots,
1869 						  node_info->total_replication_slots);
1870 	}
1871 	else if (node_info->inactive_replication_slots > 0)
1872 	{
1873 		status = CHECK_STATUS_CRITICAL;
1874 
1875 		appendPQExpBuffer(&details,
1876 						  _("%i of %i physical replication slots are inactive"),
1877 						  node_info->inactive_replication_slots,
1878 						  node_info->total_replication_slots);
1879 	}
1880 
1881 	switch (mode)
1882 	{
1883 		case OM_NAGIOS:
1884 			printf("REPMGR_INACTIVE_SLOTS %s: %s | slots=%i;%i\n",
1885 				   output_check_status(status),
1886 				   details.data,
1887 				   node_info->total_replication_slots,
1888 				   node_info->inactive_replication_slots);
1889 			break;
1890 		case OM_CSV:
1891 		case OM_TEXT:
1892 			if (list_output != NULL)
1893 			{
1894 				check_status_list_set(list_output,
1895 									  "Replication slots",
1896 									  status,
1897 									  details.data);
1898 			}
1899 			else
1900 			{
1901 				printf("%s (%s)\n",
1902 					   output_check_status(status),
1903 					   details.data);
1904 			}
1905 		default:
1906 			break;
1907 	}
1908 
1909 	termPQExpBuffer(&details);
1910 	return status;
1911 }
1912 
1913 
1914 static CheckStatus
do_node_check_missing_slots(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)1915 do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
1916 {
1917 	CheckStatus status = CHECK_STATUS_OK;
1918 	PQExpBufferData details;
1919 	NodeInfoList missing_slots = T_NODE_INFO_LIST_INITIALIZER;
1920 
1921 	if (mode == OM_CSV && list_output == NULL)
1922 	{
1923 		log_error(_("--csv output not provided with --missing-slots option"));
1924 		PQfinish(conn);
1925 		exit(ERR_BAD_CONFIG);
1926 	}
1927 
1928 	initPQExpBuffer(&details);
1929 
1930 	get_downstream_nodes_with_missing_slot(conn,
1931 										   config_file_options.node_id,
1932 										   &missing_slots);
1933 
1934 	if (missing_slots.node_count == 0)
1935 	{
1936 		appendPQExpBufferStr(&details,
1937 							 _("node has no missing physical replication slots"));
1938 	}
1939 	else
1940 	{
1941 		NodeInfoListCell *missing_slot_cell = NULL;
1942 		bool first_element = true;
1943 
1944 		status = CHECK_STATUS_CRITICAL;
1945 
1946 		appendPQExpBuffer(&details,
1947 						  _("%i physical replication slots are missing"),
1948 						  missing_slots.node_count);
1949 
1950 		if (missing_slots.node_count)
1951 		{
1952 			appendPQExpBufferStr(&details, ": ");
1953 
1954 			for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next)
1955 			{
1956 				if (first_element == true)
1957 				{
1958 					first_element = false;
1959 				}
1960 				else
1961 				{
1962 					appendPQExpBufferStr(&details, ", ");
1963 				}
1964 
1965 				appendPQExpBufferStr(&details, missing_slot_cell->node_info->slot_name);
1966 			}
1967 		}
1968 	}
1969 
1970 	switch (mode)
1971 	{
1972 		case OM_NAGIOS:
1973 		{
1974 			printf("REPMGR_MISSING_SLOTS %s: %s | missing_slots=%i",
1975 				   output_check_status(status),
1976 				   details.data,
1977 				   missing_slots.node_count);
1978 
1979 			if (missing_slots.node_count)
1980 			{
1981 				NodeInfoListCell *missing_slot_cell = NULL;
1982 				bool first_element = true;
1983 
1984 				printf(";");
1985 
1986 				for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next)
1987 				{
1988 					if (first_element == true)
1989 					{
1990 						first_element = false;
1991 					}
1992 					else
1993 					{
1994 						printf(",");
1995 					}
1996 					printf("%s", missing_slot_cell->node_info->slot_name);
1997 				}
1998 			}
1999 			printf("\n");
2000 			break;
2001 		}
2002 		case OM_CSV:
2003 		case OM_TEXT:
2004 			if (list_output != NULL)
2005 			{
2006 				check_status_list_set(list_output,
2007 									  "Missing physical replication slots",
2008 									  status,
2009 									  details.data);
2010 			}
2011 			else
2012 			{
2013 				printf("%s (%s)\n",
2014 					   output_check_status(status),
2015 					   details.data);
2016 			}
2017 		default:
2018 			break;
2019 	}
2020 
2021 	clear_node_info_list(&missing_slots);
2022 
2023 	termPQExpBuffer(&details);
2024 	return status;
2025 }
2026 
2027 
2028 CheckStatus
do_node_check_data_directory(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)2029 do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
2030 {
2031 	CheckStatus status = CHECK_STATUS_OK;
2032 	char actual_data_directory[MAXPGPATH] = "";
2033 	PQExpBufferData details;
2034 
2035 	if (mode == OM_CSV && list_output == NULL)
2036 	{
2037 		log_error(_("--csv output not provided with --data-directory-config option"));
2038 		PQfinish(conn);
2039 		exit(ERR_BAD_CONFIG);
2040 	}
2041 
2042 	initPQExpBuffer(&details);
2043 
2044 	/*
2045 	 * Check actual data directory matches that in repmgr.conf; note this requires
2046 	 * a superuser connection
2047 	 */
2048 	if (connection_has_pg_monitor_role(conn, "pg_read_all_settings") == true)
2049 	{
2050 		/* we expect to have a database connection */
2051 		if (get_pg_setting(conn, "data_directory", actual_data_directory) == false)
2052 		{
2053 			appendPQExpBuffer(&details,
2054 							  _("unable to determine current \"data_directory\""));
2055 			status = CHECK_STATUS_UNKNOWN;
2056 		}
2057 
2058 		if (strncmp(actual_data_directory, config_file_options.data_directory, MAXPGPATH) != 0)
2059 		{
2060 			if (mode != OM_NAGIOS)
2061 			{
2062 				appendPQExpBuffer(&details,
2063 								  _("configured \"data_directory\" is \"%s\"; "),
2064 								  config_file_options.data_directory);
2065 			}
2066 
2067 			appendPQExpBuffer(&details,
2068 							  "actual data directory is \"%s\"",
2069 							  actual_data_directory);
2070 
2071 			status = CHECK_STATUS_CRITICAL;
2072 		}
2073 		else
2074 		{
2075 			appendPQExpBuffer(&details,
2076 							  _("configured \"data_directory\" is \"%s\""),
2077 							  config_file_options.data_directory);
2078 		}
2079 	}
2080 	/*
2081 	 * If no superuser connection available, sanity-check that the configuration directory looks
2082 	 * like a PostgreSQL directory and hope it's the right one.
2083 	 */
2084 	else
2085 	{
2086 		if (mode == OM_TEXT)
2087 		{
2088 			log_info(_("connection is not a superuser connection, falling back to simple check"));
2089 
2090 			if (PQserverVersion(conn) >= 100000)
2091 			{
2092 				log_hint(_("provide a superuser with -S/--superuser, or add the \"%s\" user to role \"pg_read_all_settings\" or \"pg_monitor\""),
2093 						   PQuser(conn));
2094 			}
2095 		}
2096 
2097 		if (is_pg_dir(config_file_options.data_directory) == false)
2098 		{
2099 			if (mode == OM_NAGIOS)
2100 			{
2101 				appendPQExpBufferStr(&details,
2102 								  _("configured \"data_directory\" is not a PostgreSQL data directory"));
2103 			}
2104 			else
2105 			{
2106 				appendPQExpBuffer(&details,
2107 								  _("configured \"data_directory\" \"%s\" is not a PostgreSQL data directory"),
2108 								  actual_data_directory);
2109 			}
2110 
2111 			status = CHECK_STATUS_CRITICAL;
2112 		}
2113 		else
2114 		{
2115 			appendPQExpBuffer(&details,
2116 							  _("configured \"data_directory\" is \"%s\""),
2117 							  config_file_options.data_directory);
2118 		}
2119 	}
2120 
2121 	switch (mode)
2122 	{
2123 		case OM_OPTFORMAT:
2124 			printf("--configured-data-directory=%s\n",
2125 				   output_check_status(status));
2126 			break;
2127 		case OM_NAGIOS:
2128 			printf("REPMGR_DATA_DIRECTORY %s: %s",
2129 				   output_check_status(status),
2130 				   config_file_options.data_directory);
2131 
2132 			if (status == CHECK_STATUS_CRITICAL)
2133 			{
2134 				printf(" | %s", details.data);
2135 			}
2136 			puts("");
2137 			break;
2138 		case OM_CSV:
2139 		case OM_TEXT:
2140 			if (list_output != NULL)
2141 			{
2142 				check_status_list_set(list_output,
2143 									  "Configured data directory",
2144 									  status,
2145 									  details.data);
2146 			}
2147 			else
2148 			{
2149 				printf("%s (%s)\n",
2150 					   output_check_status(status),
2151 					   details.data);
2152 			}
2153 		default:
2154 			break;
2155 	}
2156 
2157 	termPQExpBuffer(&details);
2158 
2159 	return status;
2160 }
2161 
2162 /*
2163  * This is not included in the general list output
2164  */
2165 static
do_node_check_replication_config_owner(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)2166 CheckStatus do_node_check_replication_config_owner(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
2167 {
2168 	CheckStatus status = CHECK_STATUS_OK;
2169 
2170 	PQExpBufferData errmsg;
2171 	PQExpBufferData details;
2172 
2173 	if (mode != OM_OPTFORMAT)
2174 	{
2175 		log_error(_("--replication-config-owner option can only be used with --optformat"));
2176 		PQfinish(conn);
2177 		exit(ERR_BAD_CONFIG);
2178 	}
2179 
2180 	initPQExpBuffer(&errmsg);
2181 	initPQExpBuffer(&details);
2182 
2183 	if (check_replication_config_owner(PQserverVersion(conn),
2184 									   config_file_options.data_directory,
2185 									   &errmsg, &details) == false)
2186 	{
2187 		status = CHECK_STATUS_CRITICAL;
2188 	}
2189 
2190 	printf("--replication-config-owner=%s\n",
2191 		   output_check_status(status));
2192 
2193 	return status;
2194 }
2195 
2196 
2197 /*
2198  * This is not included in the general list output
2199  */
2200 static CheckStatus
do_node_check_db_connection(PGconn * conn,OutputMode mode)2201 do_node_check_db_connection(PGconn *conn, OutputMode mode)
2202 {
2203 	CheckStatus status = CHECK_STATUS_OK;
2204 	PQExpBufferData details;
2205 
2206 	if (mode == OM_CSV)
2207 	{
2208 		log_error(_("--csv output not provided with --db-connection option"));
2209 		PQfinish(conn);
2210 		exit(ERR_BAD_CONFIG);
2211 	}
2212 
2213 	/* This check is for configuration diagnostics only */
2214 	if (mode == OM_NAGIOS)
2215 	{
2216 		log_error(_("--nagios output not provided with --db-connection option"));
2217 		PQfinish(conn);
2218 		exit(ERR_BAD_CONFIG);
2219 	}
2220 
2221 	initPQExpBuffer(&details);
2222 
2223 	if (PQstatus(conn) != CONNECTION_OK)
2224 	{
2225 		t_conninfo_param_list conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
2226 		int c;
2227 
2228 		status = CHECK_STATUS_CRITICAL;
2229 		initialize_conninfo_params(&conninfo, false);
2230 		conn_to_param_list(conn, &conninfo);
2231 
2232 		appendPQExpBufferStr(&details,
2233 							 "connection parameters used:");
2234 		for (c = 0; c < conninfo.size && conninfo.keywords[c] != NULL; c++)
2235 		{
2236 			if (conninfo.values[c] != NULL && conninfo.values[c][0] != '\0')
2237 			{
2238 				appendPQExpBuffer(&details,
2239 								  " %s=%s",
2240 								  conninfo.keywords[c], conninfo.values[c]);
2241 			}
2242 		}
2243 
2244 	}
2245 
2246 	if (mode == OM_OPTFORMAT)
2247 	{
2248 		printf("--db-connection=%s\n",
2249 			   output_check_status(status));
2250 	}
2251 	else if (mode == OM_TEXT)
2252 	{
2253 		printf("%s (%s)\n",
2254 			   output_check_status(status),
2255 			   details.data);
2256 	}
2257 	termPQExpBuffer(&details);
2258 
2259 	return status;
2260 }
2261 
2262 
2263 void
do_node_service(void)2264 do_node_service(void)
2265 {
2266 	t_server_action action = ACTION_UNKNOWN;
2267 	char		data_dir[MAXPGPATH] = "";
2268 	char		command[MAXLEN] = "";
2269 	PQExpBufferData output;
2270 
2271 	action = parse_server_action(runtime_options.action);
2272 
2273 	if (action == ACTION_UNKNOWN)
2274 	{
2275 		log_error(_("unknown value \"%s\" provided for parameter --action"),
2276 				  runtime_options.action);
2277 		log_hint(_("valid values are \"start\", \"stop\", \"restart\", \"reload\" and \"promote\""));
2278 		exit(ERR_BAD_CONFIG);
2279 	}
2280 
2281 	if (runtime_options.list_actions == true)
2282 	{
2283 		return _do_node_service_list_actions(action);
2284 	}
2285 
2286 
2287 	if (data_dir_required_for_action(action))
2288 	{
2289 		get_node_config_directory(data_dir);
2290 
2291 		if (data_dir[0] == '\0')
2292 		{
2293 			log_error(_("unable to determine data directory for action"));
2294 			exit(ERR_BAD_CONFIG);
2295 		}
2296 	}
2297 
2298 
2299 	if ((action == ACTION_STOP || action == ACTION_RESTART) && runtime_options.checkpoint == true)
2300 	{
2301 		PGconn	   *conn = NULL;
2302 
2303 		if (config_file_options.conninfo[0] != '\0')
2304 		{
2305 			/*
2306 			 * If --superuser option provided, attempt to connect as the specified user
2307 			 */
2308 			if (runtime_options.superuser[0] != '\0')
2309 			{
2310 				conn = establish_db_connection_with_replacement_param(
2311 					config_file_options.conninfo,
2312 					"user",
2313 					runtime_options.superuser,
2314 					true);
2315 			}
2316 			else
2317 			{
2318 				conn = establish_db_connection(config_file_options.conninfo, true);
2319 			}
2320 		}
2321 		else
2322 		{
2323 			conn = establish_db_connection_by_params(&source_conninfo, true);
2324 		}
2325 
2326 		if (is_superuser_connection(conn, NULL) == false)
2327 		{
2328 			if (runtime_options.dry_run == true)
2329 			{
2330 				log_warning(_("a CHECKPOINT would be issued here but no superuser connection is available"));
2331 			}
2332 			else
2333 			{
2334 				log_warning(_("a superuser connection is required to issue a CHECKPOINT"));
2335 			}
2336 
2337 			log_hint(_("provide a superuser with -S/--superuser"));
2338 		}
2339 		else
2340 		{
2341 			if (runtime_options.dry_run == true)
2342 			{
2343 				log_info(_("a CHECKPOINT would be issued here"));
2344 			}
2345 			else
2346 			{
2347 
2348 				log_notice(_("issuing CHECKPOINT on node \"%s\" (ID: %i) "),
2349 						   config_file_options.node_name,
2350 						   config_file_options.node_id);
2351 
2352 				checkpoint(conn);
2353 			}
2354 		}
2355 
2356 		PQfinish(conn);
2357 	}
2358 
2359 	get_server_action(action, command, data_dir);
2360 
2361 	if (runtime_options.dry_run == true)
2362 	{
2363 		log_info(_("would execute server command \"%s\""), command);
2364 		return;
2365 	}
2366 
2367 	/*
2368 	 * log level is "DETAIL" here as this command is intended to be executed
2369 	 * by another repmgr process (e.g. during standby switchover); that repmgr
2370 	 * should emit a "NOTICE" about the intent of the command.
2371 	 */
2372 	log_detail(_("executing server command \"%s\""), command);
2373 
2374 	initPQExpBuffer(&output);
2375 
2376 	if (local_command(command, &output) == false)
2377 	{
2378 		termPQExpBuffer(&output);
2379 		exit(ERR_LOCAL_COMMAND);
2380 	}
2381 
2382 	termPQExpBuffer(&output);
2383 }
2384 
2385 
2386 static void
_do_node_service_list_actions(t_server_action action)2387 _do_node_service_list_actions(t_server_action action)
2388 {
2389 	char		command[MAXLEN] = "";
2390 
2391 	char		data_dir[MAXPGPATH] = "";
2392 
2393 	bool		data_dir_required = false;
2394 
2395 	/* do we need to provide a data directory for any of the actions? */
2396 	if (data_dir_required_for_action(ACTION_START))
2397 		data_dir_required = true;
2398 
2399 	if (data_dir_required_for_action(ACTION_STOP))
2400 		data_dir_required = true;
2401 
2402 	if (data_dir_required_for_action(ACTION_RESTART))
2403 		data_dir_required = true;
2404 
2405 	if (data_dir_required_for_action(ACTION_RELOAD))
2406 		data_dir_required = true;
2407 
2408 	if (data_dir_required_for_action(ACTION_PROMOTE))
2409 		data_dir_required = true;
2410 
2411 	if (data_dir_required == true)
2412 	{
2413 		get_node_config_directory(data_dir);
2414 	}
2415 
2416 	/* show command for specific action only */
2417 	if (action != ACTION_NONE)
2418 	{
2419 		get_server_action(action, command, data_dir);
2420 		printf("%s\n", command);
2421 		return;
2422 	}
2423 
2424 	puts(_("Following commands would be executed for each action:"));
2425 	puts("");
2426 
2427 	get_server_action(ACTION_START, command, data_dir);
2428 	printf("    start: \"%s\"\n", command);
2429 
2430 	get_server_action(ACTION_STOP, command, data_dir);
2431 	printf("     stop: \"%s\"\n", command);
2432 
2433 	get_server_action(ACTION_RESTART, command, data_dir);
2434 	printf("  restart: \"%s\"\n", command);
2435 
2436 	get_server_action(ACTION_RELOAD, command, data_dir);
2437 	printf("   reload: \"%s\"\n", command);
2438 
2439 	get_server_action(ACTION_PROMOTE, command, data_dir);
2440 	printf("  promote: \"%s\"\n", command);
2441 
2442 	puts("");
2443 
2444 }
2445 
2446 
2447 static t_server_action
parse_server_action(const char * action_name)2448 parse_server_action(const char *action_name)
2449 {
2450 	if (action_name[0] == '\0')
2451 		return ACTION_NONE;
2452 
2453 	if (strcasecmp(action_name, "start") == 0)
2454 		return ACTION_START;
2455 
2456 	if (strcasecmp(action_name, "stop") == 0)
2457 		return ACTION_STOP;
2458 
2459 	if (strcasecmp(action_name, "restart") == 0)
2460 		return ACTION_RESTART;
2461 
2462 	if (strcasecmp(action_name, "reload") == 0)
2463 		return ACTION_RELOAD;
2464 
2465 	if (strcasecmp(action_name, "promote") == 0)
2466 		return ACTION_PROMOTE;
2467 
2468 	return ACTION_UNKNOWN;
2469 }
2470 
2471 
2472 
2473 /*
2474  * Rejoin a dormant (shut down) node to the replication cluster; this
2475  * is typically a former primary which needs to be demoted to a standby.
2476  *
2477  * Note that "repmgr node rejoin" is also executed by
2478  * "repmgr standby switchover" after promoting the new primary.
2479  *
2480  * Parameters:
2481  *   --dry-run
2482  *   --force-rewind[=VALUE]
2483  *   --config-files
2484  *   --config-archive-dir
2485  *   -W/--no-wait
2486  */
2487 void
do_node_rejoin(void)2488 do_node_rejoin(void)
2489 {
2490 	PGconn	   *upstream_conn = NULL;
2491 	RecoveryType primary_recovery_type = RECTYPE_UNKNOWN;
2492 	PGconn	   *primary_conn = NULL;
2493 
2494 	DBState		db_state;
2495 	PGPing		status;
2496 	bool		is_shutdown = true;
2497 	int			server_version_num = UNKNOWN_SERVER_VERSION_NUM;
2498 	bool		hide_standby_signal = false;
2499 
2500 	PQExpBufferData command;
2501 	PQExpBufferData command_output;
2502 	PQExpBufferData follow_output;
2503 	struct stat statbuf;
2504 	t_node_info primary_node_record = T_NODE_INFO_INITIALIZER;
2505 	t_node_info local_node_record = T_NODE_INFO_INITIALIZER;
2506 
2507 	bool		success = true;
2508 	int			follow_error_code = SUCCESS;
2509 
2510 	/* check node is not actually running */
2511 	status = PQping(config_file_options.conninfo);
2512 
2513 	switch (status)
2514 	{
2515 		case PQPING_NO_ATTEMPT:
2516 			log_error(_("unable to determine status of server"));
2517 			exit(ERR_BAD_CONFIG);
2518 		case PQPING_OK:
2519 			is_shutdown = false;
2520 			break;
2521 		case PQPING_REJECT:
2522 			is_shutdown = false;
2523 			break;
2524 		case PQPING_NO_RESPONSE:
2525 			/* status not yet clear */
2526 			break;
2527 	}
2528 
2529 	if (get_db_state(config_file_options.data_directory, &db_state) == false)
2530 	{
2531 		log_error(_("unable to determine database state from pg_control"));
2532 		exit(ERR_BAD_CONFIG);
2533 	}
2534 
2535 	if (is_shutdown == false)
2536 	{
2537 		log_error(_("database is still running in state \"%s\""),
2538 				  describe_db_state(db_state));
2539 		log_hint(_("\"repmgr node rejoin\" cannot be executed on a running node"));
2540 		exit(ERR_REJOIN_FAIL);
2541 	}
2542 
2543 	/*
2544 	 * Server version number required to determine whether pg_rewind will run
2545 	 * crash recovery (Pg 13 and later).
2546 	 */
2547 	server_version_num = get_pg_version(config_file_options.data_directory, NULL);
2548 
2549 	if (server_version_num == UNKNOWN_SERVER_VERSION_NUM)
2550 	{
2551 		/* This is very unlikely to happen */
2552 		log_error(_("unable to determine database version"));
2553 		exit(ERR_BAD_CONFIG);
2554 	}
2555 
2556 	log_verbose(LOG_DEBUG, "server version number is: %i", server_version_num);
2557 
2558 	/* check if cleanly shut down */
2559 	if (db_state != DB_SHUTDOWNED && db_state != DB_SHUTDOWNED_IN_RECOVERY)
2560 	{
2561 		if (db_state == DB_SHUTDOWNING)
2562 		{
2563 			log_error(_("database is still shutting down"));
2564 		}
2565 		else if (server_version_num >= 130000 && runtime_options.force_rewind_used == true)
2566 		{
2567 			log_warning(_("database is not shut down cleanly"));
2568 			log_detail(_("--force-rewind provided, pg_rewind will automatically perform recovery"));
2569 
2570 			/*
2571 			 * If pg_rewind is executed, the first change it will make
2572 			 * is to start the server in single user mode, which will fail
2573 			 * in the presence of "standby.signal", so we'll "hide" it
2574 			 * (actually delete and recreate).
2575 			 */
2576 			hide_standby_signal = true;
2577 		}
2578 		else
2579 		{
2580 			/*
2581 			 * If the database was not shut down cleanly, it *might* rejoin correctly
2582 			 * after starting up and recovering, but better to ensure the database
2583 			 * can recover before trying anything else.
2584 			 */
2585 			log_error(_("database is not shut down cleanly"));
2586 
2587 			if (server_version_num >= 130000)
2588 			{
2589 				log_hint(_("provide --force-rewind to run recovery"));
2590 			}
2591 			else
2592 			{
2593 				if (runtime_options.force_rewind_used == true)
2594 				{
2595 					log_detail(_("pg_rewind will not be able to run"));
2596 				}
2597 				log_hint(_("database should be restarted then shut down cleanly after crash recovery completes"));
2598 			}
2599 
2600 			exit(ERR_REJOIN_FAIL);
2601 		}
2602 	}
2603 
2604 	/* check provided upstream connection */
2605 	upstream_conn = establish_db_connection_by_params(&source_conninfo, true);
2606 
2607 	if (get_primary_node_record(upstream_conn, &primary_node_record) == false)
2608 	{
2609 		log_error(_("unable to retrieve primary node record"));
2610 		log_hint(_("check the provided database connection string is for a \"repmgr\" database"));
2611 		PQfinish(upstream_conn);
2612 		exit(ERR_BAD_CONFIG);
2613 	}
2614 
2615 	/* connect to registered primary and check it's not in recovery */
2616 	primary_conn = establish_db_connection(primary_node_record.conninfo, false);
2617 
2618 	if (PQstatus(primary_conn) != CONNECTION_OK)
2619 	{
2620 		RecoveryType upstream_recovery_type = get_recovery_type(upstream_conn);
2621 
2622 		log_error(_("unable to connect to current registered primary \"%s\" (ID: %i)"),
2623 				  primary_node_record.node_name,
2624 				  primary_node_record.node_id);
2625 		log_detail(_("registered primary node conninfo is: \"%s\""),
2626 				   primary_node_record.conninfo);
2627 		/*
2628 		 * Catch case where provided upstream is not in recovery, but is also
2629 		 * not registered as primary
2630 		 */
2631 
2632 		if (upstream_recovery_type == RECTYPE_PRIMARY)
2633 		{
2634 			log_warning(_("provided upstream connection string is for a server which is not in recovery, but not registered as primary"));
2635 			log_hint(_("fix repmgr metadata configuration before continuing"));
2636 		}
2637 
2638 		PQfinish(upstream_conn);
2639 		exit(ERR_BAD_CONFIG);
2640 	}
2641 
2642 	PQfinish(upstream_conn);
2643 
2644 	primary_recovery_type = get_recovery_type(primary_conn);
2645 
2646 	if (primary_recovery_type != RECTYPE_PRIMARY)
2647 	{
2648 		log_error(_("primary server is registered as node \"%s\" (ID: %i), but server is not a primary"),
2649 				  primary_node_record.node_name,
2650 				  primary_node_record.node_id);
2651 		/* TODO: hint about checking cluster */
2652 		PQfinish(primary_conn);
2653 
2654 		exit(ERR_BAD_CONFIG);
2655 	}
2656 
2657 	/*
2658 	 * Fetch the local node record - we'll need this later, and it acts as an
2659 	 * additional sanity-check that the node is known to the primary.
2660 	 */
2661 	if (get_node_record(primary_conn, config_file_options.node_id, &local_node_record) != RECORD_FOUND)
2662 	{
2663 		log_error(_("unable to retrieve node record for the local node"));
2664 		log_hint(_("check the local node is registered with the current primary \"%s\" (ID: %i)"),
2665 				 primary_node_record.node_name,
2666 				 primary_node_record.node_id);
2667 
2668 		PQfinish(primary_conn);
2669 		exit(ERR_BAD_CONFIG);
2670 	}
2671 
2672 	/*
2673 	 * Sanity-check replication slot availability
2674 	 */
2675 	if (config_file_options.use_replication_slots)
2676 	{
2677 		bool slots_available = check_replication_slots_available(primary_node_record.node_id,
2678 																 primary_conn);
2679 		if (slots_available == false)
2680 		{
2681 			PQfinish(primary_conn);
2682 			exit(ERR_BAD_CONFIG);
2683 		}
2684 	}
2685 
2686 
2687 	/*
2688 	 * sanity-check that it will actually be possible to stream from the new upstream
2689 	 */
2690 	{
2691 		bool can_rejoin;
2692 		TimeLineID tli = get_min_recovery_end_timeline(config_file_options.data_directory);
2693 		XLogRecPtr min_recovery_location = get_min_recovery_location(config_file_options.data_directory);
2694 
2695 		/*
2696 		 * It's possible this was a former primary, so the minRecoveryPoint*
2697 		 * fields may be empty.
2698 		 */
2699 
2700 		if (min_recovery_location == InvalidXLogRecPtr)
2701 			min_recovery_location = get_latest_checkpoint_location(config_file_options.data_directory);
2702 		if (tli == 0)
2703 			tli = get_timeline(config_file_options.data_directory);
2704 
2705 		can_rejoin = check_node_can_attach(tli,
2706 										   min_recovery_location,
2707 										   primary_conn,
2708 										   &primary_node_record,
2709 										   true);
2710 
2711 		if (can_rejoin == false)
2712 		{
2713 			PQfinish(primary_conn);
2714 			exit(ERR_REJOIN_FAIL);
2715 		}
2716 	}
2717 
2718 
2719 	/*
2720 	 * --force-rewind specified - check prerequisites, and attempt to execute
2721   	 * (if --dry-run provided, just output the command which would be executed)
2722 	 */
2723 
2724 	if (runtime_options.force_rewind_used == true)
2725 	{
2726 		PQExpBufferData msg;
2727 		PQExpBufferData	filebuf;
2728 		int				ret;
2729 
2730 		/*
2731 		 * Check that pg_rewind can be used
2732 		 */
2733 
2734 		initPQExpBuffer(&msg);
2735 
2736 		if (can_use_pg_rewind(primary_conn, config_file_options.data_directory, &msg) == false)
2737 		{
2738 			log_error(_("--force-rewind specified but pg_rewind cannot be used"));
2739 			log_detail("%s", msg.data);
2740 			termPQExpBuffer(&msg);
2741 			PQfinish(primary_conn);
2742 
2743 			exit(ERR_BAD_CONFIG);
2744 		}
2745 
2746 		appendPQExpBufferStr(&msg,
2747 							 _("prerequisites for using pg_rewind are met"));
2748 
2749 		if (runtime_options.dry_run == true)
2750 		{
2751 			log_info("%s", msg.data);
2752 		}
2753 		else
2754 		{
2755 			log_verbose(LOG_INFO, "%s", msg.data);
2756 		}
2757 		termPQExpBuffer(&msg);
2758 
2759 		/*
2760 		 * Archive requested configuration files.
2761 		 *
2762 		 * In --dry-run mode this acts as a check that the files can be archived, though
2763 		 * errors will only be logged; any copied files will be deleted and --dry-run
2764 		 * execution will continue.
2765 		 */
2766 		_do_node_archive_config();
2767 
2768 		/* execute pg_rewind */
2769 		initPQExpBuffer(&command);
2770 
2771 		if (runtime_options.force_rewind_path[0] != '\0')
2772 		{
2773 			appendPQExpBuffer(&command,
2774 							  "%s -D ",
2775 							  runtime_options.force_rewind_path);
2776 		}
2777 		else
2778 		{
2779 			make_pg_path(&command, "pg_rewind");
2780 			appendPQExpBufferStr(&command,
2781 								 " -D ");
2782 		}
2783 
2784 		appendShellString(&command,
2785 						  config_file_options.data_directory);
2786 
2787 		appendPQExpBuffer(&command,
2788 						  " --source-server='%s'",
2789 						  primary_node_record.conninfo);
2790 
2791 		if (runtime_options.dry_run == true)
2792 		{
2793 			log_info(_("pg_rewind would now be executed"));
2794 			log_detail(_("pg_rewind command is:\n  %s"),
2795 						 command.data);
2796 		}
2797 		else
2798 		{
2799 			log_notice(_("executing pg_rewind"));
2800 			log_detail(_("pg_rewind command is \"%s\""),
2801 					   command.data);
2802 
2803 			/*
2804 			 * In Pg13 and later, pg_rewind will attempt to start up a server which
2805 			 * was not cleanly shut down in single user mode. This will fail if
2806 			 * "standby.signal" is present. We'll remove it and restore it after
2807 			 * pg_rewind runs.
2808 			 */
2809 			if (hide_standby_signal == true)
2810 			{
2811 				char	    standby_signal_file_path[MAXPGPATH] = "";
2812 
2813 				log_notice(_("temporarily removing \"standby.signal\""));
2814 				log_detail(_("this is required so pg_rewind can fix the unclean shutdown"));
2815 
2816 				make_standby_signal_path(standby_signal_file_path);
2817 
2818 				if (unlink(standby_signal_file_path) < 0 && errno != ENOENT)
2819 				{
2820 					log_error(_("unable to remove \"standby.signal\" file in data directory \"%s\""),
2821 							  standby_signal_file_path);
2822 					log_detail("%s", strerror(errno));
2823 					exit(ERR_REJOIN_FAIL);
2824 				}
2825 			}
2826 
2827 			initPQExpBuffer(&command_output);
2828 
2829 			ret = local_command(command.data,
2830 								&command_output);
2831 
2832 			termPQExpBuffer(&command);
2833 
2834 			if (hide_standby_signal == true)
2835 			{
2836 				/*
2837 				 * Restore standby.signal if we previously removed it, regardless
2838 				 * of whether the pg_rewind operation failed.
2839 				 */
2840 				log_notice(_("recreating \"standby.signal\""));
2841 				write_standby_signal();
2842 			}
2843 
2844 			if (ret == false)
2845 			{
2846 				log_error(_("pg_rewind execution failed"));
2847 				log_detail("%s", command_output.data);
2848 
2849 				termPQExpBuffer(&command_output);
2850 
2851 				exit(ERR_REJOIN_FAIL);
2852 			}
2853 
2854 			termPQExpBuffer(&command_output);
2855 
2856 			/* Restore any previously archived config files */
2857 			_do_node_restore_config();
2858 
2859 			initPQExpBuffer(&filebuf);
2860 
2861 			/* remove any recovery.done file copied in by pg_rewind */
2862 			appendPQExpBuffer(&filebuf,
2863 							  "%s/recovery.done",
2864 							  config_file_options.data_directory);
2865 
2866 			if (stat(filebuf.data, &statbuf) == 0)
2867 			{
2868 				log_verbose(LOG_INFO, _("deleting \"recovery.done\""));
2869 
2870 				if (unlink(filebuf.data) == -1)
2871 				{
2872 					log_warning(_("unable to delete \"%s\""),
2873 								filebuf.data);
2874 					log_detail("%s", strerror(errno));
2875 				}
2876 			}
2877 			termPQExpBuffer(&filebuf);
2878 
2879 			/*
2880 			 * Delete any replication slots copied in by pg_rewind.
2881 			 *
2882 			 * TODO:
2883 			 *  - from PostgreSQL 11, this will be handled by pg_rewind, so
2884 			 *    we can skip this step from that version; see commit
2885 			 *    266b6acb312fc440c1c1a2036aa9da94916beac6
2886 			 *  - possibly delete contents of various other directories
2887 			 *    as per the above commit for pre-PostgreSQL 11
2888 			 */
2889 			{
2890 				PQExpBufferData slotdir_path;
2891 				DIR			  *slotdir;
2892 				struct dirent *slotdir_ent;
2893 
2894 				initPQExpBuffer(&slotdir_path);
2895 
2896 				appendPQExpBuffer(&slotdir_path,
2897 								  "%s/pg_replslot",
2898 								  config_file_options.data_directory);
2899 
2900 				slotdir = opendir(slotdir_path.data);
2901 
2902 				if (slotdir == NULL)
2903 				{
2904 					log_warning(_("unable to open replication slot directory \"%s\""),
2905 								slotdir_path.data);
2906 					log_detail("%s", strerror(errno));
2907 				}
2908 				else
2909 				{
2910 					while ((slotdir_ent = readdir(slotdir)) != NULL) {
2911 						struct stat statbuf;
2912 						PQExpBufferData slotdir_ent_path;
2913 
2914 						if (strcmp(slotdir_ent->d_name, ".") == 0 || strcmp(slotdir_ent->d_name, "..") == 0)
2915 							continue;
2916 
2917 						initPQExpBuffer(&slotdir_ent_path);
2918 
2919 						appendPQExpBuffer(&slotdir_ent_path,
2920 										  "%s/%s",
2921 										  slotdir_path.data,
2922 										  slotdir_ent->d_name);
2923 
2924 						if (stat(slotdir_ent_path.data, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
2925 						{
2926 							termPQExpBuffer(&slotdir_ent_path);
2927 							continue;
2928 						}
2929 
2930 						log_debug("deleting slot directory \"%s\"", slotdir_ent_path.data);
2931 						if (rmdir_recursive(slotdir_ent_path.data) != 0 && errno != EEXIST)
2932 						{
2933 							log_warning(_("unable to delete replication slot directory \"%s\""), slotdir_ent_path.data);
2934 							log_detail("%s", strerror(errno));
2935 							log_hint(_("directory may need to be manually removed"));
2936 						}
2937 
2938 						termPQExpBuffer(&slotdir_ent_path);
2939 					}
2940 
2941 					closedir(slotdir);
2942 				}
2943 				termPQExpBuffer(&slotdir_path);
2944 			}
2945 		}
2946 	}
2947 
2948 	if (runtime_options.dry_run == true)
2949 	{
2950 		log_info(_("prerequisites for executing NODE REJOIN are met"));
2951 		exit(SUCCESS);
2952 	}
2953 
2954 	initPQExpBuffer(&follow_output);
2955 
2956 	/*
2957 	 * do_standby_follow_internal() can handle situations where the follow
2958 	 * target is not the primary, so requires database handles to both
2959 	 * (even if they point to the same node). For the time being,
2960 	 * "node rejoin" will only attach a standby to the primary.
2961 	 */
2962 	success = do_standby_follow_internal(primary_conn,
2963 										 primary_conn,
2964 										 &primary_node_record,
2965 										 &follow_output,
2966 										 ERR_REJOIN_FAIL,
2967 										 &follow_error_code);
2968 
2969 	if (success == false)
2970 	{
2971 		log_error(_("NODE REJOIN failed"));
2972 
2973 		if (strlen(follow_output.data))
2974 			log_detail("%s", follow_output.data);
2975 
2976 		create_event_notification(primary_conn,
2977 								  &config_file_options,
2978 								  config_file_options.node_id,
2979 								  "node_rejoin",
2980 								  success,
2981 								  follow_output.data);
2982 
2983 		PQfinish(primary_conn);
2984 
2985 		termPQExpBuffer(&follow_output);
2986 		exit(follow_error_code);
2987 	}
2988 
2989 	/*
2990 	 * Actively check that node actually started and connected to primary,
2991 	 * if not exit with ERR_REJOIN_FAIL.
2992 	 *
2993 	 * This check can be overridden with -W/--no-wait, in which case a one-time
2994 	 * check will be carried out.
2995 	 */
2996 	if (runtime_options.no_wait == false)
2997 	{
2998 		standy_join_status join_success = check_standby_join(primary_conn,
2999 															 &primary_node_record,
3000 															 &local_node_record);
3001 
3002 		create_event_notification(primary_conn,
3003 								  &config_file_options,
3004 								  config_file_options.node_id,
3005 								  "node_rejoin",
3006 								  join_success == JOIN_SUCCESS ? true : false,
3007 								  follow_output.data);
3008 
3009 		if (join_success != JOIN_SUCCESS)
3010 		{
3011 			termPQExpBuffer(&follow_output);
3012 			log_error(_("NODE REJOIN failed"));
3013 
3014 			if (join_success == JOIN_FAIL_NO_PING) {
3015 				log_detail(_("local node \"%s\" did not become available start after %i seconds"),
3016 						   config_file_options.node_name,
3017 						   config_file_options.node_rejoin_timeout);
3018 			}
3019 			else {
3020 				log_detail(_("no active record for local node \"%s\" found in node \"%s\"'s \"pg_stat_replication\" table"),
3021 						   config_file_options.node_name,
3022 						   primary_node_record.node_name);
3023 			}
3024 			log_hint(_("check the PostgreSQL log on the local node"));
3025 
3026 			exit(ERR_REJOIN_FAIL);
3027 		}
3028 	}
3029 	else
3030 	{
3031 		/* -W/--no-wait provided - check once */
3032 		NodeAttached node_attached = is_downstream_node_attached(primary_conn, config_file_options.node_name, NULL);
3033 		if (node_attached == NODE_ATTACHED)
3034 			success = true;
3035 	}
3036 
3037 	/*
3038 	 * Handle replication slots:
3039 	 *  - if a slot for the new upstream exists, delete that
3040 	 *  - warn about any other inactive replication slots
3041 	 */
3042 	if (runtime_options.force_rewind_used == false && config_file_options.use_replication_slots)
3043 	{
3044 		PGconn	   *local_conn = NULL;
3045 		local_conn = establish_db_connection(config_file_options.conninfo, false);
3046 
3047 		if (PQstatus(local_conn) != CONNECTION_OK)
3048 		{
3049 			log_warning(_("unable to connect to local node to check replication slot status"));
3050 			log_hint(_("execute \"repmgr node check\" to check inactive slots and drop manually if necessary"));
3051 		}
3052 		else
3053 		{
3054 			KeyValueList inactive_replication_slots = {NULL, NULL};
3055 			KeyValueListCell *cell = NULL;
3056 			int inactive_count = 0;
3057 			PQExpBufferData slotinfo;
3058 
3059 			drop_replication_slot_if_exists(local_conn,
3060 											config_file_options.node_id,
3061 											primary_node_record.slot_name);
3062 
3063 			(void) get_inactive_replication_slots(local_conn, &inactive_replication_slots);
3064 
3065 			initPQExpBuffer(&slotinfo);
3066 			for (cell = inactive_replication_slots.head; cell; cell = cell->next)
3067 			{
3068 				appendPQExpBuffer(&slotinfo,
3069 								  "  - %s (%s)", cell->key, cell->value);
3070 				inactive_count++;
3071 			}
3072 
3073 			if (inactive_count > 0)
3074 			{
3075 				log_warning(_("%i inactive replication slots detected"), inactive_count);
3076 				log_detail(_("inactive replication slots:\n%s"), slotinfo.data);
3077 				log_hint(_("these replication slots may need to be removed manually"));
3078 			}
3079 
3080 			termPQExpBuffer(&slotinfo);
3081 
3082 			PQfinish(local_conn);
3083 		}
3084 	}
3085 
3086 	if (success == true)
3087 	{
3088 		log_notice(_("NODE REJOIN successful"));
3089 		log_detail("%s", follow_output.data);
3090 	}
3091 	else
3092 	{
3093 		/*
3094 		 * if we reach here, no record found in upstream node's pg_stat_replication
3095 		 */
3096 		log_notice(_("NODE REJOIN has completed but node is not yet reattached to upstream"));
3097 		log_hint(_("you will need to manually check the node's replication status"));
3098 	}
3099 	termPQExpBuffer(&follow_output);
3100 
3101 	return;
3102 }
3103 
3104 
3105 /*
3106  * Currently for testing purposes only, not documented;
3107  * use at own risk!
3108  */
3109 
3110 void
do_node_control(void)3111 do_node_control(void)
3112 {
3113 	PGconn	   *conn = NULL;
3114 	pid_t	    wal_receiver_pid = UNKNOWN_PID;
3115 	conn = establish_db_connection(config_file_options.conninfo, true);
3116 
3117 	if (runtime_options.disable_wal_receiver == true)
3118 	{
3119 		wal_receiver_pid = disable_wal_receiver(conn);
3120 
3121 		PQfinish(conn);
3122 
3123 		if (wal_receiver_pid == UNKNOWN_PID)
3124 			exit(ERR_BAD_CONFIG);
3125 
3126 		exit(SUCCESS);
3127 	}
3128 
3129 	if (runtime_options.enable_wal_receiver == true)
3130 	{
3131 		wal_receiver_pid = enable_wal_receiver(conn, true);
3132 
3133 		PQfinish(conn);
3134 
3135 		if (wal_receiver_pid == UNKNOWN_PID)
3136 			exit(ERR_BAD_CONFIG);
3137 
3138 		exit(SUCCESS);
3139 	}
3140 
3141 	log_error(_("no option provided"));
3142 
3143 	PQfinish(conn);
3144 }
3145 
3146 
3147 /*
3148  * For "internal" use by `node rejoin` on the local node when
3149  * called by "standby switchover" from the remote node.
3150  *
3151  * This archives any configuration files in the data directory, which may be
3152  * overwritten by pg_rewind.
3153  *
3154  * Requires configuration file, optionally --config-archive-dir
3155  */
3156 static void
_do_node_archive_config(void)3157 _do_node_archive_config(void)
3158 {
3159 	PQExpBufferData		archive_dir;
3160 	struct stat statbuf;
3161 	struct dirent *arcdir_ent;
3162 	DIR		   *arcdir;
3163 
3164 	KeyValueList config_files = {NULL, NULL};
3165 	KeyValueListCell *cell = NULL;
3166 	int			copied_count = 0;
3167 
3168 	initPQExpBuffer(&archive_dir);
3169 	format_archive_dir(&archive_dir);
3170 
3171 	/* sanity-check directory path */
3172 	if (stat(archive_dir.data, &statbuf) == -1)
3173 	{
3174 		if (errno != ENOENT)
3175 		{
3176 			log_error(_("error encountered when checking archive directory \"%s\""),
3177 					  archive_dir.data);
3178 			log_detail("%s", strerror(errno));
3179 			termPQExpBuffer(&archive_dir);
3180 			exit(ERR_BAD_CONFIG);
3181 		}
3182 
3183 		/* attempt to create and open the directory */
3184 		if (mkdir(archive_dir.data, S_IRWXU) != 0 && errno != EEXIST)
3185 		{
3186 			log_error(_("unable to create temporary archive directory \"%s\""),
3187 					  archive_dir.data);
3188 			log_detail("%s", strerror(errno));
3189 			termPQExpBuffer(&archive_dir);
3190 			exit(ERR_BAD_CONFIG);
3191 		}
3192 
3193 		if (runtime_options.dry_run == true)
3194 		{
3195 			log_verbose(LOG_INFO, "temporary archive directory \"%s\" created", archive_dir.data);
3196 		}
3197 	}
3198 	else if (!S_ISDIR(statbuf.st_mode))
3199 	{
3200 		log_error(_("\"%s\" exists but is not a directory"),
3201 				  archive_dir.data);
3202 		termPQExpBuffer(&archive_dir);
3203 		exit(ERR_BAD_CONFIG);
3204 	}
3205 
3206 	arcdir = opendir(archive_dir.data);
3207 
3208 	/* always attempt to open the directory */
3209 	if (arcdir == NULL)
3210 	{
3211 		log_error(_("unable to open archive directory \"%s\""),
3212 				  archive_dir.data);
3213 		log_detail("%s", strerror(errno));
3214 		termPQExpBuffer(&archive_dir);
3215 		exit(ERR_BAD_CONFIG);
3216 	}
3217 
3218 	if (runtime_options.dry_run == false)
3219 	{
3220 
3221 		/*
3222 		 * attempt to remove any existing files in the directory
3223 		 * TODO: collate problem files into list
3224 		 */
3225 		while ((arcdir_ent = readdir(arcdir)) != NULL)
3226 		{
3227 			PQExpBufferData arcdir_ent_path;
3228 
3229 			initPQExpBuffer(&arcdir_ent_path);
3230 
3231 			appendPQExpBuffer(&arcdir_ent_path,
3232 							  "%s/%s",
3233 							  archive_dir.data,
3234 							  arcdir_ent->d_name);
3235 
3236 			if (stat(arcdir_ent_path.data, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
3237 			{
3238 				termPQExpBuffer(&arcdir_ent_path);
3239 				continue;
3240 			}
3241 
3242 			if (unlink(arcdir_ent_path.data) == -1)
3243 			{
3244 				log_error(_("unable to delete file in temporary archive directory"));
3245 				log_detail(_("file is:  \"%s\""), arcdir_ent_path.data);
3246 				log_detail("%s", strerror(errno));
3247 				closedir(arcdir);
3248 				termPQExpBuffer(&arcdir_ent_path);
3249 				exit(ERR_BAD_CONFIG);
3250 			}
3251 
3252 			termPQExpBuffer(&arcdir_ent_path);
3253 		}
3254 	}
3255 
3256 	closedir(arcdir);
3257 
3258 
3259 	/*
3260 	 * extract list of config files from --config-files
3261 	 */
3262 	{
3263 		int			i = 0;
3264 		int			j = 0;
3265 		int			config_file_len = strlen(runtime_options.config_files);
3266 
3267 		char		filenamebuf[MAXPGPATH] = "";
3268 		PQExpBufferData		pathbuf;
3269 
3270 		for (j = 0; j < config_file_len; j++)
3271 		{
3272 			if (runtime_options.config_files[j] == ',')
3273 			{
3274 				int			filename_len = j - i;
3275 
3276 				if (filename_len >= MAXPGPATH)
3277 					filename_len = MAXPGPATH - 1;
3278 
3279 				strncpy(filenamebuf, runtime_options.config_files + i, filename_len);
3280 
3281 				filenamebuf[filename_len] = '\0';
3282 
3283 				initPQExpBuffer(&pathbuf);
3284 
3285 				appendPQExpBuffer(&pathbuf,
3286 								  "%s/%s",
3287 								  config_file_options.data_directory,
3288 								  filenamebuf);
3289 
3290 				key_value_list_set(&config_files,
3291 								   filenamebuf,
3292 								   pathbuf.data);
3293 				termPQExpBuffer(&pathbuf);
3294 				i = j + 1;
3295 			}
3296 		}
3297 
3298 		if (i < config_file_len)
3299 		{
3300 			int			filename_len = config_file_len - i;
3301 
3302 			strncpy(filenamebuf, runtime_options.config_files + i, filename_len);
3303 
3304 			filenamebuf[filename_len] = '\0';
3305 
3306 			initPQExpBuffer(&pathbuf);
3307 			appendPQExpBuffer(&pathbuf,
3308 							  "%s/%s",
3309 							  config_file_options.data_directory,
3310 							  filenamebuf);
3311 
3312 			key_value_list_set(&config_files,
3313 							   filenamebuf,
3314 							   pathbuf.data);
3315 			termPQExpBuffer(&pathbuf);
3316 		}
3317 	}
3318 
3319 
3320 	for (cell = config_files.head; cell; cell = cell->next)
3321 	{
3322 		PQExpBufferData dest_file;
3323 
3324 		initPQExpBuffer(&dest_file);
3325 
3326 		appendPQExpBuffer(&dest_file,
3327 						  "%s/%s",
3328 						  archive_dir.data,
3329 						  cell->key);
3330 
3331 		if (stat(cell->value, &statbuf) == -1)
3332 		{
3333 			log_warning(_("specified file \"%s\" not found, skipping"),
3334 						cell->value);
3335 		}
3336 		else
3337 		{
3338 			if (runtime_options.dry_run == true)
3339 			{
3340 				log_info("file \"%s\" would be copied to \"%s\"",
3341 						 cell->key, dest_file.data);
3342 				copied_count++;
3343 			}
3344 			else
3345 			{
3346 				log_verbose(LOG_DEBUG, "copying \"%s\" to \"%s\"",
3347 							cell->key, dest_file.data);
3348 				copy_file(cell->value, dest_file.data);
3349 				copied_count++;
3350 			}
3351 		}
3352 
3353 		termPQExpBuffer(&dest_file);
3354 	}
3355 
3356 	if (runtime_options.dry_run == true)
3357 	{
3358 		log_verbose(LOG_INFO, _("%i files would have been copied to \"%s\""),
3359 					copied_count, archive_dir.data);
3360 	}
3361 	else
3362 	{
3363 		log_verbose(LOG_INFO, _("%i files copied to \"%s\""),
3364 					copied_count, archive_dir.data);
3365 	}
3366 
3367 	if (runtime_options.dry_run == true)
3368 	{
3369 		/*
3370 		 * Delete directory in --dry-run mode  - it should be empty unless it's been
3371 		 * interfered with for some reason, in which case manual intervention is
3372 		 * required
3373 		 */
3374 		if (rmdir(archive_dir.data) != 0 && errno != EEXIST)
3375 		{
3376 			log_warning(_("unable to delete directory \"%s\""), archive_dir.data);
3377 			log_detail("%s", strerror(errno));
3378 			log_hint(_("directory may need to be manually removed"));
3379 		}
3380 		else
3381 		{
3382 			log_verbose(LOG_INFO, "temporary archive directory \"%s\" deleted", archive_dir.data);
3383 		}
3384 	}
3385 
3386 	termPQExpBuffer(&archive_dir);
3387 }
3388 
3389 
3390 /*
3391  * Intended mainly for "internal" use by `standby switchover`, which
3392  * calls this on the target server to restore any configuration files
3393  * to the data directory, which may have been overwritten by an operation
3394  * like pg_rewind
3395  *
3396  * Not designed to be called if the instance is running, but does
3397  * not currently check.
3398  *
3399  * Requires -D/--pgdata, optionally --config-archive-dir
3400  *
3401  * Removes --config-archive-dir after successful copy
3402  */
3403 
3404 static void
_do_node_restore_config(void)3405 _do_node_restore_config(void)
3406 {
3407 	PQExpBufferData		archive_dir;
3408 
3409 	DIR		   *arcdir;
3410 	struct dirent *arcdir_ent;
3411 	int			copied_count = 0;
3412 	bool		copy_ok = true;
3413 
3414 	initPQExpBuffer(&archive_dir);
3415 
3416 	format_archive_dir(&archive_dir);
3417 
3418 	arcdir = opendir(archive_dir.data);
3419 
3420 	if (arcdir == NULL)
3421 	{
3422 		log_error(_("unable to open archive directory \"%s\""),
3423 				  archive_dir.data);
3424 		log_detail("%s", strerror(errno));
3425 		termPQExpBuffer(&archive_dir);
3426 		exit(ERR_BAD_CONFIG);
3427 	}
3428 
3429 	while ((arcdir_ent = readdir(arcdir)) != NULL)
3430 	{
3431 		struct stat statbuf;
3432 		PQExpBufferData		src_file_path;
3433 		PQExpBufferData		dest_file_path;
3434 
3435 		initPQExpBuffer(&src_file_path);
3436 
3437 		appendPQExpBuffer(&src_file_path,
3438 						  "%s/%s",
3439 						  archive_dir.data,
3440 						  arcdir_ent->d_name);
3441 
3442 		/* skip non-files */
3443 		if (stat(src_file_path.data, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
3444 		{
3445 			termPQExpBuffer(&src_file_path);
3446 			continue;
3447 		}
3448 
3449 		initPQExpBuffer(&dest_file_path);
3450 
3451 		appendPQExpBuffer(&dest_file_path,
3452 						  "%s/%s",
3453 						  config_file_options.data_directory,
3454 						  arcdir_ent->d_name);
3455 
3456 		log_verbose(LOG_DEBUG, "copying \"%s\" to \"%s\"",
3457 					src_file_path.data, dest_file_path.data);
3458 
3459 		if (copy_file(src_file_path.data, dest_file_path.data) == false)
3460 		{
3461 			copy_ok = false;
3462 			log_warning(_("unable to copy \"%s\" to \"%s\""),
3463 						arcdir_ent->d_name, runtime_options.data_dir);
3464 		}
3465 		else
3466 		{
3467 			unlink(src_file_path.data);
3468 			copied_count++;
3469 		}
3470 
3471 		termPQExpBuffer(&dest_file_path);
3472 		termPQExpBuffer(&src_file_path);
3473 	}
3474 
3475 	closedir(arcdir);
3476 
3477 	log_notice(_("%i files copied to %s"),
3478 			   copied_count,
3479 			   config_file_options.data_directory);
3480 
3481 	if (copy_ok == false)
3482 	{
3483 		log_warning(_("unable to copy all files from \"%s\""), archive_dir.data);
3484 	}
3485 	else
3486 	{
3487 		/*
3488 		 * Finally, delete directory - it should be empty unless it's been
3489 		 * interfered with for some reason, in which case manual intervention is
3490 		 * required
3491 		 */
3492 		if (rmdir(archive_dir.data) != 0 && errno != EEXIST)
3493 		{
3494 			log_warning(_("unable to delete directory \"%s\""), archive_dir.data);
3495 			log_detail("%s", strerror(errno));
3496 			log_hint(_("directory may need to be manually removed"));
3497 		}
3498 		else
3499 		{
3500 			log_verbose(LOG_INFO, "directory \"%s\" deleted", archive_dir.data);
3501 		}
3502 	}
3503 
3504 	termPQExpBuffer(&archive_dir);
3505 
3506 	return;
3507 }
3508 
3509 
3510 static void
format_archive_dir(PQExpBufferData * archive_dir)3511 format_archive_dir(PQExpBufferData *archive_dir)
3512 {
3513 	appendPQExpBuffer(archive_dir,
3514 					  "%s/repmgr-config-archive-%s",
3515 					  runtime_options.config_archive_dir,
3516 					  config_file_options.node_name);
3517 
3518 	log_verbose(LOG_DEBUG, "using archive directory \"%s\"", archive_dir->data);
3519 }
3520 
3521 
3522 static bool
copy_file(const char * src_file,const char * dest_file)3523 copy_file(const char *src_file, const char *dest_file)
3524 {
3525 	FILE	   *ptr_old,
3526 			   *ptr_new;
3527 	int			a = 0;
3528 
3529 	ptr_old = fopen(src_file, "r");
3530 
3531 	if (ptr_old == NULL)
3532 		return false;
3533 
3534 	ptr_new = fopen(dest_file, "w");
3535 
3536 	if (ptr_new == NULL)
3537 	{
3538 		fclose(ptr_old);
3539 		return false;
3540 	}
3541 
3542 	chmod(dest_file, S_IRUSR | S_IWUSR);
3543 
3544 	while (1)
3545 	{
3546 		a = fgetc(ptr_old);
3547 
3548 		if (!feof(ptr_old))
3549 		{
3550 			fputc(a, ptr_new);
3551 		}
3552 		else
3553 		{
3554 			break;
3555 		}
3556 	}
3557 
3558 	fclose(ptr_new);
3559 	fclose(ptr_old);
3560 
3561 	return true;
3562 }
3563 
3564 
3565 void
do_node_help(void)3566 do_node_help(void)
3567 {
3568 	print_help_header();
3569 
3570 	printf(_("Usage:\n"));
3571 	printf(_("    %s [OPTIONS] node status\n"), progname());
3572 	printf(_("    %s [OPTIONS] node check\n"), progname());
3573 	printf(_("    %s [OPTIONS] node rejoin\n"), progname());
3574 	printf(_("    %s [OPTIONS] node service\n"), progname());
3575 	puts("");
3576 
3577 	printf(_("NODE STATUS\n"));
3578 	puts("");
3579 	printf(_("  \"node status\" displays an overview of a node's basic information and replication status.\n"));
3580 	puts("");
3581 	printf(_("  Configuration file required, runs on local node only.\n"));
3582 	puts("");
3583 	printf(_("    --csv                 emit output as CSV\n"));
3584 	puts("");
3585 
3586 	printf(_("NODE CHECK\n"));
3587 	puts("");
3588 	printf(_("  \"node check\" performs some health checks on a node from a replication perspective.\n"));
3589 	puts("");
3590 	printf(_("  Configuration file required, runs on local node only.\n"));
3591 	puts("");
3592 	printf(_("  Connection options:\n"));
3593 	printf(_("    -S, --superuser=USERNAME  superuser to use, if repmgr user is not superuser\n"));
3594 	puts("");
3595 	printf(_("  Output options:\n"));
3596 	printf(_("    --csv                     emit output as CSV (not available for individual check output)\n"));
3597 	printf(_("    --nagios                  emit output in Nagios format (individual check output only)\n"));
3598 	puts("");
3599 	printf(_("  Following options check an individual status:\n"));
3600 	printf(_("    --archive-ready           number of WAL files ready for archiving\n"));
3601 	printf(_("    --downstream              whether all downstream nodes are connected\n"));
3602 	printf(_("    --uptream                 whether the node is connected to its upstream\n"));
3603 	printf(_("    --replication-lag         replication lag in seconds (standbys only)\n"));
3604 	printf(_("    --role                    check node has expected role\n"));
3605 	printf(_("    --slots                   check for inactive replication slots\n"));
3606 	printf(_("    --missing-slots           check for missing replication slots\n"));
3607 	printf(_("    --data-directory-config   check repmgr's data directory configuration\n"));
3608 
3609 	puts("");
3610 
3611 	printf(_("NODE REJOIN\n"));
3612 	puts("");
3613 	printf(_("  \"node rejoin\" enables a dormant (stopped) node to be rejoined to the replication cluster.\n"));
3614 	puts("");
3615 	printf(_("  Configuration file required, runs on local node only.\n"));
3616 	puts("");
3617 	printf(_("    --dry-run               check that the prerequisites are met for rejoining the node\n" \
3618 			 "                              (including usability of \"pg_rewind\" if requested)\n"));
3619 	printf(_("    --force-rewind[=VALUE]  execute \"pg_rewind\" if necessary\n"));
3620 	printf(_("                              (9.3 and 9.4 - provide full \"pg_rewind\" path)\n"));
3621 
3622 	printf(_("    --config-files          comma-separated list of configuration files to retain\n" \
3623 			 "                            after executing \"pg_rewind\"\n"));
3624 	printf(_("    --config-archive-dir    directory to temporarily store retained configuration files\n" \
3625 			 "                              (default: /tmp)\n"));
3626 	printf(_("    -W, --no-wait           don't wait for the node to rejoin cluster\n"));
3627 	puts("");
3628 
3629 	printf(_("NODE SERVICE\n"));
3630 	puts("");
3631 	printf(_("  \"node service\" executes a system service command to stop/start/restart/reload a node\n" \
3632 			 "                   or optionally display which command would be executed\n"));
3633 	puts("");
3634 	printf(_("  Configuration file required, runs on local node only.\n"));
3635 	puts("");
3636 	printf(_("    --dry-run                 show what action would be performed, but don't execute it\n"));
3637 	printf(_("    --action                  action to perform (one of \"start\", \"stop\", \"restart\" or \"reload\")\n"));
3638 	printf(_("    --list-actions            show what command would be performed for each action\n"));
3639 	printf(_("    --checkpoint              issue a CHECKPOINT before stopping or restarting the node\n"));
3640 	printf(_("    -S, --superuser=USERNAME  superuser to use, if repmgr user is not superuser\n"));
3641 
3642 	puts("");
3643 
3644 	printf(_("%s home page: <%s>\n"), "repmgr", REPMGR_URL);
3645 }
3646