1 /*
2 * repmgr-action-node.c
3 *
4 * Implements actions available for any kind of node
5 *
6 * Copyright (c) 2ndQuadrant, 2010-2020
7 *
8 * This program is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program. If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include <sys/stat.h>
23 #include <dirent.h>
24
25 #include "repmgr.h"
26 #include "controldata.h"
27 #include "dirutil.h"
28 #include "dbutils.h"
29 #include "compat.h"
30
31 #include "repmgr-client-global.h"
32 #include "repmgr-action-node.h"
33 #include "repmgr-action-standby.h"
34
35 static bool copy_file(const char *src_file, const char *dest_file);
36 static void format_archive_dir(PQExpBufferData *archive_dir);
37 static t_server_action parse_server_action(const char *action);
38
39 static void exit_optformat_error(const char *error, int errcode);
40
41 static void _do_node_service_list_actions(t_server_action action);
42 static void _do_node_status_is_shutdown_cleanly(void);
43 static void _do_node_archive_config(void);
44 static void _do_node_restore_config(void);
45
46 static void do_node_check_replication_connection(void);
47 static CheckStatus do_node_check_archive_ready(PGconn *conn, OutputMode mode, CheckStatusList *list_output);
48 static CheckStatus do_node_check_downstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
49 static CheckStatus do_node_check_upstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
50 static CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
51 static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
52 static CheckStatus do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
53 static CheckStatus do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
54 static CheckStatus do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
55 static CheckStatus do_node_check_replication_config_owner(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
56 static CheckStatus do_node_check_db_connection(PGconn *conn, OutputMode mode);
57
58 /*
59 * NODE STATUS
60 *
61 * Can only be run on the local node, as it needs to be able to
62 * read the data directory.
63 *
64 * Parameters:
65 * --is-shutdown-cleanly (for internal use only)
66 * --csv
67 */
68
69 void
do_node_status(void)70 do_node_status(void)
71 {
72 PGconn *conn = NULL;
73
74 t_node_info node_info = T_NODE_INFO_INITIALIZER;
75 char cluster_size[MAXLEN];
76 PQExpBufferData output;
77
78 KeyValueList node_status = {NULL, NULL};
79 KeyValueListCell *cell = NULL;
80 NodeInfoList missing_slots = T_NODE_INFO_LIST_INITIALIZER;
81
82 ItemList warnings = {NULL, NULL};
83 RecoveryType recovery_type = RECTYPE_UNKNOWN;
84 ReplInfo replication_info;
85 t_recovery_conf recovery_conf = T_RECOVERY_CONF_INITIALIZER;
86
87 char data_dir[MAXPGPATH] = "";
88 char server_version_str[MAXVERSIONSTR] = "";
89
90 /*
91 * A database connection is *not* required for this check
92 */
93 if (runtime_options.is_shutdown_cleanly == true)
94 {
95 return _do_node_status_is_shutdown_cleanly();
96 }
97
98 init_replication_info(&replication_info);
99
100
101 /* config file required, so we should have "conninfo" and "data_directory" */
102 conn = establish_db_connection(config_file_options.conninfo, true);
103 strncpy(data_dir, config_file_options.data_directory, MAXPGPATH);
104
105 (void)get_server_version(conn, server_version_str);
106
107 /* check node exists */
108
109 if (get_node_record_with_upstream(conn, config_file_options.node_id, &node_info) != RECORD_FOUND)
110 {
111 log_error(_("no record found for node %i"), config_file_options.node_id);
112 PQfinish(conn);
113 exit(ERR_BAD_CONFIG);
114 }
115
116 if (get_cluster_size(conn, cluster_size) == false)
117 strncpy(cluster_size, _("unknown"), MAXLEN);
118
119 recovery_type = get_recovery_type(conn);
120
121 get_node_replication_stats(conn, &node_info);
122
123 key_value_list_set(&node_status,
124 "PostgreSQL version",
125 server_version_str);
126
127 key_value_list_set(&node_status,
128 "Total data size",
129 cluster_size);
130
131 key_value_list_set(&node_status,
132 "Conninfo",
133 node_info.conninfo);
134
135 if (runtime_options.verbose == true)
136 {
137 uint64 local_system_identifier = get_system_identifier(config_file_options.data_directory);
138
139 if (local_system_identifier == UNKNOWN_SYSTEM_IDENTIFIER)
140 {
141 key_value_list_set(&node_status,
142 "System identifier",
143 "unknown");
144 item_list_append_format(&warnings,
145 _("unable to retrieve system identifier from pg_control"));
146 }
147 else
148 {
149 key_value_list_set_format(&node_status,
150 "System identifier",
151 "%lu", local_system_identifier);
152 }
153 }
154
155 key_value_list_set(&node_status,
156 "Role",
157 get_node_type_string(node_info.type));
158
159 switch (node_info.type)
160 {
161 case PRIMARY:
162 if (recovery_type == RECTYPE_STANDBY)
163 {
164 item_list_append(&warnings,
165 _("- node is registered as primary but running as standby"));
166 }
167 break;
168 case STANDBY:
169 if (recovery_type == RECTYPE_PRIMARY)
170 {
171 item_list_append(&warnings,
172 _("- node is registered as standby but running as primary"));
173 }
174 break;
175 default:
176 break;
177 }
178
179 if (guc_set(conn, "archive_mode", "=", "off"))
180 {
181 key_value_list_set(&node_status,
182 "WAL archiving",
183 "off");
184
185 key_value_list_set(&node_status,
186 "Archive command",
187 "(none)");
188 }
189 else
190 {
191 /* "archive_mode" is not "off", i.e. one of "on", "always" */
192 bool enabled = true;
193 PQExpBufferData archiving_status;
194 char archive_command[MAXLEN] = "";
195
196 initPQExpBuffer(&archiving_status);
197
198 /*
199 * if the node is a standby, and "archive_mode" is "on", archiving will
200 * actually be disabled.
201 */
202 if (recovery_type == RECTYPE_STANDBY)
203 {
204 if (guc_set(conn, "archive_mode", "=", "on"))
205 enabled = false;
206 }
207
208 if (enabled == true)
209 {
210 appendPQExpBufferStr(&archiving_status, "enabled");
211 }
212 else
213 {
214 appendPQExpBufferStr(&archiving_status, "disabled");
215 }
216
217 if (enabled == false && recovery_type == RECTYPE_STANDBY)
218 {
219 if (PQserverVersion(conn) >= 90500)
220 {
221 appendPQExpBufferStr(&archiving_status,
222 " (on standbys \"archive_mode\" must be set to \"always\" to be effective)");
223 }
224 else
225 {
226 appendPQExpBufferStr(&archiving_status,
227 " (\"archive_mode\" has no effect on standbys)");
228 }
229 }
230
231 key_value_list_set(&node_status,
232 "WAL archiving",
233 archiving_status.data);
234
235 termPQExpBuffer(&archiving_status);
236
237 get_pg_setting(conn, "archive_command", archive_command);
238
239 key_value_list_set(&node_status,
240 "Archive command",
241 archive_command);
242 }
243
244 {
245 int ready_files;
246
247 ready_files = get_ready_archive_files(conn, data_dir);
248
249 if (ready_files == ARCHIVE_STATUS_DIR_ERROR)
250 {
251 item_list_append_format(&warnings,
252 "- unable to check archive_status directory\n");
253 }
254 else
255 {
256 if (runtime_options.output_mode == OM_CSV)
257 {
258 key_value_list_set_format(&node_status,
259 "WALs pending archiving",
260 "%i",
261 ready_files);
262 }
263 else
264 {
265 key_value_list_set_format(&node_status,
266 "WALs pending archiving",
267 "%i pending files",
268 ready_files);
269 }
270 }
271
272 if (guc_set(conn, "archive_mode", "=", "off"))
273 {
274 key_value_list_set_output_mode(&node_status, "WALs pending archiving", OM_CSV);
275 }
276
277 }
278
279
280 if (node_info.max_wal_senders >= 0)
281 {
282 /* In CSV mode, raw values supplied as well */
283 key_value_list_set_format(&node_status,
284 "Replication connections",
285 "%i (of maximal %i)",
286 node_info.attached_wal_receivers,
287 node_info.max_wal_senders);
288 }
289 else if (node_info.max_wal_senders == 0)
290 {
291 key_value_list_set_format(&node_status,
292 "Replication connections",
293 "disabled");
294 }
295
296 /* check for attached nodes */
297 {
298 NodeInfoList downstream_nodes = T_NODE_INFO_LIST_INITIALIZER;
299 NodeInfoListCell *node_cell = NULL;
300 ItemList missing_nodes = {NULL, NULL};
301 int missing_nodes_count = 0;
302 int expected_nodes_count = 0;
303
304 get_downstream_node_records(conn, config_file_options.node_id, &downstream_nodes);
305
306 /* if a witness node is present, we'll need to remove this from the total */
307 expected_nodes_count = downstream_nodes.node_count;
308
309 for (node_cell = downstream_nodes.head; node_cell; node_cell = node_cell->next)
310 {
311 /* skip witness server */
312 if (node_cell->node_info->type == WITNESS)
313 {
314 expected_nodes_count --;
315 continue;
316 }
317
318 if (is_downstream_node_attached(conn, node_cell->node_info->node_name, NULL) != NODE_ATTACHED)
319 {
320 missing_nodes_count++;
321 item_list_append_format(&missing_nodes,
322 "%s (ID: %i)",
323 node_cell->node_info->node_name,
324 node_cell->node_info->node_id);
325 }
326 }
327
328 if (missing_nodes_count)
329 {
330 ItemListCell *missing_cell = NULL;
331
332 item_list_append_format(&warnings,
333 _("- %i of %i downstream nodes not attached:"),
334 missing_nodes_count,
335 expected_nodes_count);
336
337 for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next)
338 {
339 item_list_append_format(&warnings,
340 " - %s\n", missing_cell->string);
341 }
342 }
343 }
344
345 if (node_info.max_replication_slots == 0)
346 {
347 key_value_list_set(&node_status,
348 "Replication slots",
349 "disabled");
350 }
351 else
352 {
353 PQExpBufferData slotinfo;
354
355 /*
356 * check for missing replication slots - we do this regardless of
357 * what "max_replication_slots" is set to, in case the downstream
358 * node was configured with "use_replication_slots=true" and is
359 * expecting a replication slot to be available
360 */
361 get_downstream_nodes_with_missing_slot(conn,
362 config_file_options.node_id,
363 &missing_slots);
364
365 if (missing_slots.node_count > 0)
366 {
367 NodeInfoListCell *missing_slot_cell = NULL;
368
369 item_list_append_format(&warnings,
370 _("- replication slots missing for following %i node(s):"),
371 missing_slots.node_count);
372
373 for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next)
374 {
375 item_list_append_format(&warnings,
376 _(" - %s (ID: %i, slot name: \"%s\")"),
377 missing_slot_cell->node_info->node_name,
378 missing_slot_cell->node_info->node_id,
379 missing_slot_cell->node_info->slot_name);
380 }
381 }
382
383 initPQExpBuffer(&slotinfo);
384
385 appendPQExpBuffer(&slotinfo,
386 "%i physical (of maximal %i; %i missing)",
387 node_info.active_replication_slots + node_info.inactive_replication_slots,
388 node_info.max_replication_slots,
389 missing_slots.node_count);
390
391 if (node_info.inactive_replication_slots > 0)
392 {
393 KeyValueList inactive_replication_slots = {NULL, NULL};
394 KeyValueListCell *cell = NULL;
395
396 (void) get_inactive_replication_slots(conn, &inactive_replication_slots);
397
398 appendPQExpBuffer(&slotinfo,
399 "; %i inactive",
400 node_info.inactive_replication_slots);
401
402 item_list_append_format(&warnings,
403 _("- node has %i inactive physical replication slots"),
404 node_info.inactive_replication_slots);
405
406 for (cell = inactive_replication_slots.head; cell; cell = cell->next)
407 {
408 item_list_append_format(&warnings,
409 " - %s", cell->key);
410 }
411
412 key_value_list_free(&inactive_replication_slots);
413 }
414
415 key_value_list_set(&node_status,
416 "Replication slots",
417 slotinfo.data);
418
419 termPQExpBuffer(&slotinfo);
420 }
421
422
423 if (node_info.type == STANDBY)
424 {
425 key_value_list_set_format(&node_status,
426 "Upstream node",
427 "%s (ID: %i)",
428 node_info.upstream_node_name,
429 node_info.upstream_node_id);
430
431 get_replication_info(conn, node_info.type, &replication_info);
432
433 key_value_list_set_format(&node_status,
434 "Replication lag",
435 "%i seconds",
436 replication_info.replication_lag_time);
437
438 key_value_list_set_format(&node_status,
439 "Last received LSN",
440 "%X/%X", format_lsn(replication_info.last_wal_receive_lsn));
441
442 key_value_list_set_format(&node_status,
443 "Last replayed LSN",
444 "%X/%X", format_lsn(replication_info.last_wal_replay_lsn));
445 }
446 else
447 {
448 key_value_list_set(&node_status,
449 "Upstream node",
450 "(none)");
451 key_value_list_set_output_mode(&node_status,
452 "Upstream node",
453 OM_CSV);
454
455 key_value_list_set(&node_status,
456 "Replication lag",
457 "n/a");
458
459 key_value_list_set(&node_status,
460 "Last received LSN",
461 "(none)");
462
463 key_value_list_set_output_mode(&node_status,
464 "Last received LSN",
465 OM_CSV);
466
467 key_value_list_set(&node_status,
468 "Last replayed LSN",
469 "(none)");
470
471 key_value_list_set_output_mode(&node_status,
472 "Last replayed LSN",
473 OM_CSV);
474 }
475
476
477 parse_recovery_conf(data_dir, &recovery_conf);
478
479 /* format output */
480 initPQExpBuffer(&output);
481
482 if (runtime_options.output_mode == OM_CSV)
483 {
484 appendPQExpBuffer(&output,
485 "\"Node name\",\"%s\"\n",
486 node_info.node_name);
487
488 appendPQExpBuffer(&output,
489 "\"Node ID\",\"%i\"\n",
490 node_info.node_id);
491
492 for (cell = node_status.head; cell; cell = cell->next)
493 {
494 appendPQExpBuffer(&output,
495 "\"%s\",\"%s\"\n",
496 cell->key, cell->value);
497 }
498
499 /* we'll add the raw data as well */
500 appendPQExpBuffer(&output,
501 "\"max_wal_senders\",%i\n",
502 node_info.max_wal_senders);
503
504 appendPQExpBuffer(&output,
505 "\"occupied_wal_senders\",%i\n",
506 node_info.attached_wal_receivers);
507
508 appendPQExpBuffer(&output,
509 "\"max_replication_slots\",%i\n",
510 node_info.max_replication_slots);
511
512 appendPQExpBuffer(&output,
513 "\"active_replication_slots\",%i\n",
514 node_info.active_replication_slots);
515
516 /* output inactive slot information */
517 appendPQExpBuffer(&output,
518 "\"inactive_replication_slots\",%i",
519 node_info.inactive_replication_slots);
520
521 if (node_info.inactive_replication_slots)
522 {
523 KeyValueList inactive_replication_slots = {NULL, NULL};
524 KeyValueListCell *cell = NULL;
525
526 (void) get_inactive_replication_slots(conn, &inactive_replication_slots);
527 for (cell = inactive_replication_slots.head; cell; cell = cell->next)
528 {
529 appendPQExpBuffer(&output,
530 ",\"%s\"", cell->key);
531 }
532
533 key_value_list_free(&inactive_replication_slots);
534 }
535
536 /* output missing slot information */
537
538 appendPQExpBufferChar(&output, '\n');
539 appendPQExpBuffer(&output,
540 "\"missing_replication_slots\",%i",
541 missing_slots.node_count);
542
543 if (missing_slots.node_count > 0)
544 {
545 NodeInfoListCell *missing_slot_cell = NULL;
546
547 for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next)
548 {
549 appendPQExpBuffer(&output,
550 ",\"%s\"", missing_slot_cell->node_info->slot_name);
551 }
552 }
553
554 }
555 else
556 {
557 appendPQExpBuffer(&output,
558 "Node \"%s\":\n",
559 node_info.node_name);
560
561 for (cell = node_status.head; cell; cell = cell->next)
562 {
563 if (cell->output_mode == OM_NOT_SET)
564 appendPQExpBuffer(&output,
565 "\t%s: %s\n",
566 cell->key, cell->value);
567 }
568 }
569
570 puts(output.data);
571
572 termPQExpBuffer(&output);
573
574 if (warnings.head != NULL && runtime_options.terse == false && runtime_options.output_mode == OM_TEXT)
575 {
576 log_warning(_("following issue(s) were detected:"));
577 print_item_list(&warnings);
578 log_hint(_("execute \"repmgr node check\" for more details"));
579 }
580
581 clear_node_info_list(&missing_slots);
582 key_value_list_free(&node_status);
583 item_list_free(&warnings);
584 PQfinish(conn);
585
586 /*
587 * If warnings were noted, even if they're not displayed (e.g. in --csv node),
588 * that means something's not right so we need to emit a non-zero exit code.
589 */
590 if (warnings.head != NULL)
591 {
592 exit(ERR_NODE_STATUS);
593 }
594
595 return;
596 }
597
598
599 /*
600 * Returns information about the running state of the node.
601 * For internal use during "standby switchover".
602 *
603 * Returns "longopt" output:
604 *
605 * --status=(RUNNING|SHUTDOWN|UNCLEAN_SHUTDOWN|UNKNOWN)
606 * --last-checkpoint=...
607 */
608
609 static void
_do_node_status_is_shutdown_cleanly(void)610 _do_node_status_is_shutdown_cleanly(void)
611 {
612 PGPing ping_status;
613 PQExpBufferData output;
614
615 DBState db_state;
616 XLogRecPtr checkPoint = InvalidXLogRecPtr;
617
618 NodeStatus node_status = NODE_STATUS_UNKNOWN;
619
620 initPQExpBuffer(&output);
621
622 appendPQExpBufferStr(&output,
623 "--state=");
624
625 /* sanity-check we're dealing with a PostgreSQL directory */
626 if (is_pg_dir(config_file_options.data_directory) == false)
627 {
628 appendPQExpBufferStr(&output, "UNKNOWN");
629 printf("%s\n", output.data);
630 termPQExpBuffer(&output);
631 return;
632 }
633
634 ping_status = PQping(config_file_options.conninfo);
635
636 switch (ping_status)
637 {
638 case PQPING_OK:
639 node_status = NODE_STATUS_UP;
640 break;
641 case PQPING_REJECT:
642 node_status = NODE_STATUS_UP;
643 break;
644 case PQPING_NO_ATTEMPT:
645 case PQPING_NO_RESPONSE:
646 /* status not yet clear */
647 break;
648 }
649
650 /* check what pg_control says */
651
652 if (get_db_state(config_file_options.data_directory, &db_state) == false)
653 {
654 /*
655 * Unable to retrieve the database state from pg_control
656 */
657 node_status = NODE_STATUS_UNKNOWN;
658 log_verbose(LOG_DEBUG, "unable to determine db state");
659 goto return_state;
660 }
661
662 log_verbose(LOG_DEBUG, "db state now: %s", describe_db_state(db_state));
663
664 if (db_state != DB_SHUTDOWNED && db_state != DB_SHUTDOWNED_IN_RECOVERY)
665 {
666 if (node_status != NODE_STATUS_UP)
667 {
668 node_status = NODE_STATUS_UNCLEAN_SHUTDOWN;
669 }
670 /* server is still responding but shutting down */
671 else if (db_state == DB_SHUTDOWNING)
672 {
673 node_status = NODE_STATUS_SHUTTING_DOWN;
674 }
675 }
676
677 checkPoint = get_latest_checkpoint_location(config_file_options.data_directory);
678
679 if (checkPoint == InvalidXLogRecPtr)
680 {
681 /* unable to read pg_control, don't know what's happening */
682 node_status = NODE_STATUS_UNKNOWN;
683 }
684 else if (node_status == NODE_STATUS_UNKNOWN)
685 {
686 /*
687 * if still "UNKNOWN" at this point, then the node must be cleanly shut
688 * down
689 */
690 node_status = NODE_STATUS_DOWN;
691 }
692
693
694 return_state:
695
696 log_verbose(LOG_DEBUG, "node status determined as: %s",
697 print_node_status(node_status));
698
699 appendPQExpBuffer(&output,
700 "%s", print_node_status(node_status));
701
702 if (node_status == NODE_STATUS_DOWN)
703 {
704 appendPQExpBuffer(&output,
705 " --last-checkpoint-lsn=%X/%X",
706 format_lsn(checkPoint));
707 }
708
709 printf("%s\n", output.data);
710 termPQExpBuffer(&output);
711 return;
712 }
713
714 static void
exit_optformat_error(const char * error,int errcode)715 exit_optformat_error(const char *error, int errcode)
716 {
717 PQExpBufferData output;
718
719 Assert(runtime_options.output_mode == OM_OPTFORMAT);
720
721 initPQExpBuffer(&output);
722
723 appendPQExpBuffer(&output,
724 "--error=%s",
725 error);
726
727 printf("%s\n", output.data);
728
729 termPQExpBuffer(&output);
730
731 exit(errcode);
732 }
733
734 /*
735 * Configuration file required
736 */
737 void
do_node_check(void)738 do_node_check(void)
739 {
740 PGconn *conn = NULL;
741 PQExpBufferData output;
742
743 t_node_info node_info = T_NODE_INFO_INITIALIZER;
744
745 CheckStatus return_code;
746 CheckStatusList status_list = {NULL, NULL};
747 CheckStatusListCell *cell = NULL;
748
749 bool issue_detected = false;
750 bool exit_on_connection_error = true;
751
752 /* for internal use */
753 if (runtime_options.has_passfile == true)
754 {
755 return_code = has_passfile() ? 0 : 1;
756
757 exit(return_code);
758 }
759
760 /* for use by "standby switchover" */
761 if (runtime_options.replication_connection == true)
762 {
763 do_node_check_replication_connection();
764 exit(SUCCESS);
765 }
766
767 if (runtime_options.db_connection == true)
768 {
769 exit_on_connection_error = false;
770 }
771
772 /*
773 * If --optformat was provided, we'll assume this is a remote invocation
774 * and instead of exiting with an error, we'll return an error string to
775 * so the remote invoker will know what's happened.
776 */
777 if (runtime_options.output_mode == OM_OPTFORMAT)
778 {
779 exit_on_connection_error = false;
780 }
781
782
783 if (config_file_options.conninfo[0] != '\0')
784 {
785 t_conninfo_param_list node_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
786 char *errmsg = NULL;
787 bool parse_success = false;
788
789 initialize_conninfo_params(&node_conninfo, false);
790
791 parse_success = parse_conninfo_string(config_file_options.conninfo,
792 &node_conninfo,
793 &errmsg, false);
794
795 if (parse_success == false)
796 {
797 if (runtime_options.output_mode == OM_OPTFORMAT)
798 {
799 exit_optformat_error("CONNINFO_PARSE",
800 ERR_BAD_CONFIG);
801 }
802
803 log_error(_("unable to parse conninfo string \"%s\" for local node"),
804 config_file_options.conninfo);
805 log_detail("%s", errmsg);
806
807 exit(ERR_BAD_CONFIG);
808 }
809
810 /*
811 * If --superuser option provided, attempt to connect as the specified user
812 */
813
814 if (runtime_options.superuser[0] != '\0')
815 {
816 conn = establish_db_connection_with_replacement_param(
817 config_file_options.conninfo,
818 "user",
819 runtime_options.superuser,
820 exit_on_connection_error);
821 }
822 else
823 {
824 conn = establish_db_connection_by_params(&node_conninfo, exit_on_connection_error);
825 }
826 }
827 else
828 {
829 conn = establish_db_connection_by_params(&source_conninfo, exit_on_connection_error);
830 }
831
832
833 /*
834 * --db-connection option provided
835 */
836 if (runtime_options.db_connection == true)
837 {
838 return_code = do_node_check_db_connection(conn, runtime_options.output_mode);
839 PQfinish(conn);
840 exit(return_code);
841 }
842
843 /*
844 * If we've reached here, and the connection is invalid, then --optformat was provided
845 */
846 if (PQstatus(conn) != CONNECTION_OK)
847 {
848 exit_optformat_error("DB_CONNECTION",
849 ERR_DB_CONN);
850 }
851
852 if (get_node_record(conn, config_file_options.node_id, &node_info) != RECORD_FOUND)
853 {
854 log_error(_("no record found for node %i"), config_file_options.node_id);
855 PQfinish(conn);
856 exit(ERR_BAD_CONFIG);
857 }
858
859 /* add replication statistics to node record */
860 get_node_replication_stats(conn, &node_info);
861
862 /*
863 * handle specific checks ======================
864 */
865 if (runtime_options.archive_ready == true)
866 {
867 return_code = do_node_check_archive_ready(conn,
868 runtime_options.output_mode,
869 NULL);
870 PQfinish(conn);
871 exit(return_code);
872 }
873
874 if (runtime_options.upstream == true)
875 {
876 return_code = do_node_check_upstream(conn,
877 runtime_options.output_mode,
878 &node_info,
879 NULL);
880 PQfinish(conn);
881 exit(return_code);
882 }
883
884 if (runtime_options.downstream == true)
885 {
886 return_code = do_node_check_downstream(conn,
887 runtime_options.output_mode,
888 &node_info,
889 NULL);
890 PQfinish(conn);
891 exit(return_code);
892 }
893
894 if (runtime_options.replication_lag == true)
895 {
896 return_code = do_node_check_replication_lag(conn,
897 runtime_options.output_mode,
898 &node_info,
899 NULL);
900 PQfinish(conn);
901 exit(return_code);
902 }
903
904 if (runtime_options.role == true)
905 {
906 return_code = do_node_check_role(conn,
907 runtime_options.output_mode,
908 &node_info,
909 NULL);
910 PQfinish(conn);
911 exit(return_code);
912 }
913
914 if (runtime_options.slots == true)
915 {
916 return_code = do_node_check_slots(conn,
917 runtime_options.output_mode,
918 &node_info,
919 NULL);
920 PQfinish(conn);
921 exit(return_code);
922 }
923
924 if (runtime_options.missing_slots == true)
925 {
926 return_code = do_node_check_missing_slots(conn,
927 runtime_options.output_mode,
928 &node_info,
929 NULL);
930 PQfinish(conn);
931 exit(return_code);
932 }
933
934 if (runtime_options.data_directory_config == true)
935 {
936 return_code = do_node_check_data_directory(conn,
937 runtime_options.output_mode,
938 &node_info,
939 NULL);
940 PQfinish(conn);
941 exit(return_code);
942 }
943
944 if (runtime_options.replication_config_owner == true)
945 {
946 return_code = do_node_check_replication_config_owner(conn,
947 runtime_options.output_mode,
948 &node_info,
949 NULL);
950 PQfinish(conn);
951 exit(return_code);
952 }
953
954
955 if (runtime_options.output_mode == OM_NAGIOS)
956 {
957 log_error(_("--nagios can only be used with a specific check"));
958 log_hint(_("execute \"repmgr node --help\" for details"));
959 PQfinish(conn);
960 exit(ERR_BAD_CONFIG);
961 }
962
963 /* output general overview */
964
965 initPQExpBuffer(&output);
966
967 /* order functions are called is also output order */
968 if (do_node_check_role(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
969 issue_detected = true;
970
971 if (do_node_check_replication_lag(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
972 issue_detected = true;
973
974 if (do_node_check_archive_ready(conn, runtime_options.output_mode, &status_list) != CHECK_STATUS_OK)
975 issue_detected = true;
976
977 if (do_node_check_upstream(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
978 issue_detected = true;
979
980 if (do_node_check_downstream(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
981 issue_detected = true;
982
983 if (do_node_check_slots(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
984 issue_detected = true;
985
986 if (do_node_check_missing_slots(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
987 issue_detected = true;
988
989 if (do_node_check_data_directory(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
990 issue_detected = true;
991
992 if (runtime_options.output_mode == OM_CSV)
993 {
994 appendPQExpBuffer(&output,
995 "\"Node name\",\"%s\"\n",
996 node_info.node_name);
997
998 appendPQExpBuffer(&output,
999 "\"Node ID\",\"%i\"\n",
1000 node_info.node_id);
1001
1002 for (cell = status_list.head; cell; cell = cell->next)
1003 {
1004 appendPQExpBuffer(&output,
1005 "\"%s\",\"%s\"",
1006 cell->item,
1007 output_check_status(cell->status));
1008
1009 if (strlen(cell->details))
1010 {
1011 appendPQExpBuffer(&output,
1012 ",\"%s\"",
1013 cell->details);
1014 }
1015 appendPQExpBufferChar(&output, '\n');
1016 }
1017 }
1018 else
1019 {
1020 appendPQExpBuffer(&output,
1021 "Node \"%s\":\n",
1022 node_info.node_name);
1023
1024 for (cell = status_list.head; cell; cell = cell->next)
1025 {
1026 appendPQExpBuffer(&output,
1027 "\t%s: %s",
1028 cell->item,
1029 output_check_status(cell->status));
1030
1031 if (strlen(cell->details))
1032 {
1033 appendPQExpBuffer(&output,
1034 " (%s)",
1035 cell->details);
1036 }
1037 appendPQExpBufferChar(&output, '\n');
1038 }
1039 }
1040
1041
1042 printf("%s", output.data);
1043 termPQExpBuffer(&output);
1044 check_status_list_free(&status_list);
1045
1046 PQfinish(conn);
1047
1048 if (issue_detected == true)
1049 {
1050 exit(ERR_NODE_STATUS);
1051 }
1052 }
1053
1054
1055 static void
do_node_check_replication_connection(void)1056 do_node_check_replication_connection(void)
1057 {
1058 PGconn *local_conn = NULL;
1059 PGconn *repl_conn = NULL;
1060 t_node_info node_record = T_NODE_INFO_INITIALIZER;
1061 RecordStatus record_status = RECORD_NOT_FOUND;
1062 PQExpBufferData output;
1063
1064
1065 initPQExpBuffer(&output);
1066 appendPQExpBufferStr(&output,
1067 "--connection=");
1068
1069 if (runtime_options.remote_node_id == UNKNOWN_NODE_ID)
1070 {
1071 appendPQExpBufferStr(&output, "UNKNOWN");
1072 printf("%s\n", output.data);
1073 termPQExpBuffer(&output);
1074 return;
1075 }
1076
1077 /* retrieve remote node record from local database */
1078 local_conn = establish_db_connection(config_file_options.conninfo, false);
1079
1080 if (PQstatus(local_conn) != CONNECTION_OK)
1081 {
1082 appendPQExpBufferStr(&output, "CONNECTION_ERROR");
1083 printf("%s\n", output.data);
1084 termPQExpBuffer(&output);
1085 return;
1086 }
1087
1088 record_status = get_node_record(local_conn, runtime_options.remote_node_id, &node_record);
1089 PQfinish(local_conn);
1090
1091 if (record_status != RECORD_FOUND)
1092 {
1093 appendPQExpBufferStr(&output, "UNKNOWN");
1094 printf("%s\n", output.data);
1095 termPQExpBuffer(&output);
1096 return;
1097 }
1098
1099 repl_conn = establish_replication_connection_from_conninfo(node_record.conninfo,
1100 node_record.repluser);
1101
1102 if (PQstatus(repl_conn) != CONNECTION_OK)
1103 {
1104 appendPQExpBufferStr(&output, "BAD");
1105 printf("%s\n", output.data);
1106 termPQExpBuffer(&output);
1107 return;
1108 }
1109
1110 PQfinish(repl_conn);
1111
1112 appendPQExpBufferStr(&output, "OK");
1113 printf("%s\n", output.data);
1114 termPQExpBuffer(&output);
1115
1116 return;
1117 }
1118
1119
1120
1121 static CheckStatus
do_node_check_archive_ready(PGconn * conn,OutputMode mode,CheckStatusList * list_output)1122 do_node_check_archive_ready(PGconn *conn, OutputMode mode, CheckStatusList *list_output)
1123 {
1124 int ready_archive_files = 0;
1125 CheckStatus status = CHECK_STATUS_UNKNOWN;
1126 PQExpBufferData details;
1127
1128 if (mode == OM_CSV && list_output == NULL)
1129 {
1130 log_error(_("--csv output not provided with --archive-ready option"));
1131 PQfinish(conn);
1132 exit(ERR_BAD_CONFIG);
1133 }
1134
1135 initPQExpBuffer(&details);
1136
1137 ready_archive_files = get_ready_archive_files(conn, config_file_options.data_directory);
1138
1139 if (ready_archive_files > config_file_options.archive_ready_critical)
1140 {
1141 status = CHECK_STATUS_CRITICAL;
1142
1143 switch (mode)
1144 {
1145 case OM_OPTFORMAT:
1146 appendPQExpBuffer(&details,
1147 "--files=%i --threshold=%i",
1148 ready_archive_files, config_file_options.archive_ready_critical);
1149 break;
1150 case OM_NAGIOS:
1151 appendPQExpBuffer(&details,
1152 "%i pending archive ready files | files=%i;%i;%i",
1153 ready_archive_files,
1154 ready_archive_files,
1155 config_file_options.archive_ready_warning,
1156 config_file_options.archive_ready_critical);
1157 break;
1158 case OM_TEXT:
1159 appendPQExpBuffer(&details,
1160 "%i pending archive ready files, critical threshold: %i",
1161 ready_archive_files, config_file_options.archive_ready_critical);
1162 break;
1163
1164 default:
1165 break;
1166 }
1167 }
1168 else if (ready_archive_files > config_file_options.archive_ready_warning)
1169 {
1170 status = CHECK_STATUS_WARNING;
1171
1172 switch (mode)
1173 {
1174 case OM_OPTFORMAT:
1175 appendPQExpBuffer(&details,
1176 "--files=%i --threshold=%i",
1177 ready_archive_files, config_file_options.archive_ready_warning);
1178 break;
1179 case OM_NAGIOS:
1180 appendPQExpBuffer(&details,
1181 "%i pending archive ready files | files=%i;%i;%i",
1182 ready_archive_files,
1183 ready_archive_files,
1184 config_file_options.archive_ready_warning,
1185 config_file_options.archive_ready_critical);
1186
1187 break;
1188 case OM_TEXT:
1189 appendPQExpBuffer(&details,
1190 "%i pending archive ready files (threshold: %i)",
1191 ready_archive_files, config_file_options.archive_ready_warning);
1192 break;
1193
1194 default:
1195 break;
1196 }
1197 }
1198 else if (ready_archive_files < 0)
1199 {
1200 status = CHECK_STATUS_UNKNOWN;
1201
1202 switch (mode)
1203 {
1204 case OM_OPTFORMAT:
1205 break;
1206 case OM_NAGIOS:
1207 case OM_TEXT:
1208 appendPQExpBufferStr(&details,
1209 "unable to check archive_status directory");
1210 break;
1211
1212 default:
1213 break;
1214 }
1215 }
1216 else
1217 {
1218 status = CHECK_STATUS_OK;
1219
1220 switch (mode)
1221 {
1222 case OM_OPTFORMAT:
1223 appendPQExpBuffer(&details,
1224 "--files=%i", ready_archive_files);
1225 break;
1226 case OM_NAGIOS:
1227 appendPQExpBuffer(&details,
1228 "%i pending archive ready files | files=%i;%i;%i",
1229 ready_archive_files,
1230 ready_archive_files,
1231 config_file_options.archive_ready_warning,
1232 config_file_options.archive_ready_critical);
1233 break;
1234 case OM_TEXT:
1235 appendPQExpBuffer(&details,
1236 "%i pending archive ready files", ready_archive_files);
1237 break;
1238
1239 default:
1240 break;
1241 }
1242 }
1243
1244 switch (mode)
1245 {
1246 case OM_OPTFORMAT:
1247 {
1248 printf("--status=%s %s\n",
1249 output_check_status(status),
1250 details.data);
1251 }
1252 break;
1253 case OM_NAGIOS:
1254 printf("REPMGR_ARCHIVE_READY %s: %s\n",
1255 output_check_status(status),
1256 details.data);
1257 break;
1258 case OM_CSV:
1259 case OM_TEXT:
1260 if (list_output != NULL)
1261 {
1262 check_status_list_set(list_output,
1263 "WAL archiving",
1264 status,
1265 details.data);
1266 }
1267 else
1268 {
1269 printf("%s (%s)\n",
1270 output_check_status(status),
1271 details.data);
1272 }
1273 default:
1274 break;
1275 }
1276
1277 termPQExpBuffer(&details);
1278 return status;
1279 }
1280
1281
1282 static CheckStatus
do_node_check_downstream(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)1283 do_node_check_downstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
1284 {
1285 NodeInfoList downstream_nodes = T_NODE_INFO_LIST_INITIALIZER;
1286 NodeInfoListCell *cell = NULL;
1287 int missing_nodes_count = 0;
1288 int expected_nodes_count = 0;
1289 CheckStatus status = CHECK_STATUS_OK;
1290 ItemList missing_nodes = {NULL, NULL};
1291 ItemList attached_nodes = {NULL, NULL};
1292 PQExpBufferData details;
1293
1294 if (mode == OM_CSV && list_output == NULL)
1295 {
1296 log_error(_("--csv output not provided with --downstream option"));
1297 PQfinish(conn);
1298 exit(ERR_BAD_CONFIG);
1299 }
1300
1301 initPQExpBuffer(&details);
1302
1303 get_downstream_node_records(conn, config_file_options.node_id, &downstream_nodes);
1304
1305 /* if a witness node is present, we'll need to remove this from the total */
1306 expected_nodes_count = downstream_nodes.node_count;
1307
1308 for (cell = downstream_nodes.head; cell; cell = cell->next)
1309 {
1310 /* skip witness server */
1311 if (cell->node_info->type == WITNESS)
1312 {
1313 expected_nodes_count --;
1314 continue;
1315 }
1316
1317 if (is_downstream_node_attached(conn, cell->node_info->node_name, NULL) != NODE_ATTACHED)
1318 {
1319 missing_nodes_count++;
1320 item_list_append_format(&missing_nodes,
1321 "%s (ID: %i)",
1322 cell->node_info->node_name,
1323 cell->node_info->node_id);
1324 }
1325 else
1326 {
1327 item_list_append_format(&attached_nodes,
1328 "%s (ID: %i)",
1329 cell->node_info->node_name,
1330 cell->node_info->node_id);
1331 }
1332 }
1333
1334 if (node_info->type == WITNESS)
1335 {
1336 /* witness is not connecting to any upstream */
1337 appendPQExpBufferStr(&details,
1338 _("N/A - node is a witness"));
1339 }
1340 else if (missing_nodes_count == 0)
1341 {
1342 if (expected_nodes_count == 0)
1343 appendPQExpBufferStr(&details,
1344 "this node has no downstream nodes");
1345 else
1346 appendPQExpBuffer(&details,
1347 "%i of %i downstream nodes attached",
1348 expected_nodes_count - missing_nodes_count,
1349 expected_nodes_count);
1350 }
1351 else
1352 {
1353 ItemListCell *missing_cell = NULL;
1354 bool first = true;
1355
1356 status = CHECK_STATUS_CRITICAL;
1357
1358 appendPQExpBuffer(&details,
1359 "%i of %i downstream nodes not attached",
1360 missing_nodes_count,
1361 expected_nodes_count);
1362
1363 if (mode != OM_NAGIOS)
1364 {
1365 appendPQExpBufferStr(&details, "; missing: ");
1366
1367 for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next)
1368 {
1369 if (first == false)
1370 appendPQExpBufferStr(&details,
1371 ", ");
1372 else
1373 first = false;
1374
1375 if (first == false)
1376 appendPQExpBufferStr(&details, missing_cell->string);
1377 }
1378 }
1379 }
1380
1381 switch (mode)
1382 {
1383 case OM_NAGIOS:
1384 {
1385 printf("REPMGR_DOWNSTREAM_SERVERS %s: %s | ",
1386 output_check_status(status),
1387 details.data);
1388
1389 if (missing_nodes_count)
1390 {
1391 ItemListCell *missing_cell = NULL;
1392 bool first = true;
1393
1394 printf("missing: ");
1395 for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next)
1396 {
1397 if (first == false)
1398 printf(", ");
1399 else
1400 first = false;
1401
1402 if (first == false)
1403 printf("%s", missing_cell->string);
1404 }
1405 }
1406
1407 if (expected_nodes_count - missing_nodes_count)
1408 {
1409 ItemListCell *attached_cell = NULL;
1410 bool first = true;
1411
1412 if (missing_nodes_count)
1413 printf("; ");
1414 printf("attached: ");
1415 for (attached_cell = attached_nodes.head; attached_cell; attached_cell = attached_cell->next)
1416 {
1417 if (first == false)
1418 printf(", ");
1419 else
1420 first = false;
1421
1422 if (first == false)
1423 printf("%s", attached_cell->string);
1424 }
1425 }
1426 printf("\n");
1427
1428 }
1429 break;
1430 case OM_CSV:
1431 case OM_TEXT:
1432 if (list_output != NULL)
1433 {
1434 check_status_list_set(list_output,
1435 "Downstream servers",
1436 status,
1437 details.data);
1438 }
1439 else
1440 {
1441 printf("%s (%s)\n",
1442 output_check_status(status),
1443 details.data);
1444 }
1445 default:
1446 break;
1447
1448 }
1449 termPQExpBuffer(&details);
1450 clear_node_info_list(&downstream_nodes);
1451 return status;
1452 }
1453
1454
1455 static CheckStatus
do_node_check_upstream(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)1456 do_node_check_upstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
1457 {
1458 PGconn *upstream_conn = NULL;
1459 t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER;
1460 PQExpBufferData details;
1461
1462 CheckStatus status = CHECK_STATUS_OK;
1463
1464 if (mode == OM_CSV && list_output == NULL)
1465 {
1466 log_error(_("--csv output not provided with --upstream option"));
1467 PQfinish(conn);
1468 exit(ERR_BAD_CONFIG);
1469 }
1470
1471 initPQExpBuffer(&details);
1472
1473 if (node_info->type == WITNESS)
1474 {
1475 /* witness is not connecting to any upstream */
1476 appendPQExpBufferStr(&details,
1477 _("N/A - node is a witness"));
1478 }
1479 else if (get_node_record(conn, node_info->upstream_node_id, &upstream_node_info) != RECORD_FOUND)
1480 {
1481 if (get_recovery_type(conn) == RECTYPE_STANDBY)
1482 {
1483 appendPQExpBuffer(&details,
1484 _("node \"%s\" (ID: %i) is a standby but no upstream record found"),
1485 node_info->node_name,
1486 node_info->node_id);
1487 status = CHECK_STATUS_CRITICAL;
1488 }
1489 else
1490 {
1491 appendPQExpBufferStr(&details,
1492 _("N/A - node is primary"));
1493 }
1494 }
1495 else
1496 {
1497 upstream_conn = establish_db_connection(upstream_node_info.conninfo, true);
1498
1499 /* check our node is connected */
1500 if (is_downstream_node_attached(upstream_conn, config_file_options.node_name, NULL) != NODE_ATTACHED)
1501 {
1502 appendPQExpBuffer(&details,
1503 _("node \"%s\" (ID: %i) is not attached to expected upstream node \"%s\" (ID: %i)"),
1504 node_info->node_name,
1505 node_info->node_id,
1506 upstream_node_info.node_name,
1507 upstream_node_info.node_id);
1508 status = CHECK_STATUS_CRITICAL;
1509 }
1510 else
1511 {
1512 appendPQExpBuffer(&details,
1513 _("node \"%s\" (ID: %i) is attached to expected upstream node \"%s\" (ID: %i)"),
1514 node_info->node_name,
1515 node_info->node_id,
1516 upstream_node_info.node_name,
1517 upstream_node_info.node_id);
1518 }
1519 }
1520
1521 switch (mode)
1522 {
1523 case OM_NAGIOS:
1524 {
1525 printf("REPMGR_UPSTREAM_SERVER %s: %s | ",
1526 output_check_status(status),
1527 details.data);
1528 }
1529 break;
1530 case OM_TEXT:
1531 if (list_output != NULL)
1532 {
1533 check_status_list_set(list_output,
1534 "Upstream connection",
1535 status,
1536 details.data);
1537 }
1538 else
1539 {
1540 printf("%s (%s)\n",
1541 output_check_status(status),
1542 details.data);
1543 }
1544 default:
1545 break;
1546 }
1547
1548 termPQExpBuffer(&details);
1549
1550 return status;
1551 }
1552
1553
1554 static CheckStatus
do_node_check_replication_lag(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)1555 do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
1556 {
1557 CheckStatus status = CHECK_STATUS_OK;
1558 int lag_seconds = 0;
1559 PQExpBufferData details;
1560
1561 if (mode == OM_CSV && list_output == NULL)
1562 {
1563 log_error(_("--csv output not provided with --replication-lag option"));
1564 PQfinish(conn);
1565 exit(ERR_BAD_CONFIG);
1566 }
1567
1568 initPQExpBuffer(&details);
1569
1570 if (node_info->recovery_type == RECTYPE_PRIMARY)
1571 {
1572 switch (mode)
1573 {
1574 case OM_OPTFORMAT:
1575 appendPQExpBufferStr(&details,
1576 "--lag=0");
1577 break;
1578 case OM_NAGIOS:
1579 appendPQExpBuffer(&details,
1580 "0 seconds | lag=0;%i;%i",
1581 config_file_options.replication_lag_warning,
1582 config_file_options.replication_lag_critical);
1583 break;
1584 case OM_TEXT:
1585 if (node_info->type == WITNESS)
1586 {
1587 appendPQExpBufferStr(&details,
1588 "N/A - node is witness");
1589 }
1590 else
1591 {
1592 appendPQExpBufferStr(&details,
1593 "N/A - node is primary");
1594 }
1595 break;
1596 default:
1597 break;
1598 }
1599 }
1600 else
1601 {
1602 lag_seconds = get_replication_lag_seconds(conn);
1603
1604 log_debug("lag seconds: %i", lag_seconds);
1605
1606 if (lag_seconds >= config_file_options.replication_lag_critical)
1607 {
1608 status = CHECK_STATUS_CRITICAL;
1609
1610 switch (mode)
1611 {
1612 case OM_OPTFORMAT:
1613 appendPQExpBuffer(&details,
1614 "--lag=%i --threshold=%i",
1615 lag_seconds, config_file_options.replication_lag_critical);
1616 break;
1617 case OM_NAGIOS:
1618 appendPQExpBuffer(&details,
1619 "%i seconds | lag=%i;%i;%i",
1620 lag_seconds,
1621 lag_seconds,
1622 config_file_options.replication_lag_warning,
1623 config_file_options.replication_lag_critical);
1624 break;
1625 case OM_TEXT:
1626 appendPQExpBuffer(&details,
1627 "%i seconds, critical threshold: %i)",
1628 lag_seconds, config_file_options.replication_lag_critical);
1629 break;
1630
1631 default:
1632 break;
1633 }
1634 }
1635 else if (lag_seconds > config_file_options.replication_lag_warning)
1636 {
1637 status = CHECK_STATUS_WARNING;
1638
1639 switch (mode)
1640 {
1641 case OM_OPTFORMAT:
1642 appendPQExpBuffer(&details,
1643 "--lag=%i --threshold=%i",
1644 lag_seconds, config_file_options.replication_lag_warning);
1645 break;
1646 case OM_NAGIOS:
1647 appendPQExpBuffer(&details,
1648 "%i seconds | lag=%i;%i;%i",
1649 lag_seconds,
1650 lag_seconds,
1651 config_file_options.replication_lag_warning,
1652 config_file_options.replication_lag_critical);
1653 break;
1654 case OM_TEXT:
1655 appendPQExpBuffer(&details,
1656 "%i seconds, warning threshold: %i)",
1657 lag_seconds, config_file_options.replication_lag_warning);
1658 break;
1659
1660 default:
1661 break;
1662 }
1663 }
1664 else if (lag_seconds == UNKNOWN_REPLICATION_LAG)
1665 {
1666 status = CHECK_STATUS_UNKNOWN;
1667
1668 switch (mode)
1669 {
1670 case OM_OPTFORMAT:
1671 break;
1672 case OM_NAGIOS:
1673 case OM_TEXT:
1674 appendPQExpBufferStr(&details,
1675 "unable to query replication lag");
1676 break;
1677
1678 default:
1679 break;
1680 }
1681 }
1682 else
1683 {
1684 status = CHECK_STATUS_OK;
1685
1686 switch (mode)
1687 {
1688 case OM_OPTFORMAT:
1689 appendPQExpBuffer(&details,
1690 "--lag=%i",
1691 lag_seconds);
1692 break;
1693 case OM_NAGIOS:
1694 appendPQExpBuffer(&details,
1695 "%i seconds | lag=%i;%i;%i",
1696 lag_seconds,
1697 lag_seconds,
1698 config_file_options.replication_lag_warning,
1699 config_file_options.replication_lag_critical);
1700 break;
1701 case OM_TEXT:
1702 appendPQExpBuffer(&details,
1703 "%i seconds",
1704 lag_seconds);
1705 break;
1706
1707 default:
1708 break;
1709 }
1710 }
1711 }
1712
1713 switch (mode)
1714 {
1715 case OM_OPTFORMAT:
1716 printf("--status=%s %s\n",
1717 output_check_status(status),
1718 details.data);
1719 break;
1720 case OM_NAGIOS:
1721 printf("REPMGR_REPLICATION_LAG %s: %s\n",
1722 output_check_status(status),
1723 details.data);
1724 break;
1725 case OM_CSV:
1726 case OM_TEXT:
1727 if (list_output != NULL)
1728 {
1729 check_status_list_set(list_output,
1730 "Replication lag",
1731 status,
1732 details.data);
1733 }
1734 else
1735 {
1736 printf("%s (%s)\n",
1737 output_check_status(status),
1738 details.data);
1739 }
1740 default:
1741 break;
1742 }
1743
1744 termPQExpBuffer(&details);
1745
1746 return status;
1747 }
1748
1749
1750 static CheckStatus
do_node_check_role(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)1751 do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
1752 {
1753
1754 CheckStatus status = CHECK_STATUS_OK;
1755 PQExpBufferData details;
1756 RecoveryType recovery_type = get_recovery_type(conn);
1757
1758 if (mode == OM_CSV && list_output == NULL)
1759 {
1760 log_error(_("--csv output not provided with --role option"));
1761 PQfinish(conn);
1762 exit(ERR_BAD_CONFIG);
1763 }
1764
1765 initPQExpBuffer(&details);
1766
1767 switch (node_info->type)
1768 {
1769 case PRIMARY:
1770 if (recovery_type == RECTYPE_STANDBY)
1771 {
1772 status = CHECK_STATUS_CRITICAL;
1773 appendPQExpBufferStr(&details,
1774 _("node is registered as primary but running as standby"));
1775 }
1776 else
1777 {
1778 appendPQExpBufferStr(&details,
1779 _("node is primary"));
1780 }
1781 break;
1782 case STANDBY:
1783 if (recovery_type == RECTYPE_PRIMARY)
1784 {
1785 status = CHECK_STATUS_CRITICAL;
1786 appendPQExpBufferStr(&details,
1787 _("node is registered as standby but running as primary"));
1788 }
1789 else
1790 {
1791 appendPQExpBufferStr(&details,
1792 _("node is standby"));
1793 }
1794 break;
1795 case WITNESS:
1796 if (recovery_type == RECTYPE_STANDBY)
1797 {
1798 status = CHECK_STATUS_CRITICAL;
1799 appendPQExpBufferStr(&details,
1800 _("node is registered as witness but running as standby"));
1801 }
1802 else
1803 {
1804 appendPQExpBufferStr(&details,
1805 _("node is witness"));
1806 }
1807 break;
1808 default:
1809 break;
1810 }
1811
1812 switch (mode)
1813 {
1814 case OM_NAGIOS:
1815 printf("REPMGR_SERVER_ROLE %s: %s\n",
1816 output_check_status(status),
1817 details.data);
1818 break;
1819 case OM_CSV:
1820 case OM_TEXT:
1821 if (list_output != NULL)
1822 {
1823 check_status_list_set(list_output,
1824 "Server role",
1825 status,
1826 details.data);
1827 }
1828 else
1829 {
1830 printf("%s (%s)\n",
1831 output_check_status(status),
1832 details.data);
1833 }
1834 default:
1835 break;
1836 }
1837
1838 termPQExpBuffer(&details);
1839 return status;
1840
1841 }
1842
1843
1844 static CheckStatus
do_node_check_slots(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)1845 do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
1846 {
1847 CheckStatus status = CHECK_STATUS_OK;
1848 PQExpBufferData details;
1849
1850 if (mode == OM_CSV && list_output == NULL)
1851 {
1852 log_error(_("--csv output not provided with --slots option"));
1853 PQfinish(conn);
1854 exit(ERR_BAD_CONFIG);
1855 }
1856
1857 initPQExpBuffer(&details);
1858
1859 if (node_info->total_replication_slots == 0)
1860 {
1861 appendPQExpBufferStr(&details,
1862 _("node has no physical replication slots"));
1863 }
1864 else if (node_info->inactive_replication_slots == 0)
1865 {
1866 appendPQExpBuffer(&details,
1867 _("%i of %i physical replication slots are active"),
1868 node_info->total_replication_slots,
1869 node_info->total_replication_slots);
1870 }
1871 else if (node_info->inactive_replication_slots > 0)
1872 {
1873 status = CHECK_STATUS_CRITICAL;
1874
1875 appendPQExpBuffer(&details,
1876 _("%i of %i physical replication slots are inactive"),
1877 node_info->inactive_replication_slots,
1878 node_info->total_replication_slots);
1879 }
1880
1881 switch (mode)
1882 {
1883 case OM_NAGIOS:
1884 printf("REPMGR_INACTIVE_SLOTS %s: %s | slots=%i;%i\n",
1885 output_check_status(status),
1886 details.data,
1887 node_info->total_replication_slots,
1888 node_info->inactive_replication_slots);
1889 break;
1890 case OM_CSV:
1891 case OM_TEXT:
1892 if (list_output != NULL)
1893 {
1894 check_status_list_set(list_output,
1895 "Replication slots",
1896 status,
1897 details.data);
1898 }
1899 else
1900 {
1901 printf("%s (%s)\n",
1902 output_check_status(status),
1903 details.data);
1904 }
1905 default:
1906 break;
1907 }
1908
1909 termPQExpBuffer(&details);
1910 return status;
1911 }
1912
1913
1914 static CheckStatus
do_node_check_missing_slots(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)1915 do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
1916 {
1917 CheckStatus status = CHECK_STATUS_OK;
1918 PQExpBufferData details;
1919 NodeInfoList missing_slots = T_NODE_INFO_LIST_INITIALIZER;
1920
1921 if (mode == OM_CSV && list_output == NULL)
1922 {
1923 log_error(_("--csv output not provided with --missing-slots option"));
1924 PQfinish(conn);
1925 exit(ERR_BAD_CONFIG);
1926 }
1927
1928 initPQExpBuffer(&details);
1929
1930 get_downstream_nodes_with_missing_slot(conn,
1931 config_file_options.node_id,
1932 &missing_slots);
1933
1934 if (missing_slots.node_count == 0)
1935 {
1936 appendPQExpBufferStr(&details,
1937 _("node has no missing physical replication slots"));
1938 }
1939 else
1940 {
1941 NodeInfoListCell *missing_slot_cell = NULL;
1942 bool first_element = true;
1943
1944 status = CHECK_STATUS_CRITICAL;
1945
1946 appendPQExpBuffer(&details,
1947 _("%i physical replication slots are missing"),
1948 missing_slots.node_count);
1949
1950 if (missing_slots.node_count)
1951 {
1952 appendPQExpBufferStr(&details, ": ");
1953
1954 for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next)
1955 {
1956 if (first_element == true)
1957 {
1958 first_element = false;
1959 }
1960 else
1961 {
1962 appendPQExpBufferStr(&details, ", ");
1963 }
1964
1965 appendPQExpBufferStr(&details, missing_slot_cell->node_info->slot_name);
1966 }
1967 }
1968 }
1969
1970 switch (mode)
1971 {
1972 case OM_NAGIOS:
1973 {
1974 printf("REPMGR_MISSING_SLOTS %s: %s | missing_slots=%i",
1975 output_check_status(status),
1976 details.data,
1977 missing_slots.node_count);
1978
1979 if (missing_slots.node_count)
1980 {
1981 NodeInfoListCell *missing_slot_cell = NULL;
1982 bool first_element = true;
1983
1984 printf(";");
1985
1986 for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next)
1987 {
1988 if (first_element == true)
1989 {
1990 first_element = false;
1991 }
1992 else
1993 {
1994 printf(",");
1995 }
1996 printf("%s", missing_slot_cell->node_info->slot_name);
1997 }
1998 }
1999 printf("\n");
2000 break;
2001 }
2002 case OM_CSV:
2003 case OM_TEXT:
2004 if (list_output != NULL)
2005 {
2006 check_status_list_set(list_output,
2007 "Missing physical replication slots",
2008 status,
2009 details.data);
2010 }
2011 else
2012 {
2013 printf("%s (%s)\n",
2014 output_check_status(status),
2015 details.data);
2016 }
2017 default:
2018 break;
2019 }
2020
2021 clear_node_info_list(&missing_slots);
2022
2023 termPQExpBuffer(&details);
2024 return status;
2025 }
2026
2027
2028 CheckStatus
do_node_check_data_directory(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)2029 do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
2030 {
2031 CheckStatus status = CHECK_STATUS_OK;
2032 char actual_data_directory[MAXPGPATH] = "";
2033 PQExpBufferData details;
2034
2035 if (mode == OM_CSV && list_output == NULL)
2036 {
2037 log_error(_("--csv output not provided with --data-directory-config option"));
2038 PQfinish(conn);
2039 exit(ERR_BAD_CONFIG);
2040 }
2041
2042 initPQExpBuffer(&details);
2043
2044 /*
2045 * Check actual data directory matches that in repmgr.conf; note this requires
2046 * a superuser connection
2047 */
2048 if (connection_has_pg_monitor_role(conn, "pg_read_all_settings") == true)
2049 {
2050 /* we expect to have a database connection */
2051 if (get_pg_setting(conn, "data_directory", actual_data_directory) == false)
2052 {
2053 appendPQExpBuffer(&details,
2054 _("unable to determine current \"data_directory\""));
2055 status = CHECK_STATUS_UNKNOWN;
2056 }
2057
2058 if (strncmp(actual_data_directory, config_file_options.data_directory, MAXPGPATH) != 0)
2059 {
2060 if (mode != OM_NAGIOS)
2061 {
2062 appendPQExpBuffer(&details,
2063 _("configured \"data_directory\" is \"%s\"; "),
2064 config_file_options.data_directory);
2065 }
2066
2067 appendPQExpBuffer(&details,
2068 "actual data directory is \"%s\"",
2069 actual_data_directory);
2070
2071 status = CHECK_STATUS_CRITICAL;
2072 }
2073 else
2074 {
2075 appendPQExpBuffer(&details,
2076 _("configured \"data_directory\" is \"%s\""),
2077 config_file_options.data_directory);
2078 }
2079 }
2080 /*
2081 * If no superuser connection available, sanity-check that the configuration directory looks
2082 * like a PostgreSQL directory and hope it's the right one.
2083 */
2084 else
2085 {
2086 if (mode == OM_TEXT)
2087 {
2088 log_info(_("connection is not a superuser connection, falling back to simple check"));
2089
2090 if (PQserverVersion(conn) >= 100000)
2091 {
2092 log_hint(_("provide a superuser with -S/--superuser, or add the \"%s\" user to role \"pg_read_all_settings\" or \"pg_monitor\""),
2093 PQuser(conn));
2094 }
2095 }
2096
2097 if (is_pg_dir(config_file_options.data_directory) == false)
2098 {
2099 if (mode == OM_NAGIOS)
2100 {
2101 appendPQExpBufferStr(&details,
2102 _("configured \"data_directory\" is not a PostgreSQL data directory"));
2103 }
2104 else
2105 {
2106 appendPQExpBuffer(&details,
2107 _("configured \"data_directory\" \"%s\" is not a PostgreSQL data directory"),
2108 actual_data_directory);
2109 }
2110
2111 status = CHECK_STATUS_CRITICAL;
2112 }
2113 else
2114 {
2115 appendPQExpBuffer(&details,
2116 _("configured \"data_directory\" is \"%s\""),
2117 config_file_options.data_directory);
2118 }
2119 }
2120
2121 switch (mode)
2122 {
2123 case OM_OPTFORMAT:
2124 printf("--configured-data-directory=%s\n",
2125 output_check_status(status));
2126 break;
2127 case OM_NAGIOS:
2128 printf("REPMGR_DATA_DIRECTORY %s: %s",
2129 output_check_status(status),
2130 config_file_options.data_directory);
2131
2132 if (status == CHECK_STATUS_CRITICAL)
2133 {
2134 printf(" | %s", details.data);
2135 }
2136 puts("");
2137 break;
2138 case OM_CSV:
2139 case OM_TEXT:
2140 if (list_output != NULL)
2141 {
2142 check_status_list_set(list_output,
2143 "Configured data directory",
2144 status,
2145 details.data);
2146 }
2147 else
2148 {
2149 printf("%s (%s)\n",
2150 output_check_status(status),
2151 details.data);
2152 }
2153 default:
2154 break;
2155 }
2156
2157 termPQExpBuffer(&details);
2158
2159 return status;
2160 }
2161
2162 /*
2163 * This is not included in the general list output
2164 */
2165 static
do_node_check_replication_config_owner(PGconn * conn,OutputMode mode,t_node_info * node_info,CheckStatusList * list_output)2166 CheckStatus do_node_check_replication_config_owner(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
2167 {
2168 CheckStatus status = CHECK_STATUS_OK;
2169
2170 PQExpBufferData errmsg;
2171 PQExpBufferData details;
2172
2173 if (mode != OM_OPTFORMAT)
2174 {
2175 log_error(_("--replication-config-owner option can only be used with --optformat"));
2176 PQfinish(conn);
2177 exit(ERR_BAD_CONFIG);
2178 }
2179
2180 initPQExpBuffer(&errmsg);
2181 initPQExpBuffer(&details);
2182
2183 if (check_replication_config_owner(PQserverVersion(conn),
2184 config_file_options.data_directory,
2185 &errmsg, &details) == false)
2186 {
2187 status = CHECK_STATUS_CRITICAL;
2188 }
2189
2190 printf("--replication-config-owner=%s\n",
2191 output_check_status(status));
2192
2193 return status;
2194 }
2195
2196
2197 /*
2198 * This is not included in the general list output
2199 */
2200 static CheckStatus
do_node_check_db_connection(PGconn * conn,OutputMode mode)2201 do_node_check_db_connection(PGconn *conn, OutputMode mode)
2202 {
2203 CheckStatus status = CHECK_STATUS_OK;
2204 PQExpBufferData details;
2205
2206 if (mode == OM_CSV)
2207 {
2208 log_error(_("--csv output not provided with --db-connection option"));
2209 PQfinish(conn);
2210 exit(ERR_BAD_CONFIG);
2211 }
2212
2213 /* This check is for configuration diagnostics only */
2214 if (mode == OM_NAGIOS)
2215 {
2216 log_error(_("--nagios output not provided with --db-connection option"));
2217 PQfinish(conn);
2218 exit(ERR_BAD_CONFIG);
2219 }
2220
2221 initPQExpBuffer(&details);
2222
2223 if (PQstatus(conn) != CONNECTION_OK)
2224 {
2225 t_conninfo_param_list conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
2226 int c;
2227
2228 status = CHECK_STATUS_CRITICAL;
2229 initialize_conninfo_params(&conninfo, false);
2230 conn_to_param_list(conn, &conninfo);
2231
2232 appendPQExpBufferStr(&details,
2233 "connection parameters used:");
2234 for (c = 0; c < conninfo.size && conninfo.keywords[c] != NULL; c++)
2235 {
2236 if (conninfo.values[c] != NULL && conninfo.values[c][0] != '\0')
2237 {
2238 appendPQExpBuffer(&details,
2239 " %s=%s",
2240 conninfo.keywords[c], conninfo.values[c]);
2241 }
2242 }
2243
2244 }
2245
2246 if (mode == OM_OPTFORMAT)
2247 {
2248 printf("--db-connection=%s\n",
2249 output_check_status(status));
2250 }
2251 else if (mode == OM_TEXT)
2252 {
2253 printf("%s (%s)\n",
2254 output_check_status(status),
2255 details.data);
2256 }
2257 termPQExpBuffer(&details);
2258
2259 return status;
2260 }
2261
2262
2263 void
do_node_service(void)2264 do_node_service(void)
2265 {
2266 t_server_action action = ACTION_UNKNOWN;
2267 char data_dir[MAXPGPATH] = "";
2268 char command[MAXLEN] = "";
2269 PQExpBufferData output;
2270
2271 action = parse_server_action(runtime_options.action);
2272
2273 if (action == ACTION_UNKNOWN)
2274 {
2275 log_error(_("unknown value \"%s\" provided for parameter --action"),
2276 runtime_options.action);
2277 log_hint(_("valid values are \"start\", \"stop\", \"restart\", \"reload\" and \"promote\""));
2278 exit(ERR_BAD_CONFIG);
2279 }
2280
2281 if (runtime_options.list_actions == true)
2282 {
2283 return _do_node_service_list_actions(action);
2284 }
2285
2286
2287 if (data_dir_required_for_action(action))
2288 {
2289 get_node_config_directory(data_dir);
2290
2291 if (data_dir[0] == '\0')
2292 {
2293 log_error(_("unable to determine data directory for action"));
2294 exit(ERR_BAD_CONFIG);
2295 }
2296 }
2297
2298
2299 if ((action == ACTION_STOP || action == ACTION_RESTART) && runtime_options.checkpoint == true)
2300 {
2301 PGconn *conn = NULL;
2302
2303 if (config_file_options.conninfo[0] != '\0')
2304 {
2305 /*
2306 * If --superuser option provided, attempt to connect as the specified user
2307 */
2308 if (runtime_options.superuser[0] != '\0')
2309 {
2310 conn = establish_db_connection_with_replacement_param(
2311 config_file_options.conninfo,
2312 "user",
2313 runtime_options.superuser,
2314 true);
2315 }
2316 else
2317 {
2318 conn = establish_db_connection(config_file_options.conninfo, true);
2319 }
2320 }
2321 else
2322 {
2323 conn = establish_db_connection_by_params(&source_conninfo, true);
2324 }
2325
2326 if (is_superuser_connection(conn, NULL) == false)
2327 {
2328 if (runtime_options.dry_run == true)
2329 {
2330 log_warning(_("a CHECKPOINT would be issued here but no superuser connection is available"));
2331 }
2332 else
2333 {
2334 log_warning(_("a superuser connection is required to issue a CHECKPOINT"));
2335 }
2336
2337 log_hint(_("provide a superuser with -S/--superuser"));
2338 }
2339 else
2340 {
2341 if (runtime_options.dry_run == true)
2342 {
2343 log_info(_("a CHECKPOINT would be issued here"));
2344 }
2345 else
2346 {
2347
2348 log_notice(_("issuing CHECKPOINT on node \"%s\" (ID: %i) "),
2349 config_file_options.node_name,
2350 config_file_options.node_id);
2351
2352 checkpoint(conn);
2353 }
2354 }
2355
2356 PQfinish(conn);
2357 }
2358
2359 get_server_action(action, command, data_dir);
2360
2361 if (runtime_options.dry_run == true)
2362 {
2363 log_info(_("would execute server command \"%s\""), command);
2364 return;
2365 }
2366
2367 /*
2368 * log level is "DETAIL" here as this command is intended to be executed
2369 * by another repmgr process (e.g. during standby switchover); that repmgr
2370 * should emit a "NOTICE" about the intent of the command.
2371 */
2372 log_detail(_("executing server command \"%s\""), command);
2373
2374 initPQExpBuffer(&output);
2375
2376 if (local_command(command, &output) == false)
2377 {
2378 termPQExpBuffer(&output);
2379 exit(ERR_LOCAL_COMMAND);
2380 }
2381
2382 termPQExpBuffer(&output);
2383 }
2384
2385
2386 static void
_do_node_service_list_actions(t_server_action action)2387 _do_node_service_list_actions(t_server_action action)
2388 {
2389 char command[MAXLEN] = "";
2390
2391 char data_dir[MAXPGPATH] = "";
2392
2393 bool data_dir_required = false;
2394
2395 /* do we need to provide a data directory for any of the actions? */
2396 if (data_dir_required_for_action(ACTION_START))
2397 data_dir_required = true;
2398
2399 if (data_dir_required_for_action(ACTION_STOP))
2400 data_dir_required = true;
2401
2402 if (data_dir_required_for_action(ACTION_RESTART))
2403 data_dir_required = true;
2404
2405 if (data_dir_required_for_action(ACTION_RELOAD))
2406 data_dir_required = true;
2407
2408 if (data_dir_required_for_action(ACTION_PROMOTE))
2409 data_dir_required = true;
2410
2411 if (data_dir_required == true)
2412 {
2413 get_node_config_directory(data_dir);
2414 }
2415
2416 /* show command for specific action only */
2417 if (action != ACTION_NONE)
2418 {
2419 get_server_action(action, command, data_dir);
2420 printf("%s\n", command);
2421 return;
2422 }
2423
2424 puts(_("Following commands would be executed for each action:"));
2425 puts("");
2426
2427 get_server_action(ACTION_START, command, data_dir);
2428 printf(" start: \"%s\"\n", command);
2429
2430 get_server_action(ACTION_STOP, command, data_dir);
2431 printf(" stop: \"%s\"\n", command);
2432
2433 get_server_action(ACTION_RESTART, command, data_dir);
2434 printf(" restart: \"%s\"\n", command);
2435
2436 get_server_action(ACTION_RELOAD, command, data_dir);
2437 printf(" reload: \"%s\"\n", command);
2438
2439 get_server_action(ACTION_PROMOTE, command, data_dir);
2440 printf(" promote: \"%s\"\n", command);
2441
2442 puts("");
2443
2444 }
2445
2446
2447 static t_server_action
parse_server_action(const char * action_name)2448 parse_server_action(const char *action_name)
2449 {
2450 if (action_name[0] == '\0')
2451 return ACTION_NONE;
2452
2453 if (strcasecmp(action_name, "start") == 0)
2454 return ACTION_START;
2455
2456 if (strcasecmp(action_name, "stop") == 0)
2457 return ACTION_STOP;
2458
2459 if (strcasecmp(action_name, "restart") == 0)
2460 return ACTION_RESTART;
2461
2462 if (strcasecmp(action_name, "reload") == 0)
2463 return ACTION_RELOAD;
2464
2465 if (strcasecmp(action_name, "promote") == 0)
2466 return ACTION_PROMOTE;
2467
2468 return ACTION_UNKNOWN;
2469 }
2470
2471
2472
2473 /*
2474 * Rejoin a dormant (shut down) node to the replication cluster; this
2475 * is typically a former primary which needs to be demoted to a standby.
2476 *
2477 * Note that "repmgr node rejoin" is also executed by
2478 * "repmgr standby switchover" after promoting the new primary.
2479 *
2480 * Parameters:
2481 * --dry-run
2482 * --force-rewind[=VALUE]
2483 * --config-files
2484 * --config-archive-dir
2485 * -W/--no-wait
2486 */
2487 void
do_node_rejoin(void)2488 do_node_rejoin(void)
2489 {
2490 PGconn *upstream_conn = NULL;
2491 RecoveryType primary_recovery_type = RECTYPE_UNKNOWN;
2492 PGconn *primary_conn = NULL;
2493
2494 DBState db_state;
2495 PGPing status;
2496 bool is_shutdown = true;
2497 int server_version_num = UNKNOWN_SERVER_VERSION_NUM;
2498 bool hide_standby_signal = false;
2499
2500 PQExpBufferData command;
2501 PQExpBufferData command_output;
2502 PQExpBufferData follow_output;
2503 struct stat statbuf;
2504 t_node_info primary_node_record = T_NODE_INFO_INITIALIZER;
2505 t_node_info local_node_record = T_NODE_INFO_INITIALIZER;
2506
2507 bool success = true;
2508 int follow_error_code = SUCCESS;
2509
2510 /* check node is not actually running */
2511 status = PQping(config_file_options.conninfo);
2512
2513 switch (status)
2514 {
2515 case PQPING_NO_ATTEMPT:
2516 log_error(_("unable to determine status of server"));
2517 exit(ERR_BAD_CONFIG);
2518 case PQPING_OK:
2519 is_shutdown = false;
2520 break;
2521 case PQPING_REJECT:
2522 is_shutdown = false;
2523 break;
2524 case PQPING_NO_RESPONSE:
2525 /* status not yet clear */
2526 break;
2527 }
2528
2529 if (get_db_state(config_file_options.data_directory, &db_state) == false)
2530 {
2531 log_error(_("unable to determine database state from pg_control"));
2532 exit(ERR_BAD_CONFIG);
2533 }
2534
2535 if (is_shutdown == false)
2536 {
2537 log_error(_("database is still running in state \"%s\""),
2538 describe_db_state(db_state));
2539 log_hint(_("\"repmgr node rejoin\" cannot be executed on a running node"));
2540 exit(ERR_REJOIN_FAIL);
2541 }
2542
2543 /*
2544 * Server version number required to determine whether pg_rewind will run
2545 * crash recovery (Pg 13 and later).
2546 */
2547 server_version_num = get_pg_version(config_file_options.data_directory, NULL);
2548
2549 if (server_version_num == UNKNOWN_SERVER_VERSION_NUM)
2550 {
2551 /* This is very unlikely to happen */
2552 log_error(_("unable to determine database version"));
2553 exit(ERR_BAD_CONFIG);
2554 }
2555
2556 log_verbose(LOG_DEBUG, "server version number is: %i", server_version_num);
2557
2558 /* check if cleanly shut down */
2559 if (db_state != DB_SHUTDOWNED && db_state != DB_SHUTDOWNED_IN_RECOVERY)
2560 {
2561 if (db_state == DB_SHUTDOWNING)
2562 {
2563 log_error(_("database is still shutting down"));
2564 }
2565 else if (server_version_num >= 130000 && runtime_options.force_rewind_used == true)
2566 {
2567 log_warning(_("database is not shut down cleanly"));
2568 log_detail(_("--force-rewind provided, pg_rewind will automatically perform recovery"));
2569
2570 /*
2571 * If pg_rewind is executed, the first change it will make
2572 * is to start the server in single user mode, which will fail
2573 * in the presence of "standby.signal", so we'll "hide" it
2574 * (actually delete and recreate).
2575 */
2576 hide_standby_signal = true;
2577 }
2578 else
2579 {
2580 /*
2581 * If the database was not shut down cleanly, it *might* rejoin correctly
2582 * after starting up and recovering, but better to ensure the database
2583 * can recover before trying anything else.
2584 */
2585 log_error(_("database is not shut down cleanly"));
2586
2587 if (server_version_num >= 130000)
2588 {
2589 log_hint(_("provide --force-rewind to run recovery"));
2590 }
2591 else
2592 {
2593 if (runtime_options.force_rewind_used == true)
2594 {
2595 log_detail(_("pg_rewind will not be able to run"));
2596 }
2597 log_hint(_("database should be restarted then shut down cleanly after crash recovery completes"));
2598 }
2599
2600 exit(ERR_REJOIN_FAIL);
2601 }
2602 }
2603
2604 /* check provided upstream connection */
2605 upstream_conn = establish_db_connection_by_params(&source_conninfo, true);
2606
2607 if (get_primary_node_record(upstream_conn, &primary_node_record) == false)
2608 {
2609 log_error(_("unable to retrieve primary node record"));
2610 log_hint(_("check the provided database connection string is for a \"repmgr\" database"));
2611 PQfinish(upstream_conn);
2612 exit(ERR_BAD_CONFIG);
2613 }
2614
2615 /* connect to registered primary and check it's not in recovery */
2616 primary_conn = establish_db_connection(primary_node_record.conninfo, false);
2617
2618 if (PQstatus(primary_conn) != CONNECTION_OK)
2619 {
2620 RecoveryType upstream_recovery_type = get_recovery_type(upstream_conn);
2621
2622 log_error(_("unable to connect to current registered primary \"%s\" (ID: %i)"),
2623 primary_node_record.node_name,
2624 primary_node_record.node_id);
2625 log_detail(_("registered primary node conninfo is: \"%s\""),
2626 primary_node_record.conninfo);
2627 /*
2628 * Catch case where provided upstream is not in recovery, but is also
2629 * not registered as primary
2630 */
2631
2632 if (upstream_recovery_type == RECTYPE_PRIMARY)
2633 {
2634 log_warning(_("provided upstream connection string is for a server which is not in recovery, but not registered as primary"));
2635 log_hint(_("fix repmgr metadata configuration before continuing"));
2636 }
2637
2638 PQfinish(upstream_conn);
2639 exit(ERR_BAD_CONFIG);
2640 }
2641
2642 PQfinish(upstream_conn);
2643
2644 primary_recovery_type = get_recovery_type(primary_conn);
2645
2646 if (primary_recovery_type != RECTYPE_PRIMARY)
2647 {
2648 log_error(_("primary server is registered as node \"%s\" (ID: %i), but server is not a primary"),
2649 primary_node_record.node_name,
2650 primary_node_record.node_id);
2651 /* TODO: hint about checking cluster */
2652 PQfinish(primary_conn);
2653
2654 exit(ERR_BAD_CONFIG);
2655 }
2656
2657 /*
2658 * Fetch the local node record - we'll need this later, and it acts as an
2659 * additional sanity-check that the node is known to the primary.
2660 */
2661 if (get_node_record(primary_conn, config_file_options.node_id, &local_node_record) != RECORD_FOUND)
2662 {
2663 log_error(_("unable to retrieve node record for the local node"));
2664 log_hint(_("check the local node is registered with the current primary \"%s\" (ID: %i)"),
2665 primary_node_record.node_name,
2666 primary_node_record.node_id);
2667
2668 PQfinish(primary_conn);
2669 exit(ERR_BAD_CONFIG);
2670 }
2671
2672 /*
2673 * Sanity-check replication slot availability
2674 */
2675 if (config_file_options.use_replication_slots)
2676 {
2677 bool slots_available = check_replication_slots_available(primary_node_record.node_id,
2678 primary_conn);
2679 if (slots_available == false)
2680 {
2681 PQfinish(primary_conn);
2682 exit(ERR_BAD_CONFIG);
2683 }
2684 }
2685
2686
2687 /*
2688 * sanity-check that it will actually be possible to stream from the new upstream
2689 */
2690 {
2691 bool can_rejoin;
2692 TimeLineID tli = get_min_recovery_end_timeline(config_file_options.data_directory);
2693 XLogRecPtr min_recovery_location = get_min_recovery_location(config_file_options.data_directory);
2694
2695 /*
2696 * It's possible this was a former primary, so the minRecoveryPoint*
2697 * fields may be empty.
2698 */
2699
2700 if (min_recovery_location == InvalidXLogRecPtr)
2701 min_recovery_location = get_latest_checkpoint_location(config_file_options.data_directory);
2702 if (tli == 0)
2703 tli = get_timeline(config_file_options.data_directory);
2704
2705 can_rejoin = check_node_can_attach(tli,
2706 min_recovery_location,
2707 primary_conn,
2708 &primary_node_record,
2709 true);
2710
2711 if (can_rejoin == false)
2712 {
2713 PQfinish(primary_conn);
2714 exit(ERR_REJOIN_FAIL);
2715 }
2716 }
2717
2718
2719 /*
2720 * --force-rewind specified - check prerequisites, and attempt to execute
2721 * (if --dry-run provided, just output the command which would be executed)
2722 */
2723
2724 if (runtime_options.force_rewind_used == true)
2725 {
2726 PQExpBufferData msg;
2727 PQExpBufferData filebuf;
2728 int ret;
2729
2730 /*
2731 * Check that pg_rewind can be used
2732 */
2733
2734 initPQExpBuffer(&msg);
2735
2736 if (can_use_pg_rewind(primary_conn, config_file_options.data_directory, &msg) == false)
2737 {
2738 log_error(_("--force-rewind specified but pg_rewind cannot be used"));
2739 log_detail("%s", msg.data);
2740 termPQExpBuffer(&msg);
2741 PQfinish(primary_conn);
2742
2743 exit(ERR_BAD_CONFIG);
2744 }
2745
2746 appendPQExpBufferStr(&msg,
2747 _("prerequisites for using pg_rewind are met"));
2748
2749 if (runtime_options.dry_run == true)
2750 {
2751 log_info("%s", msg.data);
2752 }
2753 else
2754 {
2755 log_verbose(LOG_INFO, "%s", msg.data);
2756 }
2757 termPQExpBuffer(&msg);
2758
2759 /*
2760 * Archive requested configuration files.
2761 *
2762 * In --dry-run mode this acts as a check that the files can be archived, though
2763 * errors will only be logged; any copied files will be deleted and --dry-run
2764 * execution will continue.
2765 */
2766 _do_node_archive_config();
2767
2768 /* execute pg_rewind */
2769 initPQExpBuffer(&command);
2770
2771 if (runtime_options.force_rewind_path[0] != '\0')
2772 {
2773 appendPQExpBuffer(&command,
2774 "%s -D ",
2775 runtime_options.force_rewind_path);
2776 }
2777 else
2778 {
2779 make_pg_path(&command, "pg_rewind");
2780 appendPQExpBufferStr(&command,
2781 " -D ");
2782 }
2783
2784 appendShellString(&command,
2785 config_file_options.data_directory);
2786
2787 appendPQExpBuffer(&command,
2788 " --source-server='%s'",
2789 primary_node_record.conninfo);
2790
2791 if (runtime_options.dry_run == true)
2792 {
2793 log_info(_("pg_rewind would now be executed"));
2794 log_detail(_("pg_rewind command is:\n %s"),
2795 command.data);
2796 }
2797 else
2798 {
2799 log_notice(_("executing pg_rewind"));
2800 log_detail(_("pg_rewind command is \"%s\""),
2801 command.data);
2802
2803 /*
2804 * In Pg13 and later, pg_rewind will attempt to start up a server which
2805 * was not cleanly shut down in single user mode. This will fail if
2806 * "standby.signal" is present. We'll remove it and restore it after
2807 * pg_rewind runs.
2808 */
2809 if (hide_standby_signal == true)
2810 {
2811 char standby_signal_file_path[MAXPGPATH] = "";
2812
2813 log_notice(_("temporarily removing \"standby.signal\""));
2814 log_detail(_("this is required so pg_rewind can fix the unclean shutdown"));
2815
2816 make_standby_signal_path(standby_signal_file_path);
2817
2818 if (unlink(standby_signal_file_path) < 0 && errno != ENOENT)
2819 {
2820 log_error(_("unable to remove \"standby.signal\" file in data directory \"%s\""),
2821 standby_signal_file_path);
2822 log_detail("%s", strerror(errno));
2823 exit(ERR_REJOIN_FAIL);
2824 }
2825 }
2826
2827 initPQExpBuffer(&command_output);
2828
2829 ret = local_command(command.data,
2830 &command_output);
2831
2832 termPQExpBuffer(&command);
2833
2834 if (hide_standby_signal == true)
2835 {
2836 /*
2837 * Restore standby.signal if we previously removed it, regardless
2838 * of whether the pg_rewind operation failed.
2839 */
2840 log_notice(_("recreating \"standby.signal\""));
2841 write_standby_signal();
2842 }
2843
2844 if (ret == false)
2845 {
2846 log_error(_("pg_rewind execution failed"));
2847 log_detail("%s", command_output.data);
2848
2849 termPQExpBuffer(&command_output);
2850
2851 exit(ERR_REJOIN_FAIL);
2852 }
2853
2854 termPQExpBuffer(&command_output);
2855
2856 /* Restore any previously archived config files */
2857 _do_node_restore_config();
2858
2859 initPQExpBuffer(&filebuf);
2860
2861 /* remove any recovery.done file copied in by pg_rewind */
2862 appendPQExpBuffer(&filebuf,
2863 "%s/recovery.done",
2864 config_file_options.data_directory);
2865
2866 if (stat(filebuf.data, &statbuf) == 0)
2867 {
2868 log_verbose(LOG_INFO, _("deleting \"recovery.done\""));
2869
2870 if (unlink(filebuf.data) == -1)
2871 {
2872 log_warning(_("unable to delete \"%s\""),
2873 filebuf.data);
2874 log_detail("%s", strerror(errno));
2875 }
2876 }
2877 termPQExpBuffer(&filebuf);
2878
2879 /*
2880 * Delete any replication slots copied in by pg_rewind.
2881 *
2882 * TODO:
2883 * - from PostgreSQL 11, this will be handled by pg_rewind, so
2884 * we can skip this step from that version; see commit
2885 * 266b6acb312fc440c1c1a2036aa9da94916beac6
2886 * - possibly delete contents of various other directories
2887 * as per the above commit for pre-PostgreSQL 11
2888 */
2889 {
2890 PQExpBufferData slotdir_path;
2891 DIR *slotdir;
2892 struct dirent *slotdir_ent;
2893
2894 initPQExpBuffer(&slotdir_path);
2895
2896 appendPQExpBuffer(&slotdir_path,
2897 "%s/pg_replslot",
2898 config_file_options.data_directory);
2899
2900 slotdir = opendir(slotdir_path.data);
2901
2902 if (slotdir == NULL)
2903 {
2904 log_warning(_("unable to open replication slot directory \"%s\""),
2905 slotdir_path.data);
2906 log_detail("%s", strerror(errno));
2907 }
2908 else
2909 {
2910 while ((slotdir_ent = readdir(slotdir)) != NULL) {
2911 struct stat statbuf;
2912 PQExpBufferData slotdir_ent_path;
2913
2914 if (strcmp(slotdir_ent->d_name, ".") == 0 || strcmp(slotdir_ent->d_name, "..") == 0)
2915 continue;
2916
2917 initPQExpBuffer(&slotdir_ent_path);
2918
2919 appendPQExpBuffer(&slotdir_ent_path,
2920 "%s/%s",
2921 slotdir_path.data,
2922 slotdir_ent->d_name);
2923
2924 if (stat(slotdir_ent_path.data, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
2925 {
2926 termPQExpBuffer(&slotdir_ent_path);
2927 continue;
2928 }
2929
2930 log_debug("deleting slot directory \"%s\"", slotdir_ent_path.data);
2931 if (rmdir_recursive(slotdir_ent_path.data) != 0 && errno != EEXIST)
2932 {
2933 log_warning(_("unable to delete replication slot directory \"%s\""), slotdir_ent_path.data);
2934 log_detail("%s", strerror(errno));
2935 log_hint(_("directory may need to be manually removed"));
2936 }
2937
2938 termPQExpBuffer(&slotdir_ent_path);
2939 }
2940
2941 closedir(slotdir);
2942 }
2943 termPQExpBuffer(&slotdir_path);
2944 }
2945 }
2946 }
2947
2948 if (runtime_options.dry_run == true)
2949 {
2950 log_info(_("prerequisites for executing NODE REJOIN are met"));
2951 exit(SUCCESS);
2952 }
2953
2954 initPQExpBuffer(&follow_output);
2955
2956 /*
2957 * do_standby_follow_internal() can handle situations where the follow
2958 * target is not the primary, so requires database handles to both
2959 * (even if they point to the same node). For the time being,
2960 * "node rejoin" will only attach a standby to the primary.
2961 */
2962 success = do_standby_follow_internal(primary_conn,
2963 primary_conn,
2964 &primary_node_record,
2965 &follow_output,
2966 ERR_REJOIN_FAIL,
2967 &follow_error_code);
2968
2969 if (success == false)
2970 {
2971 log_error(_("NODE REJOIN failed"));
2972
2973 if (strlen(follow_output.data))
2974 log_detail("%s", follow_output.data);
2975
2976 create_event_notification(primary_conn,
2977 &config_file_options,
2978 config_file_options.node_id,
2979 "node_rejoin",
2980 success,
2981 follow_output.data);
2982
2983 PQfinish(primary_conn);
2984
2985 termPQExpBuffer(&follow_output);
2986 exit(follow_error_code);
2987 }
2988
2989 /*
2990 * Actively check that node actually started and connected to primary,
2991 * if not exit with ERR_REJOIN_FAIL.
2992 *
2993 * This check can be overridden with -W/--no-wait, in which case a one-time
2994 * check will be carried out.
2995 */
2996 if (runtime_options.no_wait == false)
2997 {
2998 standy_join_status join_success = check_standby_join(primary_conn,
2999 &primary_node_record,
3000 &local_node_record);
3001
3002 create_event_notification(primary_conn,
3003 &config_file_options,
3004 config_file_options.node_id,
3005 "node_rejoin",
3006 join_success == JOIN_SUCCESS ? true : false,
3007 follow_output.data);
3008
3009 if (join_success != JOIN_SUCCESS)
3010 {
3011 termPQExpBuffer(&follow_output);
3012 log_error(_("NODE REJOIN failed"));
3013
3014 if (join_success == JOIN_FAIL_NO_PING) {
3015 log_detail(_("local node \"%s\" did not become available start after %i seconds"),
3016 config_file_options.node_name,
3017 config_file_options.node_rejoin_timeout);
3018 }
3019 else {
3020 log_detail(_("no active record for local node \"%s\" found in node \"%s\"'s \"pg_stat_replication\" table"),
3021 config_file_options.node_name,
3022 primary_node_record.node_name);
3023 }
3024 log_hint(_("check the PostgreSQL log on the local node"));
3025
3026 exit(ERR_REJOIN_FAIL);
3027 }
3028 }
3029 else
3030 {
3031 /* -W/--no-wait provided - check once */
3032 NodeAttached node_attached = is_downstream_node_attached(primary_conn, config_file_options.node_name, NULL);
3033 if (node_attached == NODE_ATTACHED)
3034 success = true;
3035 }
3036
3037 /*
3038 * Handle replication slots:
3039 * - if a slot for the new upstream exists, delete that
3040 * - warn about any other inactive replication slots
3041 */
3042 if (runtime_options.force_rewind_used == false && config_file_options.use_replication_slots)
3043 {
3044 PGconn *local_conn = NULL;
3045 local_conn = establish_db_connection(config_file_options.conninfo, false);
3046
3047 if (PQstatus(local_conn) != CONNECTION_OK)
3048 {
3049 log_warning(_("unable to connect to local node to check replication slot status"));
3050 log_hint(_("execute \"repmgr node check\" to check inactive slots and drop manually if necessary"));
3051 }
3052 else
3053 {
3054 KeyValueList inactive_replication_slots = {NULL, NULL};
3055 KeyValueListCell *cell = NULL;
3056 int inactive_count = 0;
3057 PQExpBufferData slotinfo;
3058
3059 drop_replication_slot_if_exists(local_conn,
3060 config_file_options.node_id,
3061 primary_node_record.slot_name);
3062
3063 (void) get_inactive_replication_slots(local_conn, &inactive_replication_slots);
3064
3065 initPQExpBuffer(&slotinfo);
3066 for (cell = inactive_replication_slots.head; cell; cell = cell->next)
3067 {
3068 appendPQExpBuffer(&slotinfo,
3069 " - %s (%s)", cell->key, cell->value);
3070 inactive_count++;
3071 }
3072
3073 if (inactive_count > 0)
3074 {
3075 log_warning(_("%i inactive replication slots detected"), inactive_count);
3076 log_detail(_("inactive replication slots:\n%s"), slotinfo.data);
3077 log_hint(_("these replication slots may need to be removed manually"));
3078 }
3079
3080 termPQExpBuffer(&slotinfo);
3081
3082 PQfinish(local_conn);
3083 }
3084 }
3085
3086 if (success == true)
3087 {
3088 log_notice(_("NODE REJOIN successful"));
3089 log_detail("%s", follow_output.data);
3090 }
3091 else
3092 {
3093 /*
3094 * if we reach here, no record found in upstream node's pg_stat_replication
3095 */
3096 log_notice(_("NODE REJOIN has completed but node is not yet reattached to upstream"));
3097 log_hint(_("you will need to manually check the node's replication status"));
3098 }
3099 termPQExpBuffer(&follow_output);
3100
3101 return;
3102 }
3103
3104
3105 /*
3106 * Currently for testing purposes only, not documented;
3107 * use at own risk!
3108 */
3109
3110 void
do_node_control(void)3111 do_node_control(void)
3112 {
3113 PGconn *conn = NULL;
3114 pid_t wal_receiver_pid = UNKNOWN_PID;
3115 conn = establish_db_connection(config_file_options.conninfo, true);
3116
3117 if (runtime_options.disable_wal_receiver == true)
3118 {
3119 wal_receiver_pid = disable_wal_receiver(conn);
3120
3121 PQfinish(conn);
3122
3123 if (wal_receiver_pid == UNKNOWN_PID)
3124 exit(ERR_BAD_CONFIG);
3125
3126 exit(SUCCESS);
3127 }
3128
3129 if (runtime_options.enable_wal_receiver == true)
3130 {
3131 wal_receiver_pid = enable_wal_receiver(conn, true);
3132
3133 PQfinish(conn);
3134
3135 if (wal_receiver_pid == UNKNOWN_PID)
3136 exit(ERR_BAD_CONFIG);
3137
3138 exit(SUCCESS);
3139 }
3140
3141 log_error(_("no option provided"));
3142
3143 PQfinish(conn);
3144 }
3145
3146
3147 /*
3148 * For "internal" use by `node rejoin` on the local node when
3149 * called by "standby switchover" from the remote node.
3150 *
3151 * This archives any configuration files in the data directory, which may be
3152 * overwritten by pg_rewind.
3153 *
3154 * Requires configuration file, optionally --config-archive-dir
3155 */
3156 static void
_do_node_archive_config(void)3157 _do_node_archive_config(void)
3158 {
3159 PQExpBufferData archive_dir;
3160 struct stat statbuf;
3161 struct dirent *arcdir_ent;
3162 DIR *arcdir;
3163
3164 KeyValueList config_files = {NULL, NULL};
3165 KeyValueListCell *cell = NULL;
3166 int copied_count = 0;
3167
3168 initPQExpBuffer(&archive_dir);
3169 format_archive_dir(&archive_dir);
3170
3171 /* sanity-check directory path */
3172 if (stat(archive_dir.data, &statbuf) == -1)
3173 {
3174 if (errno != ENOENT)
3175 {
3176 log_error(_("error encountered when checking archive directory \"%s\""),
3177 archive_dir.data);
3178 log_detail("%s", strerror(errno));
3179 termPQExpBuffer(&archive_dir);
3180 exit(ERR_BAD_CONFIG);
3181 }
3182
3183 /* attempt to create and open the directory */
3184 if (mkdir(archive_dir.data, S_IRWXU) != 0 && errno != EEXIST)
3185 {
3186 log_error(_("unable to create temporary archive directory \"%s\""),
3187 archive_dir.data);
3188 log_detail("%s", strerror(errno));
3189 termPQExpBuffer(&archive_dir);
3190 exit(ERR_BAD_CONFIG);
3191 }
3192
3193 if (runtime_options.dry_run == true)
3194 {
3195 log_verbose(LOG_INFO, "temporary archive directory \"%s\" created", archive_dir.data);
3196 }
3197 }
3198 else if (!S_ISDIR(statbuf.st_mode))
3199 {
3200 log_error(_("\"%s\" exists but is not a directory"),
3201 archive_dir.data);
3202 termPQExpBuffer(&archive_dir);
3203 exit(ERR_BAD_CONFIG);
3204 }
3205
3206 arcdir = opendir(archive_dir.data);
3207
3208 /* always attempt to open the directory */
3209 if (arcdir == NULL)
3210 {
3211 log_error(_("unable to open archive directory \"%s\""),
3212 archive_dir.data);
3213 log_detail("%s", strerror(errno));
3214 termPQExpBuffer(&archive_dir);
3215 exit(ERR_BAD_CONFIG);
3216 }
3217
3218 if (runtime_options.dry_run == false)
3219 {
3220
3221 /*
3222 * attempt to remove any existing files in the directory
3223 * TODO: collate problem files into list
3224 */
3225 while ((arcdir_ent = readdir(arcdir)) != NULL)
3226 {
3227 PQExpBufferData arcdir_ent_path;
3228
3229 initPQExpBuffer(&arcdir_ent_path);
3230
3231 appendPQExpBuffer(&arcdir_ent_path,
3232 "%s/%s",
3233 archive_dir.data,
3234 arcdir_ent->d_name);
3235
3236 if (stat(arcdir_ent_path.data, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
3237 {
3238 termPQExpBuffer(&arcdir_ent_path);
3239 continue;
3240 }
3241
3242 if (unlink(arcdir_ent_path.data) == -1)
3243 {
3244 log_error(_("unable to delete file in temporary archive directory"));
3245 log_detail(_("file is: \"%s\""), arcdir_ent_path.data);
3246 log_detail("%s", strerror(errno));
3247 closedir(arcdir);
3248 termPQExpBuffer(&arcdir_ent_path);
3249 exit(ERR_BAD_CONFIG);
3250 }
3251
3252 termPQExpBuffer(&arcdir_ent_path);
3253 }
3254 }
3255
3256 closedir(arcdir);
3257
3258
3259 /*
3260 * extract list of config files from --config-files
3261 */
3262 {
3263 int i = 0;
3264 int j = 0;
3265 int config_file_len = strlen(runtime_options.config_files);
3266
3267 char filenamebuf[MAXPGPATH] = "";
3268 PQExpBufferData pathbuf;
3269
3270 for (j = 0; j < config_file_len; j++)
3271 {
3272 if (runtime_options.config_files[j] == ',')
3273 {
3274 int filename_len = j - i;
3275
3276 if (filename_len >= MAXPGPATH)
3277 filename_len = MAXPGPATH - 1;
3278
3279 strncpy(filenamebuf, runtime_options.config_files + i, filename_len);
3280
3281 filenamebuf[filename_len] = '\0';
3282
3283 initPQExpBuffer(&pathbuf);
3284
3285 appendPQExpBuffer(&pathbuf,
3286 "%s/%s",
3287 config_file_options.data_directory,
3288 filenamebuf);
3289
3290 key_value_list_set(&config_files,
3291 filenamebuf,
3292 pathbuf.data);
3293 termPQExpBuffer(&pathbuf);
3294 i = j + 1;
3295 }
3296 }
3297
3298 if (i < config_file_len)
3299 {
3300 int filename_len = config_file_len - i;
3301
3302 strncpy(filenamebuf, runtime_options.config_files + i, filename_len);
3303
3304 filenamebuf[filename_len] = '\0';
3305
3306 initPQExpBuffer(&pathbuf);
3307 appendPQExpBuffer(&pathbuf,
3308 "%s/%s",
3309 config_file_options.data_directory,
3310 filenamebuf);
3311
3312 key_value_list_set(&config_files,
3313 filenamebuf,
3314 pathbuf.data);
3315 termPQExpBuffer(&pathbuf);
3316 }
3317 }
3318
3319
3320 for (cell = config_files.head; cell; cell = cell->next)
3321 {
3322 PQExpBufferData dest_file;
3323
3324 initPQExpBuffer(&dest_file);
3325
3326 appendPQExpBuffer(&dest_file,
3327 "%s/%s",
3328 archive_dir.data,
3329 cell->key);
3330
3331 if (stat(cell->value, &statbuf) == -1)
3332 {
3333 log_warning(_("specified file \"%s\" not found, skipping"),
3334 cell->value);
3335 }
3336 else
3337 {
3338 if (runtime_options.dry_run == true)
3339 {
3340 log_info("file \"%s\" would be copied to \"%s\"",
3341 cell->key, dest_file.data);
3342 copied_count++;
3343 }
3344 else
3345 {
3346 log_verbose(LOG_DEBUG, "copying \"%s\" to \"%s\"",
3347 cell->key, dest_file.data);
3348 copy_file(cell->value, dest_file.data);
3349 copied_count++;
3350 }
3351 }
3352
3353 termPQExpBuffer(&dest_file);
3354 }
3355
3356 if (runtime_options.dry_run == true)
3357 {
3358 log_verbose(LOG_INFO, _("%i files would have been copied to \"%s\""),
3359 copied_count, archive_dir.data);
3360 }
3361 else
3362 {
3363 log_verbose(LOG_INFO, _("%i files copied to \"%s\""),
3364 copied_count, archive_dir.data);
3365 }
3366
3367 if (runtime_options.dry_run == true)
3368 {
3369 /*
3370 * Delete directory in --dry-run mode - it should be empty unless it's been
3371 * interfered with for some reason, in which case manual intervention is
3372 * required
3373 */
3374 if (rmdir(archive_dir.data) != 0 && errno != EEXIST)
3375 {
3376 log_warning(_("unable to delete directory \"%s\""), archive_dir.data);
3377 log_detail("%s", strerror(errno));
3378 log_hint(_("directory may need to be manually removed"));
3379 }
3380 else
3381 {
3382 log_verbose(LOG_INFO, "temporary archive directory \"%s\" deleted", archive_dir.data);
3383 }
3384 }
3385
3386 termPQExpBuffer(&archive_dir);
3387 }
3388
3389
3390 /*
3391 * Intended mainly for "internal" use by `standby switchover`, which
3392 * calls this on the target server to restore any configuration files
3393 * to the data directory, which may have been overwritten by an operation
3394 * like pg_rewind
3395 *
3396 * Not designed to be called if the instance is running, but does
3397 * not currently check.
3398 *
3399 * Requires -D/--pgdata, optionally --config-archive-dir
3400 *
3401 * Removes --config-archive-dir after successful copy
3402 */
3403
3404 static void
_do_node_restore_config(void)3405 _do_node_restore_config(void)
3406 {
3407 PQExpBufferData archive_dir;
3408
3409 DIR *arcdir;
3410 struct dirent *arcdir_ent;
3411 int copied_count = 0;
3412 bool copy_ok = true;
3413
3414 initPQExpBuffer(&archive_dir);
3415
3416 format_archive_dir(&archive_dir);
3417
3418 arcdir = opendir(archive_dir.data);
3419
3420 if (arcdir == NULL)
3421 {
3422 log_error(_("unable to open archive directory \"%s\""),
3423 archive_dir.data);
3424 log_detail("%s", strerror(errno));
3425 termPQExpBuffer(&archive_dir);
3426 exit(ERR_BAD_CONFIG);
3427 }
3428
3429 while ((arcdir_ent = readdir(arcdir)) != NULL)
3430 {
3431 struct stat statbuf;
3432 PQExpBufferData src_file_path;
3433 PQExpBufferData dest_file_path;
3434
3435 initPQExpBuffer(&src_file_path);
3436
3437 appendPQExpBuffer(&src_file_path,
3438 "%s/%s",
3439 archive_dir.data,
3440 arcdir_ent->d_name);
3441
3442 /* skip non-files */
3443 if (stat(src_file_path.data, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
3444 {
3445 termPQExpBuffer(&src_file_path);
3446 continue;
3447 }
3448
3449 initPQExpBuffer(&dest_file_path);
3450
3451 appendPQExpBuffer(&dest_file_path,
3452 "%s/%s",
3453 config_file_options.data_directory,
3454 arcdir_ent->d_name);
3455
3456 log_verbose(LOG_DEBUG, "copying \"%s\" to \"%s\"",
3457 src_file_path.data, dest_file_path.data);
3458
3459 if (copy_file(src_file_path.data, dest_file_path.data) == false)
3460 {
3461 copy_ok = false;
3462 log_warning(_("unable to copy \"%s\" to \"%s\""),
3463 arcdir_ent->d_name, runtime_options.data_dir);
3464 }
3465 else
3466 {
3467 unlink(src_file_path.data);
3468 copied_count++;
3469 }
3470
3471 termPQExpBuffer(&dest_file_path);
3472 termPQExpBuffer(&src_file_path);
3473 }
3474
3475 closedir(arcdir);
3476
3477 log_notice(_("%i files copied to %s"),
3478 copied_count,
3479 config_file_options.data_directory);
3480
3481 if (copy_ok == false)
3482 {
3483 log_warning(_("unable to copy all files from \"%s\""), archive_dir.data);
3484 }
3485 else
3486 {
3487 /*
3488 * Finally, delete directory - it should be empty unless it's been
3489 * interfered with for some reason, in which case manual intervention is
3490 * required
3491 */
3492 if (rmdir(archive_dir.data) != 0 && errno != EEXIST)
3493 {
3494 log_warning(_("unable to delete directory \"%s\""), archive_dir.data);
3495 log_detail("%s", strerror(errno));
3496 log_hint(_("directory may need to be manually removed"));
3497 }
3498 else
3499 {
3500 log_verbose(LOG_INFO, "directory \"%s\" deleted", archive_dir.data);
3501 }
3502 }
3503
3504 termPQExpBuffer(&archive_dir);
3505
3506 return;
3507 }
3508
3509
3510 static void
format_archive_dir(PQExpBufferData * archive_dir)3511 format_archive_dir(PQExpBufferData *archive_dir)
3512 {
3513 appendPQExpBuffer(archive_dir,
3514 "%s/repmgr-config-archive-%s",
3515 runtime_options.config_archive_dir,
3516 config_file_options.node_name);
3517
3518 log_verbose(LOG_DEBUG, "using archive directory \"%s\"", archive_dir->data);
3519 }
3520
3521
3522 static bool
copy_file(const char * src_file,const char * dest_file)3523 copy_file(const char *src_file, const char *dest_file)
3524 {
3525 FILE *ptr_old,
3526 *ptr_new;
3527 int a = 0;
3528
3529 ptr_old = fopen(src_file, "r");
3530
3531 if (ptr_old == NULL)
3532 return false;
3533
3534 ptr_new = fopen(dest_file, "w");
3535
3536 if (ptr_new == NULL)
3537 {
3538 fclose(ptr_old);
3539 return false;
3540 }
3541
3542 chmod(dest_file, S_IRUSR | S_IWUSR);
3543
3544 while (1)
3545 {
3546 a = fgetc(ptr_old);
3547
3548 if (!feof(ptr_old))
3549 {
3550 fputc(a, ptr_new);
3551 }
3552 else
3553 {
3554 break;
3555 }
3556 }
3557
3558 fclose(ptr_new);
3559 fclose(ptr_old);
3560
3561 return true;
3562 }
3563
3564
3565 void
do_node_help(void)3566 do_node_help(void)
3567 {
3568 print_help_header();
3569
3570 printf(_("Usage:\n"));
3571 printf(_(" %s [OPTIONS] node status\n"), progname());
3572 printf(_(" %s [OPTIONS] node check\n"), progname());
3573 printf(_(" %s [OPTIONS] node rejoin\n"), progname());
3574 printf(_(" %s [OPTIONS] node service\n"), progname());
3575 puts("");
3576
3577 printf(_("NODE STATUS\n"));
3578 puts("");
3579 printf(_(" \"node status\" displays an overview of a node's basic information and replication status.\n"));
3580 puts("");
3581 printf(_(" Configuration file required, runs on local node only.\n"));
3582 puts("");
3583 printf(_(" --csv emit output as CSV\n"));
3584 puts("");
3585
3586 printf(_("NODE CHECK\n"));
3587 puts("");
3588 printf(_(" \"node check\" performs some health checks on a node from a replication perspective.\n"));
3589 puts("");
3590 printf(_(" Configuration file required, runs on local node only.\n"));
3591 puts("");
3592 printf(_(" Connection options:\n"));
3593 printf(_(" -S, --superuser=USERNAME superuser to use, if repmgr user is not superuser\n"));
3594 puts("");
3595 printf(_(" Output options:\n"));
3596 printf(_(" --csv emit output as CSV (not available for individual check output)\n"));
3597 printf(_(" --nagios emit output in Nagios format (individual check output only)\n"));
3598 puts("");
3599 printf(_(" Following options check an individual status:\n"));
3600 printf(_(" --archive-ready number of WAL files ready for archiving\n"));
3601 printf(_(" --downstream whether all downstream nodes are connected\n"));
3602 printf(_(" --uptream whether the node is connected to its upstream\n"));
3603 printf(_(" --replication-lag replication lag in seconds (standbys only)\n"));
3604 printf(_(" --role check node has expected role\n"));
3605 printf(_(" --slots check for inactive replication slots\n"));
3606 printf(_(" --missing-slots check for missing replication slots\n"));
3607 printf(_(" --data-directory-config check repmgr's data directory configuration\n"));
3608
3609 puts("");
3610
3611 printf(_("NODE REJOIN\n"));
3612 puts("");
3613 printf(_(" \"node rejoin\" enables a dormant (stopped) node to be rejoined to the replication cluster.\n"));
3614 puts("");
3615 printf(_(" Configuration file required, runs on local node only.\n"));
3616 puts("");
3617 printf(_(" --dry-run check that the prerequisites are met for rejoining the node\n" \
3618 " (including usability of \"pg_rewind\" if requested)\n"));
3619 printf(_(" --force-rewind[=VALUE] execute \"pg_rewind\" if necessary\n"));
3620 printf(_(" (9.3 and 9.4 - provide full \"pg_rewind\" path)\n"));
3621
3622 printf(_(" --config-files comma-separated list of configuration files to retain\n" \
3623 " after executing \"pg_rewind\"\n"));
3624 printf(_(" --config-archive-dir directory to temporarily store retained configuration files\n" \
3625 " (default: /tmp)\n"));
3626 printf(_(" -W, --no-wait don't wait for the node to rejoin cluster\n"));
3627 puts("");
3628
3629 printf(_("NODE SERVICE\n"));
3630 puts("");
3631 printf(_(" \"node service\" executes a system service command to stop/start/restart/reload a node\n" \
3632 " or optionally display which command would be executed\n"));
3633 puts("");
3634 printf(_(" Configuration file required, runs on local node only.\n"));
3635 puts("");
3636 printf(_(" --dry-run show what action would be performed, but don't execute it\n"));
3637 printf(_(" --action action to perform (one of \"start\", \"stop\", \"restart\" or \"reload\")\n"));
3638 printf(_(" --list-actions show what command would be performed for each action\n"));
3639 printf(_(" --checkpoint issue a CHECKPOINT before stopping or restarting the node\n"));
3640 printf(_(" -S, --superuser=USERNAME superuser to use, if repmgr user is not superuser\n"));
3641
3642 puts("");
3643
3644 printf(_("%s home page: <%s>\n"), "repmgr", REPMGR_URL);
3645 }
3646