1 /* -*-pgsql-c-*- */
2 /*
3 * $Header$
4 *
5 * pgpool: a language independent connection pool server for PostgreSQL
6 * written by Tatsuo Ishii
7 *
8 * Copyright (c) 2003-2014 PgPool Global Development Group
9 *
10 * Permission to use, copy, modify, and distribute this software and
11 * its documentation for any purpose and without fee is hereby
12 * granted, provided that the above copyright notice appear in all
13 * copies and that both that copyright notice and this permission
14 * notice appear in supporting documentation, and that the name of the
15 * author not be used in advertising or publicity pertaining to
16 * distribution of the software without specific, written prior
17 * permission. The author makes no representations about the
18 * suitability of this software for any purpose. It is provided "as
19 * is" without express or implied warranty.
20 *
21 * recovery.c: online recovery process
22 *
23 */
24
25 #include "config.h"
26
27 #include <unistd.h>
28 #include <string.h>
29 #include "utils/elog.h"
30
31 #include "pool.h"
32 #include "pool_config.h"
33
34 #include "libpq-fe.h"
35
36 #include "watchdog/wd_ipc_commands.h"
37
38 #define WAIT_RETRY_COUNT (pool_config->recovery_timeout / 3)
39
40 #define FIRST_STAGE 0
41 #define SECOND_STAGE 1
42
43 static void exec_checkpoint(PGconn *conn);
44 static void exec_recovery(PGconn *conn, BackendInfo *master_backend, BackendInfo *recovery_backend, char stage);
45 static void exec_remote_start(PGconn *conn, BackendInfo *backend);
46 static PGconn *connect_backend_libpq(BackendInfo *backend);
47 static void check_postmaster_started(BackendInfo *backend);
48
49 static char recovery_command[1024];
50
51 extern volatile sig_atomic_t pcp_worker_wakeup_request;
52
53 /*
54 * Start online recovery.
55 * "recovery_node" is the node to be recovered.
56 * Master or primary node is chosen in this function.
57 */
start_recovery(int recovery_node)58 void start_recovery(int recovery_node)
59 {
60 int node_id;
61 BackendInfo *backend;
62 BackendInfo *recovery_backend;
63 PGconn *conn;
64 int failback_wait_count;
65 #define FAILBACK_WAIT_MAX_RETRY 5 /* 5 seconds should be enough for failback operation */
66
67 ereport(LOG,
68 (errmsg("starting recovering node %d", recovery_node)));
69
70 if ( (recovery_node < 0) || (recovery_node >= pool_config->backend_desc->num_backends) )
71 ereport(ERROR,
72 (errmsg("node recovery failed, node id: %d is not valid", recovery_node)));
73
74 if (*(my_backend_status[(recovery_node)]) == CON_UNUSED)
75 ereport(ERROR,
76 (errmsg("node recovery failed, node id: %d is unused", recovery_node)));
77
78 if (VALID_BACKEND(recovery_node))
79 ereport(ERROR,
80 (errmsg("node recovery failed, node id: %d is alive", recovery_node)));
81
82 /* select master/primary node */
83 node_id = MASTER_SLAVE ? PRIMARY_NODE_ID : REAL_MASTER_NODE_ID;
84 backend = &pool_config->backend_desc->backend_info[node_id];
85
86 /* get node info to be recovered */
87 recovery_backend = &pool_config->backend_desc->backend_info[recovery_node];
88
89 conn = connect_backend_libpq(backend);
90 if (conn == NULL)
91 ereport(ERROR,
92 (errmsg("node recovery failed, unable to connect to master node: %d ", node_id)));
93
94 PG_TRY();
95 {
96 /* 1st stage */
97 if (REPLICATION)
98 {
99 exec_checkpoint(conn);
100 ereport(LOG,
101 (errmsg("node recovery, CHECKPOINT in the 1st stage done")));
102 }
103
104 exec_recovery(conn, backend, recovery_backend, FIRST_STAGE);
105
106 ereport(LOG,
107 (errmsg("node recovery, 1st stage is done")));
108
109 if (REPLICATION)
110 {
111 ereport(LOG,
112 (errmsg("node recovery, starting 2nd stage")));
113
114 /* 2nd stage */
115 *InRecovery = RECOVERY_ONLINE;
116 if (pool_config->use_watchdog)
117 {
118 /* announce start recovery */
119 if (COMMAND_OK != wd_start_recovery())
120 ereport(ERROR,
121 (errmsg("node recovery failed, failed to send start recovery packet")));
122 }
123
124 if (wait_connection_closed() != 0)
125 ereport(ERROR,
126 (errmsg("node recovery failed, waiting connection closed in the other pgpools timeout")));
127
128 ereport(LOG,
129 (errmsg("node recovery, all connections from clients have been closed")));
130
131 exec_checkpoint(conn);
132
133 ereport(LOG,
134 (errmsg("node recovery"),
135 errdetail("CHECKPOINT in the 2nd stage done")));
136
137 exec_recovery(conn, backend, recovery_backend, SECOND_STAGE);
138 }
139
140 exec_remote_start(conn, recovery_backend);
141
142 check_postmaster_started(recovery_backend);
143
144 ereport(LOG,
145 (errmsg("node recovery, node: %d restarted", recovery_node)));
146
147 /*
148 * reset failover completion flag. this is necessary since
149 * previous failover/failback will set the flag to 1.
150 */
151 pcp_worker_wakeup_request = 0;
152
153 /* send failback request to pgpool parent */
154 send_failback_request(recovery_node,false, false);
155
156 /* wait for failback */
157 failback_wait_count = 0;
158 while (!pcp_worker_wakeup_request)
159 {
160 struct timeval t = {1, 0};
161 /* polling SIGUSR2 signal every 1 sec */
162 select(0, NULL, NULL, NULL, &t);
163 failback_wait_count++;
164 if (failback_wait_count >= FAILBACK_WAIT_MAX_RETRY)
165 {
166 ereport(LOG,
167 (errmsg("node recovery"),
168 errdetail("waiting for wake up request is timeout(%d seconds)",
169 FAILBACK_WAIT_MAX_RETRY)));
170
171 break;
172 }
173 }
174 pcp_worker_wakeup_request = 0;
175 }
176 PG_CATCH();
177 {
178 PQfinish(conn);
179 PG_RE_THROW();
180 }
181 PG_END_TRY();
182
183 PQfinish(conn);
184
185 ereport(LOG,
186 (errmsg("recovery done")));
187 }
188
189 /*
190 * Notice all children finishing recovery.
191 */
finish_recovery(void)192 void finish_recovery(void)
193 {
194 /* announce end recovery */
195 if (pool_config->use_watchdog && *InRecovery != RECOVERY_INIT)
196 {
197 wd_end_recovery();
198 }
199
200 *InRecovery = RECOVERY_INIT;
201 pool_signal_parent(SIGUSR2);
202 }
203
204 /*
205 * Execute CHECKPOINT
206 */
exec_checkpoint(PGconn * conn)207 static void exec_checkpoint(PGconn *conn)
208 {
209 PGresult *result;
210 ereport(DEBUG1,
211 (errmsg("recovery execute checkpoint, start checkpoint")));
212
213 result = PQexec(conn, "CHECKPOINT");
214 if(PQresultStatus(result) != PGRES_COMMAND_OK)
215 ereport(ERROR,
216 (errmsg("executing recovery, execute CHECKPOINT failed")));
217 PQclear(result);
218
219 ereport(DEBUG1,
220 (errmsg("recovery execute checkpoint, finish checkpoint")));
221 }
222
223 /*
224 * Call pgpool_recovery() function.
225 */
exec_recovery(PGconn * conn,BackendInfo * master_backend,BackendInfo * recovery_backend,char stage)226 static void exec_recovery(PGconn *conn, BackendInfo *master_backend, BackendInfo *recovery_backend, char stage)
227 {
228 PGresult *result;
229 char *hostname;
230 char *script;
231
232 if (strlen(recovery_backend->backend_hostname) == 0 || *(recovery_backend->backend_hostname) == '/')
233 hostname = "localhost";
234 else
235 hostname = recovery_backend->backend_hostname;
236
237 script = (stage == FIRST_STAGE) ?
238 pool_config->recovery_1st_stage_command : pool_config->recovery_2nd_stage_command;
239
240 if (script == NULL || strlen(script) == 0)
241 {
242 /* do not execute script */
243 return;
244 }
245
246 /*
247 * Execute recovery command
248 */
249 snprintf(recovery_command,
250 sizeof(recovery_command),
251 "SELECT pgpool_recovery('%s', '%s', '%s', '%d')",
252 script,
253 hostname,
254 recovery_backend->backend_data_directory,
255 master_backend->backend_port);
256
257 ereport(LOG,
258 (errmsg("executing recovery"),
259 errdetail("starting recovery command: \"%s\"", recovery_command)));
260
261 ereport(LOG,
262 (errmsg("executing recovery"),
263 errdetail("disabling statement_timeout")));
264
265 result = PQexec(conn, "SET statement_timeout To 0");
266 if(PQresultStatus(result) != PGRES_COMMAND_OK)
267 ereport(ERROR,
268 (errmsg("executing recovery, SET STATEMENT_TIMEOUT failed at \"%s\"",
269 (stage == FIRST_STAGE) ? "1st stage" : "2nd stage")));
270
271 PQclear(result);
272
273 ereport(DEBUG1,
274 (errmsg("executing recovery, start recovery")));
275
276 result = PQexec(conn, recovery_command);
277 if(PQresultStatus(result) != PGRES_TUPLES_OK)
278 ereport(ERROR,
279 (errmsg("executing recovery, execution of command failed at \"%s\"",
280 (stage == FIRST_STAGE) ? "1st stage" : "2nd stage"),
281 errdetail("command:\"%s\"",script)));
282
283 PQclear(result);
284
285 ereport(DEBUG1,
286 (errmsg("executing recovery, finish recovery")));
287 }
288
289 /*
290 * Call pgpool_remote_start() function.
291 */
exec_remote_start(PGconn * conn,BackendInfo * backend)292 static void exec_remote_start(PGconn *conn, BackendInfo *backend)
293 {
294 PGresult *result;
295 char *hostname;
296
297 if (strlen(backend->backend_hostname) == 0 || *(backend->backend_hostname) == '/')
298 hostname = "localhost";
299 else
300 hostname = backend->backend_hostname;
301
302 snprintf(recovery_command, sizeof(recovery_command),
303 "SELECT pgpool_remote_start('%s', '%s')",
304 hostname,
305 backend->backend_data_directory);
306
307 ereport(DEBUG1,
308 (errmsg("executing remote start"),
309 errdetail("start pgpool_remote_start")));
310
311 result = PQexec(conn, recovery_command);
312 if(PQresultStatus(result) != PGRES_TUPLES_OK)
313 ereport(ERROR,
314 (errmsg("executing remote start failed with error: \"%s\"",PQresultErrorMessage(result))));
315
316 PQclear(result);
317
318 ereport(DEBUG1,
319 (errmsg("executing remote start"),
320 errdetail("finish pgpool_remote_start")));
321 }
322
323 /*
324 * Check postmaster is started.
325 */
check_postmaster_started(BackendInfo * backend)326 static void check_postmaster_started(BackendInfo *backend)
327 {
328 int i = 0;
329 char port_str[16];
330 PGconn *conn;
331 char *dbname;
332
333 snprintf(port_str, sizeof(port_str),"%d", backend->backend_port);
334
335 /*
336 * First we try with "postgres" database.
337 */
338 dbname = "postgres";
339
340 do {
341 ConnStatusType r;
342
343 ereport(LOG,
344 (errmsg("checking if postmaster is started"),
345 errdetail("trying to connect to postmaster on hostname:%s database:%s user:%s (retry %d times)",
346 backend->backend_hostname, dbname, pool_config->recovery_user, i)));
347
348 conn = PQsetdbLogin(backend->backend_hostname,
349 port_str,
350 NULL,
351 NULL,
352 dbname,
353 pool_config->recovery_user,
354 pool_config->recovery_password);
355
356 r = PQstatus(conn);
357 PQfinish(conn);
358 if (r == CONNECTION_OK)
359 return;
360
361 ereport(LOG,
362 (errmsg("checking if postmaster is started"),
363 errdetail("failed to connect to postmaster on hostname:%s database:%s user:%s",
364 backend->backend_hostname, dbname, pool_config->recovery_user)));
365
366 sleep(3);
367 } while (i++ < 3); /* XXX Hard coded retry (9 seconds) */
368
369 /*
370 * Retry with "template1" database.
371 */
372 dbname = "template1";
373 i = 0;
374
375 do {
376 ConnStatusType r;
377
378 ereport(LOG,
379 (errmsg("checking if postmaster is started"),
380 errdetail("trying to connect to postmaster on hostname:%s database:%s user:%s (retry %d times)",
381 backend->backend_hostname, dbname, pool_config->recovery_user, i)));
382
383 conn = PQsetdbLogin(backend->backend_hostname,
384 port_str,
385 NULL,
386 NULL,
387 dbname,
388 pool_config->recovery_user,
389 pool_config->recovery_password);
390
391 r = PQstatus(conn);
392 PQfinish(conn);
393 if (r == CONNECTION_OK)
394 return;
395
396 ereport(LOG,
397 (errmsg("checking if postmaster is started"),
398 errdetail("failed to connect to postmaster on hostname:%s database:%s user:%s",
399 backend->backend_hostname, dbname, pool_config->recovery_user)));
400
401 if (WAIT_RETRY_COUNT != 0)
402 sleep(3);
403 } while (i++ < WAIT_RETRY_COUNT);
404
405 ereport(ERROR,
406 (errmsg("recovery is checking if postmaster is started"),
407 errdetail("postmaster on hostname:\"%s\" database:\"%s\" user:\"%s\" failed to start in %d second",
408 backend->backend_hostname, dbname, pool_config->recovery_user, pool_config->recovery_timeout)));
409 }
410
connect_backend_libpq(BackendInfo * backend)411 static PGconn *connect_backend_libpq(BackendInfo *backend)
412 {
413 char port_str[16];
414 PGconn *conn;
415
416 snprintf(port_str, sizeof(port_str),
417 "%d", backend->backend_port);
418 conn = PQsetdbLogin(backend->backend_hostname,
419 port_str,
420 NULL,
421 NULL,
422 "template1",
423 pool_config->recovery_user,
424 pool_config->recovery_password);
425
426 if (PQstatus(conn) != CONNECTION_OK)
427 {
428 PQfinish(conn);
429 return NULL;
430 }
431 return conn;
432 }
433
434 /*
435 * Wait all connections are closed.
436 */
wait_connection_closed(void)437 int wait_connection_closed(void)
438 {
439 int i = 0;
440
441 do {
442
443 if (Req_info->conn_counter == 0)
444 return 0;
445
446 if (WAIT_RETRY_COUNT != 0)
447 sleep(3);
448
449 } while (i++ < WAIT_RETRY_COUNT);
450 ereport(LOG,
451 (errmsg("wait_connection_closed: existing connections did not close in %d sec.", pool_config->recovery_timeout)));
452 return ensure_conn_counter_validity();
453 }
454
ensure_conn_counter_validity(void)455 int ensure_conn_counter_validity(void)
456 {
457 /*
458 * recovery_timeout was expired. Before returning with failure status,
459 * let's check if this is caused by the malformed conn_counter. If a child
460 * process abnormally exits (killed by SIGKILL or SEGFAULT, for example),
461 * then conn_counter is not decremented at process exit, thus it will
462 * never be returning to 0. This could be detected by checking if
463 * client_idle_limit_in_recovery is enabled and less value than
464 * recovery_timeout because all clients must be kicked out by the time
465 * when client_idle_limit_in_recovery is expired. If so, we should reset
466 * conn_counter to 0 also.
467 *
468 * See bug 431 for more info.
469 */
470 if (pool_config->client_idle_limit_in_recovery == -1 ||
471 (pool_config->client_idle_limit_in_recovery > 0 &&
472 pool_config->recovery_timeout >= pool_config->client_idle_limit_in_recovery))
473 {
474 ereport(LOG,
475 (errmsg("wait_connection_closed: mulformed conn_counter (%d) detected. reset it to 0",
476 Req_info->conn_counter)));
477 Req_info->conn_counter = 0;
478 return 0;
479 }
480 return 1;
481 }
482