1 /* -*-pgsql-c-*- */
2 /*
3  * $Header$
4  *
5  * pgpool: a language independent connection pool server for PostgreSQL
6  * written by Tatsuo Ishii
7  *
8  * Copyright (c) 2003-2014	PgPool Global Development Group
9  *
10  * Permission to use, copy, modify, and distribute this software and
11  * its documentation for any purpose and without fee is hereby
12  * granted, provided that the above copyright notice appear in all
13  * copies and that both that copyright notice and this permission
14  * notice appear in supporting documentation, and that the name of the
15  * author not be used in advertising or publicity pertaining to
16  * distribution of the software without specific, written prior
17  * permission. The author makes no representations about the
18  * suitability of this software for any purpose.  It is provided "as
19  * is" without express or implied warranty.
20  *
21  * recovery.c: online recovery process
22  *
23  */
24 
25 #include "config.h"
26 
27 #include <unistd.h>
28 #include <string.h>
29 #include "utils/elog.h"
30 
31 #include "pool.h"
32 #include "pool_config.h"
33 
34 #include "libpq-fe.h"
35 
36 #include "watchdog/wd_ipc_commands.h"
37 
38 #define WAIT_RETRY_COUNT (pool_config->recovery_timeout / 3)
39 
40 #define FIRST_STAGE 0
41 #define SECOND_STAGE 1
42 
43 static void exec_checkpoint(PGconn *conn);
44 static void exec_recovery(PGconn *conn, BackendInfo *master_backend, BackendInfo *recovery_backend, char stage);
45 static void exec_remote_start(PGconn *conn, BackendInfo *backend);
46 static PGconn *connect_backend_libpq(BackendInfo *backend);
47 static void check_postmaster_started(BackendInfo *backend);
48 
49 static char recovery_command[1024];
50 
51 extern volatile sig_atomic_t pcp_worker_wakeup_request;
52 
53 /*
54  * Start online recovery.
55  * "recovery_node" is the node to be recovered.
56  * Master or primary node is chosen in this function.
57  */
start_recovery(int recovery_node)58 void start_recovery(int recovery_node)
59 {
60 	int node_id;
61 	BackendInfo *backend;
62 	BackendInfo *recovery_backend;
63 	PGconn *conn;
64 	int failback_wait_count;
65 #define FAILBACK_WAIT_MAX_RETRY 5		/* 5 seconds should be enough for failback operation */
66 
67 	ereport(LOG,
68 		(errmsg("starting recovering node %d", recovery_node)));
69 
70 	if ( (recovery_node < 0) || (recovery_node >= pool_config->backend_desc->num_backends) )
71 		ereport(ERROR,
72 				(errmsg("node recovery failed, node id: %d is not valid", recovery_node)));
73 
74 	if (*(my_backend_status[(recovery_node)]) == CON_UNUSED)
75 		ereport(ERROR,
76 				(errmsg("node recovery failed, node id: %d is unused", recovery_node)));
77 
78 	if (VALID_BACKEND(recovery_node))
79 		ereport(ERROR,
80 				(errmsg("node recovery failed, node id: %d is alive", recovery_node)));
81 
82 	/* select master/primary node */
83 	node_id = MASTER_SLAVE ? PRIMARY_NODE_ID : REAL_MASTER_NODE_ID;
84 	backend = &pool_config->backend_desc->backend_info[node_id];
85 
86 	/* get node info to be recovered */
87 	recovery_backend = &pool_config->backend_desc->backend_info[recovery_node];
88 
89 	conn = connect_backend_libpq(backend);
90 	if (conn == NULL)
91 		ereport(ERROR,
92 				(errmsg("node recovery failed, unable to connect to master node: %d ", node_id)));
93 
94 	PG_TRY();
95 	{
96 		/* 1st stage */
97 		if (REPLICATION)
98 		{
99 			exec_checkpoint(conn);
100 			ereport(LOG,
101 				(errmsg("node recovery, CHECKPOINT in the 1st stage done")));
102 		}
103 
104 		exec_recovery(conn, backend, recovery_backend, FIRST_STAGE);
105 
106 		ereport(LOG,
107 			(errmsg("node recovery, 1st stage is done")));
108 
109 		if (REPLICATION)
110 		{
111 			ereport(LOG,
112 				(errmsg("node recovery, starting 2nd stage")));
113 
114 			/* 2nd stage */
115 			*InRecovery = RECOVERY_ONLINE;
116 			if (pool_config->use_watchdog)
117 			{
118 				/* announce start recovery */
119 				if (COMMAND_OK != wd_start_recovery())
120 					ereport(ERROR,
121 							(errmsg("node recovery failed, failed to send start recovery packet")));
122 			}
123 
124 			if (wait_connection_closed() != 0)
125 				ereport(ERROR,
126 						(errmsg("node recovery failed, waiting connection closed in the other pgpools timeout")));
127 
128 			ereport(LOG,
129 				(errmsg("node recovery, all connections from clients have been closed")));
130 
131 			exec_checkpoint(conn);
132 
133 			ereport(LOG,
134 				(errmsg("node recovery"),
135 					 errdetail("CHECKPOINT in the 2nd stage done")));
136 
137 			exec_recovery(conn, backend, recovery_backend, SECOND_STAGE);
138 		}
139 
140 		exec_remote_start(conn, recovery_backend);
141 
142 		check_postmaster_started(recovery_backend);
143 
144 		ereport(LOG,
145 			(errmsg("node recovery, node: %d restarted", recovery_node)));
146 
147 		/*
148 		 * reset failover completion flag.  this is necessary since
149 		 * previous failover/failback will set the flag to 1.
150 		 */
151 		pcp_worker_wakeup_request = 0;
152 
153 		/* send failback request to pgpool parent */
154 		send_failback_request(recovery_node,false, false);
155 
156 		/* wait for failback */
157 		failback_wait_count = 0;
158 		while (!pcp_worker_wakeup_request)
159 		{
160 			struct timeval t = {1, 0};
161 			/* polling SIGUSR2 signal every 1 sec */
162 			select(0, NULL, NULL, NULL, &t);
163 			failback_wait_count++;
164 			if (failback_wait_count >= FAILBACK_WAIT_MAX_RETRY)
165 			{
166 				ereport(LOG,
167 					(errmsg("node recovery"),
168 						errdetail("waiting for wake up request is timeout(%d seconds)",
169 							   FAILBACK_WAIT_MAX_RETRY)));
170 
171 				break;
172 			}
173 		}
174 		pcp_worker_wakeup_request = 0;
175 	}
176 	PG_CATCH();
177 	{
178 		PQfinish(conn);
179 		PG_RE_THROW();
180 	}
181 	PG_END_TRY();
182 
183 	PQfinish(conn);
184 
185 	ereport(LOG,
186 			(errmsg("recovery done")));
187 }
188 
189 /*
190  * Notice all children finishing recovery.
191  */
finish_recovery(void)192 void finish_recovery(void)
193 {
194 	/* announce end recovery */
195 	if (pool_config->use_watchdog && *InRecovery != RECOVERY_INIT)
196 	{
197 		wd_end_recovery();
198 	}
199 
200 	*InRecovery = RECOVERY_INIT;
201 	pool_signal_parent(SIGUSR2);
202 }
203 
204 /*
205  * Execute CHECKPOINT
206  */
exec_checkpoint(PGconn * conn)207 static void exec_checkpoint(PGconn *conn)
208 {
209 	PGresult *result;
210 	ereport(DEBUG1,
211 		(errmsg("recovery execute checkpoint, start checkpoint")));
212 
213 	result = PQexec(conn, "CHECKPOINT");
214 	if(PQresultStatus(result) !=  PGRES_COMMAND_OK)
215 		ereport(ERROR,
216 				(errmsg("executing recovery, execute CHECKPOINT failed")));
217 	PQclear(result);
218 
219 	ereport(DEBUG1,
220 		(errmsg("recovery execute checkpoint, finish checkpoint")));
221 }
222 
223 /*
224  * Call pgpool_recovery() function.
225  */
exec_recovery(PGconn * conn,BackendInfo * master_backend,BackendInfo * recovery_backend,char stage)226 static void exec_recovery(PGconn *conn, BackendInfo *master_backend, BackendInfo *recovery_backend, char stage)
227 {
228 	PGresult *result;
229 	char *hostname;
230 	char *script;
231 
232 	if (strlen(recovery_backend->backend_hostname) == 0 || *(recovery_backend->backend_hostname) == '/')
233 		hostname = "localhost";
234 	else
235 		hostname = recovery_backend->backend_hostname;
236 
237 	script = (stage == FIRST_STAGE) ?
238 		pool_config->recovery_1st_stage_command : pool_config->recovery_2nd_stage_command;
239 
240 	if (script == NULL || strlen(script) == 0)
241 	{
242 		/* do not execute script */
243 		return;
244 	}
245 
246 	/*
247 	 * Execute recovery command
248 	 */
249 	snprintf(recovery_command,
250 			 sizeof(recovery_command),
251 			 "SELECT pgpool_recovery('%s', '%s', '%s', '%d')",
252 			 script,
253 			 hostname,
254 			 recovery_backend->backend_data_directory,
255 			 master_backend->backend_port);
256 
257 	ereport(LOG,
258 		(errmsg("executing recovery"),
259 			 errdetail("starting recovery command: \"%s\"", recovery_command)));
260 
261 	ereport(LOG,
262 		(errmsg("executing recovery"),
263 			 errdetail("disabling statement_timeout")));
264 
265 	result = PQexec(conn, "SET statement_timeout To 0");
266 	if(PQresultStatus(result) !=  PGRES_COMMAND_OK)
267 		ereport(ERROR,
268 				(errmsg("executing recovery, SET STATEMENT_TIMEOUT failed at \"%s\"",
269 						(stage == FIRST_STAGE) ? "1st stage" : "2nd stage")));
270 
271 	PQclear(result);
272 
273 	ereport(DEBUG1,
274 		(errmsg("executing recovery, start recovery")));
275 
276 	result = PQexec(conn, recovery_command);
277 	if(PQresultStatus(result) !=  PGRES_TUPLES_OK)
278 		ereport(ERROR,
279 				(errmsg("executing recovery, execution of command failed at \"%s\"",
280 						(stage == FIRST_STAGE) ? "1st stage" : "2nd stage"),
281 				 errdetail("command:\"%s\"",script)));
282 
283 	PQclear(result);
284 
285 	ereport(DEBUG1,
286 		(errmsg("executing recovery, finish recovery")));
287 }
288 
289 /*
290  * Call pgpool_remote_start() function.
291  */
exec_remote_start(PGconn * conn,BackendInfo * backend)292 static void exec_remote_start(PGconn *conn, BackendInfo *backend)
293 {
294 	PGresult *result;
295 	char *hostname;
296 
297 	if (strlen(backend->backend_hostname) == 0 || *(backend->backend_hostname) == '/')
298 		hostname = "localhost";
299 	else
300 		hostname = backend->backend_hostname;
301 
302 	snprintf(recovery_command, sizeof(recovery_command),
303 			 "SELECT pgpool_remote_start('%s', '%s')",
304 			 hostname,
305 			 backend->backend_data_directory);
306 
307 	ereport(DEBUG1,
308 		(errmsg("executing remote start"),
309 			 errdetail("start pgpool_remote_start")));
310 
311 	result = PQexec(conn, recovery_command);
312 	if(PQresultStatus(result) !=  PGRES_TUPLES_OK)
313 		ereport(ERROR,
314 			(errmsg("executing remote start failed with error: \"%s\"",PQresultErrorMessage(result))));
315 
316 	PQclear(result);
317 
318 	ereport(DEBUG1,
319 		(errmsg("executing remote start"),
320 			 errdetail("finish pgpool_remote_start")));
321 }
322 
323 /*
324  * Check postmaster is started.
325  */
check_postmaster_started(BackendInfo * backend)326 static void check_postmaster_started(BackendInfo *backend)
327 {
328 	int i = 0;
329 	char port_str[16];
330 	PGconn *conn;
331 	char *dbname;
332 
333 	snprintf(port_str, sizeof(port_str),"%d", backend->backend_port);
334 
335 	/*
336 	 * First we try with "postgres" database.
337 	 */
338 	dbname = "postgres";
339 
340 	do {
341 		ConnStatusType r;
342 
343 		ereport(LOG,
344 			(errmsg("checking if postmaster is started"),
345 				errdetail("trying to connect to postmaster on hostname:%s database:%s user:%s (retry %d times)",
346 					   backend->backend_hostname, dbname, pool_config->recovery_user, i)));
347 
348 		conn = PQsetdbLogin(backend->backend_hostname,
349 							port_str,
350 							NULL,
351 							NULL,
352 							dbname,
353 							pool_config->recovery_user,
354 							pool_config->recovery_password);
355 
356 		r = PQstatus(conn);
357 		PQfinish(conn);
358 		if (r == CONNECTION_OK)
359 			return;
360 
361 		ereport(LOG,
362 			(errmsg("checking if postmaster is started"),
363 				errdetail("failed to connect to postmaster on hostname:%s database:%s user:%s",
364 					   backend->backend_hostname, dbname, pool_config->recovery_user)));
365 
366 		sleep(3);
367 	} while (i++ < 3);	/* XXX Hard coded retry (9 seconds) */
368 
369 	/*
370 	 * Retry with "template1" database.
371 	 */
372 	dbname = "template1";
373 	i = 0;
374 
375 	do {
376 		ConnStatusType r;
377 
378 		ereport(LOG,
379 			(errmsg("checking if postmaster is started"),
380 				errdetail("trying to connect to postmaster on hostname:%s database:%s user:%s (retry %d times)",
381 					   backend->backend_hostname, dbname, pool_config->recovery_user, i)));
382 
383 		conn = PQsetdbLogin(backend->backend_hostname,
384 							port_str,
385 							NULL,
386 							NULL,
387 							dbname,
388 							pool_config->recovery_user,
389 							pool_config->recovery_password);
390 
391 		r = PQstatus(conn);
392 		PQfinish(conn);
393 		if (r == CONNECTION_OK)
394 			return;
395 
396 		ereport(LOG,
397 			(errmsg("checking if postmaster is started"),
398 				 errdetail("failed to connect to postmaster on hostname:%s database:%s user:%s",
399 						   backend->backend_hostname, dbname, pool_config->recovery_user)));
400 
401 		if (WAIT_RETRY_COUNT != 0)
402 			sleep(3);
403 	} while (i++ < WAIT_RETRY_COUNT);
404 
405 	ereport(ERROR,
406 		(errmsg("recovery is checking if postmaster is started"),
407 			 errdetail("postmaster on hostname:\"%s\" database:\"%s\" user:\"%s\" failed to start in %d second",
408 					   backend->backend_hostname, dbname, pool_config->recovery_user, pool_config->recovery_timeout)));
409 }
410 
connect_backend_libpq(BackendInfo * backend)411 static PGconn *connect_backend_libpq(BackendInfo *backend)
412 {
413 	char port_str[16];
414 	PGconn *conn;
415 
416 	snprintf(port_str, sizeof(port_str),
417 			 "%d", backend->backend_port);
418 	conn = PQsetdbLogin(backend->backend_hostname,
419 						port_str,
420 						NULL,
421 						NULL,
422 						"template1",
423 						pool_config->recovery_user,
424 						pool_config->recovery_password);
425 
426 	if (PQstatus(conn) != CONNECTION_OK)
427 	{
428 		PQfinish(conn);
429 		return NULL;
430 	}
431 	return conn;
432 }
433 
434 /*
435  * Wait all connections are closed.
436  */
wait_connection_closed(void)437 int wait_connection_closed(void)
438 {
439 	int i = 0;
440 
441 	do {
442 
443 		if (Req_info->conn_counter == 0)
444 			return 0;
445 
446 		if (WAIT_RETRY_COUNT != 0)
447 			sleep(3);
448 
449 	} while (i++ < WAIT_RETRY_COUNT);
450 	ereport(LOG,
451 			(errmsg("wait_connection_closed: existing connections did not close in %d sec.", pool_config->recovery_timeout)));
452 	return ensure_conn_counter_validity();
453 }
454 
ensure_conn_counter_validity(void)455 int ensure_conn_counter_validity(void)
456 {
457 	/*
458 	 * recovery_timeout was expired. Before returning with failure status,
459 	 * let's check if this is caused by the malformed conn_counter. If a child
460 	 * process abnormally exits (killed by SIGKILL or SEGFAULT, for example),
461 	 * then conn_counter is not decremented at process exit, thus it will
462 	 * never be returning to 0. This could be detected by checking if
463 	 * client_idle_limit_in_recovery is enabled and less value than
464 	 * recovery_timeout because all clients must be kicked out by the time
465 	 * when client_idle_limit_in_recovery is expired. If so, we should reset
466 	 * conn_counter to 0 also.
467 	 *
468 	 * See bug 431 for more info.
469 	 */
470 	if (pool_config->client_idle_limit_in_recovery == -1 ||
471 		(pool_config->client_idle_limit_in_recovery > 0 &&
472 		 pool_config->recovery_timeout >= pool_config->client_idle_limit_in_recovery))
473 	{
474 		ereport(LOG,
475 				(errmsg("wait_connection_closed: mulformed conn_counter (%d) detected. reset it to 0",
476 						Req_info->conn_counter)));
477 		Req_info->conn_counter = 0;
478 		return 0;
479 	}
480 	return 1;
481 }
482