1 /* -*-pgsql-c-*- */
2 /*
3  * $Header$
4  *
5  * pgpool: a language independent connection pool server for PostgreSQL
6  * written by Tatsuo Ishii
7  *
8  * Copyright (c) 2003-2018	PgPool Global Development Group
9  *
10  * Permission to use, copy, modify, and distribute this software and
11  * its documentation for any purpose and without fee is hereby
12  * granted, provided that the above copyright notice appear in all
13  * copies and that both that copyright notice and this permission
14  * notice appear in supporting documentation, and that the name of the
15  * author not be used in advertising or publicity pertaining to
16  * distribution of the software without specific, written prior
17  * permission. The author makes no representations about the
18  * suitability of this software for any purpose.  It is provided "as
19  * is" without express or implied warranty.
20  *
21  * recovery.c: online recovery process
22  *
23  */
24 
25 #include "config.h"
26 
27 #include <unistd.h>
28 #include <string.h>
29 #include "utils/elog.h"
30 
31 #include "pool.h"
32 #include "pool_config.h"
33 
34 #include "libpq-fe.h"
35 
36 #include "watchdog/wd_ipc_commands.h"
37 
38 #define WAIT_RETRY_COUNT (pool_config->recovery_timeout / 3)
39 
40 #define FIRST_STAGE 0
41 #define SECOND_STAGE 1
42 
43 static void exec_checkpoint(PGconn *conn);
44 static void exec_recovery(PGconn *conn, BackendInfo * master_backend, BackendInfo * recovery_backend, char stage, int recovery_node);
45 static void exec_remote_start(PGconn *conn, BackendInfo * backend);
46 static PGconn *connect_backend_libpq(BackendInfo * backend);
47 static void check_postmaster_started(BackendInfo * backend);
48 
49 static char recovery_command[1024];
50 
51 extern volatile sig_atomic_t pcp_worker_wakeup_request;
52 
53 /*
54  * Start online recovery.
55  * "recovery_node" is the node to be recovered.
56  * Master or primary node is chosen in this function.
57  */
58 void
start_recovery(int recovery_node)59 start_recovery(int recovery_node)
60 {
61 	int			node_id;
62 	BackendInfo *backend;
63 	BackendInfo *recovery_backend;
64 	PGconn	   *conn;
65 	int			failback_wait_count;
66 #define FAILBACK_WAIT_MAX_RETRY 5	/* 5 seconds should be enough for failback
67 									 * operation */
68 
69 	ereport(LOG,
70 			(errmsg("starting recovering node %d", recovery_node)));
71 
72 	if ((recovery_node < 0) || (recovery_node >= pool_config->backend_desc->num_backends))
73 		ereport(ERROR,
74 				(errmsg("node recovery failed, node id: %d is not valid", recovery_node)));
75 
76 	if (*(my_backend_status[(recovery_node)]) == CON_UNUSED)
77 		ereport(ERROR,
78 				(errmsg("node recovery failed, node id: %d is unused", recovery_node)));
79 
80 	if (VALID_BACKEND(recovery_node))
81 		ereport(ERROR,
82 				(errmsg("node recovery failed, node id: %d is alive", recovery_node)));
83 
84 	/* select master/primary node */
85 	node_id = MASTER_SLAVE ? PRIMARY_NODE_ID : REAL_MASTER_NODE_ID;
86 	backend = &pool_config->backend_desc->backend_info[node_id];
87 
88 	/* get node info to be recovered */
89 	recovery_backend = &pool_config->backend_desc->backend_info[recovery_node];
90 
91 	conn = connect_backend_libpq(backend);
92 	if (conn == NULL)
93 		ereport(ERROR,
94 				(errmsg("node recovery failed, unable to connect to master node: %d ", node_id)));
95 
96 	PG_TRY();
97 	{
98 		/* 1st stage */
99 		if (REPLICATION)
100 		{
101 			exec_checkpoint(conn);
102 			ereport(LOG,
103 					(errmsg("node recovery, CHECKPOINT in the 1st stage done")));
104 		}
105 
106 		exec_recovery(conn, backend, recovery_backend, FIRST_STAGE, recovery_node);
107 
108 		ereport(LOG,
109 				(errmsg("node recovery, 1st stage is done")));
110 
111 		if (REPLICATION)
112 		{
113 			ereport(LOG,
114 					(errmsg("node recovery, starting 2nd stage")));
115 
116 			/* 2nd stage */
117 			*InRecovery = RECOVERY_ONLINE;
118 			if (pool_config->use_watchdog)
119 			{
120 				/* announce start recovery */
121 				if (COMMAND_OK != wd_start_recovery())
122 					ereport(ERROR,
123 							(errmsg("node recovery failed, failed to send start recovery packet")));
124 			}
125 
126 			if (wait_connection_closed() != 0)
127 				ereport(ERROR,
128 						(errmsg("node recovery failed, waiting connection closed in the other pgpools timeout")));
129 
130 			ereport(LOG,
131 					(errmsg("node recovery, all connections from clients have been closed")));
132 
133 			exec_checkpoint(conn);
134 
135 			ereport(LOG,
136 					(errmsg("node recovery"),
137 					 errdetail("CHECKPOINT in the 2nd stage done")));
138 
139 			exec_recovery(conn, backend, recovery_backend, SECOND_STAGE, recovery_node);
140 		}
141 
142 		exec_remote_start(conn, recovery_backend);
143 
144 		check_postmaster_started(recovery_backend);
145 
146 		ereport(LOG,
147 				(errmsg("node recovery, node: %d restarted", recovery_node)));
148 
149 		/*
150 		 * reset failover completion flag.  this is necessary since previous
151 		 * failover/failback will set the flag to 1.
152 		 */
153 		pcp_worker_wakeup_request = 0;
154 
155 		/* send failback request to pgpool parent */
156 		send_failback_request(recovery_node, false, REQ_DETAIL_CONFIRMED);
157 
158 		/* wait for failback */
159 		failback_wait_count = 0;
160 		while (!pcp_worker_wakeup_request)
161 		{
162 			struct timeval t = {1, 0};
163 
164 			/* polling SIGUSR2 signal every 1 sec */
165 			select(0, NULL, NULL, NULL, &t);
166 			failback_wait_count++;
167 			if (failback_wait_count >= FAILBACK_WAIT_MAX_RETRY)
168 			{
169 				ereport(LOG,
170 						(errmsg("node recovery"),
171 						 errdetail("waiting for wake up request is timeout(%d seconds)",
172 								   FAILBACK_WAIT_MAX_RETRY)));
173 
174 				break;
175 			}
176 		}
177 		pcp_worker_wakeup_request = 0;
178 	}
179 	PG_CATCH();
180 	{
181 		PQfinish(conn);
182 		PG_RE_THROW();
183 	}
184 	PG_END_TRY();
185 
186 	PQfinish(conn);
187 
188 	ereport(LOG,
189 			(errmsg("recovery done")));
190 }
191 
192 /*
193  * Notice all children finishing recovery.
194  */
195 void
finish_recovery(void)196 finish_recovery(void)
197 {
198 	/* announce end recovery */
199 	if (pool_config->use_watchdog && *InRecovery != RECOVERY_INIT)
200 	{
201 		wd_end_recovery();
202 	}
203 
204 	*InRecovery = RECOVERY_INIT;
205 	pool_signal_parent(SIGUSR2);
206 }
207 
208 /*
209  * Execute CHECKPOINT
210  */
211 static void
exec_checkpoint(PGconn * conn)212 exec_checkpoint(PGconn *conn)
213 {
214 	PGresult   *result;
215 
216 	ereport(DEBUG1,
217 			(errmsg("recovery execute checkpoint, start checkpoint")));
218 
219 	result = PQexec(conn, "CHECKPOINT");
220 	if (PQresultStatus(result) != PGRES_COMMAND_OK)
221 		ereport(ERROR,
222 				(errmsg("executing recovery, execute CHECKPOINT failed")));
223 	PQclear(result);
224 
225 	ereport(DEBUG1,
226 			(errmsg("recovery execute checkpoint, finish checkpoint")));
227 }
228 
229 /*
230  * Call pgpool_recovery() function.
231  */
232 static void
exec_recovery(PGconn * conn,BackendInfo * master_backend,BackendInfo * recovery_backend,char stage,int recovery_node)233 exec_recovery(PGconn *conn, BackendInfo * master_backend, BackendInfo * recovery_backend, char stage, int recovery_node)
234 {
235 	PGresult   *result;
236 	char	   *hostname;
237 	char	   *script;
238 
239 	if (strlen(recovery_backend->backend_hostname) == 0 || *(recovery_backend->backend_hostname) == '/')
240 		hostname = "localhost";
241 	else
242 		hostname = recovery_backend->backend_hostname;
243 
244 	script = (stage == FIRST_STAGE) ?
245 		pool_config->recovery_1st_stage_command : pool_config->recovery_2nd_stage_command;
246 
247 	if (script == NULL || strlen(script) == 0)
248 	{
249 		/* do not execute script */
250 		return;
251 	}
252 
253 	/*
254 	 * Execute recovery command
255 	 */
256 	snprintf(recovery_command,
257 			 sizeof(recovery_command),
258 			 "SELECT pgpool_recovery('%s', '%s', '%s', '%d', %d, '%d')",
259 			 script,
260 			 hostname,
261 			 recovery_backend->backend_data_directory,
262 			 master_backend->backend_port,
263 			 recovery_node,
264 			 recovery_backend->backend_port
265 		);
266 
267 	ereport(LOG,
268 			(errmsg("executing recovery"),
269 			 errdetail("starting recovery command: \"%s\"", recovery_command)));
270 
271 	ereport(LOG,
272 			(errmsg("executing recovery"),
273 			 errdetail("disabling statement_timeout")));
274 
275 	result = PQexec(conn, "SET statement_timeout To 0");
276 	if (PQresultStatus(result) != PGRES_COMMAND_OK)
277 		ereport(ERROR,
278 				(errmsg("executing recovery, SET STATEMENT_TIMEOUT failed at \"%s\"",
279 						(stage == FIRST_STAGE) ? "1st stage" : "2nd stage")));
280 
281 	PQclear(result);
282 
283 	ereport(DEBUG1,
284 			(errmsg("executing recovery, start recovery")));
285 
286 	result = PQexec(conn, recovery_command);
287 	if (PQresultStatus(result) != PGRES_TUPLES_OK)
288 		ereport(ERROR,
289 				(errmsg("executing recovery, execution of command failed at \"%s\"",
290 						(stage == FIRST_STAGE) ? "1st stage" : "2nd stage"),
291 				 errdetail("command:\"%s\"", script)));
292 
293 	PQclear(result);
294 
295 	ereport(DEBUG1,
296 			(errmsg("executing recovery, finish recovery")));
297 }
298 
299 /*
300  * Call pgpool_remote_start() function.
301  */
302 static void
exec_remote_start(PGconn * conn,BackendInfo * backend)303 exec_remote_start(PGconn *conn, BackendInfo * backend)
304 {
305 	PGresult   *result;
306 	char	   *hostname;
307 
308 	if (strlen(backend->backend_hostname) == 0 || *(backend->backend_hostname) == '/')
309 		hostname = "localhost";
310 	else
311 		hostname = backend->backend_hostname;
312 
313 	snprintf(recovery_command, sizeof(recovery_command),
314 			 "SELECT pgpool_remote_start('%s', '%s')",
315 			 hostname,
316 			 backend->backend_data_directory);
317 
318 	ereport(DEBUG1,
319 			(errmsg("executing remote start"),
320 			 errdetail("start pgpool_remote_start")));
321 
322 	result = PQexec(conn, recovery_command);
323 	if (PQresultStatus(result) != PGRES_TUPLES_OK)
324 		ereport(ERROR,
325 				(errmsg("executing remote start failed with error: \"%s\"", PQresultErrorMessage(result))));
326 
327 	PQclear(result);
328 
329 	ereport(DEBUG1,
330 			(errmsg("executing remote start"),
331 			 errdetail("finish pgpool_remote_start")));
332 }
333 
334 /*
335  * Check postmaster is started.
336  */
337 static void
check_postmaster_started(BackendInfo * backend)338 check_postmaster_started(BackendInfo * backend)
339 {
340 	int			i = 0;
341 	char		port_str[16];
342 	PGconn	   *conn;
343 	char	   *dbname;
344 	char	   *password = get_pgpool_config_user_password(pool_config->recovery_user,
345 														   pool_config->recovery_password);
346 
347 	snprintf(port_str, sizeof(port_str), "%d", backend->backend_port);
348 
349 	/*
350 	 * First we try with "postgres" database.
351 	 */
352 	dbname = "postgres";
353 
354 	do
355 	{
356 		ConnStatusType r;
357 
358 		ereport(LOG,
359 				(errmsg("checking if postmaster is started"),
360 				 errdetail("trying to connect to postmaster on hostname:%s database:%s user:%s (retry %d times)",
361 						   backend->backend_hostname, dbname, pool_config->recovery_user, i)));
362 
363 		conn = PQsetdbLogin(backend->backend_hostname,
364 							port_str,
365 							NULL,
366 							NULL,
367 							dbname,
368 							pool_config->recovery_user,
369 							password ? password : NULL);
370 
371 		r = PQstatus(conn);
372 		PQfinish(conn);
373 		if (r == CONNECTION_OK)
374 		{
375 			if (password)
376 				pfree(password);
377 			return;
378 		}
379 		ereport(LOG,
380 				(errmsg("checking if postmaster is started"),
381 				 errdetail("failed to connect to postmaster on hostname:%s database:%s user:%s",
382 						   backend->backend_hostname, dbname, pool_config->recovery_user)));
383 
384 		sleep(3);
385 	} while (i++ < 3);			/* XXX Hard coded retry (9 seconds) */
386 
387 	/*
388 	 * Retry with "template1" database.
389 	 */
390 	dbname = "template1";
391 	i = 0;
392 
393 	do
394 	{
395 		ConnStatusType r;
396 
397 		ereport(LOG,
398 				(errmsg("checking if postmaster is started"),
399 				 errdetail("trying to connect to postmaster on hostname:%s database:%s user:%s (retry %d times)",
400 						   backend->backend_hostname, dbname, pool_config->recovery_user, i)));
401 
402 		conn = PQsetdbLogin(backend->backend_hostname,
403 							port_str,
404 							NULL,
405 							NULL,
406 							dbname,
407 							pool_config->recovery_user,
408 							password ? password : NULL);
409 
410 		r = PQstatus(conn);
411 		PQfinish(conn);
412 		if (r == CONNECTION_OK)
413 		{
414 			if (password)
415 				pfree(password);
416 			return;
417 		}
418 
419 		ereport(LOG,
420 				(errmsg("checking if postmaster is started"),
421 				 errdetail("failed to connect to postmaster on hostname:%s database:%s user:%s",
422 						   backend->backend_hostname, dbname, pool_config->recovery_user)));
423 
424 		if (WAIT_RETRY_COUNT != 0)
425 			sleep(3);
426 	} while (i++ < WAIT_RETRY_COUNT);
427 
428 	if (password)
429 		pfree(password);
430 
431 	ereport(ERROR,
432 			(errmsg("recovery is checking if postmaster is started"),
433 			 errdetail("postmaster on hostname:\"%s\" database:\"%s\" user:\"%s\" failed to start in %d second",
434 					   backend->backend_hostname, dbname, pool_config->recovery_user, pool_config->recovery_timeout)));
435 }
436 
437 static PGconn *
connect_backend_libpq(BackendInfo * backend)438 connect_backend_libpq(BackendInfo * backend)
439 {
440 	char		port_str[16];
441 	PGconn	   *conn;
442 	char	   *password = get_pgpool_config_user_password(pool_config->recovery_user,
443 														   pool_config->recovery_password);
444 
445 	snprintf(port_str, sizeof(port_str),
446 			 "%d", backend->backend_port);
447 	conn = PQsetdbLogin(backend->backend_hostname,
448 						port_str,
449 						NULL,
450 						NULL,
451 						"template1",
452 						pool_config->recovery_user,
453 						password ? password : "");
454 
455 	if (password)
456 		pfree(password);
457 
458 	if (PQstatus(conn) != CONNECTION_OK)
459 	{
460 		PQfinish(conn);
461 		return NULL;
462 	}
463 	return conn;
464 }
465 
466 /*
467  * Wait all connections are closed.
468  */
469 int
wait_connection_closed(void)470 wait_connection_closed(void)
471 {
472 	int			i = 0;
473 
474 	do
475 	{
476 
477 		if (Req_info->conn_counter == 0)
478 			return 0;
479 
480 		if (WAIT_RETRY_COUNT != 0)
481 			sleep(3);
482 
483 	} while (i++ < WAIT_RETRY_COUNT);
484 	ereport(LOG,
485 			(errmsg("wait_connection_closed: existing connections did not close in %d sec.", pool_config->recovery_timeout)));
486 	return ensure_conn_counter_validity();
487 }
488 
ensure_conn_counter_validity(void)489 int ensure_conn_counter_validity(void)
490 {
491 	/*
492 	 * recovery_timeout was expired. Before returning with failure status,
493 	 * let's check if this is caused by the malformed conn_counter. If a child
494 	 * process abnormally exits (killed by SIGKILL or SEGFAULT, for example),
495 	 * then conn_counter is not decremented at process exit, thus it will
496 	 * never be returning to 0. This could be detected by checking if
497 	 * client_idle_limit_in_recovery is enabled and less value than
498 	 * recovery_timeout because all clients must be kicked out by the time
499 	 * when client_idle_limit_in_recovery is expired. If so, we should reset
500 	 * conn_counter to 0 also.
501 	 *
502 	 * See bug 431 for more info.
503 	 */
504 	if (pool_config->client_idle_limit_in_recovery == -1 ||
505 		(pool_config->client_idle_limit_in_recovery > 0 &&
506 		 pool_config->recovery_timeout >= pool_config->client_idle_limit_in_recovery))
507 	{
508 		ereport(LOG,
509 				(errmsg("wait_connection_closed: mulformed conn_counter (%d) detected. reset it to 0",
510 						Req_info->conn_counter)));
511 		Req_info->conn_counter = 0;
512 		return 0;
513 	}
514 	return 1;
515 }
516