1#
2# Tests restarts of postgres due to crashes of a subprocess.
3#
4# Two longer-running psql subprocesses are used: One to kill a
5# backend, triggering a crash-restart cycle, one to detect when
6# postmaster noticed the backend died.  The second backend is
7# necessary because it's otherwise hard to determine if postmaster is
8# still accepting new sessions (because it hasn't noticed that the
9# backend died), or because it's already restarted.
10#
11use strict;
12use warnings;
13use PostgresNode;
14use TestLib;
15use Test::More;
16use Config;
17use Time::HiRes qw(usleep);
18
19plan tests => 18;
20
21
22# To avoid hanging while expecting some specific input from a psql
23# instance being driven by us, add a timeout high enough that it
24# should never trigger even on very slow machines, unless something
25# is really wrong.
26my $psql_timeout = IPC::Run::timer(60);
27
28my $node = get_new_node('master');
29$node->init(allows_streaming => 1);
30$node->start();
31
32# by default PostgresNode doesn't doesn't restart after a crash
33$node->safe_psql(
34	'postgres',
35	q[ALTER SYSTEM SET restart_after_crash = 1;
36				   ALTER SYSTEM SET log_connections = 1;
37				   SELECT pg_reload_conf();]);
38
39# Run psql, keeping session alive, so we have an alive backend to kill.
40my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', '');
41my $killme = IPC::Run::start(
42	[
43		'psql', '-X', '-qAt', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d',
44		$node->connstr('postgres')
45	],
46	'<',
47	\$killme_stdin,
48	'>',
49	\$killme_stdout,
50	'2>',
51	\$killme_stderr,
52	$psql_timeout);
53
54# Need a second psql to check if crash-restart happened.
55my ($monitor_stdin, $monitor_stdout, $monitor_stderr) = ('', '', '');
56my $monitor = IPC::Run::start(
57	[
58		'psql', '-X', '-qAt', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d',
59		$node->connstr('postgres')
60	],
61	'<',
62	\$monitor_stdin,
63	'>',
64	\$monitor_stdout,
65	'2>',
66	\$monitor_stderr,
67	$psql_timeout);
68
69#create table, insert row that should survive
70$killme_stdin .= q[
71CREATE TABLE alive(status text);
72INSERT INTO alive VALUES($$committed-before-sigquit$$);
73SELECT pg_backend_pid();
74];
75ok(pump_until($killme, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m),
76	'acquired pid for SIGQUIT');
77my $pid = $killme_stdout;
78chomp($pid);
79$killme_stdout = '';
80$killme_stderr = '';
81
82#insert a row that should *not* survive, due to in-progress xact
83$killme_stdin .= q[
84BEGIN;
85INSERT INTO alive VALUES($$in-progress-before-sigquit$$) RETURNING status;
86];
87ok(pump_until($killme, \$killme_stdout, qr/in-progress-before-sigquit/m),
88	'inserted in-progress-before-sigquit');
89$killme_stdout = '';
90$killme_stderr = '';
91
92
93# Start longrunning query in second session, it's failure will signal
94# that crash-restart has occurred.  The initial wait for the trivial
95# select is to be sure that psql successfully connected to backend.
96$monitor_stdin .= q[
97SELECT $$psql-connected$$;
98SELECT pg_sleep(3600);
99];
100ok(pump_until($monitor, \$monitor_stdout, qr/psql-connected/m),
101	'monitor connected');
102$monitor_stdout = '';
103$monitor_stderr = '';
104
105# kill once with QUIT - we expect psql to exit, while emitting error message first
106my $ret = TestLib::system_log('pg_ctl', 'kill', 'QUIT', $pid);
107
108# Exactly process should have been alive to be killed
109is($ret, 0, "killed process with SIGQUIT");
110
111# Check that psql sees the killed backend as having been terminated
112$killme_stdin .= q[
113SELECT 1;
114];
115ok( pump_until(
116		$killme,
117		\$killme_stderr,
118		qr/WARNING:  terminating connection because of crash of another server process|server closed the connection unexpectedly|connection to server was lost/m
119	),
120	"psql query died successfully after SIGQUIT");
121$killme_stderr = '';
122$killme_stdout = '';
123$killme->finish;
124
125# Wait till server restarts - we should get the WARNING here, but
126# sometimes the server is unable to send that, if interrupted while
127# sending.
128ok( pump_until(
129		$monitor,
130		\$monitor_stderr,
131		qr/WARNING:  terminating connection because of crash of another server process|server closed the connection unexpectedly|connection to server was lost/m
132	),
133	"psql monitor died successfully after SIGQUIT");
134$monitor->finish;
135
136# Wait till server restarts
137is($node->poll_query_until('postgres', undef, ''),
138	"1", "reconnected after SIGQUIT");
139
140
141# restart psql processes, now that the crash cycle finished
142($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', '');
143$killme->run();
144($monitor_stdin, $monitor_stdout, $monitor_stderr) = ('', '', '');
145$monitor->run();
146
147
148# Acquire pid of new backend
149$killme_stdin .= q[
150SELECT pg_backend_pid();
151];
152ok(pump_until($killme, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m),
153	"acquired pid for SIGKILL");
154$pid = $killme_stdout;
155chomp($pid);
156$killme_stdout = '';
157$killme_stderr = '';
158
159# Insert test rows
160$killme_stdin .= q[
161INSERT INTO alive VALUES($$committed-before-sigkill$$) RETURNING status;
162BEGIN;
163INSERT INTO alive VALUES($$in-progress-before-sigkill$$) RETURNING status;
164];
165ok(pump_until($killme, \$killme_stdout, qr/in-progress-before-sigkill/m),
166	'inserted in-progress-before-sigkill');
167$killme_stdout = '';
168$killme_stderr = '';
169
170# Re-start longrunning query in second session, it's failure will
171# signal that crash-restart has occurred.  The initial wait for the
172# trivial select is to be sure that psql successfully connected to
173# backend.
174$monitor_stdin .= q[
175SELECT $$psql-connected$$;
176SELECT pg_sleep(3600);
177];
178ok(pump_until($monitor, \$monitor_stdout, qr/psql-connected/m),
179	'monitor connected');
180$monitor_stdout = '';
181$monitor_stderr = '';
182
183
184# kill with SIGKILL this time - we expect the backend to exit, without
185# being able to emit an error error message
186$ret = TestLib::system_log('pg_ctl', 'kill', 'KILL', $pid);
187is($ret, 0, "killed process with KILL");
188
189# Check that psql sees the server as being terminated. No WARNING,
190# because signal handlers aren't being run on SIGKILL.
191$killme_stdin .= q[
192SELECT 1;
193];
194ok( pump_until(
195		$killme, \$killme_stderr,
196		qr/server closed the connection unexpectedly|connection to server was lost/m),
197	"psql query died successfully after SIGKILL");
198$killme->finish;
199
200# Wait till server restarts - we should get the WARNING here, but
201# sometimes the server is unable to send that, if interrupted while
202# sending.
203ok( pump_until(
204		$monitor,
205		\$monitor_stderr,
206		qr/WARNING:  terminating connection because of crash of another server process|server closed the connection unexpectedly|connection to server was lost/m
207	),
208	"psql monitor died successfully after SIGKILL");
209$monitor->finish;
210
211# Wait till server restarts
212is($node->poll_query_until('postgres', undef, ''),
213	"1", "reconnected after SIGKILL");
214
215# Make sure the committed rows survived, in-progress ones not
216is( $node->safe_psql('postgres', 'SELECT * FROM alive'),
217	"committed-before-sigquit\ncommitted-before-sigkill",
218	'data survived');
219
220is( $node->safe_psql(
221		'postgres',
222		'INSERT INTO alive VALUES($$before-orderly-restart$$) RETURNING status'
223	),
224	'before-orderly-restart',
225	'can still write after crash restart');
226
227# Just to be sure, check that an orderly restart now still works
228$node->restart();
229
230is( $node->safe_psql('postgres', 'SELECT * FROM alive'),
231	"committed-before-sigquit\ncommitted-before-sigkill\nbefore-orderly-restart",
232	'data survived');
233
234is( $node->safe_psql(
235		'postgres',
236		'INSERT INTO alive VALUES($$after-orderly-restart$$) RETURNING status'
237	),
238	'after-orderly-restart',
239	'can still write after orderly restart');
240
241$node->stop();
242
243# Pump until string is matched, or timeout occurs
244sub pump_until
245{
246	my ($proc, $stream, $untl) = @_;
247	$proc->pump_nb();
248	while (1)
249	{
250		last if $$stream =~ /$untl/;
251		if ($psql_timeout->is_expired)
252		{
253			diag("aborting wait: program timed out");
254			diag("stream contents: >>", $$stream, "<<");
255			diag("pattern searched for: ", $untl);
256
257			return 0;
258		}
259		if (not $proc->pumpable())
260		{
261			diag("aborting wait: program died");
262			diag("stream contents: >>", $$stream, "<<");
263			diag("pattern searched for: ", $untl);
264
265			return 0;
266		}
267		$proc->pump();
268	}
269	return 1;
270
271}
272