1
2# Copyright (c) 2021, PostgreSQL Global Development Group
3
4#
5# Tests restarts of postgres due to crashes of a subprocess.
6#
7# Two longer-running psql subprocesses are used: One to kill a
8# backend, triggering a crash-restart cycle, one to detect when
9# postmaster noticed the backend died.  The second backend is
10# necessary because it's otherwise hard to determine if postmaster is
11# still accepting new sessions (because it hasn't noticed that the
12# backend died), or because it's already restarted.
13#
14use strict;
15use warnings;
16use PostgresNode;
17use TestLib;
18use Test::More;
19use Config;
20
21plan tests => 18;
22
23
24# To avoid hanging while expecting some specific input from a psql
25# instance being driven by us, add a timeout high enough that it
26# should never trigger even on very slow machines, unless something
27# is really wrong.
28my $psql_timeout = IPC::Run::timer(60);
29
30my $node = get_new_node('primary');
31$node->init(allows_streaming => 1);
32$node->start();
33
34# by default PostgresNode doesn't doesn't restart after a crash
35$node->safe_psql(
36	'postgres',
37	q[ALTER SYSTEM SET restart_after_crash = 1;
38				   ALTER SYSTEM SET log_connections = 1;
39				   SELECT pg_reload_conf();]);
40
41# Run psql, keeping session alive, so we have an alive backend to kill.
42my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', '');
43my $killme = IPC::Run::start(
44	[
45		'psql', '-X', '-qAt', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d',
46		$node->connstr('postgres')
47	],
48	'<',
49	\$killme_stdin,
50	'>',
51	\$killme_stdout,
52	'2>',
53	\$killme_stderr,
54	$psql_timeout);
55
56# Need a second psql to check if crash-restart happened.
57my ($monitor_stdin, $monitor_stdout, $monitor_stderr) = ('', '', '');
58my $monitor = IPC::Run::start(
59	[
60		'psql', '-X', '-qAt', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d',
61		$node->connstr('postgres')
62	],
63	'<',
64	\$monitor_stdin,
65	'>',
66	\$monitor_stdout,
67	'2>',
68	\$monitor_stderr,
69	$psql_timeout);
70
71#create table, insert row that should survive
72$killme_stdin .= q[
73CREATE TABLE alive(status text);
74INSERT INTO alive VALUES($$committed-before-sigquit$$);
75SELECT pg_backend_pid();
76];
77ok(pump_until($killme, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m),
78	'acquired pid for SIGQUIT');
79my $pid = $killme_stdout;
80chomp($pid);
81$killme_stdout = '';
82$killme_stderr = '';
83
84#insert a row that should *not* survive, due to in-progress xact
85$killme_stdin .= q[
86BEGIN;
87INSERT INTO alive VALUES($$in-progress-before-sigquit$$) RETURNING status;
88];
89ok(pump_until($killme, \$killme_stdout, qr/in-progress-before-sigquit/m),
90	'inserted in-progress-before-sigquit');
91$killme_stdout = '';
92$killme_stderr = '';
93
94
95# Start longrunning query in second session; its failure will signal that
96# crash-restart has occurred.  The initial wait for the trivial select is to
97# be sure that psql successfully connected to backend.
98$monitor_stdin .= q[
99SELECT $$psql-connected$$;
100SELECT pg_sleep(3600);
101];
102ok(pump_until($monitor, \$monitor_stdout, qr/psql-connected/m),
103	'monitor connected');
104$monitor_stdout = '';
105$monitor_stderr = '';
106
107# kill once with QUIT - we expect psql to exit, while emitting error message first
108my $ret = TestLib::system_log('pg_ctl', 'kill', 'QUIT', $pid);
109
110# Exactly process should have been alive to be killed
111is($ret, 0, "killed process with SIGQUIT");
112
113# Check that psql sees the killed backend as having been terminated
114$killme_stdin .= q[
115SELECT 1;
116];
117ok( pump_until(
118		$killme,
119		\$killme_stderr,
120		qr/WARNING:  terminating connection because of crash of another server process|server closed the connection unexpectedly|connection to server was lost/m
121	),
122	"psql query died successfully after SIGQUIT");
123$killme_stderr = '';
124$killme_stdout = '';
125$killme->finish;
126
127# Wait till server restarts - we should get the WARNING here, but
128# sometimes the server is unable to send that, if interrupted while
129# sending.
130ok( pump_until(
131		$monitor,
132		\$monitor_stderr,
133		qr/WARNING:  terminating connection because of crash of another server process|server closed the connection unexpectedly|connection to server was lost/m
134	),
135	"psql monitor died successfully after SIGQUIT");
136$monitor->finish;
137
138# Wait till server restarts
139is($node->poll_query_until('postgres', undef, ''),
140	"1", "reconnected after SIGQUIT");
141
142
143# restart psql processes, now that the crash cycle finished
144($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', '');
145$killme->run();
146($monitor_stdin, $monitor_stdout, $monitor_stderr) = ('', '', '');
147$monitor->run();
148
149
150# Acquire pid of new backend
151$killme_stdin .= q[
152SELECT pg_backend_pid();
153];
154ok(pump_until($killme, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m),
155	"acquired pid for SIGKILL");
156$pid = $killme_stdout;
157chomp($pid);
158$killme_stdout = '';
159$killme_stderr = '';
160
161# Insert test rows
162$killme_stdin .= q[
163INSERT INTO alive VALUES($$committed-before-sigkill$$) RETURNING status;
164BEGIN;
165INSERT INTO alive VALUES($$in-progress-before-sigkill$$) RETURNING status;
166];
167ok(pump_until($killme, \$killme_stdout, qr/in-progress-before-sigkill/m),
168	'inserted in-progress-before-sigkill');
169$killme_stdout = '';
170$killme_stderr = '';
171
172# Re-start longrunning query in second session; its failure will signal that
173# crash-restart has occurred.  The initial wait for the trivial select is to
174# be sure that psql successfully connected to backend.
175$monitor_stdin .= q[
176SELECT $$psql-connected$$;
177SELECT pg_sleep(3600);
178];
179ok(pump_until($monitor, \$monitor_stdout, qr/psql-connected/m),
180	'monitor connected');
181$monitor_stdout = '';
182$monitor_stderr = '';
183
184
185# kill with SIGKILL this time - we expect the backend to exit, without
186# being able to emit an error message
187$ret = TestLib::system_log('pg_ctl', 'kill', 'KILL', $pid);
188is($ret, 0, "killed process with KILL");
189
190# Check that psql sees the server as being terminated. No WARNING,
191# because signal handlers aren't being run on SIGKILL.
192$killme_stdin .= q[
193SELECT 1;
194];
195ok( pump_until(
196		$killme,
197		\$killme_stderr,
198		qr/server closed the connection unexpectedly|connection to server was lost/m
199	),
200	"psql query died successfully after SIGKILL");
201$killme->finish;
202
203# Wait till server restarts - we should get the WARNING here, but
204# sometimes the server is unable to send that, if interrupted while
205# sending.
206ok( pump_until(
207		$monitor,
208		\$monitor_stderr,
209		qr/WARNING:  terminating connection because of crash of another server process|server closed the connection unexpectedly|connection to server was lost/m
210	),
211	"psql monitor died successfully after SIGKILL");
212$monitor->finish;
213
214# Wait till server restarts
215is($node->poll_query_until('postgres', undef, ''),
216	"1", "reconnected after SIGKILL");
217
218# Make sure the committed rows survived, in-progress ones not
219is( $node->safe_psql('postgres', 'SELECT * FROM alive'),
220	"committed-before-sigquit\ncommitted-before-sigkill",
221	'data survived');
222
223is( $node->safe_psql(
224		'postgres',
225		'INSERT INTO alive VALUES($$before-orderly-restart$$) RETURNING status'
226	),
227	'before-orderly-restart',
228	'can still write after crash restart');
229
230# Just to be sure, check that an orderly restart now still works
231$node->restart();
232
233is( $node->safe_psql('postgres', 'SELECT * FROM alive'),
234	"committed-before-sigquit\ncommitted-before-sigkill\nbefore-orderly-restart",
235	'data survived');
236
237is( $node->safe_psql(
238		'postgres',
239		'INSERT INTO alive VALUES($$after-orderly-restart$$) RETURNING status'
240	),
241	'after-orderly-restart',
242	'can still write after orderly restart');
243
244$node->stop();
245
246# Pump until string is matched, or timeout occurs
247sub pump_until
248{
249	my ($proc, $stream, $untl) = @_;
250	$proc->pump_nb();
251	while (1)
252	{
253		last if $$stream =~ /$untl/;
254		if ($psql_timeout->is_expired)
255		{
256			diag("aborting wait: program timed out");
257			diag("stream contents: >>", $$stream, "<<");
258			diag("pattern searched for: ", $untl);
259
260			return 0;
261		}
262		if (not $proc->pumpable())
263		{
264			diag("aborting wait: program died");
265			diag("stream contents: >>", $$stream, "<<");
266			diag("pattern searched for: ", $untl);
267
268			return 0;
269		}
270		$proc->pump();
271	}
272	return 1;
273
274}
275