1 2# Copyright (c) 2021, PostgreSQL Global Development Group 3 4# 5# Tests restarts of postgres due to crashes of a subprocess. 6# 7# Two longer-running psql subprocesses are used: One to kill a 8# backend, triggering a crash-restart cycle, one to detect when 9# postmaster noticed the backend died. The second backend is 10# necessary because it's otherwise hard to determine if postmaster is 11# still accepting new sessions (because it hasn't noticed that the 12# backend died), or because it's already restarted. 13# 14use strict; 15use warnings; 16use PostgresNode; 17use TestLib; 18use Test::More; 19use Config; 20 21plan tests => 18; 22 23 24# To avoid hanging while expecting some specific input from a psql 25# instance being driven by us, add a timeout high enough that it 26# should never trigger even on very slow machines, unless something 27# is really wrong. 28my $psql_timeout = IPC::Run::timer(60); 29 30my $node = get_new_node('primary'); 31$node->init(allows_streaming => 1); 32$node->start(); 33 34# by default PostgresNode doesn't doesn't restart after a crash 35$node->safe_psql( 36 'postgres', 37 q[ALTER SYSTEM SET restart_after_crash = 1; 38 ALTER SYSTEM SET log_connections = 1; 39 SELECT pg_reload_conf();]); 40 41# Run psql, keeping session alive, so we have an alive backend to kill. 42my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', ''); 43my $killme = IPC::Run::start( 44 [ 45 'psql', '-X', '-qAt', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d', 46 $node->connstr('postgres') 47 ], 48 '<', 49 \$killme_stdin, 50 '>', 51 \$killme_stdout, 52 '2>', 53 \$killme_stderr, 54 $psql_timeout); 55 56# Need a second psql to check if crash-restart happened. 57my ($monitor_stdin, $monitor_stdout, $monitor_stderr) = ('', '', ''); 58my $monitor = IPC::Run::start( 59 [ 60 'psql', '-X', '-qAt', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d', 61 $node->connstr('postgres') 62 ], 63 '<', 64 \$monitor_stdin, 65 '>', 66 \$monitor_stdout, 67 '2>', 68 \$monitor_stderr, 69 $psql_timeout); 70 71#create table, insert row that should survive 72$killme_stdin .= q[ 73CREATE TABLE alive(status text); 74INSERT INTO alive VALUES($$committed-before-sigquit$$); 75SELECT pg_backend_pid(); 76]; 77ok(pump_until($killme, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m), 78 'acquired pid for SIGQUIT'); 79my $pid = $killme_stdout; 80chomp($pid); 81$killme_stdout = ''; 82$killme_stderr = ''; 83 84#insert a row that should *not* survive, due to in-progress xact 85$killme_stdin .= q[ 86BEGIN; 87INSERT INTO alive VALUES($$in-progress-before-sigquit$$) RETURNING status; 88]; 89ok(pump_until($killme, \$killme_stdout, qr/in-progress-before-sigquit/m), 90 'inserted in-progress-before-sigquit'); 91$killme_stdout = ''; 92$killme_stderr = ''; 93 94 95# Start longrunning query in second session; its failure will signal that 96# crash-restart has occurred. The initial wait for the trivial select is to 97# be sure that psql successfully connected to backend. 98$monitor_stdin .= q[ 99SELECT $$psql-connected$$; 100SELECT pg_sleep(3600); 101]; 102ok(pump_until($monitor, \$monitor_stdout, qr/psql-connected/m), 103 'monitor connected'); 104$monitor_stdout = ''; 105$monitor_stderr = ''; 106 107# kill once with QUIT - we expect psql to exit, while emitting error message first 108my $ret = TestLib::system_log('pg_ctl', 'kill', 'QUIT', $pid); 109 110# Exactly process should have been alive to be killed 111is($ret, 0, "killed process with SIGQUIT"); 112 113# Check that psql sees the killed backend as having been terminated 114$killme_stdin .= q[ 115SELECT 1; 116]; 117ok( pump_until( 118 $killme, 119 \$killme_stderr, 120 qr/WARNING: terminating connection because of crash of another server process|server closed the connection unexpectedly|connection to server was lost/m 121 ), 122 "psql query died successfully after SIGQUIT"); 123$killme_stderr = ''; 124$killme_stdout = ''; 125$killme->finish; 126 127# Wait till server restarts - we should get the WARNING here, but 128# sometimes the server is unable to send that, if interrupted while 129# sending. 130ok( pump_until( 131 $monitor, 132 \$monitor_stderr, 133 qr/WARNING: terminating connection because of crash of another server process|server closed the connection unexpectedly|connection to server was lost/m 134 ), 135 "psql monitor died successfully after SIGQUIT"); 136$monitor->finish; 137 138# Wait till server restarts 139is($node->poll_query_until('postgres', undef, ''), 140 "1", "reconnected after SIGQUIT"); 141 142 143# restart psql processes, now that the crash cycle finished 144($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', ''); 145$killme->run(); 146($monitor_stdin, $monitor_stdout, $monitor_stderr) = ('', '', ''); 147$monitor->run(); 148 149 150# Acquire pid of new backend 151$killme_stdin .= q[ 152SELECT pg_backend_pid(); 153]; 154ok(pump_until($killme, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m), 155 "acquired pid for SIGKILL"); 156$pid = $killme_stdout; 157chomp($pid); 158$killme_stdout = ''; 159$killme_stderr = ''; 160 161# Insert test rows 162$killme_stdin .= q[ 163INSERT INTO alive VALUES($$committed-before-sigkill$$) RETURNING status; 164BEGIN; 165INSERT INTO alive VALUES($$in-progress-before-sigkill$$) RETURNING status; 166]; 167ok(pump_until($killme, \$killme_stdout, qr/in-progress-before-sigkill/m), 168 'inserted in-progress-before-sigkill'); 169$killme_stdout = ''; 170$killme_stderr = ''; 171 172# Re-start longrunning query in second session; its failure will signal that 173# crash-restart has occurred. The initial wait for the trivial select is to 174# be sure that psql successfully connected to backend. 175$monitor_stdin .= q[ 176SELECT $$psql-connected$$; 177SELECT pg_sleep(3600); 178]; 179ok(pump_until($monitor, \$monitor_stdout, qr/psql-connected/m), 180 'monitor connected'); 181$monitor_stdout = ''; 182$monitor_stderr = ''; 183 184 185# kill with SIGKILL this time - we expect the backend to exit, without 186# being able to emit an error message 187$ret = TestLib::system_log('pg_ctl', 'kill', 'KILL', $pid); 188is($ret, 0, "killed process with KILL"); 189 190# Check that psql sees the server as being terminated. No WARNING, 191# because signal handlers aren't being run on SIGKILL. 192$killme_stdin .= q[ 193SELECT 1; 194]; 195ok( pump_until( 196 $killme, 197 \$killme_stderr, 198 qr/server closed the connection unexpectedly|connection to server was lost/m 199 ), 200 "psql query died successfully after SIGKILL"); 201$killme->finish; 202 203# Wait till server restarts - we should get the WARNING here, but 204# sometimes the server is unable to send that, if interrupted while 205# sending. 206ok( pump_until( 207 $monitor, 208 \$monitor_stderr, 209 qr/WARNING: terminating connection because of crash of another server process|server closed the connection unexpectedly|connection to server was lost/m 210 ), 211 "psql monitor died successfully after SIGKILL"); 212$monitor->finish; 213 214# Wait till server restarts 215is($node->poll_query_until('postgres', undef, ''), 216 "1", "reconnected after SIGKILL"); 217 218# Make sure the committed rows survived, in-progress ones not 219is( $node->safe_psql('postgres', 'SELECT * FROM alive'), 220 "committed-before-sigquit\ncommitted-before-sigkill", 221 'data survived'); 222 223is( $node->safe_psql( 224 'postgres', 225 'INSERT INTO alive VALUES($$before-orderly-restart$$) RETURNING status' 226 ), 227 'before-orderly-restart', 228 'can still write after crash restart'); 229 230# Just to be sure, check that an orderly restart now still works 231$node->restart(); 232 233is( $node->safe_psql('postgres', 'SELECT * FROM alive'), 234 "committed-before-sigquit\ncommitted-before-sigkill\nbefore-orderly-restart", 235 'data survived'); 236 237is( $node->safe_psql( 238 'postgres', 239 'INSERT INTO alive VALUES($$after-orderly-restart$$) RETURNING status' 240 ), 241 'after-orderly-restart', 242 'can still write after orderly restart'); 243 244$node->stop(); 245 246# Pump until string is matched, or timeout occurs 247sub pump_until 248{ 249 my ($proc, $stream, $untl) = @_; 250 $proc->pump_nb(); 251 while (1) 252 { 253 last if $$stream =~ /$untl/; 254 if ($psql_timeout->is_expired) 255 { 256 diag("aborting wait: program timed out"); 257 diag("stream contents: >>", $$stream, "<<"); 258 diag("pattern searched for: ", $untl); 259 260 return 0; 261 } 262 if (not $proc->pumpable()) 263 { 264 diag("aborting wait: program died"); 265 diag("stream contents: >>", $$stream, "<<"); 266 diag("pattern searched for: ", $untl); 267 268 return 0; 269 } 270 $proc->pump(); 271 } 272 return 1; 273 274} 275