1# 2# Tests restarts of postgres due to crashes of a subprocess. 3# 4# Two longer-running psql subprocesses are used: One to kill a 5# backend, triggering a crash-restart cycle, one to detect when 6# postmaster noticed the backend died. The second backend is 7# necessary because it's otherwise hard to determine if postmaster is 8# still accepting new sessions (because it hasn't noticed that the 9# backend died), or because it's already restarted. 10# 11use strict; 12use warnings; 13use PostgresNode; 14use TestLib; 15use Test::More; 16use Config; 17use Time::HiRes qw(usleep); 18 19plan tests => 18; 20 21 22# To avoid hanging while expecting some specific input from a psql 23# instance being driven by us, add a timeout high enough that it 24# should never trigger even on very slow machines, unless something 25# is really wrong. 26my $psql_timeout = IPC::Run::timer(60); 27 28my $node = get_new_node('master'); 29$node->init(allows_streaming => 1); 30$node->start(); 31 32# by default PostgresNode doesn't doesn't restart after a crash 33$node->safe_psql( 34 'postgres', 35 q[ALTER SYSTEM SET restart_after_crash = 1; 36 ALTER SYSTEM SET log_connections = 1; 37 SELECT pg_reload_conf();]); 38 39# Run psql, keeping session alive, so we have an alive backend to kill. 40my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', ''); 41my $killme = IPC::Run::start( 42 [ 43 'psql', '-X', '-qAt', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d', 44 $node->connstr('postgres') 45 ], 46 '<', 47 \$killme_stdin, 48 '>', 49 \$killme_stdout, 50 '2>', 51 \$killme_stderr, 52 $psql_timeout); 53 54# Need a second psql to check if crash-restart happened. 55my ($monitor_stdin, $monitor_stdout, $monitor_stderr) = ('', '', ''); 56my $monitor = IPC::Run::start( 57 [ 58 'psql', '-X', '-qAt', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d', 59 $node->connstr('postgres') 60 ], 61 '<', 62 \$monitor_stdin, 63 '>', 64 \$monitor_stdout, 65 '2>', 66 \$monitor_stderr, 67 $psql_timeout); 68 69#create table, insert row that should survive 70$killme_stdin .= q[ 71CREATE TABLE alive(status text); 72INSERT INTO alive VALUES($$committed-before-sigquit$$); 73SELECT pg_backend_pid(); 74]; 75ok(pump_until($killme, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m), 76 'acquired pid for SIGQUIT'); 77my $pid = $killme_stdout; 78chomp($pid); 79$killme_stdout = ''; 80$killme_stderr = ''; 81 82#insert a row that should *not* survive, due to in-progress xact 83$killme_stdin .= q[ 84BEGIN; 85INSERT INTO alive VALUES($$in-progress-before-sigquit$$) RETURNING status; 86]; 87ok(pump_until($killme, \$killme_stdout, qr/in-progress-before-sigquit/m), 88 'inserted in-progress-before-sigquit'); 89$killme_stdout = ''; 90$killme_stderr = ''; 91 92 93# Start longrunning query in second session, it's failure will signal 94# that crash-restart has occurred. The initial wait for the trivial 95# select is to be sure that psql successfully connected to backend. 96$monitor_stdin .= q[ 97SELECT $$psql-connected$$; 98SELECT pg_sleep(3600); 99]; 100ok(pump_until($monitor, \$monitor_stdout, qr/psql-connected/m), 101 'monitor connected'); 102$monitor_stdout = ''; 103$monitor_stderr = ''; 104 105# kill once with QUIT - we expect psql to exit, while emitting error message first 106my $ret = TestLib::system_log('pg_ctl', 'kill', 'QUIT', $pid); 107 108# Exactly process should have been alive to be killed 109is($ret, 0, "killed process with SIGQUIT"); 110 111# Check that psql sees the killed backend as having been terminated 112$killme_stdin .= q[ 113SELECT 1; 114]; 115ok( pump_until( 116 $killme, 117 \$killme_stderr, 118 qr/WARNING: terminating connection because of crash of another server process|server closed the connection unexpectedly|connection to server was lost/m 119 ), 120 "psql query died successfully after SIGQUIT"); 121$killme_stderr = ''; 122$killme_stdout = ''; 123$killme->finish; 124 125# Wait till server restarts - we should get the WARNING here, but 126# sometimes the server is unable to send that, if interrupted while 127# sending. 128ok( pump_until( 129 $monitor, 130 \$monitor_stderr, 131 qr/WARNING: terminating connection because of crash of another server process|server closed the connection unexpectedly|connection to server was lost/m 132 ), 133 "psql monitor died successfully after SIGQUIT"); 134$monitor->finish; 135 136# Wait till server restarts 137is($node->poll_query_until('postgres', undef, ''), 138 "1", "reconnected after SIGQUIT"); 139 140 141# restart psql processes, now that the crash cycle finished 142($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', ''); 143$killme->run(); 144($monitor_stdin, $monitor_stdout, $monitor_stderr) = ('', '', ''); 145$monitor->run(); 146 147 148# Acquire pid of new backend 149$killme_stdin .= q[ 150SELECT pg_backend_pid(); 151]; 152ok(pump_until($killme, \$killme_stdout, qr/[[:digit:]]+[\r\n]$/m), 153 "acquired pid for SIGKILL"); 154$pid = $killme_stdout; 155chomp($pid); 156$killme_stdout = ''; 157$killme_stderr = ''; 158 159# Insert test rows 160$killme_stdin .= q[ 161INSERT INTO alive VALUES($$committed-before-sigkill$$) RETURNING status; 162BEGIN; 163INSERT INTO alive VALUES($$in-progress-before-sigkill$$) RETURNING status; 164]; 165ok(pump_until($killme, \$killme_stdout, qr/in-progress-before-sigkill/m), 166 'inserted in-progress-before-sigkill'); 167$killme_stdout = ''; 168$killme_stderr = ''; 169 170# Re-start longrunning query in second session, it's failure will 171# signal that crash-restart has occurred. The initial wait for the 172# trivial select is to be sure that psql successfully connected to 173# backend. 174$monitor_stdin .= q[ 175SELECT $$psql-connected$$; 176SELECT pg_sleep(3600); 177]; 178ok(pump_until($monitor, \$monitor_stdout, qr/psql-connected/m), 179 'monitor connected'); 180$monitor_stdout = ''; 181$monitor_stderr = ''; 182 183 184# kill with SIGKILL this time - we expect the backend to exit, without 185# being able to emit an error error message 186$ret = TestLib::system_log('pg_ctl', 'kill', 'KILL', $pid); 187is($ret, 0, "killed process with KILL"); 188 189# Check that psql sees the server as being terminated. No WARNING, 190# because signal handlers aren't being run on SIGKILL. 191$killme_stdin .= q[ 192SELECT 1; 193]; 194ok( pump_until( 195 $killme, \$killme_stderr, 196 qr/server closed the connection unexpectedly|connection to server was lost/m), 197 "psql query died successfully after SIGKILL"); 198$killme->finish; 199 200# Wait till server restarts - we should get the WARNING here, but 201# sometimes the server is unable to send that, if interrupted while 202# sending. 203ok( pump_until( 204 $monitor, 205 \$monitor_stderr, 206 qr/WARNING: terminating connection because of crash of another server process|server closed the connection unexpectedly|connection to server was lost/m 207 ), 208 "psql monitor died successfully after SIGKILL"); 209$monitor->finish; 210 211# Wait till server restarts 212is($node->poll_query_until('postgres', undef, ''), 213 "1", "reconnected after SIGKILL"); 214 215# Make sure the committed rows survived, in-progress ones not 216is( $node->safe_psql('postgres', 'SELECT * FROM alive'), 217 "committed-before-sigquit\ncommitted-before-sigkill", 218 'data survived'); 219 220is( $node->safe_psql( 221 'postgres', 222 'INSERT INTO alive VALUES($$before-orderly-restart$$) RETURNING status' 223 ), 224 'before-orderly-restart', 225 'can still write after crash restart'); 226 227# Just to be sure, check that an orderly restart now still works 228$node->restart(); 229 230is( $node->safe_psql('postgres', 'SELECT * FROM alive'), 231 "committed-before-sigquit\ncommitted-before-sigkill\nbefore-orderly-restart", 232 'data survived'); 233 234is( $node->safe_psql( 235 'postgres', 236 'INSERT INTO alive VALUES($$after-orderly-restart$$) RETURNING status' 237 ), 238 'after-orderly-restart', 239 'can still write after orderly restart'); 240 241$node->stop(); 242 243# Pump until string is matched, or timeout occurs 244sub pump_until 245{ 246 my ($proc, $stream, $untl) = @_; 247 $proc->pump_nb(); 248 while (1) 249 { 250 last if $$stream =~ /$untl/; 251 if ($psql_timeout->is_expired) 252 { 253 diag("aborting wait: program timed out"); 254 diag("stream contents: >>", $$stream, "<<"); 255 diag("pattern searched for: ", $untl); 256 257 return 0; 258 } 259 if (not $proc->pumpable()) 260 { 261 diag("aborting wait: program died"); 262 diag("stream contents: >>", $$stream, "<<"); 263 diag("pattern searched for: ", $untl); 264 265 return 0; 266 } 267 $proc->pump(); 268 } 269 return 1; 270 271} 272