1
2# Copyright (c) 2021, PostgreSQL Global Development Group
3
4use strict;
5use warnings;
6
7use PostgresNode;
8use TestLib;
9
10use Fcntl qw(:seek);
11use Test::More;
12
13# This regression test demonstrates that the pg_amcheck binary correctly
14# identifies specific kinds of corruption within pages.  To test this, we need
15# a mechanism to create corrupt pages with predictable, repeatable corruption.
16# The postgres backend cannot be expected to help us with this, as its design
17# is not consistent with the goal of intentionally corrupting pages.
18#
19# Instead, we create a table to corrupt, and with careful consideration of how
20# postgresql lays out heap pages, we seek to offsets within the page and
21# overwrite deliberately chosen bytes with specific values calculated to
22# corrupt the page in expected ways.  We then verify that pg_amcheck reports
23# the corruption, and that it runs without crashing.  Note that the backend
24# cannot simply be started to run queries against the corrupt table, as the
25# backend will crash, at least for some of the corruption types we generate.
26#
27# Autovacuum potentially touching the table in the background makes the exact
28# behavior of this test harder to reason about.  We turn it off to keep things
29# simpler.  We use a "belt and suspenders" approach, turning it off for the
30# system generally in postgresql.conf, and turning it off specifically for the
31# test table.
32#
33# This test depends on the table being written to the heap file exactly as we
34# expect it to be, so we take care to arrange the columns of the table, and
35# insert rows of the table, that give predictable sizes and locations within
36# the table page.
37#
38# The HeapTupleHeaderData has 23 bytes of fixed size fields before the variable
39# length t_bits[] array.  We have exactly 3 columns in the table, so natts = 3,
40# t_bits is 1 byte long, and t_hoff = MAXALIGN(23 + 1) = 24.
41#
42# We're not too fussy about which datatypes we use for the test, but we do care
43# about some specific properties.  We'd like to test both fixed size and
44# varlena types.  We'd like some varlena data inline and some toasted.  And
45# we'd like the layout of the table such that the datums land at predictable
46# offsets within the tuple.  We choose a structure without padding on all
47# supported architectures:
48#
49# 	a BIGINT
50#	b TEXT
51#	c TEXT
52#
53# We always insert a 7-ascii character string into field 'b', which with a
54# 1-byte varlena header gives an 8 byte inline value.  We always insert a long
55# text string in field 'c', long enough to force toast storage.
56#
57# We choose to read and write binary copies of our table's tuples, using perl's
58# pack() and unpack() functions.  Perl uses a packing code system in which:
59#
60#	l = "signed 32-bit Long",
61#	L = "Unsigned 32-bit Long",
62#	S = "Unsigned 16-bit Short",
63#	C = "Unsigned 8-bit Octet",
64#
65# Each tuple in our table has a layout as follows:
66#
67#    xx xx xx xx            t_xmin: xxxx		offset = 0		L
68#    xx xx xx xx            t_xmax: xxxx		offset = 4		L
69#    xx xx xx xx          t_field3: xxxx		offset = 8		L
70#    xx xx                   bi_hi: xx			offset = 12		S
71#    xx xx                   bi_lo: xx			offset = 14		S
72#    xx xx                ip_posid: xx			offset = 16		S
73#    xx xx             t_infomask2: xx			offset = 18		S
74#    xx xx              t_infomask: xx			offset = 20		S
75#    xx                     t_hoff: x			offset = 22		C
76#    xx                     t_bits: x			offset = 23		C
77#    xx xx xx xx xx xx xx xx   'a': xxxxxxxx	offset = 24		LL
78#    xx xx xx xx xx xx xx xx   'b': xxxxxxxx	offset = 32		CCCCCCCC
79#    xx xx xx xx xx xx xx xx   'c': xxxxxxxx	offset = 40		CCllLL
80#    xx xx xx xx xx xx xx xx      : xxxxxxxx	 ...continued
81#    xx xx                        : xx      	 ...continued
82#
83# We could choose to read and write columns 'b' and 'c' in other ways, but
84# it is convenient enough to do it this way.  We define packing code
85# constants here, where they can be compared easily against the layout.
86
87use constant HEAPTUPLE_PACK_CODE => 'LLLSSSSSCCLLCCCCCCCCCCllLL';
88use constant HEAPTUPLE_PACK_LENGTH => 58;    # Total size
89
90# Read a tuple of our table from a heap page.
91#
92# Takes an open filehandle to the heap file, and the offset of the tuple.
93#
94# Rather than returning the binary data from the file, unpacks the data into a
95# perl hash with named fields.  These fields exactly match the ones understood
96# by write_tuple(), below.  Returns a reference to this hash.
97#
98sub read_tuple
99{
100	my ($fh, $offset) = @_;
101	my ($buffer, %tup);
102	seek($fh, $offset, SEEK_SET)
103	  or BAIL_OUT("seek failed: $!");
104	defined(sysread($fh, $buffer, HEAPTUPLE_PACK_LENGTH))
105	  or BAIL_OUT("sysread failed: $!");
106
107	@_ = unpack(HEAPTUPLE_PACK_CODE, $buffer);
108	%tup = (
109		t_xmin          => shift,
110		t_xmax          => shift,
111		t_field3        => shift,
112		bi_hi           => shift,
113		bi_lo           => shift,
114		ip_posid        => shift,
115		t_infomask2     => shift,
116		t_infomask      => shift,
117		t_hoff          => shift,
118		t_bits          => shift,
119		a_1             => shift,
120		a_2             => shift,
121		b_header        => shift,
122		b_body1         => shift,
123		b_body2         => shift,
124		b_body3         => shift,
125		b_body4         => shift,
126		b_body5         => shift,
127		b_body6         => shift,
128		b_body7         => shift,
129		c_va_header     => shift,
130		c_va_vartag     => shift,
131		c_va_rawsize    => shift,
132		c_va_extinfo    => shift,
133		c_va_valueid    => shift,
134		c_va_toastrelid => shift);
135	# Stitch together the text for column 'b'
136	$tup{b} = join('', map { chr($tup{"b_body$_"}) } (1 .. 7));
137	return \%tup;
138}
139
140# Write a tuple of our table to a heap page.
141#
142# Takes an open filehandle to the heap file, the offset of the tuple, and a
143# reference to a hash with the tuple values, as returned by read_tuple().
144# Writes the tuple fields from the hash into the heap file.
145#
146# The purpose of this function is to write a tuple back to disk with some
147# subset of fields modified.  The function does no error checking.  Use
148# cautiously.
149#
150sub write_tuple
151{
152	my ($fh, $offset, $tup) = @_;
153	my $buffer = pack(
154		HEAPTUPLE_PACK_CODE,
155		$tup->{t_xmin},       $tup->{t_xmax},
156		$tup->{t_field3},     $tup->{bi_hi},
157		$tup->{bi_lo},        $tup->{ip_posid},
158		$tup->{t_infomask2},  $tup->{t_infomask},
159		$tup->{t_hoff},       $tup->{t_bits},
160		$tup->{a_1},          $tup->{a_2},
161		$tup->{b_header},     $tup->{b_body1},
162		$tup->{b_body2},      $tup->{b_body3},
163		$tup->{b_body4},      $tup->{b_body5},
164		$tup->{b_body6},      $tup->{b_body7},
165		$tup->{c_va_header},  $tup->{c_va_vartag},
166		$tup->{c_va_rawsize}, $tup->{c_va_extinfo},
167		$tup->{c_va_valueid}, $tup->{c_va_toastrelid});
168	seek($fh, $offset, SEEK_SET)
169	  or BAIL_OUT("seek failed: $!");
170	defined(syswrite($fh, $buffer, HEAPTUPLE_PACK_LENGTH))
171	  or BAIL_OUT("syswrite failed: $!");
172	return;
173}
174
175# Set umask so test directories and files are created with default permissions
176umask(0077);
177
178# Set up the node.  Once we create and corrupt the table,
179# autovacuum workers visiting the table could crash the backend.
180# Disable autovacuum so that won't happen.
181my $node = get_new_node('test');
182$node->init;
183$node->append_conf('postgresql.conf', 'autovacuum=off');
184
185# Start the node and load the extensions.  We depend on both
186# amcheck and pageinspect for this test.
187$node->start;
188my $port   = $node->port;
189my $pgdata = $node->data_dir;
190$node->safe_psql('postgres', "CREATE EXTENSION amcheck");
191$node->safe_psql('postgres', "CREATE EXTENSION pageinspect");
192
193# Get a non-zero datfrozenxid
194$node->safe_psql('postgres', qq(VACUUM FREEZE));
195
196# Create the test table with precisely the schema that our corruption function
197# expects.
198$node->safe_psql(
199	'postgres', qq(
200		CREATE TABLE public.test (a BIGINT, b TEXT, c TEXT);
201		ALTER TABLE public.test SET (autovacuum_enabled=false);
202		ALTER TABLE public.test ALTER COLUMN c SET STORAGE EXTERNAL;
203		CREATE INDEX test_idx ON public.test(a, b);
204	));
205
206# We want (0 < datfrozenxid < test.relfrozenxid).  To achieve this, we freeze
207# an otherwise unused table, public.junk, prior to inserting data and freezing
208# public.test
209$node->safe_psql(
210	'postgres', qq(
211		CREATE TABLE public.junk AS SELECT 'junk'::TEXT AS junk_column;
212		ALTER TABLE public.junk SET (autovacuum_enabled=false);
213		VACUUM FREEZE public.junk
214	));
215
216my $rel = $node->safe_psql('postgres',
217	qq(SELECT pg_relation_filepath('public.test')));
218my $relpath = "$pgdata/$rel";
219
220# Insert data and freeze public.test
221use constant ROWCOUNT => 16;
222$node->safe_psql(
223	'postgres', qq(
224	INSERT INTO public.test (a, b, c)
225		VALUES (
226			x'DEADF9F9DEADF9F9'::bigint,
227			'abcdefg',
228			repeat('w', 10000)
229		);
230	VACUUM FREEZE public.test
231	)) for (1 .. ROWCOUNT);
232
233my $relfrozenxid = $node->safe_psql('postgres',
234	q(select relfrozenxid from pg_class where relname = 'test'));
235my $datfrozenxid = $node->safe_psql('postgres',
236	q(select datfrozenxid from pg_database where datname = 'postgres'));
237
238# Sanity check that our 'test' table has a relfrozenxid newer than the
239# datfrozenxid for the database, and that the datfrozenxid is greater than the
240# first normal xid.  We rely on these invariants in some of our tests.
241if ($datfrozenxid <= 3 || $datfrozenxid >= $relfrozenxid)
242{
243	$node->clean_node;
244	plan skip_all =>
245	  "Xid thresholds not as expected: got datfrozenxid = $datfrozenxid, relfrozenxid = $relfrozenxid";
246	exit;
247}
248
249# Find where each of the tuples is located on the page.
250my @lp_off;
251for my $tup (0 .. ROWCOUNT - 1)
252{
253	push(
254		@lp_off,
255		$node->safe_psql(
256			'postgres', qq(
257select lp_off from heap_page_items(get_raw_page('test', 'main', 0))
258	offset $tup limit 1)));
259}
260
261# Sanity check that our 'test' table on disk layout matches expectations.  If
262# this is not so, we will have to skip the test until somebody updates the test
263# to work on this platform.
264$node->stop;
265my $file;
266open($file, '+<', $relpath)
267  or BAIL_OUT("open failed: $!");
268binmode $file;
269
270my $ENDIANNESS;
271for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++)
272{
273	my $offnum = $tupidx + 1;        # offnum is 1-based, not zero-based
274	my $offset = $lp_off[$tupidx];
275	my $tup = read_tuple($file, $offset);
276
277	# Sanity-check that the data appears on the page where we expect.
278	my $a_1 = $tup->{a_1};
279	my $a_2 = $tup->{a_2};
280	my $b   = $tup->{b};
281	if ($a_1 != 0xDEADF9F9 || $a_2 != 0xDEADF9F9 || $b ne 'abcdefg')
282	{
283		close($file);    # ignore errors on close; we're exiting anyway
284		$node->clean_node;
285		plan skip_all =>
286		  sprintf(
287			"Page layout differs from our expectations: expected (%x, %x, \"%s\"), got (%x, %x, \"%s\")",
288			0xDEADF9F9, 0xDEADF9F9, "abcdefg", $a_1, $a_2, $b);
289		exit;
290	}
291
292	# Determine endianness of current platform from the 1-byte varlena header
293	$ENDIANNESS = $tup->{b_header} == 0x11 ? "little" : "big";
294}
295close($file)
296  or BAIL_OUT("close failed: $!");
297$node->start;
298
299# Ok, Xids and page layout look ok.  We can run corruption tests.
300plan tests => 19;
301
302# Check that pg_amcheck runs against the uncorrupted table without error.
303$node->command_ok(
304	[ 'pg_amcheck', '-p', $port, 'postgres' ],
305	'pg_amcheck test table, prior to corruption');
306
307# Check that pg_amcheck runs against the uncorrupted table and index without error.
308$node->command_ok([ 'pg_amcheck', '-p', $port, 'postgres' ],
309	'pg_amcheck test table and index, prior to corruption');
310
311$node->stop;
312
313# Some #define constants from access/htup_details.h for use while corrupting.
314use constant HEAP_HASNULL        => 0x0001;
315use constant HEAP_XMAX_LOCK_ONLY => 0x0080;
316use constant HEAP_XMIN_COMMITTED => 0x0100;
317use constant HEAP_XMIN_INVALID   => 0x0200;
318use constant HEAP_XMAX_COMMITTED => 0x0400;
319use constant HEAP_XMAX_INVALID   => 0x0800;
320use constant HEAP_NATTS_MASK     => 0x07FF;
321use constant HEAP_XMAX_IS_MULTI  => 0x1000;
322use constant HEAP_KEYS_UPDATED   => 0x2000;
323
324# Helper function to generate a regular expression matching the header we
325# expect verify_heapam() to return given which fields we expect to be non-null.
326sub header
327{
328	my ($blkno, $offnum, $attnum) = @_;
329	return
330	  qr/heap table "postgres\.public\.test", block $blkno, offset $offnum, attribute $attnum:\s+/ms
331	  if (defined $attnum);
332	return
333	  qr/heap table "postgres\.public\.test", block $blkno, offset $offnum:\s+/ms
334	  if (defined $offnum);
335	return qr/heap table "postgres\.public\.test", block $blkno:\s+/ms
336	  if (defined $blkno);
337	return qr/heap table "postgres\.public\.test":\s+/ms;
338}
339
340# Corrupt the tuples, one type of corruption per tuple.  Some types of
341# corruption cause verify_heapam to skip to the next tuple without
342# performing any remaining checks, so we can't exercise the system properly if
343# we focus all our corruption on a single tuple.
344#
345my @expected;
346open($file, '+<', $relpath)
347  or BAIL_OUT("open failed: $!");
348binmode $file;
349
350for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++)
351{
352	my $offnum = $tupidx + 1;        # offnum is 1-based, not zero-based
353	my $offset = $lp_off[$tupidx];
354	my $tup = read_tuple($file, $offset);
355
356	my $header = header(0, $offnum, undef);
357	if ($offnum == 1)
358	{
359		# Corruptly set xmin < relfrozenxid
360		my $xmin = $relfrozenxid - 1;
361		$tup->{t_xmin} = $xmin;
362		$tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED;
363		$tup->{t_infomask} &= ~HEAP_XMIN_INVALID;
364
365		# Expected corruption report
366		push @expected,
367		  qr/${header}xmin $xmin precedes relation freeze threshold 0:\d+/;
368	}
369	if ($offnum == 2)
370	{
371		# Corruptly set xmin < datfrozenxid
372		my $xmin = 3;
373		$tup->{t_xmin} = $xmin;
374		$tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED;
375		$tup->{t_infomask} &= ~HEAP_XMIN_INVALID;
376
377		push @expected,
378		  qr/${$header}xmin $xmin precedes oldest valid transaction ID 0:\d+/;
379	}
380	elsif ($offnum == 3)
381	{
382		# Corruptly set xmin < datfrozenxid, further back, noting circularity
383		# of xid comparison.  For a new cluster with epoch = 0, the corrupt
384		# xmin will be interpreted as in the future
385		$tup->{t_xmin} = 4026531839;
386		$tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED;
387		$tup->{t_infomask} &= ~HEAP_XMIN_INVALID;
388
389		push @expected,
390		  qr/${$header}xmin 4026531839 equals or exceeds next valid transaction ID 0:\d+/;
391	}
392	elsif ($offnum == 4)
393	{
394		# Corruptly set xmax < relminmxid;
395		$tup->{t_xmax} = 4026531839;
396		$tup->{t_infomask} &= ~HEAP_XMAX_INVALID;
397
398		push @expected,
399		  qr/${$header}xmax 4026531839 equals or exceeds next valid transaction ID 0:\d+/;
400	}
401	elsif ($offnum == 5)
402	{
403		# Corrupt the tuple t_hoff, but keep it aligned properly
404		$tup->{t_hoff} += 128;
405
406		push @expected,
407		  qr/${$header}data begins at offset 152 beyond the tuple length 58/,
408		  qr/${$header}tuple data should begin at byte 24, but actually begins at byte 152 \(3 attributes, no nulls\)/;
409	}
410	elsif ($offnum == 6)
411	{
412		# Corrupt the tuple t_hoff, wrong alignment
413		$tup->{t_hoff} += 3;
414
415		push @expected,
416		  qr/${$header}tuple data should begin at byte 24, but actually begins at byte 27 \(3 attributes, no nulls\)/;
417	}
418	elsif ($offnum == 7)
419	{
420		# Corrupt the tuple t_hoff, underflow but correct alignment
421		$tup->{t_hoff} -= 8;
422
423		push @expected,
424		  qr/${$header}tuple data should begin at byte 24, but actually begins at byte 16 \(3 attributes, no nulls\)/;
425	}
426	elsif ($offnum == 8)
427	{
428		# Corrupt the tuple t_hoff, underflow and wrong alignment
429		$tup->{t_hoff} -= 3;
430
431		push @expected,
432		  qr/${$header}tuple data should begin at byte 24, but actually begins at byte 21 \(3 attributes, no nulls\)/;
433	}
434	elsif ($offnum == 9)
435	{
436		# Corrupt the tuple to look like it has lots of attributes, not just 3
437		$tup->{t_infomask2} |= HEAP_NATTS_MASK;
438
439		push @expected,
440		  qr/${$header}number of attributes 2047 exceeds maximum expected for table 3/;
441	}
442	elsif ($offnum == 10)
443	{
444		# Corrupt the tuple to look like it has lots of attributes, some of
445		# them null.  This falsely creates the impression that the t_bits
446		# array is longer than just one byte, but t_hoff still says otherwise.
447		$tup->{t_infomask}  |= HEAP_HASNULL;
448		$tup->{t_infomask2} |= HEAP_NATTS_MASK;
449		$tup->{t_bits} = 0xAA;
450
451		push @expected,
452		  qr/${$header}tuple data should begin at byte 280, but actually begins at byte 24 \(2047 attributes, has nulls\)/;
453	}
454	elsif ($offnum == 11)
455	{
456		# Same as above, but this time t_hoff plays along
457		$tup->{t_infomask}  |= HEAP_HASNULL;
458		$tup->{t_infomask2} |= (HEAP_NATTS_MASK & 0x40);
459		$tup->{t_bits} = 0xAA;
460		$tup->{t_hoff} = 32;
461
462		push @expected,
463		  qr/${$header}number of attributes 67 exceeds maximum expected for table 3/;
464	}
465	elsif ($offnum == 12)
466	{
467		# Overwrite column 'b' 1-byte varlena header and initial characters to
468		# look like a long 4-byte varlena
469		#
470		# On little endian machines, bytes ending in two zero bits (xxxxxx00 bytes)
471		# are 4-byte length word, aligned, uncompressed data (up to 1G).  We set the
472		# high six bits to 111111 and the lower two bits to 00, then the next three
473		# bytes with 0xFF using 0xFCFFFFFF.
474		#
475		# On big endian machines, bytes starting in two zero bits (00xxxxxx bytes)
476		# are 4-byte length word, aligned, uncompressed data (up to 1G).  We set the
477		# low six bits to 111111 and the high two bits to 00, then the next three
478		# bytes with 0xFF using 0x3FFFFFFF.
479		#
480		$tup->{b_header} = $ENDIANNESS eq 'little' ? 0xFC : 0x3F;
481		$tup->{b_body1}  = 0xFF;
482		$tup->{b_body2}  = 0xFF;
483		$tup->{b_body3}  = 0xFF;
484
485		$header = header(0, $offnum, 1);
486		push @expected,
487		  qr/${header}attribute with length \d+ ends at offset \d+ beyond total tuple length \d+/;
488	}
489	elsif ($offnum == 13)
490	{
491		# Corrupt the bits in column 'c' toast pointer
492		$tup->{c_va_valueid} = 0xFFFFFFFF;
493
494		$header = header(0, $offnum, 2);
495		push @expected, qr/${header}toast value \d+ not found in toast table/;
496	}
497	elsif ($offnum == 14)
498	{
499		# Set both HEAP_XMAX_COMMITTED and HEAP_XMAX_IS_MULTI
500		$tup->{t_infomask} |= HEAP_XMAX_COMMITTED;
501		$tup->{t_infomask} |= HEAP_XMAX_IS_MULTI;
502		$tup->{t_xmax} = 4;
503
504		push @expected,
505		  qr/${header}multitransaction ID 4 equals or exceeds next valid multitransaction ID 1/;
506	}
507	elsif ($offnum == 15)    # Last offnum must equal ROWCOUNT
508	{
509		# Set both HEAP_XMAX_COMMITTED and HEAP_XMAX_IS_MULTI
510		$tup->{t_infomask} |= HEAP_XMAX_COMMITTED;
511		$tup->{t_infomask} |= HEAP_XMAX_IS_MULTI;
512		$tup->{t_xmax} = 4000000000;
513
514		push @expected,
515		  qr/${header}multitransaction ID 4000000000 precedes relation minimum multitransaction ID threshold 1/;
516	}
517	write_tuple($file, $offset, $tup);
518}
519close($file)
520  or BAIL_OUT("close failed: $!");
521$node->start;
522
523# Run pg_amcheck against the corrupt table with epoch=0, comparing actual
524# corruption messages against the expected messages
525$node->command_checks_all(
526	[ 'pg_amcheck', '--no-dependent-indexes', '-p', $port, 'postgres' ],
527	2, [@expected], [], 'Expected corruption message output');
528
529$node->teardown_node;
530$node->clean_node;
531