1 2# Copyright (c) 2021, PostgreSQL Global Development Group 3 4use strict; 5use warnings; 6 7use PostgresNode; 8use TestLib; 9 10use Fcntl qw(:seek); 11use Test::More; 12 13# This regression test demonstrates that the pg_amcheck binary correctly 14# identifies specific kinds of corruption within pages. To test this, we need 15# a mechanism to create corrupt pages with predictable, repeatable corruption. 16# The postgres backend cannot be expected to help us with this, as its design 17# is not consistent with the goal of intentionally corrupting pages. 18# 19# Instead, we create a table to corrupt, and with careful consideration of how 20# postgresql lays out heap pages, we seek to offsets within the page and 21# overwrite deliberately chosen bytes with specific values calculated to 22# corrupt the page in expected ways. We then verify that pg_amcheck reports 23# the corruption, and that it runs without crashing. Note that the backend 24# cannot simply be started to run queries against the corrupt table, as the 25# backend will crash, at least for some of the corruption types we generate. 26# 27# Autovacuum potentially touching the table in the background makes the exact 28# behavior of this test harder to reason about. We turn it off to keep things 29# simpler. We use a "belt and suspenders" approach, turning it off for the 30# system generally in postgresql.conf, and turning it off specifically for the 31# test table. 32# 33# This test depends on the table being written to the heap file exactly as we 34# expect it to be, so we take care to arrange the columns of the table, and 35# insert rows of the table, that give predictable sizes and locations within 36# the table page. 37# 38# The HeapTupleHeaderData has 23 bytes of fixed size fields before the variable 39# length t_bits[] array. We have exactly 3 columns in the table, so natts = 3, 40# t_bits is 1 byte long, and t_hoff = MAXALIGN(23 + 1) = 24. 41# 42# We're not too fussy about which datatypes we use for the test, but we do care 43# about some specific properties. We'd like to test both fixed size and 44# varlena types. We'd like some varlena data inline and some toasted. And 45# we'd like the layout of the table such that the datums land at predictable 46# offsets within the tuple. We choose a structure without padding on all 47# supported architectures: 48# 49# a BIGINT 50# b TEXT 51# c TEXT 52# 53# We always insert a 7-ascii character string into field 'b', which with a 54# 1-byte varlena header gives an 8 byte inline value. We always insert a long 55# text string in field 'c', long enough to force toast storage. 56# 57# We choose to read and write binary copies of our table's tuples, using perl's 58# pack() and unpack() functions. Perl uses a packing code system in which: 59# 60# l = "signed 32-bit Long", 61# L = "Unsigned 32-bit Long", 62# S = "Unsigned 16-bit Short", 63# C = "Unsigned 8-bit Octet", 64# 65# Each tuple in our table has a layout as follows: 66# 67# xx xx xx xx t_xmin: xxxx offset = 0 L 68# xx xx xx xx t_xmax: xxxx offset = 4 L 69# xx xx xx xx t_field3: xxxx offset = 8 L 70# xx xx bi_hi: xx offset = 12 S 71# xx xx bi_lo: xx offset = 14 S 72# xx xx ip_posid: xx offset = 16 S 73# xx xx t_infomask2: xx offset = 18 S 74# xx xx t_infomask: xx offset = 20 S 75# xx t_hoff: x offset = 22 C 76# xx t_bits: x offset = 23 C 77# xx xx xx xx xx xx xx xx 'a': xxxxxxxx offset = 24 LL 78# xx xx xx xx xx xx xx xx 'b': xxxxxxxx offset = 32 CCCCCCCC 79# xx xx xx xx xx xx xx xx 'c': xxxxxxxx offset = 40 CCllLL 80# xx xx xx xx xx xx xx xx : xxxxxxxx ...continued 81# xx xx : xx ...continued 82# 83# We could choose to read and write columns 'b' and 'c' in other ways, but 84# it is convenient enough to do it this way. We define packing code 85# constants here, where they can be compared easily against the layout. 86 87use constant HEAPTUPLE_PACK_CODE => 'LLLSSSSSCCLLCCCCCCCCCCllLL'; 88use constant HEAPTUPLE_PACK_LENGTH => 58; # Total size 89 90# Read a tuple of our table from a heap page. 91# 92# Takes an open filehandle to the heap file, and the offset of the tuple. 93# 94# Rather than returning the binary data from the file, unpacks the data into a 95# perl hash with named fields. These fields exactly match the ones understood 96# by write_tuple(), below. Returns a reference to this hash. 97# 98sub read_tuple 99{ 100 my ($fh, $offset) = @_; 101 my ($buffer, %tup); 102 seek($fh, $offset, SEEK_SET) 103 or BAIL_OUT("seek failed: $!"); 104 defined(sysread($fh, $buffer, HEAPTUPLE_PACK_LENGTH)) 105 or BAIL_OUT("sysread failed: $!"); 106 107 @_ = unpack(HEAPTUPLE_PACK_CODE, $buffer); 108 %tup = ( 109 t_xmin => shift, 110 t_xmax => shift, 111 t_field3 => shift, 112 bi_hi => shift, 113 bi_lo => shift, 114 ip_posid => shift, 115 t_infomask2 => shift, 116 t_infomask => shift, 117 t_hoff => shift, 118 t_bits => shift, 119 a_1 => shift, 120 a_2 => shift, 121 b_header => shift, 122 b_body1 => shift, 123 b_body2 => shift, 124 b_body3 => shift, 125 b_body4 => shift, 126 b_body5 => shift, 127 b_body6 => shift, 128 b_body7 => shift, 129 c_va_header => shift, 130 c_va_vartag => shift, 131 c_va_rawsize => shift, 132 c_va_extinfo => shift, 133 c_va_valueid => shift, 134 c_va_toastrelid => shift); 135 # Stitch together the text for column 'b' 136 $tup{b} = join('', map { chr($tup{"b_body$_"}) } (1 .. 7)); 137 return \%tup; 138} 139 140# Write a tuple of our table to a heap page. 141# 142# Takes an open filehandle to the heap file, the offset of the tuple, and a 143# reference to a hash with the tuple values, as returned by read_tuple(). 144# Writes the tuple fields from the hash into the heap file. 145# 146# The purpose of this function is to write a tuple back to disk with some 147# subset of fields modified. The function does no error checking. Use 148# cautiously. 149# 150sub write_tuple 151{ 152 my ($fh, $offset, $tup) = @_; 153 my $buffer = pack( 154 HEAPTUPLE_PACK_CODE, 155 $tup->{t_xmin}, $tup->{t_xmax}, 156 $tup->{t_field3}, $tup->{bi_hi}, 157 $tup->{bi_lo}, $tup->{ip_posid}, 158 $tup->{t_infomask2}, $tup->{t_infomask}, 159 $tup->{t_hoff}, $tup->{t_bits}, 160 $tup->{a_1}, $tup->{a_2}, 161 $tup->{b_header}, $tup->{b_body1}, 162 $tup->{b_body2}, $tup->{b_body3}, 163 $tup->{b_body4}, $tup->{b_body5}, 164 $tup->{b_body6}, $tup->{b_body7}, 165 $tup->{c_va_header}, $tup->{c_va_vartag}, 166 $tup->{c_va_rawsize}, $tup->{c_va_extinfo}, 167 $tup->{c_va_valueid}, $tup->{c_va_toastrelid}); 168 seek($fh, $offset, SEEK_SET) 169 or BAIL_OUT("seek failed: $!"); 170 defined(syswrite($fh, $buffer, HEAPTUPLE_PACK_LENGTH)) 171 or BAIL_OUT("syswrite failed: $!"); 172 return; 173} 174 175# Set umask so test directories and files are created with default permissions 176umask(0077); 177 178# Set up the node. Once we create and corrupt the table, 179# autovacuum workers visiting the table could crash the backend. 180# Disable autovacuum so that won't happen. 181my $node = get_new_node('test'); 182$node->init; 183$node->append_conf('postgresql.conf', 'autovacuum=off'); 184 185# Start the node and load the extensions. We depend on both 186# amcheck and pageinspect for this test. 187$node->start; 188my $port = $node->port; 189my $pgdata = $node->data_dir; 190$node->safe_psql('postgres', "CREATE EXTENSION amcheck"); 191$node->safe_psql('postgres', "CREATE EXTENSION pageinspect"); 192 193# Get a non-zero datfrozenxid 194$node->safe_psql('postgres', qq(VACUUM FREEZE)); 195 196# Create the test table with precisely the schema that our corruption function 197# expects. 198$node->safe_psql( 199 'postgres', qq( 200 CREATE TABLE public.test (a BIGINT, b TEXT, c TEXT); 201 ALTER TABLE public.test SET (autovacuum_enabled=false); 202 ALTER TABLE public.test ALTER COLUMN c SET STORAGE EXTERNAL; 203 CREATE INDEX test_idx ON public.test(a, b); 204 )); 205 206# We want (0 < datfrozenxid < test.relfrozenxid). To achieve this, we freeze 207# an otherwise unused table, public.junk, prior to inserting data and freezing 208# public.test 209$node->safe_psql( 210 'postgres', qq( 211 CREATE TABLE public.junk AS SELECT 'junk'::TEXT AS junk_column; 212 ALTER TABLE public.junk SET (autovacuum_enabled=false); 213 VACUUM FREEZE public.junk 214 )); 215 216my $rel = $node->safe_psql('postgres', 217 qq(SELECT pg_relation_filepath('public.test'))); 218my $relpath = "$pgdata/$rel"; 219 220# Insert data and freeze public.test 221use constant ROWCOUNT => 16; 222$node->safe_psql( 223 'postgres', qq( 224 INSERT INTO public.test (a, b, c) 225 VALUES ( 226 x'DEADF9F9DEADF9F9'::bigint, 227 'abcdefg', 228 repeat('w', 10000) 229 ); 230 VACUUM FREEZE public.test 231 )) for (1 .. ROWCOUNT); 232 233my $relfrozenxid = $node->safe_psql('postgres', 234 q(select relfrozenxid from pg_class where relname = 'test')); 235my $datfrozenxid = $node->safe_psql('postgres', 236 q(select datfrozenxid from pg_database where datname = 'postgres')); 237 238# Sanity check that our 'test' table has a relfrozenxid newer than the 239# datfrozenxid for the database, and that the datfrozenxid is greater than the 240# first normal xid. We rely on these invariants in some of our tests. 241if ($datfrozenxid <= 3 || $datfrozenxid >= $relfrozenxid) 242{ 243 $node->clean_node; 244 plan skip_all => 245 "Xid thresholds not as expected: got datfrozenxid = $datfrozenxid, relfrozenxid = $relfrozenxid"; 246 exit; 247} 248 249# Find where each of the tuples is located on the page. 250my @lp_off; 251for my $tup (0 .. ROWCOUNT - 1) 252{ 253 push( 254 @lp_off, 255 $node->safe_psql( 256 'postgres', qq( 257select lp_off from heap_page_items(get_raw_page('test', 'main', 0)) 258 offset $tup limit 1))); 259} 260 261# Sanity check that our 'test' table on disk layout matches expectations. If 262# this is not so, we will have to skip the test until somebody updates the test 263# to work on this platform. 264$node->stop; 265my $file; 266open($file, '+<', $relpath) 267 or BAIL_OUT("open failed: $!"); 268binmode $file; 269 270my $ENDIANNESS; 271for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) 272{ 273 my $offnum = $tupidx + 1; # offnum is 1-based, not zero-based 274 my $offset = $lp_off[$tupidx]; 275 my $tup = read_tuple($file, $offset); 276 277 # Sanity-check that the data appears on the page where we expect. 278 my $a_1 = $tup->{a_1}; 279 my $a_2 = $tup->{a_2}; 280 my $b = $tup->{b}; 281 if ($a_1 != 0xDEADF9F9 || $a_2 != 0xDEADF9F9 || $b ne 'abcdefg') 282 { 283 close($file); # ignore errors on close; we're exiting anyway 284 $node->clean_node; 285 plan skip_all => 286 sprintf( 287 "Page layout differs from our expectations: expected (%x, %x, \"%s\"), got (%x, %x, \"%s\")", 288 0xDEADF9F9, 0xDEADF9F9, "abcdefg", $a_1, $a_2, $b); 289 exit; 290 } 291 292 # Determine endianness of current platform from the 1-byte varlena header 293 $ENDIANNESS = $tup->{b_header} == 0x11 ? "little" : "big"; 294} 295close($file) 296 or BAIL_OUT("close failed: $!"); 297$node->start; 298 299# Ok, Xids and page layout look ok. We can run corruption tests. 300plan tests => 19; 301 302# Check that pg_amcheck runs against the uncorrupted table without error. 303$node->command_ok( 304 [ 'pg_amcheck', '-p', $port, 'postgres' ], 305 'pg_amcheck test table, prior to corruption'); 306 307# Check that pg_amcheck runs against the uncorrupted table and index without error. 308$node->command_ok([ 'pg_amcheck', '-p', $port, 'postgres' ], 309 'pg_amcheck test table and index, prior to corruption'); 310 311$node->stop; 312 313# Some #define constants from access/htup_details.h for use while corrupting. 314use constant HEAP_HASNULL => 0x0001; 315use constant HEAP_XMAX_LOCK_ONLY => 0x0080; 316use constant HEAP_XMIN_COMMITTED => 0x0100; 317use constant HEAP_XMIN_INVALID => 0x0200; 318use constant HEAP_XMAX_COMMITTED => 0x0400; 319use constant HEAP_XMAX_INVALID => 0x0800; 320use constant HEAP_NATTS_MASK => 0x07FF; 321use constant HEAP_XMAX_IS_MULTI => 0x1000; 322use constant HEAP_KEYS_UPDATED => 0x2000; 323 324# Helper function to generate a regular expression matching the header we 325# expect verify_heapam() to return given which fields we expect to be non-null. 326sub header 327{ 328 my ($blkno, $offnum, $attnum) = @_; 329 return 330 qr/heap table "postgres\.public\.test", block $blkno, offset $offnum, attribute $attnum:\s+/ms 331 if (defined $attnum); 332 return 333 qr/heap table "postgres\.public\.test", block $blkno, offset $offnum:\s+/ms 334 if (defined $offnum); 335 return qr/heap table "postgres\.public\.test", block $blkno:\s+/ms 336 if (defined $blkno); 337 return qr/heap table "postgres\.public\.test":\s+/ms; 338} 339 340# Corrupt the tuples, one type of corruption per tuple. Some types of 341# corruption cause verify_heapam to skip to the next tuple without 342# performing any remaining checks, so we can't exercise the system properly if 343# we focus all our corruption on a single tuple. 344# 345my @expected; 346open($file, '+<', $relpath) 347 or BAIL_OUT("open failed: $!"); 348binmode $file; 349 350for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) 351{ 352 my $offnum = $tupidx + 1; # offnum is 1-based, not zero-based 353 my $offset = $lp_off[$tupidx]; 354 my $tup = read_tuple($file, $offset); 355 356 my $header = header(0, $offnum, undef); 357 if ($offnum == 1) 358 { 359 # Corruptly set xmin < relfrozenxid 360 my $xmin = $relfrozenxid - 1; 361 $tup->{t_xmin} = $xmin; 362 $tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED; 363 $tup->{t_infomask} &= ~HEAP_XMIN_INVALID; 364 365 # Expected corruption report 366 push @expected, 367 qr/${header}xmin $xmin precedes relation freeze threshold 0:\d+/; 368 } 369 if ($offnum == 2) 370 { 371 # Corruptly set xmin < datfrozenxid 372 my $xmin = 3; 373 $tup->{t_xmin} = $xmin; 374 $tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED; 375 $tup->{t_infomask} &= ~HEAP_XMIN_INVALID; 376 377 push @expected, 378 qr/${$header}xmin $xmin precedes oldest valid transaction ID 0:\d+/; 379 } 380 elsif ($offnum == 3) 381 { 382 # Corruptly set xmin < datfrozenxid, further back, noting circularity 383 # of xid comparison. For a new cluster with epoch = 0, the corrupt 384 # xmin will be interpreted as in the future 385 $tup->{t_xmin} = 4026531839; 386 $tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED; 387 $tup->{t_infomask} &= ~HEAP_XMIN_INVALID; 388 389 push @expected, 390 qr/${$header}xmin 4026531839 equals or exceeds next valid transaction ID 0:\d+/; 391 } 392 elsif ($offnum == 4) 393 { 394 # Corruptly set xmax < relminmxid; 395 $tup->{t_xmax} = 4026531839; 396 $tup->{t_infomask} &= ~HEAP_XMAX_INVALID; 397 398 push @expected, 399 qr/${$header}xmax 4026531839 equals or exceeds next valid transaction ID 0:\d+/; 400 } 401 elsif ($offnum == 5) 402 { 403 # Corrupt the tuple t_hoff, but keep it aligned properly 404 $tup->{t_hoff} += 128; 405 406 push @expected, 407 qr/${$header}data begins at offset 152 beyond the tuple length 58/, 408 qr/${$header}tuple data should begin at byte 24, but actually begins at byte 152 \(3 attributes, no nulls\)/; 409 } 410 elsif ($offnum == 6) 411 { 412 # Corrupt the tuple t_hoff, wrong alignment 413 $tup->{t_hoff} += 3; 414 415 push @expected, 416 qr/${$header}tuple data should begin at byte 24, but actually begins at byte 27 \(3 attributes, no nulls\)/; 417 } 418 elsif ($offnum == 7) 419 { 420 # Corrupt the tuple t_hoff, underflow but correct alignment 421 $tup->{t_hoff} -= 8; 422 423 push @expected, 424 qr/${$header}tuple data should begin at byte 24, but actually begins at byte 16 \(3 attributes, no nulls\)/; 425 } 426 elsif ($offnum == 8) 427 { 428 # Corrupt the tuple t_hoff, underflow and wrong alignment 429 $tup->{t_hoff} -= 3; 430 431 push @expected, 432 qr/${$header}tuple data should begin at byte 24, but actually begins at byte 21 \(3 attributes, no nulls\)/; 433 } 434 elsif ($offnum == 9) 435 { 436 # Corrupt the tuple to look like it has lots of attributes, not just 3 437 $tup->{t_infomask2} |= HEAP_NATTS_MASK; 438 439 push @expected, 440 qr/${$header}number of attributes 2047 exceeds maximum expected for table 3/; 441 } 442 elsif ($offnum == 10) 443 { 444 # Corrupt the tuple to look like it has lots of attributes, some of 445 # them null. This falsely creates the impression that the t_bits 446 # array is longer than just one byte, but t_hoff still says otherwise. 447 $tup->{t_infomask} |= HEAP_HASNULL; 448 $tup->{t_infomask2} |= HEAP_NATTS_MASK; 449 $tup->{t_bits} = 0xAA; 450 451 push @expected, 452 qr/${$header}tuple data should begin at byte 280, but actually begins at byte 24 \(2047 attributes, has nulls\)/; 453 } 454 elsif ($offnum == 11) 455 { 456 # Same as above, but this time t_hoff plays along 457 $tup->{t_infomask} |= HEAP_HASNULL; 458 $tup->{t_infomask2} |= (HEAP_NATTS_MASK & 0x40); 459 $tup->{t_bits} = 0xAA; 460 $tup->{t_hoff} = 32; 461 462 push @expected, 463 qr/${$header}number of attributes 67 exceeds maximum expected for table 3/; 464 } 465 elsif ($offnum == 12) 466 { 467 # Overwrite column 'b' 1-byte varlena header and initial characters to 468 # look like a long 4-byte varlena 469 # 470 # On little endian machines, bytes ending in two zero bits (xxxxxx00 bytes) 471 # are 4-byte length word, aligned, uncompressed data (up to 1G). We set the 472 # high six bits to 111111 and the lower two bits to 00, then the next three 473 # bytes with 0xFF using 0xFCFFFFFF. 474 # 475 # On big endian machines, bytes starting in two zero bits (00xxxxxx bytes) 476 # are 4-byte length word, aligned, uncompressed data (up to 1G). We set the 477 # low six bits to 111111 and the high two bits to 00, then the next three 478 # bytes with 0xFF using 0x3FFFFFFF. 479 # 480 $tup->{b_header} = $ENDIANNESS eq 'little' ? 0xFC : 0x3F; 481 $tup->{b_body1} = 0xFF; 482 $tup->{b_body2} = 0xFF; 483 $tup->{b_body3} = 0xFF; 484 485 $header = header(0, $offnum, 1); 486 push @expected, 487 qr/${header}attribute with length \d+ ends at offset \d+ beyond total tuple length \d+/; 488 } 489 elsif ($offnum == 13) 490 { 491 # Corrupt the bits in column 'c' toast pointer 492 $tup->{c_va_valueid} = 0xFFFFFFFF; 493 494 $header = header(0, $offnum, 2); 495 push @expected, qr/${header}toast value \d+ not found in toast table/; 496 } 497 elsif ($offnum == 14) 498 { 499 # Set both HEAP_XMAX_COMMITTED and HEAP_XMAX_IS_MULTI 500 $tup->{t_infomask} |= HEAP_XMAX_COMMITTED; 501 $tup->{t_infomask} |= HEAP_XMAX_IS_MULTI; 502 $tup->{t_xmax} = 4; 503 504 push @expected, 505 qr/${header}multitransaction ID 4 equals or exceeds next valid multitransaction ID 1/; 506 } 507 elsif ($offnum == 15) # Last offnum must equal ROWCOUNT 508 { 509 # Set both HEAP_XMAX_COMMITTED and HEAP_XMAX_IS_MULTI 510 $tup->{t_infomask} |= HEAP_XMAX_COMMITTED; 511 $tup->{t_infomask} |= HEAP_XMAX_IS_MULTI; 512 $tup->{t_xmax} = 4000000000; 513 514 push @expected, 515 qr/${header}multitransaction ID 4000000000 precedes relation minimum multitransaction ID threshold 1/; 516 } 517 write_tuple($file, $offset, $tup); 518} 519close($file) 520 or BAIL_OUT("close failed: $!"); 521$node->start; 522 523# Run pg_amcheck against the corrupt table with epoch=0, comparing actual 524# corruption messages against the expected messages 525$node->command_checks_all( 526 [ 'pg_amcheck', '--no-dependent-indexes', '-p', $port, 'postgres' ], 527 2, [@expected], [], 'Expected corruption message output'); 528 529$node->teardown_node; 530$node->clean_node; 531