1 /***********************************************************************************************************************************
2 PostgreSQL Page Checksum Algorithm
3 
4 For each supported release of PostgreSQL check the code in this file to see if it has changed.  The easiest way to do this is to
5 copy and paste in place and check git to see if there are any diffs.  Tabs should be copied as is to make this process easy even
6 though the pgBackRest project does not use tabs elsewhere.
7 
8 Since the checksum implementation and page format do not (yet) change between versions this code should be copied verbatim from
9 src/include/storage/checksum_impl.h for each new release.  Only the newest released version of the code should be used.
10 
11 Modifications need to be made after copying:
12 
13 1) Remove `#include "storage/bufpage.h"`.
14 2) Make pg_checksum_page() static.
15 ***********************************************************************************************************************************/
16 
17 /*-------------------------------------------------------------------------
18  *
19  * checksum_impl.h
20  *	  Checksum implementation for data pages.
21  *
22  * This file exists for the benefit of external programs that may wish to
23  * check Postgres page checksums.  They can #include this to get the code
24  * referenced by storage/checksum.h.  (Note: you may need to redefine
25  * Assert() as empty to compile this successfully externally.)
26  *
27  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
28  * Portions Copyright (c) 1994, Regents of the University of California
29  *
30  * src/include/storage/checksum_impl.h
31  *
32  *-------------------------------------------------------------------------
33  */
34 
35 /*
36  * The algorithm used to checksum pages is chosen for very fast calculation.
37  * Workloads where the database working set fits into OS file cache but not
38  * into shared buffers can read in pages at a very fast pace and the checksum
39  * algorithm itself can become the largest bottleneck.
40  *
41  * The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand
42  * for Fowler/Noll/Vo).  The primitive of a plain FNV-1a hash folds in data 1
43  * byte at a time according to the formula:
44  *
45  *	   hash = (hash ^ value) * FNV_PRIME
46  *
47  * FNV-1a algorithm is described at http://www.isthe.com/chongo/tech/comp/fnv/
48  *
49  * PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of
50  * high bits - high order bits in input data only affect high order bits in
51  * output data. To resolve this we xor in the value prior to multiplication
52  * shifted right by 17 bits. The number 17 was chosen because it doesn't
53  * have common denominator with set bit positions in FNV_PRIME and empirically
54  * provides the fastest mixing for high order bits of final iterations quickly
55  * avalanche into lower positions. For performance reasons we choose to combine
56  * 4 bytes at a time. The actual hash formula used as the basis is:
57  *
58  *	   hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value) >> 17)
59  *
60  * The main bottleneck in this calculation is the multiplication latency. To
61  * hide the latency and to make use of SIMD parallelism multiple hash values
62  * are calculated in parallel. The page is treated as a 32 column two
63  * dimensional array of 32 bit values. Each column is aggregated separately
64  * into a partial checksum. Each partial checksum uses a different initial
65  * value (offset basis in FNV terminology). The initial values actually used
66  * were chosen randomly, as the values themselves don't matter as much as that
67  * they are different and don't match anything in real data. After initializing
68  * partial checksums each value in the column is aggregated according to the
69  * above formula. Finally two more iterations of the formula are performed with
70  * value 0 to mix the bits of the last value added.
71  *
72  * The partial checksums are then folded together using xor to form a single
73  * 32-bit checksum. The caller can safely reduce the value to 16 bits
74  * using modulo 2^16-1. That will cause a very slight bias towards lower
75  * values but this is not significant for the performance of the
76  * checksum.
77  *
78  * The algorithm choice was based on what instructions are available in SIMD
79  * instruction sets. This meant that a fast and good algorithm needed to use
80  * multiplication as the main mixing operator. The simplest multiplication
81  * based checksum primitive is the one used by FNV. The prime used is chosen
82  * for good dispersion of values. It has no known simple patterns that result
83  * in collisions. Test of 5-bit differentials of the primitive over 64bit keys
84  * reveals no differentials with 3 or more values out of 100000 random keys
85  * colliding. Avalanche test shows that only high order bits of the last word
86  * have a bias. Tests of 1-4 uncorrelated bit errors, stray 0 and 0xFF bytes,
87  * overwriting page from random position to end with 0 bytes, and overwriting
88  * random segments of page with 0x00, 0xFF and random data all show optimal
89  * 2e-16 false positive rate within margin of error.
90  *
91  * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer
92  * multiplication instruction. As of 2013 the corresponding instruction is
93  * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32).
94  * Vectorization requires a compiler to do the vectorization for us. For recent
95  * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough
96  * to achieve vectorization.
97  *
98  * The optimal amount of parallelism to use depends on CPU specific instruction
99  * latency, SIMD instruction width, throughput and the amount of registers
100  * available to hold intermediate state. Generally, more parallelism is better
101  * up to the point that state doesn't fit in registers and extra load-store
102  * instructions are needed to swap values in/out. The number chosen is a fixed
103  * part of the algorithm because changing the parallelism changes the checksum
104  * result.
105  *
106  * The parallelism number 32 was chosen based on the fact that it is the
107  * largest state that fits into architecturally visible x86 SSE registers while
108  * leaving some free registers for intermediate values. For future processors
109  * with 256bit vector registers this will leave some performance on the table.
110  * When vectorization is not available it might be beneficial to restructure
111  * the computation to calculate a subset of the columns at a time and perform
112  * multiple passes to avoid register spilling. This optimization opportunity
113  * is not used. Current coding also assumes that the compiler has the ability
114  * to unroll the inner loop to avoid loop overhead and minimize register
115  * spilling. For less sophisticated compilers it might be beneficial to
116  * manually unroll the inner loop.
117  */
118 
119 /* number of checksums to calculate in parallel */
120 #define N_SUMS 32
121 /* prime multiplier of FNV-1a hash */
122 #define FNV_PRIME 16777619
123 
124 /* Use a union so that this code is valid under strict aliasing */
125 typedef union
126 {
127 	PageHeaderData phdr;
128 	uint32		data[BLCKSZ / (sizeof(uint32) * N_SUMS)][N_SUMS];
129 } PGChecksummablePage;
130 
131 /*
132  * Base offsets to initialize each of the parallel FNV hashes into a
133  * different initial state.
134  */
135 static const uint32 checksumBaseOffsets[N_SUMS] = {
136 	0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A,
137 	0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C,
138 	0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA,
139 	0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB,
140 	0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE,
141 	0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4,
142 	0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E,
143 	0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756
144 };
145 
146 /*
147  * Calculate one round of the checksum.
148  */
149 #define CHECKSUM_COMP(checksum, value) \
150 do { \
151 	uint32 __tmp = (checksum) ^ (value); \
152 	(checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17); \
153 } while (0)
154 
155 /*
156  * Block checksum algorithm.  The page must be adequately aligned
157  * (at least on 4-byte boundary).
158  */
159 static uint32
pg_checksum_block(const PGChecksummablePage * page)160 pg_checksum_block(const PGChecksummablePage *page)
161 {
162 	uint32		sums[N_SUMS];
163 	uint32		result = 0;
164 	uint32		i,
165 				j;
166 
167 	/* ensure that the size is compatible with the algorithm */
168 	Assert(sizeof(PGChecksummablePage) == BLCKSZ);
169 
170 	/* initialize partial checksums to their corresponding offsets */
171 	memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
172 
173 	/* main checksum calculation */
174 	for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
175 		for (j = 0; j < N_SUMS; j++)
176 			CHECKSUM_COMP(sums[j], page->data[i][j]);
177 
178 	/* finally add in two rounds of zeroes for additional mixing */
179 	for (i = 0; i < 2; i++)
180 		for (j = 0; j < N_SUMS; j++)
181 			CHECKSUM_COMP(sums[j], 0);
182 
183 	/* xor fold partial checksums together */
184 	for (i = 0; i < N_SUMS; i++)
185 		result ^= sums[i];
186 
187 	return result;
188 }
189 
190 /*
191  * Compute the checksum for a Postgres page.
192  *
193  * The page must be adequately aligned (at least on a 4-byte boundary).
194  * Beware also that the checksum field of the page is transiently zeroed.
195  *
196  * The checksum includes the block number (to detect the case where a page is
197  * somehow moved to a different location), the page header (excluding the
198  * checksum itself), and the page data.
199  */
200 static uint16
pg_checksum_page(char * page,BlockNumber blkno)201 pg_checksum_page(char *page, BlockNumber blkno)
202 {
203 	PGChecksummablePage *cpage = (PGChecksummablePage *) page;
204 	uint16		save_checksum;
205 	uint32		checksum;
206 
207 	/* We only calculate the checksum for properly-initialized pages */
208 	Assert(!PageIsNew(&cpage->phdr));
209 
210 	/*
211 	 * Save pd_checksum and temporarily set it to zero, so that the checksum
212 	 * calculation isn't affected by the old checksum stored on the page.
213 	 * Restore it after, because actually updating the checksum is NOT part of
214 	 * the API of this function.
215 	 */
216 	save_checksum = cpage->phdr.pd_checksum;
217 	cpage->phdr.pd_checksum = 0;
218 	checksum = pg_checksum_block(cpage);
219 	cpage->phdr.pd_checksum = save_checksum;
220 
221 	/* Mix in the block number to detect transposed pages */
222 	checksum ^= blkno;
223 
224 	/*
225 	 * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of
226 	 * one. That avoids checksums of zero, which seems like a good idea.
227 	 */
228 	return (uint16) ((checksum % 65535) + 1);
229 }
230