1 /*-------------------------------------------------------------------------
2  *
3  * nbtxlog.h
4  *	  header file for postgres btree xlog routines
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * src/include/access/nbtxlog.h
10  *
11  *-------------------------------------------------------------------------
12  */
13 #ifndef NBTXLOG_H
14 #define NBTXLOG_H
15 
16 #include "access/xlogreader.h"
17 #include "lib/stringinfo.h"
18 #include "storage/off.h"
19 
20 /*
21  * XLOG records for btree operations
22  *
23  * XLOG allows to store some information in high 4 bits of log
24  * record xl_info field
25  */
26 #define XLOG_BTREE_INSERT_LEAF	0x00	/* add index tuple without split */
27 #define XLOG_BTREE_INSERT_UPPER 0x10	/* same, on a non-leaf page */
28 #define XLOG_BTREE_INSERT_META	0x20	/* same, plus update metapage */
29 #define XLOG_BTREE_SPLIT_L		0x30	/* add index tuple with split */
30 #define XLOG_BTREE_SPLIT_R		0x40	/* as above, new item on right */
31 #define XLOG_BTREE_INSERT_POST	0x50	/* add index tuple with posting split */
32 #define XLOG_BTREE_DEDUP		0x60	/* deduplicate tuples for a page */
33 #define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuples for a page */
34 #define XLOG_BTREE_UNLINK_PAGE	0x80	/* delete a half-dead page */
35 #define XLOG_BTREE_UNLINK_PAGE_META 0x90	/* same, and update metapage */
36 #define XLOG_BTREE_NEWROOT		0xA0	/* new root page */
37 #define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0	/* mark a leaf as half-dead */
38 #define XLOG_BTREE_VACUUM		0xC0	/* delete entries on a page during
39 										 * vacuum */
40 #define XLOG_BTREE_REUSE_PAGE	0xD0	/* old page is about to be reused from
41 										 * FSM */
42 #define XLOG_BTREE_META_CLEANUP	0xE0	/* update cleanup-related data in the
43 										 * metapage */
44 
45 /*
46  * All that we need to regenerate the meta-data page
47  */
48 typedef struct xl_btree_metadata
49 {
50 	uint32		version;
51 	BlockNumber root;
52 	uint32		level;
53 	BlockNumber fastroot;
54 	uint32		fastlevel;
55 	TransactionId oldest_btpo_xact;
56 	float8		last_cleanup_num_heap_tuples;
57 	bool		allequalimage;
58 } xl_btree_metadata;
59 
60 /*
61  * This is what we need to know about simple (without split) insert.
62  *
63  * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and
64  * INSERT_POST.  Note that INSERT_META and INSERT_UPPER implies it's not a
65  * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf
66  * page.
67  *
68  * Backup Blk 0: original page
69  * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
70  * Backup Blk 2: xl_btree_metadata, if INSERT_META
71  *
72  * Note: The new tuple is actually the "original" new item in the posting
73  * list split insert case (i.e. the INSERT_POST case).  A split offset for
74  * the posting list is logged before the original new item.  Recovery needs
75  * both, since it must do an in-place update of the existing posting list
76  * that was split as an extra step.  Also, recovery generates a "final"
77  * newitem.  See _bt_swap_posting() for details on posting list splits.
78  */
79 typedef struct xl_btree_insert
80 {
81 	OffsetNumber offnum;
82 
83 	/* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */
84 	/* NEW TUPLE ALWAYS FOLLOWS AT THE END */
85 } xl_btree_insert;
86 
87 #define SizeOfBtreeInsert	(offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
88 
89 /*
90  * On insert with split, we save all the items going into the right sibling
91  * so that we can restore it completely from the log record.  This way takes
92  * less xlog space than the normal approach, because if we did it standardly,
93  * XLogInsert would almost always think the right page is new and store its
94  * whole page image.  The left page, however, is handled in the normal
95  * incremental-update fashion.
96  *
97  * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record.
98  * There are two variants to indicate whether the inserted tuple went into the
99  * left or right split page (and thus, whether the new item is stored or not).
100  * We always log the left page high key because suffix truncation can generate
101  * a new leaf high key using user-defined code.  This is also necessary on
102  * internal pages, since the firstright item that the left page's high key was
103  * based on will have been truncated to zero attributes in the right page (the
104  * separator key is unavailable from the right page).
105  *
106  * Backup Blk 0: original page / new left page
107  *
108  * The left page's data portion contains the new item, if it's the _L variant.
109  * _R variant split records generally do not have a newitem (_R variant leaf
110  * page split records that must deal with a posting list split will include an
111  * explicit newitem, though it is never used on the right page -- it is
112  * actually an orignewitem needed to update existing posting list).  The new
113  * high key of the left/original page appears last of all (and must always be
114  * present).
115  *
116  * Page split records that need the REDO routine to deal with a posting list
117  * split directly will have an explicit newitem, which is actually an
118  * orignewitem (the newitem as it was before the posting list split, not
119  * after).  A posting list split always has a newitem that comes immediately
120  * after the posting list being split (which would have overlapped with
121  * orignewitem prior to split).  Usually REDO must deal with posting list
122  * splits with an _L variant page split record, and usually both the new
123  * posting list and the final newitem go on the left page (the existing
124  * posting list will be inserted instead of the old, and the final newitem
125  * will be inserted next to that).  However, _R variant split records will
126  * include an orignewitem when the split point for the page happens to have a
127  * lastleft tuple that is also the posting list being split (leaving newitem
128  * as the page split's firstright tuple).  The existence of this corner case
129  * does not change the basic fact about newitem/orignewitem for the REDO
130  * routine: it is always state used for the left page alone.  (This is why the
131  * record's postingoff field isn't a reliable indicator of whether or not a
132  * posting list split occurred during the page split; a non-zero value merely
133  * indicates that the REDO routine must reconstruct a new posting list tuple
134  * that is needed for the left page.)
135  *
136  * This posting list split handling is equivalent to the xl_btree_insert REDO
137  * routine's INSERT_POST handling.  While the details are more complicated
138  * here, the concept and goals are exactly the same.  See _bt_swap_posting()
139  * for details on posting list splits.
140  *
141  * Backup Blk 1: new right page
142  *
143  * The right page's data portion contains the right page's tuples in the form
144  * used by _bt_restore_page.  This includes the new item, if it's the _R
145  * variant.  The right page's tuples also include the right page's high key
146  * with either variant (moved from the left/original page during the split),
147  * unless the split happened to be of the rightmost page on its level, where
148  * there is no high key for new right page.
149  *
150  * Backup Blk 2: next block (orig page's rightlink), if any
151  * Backup Blk 3: child's left sibling, if non-leaf split
152  */
153 typedef struct xl_btree_split
154 {
155 	uint32		level;			/* tree level of page being split */
156 	OffsetNumber firstrightoff; /* first origpage item on rightpage */
157 	OffsetNumber newitemoff;	/* new item's offset */
158 	uint16		postingoff;		/* offset inside orig posting tuple */
159 } xl_btree_split;
160 
161 #define SizeOfBtreeSplit	(offsetof(xl_btree_split, postingoff) + sizeof(uint16))
162 
163 /*
164  * When page is deduplicated, consecutive groups of tuples with equal keys are
165  * merged together into posting list tuples.
166  *
167  * The WAL record represents a deduplication pass for a leaf page.  An array
168  * of BTDedupInterval structs follows.
169  */
170 typedef struct xl_btree_dedup
171 {
172 	uint16		nintervals;
173 
174 	/* DEDUPLICATION INTERVALS FOLLOW */
175 } xl_btree_dedup;
176 
177 #define SizeOfBtreeDedup 	(offsetof(xl_btree_dedup, nintervals) + sizeof(uint16))
178 
179 /*
180  * This is what we need to know about delete of individual leaf index tuples.
181  * The WAL record can represent deletion of any number of index tuples on a
182  * single index page when *not* executed by VACUUM.  Deletion of a subset of
183  * the TIDs within a posting list tuple is not supported.
184  *
185  * Backup Blk 0: index page
186  */
187 typedef struct xl_btree_delete
188 {
189 	TransactionId latestRemovedXid;
190 	uint32		ndeleted;
191 
192 	/* DELETED TARGET OFFSET NUMBERS FOLLOW */
193 } xl_btree_delete;
194 
195 #define SizeOfBtreeDelete	(offsetof(xl_btree_delete, ndeleted) + sizeof(uint32))
196 
197 /*
198  * This is what we need to know about page reuse within btree.  This record
199  * only exists to generate a conflict point for Hot Standby.
200  *
201  * Note that we must include a RelFileNode in the record because we don't
202  * actually register the buffer with the record.
203  */
204 typedef struct xl_btree_reuse_page
205 {
206 	RelFileNode node;
207 	BlockNumber block;
208 	TransactionId latestRemovedXid;
209 } xl_btree_reuse_page;
210 
211 #define SizeOfBtreeReusePage	(sizeof(xl_btree_reuse_page))
212 
213 /*
214  * This is what we need to know about which TIDs to remove from an individual
215  * posting list tuple during vacuuming.  An array of these may appear at the
216  * end of xl_btree_vacuum records.
217  */
218 typedef struct xl_btree_update
219 {
220 	uint16		ndeletedtids;
221 
222 	/* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */
223 } xl_btree_update;
224 
225 #define SizeOfBtreeUpdate	(offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16))
226 
227 /*
228  * This is what we need to know about a VACUUM of a leaf page.  The WAL record
229  * can represent deletion of any number of index tuples on a single index page
230  * when executed by VACUUM.  It can also support "updates" of index tuples,
231  * which is how deletes of a subset of TIDs contained in an existing posting
232  * list tuple are implemented. (Updates are only used when there will be some
233  * remaining TIDs once VACUUM finishes; otherwise the posting list tuple can
234  * just be deleted).
235  *
236  * Updated posting list tuples are represented using xl_btree_update metadata.
237  * The REDO routine uses each xl_btree_update (plus its corresponding original
238  * index tuple from the target leaf page) to generate the final updated tuple.
239  */
240 typedef struct xl_btree_vacuum
241 {
242 	uint16		ndeleted;
243 	uint16		nupdated;
244 
245 	/* DELETED TARGET OFFSET NUMBERS FOLLOW */
246 	/* UPDATED TARGET OFFSET NUMBERS FOLLOW */
247 	/* UPDATED TUPLES METADATA ARRAY FOLLOWS */
248 } xl_btree_vacuum;
249 
250 #define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))
251 
252 /*
253  * This is what we need to know about marking an empty subtree for deletion.
254  * The target identifies the tuple removed from the parent page (note that we
255  * remove this tuple's downlink and the *following* tuple's key).  Note that
256  * the leaf page is empty, so we don't need to store its content --- it is
257  * just reinitialized during recovery using the rest of the fields.
258  *
259  * Backup Blk 0: leaf block
260  * Backup Blk 1: top parent
261  */
262 typedef struct xl_btree_mark_page_halfdead
263 {
264 	OffsetNumber poffset;		/* deleted tuple id in parent page */
265 
266 	/* information needed to recreate the leaf page: */
267 	BlockNumber leafblk;		/* leaf block ultimately being deleted */
268 	BlockNumber leftblk;		/* leaf block's left sibling, if any */
269 	BlockNumber rightblk;		/* leaf block's right sibling */
270 	BlockNumber topparent;		/* topmost internal page in the subtree */
271 } xl_btree_mark_page_halfdead;
272 
273 #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
274 
275 /*
276  * This is what we need to know about deletion of a btree page.  Note we do
277  * not store any content for the deleted page --- it is just rewritten as empty
278  * during recovery, apart from resetting the btpo.xact.
279  *
280  * Backup Blk 0: target block being deleted
281  * Backup Blk 1: target block's left sibling, if any
282  * Backup Blk 2: target block's right sibling
283  * Backup Blk 3: leaf block (if different from target)
284  * Backup Blk 4: metapage (if rightsib becomes new fast root)
285  */
286 typedef struct xl_btree_unlink_page
287 {
288 	BlockNumber leftsib;		/* target block's left sibling, if any */
289 	BlockNumber rightsib;		/* target block's right sibling */
290 
291 	/*
292 	 * Information needed to recreate the leaf page, when target is an
293 	 * internal page.
294 	 */
295 	BlockNumber leafleftsib;
296 	BlockNumber leafrightsib;
297 	BlockNumber topparent;		/* next child down in the subtree */
298 
299 	TransactionId btpo_xact;	/* value of btpo.xact for use in recovery */
300 	/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
301 } xl_btree_unlink_page;
302 
303 #define SizeOfBtreeUnlinkPage	(offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId))
304 
305 /*
306  * New root log record.  There are zero tuples if this is to establish an
307  * empty root, or two if it is the result of splitting an old root.
308  *
309  * Note that although this implies rewriting the metadata page, we don't need
310  * an xl_btree_metadata record --- the rootblk and level are sufficient.
311  *
312  * Backup Blk 0: new root page (2 tuples as payload, if splitting old root)
313  * Backup Blk 1: left child (if splitting an old root)
314  * Backup Blk 2: metapage
315  */
316 typedef struct xl_btree_newroot
317 {
318 	BlockNumber rootblk;		/* location of new root (redundant with blk 0) */
319 	uint32		level;			/* its tree level */
320 } xl_btree_newroot;
321 
322 #define SizeOfBtreeNewroot	(offsetof(xl_btree_newroot, level) + sizeof(uint32))
323 
324 
325 /*
326  * prototypes for functions in nbtxlog.c
327  */
328 extern void btree_redo(XLogReaderState *record);
329 extern void btree_desc(StringInfo buf, XLogReaderState *record);
330 extern const char *btree_identify(uint8 info);
331 extern void btree_xlog_startup(void);
332 extern void btree_xlog_cleanup(void);
333 extern void btree_mask(char *pagedata, BlockNumber blkno);
334 
335 #endif							/* NBTXLOG_H */
336