1 /*------------------------------------------------------------------------- 2 * 3 * nbtxlog.h 4 * header file for postgres btree xlog routines 5 * 6 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group 7 * Portions Copyright (c) 1994, Regents of the University of California 8 * 9 * src/include/access/nbtxlog.h 10 * 11 *------------------------------------------------------------------------- 12 */ 13 #ifndef NBTXLOG_H 14 #define NBTXLOG_H 15 16 #include "access/xlogreader.h" 17 #include "lib/stringinfo.h" 18 #include "storage/off.h" 19 20 /* 21 * XLOG records for btree operations 22 * 23 * XLOG allows to store some information in high 4 bits of log 24 * record xl_info field 25 */ 26 #define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */ 27 #define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */ 28 #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ 29 #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ 30 #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ 31 #define XLOG_BTREE_INSERT_POST 0x50 /* add index tuple with posting split */ 32 #define XLOG_BTREE_DEDUP 0x60 /* deduplicate tuples for a page */ 33 #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ 34 #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */ 35 #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */ 36 #define XLOG_BTREE_NEWROOT 0xA0 /* new root page */ 37 #define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */ 38 #define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during 39 * vacuum */ 40 #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from 41 * FSM */ 42 #define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the 43 * metapage */ 44 45 /* 46 * All that we need to regenerate the meta-data page 47 */ 48 typedef struct xl_btree_metadata 49 { 50 uint32 version; 51 BlockNumber root; 52 uint32 level; 53 BlockNumber fastroot; 54 uint32 fastlevel; 55 TransactionId oldest_btpo_xact; 56 float8 last_cleanup_num_heap_tuples; 57 bool allequalimage; 58 } xl_btree_metadata; 59 60 /* 61 * This is what we need to know about simple (without split) insert. 62 * 63 * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and 64 * INSERT_POST. Note that INSERT_META and INSERT_UPPER implies it's not a 65 * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf 66 * page. 67 * 68 * Backup Blk 0: original page 69 * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META 70 * Backup Blk 2: xl_btree_metadata, if INSERT_META 71 * 72 * Note: The new tuple is actually the "original" new item in the posting 73 * list split insert case (i.e. the INSERT_POST case). A split offset for 74 * the posting list is logged before the original new item. Recovery needs 75 * both, since it must do an in-place update of the existing posting list 76 * that was split as an extra step. Also, recovery generates a "final" 77 * newitem. See _bt_swap_posting() for details on posting list splits. 78 */ 79 typedef struct xl_btree_insert 80 { 81 OffsetNumber offnum; 82 83 /* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */ 84 /* NEW TUPLE ALWAYS FOLLOWS AT THE END */ 85 } xl_btree_insert; 86 87 #define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) 88 89 /* 90 * On insert with split, we save all the items going into the right sibling 91 * so that we can restore it completely from the log record. This way takes 92 * less xlog space than the normal approach, because if we did it standardly, 93 * XLogInsert would almost always think the right page is new and store its 94 * whole page image. The left page, however, is handled in the normal 95 * incremental-update fashion. 96 * 97 * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record. 98 * There are two variants to indicate whether the inserted tuple went into the 99 * left or right split page (and thus, whether the new item is stored or not). 100 * We always log the left page high key because suffix truncation can generate 101 * a new leaf high key using user-defined code. This is also necessary on 102 * internal pages, since the firstright item that the left page's high key was 103 * based on will have been truncated to zero attributes in the right page (the 104 * separator key is unavailable from the right page). 105 * 106 * Backup Blk 0: original page / new left page 107 * 108 * The left page's data portion contains the new item, if it's the _L variant. 109 * _R variant split records generally do not have a newitem (_R variant leaf 110 * page split records that must deal with a posting list split will include an 111 * explicit newitem, though it is never used on the right page -- it is 112 * actually an orignewitem needed to update existing posting list). The new 113 * high key of the left/original page appears last of all (and must always be 114 * present). 115 * 116 * Page split records that need the REDO routine to deal with a posting list 117 * split directly will have an explicit newitem, which is actually an 118 * orignewitem (the newitem as it was before the posting list split, not 119 * after). A posting list split always has a newitem that comes immediately 120 * after the posting list being split (which would have overlapped with 121 * orignewitem prior to split). Usually REDO must deal with posting list 122 * splits with an _L variant page split record, and usually both the new 123 * posting list and the final newitem go on the left page (the existing 124 * posting list will be inserted instead of the old, and the final newitem 125 * will be inserted next to that). However, _R variant split records will 126 * include an orignewitem when the split point for the page happens to have a 127 * lastleft tuple that is also the posting list being split (leaving newitem 128 * as the page split's firstright tuple). The existence of this corner case 129 * does not change the basic fact about newitem/orignewitem for the REDO 130 * routine: it is always state used for the left page alone. (This is why the 131 * record's postingoff field isn't a reliable indicator of whether or not a 132 * posting list split occurred during the page split; a non-zero value merely 133 * indicates that the REDO routine must reconstruct a new posting list tuple 134 * that is needed for the left page.) 135 * 136 * This posting list split handling is equivalent to the xl_btree_insert REDO 137 * routine's INSERT_POST handling. While the details are more complicated 138 * here, the concept and goals are exactly the same. See _bt_swap_posting() 139 * for details on posting list splits. 140 * 141 * Backup Blk 1: new right page 142 * 143 * The right page's data portion contains the right page's tuples in the form 144 * used by _bt_restore_page. This includes the new item, if it's the _R 145 * variant. The right page's tuples also include the right page's high key 146 * with either variant (moved from the left/original page during the split), 147 * unless the split happened to be of the rightmost page on its level, where 148 * there is no high key for new right page. 149 * 150 * Backup Blk 2: next block (orig page's rightlink), if any 151 * Backup Blk 3: child's left sibling, if non-leaf split 152 */ 153 typedef struct xl_btree_split 154 { 155 uint32 level; /* tree level of page being split */ 156 OffsetNumber firstrightoff; /* first origpage item on rightpage */ 157 OffsetNumber newitemoff; /* new item's offset */ 158 uint16 postingoff; /* offset inside orig posting tuple */ 159 } xl_btree_split; 160 161 #define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(uint16)) 162 163 /* 164 * When page is deduplicated, consecutive groups of tuples with equal keys are 165 * merged together into posting list tuples. 166 * 167 * The WAL record represents a deduplication pass for a leaf page. An array 168 * of BTDedupInterval structs follows. 169 */ 170 typedef struct xl_btree_dedup 171 { 172 uint16 nintervals; 173 174 /* DEDUPLICATION INTERVALS FOLLOW */ 175 } xl_btree_dedup; 176 177 #define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16)) 178 179 /* 180 * This is what we need to know about delete of individual leaf index tuples. 181 * The WAL record can represent deletion of any number of index tuples on a 182 * single index page when *not* executed by VACUUM. Deletion of a subset of 183 * the TIDs within a posting list tuple is not supported. 184 * 185 * Backup Blk 0: index page 186 */ 187 typedef struct xl_btree_delete 188 { 189 TransactionId latestRemovedXid; 190 uint32 ndeleted; 191 192 /* DELETED TARGET OFFSET NUMBERS FOLLOW */ 193 } xl_btree_delete; 194 195 #define SizeOfBtreeDelete (offsetof(xl_btree_delete, ndeleted) + sizeof(uint32)) 196 197 /* 198 * This is what we need to know about page reuse within btree. This record 199 * only exists to generate a conflict point for Hot Standby. 200 * 201 * Note that we must include a RelFileNode in the record because we don't 202 * actually register the buffer with the record. 203 */ 204 typedef struct xl_btree_reuse_page 205 { 206 RelFileNode node; 207 BlockNumber block; 208 TransactionId latestRemovedXid; 209 } xl_btree_reuse_page; 210 211 #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) 212 213 /* 214 * This is what we need to know about which TIDs to remove from an individual 215 * posting list tuple during vacuuming. An array of these may appear at the 216 * end of xl_btree_vacuum records. 217 */ 218 typedef struct xl_btree_update 219 { 220 uint16 ndeletedtids; 221 222 /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */ 223 } xl_btree_update; 224 225 #define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16)) 226 227 /* 228 * This is what we need to know about a VACUUM of a leaf page. The WAL record 229 * can represent deletion of any number of index tuples on a single index page 230 * when executed by VACUUM. It can also support "updates" of index tuples, 231 * which is how deletes of a subset of TIDs contained in an existing posting 232 * list tuple are implemented. (Updates are only used when there will be some 233 * remaining TIDs once VACUUM finishes; otherwise the posting list tuple can 234 * just be deleted). 235 * 236 * Updated posting list tuples are represented using xl_btree_update metadata. 237 * The REDO routine uses each xl_btree_update (plus its corresponding original 238 * index tuple from the target leaf page) to generate the final updated tuple. 239 */ 240 typedef struct xl_btree_vacuum 241 { 242 uint16 ndeleted; 243 uint16 nupdated; 244 245 /* DELETED TARGET OFFSET NUMBERS FOLLOW */ 246 /* UPDATED TARGET OFFSET NUMBERS FOLLOW */ 247 /* UPDATED TUPLES METADATA ARRAY FOLLOWS */ 248 } xl_btree_vacuum; 249 250 #define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16)) 251 252 /* 253 * This is what we need to know about marking an empty subtree for deletion. 254 * The target identifies the tuple removed from the parent page (note that we 255 * remove this tuple's downlink and the *following* tuple's key). Note that 256 * the leaf page is empty, so we don't need to store its content --- it is 257 * just reinitialized during recovery using the rest of the fields. 258 * 259 * Backup Blk 0: leaf block 260 * Backup Blk 1: top parent 261 */ 262 typedef struct xl_btree_mark_page_halfdead 263 { 264 OffsetNumber poffset; /* deleted tuple id in parent page */ 265 266 /* information needed to recreate the leaf page: */ 267 BlockNumber leafblk; /* leaf block ultimately being deleted */ 268 BlockNumber leftblk; /* leaf block's left sibling, if any */ 269 BlockNumber rightblk; /* leaf block's right sibling */ 270 BlockNumber topparent; /* topmost internal page in the subtree */ 271 } xl_btree_mark_page_halfdead; 272 273 #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber)) 274 275 /* 276 * This is what we need to know about deletion of a btree page. Note we do 277 * not store any content for the deleted page --- it is just rewritten as empty 278 * during recovery, apart from resetting the btpo.xact. 279 * 280 * Backup Blk 0: target block being deleted 281 * Backup Blk 1: target block's left sibling, if any 282 * Backup Blk 2: target block's right sibling 283 * Backup Blk 3: leaf block (if different from target) 284 * Backup Blk 4: metapage (if rightsib becomes new fast root) 285 */ 286 typedef struct xl_btree_unlink_page 287 { 288 BlockNumber leftsib; /* target block's left sibling, if any */ 289 BlockNumber rightsib; /* target block's right sibling */ 290 291 /* 292 * Information needed to recreate the leaf page, when target is an 293 * internal page. 294 */ 295 BlockNumber leafleftsib; 296 BlockNumber leafrightsib; 297 BlockNumber topparent; /* next child down in the subtree */ 298 299 TransactionId btpo_xact; /* value of btpo.xact for use in recovery */ 300 /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */ 301 } xl_btree_unlink_page; 302 303 #define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId)) 304 305 /* 306 * New root log record. There are zero tuples if this is to establish an 307 * empty root, or two if it is the result of splitting an old root. 308 * 309 * Note that although this implies rewriting the metadata page, we don't need 310 * an xl_btree_metadata record --- the rootblk and level are sufficient. 311 * 312 * Backup Blk 0: new root page (2 tuples as payload, if splitting old root) 313 * Backup Blk 1: left child (if splitting an old root) 314 * Backup Blk 2: metapage 315 */ 316 typedef struct xl_btree_newroot 317 { 318 BlockNumber rootblk; /* location of new root (redundant with blk 0) */ 319 uint32 level; /* its tree level */ 320 } xl_btree_newroot; 321 322 #define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32)) 323 324 325 /* 326 * prototypes for functions in nbtxlog.c 327 */ 328 extern void btree_redo(XLogReaderState *record); 329 extern void btree_desc(StringInfo buf, XLogReaderState *record); 330 extern const char *btree_identify(uint8 info); 331 extern void btree_xlog_startup(void); 332 extern void btree_xlog_cleanup(void); 333 extern void btree_mask(char *pagedata, BlockNumber blkno); 334 335 #endif /* NBTXLOG_H */ 336