1 /*------------------------------------------------------------------------- 2 * 3 * nbtxlog.h 4 * header file for postgres btree xlog routines 5 * 6 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group 7 * Portions Copyright (c) 1994, Regents of the University of California 8 * 9 * src/include/access/nbtxlog.h 10 * 11 *------------------------------------------------------------------------- 12 */ 13 #ifndef NBTXLOG_H 14 #define NBTXLOG_H 15 16 #include "access/xlogreader.h" 17 #include "lib/stringinfo.h" 18 #include "storage/off.h" 19 20 /* 21 * XLOG records for btree operations 22 * 23 * XLOG allows to store some information in high 4 bits of log 24 * record xl_info field 25 */ 26 #define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */ 27 #define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */ 28 #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ 29 #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ 30 #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ 31 #define XLOG_BTREE_SPLIT_L_HIGHKEY 0x50 /* as above, include truncated highkey */ 32 #define XLOG_BTREE_SPLIT_R_HIGHKEY 0x60 /* as above, include truncated highkey */ 33 #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ 34 #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */ 35 #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */ 36 #define XLOG_BTREE_NEWROOT 0xA0 /* new root page */ 37 #define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */ 38 #define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during 39 * vacuum */ 40 #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from 41 * FSM */ 42 #define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the 43 * metapage */ 44 45 /* 46 * All that we need to regenerate the meta-data page 47 */ 48 typedef struct xl_btree_metadata 49 { 50 BlockNumber root; 51 uint32 level; 52 BlockNumber fastroot; 53 uint32 fastlevel; 54 TransactionId oldest_btpo_xact; 55 float8 last_cleanup_num_heap_tuples; 56 } xl_btree_metadata; 57 58 /* 59 * This is what we need to know about simple (without split) insert. 60 * 61 * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. 62 * Note that INSERT_META implies it's not a leaf page. 63 * 64 * Backup Blk 0: original page (data contains the inserted tuple) 65 * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META 66 * Backup Blk 2: xl_btree_metadata, if INSERT_META 67 */ 68 typedef struct xl_btree_insert 69 { 70 OffsetNumber offnum; 71 } xl_btree_insert; 72 73 #define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) 74 75 /* 76 * On insert with split, we save all the items going into the right sibling 77 * so that we can restore it completely from the log record. This way takes 78 * less xlog space than the normal approach, because if we did it standardly, 79 * XLogInsert would almost always think the right page is new and store its 80 * whole page image. The left page, however, is handled in the normal 81 * incremental-update fashion. 82 * 83 * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record. 84 * The _L and _R variants indicate whether the inserted tuple went into the 85 * left or right split page (and thus, whether the new item is stored or not). 86 * The _HIGHKEY variants indicate that we've logged explicitly left page high 87 * key value, otherwise redo should use right page leftmost key as a left page 88 * high key. _HIGHKEY is specified for internal pages where right page 89 * leftmost key is suppressed, and for leaf pages of covering indexes where 90 * high key have non-key attributes truncated. 91 * 92 * Backup Blk 0: original page / new left page 93 * 94 * The left page's data portion contains the new item, if it's the _L variant. 95 * (In the _R variants, the new item is one of the right page's tuples.) 96 * If level > 0, an IndexTuple representing the HIKEY of the left page 97 * follows. We don't need this on leaf pages, because it's the same as the 98 * leftmost key in the new right page. 99 * 100 * Backup Blk 1: new right page 101 * 102 * The right page's data portion contains the right page's tuples in the 103 * form used by _bt_restore_page. 104 * 105 * Backup Blk 2: next block (orig page's rightlink), if any 106 * Backup Blk 3: child's left sibling, if non-leaf split 107 */ 108 typedef struct xl_btree_split 109 { 110 uint32 level; /* tree level of page being split */ 111 OffsetNumber firstright; /* first item moved to right page */ 112 OffsetNumber newitemoff; /* new item's offset (useful for _L variant) */ 113 } xl_btree_split; 114 115 #define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber)) 116 117 /* 118 * This is what we need to know about delete of individual leaf index tuples. 119 * The WAL record can represent deletion of any number of index tuples on a 120 * single index page when *not* executed by VACUUM. 121 * 122 * Backup Blk 0: index page 123 */ 124 typedef struct xl_btree_delete 125 { 126 RelFileNode hnode; /* RelFileNode of the heap the index currently 127 * points at */ 128 int nitems; 129 130 /* TARGET OFFSET NUMBERS FOLLOW AT THE END */ 131 } xl_btree_delete; 132 133 #define SizeOfBtreeDelete (offsetof(xl_btree_delete, nitems) + sizeof(int)) 134 135 /* 136 * This is what we need to know about page reuse within btree. 137 */ 138 typedef struct xl_btree_reuse_page 139 { 140 RelFileNode node; 141 BlockNumber block; 142 TransactionId latestRemovedXid; 143 } xl_btree_reuse_page; 144 145 #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) 146 147 /* 148 * This is what we need to know about vacuum of individual leaf index tuples. 149 * The WAL record can represent deletion of any number of index tuples on a 150 * single index page when executed by VACUUM. 151 * 152 * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber. 153 * For a non-MVCC index scans there is an additional correctness requirement 154 * for applying these changes during recovery, which is that we must do one 155 * of these two things for every block in the index: 156 * * lock the block for cleanup and apply any required changes 157 * * EnsureBlockUnpinned() 158 * The purpose of this is to ensure that no index scans started before we 159 * finish scanning the index are still running by the time we begin to remove 160 * heap tuples. 161 * 162 * Any changes to any one block are registered on just one WAL record. All 163 * blocks that we need to run EnsureBlockUnpinned() are listed as a block range 164 * starting from the last block vacuumed through until this one. Individual 165 * block numbers aren't given. 166 * 167 * Note that the *last* WAL record in any vacuum of an index is allowed to 168 * have a zero length array of offsets. Earlier records must have at least one. 169 */ 170 typedef struct xl_btree_vacuum 171 { 172 BlockNumber lastBlockVacuumed; 173 174 /* TARGET OFFSET NUMBERS FOLLOW */ 175 } xl_btree_vacuum; 176 177 #define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber)) 178 179 /* 180 * This is what we need to know about marking an empty branch for deletion. 181 * The target identifies the tuple removed from the parent page (note that we 182 * remove this tuple's downlink and the *following* tuple's key). Note that 183 * the leaf page is empty, so we don't need to store its content --- it is 184 * just reinitialized during recovery using the rest of the fields. 185 * 186 * Backup Blk 0: leaf block 187 * Backup Blk 1: top parent 188 */ 189 typedef struct xl_btree_mark_page_halfdead 190 { 191 OffsetNumber poffset; /* deleted tuple id in parent page */ 192 193 /* information needed to recreate the leaf page: */ 194 BlockNumber leafblk; /* leaf block ultimately being deleted */ 195 BlockNumber leftblk; /* leaf block's left sibling, if any */ 196 BlockNumber rightblk; /* leaf block's right sibling */ 197 BlockNumber topparent; /* topmost internal page in the branch */ 198 } xl_btree_mark_page_halfdead; 199 200 #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber)) 201 202 /* 203 * This is what we need to know about deletion of a btree page. Note we do 204 * not store any content for the deleted page --- it is just rewritten as empty 205 * during recovery, apart from resetting the btpo.xact. 206 * 207 * Backup Blk 0: target block being deleted 208 * Backup Blk 1: target block's left sibling, if any 209 * Backup Blk 2: target block's right sibling 210 * Backup Blk 3: leaf block (if different from target) 211 * Backup Blk 4: metapage (if rightsib becomes new fast root) 212 */ 213 typedef struct xl_btree_unlink_page 214 { 215 BlockNumber leftsib; /* target block's left sibling, if any */ 216 BlockNumber rightsib; /* target block's right sibling */ 217 218 /* 219 * Information needed to recreate the leaf page, when target is an 220 * internal page. 221 */ 222 BlockNumber leafleftsib; 223 BlockNumber leafrightsib; 224 BlockNumber topparent; /* next child down in the branch */ 225 226 TransactionId btpo_xact; /* value of btpo.xact for use in recovery */ 227 /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */ 228 } xl_btree_unlink_page; 229 230 #define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId)) 231 232 /* 233 * New root log record. There are zero tuples if this is to establish an 234 * empty root, or two if it is the result of splitting an old root. 235 * 236 * Note that although this implies rewriting the metadata page, we don't need 237 * an xl_btree_metadata record --- the rootblk and level are sufficient. 238 * 239 * Backup Blk 0: new root page (2 tuples as payload, if splitting old root) 240 * Backup Blk 1: left child (if splitting an old root) 241 * Backup Blk 2: metapage 242 */ 243 typedef struct xl_btree_newroot 244 { 245 BlockNumber rootblk; /* location of new root (redundant with blk 0) */ 246 uint32 level; /* its tree level */ 247 } xl_btree_newroot; 248 249 #define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32)) 250 251 252 /* 253 * prototypes for functions in nbtxlog.c 254 */ 255 extern void btree_redo(XLogReaderState *record); 256 extern void btree_desc(StringInfo buf, XLogReaderState *record); 257 extern const char *btree_identify(uint8 info); 258 extern void btree_mask(char *pagedata, BlockNumber blkno); 259 260 #endif /* NBXLOG_H */ 261