1 /*- 2 * Copyright (c) 2014-2018 MongoDB, Inc. 3 * Copyright (c) 2008-2014 WiredTiger, Inc. 4 * All rights reserved. 5 * 6 * See the file LICENSE for redistribution information. 7 */ 8 9 /* 10 * Supported btree formats: the "current" version is the maximum supported 11 * major/minor versions. 12 */ 13 #define WT_BTREE_MAJOR_VERSION_MIN 1 /* Oldest version supported */ 14 #define WT_BTREE_MINOR_VERSION_MIN 1 15 16 #define WT_BTREE_MAJOR_VERSION_MAX 1 /* Newest version supported */ 17 #define WT_BTREE_MINOR_VERSION_MAX 1 18 19 /* 20 * The maximum btree leaf and internal page size is 512MB (2^29). The limit 21 * is enforced in software, it could be larger, specifically, the underlying 22 * default block manager can support 4GB (2^32). Currently, the maximum page 23 * size must accommodate our dependence on the maximum page size fitting into 24 * a number of bits less than 32; see the row-store page key-lookup functions 25 * for the magic. 26 */ 27 #define WT_BTREE_PAGE_SIZE_MAX (512 * WT_MEGABYTE) 28 29 /* 30 * The length of variable-length column-store values and row-store keys/values 31 * are stored in a 4B type, so the largest theoretical key/value item is 4GB. 32 * However, in the WT_UPDATE structure we use the UINT32_MAX size as a "deleted" 33 * flag, and second, the size of an overflow object is constrained by what an 34 * underlying block manager can actually write. (For example, in the default 35 * block manager, writing an overflow item includes the underlying block's page 36 * header and block manager specific structure, aligned to an allocation-sized 37 * unit). The btree engine limits the size of a single object to (4GB - 1KB); 38 * that gives us additional bytes if we ever want to store a structure length 39 * plus the object size in 4B, or if we need additional flag values. Attempts 40 * to store large key/value items in the tree trigger an immediate check to the 41 * block manager, to make sure it can write the item. Storing 4GB objects in a 42 * btree borders on clinical insanity, anyway. 43 * 44 * Record numbers are stored in 64-bit unsigned integers, meaning the largest 45 * record number is "really, really big". 46 */ 47 #define WT_BTREE_MAX_OBJECT_SIZE ((uint32_t)(UINT32_MAX - 1024)) 48 49 /* 50 * A location in a file is a variable-length cookie, but it has a maximum size 51 * so it's easy to create temporary space in which to store them. (Locations 52 * can't be much larger than this anyway, they must fit onto the minimum size 53 * page because a reference to an overflow page is itself a location.) 54 */ 55 #define WT_BTREE_MAX_ADDR_COOKIE 255 /* Maximum address cookie */ 56 57 /* Evict pages if we see this many consecutive deleted records. */ 58 #define WT_BTREE_DELETE_THRESHOLD 1000 59 60 /* 61 * Minimum size of the chunks (in percentage of the page size) a page gets split 62 * into during reconciliation. 63 */ 64 #define WT_BTREE_MIN_SPLIT_PCT 50 65 66 /* 67 * WT_BTREE -- 68 * A btree handle. 69 */ 70 struct __wt_btree { 71 WT_DATA_HANDLE *dhandle; 72 73 WT_CKPT *ckpt; /* Checkpoint information */ 74 75 enum { BTREE_COL_FIX=1, /* Fixed-length column store */ 76 BTREE_COL_VAR=2, /* Variable-length column store */ 77 BTREE_ROW=3 /* Row-store */ 78 } type; /* Type */ 79 80 const char *key_format; /* Key format */ 81 const char *value_format; /* Value format */ 82 uint8_t bitcnt; /* Fixed-length field size in bits */ 83 84 WT_COLLATOR *collator; /* Row-store comparator */ 85 int collator_owned; /* The collator needs to be freed */ 86 87 uint32_t id; /* File ID, for logging */ 88 89 uint32_t key_gap; /* Row-store prefix key gap */ 90 91 uint32_t allocsize; /* Allocation size */ 92 uint32_t maxintlpage; /* Internal page max size */ 93 uint32_t maxintlkey; /* Internal page max key size */ 94 uint32_t maxleafpage; /* Leaf page max size */ 95 uint32_t maxleafkey; /* Leaf page max key size */ 96 uint32_t maxleafvalue; /* Leaf page max value size */ 97 uint64_t maxmempage; /* In-memory page max size */ 98 uint32_t maxmempage_image; /* In-memory page image max size */ 99 uint64_t splitmempage; /* In-memory split trigger size */ 100 101 /* AUTOMATIC FLAG VALUE GENERATION START */ 102 #define WT_ASSERT_COMMIT_TS_ALWAYS 0x01u 103 #define WT_ASSERT_COMMIT_TS_KEYS 0x02u 104 #define WT_ASSERT_COMMIT_TS_NEVER 0x04u 105 #define WT_ASSERT_READ_TS_ALWAYS 0x08u 106 #define WT_ASSERT_READ_TS_NEVER 0x10u 107 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 108 uint32_t assert_flags; /* Debugging assertion information */ 109 110 void *huffman_key; /* Key huffman encoding */ 111 void *huffman_value; /* Value huffman encoding */ 112 113 enum { CKSUM_ON=1, /* On */ 114 CKSUM_OFF=2, /* Off */ 115 CKSUM_UNCOMPRESSED=3 /* Uncompressed blocks only */ 116 } checksum; /* Checksum configuration */ 117 118 /* 119 * Reconciliation... 120 */ 121 u_int dictionary; /* Dictionary slots */ 122 bool internal_key_truncate; /* Internal key truncate */ 123 bool prefix_compression; /* Prefix compression */ 124 u_int prefix_compression_min; /* Prefix compression min */ 125 126 #define WT_SPLIT_DEEPEN_MIN_CHILD_DEF 10000 127 u_int split_deepen_min_child; /* Minimum entries to deepen tree */ 128 #define WT_SPLIT_DEEPEN_PER_CHILD_DEF 100 129 u_int split_deepen_per_child; /* Entries per child when deepened */ 130 int split_pct; /* Split page percent */ 131 132 WT_COMPRESSOR *compressor; /* Page compressor */ 133 /* 134 * When doing compression, the pre-compression in-memory byte size is 135 * optionally adjusted based on previous compression results. 136 * It's an 8B value because it's updated without a lock. 137 */ 138 bool leafpage_compadjust; /* Run-time compression adjustment */ 139 uint64_t maxleafpage_precomp; /* Leaf page pre-compression size */ 140 bool intlpage_compadjust; /* Run-time compression adjustment */ 141 uint64_t maxintlpage_precomp; /* Internal page pre-compression size */ 142 143 WT_KEYED_ENCRYPTOR *kencryptor; /* Page encryptor */ 144 145 WT_RWLOCK ovfl_lock; /* Overflow lock */ 146 147 int maximum_depth; /* Maximum tree depth during search */ 148 u_int rec_multiblock_max; /* Maximum blocks written for a page */ 149 150 uint64_t last_recno; /* Column-store last record number */ 151 152 WT_REF root; /* Root page reference */ 153 bool modified; /* If the tree ever modified */ 154 uint8_t original; /* Newly created: bulk-load possible 155 (want a bool but needs atomic cas) */ 156 157 bool lookaside_entries; /* Has entries in the lookaside table */ 158 bool lsm_primary; /* Handle is/was the LSM primary */ 159 160 WT_BM *bm; /* Block manager reference */ 161 u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */ 162 163 uint64_t write_gen; /* Write generation */ 164 uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */ 165 WT_DECL_TIMESTAMP(rec_max_timestamp) 166 167 uint64_t checkpoint_gen; /* Checkpoint generation */ 168 WT_SESSION_IMPL *sync_session; /* Syncing session */ 169 volatile enum { 170 WT_BTREE_SYNC_OFF, WT_BTREE_SYNC_WAIT, WT_BTREE_SYNC_RUNNING 171 } syncing; /* Sync status */ 172 173 /* 174 * Helper macros: 175 * WT_BTREE_SYNCING indicates if a sync is active (either waiting to 176 * start or already running), so no new operations should start that 177 * would conflict with the sync. 178 * WT_SESSION_BTREE_SYNC indicates if the session is performing a sync 179 * on its current tree. 180 * WT_SESSION_BTREE_SYNC_SAFE checks whether it is safe to perform an 181 * operation that would conflict with a sync. 182 */ 183 #define WT_BTREE_SYNCING(btree) \ 184 (btree->syncing != WT_BTREE_SYNC_OFF) 185 #define WT_SESSION_BTREE_SYNC(session) \ 186 (S2BT(session)->sync_session == session) 187 #define WT_SESSION_BTREE_SYNC_SAFE(session, btree) \ 188 ((btree)->syncing != WT_BTREE_SYNC_RUNNING || \ 189 (btree)->sync_session == session) 190 191 uint64_t bytes_inmem; /* Cache bytes in memory. */ 192 uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */ 193 uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */ 194 195 /* 196 * The maximum bytes allowed to be used for the table on disk. This is 197 * currently only used for the lookaside table. 198 */ 199 uint64_t file_max; 200 201 /* 202 * We flush pages from the tree (in order to make checkpoint faster), 203 * without a high-level lock. To avoid multiple threads flushing at 204 * the same time, lock the tree. 205 */ 206 WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ 207 208 /* 209 * All of the following fields live at the end of the structure so it's 210 * easier to clear everything but the fields that persist. 211 */ 212 #define WT_BTREE_CLEAR_SIZE (offsetof(WT_BTREE, evict_ref)) 213 214 /* 215 * Eviction information is maintained in the btree handle, but owned by 216 * eviction, not the btree code. 217 */ 218 WT_REF *evict_ref; /* Eviction thread's location */ 219 uint64_t evict_priority; /* Relative priority of cached pages */ 220 uint32_t evict_walk_progress;/* Eviction walk progress */ 221 uint32_t evict_walk_target; /* Eviction walk target */ 222 u_int evict_walk_period; /* Skip this many LRU walks */ 223 u_int evict_walk_saved; /* Saved walk skips for checkpoints */ 224 u_int evict_walk_skips; /* Number of walks skipped */ 225 int32_t evict_disabled; /* Eviction disabled count */ 226 bool evict_disabled_open;/* Eviction disabled on open */ 227 volatile uint32_t evict_busy; /* Count of threads in eviction */ 228 enum { /* Start position for eviction walk */ 229 WT_EVICT_WALK_NEXT, 230 WT_EVICT_WALK_PREV, 231 WT_EVICT_WALK_RAND_NEXT, 232 WT_EVICT_WALK_RAND_PREV 233 } evict_start_type; 234 235 /* 236 * Flag values up to 0xff are reserved for WT_DHANDLE_XXX. We don't 237 * automatically generate these flag values for that reason, there's 238 * no way to start at an offset. 239 */ 240 #define WT_BTREE_ALTER 0x000100u /* Handle is for alter */ 241 #define WT_BTREE_BULK 0x000200u /* Bulk-load handle */ 242 #define WT_BTREE_CLOSED 0x000400u /* Handle closed */ 243 #define WT_BTREE_IGNORE_CACHE 0x000800u /* Cache-resident object */ 244 #define WT_BTREE_IN_MEMORY 0x001000u /* Cache-resident object */ 245 #define WT_BTREE_LOOKASIDE 0x002000u /* Look-aside table */ 246 #define WT_BTREE_NO_CHECKPOINT 0x004000u /* Disable checkpoints */ 247 #define WT_BTREE_NO_LOGGING 0x008000u /* Disable logging */ 248 #define WT_BTREE_READONLY 0x010000u /* Handle is readonly */ 249 #define WT_BTREE_REBALANCE 0x020000u /* Handle is for rebalance */ 250 #define WT_BTREE_SALVAGE 0x040000u /* Handle is for salvage */ 251 #define WT_BTREE_SKIP_CKPT 0x080000u /* Handle skipped checkpoint */ 252 #define WT_BTREE_UPGRADE 0x100000u /* Handle is for upgrade */ 253 #define WT_BTREE_VERIFY 0x200000u /* Handle is for verify */ 254 uint32_t flags; 255 }; 256 257 /* Flags that make a btree handle special (not for normal use). */ 258 #define WT_BTREE_SPECIAL_FLAGS \ 259 (WT_BTREE_ALTER | WT_BTREE_BULK | WT_BTREE_REBALANCE | \ 260 WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY) 261 262 /* 263 * WT_SALVAGE_COOKIE -- 264 * Encapsulation of salvage information for reconciliation. 265 */ 266 struct __wt_salvage_cookie { 267 uint64_t missing; /* Initial items to create */ 268 uint64_t skip; /* Initial items to skip */ 269 uint64_t take; /* Items to take */ 270 271 bool done; /* Ignore the rest */ 272 }; 273