1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 /*
10  * Supported btree formats: the "current" version is the maximum supported
11  * major/minor versions.
12  */
13 #define	WT_BTREE_MAJOR_VERSION_MIN	1	/* Oldest version supported */
14 #define	WT_BTREE_MINOR_VERSION_MIN	1
15 
16 #define	WT_BTREE_MAJOR_VERSION_MAX	1	/* Newest version supported */
17 #define	WT_BTREE_MINOR_VERSION_MAX	1
18 
19 /*
20  * The maximum btree leaf and internal page size is 512MB (2^29).  The limit
21  * is enforced in software, it could be larger, specifically, the underlying
22  * default block manager can support 4GB (2^32).  Currently, the maximum page
23  * size must accommodate our dependence on the maximum page size fitting into
24  * a number of bits less than 32; see the row-store page key-lookup functions
25  * for the magic.
26  */
27 #define	WT_BTREE_PAGE_SIZE_MAX		(512 * WT_MEGABYTE)
28 
29 /*
30  * The length of variable-length column-store values and row-store keys/values
31  * are stored in a 4B type, so the largest theoretical key/value item is 4GB.
32  * However, in the WT_UPDATE structure we use the UINT32_MAX size as a "deleted"
33  * flag, and second, the size of an overflow object is constrained by what an
34  * underlying block manager can actually write.  (For example, in the default
35  * block manager, writing an overflow item includes the underlying block's page
36  * header and block manager specific structure, aligned to an allocation-sized
37  * unit).  The btree engine limits the size of a single object to (4GB - 1KB);
38  * that gives us additional bytes if we ever want to store a structure length
39  * plus the object size in 4B, or if we need additional flag values.  Attempts
40  * to store large key/value items in the tree trigger an immediate check to the
41  * block manager, to make sure it can write the item.  Storing 4GB objects in a
42  * btree borders on clinical insanity, anyway.
43  *
44  * Record numbers are stored in 64-bit unsigned integers, meaning the largest
45  * record number is "really, really big".
46  */
47 #define	WT_BTREE_MAX_OBJECT_SIZE	((uint32_t)(UINT32_MAX - 1024))
48 
49 /*
50  * A location in a file is a variable-length cookie, but it has a maximum size
51  * so it's easy to create temporary space in which to store them.  (Locations
52  * can't be much larger than this anyway, they must fit onto the minimum size
53  * page because a reference to an overflow page is itself a location.)
54  */
55 #define	WT_BTREE_MAX_ADDR_COOKIE	255	/* Maximum address cookie */
56 
57 /* Evict pages if we see this many consecutive deleted records. */
58 #define	WT_BTREE_DELETE_THRESHOLD	1000
59 
60 /*
61  * Minimum size of the chunks (in percentage of the page size) a page gets split
62  * into during reconciliation.
63  */
64 #define	WT_BTREE_MIN_SPLIT_PCT		50
65 
66 /*
67  * WT_BTREE --
68  *	A btree handle.
69  */
70 struct __wt_btree {
71 	WT_DATA_HANDLE *dhandle;
72 
73 	WT_CKPT	  *ckpt;		/* Checkpoint information */
74 
75 	enum {	BTREE_COL_FIX=1,	/* Fixed-length column store */
76 		BTREE_COL_VAR=2,	/* Variable-length column store */
77 		BTREE_ROW=3		/* Row-store */
78 	} type;				/* Type */
79 
80 	const char *key_format;		/* Key format */
81 	const char *value_format;	/* Value format */
82 	uint8_t bitcnt;			/* Fixed-length field size in bits */
83 
84 	WT_COLLATOR *collator;		/* Row-store comparator */
85 	int collator_owned;		/* The collator needs to be freed */
86 
87 	uint32_t id;			/* File ID, for logging */
88 
89 	uint32_t key_gap;		/* Row-store prefix key gap */
90 
91 	uint32_t allocsize;		/* Allocation size */
92 	uint32_t maxintlpage;		/* Internal page max size */
93 	uint32_t maxintlkey;		/* Internal page max key size */
94 	uint32_t maxleafpage;		/* Leaf page max size */
95 	uint32_t maxleafkey;		/* Leaf page max key size */
96 	uint32_t maxleafvalue;		/* Leaf page max value size */
97 	uint64_t maxmempage;		/* In-memory page max size */
98 	uint32_t maxmempage_image;	/* In-memory page image max size */
99 	uint64_t splitmempage;		/* In-memory split trigger size */
100 
101 /* AUTOMATIC FLAG VALUE GENERATION START */
102 #define	WT_ASSERT_COMMIT_TS_ALWAYS	0x01u
103 #define	WT_ASSERT_COMMIT_TS_KEYS	0x02u
104 #define	WT_ASSERT_COMMIT_TS_NEVER	0x04u
105 #define	WT_ASSERT_READ_TS_ALWAYS	0x08u
106 #define	WT_ASSERT_READ_TS_NEVER		0x10u
107 /* AUTOMATIC FLAG VALUE GENERATION STOP */
108 	uint32_t assert_flags;		/* Debugging assertion information */
109 
110 	void *huffman_key;		/* Key huffman encoding */
111 	void *huffman_value;		/* Value huffman encoding */
112 
113 	enum {	CKSUM_ON=1,		/* On */
114 		CKSUM_OFF=2,		/* Off */
115 		CKSUM_UNCOMPRESSED=3	/* Uncompressed blocks only */
116 	} checksum;			/* Checksum configuration */
117 
118 	/*
119 	 * Reconciliation...
120 	 */
121 	u_int dictionary;		/* Dictionary slots */
122 	bool  internal_key_truncate;	/* Internal key truncate */
123 	bool  prefix_compression;	/* Prefix compression */
124 	u_int prefix_compression_min;	/* Prefix compression min */
125 
126 #define	WT_SPLIT_DEEPEN_MIN_CHILD_DEF	10000
127 	u_int split_deepen_min_child;	/* Minimum entries to deepen tree */
128 #define	WT_SPLIT_DEEPEN_PER_CHILD_DEF	100
129 	u_int split_deepen_per_child;	/* Entries per child when deepened */
130 	int   split_pct;		/* Split page percent */
131 
132 	WT_COMPRESSOR *compressor;	/* Page compressor */
133 	/*
134 	 * When doing compression, the pre-compression in-memory byte size is
135 	 * optionally adjusted based on previous compression results.
136 	 * It's an 8B value because it's updated without a lock.
137 	 */
138 	bool	 leafpage_compadjust;	/* Run-time compression adjustment */
139 	uint64_t maxleafpage_precomp;	/* Leaf page pre-compression size */
140 	bool	 intlpage_compadjust;	/* Run-time compression adjustment */
141 	uint64_t maxintlpage_precomp;	/* Internal page pre-compression size */
142 
143 	WT_KEYED_ENCRYPTOR *kencryptor;	/* Page encryptor */
144 
145 	WT_RWLOCK ovfl_lock;		/* Overflow lock */
146 
147 	int	maximum_depth;		/* Maximum tree depth during search */
148 	u_int	rec_multiblock_max;	/* Maximum blocks written for a page */
149 
150 	uint64_t last_recno;		/* Column-store last record number */
151 
152 	WT_REF	root;			/* Root page reference */
153 	bool	modified;		/* If the tree ever modified */
154 	uint8_t	original;		/* Newly created: bulk-load possible
155 					   (want a bool but needs atomic cas) */
156 
157 	bool lookaside_entries;		/* Has entries in the lookaside table */
158 	bool lsm_primary;		/* Handle is/was the LSM primary */
159 
160 	WT_BM	*bm;			/* Block manager reference */
161 	u_int	 block_header;		/* WT_PAGE_HEADER_BYTE_SIZE */
162 
163 	uint64_t write_gen;		/* Write generation */
164 	uint64_t rec_max_txn;		/* Maximum txn seen (clean trees) */
165 	WT_DECL_TIMESTAMP(rec_max_timestamp)
166 
167 	uint64_t checkpoint_gen;	/* Checkpoint generation */
168 	WT_SESSION_IMPL *sync_session;	/* Syncing session */
169 	volatile enum {
170 		WT_BTREE_SYNC_OFF, WT_BTREE_SYNC_WAIT, WT_BTREE_SYNC_RUNNING
171 	} syncing;			/* Sync status */
172 
173 	/*
174 	 * Helper macros:
175 	 * WT_BTREE_SYNCING indicates if a sync is active (either waiting to
176 	 * start or already running), so no new operations should start that
177 	 * would conflict with the sync.
178 	 * WT_SESSION_BTREE_SYNC indicates if the session is performing a sync
179 	 * on its current tree.
180 	 * WT_SESSION_BTREE_SYNC_SAFE checks whether it is safe to perform an
181 	 * operation that would conflict with a sync.
182 	 */
183 #define	WT_BTREE_SYNCING(btree)						\
184 	(btree->syncing != WT_BTREE_SYNC_OFF)
185 #define	WT_SESSION_BTREE_SYNC(session)					\
186 	(S2BT(session)->sync_session == session)
187 #define	WT_SESSION_BTREE_SYNC_SAFE(session, btree)			\
188 	((btree)->syncing != WT_BTREE_SYNC_RUNNING ||			\
189 	    (btree)->sync_session == session)
190 
191 	uint64_t    bytes_inmem;        /* Cache bytes in memory. */
192 	uint64_t    bytes_dirty_intl;	/* Bytes in dirty internal pages. */
193 	uint64_t    bytes_dirty_leaf;	/* Bytes in dirty leaf pages. */
194 
195 	/*
196 	 * The maximum bytes allowed to be used for the table on disk.  This is
197 	 * currently only used for the lookaside table.
198 	 */
199 	uint64_t    file_max;
200 
201 	/*
202 	 * We flush pages from the tree (in order to make checkpoint faster),
203 	 * without a high-level lock.  To avoid multiple threads flushing at
204 	 * the same time, lock the tree.
205 	 */
206 	WT_SPINLOCK	flush_lock;	/* Lock to flush the tree's pages */
207 
208 	/*
209 	 * All of the following fields live at the end of the structure so it's
210 	 * easier to clear everything but the fields that persist.
211 	 */
212 #define	WT_BTREE_CLEAR_SIZE	(offsetof(WT_BTREE, evict_ref))
213 
214 	/*
215 	 * Eviction information is maintained in the btree handle, but owned by
216 	 * eviction, not the btree code.
217 	 */
218 	WT_REF	   *evict_ref;		/* Eviction thread's location */
219 	uint64_t    evict_priority;	/* Relative priority of cached pages */
220 	uint32_t    evict_walk_progress;/* Eviction walk progress */
221 	uint32_t    evict_walk_target;  /* Eviction walk target */
222 	u_int	    evict_walk_period;	/* Skip this many LRU walks */
223 	u_int	    evict_walk_saved;	/* Saved walk skips for checkpoints */
224 	u_int	    evict_walk_skips;	/* Number of walks skipped */
225 	int32_t	    evict_disabled;	/* Eviction disabled count */
226 	bool	    evict_disabled_open;/* Eviction disabled on open */
227 	volatile uint32_t evict_busy;	/* Count of threads in eviction */
228 	enum {				/* Start position for eviction walk */
229 		WT_EVICT_WALK_NEXT,
230 		WT_EVICT_WALK_PREV,
231 		WT_EVICT_WALK_RAND_NEXT,
232 		WT_EVICT_WALK_RAND_PREV
233 	} evict_start_type;
234 
235 	/*
236 	 * Flag values up to 0xff are reserved for WT_DHANDLE_XXX. We don't
237 	 * automatically generate these flag values for that reason, there's
238 	 * no way to start at an offset.
239 	 */
240 #define	WT_BTREE_ALTER		0x000100u	/* Handle is for alter */
241 #define	WT_BTREE_BULK		0x000200u	/* Bulk-load handle */
242 #define	WT_BTREE_CLOSED		0x000400u	/* Handle closed */
243 #define	WT_BTREE_IGNORE_CACHE	0x000800u	/* Cache-resident object */
244 #define	WT_BTREE_IN_MEMORY	0x001000u	/* Cache-resident object */
245 #define	WT_BTREE_LOOKASIDE	0x002000u	/* Look-aside table */
246 #define	WT_BTREE_NO_CHECKPOINT	0x004000u	/* Disable checkpoints */
247 #define	WT_BTREE_NO_LOGGING	0x008000u	/* Disable logging */
248 #define	WT_BTREE_READONLY	0x010000u	/* Handle is readonly */
249 #define	WT_BTREE_REBALANCE	0x020000u	/* Handle is for rebalance */
250 #define	WT_BTREE_SALVAGE	0x040000u	/* Handle is for salvage */
251 #define	WT_BTREE_SKIP_CKPT	0x080000u	/* Handle skipped checkpoint */
252 #define	WT_BTREE_UPGRADE	0x100000u	/* Handle is for upgrade */
253 #define	WT_BTREE_VERIFY		0x200000u	/* Handle is for verify */
254 	uint32_t flags;
255 };
256 
257 /* Flags that make a btree handle special (not for normal use). */
258 #define	WT_BTREE_SPECIAL_FLAGS	 					\
259 	(WT_BTREE_ALTER | WT_BTREE_BULK | WT_BTREE_REBALANCE |		\
260 	WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)
261 
262 /*
263  * WT_SALVAGE_COOKIE --
264  *	Encapsulation of salvage information for reconciliation.
265  */
266 struct __wt_salvage_cookie {
267 	uint64_t missing;			/* Initial items to create */
268 	uint64_t skip;				/* Initial items to skip */
269 	uint64_t take;				/* Items to take */
270 
271 	bool	 done;				/* Ignore the rest */
272 };
273