1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 /*
10  * WiredTiger's block manager interface.
11  */
12 
13 /*
14  * The file's description is written into the first block of the file, which
15  * means we can use an offset of 0 as an invalid offset.
16  */
17 #define	WT_BLOCK_INVALID_OFFSET		0
18 
19 /*
20  * The block manager maintains three per-checkpoint extent lists:
21  *	alloc:	 the extents allocated in this checkpoint
22  *	avail:	 the extents available for allocation
23  *	discard: the extents freed in this checkpoint
24  *
25  * An extent list is based on two skiplists: first, a by-offset list linking
26  * WT_EXT elements and sorted by file offset (low-to-high), second, a by-size
27  * list linking WT_SIZE elements and sorted by chunk size (low-to-high).
28  *
29  * Additionally, each WT_SIZE element on the by-size has a skiplist of its own,
30  * linking WT_EXT elements and sorted by file offset (low-to-high).  This list
31  * has an entry for extents of a particular size.
32  *
33  * The trickiness is each individual WT_EXT element appears on two skiplists.
34  * In order to minimize allocation calls, we allocate a single array of WT_EXT
35  * pointers at the end of the WT_EXT structure, for both skiplists, and store
36  * the depth of the skiplist in the WT_EXT structure.  The skiplist entries for
37  * the offset skiplist start at WT_EXT.next[0] and the entries for the size
38  * skiplist start at WT_EXT.next[WT_EXT.depth].
39  *
40  * One final complication: we only maintain the per-size skiplist for the avail
41  * list, the alloc and discard extent lists are not searched based on size.
42  */
43 
44 /*
45  * WT_EXTLIST --
46  *	An extent list.
47  */
48 struct __wt_extlist {
49 	char *name;				/* Name */
50 
51 	uint64_t bytes;				/* Byte count */
52 	uint32_t entries;			/* Entry count */
53 
54 	wt_off_t offset;			/* Written extent offset */
55 	uint32_t checksum;			/* Written extent checksum */
56 	uint32_t size;				/* Written extent size */
57 
58 	bool	 track_size;			/* Maintain per-size skiplist */
59 
60 	WT_EXT	*last;				/* Cached last element */
61 
62 	WT_EXT	*off[WT_SKIP_MAXDEPTH];		/* Size/offset skiplists */
63 	WT_SIZE *sz[WT_SKIP_MAXDEPTH];
64 };
65 
66 /*
67  * WT_EXT --
68  *	Encapsulation of an extent, either allocated or freed within the
69  * checkpoint.
70  */
71 struct __wt_ext {
72 	wt_off_t  off;				/* Extent's file offset */
73 	wt_off_t  size;				/* Extent's Size */
74 
75 	uint8_t	 depth;				/* Skip list depth */
76 
77 	/*
78 	 * Variable-length array, sized by the number of skiplist elements.
79 	 * The first depth array entries are the address skiplist elements,
80 	 * the second depth array entries are the size skiplist.
81 	 */
82 	WT_EXT	*next[0];			/* Offset, size skiplists */
83 };
84 
85 /*
86  * WT_SIZE --
87  *	Encapsulation of a block size skiplist entry.
88  */
89 struct __wt_size {
90 	wt_off_t size;				/* Size */
91 
92 	uint8_t	 depth;				/* Skip list depth */
93 
94 	WT_EXT	*off[WT_SKIP_MAXDEPTH];		/* Per-size offset skiplist */
95 
96 	/*
97 	 * We don't use a variable-length array for the size skiplist, we want
98 	 * to be able to use any cached WT_SIZE structure as the head of a list,
99 	 * and we don't know the related WT_EXT structure's depth.
100 	 */
101 	WT_SIZE *next[WT_SKIP_MAXDEPTH];	/* Size skiplist */
102 };
103 
104 /*
105  * WT_EXT_FOREACH --
106  *	Walk a block manager skiplist.
107  * WT_EXT_FOREACH_OFF --
108  *	Walk a block manager skiplist where the WT_EXT.next entries are offset
109  * by the depth.
110  */
111 #define	WT_EXT_FOREACH(skip, head)					\
112 	for ((skip) = (head)[0];					\
113 	    (skip) != NULL; (skip) = (skip)->next[0])
114 #define	WT_EXT_FOREACH_OFF(skip, head)					\
115 	for ((skip) = (head)[0];					\
116 	    (skip) != NULL; (skip) = (skip)->next[(skip)->depth])
117 
118 /*
119  * Checkpoint cookie: carries a version number as I don't want to rev the schema
120  * file version should the default block manager checkpoint format change.
121  *
122  * Version #1 checkpoint cookie format:
123  *	[1] [root addr] [alloc addr] [avail addr] [discard addr]
124  *	    [file size] [checkpoint size] [write generation]
125  */
126 #define	WT_BM_CHECKPOINT_VERSION	1	/* Checkpoint format version */
127 #define	WT_BLOCK_EXTLIST_MAGIC		71002	/* Identify a list */
128 struct __wt_block_ckpt {
129 	uint8_t	 version;			/* Version */
130 
131 	wt_off_t root_offset;			/* The root */
132 	uint32_t root_checksum, root_size;
133 
134 	WT_EXTLIST alloc;			/* Extents allocated */
135 	WT_EXTLIST avail;			/* Extents available */
136 	WT_EXTLIST discard;			/* Extents discarded */
137 
138 	wt_off_t   file_size;			/* Checkpoint file size */
139 	uint64_t   ckpt_size;			/* Checkpoint byte count */
140 
141 	WT_EXTLIST ckpt_avail;			/* Checkpoint free'd extents */
142 
143 	/*
144 	 * Checkpoint archive: the block manager may potentially free a lot of
145 	 * memory from the allocation and discard extent lists when checkpoint
146 	 * completes.  Put it off until the checkpoint resolves, that lets the
147 	 * upper btree layer continue eviction sooner.
148 	 */
149 	WT_EXTLIST ckpt_alloc;			/* Checkpoint archive */
150 	WT_EXTLIST ckpt_discard;		/* Checkpoint archive */
151 };
152 
153 /*
154  * WT_BM --
155  *	Block manager handle, references a single checkpoint in a file.
156  */
157 struct __wt_bm {
158 						/* Methods */
159 	int (*addr_invalid)
160 	    (WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
161 	int (*addr_string)
162 	    (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
163 	u_int (*block_header)(WT_BM *);
164 	int (*checkpoint)
165 	    (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, WT_CKPT *, bool);
166 	int (*checkpoint_load)(WT_BM *, WT_SESSION_IMPL *,
167 	    const uint8_t *, size_t, uint8_t *, size_t *, bool);
168 	int (*checkpoint_resolve)(WT_BM *, WT_SESSION_IMPL *, bool);
169 	int (*checkpoint_start)(WT_BM *, WT_SESSION_IMPL *);
170 	int (*checkpoint_unload)(WT_BM *, WT_SESSION_IMPL *);
171 	int (*close)(WT_BM *, WT_SESSION_IMPL *);
172 	int (*compact_end)(WT_BM *, WT_SESSION_IMPL *);
173 	int (*compact_page_skip)
174 	    (WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t, bool *);
175 	int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, bool *);
176 	int (*compact_start)(WT_BM *, WT_SESSION_IMPL *);
177 	int (*corrupt)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
178 	int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
179 	bool (*is_mapped)(WT_BM *, WT_SESSION_IMPL *);
180 	int (*map_discard)(WT_BM *, WT_SESSION_IMPL *, void *, size_t);
181 	int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
182 	int (*read)
183 	    (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
184 	int (*salvage_end)(WT_BM *, WT_SESSION_IMPL *);
185 	int (*salvage_next)
186 	    (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t *, bool *);
187 	int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *);
188 	int (*salvage_valid)
189 	    (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, bool);
190 	int (*size)(WT_BM *, WT_SESSION_IMPL *, wt_off_t *);
191 	int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats);
192 	int (*sync)(WT_BM *, WT_SESSION_IMPL *, bool);
193 	int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
194 	int (*verify_end)(WT_BM *, WT_SESSION_IMPL *);
195 	int (*verify_start)
196 	    (WT_BM *, WT_SESSION_IMPL *, WT_CKPT *, const char *[]);
197 	int (*write) (WT_BM *,
198 	    WT_SESSION_IMPL *, WT_ITEM *, uint8_t *, size_t *, bool, bool);
199 	int (*write_size)(WT_BM *, WT_SESSION_IMPL *, size_t *);
200 
201 	WT_BLOCK *block;			/* Underlying file */
202 
203 	void	*map;				/* Mapped region */
204 	size_t	 maplen;
205 	void	*mapped_cookie;
206 
207 	/*
208 	 * There's only a single block manager handle that can be written, all
209 	 * others are checkpoints.
210 	 */
211 	bool is_live;				/* The live system */
212 };
213 
214 /*
215  * WT_BLOCK --
216  *	Block manager handle, references a single file.
217  */
218 struct __wt_block {
219 	const char *name;		/* Name */
220 	uint64_t name_hash;		/* Hash of name */
221 
222 	/* A list of block manager handles, sharing a file descriptor. */
223 	uint32_t ref;			/* References */
224 	TAILQ_ENTRY(__wt_block) q;	/* Linked list of handles */
225 	TAILQ_ENTRY(__wt_block) hashq;	/* Hashed list of handles */
226 
227 	WT_FH	*fh;			/* Backing file handle */
228 	wt_off_t size;			/* File size */
229 	wt_off_t extend_size;		/* File extended size */
230 	wt_off_t extend_len;		/* File extend chunk size */
231 
232 	/* Configuration information, set when the file is opened. */
233 	uint32_t allocfirst;		/* Allocation is first-fit */
234 	uint32_t allocsize;		/* Allocation size */
235 	size_t	 os_cache;		/* System buffer cache flush max */
236 	size_t	 os_cache_max;
237 	size_t	 os_cache_dirty;	/* System buffer cache write max */
238 	size_t	 os_cache_dirty_max;
239 
240 	u_int	 block_header;		/* Header length */
241 
242 	/*
243 	 * There is only a single checkpoint in a file that can be written.  The
244 	 * information could logically live in the WT_BM structure, but then we
245 	 * would be re-creating it every time we opened a new checkpoint and I'd
246 	 * rather not do that.  So, it's stored here, only accessed by one WT_BM
247 	 * handle.
248 	 */
249 	WT_SPINLOCK	live_lock;	/* Live checkpoint lock */
250 	WT_BLOCK_CKPT	live;		/* Live checkpoint */
251 #ifdef HAVE_DIAGNOSTIC
252 	bool		live_open;	/* Live system is open */
253 #endif
254 					/* Live checkpoint status */
255 	enum { WT_CKPT_NONE=0, WT_CKPT_INPROGRESS,
256 	    WT_CKPT_PANIC_ON_FAILURE, WT_CKPT_SALVAGE } ckpt_state;
257 
258 				/* Compaction support */
259 	int	 compact_pct_tenths;	/* Percent to compact */
260 	uint64_t compact_pages_reviewed;/* Pages reviewed */
261 	uint64_t compact_pages_skipped;	/* Pages skipped */
262 	uint64_t compact_pages_written;	/* Pages rewritten */
263 
264 				/* Salvage support */
265 	wt_off_t	slvg_off;	/* Salvage file offset */
266 
267 				/* Verification support */
268 	bool	   verify;		/* If performing verification */
269 	bool	   verify_layout;	/* Print out file layout information */
270 	bool	   verify_strict;	/* Fail hard on any error */
271 	wt_off_t   verify_size;		/* Checkpoint's file size */
272 	WT_EXTLIST verify_alloc;	/* Verification allocation list */
273 	uint64_t   frags;		/* Maximum frags in the file */
274 	uint8_t   *fragfile;		/* Per-file frag tracking list */
275 	uint8_t   *fragckpt;		/* Per-checkpoint frag tracking list */
276 };
277 
278 /*
279  * WT_BLOCK_DESC --
280  *	The file's description.
281  */
282 struct __wt_block_desc {
283 #define	WT_BLOCK_MAGIC		120897
284 	uint32_t magic;			/* 00-03: Magic number */
285 #define	WT_BLOCK_MAJOR_VERSION	1
286 	uint16_t majorv;		/* 04-05: Major version */
287 #define	WT_BLOCK_MINOR_VERSION	0
288 	uint16_t minorv;		/* 06-07: Minor version */
289 
290 	uint32_t checksum;		/* 08-11: Description block checksum */
291 
292 	uint32_t unused;		/* 12-15: Padding */
293 };
294 /*
295  * WT_BLOCK_DESC_SIZE is the expected structure size -- we verify the build to
296  * ensure the compiler hasn't inserted padding (padding won't cause failure,
297  * we reserve the first allocation-size block of the file for this information,
298  * but it would be worth investigation, regardless).
299  */
300 #define	WT_BLOCK_DESC_SIZE		16
301 
302 /*
303  * __wt_block_desc_byteswap --
304  *	Handle big- and little-endian transformation of a description block.
305  */
306 static inline void
__wt_block_desc_byteswap(WT_BLOCK_DESC * desc)307 __wt_block_desc_byteswap(WT_BLOCK_DESC *desc)
308 {
309 #ifdef WORDS_BIGENDIAN
310 	desc->magic = __wt_bswap32(desc->magic);
311 	desc->majorv = __wt_bswap16(desc->majorv);
312 	desc->minorv = __wt_bswap16(desc->minorv);
313 	desc->checksum = __wt_bswap32(desc->checksum);
314 #else
315 	WT_UNUSED(desc);
316 #endif
317 }
318 
319 /*
320  * WT_BLOCK_HEADER --
321  *	Blocks have a common header, a WT_PAGE_HEADER structure followed by a
322  * block-manager specific structure: WT_BLOCK_HEADER is WiredTiger's default.
323  */
324 struct __wt_block_header {
325 	/*
326 	 * We write the page size in the on-disk page header because it makes
327 	 * salvage easier.  (If we don't know the expected page length, we'd
328 	 * have to read increasingly larger chunks from the file until we find
329 	 * one that checksums, and that's going to be harsh given WiredTiger's
330 	 * potentially large page sizes.)
331 	 */
332 	uint32_t disk_size;		/* 00-03: on-disk page size */
333 
334 	/*
335 	 * Page checksums are stored in two places.  First, the page checksum
336 	 * is written within the internal page that references it as part of
337 	 * the address cookie.  This is done to improve the chances of detecting
338 	 * not only disk corruption but other bugs (for example, overwriting a
339 	 * page with another valid page image).  Second, a page's checksum is
340 	 * stored in the disk header.  This is for salvage, so salvage knows it
341 	 * has found a page that may be useful.
342 	 */
343 	uint32_t checksum;		/* 04-07: checksum */
344 
345 	/*
346 	 * No automatic generation: flag values cannot change, they're written
347 	 * to disk.
348 	 */
349 #define	WT_BLOCK_DATA_CKSUM	0x1u	/* Block data is part of the checksum */
350 	uint8_t flags;			/* 08: flags */
351 
352 	/*
353 	 * End the structure with 3 bytes of padding: it wastes space, but it
354 	 * leaves the structure 32-bit aligned and having a few bytes to play
355 	 * with in the future can't hurt.
356 	 */
357 	uint8_t unused[3];		/* 09-11: unused padding */
358 };
359 /*
360  * WT_BLOCK_HEADER_SIZE is the number of bytes we allocate for the structure: if
361  * the compiler inserts padding it will break the world.
362  */
363 #define	WT_BLOCK_HEADER_SIZE		12
364 
365 /*
366  * __wt_block_header_byteswap_copy --
367  *	Handle big- and little-endian transformation of a header block,
368  * copying from a source to a target.
369  */
370 static inline void
__wt_block_header_byteswap_copy(WT_BLOCK_HEADER * from,WT_BLOCK_HEADER * to)371 __wt_block_header_byteswap_copy(WT_BLOCK_HEADER *from, WT_BLOCK_HEADER *to)
372 {
373 	*to = *from;
374 #ifdef WORDS_BIGENDIAN
375 	to->disk_size = __wt_bswap32(from->disk_size);
376 	to->checksum = __wt_bswap32(from->checksum);
377 #endif
378 }
379 
380 /*
381  * __wt_block_header_byteswap --
382  *	Handle big- and little-endian transformation of a header block.
383  */
384 static inline void
__wt_block_header_byteswap(WT_BLOCK_HEADER * blk)385 __wt_block_header_byteswap(WT_BLOCK_HEADER *blk)
386 {
387 #ifdef WORDS_BIGENDIAN
388 	__wt_block_header_byteswap_copy(blk, blk);
389 #else
390 	WT_UNUSED(blk);
391 #endif
392 }
393 
394 /*
395  * WT_BLOCK_HEADER_BYTE
396  * WT_BLOCK_HEADER_BYTE_SIZE --
397  *	The first usable data byte on the block (past the combined headers).
398  */
399 #define	WT_BLOCK_HEADER_BYTE_SIZE					\
400 	(WT_PAGE_HEADER_SIZE + WT_BLOCK_HEADER_SIZE)
401 #define	WT_BLOCK_HEADER_BYTE(dsk)					\
402 	((void *)((uint8_t *)(dsk) + WT_BLOCK_HEADER_BYTE_SIZE))
403 
404 /*
405  * We don't compress or encrypt the block's WT_PAGE_HEADER or WT_BLOCK_HEADER
406  * structures because we need both available with decompression or decryption.
407  * We use the WT_BLOCK_HEADER checksum and on-disk size during salvage to
408  * figure out where the blocks are, and we use the WT_PAGE_HEADER in-memory
409  * size during decompression and decryption to know how large a target buffer
410  * to allocate. We can only skip the header information when doing encryption,
411  * but we skip the first 64B when doing compression; a 64B boundary may offer
412  * better alignment for the underlying compression engine, and skipping 64B
413  * shouldn't make any difference in terms of compression efficiency.
414  */
415 #define	WT_BLOCK_COMPRESS_SKIP	64
416 #define	WT_BLOCK_ENCRYPT_SKIP	WT_BLOCK_HEADER_BYTE_SIZE
417 
418 /*
419  * __wt_block_header --
420  *	Return the size of the block-specific header.
421  */
422 static inline u_int
__wt_block_header(WT_BLOCK * block)423 __wt_block_header(WT_BLOCK *block)
424 {
425 	WT_UNUSED(block);
426 
427 	return ((u_int)WT_BLOCK_HEADER_SIZE);
428 }
429