1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 /*
10 * WiredTiger's block manager interface.
11 */
12
13 /*
14 * The file's description is written into the first block of the file, which
15 * means we can use an offset of 0 as an invalid offset.
16 */
17 #define WT_BLOCK_INVALID_OFFSET 0
18
19 /*
20 * The block manager maintains three per-checkpoint extent lists:
21 * alloc: the extents allocated in this checkpoint
22 * avail: the extents available for allocation
23 * discard: the extents freed in this checkpoint
24 *
25 * An extent list is based on two skiplists: first, a by-offset list linking
26 * WT_EXT elements and sorted by file offset (low-to-high), second, a by-size
27 * list linking WT_SIZE elements and sorted by chunk size (low-to-high).
28 *
29 * Additionally, each WT_SIZE element on the by-size has a skiplist of its own,
30 * linking WT_EXT elements and sorted by file offset (low-to-high). This list
31 * has an entry for extents of a particular size.
32 *
33 * The trickiness is each individual WT_EXT element appears on two skiplists.
34 * In order to minimize allocation calls, we allocate a single array of WT_EXT
35 * pointers at the end of the WT_EXT structure, for both skiplists, and store
36 * the depth of the skiplist in the WT_EXT structure. The skiplist entries for
37 * the offset skiplist start at WT_EXT.next[0] and the entries for the size
38 * skiplist start at WT_EXT.next[WT_EXT.depth].
39 *
40 * One final complication: we only maintain the per-size skiplist for the avail
41 * list, the alloc and discard extent lists are not searched based on size.
42 */
43
44 /*
45 * WT_EXTLIST --
46 * An extent list.
47 */
48 struct __wt_extlist {
49 char *name; /* Name */
50
51 uint64_t bytes; /* Byte count */
52 uint32_t entries; /* Entry count */
53
54 wt_off_t offset; /* Written extent offset */
55 uint32_t checksum; /* Written extent checksum */
56 uint32_t size; /* Written extent size */
57
58 bool track_size; /* Maintain per-size skiplist */
59
60 WT_EXT *last; /* Cached last element */
61
62 WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Size/offset skiplists */
63 WT_SIZE *sz[WT_SKIP_MAXDEPTH];
64 };
65
66 /*
67 * WT_EXT --
68 * Encapsulation of an extent, either allocated or freed within the
69 * checkpoint.
70 */
71 struct __wt_ext {
72 wt_off_t off; /* Extent's file offset */
73 wt_off_t size; /* Extent's Size */
74
75 uint8_t depth; /* Skip list depth */
76
77 /*
78 * Variable-length array, sized by the number of skiplist elements.
79 * The first depth array entries are the address skiplist elements,
80 * the second depth array entries are the size skiplist.
81 */
82 WT_EXT *next[0]; /* Offset, size skiplists */
83 };
84
85 /*
86 * WT_SIZE --
87 * Encapsulation of a block size skiplist entry.
88 */
89 struct __wt_size {
90 wt_off_t size; /* Size */
91
92 uint8_t depth; /* Skip list depth */
93
94 WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Per-size offset skiplist */
95
96 /*
97 * We don't use a variable-length array for the size skiplist, we want
98 * to be able to use any cached WT_SIZE structure as the head of a list,
99 * and we don't know the related WT_EXT structure's depth.
100 */
101 WT_SIZE *next[WT_SKIP_MAXDEPTH]; /* Size skiplist */
102 };
103
104 /*
105 * WT_EXT_FOREACH --
106 * Walk a block manager skiplist.
107 * WT_EXT_FOREACH_OFF --
108 * Walk a block manager skiplist where the WT_EXT.next entries are offset
109 * by the depth.
110 */
111 #define WT_EXT_FOREACH(skip, head) \
112 for ((skip) = (head)[0]; \
113 (skip) != NULL; (skip) = (skip)->next[0])
114 #define WT_EXT_FOREACH_OFF(skip, head) \
115 for ((skip) = (head)[0]; \
116 (skip) != NULL; (skip) = (skip)->next[(skip)->depth])
117
118 /*
119 * Checkpoint cookie: carries a version number as I don't want to rev the schema
120 * file version should the default block manager checkpoint format change.
121 *
122 * Version #1 checkpoint cookie format:
123 * [1] [root addr] [alloc addr] [avail addr] [discard addr]
124 * [file size] [checkpoint size] [write generation]
125 */
126 #define WT_BM_CHECKPOINT_VERSION 1 /* Checkpoint format version */
127 #define WT_BLOCK_EXTLIST_MAGIC 71002 /* Identify a list */
128 struct __wt_block_ckpt {
129 uint8_t version; /* Version */
130
131 wt_off_t root_offset; /* The root */
132 uint32_t root_checksum, root_size;
133
134 WT_EXTLIST alloc; /* Extents allocated */
135 WT_EXTLIST avail; /* Extents available */
136 WT_EXTLIST discard; /* Extents discarded */
137
138 wt_off_t file_size; /* Checkpoint file size */
139 uint64_t ckpt_size; /* Checkpoint byte count */
140
141 WT_EXTLIST ckpt_avail; /* Checkpoint free'd extents */
142
143 /*
144 * Checkpoint archive: the block manager may potentially free a lot of
145 * memory from the allocation and discard extent lists when checkpoint
146 * completes. Put it off until the checkpoint resolves, that lets the
147 * upper btree layer continue eviction sooner.
148 */
149 WT_EXTLIST ckpt_alloc; /* Checkpoint archive */
150 WT_EXTLIST ckpt_discard; /* Checkpoint archive */
151 };
152
153 /*
154 * WT_BM --
155 * Block manager handle, references a single checkpoint in a file.
156 */
157 struct __wt_bm {
158 /* Methods */
159 int (*addr_invalid)
160 (WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
161 int (*addr_string)
162 (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
163 u_int (*block_header)(WT_BM *);
164 int (*checkpoint)
165 (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, WT_CKPT *, bool);
166 int (*checkpoint_load)(WT_BM *, WT_SESSION_IMPL *,
167 const uint8_t *, size_t, uint8_t *, size_t *, bool);
168 int (*checkpoint_resolve)(WT_BM *, WT_SESSION_IMPL *, bool);
169 int (*checkpoint_start)(WT_BM *, WT_SESSION_IMPL *);
170 int (*checkpoint_unload)(WT_BM *, WT_SESSION_IMPL *);
171 int (*close)(WT_BM *, WT_SESSION_IMPL *);
172 int (*compact_end)(WT_BM *, WT_SESSION_IMPL *);
173 int (*compact_page_skip)
174 (WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t, bool *);
175 int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, bool *);
176 int (*compact_start)(WT_BM *, WT_SESSION_IMPL *);
177 int (*corrupt)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
178 int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
179 bool (*is_mapped)(WT_BM *, WT_SESSION_IMPL *);
180 int (*map_discard)(WT_BM *, WT_SESSION_IMPL *, void *, size_t);
181 int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
182 int (*read)
183 (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
184 int (*salvage_end)(WT_BM *, WT_SESSION_IMPL *);
185 int (*salvage_next)
186 (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t *, bool *);
187 int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *);
188 int (*salvage_valid)
189 (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, bool);
190 int (*size)(WT_BM *, WT_SESSION_IMPL *, wt_off_t *);
191 int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats);
192 int (*sync)(WT_BM *, WT_SESSION_IMPL *, bool);
193 int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
194 int (*verify_end)(WT_BM *, WT_SESSION_IMPL *);
195 int (*verify_start)
196 (WT_BM *, WT_SESSION_IMPL *, WT_CKPT *, const char *[]);
197 int (*write) (WT_BM *,
198 WT_SESSION_IMPL *, WT_ITEM *, uint8_t *, size_t *, bool, bool);
199 int (*write_size)(WT_BM *, WT_SESSION_IMPL *, size_t *);
200
201 WT_BLOCK *block; /* Underlying file */
202
203 void *map; /* Mapped region */
204 size_t maplen;
205 void *mapped_cookie;
206
207 /*
208 * There's only a single block manager handle that can be written, all
209 * others are checkpoints.
210 */
211 bool is_live; /* The live system */
212 };
213
214 /*
215 * WT_BLOCK --
216 * Block manager handle, references a single file.
217 */
218 struct __wt_block {
219 const char *name; /* Name */
220 uint64_t name_hash; /* Hash of name */
221
222 /* A list of block manager handles, sharing a file descriptor. */
223 uint32_t ref; /* References */
224 TAILQ_ENTRY(__wt_block) q; /* Linked list of handles */
225 TAILQ_ENTRY(__wt_block) hashq; /* Hashed list of handles */
226
227 WT_FH *fh; /* Backing file handle */
228 wt_off_t size; /* File size */
229 wt_off_t extend_size; /* File extended size */
230 wt_off_t extend_len; /* File extend chunk size */
231
232 /* Configuration information, set when the file is opened. */
233 uint32_t allocfirst; /* Allocation is first-fit */
234 uint32_t allocsize; /* Allocation size */
235 size_t os_cache; /* System buffer cache flush max */
236 size_t os_cache_max;
237 size_t os_cache_dirty; /* System buffer cache write max */
238 size_t os_cache_dirty_max;
239
240 u_int block_header; /* Header length */
241
242 /*
243 * There is only a single checkpoint in a file that can be written. The
244 * information could logically live in the WT_BM structure, but then we
245 * would be re-creating it every time we opened a new checkpoint and I'd
246 * rather not do that. So, it's stored here, only accessed by one WT_BM
247 * handle.
248 */
249 WT_SPINLOCK live_lock; /* Live checkpoint lock */
250 WT_BLOCK_CKPT live; /* Live checkpoint */
251 #ifdef HAVE_DIAGNOSTIC
252 bool live_open; /* Live system is open */
253 #endif
254 /* Live checkpoint status */
255 enum { WT_CKPT_NONE=0, WT_CKPT_INPROGRESS,
256 WT_CKPT_PANIC_ON_FAILURE, WT_CKPT_SALVAGE } ckpt_state;
257
258 /* Compaction support */
259 int compact_pct_tenths; /* Percent to compact */
260 uint64_t compact_pages_reviewed;/* Pages reviewed */
261 uint64_t compact_pages_skipped; /* Pages skipped */
262 uint64_t compact_pages_written; /* Pages rewritten */
263
264 /* Salvage support */
265 wt_off_t slvg_off; /* Salvage file offset */
266
267 /* Verification support */
268 bool verify; /* If performing verification */
269 bool verify_layout; /* Print out file layout information */
270 bool verify_strict; /* Fail hard on any error */
271 wt_off_t verify_size; /* Checkpoint's file size */
272 WT_EXTLIST verify_alloc; /* Verification allocation list */
273 uint64_t frags; /* Maximum frags in the file */
274 uint8_t *fragfile; /* Per-file frag tracking list */
275 uint8_t *fragckpt; /* Per-checkpoint frag tracking list */
276 };
277
278 /*
279 * WT_BLOCK_DESC --
280 * The file's description.
281 */
282 struct __wt_block_desc {
283 #define WT_BLOCK_MAGIC 120897
284 uint32_t magic; /* 00-03: Magic number */
285 #define WT_BLOCK_MAJOR_VERSION 1
286 uint16_t majorv; /* 04-05: Major version */
287 #define WT_BLOCK_MINOR_VERSION 0
288 uint16_t minorv; /* 06-07: Minor version */
289
290 uint32_t checksum; /* 08-11: Description block checksum */
291
292 uint32_t unused; /* 12-15: Padding */
293 };
294 /*
295 * WT_BLOCK_DESC_SIZE is the expected structure size -- we verify the build to
296 * ensure the compiler hasn't inserted padding (padding won't cause failure,
297 * we reserve the first allocation-size block of the file for this information,
298 * but it would be worth investigation, regardless).
299 */
300 #define WT_BLOCK_DESC_SIZE 16
301
302 /*
303 * __wt_block_desc_byteswap --
304 * Handle big- and little-endian transformation of a description block.
305 */
306 static inline void
__wt_block_desc_byteswap(WT_BLOCK_DESC * desc)307 __wt_block_desc_byteswap(WT_BLOCK_DESC *desc)
308 {
309 #ifdef WORDS_BIGENDIAN
310 desc->magic = __wt_bswap32(desc->magic);
311 desc->majorv = __wt_bswap16(desc->majorv);
312 desc->minorv = __wt_bswap16(desc->minorv);
313 desc->checksum = __wt_bswap32(desc->checksum);
314 #else
315 WT_UNUSED(desc);
316 #endif
317 }
318
319 /*
320 * WT_BLOCK_HEADER --
321 * Blocks have a common header, a WT_PAGE_HEADER structure followed by a
322 * block-manager specific structure: WT_BLOCK_HEADER is WiredTiger's default.
323 */
324 struct __wt_block_header {
325 /*
326 * We write the page size in the on-disk page header because it makes
327 * salvage easier. (If we don't know the expected page length, we'd
328 * have to read increasingly larger chunks from the file until we find
329 * one that checksums, and that's going to be harsh given WiredTiger's
330 * potentially large page sizes.)
331 */
332 uint32_t disk_size; /* 00-03: on-disk page size */
333
334 /*
335 * Page checksums are stored in two places. First, the page checksum
336 * is written within the internal page that references it as part of
337 * the address cookie. This is done to improve the chances of detecting
338 * not only disk corruption but other bugs (for example, overwriting a
339 * page with another valid page image). Second, a page's checksum is
340 * stored in the disk header. This is for salvage, so salvage knows it
341 * has found a page that may be useful.
342 */
343 uint32_t checksum; /* 04-07: checksum */
344
345 /*
346 * No automatic generation: flag values cannot change, they're written
347 * to disk.
348 */
349 #define WT_BLOCK_DATA_CKSUM 0x1u /* Block data is part of the checksum */
350 uint8_t flags; /* 08: flags */
351
352 /*
353 * End the structure with 3 bytes of padding: it wastes space, but it
354 * leaves the structure 32-bit aligned and having a few bytes to play
355 * with in the future can't hurt.
356 */
357 uint8_t unused[3]; /* 09-11: unused padding */
358 };
359 /*
360 * WT_BLOCK_HEADER_SIZE is the number of bytes we allocate for the structure: if
361 * the compiler inserts padding it will break the world.
362 */
363 #define WT_BLOCK_HEADER_SIZE 12
364
365 /*
366 * __wt_block_header_byteswap_copy --
367 * Handle big- and little-endian transformation of a header block,
368 * copying from a source to a target.
369 */
370 static inline void
__wt_block_header_byteswap_copy(WT_BLOCK_HEADER * from,WT_BLOCK_HEADER * to)371 __wt_block_header_byteswap_copy(WT_BLOCK_HEADER *from, WT_BLOCK_HEADER *to)
372 {
373 *to = *from;
374 #ifdef WORDS_BIGENDIAN
375 to->disk_size = __wt_bswap32(from->disk_size);
376 to->checksum = __wt_bswap32(from->checksum);
377 #endif
378 }
379
380 /*
381 * __wt_block_header_byteswap --
382 * Handle big- and little-endian transformation of a header block.
383 */
384 static inline void
__wt_block_header_byteswap(WT_BLOCK_HEADER * blk)385 __wt_block_header_byteswap(WT_BLOCK_HEADER *blk)
386 {
387 #ifdef WORDS_BIGENDIAN
388 __wt_block_header_byteswap_copy(blk, blk);
389 #else
390 WT_UNUSED(blk);
391 #endif
392 }
393
394 /*
395 * WT_BLOCK_HEADER_BYTE
396 * WT_BLOCK_HEADER_BYTE_SIZE --
397 * The first usable data byte on the block (past the combined headers).
398 */
399 #define WT_BLOCK_HEADER_BYTE_SIZE \
400 (WT_PAGE_HEADER_SIZE + WT_BLOCK_HEADER_SIZE)
401 #define WT_BLOCK_HEADER_BYTE(dsk) \
402 ((void *)((uint8_t *)(dsk) + WT_BLOCK_HEADER_BYTE_SIZE))
403
404 /*
405 * We don't compress or encrypt the block's WT_PAGE_HEADER or WT_BLOCK_HEADER
406 * structures because we need both available with decompression or decryption.
407 * We use the WT_BLOCK_HEADER checksum and on-disk size during salvage to
408 * figure out where the blocks are, and we use the WT_PAGE_HEADER in-memory
409 * size during decompression and decryption to know how large a target buffer
410 * to allocate. We can only skip the header information when doing encryption,
411 * but we skip the first 64B when doing compression; a 64B boundary may offer
412 * better alignment for the underlying compression engine, and skipping 64B
413 * shouldn't make any difference in terms of compression efficiency.
414 */
415 #define WT_BLOCK_COMPRESS_SKIP 64
416 #define WT_BLOCK_ENCRYPT_SKIP WT_BLOCK_HEADER_BYTE_SIZE
417
418 /*
419 * __wt_block_header --
420 * Return the size of the block-specific header.
421 */
422 static inline u_int
__wt_block_header(WT_BLOCK * block)423 __wt_block_header(WT_BLOCK *block)
424 {
425 WT_UNUSED(block);
426
427 return ((u_int)WT_BLOCK_HEADER_SIZE);
428 }
429