1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 #define WT_RECNO_OOB 0 /* Illegal record number */
10
11 /* AUTOMATIC FLAG VALUE GENERATION START */
12 #define WT_READ_CACHE 0x0001u
13 #define WT_READ_DELETED_CHECK 0x0002u
14 #define WT_READ_DELETED_SKIP 0x0004u
15 #define WT_READ_IGNORE_CACHE_SIZE 0x0008u
16 #define WT_READ_LOOKASIDE 0x0010u
17 #define WT_READ_NOTFOUND_OK 0x0020u
18 #define WT_READ_NO_GEN 0x0040u
19 #define WT_READ_NO_SPLIT 0x0080u
20 #define WT_READ_NO_WAIT 0x0100u
21 #define WT_READ_PREV 0x0200u
22 #define WT_READ_RESTART_OK 0x0400u
23 #define WT_READ_SKIP_INTL 0x0800u
24 #define WT_READ_TRUNCATE 0x1000u
25 #define WT_READ_WONT_NEED 0x2000u
26 /* AUTOMATIC FLAG VALUE GENERATION STOP */
27
28 /* AUTOMATIC FLAG VALUE GENERATION START */
29 #define WT_REC_CHECKPOINT 0x01u
30 #define WT_REC_EVICT 0x02u
31 #define WT_REC_IN_MEMORY 0x04u
32 #define WT_REC_LOOKASIDE 0x08u
33 #define WT_REC_SCRUB 0x10u
34 #define WT_REC_UPDATE_RESTORE 0x20u
35 #define WT_REC_VISIBILITY_ERR 0x40u
36 #define WT_REC_VISIBLE_ALL 0x80u
37 /* AUTOMATIC FLAG VALUE GENERATION STOP */
38
39 /*
40 * WT_PAGE_HEADER --
41 * Blocks have a common header, a WT_PAGE_HEADER structure followed by a
42 * block-manager specific structure.
43 */
44 struct __wt_page_header {
45 /*
46 * The record number of the first record of the page is stored on disk
47 * so we can figure out where the column-store leaf page fits into the
48 * key space during salvage.
49 */
50 uint64_t recno; /* 00-07: column-store starting recno */
51
52 /*
53 * We maintain page write-generations in the non-transactional case
54 * as that's how salvage can determine the most recent page between
55 * pages overlapping the same key range.
56 */
57 uint64_t write_gen; /* 08-15: write generation */
58
59 /*
60 * The page's in-memory size isn't rounded or aligned, it's the actual
61 * number of bytes the disk-image consumes when instantiated in memory.
62 */
63 uint32_t mem_size; /* 16-19: in-memory page size */
64
65 union {
66 uint32_t entries; /* 20-23: number of cells on page */
67 uint32_t datalen; /* 20-23: overflow data length */
68 } u;
69
70 uint8_t type; /* 24: page type */
71
72 /*
73 * No automatic generation: flag values cannot change, they're written
74 * to disk.
75 */
76 #define WT_PAGE_COMPRESSED 0x01u /* Page is compressed on disk */
77 #define WT_PAGE_EMPTY_V_ALL 0x02u /* Page has all zero-length values */
78 #define WT_PAGE_EMPTY_V_NONE 0x04u /* Page has no zero-length values */
79 #define WT_PAGE_ENCRYPTED 0x08u /* Page is encrypted on disk */
80 #define WT_PAGE_LAS_UPDATE 0x10u /* Page updates in lookaside store */
81 uint8_t flags; /* 25: flags */
82
83 /*
84 * End the structure with 2 bytes of padding: it wastes space, but it
85 * leaves the structure 32-bit aligned and having a few bytes to play
86 * with in the future can't hurt.
87 */
88 uint8_t unused[2]; /* 26-27: unused padding */
89 };
90 /*
91 * WT_PAGE_HEADER_SIZE is the number of bytes we allocate for the structure: if
92 * the compiler inserts padding it will break the world.
93 */
94 #define WT_PAGE_HEADER_SIZE 28
95
96 /*
97 * __wt_page_header_byteswap --
98 * Handle big- and little-endian transformation of a page header.
99 */
100 static inline void
__wt_page_header_byteswap(WT_PAGE_HEADER * dsk)101 __wt_page_header_byteswap(WT_PAGE_HEADER *dsk)
102 {
103 #ifdef WORDS_BIGENDIAN
104 dsk->recno = __wt_bswap64(dsk->recno);
105 dsk->write_gen = __wt_bswap64(dsk->write_gen);
106 dsk->mem_size = __wt_bswap32(dsk->mem_size);
107 dsk->u.entries = __wt_bswap32(dsk->u.entries);
108 #else
109 WT_UNUSED(dsk);
110 #endif
111 }
112
113 /*
114 * The block-manager specific information immediately follows the WT_PAGE_HEADER
115 * structure.
116 */
117 #define WT_BLOCK_HEADER_REF(dsk) \
118 ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_SIZE))
119
120 /*
121 * WT_PAGE_HEADER_BYTE --
122 * WT_PAGE_HEADER_BYTE_SIZE --
123 * The first usable data byte on the block (past the combined headers).
124 */
125 #define WT_PAGE_HEADER_BYTE_SIZE(btree) \
126 ((u_int)(WT_PAGE_HEADER_SIZE + (btree)->block_header))
127 #define WT_PAGE_HEADER_BYTE(btree, dsk) \
128 ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_BYTE_SIZE(btree)))
129
130 /*
131 * WT_ADDR --
132 * An in-memory structure to hold a block's location.
133 */
134 struct __wt_addr {
135 uint8_t *addr; /* Block-manager's cookie */
136 uint8_t size; /* Block-manager's cookie length */
137
138 #define WT_ADDR_INT 1 /* Internal page */
139 #define WT_ADDR_LEAF 2 /* Leaf page */
140 #define WT_ADDR_LEAF_NO 3 /* Leaf page, no overflow */
141 uint8_t type;
142
143 /*
144 * If an address is both as an address for the previous and the current
145 * multi-block reconciliations, that is, a block we're writing matches
146 * the block written the last time, it will appear in both the current
147 * boundary points as well as the page modification's list of previous
148 * blocks. The reuse flag is how we know that's happening so the block
149 * is treated correctly (not free'd on error, for example).
150 */
151 uint8_t reuse;
152 };
153
154 /*
155 * Overflow tracking for reuse: When a page is reconciled, we write new K/V
156 * overflow items. If pages are reconciled multiple times, we need to know
157 * if we've already written a particular overflow record (so we don't write
158 * it again), as well as if we've modified an overflow record previously
159 * written (in which case we want to write a new record and discard blocks
160 * used by the previously written record). Track overflow records written
161 * for the page, storing the values in a skiplist with the record's value as
162 * the "key".
163 */
164 struct __wt_ovfl_reuse {
165 uint32_t value_offset; /* Overflow value offset */
166 uint32_t value_size; /* Overflow value size */
167 uint8_t addr_offset; /* Overflow addr offset */
168 uint8_t addr_size; /* Overflow addr size */
169
170 /*
171 * On each page reconciliation, we clear the entry's in-use flag, and
172 * reset it as the overflow record is re-used. After reconciliation
173 * completes, unused skiplist entries are discarded, along with their
174 * underlying blocks.
175 *
176 * On each page reconciliation, set the just-added flag for each new
177 * skiplist entry; if reconciliation fails for any reason, discard the
178 * newly added skiplist entries, along with their underlying blocks.
179 */
180 /* AUTOMATIC FLAG VALUE GENERATION START */
181 #define WT_OVFL_REUSE_INUSE 0x1u
182 #define WT_OVFL_REUSE_JUST_ADDED 0x2u
183 /* AUTOMATIC FLAG VALUE GENERATION STOP */
184 uint8_t flags;
185
186 /*
187 * The untyped address immediately follows the WT_OVFL_REUSE structure,
188 * the untyped value immediately follows the address.
189 */
190 #define WT_OVFL_REUSE_ADDR(p) \
191 ((void *)((uint8_t *)(p) + (p)->addr_offset))
192 #define WT_OVFL_REUSE_VALUE(p) \
193 ((void *)((uint8_t *)(p) + (p)->value_offset))
194
195 WT_OVFL_REUSE *next[0]; /* Forward-linked skip list */
196 };
197
198 /*
199 * Lookaside table support: when a page is being reconciled for eviction and has
200 * updates that might be required by earlier readers in the system, the updates
201 * are written into a lookaside table, and restored as necessary if the page is
202 * read.
203 *
204 * The key is a unique marker for the page (a page ID plus a file ID, ordered
205 * this way so that overall the lookaside table is append-mostly), a counter
206 * (used to ensure the update records remain in the original order), and the
207 * record's key (byte-string for row-store, record number for column-store).
208 * The value is the WT_UPDATE structure's transaction ID, timestamp, update's
209 * prepare state, update type and value.
210 *
211 * As the key for the lookaside table is different for row- and column-store, we
212 * store both key types in a WT_ITEM, building/parsing them in the code, because
213 * otherwise we'd need two lookaside files with different key formats. We could
214 * make the lookaside table's key standard by moving the source key into the
215 * lookaside table value, but that doesn't make the coding any simpler, and it
216 * makes the lookaside table's value more likely to overflow the page size when
217 * the row-store key is relatively large.
218 */
219 #ifdef HAVE_BUILTIN_EXTENSION_SNAPPY
220 #define WT_LOOKASIDE_COMPRESSOR "snappy"
221 #else
222 #define WT_LOOKASIDE_COMPRESSOR "none"
223 #endif
224 #define WT_LAS_CONFIG \
225 "key_format=" WT_UNCHECKED_STRING(QIQu) \
226 ",value_format=" WT_UNCHECKED_STRING(QuBBu) \
227 ",block_compressor=" WT_LOOKASIDE_COMPRESSOR \
228 ",leaf_value_max=64MB" \
229 ",prefix_compression=true"
230
231 /*
232 * WT_PAGE_LOOKASIDE --
233 * Related information for on-disk pages with lookaside entries.
234 */
235 struct __wt_page_lookaside {
236 uint64_t las_pageid; /* Page ID in lookaside */
237 uint64_t max_txn; /* Maximum transaction ID */
238 uint64_t unstable_txn; /* First transaction ID not on page */
239 WT_DECL_TIMESTAMP(max_timestamp)/* Maximum timestamp */
240 WT_DECL_TIMESTAMP(unstable_timestamp)/* First timestamp not on page */
241 bool eviction_to_lookaside; /* Revert to lookaside on eviction */
242 bool resolved; /* History has been read into cache */
243 bool skew_newest; /* Page image has newest versions */
244 };
245
246 /*
247 * WT_PAGE_MODIFY --
248 * When a page is modified, there's additional information to maintain.
249 */
250 struct __wt_page_modify {
251 /* The first unwritten transaction ID (approximate). */
252 uint64_t first_dirty_txn;
253
254 /* The transaction state last time eviction was attempted. */
255 uint64_t last_evict_pass_gen;
256 uint64_t last_eviction_id;
257 WT_DECL_TIMESTAMP(last_eviction_timestamp)
258
259 #ifdef HAVE_DIAGNOSTIC
260 /* Check that transaction time moves forward. */
261 uint64_t last_oldest_id;
262 #endif
263
264 /* Avoid checking for obsolete updates during checkpoints. */
265 uint64_t obsolete_check_txn;
266 WT_DECL_TIMESTAMP(obsolete_check_timestamp)
267
268 /* The largest transaction seen on the page by reconciliation. */
269 uint64_t rec_max_txn;
270 WT_DECL_TIMESTAMP(rec_max_timestamp)
271
272 /* Stable timestamp at last reconciliation. */
273 WT_DECL_TIMESTAMP(last_stable_timestamp)
274
275 /* The largest update transaction ID (approximate). */
276 uint64_t update_txn;
277
278 /* Dirty bytes added to the cache. */
279 size_t bytes_dirty;
280
281 /*
282 * When pages are reconciled, the result is one or more replacement
283 * blocks. A replacement block can be in one of two states: it was
284 * written to disk, and so we have a block address, or it contained
285 * unresolved modifications and we have a disk image for it with a
286 * list of those unresolved modifications. The former is the common
287 * case: we only build lists of unresolved modifications when we're
288 * evicting a page, and we only expect to see unresolved modifications
289 * on a page being evicted in the case of a hot page that's too large
290 * to keep in memory as it is. In other words, checkpoints will skip
291 * unresolved modifications, and will write the blocks rather than
292 * build lists of unresolved modifications.
293 *
294 * Ugly union/struct layout to conserve memory, we never have both
295 * a replace address and multiple replacement blocks.
296 */
297 union {
298 struct { /* Single, written replacement block */
299 WT_ADDR replace;
300
301 /*
302 * A disk image that may or may not have been written, used to
303 * re-instantiate the page in memory.
304 */
305 void *disk_image;
306
307 /* The page has lookaside entries. */
308 WT_PAGE_LOOKASIDE page_las;
309 } r;
310 #undef mod_replace
311 #define mod_replace u1.r.replace
312 #undef mod_disk_image
313 #define mod_disk_image u1.r.disk_image
314 #undef mod_page_las
315 #define mod_page_las u1.r.page_las
316
317 struct { /* Multiple replacement blocks */
318 struct __wt_multi {
319 /*
320 * Block's key: either a column-store record number or a
321 * row-store variable length byte string.
322 */
323 union {
324 uint64_t recno;
325 WT_IKEY *ikey;
326 } key;
327
328 /*
329 * A disk image that may or may not have been written, used to
330 * re-instantiate the page in memory.
331 */
332 void *disk_image;
333
334 /*
335 * List of unresolved updates. Updates are either a row-store
336 * insert or update list, or column-store insert list. When
337 * creating lookaside records, there is an additional value,
338 * the committed item's transaction information.
339 *
340 * If there are unresolved updates, the block wasn't written and
341 * there will always be a disk image.
342 */
343 struct __wt_save_upd {
344 WT_INSERT *ins; /* Insert list reference */
345 WT_ROW *ripcip; /* Original on-page reference */
346 WT_UPDATE *onpage_upd;
347 } *supd;
348 uint32_t supd_entries;
349
350 /*
351 * Disk image was written: address, size and checksum.
352 * On subsequent reconciliations of this page, we avoid writing
353 * the block if it's unchanged by comparing size and checksum;
354 * the reuse flag is set when the block is unchanged and we're
355 * reusing a previous address.
356 */
357 WT_ADDR addr;
358 uint32_t size;
359 uint32_t checksum;
360
361 WT_PAGE_LOOKASIDE page_las;
362 } *multi;
363 uint32_t multi_entries; /* Multiple blocks element count */
364 } m;
365 #undef mod_multi
366 #define mod_multi u1.m.multi
367 #undef mod_multi_entries
368 #define mod_multi_entries u1.m.multi_entries
369 } u1;
370
371 /*
372 * Internal pages need to be able to chain root-page splits and have a
373 * special transactional eviction requirement. Column-store leaf pages
374 * need update and append lists.
375 *
376 * Ugly union/struct layout to conserve memory, a page is either a leaf
377 * page or an internal page.
378 */
379 union {
380 struct {
381 /*
382 * When a root page splits, we create a new page and write it;
383 * the new page can also split and so on, and we continue this
384 * process until we write a single replacement root page. We
385 * use the root split field to track the list of created pages
386 * so they can be discarded when no longer needed.
387 */
388 WT_PAGE *root_split; /* Linked list of root split pages */
389 } intl;
390 #undef mod_root_split
391 #define mod_root_split u2.intl.root_split
392 struct {
393 /*
394 * Appended items to column-stores: there is only a single one
395 * of these active at a time per column-store tree.
396 */
397 WT_INSERT_HEAD **append;
398
399 /*
400 * Updated items in column-stores: variable-length RLE entries
401 * can expand to multiple entries which requires some kind of
402 * list we can expand on demand. Updated items in fixed-length
403 * files could be done based on an WT_UPDATE array as in
404 * row-stores, but there can be a very large number of bits on
405 * a single page, and the cost of the WT_UPDATE array would be
406 * huge.
407 */
408 WT_INSERT_HEAD **update;
409
410 /*
411 * Split-saved last column-store page record. If a column-store
412 * page is split, we save the first record number moved so that
413 * during reconciliation we know the page's last record and can
414 * write any implicitly created deleted records for the page.
415 */
416 uint64_t split_recno;
417 } column_leaf;
418 #undef mod_col_append
419 #define mod_col_append u2.column_leaf.append
420 #undef mod_col_update
421 #define mod_col_update u2.column_leaf.update
422 #undef mod_col_split_recno
423 #define mod_col_split_recno u2.column_leaf.split_recno
424 struct {
425 /* Inserted items for row-store. */
426 WT_INSERT_HEAD **insert;
427
428 /* Updated items for row-stores. */
429 WT_UPDATE **update;
430 } row_leaf;
431 #undef mod_row_insert
432 #define mod_row_insert u2.row_leaf.insert
433 #undef mod_row_update
434 #define mod_row_update u2.row_leaf.update
435 } u2;
436
437 /*
438 * Overflow record tracking for reconciliation. We assume overflow
439 * records are relatively rare, so we don't allocate the structures
440 * to track them until we actually see them in the data.
441 */
442 struct __wt_ovfl_track {
443 /*
444 * Overflow key/value address/byte-string pairs we potentially
445 * reuse each time we reconcile the page.
446 */
447 WT_OVFL_REUSE *ovfl_reuse[WT_SKIP_MAXDEPTH];
448
449 /*
450 * Overflow key/value addresses to be discarded from the block
451 * manager after reconciliation completes successfully.
452 */
453 WT_CELL **discard;
454 size_t discard_entries;
455 size_t discard_allocated;
456
457 /* Cached overflow value cell/update address pairs. */
458 struct {
459 WT_CELL *cell;
460 uint8_t *data;
461 size_t size;
462 } *remove;
463 size_t remove_allocated;
464 uint32_t remove_next;
465 } *ovfl_track;
466
467 #define WT_PAGE_LOCK(s, p) \
468 __wt_spin_lock((s), &(p)->modify->page_lock)
469 #define WT_PAGE_TRYLOCK(s, p) \
470 __wt_spin_trylock((s), &(p)->modify->page_lock)
471 #define WT_PAGE_UNLOCK(s, p) \
472 __wt_spin_unlock((s), &(p)->modify->page_lock)
473 WT_SPINLOCK page_lock; /* Page's spinlock */
474
475 /*
476 * The page state is incremented when a page is modified.
477 *
478 * WT_PAGE_CLEAN --
479 * The page is clean.
480 * WT_PAGE_DIRTY_FIRST --
481 * The page is in this state after the first operation that marks a
482 * page dirty, or when reconciliation is checking to see if it has
483 * done enough work to be able to mark the page clean.
484 * WT_PAGE_DIRTY --
485 * Two or more updates have been added to the page.
486 */
487 #define WT_PAGE_CLEAN 0
488 #define WT_PAGE_DIRTY_FIRST 1
489 #define WT_PAGE_DIRTY 2
490 uint32_t page_state;
491
492 #define WT_PM_REC_EMPTY 1 /* Reconciliation: no replacement */
493 #define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */
494 #define WT_PM_REC_REPLACE 3 /* Reconciliation: single block */
495 uint8_t rec_result; /* Reconciliation state */
496
497 #define WT_PAGE_RS_LOOKASIDE 0x1
498 #define WT_PAGE_RS_RESTORED 0x2
499 uint8_t restore_state; /* Created by restoring updates */
500 };
501
502 /*
503 * WT_COL_RLE --
504 * Variable-length column-store pages have an array of page entries with RLE
505 * counts greater than 1 when reading the page, so it's not necessary to walk
506 * the page counting records to find a specific entry. We can do a binary search
507 * in this array, then an offset calculation to find the cell.
508 */
509 WT_PACKED_STRUCT_BEGIN(__wt_col_rle)
510 uint64_t recno; /* Record number of first repeat. */
511 uint64_t rle; /* Repeat count. */
512 uint32_t indx; /* Slot of entry in col_var. */
513 WT_PACKED_STRUCT_END
514
515 /*
516 * WT_PAGE --
517 * The WT_PAGE structure describes the in-memory page information.
518 */
519 struct __wt_page {
520 /* Per page-type information. */
521 union {
522 /*
523 * Internal pages (both column- and row-store).
524 *
525 * In-memory internal pages have an array of pointers to child
526 * structures, maintained in collated order.
527 *
528 * Multiple threads of control may be searching the in-memory
529 * internal page and a child page of the internal page may
530 * cause a split at any time. When a page splits, a new array
531 * is allocated and atomically swapped into place. Threads in
532 * the old array continue without interruption (the old array is
533 * still valid), but have to avoid racing. No barrier is needed
534 * because the array reference is updated atomically, but code
535 * reading the fields multiple times would be a very bad idea.
536 * Specifically, do not do this:
537 * WT_REF **refp = page->u.intl__index->index;
538 * uint32_t entries = page->u.intl__index->entries;
539 *
540 * The field is declared volatile (so the compiler knows not to
541 * read it multiple times), and we obscure the field name and
542 * use a copy macro in all references to the field (so the code
543 * doesn't read it multiple times).
544 */
545 struct {
546 WT_REF *parent_ref; /* Parent reference */
547 uint64_t split_gen; /* Generation of last split */
548
549 struct __wt_page_index {
550 uint32_t entries;
551 uint32_t deleted_entries;
552 WT_REF **index;
553 } * volatile __index; /* Collated children */
554 } intl;
555 #undef pg_intl_parent_ref
556 #define pg_intl_parent_ref u.intl.parent_ref
557 #undef pg_intl_split_gen
558 #define pg_intl_split_gen u.intl.split_gen
559
560 /*
561 * Macros to copy/set the index because the name is obscured to ensure
562 * the field isn't read multiple times.
563 *
564 * There are two versions of WT_INTL_INDEX_GET because the session split
565 * generation is usually set, but it's not always required: for example,
566 * if a page is locked for splitting, or being created or destroyed.
567 */
568 #define WT_INTL_INDEX_GET_SAFE(page) \
569 ((page)->u.intl.__index)
570 #define WT_INTL_INDEX_GET(session, page, pindex) do { \
571 WT_ASSERT(session, \
572 __wt_session_gen(session, WT_GEN_SPLIT) != 0); \
573 (pindex) = WT_INTL_INDEX_GET_SAFE(page); \
574 } while (0)
575 #define WT_INTL_INDEX_SET(page, v) do { \
576 WT_WRITE_BARRIER(); \
577 ((page)->u.intl.__index) = (v); \
578 } while (0)
579
580 /*
581 * Macro to walk the list of references in an internal page.
582 */
583 #define WT_INTL_FOREACH_BEGIN(session, page, ref) do { \
584 WT_PAGE_INDEX *__pindex; \
585 WT_REF **__refp; \
586 uint32_t __entries; \
587 WT_INTL_INDEX_GET(session, page, __pindex); \
588 for (__refp = __pindex->index, \
589 __entries = __pindex->entries; __entries > 0; --__entries) {\
590 (ref) = *__refp++;
591 #define WT_INTL_FOREACH_REVERSE_BEGIN(session, page, ref) do { \
592 WT_PAGE_INDEX *__pindex; \
593 WT_REF **__refp; \
594 uint32_t __entries; \
595 WT_INTL_INDEX_GET(session, page, __pindex); \
596 for (__refp = __pindex->index + __pindex->entries, \
597 __entries = __pindex->entries; __entries > 0; --__entries) {\
598 (ref) = *--__refp;
599 #define WT_INTL_FOREACH_END \
600 } \
601 } while (0)
602
603 /* Row-store leaf page. */
604 WT_ROW *row; /* Key/value pairs */
605 #undef pg_row
606 #define pg_row u.row
607
608 /* Fixed-length column-store leaf page. */
609 uint8_t *fix_bitf; /* Values */
610 #undef pg_fix_bitf
611 #define pg_fix_bitf u.fix_bitf
612
613 /* Variable-length column-store leaf page. */
614 struct {
615 WT_COL *col_var; /* Values */
616
617 /*
618 * Variable-length column-store pages have an array
619 * of page entries with RLE counts greater than 1 when
620 * reading the page, so it's not necessary to walk the
621 * page counting records to find a specific entry. We
622 * can do a binary search in this array, then an offset
623 * calculation to find the cell.
624 *
625 * It's a separate structure to keep the page structure
626 * as small as possible.
627 */
628 struct __wt_col_var_repeat {
629 uint32_t nrepeats; /* repeat slots */
630 WT_COL_RLE repeats[0]; /* lookup RLE array */
631 } *repeats;
632 #define WT_COL_VAR_REPEAT_SET(page) \
633 ((page)->u.col_var.repeats != NULL)
634 } col_var;
635 #undef pg_var
636 #define pg_var u.col_var.col_var
637 #undef pg_var_repeats
638 #define pg_var_repeats u.col_var.repeats->repeats
639 #undef pg_var_nrepeats
640 #define pg_var_nrepeats u.col_var.repeats->nrepeats
641 } u;
642
643 /*
644 * Page entries, type and flags are positioned at the end of the WT_PAGE
645 * union to reduce cache misses in the row-store search function.
646 *
647 * The entries field only applies to leaf pages, internal pages use the
648 * page-index entries instead.
649 */
650 uint32_t entries; /* Leaf page entries */
651
652 #define WT_PAGE_IS_INTERNAL(page) \
653 ((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT)
654 #define WT_PAGE_INVALID 0 /* Invalid page */
655 #define WT_PAGE_BLOCK_MANAGER 1 /* Block-manager page */
656 #define WT_PAGE_COL_FIX 2 /* Col-store fixed-len leaf */
657 #define WT_PAGE_COL_INT 3 /* Col-store internal page */
658 #define WT_PAGE_COL_VAR 4 /* Col-store var-length leaf page */
659 #define WT_PAGE_OVFL 5 /* Overflow page */
660 #define WT_PAGE_ROW_INT 6 /* Row-store internal page */
661 #define WT_PAGE_ROW_LEAF 7 /* Row-store leaf page */
662 uint8_t type; /* Page type */
663
664 /* AUTOMATIC FLAG VALUE GENERATION START */
665 #define WT_PAGE_BUILD_KEYS 0x01u /* Keys have been built in memory */
666 #define WT_PAGE_DISK_ALLOC 0x02u /* Disk image in allocated memory */
667 #define WT_PAGE_DISK_MAPPED 0x04u /* Disk image in mapped memory */
668 #define WT_PAGE_EVICT_LRU 0x08u /* Page is on the LRU queue */
669 #define WT_PAGE_EVICT_NO_PROGRESS 0x10u /* Eviction doesn't count as progress */
670 #define WT_PAGE_OVERFLOW_KEYS 0x20u /* Page has overflow keys */
671 #define WT_PAGE_SPLIT_INSERT 0x40u /* A leaf page was split for append */
672 #define WT_PAGE_UPDATE_IGNORE 0x80u /* Ignore updates on page discard */
673 /* AUTOMATIC FLAG VALUE GENERATION STOP */
674 uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
675
676 uint8_t unused[2]; /* Unused padding */
677
678 /*
679 * The page's read generation acts as an LRU value for each page in the
680 * tree; it is used by the eviction server thread to select pages to be
681 * discarded from the in-memory tree.
682 *
683 * The read generation is a 64-bit value, if incremented frequently, a
684 * 32-bit value could overflow.
685 *
686 * The read generation is a piece of shared memory potentially read
687 * by many threads. We don't want to update page read generations for
688 * in-cache workloads and suffer the cache misses, so we don't simply
689 * increment the read generation value on every access. Instead, the
690 * read generation is incremented by the eviction server each time it
691 * becomes active. To avoid incrementing a page's read generation too
692 * frequently, it is set to a future point.
693 *
694 * Because low read generation values have special meaning, and there
695 * are places where we manipulate the value, use an initial value well
696 * outside of the special range.
697 */
698 #define WT_READGEN_NOTSET 0
699 #define WT_READGEN_OLDEST 1
700 #define WT_READGEN_WONT_NEED 2
701 #define WT_READGEN_EVICT_SOON(readgen) \
702 ((readgen) != WT_READGEN_NOTSET && (readgen) < WT_READGEN_START_VALUE)
703 #define WT_READGEN_START_VALUE 100
704 #define WT_READGEN_STEP 100
705 uint64_t read_gen;
706
707 size_t memory_footprint; /* Memory attached to the page */
708
709 /* Page's on-disk representation: NULL for pages created in memory. */
710 const WT_PAGE_HEADER *dsk;
711
712 /* If/when the page is modified, we need lots more information. */
713 WT_PAGE_MODIFY *modify;
714
715 /* This is the 64 byte boundary, try to keep hot fields above here. */
716
717 uint64_t cache_create_gen; /* Page create timestamp */
718 uint64_t evict_pass_gen; /* Eviction pass generation */
719 };
720
721 /*
722 * WT_PAGE_DISK_OFFSET, WT_PAGE_REF_OFFSET --
723 * Return the offset/pointer of a pointer/offset in a page disk image.
724 */
725 #define WT_PAGE_DISK_OFFSET(page, p) \
726 WT_PTRDIFF32(p, (page)->dsk)
727 #define WT_PAGE_REF_OFFSET(page, o) \
728 ((void *)((uint8_t *)((page)->dsk) + (o)))
729
730 /*
731 * Prepare update states.
732 *
733 * Prepare update synchronization is based on the state field, which has the
734 * following possible states:
735 *
736 * WT_PREPARE_INIT:
737 * The initial prepare state of either an update or a page_del structure,
738 * indicating a prepare phase has not started yet.
739 * This state has no impact on the visibility of the update's data.
740 *
741 * WT_PREPARE_INPROGRESS:
742 * Update is in prepared phase.
743 *
744 * WT_PREPARE_LOCKED:
745 * State is locked as state transition is in progress from INPROGRESS to
746 * RESOLVED. Any reader of the state needs to wait for state transition to
747 * complete.
748 *
749 * WT_PREPARE_RESOLVED:
750 * Represents the commit state of the prepared update.
751 *
752 * State Transition:
753 * From uncommitted -> prepare -> commit:
754 * INIT --> INPROGRESS --> LOCKED --> RESOLVED
755 * LOCKED will be a momentary phase during timestamp update.
756 *
757 * From uncommitted -> prepare -> rollback:
758 * INIT --> INPROGRESS
759 * Prepare state will not be updated during rollback and will continue to
760 * have the state as INPROGRESS.
761 */
762 #define WT_PREPARE_INIT 0 /* Must be 0, as structures
763 will be default initialized
764 with 0. */
765 #define WT_PREPARE_INPROGRESS 1
766 #define WT_PREPARE_LOCKED 2
767 #define WT_PREPARE_RESOLVED 3
768
769 /*
770 * Page state.
771 *
772 * Synchronization is based on the WT_REF->state field, which has a number of
773 * possible states:
774 *
775 * WT_REF_DISK:
776 * The initial setting before a page is brought into memory, and set as a
777 * result of page eviction; the page is on disk, and must be read into
778 * memory before use. WT_REF_DISK has a value of 0 (the default state
779 * after allocating cleared memory).
780 *
781 * WT_REF_DELETED:
782 * The page is on disk, but has been deleted from the tree; we can delete
783 * row-store leaf pages without reading them if they don't reference
784 * overflow items.
785 *
786 * WT_REF_LIMBO:
787 * The page image has been loaded into memory but there is additional
788 * history in the lookaside table that has not been applied.
789 *
790 * WT_REF_LOCKED:
791 * Locked for exclusive access. In eviction, this page or a parent has
792 * been selected for eviction; once hazard pointers are checked, the page
793 * will be evicted. When reading a page that was previously deleted, it
794 * is locked until the page is in memory with records marked deleted. The
795 * thread that set the page to WT_REF_LOCKED has exclusive access, no
796 * other thread may use the WT_REF until the state is changed.
797 *
798 * WT_REF_LOOKASIDE:
799 * The page is on disk (as per WT_REF_DISK) and has entries in the
800 * lookaside table that must be applied before the page can be read.
801 *
802 * WT_REF_MEM:
803 * Set by a reading thread once the page has been read from disk; the page
804 * is in the cache and the page reference is OK.
805 *
806 * WT_REF_READING:
807 * Set by a reading thread before reading an ordinary page from disk;
808 * other readers of the page wait until the read completes. Sync can
809 * safely skip over such pages: they are clean by definition.
810 *
811 * WT_REF_SPLIT:
812 * Set when the page is split; the WT_REF is dead and can no longer be
813 * used.
814 *
815 * The life cycle of a typical page goes like this: pages are read into memory
816 * from disk and their state set to WT_REF_MEM. When the page is selected for
817 * eviction, the page state is set to WT_REF_LOCKED. In all cases, evicting
818 * threads reset the page's state when finished with the page: if eviction was
819 * successful (a clean page was discarded, and a dirty page was written to disk
820 * and then discarded), the page state is set to WT_REF_DISK; if eviction failed
821 * because the page was busy, page state is reset to WT_REF_MEM.
822 *
823 * Readers check the state field and if it's WT_REF_MEM, they set a hazard
824 * pointer to the page, flush memory and re-confirm the page state. If the
825 * page state is unchanged, the reader has a valid reference and can proceed.
826 *
827 * When an evicting thread wants to discard a page from the tree, it sets the
828 * WT_REF_LOCKED state, flushes memory, then checks hazard pointers. If a
829 * hazard pointer is found, state is reset to WT_REF_MEM, restoring the page
830 * to the readers. If the evicting thread does not find a hazard pointer,
831 * the page is evicted.
832 */
833
834 /*
835 * WT_PAGE_DELETED --
836 * Related information for truncated pages.
837 */
838 struct __wt_page_deleted {
839 volatile uint64_t txnid; /* Transaction ID */
840 WT_DECL_TIMESTAMP(timestamp)
841
842 /*
843 * The state is used for transaction prepare to manage visibility
844 * and inheriting prepare state to update_list.
845 */
846 volatile uint8_t prepare_state; /* Prepare state. */
847
848 uint32_t previous_state; /* Previous state */
849
850 WT_UPDATE **update_list; /* List of updates for abort */
851 };
852
853 /*
854 * WT_REF --
855 * A single in-memory page and the state information used to determine if
856 * it's OK to dereference the pointer to the page.
857 */
858 struct __wt_ref {
859 WT_PAGE *page; /* Page */
860
861 /*
862 * When the tree deepens as a result of a split, the home page value
863 * changes. Don't cache it, we need to see that change when looking
864 * up our slot in the page's index structure.
865 */
866 WT_PAGE * volatile home; /* Reference page */
867 volatile uint32_t pindex_hint; /* Reference page index hint */
868
869 #define WT_REF_DISK 0 /* Page is on disk */
870 #define WT_REF_DELETED 1 /* Page is on disk, but deleted */
871 #define WT_REF_LIMBO 2 /* Page is in cache without history */
872 #define WT_REF_LOCKED 3 /* Page locked for exclusive access */
873 #define WT_REF_LOOKASIDE 4 /* Page is on disk with lookaside */
874 #define WT_REF_MEM 5 /* Page is in cache and valid */
875 #define WT_REF_READING 6 /* Page being read */
876 #define WT_REF_SPLIT 7 /* Parent page split (WT_REF dead) */
877 volatile uint32_t state; /* Page state */
878
879 /*
880 * Address: on-page cell if read from backing block, off-page WT_ADDR
881 * if instantiated in-memory, or NULL if page created in-memory.
882 */
883 void *addr;
884
885 /*
886 * The child page's key. Do NOT change this union without reviewing
887 * __wt_ref_key.
888 */
889 union {
890 uint64_t recno; /* Column-store: starting recno */
891 void *ikey; /* Row-store: key */
892 } key;
893 #undef ref_recno
894 #define ref_recno key.recno
895 #undef ref_ikey
896 #define ref_ikey key.ikey
897
898 WT_PAGE_DELETED *page_del; /* Deleted page information */
899 WT_PAGE_LOOKASIDE *page_las; /* Lookaside information */
900 };
901 /*
902 * WT_REF_SIZE is the expected structure size -- we verify the build to ensure
903 * the compiler hasn't inserted padding which would break the world.
904 */
905 #define WT_REF_SIZE 56
906
907 /*
908 * WT_ROW --
909 * Each in-memory page row-store leaf page has an array of WT_ROW structures:
910 * this is created from on-page data when a page is read from the file. It's
911 * sorted by key, fixed in size, and starts with a reference to on-page data.
912 *
913 * Multiple threads of control may be searching the in-memory row-store pages,
914 * and the key may be instantiated at any time. Code must be able to handle
915 * both when the key has not been instantiated (the key field points into the
916 * page's disk image), and when the key has been instantiated (the key field
917 * points outside the page's disk image). We don't need barriers because the
918 * key is updated atomically, but code that reads the key field multiple times
919 * is a very, very bad idea. Specifically, do not do this:
920 *
921 * key = rip->key;
922 * if (key_is_on_page(key)) {
923 * cell = rip->key;
924 * }
925 *
926 * The field is declared volatile (so the compiler knows it shouldn't read it
927 * multiple times), and we obscure the field name and use a copy macro in all
928 * references to the field (so the code doesn't read it multiple times), all
929 * to make sure we don't introduce this bug (again).
930 */
931 struct __wt_row { /* On-page key, on-page cell, or off-page WT_IKEY */
932 void * volatile __key;
933 };
934 #define WT_ROW_KEY_COPY(rip) ((rip)->__key)
935 #define WT_ROW_KEY_SET(rip, v) ((rip)->__key) = (void *)(v)
936
937 /*
938 * WT_ROW_FOREACH --
939 * Walk the entries of an in-memory row-store leaf page.
940 */
941 #define WT_ROW_FOREACH(page, rip, i) \
942 for ((i) = (page)->entries, \
943 (rip) = (page)->pg_row; (i) > 0; ++(rip), --(i))
944 #define WT_ROW_FOREACH_REVERSE(page, rip, i) \
945 for ((i) = (page)->entries, \
946 (rip) = (page)->pg_row + ((page)->entries - 1); \
947 (i) > 0; --(rip), --(i))
948
949 /*
950 * WT_ROW_SLOT --
951 * Return the 0-based array offset based on a WT_ROW reference.
952 */
953 #define WT_ROW_SLOT(page, rip) \
954 ((uint32_t)(((WT_ROW *)(rip)) - (page)->pg_row))
955
956 /*
957 * WT_COL --
958 * Each in-memory variable-length column-store leaf page has an array of WT_COL
959 * structures: this is created from on-page data when a page is read from the
960 * file. It's fixed in size, and references data on the page.
961 */
962 struct __wt_col {
963 /*
964 * Variable-length column-store data references are page offsets, not
965 * pointers (we boldly re-invent short pointers). The trade-off is 4B
966 * per K/V pair on a 64-bit machine vs. a single cycle for the addition
967 * of a base pointer. The on-page data is a WT_CELL (same as row-store
968 * pages).
969 *
970 * If the value is 0, it's a single, deleted record.
971 *
972 * Obscure the field name, code shouldn't use WT_COL->__col_value, the
973 * public interface is WT_COL_PTR and WT_COL_PTR_SET.
974 */
975 uint32_t __col_value;
976 };
977
978 /*
979 * WT_COL_PTR, WT_COL_PTR_SET --
980 * Return/Set a pointer corresponding to the data offset. (If the item does
981 * not exist on the page, return a NULL.)
982 */
983 #define WT_COL_PTR(page, cip) \
984 ((cip)->__col_value == 0 ? \
985 NULL : WT_PAGE_REF_OFFSET(page, (cip)->__col_value))
986 #define WT_COL_PTR_SET(cip, value) \
987 (cip)->__col_value = (value)
988
989 /*
990 * WT_COL_FOREACH --
991 * Walk the entries of variable-length column-store leaf page.
992 */
993 #define WT_COL_FOREACH(page, cip, i) \
994 for ((i) = (page)->entries, \
995 (cip) = (page)->pg_var; (i) > 0; ++(cip), --(i))
996
997 /*
998 * WT_COL_SLOT --
999 * Return the 0-based array offset based on a WT_COL reference.
1000 */
1001 #define WT_COL_SLOT(page, cip) \
1002 ((uint32_t)(((WT_COL *)(cip)) - (page)->pg_var))
1003
1004 /*
1005 * WT_IKEY --
1006 * Instantiated key: row-store keys are usually prefix compressed and sometimes
1007 * Huffman encoded or overflow objects. Normally, a row-store page in-memory
1008 * key points to the on-page WT_CELL, but in some cases, we instantiate the key
1009 * in memory, in which case the row-store page in-memory key points to a WT_IKEY
1010 * structure.
1011 */
1012 struct __wt_ikey {
1013 uint32_t size; /* Key length */
1014
1015 /*
1016 * If we no longer point to the key's on-page WT_CELL, we can't find its
1017 * related value. Save the offset of the key cell in the page.
1018 *
1019 * Row-store cell references are page offsets, not pointers (we boldly
1020 * re-invent short pointers). The trade-off is 4B per K/V pair on a
1021 * 64-bit machine vs. a single cycle for the addition of a base pointer.
1022 */
1023 uint32_t cell_offset;
1024
1025 /* The key bytes immediately follow the WT_IKEY structure. */
1026 #define WT_IKEY_DATA(ikey) \
1027 ((void *)((uint8_t *)(ikey) + sizeof(WT_IKEY)))
1028 };
1029
1030 /*
1031 * WT_UPDATE --
1032 * Entries on leaf pages can be updated, either modified or deleted. Updates
1033 * to entries referenced from the WT_ROW and WT_COL arrays are stored in the
1034 * page's WT_UPDATE array. When the first element on a page is updated, the
1035 * WT_UPDATE array is allocated, with one slot for every existing element in
1036 * the page. A slot points to a WT_UPDATE structure; if more than one update
1037 * is done for an entry, WT_UPDATE structures are formed into a forward-linked
1038 * list.
1039 */
1040 struct __wt_update {
1041 volatile uint64_t txnid; /* transaction ID */
1042 #if WT_TIMESTAMP_SIZE == 8
1043 WT_DECL_TIMESTAMP(timestamp) /* aligned uint64_t timestamp */
1044 #endif
1045
1046 WT_UPDATE *next; /* forward-linked list */
1047
1048 uint32_t size; /* data length */
1049
1050 #define WT_UPDATE_INVALID 0 /* diagnostic check */
1051 #define WT_UPDATE_BIRTHMARK 1 /* transaction for on-page value */
1052 #define WT_UPDATE_MODIFY 2 /* partial-update modify value */
1053 #define WT_UPDATE_RESERVE 3 /* reserved */
1054 #define WT_UPDATE_STANDARD 4 /* complete value */
1055 #define WT_UPDATE_TOMBSTONE 5 /* deleted */
1056 uint8_t type; /* type (one byte to conserve memory) */
1057
1058 /* If the update includes a complete value. */
1059 #define WT_UPDATE_DATA_VALUE(upd) \
1060 ((upd)->type == WT_UPDATE_STANDARD || \
1061 (upd)->type == WT_UPDATE_TOMBSTONE)
1062
1063 #if WT_TIMESTAMP_SIZE != 8
1064 WT_DECL_TIMESTAMP(timestamp) /* unaligned uint8_t array timestamp */
1065 #endif
1066
1067 /*
1068 * The update state is used for transaction prepare to manage
1069 * visibility and transitioning update structure state safely.
1070 */
1071 volatile uint8_t prepare_state; /* Prepare state. */
1072
1073 /*
1074 * Zero or more bytes of value (the payload) immediately follows the
1075 * WT_UPDATE structure. We use a C99 flexible array member which has
1076 * the semantics we want.
1077 */
1078 uint8_t data[]; /* start of the data */
1079 };
1080
1081 /*
1082 * WT_UPDATE_SIZE is the expected structure size excluding the payload data --
1083 * we verify the build to ensure the compiler hasn't inserted padding.
1084 */
1085 #define WT_UPDATE_SIZE (22 + WT_TIMESTAMP_SIZE)
1086
1087 /*
1088 * The memory size of an update: include some padding because this is such a
1089 * common case that overhead of tiny allocations can swamp our cache overhead
1090 * calculation.
1091 */
1092 #define WT_UPDATE_MEMSIZE(upd) \
1093 WT_ALIGN(WT_UPDATE_SIZE + (upd)->size, 32)
1094
1095 /*
1096 * WT_MAX_MODIFY_UPDATE --
1097 * Limit update chains value to avoid penalizing reads and
1098 * permit truncation. Having a smaller value will penalize the cases
1099 * when history has to be maintained, resulting in multiplying cache
1100 * pressure.
1101 */
1102 #define WT_MAX_MODIFY_UPDATE 10
1103
1104 /*
1105 * WT_MODIFY_MEM_FACTOR --
1106 * Limit update chains to a fraction of the base document size.
1107 */
1108 #define WT_MODIFY_MEM_FRACTION 10
1109
1110 /*
1111 * WT_INSERT --
1112 *
1113 * Row-store leaf pages support inserts of new K/V pairs. When the first K/V
1114 * pair is inserted, the WT_INSERT_HEAD array is allocated, with one slot for
1115 * every existing element in the page, plus one additional slot. A slot points
1116 * to a WT_INSERT_HEAD structure for the items which sort after the WT_ROW
1117 * element that references it and before the subsequent WT_ROW element; the
1118 * skiplist structure has a randomly chosen depth of next pointers in each
1119 * inserted node.
1120 *
1121 * The additional slot is because it's possible to insert items smaller than any
1122 * existing key on the page: for that reason, the first slot of the insert array
1123 * holds keys smaller than any other key on the page.
1124 *
1125 * In column-store variable-length run-length encoded pages, a single indx
1126 * entry may reference a large number of records, because there's a single
1127 * on-page entry representing many identical records. (We don't expand those
1128 * entries when the page comes into memory, as that would require resources as
1129 * pages are moved to/from the cache, including read-only files.) Instead, a
1130 * single indx entry represents all of the identical records originally found
1131 * on the page.
1132 *
1133 * Modifying (or deleting) run-length encoded column-store records is hard
1134 * because the page's entry no longer references a set of identical items. We
1135 * handle this by "inserting" a new entry into the insert array, with its own
1136 * record number. (This is the only case where it's possible to insert into a
1137 * column-store: only appends are allowed, as insert requires re-numbering
1138 * subsequent records. Berkeley DB did support mutable records, but it won't
1139 * scale and it isn't useful enough to re-implement, IMNSHO.)
1140 */
1141 struct __wt_insert {
1142 WT_UPDATE *upd; /* value */
1143
1144 union {
1145 uint64_t recno; /* column-store record number */
1146 struct {
1147 uint32_t offset; /* row-store key data start */
1148 uint32_t size; /* row-store key data size */
1149 } key;
1150 } u;
1151
1152 #define WT_INSERT_KEY_SIZE(ins) (((WT_INSERT *)(ins))->u.key.size)
1153 #define WT_INSERT_KEY(ins) \
1154 ((void *)((uint8_t *)(ins) + ((WT_INSERT *)(ins))->u.key.offset))
1155 #define WT_INSERT_RECNO(ins) (((WT_INSERT *)(ins))->u.recno)
1156
1157 WT_INSERT *next[0]; /* forward-linked skip list */
1158 };
1159
1160 /*
1161 * Skiplist helper macros.
1162 */
1163 #define WT_SKIP_FIRST(ins_head) \
1164 (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)(ins_head))->head[0])
1165 #define WT_SKIP_LAST(ins_head) \
1166 (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)(ins_head))->tail[0])
1167 #define WT_SKIP_NEXT(ins) ((ins)->next[0])
1168 #define WT_SKIP_FOREACH(ins, ins_head) \
1169 for ((ins) = WT_SKIP_FIRST(ins_head); \
1170 (ins) != NULL; \
1171 (ins) = WT_SKIP_NEXT(ins))
1172
1173 /*
1174 * Atomically allocate and swap a structure or array into place.
1175 */
1176 #define WT_PAGE_ALLOC_AND_SWAP(s, page, dest, v, count) do { \
1177 if (((v) = (dest)) == NULL) { \
1178 WT_ERR(__wt_calloc_def(s, count, &(v))); \
1179 if (__wt_atomic_cas_ptr(&(dest), NULL, v)) \
1180 __wt_cache_page_inmem_incr( \
1181 s, page, (count) * sizeof(*(v))); \
1182 else \
1183 __wt_free(s, v); \
1184 } \
1185 } while (0)
1186
1187 /*
1188 * WT_INSERT_HEAD --
1189 * The head of a skiplist of WT_INSERT items.
1190 */
1191 struct __wt_insert_head {
1192 WT_INSERT *head[WT_SKIP_MAXDEPTH]; /* first item on skiplists */
1193 WT_INSERT *tail[WT_SKIP_MAXDEPTH]; /* last item on skiplists */
1194 };
1195
1196 /*
1197 * The row-store leaf page insert lists are arrays of pointers to structures,
1198 * and may not exist. The following macros return an array entry if the array
1199 * of pointers and the specific structure exist, else NULL.
1200 */
1201 #define WT_ROW_INSERT_SLOT(page, slot) \
1202 ((page)->modify == NULL || \
1203 (page)->modify->mod_row_insert == NULL ? \
1204 NULL : (page)->modify->mod_row_insert[slot])
1205 #define WT_ROW_INSERT(page, ip) \
1206 WT_ROW_INSERT_SLOT(page, WT_ROW_SLOT(page, ip))
1207 #define WT_ROW_UPDATE(page, ip) \
1208 ((page)->modify == NULL || \
1209 (page)->modify->mod_row_update == NULL ? \
1210 NULL : (page)->modify->mod_row_update[WT_ROW_SLOT(page, ip)])
1211 /*
1212 * WT_ROW_INSERT_SMALLEST references an additional slot past the end of the
1213 * the "one per WT_ROW slot" insert array. That's because the insert array
1214 * requires an extra slot to hold keys that sort before any key found on the
1215 * original page.
1216 */
1217 #define WT_ROW_INSERT_SMALLEST(page) \
1218 ((page)->modify == NULL || \
1219 (page)->modify->mod_row_insert == NULL ? \
1220 NULL : (page)->modify->mod_row_insert[(page)->entries])
1221
1222 /*
1223 * The column-store leaf page update lists are arrays of pointers to structures,
1224 * and may not exist. The following macros return an array entry if the array
1225 * of pointers and the specific structure exist, else NULL.
1226 */
1227 #define WT_COL_UPDATE_SLOT(page, slot) \
1228 ((page)->modify == NULL || \
1229 (page)->modify->mod_col_update == NULL ? \
1230 NULL : (page)->modify->mod_col_update[slot])
1231 #define WT_COL_UPDATE(page, ip) \
1232 WT_COL_UPDATE_SLOT(page, WT_COL_SLOT(page, ip))
1233
1234 /*
1235 * WT_COL_UPDATE_SINGLE is a single WT_INSERT list, used for any fixed-length
1236 * column-store updates for a page.
1237 */
1238 #define WT_COL_UPDATE_SINGLE(page) \
1239 WT_COL_UPDATE_SLOT(page, 0)
1240
1241 /*
1242 * WT_COL_APPEND is an WT_INSERT list, used for fixed- and variable-length
1243 * appends.
1244 */
1245 #define WT_COL_APPEND(page) \
1246 ((page)->modify == NULL || \
1247 (page)->modify->mod_col_append == NULL ? \
1248 NULL : (page)->modify->mod_col_append[0])
1249
1250 /* WT_FIX_FOREACH walks fixed-length bit-fields on a disk page. */
1251 #define WT_FIX_FOREACH(btree, dsk, v, i) \
1252 for ((i) = 0, \
1253 (v) = (i) < (dsk)->u.entries ? \
1254 __bit_getv( \
1255 WT_PAGE_HEADER_BYTE(btree, dsk), 0, (btree)->bitcnt) : 0; \
1256 (i) < (dsk)->u.entries; ++(i), \
1257 (v) = __bit_getv( \
1258 WT_PAGE_HEADER_BYTE(btree, dsk), i, (btree)->bitcnt))
1259
1260 /*
1261 * Manage split generation numbers. Splits walk the list of sessions to check
1262 * when it is safe to free structures that have been replaced. We also check
1263 * that list periodically (e.g., when wrapping up a transaction) to free any
1264 * memory we can.
1265 *
1266 * Before a thread enters code that will examine page indexes (which are
1267 * swapped out by splits), it publishes a copy of the current split generation
1268 * into its session. Don't assume that threads never re-enter this code: if we
1269 * already have a split generation, leave it alone. If our caller is examining
1270 * an index, we don't want the oldest split generation to move forward and
1271 * potentially free it.
1272 */
1273 #define WT_ENTER_PAGE_INDEX(session) do { \
1274 uint64_t __prev_split_gen = \
1275 __wt_session_gen(session, WT_GEN_SPLIT); \
1276 if (__prev_split_gen == 0) \
1277 __wt_session_gen_enter(session, WT_GEN_SPLIT);
1278
1279 #define WT_LEAVE_PAGE_INDEX(session) \
1280 if (__prev_split_gen == 0) \
1281 __wt_session_gen_leave(session, WT_GEN_SPLIT); \
1282 } while (0)
1283
1284 #define WT_WITH_PAGE_INDEX(session, e) \
1285 WT_ENTER_PAGE_INDEX(session); \
1286 (e); \
1287 WT_LEAVE_PAGE_INDEX(session)
1288