1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #define	WT_RECNO_OOB	0		/* Illegal record number */
10 
11 /* AUTOMATIC FLAG VALUE GENERATION START */
12 #define	WT_READ_CACHE			0x0001u
13 #define	WT_READ_DELETED_CHECK		0x0002u
14 #define	WT_READ_DELETED_SKIP		0x0004u
15 #define	WT_READ_IGNORE_CACHE_SIZE	0x0008u
16 #define	WT_READ_LOOKASIDE		0x0010u
17 #define	WT_READ_NOTFOUND_OK		0x0020u
18 #define	WT_READ_NO_GEN			0x0040u
19 #define	WT_READ_NO_SPLIT		0x0080u
20 #define	WT_READ_NO_WAIT			0x0100u
21 #define	WT_READ_PREV			0x0200u
22 #define	WT_READ_RESTART_OK		0x0400u
23 #define	WT_READ_SKIP_INTL		0x0800u
24 #define	WT_READ_TRUNCATE		0x1000u
25 #define	WT_READ_WONT_NEED		0x2000u
26 /* AUTOMATIC FLAG VALUE GENERATION STOP */
27 
28 /* AUTOMATIC FLAG VALUE GENERATION START */
29 #define	WT_REC_CHECKPOINT	0x01u
30 #define	WT_REC_EVICT		0x02u
31 #define	WT_REC_IN_MEMORY	0x04u
32 #define	WT_REC_LOOKASIDE	0x08u
33 #define	WT_REC_SCRUB		0x10u
34 #define	WT_REC_UPDATE_RESTORE	0x20u
35 #define	WT_REC_VISIBILITY_ERR	0x40u
36 #define	WT_REC_VISIBLE_ALL	0x80u
37 /* AUTOMATIC FLAG VALUE GENERATION STOP */
38 
39 /*
40  * WT_PAGE_HEADER --
41  *	Blocks have a common header, a WT_PAGE_HEADER structure followed by a
42  * block-manager specific structure.
43  */
44 struct __wt_page_header {
45 	/*
46 	 * The record number of the first record of the page is stored on disk
47 	 * so we can figure out where the column-store leaf page fits into the
48 	 * key space during salvage.
49 	 */
50 	uint64_t recno;			/* 00-07: column-store starting recno */
51 
52 	/*
53 	 * We maintain page write-generations in the non-transactional case
54 	 * as that's how salvage can determine the most recent page between
55 	 * pages overlapping the same key range.
56 	 */
57 	uint64_t write_gen;		/* 08-15: write generation */
58 
59 	/*
60 	 * The page's in-memory size isn't rounded or aligned, it's the actual
61 	 * number of bytes the disk-image consumes when instantiated in memory.
62 	 */
63 	uint32_t mem_size;		/* 16-19: in-memory page size */
64 
65 	union {
66 		uint32_t entries;	/* 20-23: number of cells on page */
67 		uint32_t datalen;	/* 20-23: overflow data length */
68 	} u;
69 
70 	uint8_t type;			/* 24: page type */
71 
72 	/*
73 	 * No automatic generation: flag values cannot change, they're written
74 	 * to disk.
75 	 */
76 #define	WT_PAGE_COMPRESSED	0x01u	/* Page is compressed on disk */
77 #define	WT_PAGE_EMPTY_V_ALL	0x02u	/* Page has all zero-length values */
78 #define	WT_PAGE_EMPTY_V_NONE	0x04u	/* Page has no zero-length values */
79 #define	WT_PAGE_ENCRYPTED	0x08u	/* Page is encrypted on disk */
80 #define	WT_PAGE_LAS_UPDATE	0x10u	/* Page updates in lookaside store */
81 	uint8_t flags;			/* 25: flags */
82 
83 	/*
84 	 * End the structure with 2 bytes of padding: it wastes space, but it
85 	 * leaves the structure 32-bit aligned and having a few bytes to play
86 	 * with in the future can't hurt.
87 	 */
88 	uint8_t unused[2];		/* 26-27: unused padding */
89 };
90 /*
91  * WT_PAGE_HEADER_SIZE is the number of bytes we allocate for the structure: if
92  * the compiler inserts padding it will break the world.
93  */
94 #define	WT_PAGE_HEADER_SIZE		28
95 
96 /*
97  * __wt_page_header_byteswap --
98  *	Handle big- and little-endian transformation of a page header.
99  */
100 static inline void
__wt_page_header_byteswap(WT_PAGE_HEADER * dsk)101 __wt_page_header_byteswap(WT_PAGE_HEADER *dsk)
102 {
103 #ifdef WORDS_BIGENDIAN
104 	dsk->recno = __wt_bswap64(dsk->recno);
105 	dsk->write_gen = __wt_bswap64(dsk->write_gen);
106 	dsk->mem_size = __wt_bswap32(dsk->mem_size);
107 	dsk->u.entries = __wt_bswap32(dsk->u.entries);
108 #else
109 	WT_UNUSED(dsk);
110 #endif
111 }
112 
113 /*
114  * The block-manager specific information immediately follows the WT_PAGE_HEADER
115  * structure.
116  */
117 #define	WT_BLOCK_HEADER_REF(dsk)					\
118 	((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_SIZE))
119 
120 /*
121  * WT_PAGE_HEADER_BYTE --
122  * WT_PAGE_HEADER_BYTE_SIZE --
123  *	The first usable data byte on the block (past the combined headers).
124  */
125 #define	WT_PAGE_HEADER_BYTE_SIZE(btree)					\
126 	((u_int)(WT_PAGE_HEADER_SIZE + (btree)->block_header))
127 #define	WT_PAGE_HEADER_BYTE(btree, dsk)					\
128 	((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_BYTE_SIZE(btree)))
129 
130 /*
131  * WT_ADDR --
132  *	An in-memory structure to hold a block's location.
133  */
134 struct __wt_addr {
135 	uint8_t *addr;			/* Block-manager's cookie */
136 	uint8_t  size;			/* Block-manager's cookie length */
137 
138 #define	WT_ADDR_INT	1		/* Internal page */
139 #define	WT_ADDR_LEAF	2		/* Leaf page */
140 #define	WT_ADDR_LEAF_NO	3		/* Leaf page, no overflow */
141 	uint8_t  type;
142 
143 	/*
144 	 * If an address is both as an address for the previous and the current
145 	 * multi-block reconciliations, that is, a block we're writing matches
146 	 * the block written the last time, it will appear in both the current
147 	 * boundary points as well as the page modification's list of previous
148 	 * blocks.  The reuse flag is how we know that's happening so the block
149 	 * is treated correctly (not free'd on error, for example).
150 	 */
151 	uint8_t	 reuse;
152 };
153 
154 /*
155  * Overflow tracking for reuse: When a page is reconciled, we write new K/V
156  * overflow items.  If pages are reconciled multiple times, we need to know
157  * if we've already written a particular overflow record (so we don't write
158  * it again), as well as if we've modified an overflow record previously
159  * written (in which case we want to write a new record and discard blocks
160  * used by the previously written record).  Track overflow records written
161  * for the page, storing the values in a skiplist with the record's value as
162  * the "key".
163  */
164 struct __wt_ovfl_reuse {
165 	uint32_t value_offset;		/* Overflow value offset */
166 	uint32_t value_size;		/* Overflow value size */
167 	uint8_t  addr_offset;		/* Overflow addr offset */
168 	uint8_t  addr_size;		/* Overflow addr size */
169 
170 	/*
171 	 * On each page reconciliation, we clear the entry's in-use flag, and
172 	 * reset it as the overflow record is re-used.  After reconciliation
173 	 * completes, unused skiplist entries are discarded, along with their
174 	 * underlying blocks.
175 	 *
176 	 * On each page reconciliation, set the just-added flag for each new
177 	 * skiplist entry; if reconciliation fails for any reason, discard the
178 	 * newly added skiplist entries, along with their underlying blocks.
179 	 */
180 /* AUTOMATIC FLAG VALUE GENERATION START */
181 #define	WT_OVFL_REUSE_INUSE		0x1u
182 #define	WT_OVFL_REUSE_JUST_ADDED	0x2u
183 /* AUTOMATIC FLAG VALUE GENERATION STOP */
184 	uint8_t	 flags;
185 
186 	/*
187 	 * The untyped address immediately follows the WT_OVFL_REUSE structure,
188 	 * the untyped value immediately follows the address.
189 	 */
190 #define	WT_OVFL_REUSE_ADDR(p)						\
191 	((void *)((uint8_t *)(p) + (p)->addr_offset))
192 #define	WT_OVFL_REUSE_VALUE(p)						\
193 	((void *)((uint8_t *)(p) + (p)->value_offset))
194 
195 	WT_OVFL_REUSE *next[0];		/* Forward-linked skip list */
196 };
197 
198 /*
199  * Lookaside table support: when a page is being reconciled for eviction and has
200  * updates that might be required by earlier readers in the system, the updates
201  * are written into a lookaside table, and restored as necessary if the page is
202  * read.
203  *
204  * The key is a unique marker for the page (a page ID plus a file ID, ordered
205  * this way so that overall the lookaside table is append-mostly), a counter
206  * (used to ensure the update records remain in the original order), and the
207  * record's key (byte-string for row-store, record number for column-store).
208  * The value is the WT_UPDATE structure's transaction ID, timestamp, update's
209  * prepare state, update type and value.
210  *
211  * As the key for the lookaside table is different for row- and column-store, we
212  * store both key types in a WT_ITEM, building/parsing them in the code, because
213  * otherwise we'd need two lookaside files with different key formats. We could
214  * make the lookaside table's key standard by moving the source key into the
215  * lookaside table value, but that doesn't make the coding any simpler, and it
216  * makes the lookaside table's value more likely to overflow the page size when
217  * the row-store key is relatively large.
218  */
219 #ifdef HAVE_BUILTIN_EXTENSION_SNAPPY
220 #define	WT_LOOKASIDE_COMPRESSOR	"snappy"
221 #else
222 #define	WT_LOOKASIDE_COMPRESSOR	"none"
223 #endif
224 #define	WT_LAS_CONFIG							\
225     "key_format=" WT_UNCHECKED_STRING(QIQu)				\
226     ",value_format=" WT_UNCHECKED_STRING(QuBBu)				\
227     ",block_compressor=" WT_LOOKASIDE_COMPRESSOR			\
228     ",leaf_value_max=64MB"						\
229     ",prefix_compression=true"
230 
231 /*
232  * WT_PAGE_LOOKASIDE --
233  *	Related information for on-disk pages with lookaside entries.
234  */
235 struct __wt_page_lookaside {
236 	uint64_t las_pageid;		/* Page ID in lookaside */
237 	uint64_t max_txn;		/* Maximum transaction ID */
238 	uint64_t unstable_txn;		/* First transaction ID not on page */
239 	WT_DECL_TIMESTAMP(max_timestamp)/* Maximum timestamp */
240 	WT_DECL_TIMESTAMP(unstable_timestamp)/* First timestamp not on page */
241 	bool eviction_to_lookaside;	/* Revert to lookaside on eviction */
242 	bool resolved;			/* History has been read into cache */
243 	bool skew_newest;		/* Page image has newest versions */
244 };
245 
246 /*
247  * WT_PAGE_MODIFY --
248  *	When a page is modified, there's additional information to maintain.
249  */
250 struct __wt_page_modify {
251 	/* The first unwritten transaction ID (approximate). */
252 	uint64_t first_dirty_txn;
253 
254 	/* The transaction state last time eviction was attempted. */
255 	uint64_t last_evict_pass_gen;
256 	uint64_t last_eviction_id;
257 	WT_DECL_TIMESTAMP(last_eviction_timestamp)
258 
259 #ifdef HAVE_DIAGNOSTIC
260 	/* Check that transaction time moves forward. */
261 	uint64_t last_oldest_id;
262 #endif
263 
264 	/* Avoid checking for obsolete updates during checkpoints. */
265 	uint64_t obsolete_check_txn;
266 	WT_DECL_TIMESTAMP(obsolete_check_timestamp)
267 
268 	/* The largest transaction seen on the page by reconciliation. */
269 	uint64_t rec_max_txn;
270 	WT_DECL_TIMESTAMP(rec_max_timestamp)
271 
272 	/* Stable timestamp at last reconciliation. */
273 	WT_DECL_TIMESTAMP(last_stable_timestamp)
274 
275 	/* The largest update transaction ID (approximate). */
276 	uint64_t update_txn;
277 
278 	/* Dirty bytes added to the cache. */
279 	size_t bytes_dirty;
280 
281 	/*
282 	 * When pages are reconciled, the result is one or more replacement
283 	 * blocks.  A replacement block can be in one of two states: it was
284 	 * written to disk, and so we have a block address, or it contained
285 	 * unresolved modifications and we have a disk image for it with a
286 	 * list of those unresolved modifications.  The former is the common
287 	 * case: we only build lists of unresolved modifications when we're
288 	 * evicting a page, and we only expect to see unresolved modifications
289 	 * on a page being evicted in the case of a hot page that's too large
290 	 * to keep in memory as it is.  In other words, checkpoints will skip
291 	 * unresolved modifications, and will write the blocks rather than
292 	 * build lists of unresolved modifications.
293 	 *
294 	 * Ugly union/struct layout to conserve memory, we never have both
295 	 * a replace address and multiple replacement blocks.
296 	 */
297 	union {
298 	struct {			/* Single, written replacement block */
299 		WT_ADDR	 replace;
300 
301 		/*
302 		 * A disk image that may or may not have been written, used to
303 		 * re-instantiate the page in memory.
304 		 */
305 		void	*disk_image;
306 
307 		/* The page has lookaside entries. */
308 		WT_PAGE_LOOKASIDE page_las;
309 	} r;
310 #undef	mod_replace
311 #define	mod_replace	u1.r.replace
312 #undef	mod_disk_image
313 #define	mod_disk_image	u1.r.disk_image
314 #undef	mod_page_las
315 #define	mod_page_las	u1.r.page_las
316 
317 	struct {			/* Multiple replacement blocks */
318 	struct __wt_multi {
319 		/*
320 		 * Block's key: either a column-store record number or a
321 		 * row-store variable length byte string.
322 		 */
323 		union {
324 			uint64_t recno;
325 			WT_IKEY *ikey;
326 		} key;
327 
328 		/*
329 		 * A disk image that may or may not have been written, used to
330 		 * re-instantiate the page in memory.
331 		 */
332 		void	*disk_image;
333 
334 		/*
335 		 * List of unresolved updates. Updates are either a row-store
336 		 * insert or update list, or column-store insert list. When
337 		 * creating lookaside records, there is an additional value,
338 		 * the committed item's transaction information.
339 		 *
340 		 * If there are unresolved updates, the block wasn't written and
341 		 * there will always be a disk image.
342 		 */
343 		struct __wt_save_upd {
344 			WT_INSERT *ins;		/* Insert list reference */
345 			WT_ROW	  *ripcip;	/* Original on-page reference */
346 			WT_UPDATE *onpage_upd;
347 		} *supd;
348 		uint32_t supd_entries;
349 
350 		/*
351 		 * Disk image was written: address, size and checksum.
352 		 * On subsequent reconciliations of this page, we avoid writing
353 		 * the block if it's unchanged by comparing size and checksum;
354 		 * the reuse flag is set when the block is unchanged and we're
355 		 * reusing a previous address.
356 		 */
357 		WT_ADDR	 addr;
358 		uint32_t size;
359 		uint32_t checksum;
360 
361 		WT_PAGE_LOOKASIDE page_las;
362 	} *multi;
363 	uint32_t multi_entries;		/* Multiple blocks element count */
364 	} m;
365 #undef	mod_multi
366 #define	mod_multi		u1.m.multi
367 #undef	mod_multi_entries
368 #define	mod_multi_entries	u1.m.multi_entries
369 	} u1;
370 
371 	/*
372 	 * Internal pages need to be able to chain root-page splits and have a
373 	 * special transactional eviction requirement.  Column-store leaf pages
374 	 * need update and append lists.
375 	 *
376 	 * Ugly union/struct layout to conserve memory, a page is either a leaf
377 	 * page or an internal page.
378 	 */
379 	union {
380 	struct {
381 		/*
382 		 * When a root page splits, we create a new page and write it;
383 		 * the new page can also split and so on, and we continue this
384 		 * process until we write a single replacement root page.  We
385 		 * use the root split field to track the list of created pages
386 		 * so they can be discarded when no longer needed.
387 		 */
388 		WT_PAGE *root_split;	/* Linked list of root split pages */
389 	} intl;
390 #undef	mod_root_split
391 #define	mod_root_split		u2.intl.root_split
392 	struct {
393 		/*
394 		 * Appended items to column-stores: there is only a single one
395 		 * of these active at a time per column-store tree.
396 		 */
397 		WT_INSERT_HEAD **append;
398 
399 		/*
400 		 * Updated items in column-stores: variable-length RLE entries
401 		 * can expand to multiple entries which requires some kind of
402 		 * list we can expand on demand.  Updated items in fixed-length
403 		 * files could be done based on an WT_UPDATE array as in
404 		 * row-stores, but there can be a very large number of bits on
405 		 * a single page, and the cost of the WT_UPDATE array would be
406 		 * huge.
407 		 */
408 		WT_INSERT_HEAD **update;
409 
410 		/*
411 		 * Split-saved last column-store page record. If a column-store
412 		 * page is split, we save the first record number moved so that
413 		 * during reconciliation we know the page's last record and can
414 		 * write any implicitly created deleted records for the page.
415 		 */
416 		uint64_t split_recno;
417 	} column_leaf;
418 #undef	mod_col_append
419 #define	mod_col_append		u2.column_leaf.append
420 #undef	mod_col_update
421 #define	mod_col_update		u2.column_leaf.update
422 #undef	mod_col_split_recno
423 #define	mod_col_split_recno	u2.column_leaf.split_recno
424 	struct {
425 		/* Inserted items for row-store. */
426 		WT_INSERT_HEAD	**insert;
427 
428 		/* Updated items for row-stores. */
429 		WT_UPDATE	**update;
430 	} row_leaf;
431 #undef	mod_row_insert
432 #define	mod_row_insert		u2.row_leaf.insert
433 #undef	mod_row_update
434 #define	mod_row_update		u2.row_leaf.update
435 	} u2;
436 
437 	/*
438 	 * Overflow record tracking for reconciliation.  We assume overflow
439 	 * records are relatively rare, so we don't allocate the structures
440 	 * to track them until we actually see them in the data.
441 	 */
442 	struct __wt_ovfl_track {
443 		/*
444 		 * Overflow key/value address/byte-string pairs we potentially
445 		 * reuse each time we reconcile the page.
446 		 */
447 		WT_OVFL_REUSE	*ovfl_reuse[WT_SKIP_MAXDEPTH];
448 
449 		/*
450 		 * Overflow key/value addresses to be discarded from the block
451 		 * manager after reconciliation completes successfully.
452 		 */
453 		WT_CELL **discard;
454 		size_t	  discard_entries;
455 		size_t	  discard_allocated;
456 
457 		/* Cached overflow value cell/update address pairs. */
458 		struct {
459 			WT_CELL	*cell;
460 			uint8_t	*data;
461 			size_t	 size;
462 		} *remove;
463 		size_t	 remove_allocated;
464 		uint32_t remove_next;
465 	} *ovfl_track;
466 
467 #define	WT_PAGE_LOCK(s, p) 						\
468 	__wt_spin_lock((s), &(p)->modify->page_lock)
469 #define	WT_PAGE_TRYLOCK(s, p)						\
470 	__wt_spin_trylock((s), &(p)->modify->page_lock)
471 #define	WT_PAGE_UNLOCK(s, p) 						\
472 	__wt_spin_unlock((s), &(p)->modify->page_lock)
473 	WT_SPINLOCK page_lock;		/* Page's spinlock */
474 
475 	/*
476 	 * The page state is incremented when a page is modified.
477 	 *
478 	 * WT_PAGE_CLEAN --
479 	 *	The page is clean.
480 	 * WT_PAGE_DIRTY_FIRST --
481 	 *	The page is in this state after the first operation that marks a
482 	 *	page dirty, or when reconciliation is checking to see if it has
483 	 *	done enough work to be able to mark the page clean.
484 	 * WT_PAGE_DIRTY --
485 	 *	Two or more updates have been added to the page.
486 	 */
487 #define	WT_PAGE_CLEAN		0
488 #define	WT_PAGE_DIRTY_FIRST	1
489 #define	WT_PAGE_DIRTY		2
490 	uint32_t page_state;
491 
492 #define	WT_PM_REC_EMPTY		1	/* Reconciliation: no replacement */
493 #define	WT_PM_REC_MULTIBLOCK	2	/* Reconciliation: multiple blocks */
494 #define	WT_PM_REC_REPLACE	3	/* Reconciliation: single block */
495 	uint8_t rec_result;		/* Reconciliation state */
496 
497 #define	WT_PAGE_RS_LOOKASIDE	0x1
498 #define	WT_PAGE_RS_RESTORED	0x2
499 	uint8_t restore_state;		/* Created by restoring updates */
500 };
501 
502 /*
503  * WT_COL_RLE --
504  * Variable-length column-store pages have an array of page entries with RLE
505  * counts greater than 1 when reading the page, so it's not necessary to walk
506  * the page counting records to find a specific entry. We can do a binary search
507  * in this array, then an offset calculation to find the cell.
508  */
509 WT_PACKED_STRUCT_BEGIN(__wt_col_rle)
510 	uint64_t recno;			/* Record number of first repeat. */
511 	uint64_t rle;			/* Repeat count. */
512 	uint32_t indx;			/* Slot of entry in col_var. */
513 WT_PACKED_STRUCT_END
514 
515 /*
516  * WT_PAGE --
517  * The WT_PAGE structure describes the in-memory page information.
518  */
519 struct __wt_page {
520 	/* Per page-type information. */
521 	union {
522 		/*
523 		 * Internal pages (both column- and row-store).
524 		 *
525 		 * In-memory internal pages have an array of pointers to child
526 		 * structures, maintained in collated order.
527 		 *
528 		 * Multiple threads of control may be searching the in-memory
529 		 * internal page and a child page of the internal page may
530 		 * cause a split at any time.  When a page splits, a new array
531 		 * is allocated and atomically swapped into place.  Threads in
532 		 * the old array continue without interruption (the old array is
533 		 * still valid), but have to avoid racing.  No barrier is needed
534 		 * because the array reference is updated atomically, but code
535 		 * reading the fields multiple times would be a very bad idea.
536 		 * Specifically, do not do this:
537 		 *	WT_REF **refp = page->u.intl__index->index;
538 		 *	uint32_t entries = page->u.intl__index->entries;
539 		 *
540 		 * The field is declared volatile (so the compiler knows not to
541 		 * read it multiple times), and we obscure the field name and
542 		 * use a copy macro in all references to the field (so the code
543 		 * doesn't read it multiple times).
544 		 */
545 		struct {
546 			WT_REF	*parent_ref;	/* Parent reference */
547 			uint64_t split_gen;	/* Generation of last split */
548 
549 			struct __wt_page_index {
550 				uint32_t entries;
551 				uint32_t deleted_entries;
552 				WT_REF	**index;
553 			} * volatile __index;	/* Collated children */
554 		} intl;
555 #undef	pg_intl_parent_ref
556 #define	pg_intl_parent_ref		u.intl.parent_ref
557 #undef	pg_intl_split_gen
558 #define	pg_intl_split_gen		u.intl.split_gen
559 
560 	/*
561 	 * Macros to copy/set the index because the name is obscured to ensure
562 	 * the field isn't read multiple times.
563 	 *
564 	 * There are two versions of WT_INTL_INDEX_GET because the session split
565 	 * generation is usually set, but it's not always required: for example,
566 	 * if a page is locked for splitting, or being created or destroyed.
567 	 */
568 #define	WT_INTL_INDEX_GET_SAFE(page)					\
569 	((page)->u.intl.__index)
570 #define	WT_INTL_INDEX_GET(session, page, pindex) do {			\
571 	WT_ASSERT(session,						\
572 	    __wt_session_gen(session, WT_GEN_SPLIT) != 0);		\
573 	(pindex) = WT_INTL_INDEX_GET_SAFE(page);			\
574 } while (0)
575 #define	WT_INTL_INDEX_SET(page, v) do {					\
576 	WT_WRITE_BARRIER();						\
577 	((page)->u.intl.__index) = (v);					\
578 } while (0)
579 
580 	/*
581 	 * Macro to walk the list of references in an internal page.
582 	 */
583 #define	WT_INTL_FOREACH_BEGIN(session, page, ref) do {			\
584 	WT_PAGE_INDEX *__pindex;					\
585 	WT_REF **__refp;						\
586 	uint32_t __entries;						\
587 	WT_INTL_INDEX_GET(session, page, __pindex);			\
588 	for (__refp = __pindex->index,					\
589 	    __entries = __pindex->entries; __entries > 0; --__entries) {\
590 		(ref) = *__refp++;
591 #define	WT_INTL_FOREACH_REVERSE_BEGIN(session, page, ref) do {		\
592 	WT_PAGE_INDEX *__pindex;					\
593 	WT_REF **__refp;						\
594 	uint32_t __entries;						\
595 	WT_INTL_INDEX_GET(session, page, __pindex);			\
596 	for (__refp = __pindex->index + __pindex->entries,		\
597 	    __entries = __pindex->entries; __entries > 0; --__entries) {\
598 		(ref) = *--__refp;
599 #define	WT_INTL_FOREACH_END						\
600 	}								\
601 } while (0)
602 
603 		/* Row-store leaf page. */
604 		WT_ROW *row;			/* Key/value pairs */
605 #undef	pg_row
606 #define	pg_row		u.row
607 
608 		/* Fixed-length column-store leaf page. */
609 		uint8_t *fix_bitf;		/* Values */
610 #undef	pg_fix_bitf
611 #define	pg_fix_bitf	u.fix_bitf
612 
613 		/* Variable-length column-store leaf page. */
614 		struct {
615 			WT_COL *col_var;	/* Values */
616 
617 			/*
618 			 * Variable-length column-store pages have an array
619 			 * of page entries with RLE counts greater than 1 when
620 			 * reading the page, so it's not necessary to walk the
621 			 * page counting records to find a specific entry. We
622 			 * can do a binary search in this array, then an offset
623 			 * calculation to find the cell.
624 			 *
625 			 * It's a separate structure to keep the page structure
626 			 * as small as possible.
627 			 */
628 			struct __wt_col_var_repeat {
629 				uint32_t   nrepeats;	/* repeat slots */
630 				WT_COL_RLE repeats[0];	/* lookup RLE array */
631 			} *repeats;
632 #define	WT_COL_VAR_REPEAT_SET(page)					\
633 	((page)->u.col_var.repeats != NULL)
634 		} col_var;
635 #undef	pg_var
636 #define	pg_var		u.col_var.col_var
637 #undef	pg_var_repeats
638 #define	pg_var_repeats	u.col_var.repeats->repeats
639 #undef	pg_var_nrepeats
640 #define	pg_var_nrepeats	u.col_var.repeats->nrepeats
641 	} u;
642 
643 	/*
644 	 * Page entries, type and flags are positioned at the end of the WT_PAGE
645 	 * union to reduce cache misses in the row-store search function.
646 	 *
647 	 * The entries field only applies to leaf pages, internal pages use the
648 	 * page-index entries instead.
649 	 */
650 	uint32_t entries;		/* Leaf page entries */
651 
652 #define	WT_PAGE_IS_INTERNAL(page)					\
653 	((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT)
654 #define	WT_PAGE_INVALID		0	/* Invalid page */
655 #define	WT_PAGE_BLOCK_MANAGER	1	/* Block-manager page */
656 #define	WT_PAGE_COL_FIX		2	/* Col-store fixed-len leaf */
657 #define	WT_PAGE_COL_INT		3	/* Col-store internal page */
658 #define	WT_PAGE_COL_VAR		4	/* Col-store var-length leaf page */
659 #define	WT_PAGE_OVFL		5	/* Overflow page */
660 #define	WT_PAGE_ROW_INT		6	/* Row-store internal page */
661 #define	WT_PAGE_ROW_LEAF	7	/* Row-store leaf page */
662 	uint8_t type;			/* Page type */
663 
664 /* AUTOMATIC FLAG VALUE GENERATION START */
665 #define	WT_PAGE_BUILD_KEYS	0x01u	/* Keys have been built in memory */
666 #define	WT_PAGE_DISK_ALLOC	0x02u	/* Disk image in allocated memory */
667 #define	WT_PAGE_DISK_MAPPED	0x04u	/* Disk image in mapped memory */
668 #define	WT_PAGE_EVICT_LRU	0x08u	/* Page is on the LRU queue */
669 #define	WT_PAGE_EVICT_NO_PROGRESS 0x10u	/* Eviction doesn't count as progress */
670 #define	WT_PAGE_OVERFLOW_KEYS	0x20u	/* Page has overflow keys */
671 #define	WT_PAGE_SPLIT_INSERT	0x40u	/* A leaf page was split for append */
672 #define	WT_PAGE_UPDATE_IGNORE	0x80u	/* Ignore updates on page discard */
673 /* AUTOMATIC FLAG VALUE GENERATION STOP */
674 	uint8_t flags_atomic;		/* Atomic flags, use F_*_ATOMIC */
675 
676 	uint8_t unused[2];		/* Unused padding */
677 
678 	/*
679 	 * The page's read generation acts as an LRU value for each page in the
680 	 * tree; it is used by the eviction server thread to select pages to be
681 	 * discarded from the in-memory tree.
682 	 *
683 	 * The read generation is a 64-bit value, if incremented frequently, a
684 	 * 32-bit value could overflow.
685 	 *
686 	 * The read generation is a piece of shared memory potentially read
687 	 * by many threads.  We don't want to update page read generations for
688 	 * in-cache workloads and suffer the cache misses, so we don't simply
689 	 * increment the read generation value on every access.  Instead, the
690 	 * read generation is incremented by the eviction server each time it
691 	 * becomes active.  To avoid incrementing a page's read generation too
692 	 * frequently, it is set to a future point.
693 	 *
694 	 * Because low read generation values have special meaning, and there
695 	 * are places where we manipulate the value, use an initial value well
696 	 * outside of the special range.
697 	 */
698 #define	WT_READGEN_NOTSET	0
699 #define	WT_READGEN_OLDEST	1
700 #define	WT_READGEN_WONT_NEED	2
701 #define	WT_READGEN_EVICT_SOON(readgen) 					\
702 	((readgen) != WT_READGEN_NOTSET && (readgen) < WT_READGEN_START_VALUE)
703 #define	WT_READGEN_START_VALUE	100
704 #define	WT_READGEN_STEP		100
705 	uint64_t read_gen;
706 
707 	size_t memory_footprint;	/* Memory attached to the page */
708 
709 	/* Page's on-disk representation: NULL for pages created in memory. */
710 	const WT_PAGE_HEADER *dsk;
711 
712 	/* If/when the page is modified, we need lots more information. */
713 	WT_PAGE_MODIFY *modify;
714 
715 	/* This is the 64 byte boundary, try to keep hot fields above here. */
716 
717 	uint64_t cache_create_gen;	/* Page create timestamp */
718 	uint64_t evict_pass_gen;	/* Eviction pass generation */
719 };
720 
721 /*
722  * WT_PAGE_DISK_OFFSET, WT_PAGE_REF_OFFSET --
723  *	Return the offset/pointer of a pointer/offset in a page disk image.
724  */
725 #define	WT_PAGE_DISK_OFFSET(page, p)					\
726 	WT_PTRDIFF32(p, (page)->dsk)
727 #define	WT_PAGE_REF_OFFSET(page, o)					\
728 	((void *)((uint8_t *)((page)->dsk) + (o)))
729 
730 /*
731  * Prepare update states.
732  *
733  * Prepare update synchronization is based on the state field, which has the
734  * following possible states:
735  *
736  * WT_PREPARE_INIT:
737  *	The initial prepare state of either an update or a page_del structure,
738  *	indicating a prepare phase has not started yet.
739  *	This state has no impact on the visibility of the update's data.
740  *
741  * WT_PREPARE_INPROGRESS:
742  *	Update is in prepared phase.
743  *
744  * WT_PREPARE_LOCKED:
745  *	State is locked as state transition is in progress from INPROGRESS to
746  *	RESOLVED. Any reader of the state needs to wait for state transition to
747  *	complete.
748  *
749  * WT_PREPARE_RESOLVED:
750  *	Represents the commit state of the prepared update.
751  *
752  * State Transition:
753  * 	From uncommitted -> prepare -> commit:
754  * 	INIT --> INPROGRESS --> LOCKED --> RESOLVED
755  * 	LOCKED will be a momentary phase during timestamp update.
756  *
757  * 	From uncommitted -> prepare -> rollback:
758  * 	INIT --> INPROGRESS
759  * 	Prepare state will not be updated during rollback and will continue to
760  * 	have the state as INPROGRESS.
761  */
762 #define	WT_PREPARE_INIT			0	/* Must be 0, as structures
763 						   will be default initialized
764 						   with 0. */
765 #define	WT_PREPARE_INPROGRESS		1
766 #define	WT_PREPARE_LOCKED		2
767 #define	WT_PREPARE_RESOLVED		3
768 
769 /*
770  * Page state.
771  *
772  * Synchronization is based on the WT_REF->state field, which has a number of
773  * possible states:
774  *
775  * WT_REF_DISK:
776  *	The initial setting before a page is brought into memory, and set as a
777  *	result of page eviction; the page is on disk, and must be read into
778  *	memory before use.  WT_REF_DISK has a value of 0 (the default state
779  *	after allocating cleared memory).
780  *
781  * WT_REF_DELETED:
782  *	The page is on disk, but has been deleted from the tree; we can delete
783  *	row-store leaf pages without reading them if they don't reference
784  *	overflow items.
785  *
786  * WT_REF_LIMBO:
787  *	The page image has been loaded into memory but there is additional
788  *	history in the lookaside table that has not been applied.
789  *
790  * WT_REF_LOCKED:
791  *	Locked for exclusive access.  In eviction, this page or a parent has
792  *	been selected for eviction; once hazard pointers are checked, the page
793  *	will be evicted.  When reading a page that was previously deleted, it
794  *	is locked until the page is in memory with records marked deleted.  The
795  *	thread that set the page to WT_REF_LOCKED has exclusive access, no
796  *	other thread may use the WT_REF until the state is changed.
797  *
798  * WT_REF_LOOKASIDE:
799  *	The page is on disk (as per WT_REF_DISK) and has entries in the
800  *	lookaside table that must be applied before the page can be read.
801  *
802  * WT_REF_MEM:
803  *	Set by a reading thread once the page has been read from disk; the page
804  *	is in the cache and the page reference is OK.
805  *
806  * WT_REF_READING:
807  *	Set by a reading thread before reading an ordinary page from disk;
808  *	other readers of the page wait until the read completes.  Sync can
809  *	safely skip over such pages: they are clean by definition.
810  *
811  * WT_REF_SPLIT:
812  *	Set when the page is split; the WT_REF is dead and can no longer be
813  *	used.
814  *
815  * The life cycle of a typical page goes like this: pages are read into memory
816  * from disk and their state set to WT_REF_MEM.  When the page is selected for
817  * eviction, the page state is set to WT_REF_LOCKED.  In all cases, evicting
818  * threads reset the page's state when finished with the page: if eviction was
819  * successful (a clean page was discarded, and a dirty page was written to disk
820  * and then discarded), the page state is set to WT_REF_DISK; if eviction failed
821  * because the page was busy, page state is reset to WT_REF_MEM.
822  *
823  * Readers check the state field and if it's WT_REF_MEM, they set a hazard
824  * pointer to the page, flush memory and re-confirm the page state.  If the
825  * page state is unchanged, the reader has a valid reference and can proceed.
826  *
827  * When an evicting thread wants to discard a page from the tree, it sets the
828  * WT_REF_LOCKED state, flushes memory, then checks hazard pointers.  If a
829  * hazard pointer is found, state is reset to WT_REF_MEM, restoring the page
830  * to the readers.  If the evicting thread does not find a hazard pointer,
831  * the page is evicted.
832  */
833 
834 /*
835  * WT_PAGE_DELETED --
836  *	Related information for truncated pages.
837  */
838 struct __wt_page_deleted {
839 	volatile uint64_t txnid;		/* Transaction ID */
840 	WT_DECL_TIMESTAMP(timestamp)
841 
842 	/*
843 	 * The state is used for transaction prepare to manage visibility
844 	 * and inheriting prepare state to update_list.
845 	 */
846 	volatile uint8_t prepare_state;		/* Prepare state. */
847 
848 	uint32_t previous_state;		/* Previous state */
849 
850 	WT_UPDATE **update_list;		/* List of updates for abort */
851 };
852 
853 /*
854  * WT_REF --
855  *	A single in-memory page and the state information used to determine if
856  * it's OK to dereference the pointer to the page.
857  */
858 struct __wt_ref {
859 	WT_PAGE *page;			/* Page */
860 
861 	/*
862 	 * When the tree deepens as a result of a split, the home page value
863 	 * changes.  Don't cache it, we need to see that change when looking
864 	 * up our slot in the page's index structure.
865 	 */
866 	WT_PAGE * volatile home;	/* Reference page */
867 	volatile uint32_t pindex_hint;	/* Reference page index hint */
868 
869 #define	WT_REF_DISK	 0		/* Page is on disk */
870 #define	WT_REF_DELETED	 1		/* Page is on disk, but deleted */
871 #define	WT_REF_LIMBO	 2		/* Page is in cache without history */
872 #define	WT_REF_LOCKED	 3		/* Page locked for exclusive access */
873 #define	WT_REF_LOOKASIDE 4		/* Page is on disk with lookaside */
874 #define	WT_REF_MEM	 5		/* Page is in cache and valid */
875 #define	WT_REF_READING	 6		/* Page being read */
876 #define	WT_REF_SPLIT	 7		/* Parent page split (WT_REF dead) */
877 	volatile uint32_t state;	/* Page state */
878 
879 	/*
880 	 * Address: on-page cell if read from backing block, off-page WT_ADDR
881 	 * if instantiated in-memory, or NULL if page created in-memory.
882 	 */
883 	void	*addr;
884 
885 	/*
886 	 * The child page's key.  Do NOT change this union without reviewing
887 	 * __wt_ref_key.
888 	 */
889 	union {
890 		uint64_t recno;		/* Column-store: starting recno */
891 		void	*ikey;		/* Row-store: key */
892 	} key;
893 #undef	ref_recno
894 #define	ref_recno	key.recno
895 #undef	ref_ikey
896 #define	ref_ikey	key.ikey
897 
898 	WT_PAGE_DELETED	  *page_del;	/* Deleted page information */
899 	WT_PAGE_LOOKASIDE *page_las;	/* Lookaside information */
900 };
901 /*
902  * WT_REF_SIZE is the expected structure size -- we verify the build to ensure
903  * the compiler hasn't inserted padding which would break the world.
904  */
905 #define	WT_REF_SIZE	56
906 
907 /*
908  * WT_ROW --
909  * Each in-memory page row-store leaf page has an array of WT_ROW structures:
910  * this is created from on-page data when a page is read from the file.  It's
911  * sorted by key, fixed in size, and starts with a reference to on-page data.
912  *
913  * Multiple threads of control may be searching the in-memory row-store pages,
914  * and the key may be instantiated at any time.  Code must be able to handle
915  * both when the key has not been instantiated (the key field points into the
916  * page's disk image), and when the key has been instantiated (the key field
917  * points outside the page's disk image).  We don't need barriers because the
918  * key is updated atomically, but code that reads the key field multiple times
919  * is a very, very bad idea.  Specifically, do not do this:
920  *
921  *	key = rip->key;
922  *	if (key_is_on_page(key)) {
923  *		cell = rip->key;
924  *	}
925  *
926  * The field is declared volatile (so the compiler knows it shouldn't read it
927  * multiple times), and we obscure the field name and use a copy macro in all
928  * references to the field (so the code doesn't read it multiple times), all
929  * to make sure we don't introduce this bug (again).
930  */
931 struct __wt_row {	/* On-page key, on-page cell, or off-page WT_IKEY */
932 	void * volatile __key;
933 };
934 #define	WT_ROW_KEY_COPY(rip)	((rip)->__key)
935 #define	WT_ROW_KEY_SET(rip, v)	((rip)->__key) = (void *)(v)
936 
937 /*
938  * WT_ROW_FOREACH --
939  *	Walk the entries of an in-memory row-store leaf page.
940  */
941 #define	WT_ROW_FOREACH(page, rip, i)					\
942 	for ((i) = (page)->entries,					\
943 	    (rip) = (page)->pg_row; (i) > 0; ++(rip), --(i))
944 #define	WT_ROW_FOREACH_REVERSE(page, rip, i)				\
945 	for ((i) = (page)->entries,					\
946 	    (rip) = (page)->pg_row + ((page)->entries - 1);		\
947 	    (i) > 0; --(rip), --(i))
948 
949 /*
950  * WT_ROW_SLOT --
951  *	Return the 0-based array offset based on a WT_ROW reference.
952  */
953 #define	WT_ROW_SLOT(page, rip)						\
954 	((uint32_t)(((WT_ROW *)(rip)) - (page)->pg_row))
955 
956 /*
957  * WT_COL --
958  * Each in-memory variable-length column-store leaf page has an array of WT_COL
959  * structures: this is created from on-page data when a page is read from the
960  * file.  It's fixed in size, and references data on the page.
961  */
962 struct __wt_col {
963 	/*
964 	 * Variable-length column-store data references are page offsets, not
965 	 * pointers (we boldly re-invent short pointers).  The trade-off is 4B
966 	 * per K/V pair on a 64-bit machine vs. a single cycle for the addition
967 	 * of a base pointer.  The on-page data is a WT_CELL (same as row-store
968 	 * pages).
969 	 *
970 	 * If the value is 0, it's a single, deleted record.
971 	 *
972 	 * Obscure the field name, code shouldn't use WT_COL->__col_value, the
973 	 * public interface is WT_COL_PTR and WT_COL_PTR_SET.
974 	 */
975 	uint32_t __col_value;
976 };
977 
978 /*
979  * WT_COL_PTR, WT_COL_PTR_SET --
980  *	Return/Set a pointer corresponding to the data offset. (If the item does
981  * not exist on the page, return a NULL.)
982  */
983 #define	WT_COL_PTR(page, cip)						\
984 	((cip)->__col_value == 0 ?					\
985 	    NULL : WT_PAGE_REF_OFFSET(page, (cip)->__col_value))
986 #define	WT_COL_PTR_SET(cip, value)					\
987 	(cip)->__col_value = (value)
988 
989 /*
990  * WT_COL_FOREACH --
991  *	Walk the entries of variable-length column-store leaf page.
992  */
993 #define	WT_COL_FOREACH(page, cip, i)					\
994 	for ((i) = (page)->entries,					\
995 	    (cip) = (page)->pg_var; (i) > 0; ++(cip), --(i))
996 
997 /*
998  * WT_COL_SLOT --
999  *	Return the 0-based array offset based on a WT_COL reference.
1000  */
1001 #define	WT_COL_SLOT(page, cip)						\
1002 	((uint32_t)(((WT_COL *)(cip)) - (page)->pg_var))
1003 
1004 /*
1005  * WT_IKEY --
1006  * Instantiated key: row-store keys are usually prefix compressed and sometimes
1007  * Huffman encoded or overflow objects.  Normally, a row-store page in-memory
1008  * key points to the on-page WT_CELL, but in some cases, we instantiate the key
1009  * in memory, in which case the row-store page in-memory key points to a WT_IKEY
1010  * structure.
1011  */
1012 struct __wt_ikey {
1013 	uint32_t size;			/* Key length */
1014 
1015 	/*
1016 	 * If we no longer point to the key's on-page WT_CELL, we can't find its
1017 	 * related value.  Save the offset of the key cell in the page.
1018 	 *
1019 	 * Row-store cell references are page offsets, not pointers (we boldly
1020 	 * re-invent short pointers).  The trade-off is 4B per K/V pair on a
1021 	 * 64-bit machine vs. a single cycle for the addition of a base pointer.
1022 	 */
1023 	uint32_t  cell_offset;
1024 
1025 	/* The key bytes immediately follow the WT_IKEY structure. */
1026 #define	WT_IKEY_DATA(ikey)						\
1027 	((void *)((uint8_t *)(ikey) + sizeof(WT_IKEY)))
1028 };
1029 
1030 /*
1031  * WT_UPDATE --
1032  * Entries on leaf pages can be updated, either modified or deleted.  Updates
1033  * to entries referenced from the WT_ROW and WT_COL arrays are stored in the
1034  * page's WT_UPDATE array.  When the first element on a page is updated, the
1035  * WT_UPDATE array is allocated, with one slot for every existing element in
1036  * the page.  A slot points to a WT_UPDATE structure; if more than one update
1037  * is done for an entry, WT_UPDATE structures are formed into a forward-linked
1038  * list.
1039  */
1040 struct __wt_update {
1041 	volatile uint64_t txnid;	/* transaction ID */
1042 #if WT_TIMESTAMP_SIZE == 8
1043 	WT_DECL_TIMESTAMP(timestamp)	/* aligned uint64_t timestamp */
1044 #endif
1045 
1046 	WT_UPDATE *next;		/* forward-linked list */
1047 
1048 	uint32_t size;			/* data length */
1049 
1050 #define	WT_UPDATE_INVALID	0	/* diagnostic check */
1051 #define	WT_UPDATE_BIRTHMARK	1	/* transaction for on-page value */
1052 #define	WT_UPDATE_MODIFY	2	/* partial-update modify value */
1053 #define	WT_UPDATE_RESERVE	3	/* reserved */
1054 #define	WT_UPDATE_STANDARD	4	/* complete value */
1055 #define	WT_UPDATE_TOMBSTONE	5	/* deleted */
1056 	uint8_t type;			/* type (one byte to conserve memory) */
1057 
1058 	/* If the update includes a complete value. */
1059 #define	WT_UPDATE_DATA_VALUE(upd)					\
1060 	((upd)->type == WT_UPDATE_STANDARD ||				\
1061 	(upd)->type == WT_UPDATE_TOMBSTONE)
1062 
1063 #if WT_TIMESTAMP_SIZE != 8
1064 	WT_DECL_TIMESTAMP(timestamp)	/* unaligned uint8_t array timestamp */
1065 #endif
1066 
1067 	/*
1068 	 * The update state is used for transaction prepare to manage
1069 	 * visibility and transitioning update structure state safely.
1070 	 */
1071 	volatile uint8_t prepare_state;	/* Prepare state. */
1072 
1073 	/*
1074 	 * Zero or more bytes of value (the payload) immediately follows the
1075 	 * WT_UPDATE structure.  We use a C99 flexible array member which has
1076 	 * the semantics we want.
1077 	 */
1078 	uint8_t data[];			/* start of the data */
1079 };
1080 
1081 /*
1082  * WT_UPDATE_SIZE is the expected structure size excluding the payload data --
1083  * we verify the build to ensure the compiler hasn't inserted padding.
1084  */
1085 #define	WT_UPDATE_SIZE	(22 + WT_TIMESTAMP_SIZE)
1086 
1087 /*
1088  * The memory size of an update: include some padding because this is such a
1089  * common case that overhead of tiny allocations can swamp our cache overhead
1090  * calculation.
1091  */
1092 #define	WT_UPDATE_MEMSIZE(upd)						\
1093 	WT_ALIGN(WT_UPDATE_SIZE + (upd)->size, 32)
1094 
1095 /*
1096  * WT_MAX_MODIFY_UPDATE --
1097  *	Limit update chains value to avoid penalizing reads and
1098  *	permit truncation. Having a smaller value will penalize the cases
1099  *	when history has to be maintained, resulting in multiplying cache
1100  *	pressure.
1101  */
1102 #define	WT_MAX_MODIFY_UPDATE	10
1103 
1104 /*
1105  * WT_MODIFY_MEM_FACTOR	--
1106  *	Limit update chains to a fraction of the base document size.
1107  */
1108 #define	WT_MODIFY_MEM_FRACTION	10
1109 
1110 /*
1111  * WT_INSERT --
1112  *
1113  * Row-store leaf pages support inserts of new K/V pairs.  When the first K/V
1114  * pair is inserted, the WT_INSERT_HEAD array is allocated, with one slot for
1115  * every existing element in the page, plus one additional slot.  A slot points
1116  * to a WT_INSERT_HEAD structure for the items which sort after the WT_ROW
1117  * element that references it and before the subsequent WT_ROW element; the
1118  * skiplist structure has a randomly chosen depth of next pointers in each
1119  * inserted node.
1120  *
1121  * The additional slot is because it's possible to insert items smaller than any
1122  * existing key on the page: for that reason, the first slot of the insert array
1123  * holds keys smaller than any other key on the page.
1124  *
1125  * In column-store variable-length run-length encoded pages, a single indx
1126  * entry may reference a large number of records, because there's a single
1127  * on-page entry representing many identical records. (We don't expand those
1128  * entries when the page comes into memory, as that would require resources as
1129  * pages are moved to/from the cache, including read-only files.)  Instead, a
1130  * single indx entry represents all of the identical records originally found
1131  * on the page.
1132  *
1133  * Modifying (or deleting) run-length encoded column-store records is hard
1134  * because the page's entry no longer references a set of identical items.  We
1135  * handle this by "inserting" a new entry into the insert array, with its own
1136  * record number.  (This is the only case where it's possible to insert into a
1137  * column-store: only appends are allowed, as insert requires re-numbering
1138  * subsequent records.  Berkeley DB did support mutable records, but it won't
1139  * scale and it isn't useful enough to re-implement, IMNSHO.)
1140  */
1141 struct __wt_insert {
1142 	WT_UPDATE *upd;				/* value */
1143 
1144 	union {
1145 		uint64_t recno;			/* column-store record number */
1146 		struct {
1147 			uint32_t offset;	/* row-store key data start */
1148 			uint32_t size;		/* row-store key data size */
1149 		} key;
1150 	} u;
1151 
1152 #define	WT_INSERT_KEY_SIZE(ins) (((WT_INSERT *)(ins))->u.key.size)
1153 #define	WT_INSERT_KEY(ins)						\
1154 	((void *)((uint8_t *)(ins) + ((WT_INSERT *)(ins))->u.key.offset))
1155 #define	WT_INSERT_RECNO(ins)	(((WT_INSERT *)(ins))->u.recno)
1156 
1157 	WT_INSERT *next[0];			/* forward-linked skip list */
1158 };
1159 
1160 /*
1161  * Skiplist helper macros.
1162  */
1163 #define	WT_SKIP_FIRST(ins_head)						\
1164 	(((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)(ins_head))->head[0])
1165 #define	WT_SKIP_LAST(ins_head)						\
1166 	(((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)(ins_head))->tail[0])
1167 #define	WT_SKIP_NEXT(ins)  ((ins)->next[0])
1168 #define	WT_SKIP_FOREACH(ins, ins_head)					\
1169 	for ((ins) = WT_SKIP_FIRST(ins_head);				\
1170 	    (ins) != NULL;						\
1171 	    (ins) = WT_SKIP_NEXT(ins))
1172 
1173 /*
1174  * Atomically allocate and swap a structure or array into place.
1175  */
1176 #define	WT_PAGE_ALLOC_AND_SWAP(s, page, dest, v, count)	do {		\
1177 	if (((v) = (dest)) == NULL) {					\
1178 		WT_ERR(__wt_calloc_def(s, count, &(v)));		\
1179 		if (__wt_atomic_cas_ptr(&(dest), NULL, v))		\
1180 			__wt_cache_page_inmem_incr(			\
1181 			    s, page, (count) * sizeof(*(v)));		\
1182 		else							\
1183 			__wt_free(s, v);				\
1184 	}								\
1185 } while (0)
1186 
1187 /*
1188  * WT_INSERT_HEAD --
1189  * 	The head of a skiplist of WT_INSERT items.
1190  */
1191 struct __wt_insert_head {
1192 	WT_INSERT *head[WT_SKIP_MAXDEPTH];	/* first item on skiplists */
1193 	WT_INSERT *tail[WT_SKIP_MAXDEPTH];	/* last item on skiplists */
1194 };
1195 
1196 /*
1197  * The row-store leaf page insert lists are arrays of pointers to structures,
1198  * and may not exist.  The following macros return an array entry if the array
1199  * of pointers and the specific structure exist, else NULL.
1200  */
1201 #define	WT_ROW_INSERT_SLOT(page, slot)					\
1202 	((page)->modify == NULL ||					\
1203 	    (page)->modify->mod_row_insert == NULL ?			\
1204 	    NULL : (page)->modify->mod_row_insert[slot])
1205 #define	WT_ROW_INSERT(page, ip)						\
1206 	WT_ROW_INSERT_SLOT(page, WT_ROW_SLOT(page, ip))
1207 #define	WT_ROW_UPDATE(page, ip)						\
1208 	((page)->modify == NULL ||					\
1209 	    (page)->modify->mod_row_update == NULL ?			\
1210 	    NULL : (page)->modify->mod_row_update[WT_ROW_SLOT(page, ip)])
1211 /*
1212  * WT_ROW_INSERT_SMALLEST references an additional slot past the end of the
1213  * the "one per WT_ROW slot" insert array.  That's because the insert array
1214  * requires an extra slot to hold keys that sort before any key found on the
1215  * original page.
1216  */
1217 #define	WT_ROW_INSERT_SMALLEST(page)					\
1218 	((page)->modify == NULL ||					\
1219 	    (page)->modify->mod_row_insert == NULL ?			\
1220 	    NULL : (page)->modify->mod_row_insert[(page)->entries])
1221 
1222 /*
1223  * The column-store leaf page update lists are arrays of pointers to structures,
1224  * and may not exist.  The following macros return an array entry if the array
1225  * of pointers and the specific structure exist, else NULL.
1226  */
1227 #define	WT_COL_UPDATE_SLOT(page, slot)					\
1228 	((page)->modify == NULL ||					\
1229 	    (page)->modify->mod_col_update == NULL ?			\
1230 	    NULL : (page)->modify->mod_col_update[slot])
1231 #define	WT_COL_UPDATE(page, ip)						\
1232 	WT_COL_UPDATE_SLOT(page, WT_COL_SLOT(page, ip))
1233 
1234 /*
1235  * WT_COL_UPDATE_SINGLE is a single WT_INSERT list, used for any fixed-length
1236  * column-store updates for a page.
1237  */
1238 #define	WT_COL_UPDATE_SINGLE(page)					\
1239 	WT_COL_UPDATE_SLOT(page, 0)
1240 
1241 /*
1242  * WT_COL_APPEND is an WT_INSERT list, used for fixed- and variable-length
1243  * appends.
1244  */
1245 #define	WT_COL_APPEND(page)						\
1246 	((page)->modify == NULL ||					\
1247 	    (page)->modify->mod_col_append == NULL ?			\
1248 	    NULL : (page)->modify->mod_col_append[0])
1249 
1250 /* WT_FIX_FOREACH walks fixed-length bit-fields on a disk page. */
1251 #define	WT_FIX_FOREACH(btree, dsk, v, i)				\
1252 	for ((i) = 0,							\
1253 	    (v) = (i) < (dsk)->u.entries ?				\
1254 	    __bit_getv(							\
1255 	    WT_PAGE_HEADER_BYTE(btree, dsk), 0, (btree)->bitcnt) : 0;	\
1256 	    (i) < (dsk)->u.entries; ++(i),				\
1257 	    (v) = __bit_getv(						\
1258 	    WT_PAGE_HEADER_BYTE(btree, dsk), i, (btree)->bitcnt))
1259 
1260 /*
1261  * Manage split generation numbers.  Splits walk the list of sessions to check
1262  * when it is safe to free structures that have been replaced.  We also check
1263  * that list periodically (e.g., when wrapping up a transaction) to free any
1264  * memory we can.
1265  *
1266  * Before a thread enters code that will examine page indexes (which are
1267  * swapped out by splits), it publishes a copy of the current split generation
1268  * into its session.  Don't assume that threads never re-enter this code: if we
1269  * already have a split generation, leave it alone.  If our caller is examining
1270  * an index, we don't want the oldest split generation to move forward and
1271  * potentially free it.
1272  */
1273 #define	WT_ENTER_PAGE_INDEX(session) do {				\
1274 	uint64_t __prev_split_gen =					\
1275 	    __wt_session_gen(session, WT_GEN_SPLIT);			\
1276 	if (__prev_split_gen == 0)					\
1277 		__wt_session_gen_enter(session, WT_GEN_SPLIT);
1278 
1279 #define	WT_LEAVE_PAGE_INDEX(session)					\
1280 	if (__prev_split_gen == 0)					\
1281 		__wt_session_gen_leave(session, WT_GEN_SPLIT);		\
1282 	} while (0)
1283 
1284 #define	WT_WITH_PAGE_INDEX(session, e)					\
1285 	WT_ENTER_PAGE_INDEX(session);					\
1286 	(e);								\
1287 	WT_LEAVE_PAGE_INDEX(session)
1288