1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 /*
10  * WT_CELL --
11  *	Variable-length cell type.
12  *
13  * Pages containing variable-length keys or values data (the WT_PAGE_ROW_INT,
14  * WT_PAGE_ROW_LEAF, WT_PAGE_COL_INT and WT_PAGE_COL_VAR page types), have
15  * cells after the page header.
16  *
17  * There are 4 basic cell types: keys and data (each of which has an overflow
18  * form), deleted cells and off-page references.  The cell is usually followed
19  * by additional data, varying by type: a key or data cell is followed by a set
20  * of bytes, an address cookie follows overflow or off-page cells.
21  *
22  * Deleted cells are place-holders for column-store files, where entries cannot
23  * be removed in order to preserve the record count.
24  *
25  * Here's the cell use by page type:
26  *
27  * WT_PAGE_ROW_INT (row-store internal page):
28  *	Keys and offpage-reference pairs (a WT_CELL_KEY or WT_CELL_KEY_OVFL
29  * cell followed by a WT_CELL_ADDR_XXX cell).
30  *
31  * WT_PAGE_ROW_LEAF (row-store leaf page):
32  *	Keys with optional data cells (a WT_CELL_KEY or WT_CELL_KEY_OVFL cell,
33  *	normally followed by a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell).
34  *
35  *	WT_PAGE_ROW_LEAF pages optionally prefix-compress keys, using a single
36  *	byte count immediately following the cell.
37  *
38  * WT_PAGE_COL_INT (Column-store internal page):
39  *	Off-page references (a WT_CELL_ADDR_XXX cell).
40  *
41  * WT_PAGE_COL_VAR (Column-store leaf page storing variable-length cells):
42  *	Data cells (a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell), or deleted
43  * cells (a WT_CELL_DEL cell).
44  *
45  * Each cell starts with a descriptor byte:
46  *
47  * Bits 1 and 2 are reserved for "short" key and value cells (that is, a cell
48  * carrying data less than 64B, where we can store the data length in the cell
49  * descriptor byte):
50  *	0x00	Not a short key/data cell
51  *	0x01	Short key cell
52  *	0x10	Short key cell, with a following prefix-compression byte
53  *	0x11	Short value cell
54  * In these cases, the other 6 bits of the descriptor byte are the data length.
55  *
56  * Bit 3 marks an 8B packed, uint64_t value following the cell description byte.
57  * (A run-length counter or a record number for variable-length column store.)
58  *
59  * Bit 4 is unused.
60  *
61  * Bits 5-8 are cell "types".
62  */
63 #define	WT_CELL_KEY_SHORT	0x01		/* Short key */
64 #define	WT_CELL_KEY_SHORT_PFX	0x02		/* Short key with prefix byte */
65 #define	WT_CELL_VALUE_SHORT	0x03		/* Short data */
66 #define	WT_CELL_SHORT_TYPE(v)	((v) & 0x03U)
67 
68 #define	WT_CELL_SHORT_MAX	63		/* Maximum short key/value */
69 #define	WT_CELL_SHORT_SHIFT	2		/* Shift for short key/value */
70 
71 #define	WT_CELL_64V		0x04		/* Associated value */
72 
73 /*
74  * We could use bit 4 as a single bit (similar to bit 3), or as a type bit in a
75  * backward compatible way by adding bit 4 to the type mask and adding new types
76  * that incorporate it.
77  */
78 #define	WT_CELL_UNUSED_BIT4	0x08		/* Unused */
79 
80 /*
81  * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf
82  * block location, and WT_CELL_ADDR_LEAF_NO is a leaf block location where the
83  * page has no overflow items.  (The goal is to speed up truncation as we don't
84  * have to read pages without overflow items in order to delete them.  Note,
85  * WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without
86  * overflow items, the only guarantee is that if set, the page has no overflow
87  * items.)
88  *
89  * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting
90  * value dictionaries: if the two values are the same, we only store them once
91  * and have the second and subsequent use reference the original.
92  */
93 #define	WT_CELL_ADDR_DEL	 (0)		/* Address: deleted */
94 #define	WT_CELL_ADDR_INT	 (1 << 4)	/* Address: internal  */
95 #define	WT_CELL_ADDR_LEAF	 (2 << 4)	/* Address: leaf */
96 #define	WT_CELL_ADDR_LEAF_NO	 (3 << 4)	/* Address: leaf no overflow */
97 #define	WT_CELL_DEL		 (4 << 4)	/* Deleted value */
98 #define	WT_CELL_KEY		 (5 << 4)	/* Key */
99 #define	WT_CELL_KEY_OVFL	 (6 << 4)	/* Overflow key */
100 #define	WT_CELL_KEY_OVFL_RM	(12 << 4)	/* Overflow key (removed) */
101 #define	WT_CELL_KEY_PFX		 (7 << 4)	/* Key with prefix byte */
102 #define	WT_CELL_VALUE		 (8 << 4)	/* Value */
103 #define	WT_CELL_VALUE_COPY	 (9 << 4)	/* Value copy */
104 #define	WT_CELL_VALUE_OVFL	(10 << 4)	/* Overflow value */
105 #define	WT_CELL_VALUE_OVFL_RM	(11 << 4)	/* Overflow value (removed) */
106 
107 #define	WT_CELL_TYPE_MASK	(0x0fU << 4)	/* Maximum 16 cell types */
108 #define	WT_CELL_TYPE(v)		((v) & WT_CELL_TYPE_MASK)
109 
110 /*
111  * When we aren't able to create a short key or value (and, in the case of a
112  * value, there's no associated RLE), the key or value is at least 64B, else
113  * we'd have been able to store it as a short cell.  Decrement/Increment the
114  * size before storing it, in the hopes that relatively small key/value sizes
115  * will pack into a single byte instead of two bytes.
116  */
117 #define	WT_CELL_SIZE_ADJUST	64
118 
119 /*
120  * WT_CELL --
121  *	Variable-length, on-page cell header.
122  */
123 struct __wt_cell {
124 	/*
125 	 * Maximum of 16 bytes:
126 	 * 1: cell descriptor byte
127 	 * 1: prefix compression count
128 	 * 9: associated 64-bit value	(uint64_t encoding, max 9 bytes)
129 	 * 5: data length		(uint32_t encoding, max 5 bytes)
130 	 *
131 	 * This calculation is pessimistic: the prefix compression count and
132 	 * 64V value overlap, the 64V value and data length are optional.
133 	 */
134 	uint8_t __chunk[1 + 1 + WT_INTPACK64_MAXSIZE + WT_INTPACK32_MAXSIZE];
135 };
136 
137 /*
138  * WT_CELL_UNPACK --
139  *	Unpacked cell.
140  */
141 struct __wt_cell_unpack {
142 	WT_CELL *cell;			/* Cell's disk image address */
143 
144 	uint64_t v;			/* RLE count or recno */
145 
146 	/*
147 	 * !!!
148 	 * The size and __len fields are reasonably type size_t; don't change
149 	 * the type, performance drops significantly if they're type size_t.
150 	 */
151 	const void *data;		/* Data */
152 	uint32_t    size;		/* Data size */
153 
154 	uint32_t __len;			/* Cell + data length (usually) */
155 
156 	uint8_t prefix;			/* Cell prefix length */
157 
158 	uint8_t raw;			/* Raw cell type (include "shorts") */
159 	uint8_t type;			/* Cell type */
160 
161 	uint8_t ovfl;			/* boolean: cell is an overflow */
162 };
163 
164 /*
165  * WT_CELL_FOREACH --
166  *	Walk the cells on a page.
167  */
168 #define	WT_CELL_FOREACH(btree, dsk, cell, unpack, i)			\
169 	for ((cell) =							\
170 	    WT_PAGE_HEADER_BYTE(btree, dsk), (i) = (dsk)->u.entries;	\
171 	    (i) > 0;							\
172 	    (cell) = (WT_CELL *)((uint8_t *)(cell) + (unpack)->__len), --(i))
173 
174 /*
175  * __wt_cell_pack_addr --
176  *	Pack an address cell.
177  */
178 static inline size_t
__wt_cell_pack_addr(WT_CELL * cell,u_int cell_type,uint64_t recno,size_t size)179 __wt_cell_pack_addr(WT_CELL *cell, u_int cell_type, uint64_t recno, size_t size)
180 {
181 	uint8_t *p;
182 
183 	p = cell->__chunk + 1;
184 
185 	if (recno == WT_RECNO_OOB)
186 		cell->__chunk[0] = (uint8_t)cell_type;	/* Type */
187 	else {
188 		cell->__chunk[0] = (uint8_t)(cell_type | WT_CELL_64V);
189 		(void)__wt_vpack_uint(&p, 0, recno);	/* Record number */
190 	}
191 	(void)__wt_vpack_uint(&p, 0, (uint64_t)size);	/* Length */
192 	return (WT_PTRDIFF(p, cell));
193 }
194 
195 /*
196  * __wt_cell_pack_data --
197  *	Set a data item's WT_CELL contents.
198  */
199 static inline size_t
__wt_cell_pack_data(WT_CELL * cell,uint64_t rle,size_t size)200 __wt_cell_pack_data(WT_CELL *cell, uint64_t rle, size_t size)
201 {
202 	uint8_t byte, *p;
203 
204 	/*
205 	 * Short data cells without run-length encoding have 6 bits of data
206 	 * length in the descriptor byte.
207 	 */
208 	if (rle < 2 && size <= WT_CELL_SHORT_MAX) {
209 		byte = (uint8_t)size;			/* Type + length */
210 		cell->__chunk[0] = (uint8_t)
211 		    ((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_VALUE_SHORT);
212 		return (1);
213 	}
214 
215 	p = cell->__chunk + 1;
216 	if (rle < 2) {
217 		size -= WT_CELL_SIZE_ADJUST;
218 		cell->__chunk[0] = WT_CELL_VALUE;	/* Type */
219 	} else {
220 		cell->__chunk[0] = WT_CELL_VALUE | WT_CELL_64V;
221 		(void)__wt_vpack_uint(&p, 0, rle);	/* RLE */
222 	}
223 	(void)__wt_vpack_uint(&p, 0, (uint64_t)size);	/* Length */
224 	return (WT_PTRDIFF(p, cell));
225 }
226 
227 /*
228  * __wt_cell_pack_data_match --
229  *	Return if two items would have identical WT_CELLs (except for any RLE).
230  */
231 static inline int
__wt_cell_pack_data_match(WT_CELL * page_cell,WT_CELL * val_cell,const uint8_t * val_data,bool * matchp)232 __wt_cell_pack_data_match(
233     WT_CELL *page_cell, WT_CELL *val_cell, const uint8_t *val_data,
234     bool *matchp)
235 {
236 	uint64_t av, bv;
237 	const uint8_t *a, *b;
238 	bool rle;
239 
240 	*matchp = 0;				/* Default to no-match */
241 
242 	/*
243 	 * This is a special-purpose function used by reconciliation to support
244 	 * dictionary lookups.  We're passed an on-page cell and a created cell
245 	 * plus a chunk of data we're about to write on the page, and we return
246 	 * if they would match on the page.  The column-store comparison ignores
247 	 * the RLE because the copied cell will have its own RLE.
248 	 */
249 	a = (uint8_t *)page_cell;
250 	b = (uint8_t *)val_cell;
251 
252 	if (WT_CELL_SHORT_TYPE(a[0]) == WT_CELL_VALUE_SHORT) {
253 		av = a[0] >> WT_CELL_SHORT_SHIFT;
254 		++a;
255 	} else if (WT_CELL_TYPE(a[0]) == WT_CELL_VALUE) {
256 		rle = (a[0] & WT_CELL_64V) != 0;	/* Skip any RLE */
257 		++a;
258 		if (rle)
259 			WT_RET(__wt_vunpack_uint(&a, 0, &av));
260 		WT_RET(__wt_vunpack_uint(&a, 0, &av));	/* Length */
261 	} else
262 		return (0);
263 
264 	if (WT_CELL_SHORT_TYPE(b[0]) == WT_CELL_VALUE_SHORT) {
265 		bv = b[0] >> WT_CELL_SHORT_SHIFT;
266 		++b;
267 	} else if (WT_CELL_TYPE(b[0]) == WT_CELL_VALUE) {
268 		rle = (b[0] & WT_CELL_64V) != 0;	/* Skip any RLE */
269 		++b;
270 		if (rle)
271 			WT_RET(__wt_vunpack_uint(&b, 0, &bv));
272 		WT_RET(__wt_vunpack_uint(&b, 0, &bv));	/* Length */
273 	} else
274 		return (0);
275 
276 	if (av == bv)
277 		*matchp = memcmp(a, val_data, av) == 0;
278 	return (0);
279 }
280 
281 /*
282  * __wt_cell_pack_copy --
283  *	Write a copy value cell.
284  */
285 static inline size_t
__wt_cell_pack_copy(WT_CELL * cell,uint64_t rle,uint64_t v)286 __wt_cell_pack_copy(WT_CELL *cell, uint64_t rle, uint64_t v)
287 {
288 	uint8_t *p;
289 
290 	p = cell->__chunk + 1;
291 
292 	if (rle < 2)					/* Type */
293 		cell->__chunk[0] = WT_CELL_VALUE_COPY;
294 	else {						/* Type */
295 		cell->__chunk[0] = WT_CELL_VALUE_COPY | WT_CELL_64V;
296 		(void)__wt_vpack_uint(&p, 0, rle);	/* RLE */
297 	}
298 	(void)__wt_vpack_uint(&p, 0, v);		/* Copy offset */
299 	return (WT_PTRDIFF(p, cell));
300 }
301 
302 /*
303  * __wt_cell_pack_del --
304  *	Write a deleted value cell.
305  */
306 static inline size_t
__wt_cell_pack_del(WT_CELL * cell,uint64_t rle)307 __wt_cell_pack_del(WT_CELL *cell, uint64_t rle)
308 {
309 	uint8_t *p;
310 
311 	p = cell->__chunk + 1;
312 	if (rle < 2) {					/* Type */
313 		cell->__chunk[0] = WT_CELL_DEL;
314 		return (1);
315 	}
316 							/* Type */
317 	cell->__chunk[0] = WT_CELL_DEL | WT_CELL_64V;
318 	(void)__wt_vpack_uint(&p, 0, rle);		/* RLE */
319 	return (WT_PTRDIFF(p, cell));
320 }
321 
322 /*
323  * __wt_cell_pack_int_key --
324  *	Set a row-store internal page key's WT_CELL contents.
325  */
326 static inline size_t
__wt_cell_pack_int_key(WT_CELL * cell,size_t size)327 __wt_cell_pack_int_key(WT_CELL *cell, size_t size)
328 {
329 	uint8_t byte, *p;
330 
331 	/* Short keys have 6 bits of data length in the descriptor byte. */
332 	if (size <= WT_CELL_SHORT_MAX) {
333 		byte = (uint8_t)size;
334 		cell->__chunk[0] = (uint8_t)
335 		    ((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT);
336 		return (1);
337 	}
338 
339 	cell->__chunk[0] = WT_CELL_KEY;			/* Type */
340 	p = cell->__chunk + 1;
341 
342 	size -= WT_CELL_SIZE_ADJUST;
343 	(void)__wt_vpack_uint(&p, 0, (uint64_t)size);	/* Length */
344 
345 	return (WT_PTRDIFF(p, cell));
346 }
347 
348 /*
349  * __wt_cell_pack_leaf_key --
350  *	Set a row-store leaf page key's WT_CELL contents.
351  */
352 static inline size_t
__wt_cell_pack_leaf_key(WT_CELL * cell,uint8_t prefix,size_t size)353 __wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size)
354 {
355 	uint8_t byte, *p;
356 
357 	/* Short keys have 6 bits of data length in the descriptor byte. */
358 	if (size <= WT_CELL_SHORT_MAX) {
359 		if (prefix == 0) {
360 			byte = (uint8_t)size;		/* Type + length */
361 			cell->__chunk[0] = (uint8_t)
362 			    ((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT);
363 			return (1);
364 		}
365 		byte = (uint8_t)size;		/* Type + length */
366 		cell->__chunk[0] = (uint8_t)
367 		    ((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT_PFX);
368 		cell->__chunk[1] = prefix;	/* Prefix */
369 		return (2);
370 	}
371 
372 	if (prefix == 0) {
373 		cell->__chunk[0] = WT_CELL_KEY;		/* Type */
374 		p = cell->__chunk + 1;
375 	} else {
376 		cell->__chunk[0] = WT_CELL_KEY_PFX;	/* Type */
377 		cell->__chunk[1] = prefix;		/* Prefix */
378 		p = cell->__chunk + 2;
379 	}
380 
381 	size -= WT_CELL_SIZE_ADJUST;
382 	(void)__wt_vpack_uint(&p, 0, (uint64_t)size);	/* Length */
383 
384 	return (WT_PTRDIFF(p, cell));
385 }
386 
387 /*
388  * __wt_cell_pack_ovfl --
389  *	Pack an overflow cell.
390  */
391 static inline size_t
__wt_cell_pack_ovfl(WT_CELL * cell,uint8_t type,uint64_t rle,size_t size)392 __wt_cell_pack_ovfl(WT_CELL *cell, uint8_t type, uint64_t rle, size_t size)
393 {
394 	uint8_t *p;
395 
396 	p = cell->__chunk + 1;
397 	if (rle < 2)					/* Type */
398 		cell->__chunk[0] = type;
399 	else {
400 		cell->__chunk[0] = type | WT_CELL_64V;
401 		(void)__wt_vpack_uint(&p, 0, rle);	/* RLE */
402 	}
403 	(void)__wt_vpack_uint(&p, 0, (uint64_t)size);	/* Length */
404 	return (WT_PTRDIFF(p, cell));
405 }
406 
407 /*
408  * __wt_cell_rle --
409  *	Return the cell's RLE value.
410  */
411 static inline uint64_t
__wt_cell_rle(WT_CELL_UNPACK * unpack)412 __wt_cell_rle(WT_CELL_UNPACK *unpack)
413 {
414 	/*
415 	 * Any item with only 1 occurrence is stored with an RLE of 0, that is,
416 	 * without any RLE at all.  This code is a single place to handle that
417 	 * correction, for simplicity.
418 	 */
419 	return (unpack->v < 2 ? 1 : unpack->v);
420 }
421 
422 /*
423  * __wt_cell_total_len --
424  *	Return the cell's total length, including data.
425  */
426 static inline size_t
__wt_cell_total_len(WT_CELL_UNPACK * unpack)427 __wt_cell_total_len(WT_CELL_UNPACK *unpack)
428 {
429 	/*
430 	 * The length field is specially named because it's dangerous to use it:
431 	 * it represents the length of the current cell (normally used for the
432 	 * loop that walks through cells on the page), but occasionally we want
433 	 * to copy a cell directly from the page, and what we need is the cell's
434 	 * total length. The problem is dictionary-copy cells, because in that
435 	 * case, the __len field is the length of the current cell, not the cell
436 	 * for which we're returning data.  To use the __len field, you must be
437 	 * sure you're not looking at a copy cell.
438 	 */
439 	return (unpack->__len);
440 }
441 
442 /*
443  * __wt_cell_type --
444  *	Return the cell's type (collapsing special types).
445  */
446 static inline u_int
__wt_cell_type(WT_CELL * cell)447 __wt_cell_type(WT_CELL *cell)
448 {
449 	u_int type;
450 
451 	switch (WT_CELL_SHORT_TYPE(cell->__chunk[0])) {
452 	case WT_CELL_KEY_SHORT:
453 	case WT_CELL_KEY_SHORT_PFX:
454 		return (WT_CELL_KEY);
455 	case WT_CELL_VALUE_SHORT:
456 		return (WT_CELL_VALUE);
457 	}
458 
459 	switch (type = WT_CELL_TYPE(cell->__chunk[0])) {
460 	case WT_CELL_KEY_PFX:
461 		return (WT_CELL_KEY);
462 	case WT_CELL_KEY_OVFL_RM:
463 		return (WT_CELL_KEY_OVFL);
464 	case WT_CELL_VALUE_OVFL_RM:
465 		return (WT_CELL_VALUE_OVFL);
466 	}
467 	return (type);
468 }
469 
470 /*
471  * __wt_cell_type_raw --
472  *	Return the cell's type.
473  */
474 static inline u_int
__wt_cell_type_raw(WT_CELL * cell)475 __wt_cell_type_raw(WT_CELL *cell)
476 {
477 	return (WT_CELL_SHORT_TYPE(cell->__chunk[0]) == 0 ?
478 	    WT_CELL_TYPE(cell->__chunk[0]) :
479 	    WT_CELL_SHORT_TYPE(cell->__chunk[0]));
480 }
481 
482 /*
483  * __wt_cell_type_reset --
484  *	Reset the cell's type.
485  */
486 static inline void
__wt_cell_type_reset(WT_SESSION_IMPL * session,WT_CELL * cell,u_int old_type,u_int new_type)487 __wt_cell_type_reset(
488     WT_SESSION_IMPL *session, WT_CELL *cell, u_int old_type, u_int new_type)
489 {
490 	/*
491 	 * For all current callers of this function, this should happen once
492 	 * and only once, assert we're setting what we think we're setting.
493 	 */
494 	WT_ASSERT(session, old_type == 0 || old_type == __wt_cell_type(cell));
495 	WT_UNUSED(old_type);
496 
497 	cell->__chunk[0] =
498 	    (cell->__chunk[0] & ~WT_CELL_TYPE_MASK) | WT_CELL_TYPE(new_type);
499 }
500 
501 /*
502  * __wt_cell_leaf_value_parse --
503  *	Return the cell if it's a row-store leaf page value, otherwise return
504  * NULL.
505  */
506 static inline WT_CELL *
__wt_cell_leaf_value_parse(WT_PAGE * page,WT_CELL * cell)507 __wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell)
508 {
509 	/*
510 	 * This function exists so there's a place for this comment.
511 	 *
512 	 * Row-store leaf pages may have a single data cell between each key, or
513 	 * keys may be adjacent (when the data cell is empty).
514 	 *
515 	 * One special case: if the last key on a page is a key without a value,
516 	 * don't walk off the end of the page: the size of the underlying disk
517 	 * image is exact, which means the end of the last cell on the page plus
518 	 * the length of the cell should be the byte immediately after the page
519 	 * disk image.
520 	 *
521 	 * !!!
522 	 * This line of code is really a call to __wt_off_page, but we know the
523 	 * cell we're given will either be on the page or past the end of page,
524 	 * so it's a simpler check.  (I wouldn't bother, but the real problem is
525 	 * we can't call __wt_off_page directly, it's in btree.i which requires
526 	 * this file be included first.)
527 	 */
528 	if (cell >= (WT_CELL *)((uint8_t *)page->dsk + page->dsk->mem_size))
529 		return (NULL);
530 
531 	switch (__wt_cell_type_raw(cell)) {
532 	case WT_CELL_KEY:
533 	case WT_CELL_KEY_OVFL:
534 	case WT_CELL_KEY_OVFL_RM:
535 	case WT_CELL_KEY_PFX:
536 	case WT_CELL_KEY_SHORT:
537 	case WT_CELL_KEY_SHORT_PFX:
538 		return (NULL);
539 	default:
540 		return (cell);
541 	}
542 }
543 
544 /*
545  * __wt_cell_unpack_safe --
546  *	Unpack a WT_CELL into a structure during verification.
547  */
548 static inline int
__wt_cell_unpack_safe(WT_CELL * cell,WT_CELL_UNPACK * unpack,const void * start,const void * end)549 __wt_cell_unpack_safe(
550     WT_CELL *cell, WT_CELL_UNPACK *unpack, const void *start, const void *end)
551 {
552 	struct {
553 		uint32_t len;
554 		uint64_t v;
555 	} copy;
556 	uint64_t v;
557 	const uint8_t *p;
558 
559 	copy.len = 0;
560 	copy.v = 0;			/* -Werror=maybe-uninitialized */
561 
562 	/*
563 	 * The verification code specifies start/end arguments, pointers to the
564 	 * start of the page and to 1 past the end-of-page. In which case, make
565 	 * sure all reads are inside the page image. If an error occurs, return
566 	 * an error code but don't output messages, our caller handles that.
567 	 */
568 #define	WT_CELL_LEN_CHK(t, len) do {					\
569 	if (start != NULL &&						\
570 	    ((uint8_t *)(t) < (uint8_t *)start ||			\
571 	    (((uint8_t *)(t)) + (len)) > (uint8_t *)end))		\
572 		return (WT_ERROR);	        			\
573 } while (0)
574 
575 restart:
576 	/*
577 	 * This path is performance critical for read-only trees, we're parsing
578 	 * on-page structures. For that reason we don't clear the unpacked cell
579 	 * structure (although that would be simpler), instead we make sure we
580 	 * initialize all structure elements either here or in the immediately
581 	 * following switch.
582 	 */
583 	WT_CELL_LEN_CHK(cell, 0);
584 	unpack->cell = cell;
585 	unpack->v = 0;
586 	unpack->raw = (uint8_t)__wt_cell_type_raw(cell);
587 	unpack->type = (uint8_t)__wt_cell_type(cell);
588 	unpack->ovfl = 0;
589 
590 	/*
591 	 * Handle cells with neither an RLE count or data length: short key/data
592 	 * cells have 6 bits of data length in the descriptor byte.
593 	 */
594 	switch (unpack->raw) {
595 	case WT_CELL_KEY_SHORT_PFX:
596 		WT_CELL_LEN_CHK(cell, 1);		/* skip prefix */
597 		unpack->prefix = cell->__chunk[1];
598 		unpack->data = cell->__chunk + 2;
599 		unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT;
600 		unpack->__len = 2 + unpack->size;
601 		goto done;
602 	case WT_CELL_KEY_SHORT:
603 	case WT_CELL_VALUE_SHORT:
604 		unpack->prefix = 0;
605 		unpack->data = cell->__chunk + 1;
606 		unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT;
607 		unpack->__len = 1 + unpack->size;
608 		goto done;
609 	}
610 
611 	unpack->prefix = 0;
612 	unpack->data = NULL;
613 	unpack->size = 0;
614 	unpack->__len = 0;
615 
616 	p = (uint8_t *)cell + 1;			/* skip cell */
617 
618 	/*
619 	 * Check for a prefix byte that optionally follows the cell descriptor
620 	 * byte on row-store leaf pages.
621 	 */
622 	if (unpack->raw == WT_CELL_KEY_PFX) {
623 		++p;					/* skip prefix */
624 		WT_CELL_LEN_CHK(p, 0);
625 		unpack->prefix = cell->__chunk[1];
626 	}
627 
628 	/*
629 	 * Check for an RLE count or record number that optionally follows the
630 	 * cell descriptor byte on column-store variable-length pages.
631 	 */
632 	if (cell->__chunk[0] & WT_CELL_64V)		/* skip value */
633 		WT_RET(__wt_vunpack_uint(
634 		    &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->v));
635 
636 	/*
637 	 * Handle special actions for a few different cell types and set the
638 	 * data length (deleted cells are fixed-size without length bytes,
639 	 * almost everything else has data length bytes).
640 	 */
641 	switch (unpack->raw) {
642 	case WT_CELL_VALUE_COPY:
643 		/*
644 		 * The cell is followed by an offset to a cell written earlier
645 		 * in the page.  Save/restore the length and RLE of this cell,
646 		 * we need the length to step through the set of cells on the
647 		 * page and this RLE is probably different from the RLE of the
648 		 * earlier cell.
649 		 */
650 		WT_RET(__wt_vunpack_uint(
651 		    &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v));
652 		copy.len = WT_PTRDIFF32(p, cell);
653 		copy.v = unpack->v;
654 		cell = (WT_CELL *)((uint8_t *)cell - v);
655 		goto restart;
656 
657 	case WT_CELL_KEY_OVFL:
658 	case WT_CELL_KEY_OVFL_RM:
659 	case WT_CELL_VALUE_OVFL:
660 	case WT_CELL_VALUE_OVFL_RM:
661 		/*
662 		 * Set overflow flag.
663 		 */
664 		unpack->ovfl = 1;
665 		/* FALLTHROUGH */
666 
667 	case WT_CELL_ADDR_DEL:
668 	case WT_CELL_ADDR_INT:
669 	case WT_CELL_ADDR_LEAF:
670 	case WT_CELL_ADDR_LEAF_NO:
671 	case WT_CELL_KEY:
672 	case WT_CELL_KEY_PFX:
673 	case WT_CELL_VALUE:
674 		/*
675 		 * The cell is followed by a 4B data length and a chunk of
676 		 * data.
677 		 */
678 		WT_RET(__wt_vunpack_uint(
679 		    &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v));
680 
681 		if (unpack->raw == WT_CELL_KEY ||
682 		    unpack->raw == WT_CELL_KEY_PFX ||
683 		    (unpack->raw == WT_CELL_VALUE && unpack->v == 0))
684 			v += WT_CELL_SIZE_ADJUST;
685 
686 		unpack->data = p;
687 		unpack->size = (uint32_t)v;
688 		unpack->__len = WT_PTRDIFF32(p + unpack->size, cell);
689 		break;
690 
691 	case WT_CELL_DEL:
692 		unpack->__len = WT_PTRDIFF32(p, cell);
693 		break;
694 	default:
695 		return (WT_ERROR);		/* Unknown cell type. */
696 	}
697 
698 	/*
699 	 * Check the original cell against the full cell length (this is a
700 	 * diagnostic as well, we may be copying the cell from the page and
701 	 * we need the right length).
702 	 */
703 done:	WT_CELL_LEN_CHK(cell, unpack->__len);
704 	if (copy.len != 0) {
705 		unpack->raw = WT_CELL_VALUE_COPY;
706 		unpack->__len = copy.len;
707 		unpack->v = copy.v;
708 	}
709 
710 	return (0);
711 }
712 
713 /*
714  * __wt_cell_unpack --
715  *	Unpack a WT_CELL into a structure.
716  */
717 static inline void
__wt_cell_unpack(WT_CELL * cell,WT_CELL_UNPACK * unpack)718 __wt_cell_unpack(WT_CELL *cell, WT_CELL_UNPACK *unpack)
719 {
720 	/*
721 	 * Row-store doesn't store zero-length values on pages, but this allows
722 	 * us to pretend.
723 	 */
724 	if (cell == NULL) {
725 		unpack->cell = NULL;
726 		unpack->v = 0;
727 		unpack->data = "";
728 		unpack->size = 0;
729 		unpack->__len = 0;
730 		unpack->prefix = 0;
731 		unpack->raw = unpack->type = WT_CELL_VALUE;
732 		unpack->ovfl = 0;
733 		return;
734 	}
735 
736 	(void)__wt_cell_unpack_safe(cell, unpack, NULL, NULL);
737 }
738 
739 /*
740  * __cell_data_ref --
741  *	Set a buffer to reference the data from an unpacked cell.
742  */
743 static inline int
__cell_data_ref(WT_SESSION_IMPL * session,WT_PAGE * page,int page_type,WT_CELL_UNPACK * unpack,WT_ITEM * store)744 __cell_data_ref(WT_SESSION_IMPL *session,
745     WT_PAGE *page, int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
746 {
747 	WT_BTREE *btree;
748 	bool decoded;
749 	void *huffman;
750 
751 	btree = S2BT(session);
752 
753 	/* Reference the cell's data, optionally decode it. */
754 	switch (unpack->type) {
755 	case WT_CELL_KEY:
756 		store->data = unpack->data;
757 		store->size = unpack->size;
758 		if (page_type == WT_PAGE_ROW_INT)
759 			return (0);
760 
761 		huffman = btree->huffman_key;
762 		break;
763 	case WT_CELL_VALUE:
764 		store->data = unpack->data;
765 		store->size = unpack->size;
766 		huffman = btree->huffman_value;
767 		break;
768 	case WT_CELL_KEY_OVFL:
769 		WT_RET(__wt_ovfl_read(session, page, unpack, store, &decoded));
770 		if (page_type == WT_PAGE_ROW_INT || decoded)
771 			return (0);
772 
773 		huffman = btree->huffman_key;
774 		break;
775 	case WT_CELL_VALUE_OVFL:
776 		WT_RET(__wt_ovfl_read(session, page, unpack, store, &decoded));
777 		if (decoded)
778 			return (0);
779 		huffman = btree->huffman_value;
780 		break;
781 	WT_ILLEGAL_VALUE(session, unpack->type);
782 	}
783 
784 	return (huffman == NULL || store->size == 0 ? 0 :
785 	    __wt_huffman_decode(
786 	    session, huffman, store->data, store->size, store));
787 }
788 
789 /*
790  * __wt_dsk_cell_data_ref --
791  *	Set a buffer to reference the data from an unpacked cell.
792  *
793  * There are two versions because of WT_CELL_VALUE_OVFL_RM type cells.  When an
794  * overflow item is deleted, its backing blocks are removed; if there are still
795  * running transactions that might need to see the overflow item, we cache a
796  * copy of the item and reset the item's cell to WT_CELL_VALUE_OVFL_RM.  If we
797  * find a WT_CELL_VALUE_OVFL_RM cell when reading an overflow item, we use the
798  * page reference to look aside into the cache.  So, calling the "dsk" version
799  * of the function declares the cell cannot be of type WT_CELL_VALUE_OVFL_RM,
800  * and calling the "page" version means it might be.
801  */
802 static inline int
__wt_dsk_cell_data_ref(WT_SESSION_IMPL * session,int page_type,WT_CELL_UNPACK * unpack,WT_ITEM * store)803 __wt_dsk_cell_data_ref(WT_SESSION_IMPL *session,
804     int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
805 {
806 	WT_ASSERT(session,
807 	    __wt_cell_type_raw(unpack->cell) != WT_CELL_VALUE_OVFL_RM);
808 	return (__cell_data_ref(session, NULL, page_type, unpack, store));
809 }
810 
811 /*
812  * __wt_page_cell_data_ref --
813  *	Set a buffer to reference the data from an unpacked cell.
814  */
815 static inline int
__wt_page_cell_data_ref(WT_SESSION_IMPL * session,WT_PAGE * page,WT_CELL_UNPACK * unpack,WT_ITEM * store)816 __wt_page_cell_data_ref(WT_SESSION_IMPL *session,
817     WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store)
818 {
819 	return (__cell_data_ref(session, page, page->type, unpack, store));
820 }
821 
822 /*
823  * __wt_cell_data_copy --
824  *	Copy the data from an unpacked cell into a buffer.
825  */
826 static inline int
__wt_cell_data_copy(WT_SESSION_IMPL * session,int page_type,WT_CELL_UNPACK * unpack,WT_ITEM * store)827 __wt_cell_data_copy(WT_SESSION_IMPL *session,
828     int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
829 {
830 	/*
831 	 * We have routines to both copy and reference a cell's information.  In
832 	 * most cases, all we need is a reference and we prefer that, especially
833 	 * when returning key/value items.  In a few we need a real copy: call
834 	 * the standard reference function and get a reference.  In some cases,
835 	 * a copy will be made (for example, when reading an overflow item from
836 	 * the underlying object.  If that happens, we're done, otherwise make
837 	 * a copy.
838 	 *
839 	 * We don't require two versions of this function, no callers need to
840 	 * handle WT_CELL_VALUE_OVFL_RM cells.
841 	 */
842 	WT_RET(__wt_dsk_cell_data_ref(session, page_type, unpack, store));
843 	if (!WT_DATA_IN_ITEM(store))
844 		WT_RET(__wt_buf_set(session, store, store->data, store->size));
845 	return (0);
846 }
847