1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 /*
10 * WT_CELL --
11 * Variable-length cell type.
12 *
13 * Pages containing variable-length keys or values data (the WT_PAGE_ROW_INT,
14 * WT_PAGE_ROW_LEAF, WT_PAGE_COL_INT and WT_PAGE_COL_VAR page types), have
15 * cells after the page header.
16 *
17 * There are 4 basic cell types: keys and data (each of which has an overflow
18 * form), deleted cells and off-page references. The cell is usually followed
19 * by additional data, varying by type: a key or data cell is followed by a set
20 * of bytes, an address cookie follows overflow or off-page cells.
21 *
22 * Deleted cells are place-holders for column-store files, where entries cannot
23 * be removed in order to preserve the record count.
24 *
25 * Here's the cell use by page type:
26 *
27 * WT_PAGE_ROW_INT (row-store internal page):
28 * Keys and offpage-reference pairs (a WT_CELL_KEY or WT_CELL_KEY_OVFL
29 * cell followed by a WT_CELL_ADDR_XXX cell).
30 *
31 * WT_PAGE_ROW_LEAF (row-store leaf page):
32 * Keys with optional data cells (a WT_CELL_KEY or WT_CELL_KEY_OVFL cell,
33 * normally followed by a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell).
34 *
35 * WT_PAGE_ROW_LEAF pages optionally prefix-compress keys, using a single
36 * byte count immediately following the cell.
37 *
38 * WT_PAGE_COL_INT (Column-store internal page):
39 * Off-page references (a WT_CELL_ADDR_XXX cell).
40 *
41 * WT_PAGE_COL_VAR (Column-store leaf page storing variable-length cells):
42 * Data cells (a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell), or deleted
43 * cells (a WT_CELL_DEL cell).
44 *
45 * Each cell starts with a descriptor byte:
46 *
47 * Bits 1 and 2 are reserved for "short" key and value cells (that is, a cell
48 * carrying data less than 64B, where we can store the data length in the cell
49 * descriptor byte):
50 * 0x00 Not a short key/data cell
51 * 0x01 Short key cell
52 * 0x10 Short key cell, with a following prefix-compression byte
53 * 0x11 Short value cell
54 * In these cases, the other 6 bits of the descriptor byte are the data length.
55 *
56 * Bit 3 marks an 8B packed, uint64_t value following the cell description byte.
57 * (A run-length counter or a record number for variable-length column store.)
58 *
59 * Bit 4 is unused.
60 *
61 * Bits 5-8 are cell "types".
62 */
63 #define WT_CELL_KEY_SHORT 0x01 /* Short key */
64 #define WT_CELL_KEY_SHORT_PFX 0x02 /* Short key with prefix byte */
65 #define WT_CELL_VALUE_SHORT 0x03 /* Short data */
66 #define WT_CELL_SHORT_TYPE(v) ((v) & 0x03U)
67
68 #define WT_CELL_SHORT_MAX 63 /* Maximum short key/value */
69 #define WT_CELL_SHORT_SHIFT 2 /* Shift for short key/value */
70
71 #define WT_CELL_64V 0x04 /* Associated value */
72
73 /*
74 * We could use bit 4 as a single bit (similar to bit 3), or as a type bit in a
75 * backward compatible way by adding bit 4 to the type mask and adding new types
76 * that incorporate it.
77 */
78 #define WT_CELL_UNUSED_BIT4 0x08 /* Unused */
79
80 /*
81 * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf
82 * block location, and WT_CELL_ADDR_LEAF_NO is a leaf block location where the
83 * page has no overflow items. (The goal is to speed up truncation as we don't
84 * have to read pages without overflow items in order to delete them. Note,
85 * WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without
86 * overflow items, the only guarantee is that if set, the page has no overflow
87 * items.)
88 *
89 * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting
90 * value dictionaries: if the two values are the same, we only store them once
91 * and have the second and subsequent use reference the original.
92 */
93 #define WT_CELL_ADDR_DEL (0) /* Address: deleted */
94 #define WT_CELL_ADDR_INT (1 << 4) /* Address: internal */
95 #define WT_CELL_ADDR_LEAF (2 << 4) /* Address: leaf */
96 #define WT_CELL_ADDR_LEAF_NO (3 << 4) /* Address: leaf no overflow */
97 #define WT_CELL_DEL (4 << 4) /* Deleted value */
98 #define WT_CELL_KEY (5 << 4) /* Key */
99 #define WT_CELL_KEY_OVFL (6 << 4) /* Overflow key */
100 #define WT_CELL_KEY_OVFL_RM (12 << 4) /* Overflow key (removed) */
101 #define WT_CELL_KEY_PFX (7 << 4) /* Key with prefix byte */
102 #define WT_CELL_VALUE (8 << 4) /* Value */
103 #define WT_CELL_VALUE_COPY (9 << 4) /* Value copy */
104 #define WT_CELL_VALUE_OVFL (10 << 4) /* Overflow value */
105 #define WT_CELL_VALUE_OVFL_RM (11 << 4) /* Overflow value (removed) */
106
107 #define WT_CELL_TYPE_MASK (0x0fU << 4) /* Maximum 16 cell types */
108 #define WT_CELL_TYPE(v) ((v) & WT_CELL_TYPE_MASK)
109
110 /*
111 * When we aren't able to create a short key or value (and, in the case of a
112 * value, there's no associated RLE), the key or value is at least 64B, else
113 * we'd have been able to store it as a short cell. Decrement/Increment the
114 * size before storing it, in the hopes that relatively small key/value sizes
115 * will pack into a single byte instead of two bytes.
116 */
117 #define WT_CELL_SIZE_ADJUST 64
118
119 /*
120 * WT_CELL --
121 * Variable-length, on-page cell header.
122 */
123 struct __wt_cell {
124 /*
125 * Maximum of 16 bytes:
126 * 1: cell descriptor byte
127 * 1: prefix compression count
128 * 9: associated 64-bit value (uint64_t encoding, max 9 bytes)
129 * 5: data length (uint32_t encoding, max 5 bytes)
130 *
131 * This calculation is pessimistic: the prefix compression count and
132 * 64V value overlap, the 64V value and data length are optional.
133 */
134 uint8_t __chunk[1 + 1 + WT_INTPACK64_MAXSIZE + WT_INTPACK32_MAXSIZE];
135 };
136
137 /*
138 * WT_CELL_UNPACK --
139 * Unpacked cell.
140 */
141 struct __wt_cell_unpack {
142 WT_CELL *cell; /* Cell's disk image address */
143
144 uint64_t v; /* RLE count or recno */
145
146 /*
147 * !!!
148 * The size and __len fields are reasonably type size_t; don't change
149 * the type, performance drops significantly if they're type size_t.
150 */
151 const void *data; /* Data */
152 uint32_t size; /* Data size */
153
154 uint32_t __len; /* Cell + data length (usually) */
155
156 uint8_t prefix; /* Cell prefix length */
157
158 uint8_t raw; /* Raw cell type (include "shorts") */
159 uint8_t type; /* Cell type */
160
161 uint8_t ovfl; /* boolean: cell is an overflow */
162 };
163
164 /*
165 * WT_CELL_FOREACH --
166 * Walk the cells on a page.
167 */
168 #define WT_CELL_FOREACH(btree, dsk, cell, unpack, i) \
169 for ((cell) = \
170 WT_PAGE_HEADER_BYTE(btree, dsk), (i) = (dsk)->u.entries; \
171 (i) > 0; \
172 (cell) = (WT_CELL *)((uint8_t *)(cell) + (unpack)->__len), --(i))
173
174 /*
175 * __wt_cell_pack_addr --
176 * Pack an address cell.
177 */
178 static inline size_t
__wt_cell_pack_addr(WT_CELL * cell,u_int cell_type,uint64_t recno,size_t size)179 __wt_cell_pack_addr(WT_CELL *cell, u_int cell_type, uint64_t recno, size_t size)
180 {
181 uint8_t *p;
182
183 p = cell->__chunk + 1;
184
185 if (recno == WT_RECNO_OOB)
186 cell->__chunk[0] = (uint8_t)cell_type; /* Type */
187 else {
188 cell->__chunk[0] = (uint8_t)(cell_type | WT_CELL_64V);
189 (void)__wt_vpack_uint(&p, 0, recno); /* Record number */
190 }
191 (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
192 return (WT_PTRDIFF(p, cell));
193 }
194
195 /*
196 * __wt_cell_pack_data --
197 * Set a data item's WT_CELL contents.
198 */
199 static inline size_t
__wt_cell_pack_data(WT_CELL * cell,uint64_t rle,size_t size)200 __wt_cell_pack_data(WT_CELL *cell, uint64_t rle, size_t size)
201 {
202 uint8_t byte, *p;
203
204 /*
205 * Short data cells without run-length encoding have 6 bits of data
206 * length in the descriptor byte.
207 */
208 if (rle < 2 && size <= WT_CELL_SHORT_MAX) {
209 byte = (uint8_t)size; /* Type + length */
210 cell->__chunk[0] = (uint8_t)
211 ((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_VALUE_SHORT);
212 return (1);
213 }
214
215 p = cell->__chunk + 1;
216 if (rle < 2) {
217 size -= WT_CELL_SIZE_ADJUST;
218 cell->__chunk[0] = WT_CELL_VALUE; /* Type */
219 } else {
220 cell->__chunk[0] = WT_CELL_VALUE | WT_CELL_64V;
221 (void)__wt_vpack_uint(&p, 0, rle); /* RLE */
222 }
223 (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
224 return (WT_PTRDIFF(p, cell));
225 }
226
227 /*
228 * __wt_cell_pack_data_match --
229 * Return if two items would have identical WT_CELLs (except for any RLE).
230 */
231 static inline int
__wt_cell_pack_data_match(WT_CELL * page_cell,WT_CELL * val_cell,const uint8_t * val_data,bool * matchp)232 __wt_cell_pack_data_match(
233 WT_CELL *page_cell, WT_CELL *val_cell, const uint8_t *val_data,
234 bool *matchp)
235 {
236 uint64_t av, bv;
237 const uint8_t *a, *b;
238 bool rle;
239
240 *matchp = 0; /* Default to no-match */
241
242 /*
243 * This is a special-purpose function used by reconciliation to support
244 * dictionary lookups. We're passed an on-page cell and a created cell
245 * plus a chunk of data we're about to write on the page, and we return
246 * if they would match on the page. The column-store comparison ignores
247 * the RLE because the copied cell will have its own RLE.
248 */
249 a = (uint8_t *)page_cell;
250 b = (uint8_t *)val_cell;
251
252 if (WT_CELL_SHORT_TYPE(a[0]) == WT_CELL_VALUE_SHORT) {
253 av = a[0] >> WT_CELL_SHORT_SHIFT;
254 ++a;
255 } else if (WT_CELL_TYPE(a[0]) == WT_CELL_VALUE) {
256 rle = (a[0] & WT_CELL_64V) != 0; /* Skip any RLE */
257 ++a;
258 if (rle)
259 WT_RET(__wt_vunpack_uint(&a, 0, &av));
260 WT_RET(__wt_vunpack_uint(&a, 0, &av)); /* Length */
261 } else
262 return (0);
263
264 if (WT_CELL_SHORT_TYPE(b[0]) == WT_CELL_VALUE_SHORT) {
265 bv = b[0] >> WT_CELL_SHORT_SHIFT;
266 ++b;
267 } else if (WT_CELL_TYPE(b[0]) == WT_CELL_VALUE) {
268 rle = (b[0] & WT_CELL_64V) != 0; /* Skip any RLE */
269 ++b;
270 if (rle)
271 WT_RET(__wt_vunpack_uint(&b, 0, &bv));
272 WT_RET(__wt_vunpack_uint(&b, 0, &bv)); /* Length */
273 } else
274 return (0);
275
276 if (av == bv)
277 *matchp = memcmp(a, val_data, av) == 0;
278 return (0);
279 }
280
281 /*
282 * __wt_cell_pack_copy --
283 * Write a copy value cell.
284 */
285 static inline size_t
__wt_cell_pack_copy(WT_CELL * cell,uint64_t rle,uint64_t v)286 __wt_cell_pack_copy(WT_CELL *cell, uint64_t rle, uint64_t v)
287 {
288 uint8_t *p;
289
290 p = cell->__chunk + 1;
291
292 if (rle < 2) /* Type */
293 cell->__chunk[0] = WT_CELL_VALUE_COPY;
294 else { /* Type */
295 cell->__chunk[0] = WT_CELL_VALUE_COPY | WT_CELL_64V;
296 (void)__wt_vpack_uint(&p, 0, rle); /* RLE */
297 }
298 (void)__wt_vpack_uint(&p, 0, v); /* Copy offset */
299 return (WT_PTRDIFF(p, cell));
300 }
301
302 /*
303 * __wt_cell_pack_del --
304 * Write a deleted value cell.
305 */
306 static inline size_t
__wt_cell_pack_del(WT_CELL * cell,uint64_t rle)307 __wt_cell_pack_del(WT_CELL *cell, uint64_t rle)
308 {
309 uint8_t *p;
310
311 p = cell->__chunk + 1;
312 if (rle < 2) { /* Type */
313 cell->__chunk[0] = WT_CELL_DEL;
314 return (1);
315 }
316 /* Type */
317 cell->__chunk[0] = WT_CELL_DEL | WT_CELL_64V;
318 (void)__wt_vpack_uint(&p, 0, rle); /* RLE */
319 return (WT_PTRDIFF(p, cell));
320 }
321
322 /*
323 * __wt_cell_pack_int_key --
324 * Set a row-store internal page key's WT_CELL contents.
325 */
326 static inline size_t
__wt_cell_pack_int_key(WT_CELL * cell,size_t size)327 __wt_cell_pack_int_key(WT_CELL *cell, size_t size)
328 {
329 uint8_t byte, *p;
330
331 /* Short keys have 6 bits of data length in the descriptor byte. */
332 if (size <= WT_CELL_SHORT_MAX) {
333 byte = (uint8_t)size;
334 cell->__chunk[0] = (uint8_t)
335 ((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT);
336 return (1);
337 }
338
339 cell->__chunk[0] = WT_CELL_KEY; /* Type */
340 p = cell->__chunk + 1;
341
342 size -= WT_CELL_SIZE_ADJUST;
343 (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
344
345 return (WT_PTRDIFF(p, cell));
346 }
347
348 /*
349 * __wt_cell_pack_leaf_key --
350 * Set a row-store leaf page key's WT_CELL contents.
351 */
352 static inline size_t
__wt_cell_pack_leaf_key(WT_CELL * cell,uint8_t prefix,size_t size)353 __wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size)
354 {
355 uint8_t byte, *p;
356
357 /* Short keys have 6 bits of data length in the descriptor byte. */
358 if (size <= WT_CELL_SHORT_MAX) {
359 if (prefix == 0) {
360 byte = (uint8_t)size; /* Type + length */
361 cell->__chunk[0] = (uint8_t)
362 ((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT);
363 return (1);
364 }
365 byte = (uint8_t)size; /* Type + length */
366 cell->__chunk[0] = (uint8_t)
367 ((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT_PFX);
368 cell->__chunk[1] = prefix; /* Prefix */
369 return (2);
370 }
371
372 if (prefix == 0) {
373 cell->__chunk[0] = WT_CELL_KEY; /* Type */
374 p = cell->__chunk + 1;
375 } else {
376 cell->__chunk[0] = WT_CELL_KEY_PFX; /* Type */
377 cell->__chunk[1] = prefix; /* Prefix */
378 p = cell->__chunk + 2;
379 }
380
381 size -= WT_CELL_SIZE_ADJUST;
382 (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
383
384 return (WT_PTRDIFF(p, cell));
385 }
386
387 /*
388 * __wt_cell_pack_ovfl --
389 * Pack an overflow cell.
390 */
391 static inline size_t
__wt_cell_pack_ovfl(WT_CELL * cell,uint8_t type,uint64_t rle,size_t size)392 __wt_cell_pack_ovfl(WT_CELL *cell, uint8_t type, uint64_t rle, size_t size)
393 {
394 uint8_t *p;
395
396 p = cell->__chunk + 1;
397 if (rle < 2) /* Type */
398 cell->__chunk[0] = type;
399 else {
400 cell->__chunk[0] = type | WT_CELL_64V;
401 (void)__wt_vpack_uint(&p, 0, rle); /* RLE */
402 }
403 (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
404 return (WT_PTRDIFF(p, cell));
405 }
406
407 /*
408 * __wt_cell_rle --
409 * Return the cell's RLE value.
410 */
411 static inline uint64_t
__wt_cell_rle(WT_CELL_UNPACK * unpack)412 __wt_cell_rle(WT_CELL_UNPACK *unpack)
413 {
414 /*
415 * Any item with only 1 occurrence is stored with an RLE of 0, that is,
416 * without any RLE at all. This code is a single place to handle that
417 * correction, for simplicity.
418 */
419 return (unpack->v < 2 ? 1 : unpack->v);
420 }
421
422 /*
423 * __wt_cell_total_len --
424 * Return the cell's total length, including data.
425 */
426 static inline size_t
__wt_cell_total_len(WT_CELL_UNPACK * unpack)427 __wt_cell_total_len(WT_CELL_UNPACK *unpack)
428 {
429 /*
430 * The length field is specially named because it's dangerous to use it:
431 * it represents the length of the current cell (normally used for the
432 * loop that walks through cells on the page), but occasionally we want
433 * to copy a cell directly from the page, and what we need is the cell's
434 * total length. The problem is dictionary-copy cells, because in that
435 * case, the __len field is the length of the current cell, not the cell
436 * for which we're returning data. To use the __len field, you must be
437 * sure you're not looking at a copy cell.
438 */
439 return (unpack->__len);
440 }
441
442 /*
443 * __wt_cell_type --
444 * Return the cell's type (collapsing special types).
445 */
446 static inline u_int
__wt_cell_type(WT_CELL * cell)447 __wt_cell_type(WT_CELL *cell)
448 {
449 u_int type;
450
451 switch (WT_CELL_SHORT_TYPE(cell->__chunk[0])) {
452 case WT_CELL_KEY_SHORT:
453 case WT_CELL_KEY_SHORT_PFX:
454 return (WT_CELL_KEY);
455 case WT_CELL_VALUE_SHORT:
456 return (WT_CELL_VALUE);
457 }
458
459 switch (type = WT_CELL_TYPE(cell->__chunk[0])) {
460 case WT_CELL_KEY_PFX:
461 return (WT_CELL_KEY);
462 case WT_CELL_KEY_OVFL_RM:
463 return (WT_CELL_KEY_OVFL);
464 case WT_CELL_VALUE_OVFL_RM:
465 return (WT_CELL_VALUE_OVFL);
466 }
467 return (type);
468 }
469
470 /*
471 * __wt_cell_type_raw --
472 * Return the cell's type.
473 */
474 static inline u_int
__wt_cell_type_raw(WT_CELL * cell)475 __wt_cell_type_raw(WT_CELL *cell)
476 {
477 return (WT_CELL_SHORT_TYPE(cell->__chunk[0]) == 0 ?
478 WT_CELL_TYPE(cell->__chunk[0]) :
479 WT_CELL_SHORT_TYPE(cell->__chunk[0]));
480 }
481
482 /*
483 * __wt_cell_type_reset --
484 * Reset the cell's type.
485 */
486 static inline void
__wt_cell_type_reset(WT_SESSION_IMPL * session,WT_CELL * cell,u_int old_type,u_int new_type)487 __wt_cell_type_reset(
488 WT_SESSION_IMPL *session, WT_CELL *cell, u_int old_type, u_int new_type)
489 {
490 /*
491 * For all current callers of this function, this should happen once
492 * and only once, assert we're setting what we think we're setting.
493 */
494 WT_ASSERT(session, old_type == 0 || old_type == __wt_cell_type(cell));
495 WT_UNUSED(old_type);
496
497 cell->__chunk[0] =
498 (cell->__chunk[0] & ~WT_CELL_TYPE_MASK) | WT_CELL_TYPE(new_type);
499 }
500
501 /*
502 * __wt_cell_leaf_value_parse --
503 * Return the cell if it's a row-store leaf page value, otherwise return
504 * NULL.
505 */
506 static inline WT_CELL *
__wt_cell_leaf_value_parse(WT_PAGE * page,WT_CELL * cell)507 __wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell)
508 {
509 /*
510 * This function exists so there's a place for this comment.
511 *
512 * Row-store leaf pages may have a single data cell between each key, or
513 * keys may be adjacent (when the data cell is empty).
514 *
515 * One special case: if the last key on a page is a key without a value,
516 * don't walk off the end of the page: the size of the underlying disk
517 * image is exact, which means the end of the last cell on the page plus
518 * the length of the cell should be the byte immediately after the page
519 * disk image.
520 *
521 * !!!
522 * This line of code is really a call to __wt_off_page, but we know the
523 * cell we're given will either be on the page or past the end of page,
524 * so it's a simpler check. (I wouldn't bother, but the real problem is
525 * we can't call __wt_off_page directly, it's in btree.i which requires
526 * this file be included first.)
527 */
528 if (cell >= (WT_CELL *)((uint8_t *)page->dsk + page->dsk->mem_size))
529 return (NULL);
530
531 switch (__wt_cell_type_raw(cell)) {
532 case WT_CELL_KEY:
533 case WT_CELL_KEY_OVFL:
534 case WT_CELL_KEY_OVFL_RM:
535 case WT_CELL_KEY_PFX:
536 case WT_CELL_KEY_SHORT:
537 case WT_CELL_KEY_SHORT_PFX:
538 return (NULL);
539 default:
540 return (cell);
541 }
542 }
543
544 /*
545 * __wt_cell_unpack_safe --
546 * Unpack a WT_CELL into a structure during verification.
547 */
548 static inline int
__wt_cell_unpack_safe(WT_CELL * cell,WT_CELL_UNPACK * unpack,const void * start,const void * end)549 __wt_cell_unpack_safe(
550 WT_CELL *cell, WT_CELL_UNPACK *unpack, const void *start, const void *end)
551 {
552 struct {
553 uint32_t len;
554 uint64_t v;
555 } copy;
556 uint64_t v;
557 const uint8_t *p;
558
559 copy.len = 0;
560 copy.v = 0; /* -Werror=maybe-uninitialized */
561
562 /*
563 * The verification code specifies start/end arguments, pointers to the
564 * start of the page and to 1 past the end-of-page. In which case, make
565 * sure all reads are inside the page image. If an error occurs, return
566 * an error code but don't output messages, our caller handles that.
567 */
568 #define WT_CELL_LEN_CHK(t, len) do { \
569 if (start != NULL && \
570 ((uint8_t *)(t) < (uint8_t *)start || \
571 (((uint8_t *)(t)) + (len)) > (uint8_t *)end)) \
572 return (WT_ERROR); \
573 } while (0)
574
575 restart:
576 /*
577 * This path is performance critical for read-only trees, we're parsing
578 * on-page structures. For that reason we don't clear the unpacked cell
579 * structure (although that would be simpler), instead we make sure we
580 * initialize all structure elements either here or in the immediately
581 * following switch.
582 */
583 WT_CELL_LEN_CHK(cell, 0);
584 unpack->cell = cell;
585 unpack->v = 0;
586 unpack->raw = (uint8_t)__wt_cell_type_raw(cell);
587 unpack->type = (uint8_t)__wt_cell_type(cell);
588 unpack->ovfl = 0;
589
590 /*
591 * Handle cells with neither an RLE count or data length: short key/data
592 * cells have 6 bits of data length in the descriptor byte.
593 */
594 switch (unpack->raw) {
595 case WT_CELL_KEY_SHORT_PFX:
596 WT_CELL_LEN_CHK(cell, 1); /* skip prefix */
597 unpack->prefix = cell->__chunk[1];
598 unpack->data = cell->__chunk + 2;
599 unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT;
600 unpack->__len = 2 + unpack->size;
601 goto done;
602 case WT_CELL_KEY_SHORT:
603 case WT_CELL_VALUE_SHORT:
604 unpack->prefix = 0;
605 unpack->data = cell->__chunk + 1;
606 unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT;
607 unpack->__len = 1 + unpack->size;
608 goto done;
609 }
610
611 unpack->prefix = 0;
612 unpack->data = NULL;
613 unpack->size = 0;
614 unpack->__len = 0;
615
616 p = (uint8_t *)cell + 1; /* skip cell */
617
618 /*
619 * Check for a prefix byte that optionally follows the cell descriptor
620 * byte on row-store leaf pages.
621 */
622 if (unpack->raw == WT_CELL_KEY_PFX) {
623 ++p; /* skip prefix */
624 WT_CELL_LEN_CHK(p, 0);
625 unpack->prefix = cell->__chunk[1];
626 }
627
628 /*
629 * Check for an RLE count or record number that optionally follows the
630 * cell descriptor byte on column-store variable-length pages.
631 */
632 if (cell->__chunk[0] & WT_CELL_64V) /* skip value */
633 WT_RET(__wt_vunpack_uint(
634 &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->v));
635
636 /*
637 * Handle special actions for a few different cell types and set the
638 * data length (deleted cells are fixed-size without length bytes,
639 * almost everything else has data length bytes).
640 */
641 switch (unpack->raw) {
642 case WT_CELL_VALUE_COPY:
643 /*
644 * The cell is followed by an offset to a cell written earlier
645 * in the page. Save/restore the length and RLE of this cell,
646 * we need the length to step through the set of cells on the
647 * page and this RLE is probably different from the RLE of the
648 * earlier cell.
649 */
650 WT_RET(__wt_vunpack_uint(
651 &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v));
652 copy.len = WT_PTRDIFF32(p, cell);
653 copy.v = unpack->v;
654 cell = (WT_CELL *)((uint8_t *)cell - v);
655 goto restart;
656
657 case WT_CELL_KEY_OVFL:
658 case WT_CELL_KEY_OVFL_RM:
659 case WT_CELL_VALUE_OVFL:
660 case WT_CELL_VALUE_OVFL_RM:
661 /*
662 * Set overflow flag.
663 */
664 unpack->ovfl = 1;
665 /* FALLTHROUGH */
666
667 case WT_CELL_ADDR_DEL:
668 case WT_CELL_ADDR_INT:
669 case WT_CELL_ADDR_LEAF:
670 case WT_CELL_ADDR_LEAF_NO:
671 case WT_CELL_KEY:
672 case WT_CELL_KEY_PFX:
673 case WT_CELL_VALUE:
674 /*
675 * The cell is followed by a 4B data length and a chunk of
676 * data.
677 */
678 WT_RET(__wt_vunpack_uint(
679 &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v));
680
681 if (unpack->raw == WT_CELL_KEY ||
682 unpack->raw == WT_CELL_KEY_PFX ||
683 (unpack->raw == WT_CELL_VALUE && unpack->v == 0))
684 v += WT_CELL_SIZE_ADJUST;
685
686 unpack->data = p;
687 unpack->size = (uint32_t)v;
688 unpack->__len = WT_PTRDIFF32(p + unpack->size, cell);
689 break;
690
691 case WT_CELL_DEL:
692 unpack->__len = WT_PTRDIFF32(p, cell);
693 break;
694 default:
695 return (WT_ERROR); /* Unknown cell type. */
696 }
697
698 /*
699 * Check the original cell against the full cell length (this is a
700 * diagnostic as well, we may be copying the cell from the page and
701 * we need the right length).
702 */
703 done: WT_CELL_LEN_CHK(cell, unpack->__len);
704 if (copy.len != 0) {
705 unpack->raw = WT_CELL_VALUE_COPY;
706 unpack->__len = copy.len;
707 unpack->v = copy.v;
708 }
709
710 return (0);
711 }
712
713 /*
714 * __wt_cell_unpack --
715 * Unpack a WT_CELL into a structure.
716 */
717 static inline void
__wt_cell_unpack(WT_CELL * cell,WT_CELL_UNPACK * unpack)718 __wt_cell_unpack(WT_CELL *cell, WT_CELL_UNPACK *unpack)
719 {
720 /*
721 * Row-store doesn't store zero-length values on pages, but this allows
722 * us to pretend.
723 */
724 if (cell == NULL) {
725 unpack->cell = NULL;
726 unpack->v = 0;
727 unpack->data = "";
728 unpack->size = 0;
729 unpack->__len = 0;
730 unpack->prefix = 0;
731 unpack->raw = unpack->type = WT_CELL_VALUE;
732 unpack->ovfl = 0;
733 return;
734 }
735
736 (void)__wt_cell_unpack_safe(cell, unpack, NULL, NULL);
737 }
738
739 /*
740 * __cell_data_ref --
741 * Set a buffer to reference the data from an unpacked cell.
742 */
743 static inline int
__cell_data_ref(WT_SESSION_IMPL * session,WT_PAGE * page,int page_type,WT_CELL_UNPACK * unpack,WT_ITEM * store)744 __cell_data_ref(WT_SESSION_IMPL *session,
745 WT_PAGE *page, int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
746 {
747 WT_BTREE *btree;
748 bool decoded;
749 void *huffman;
750
751 btree = S2BT(session);
752
753 /* Reference the cell's data, optionally decode it. */
754 switch (unpack->type) {
755 case WT_CELL_KEY:
756 store->data = unpack->data;
757 store->size = unpack->size;
758 if (page_type == WT_PAGE_ROW_INT)
759 return (0);
760
761 huffman = btree->huffman_key;
762 break;
763 case WT_CELL_VALUE:
764 store->data = unpack->data;
765 store->size = unpack->size;
766 huffman = btree->huffman_value;
767 break;
768 case WT_CELL_KEY_OVFL:
769 WT_RET(__wt_ovfl_read(session, page, unpack, store, &decoded));
770 if (page_type == WT_PAGE_ROW_INT || decoded)
771 return (0);
772
773 huffman = btree->huffman_key;
774 break;
775 case WT_CELL_VALUE_OVFL:
776 WT_RET(__wt_ovfl_read(session, page, unpack, store, &decoded));
777 if (decoded)
778 return (0);
779 huffman = btree->huffman_value;
780 break;
781 WT_ILLEGAL_VALUE(session, unpack->type);
782 }
783
784 return (huffman == NULL || store->size == 0 ? 0 :
785 __wt_huffman_decode(
786 session, huffman, store->data, store->size, store));
787 }
788
789 /*
790 * __wt_dsk_cell_data_ref --
791 * Set a buffer to reference the data from an unpacked cell.
792 *
793 * There are two versions because of WT_CELL_VALUE_OVFL_RM type cells. When an
794 * overflow item is deleted, its backing blocks are removed; if there are still
795 * running transactions that might need to see the overflow item, we cache a
796 * copy of the item and reset the item's cell to WT_CELL_VALUE_OVFL_RM. If we
797 * find a WT_CELL_VALUE_OVFL_RM cell when reading an overflow item, we use the
798 * page reference to look aside into the cache. So, calling the "dsk" version
799 * of the function declares the cell cannot be of type WT_CELL_VALUE_OVFL_RM,
800 * and calling the "page" version means it might be.
801 */
802 static inline int
__wt_dsk_cell_data_ref(WT_SESSION_IMPL * session,int page_type,WT_CELL_UNPACK * unpack,WT_ITEM * store)803 __wt_dsk_cell_data_ref(WT_SESSION_IMPL *session,
804 int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
805 {
806 WT_ASSERT(session,
807 __wt_cell_type_raw(unpack->cell) != WT_CELL_VALUE_OVFL_RM);
808 return (__cell_data_ref(session, NULL, page_type, unpack, store));
809 }
810
811 /*
812 * __wt_page_cell_data_ref --
813 * Set a buffer to reference the data from an unpacked cell.
814 */
815 static inline int
__wt_page_cell_data_ref(WT_SESSION_IMPL * session,WT_PAGE * page,WT_CELL_UNPACK * unpack,WT_ITEM * store)816 __wt_page_cell_data_ref(WT_SESSION_IMPL *session,
817 WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store)
818 {
819 return (__cell_data_ref(session, page, page->type, unpack, store));
820 }
821
822 /*
823 * __wt_cell_data_copy --
824 * Copy the data from an unpacked cell into a buffer.
825 */
826 static inline int
__wt_cell_data_copy(WT_SESSION_IMPL * session,int page_type,WT_CELL_UNPACK * unpack,WT_ITEM * store)827 __wt_cell_data_copy(WT_SESSION_IMPL *session,
828 int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
829 {
830 /*
831 * We have routines to both copy and reference a cell's information. In
832 * most cases, all we need is a reference and we prefer that, especially
833 * when returning key/value items. In a few we need a real copy: call
834 * the standard reference function and get a reference. In some cases,
835 * a copy will be made (for example, when reading an overflow item from
836 * the underlying object. If that happens, we're done, otherwise make
837 * a copy.
838 *
839 * We don't require two versions of this function, no callers need to
840 * handle WT_CELL_VALUE_OVFL_RM cells.
841 */
842 WT_RET(__wt_dsk_cell_data_ref(session, page_type, unpack, store));
843 if (!WT_DATA_IN_ITEM(store))
844 WT_RET(__wt_buf_set(session, store, store->data, store->size));
845 return (0);
846 }
847