1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 /*
10  * Initialize a static WT_CURSOR structure.
11  */
12 #define	WT_CURSOR_STATIC_INIT(n,					\
13 	get_key,							\
14 	get_value,							\
15 	set_key,							\
16 	set_value,							\
17 	compare,							\
18 	equals,								\
19 	next,								\
20 	prev,								\
21 	reset,								\
22 	search,								\
23 	search_near,							\
24 	insert,								\
25 	modify,								\
26 	update,								\
27 	remove,								\
28 	reserve,							\
29 	reconfigure,							\
30 	cache,								\
31 	reopen,								\
32 	close)								\
33 	static const WT_CURSOR n = {					\
34 	NULL,				/* session */			\
35 	NULL,				/* uri */			\
36 	NULL,				/* key_format */		\
37 	NULL,				/* value_format */		\
38 	get_key,							\
39 	get_value,							\
40 	set_key,							\
41 	set_value,							\
42 	compare,							\
43 	equals,								\
44 	next,								\
45 	prev,								\
46 	reset,								\
47 	search,								\
48 	search_near,							\
49 	insert,								\
50 	modify,								\
51 	update,								\
52 	remove,								\
53 	reserve,							\
54 	close,								\
55 	reconfigure,							\
56 	cache,								\
57 	reopen,								\
58 	0,				/* uri_hash */			\
59 	{ NULL, NULL },			/* TAILQ_ENTRY q */		\
60 	0,				/* recno key */			\
61 	{ 0 },				/* recno raw buffer */		\
62 	NULL,				/* json_private */		\
63 	NULL,				/* lang_private */		\
64 	{ NULL, 0, NULL, 0, 0 },	/* WT_ITEM key */		\
65 	{ NULL, 0, NULL, 0, 0 },	/* WT_ITEM value */		\
66 	0,				/* int saved_err */		\
67 	NULL,				/* internal_uri */		\
68 	0				/* uint32_t flags */		\
69 }
70 
71 struct __wt_cursor_backup {
72 	WT_CURSOR iface;
73 
74 	size_t next;			/* Cursor position */
75 	WT_FSTREAM *bfs;		/* Backup file stream */
76 	uint32_t maxid;			/* Maximum log file ID seen */
77 
78 	char **list;			/* List of files to be copied. */
79 	size_t list_allocated;
80 	size_t list_next;
81 
82 /* AUTOMATIC FLAG VALUE GENERATION START */
83 #define	WT_CURBACKUP_LOCKER	0x1u	/* Hot-backup started */
84 /* AUTOMATIC FLAG VALUE GENERATION STOP */
85 	uint8_t	flags;
86 };
87 #define	WT_CURSOR_BACKUP_ID(cursor)	(((WT_CURSOR_BACKUP *)(cursor))->maxid)
88 
89 struct __wt_cursor_btree {
90 	WT_CURSOR iface;
91 
92 	/*
93 	 * The btree field is safe to use when the cursor is open.  When the
94 	 * cursor is cached, the btree may be closed, so it is only safe
95 	 * initially to look at the underlying data handle.
96 	 */
97 	WT_BTREE *btree;		/* Enclosing btree */
98 	WT_DATA_HANDLE *dhandle;	/* Data handle for the btree */
99 
100 	/*
101 	 * The following fields are set by the search functions as a precursor
102 	 * to page modification: we have a page, a WT_COL/WT_ROW slot on the
103 	 * page, an insert head, insert list and a skiplist stack (the stack of
104 	 * skiplist entries leading to the insert point).  The search functions
105 	 * also return the relationship of the search key to the found key.
106 	 */
107 	WT_REF	  *ref;			/* Current page */
108 	uint32_t   slot;		/* WT_COL/WT_ROW 0-based slot */
109 
110 	WT_INSERT_HEAD	*ins_head;	/* Insert chain head */
111 	WT_INSERT	*ins;		/* Current insert node */
112 					/* Search stack */
113 	WT_INSERT	**ins_stack[WT_SKIP_MAXDEPTH];
114 
115 					/* Next item(s) found during search */
116 	WT_INSERT	*next_stack[WT_SKIP_MAXDEPTH];
117 
118 	uint32_t page_deleted_count;	/* Deleted items on the page */
119 
120 	uint64_t recno;			/* Record number */
121 
122 	/*
123 	 * Next-random cursors can optionally be configured to step through a
124 	 * percentage of the total leaf pages to their next value. Note the
125 	 * configured value and the calculated number of leaf pages to skip.
126 	 */
127 	uint64_t next_random_leaf_skip;
128 	u_int	 next_random_sample_size;
129 
130 	/*
131 	 * The search function sets compare to:
132 	 *	< 1 if the found key is less than the specified key
133 	 *	  0 if the found key matches the specified key
134 	 *	> 1 if the found key is larger than the specified key
135 	 */
136 	int	compare;
137 
138 	/*
139 	 * A key returned from a binary search or cursor movement on a row-store
140 	 * page; if we find an exact match on a row-store leaf page in a search
141 	 * operation, keep a copy of key we built during the search to avoid
142 	 * doing the additional work of getting the key again for return to the
143 	 * application. Note, this only applies to exact matches when searching
144 	 * disk-image structures, so it's not, for example, a key from an insert
145 	 * list. Additionally, this structure is used to build keys when moving
146 	 * a cursor through a row-store leaf page.
147 	 */
148 	WT_ITEM *row_key, _row_key;
149 
150 	/*
151 	 * It's relatively expensive to calculate the last record on a variable-
152 	 * length column-store page because of the repeat values.  Calculate it
153 	 * once per page and cache it.  This value doesn't include the skiplist
154 	 * of appended entries on the last page.
155 	 */
156 	uint64_t last_standard_recno;
157 
158 	/*
159 	 * For row-store pages, we need a single item that tells us the part of
160 	 * the page we're walking (otherwise switching from next to prev and
161 	 * vice-versa is just too complicated), so we map the WT_ROW and
162 	 * WT_INSERT_HEAD insert array slots into a single name space: slot 1
163 	 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
164 	 * WT_INSERT_HEAD[0], and so on.  This means WT_INSERT lists are
165 	 * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
166 	 */
167 	uint32_t row_iteration_slot;	/* Row-store iteration slot */
168 
169 	/*
170 	 * Variable-length column-store values are run-length encoded and may
171 	 * be overflow values or Huffman encoded. To avoid repeatedly reading
172 	 * overflow values or decompressing encoded values, process it once and
173 	 * store the result in a temporary buffer.  The cip_saved field is used
174 	 * to determine if we've switched columns since our last cursor call.
175 	 */
176 	WT_COL *cip_saved;		/* Last iteration reference */
177 
178 	/*
179 	 * We don't instantiate prefix-compressed keys on pages where there's no
180 	 * Huffman encoding because we don't want to waste memory if only moving
181 	 * a cursor through the page, and it's faster to build keys while moving
182 	 * through the page than to roll-forward from a previously instantiated
183 	 * key (we don't instantiate all of the keys, just the ones at binary
184 	 * search points).  We can't use the application's WT_CURSOR key field
185 	 * as a copy of the last-returned key because it may have been altered
186 	 * by the API layer, for example, dump cursors.  Instead we store the
187 	 * last-returned key in a temporary buffer.  The rip_saved field is used
188 	 * to determine if the key in the temporary buffer has the prefix needed
189 	 * for building the current key.
190 	 */
191 	WT_ROW *rip_saved;		/* Last-returned key reference */
192 
193 	/*
194 	 * A temporary buffer for caching RLE values for column-store files (if
195 	 * RLE is non-zero, then we don't unpack the value every time we move
196 	 * to the next cursor position, we re-use the unpacked value we stored
197 	 * here the first time we hit the value).
198 	 *
199 	 * A temporary buffer for building on-page keys when searching row-store
200 	 * files.
201 	 */
202 	WT_ITEM *tmp, _tmp;
203 
204 	/*
205 	 * The update structure allocated by the row- and column-store modify
206 	 * functions, used to avoid a data copy in the WT_CURSOR.update call.
207 	 */
208 	WT_UPDATE *modify_update;
209 
210 	/*
211 	 * Fixed-length column-store items are a single byte, and it's simpler
212 	 * and cheaper to allocate the space for it now than keep checking to
213 	 * see if we need to grow the buffer.
214 	 */
215 	uint8_t v;			/* Fixed-length return value */
216 
217 	uint8_t	append_tree;		/* Cursor appended to the tree */
218 
219 #ifdef HAVE_DIAGNOSTIC
220 	/* Check that cursor next/prev never returns keys out-of-order. */
221 	WT_ITEM *lastkey, _lastkey;
222 	uint64_t lastrecno;
223 #endif
224 
225 /* AUTOMATIC FLAG VALUE GENERATION START */
226 #define	WT_CBT_ACTIVE		0x001u	/* Active in the tree */
227 #define	WT_CBT_ITERATE_APPEND	0x002u	/* Col-store: iterating append list */
228 #define	WT_CBT_ITERATE_NEXT	0x004u	/* Next iteration configuration */
229 #define	WT_CBT_ITERATE_PREV	0x008u	/* Prev iteration configuration */
230 #define	WT_CBT_NO_TXN   	0x010u	/* Non-txn cursor (e.g. a checkpoint) */
231 #define	WT_CBT_READ_ONCE	0x020u	/* Page in with WT_READ_WONT_NEED */
232 #define	WT_CBT_RETRY_NEXT	0x040u	/* Next, resulted in prepare conflict */
233 #define	WT_CBT_RETRY_PREV	0x080u	/* Prev, resulted in prepare conflict */
234 #define	WT_CBT_SEARCH_SMALLEST	0x100u	/* Row-store: small-key insert list */
235 #define	WT_CBT_VAR_ONPAGE_MATCH	0x200u	/* Var-store: on-page recno match */
236 /* AUTOMATIC FLAG VALUE GENERATION STOP */
237 
238 #define	WT_CBT_POSITION_MASK		/* Flags associated with position */ \
239 	(WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \
240 	WT_CBT_RETRY_NEXT | WT_CBT_RETRY_PREV | WT_CBT_SEARCH_SMALLEST |     \
241 	WT_CBT_VAR_ONPAGE_MATCH)
242 
243 	uint32_t flags;
244 };
245 
246 struct __wt_cursor_bulk {
247 	WT_CURSOR_BTREE cbt;
248 
249 	/*
250 	 * Variable-length column store compares values during bulk load as
251 	 * part of RLE compression, row-store compares keys during bulk load
252 	 * to avoid corruption.
253 	 */
254 	bool	first_insert;		/* First insert */
255 	WT_ITEM	last;			/* Last key/value inserted */
256 
257 	/*
258 	 * Additional column-store bulk load support.
259 	 */
260 	uint64_t recno;			/* Record number */
261 	uint64_t rle;			/* Variable-length RLE counter */
262 
263 	/*
264 	 * Additional fixed-length column store bitmap bulk load support:
265 	 * current entry in memory chunk count, and the maximum number of
266 	 * records per chunk.
267 	 */
268 	bool	 bitmap;		/* Bitmap bulk load */
269 	uint32_t entry;			/* Entry count */
270 	uint32_t nrecs;			/* Max records per chunk */
271 
272 	void	*reconcile;		/* Reconciliation support */
273 	WT_REF	*ref;			/* The leaf page */
274 	WT_PAGE *leaf;
275 };
276 
277 struct __wt_cursor_config {
278 	WT_CURSOR iface;
279 };
280 
281 struct __wt_cursor_data_source {
282 	WT_CURSOR iface;
283 
284 	WT_COLLATOR *collator;		/* Configured collator */
285 	int collator_owned;		/* Collator needs to be terminated */
286 
287 	WT_CURSOR *source;		/* Application-owned cursor */
288 };
289 
290 struct __wt_cursor_dump {
291 	WT_CURSOR iface;
292 
293 	WT_CURSOR *child;
294 };
295 
296 struct __wt_cursor_index {
297 	WT_CURSOR iface;
298 
299 	WT_TABLE *table;
300 	WT_INDEX *index;
301 	const char *key_plan, *value_plan;
302 
303 	WT_CURSOR *child;
304 	WT_CURSOR **cg_cursors;
305 	uint8_t	*cg_needvalue;
306 };
307 
308 /*
309  * A join iterator structure is used to generate candidate primary keys. It
310  * is the responsibility of the caller of the iterator to filter these
311  * primary key against the other conditions of the join before returning
312  * them the caller of WT_CURSOR::next.
313  *
314  * For a conjunction join (the default), entry_count will be 1, meaning that
315  * the iterator only consumes the first entry (WT_CURSOR_JOIN_ENTRY).  That
316  * is, it successively returns primary keys from a cursor for the first
317  * index that was joined.  When the values returned by that cursor are
318  * exhausted, the iterator has completed.  For a disjunction join,
319  * exhausting a cursor just means that the iterator advances to the next
320  * entry. If the next entry represents an index, a new cursor is opened and
321  * primary keys from that index are then successively returned.
322  *
323  * When positioned on an entry that represents a nested join, a new child
324  * iterator is created that will be bound to the nested WT_CURSOR_JOIN.
325  * That iterator is then used to generate candidate primary keys.  When its
326  * iteration is completed, that iterator is destroyed and the parent
327  * iterator advances to the next entry.  Thus, depending on how deeply joins
328  * are nested, a similarly deep stack of iterators is created.
329  */
330 struct __wt_cursor_join_iter {
331 	WT_SESSION_IMPL		*session;
332 	WT_CURSOR_JOIN		*cjoin;
333 	WT_CURSOR_JOIN_ENTRY	*entry;
334 	WT_CURSOR_JOIN_ITER	*child;
335 	WT_CURSOR		*cursor;	/* has null projection */
336 	WT_ITEM			*curkey;	/* primary key */
337 	WT_ITEM			 idxkey;
338 	u_int			 entry_pos;	/* the current entry */
339 	u_int			 entry_count;	/* entries to walk */
340 	u_int			 end_pos;	/* the current endpoint */
341 	u_int			 end_count;	/* endpoints to walk */
342 	u_int			 end_skip;	/* when testing for inclusion */
343 						/* can we skip current end? */
344 	bool			 positioned;
345 	bool			 is_equal;
346 };
347 
348 /*
349  * A join endpoint represents a positioned cursor that is 'captured' by a
350  * WT_SESSION::join call.
351  */
352 struct __wt_cursor_join_endpoint {
353 	WT_ITEM			 key;
354 	uint8_t			 recno_buf[10];	/* holds packed recno */
355 	WT_CURSOR		*cursor;
356 
357 /* AUTOMATIC FLAG VALUE GENERATION START */
358 #define	WT_CURJOIN_END_EQ		0x1u	/* include values == cursor */
359 #define	WT_CURJOIN_END_GT		0x2u	/* include values >  cursor */
360 #define	WT_CURJOIN_END_LT		0x4u	/* include values <  cursor */
361 #define	WT_CURJOIN_END_OWN_CURSOR	0x8u	/* must close cursor */
362 /* AUTOMATIC FLAG VALUE GENERATION STOP */
363 #define	WT_CURJOIN_END_GE	(WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ)
364 #define	WT_CURJOIN_END_LE	(WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ)
365 	uint8_t			 flags;		/* range for this endpoint */
366 };
367 #define	WT_CURJOIN_END_RANGE(endp)					\
368 	((endp)->flags &						\
369 	    (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ | WT_CURJOIN_END_LT))
370 
371 /*
372  * Each join entry typically represents an index's participation in a join.
373  * For example, if 'k' is an index, then "t.k > 10 && t.k < 20" would be
374  * represented by a single entry, with two endpoints.  When the index and
375  * subjoin fields are NULL, the join is on the main table.  When subjoin is
376  * non-NULL, there is a nested join clause.
377  */
378 struct __wt_cursor_join_entry {
379 	WT_INDEX		*index;
380 	WT_CURSOR		*main;		/* raw main table cursor */
381 	WT_CURSOR_JOIN		*subjoin;	/* a nested join clause */
382 	WT_BLOOM		*bloom;		/* Bloom filter handle */
383 	char			*repack_format; /* target format for repack */
384 	uint32_t		 bloom_bit_count; /* bits per item in bloom */
385 	uint32_t		 bloom_hash_count; /* hash functions in bloom */
386 	uint64_t		 count;		/* approx number of matches */
387 
388 /* AUTOMATIC FLAG VALUE GENERATION START */
389 #define	WT_CURJOIN_ENTRY_BLOOM		 0x1u	/* use a bloom filter */
390 #define	WT_CURJOIN_ENTRY_DISJUNCTION	 0x2u	/* endpoints are or-ed */
391 #define	WT_CURJOIN_ENTRY_FALSE_POSITIVES 0x4u	/* don't filter false pos */
392 #define	WT_CURJOIN_ENTRY_OWN_BLOOM	 0x8u	/* this entry owns the bloom */
393 /* AUTOMATIC FLAG VALUE GENERATION STOP */
394 	uint8_t			 flags;
395 
396 	WT_CURSOR_JOIN_ENDPOINT	*ends;		/* reference endpoints */
397 	size_t			 ends_allocated;
398 	u_int			 ends_next;
399 
400 	WT_JOIN_STATS		 stats;		/* Join statistics */
401 };
402 
403 struct __wt_cursor_join {
404 	WT_CURSOR iface;
405 
406 	WT_TABLE		*table;
407 	const char		*projection;
408 	WT_CURSOR		*main;		/* main table with projection */
409 	WT_CURSOR_JOIN		*parent;	/* parent of nested group */
410 	WT_CURSOR_JOIN_ITER	*iter;		/* chain of iterators */
411 	WT_CURSOR_JOIN_ENTRY	*entries;
412 	size_t			 entries_allocated;
413 	u_int			 entries_next;
414 	uint8_t			 recno_buf[10];	/* holds packed recno */
415 
416 /* AUTOMATIC FLAG VALUE GENERATION START */
417 #define	WT_CURJOIN_DISJUNCTION		0x1u	/* Entries are or-ed */
418 #define	WT_CURJOIN_ERROR		0x2u	/* Error in initialization */
419 #define	WT_CURJOIN_INITIALIZED		0x4u	/* Successful initialization */
420 /* AUTOMATIC FLAG VALUE GENERATION STOP */
421 	uint8_t			 flags;
422 };
423 
424 struct __wt_cursor_json {
425 	char	*key_buf;		/* JSON formatted string */
426 	char	*value_buf;		/* JSON formatted string */
427 	WT_CONFIG_ITEM key_names;	/* Names of key columns */
428 	WT_CONFIG_ITEM value_names;	/* Names of value columns */
429 };
430 
431 struct __wt_cursor_log {
432 	WT_CURSOR iface;
433 
434 	WT_LSN		*cur_lsn;	/* LSN of current record */
435 	WT_LSN		*next_lsn;	/* LSN of next record */
436 	WT_ITEM		*logrec;	/* Copy of record for cursor */
437 	WT_ITEM		*opkey, *opvalue;	/* Op key/value copy */
438 	const uint8_t	*stepp, *stepp_end;	/* Pointer within record */
439 	uint8_t		*packed_key;	/* Packed key for 'raw' interface */
440 	uint8_t		*packed_value;	/* Packed value for 'raw' interface */
441 	uint32_t	step_count;	/* Intra-record count */
442 	uint32_t	rectype;	/* Record type */
443 	uint64_t	txnid;		/* Record txnid */
444 
445 /* AUTOMATIC FLAG VALUE GENERATION START */
446 #define	WT_CURLOG_ARCHIVE_LOCK	0x1u	/* Archive lock held */
447 /* AUTOMATIC FLAG VALUE GENERATION STOP */
448 	uint8_t		flags;
449 };
450 
451 struct __wt_cursor_metadata {
452 	WT_CURSOR iface;
453 
454 	WT_CURSOR *file_cursor;		/* Queries of regular metadata */
455 	WT_CURSOR *create_cursor;	/* Extra cursor for create option */
456 
457 /* AUTOMATIC FLAG VALUE GENERATION START */
458 #define	WT_MDC_CREATEONLY	0x1u
459 #define	WT_MDC_ONMETADATA	0x2u
460 #define	WT_MDC_POSITIONED	0x4u
461 /* AUTOMATIC FLAG VALUE GENERATION STOP */
462 	uint8_t	flags;
463 };
464 
465 struct __wt_join_stats_group {
466 	const char *desc_prefix;	/* Prefix appears before description */
467 	WT_CURSOR_JOIN *join_cursor;
468 	ssize_t join_cursor_entry;	/* Position in entries */
469 	WT_JOIN_STATS join_stats;
470 };
471 
472 struct __wt_cursor_stat {
473 	WT_CURSOR iface;
474 
475 	bool	notinitialized;		/* Cursor not initialized */
476 	bool	notpositioned;		/* Cursor not positioned */
477 
478 	int64_t	     *stats;		/* Statistics */
479 	int	      stats_base;	/* Base statistics value */
480 	int	      stats_count;	/* Count of statistics values */
481 	int	     (*stats_desc)(WT_CURSOR_STAT *, int, const char **);
482 					/* Statistics descriptions */
483 	int	     (*next_set)(WT_SESSION_IMPL *, WT_CURSOR_STAT *, bool,
484 			 bool);		/* Advance to next set */
485 
486 	union {				/* Copies of the statistics */
487 		WT_DSRC_STATS dsrc_stats;
488 		WT_CONNECTION_STATS conn_stats;
489 		WT_JOIN_STATS_GROUP join_stats_group;
490 	} u;
491 
492 	const char **cfg;		/* Original cursor configuration */
493 	char	*desc_buf;		/* Saved description string */
494 
495 	int	 key;			/* Current stats key */
496 	uint64_t v;			/* Current stats value */
497 	WT_ITEM	 pv;			/* Current stats value (string) */
498 
499 	/* Options declared in flags.py, shared by WT_CONNECTION::stat_flags */
500 	uint32_t flags;
501 };
502 
503 /*
504  * WT_CURSOR_STATS --
505  *	Return a reference to a statistic cursor's stats structures.
506  */
507 #define	WT_CURSOR_STATS(cursor)						\
508 	(((WT_CURSOR_STAT *)(cursor))->stats)
509 
510 struct __wt_cursor_table {
511 	WT_CURSOR iface;
512 
513 	WT_TABLE *table;
514 	const char *plan;
515 
516 	const char **cfg;		/* Saved configuration string */
517 
518 	WT_CURSOR **cg_cursors;
519 	WT_ITEM *cg_valcopy;		/*
520 					 * Copies of column group values, for
521 					 * overlapping set_value calls.
522 					 */
523 	WT_CURSOR **idx_cursors;
524 };
525 
526 #define	WT_CURSOR_PRIMARY(cursor)					\
527 	(((WT_CURSOR_TABLE *)(cursor))->cg_cursors[0])
528 
529 #define	WT_CURSOR_RECNO(cursor)	WT_STREQ((cursor)->key_format, "r")
530 
531 #define	WT_CURSOR_RAW_OK						\
532 	(WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_PRINT | WT_CURSTD_RAW)
533