1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 #include "wt_internal.h"
10
11 /*
12 * When returning an error, we need to restore the cursor to a valid state, the
13 * upper-level cursor code is likely to retry. This structure and the associated
14 * functions are used save and restore the cursor state.
15 */
16 typedef struct {
17 WT_ITEM key;
18 WT_ITEM value;
19 uint64_t recno;
20 uint32_t flags;
21 } WT_CURFILE_STATE;
22
23 /*
24 * __cursor_state_save --
25 * Save the cursor's external state.
26 */
27 static inline void
__cursor_state_save(WT_CURSOR * cursor,WT_CURFILE_STATE * state)28 __cursor_state_save(WT_CURSOR *cursor, WT_CURFILE_STATE *state)
29 {
30 WT_ITEM_SET(state->key, cursor->key);
31 WT_ITEM_SET(state->value, cursor->value);
32 state->recno = cursor->recno;
33 state->flags = cursor->flags;
34 }
35
36 /*
37 * __cursor_state_restore --
38 * Restore the cursor's external state.
39 */
40 static inline void
__cursor_state_restore(WT_CURSOR * cursor,WT_CURFILE_STATE * state)41 __cursor_state_restore(WT_CURSOR *cursor, WT_CURFILE_STATE *state)
42 {
43 if (F_ISSET(state, WT_CURSTD_KEY_EXT))
44 WT_ITEM_SET(cursor->key, state->key);
45 if (F_ISSET(state, WT_CURSTD_VALUE_EXT))
46 WT_ITEM_SET(cursor->value, state->value);
47 cursor->recno = state->recno;
48 F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
49 F_SET(cursor, F_MASK(state, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT));
50
51 }
52
53 /*
54 * __cursor_page_pinned --
55 * Return if we have a page pinned.
56 */
57 static inline bool
__cursor_page_pinned(WT_CURSOR_BTREE * cbt)58 __cursor_page_pinned(WT_CURSOR_BTREE *cbt)
59 {
60 WT_CURSOR *cursor;
61 WT_SESSION_IMPL *session;
62 uint32_t current_state;
63
64 cursor = &cbt->iface;
65 session = (WT_SESSION_IMPL *)cursor->session;
66
67 /*
68 * Check the page active flag, asserting the page reference with any
69 * external key.
70 */
71 if (!F_ISSET(cbt, WT_CBT_ACTIVE)) {
72 WT_ASSERT(session,
73 cbt->ref == NULL && !F_ISSET(cursor, WT_CURSTD_KEY_INT));
74 return (false);
75 }
76
77 /*
78 * Check if the key references the page. When returning from search,
79 * the page is active and the key is internal. After the application
80 * sets a key, the key is external, and the page is useless.
81 */
82 if (!F_ISSET(cursor, WT_CURSTD_KEY_INT))
83 return (false);
84
85 /*
86 * Fail if the page is flagged for forced eviction (so we periodically
87 * release pages grown too large).
88 */
89 if (cbt->ref->page->read_gen == WT_READGEN_OLDEST)
90 return (false);
91
92 /*
93 * If we are doing an update, we need a page with history, release the
94 * page so we get it again with history if required. Eviction may be
95 * locking the page, wait until we see a "normal" state and then test
96 * against that state (eviction may have already locked the page again).
97 */
98 if (F_ISSET(&session->txn, WT_TXN_UPDATE)) {
99 while ((current_state = cbt->ref->state) == WT_REF_LOCKED)
100 __wt_yield();
101 return (current_state == WT_REF_MEM);
102 }
103
104 return (true);
105 }
106
107 /*
108 * __cursor_size_chk --
109 * Return if an inserted item is too large.
110 */
111 static inline int
__cursor_size_chk(WT_SESSION_IMPL * session,WT_ITEM * kv)112 __cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv)
113 {
114 WT_BM *bm;
115 WT_BTREE *btree;
116 WT_DECL_RET;
117 size_t size;
118
119 btree = S2BT(session);
120 bm = btree->bm;
121
122 if (btree->type == BTREE_COL_FIX) {
123 /* Fixed-size column-stores take a single byte. */
124 if (kv->size != 1)
125 WT_RET_MSG(session, EINVAL,
126 "item size of %" WT_SIZET_FMT " does not match "
127 "fixed-length file requirement of 1 byte",
128 kv->size);
129 return (0);
130 }
131
132 /* Don't waste effort, 1GB is always cool. */
133 if (kv->size <= WT_GIGABYTE)
134 return (0);
135
136 /* Check what we are willing to store in the tree. */
137 if (kv->size > WT_BTREE_MAX_OBJECT_SIZE)
138 WT_RET_MSG(session, EINVAL,
139 "item size of %" WT_SIZET_FMT " exceeds the maximum "
140 "supported WiredTiger size of %" PRIu32,
141 kv->size, WT_BTREE_MAX_OBJECT_SIZE);
142
143 /* Check what the block manager can actually write. */
144 size = kv->size;
145 if ((ret = bm->write_size(bm, session, &size)) != 0)
146 WT_RET_MSG(session, ret,
147 "item size of %" WT_SIZET_FMT " refused by block manager",
148 kv->size);
149
150 return (0);
151 }
152
153 /*
154 * __cursor_disable_bulk --
155 * Disable bulk loads into a tree.
156 */
157 static inline void
__cursor_disable_bulk(WT_SESSION_IMPL * session,WT_BTREE * btree)158 __cursor_disable_bulk(WT_SESSION_IMPL *session, WT_BTREE *btree)
159 {
160 /*
161 * Once a tree (other than the LSM primary) is no longer empty, eviction
162 * should pay attention to it, and it's no longer possible to bulk-load
163 * into it.
164 */
165 if (!btree->original)
166 return;
167 if (btree->lsm_primary) {
168 btree->original = 0; /* Make the next test faster. */
169 return;
170 }
171
172 /*
173 * We use a compare-and-swap here to avoid races among the first inserts
174 * into a tree. Eviction is disabled when an empty tree is opened, and
175 * it must only be enabled once.
176 */
177 if (__wt_atomic_cas8(&btree->original, 1, 0)) {
178 btree->evict_disabled_open = false;
179 __wt_evict_file_exclusive_off(session);
180 }
181 }
182
183 /*
184 * __cursor_fix_implicit --
185 * Return if search went past the end of the tree.
186 */
187 static inline bool
__cursor_fix_implicit(WT_BTREE * btree,WT_CURSOR_BTREE * cbt)188 __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
189 {
190 /*
191 * When there's no exact match, column-store search returns the key
192 * nearest the searched-for key (continuing past keys smaller than the
193 * searched-for key to return the next-largest key). Therefore, if the
194 * returned comparison is -1, the searched-for key was larger than any
195 * row on the page's standard information or column-store insert list.
196 *
197 * If the returned comparison is NOT -1, there was a row equal to or
198 * larger than the searched-for key, and we implicitly create missing
199 * rows.
200 */
201 return (btree->type == BTREE_COL_FIX && cbt->compare != -1);
202 }
203
204 /*
205 * __wt_cursor_valid --
206 * Return if the cursor references an valid key/value pair.
207 */
208 int
__wt_cursor_valid(WT_CURSOR_BTREE * cbt,WT_UPDATE ** updp,bool * valid)209 __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *valid)
210 {
211 WT_BTREE *btree;
212 WT_CELL *cell;
213 WT_COL *cip;
214 WT_PAGE *page;
215 WT_SESSION_IMPL *session;
216 WT_UPDATE *upd;
217
218 if (updp != NULL)
219 *updp = NULL;
220 *valid = false;
221 btree = cbt->btree;
222 page = cbt->ref->page;
223 session = (WT_SESSION_IMPL *)cbt->iface.session;
224
225 /*
226 * We may be pointing to an insert object, and we may have a page with
227 * existing entries. Insert objects always have associated update
228 * objects (the value). Any update object may be deleted, or invisible
229 * to us. In the case of an on-page entry, there is by definition a
230 * value that is visible to us, the original page cell.
231 *
232 * If we find a visible update structure, return our caller a reference
233 * to it because we don't want to repeatedly search for the update, it
234 * might suddenly become invisible (imagine a read-uncommitted session
235 * with another session's aborted insert), and we don't want to handle
236 * that potential error every time we look at the value.
237 *
238 * Unfortunately, the objects we might have and their relationships are
239 * different for the underlying page types.
240 *
241 * In the case of row-store, an insert object implies ignoring any page
242 * objects, no insert object can have the same key as an on-page object.
243 * For row-store:
244 * if there's an insert object:
245 * if there's a visible update:
246 * exact match
247 * else
248 * no exact match
249 * else
250 * use the on-page object (which may have an associated
251 * update object that may or may not be visible to us).
252 *
253 * Column-store is more complicated because an insert object can have
254 * the same key as an on-page object: updates to column-store rows
255 * are insert/object pairs, and an invisible update isn't the end as
256 * there may be an on-page object that is visible. This changes the
257 * logic to:
258 * if there's an insert object:
259 * if there's a visible update:
260 * exact match
261 * else if the on-page object's key matches the insert key
262 * use the on-page object
263 * else
264 * use the on-page object
265 *
266 * First, check for an insert object with a visible update (a visible
267 * update that's been deleted is not a valid key/value pair).
268 */
269 if (cbt->ins != NULL) {
270 WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
271 if (upd != NULL) {
272 if (upd->type == WT_UPDATE_TOMBSTONE)
273 return (0);
274 if (updp != NULL)
275 *updp = upd;
276 *valid = true;
277 return (0);
278 }
279 }
280
281 /*
282 * If we don't have an insert object, or in the case of column-store,
283 * there's an insert object but no update was visible to us and the key
284 * on the page is the same as the insert object's key, and the slot as
285 * set by the search function is valid, we can use the original page
286 * information.
287 */
288 switch (btree->type) {
289 case BTREE_COL_FIX:
290 /*
291 * If search returned an insert object, there may or may not be
292 * a matching on-page object, we have to check. Fixed-length
293 * column-store pages don't have slots, but map one-to-one to
294 * keys, check for retrieval past the end of the page.
295 */
296 if (cbt->recno >= cbt->ref->ref_recno + page->entries)
297 return (0);
298
299 /*
300 * An update would have appeared as an "insert" object; no
301 * further checks to do.
302 */
303 break;
304 case BTREE_COL_VAR:
305 /* The search function doesn't check for empty pages. */
306 if (page->entries == 0)
307 return (0);
308 WT_ASSERT(session, cbt->slot < page->entries);
309
310 /*
311 * Column-store updates are stored as "insert" objects. If
312 * search returned an insert object we can't return, the
313 * returned on-page object must be checked for a match.
314 */
315 if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH))
316 return (0);
317
318 /*
319 * Although updates would have appeared as an "insert" objects,
320 * variable-length column store deletes are written into the
321 * backing store; check the cell for a record already deleted
322 * when read.
323 */
324 cip = &page->pg_var[cbt->slot];
325 if ((cell = WT_COL_PTR(page, cip)) == NULL ||
326 __wt_cell_type(cell) == WT_CELL_DEL)
327 return (0);
328 break;
329 case BTREE_ROW:
330 /* The search function doesn't check for empty pages. */
331 if (page->entries == 0)
332 return (0);
333 WT_ASSERT(session, cbt->slot < page->entries);
334
335 /*
336 * See above: for row-store, no insert object can have the same
337 * key as an on-page object, we're done.
338 */
339 if (cbt->ins != NULL)
340 return (0);
341
342 /* Check for an update. */
343 if (page->modify != NULL &&
344 page->modify->mod_row_update != NULL) {
345 WT_RET(__wt_txn_read(session,
346 page->modify->mod_row_update[cbt->slot], &upd));
347 if (upd != NULL) {
348 if (upd->type == WT_UPDATE_TOMBSTONE)
349 return (0);
350 if (updp != NULL)
351 *updp = upd;
352 }
353 }
354 break;
355 }
356 *valid = true;
357 return (0);
358 }
359
360 /*
361 * __cursor_col_search --
362 * Column-store search from a cursor.
363 */
364 static inline int
__cursor_col_search(WT_CURSOR_BTREE * cbt,WT_REF * leaf,bool * leaf_foundp)365 __cursor_col_search(WT_CURSOR_BTREE *cbt, WT_REF *leaf, bool *leaf_foundp)
366 {
367 WT_DECL_RET;
368 WT_SESSION_IMPL *session;
369
370 session = (WT_SESSION_IMPL *)cbt->iface.session;
371 WT_WITH_PAGE_INDEX(
372 session, ret = __wt_col_search(
373 cbt, cbt->iface.recno, leaf, false, leaf_foundp));
374 return (ret);
375 }
376
377 /*
378 * __cursor_row_search --
379 * Row-store search from a cursor.
380 */
381 static inline int
__cursor_row_search(WT_CURSOR_BTREE * cbt,bool insert,WT_REF * leaf,bool * leaf_foundp)382 __cursor_row_search(
383 WT_CURSOR_BTREE *cbt, bool insert, WT_REF *leaf, bool *leaf_foundp)
384 {
385 WT_DECL_RET;
386 WT_SESSION_IMPL *session;
387
388 session = (WT_SESSION_IMPL *)cbt->iface.session;
389 WT_WITH_PAGE_INDEX(
390 session, ret = __wt_row_search(
391 cbt, &cbt->iface.key, insert, leaf, false, leaf_foundp));
392 return (ret);
393 }
394
395 /*
396 * __cursor_col_modify_v --
397 * Column-store modify from a cursor, with a separate value.
398 */
399 static inline int
__cursor_col_modify_v(WT_CURSOR_BTREE * cbt,WT_ITEM * value,u_int modify_type)400 __cursor_col_modify_v(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
401 {
402 return (__wt_col_modify(
403 cbt, cbt->iface.recno, value, NULL, modify_type, false));
404 }
405
406 /*
407 * __cursor_row_modify_v --
408 * Row-store modify from a cursor, with a separate value.
409 */
410 static inline int
__cursor_row_modify_v(WT_CURSOR_BTREE * cbt,WT_ITEM * value,u_int modify_type)411 __cursor_row_modify_v(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
412 {
413 return (__wt_row_modify(
414 cbt, &cbt->iface.key, value, NULL, modify_type, false));
415 }
416
417 /*
418 * __cursor_col_modify --
419 * Column-store modify from a cursor.
420 */
421 static inline int
__cursor_col_modify(WT_CURSOR_BTREE * cbt,u_int modify_type)422 __cursor_col_modify(WT_CURSOR_BTREE *cbt, u_int modify_type)
423 {
424 return (__wt_col_modify(
425 cbt, cbt->iface.recno, &cbt->iface.value,
426 NULL, modify_type, false));
427 }
428
429 /*
430 * __cursor_row_modify --
431 * Row-store modify from a cursor.
432 */
433 static inline int
__cursor_row_modify(WT_CURSOR_BTREE * cbt,u_int modify_type)434 __cursor_row_modify(WT_CURSOR_BTREE *cbt, u_int modify_type)
435 {
436 return (__wt_row_modify(
437 cbt, &cbt->iface.key, &cbt->iface.value,
438 NULL, modify_type, false));
439 }
440
441 /*
442 * __cursor_restart --
443 * Common cursor restart handling.
444 */
445 static void
__cursor_restart(WT_SESSION_IMPL * session,uint64_t * yield_count,uint64_t * sleep_usecs)446 __cursor_restart(
447 WT_SESSION_IMPL *session, uint64_t *yield_count, uint64_t *sleep_usecs)
448 {
449 __wt_spin_backoff(yield_count, sleep_usecs);
450
451 WT_STAT_CONN_INCR(session, cursor_restart);
452 WT_STAT_DATA_INCR(session, cursor_restart);
453 }
454
455 /*
456 * __wt_btcur_reset --
457 * Invalidate the cursor position.
458 */
459 int
__wt_btcur_reset(WT_CURSOR_BTREE * cbt)460 __wt_btcur_reset(WT_CURSOR_BTREE *cbt)
461 {
462 WT_CURSOR *cursor;
463 WT_SESSION_IMPL *session;
464
465 cursor = &cbt->iface;
466 session = (WT_SESSION_IMPL *)cbt->iface.session;
467
468 WT_STAT_CONN_INCR(session, cursor_reset);
469 WT_STAT_DATA_INCR(session, cursor_reset);
470
471 F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
472
473 return (__cursor_reset(cbt));
474 }
475
476 /*
477 * __wt_btcur_search --
478 * Search for a matching record in the tree.
479 */
480 int
__wt_btcur_search(WT_CURSOR_BTREE * cbt)481 __wt_btcur_search(WT_CURSOR_BTREE *cbt)
482 {
483 WT_BTREE *btree;
484 WT_CURFILE_STATE state;
485 WT_CURSOR *cursor;
486 WT_DECL_RET;
487 WT_SESSION_IMPL *session;
488 WT_UPDATE *upd;
489 bool leaf_found, valid;
490
491 btree = cbt->btree;
492 cursor = &cbt->iface;
493 session = (WT_SESSION_IMPL *)cursor->session;
494 upd = NULL; /* -Wuninitialized */
495
496 WT_STAT_CONN_INCR(session, cursor_search);
497 WT_STAT_DATA_INCR(session, cursor_search);
498
499 WT_RET(__wt_txn_search_check(session));
500 __cursor_state_save(cursor, &state);
501
502 /*
503 * The pinned page goes away if we search the tree, get a local copy of
504 * any pinned key and discard any pinned value, then re-save the cursor
505 * state. Done before searching pinned pages (unlike other cursor
506 * functions), because we don't anticipate applications searching for a
507 * key they currently have pinned.)
508 */
509 WT_ERR(__cursor_localkey(cursor));
510 __cursor_novalue(cursor);
511 __cursor_state_save(cursor, &state);
512
513 /*
514 * If we have a page pinned, search it; if we don't have a page pinned,
515 * or the search of the pinned page doesn't find an exact match, search
516 * from the root.
517 */
518 valid = false;
519 if (__cursor_page_pinned(cbt)) {
520 __wt_txn_cursor_op(session);
521
522 WT_ERR(btree->type == BTREE_ROW ?
523 __cursor_row_search(cbt, false, cbt->ref, &leaf_found) :
524 __cursor_col_search(cbt, cbt->ref, &leaf_found));
525
526 /* Return, if prepare conflict encountered. */
527 if (leaf_found && cbt->compare == 0)
528 WT_ERR(__wt_cursor_valid(cbt, &upd, &valid));
529 }
530 if (!valid) {
531 WT_ERR(__cursor_func_init(cbt, true));
532
533 WT_ERR(btree->type == BTREE_ROW ?
534 __cursor_row_search(cbt, false, NULL, NULL) :
535 __cursor_col_search(cbt, NULL, NULL));
536
537 /* Return, if prepare conflict encountered. */
538 if (cbt->compare == 0)
539 WT_ERR(__wt_cursor_valid(cbt, &upd, &valid));
540 }
541
542 if (valid)
543 ret = __cursor_kv_return(cbt, upd);
544 else if (__cursor_fix_implicit(btree, cbt)) {
545 /*
546 * Creating a record past the end of the tree in a fixed-length
547 * column-store implicitly fills the gap with empty records.
548 */
549 cbt->recno = cursor->recno;
550 cbt->v = 0;
551 cursor->value.data = &cbt->v;
552 cursor->value.size = 1;
553 F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
554 F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
555 } else
556 ret = WT_NOTFOUND;
557
558 #ifdef HAVE_DIAGNOSTIC
559 if (ret == 0)
560 WT_ERR(__wt_cursor_key_order_init(cbt));
561 #endif
562
563 err: if (ret != 0) {
564 WT_TRET(__cursor_reset(cbt));
565 __cursor_state_restore(cursor, &state);
566 }
567 return (ret);
568 }
569
570 /*
571 * __wt_btcur_search_near --
572 * Search for a record in the tree.
573 */
574 int
__wt_btcur_search_near(WT_CURSOR_BTREE * cbt,int * exactp)575 __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
576 {
577 WT_BTREE *btree;
578 WT_CURFILE_STATE state;
579 WT_CURSOR *cursor;
580 WT_DECL_RET;
581 WT_SESSION_IMPL *session;
582 WT_UPDATE *upd;
583 int exact;
584 bool leaf_found, valid;
585
586 btree = cbt->btree;
587 cursor = &cbt->iface;
588 session = (WT_SESSION_IMPL *)cursor->session;
589 upd = NULL; /* -Wuninitialized */
590 exact = 0;
591
592 WT_STAT_CONN_INCR(session, cursor_search_near);
593 WT_STAT_DATA_INCR(session, cursor_search_near);
594
595 WT_RET(__wt_txn_search_check(session));
596 __cursor_state_save(cursor, &state);
597
598 /*
599 * The pinned page goes away if we search the tree, get a local copy of
600 * any pinned key and discard any pinned value, then re-save the cursor
601 * state. Done before searching pinned pages (unlike other cursor
602 * functions), because we don't anticipate applications searching for a
603 * key they currently have pinned.)
604 */
605 WT_ERR(__cursor_localkey(cursor));
606 __cursor_novalue(cursor);
607 __cursor_state_save(cursor, &state);
608
609 /*
610 * If we have a row-store page pinned, search it; if we don't have a
611 * page pinned, or the search of the pinned page doesn't find an exact
612 * match, search from the root. Unlike WT_CURSOR.search, ignore pinned
613 * pages in the case of column-store, search-near isn't an interesting
614 * enough case for column-store to add the complexity needed to avoid
615 * the tree search.
616 */
617 valid = false;
618 if (btree->type == BTREE_ROW && __cursor_page_pinned(cbt)) {
619 __wt_txn_cursor_op(session);
620 /*
621 * Set the "insert" flag for the btree row-store search; we may
622 * intend to position the cursor at the end of the tree, rather
623 * than match an existing record.
624 */
625 WT_ERR(__cursor_row_search(cbt, true, cbt->ref, &leaf_found));
626
627 /*
628 * Only use the pinned page search results if search returns an
629 * exact match or a slot other than the page's boundary slots,
630 * if that's not the case, a neighbor page might offer a better
631 * match. This test is simplistic as we're ignoring append
632 * lists (there may be no page slots or we might be
633 * legitimately positioned after the last page slot). Ignore
634 * those cases, it makes things too complicated.
635 */
636 if (leaf_found &&
637 (cbt->compare == 0 ||
638 (cbt->slot != 0 &&
639 cbt->slot != cbt->ref->page->entries - 1)))
640 WT_ERR(__wt_cursor_valid(cbt, &upd, &valid));
641 }
642 if (!valid) {
643 WT_ERR(__cursor_func_init(cbt, true));
644
645 /*
646 * Set the "insert" flag for the btree row-store search; we may
647 * intend to position the cursor at the end of the tree, rather
648 * than match an existing record.
649 */
650 WT_ERR(btree->type == BTREE_ROW ?
651 __cursor_row_search(cbt, true, NULL, NULL) :
652 __cursor_col_search(cbt, NULL, NULL));
653 WT_ERR(__wt_cursor_valid(cbt, &upd, &valid));
654 }
655
656 /*
657 * If we find a valid key, return it.
658 *
659 * Else, creating a record past the end of the tree in a fixed-length
660 * column-store implicitly fills the gap with empty records. In this
661 * case, we instantiate the empty record, it's an exact match.
662 *
663 * Else, move to the next key in the tree (bias for prefix searches).
664 * Cursor next skips invalid rows, so we don't have to test for them
665 * again.
666 *
667 * Else, redo the search and move to the previous key in the tree.
668 * Cursor previous skips invalid rows, so we don't have to test for
669 * them again.
670 *
671 * If that fails, quit, there's no record to return.
672 */
673 if (valid) {
674 exact = cbt->compare;
675 ret = __cursor_kv_return(cbt, upd);
676 } else if (__cursor_fix_implicit(btree, cbt)) {
677 cbt->recno = cursor->recno;
678 cbt->v = 0;
679 cursor->value.data = &cbt->v;
680 cursor->value.size = 1;
681 exact = 0;
682 F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
683 F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
684 } else {
685 /*
686 * We didn't find an exact match: try after the search key,
687 * then before. We have to loop here because at low isolation
688 * levels, new records could appear as we are stepping through
689 * the tree.
690 */
691 while ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) {
692 WT_ERR(ret);
693 if (btree->type == BTREE_ROW)
694 WT_ERR(__wt_compare(session, btree->collator,
695 &cursor->key, &state.key, &exact));
696 else
697 exact = cbt->recno < state.recno ? -1 :
698 cbt->recno == state.recno ? 0 : 1;
699 if (exact >= 0)
700 goto done;
701 }
702
703 /*
704 * We walked to the end of the tree without finding a match.
705 * Walk backwards instead.
706 */
707 while ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) {
708 WT_ERR(ret);
709 if (btree->type == BTREE_ROW)
710 WT_ERR(__wt_compare(session, btree->collator,
711 &cursor->key, &state.key, &exact));
712 else
713 exact = cbt->recno < state.recno ? -1 :
714 cbt->recno == state.recno ? 0 : 1;
715 if (exact <= 0)
716 goto done;
717 }
718 }
719
720 done:
721 err: if (ret == 0 && exactp != NULL)
722 *exactp = exact;
723
724 #ifdef HAVE_DIAGNOSTIC
725 if (ret == 0)
726 WT_TRET(__wt_cursor_key_order_init(cbt));
727 #endif
728
729 if (ret != 0) {
730 WT_TRET(__cursor_reset(cbt));
731 __cursor_state_restore(cursor, &state);
732 }
733 return (ret);
734 }
735
736 /*
737 * __wt_btcur_insert --
738 * Insert a record into the tree.
739 */
740 int
__wt_btcur_insert(WT_CURSOR_BTREE * cbt)741 __wt_btcur_insert(WT_CURSOR_BTREE *cbt)
742 {
743 WT_BTREE *btree;
744 WT_CURFILE_STATE state;
745 WT_CURSOR *cursor;
746 WT_DECL_RET;
747 WT_SESSION_IMPL *session;
748 uint64_t yield_count, sleep_usecs;
749 bool append_key, valid;
750
751 btree = cbt->btree;
752 cursor = &cbt->iface;
753 session = (WT_SESSION_IMPL *)cursor->session;
754 yield_count = sleep_usecs = 0;
755
756 WT_STAT_CONN_INCR(session, cursor_insert);
757 WT_STAT_DATA_INCR(session, cursor_insert);
758 WT_STAT_DATA_INCRV(session,
759 cursor_insert_bytes, cursor->key.size + cursor->value.size);
760
761 if (btree->type == BTREE_ROW)
762 WT_RET(__cursor_size_chk(session, &cursor->key));
763 WT_RET(__cursor_size_chk(session, &cursor->value));
764
765 /* It's no longer possible to bulk-load into the tree. */
766 __cursor_disable_bulk(session, btree);
767
768 /*
769 * Insert a new record if WT_CURSTD_APPEND configured, (ignoring any
770 * application set record number). Although append can't be configured
771 * for a row-store, this code would break if it were, and that's owned
772 * by the upper cursor layer, be cautious.
773 */
774 append_key =
775 F_ISSET(cursor, WT_CURSTD_APPEND) && btree->type != BTREE_ROW;
776
777 /* Save the cursor state. */
778 __cursor_state_save(cursor, &state);
779
780 /*
781 * If inserting with overwrite configured, and positioned to an on-page
782 * key, the update doesn't require another search. Cursors configured
783 * for append aren't included, regardless of whether or not they meet
784 * all other criteria.
785 *
786 * Fixed-length column store can never use a positioned cursor to update
787 * because the cursor may not be positioned to the correct record in the
788 * case of implicit records in the append list.
789 */
790 if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt) &&
791 F_ISSET(cursor, WT_CURSTD_OVERWRITE) && !append_key) {
792 WT_ERR(__wt_txn_autocommit_check(session));
793 /*
794 * The cursor position may not be exact (the cursor's comparison
795 * value not equal to zero). Correct to an exact match so we can
796 * update whatever we're pointing at.
797 */
798 cbt->compare = 0;
799 ret = btree->type == BTREE_ROW ?
800 __cursor_row_modify(cbt, WT_UPDATE_STANDARD) :
801 __cursor_col_modify(cbt, WT_UPDATE_STANDARD);
802 if (ret == 0)
803 goto done;
804
805 /*
806 * The pinned page goes away if we fail for any reason, get a
807 * local copy of any pinned key or value. (Restart could still
808 * use the pinned page, but that's an unlikely path.) Re-save
809 * the cursor state: we may retry but eventually fail.
810 */
811 WT_TRET(__cursor_localkey(cursor));
812 WT_TRET(__cursor_localvalue(cursor));
813 __cursor_state_save(cursor, &state);
814 goto err;
815 }
816
817 /*
818 * The pinned page goes away if we do a search, get a local copy of any
819 * pinned key or value. Re-save the cursor state: we may retry but
820 * eventually fail.
821 */
822 WT_ERR(__cursor_localkey(cursor));
823 WT_ERR(__cursor_localvalue(cursor));
824 __cursor_state_save(cursor, &state);
825
826 retry: WT_ERR(__cursor_func_init(cbt, true));
827
828 if (btree->type == BTREE_ROW) {
829 WT_ERR(__cursor_row_search(cbt, true, NULL, NULL));
830 /*
831 * If not overwriting, fail if the key exists, else insert the
832 * key/value pair.
833 */
834 if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
835 cbt->compare == 0) {
836 WT_ERR(__wt_cursor_valid(cbt, NULL, &valid));
837 if (valid)
838 WT_ERR(WT_DUPLICATE_KEY);
839 }
840
841 ret = __cursor_row_modify(cbt, WT_UPDATE_STANDARD);
842 } else if (append_key) {
843 /*
844 * Optionally insert a new record (ignoring the application's
845 * record number). The real record number is allocated by the
846 * serialized append operation.
847 */
848 cbt->iface.recno = WT_RECNO_OOB;
849 cbt->compare = 1;
850 WT_ERR(__cursor_col_search(cbt, NULL, NULL));
851 WT_ERR(__cursor_col_modify(cbt, WT_UPDATE_STANDARD));
852 cursor->recno = cbt->recno;
853 } else {
854 WT_ERR(__cursor_col_search(cbt, NULL, NULL));
855
856 /*
857 * If not overwriting, fail if the key exists. Creating a
858 * record past the end of the tree in a fixed-length
859 * column-store implicitly fills the gap with empty records.
860 * Fail in that case, the record exists.
861 */
862 if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
863 if (cbt->compare == 0) {
864 WT_ERR(__wt_cursor_valid(cbt, NULL, &valid));
865 if (valid)
866 WT_ERR(WT_DUPLICATE_KEY);
867 } else if (__cursor_fix_implicit(btree, cbt))
868 WT_ERR(WT_DUPLICATE_KEY);
869 }
870
871 WT_ERR(__cursor_col_modify(cbt, WT_UPDATE_STANDARD));
872 }
873
874 err: if (ret == WT_RESTART) {
875 __cursor_restart(session, &yield_count, &sleep_usecs);
876 goto retry;
877 }
878
879 /* Insert doesn't maintain a position across calls, clear resources. */
880 if (ret == 0) {
881 done: F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
882 if (append_key)
883 F_SET(cursor, WT_CURSTD_KEY_EXT);
884 }
885 WT_TRET(__cursor_reset(cbt));
886 if (ret != 0)
887 __cursor_state_restore(cursor, &state);
888
889 return (ret);
890 }
891
892 /*
893 * __curfile_update_check --
894 * Check whether an update would conflict.
895 *
896 * This function expects the cursor to already be positioned. It should
897 * be called before deciding whether to skip an update operation based on
898 * existence of a visible update for a key -- even if there is no value
899 * visible to the transaction, an update could still conflict.
900 */
901 static int
__curfile_update_check(WT_CURSOR_BTREE * cbt)902 __curfile_update_check(WT_CURSOR_BTREE *cbt)
903 {
904 WT_BTREE *btree;
905 WT_SESSION_IMPL *session;
906
907 btree = cbt->btree;
908 session = (WT_SESSION_IMPL *)cbt->iface.session;
909
910 if (cbt->compare != 0)
911 return (0);
912 if (cbt->ins != NULL)
913 return (__wt_txn_update_check(session, cbt->ins->upd));
914
915 if (btree->type == BTREE_ROW &&
916 cbt->ref->page->modify != NULL &&
917 cbt->ref->page->modify->mod_row_update != NULL)
918 return (__wt_txn_update_check(session,
919 cbt->ref->page->modify->mod_row_update[cbt->slot]));
920 return (0);
921 }
922
923 /*
924 * __wt_btcur_insert_check --
925 * Check whether an update would conflict.
926 *
927 * This can replace WT_CURSOR::insert, so it only checks for conflicts without
928 * updating the tree. It is used to maintain snapshot isolation for transactions
929 * that span multiple chunks in an LSM tree.
930 */
931 int
__wt_btcur_insert_check(WT_CURSOR_BTREE * cbt)932 __wt_btcur_insert_check(WT_CURSOR_BTREE *cbt)
933 {
934 WT_CURSOR *cursor;
935 WT_DECL_RET;
936 WT_SESSION_IMPL *session;
937 uint64_t yield_count, sleep_usecs;
938
939 cursor = &cbt->iface;
940 session = (WT_SESSION_IMPL *)cursor->session;
941 yield_count = sleep_usecs = 0;
942
943 WT_ASSERT(session, cbt->btree->type == BTREE_ROW);
944
945 /*
946 * The pinned page goes away if we do a search, get a local copy of any
947 * pinned key and discard any pinned value. Unlike most of the btree
948 * cursor routines, we don't have to save/restore the cursor key state,
949 * none of the work done here changes the cursor state.
950 */
951 WT_ERR(__cursor_localkey(cursor));
952 __cursor_novalue(cursor);
953
954 retry: WT_ERR(__cursor_func_init(cbt, true));
955 WT_ERR(__cursor_row_search(cbt, true, NULL, NULL));
956
957 /* Just check for conflicts. */
958 ret = __curfile_update_check(cbt);
959
960 err: if (ret == WT_RESTART) {
961 __cursor_restart(session, &yield_count, &sleep_usecs);
962 goto retry;
963 }
964
965 /* Insert doesn't maintain a position across calls, clear resources. */
966 if (ret == 0)
967 F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
968 WT_TRET(__cursor_reset(cbt));
969
970 return (ret);
971 }
972
973 /*
974 * __wt_btcur_remove --
975 * Remove a record from the tree.
976 */
977 int
__wt_btcur_remove(WT_CURSOR_BTREE * cbt)978 __wt_btcur_remove(WT_CURSOR_BTREE *cbt)
979 {
980 enum { NO_POSITION, POSITIONED, SEARCH_POSITION } positioned;
981 WT_BTREE *btree;
982 WT_CURFILE_STATE state;
983 WT_CURSOR *cursor;
984 WT_DECL_RET;
985 WT_SESSION_IMPL *session;
986 uint64_t yield_count, sleep_usecs;
987 bool iterating, valid;
988
989 btree = cbt->btree;
990 cursor = &cbt->iface;
991 session = (WT_SESSION_IMPL *)cursor->session;
992 yield_count = sleep_usecs = 0;
993 iterating = F_ISSET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV);
994
995 WT_STAT_CONN_INCR(session, cursor_remove);
996 WT_STAT_DATA_INCR(session, cursor_remove);
997 WT_STAT_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size);
998
999 /*
1000 * WT_CURSOR.remove has a unique semantic, the cursor stays positioned
1001 * if it starts positioned, otherwise clear the cursor on completion.
1002 *
1003 * However, if we unpin the page (because the page is in WT_REF_LIMBO or
1004 * it was selected for forcible eviction), and every item on the page is
1005 * deleted, eviction can delete the page and our subsequent search will
1006 * re-instantiate an empty page for us, with no key/value pairs. Cursor
1007 * remove will search that page and return not-found, which is OK unless
1008 * cursor-overwrite is configured (which causes cursor remove to return
1009 * success even if there's no item to delete). In that case, we're
1010 * supposed to return a positioned cursor, but there's nothing to which
1011 * we can position, and we'll fail attempting to point the cursor at the
1012 * key on the page to satisfy the positioned requirement.
1013 *
1014 * Do the best we can: If we start with a positioned cursor, and we let
1015 * go of our pinned page, reset our state to use the search position,
1016 * that is, use a successful search to return to a "positioned" state.
1017 * If we start with a positioned cursor, let go of our pinned page, and
1018 * the search fails, leave the cursor's key set so the cursor appears
1019 * positioned to the application.
1020 */
1021 positioned =
1022 F_ISSET(cursor, WT_CURSTD_KEY_INT) ? POSITIONED : NO_POSITION;
1023
1024 /* Save the cursor state. */
1025 __cursor_state_save(cursor, &state);
1026
1027 /*
1028 * If remove positioned to an on-page key, the remove doesn't require
1029 * another search. We don't care about the "overwrite" configuration
1030 * because regardless of the overwrite setting, any existing record is
1031 * removed, and the record must exist with a positioned cursor.
1032 *
1033 * There's trickiness in the page-pinned check. By definition a remove
1034 * operation leaves a cursor positioned if it's initially positioned.
1035 * However, if every item on the page is deleted and we unpin the page,
1036 * eviction might delete the page and our search will re-instantiate an
1037 * empty page for us. Cursor remove returns not-found whether or not
1038 * that eviction/deletion happens and it's OK unless cursor-overwrite
1039 * is configured (which means we return success even if there's no item
1040 * to delete). In that case, we'll fail when we try to point the cursor
1041 * at the key on the page to satisfy the positioned requirement. It's
1042 * arguably safe to simply leave the key initialized in the cursor (as
1043 * that's all a positioned cursor implies), but it's probably safer to
1044 * avoid page eviction entirely in the positioned case.
1045 *
1046 * Fixed-length column store can never use a positioned cursor to update
1047 * because the cursor may not be positioned to the correct record in the
1048 * case of implicit records in the append list.
1049 */
1050 if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt)) {
1051 WT_ERR(__wt_txn_autocommit_check(session));
1052
1053 /*
1054 * The cursor position may not be exact (the cursor's comparison
1055 * value not equal to zero). Correct to an exact match so we can
1056 * remove whatever we're pointing at.
1057 */
1058 cbt->compare = 0;
1059 ret = btree->type == BTREE_ROW ?
1060 __cursor_row_modify(cbt, WT_UPDATE_TOMBSTONE) :
1061 __cursor_col_modify(cbt, WT_UPDATE_TOMBSTONE);
1062 if (ret == 0)
1063 goto done;
1064 goto err;
1065 }
1066
1067 /*
1068 * The pinned page goes away if we do a search, including as a result of
1069 * a restart. Get a local copy of any pinned key and re-save the cursor
1070 * state: we may retry but eventually fail.
1071 *
1072 * Note these steps must be repeatable, we'll continue to take this path
1073 * as long as we encounter WT_RESTART.
1074 */
1075 retry: if (positioned == POSITIONED)
1076 positioned = SEARCH_POSITION;
1077 WT_ERR(__cursor_localkey(cursor));
1078 __cursor_state_save(cursor, &state);
1079
1080 WT_ERR(__cursor_func_init(cbt, true));
1081
1082 if (btree->type == BTREE_ROW) {
1083 WT_ERR(__cursor_row_search(cbt, false, NULL, NULL));
1084
1085 /* Check whether an update would conflict. */
1086 WT_ERR(__curfile_update_check(cbt));
1087
1088 if (cbt->compare != 0)
1089 WT_ERR(WT_NOTFOUND);
1090 WT_ERR(__wt_cursor_valid(cbt, NULL, &valid));
1091 if (!valid)
1092 WT_ERR(WT_NOTFOUND);
1093
1094 ret = __cursor_row_modify(cbt, WT_UPDATE_TOMBSTONE);
1095 } else {
1096 WT_ERR(__cursor_col_search(cbt, NULL, NULL));
1097
1098 /*
1099 * If we find a matching record, check whether an update would
1100 * conflict. Do this before checking if the update is visible
1101 * in __wt_cursor_valid, or we can miss conflict.
1102 */
1103 WT_ERR(__curfile_update_check(cbt));
1104
1105 /* Remove the record if it exists. */
1106 valid = false;
1107 if (cbt->compare == 0)
1108 WT_ERR(__wt_cursor_valid(cbt, NULL, &valid));
1109 if (cbt->compare != 0 || !valid) {
1110 if (!__cursor_fix_implicit(btree, cbt))
1111 WT_ERR(WT_NOTFOUND);
1112 /*
1113 * Creating a record past the end of the tree in a
1114 * fixed-length column-store implicitly fills the
1115 * gap with empty records. Return success in that
1116 * case, the record was deleted successfully.
1117 *
1118 * Correct the btree cursor's location: the search
1119 * will have pointed us at the previous/next item,
1120 * and that's not correct.
1121 */
1122 cbt->recno = cursor->recno;
1123 } else
1124 ret = __cursor_col_modify(cbt, WT_UPDATE_TOMBSTONE);
1125 }
1126
1127 err: if (ret == WT_RESTART) {
1128 __cursor_restart(session, &yield_count, &sleep_usecs);
1129 goto retry;
1130 }
1131
1132 if (ret == 0) {
1133 done: switch (positioned) {
1134 case NO_POSITION:
1135 /*
1136 * Never positioned and we leave it that way, clear any
1137 * key and reset the cursor.
1138 */
1139 F_CLR(cursor, WT_CURSTD_KEY_SET);
1140 WT_TRET(__cursor_reset(cbt));
1141 break;
1142 case POSITIONED:
1143 /*
1144 * Positioned and we used the pinned page, leave the key
1145 * alone, whatever it is.
1146 */
1147 break;
1148 case SEARCH_POSITION:
1149 /*
1150 * Positioned and we did a search anyway, get a key to
1151 * return.
1152 */
1153 WT_TRET(__wt_key_return(cbt));
1154 break;
1155 }
1156 }
1157
1158 if (ret != 0) {
1159 WT_TRET(__cursor_reset(cbt));
1160 __cursor_state_restore(cursor, &state);
1161
1162 /*
1163 * If the record isn't found and the cursor is configured for
1164 * overwrite, that is what we want, try to return success.
1165 *
1166 * We set the return to 0 after testing for success, the clause
1167 * above dealing with the cursor position is only correct if we
1168 * were successful. If search failed after positioned is set to
1169 * SEARCH_POSITION, we cannot return a key. The only action to
1170 * take is to set the cursor to its original key, which we just
1171 * did.
1172 *
1173 * Finally, if an iterating or positioned cursor was forced to
1174 * give up its pinned page and then a search failed, we've
1175 * lost our cursor position. Since no subsequent iteration can
1176 * succeed, we cannot return success.
1177 */
1178 if (ret == WT_NOTFOUND &&
1179 F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
1180 !iterating && positioned == NO_POSITION)
1181 ret = 0;
1182 }
1183
1184 /*
1185 * Upper level cursor removes don't expect the cursor value to be set
1186 * after a successful remove (and check in diagnostic mode). Error
1187 * handling may have converted failure to a success, do a final check.
1188 */
1189 if (ret == 0)
1190 F_CLR(cursor, WT_CURSTD_VALUE_SET);
1191
1192 return (ret);
1193 }
1194
1195 /*
1196 * __btcur_update --
1197 * Update a record in the tree.
1198 */
1199 static int
__btcur_update(WT_CURSOR_BTREE * cbt,WT_ITEM * value,u_int modify_type)1200 __btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
1201 {
1202 WT_BTREE *btree;
1203 WT_CURFILE_STATE state;
1204 WT_CURSOR *cursor;
1205 WT_DECL_RET;
1206 WT_SESSION_IMPL *session;
1207 uint64_t yield_count, sleep_usecs;
1208 bool leaf_found, valid;
1209
1210 btree = cbt->btree;
1211 cursor = &cbt->iface;
1212 session = (WT_SESSION_IMPL *)cursor->session;
1213 yield_count = sleep_usecs = 0;
1214
1215 /* It's no longer possible to bulk-load into the tree. */
1216 __cursor_disable_bulk(session, btree);
1217
1218 /* Save the cursor state. */
1219 __cursor_state_save(cursor, &state);
1220
1221 /*
1222 * If update positioned to an on-page key, the update doesn't require
1223 * another search. We don't care about the "overwrite" configuration
1224 * because regardless of the overwrite setting, any existing record is
1225 * updated, and the record must exist with a positioned cursor.
1226 *
1227 * Fixed-length column store can never use a positioned cursor to update
1228 * because the cursor may not be positioned to the correct record in the
1229 * case of implicit records in the append list.
1230 */
1231 if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt)) {
1232 WT_ERR(__wt_txn_autocommit_check(session));
1233
1234 /*
1235 * The cursor position may not be exact (the cursor's comparison
1236 * value not equal to zero). Correct to an exact match so we can
1237 * update whatever we're pointing at.
1238 */
1239 cbt->compare = 0;
1240 ret = btree->type == BTREE_ROW ?
1241 __cursor_row_modify_v(cbt, value, modify_type) :
1242 __cursor_col_modify_v(cbt, value, modify_type);
1243 if (ret == 0)
1244 goto done;
1245
1246 /*
1247 * The pinned page goes away if we fail for any reason, get a
1248 * a local copy of any pinned key or value. (Restart could still
1249 * use the pinned page, but that's an unlikely path.) Re-save
1250 * the cursor state: we may retry but eventually fail.
1251 */
1252 WT_TRET(__cursor_localkey(cursor));
1253 WT_TRET(__cursor_localvalue(cursor));
1254 __cursor_state_save(cursor, &state);
1255 goto err;
1256 }
1257
1258 /*
1259 * The pinned page goes away if we do a search, get a local copy of any
1260 * pinned key or value. Re-save the cursor state: we may retry but
1261 * eventually fail.
1262 */
1263 WT_ERR(__cursor_localkey(cursor));
1264 WT_ERR(__cursor_localvalue(cursor));
1265 __cursor_state_save(cursor, &state);
1266
1267 /*
1268 * If our caller configures for a local search and we have a page
1269 * pinned, do that search.
1270 */
1271 if (F_ISSET(cursor, WT_CURSTD_UPDATE_LOCAL)
1272 && __cursor_page_pinned(cbt)) {
1273 __wt_txn_cursor_op(session);
1274 WT_ERR(__wt_txn_autocommit_check(session));
1275 WT_ERR(btree->type == BTREE_ROW ?
1276 __cursor_row_search(cbt, true, cbt->ref, &leaf_found) :
1277 __cursor_col_search(cbt, cbt->ref, &leaf_found));
1278 /*
1279 * Only use the pinned page search results if search returns an
1280 * exact match or a slot other than the page's boundary slots,
1281 * if that's not the case, a neighbor page might offer a better
1282 * match. This test is simplistic as we're ignoring append
1283 * lists (there may be no page slots or we might be
1284 * legitimately positioned after the last page slot). Ignore
1285 * those cases, it makes things too complicated.
1286 */
1287 if (leaf_found && (cbt->compare == 0
1288 || (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1)))
1289 goto update_local;
1290 }
1291
1292 retry:
1293 WT_ERR(__cursor_func_init(cbt, true));
1294 WT_ERR(btree->type == BTREE_ROW ?
1295 __cursor_row_search(cbt, true, NULL, NULL) :
1296 __cursor_col_search(cbt, NULL, NULL));
1297 update_local:
1298 if (btree->type == BTREE_ROW) {
1299 /*
1300 * If not overwriting, check for conflicts and fail if the key
1301 * does not exist.
1302 */
1303 if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
1304 WT_ERR(__curfile_update_check(cbt));
1305 if (cbt->compare != 0)
1306 WT_ERR(WT_NOTFOUND);
1307 WT_ERR(__wt_cursor_valid(cbt, NULL, &valid));
1308 if (!valid)
1309 WT_ERR(WT_NOTFOUND);
1310 }
1311 ret = __cursor_row_modify_v(cbt, value, modify_type);
1312 } else {
1313 /*
1314 * If not overwriting, fail if the key doesn't exist. If we
1315 * find an update for the key, check for conflicts. Update the
1316 * record if it exists. Creating a record past the end of the
1317 * tree in a fixed-length column-store implicitly fills the gap
1318 * with empty records. Update the record in that case, the
1319 * record exists.
1320 */
1321 if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
1322 WT_ERR(__curfile_update_check(cbt));
1323 valid = false;
1324 if (cbt->compare == 0)
1325 WT_ERR(__wt_cursor_valid(cbt, NULL, &valid));
1326 if ((cbt->compare != 0 || !valid) &&
1327 !__cursor_fix_implicit(btree, cbt))
1328 WT_ERR(WT_NOTFOUND);
1329 }
1330 ret = __cursor_col_modify_v(cbt, value, modify_type);
1331 }
1332
1333 err: if (ret == WT_RESTART) {
1334 __cursor_restart(session, &yield_count, &sleep_usecs);
1335 goto retry;
1336 }
1337
1338 /*
1339 * If successful, point the cursor at internal copies of the data. We
1340 * could shuffle memory in the cursor so the key/value pair are in local
1341 * buffer memory, but that's a data copy. We don't want to do another
1342 * search (and we might get a different update structure if we race).
1343 * To make this work, we add a field to the btree cursor to pass back a
1344 * pointer to the modify function's allocated update structure.
1345 */
1346 if (ret == 0) {
1347 done: switch (modify_type) {
1348 case WT_UPDATE_STANDARD:
1349 /*
1350 * WT_CURSOR.update returns a key and a value.
1351 */
1352 ret = __cursor_kv_return(cbt, cbt->modify_update);
1353 break;
1354 case WT_UPDATE_RESERVE:
1355 /*
1356 * WT_CURSOR.reserve doesn't return any value.
1357 */
1358 F_CLR(cursor, WT_CURSTD_VALUE_SET);
1359 /* FALLTHROUGH */
1360 case WT_UPDATE_MODIFY:
1361 /*
1362 * WT_CURSOR.modify has already created the return value
1363 * and our job is to leave it untouched.
1364 */
1365 ret = __wt_key_return(cbt);
1366 break;
1367 case WT_UPDATE_BIRTHMARK:
1368 case WT_UPDATE_TOMBSTONE:
1369 WT_ILLEGAL_VALUE(session, modify_type);
1370 }
1371 }
1372
1373 if (ret != 0) {
1374 WT_TRET(__cursor_reset(cbt));
1375 __cursor_state_restore(cursor, &state);
1376 }
1377
1378 return (ret);
1379 }
1380
1381 /*
1382 * __cursor_chain_exceeded --
1383 * Return if the update chain has exceeded the limit.
1384 */
1385 static bool
__cursor_chain_exceeded(WT_CURSOR_BTREE * cbt)1386 __cursor_chain_exceeded(WT_CURSOR_BTREE *cbt)
1387 {
1388 WT_CURSOR *cursor;
1389 WT_PAGE *page;
1390 WT_SESSION_IMPL *session;
1391 WT_UPDATE *upd;
1392 size_t upd_size;
1393 int i;
1394
1395 cursor = &cbt->iface;
1396 page = cbt->ref->page;
1397 session = (WT_SESSION_IMPL *)cursor->session;
1398
1399 upd = NULL;
1400 if (cbt->ins != NULL)
1401 upd = cbt->ins->upd;
1402 else if (cbt->btree->type == BTREE_ROW &&
1403 page->modify != NULL && page->modify->mod_row_update != NULL)
1404 upd = page->modify->mod_row_update[cbt->slot];
1405
1406 /*
1407 * Step through the modify operations at the beginning of the chain.
1408 *
1409 * Deleted or standard updates are anticipated to be sufficient to base
1410 * the modify (although that's not guaranteed: they may not be visible
1411 * or might abort before we read them). Also, this is not a hard
1412 * limit, threads can race modifying updates.
1413 *
1414 * If the total size in bytes of the updates exceeds some factor of the
1415 * underlying value size (which we know because the cursor is
1416 * positioned), create a new full copy of the value. This limits the
1417 * cache pressure from creating full copies to that factor: with the
1418 * default factor of 1, the total size in memory of a set of modify
1419 * updates is limited to double the size of the modifies.
1420 *
1421 * Otherwise, limit the length of the update chain to a fixed size to
1422 * bound the cost of rebuilding the value during reads. When history
1423 * has to be maintained, creating extra copies of large documents
1424 * multiplies cache pressure because the old ones cannot be freed, so
1425 * allow the modify chain to grow.
1426 */
1427 for (i = 0, upd_size = 0;
1428 upd != NULL && upd->type == WT_UPDATE_MODIFY;
1429 ++i, upd = upd->next) {
1430 upd_size += WT_UPDATE_MEMSIZE(upd);
1431 if (i >= WT_MAX_MODIFY_UPDATE &&
1432 upd_size * WT_MODIFY_MEM_FRACTION >= cursor->value.size)
1433 return (true);
1434 }
1435 if (i >= WT_MAX_MODIFY_UPDATE && upd != NULL &&
1436 upd->type == WT_UPDATE_STANDARD &&
1437 __wt_txn_upd_visible_all(session, upd))
1438 return (true);
1439 return (false);
1440 }
1441
1442 /*
1443 * __wt_btcur_modify --
1444 * Modify a record in the tree.
1445 */
1446 int
__wt_btcur_modify(WT_CURSOR_BTREE * cbt,WT_MODIFY * entries,int nentries)1447 __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
1448 {
1449 WT_CURFILE_STATE state;
1450 WT_CURSOR *cursor;
1451 WT_DECL_ITEM(modify);
1452 WT_DECL_RET;
1453 WT_SESSION_IMPL *session;
1454 size_t orig, new;
1455 bool overwrite;
1456
1457 cursor = &cbt->iface;
1458 session = (WT_SESSION_IMPL *)cursor->session;
1459
1460 WT_STAT_CONN_INCR(session, cursor_modify);
1461 WT_STAT_DATA_INCR(session, cursor_modify);
1462
1463 /* Save the cursor state. */
1464 __cursor_state_save(cursor, &state);
1465
1466 /*
1467 * Get the current value and apply the modification to it, for a few
1468 * reasons: first, we set the updated value so the application can
1469 * retrieve the cursor's value; second, we use the updated value as
1470 * the update if the update chain is too long; third, there's a check
1471 * if the updated value is too large to store; fourth, to simplify the
1472 * count of bytes being added/removed; fifth, we can get into serious
1473 * trouble if we attempt to modify a value that doesn't exist. For the
1474 * fifth reason, verify we're not in a read-uncommitted transaction,
1475 * that implies a value that might disappear out from under us.
1476 *
1477 * Also, an application might read a value outside of a transaction and
1478 * then call modify. For that to work, the read must be part of the
1479 * transaction that performs the update for correctness, otherwise we
1480 * could race with another thread and end up modifying the wrong value.
1481 * A clever application could get this right (imagine threads that only
1482 * updated non-overlapping, fixed-length byte strings), but it's unsafe
1483 * because it will work most of the time and the failure is unlikely to
1484 * be detected. Require explicit transactions for modify operations.
1485 */
1486 if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
1487 WT_ERR_MSG(session, ENOTSUP,
1488 "not supported in read-uncommitted transactions");
1489 if (F_ISSET(&session->txn, WT_TXN_AUTOCOMMIT))
1490 WT_ERR_MSG(session, ENOTSUP,
1491 "not supported in implicit transactions");
1492
1493 if (!F_ISSET(cursor, WT_CURSTD_KEY_INT) ||
1494 !F_ISSET(cursor, WT_CURSTD_VALUE_INT))
1495 WT_ERR(__wt_btcur_search(cbt));
1496 orig = cursor->value.size;
1497 WT_ERR(__wt_modify_apply_api(session, cursor, entries, nentries));
1498 new = cursor->value.size;
1499 WT_ERR(__cursor_size_chk(session, &cursor->value));
1500 if (new > orig)
1501 WT_STAT_DATA_INCRV(session, cursor_update_bytes, new - orig);
1502 else
1503 WT_STAT_DATA_DECRV(session, cursor_update_bytes, orig - new);
1504
1505 /*
1506 * WT_CURSOR.modify is update-without-overwrite.
1507 *
1508 * Use the modify buffer as the update if the data package saves us some
1509 * memory and the update chain is under the limit, else use the complete
1510 * value.
1511 */
1512 overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
1513 F_CLR(cursor, WT_CURSTD_OVERWRITE);
1514 if (cursor->value.size <= 64 || __cursor_chain_exceeded(cbt))
1515 ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD);
1516 else if ((ret =
1517 __wt_modify_pack(session, &modify, entries, nentries)) == 0)
1518 ret = __btcur_update(cbt, modify, WT_UPDATE_MODIFY);
1519 if (overwrite)
1520 F_SET(cursor, WT_CURSTD_OVERWRITE);
1521
1522 /*
1523 * We have our own cursor state restoration because we've modified the
1524 * cursor before calling the underlying cursor update function and we
1525 * need to restore it to its original state. This means multiple calls
1526 * to reset the cursor, but that shouldn't be a problem.
1527 */
1528 if (ret != 0) {
1529 err: WT_TRET(__cursor_reset(cbt));
1530 __cursor_state_restore(cursor, &state);
1531 }
1532
1533 __wt_scr_free(session, &modify);
1534 return (ret);
1535 }
1536
1537 /*
1538 * __wt_btcur_reserve --
1539 * Reserve a record in the tree.
1540 */
1541 int
__wt_btcur_reserve(WT_CURSOR_BTREE * cbt)1542 __wt_btcur_reserve(WT_CURSOR_BTREE *cbt)
1543 {
1544 WT_CURSOR *cursor;
1545 WT_DECL_RET;
1546 WT_SESSION_IMPL *session;
1547 bool overwrite;
1548
1549 cursor = &cbt->iface;
1550 session = (WT_SESSION_IMPL *)cursor->session;
1551
1552 WT_STAT_CONN_INCR(session, cursor_reserve);
1553 WT_STAT_DATA_INCR(session, cursor_reserve);
1554
1555 /* WT_CURSOR.reserve is update-without-overwrite and a special value. */
1556 overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
1557 F_CLR(cursor, WT_CURSTD_OVERWRITE);
1558 ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_RESERVE);
1559 if (overwrite)
1560 F_SET(cursor, WT_CURSTD_OVERWRITE);
1561 return (ret);
1562 }
1563
1564 /*
1565 * __wt_btcur_update --
1566 * Update a record in the tree.
1567 */
1568 int
__wt_btcur_update(WT_CURSOR_BTREE * cbt)1569 __wt_btcur_update(WT_CURSOR_BTREE *cbt)
1570 {
1571 WT_BTREE *btree;
1572 WT_CURSOR *cursor;
1573 WT_SESSION_IMPL *session;
1574
1575 btree = cbt->btree;
1576 cursor = &cbt->iface;
1577 session = (WT_SESSION_IMPL *)cursor->session;
1578
1579 WT_STAT_CONN_INCR(session, cursor_update);
1580 WT_STAT_DATA_INCR(session, cursor_update);
1581 WT_STAT_DATA_INCRV(session, cursor_update_bytes, cursor->value.size);
1582
1583 if (btree->type == BTREE_ROW)
1584 WT_RET(__cursor_size_chk(session, &cursor->key));
1585 WT_RET(__cursor_size_chk(session, &cursor->value));
1586
1587 return (__btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD));
1588 }
1589
1590 /*
1591 * __wt_btcur_compare --
1592 * Return a comparison between two cursors.
1593 */
1594 int
__wt_btcur_compare(WT_CURSOR_BTREE * a_arg,WT_CURSOR_BTREE * b_arg,int * cmpp)1595 __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp)
1596 {
1597 WT_CURSOR *a, *b;
1598 WT_SESSION_IMPL *session;
1599
1600 a = (WT_CURSOR *)a_arg;
1601 b = (WT_CURSOR *)b_arg;
1602 session = (WT_SESSION_IMPL *)a->session;
1603
1604 /* Confirm both cursors reference the same object. */
1605 if (a_arg->btree != b_arg->btree)
1606 WT_RET_MSG(
1607 session, EINVAL, "Cursors must reference the same object");
1608
1609 switch (a_arg->btree->type) {
1610 case BTREE_COL_FIX:
1611 case BTREE_COL_VAR:
1612 /*
1613 * Compare the interface's cursor record, not the underlying
1614 * cursor reference: the interface's cursor reference is the
1615 * one being returned to the application.
1616 */
1617 if (a->recno < b->recno)
1618 *cmpp = -1;
1619 else if (a->recno == b->recno)
1620 *cmpp = 0;
1621 else
1622 *cmpp = 1;
1623 break;
1624 case BTREE_ROW:
1625 WT_RET(__wt_compare(
1626 session, a_arg->btree->collator, &a->key, &b->key, cmpp));
1627 break;
1628 }
1629 return (0);
1630 }
1631
1632 /*
1633 * __cursor_equals --
1634 * Return if two cursors reference the same row.
1635 */
1636 static inline bool
__cursor_equals(WT_CURSOR_BTREE * a,WT_CURSOR_BTREE * b)1637 __cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b)
1638 {
1639 switch (a->btree->type) {
1640 case BTREE_COL_FIX:
1641 case BTREE_COL_VAR:
1642 /*
1643 * Compare the interface's cursor record, not the underlying
1644 * cursor reference: the interface's cursor reference is the
1645 * one being returned to the application.
1646 */
1647 if (((WT_CURSOR *)a)->recno == ((WT_CURSOR *)b)->recno)
1648 return (true);
1649 break;
1650 case BTREE_ROW:
1651 if (a->ref != b->ref)
1652 return (false);
1653 if (a->ins != NULL || b->ins != NULL) {
1654 if (a->ins == b->ins)
1655 return (true);
1656 break;
1657 }
1658 if (a->slot == b->slot)
1659 return (true);
1660 break;
1661 }
1662 return (false);
1663 }
1664
1665 /*
1666 * __wt_btcur_equals --
1667 * Return an equality comparison between two cursors.
1668 */
1669 int
__wt_btcur_equals(WT_CURSOR_BTREE * a_arg,WT_CURSOR_BTREE * b_arg,int * equalp)1670 __wt_btcur_equals(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp)
1671 {
1672 WT_CURSOR *a, *b;
1673 WT_SESSION_IMPL *session;
1674 int cmp;
1675
1676 a = (WT_CURSOR *)a_arg;
1677 b = (WT_CURSOR *)b_arg;
1678 cmp = 0;
1679 session = (WT_SESSION_IMPL *)a->session;
1680
1681 /* Confirm both cursors reference the same object. */
1682 if (a_arg->btree != b_arg->btree)
1683 WT_RET_MSG(
1684 session, EINVAL, "Cursors must reference the same object");
1685
1686 /*
1687 * The reason for an equals method is because we can avoid doing
1688 * a full key comparison in some cases. If both cursors point into the
1689 * tree, take the fast path, otherwise fall back to the slower compare
1690 * method; in both cases, return 1 if the cursors are equal, 0 if they
1691 * are not.
1692 */
1693 if (F_ISSET(a, WT_CURSTD_KEY_INT) && F_ISSET(b, WT_CURSTD_KEY_INT))
1694 *equalp = __cursor_equals(a_arg, b_arg);
1695 else {
1696 WT_RET(__wt_btcur_compare(a_arg, b_arg, &cmp));
1697 *equalp = (cmp == 0) ? 1 : 0;
1698 }
1699 return (0);
1700 }
1701
1702 /*
1703 * __cursor_truncate --
1704 * Discard a cursor range from row-store or variable-width column-store
1705 * tree.
1706 */
1707 static int
__cursor_truncate(WT_CURSOR_BTREE * start,WT_CURSOR_BTREE * stop,int (* rmfunc)(WT_CURSOR_BTREE *,u_int))1708 __cursor_truncate(
1709 WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop,
1710 int (*rmfunc)(WT_CURSOR_BTREE *, u_int))
1711 {
1712 WT_DECL_RET;
1713 WT_SESSION_IMPL *session;
1714 uint64_t yield_count, sleep_usecs;
1715
1716 session = (WT_SESSION_IMPL *)start->iface.session;
1717 yield_count = sleep_usecs = 0;
1718
1719 /*
1720 * First, call the cursor search method to re-position the cursor: we
1721 * may not have a cursor position (if the higher-level truncate code
1722 * switched the cursors to have an "external" cursor key, and because
1723 * we don't save a copy of the page's write generation information,
1724 * which we need to remove records).
1725 *
1726 * Once that's done, we can delete records without a full search, unless
1727 * we encounter a restart error because the page was modified by some
1728 * other thread of control; in that case, repeat the full search to
1729 * refresh the page's modification information.
1730 *
1731 * If this is a row-store, we delete leaf pages having no overflow items
1732 * without reading them; for that to work, we have to ensure we read the
1733 * page referenced by the ending cursor, since we may be deleting only a
1734 * partial page at the end of the truncation. Our caller already fully
1735 * instantiated the end cursor, so we know that page is pinned in memory
1736 * and we can proceed without concern.
1737 */
1738 retry:
1739 WT_ERR(__wt_btcur_search(start));
1740 WT_ASSERT(session,
1741 F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);
1742
1743 for (;;) {
1744 WT_ERR(rmfunc(start, WT_UPDATE_TOMBSTONE));
1745
1746 if (stop != NULL && __cursor_equals(start, stop))
1747 return (0);
1748
1749 WT_ERR(__wt_btcur_next(start, true));
1750
1751 start->compare = 0; /* Exact match */
1752 }
1753
1754 err: if (ret == WT_RESTART) {
1755 __cursor_restart(session, &yield_count, &sleep_usecs);
1756 goto retry;
1757 }
1758
1759 WT_RET_NOTFOUND_OK(ret);
1760 return (0);
1761 }
1762
1763 /*
1764 * __cursor_truncate_fix --
1765 * Discard a cursor range from fixed-width column-store tree.
1766 */
1767 static int
__cursor_truncate_fix(WT_CURSOR_BTREE * start,WT_CURSOR_BTREE * stop,int (* rmfunc)(WT_CURSOR_BTREE *,u_int))1768 __cursor_truncate_fix(
1769 WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop,
1770 int (*rmfunc)(WT_CURSOR_BTREE *, u_int))
1771 {
1772 WT_DECL_RET;
1773 WT_SESSION_IMPL *session;
1774 uint64_t yield_count, sleep_usecs;
1775 const uint8_t *value;
1776
1777 session = (WT_SESSION_IMPL *)start->iface.session;
1778 yield_count = sleep_usecs = 0;
1779
1780 /*
1781 * Handle fixed-length column-store objects separately: for row-store
1782 * and variable-length column-store objects we have "deleted" values
1783 * and so returned objects actually exist: fixed-length column-store
1784 * objects are filled-in if they don't exist, that is, if you create
1785 * record 37, records 1-36 magically appear. Those records can't be
1786 * deleted, which means we have to ignore already "deleted" records.
1787 *
1788 * First, call the cursor search method to re-position the cursor: we
1789 * may not have a cursor position (if the higher-level truncate code
1790 * switched the cursors to have an "external" cursor key, and because
1791 * we don't save a copy of the page's write generation information,
1792 * which we need to remove records).
1793 *
1794 * Once that's done, we can delete records without a full search, unless
1795 * we encounter a restart error because the page was modified by some
1796 * other thread of control; in that case, repeat the full search to
1797 * refresh the page's modification information.
1798 */
1799 retry: WT_ERR(__wt_btcur_search(start));
1800 WT_ASSERT(session,
1801 F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);
1802
1803 for (;;) {
1804 value = (const uint8_t *)start->iface.value.data;
1805 if (*value != 0)
1806 WT_ERR(rmfunc(start, WT_UPDATE_TOMBSTONE));
1807
1808 if (stop != NULL && __cursor_equals(start, stop))
1809 return (0);
1810
1811 WT_ERR(__wt_btcur_next(start, true));
1812
1813 start->compare = 0; /* Exact match */
1814 }
1815
1816 err: if (ret == WT_RESTART) {
1817 __cursor_restart(session, &yield_count, &sleep_usecs);
1818 goto retry;
1819 }
1820
1821 WT_RET_NOTFOUND_OK(ret);
1822 return (0);
1823 }
1824
1825 /*
1826 * __wt_btcur_range_truncate --
1827 * Discard a cursor range from the tree.
1828 */
1829 int
__wt_btcur_range_truncate(WT_CURSOR_BTREE * start,WT_CURSOR_BTREE * stop)1830 __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
1831 {
1832 WT_BTREE *btree;
1833 WT_DECL_RET;
1834 WT_SESSION_IMPL *session;
1835
1836 session = (WT_SESSION_IMPL *)start->iface.session;
1837 btree = start->btree;
1838 WT_STAT_DATA_INCR(session, cursor_truncate);
1839
1840 WT_RET(__wt_txn_autocommit_check(session));
1841 /*
1842 * For recovery, log the start and stop keys for a truncate operation,
1843 * not the individual records removed. On the other hand, for rollback
1844 * we need to keep track of all the in-memory operations.
1845 *
1846 * We deal with this here by logging the truncate range first, then (in
1847 * the logging code) disabling writing of the in-memory remove records
1848 * to disk.
1849 */
1850 if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED))
1851 WT_RET(__wt_txn_truncate_log(session, start, stop));
1852
1853 switch (btree->type) {
1854 case BTREE_COL_FIX:
1855 WT_ERR(__cursor_truncate_fix(start, stop, __cursor_col_modify));
1856 break;
1857 case BTREE_COL_VAR:
1858 WT_ERR(__cursor_truncate(start, stop, __cursor_col_modify));
1859 break;
1860 case BTREE_ROW:
1861 /*
1862 * The underlying cursor comparison routine requires cursors be
1863 * fully instantiated when truncating row-store objects because
1864 * it's comparing page and/or skiplist positions, not keys. (Key
1865 * comparison would work, it's only that a key comparison would
1866 * be relatively expensive, especially with custom collators.
1867 * Column-store objects have record number keys, so the key
1868 * comparison is cheap.) The session truncate code did cursor
1869 * searches when setting up the truncate so we're good to go: if
1870 * that ever changes, we'd need to do something here to ensure a
1871 * fully instantiated cursor.
1872 */
1873 WT_ERR(__cursor_truncate(start, stop, __cursor_row_modify));
1874 break;
1875 }
1876
1877 err: if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED))
1878 __wt_txn_truncate_end(session);
1879 return (ret);
1880 }
1881
1882 /*
1883 * __wt_btcur_init --
1884 * Initialize a cursor used for internal purposes.
1885 */
1886 void
__wt_btcur_init(WT_SESSION_IMPL * session,WT_CURSOR_BTREE * cbt)1887 __wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
1888 {
1889 memset(cbt, 0, sizeof(WT_CURSOR_BTREE));
1890
1891 cbt->iface.session = &session->iface;
1892 cbt->btree = S2BT(session);
1893 }
1894
1895 /*
1896 * __wt_btcur_open --
1897 * Open a btree cursor.
1898 */
1899 void
__wt_btcur_open(WT_CURSOR_BTREE * cbt)1900 __wt_btcur_open(WT_CURSOR_BTREE *cbt)
1901 {
1902 cbt->row_key = &cbt->_row_key;
1903 cbt->tmp = &cbt->_tmp;
1904
1905 #ifdef HAVE_DIAGNOSTIC
1906 cbt->lastkey = &cbt->_lastkey;
1907 cbt->lastrecno = WT_RECNO_OOB;
1908 #endif
1909 }
1910
1911 /*
1912 * __wt_btcur_close --
1913 * Close a btree cursor.
1914 */
1915 int
__wt_btcur_close(WT_CURSOR_BTREE * cbt,bool lowlevel)1916 __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel)
1917 {
1918 WT_DECL_RET;
1919 WT_SESSION_IMPL *session;
1920
1921 session = (WT_SESSION_IMPL *)cbt->iface.session;
1922
1923 /*
1924 * The in-memory split and lookaside table code creates low-level btree
1925 * cursors to search/modify leaf pages. Those cursors don't hold hazard
1926 * pointers, nor are they counted in the session handle's cursor count.
1927 * Skip the usual cursor tear-down in that case.
1928 */
1929 if (!lowlevel)
1930 ret = __cursor_reset(cbt);
1931
1932 __wt_buf_free(session, &cbt->_row_key);
1933 __wt_buf_free(session, &cbt->_tmp);
1934 #ifdef HAVE_DIAGNOSTIC
1935 __wt_buf_free(session, &cbt->_lastkey);
1936 #endif
1937
1938 return (ret);
1939 }
1940