1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 /*
12  * When returning an error, we need to restore the cursor to a valid state, the
13  * upper-level cursor code is likely to retry. This structure and the associated
14  * functions are used save and restore the cursor state.
15  */
16 typedef struct {
17 	WT_ITEM key;
18 	WT_ITEM value;
19 	uint64_t recno;
20 	uint32_t flags;
21 } WT_CURFILE_STATE;
22 
23 /*
24  * __cursor_state_save --
25  *	Save the cursor's external state.
26  */
27 static inline void
__cursor_state_save(WT_CURSOR * cursor,WT_CURFILE_STATE * state)28 __cursor_state_save(WT_CURSOR *cursor, WT_CURFILE_STATE *state)
29 {
30 	WT_ITEM_SET(state->key, cursor->key);
31 	WT_ITEM_SET(state->value, cursor->value);
32 	state->recno = cursor->recno;
33 	state->flags = cursor->flags;
34 }
35 
36 /*
37  * __cursor_state_restore --
38  *	Restore the cursor's external state.
39  */
40 static inline void
__cursor_state_restore(WT_CURSOR * cursor,WT_CURFILE_STATE * state)41 __cursor_state_restore(WT_CURSOR *cursor, WT_CURFILE_STATE *state)
42 {
43 	if (F_ISSET(state, WT_CURSTD_KEY_EXT))
44 		WT_ITEM_SET(cursor->key, state->key);
45 	if (F_ISSET(state, WT_CURSTD_VALUE_EXT))
46 		WT_ITEM_SET(cursor->value, state->value);
47 	cursor->recno = state->recno;
48 	F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
49 	F_SET(cursor, F_MASK(state, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT));
50 
51 }
52 
53 /*
54  * __cursor_page_pinned --
55  *	Return if we have a page pinned.
56  */
57 static inline bool
__cursor_page_pinned(WT_CURSOR_BTREE * cbt)58 __cursor_page_pinned(WT_CURSOR_BTREE *cbt)
59 {
60 	WT_CURSOR *cursor;
61 	WT_SESSION_IMPL *session;
62 	uint32_t current_state;
63 
64 	cursor = &cbt->iface;
65 	session = (WT_SESSION_IMPL *)cursor->session;
66 
67 	/*
68 	 * Check the page active flag, asserting the page reference with any
69 	 * external key.
70 	 */
71 	if (!F_ISSET(cbt, WT_CBT_ACTIVE)) {
72 		WT_ASSERT(session,
73 		    cbt->ref == NULL && !F_ISSET(cursor, WT_CURSTD_KEY_INT));
74 		return (false);
75 	}
76 
77 	/*
78 	 * Check if the key references the page. When returning from search,
79 	 * the page is active and the key is internal. After the application
80 	 * sets a key, the key is external, and the page is useless.
81 	 */
82 	if (!F_ISSET(cursor, WT_CURSTD_KEY_INT))
83 		return (false);
84 
85 	/*
86 	 * Fail if the page is flagged for forced eviction (so we periodically
87 	 * release pages grown too large).
88 	 */
89 	if (cbt->ref->page->read_gen == WT_READGEN_OLDEST)
90 		return (false);
91 
92 	/*
93 	 * If we are doing an update, we need a page with history, release the
94 	 * page so we get it again with history if required. Eviction may be
95 	 * locking the page, wait until we see a "normal" state and then test
96 	 * against that state (eviction may have already locked the page again).
97 	 */
98 	if (F_ISSET(&session->txn, WT_TXN_UPDATE)) {
99 		while ((current_state = cbt->ref->state) == WT_REF_LOCKED)
100 			__wt_yield();
101 		return (current_state == WT_REF_MEM);
102 	}
103 
104 	return (true);
105 }
106 
107 /*
108  * __cursor_size_chk --
109  *	Return if an inserted item is too large.
110  */
111 static inline int
__cursor_size_chk(WT_SESSION_IMPL * session,WT_ITEM * kv)112 __cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv)
113 {
114 	WT_BM *bm;
115 	WT_BTREE *btree;
116 	WT_DECL_RET;
117 	size_t size;
118 
119 	btree = S2BT(session);
120 	bm = btree->bm;
121 
122 	if (btree->type == BTREE_COL_FIX) {
123 		/* Fixed-size column-stores take a single byte. */
124 		if (kv->size != 1)
125 			WT_RET_MSG(session, EINVAL,
126 			    "item size of %" WT_SIZET_FMT " does not match "
127 			    "fixed-length file requirement of 1 byte",
128 			    kv->size);
129 		return (0);
130 	}
131 
132 	/* Don't waste effort, 1GB is always cool. */
133 	if (kv->size <= WT_GIGABYTE)
134 		return (0);
135 
136 	/* Check what we are willing to store in the tree. */
137 	if (kv->size > WT_BTREE_MAX_OBJECT_SIZE)
138 		WT_RET_MSG(session, EINVAL,
139 		    "item size of %" WT_SIZET_FMT " exceeds the maximum "
140 		    "supported WiredTiger size of %" PRIu32,
141 		    kv->size, WT_BTREE_MAX_OBJECT_SIZE);
142 
143 	/* Check what the block manager can actually write. */
144 	size = kv->size;
145 	if ((ret = bm->write_size(bm, session, &size)) != 0)
146 		WT_RET_MSG(session, ret,
147 		    "item size of %" WT_SIZET_FMT " refused by block manager",
148 		    kv->size);
149 
150 	return (0);
151 }
152 
153 /*
154  * __cursor_disable_bulk --
155  *	Disable bulk loads into a tree.
156  */
157 static inline void
__cursor_disable_bulk(WT_SESSION_IMPL * session,WT_BTREE * btree)158 __cursor_disable_bulk(WT_SESSION_IMPL *session, WT_BTREE *btree)
159 {
160 	/*
161 	 * Once a tree (other than the LSM primary) is no longer empty, eviction
162 	 * should pay attention to it, and it's no longer possible to bulk-load
163 	 * into it.
164 	 */
165 	if (!btree->original)
166 		return;
167 	if (btree->lsm_primary) {
168 		btree->original = 0;		/* Make the next test faster. */
169 		return;
170 	}
171 
172 	/*
173 	 * We use a compare-and-swap here to avoid races among the first inserts
174 	 * into a tree.  Eviction is disabled when an empty tree is opened, and
175 	 * it must only be enabled once.
176 	 */
177 	if (__wt_atomic_cas8(&btree->original, 1, 0)) {
178 		btree->evict_disabled_open = false;
179 		__wt_evict_file_exclusive_off(session);
180 	}
181 }
182 
183 /*
184  * __cursor_fix_implicit --
185  *	Return if search went past the end of the tree.
186  */
187 static inline bool
__cursor_fix_implicit(WT_BTREE * btree,WT_CURSOR_BTREE * cbt)188 __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
189 {
190 	/*
191 	 * When there's no exact match, column-store search returns the key
192 	 * nearest the searched-for key (continuing past keys smaller than the
193 	 * searched-for key to return the next-largest key). Therefore, if the
194 	 * returned comparison is -1, the searched-for key was larger than any
195 	 * row on the page's standard information or column-store insert list.
196 	 *
197 	 * If the returned comparison is NOT -1, there was a row equal to or
198 	 * larger than the searched-for key, and we implicitly create missing
199 	 * rows.
200 	 */
201 	return (btree->type == BTREE_COL_FIX && cbt->compare != -1);
202 }
203 
204 /*
205  * __wt_cursor_valid --
206  *	Return if the cursor references an valid key/value pair.
207  */
208 int
__wt_cursor_valid(WT_CURSOR_BTREE * cbt,WT_UPDATE ** updp,bool * valid)209 __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *valid)
210 {
211 	WT_BTREE *btree;
212 	WT_CELL *cell;
213 	WT_COL *cip;
214 	WT_PAGE *page;
215 	WT_SESSION_IMPL *session;
216 	WT_UPDATE *upd;
217 
218 	if (updp != NULL)
219 		*updp = NULL;
220 	*valid = false;
221 	btree = cbt->btree;
222 	page = cbt->ref->page;
223 	session = (WT_SESSION_IMPL *)cbt->iface.session;
224 
225 	/*
226 	 * We may be pointing to an insert object, and we may have a page with
227 	 * existing entries.  Insert objects always have associated update
228 	 * objects (the value).  Any update object may be deleted, or invisible
229 	 * to us.  In the case of an on-page entry, there is by definition a
230 	 * value that is visible to us, the original page cell.
231 	 *
232 	 * If we find a visible update structure, return our caller a reference
233 	 * to it because we don't want to repeatedly search for the update, it
234 	 * might suddenly become invisible (imagine a read-uncommitted session
235 	 * with another session's aborted insert), and we don't want to handle
236 	 * that potential error every time we look at the value.
237 	 *
238 	 * Unfortunately, the objects we might have and their relationships are
239 	 * different for the underlying page types.
240 	 *
241 	 * In the case of row-store, an insert object implies ignoring any page
242 	 * objects, no insert object can have the same key as an on-page object.
243 	 * For row-store:
244 	 *	if there's an insert object:
245 	 *		if there's a visible update:
246 	 *			exact match
247 	 *		else
248 	 *			no exact match
249 	 *	else
250 	 *		use the on-page object (which may have an associated
251 	 *		update object that may or may not be visible to us).
252 	 *
253 	 * Column-store is more complicated because an insert object can have
254 	 * the same key as an on-page object: updates to column-store rows
255 	 * are insert/object pairs, and an invisible update isn't the end as
256 	 * there may be an on-page object that is visible.  This changes the
257 	 * logic to:
258 	 *	if there's an insert object:
259 	 *		if there's a visible update:
260 	 *			exact match
261 	 *		else if the on-page object's key matches the insert key
262 	 *			use the on-page object
263 	 *	else
264 	 *		use the on-page object
265 	 *
266 	 * First, check for an insert object with a visible update (a visible
267 	 * update that's been deleted is not a valid key/value pair).
268 	 */
269 	if (cbt->ins != NULL) {
270 		WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
271 		if (upd != NULL) {
272 			if (upd->type == WT_UPDATE_TOMBSTONE)
273 				return (0);
274 			if (updp != NULL)
275 				*updp = upd;
276 			*valid = true;
277 			return (0);
278 		}
279 	}
280 
281 	/*
282 	 * If we don't have an insert object, or in the case of column-store,
283 	 * there's an insert object but no update was visible to us and the key
284 	 * on the page is the same as the insert object's key, and the slot as
285 	 * set by the search function is valid, we can use the original page
286 	 * information.
287 	 */
288 	switch (btree->type) {
289 	case BTREE_COL_FIX:
290 		/*
291 		 * If search returned an insert object, there may or may not be
292 		 * a matching on-page object, we have to check.  Fixed-length
293 		 * column-store pages don't have slots, but map one-to-one to
294 		 * keys, check for retrieval past the end of the page.
295 		 */
296 		if (cbt->recno >= cbt->ref->ref_recno + page->entries)
297 			return (0);
298 
299 		/*
300 		 * An update would have appeared as an "insert" object; no
301 		 * further checks to do.
302 		 */
303 		break;
304 	case BTREE_COL_VAR:
305 		/* The search function doesn't check for empty pages. */
306 		if (page->entries == 0)
307 			return (0);
308 		WT_ASSERT(session, cbt->slot < page->entries);
309 
310 		/*
311 		 * Column-store updates are stored as "insert" objects. If
312 		 * search returned an insert object we can't return, the
313 		 * returned on-page object must be checked for a match.
314 		 */
315 		if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH))
316 			return (0);
317 
318 		/*
319 		 * Although updates would have appeared as an "insert" objects,
320 		 * variable-length column store deletes are written into the
321 		 * backing store; check the cell for a record already deleted
322 		 * when read.
323 		 */
324 		cip = &page->pg_var[cbt->slot];
325 		if ((cell = WT_COL_PTR(page, cip)) == NULL ||
326 		    __wt_cell_type(cell) == WT_CELL_DEL)
327 			return (0);
328 		break;
329 	case BTREE_ROW:
330 		/* The search function doesn't check for empty pages. */
331 		if (page->entries == 0)
332 			return (0);
333 		WT_ASSERT(session, cbt->slot < page->entries);
334 
335 		/*
336 		 * See above: for row-store, no insert object can have the same
337 		 * key as an on-page object, we're done.
338 		 */
339 		if (cbt->ins != NULL)
340 			return (0);
341 
342 		/* Check for an update. */
343 		if (page->modify != NULL &&
344 		    page->modify->mod_row_update != NULL) {
345 			WT_RET(__wt_txn_read(session,
346 			    page->modify->mod_row_update[cbt->slot], &upd));
347 			if (upd != NULL) {
348 				if (upd->type == WT_UPDATE_TOMBSTONE)
349 					return (0);
350 				if (updp != NULL)
351 					*updp = upd;
352 			}
353 		}
354 		break;
355 	}
356 	*valid = true;
357 	return (0);
358 }
359 
360 /*
361  * __cursor_col_search --
362  *	Column-store search from a cursor.
363  */
364 static inline int
__cursor_col_search(WT_CURSOR_BTREE * cbt,WT_REF * leaf,bool * leaf_foundp)365 __cursor_col_search(WT_CURSOR_BTREE *cbt, WT_REF *leaf, bool *leaf_foundp)
366 {
367 	WT_DECL_RET;
368 	WT_SESSION_IMPL *session;
369 
370 	session = (WT_SESSION_IMPL *)cbt->iface.session;
371 	WT_WITH_PAGE_INDEX(
372 	    session, ret = __wt_col_search(
373 		cbt, cbt->iface.recno, leaf, false, leaf_foundp));
374 	return (ret);
375 }
376 
377 /*
378  * __cursor_row_search --
379  *	Row-store search from a cursor.
380  */
381 static inline int
__cursor_row_search(WT_CURSOR_BTREE * cbt,bool insert,WT_REF * leaf,bool * leaf_foundp)382 __cursor_row_search(
383     WT_CURSOR_BTREE *cbt, bool insert, WT_REF *leaf, bool *leaf_foundp)
384 {
385 	WT_DECL_RET;
386 	WT_SESSION_IMPL *session;
387 
388 	session = (WT_SESSION_IMPL *)cbt->iface.session;
389 	WT_WITH_PAGE_INDEX(
390 	    session, ret = __wt_row_search(
391 	    cbt, &cbt->iface.key, insert, leaf, false, leaf_foundp));
392 	return (ret);
393 }
394 
395 /*
396  * __cursor_col_modify_v --
397  *	Column-store modify from a cursor, with a separate value.
398  */
399 static inline int
__cursor_col_modify_v(WT_CURSOR_BTREE * cbt,WT_ITEM * value,u_int modify_type)400 __cursor_col_modify_v(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
401 {
402 	return (__wt_col_modify(
403 	    cbt, cbt->iface.recno, value, NULL, modify_type, false));
404 }
405 
406 /*
407  * __cursor_row_modify_v --
408  *	Row-store modify from a cursor, with a separate value.
409  */
410 static inline int
__cursor_row_modify_v(WT_CURSOR_BTREE * cbt,WT_ITEM * value,u_int modify_type)411 __cursor_row_modify_v(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
412 {
413 	return (__wt_row_modify(
414 	    cbt, &cbt->iface.key, value, NULL, modify_type, false));
415 }
416 
417 /*
418  * __cursor_col_modify --
419  *	Column-store modify from a cursor.
420  */
421 static inline int
__cursor_col_modify(WT_CURSOR_BTREE * cbt,u_int modify_type)422 __cursor_col_modify(WT_CURSOR_BTREE *cbt, u_int modify_type)
423 {
424 	return (__wt_col_modify(
425 	    cbt, cbt->iface.recno, &cbt->iface.value,
426 	    NULL, modify_type, false));
427 }
428 
429 /*
430  * __cursor_row_modify --
431  *	Row-store modify from a cursor.
432  */
433 static inline int
__cursor_row_modify(WT_CURSOR_BTREE * cbt,u_int modify_type)434 __cursor_row_modify(WT_CURSOR_BTREE *cbt, u_int modify_type)
435 {
436 	return (__wt_row_modify(
437 	    cbt, &cbt->iface.key, &cbt->iface.value,
438 	    NULL, modify_type, false));
439 }
440 
441 /*
442  * __cursor_restart --
443  *	Common cursor restart handling.
444  */
445 static void
__cursor_restart(WT_SESSION_IMPL * session,uint64_t * yield_count,uint64_t * sleep_usecs)446 __cursor_restart(
447     WT_SESSION_IMPL *session, uint64_t *yield_count, uint64_t *sleep_usecs)
448 {
449 	__wt_spin_backoff(yield_count, sleep_usecs);
450 
451 	WT_STAT_CONN_INCR(session, cursor_restart);
452 	WT_STAT_DATA_INCR(session, cursor_restart);
453 }
454 
455 /*
456  * __wt_btcur_reset --
457  *	Invalidate the cursor position.
458  */
459 int
__wt_btcur_reset(WT_CURSOR_BTREE * cbt)460 __wt_btcur_reset(WT_CURSOR_BTREE *cbt)
461 {
462 	WT_CURSOR *cursor;
463 	WT_SESSION_IMPL *session;
464 
465 	cursor = &cbt->iface;
466 	session = (WT_SESSION_IMPL *)cbt->iface.session;
467 
468 	WT_STAT_CONN_INCR(session, cursor_reset);
469 	WT_STAT_DATA_INCR(session, cursor_reset);
470 
471 	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
472 
473 	return (__cursor_reset(cbt));
474 }
475 
476 /*
477  * __wt_btcur_search --
478  *	Search for a matching record in the tree.
479  */
480 int
__wt_btcur_search(WT_CURSOR_BTREE * cbt)481 __wt_btcur_search(WT_CURSOR_BTREE *cbt)
482 {
483 	WT_BTREE *btree;
484 	WT_CURFILE_STATE state;
485 	WT_CURSOR *cursor;
486 	WT_DECL_RET;
487 	WT_SESSION_IMPL *session;
488 	WT_UPDATE *upd;
489 	bool leaf_found, valid;
490 
491 	btree = cbt->btree;
492 	cursor = &cbt->iface;
493 	session = (WT_SESSION_IMPL *)cursor->session;
494 	upd = NULL;					/* -Wuninitialized */
495 
496 	WT_STAT_CONN_INCR(session, cursor_search);
497 	WT_STAT_DATA_INCR(session, cursor_search);
498 
499 	WT_RET(__wt_txn_search_check(session));
500 	__cursor_state_save(cursor, &state);
501 
502 	/*
503 	 * The pinned page goes away if we search the tree, get a local copy of
504 	 * any pinned key and discard any pinned value, then re-save the cursor
505 	 * state. Done before searching pinned pages (unlike other cursor
506 	 * functions), because we don't anticipate applications searching for a
507 	 * key they currently have pinned.)
508 	 */
509 	WT_ERR(__cursor_localkey(cursor));
510 	__cursor_novalue(cursor);
511 	__cursor_state_save(cursor, &state);
512 
513 	/*
514 	 * If we have a page pinned, search it; if we don't have a page pinned,
515 	 * or the search of the pinned page doesn't find an exact match, search
516 	 * from the root.
517 	 */
518 	valid = false;
519 	if (__cursor_page_pinned(cbt)) {
520 		__wt_txn_cursor_op(session);
521 
522 		WT_ERR(btree->type == BTREE_ROW ?
523 		    __cursor_row_search(cbt, false, cbt->ref, &leaf_found) :
524 		    __cursor_col_search(cbt, cbt->ref, &leaf_found));
525 
526 		/* Return, if prepare conflict encountered. */
527 		if (leaf_found && cbt->compare == 0)
528 			WT_ERR(__wt_cursor_valid(cbt, &upd, &valid));
529 	}
530 	if (!valid) {
531 		WT_ERR(__cursor_func_init(cbt, true));
532 
533 		WT_ERR(btree->type == BTREE_ROW ?
534 		    __cursor_row_search(cbt, false, NULL, NULL) :
535 		    __cursor_col_search(cbt, NULL, NULL));
536 
537 		/* Return, if prepare conflict encountered. */
538 		if (cbt->compare == 0)
539 			WT_ERR(__wt_cursor_valid(cbt, &upd, &valid));
540 	}
541 
542 	if (valid)
543 		ret = __cursor_kv_return(cbt, upd);
544 	else if (__cursor_fix_implicit(btree, cbt)) {
545 		/*
546 		 * Creating a record past the end of the tree in a fixed-length
547 		 * column-store implicitly fills the gap with empty records.
548 		 */
549 		cbt->recno = cursor->recno;
550 		cbt->v = 0;
551 		cursor->value.data = &cbt->v;
552 		cursor->value.size = 1;
553 		F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
554 		F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
555 	} else
556 		ret = WT_NOTFOUND;
557 
558 #ifdef HAVE_DIAGNOSTIC
559 	if (ret == 0)
560 		WT_ERR(__wt_cursor_key_order_init(cbt));
561 #endif
562 
563 err:	if (ret != 0) {
564 		WT_TRET(__cursor_reset(cbt));
565 		__cursor_state_restore(cursor, &state);
566 	}
567 	return (ret);
568 }
569 
570 /*
571  * __wt_btcur_search_near --
572  *	Search for a record in the tree.
573  */
574 int
__wt_btcur_search_near(WT_CURSOR_BTREE * cbt,int * exactp)575 __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
576 {
577 	WT_BTREE *btree;
578 	WT_CURFILE_STATE state;
579 	WT_CURSOR *cursor;
580 	WT_DECL_RET;
581 	WT_SESSION_IMPL *session;
582 	WT_UPDATE *upd;
583 	int exact;
584 	bool leaf_found, valid;
585 
586 	btree = cbt->btree;
587 	cursor = &cbt->iface;
588 	session = (WT_SESSION_IMPL *)cursor->session;
589 	upd = NULL;					/* -Wuninitialized */
590 	exact = 0;
591 
592 	WT_STAT_CONN_INCR(session, cursor_search_near);
593 	WT_STAT_DATA_INCR(session, cursor_search_near);
594 
595 	WT_RET(__wt_txn_search_check(session));
596 	__cursor_state_save(cursor, &state);
597 
598 	/*
599 	 * The pinned page goes away if we search the tree, get a local copy of
600 	 * any pinned key and discard any pinned value, then re-save the cursor
601 	 * state. Done before searching pinned pages (unlike other cursor
602 	 * functions), because we don't anticipate applications searching for a
603 	 * key they currently have pinned.)
604 	 */
605 	WT_ERR(__cursor_localkey(cursor));
606 	__cursor_novalue(cursor);
607 	__cursor_state_save(cursor, &state);
608 
609 	/*
610 	 * If we have a row-store page pinned, search it; if we don't have a
611 	 * page pinned, or the search of the pinned page doesn't find an exact
612 	 * match, search from the root. Unlike WT_CURSOR.search, ignore pinned
613 	 * pages in the case of column-store, search-near isn't an interesting
614 	 * enough case for column-store to add the complexity needed to avoid
615 	 * the tree search.
616 	 */
617 	valid = false;
618 	if (btree->type == BTREE_ROW && __cursor_page_pinned(cbt)) {
619 		__wt_txn_cursor_op(session);
620 		/*
621 		 * Set the "insert" flag for the btree row-store search; we may
622 		 * intend to position the cursor at the end of the tree, rather
623 		 * than match an existing record.
624 		 */
625 		WT_ERR(__cursor_row_search(cbt, true, cbt->ref, &leaf_found));
626 
627 		/*
628 		 * Only use the pinned page search results if search returns an
629 		 * exact match or a slot other than the page's boundary slots,
630 		 * if that's not the case, a neighbor page might offer a better
631 		 * match. This test is simplistic as we're ignoring append
632 		 * lists (there may be no page slots or we might be
633 		 * legitimately positioned after the last page slot). Ignore
634 		 * those cases, it makes things too complicated.
635 		 */
636 		if (leaf_found &&
637 		    (cbt->compare == 0 ||
638 		    (cbt->slot != 0 &&
639 		    cbt->slot != cbt->ref->page->entries - 1)))
640 			WT_ERR(__wt_cursor_valid(cbt, &upd, &valid));
641 	}
642 	if (!valid) {
643 		WT_ERR(__cursor_func_init(cbt, true));
644 
645 		/*
646 		 * Set the "insert" flag for the btree row-store search; we may
647 		 * intend to position the cursor at the end of the tree, rather
648 		 * than match an existing record.
649 		 */
650 		WT_ERR(btree->type == BTREE_ROW ?
651 		    __cursor_row_search(cbt, true, NULL, NULL) :
652 		    __cursor_col_search(cbt, NULL, NULL));
653 		WT_ERR(__wt_cursor_valid(cbt, &upd, &valid));
654 	}
655 
656 	/*
657 	 * If we find a valid key, return it.
658 	 *
659 	 * Else, creating a record past the end of the tree in a fixed-length
660 	 * column-store implicitly fills the gap with empty records.  In this
661 	 * case, we instantiate the empty record, it's an exact match.
662 	 *
663 	 * Else, move to the next key in the tree (bias for prefix searches).
664 	 * Cursor next skips invalid rows, so we don't have to test for them
665 	 * again.
666 	 *
667 	 * Else, redo the search and move to the previous key in the tree.
668 	 * Cursor previous skips invalid rows, so we don't have to test for
669 	 * them again.
670 	 *
671 	 * If that fails, quit, there's no record to return.
672 	 */
673 	if (valid) {
674 		exact = cbt->compare;
675 		ret = __cursor_kv_return(cbt, upd);
676 	} else if (__cursor_fix_implicit(btree, cbt)) {
677 		cbt->recno = cursor->recno;
678 		cbt->v = 0;
679 		cursor->value.data = &cbt->v;
680 		cursor->value.size = 1;
681 		exact = 0;
682 		F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
683 		F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
684 	} else {
685 		/*
686 		 * We didn't find an exact match: try after the search key,
687 		 * then before.  We have to loop here because at low isolation
688 		 * levels, new records could appear as we are stepping through
689 		 * the tree.
690 		 */
691 		while ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) {
692 			WT_ERR(ret);
693 			if (btree->type == BTREE_ROW)
694 				WT_ERR(__wt_compare(session, btree->collator,
695 				    &cursor->key, &state.key, &exact));
696 			else
697 				exact = cbt->recno < state.recno ? -1 :
698 				    cbt->recno == state.recno ? 0 : 1;
699 			if (exact >= 0)
700 				goto done;
701 		}
702 
703 		/*
704 		 * We walked to the end of the tree without finding a match.
705 		 * Walk backwards instead.
706 		 */
707 		while ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) {
708 			WT_ERR(ret);
709 			if (btree->type == BTREE_ROW)
710 				WT_ERR(__wt_compare(session, btree->collator,
711 				    &cursor->key, &state.key, &exact));
712 			else
713 				exact = cbt->recno < state.recno ? -1 :
714 				    cbt->recno == state.recno ? 0 : 1;
715 			if (exact <= 0)
716 				goto done;
717 		}
718 	}
719 
720 done:
721 err:	if (ret == 0 && exactp != NULL)
722 		*exactp = exact;
723 
724 #ifdef HAVE_DIAGNOSTIC
725 	if (ret == 0)
726 		WT_TRET(__wt_cursor_key_order_init(cbt));
727 #endif
728 
729 	if (ret != 0) {
730 		WT_TRET(__cursor_reset(cbt));
731 		__cursor_state_restore(cursor, &state);
732 	}
733 	return (ret);
734 }
735 
736 /*
737  * __wt_btcur_insert --
738  *	Insert a record into the tree.
739  */
740 int
__wt_btcur_insert(WT_CURSOR_BTREE * cbt)741 __wt_btcur_insert(WT_CURSOR_BTREE *cbt)
742 {
743 	WT_BTREE *btree;
744 	WT_CURFILE_STATE state;
745 	WT_CURSOR *cursor;
746 	WT_DECL_RET;
747 	WT_SESSION_IMPL *session;
748 	uint64_t yield_count, sleep_usecs;
749 	bool append_key, valid;
750 
751 	btree = cbt->btree;
752 	cursor = &cbt->iface;
753 	session = (WT_SESSION_IMPL *)cursor->session;
754 	yield_count = sleep_usecs = 0;
755 
756 	WT_STAT_CONN_INCR(session, cursor_insert);
757 	WT_STAT_DATA_INCR(session, cursor_insert);
758 	WT_STAT_DATA_INCRV(session,
759 	    cursor_insert_bytes, cursor->key.size + cursor->value.size);
760 
761 	if (btree->type == BTREE_ROW)
762 		WT_RET(__cursor_size_chk(session, &cursor->key));
763 	WT_RET(__cursor_size_chk(session, &cursor->value));
764 
765 	/* It's no longer possible to bulk-load into the tree. */
766 	__cursor_disable_bulk(session, btree);
767 
768 	/*
769 	 * Insert a new record if WT_CURSTD_APPEND configured, (ignoring any
770 	 * application set record number). Although append can't be configured
771 	 * for a row-store, this code would break if it were, and that's owned
772 	 * by the upper cursor layer, be cautious.
773 	 */
774 	append_key =
775 	    F_ISSET(cursor, WT_CURSTD_APPEND) && btree->type != BTREE_ROW;
776 
777 	/* Save the cursor state. */
778 	__cursor_state_save(cursor, &state);
779 
780 	/*
781 	 * If inserting with overwrite configured, and positioned to an on-page
782 	 * key, the update doesn't require another search. Cursors configured
783 	 * for append aren't included, regardless of whether or not they meet
784 	 * all other criteria.
785 	 *
786 	 * Fixed-length column store can never use a positioned cursor to update
787 	 * because the cursor may not be positioned to the correct record in the
788 	 * case of implicit records in the append list.
789 	 */
790 	if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt) &&
791 	    F_ISSET(cursor, WT_CURSTD_OVERWRITE) && !append_key) {
792 		WT_ERR(__wt_txn_autocommit_check(session));
793 		/*
794 		 * The cursor position may not be exact (the cursor's comparison
795 		 * value not equal to zero). Correct to an exact match so we can
796 		 * update whatever we're pointing at.
797 		 */
798 		cbt->compare = 0;
799 		ret = btree->type == BTREE_ROW ?
800 		    __cursor_row_modify(cbt, WT_UPDATE_STANDARD) :
801 		    __cursor_col_modify(cbt, WT_UPDATE_STANDARD);
802 		if (ret == 0)
803 			goto done;
804 
805 		/*
806 		 * The pinned page goes away if we fail for any reason, get a
807 		 * local copy of any pinned key or value. (Restart could still
808 		 * use the pinned page, but that's an unlikely path.) Re-save
809 		 * the cursor state: we may retry but eventually fail.
810 		 */
811 		WT_TRET(__cursor_localkey(cursor));
812 		WT_TRET(__cursor_localvalue(cursor));
813 		__cursor_state_save(cursor, &state);
814 		goto err;
815 	}
816 
817 	/*
818 	 * The pinned page goes away if we do a search, get a local copy of any
819 	 * pinned key or value. Re-save the cursor state: we may retry but
820 	 * eventually fail.
821 	 */
822 	WT_ERR(__cursor_localkey(cursor));
823 	WT_ERR(__cursor_localvalue(cursor));
824 	__cursor_state_save(cursor, &state);
825 
826 retry:	WT_ERR(__cursor_func_init(cbt, true));
827 
828 	if (btree->type == BTREE_ROW) {
829 		WT_ERR(__cursor_row_search(cbt, true, NULL, NULL));
830 		/*
831 		 * If not overwriting, fail if the key exists, else insert the
832 		 * key/value pair.
833 		 */
834 		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
835 		    cbt->compare == 0) {
836 			WT_ERR(__wt_cursor_valid(cbt, NULL, &valid));
837 			if (valid)
838 				WT_ERR(WT_DUPLICATE_KEY);
839 		}
840 
841 		ret = __cursor_row_modify(cbt, WT_UPDATE_STANDARD);
842 	} else if (append_key) {
843 		/*
844 		 * Optionally insert a new record (ignoring the application's
845 		 * record number). The real record number is allocated by the
846 		 * serialized append operation.
847 		 */
848 		cbt->iface.recno = WT_RECNO_OOB;
849 		cbt->compare = 1;
850 		WT_ERR(__cursor_col_search(cbt, NULL, NULL));
851 		WT_ERR(__cursor_col_modify(cbt, WT_UPDATE_STANDARD));
852 		cursor->recno = cbt->recno;
853 	} else {
854 		WT_ERR(__cursor_col_search(cbt, NULL, NULL));
855 
856 		/*
857 		 * If not overwriting, fail if the key exists.  Creating a
858 		 * record past the end of the tree in a fixed-length
859 		 * column-store implicitly fills the gap with empty records.
860 		 * Fail in that case, the record exists.
861 		 */
862 		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
863 			if (cbt->compare == 0) {
864 				WT_ERR(__wt_cursor_valid(cbt, NULL, &valid));
865 				if (valid)
866 					WT_ERR(WT_DUPLICATE_KEY);
867 			} else if (__cursor_fix_implicit(btree, cbt))
868 				WT_ERR(WT_DUPLICATE_KEY);
869 		}
870 
871 		WT_ERR(__cursor_col_modify(cbt, WT_UPDATE_STANDARD));
872 	}
873 
874 err:	if (ret == WT_RESTART) {
875 		__cursor_restart(session, &yield_count, &sleep_usecs);
876 		goto retry;
877 	}
878 
879 	/* Insert doesn't maintain a position across calls, clear resources. */
880 	if (ret == 0) {
881 done:		F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
882 		if (append_key)
883 			F_SET(cursor, WT_CURSTD_KEY_EXT);
884 	}
885 	WT_TRET(__cursor_reset(cbt));
886 	if (ret != 0)
887 		__cursor_state_restore(cursor, &state);
888 
889 	return (ret);
890 }
891 
892 /*
893  * __curfile_update_check --
894  *	Check whether an update would conflict.
895  *
896  *	This function expects the cursor to already be positioned.  It should
897  *	be called before deciding whether to skip an update operation based on
898  *	existence of a visible update for a key -- even if there is no value
899  *	visible to the transaction, an update could still conflict.
900  */
901 static int
__curfile_update_check(WT_CURSOR_BTREE * cbt)902 __curfile_update_check(WT_CURSOR_BTREE *cbt)
903 {
904 	WT_BTREE *btree;
905 	WT_SESSION_IMPL *session;
906 
907 	btree = cbt->btree;
908 	session = (WT_SESSION_IMPL *)cbt->iface.session;
909 
910 	if (cbt->compare != 0)
911 		return (0);
912 	if (cbt->ins != NULL)
913 		return (__wt_txn_update_check(session, cbt->ins->upd));
914 
915 	if (btree->type == BTREE_ROW &&
916 	    cbt->ref->page->modify != NULL &&
917 	    cbt->ref->page->modify->mod_row_update != NULL)
918 		return (__wt_txn_update_check(session,
919 		    cbt->ref->page->modify->mod_row_update[cbt->slot]));
920 	return (0);
921 }
922 
923 /*
924  * __wt_btcur_insert_check --
925  *	Check whether an update would conflict.
926  *
927  * This can replace WT_CURSOR::insert, so it only checks for conflicts without
928  * updating the tree. It is used to maintain snapshot isolation for transactions
929  * that span multiple chunks in an LSM tree.
930  */
931 int
__wt_btcur_insert_check(WT_CURSOR_BTREE * cbt)932 __wt_btcur_insert_check(WT_CURSOR_BTREE *cbt)
933 {
934 	WT_CURSOR *cursor;
935 	WT_DECL_RET;
936 	WT_SESSION_IMPL *session;
937 	uint64_t yield_count, sleep_usecs;
938 
939 	cursor = &cbt->iface;
940 	session = (WT_SESSION_IMPL *)cursor->session;
941 	yield_count = sleep_usecs = 0;
942 
943 	WT_ASSERT(session, cbt->btree->type == BTREE_ROW);
944 
945 	/*
946 	 * The pinned page goes away if we do a search, get a local copy of any
947 	 * pinned key and discard any pinned value. Unlike most of the btree
948 	 * cursor routines, we don't have to save/restore the cursor key state,
949 	 * none of the work done here changes the cursor state.
950 	 */
951 	WT_ERR(__cursor_localkey(cursor));
952 	__cursor_novalue(cursor);
953 
954 retry:	WT_ERR(__cursor_func_init(cbt, true));
955 	WT_ERR(__cursor_row_search(cbt, true, NULL, NULL));
956 
957 	/* Just check for conflicts. */
958 	ret = __curfile_update_check(cbt);
959 
960 err:	if (ret == WT_RESTART) {
961 		__cursor_restart(session, &yield_count, &sleep_usecs);
962 		goto retry;
963 	}
964 
965 	/* Insert doesn't maintain a position across calls, clear resources. */
966 	if (ret == 0)
967 		F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
968 	WT_TRET(__cursor_reset(cbt));
969 
970 	return (ret);
971 }
972 
973 /*
974  * __wt_btcur_remove --
975  *	Remove a record from the tree.
976  */
977 int
__wt_btcur_remove(WT_CURSOR_BTREE * cbt)978 __wt_btcur_remove(WT_CURSOR_BTREE *cbt)
979 {
980 	enum { NO_POSITION, POSITIONED, SEARCH_POSITION } positioned;
981 	WT_BTREE *btree;
982 	WT_CURFILE_STATE state;
983 	WT_CURSOR *cursor;
984 	WT_DECL_RET;
985 	WT_SESSION_IMPL *session;
986 	uint64_t yield_count, sleep_usecs;
987 	bool iterating, valid;
988 
989 	btree = cbt->btree;
990 	cursor = &cbt->iface;
991 	session = (WT_SESSION_IMPL *)cursor->session;
992 	yield_count = sleep_usecs = 0;
993 	iterating = F_ISSET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV);
994 
995 	WT_STAT_CONN_INCR(session, cursor_remove);
996 	WT_STAT_DATA_INCR(session, cursor_remove);
997 	WT_STAT_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size);
998 
999 	/*
1000 	 * WT_CURSOR.remove has a unique semantic, the cursor stays positioned
1001 	 * if it starts positioned, otherwise clear the cursor on completion.
1002 	 *
1003 	 * However, if we unpin the page (because the page is in WT_REF_LIMBO or
1004 	 * it was selected for forcible eviction), and every item on the page is
1005 	 * deleted, eviction can delete the page and our subsequent search will
1006 	 * re-instantiate an empty page for us, with no key/value pairs. Cursor
1007 	 * remove will search that page and return not-found, which is OK unless
1008 	 * cursor-overwrite is configured (which causes cursor remove to return
1009 	 * success even if there's no item to delete). In that case, we're
1010 	 * supposed to return a positioned cursor, but there's nothing to which
1011 	 * we can position, and we'll fail attempting to point the cursor at the
1012 	 * key on the page to satisfy the positioned requirement.
1013 	 *
1014 	 * Do the best we can: If we start with a positioned cursor, and we let
1015 	 * go of our pinned page, reset our state to use the search position,
1016 	 * that is, use a successful search to return to a "positioned" state.
1017 	 * If we start with a positioned cursor, let go of our pinned page, and
1018 	 * the search fails, leave the cursor's key set so the cursor appears
1019 	 * positioned to the application.
1020 	 */
1021 	positioned =
1022 	    F_ISSET(cursor, WT_CURSTD_KEY_INT) ? POSITIONED : NO_POSITION;
1023 
1024 	/* Save the cursor state. */
1025 	__cursor_state_save(cursor, &state);
1026 
1027 	/*
1028 	 * If remove positioned to an on-page key, the remove doesn't require
1029 	 * another search. We don't care about the "overwrite" configuration
1030 	 * because regardless of the overwrite setting, any existing record is
1031 	 * removed, and the record must exist with a positioned cursor.
1032 	 *
1033 	 * There's trickiness in the page-pinned check. By definition a remove
1034 	 * operation leaves a cursor positioned if it's initially positioned.
1035 	 * However, if every item on the page is deleted and we unpin the page,
1036 	 * eviction might delete the page and our search will re-instantiate an
1037 	 * empty page for us. Cursor remove returns not-found whether or not
1038 	 * that eviction/deletion happens and it's OK unless cursor-overwrite
1039 	 * is configured (which means we return success even if there's no item
1040 	 * to delete). In that case, we'll fail when we try to point the cursor
1041 	 * at the key on the page to satisfy the positioned requirement. It's
1042 	 * arguably safe to simply leave the key initialized in the cursor (as
1043 	 * that's all a positioned cursor implies), but it's probably safer to
1044 	 * avoid page eviction entirely in the positioned case.
1045 	 *
1046 	 * Fixed-length column store can never use a positioned cursor to update
1047 	 * because the cursor may not be positioned to the correct record in the
1048 	 * case of implicit records in the append list.
1049 	 */
1050 	if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt)) {
1051 		WT_ERR(__wt_txn_autocommit_check(session));
1052 
1053 		/*
1054 		 * The cursor position may not be exact (the cursor's comparison
1055 		 * value not equal to zero). Correct to an exact match so we can
1056 		 * remove whatever we're pointing at.
1057 		 */
1058 		cbt->compare = 0;
1059 		ret = btree->type == BTREE_ROW ?
1060 		    __cursor_row_modify(cbt, WT_UPDATE_TOMBSTONE) :
1061 		    __cursor_col_modify(cbt, WT_UPDATE_TOMBSTONE);
1062 		if (ret == 0)
1063 			goto done;
1064 		goto err;
1065 	}
1066 
1067 	/*
1068 	 * The pinned page goes away if we do a search, including as a result of
1069 	 * a restart. Get a local copy of any pinned key and re-save the cursor
1070 	 * state: we may retry but eventually fail.
1071 	 *
1072 	 * Note these steps must be repeatable, we'll continue to take this path
1073 	 * as long as we encounter WT_RESTART.
1074 	 */
1075 retry:	if (positioned == POSITIONED)
1076 		positioned = SEARCH_POSITION;
1077 	WT_ERR(__cursor_localkey(cursor));
1078 	__cursor_state_save(cursor, &state);
1079 
1080 	WT_ERR(__cursor_func_init(cbt, true));
1081 
1082 	if (btree->type == BTREE_ROW) {
1083 		WT_ERR(__cursor_row_search(cbt, false, NULL, NULL));
1084 
1085 		/* Check whether an update would conflict. */
1086 		WT_ERR(__curfile_update_check(cbt));
1087 
1088 		if (cbt->compare != 0)
1089 			WT_ERR(WT_NOTFOUND);
1090 		WT_ERR(__wt_cursor_valid(cbt, NULL, &valid));
1091 		if (!valid)
1092 			WT_ERR(WT_NOTFOUND);
1093 
1094 		ret = __cursor_row_modify(cbt, WT_UPDATE_TOMBSTONE);
1095 	} else {
1096 		WT_ERR(__cursor_col_search(cbt, NULL, NULL));
1097 
1098 		/*
1099 		 * If we find a matching record, check whether an update would
1100 		 * conflict.  Do this before checking if the update is visible
1101 		 * in __wt_cursor_valid, or we can miss conflict.
1102 		 */
1103 		WT_ERR(__curfile_update_check(cbt));
1104 
1105 		/* Remove the record if it exists. */
1106 		valid = false;
1107 		if (cbt->compare == 0)
1108 			WT_ERR(__wt_cursor_valid(cbt, NULL, &valid));
1109 		if (cbt->compare != 0 || !valid) {
1110 			if (!__cursor_fix_implicit(btree, cbt))
1111 				WT_ERR(WT_NOTFOUND);
1112 			/*
1113 			 * Creating a record past the end of the tree in a
1114 			 * fixed-length column-store implicitly fills the
1115 			 * gap with empty records.  Return success in that
1116 			 * case, the record was deleted successfully.
1117 			 *
1118 			 * Correct the btree cursor's location: the search
1119 			 * will have pointed us at the previous/next item,
1120 			 * and that's not correct.
1121 			 */
1122 			cbt->recno = cursor->recno;
1123 		} else
1124 			ret = __cursor_col_modify(cbt, WT_UPDATE_TOMBSTONE);
1125 	}
1126 
1127 err:	if (ret == WT_RESTART) {
1128 		__cursor_restart(session, &yield_count, &sleep_usecs);
1129 		goto retry;
1130 	}
1131 
1132 	if (ret == 0) {
1133 done:		switch (positioned) {
1134 		case NO_POSITION:
1135 			/*
1136 			 * Never positioned and we leave it that way, clear any
1137 			 * key and reset the cursor.
1138 			 */
1139 			F_CLR(cursor, WT_CURSTD_KEY_SET);
1140 			WT_TRET(__cursor_reset(cbt));
1141 			break;
1142 		case POSITIONED:
1143 			/*
1144 			 * Positioned and we used the pinned page, leave the key
1145 			 * alone, whatever it is.
1146 			 */
1147 			break;
1148 		case SEARCH_POSITION:
1149 			/*
1150 			 * Positioned and we did a search anyway, get a key to
1151 			 * return.
1152 			 */
1153 			WT_TRET(__wt_key_return(cbt));
1154 			break;
1155 		}
1156 	}
1157 
1158 	if (ret != 0) {
1159 		WT_TRET(__cursor_reset(cbt));
1160 		__cursor_state_restore(cursor, &state);
1161 
1162 		/*
1163 		 * If the record isn't found and the cursor is configured for
1164 		 * overwrite, that is what we want, try to return success.
1165 		 *
1166 		 * We set the return to 0 after testing for success, the clause
1167 		 * above dealing with the cursor position is only correct if we
1168 		 * were successful. If search failed after positioned is set to
1169 		 * SEARCH_POSITION, we cannot return a key. The only action to
1170 		 * take is to set the cursor to its original key, which we just
1171 		 * did.
1172 		 *
1173 		 * Finally, if an iterating or positioned cursor was forced to
1174 		 * give up its pinned page and then a search failed, we've
1175 		 * lost our cursor position. Since no subsequent iteration can
1176 		 * succeed, we cannot return success.
1177 		 */
1178 		if (ret == WT_NOTFOUND &&
1179 		    F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
1180 		    !iterating && positioned == NO_POSITION)
1181 			ret = 0;
1182 	}
1183 
1184 	/*
1185 	 * Upper level cursor removes don't expect the cursor value to be set
1186 	 * after a successful remove (and check in diagnostic mode). Error
1187 	 * handling may have converted failure to a success, do a final check.
1188 	 */
1189 	if (ret == 0)
1190 		F_CLR(cursor, WT_CURSTD_VALUE_SET);
1191 
1192 	return (ret);
1193 }
1194 
1195 /*
1196  * __btcur_update --
1197  *	Update a record in the tree.
1198  */
1199 static int
__btcur_update(WT_CURSOR_BTREE * cbt,WT_ITEM * value,u_int modify_type)1200 __btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
1201 {
1202 	WT_BTREE *btree;
1203 	WT_CURFILE_STATE state;
1204 	WT_CURSOR *cursor;
1205 	WT_DECL_RET;
1206 	WT_SESSION_IMPL *session;
1207 	uint64_t yield_count, sleep_usecs;
1208 	bool leaf_found, valid;
1209 
1210 	btree = cbt->btree;
1211 	cursor = &cbt->iface;
1212 	session = (WT_SESSION_IMPL *)cursor->session;
1213 	yield_count = sleep_usecs = 0;
1214 
1215 	/* It's no longer possible to bulk-load into the tree. */
1216 	__cursor_disable_bulk(session, btree);
1217 
1218 	/* Save the cursor state. */
1219 	__cursor_state_save(cursor, &state);
1220 
1221 	/*
1222 	 * If update positioned to an on-page key, the update doesn't require
1223 	 * another search. We don't care about the "overwrite" configuration
1224 	 * because regardless of the overwrite setting, any existing record is
1225 	 * updated, and the record must exist with a positioned cursor.
1226 	 *
1227 	 * Fixed-length column store can never use a positioned cursor to update
1228 	 * because the cursor may not be positioned to the correct record in the
1229 	 * case of implicit records in the append list.
1230 	 */
1231 	if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt)) {
1232 		WT_ERR(__wt_txn_autocommit_check(session));
1233 
1234 		/*
1235 		 * The cursor position may not be exact (the cursor's comparison
1236 		 * value not equal to zero). Correct to an exact match so we can
1237 		 * update whatever we're pointing at.
1238 		 */
1239 		cbt->compare = 0;
1240 		ret = btree->type == BTREE_ROW ?
1241 		    __cursor_row_modify_v(cbt, value, modify_type) :
1242 		    __cursor_col_modify_v(cbt, value, modify_type);
1243 		if (ret == 0)
1244 			goto done;
1245 
1246 		/*
1247 		 * The pinned page goes away if we fail for any reason, get a
1248 		 * a local copy of any pinned key or value. (Restart could still
1249 		 * use the pinned page, but that's an unlikely path.) Re-save
1250 		 * the cursor state: we may retry but eventually fail.
1251 		 */
1252 		WT_TRET(__cursor_localkey(cursor));
1253 		WT_TRET(__cursor_localvalue(cursor));
1254 		__cursor_state_save(cursor, &state);
1255 		goto err;
1256 	}
1257 
1258 	/*
1259 	 * The pinned page goes away if we do a search, get a local copy of any
1260 	 * pinned key or value. Re-save the cursor state: we may retry but
1261 	 * eventually fail.
1262 	 */
1263 	WT_ERR(__cursor_localkey(cursor));
1264 	WT_ERR(__cursor_localvalue(cursor));
1265 	__cursor_state_save(cursor, &state);
1266 
1267 	/*
1268 	 * If our caller configures for a local search and we have a page
1269 	 * pinned, do that search.
1270 	 */
1271 	if (F_ISSET(cursor, WT_CURSTD_UPDATE_LOCAL)
1272 	    && __cursor_page_pinned(cbt)) {
1273 		__wt_txn_cursor_op(session);
1274 		WT_ERR(__wt_txn_autocommit_check(session));
1275 		WT_ERR(btree->type == BTREE_ROW ?
1276 		    __cursor_row_search(cbt, true, cbt->ref, &leaf_found) :
1277 		    __cursor_col_search(cbt, cbt->ref, &leaf_found));
1278 		/*
1279 		 * Only use the pinned page search results if search returns an
1280 		 * exact match or a slot other than the page's boundary slots,
1281 		 * if that's not the case, a neighbor page might offer a better
1282 		 * match. This test is simplistic as we're ignoring append
1283 		 * lists (there may be no page slots or we might be
1284 		 * legitimately positioned after the last page slot). Ignore
1285 		 * those cases, it makes things too complicated.
1286 		 */
1287 	if (leaf_found && (cbt->compare == 0
1288 	    || (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1)))
1289 		goto update_local;
1290     }
1291 
1292 retry:
1293 	WT_ERR(__cursor_func_init(cbt, true));
1294 	WT_ERR(btree->type == BTREE_ROW ?
1295 	    __cursor_row_search(cbt, true, NULL, NULL) :
1296 	    __cursor_col_search(cbt, NULL, NULL));
1297 update_local:
1298 	if (btree->type == BTREE_ROW) {
1299 		/*
1300 		 * If not overwriting, check for conflicts and fail if the key
1301 		 * does not exist.
1302 		 */
1303 		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
1304 			WT_ERR(__curfile_update_check(cbt));
1305 			if (cbt->compare != 0)
1306 				WT_ERR(WT_NOTFOUND);
1307 			WT_ERR(__wt_cursor_valid(cbt, NULL, &valid));
1308 			if (!valid)
1309 				WT_ERR(WT_NOTFOUND);
1310 		}
1311 		ret = __cursor_row_modify_v(cbt, value, modify_type);
1312 	} else {
1313 		/*
1314 		 * If not overwriting, fail if the key doesn't exist.  If we
1315 		 * find an update for the key, check for conflicts.  Update the
1316 		 * record if it exists.  Creating a record past the end of the
1317 		 * tree in a fixed-length column-store implicitly fills the gap
1318 		 * with empty records.  Update the record in that case, the
1319 		 * record exists.
1320 		 */
1321 		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
1322 			WT_ERR(__curfile_update_check(cbt));
1323 			valid = false;
1324 			if (cbt->compare == 0)
1325 				WT_ERR(__wt_cursor_valid(cbt, NULL, &valid));
1326 			if ((cbt->compare != 0 || !valid) &&
1327 			    !__cursor_fix_implicit(btree, cbt))
1328 				WT_ERR(WT_NOTFOUND);
1329 		}
1330 		ret = __cursor_col_modify_v(cbt, value, modify_type);
1331 	}
1332 
1333 err:	if (ret == WT_RESTART) {
1334 		__cursor_restart(session, &yield_count, &sleep_usecs);
1335 		goto retry;
1336 	}
1337 
1338 	/*
1339 	 * If successful, point the cursor at internal copies of the data.  We
1340 	 * could shuffle memory in the cursor so the key/value pair are in local
1341 	 * buffer memory, but that's a data copy.  We don't want to do another
1342 	 * search (and we might get a different update structure if we race).
1343 	 * To make this work, we add a field to the btree cursor to pass back a
1344 	 * pointer to the modify function's allocated update structure.
1345 	 */
1346 	if (ret == 0) {
1347 done:		switch (modify_type) {
1348 		case WT_UPDATE_STANDARD:
1349 			/*
1350 			 * WT_CURSOR.update returns a key and a value.
1351 			 */
1352 			ret = __cursor_kv_return(cbt, cbt->modify_update);
1353 			break;
1354 		case WT_UPDATE_RESERVE:
1355 			/*
1356 			 * WT_CURSOR.reserve doesn't return any value.
1357 			 */
1358 			F_CLR(cursor, WT_CURSTD_VALUE_SET);
1359 			/* FALLTHROUGH */
1360 		case WT_UPDATE_MODIFY:
1361 			/*
1362 			 * WT_CURSOR.modify has already created the return value
1363 			 * and our job is to leave it untouched.
1364 			 */
1365 			ret = __wt_key_return(cbt);
1366 			break;
1367 		case WT_UPDATE_BIRTHMARK:
1368 		case WT_UPDATE_TOMBSTONE:
1369 		WT_ILLEGAL_VALUE(session, modify_type);
1370 		}
1371 	}
1372 
1373 	if (ret != 0) {
1374 		WT_TRET(__cursor_reset(cbt));
1375 		__cursor_state_restore(cursor, &state);
1376 	}
1377 
1378 	return (ret);
1379 }
1380 
1381 /*
1382  * __cursor_chain_exceeded --
1383  *	Return if the update chain has exceeded the limit.
1384  */
1385 static bool
__cursor_chain_exceeded(WT_CURSOR_BTREE * cbt)1386 __cursor_chain_exceeded(WT_CURSOR_BTREE *cbt)
1387 {
1388 	WT_CURSOR *cursor;
1389 	WT_PAGE *page;
1390 	WT_SESSION_IMPL *session;
1391 	WT_UPDATE *upd;
1392 	size_t upd_size;
1393 	int i;
1394 
1395 	cursor = &cbt->iface;
1396 	page = cbt->ref->page;
1397 	session = (WT_SESSION_IMPL *)cursor->session;
1398 
1399 	upd = NULL;
1400 	if (cbt->ins != NULL)
1401 		upd = cbt->ins->upd;
1402 	else if (cbt->btree->type == BTREE_ROW &&
1403 	    page->modify != NULL && page->modify->mod_row_update != NULL)
1404 		upd = page->modify->mod_row_update[cbt->slot];
1405 
1406 	/*
1407 	 * Step through the modify operations at the beginning of the chain.
1408 	 *
1409 	 * Deleted or standard updates are anticipated to be sufficient to base
1410 	 * the modify (although that's not guaranteed: they may not be visible
1411 	 * or might abort before we read them).  Also, this is not a hard
1412 	 * limit, threads can race modifying updates.
1413 	 *
1414 	 * If the total size in bytes of the updates exceeds some factor of the
1415 	 * underlying value size (which we know because the cursor is
1416 	 * positioned), create a new full copy of the value.  This limits the
1417 	 * cache pressure from creating full copies to that factor: with the
1418 	 * default factor of 1, the total size in memory of a set of modify
1419 	 * updates is limited to double the size of the modifies.
1420 	 *
1421 	 * Otherwise, limit the length of the update chain to a fixed size to
1422 	 * bound the cost of rebuilding the value during reads.  When history
1423 	 * has to be maintained, creating extra copies of large documents
1424 	 * multiplies cache pressure because the old ones cannot be freed, so
1425 	 * allow the modify chain to grow.
1426 	 */
1427 	for (i = 0, upd_size = 0;
1428 	    upd != NULL && upd->type == WT_UPDATE_MODIFY;
1429 	    ++i, upd = upd->next) {
1430 		upd_size += WT_UPDATE_MEMSIZE(upd);
1431 		if (i >= WT_MAX_MODIFY_UPDATE &&
1432 		    upd_size * WT_MODIFY_MEM_FRACTION >= cursor->value.size)
1433 			return (true);
1434 	}
1435 	if (i >= WT_MAX_MODIFY_UPDATE && upd != NULL &&
1436 	    upd->type == WT_UPDATE_STANDARD &&
1437 	    __wt_txn_upd_visible_all(session, upd))
1438 		return (true);
1439 	return (false);
1440 }
1441 
1442 /*
1443  * __wt_btcur_modify --
1444  *     Modify a record in the tree.
1445  */
1446 int
__wt_btcur_modify(WT_CURSOR_BTREE * cbt,WT_MODIFY * entries,int nentries)1447 __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
1448 {
1449 	WT_CURFILE_STATE state;
1450 	WT_CURSOR *cursor;
1451 	WT_DECL_ITEM(modify);
1452 	WT_DECL_RET;
1453 	WT_SESSION_IMPL *session;
1454 	size_t orig, new;
1455 	bool overwrite;
1456 
1457 	cursor = &cbt->iface;
1458 	session = (WT_SESSION_IMPL *)cursor->session;
1459 
1460 	WT_STAT_CONN_INCR(session, cursor_modify);
1461 	WT_STAT_DATA_INCR(session, cursor_modify);
1462 
1463 	/* Save the cursor state. */
1464 	__cursor_state_save(cursor, &state);
1465 
1466 	/*
1467 	 * Get the current value and apply the modification to it, for a few
1468 	 * reasons: first, we set the updated value so the application can
1469 	 * retrieve the cursor's value; second, we use the updated value as
1470 	 * the update if the update chain is too long; third, there's a check
1471 	 * if the updated value is too large to store; fourth, to simplify the
1472 	 * count of bytes being added/removed; fifth, we can get into serious
1473 	 * trouble if we attempt to modify a value that doesn't exist. For the
1474 	 * fifth reason, verify we're not in a read-uncommitted transaction,
1475 	 * that implies a value that might disappear out from under us.
1476 	 *
1477 	 * Also, an application might read a value outside of a transaction and
1478 	 * then call modify. For that to work, the read must be part of the
1479 	 * transaction that performs the update for correctness, otherwise we
1480 	 * could race with another thread and end up modifying the wrong value.
1481 	 * A clever application could get this right (imagine threads that only
1482 	 * updated non-overlapping, fixed-length byte strings), but it's unsafe
1483 	 * because it will work most of the time and the failure is unlikely to
1484 	 * be detected. Require explicit transactions for modify operations.
1485 	 */
1486 	if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
1487 		WT_ERR_MSG(session, ENOTSUP,
1488 		    "not supported in read-uncommitted transactions");
1489 	if (F_ISSET(&session->txn, WT_TXN_AUTOCOMMIT))
1490 		WT_ERR_MSG(session, ENOTSUP,
1491 		    "not supported in implicit transactions");
1492 
1493 	if (!F_ISSET(cursor, WT_CURSTD_KEY_INT) ||
1494 	    !F_ISSET(cursor, WT_CURSTD_VALUE_INT))
1495 		WT_ERR(__wt_btcur_search(cbt));
1496 	orig = cursor->value.size;
1497 	WT_ERR(__wt_modify_apply_api(session, cursor, entries, nentries));
1498 	new = cursor->value.size;
1499 	WT_ERR(__cursor_size_chk(session, &cursor->value));
1500 	if (new > orig)
1501 		WT_STAT_DATA_INCRV(session, cursor_update_bytes, new - orig);
1502 	else
1503 		WT_STAT_DATA_DECRV(session, cursor_update_bytes, orig - new);
1504 
1505 	/*
1506 	 * WT_CURSOR.modify is update-without-overwrite.
1507 	 *
1508 	 * Use the modify buffer as the update if the data package saves us some
1509 	 * memory and the update chain is under the limit, else use the complete
1510 	 * value.
1511 	 */
1512 	overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
1513 	F_CLR(cursor, WT_CURSTD_OVERWRITE);
1514 	if (cursor->value.size <= 64 || __cursor_chain_exceeded(cbt))
1515 		ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD);
1516 	else if ((ret =
1517 	    __wt_modify_pack(session, &modify, entries, nentries)) == 0)
1518 		ret = __btcur_update(cbt, modify, WT_UPDATE_MODIFY);
1519 	if (overwrite)
1520 	       F_SET(cursor, WT_CURSTD_OVERWRITE);
1521 
1522 	/*
1523 	 * We have our own cursor state restoration because we've modified the
1524 	 * cursor before calling the underlying cursor update function and we
1525 	 * need to restore it to its original state. This means multiple calls
1526 	 * to reset the cursor, but that shouldn't be a problem.
1527 	 */
1528 	if (ret != 0) {
1529 err:		WT_TRET(__cursor_reset(cbt));
1530 		__cursor_state_restore(cursor, &state);
1531 	}
1532 
1533 	__wt_scr_free(session, &modify);
1534 	return (ret);
1535 }
1536 
1537 /*
1538  * __wt_btcur_reserve --
1539  *     Reserve a record in the tree.
1540  */
1541 int
__wt_btcur_reserve(WT_CURSOR_BTREE * cbt)1542 __wt_btcur_reserve(WT_CURSOR_BTREE *cbt)
1543 {
1544 	WT_CURSOR *cursor;
1545 	WT_DECL_RET;
1546 	WT_SESSION_IMPL *session;
1547 	bool overwrite;
1548 
1549 	cursor = &cbt->iface;
1550 	session = (WT_SESSION_IMPL *)cursor->session;
1551 
1552 	WT_STAT_CONN_INCR(session, cursor_reserve);
1553 	WT_STAT_DATA_INCR(session, cursor_reserve);
1554 
1555 	/* WT_CURSOR.reserve is update-without-overwrite and a special value. */
1556 	overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
1557 	F_CLR(cursor, WT_CURSTD_OVERWRITE);
1558 	ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_RESERVE);
1559 	if (overwrite)
1560 	       F_SET(cursor, WT_CURSTD_OVERWRITE);
1561 	return (ret);
1562 }
1563 
1564 /*
1565  * __wt_btcur_update --
1566  *     Update a record in the tree.
1567  */
1568 int
__wt_btcur_update(WT_CURSOR_BTREE * cbt)1569 __wt_btcur_update(WT_CURSOR_BTREE *cbt)
1570 {
1571 	WT_BTREE *btree;
1572 	WT_CURSOR *cursor;
1573 	WT_SESSION_IMPL *session;
1574 
1575 	btree = cbt->btree;
1576 	cursor = &cbt->iface;
1577 	session = (WT_SESSION_IMPL *)cursor->session;
1578 
1579 	WT_STAT_CONN_INCR(session, cursor_update);
1580 	WT_STAT_DATA_INCR(session, cursor_update);
1581 	WT_STAT_DATA_INCRV(session, cursor_update_bytes, cursor->value.size);
1582 
1583 	if (btree->type == BTREE_ROW)
1584 		WT_RET(__cursor_size_chk(session, &cursor->key));
1585 	WT_RET(__cursor_size_chk(session, &cursor->value));
1586 
1587 	return (__btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD));
1588 }
1589 
1590 /*
1591  * __wt_btcur_compare --
1592  *	Return a comparison between two cursors.
1593  */
1594 int
__wt_btcur_compare(WT_CURSOR_BTREE * a_arg,WT_CURSOR_BTREE * b_arg,int * cmpp)1595 __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp)
1596 {
1597 	WT_CURSOR *a, *b;
1598 	WT_SESSION_IMPL *session;
1599 
1600 	a = (WT_CURSOR *)a_arg;
1601 	b = (WT_CURSOR *)b_arg;
1602 	session = (WT_SESSION_IMPL *)a->session;
1603 
1604 	/* Confirm both cursors reference the same object. */
1605 	if (a_arg->btree != b_arg->btree)
1606 		WT_RET_MSG(
1607 		    session, EINVAL, "Cursors must reference the same object");
1608 
1609 	switch (a_arg->btree->type) {
1610 	case BTREE_COL_FIX:
1611 	case BTREE_COL_VAR:
1612 		/*
1613 		 * Compare the interface's cursor record, not the underlying
1614 		 * cursor reference: the interface's cursor reference is the
1615 		 * one being returned to the application.
1616 		 */
1617 		if (a->recno < b->recno)
1618 			*cmpp = -1;
1619 		else if (a->recno == b->recno)
1620 			*cmpp = 0;
1621 		else
1622 			*cmpp = 1;
1623 		break;
1624 	case BTREE_ROW:
1625 		WT_RET(__wt_compare(
1626 		    session, a_arg->btree->collator, &a->key, &b->key, cmpp));
1627 		break;
1628 	}
1629 	return (0);
1630 }
1631 
1632 /*
1633  * __cursor_equals --
1634  *	Return if two cursors reference the same row.
1635  */
1636 static inline bool
__cursor_equals(WT_CURSOR_BTREE * a,WT_CURSOR_BTREE * b)1637 __cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b)
1638 {
1639 	switch (a->btree->type) {
1640 	case BTREE_COL_FIX:
1641 	case BTREE_COL_VAR:
1642 		/*
1643 		 * Compare the interface's cursor record, not the underlying
1644 		 * cursor reference: the interface's cursor reference is the
1645 		 * one being returned to the application.
1646 		 */
1647 		if (((WT_CURSOR *)a)->recno == ((WT_CURSOR *)b)->recno)
1648 			return (true);
1649 		break;
1650 	case BTREE_ROW:
1651 		if (a->ref != b->ref)
1652 			return (false);
1653 		if (a->ins != NULL || b->ins != NULL) {
1654 			if (a->ins == b->ins)
1655 				return (true);
1656 			break;
1657 		}
1658 		if (a->slot == b->slot)
1659 			return (true);
1660 		break;
1661 	}
1662 	return (false);
1663 }
1664 
1665 /*
1666  * __wt_btcur_equals --
1667  *	Return an equality comparison between two cursors.
1668  */
1669 int
__wt_btcur_equals(WT_CURSOR_BTREE * a_arg,WT_CURSOR_BTREE * b_arg,int * equalp)1670 __wt_btcur_equals(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp)
1671 {
1672 	WT_CURSOR *a, *b;
1673 	WT_SESSION_IMPL *session;
1674 	int cmp;
1675 
1676 	a = (WT_CURSOR *)a_arg;
1677 	b = (WT_CURSOR *)b_arg;
1678 	cmp = 0;
1679 	session = (WT_SESSION_IMPL *)a->session;
1680 
1681 	/* Confirm both cursors reference the same object. */
1682 	if (a_arg->btree != b_arg->btree)
1683 		WT_RET_MSG(
1684 		    session, EINVAL, "Cursors must reference the same object");
1685 
1686 	/*
1687 	 * The reason for an equals method is because we can avoid doing
1688 	 * a full key comparison in some cases. If both cursors point into the
1689 	 * tree, take the fast path, otherwise fall back to the slower compare
1690 	 * method; in both cases, return 1 if the cursors are equal, 0 if they
1691 	 * are not.
1692 	 */
1693 	if (F_ISSET(a, WT_CURSTD_KEY_INT) && F_ISSET(b, WT_CURSTD_KEY_INT))
1694 		*equalp = __cursor_equals(a_arg, b_arg);
1695 	else {
1696 		WT_RET(__wt_btcur_compare(a_arg, b_arg, &cmp));
1697 		*equalp = (cmp == 0) ? 1 : 0;
1698 	}
1699 	return (0);
1700 }
1701 
1702 /*
1703  * __cursor_truncate --
1704  *	Discard a cursor range from row-store or variable-width column-store
1705  * tree.
1706  */
1707 static int
__cursor_truncate(WT_CURSOR_BTREE * start,WT_CURSOR_BTREE * stop,int (* rmfunc)(WT_CURSOR_BTREE *,u_int))1708 __cursor_truncate(
1709     WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop,
1710     int (*rmfunc)(WT_CURSOR_BTREE *, u_int))
1711 {
1712 	WT_DECL_RET;
1713 	WT_SESSION_IMPL *session;
1714 	uint64_t yield_count, sleep_usecs;
1715 
1716 	session = (WT_SESSION_IMPL *)start->iface.session;
1717 	yield_count = sleep_usecs = 0;
1718 
1719 	/*
1720 	 * First, call the cursor search method to re-position the cursor: we
1721 	 * may not have a cursor position (if the higher-level truncate code
1722 	 * switched the cursors to have an "external" cursor key, and because
1723 	 * we don't save a copy of the page's write generation information,
1724 	 * which we need to remove records).
1725 	 *
1726 	 * Once that's done, we can delete records without a full search, unless
1727 	 * we encounter a restart error because the page was modified by some
1728 	 * other thread of control; in that case, repeat the full search to
1729 	 * refresh the page's modification information.
1730 	 *
1731 	 * If this is a row-store, we delete leaf pages having no overflow items
1732 	 * without reading them; for that to work, we have to ensure we read the
1733 	 * page referenced by the ending cursor, since we may be deleting only a
1734 	 * partial page at the end of the truncation.  Our caller already fully
1735 	 * instantiated the end cursor, so we know that page is pinned in memory
1736 	 * and we can proceed without concern.
1737 	 */
1738 retry:
1739 	WT_ERR(__wt_btcur_search(start));
1740 	WT_ASSERT(session,
1741 	    F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);
1742 
1743 	for (;;) {
1744 		WT_ERR(rmfunc(start, WT_UPDATE_TOMBSTONE));
1745 
1746 		if (stop != NULL && __cursor_equals(start, stop))
1747 			return (0);
1748 
1749 		WT_ERR(__wt_btcur_next(start, true));
1750 
1751 		start->compare = 0;		/* Exact match */
1752 	}
1753 
1754 err:	if (ret == WT_RESTART) {
1755 		__cursor_restart(session, &yield_count, &sleep_usecs);
1756 		goto retry;
1757 	}
1758 
1759 	WT_RET_NOTFOUND_OK(ret);
1760 	return (0);
1761 }
1762 
1763 /*
1764  * __cursor_truncate_fix --
1765  *	Discard a cursor range from fixed-width column-store tree.
1766  */
1767 static int
__cursor_truncate_fix(WT_CURSOR_BTREE * start,WT_CURSOR_BTREE * stop,int (* rmfunc)(WT_CURSOR_BTREE *,u_int))1768 __cursor_truncate_fix(
1769     WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop,
1770     int (*rmfunc)(WT_CURSOR_BTREE *, u_int))
1771 {
1772 	WT_DECL_RET;
1773 	WT_SESSION_IMPL *session;
1774 	uint64_t yield_count, sleep_usecs;
1775 	const uint8_t *value;
1776 
1777 	session = (WT_SESSION_IMPL *)start->iface.session;
1778 	yield_count = sleep_usecs = 0;
1779 
1780 	/*
1781 	 * Handle fixed-length column-store objects separately: for row-store
1782 	 * and variable-length column-store objects we have "deleted" values
1783 	 * and so returned objects actually exist: fixed-length column-store
1784 	 * objects are filled-in if they don't exist, that is, if you create
1785 	 * record 37, records 1-36 magically appear.  Those records can't be
1786 	 * deleted, which means we have to ignore already "deleted" records.
1787 	 *
1788 	 * First, call the cursor search method to re-position the cursor: we
1789 	 * may not have a cursor position (if the higher-level truncate code
1790 	 * switched the cursors to have an "external" cursor key, and because
1791 	 * we don't save a copy of the page's write generation information,
1792 	 * which we need to remove records).
1793 	 *
1794 	 * Once that's done, we can delete records without a full search, unless
1795 	 * we encounter a restart error because the page was modified by some
1796 	 * other thread of control; in that case, repeat the full search to
1797 	 * refresh the page's modification information.
1798 	 */
1799 retry:	WT_ERR(__wt_btcur_search(start));
1800 	WT_ASSERT(session,
1801 	    F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);
1802 
1803 	for (;;) {
1804 		value = (const uint8_t *)start->iface.value.data;
1805 		if (*value != 0)
1806 			WT_ERR(rmfunc(start, WT_UPDATE_TOMBSTONE));
1807 
1808 		if (stop != NULL && __cursor_equals(start, stop))
1809 			return (0);
1810 
1811 		WT_ERR(__wt_btcur_next(start, true));
1812 
1813 		start->compare = 0;	/* Exact match */
1814 	}
1815 
1816 err:	if (ret == WT_RESTART) {
1817 		__cursor_restart(session, &yield_count, &sleep_usecs);
1818 		goto retry;
1819 	}
1820 
1821 	WT_RET_NOTFOUND_OK(ret);
1822 	return (0);
1823 }
1824 
1825 /*
1826  * __wt_btcur_range_truncate --
1827  *	Discard a cursor range from the tree.
1828  */
1829 int
__wt_btcur_range_truncate(WT_CURSOR_BTREE * start,WT_CURSOR_BTREE * stop)1830 __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
1831 {
1832 	WT_BTREE *btree;
1833 	WT_DECL_RET;
1834 	WT_SESSION_IMPL *session;
1835 
1836 	session = (WT_SESSION_IMPL *)start->iface.session;
1837 	btree = start->btree;
1838 	WT_STAT_DATA_INCR(session, cursor_truncate);
1839 
1840 	WT_RET(__wt_txn_autocommit_check(session));
1841 	/*
1842 	 * For recovery, log the start and stop keys for a truncate operation,
1843 	 * not the individual records removed.  On the other hand, for rollback
1844 	 * we need to keep track of all the in-memory operations.
1845 	 *
1846 	 * We deal with this here by logging the truncate range first, then (in
1847 	 * the logging code) disabling writing of the in-memory remove records
1848 	 * to disk.
1849 	 */
1850 	if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED))
1851 		WT_RET(__wt_txn_truncate_log(session, start, stop));
1852 
1853 	switch (btree->type) {
1854 	case BTREE_COL_FIX:
1855 		WT_ERR(__cursor_truncate_fix(start, stop, __cursor_col_modify));
1856 		break;
1857 	case BTREE_COL_VAR:
1858 		WT_ERR(__cursor_truncate(start, stop, __cursor_col_modify));
1859 		break;
1860 	case BTREE_ROW:
1861 		/*
1862 		 * The underlying cursor comparison routine requires cursors be
1863 		 * fully instantiated when truncating row-store objects because
1864 		 * it's comparing page and/or skiplist positions, not keys. (Key
1865 		 * comparison would work, it's only that a key comparison would
1866 		 * be relatively expensive, especially with custom collators.
1867 		 * Column-store objects have record number keys, so the key
1868 		 * comparison is cheap.)  The session truncate code did cursor
1869 		 * searches when setting up the truncate so we're good to go: if
1870 		 * that ever changes, we'd need to do something here to ensure a
1871 		 * fully instantiated cursor.
1872 		 */
1873 		WT_ERR(__cursor_truncate(start, stop, __cursor_row_modify));
1874 		break;
1875 	}
1876 
1877 err:	if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED))
1878 		__wt_txn_truncate_end(session);
1879 	return (ret);
1880 }
1881 
1882 /*
1883  * __wt_btcur_init --
1884  *	Initialize a cursor used for internal purposes.
1885  */
1886 void
__wt_btcur_init(WT_SESSION_IMPL * session,WT_CURSOR_BTREE * cbt)1887 __wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
1888 {
1889 	memset(cbt, 0, sizeof(WT_CURSOR_BTREE));
1890 
1891 	cbt->iface.session = &session->iface;
1892 	cbt->btree = S2BT(session);
1893 }
1894 
1895 /*
1896  * __wt_btcur_open --
1897  *	Open a btree cursor.
1898  */
1899 void
__wt_btcur_open(WT_CURSOR_BTREE * cbt)1900 __wt_btcur_open(WT_CURSOR_BTREE *cbt)
1901 {
1902 	cbt->row_key = &cbt->_row_key;
1903 	cbt->tmp = &cbt->_tmp;
1904 
1905 #ifdef HAVE_DIAGNOSTIC
1906 	cbt->lastkey = &cbt->_lastkey;
1907 	cbt->lastrecno = WT_RECNO_OOB;
1908 #endif
1909 }
1910 
1911 /*
1912  * __wt_btcur_close --
1913  *	Close a btree cursor.
1914  */
1915 int
__wt_btcur_close(WT_CURSOR_BTREE * cbt,bool lowlevel)1916 __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel)
1917 {
1918 	WT_DECL_RET;
1919 	WT_SESSION_IMPL *session;
1920 
1921 	session = (WT_SESSION_IMPL *)cbt->iface.session;
1922 
1923 	/*
1924 	 * The in-memory split and lookaside table code creates low-level btree
1925 	 * cursors to search/modify leaf pages. Those cursors don't hold hazard
1926 	 * pointers, nor are they counted in the session handle's cursor count.
1927 	 * Skip the usual cursor tear-down in that case.
1928 	 */
1929 	if (!lowlevel)
1930 		ret = __cursor_reset(cbt);
1931 
1932 	__wt_buf_free(session, &cbt->_row_key);
1933 	__wt_buf_free(session, &cbt->_tmp);
1934 #ifdef HAVE_DIAGNOSTIC
1935 	__wt_buf_free(session, &cbt->_lastkey);
1936 #endif
1937 
1938 	return (ret);
1939 }
1940