1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 /*
12  * Walking backwards through skip lists.
13  *
14  * The skip list stack is an array of pointers set up by a search.  It points
15  * to the position a node should go in the skip list.  In other words, the skip
16  * list search stack always points *after* the search item (that is, into the
17  * search item's next array).
18  *
19  * Helper macros to go from a stack pointer at level i, pointing into a next
20  * array, back to the insert node containing that next array.
21  */
22 #undef	PREV_ITEM
23 #define	PREV_ITEM(ins_head, insp, i)					\
24 	(((insp) == &(ins_head)->head[i] || (insp) == NULL) ? NULL :	\
25 	    (WT_INSERT *)((char *)((insp) - (i)) - offsetof(WT_INSERT, next)))
26 
27 #undef	PREV_INS
28 #define	PREV_INS(cbt, i)						\
29 	PREV_ITEM((cbt)->ins_head, (cbt)->ins_stack[(i)], (i))
30 
31 /*
32  * __cursor_skip_prev --
33  *	Move back one position in a skip list stack (aka "finger").
34  */
35 static inline int
__cursor_skip_prev(WT_CURSOR_BTREE * cbt)36 __cursor_skip_prev(WT_CURSOR_BTREE *cbt)
37 {
38 	WT_INSERT *current, *ins;
39 	WT_ITEM key;
40 	WT_SESSION_IMPL *session;
41 	int i;
42 
43 	session = (WT_SESSION_IMPL *)cbt->iface.session;
44 
45 restart:
46 	/*
47 	 * If the search stack does not point at the current item, fill it in
48 	 * with a search.
49 	 */
50 	while ((current = cbt->ins) != PREV_INS(cbt, 0)) {
51 		if (cbt->btree->type == BTREE_ROW) {
52 			key.data = WT_INSERT_KEY(current);
53 			key.size = WT_INSERT_KEY_SIZE(current);
54 			WT_RET(__wt_search_insert(
55 			    session, cbt, cbt->ins_head, &key));
56 		} else
57 			cbt->ins = __col_insert_search(cbt->ins_head,
58 			    cbt->ins_stack, cbt->next_stack,
59 			    WT_INSERT_RECNO(current));
60 	}
61 
62 	/*
63 	 * Find the first node up the search stack that does not move.
64 	 *
65 	 * The depth of the current item must be at least this level, since we
66 	 * see it in that many levels of the stack.
67 	 *
68 	 * !!! Watch these loops carefully: they all rely on the value of i,
69 	 * and the exit conditions to end up with the right values are
70 	 * non-trivial.
71 	 */
72 	ins = NULL;			/* -Wconditional-uninitialized */
73 	for (i = 0; i < WT_SKIP_MAXDEPTH - 1; i++)
74 		if ((ins = PREV_INS(cbt, i + 1)) != current)
75 			break;
76 
77 	/*
78 	 * Find a starting point for the new search.  That is either at the
79 	 * non-moving node if we found a valid node, or the beginning of the
80 	 * next list down that is not the current node.
81 	 *
82 	 * Since it is the beginning of a list, and we know the current node is
83 	 * has a skip depth at least this high, any node we find must sort
84 	 * before the current node.
85 	 */
86 	if (ins == NULL || ins == current)
87 		for (; i >= 0; i--) {
88 			cbt->ins_stack[i] = NULL;
89 			cbt->next_stack[i] = NULL;
90 			ins = cbt->ins_head->head[i];
91 			if (ins != NULL && ins != current)
92 				break;
93 		}
94 
95 	/* Walk any remaining levels until just before the current node. */
96 	while (i >= 0) {
97 		/*
98 		 * If we get to the end of a list without finding the current
99 		 * item, we must have raced with an insert.  Restart the search.
100 		 */
101 		if (ins == NULL) {
102 			cbt->ins_stack[0] = NULL;
103 			cbt->next_stack[0] = NULL;
104 			goto restart;
105 		}
106 		if (ins->next[i] != current)		/* Stay at this level */
107 			ins = ins->next[i];
108 		else {					/* Drop down a level */
109 			cbt->ins_stack[i] = &ins->next[i];
110 			cbt->next_stack[i] = ins->next[i];
111 			--i;
112 		}
113 	}
114 
115 	/* If we found a previous node, the next one must be current. */
116 	if (cbt->ins_stack[0] != NULL && *cbt->ins_stack[0] != current)
117 		goto restart;
118 
119 	cbt->ins = PREV_INS(cbt, 0);
120 	return (0);
121 }
122 
123 /*
124  * __cursor_fix_append_prev --
125  *	Return the previous fixed-length entry on the append list.
126  */
127 static inline int
__cursor_fix_append_prev(WT_CURSOR_BTREE * cbt,bool newpage)128 __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
129 {
130 	WT_SESSION_IMPL *session;
131 	WT_UPDATE *upd;
132 
133 	session = (WT_SESSION_IMPL *)cbt->iface.session;
134 
135 	if (newpage) {
136 		if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL)
137 			return (WT_NOTFOUND);
138 	} else {
139 		/* Move to the previous record in the append list, if any. */
140 		if (cbt->ins != NULL &&
141 		    cbt->recno <= WT_INSERT_RECNO(cbt->ins))
142 			WT_RET(__cursor_skip_prev(cbt));
143 
144 		/*
145 		 * Handle the special case of leading implicit records, that is,
146 		 * there aren't any records in the page not on the append list,
147 		 * and the append list's first record isn't the first record on
148 		 * the page. (Although implemented as a test of the page values,
149 		 * this is really a test for a tree where the first inserted
150 		 * record wasn't record 1, any other page with only an append
151 		 * list will have a first page record number matching the first
152 		 * record in the append list.)
153 		 *
154 		 * The "right" place to handle this is probably in our caller.
155 		 * The high-level cursor-previous routine would:
156 		 *    -- call this routine to walk the append list
157 		 *    -- call the routine to walk the standard page items
158 		 *    -- call the tree walk routine looking for a previous page
159 		 * Each of them returns WT_NOTFOUND, at which point our caller
160 		 * checks the cursor record number, and if it's larger than 1,
161 		 * returns the implicit records.  Instead, I'm trying to detect
162 		 * the case here, mostly because I don't want to put that code
163 		 * into our caller.  Anyway, if this code breaks for any reason,
164 		 * that's the way I'd go.
165 		 *
166 		 * If we're not pointing to a WT_INSERT entry (we didn't find a
167 		 * WT_INSERT record preceding our record name-space), check if
168 		 * we've reached the beginning of this page, a possibility if a
169 		 * page had a large number of items appended, and then split.
170 		 * If not, check if there are any records on the page. If there
171 		 * aren't, then we're in the magic zone, keep going until we get
172 		 * to a record number matching the first record on the page.
173 		 */
174 		if (cbt->ins == NULL &&
175 		    (cbt->recno == cbt->ref->ref_recno ||
176 		    __col_fix_last_recno(cbt->ref) != 0))
177 			return (WT_NOTFOUND);
178 	}
179 
180 	/*
181 	 * This code looks different from the cursor-next code. The append list
182 	 * may be preceded by other rows. If we're iterating through the tree,
183 	 * starting at the last record in the tree, by definition we're starting
184 	 * a new iteration and we set the record number to the last record found
185 	 * on the page. Otherwise, decrement the record.
186 	 */
187 	if (newpage)
188 		__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
189 	else
190 		__cursor_set_recno(cbt, cbt->recno - 1);
191 
192 	/*
193 	 * Fixed-width column store appends are inherently non-transactional.
194 	 * Even a non-visible update by a concurrent or aborted transaction
195 	 * changes the effective end of the data.  The effect is subtle because
196 	 * of the blurring between deleted and empty values, but ideally we
197 	 * would skip all uncommitted changes at the end of the data.  This
198 	 * doesn't apply to variable-width column stores because the implicitly
199 	 * created records written by reconciliation are deleted and so can be
200 	 * never seen by a read.
201 	 */
202 	if (cbt->ins == NULL || cbt->recno > WT_INSERT_RECNO(cbt->ins)) {
203 		cbt->v = 0;
204 		cbt->iface.value.data = &cbt->v;
205 	} else {
206 		upd = NULL;
207 		WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
208 		if (upd == NULL) {
209 			cbt->v = 0;
210 			cbt->iface.value.data = &cbt->v;
211 		} else
212 			cbt->iface.value.data = upd->data;
213 	}
214 	cbt->iface.value.size = 1;
215 	return (0);
216 }
217 
218 /*
219  * __cursor_fix_prev --
220  *	Move to the previous, fixed-length column-store item.
221  */
222 static inline int
__cursor_fix_prev(WT_CURSOR_BTREE * cbt,bool newpage)223 __cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage)
224 {
225 	WT_BTREE *btree;
226 	WT_PAGE *page;
227 	WT_SESSION_IMPL *session;
228 	WT_UPDATE *upd;
229 
230 	session = (WT_SESSION_IMPL *)cbt->iface.session;
231 	page = cbt->ref->page;
232 	btree = S2BT(session);
233 
234 	/* Initialize for each new page. */
235 	if (newpage) {
236 		cbt->last_standard_recno = __col_fix_last_recno(cbt->ref);
237 		if (cbt->last_standard_recno == 0)
238 			return (WT_NOTFOUND);
239 		__cursor_set_recno(cbt, cbt->last_standard_recno);
240 		goto new_page;
241 	}
242 
243 	/* Move to the previous entry and return the item. */
244 	if (cbt->recno == cbt->ref->ref_recno)
245 		return (WT_NOTFOUND);
246 	__cursor_set_recno(cbt, cbt->recno - 1);
247 
248 new_page:
249 	/* Check any insert list for a matching record. */
250 	cbt->ins_head = WT_COL_UPDATE_SINGLE(page);
251 	cbt->ins = __col_insert_search(
252 	    cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno);
253 	if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins))
254 		cbt->ins = NULL;
255 	upd = NULL;
256 	if (cbt->ins != NULL)
257 		WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
258 	if (upd == NULL) {
259 		cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt);
260 		cbt->iface.value.data = &cbt->v;
261 	} else
262 		cbt->iface.value.data = upd->data;
263 	cbt->iface.value.size = 1;
264 	return (0);
265 }
266 
267 /*
268  * __cursor_var_append_prev --
269  *	Return the previous variable-length entry on the append list.
270  */
271 static inline int
__cursor_var_append_prev(WT_CURSOR_BTREE * cbt,bool newpage)272 __cursor_var_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
273 {
274 	WT_SESSION_IMPL *session;
275 	WT_UPDATE *upd;
276 
277 	session = (WT_SESSION_IMPL *)cbt->iface.session;
278 
279 	if (newpage) {
280 		cbt->ins = WT_SKIP_LAST(cbt->ins_head);
281 		goto new_page;
282 	}
283 
284 	for (;;) {
285 		WT_RET(__cursor_skip_prev(cbt));
286 new_page:	if (cbt->ins == NULL)
287 			return (WT_NOTFOUND);
288 
289 		__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
290 		WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
291 		if (upd == NULL)
292 			continue;
293 		if (upd->type == WT_UPDATE_TOMBSTONE) {
294 			if (upd->txnid != WT_TXN_NONE &&
295 			    __wt_txn_upd_visible_all(session, upd))
296 				++cbt->page_deleted_count;
297 			continue;
298 		}
299 		return (__wt_value_return(cbt, upd));
300 	}
301 	/* NOTREACHED */
302 }
303 
304 /*
305  * __cursor_var_prev --
306  *	Move to the previous, variable-length column-store item.
307  */
308 static inline int
__cursor_var_prev(WT_CURSOR_BTREE * cbt,bool newpage)309 __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
310 {
311 	WT_CELL *cell;
312 	WT_CELL_UNPACK unpack;
313 	WT_COL *cip;
314 	WT_INSERT *ins;
315 	WT_PAGE *page;
316 	WT_SESSION_IMPL *session;
317 	WT_UPDATE *upd;
318 	uint64_t rle_start;
319 
320 	session = (WT_SESSION_IMPL *)cbt->iface.session;
321 	page = cbt->ref->page;
322 
323 	rle_start = 0;			/* -Werror=maybe-uninitialized */
324 
325 	/* Initialize for each new page. */
326 	if (newpage) {
327 		cbt->last_standard_recno = __col_var_last_recno(cbt->ref);
328 		if (cbt->last_standard_recno == 0)
329 			return (WT_NOTFOUND);
330 		__cursor_set_recno(cbt, cbt->last_standard_recno);
331 		cbt->cip_saved = NULL;
332 		goto new_page;
333 	}
334 
335 	/* Move to the previous entry and return the item. */
336 	for (;;) {
337 		__cursor_set_recno(cbt, cbt->recno - 1);
338 
339 new_page:	if (cbt->recno < cbt->ref->ref_recno)
340 			return (WT_NOTFOUND);
341 
342 		/* Find the matching WT_COL slot. */
343 		if ((cip =
344 		    __col_var_search(cbt->ref, cbt->recno, &rle_start)) == NULL)
345 			return (WT_NOTFOUND);
346 		cbt->slot = WT_COL_SLOT(page, cip);
347 
348 		/* Check any insert list for a matching record. */
349 		cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
350 		cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
351 		upd = NULL;
352 		if (cbt->ins != NULL)
353 			WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd));
354 		if (upd != NULL) {
355 			if (upd->type == WT_UPDATE_TOMBSTONE) {
356 				if (upd->txnid != WT_TXN_NONE &&
357 				    __wt_txn_upd_visible_all(session, upd))
358 					++cbt->page_deleted_count;
359 				continue;
360 			}
361 			return (__wt_value_return(cbt, upd));
362 		}
363 
364 		/*
365 		 * If we're at the same slot as the last reference and there's
366 		 * no matching insert list item, re-use the return information
367 		 * (so encoded items with large repeat counts aren't repeatedly
368 		 * decoded).  Otherwise, unpack the cell and build the return
369 		 * information.
370 		 */
371 		if (cbt->cip_saved != cip) {
372 			if ((cell = WT_COL_PTR(page, cip)) == NULL)
373 				continue;
374 			__wt_cell_unpack(cell, &unpack);
375 			if (unpack.type == WT_CELL_DEL) {
376 				if (__wt_cell_rle(&unpack) == 1)
377 					continue;
378 				/*
379 				 * There can be huge gaps in the variable-length
380 				 * column-store name space appearing as deleted
381 				 * records. If more than one deleted record, do
382 				 * the work of finding the next record to return
383 				 * instead of looping through the records.
384 				 *
385 				 * First, find the largest record in the update
386 				 * list that's smaller than the current record.
387 				 */
388 				ins = __col_insert_search_lt(
389 				    cbt->ins_head, cbt->recno);
390 
391 				/*
392 				 * Second, for records with RLEs greater than 1,
393 				 * the above call to __col_var_search located
394 				 * this record in the page's list of repeating
395 				 * records, and returned the starting record.
396 				 * The starting record - 1 is the record to
397 				 * which we could skip, if there was no larger
398 				 * record in the update list.
399 				 */
400 				cbt->recno = rle_start - 1;
401 				if (ins != NULL &&
402 				    WT_INSERT_RECNO(ins) > cbt->recno)
403 					cbt->recno = WT_INSERT_RECNO(ins);
404 
405 				/* Adjust for the outer loop decrement. */
406 				++cbt->recno;
407 				continue;
408 			}
409 			WT_RET(__wt_page_cell_data_ref(
410 			    session, page, &unpack, cbt->tmp));
411 
412 			cbt->cip_saved = cip;
413 		}
414 		cbt->iface.value.data = cbt->tmp->data;
415 		cbt->iface.value.size = cbt->tmp->size;
416 		return (0);
417 	}
418 	/* NOTREACHED */
419 }
420 
421 /*
422  * __cursor_row_prev --
423  *	Move to the previous row-store item.
424  */
425 static inline int
__cursor_row_prev(WT_CURSOR_BTREE * cbt,bool newpage)426 __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage)
427 {
428 	WT_INSERT *ins;
429 	WT_ITEM *key;
430 	WT_PAGE *page;
431 	WT_ROW *rip;
432 	WT_SESSION_IMPL *session;
433 	WT_UPDATE *upd;
434 
435 	session = (WT_SESSION_IMPL *)cbt->iface.session;
436 	page = cbt->ref->page;
437 	key = &cbt->iface.key;
438 
439 	/*
440 	 * For row-store pages, we need a single item that tells us the part
441 	 * of the page we're walking (otherwise switching from next to prev
442 	 * and vice-versa is just too complicated), so we map the WT_ROW and
443 	 * WT_INSERT_HEAD insert array slots into a single name space: slot 1
444 	 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
445 	 * WT_INSERT_HEAD[0], and so on.  This means WT_INSERT lists are
446 	 * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
447 	 *
448 	 * Initialize for each new page.
449 	 */
450 	if (newpage) {
451 		/*
452 		 * If we haven't instantiated keys on this page, do so, else it
453 		 * is a very, very slow traversal.
454 		 */
455 		if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
456 			WT_RET(__wt_row_leaf_keys(session, page));
457 
458 		if (page->entries == 0)
459 			cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
460 		else
461 			cbt->ins_head =
462 			    WT_ROW_INSERT_SLOT(page, page->entries - 1);
463 		cbt->ins = WT_SKIP_LAST(cbt->ins_head);
464 		cbt->row_iteration_slot = page->entries * 2 + 1;
465 		cbt->rip_saved = NULL;
466 		goto new_insert;
467 	}
468 
469 	/* Move to the previous entry and return the item. */
470 	for (;;) {
471 		/*
472 		 * Continue traversing any insert list.  Maintain the reference
473 		 * to the current insert element in case we switch to a cursor
474 		 * next movement.
475 		 */
476 		if (cbt->ins != NULL)
477 			WT_RET(__cursor_skip_prev(cbt));
478 
479 new_insert:	if ((ins = cbt->ins) != NULL) {
480 			WT_RET(__wt_txn_read(session, ins->upd, &upd));
481 			if (upd == NULL)
482 				continue;
483 			if (upd->type == WT_UPDATE_TOMBSTONE) {
484 				if (upd->txnid != WT_TXN_NONE &&
485 				    __wt_txn_upd_visible_all(session, upd))
486 					++cbt->page_deleted_count;
487 				continue;
488 			}
489 			key->data = WT_INSERT_KEY(ins);
490 			key->size = WT_INSERT_KEY_SIZE(ins);
491 			return (__wt_value_return(cbt, upd));
492 		}
493 
494 		/* Check for the beginning of the page. */
495 		if (cbt->row_iteration_slot == 1)
496 			return (WT_NOTFOUND);
497 		--cbt->row_iteration_slot;
498 
499 		/*
500 		 * Odd-numbered slots configure as WT_INSERT_HEAD entries,
501 		 * even-numbered slots configure as WT_ROW entries.
502 		 */
503 		if (cbt->row_iteration_slot & 0x01) {
504 			cbt->ins_head = cbt->row_iteration_slot == 1 ?
505 			    WT_ROW_INSERT_SMALLEST(page) :
506 			    WT_ROW_INSERT_SLOT(
507 				page, cbt->row_iteration_slot / 2 - 1);
508 			cbt->ins = WT_SKIP_LAST(cbt->ins_head);
509 			goto new_insert;
510 		}
511 		cbt->ins_head = NULL;
512 		cbt->ins = NULL;
513 
514 		cbt->slot = cbt->row_iteration_slot / 2 - 1;
515 		rip = &page->pg_row[cbt->slot];
516 		WT_RET(__wt_txn_read(session, WT_ROW_UPDATE(page, rip), &upd));
517 		if (upd != NULL && upd->type == WT_UPDATE_TOMBSTONE) {
518 			if (upd->txnid != WT_TXN_NONE &&
519 			    __wt_txn_upd_visible_all(session, upd))
520 				++cbt->page_deleted_count;
521 			continue;
522 		}
523 		return (__cursor_row_slot_return(cbt, rip, upd));
524 	}
525 	/* NOTREACHED */
526 }
527 
528 /*
529  * __wt_btcur_prev --
530  *	Move to the previous record in the tree.
531  */
532 int
__wt_btcur_prev(WT_CURSOR_BTREE * cbt,bool truncating)533 __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
534 {
535 	WT_CURSOR *cursor;
536 	WT_DECL_RET;
537 	WT_PAGE *page;
538 	WT_SESSION_IMPL *session;
539 	WT_UPDATE *upd;
540 	uint32_t flags;
541 	bool newpage, valid;
542 
543 	cursor = &cbt->iface;
544 	session = (WT_SESSION_IMPL *)cbt->iface.session;
545 
546 	WT_STAT_CONN_INCR(session, cursor_prev);
547 	WT_STAT_DATA_INCR(session, cursor_prev);
548 
549 	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
550 
551 	/*
552 	 * In case of retrying a prev operation due to a prepare conflict,
553 	 * cursor would have been already positioned at an update structure
554 	 * which resulted in conflict. So, now when retrying we should examine
555 	 * the same update again instead of starting from the next one in the
556 	 * update chain.
557 	 */
558 	F_CLR(cbt, WT_CBT_RETRY_NEXT);
559 	if (F_ISSET(cbt, WT_CBT_RETRY_PREV)) {
560 		WT_RET(__wt_cursor_valid(cbt, &upd, &valid));
561 		F_CLR(cbt, WT_CBT_RETRY_PREV);
562 		if (valid) {
563 			/*
564 			 * If the update, which returned prepared conflict is
565 			 * visible, return the value.
566 			 */
567 			return (__cursor_kv_return(cbt, upd));
568 		}
569 	}
570 
571 	WT_RET(__cursor_func_init(cbt, false));
572 
573 	/*
574 	 * If we aren't already iterating in the right direction, there's
575 	 * some setup to do.
576 	 */
577 	if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
578 		__wt_btcur_iterate_setup(cbt);
579 
580 	/*
581 	 * Walk any page we're holding until the underlying call returns not-
582 	 * found.  Then, move to the previous page, until we reach the start
583 	 * of the file.
584 	 */
585 	flags =						/* tree walk flags */
586 	    WT_READ_NO_SPLIT | WT_READ_PREV | WT_READ_SKIP_INTL;
587 	if (truncating)
588 		LF_SET(WT_READ_TRUNCATE);
589 	for (newpage = false;; newpage = true) {
590 		page = cbt->ref == NULL ? NULL : cbt->ref->page;
591 
592 		/*
593 		 * Column-store pages may have appended entries. Handle it
594 		 * separately from the usual cursor code, it's in a simple
595 		 * format.
596 		 */
597 		if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF &&
598 		    (cbt->ins_head = WT_COL_APPEND(page)) != NULL)
599 			F_SET(cbt, WT_CBT_ITERATE_APPEND);
600 
601 		if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
602 			switch (page->type) {
603 			case WT_PAGE_COL_FIX:
604 				ret = __cursor_fix_append_prev(cbt, newpage);
605 				break;
606 			case WT_PAGE_COL_VAR:
607 				ret = __cursor_var_append_prev(cbt, newpage);
608 				break;
609 			WT_ILLEGAL_VALUE_ERR(session, page->type);
610 			}
611 			if (ret == 0)
612 				break;
613 			F_CLR(cbt, WT_CBT_ITERATE_APPEND);
614 			if (ret != WT_NOTFOUND)
615 				break;
616 			newpage = true;
617 		}
618 		if (page != NULL) {
619 			switch (page->type) {
620 			case WT_PAGE_COL_FIX:
621 				ret = __cursor_fix_prev(cbt, newpage);
622 				break;
623 			case WT_PAGE_COL_VAR:
624 				ret = __cursor_var_prev(cbt, newpage);
625 				break;
626 			case WT_PAGE_ROW_LEAF:
627 				ret = __cursor_row_prev(cbt, newpage);
628 				break;
629 			WT_ILLEGAL_VALUE_ERR(session, page->type);
630 			}
631 			if (ret != WT_NOTFOUND)
632 				break;
633 		}
634 
635 		/*
636 		 * If we saw a lot of deleted records on this page, or we went
637 		 * all the way through a page and only saw deleted records, try
638 		 * to evict the page when we release it.  Otherwise repeatedly
639 		 * deleting from the beginning of a tree can have quadratic
640 		 * performance.  Take care not to force eviction of pages that
641 		 * are genuinely empty, in new trees.
642 		 */
643 		if (page != NULL &&
644 		    (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD ||
645 		    (newpage && cbt->page_deleted_count > 0)))
646 			__wt_page_evict_soon(session, cbt->ref);
647 		cbt->page_deleted_count = 0;
648 
649 		if (F_ISSET(cbt, WT_CBT_READ_ONCE))
650 			LF_SET(WT_READ_WONT_NEED);
651 		WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
652 		WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
653 	}
654 #ifdef HAVE_DIAGNOSTIC
655 	if (ret == 0)
656 		WT_ERR(__wt_cursor_key_order_check(session, cbt, false));
657 #endif
658 err:	switch (ret) {
659 	case 0:
660 		F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
661 		break;
662 	case WT_PREPARE_CONFLICT:
663 		/*
664 		 * If prepare conflict occurs, cursor should not be reset,
665 		 * as current cursor position will be reused in case of a
666 		 * retry from user.
667 		 */
668 		F_SET(cbt, WT_CBT_RETRY_PREV);
669 		break;
670 	default:
671 		WT_TRET(__cursor_reset(cbt));
672 	}
673 	return (ret);
674 }
675