1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 /*
12  * __check_leaf_key_range --
13  *	Check the search key is in the leaf page's key range.
14  */
15 static inline int
__check_leaf_key_range(WT_SESSION_IMPL * session,uint64_t recno,WT_REF * leaf,WT_CURSOR_BTREE * cbt)16 __check_leaf_key_range(WT_SESSION_IMPL *session,
17     uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
18 {
19 	WT_PAGE_INDEX *pindex;
20 	uint32_t indx;
21 
22 	/*
23 	 * There are reasons we can't do the fast checks, and we continue with
24 	 * the leaf page search in those cases, only skipping the complete leaf
25 	 * page search if we know it's not going to work.
26 	 */
27 	cbt->compare = 0;
28 
29 	/*
30 	 * Check if the search key is smaller than the parent's starting key for
31 	 * this page.
32 	 */
33 	if (recno < leaf->ref_recno) {
34 		cbt->compare = 1;		/* page keys > search key */
35 		return (0);
36 	}
37 
38 	/*
39 	 * Check if the search key is greater than or equal to the starting key
40 	 * for the parent's next page.
41 	 *
42 	 * !!!
43 	 * Check that "indx + 1" is a valid page-index entry first, because it
44 	 * also checks that "indx" is a valid page-index entry, and we have to
45 	 * do that latter check before looking at the indx slot of the array
46 	 * for a match to leaf (in other words, our page hint might be wrong).
47 	 */
48 	WT_INTL_INDEX_GET(session, leaf->home, pindex);
49 	indx = leaf->pindex_hint;
50 	if (indx + 1 < pindex->entries && pindex->index[indx] == leaf)
51 		if (recno >= pindex->index[indx + 1]->ref_recno) {
52 			cbt->compare = -1;	/* page keys < search key */
53 			return (0);
54 		}
55 
56 	return (0);
57 }
58 
59 /*
60  * __wt_col_search --
61  *	Search a column-store tree for a specific record-based key.
62  */
63 int
__wt_col_search(WT_CURSOR_BTREE * cbt,uint64_t search_recno,WT_REF * leaf,bool leaf_safe,bool * leaf_foundp)64 __wt_col_search(
65     WT_CURSOR_BTREE *cbt, uint64_t search_recno,
66     WT_REF *leaf, bool leaf_safe, bool *leaf_foundp)
67 {
68 	WT_BTREE *btree;
69 	WT_COL *cip;
70 	WT_DECL_RET;
71 	WT_INSERT *ins;
72 	WT_INSERT_HEAD *ins_head;
73 	WT_PAGE *page;
74 	WT_PAGE_INDEX *pindex, *parent_pindex;
75 	WT_REF *current, *descent;
76 	WT_SESSION_IMPL *session;
77 	uint64_t recno;
78 	uint32_t base, indx, limit, read_flags;
79 	int depth;
80 
81 	session = (WT_SESSION_IMPL *)cbt->iface.session;
82 	btree = S2BT(session);
83 	current = NULL;
84 
85 	__cursor_pos_clear(cbt);
86 
87 	/*
88 	 * When appending a new record, the search record number will be an
89 	 * out-of-band value, search for the largest key in the table instead.
90 	 */
91 	if ((recno = search_recno) == WT_RECNO_OOB)
92 		recno = UINT64_MAX;
93 
94 	/*
95 	 * We may be searching only a single leaf page, not the full tree. In
96 	 * the normal case where we are searching a tree, check the page's
97 	 * parent keys before doing the full search, it's faster when the
98 	 * cursor is being re-positioned.  Skip this if the page is being
99 	 * re-instantiated in memory. when the cursor is being re-positioned.
100 	 * Skip that check if we know the page is the right one
101 	 * (for example, when re-instantiating a page in memory, in that
102 	 * case we know the target must be on the current page).
103 	 */
104 	if (leaf != NULL) {
105 		WT_ASSERT(session, search_recno != WT_RECNO_OOB);
106 
107 		if (!leaf_safe) {
108 			WT_RET(__check_leaf_key_range(
109 			    session, recno, leaf, cbt));
110 			*leaf_foundp = cbt->compare == 0;
111 			if (!*leaf_foundp)
112 				return (0);
113 		}
114 
115 		current = leaf;
116 		goto leaf_only;
117 	}
118 
119 	if (0) {
120 restart:	/*
121 		 * Discard the currently held page and restart the search from
122 		 * the root.
123 		 */
124 		WT_RET(__wt_page_release(session, current, 0));
125 	}
126 
127 	/* Search the internal pages of the tree. */
128 	current = &btree->root;
129 	for (depth = 2, pindex = NULL;; ++depth) {
130 		parent_pindex = pindex;
131 		page = current->page;
132 		if (page->type != WT_PAGE_COL_INT)
133 			break;
134 
135 		WT_INTL_INDEX_GET(session, page, pindex);
136 		base = pindex->entries;
137 		descent = pindex->index[base - 1];
138 
139 		/* Fast path appends. */
140 		if (recno >= descent->ref_recno) {
141 			/*
142 			 * If on the last slot (the key is larger than any key
143 			 * on the page), check for an internal page split race.
144 			 */
145 			if (__wt_split_descent_race(
146 			    session, current, parent_pindex))
147 				goto restart;
148 
149 			goto descend;
150 		}
151 
152 		/* Binary search of internal pages. */
153 		for (base = 0,
154 		    limit = pindex->entries - 1; limit != 0; limit >>= 1) {
155 			indx = base + (limit >> 1);
156 			descent = pindex->index[indx];
157 
158 			if (recno == descent->ref_recno)
159 				break;
160 			if (recno < descent->ref_recno)
161 				continue;
162 			base = indx + 1;
163 			--limit;
164 		}
165 descend:	/*
166 		 * Reference the slot used for next step down the tree.
167 		 *
168 		 * Base is the smallest index greater than recno and may be the
169 		 * (last + 1) index.  The slot for descent is the one before
170 		 * base.
171 		 */
172 		if (recno != descent->ref_recno) {
173 			/*
174 			 * We don't have to correct for base == 0 because the
175 			 * only way for base to be 0 is if recno is the page's
176 			 * starting recno.
177 			 */
178 			WT_ASSERT(session, base > 0);
179 			descent = pindex->index[base - 1];
180 		}
181 
182 		/* Encourage races. */
183 		WT_DIAGNOSTIC_YIELD;
184 
185 		/*
186 		 * Swap the current page for the child page. If the page splits
187 		 * while we're retrieving it, restart the search at the root.
188 		 * We cannot restart in the "current" page; for example, if a
189 		 * thread is appending to the tree, the page it's waiting for
190 		 * did an insert-split into the parent, then the parent split
191 		 * into its parent, the name space we are searching for may have
192 		 * moved above the current page in the tree.
193 		 *
194 		 * On other error, simply return, the swap call ensures we're
195 		 * holding nothing on failure.
196 		 */
197 		read_flags = WT_READ_RESTART_OK;
198 		if (F_ISSET(cbt, WT_CBT_READ_ONCE))
199 			FLD_SET(read_flags, WT_READ_WONT_NEED);
200 		if ((ret = __wt_page_swap(session,
201 		    current, descent, read_flags)) == 0) {
202 			current = descent;
203 			continue;
204 		}
205 		if (ret == WT_RESTART)
206 			goto restart;
207 		return (ret);
208 	}
209 
210 	/* Track how deep the tree gets. */
211 	if (depth > btree->maximum_depth)
212 		btree->maximum_depth = depth;
213 
214 leaf_only:
215 	page = current->page;
216 	cbt->ref = current;
217 
218 	/*
219 	 * Don't bother searching if the caller is appending a new record where
220 	 * we'll allocate the record number; we're not going to find a match by
221 	 * definition, and we figure out the record number and position when we
222 	 * do the work.
223 	 */
224 	if (search_recno == WT_RECNO_OOB) {
225 		cbt->compare = -1;
226 		return (0);
227 	}
228 
229 	/*
230 	 * Search the leaf page.
231 	 *
232 	 * Search after a page is pinned does a search of the pinned page before
233 	 * doing a full tree search, in which case we might be searching for a
234 	 * record logically before the page. Return failure, and there's nothing
235 	 * else to do, the record isn't going to be on this page.
236 	 *
237 	 * We don't check inside the search path for a record greater than the
238 	 * maximum record in the tree; in that case, we get here with a record
239 	 * that's impossibly large for the page. We do have additional setup to
240 	 * do in that case, the record may be appended to the page.
241 	 */
242 	if (page->type == WT_PAGE_COL_FIX) {
243 		if (recno < current->ref_recno) {
244 			cbt->recno = current->ref_recno;
245 			cbt->compare = 1;
246 			return (0);
247 		}
248 		if (recno >= current->ref_recno + page->entries) {
249 			cbt->recno = current->ref_recno + page->entries;
250 			goto past_end;
251 		} else {
252 			cbt->recno = recno;
253 			cbt->compare = 0;
254 			ins_head = WT_COL_UPDATE_SINGLE(page);
255 		}
256 	} else {
257 		if (recno < current->ref_recno) {
258 			cbt->recno = current->ref_recno;
259 			cbt->slot = 0;
260 			cbt->compare = 1;
261 			return (0);
262 		}
263 		if ((cip = __col_var_search(current, recno, NULL)) == NULL) {
264 			cbt->recno = __col_var_last_recno(current);
265 			cbt->slot = page->entries == 0 ? 0 : page->entries - 1;
266 			goto past_end;
267 		} else {
268 			cbt->recno = recno;
269 			cbt->slot = WT_COL_SLOT(page, cip);
270 			cbt->compare = 0;
271 			ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
272 			F_SET(cbt, WT_CBT_VAR_ONPAGE_MATCH);
273 		}
274 	}
275 
276 	/*
277 	 * We have a match on the page, check for an update.  Check the page's
278 	 * update list (fixed-length), or slot's update list (variable-length)
279 	 * for a better match.  The only better match we can find is an exact
280 	 * match, otherwise the existing match on the page is the one we want.
281 	 * For that reason, don't set the cursor's WT_INSERT_HEAD/WT_INSERT pair
282 	 * until we know we have a useful entry.
283 	 */
284 	if ((ins = __col_insert_search(
285 	    ins_head, cbt->ins_stack, cbt->next_stack, recno)) != NULL)
286 		if (recno == WT_INSERT_RECNO(ins)) {
287 			cbt->ins_head = ins_head;
288 			cbt->ins = ins;
289 		}
290 	return (0);
291 
292 past_end:
293 	/*
294 	 * A record past the end of the page's standard information.  Check the
295 	 * append list; by definition, any record on the append list is closer
296 	 * than the last record on the page, so it's a better choice for return.
297 	 * This is a rarely used path: we normally find exact matches, because
298 	 * column-store files are dense, but in this case the caller searched
299 	 * past the end of the table.
300 	 */
301 	cbt->ins_head = WT_COL_APPEND(page);
302 	if ((cbt->ins = __col_insert_search(
303 	    cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno)) == NULL)
304 		cbt->compare = -1;
305 	else {
306 		cbt->recno = WT_INSERT_RECNO(cbt->ins);
307 		if (recno == cbt->recno)
308 			cbt->compare = 0;
309 		else if (recno < cbt->recno)
310 			cbt->compare = 1;
311 		else
312 			cbt->compare = -1;
313 	}
314 	return (0);
315 }
316