1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 /*
10  * __cursor_set_recno --
11  *	The cursor value in the interface has to track the value in the
12  * underlying cursor, update them in parallel.
13  */
14 static inline void
__cursor_set_recno(WT_CURSOR_BTREE * cbt,uint64_t v)15 __cursor_set_recno(WT_CURSOR_BTREE *cbt, uint64_t v)
16 {
17 	cbt->iface.recno = cbt->recno = v;
18 }
19 
20 /*
21  * __cursor_novalue --
22  *	Release any cached value before an operation that could update the
23  * transaction context and free data a value is pointing to.
24  */
25 static inline void
__cursor_novalue(WT_CURSOR * cursor)26 __cursor_novalue(WT_CURSOR *cursor)
27 {
28 	F_CLR(cursor, WT_CURSTD_VALUE_INT);
29 }
30 
31 /*
32  * __cursor_checkkey --
33  *	Check if a key is set without making a copy.
34  */
35 static inline int
__cursor_checkkey(WT_CURSOR * cursor)36 __cursor_checkkey(WT_CURSOR *cursor)
37 {
38 	return (F_ISSET(cursor, WT_CURSTD_KEY_SET) ?
39 	    0 : __wt_cursor_kv_not_set(cursor, true));
40 }
41 
42 /*
43  * __cursor_checkvalue --
44  *	Check if a value is set without making a copy.
45  */
46 static inline int
__cursor_checkvalue(WT_CURSOR * cursor)47 __cursor_checkvalue(WT_CURSOR *cursor)
48 {
49 	return (F_ISSET(cursor, WT_CURSTD_VALUE_SET) ?
50 	    0 : __wt_cursor_kv_not_set(cursor, false));
51 }
52 
53 /*
54  * __cursor_localkey --
55  *	If the key points into the tree, get a local copy.
56  */
57 static inline int
__cursor_localkey(WT_CURSOR * cursor)58 __cursor_localkey(WT_CURSOR *cursor)
59 {
60 	if (F_ISSET(cursor, WT_CURSTD_KEY_INT)) {
61 		if (!WT_DATA_IN_ITEM(&cursor->key))
62 			WT_RET(__wt_buf_set((WT_SESSION_IMPL *)cursor->session,
63 			    &cursor->key, cursor->key.data, cursor->key.size));
64 		F_CLR(cursor, WT_CURSTD_KEY_INT);
65 		F_SET(cursor, WT_CURSTD_KEY_EXT);
66 	}
67 	return (0);
68 }
69 
70 /*
71  * __cursor_localvalue --
72  *	If the value points into the tree, get a local copy.
73  */
74 static inline int
__cursor_localvalue(WT_CURSOR * cursor)75 __cursor_localvalue(WT_CURSOR *cursor)
76 {
77 	if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) {
78 		if (!WT_DATA_IN_ITEM(&cursor->value))
79 			WT_RET(__wt_buf_set((WT_SESSION_IMPL *)cursor->session,
80 			    &cursor->value,
81 			    cursor->value.data, cursor->value.size));
82 		F_CLR(cursor, WT_CURSTD_VALUE_INT);
83 		F_SET(cursor, WT_CURSTD_VALUE_EXT);
84 	}
85 	return (0);
86 }
87 
88 /*
89  * __cursor_needkey --
90  *
91  * Check if we have a key set. There's an additional semantic here: if we're
92  * pointing into the tree, get a local copy of whatever we're referencing in
93  * the tree, there's an obvious race with the cursor moving and the reference.
94  */
95 static inline int
__cursor_needkey(WT_CURSOR * cursor)96 __cursor_needkey(WT_CURSOR *cursor)
97 {
98 	WT_RET(__cursor_localkey(cursor));
99 	return (__cursor_checkkey(cursor));
100 }
101 
102 /*
103  * __cursor_needvalue --
104  *
105  * Check if we have a value set. There's an additional semantic here: if we're
106  * pointing into the tree, get a local copy of whatever we're referencing in
107  * the tree, there's an obvious race with the cursor moving and the reference.
108  */
109 static inline int
__cursor_needvalue(WT_CURSOR * cursor)110 __cursor_needvalue(WT_CURSOR *cursor)
111 {
112 	WT_RET(__cursor_localvalue(cursor));
113 	return (__cursor_checkvalue(cursor));
114 }
115 
116 /*
117  * __cursor_pos_clear --
118  *	Reset the cursor's location.
119  */
120 static inline void
__cursor_pos_clear(WT_CURSOR_BTREE * cbt)121 __cursor_pos_clear(WT_CURSOR_BTREE *cbt)
122 {
123 	/*
124 	 * Most of the cursor's location information that needs to be set on
125 	 * successful return is always set by a successful return, for example,
126 	 * we don't initialize the compare return value because it's always
127 	 * set by the row-store search.  The other stuff gets cleared here,
128 	 * and it's a minimal set of things we need to clear. It would be a
129 	 * lot simpler to clear everything, but we call this function a lot.
130 	 */
131 	cbt->recno = WT_RECNO_OOB;
132 
133 	cbt->ins = NULL;
134 	cbt->ins_head = NULL;
135 	cbt->ins_stack[0] = NULL;
136 
137 	F_CLR(cbt, WT_CBT_POSITION_MASK);
138 }
139 
140 /*
141  * __cursor_enter --
142  *	Activate a cursor.
143  */
144 static inline int
__cursor_enter(WT_SESSION_IMPL * session)145 __cursor_enter(WT_SESSION_IMPL *session)
146 {
147 	/*
148 	 * If there are no other cursors positioned in the session, check
149 	 * whether the cache is full.
150 	 */
151 	if (session->ncursors == 0)
152 		WT_RET(__wt_cache_eviction_check(session, false, false, NULL));
153 	++session->ncursors;
154 	return (0);
155 }
156 
157 /*
158  * __cursor_leave --
159  *	Deactivate a cursor.
160  */
161 static inline void
__cursor_leave(WT_SESSION_IMPL * session)162 __cursor_leave(WT_SESSION_IMPL *session)
163 {
164 	/*
165 	 * Decrement the count of active cursors in the session.  When that
166 	 * goes to zero, there are no active cursors, and we can release any
167 	 * snapshot we're holding for read committed isolation.
168 	 */
169 	WT_ASSERT(session, session->ncursors > 0);
170 	if (--session->ncursors == 0)
171 		__wt_txn_read_last(session);
172 }
173 
174 /*
175  * __cursor_reset --
176  *	Reset the cursor, it no longer holds any position.
177  */
178 static inline int
__cursor_reset(WT_CURSOR_BTREE * cbt)179 __cursor_reset(WT_CURSOR_BTREE *cbt)
180 {
181 	WT_DECL_RET;
182 	WT_SESSION_IMPL *session;
183 
184 	session = (WT_SESSION_IMPL *)cbt->iface.session;
185 
186 	__cursor_pos_clear(cbt);
187 
188 	/* If the cursor was active, deactivate it. */
189 	if (F_ISSET(cbt, WT_CBT_ACTIVE)) {
190 		if (!F_ISSET(cbt, WT_CBT_NO_TXN))
191 			__cursor_leave(session);
192 		F_CLR(cbt, WT_CBT_ACTIVE);
193 	}
194 
195 	/* If we're not holding a cursor reference, we're done. */
196 	if (cbt->ref == NULL)
197 		return (0);
198 
199 	/*
200 	 * If we were scanning and saw a lot of deleted records on this page,
201 	 * try to evict the page when we release it.
202 	 */
203 	if (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD)
204 		__wt_page_evict_soon(session, cbt->ref);
205 	cbt->page_deleted_count = 0;
206 
207 	/*
208 	 * Release any page references we're holding. This can trigger eviction
209 	 * (e.g., forced eviction of big pages), so it's important to do after
210 	 * releasing our snapshot above.
211 	 *
212 	 * Clear the reference regardless, so we don't try the release twice.
213 	 */
214 	ret = __wt_page_release(session, cbt->ref, 0);
215 	cbt->ref = NULL;
216 
217 	return (ret);
218 }
219 
220 /*
221  * __wt_curindex_get_valuev --
222  *	Internal implementation of WT_CURSOR->get_value for index cursors
223  */
224 static inline int
__wt_curindex_get_valuev(WT_CURSOR * cursor,va_list ap)225 __wt_curindex_get_valuev(WT_CURSOR *cursor, va_list ap)
226 {
227 	WT_CURSOR_INDEX *cindex;
228 	WT_ITEM *item;
229 	WT_SESSION_IMPL *session;
230 
231 	cindex = (WT_CURSOR_INDEX *)cursor;
232 	session = (WT_SESSION_IMPL *)cursor->session;
233 	WT_RET(__cursor_checkvalue(cursor));
234 
235 	if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) {
236 		WT_RET(__wt_schema_project_merge(session,
237 		    cindex->cg_cursors, cindex->value_plan,
238 		    cursor->value_format, &cursor->value));
239 		item = va_arg(ap, WT_ITEM *);
240 		item->data = cursor->value.data;
241 		item->size = cursor->value.size;
242 	} else
243 		WT_RET(__wt_schema_project_out(session,
244 		    cindex->cg_cursors, cindex->value_plan, ap));
245 	return (0);
246 }
247 
248 /*
249  * __wt_curtable_get_valuev --
250  *	Internal implementation of WT_CURSOR->get_value for table cursors.
251  */
252 static inline int
__wt_curtable_get_valuev(WT_CURSOR * cursor,va_list ap)253 __wt_curtable_get_valuev(WT_CURSOR *cursor, va_list ap)
254 {
255 	WT_CURSOR *primary;
256 	WT_CURSOR_TABLE *ctable;
257 	WT_ITEM *item;
258 	WT_SESSION_IMPL *session;
259 
260 	ctable = (WT_CURSOR_TABLE *)cursor;
261 	session = (WT_SESSION_IMPL *)cursor->session;
262 	primary = *ctable->cg_cursors;
263 	WT_RET(__cursor_checkvalue(primary));
264 
265 	if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) {
266 		WT_RET(__wt_schema_project_merge(session,
267 		    ctable->cg_cursors, ctable->plan,
268 		    cursor->value_format, &cursor->value));
269 		item = va_arg(ap, WT_ITEM *);
270 		item->data = cursor->value.data;
271 		item->size = cursor->value.size;
272 	} else
273 		WT_RET(__wt_schema_project_out(session,
274 		    ctable->cg_cursors, ctable->plan, ap));
275 	return (0);
276 }
277 
278 /*
279  * __wt_cursor_dhandle_incr_use --
280  *	Increment the in-use counter in the cursor's data source.
281  */
282 static inline void
__wt_cursor_dhandle_incr_use(WT_SESSION_IMPL * session)283 __wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session)
284 {
285 	WT_DATA_HANDLE *dhandle;
286 
287 	dhandle = session->dhandle;
288 
289 	/* If we open a handle with a time of death set, clear it. */
290 	if (__wt_atomic_addi32(&dhandle->session_inuse, 1) == 1 &&
291 	    dhandle->timeofdeath != 0)
292 		dhandle->timeofdeath = 0;
293 }
294 
295 /*
296  * __wt_cursor_dhandle_decr_use --
297  *	Decrement the in-use counter in the cursor's data source.
298  */
299 static inline void
__wt_cursor_dhandle_decr_use(WT_SESSION_IMPL * session)300 __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session)
301 {
302 	WT_DATA_HANDLE *dhandle;
303 
304 	dhandle = session->dhandle;
305 
306 	/* If we close a handle with a time of death set, clear it. */
307 	WT_ASSERT(session, dhandle->session_inuse > 0);
308 	if (__wt_atomic_subi32(&dhandle->session_inuse, 1) == 0 &&
309 	    dhandle->timeofdeath != 0)
310 		dhandle->timeofdeath = 0;
311 }
312 
313 /*
314  * __cursor_kv_return --
315  *      Return a page referenced key/value pair to the application.
316  */
317 static inline int
__cursor_kv_return(WT_CURSOR_BTREE * cbt,WT_UPDATE * upd)318 __cursor_kv_return(WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
319 {
320 	WT_RET(__wt_key_return(cbt));
321 	WT_RET(__wt_value_return(cbt, upd));
322 
323 	return (0);
324 }
325 
326 /*
327  * __cursor_func_init --
328  *	Cursor call setup.
329  */
330 static inline int
__cursor_func_init(WT_CURSOR_BTREE * cbt,bool reenter)331 __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter)
332 {
333 	WT_SESSION_IMPL *session;
334 
335 	session = (WT_SESSION_IMPL *)cbt->iface.session;
336 
337 	if (reenter) {
338 #ifdef HAVE_DIAGNOSTIC
339 		__wt_cursor_key_order_reset(cbt);
340 #endif
341 		WT_RET(__cursor_reset(cbt));
342 	}
343 
344 	/*
345 	 * Any old insert position is now invalid.  We rely on this being
346 	 * cleared to detect if a new skiplist is installed after a search.
347 	 */
348 	cbt->ins_stack[0] = NULL;
349 
350 	/* If the transaction is idle, check that the cache isn't full. */
351 	WT_RET(__wt_txn_idle_cache_check(session));
352 
353 	/* Activate the file cursor. */
354 	if (!F_ISSET(cbt, WT_CBT_ACTIVE)) {
355 		if (!F_ISSET(cbt, WT_CBT_NO_TXN))
356 			WT_RET(__cursor_enter(session));
357 		F_SET(cbt, WT_CBT_ACTIVE);
358 	}
359 
360 	/*
361 	 * If this is an ordinary transactional cursor, make sure we are set up
362 	 * to read.
363 	 */
364 	if (!F_ISSET(cbt, WT_CBT_NO_TXN))
365 		__wt_txn_cursor_op(session);
366 	return (0);
367 }
368 
369 /*
370  * __cursor_row_slot_return --
371  *	Return a row-store leaf page slot's K/V pair.
372  */
373 static inline int
__cursor_row_slot_return(WT_CURSOR_BTREE * cbt,WT_ROW * rip,WT_UPDATE * upd)374 __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd)
375 {
376 	WT_BTREE *btree;
377 	WT_CELL *cell;
378 	WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
379 	WT_ITEM *kb, *vb;
380 	WT_PAGE *page;
381 	WT_SESSION_IMPL *session;
382 	void *copy;
383 
384 	session = (WT_SESSION_IMPL *)cbt->iface.session;
385 	btree = S2BT(session);
386 	page = cbt->ref->page;
387 
388 	kpack = NULL;
389 	vpack = &_vpack;
390 
391 	kb = &cbt->iface.key;
392 	vb = &cbt->iface.value;
393 
394 	/*
395 	 * The row-store key can change underfoot; explicitly take a copy.
396 	 */
397 	copy = WT_ROW_KEY_COPY(rip);
398 
399 	/*
400 	 * Get a key: we could just call __wt_row_leaf_key, but as a cursor
401 	 * is running through the tree, we may have additional information
402 	 * here (we may have the fully-built key that's immediately before
403 	 * the prefix-compressed key we want, so it's a faster construction).
404 	 *
405 	 * First, check for an immediately available key.
406 	 */
407 	if (__wt_row_leaf_key_info(
408 	    page, copy, NULL, &cell, &kb->data, &kb->size))
409 		goto value;
410 
411 	/* Huffman encoded keys are a slow path in all cases. */
412 	if (btree->huffman_key != NULL)
413 		goto slow;
414 
415 	/*
416 	 * Unpack the cell and deal with overflow and prefix-compressed keys.
417 	 * Inline building simple prefix-compressed keys from a previous key,
418 	 * otherwise build from scratch.
419 	 */
420 	kpack = &_kpack;
421 	__wt_cell_unpack(cell, kpack);
422 	if (kpack->type == WT_CELL_KEY &&
423 	    cbt->rip_saved != NULL && cbt->rip_saved == rip - 1) {
424 		WT_ASSERT(session, cbt->row_key->size >= kpack->prefix);
425 
426 		/*
427 		 * Grow the buffer as necessary as well as ensure data has been
428 		 * copied into local buffer space, then append the suffix to the
429 		 * prefix already in the buffer.
430 		 *
431 		 * Don't grow the buffer unnecessarily or copy data we don't
432 		 * need, truncate the item's data length to the prefix bytes.
433 		 */
434 		cbt->row_key->size = kpack->prefix;
435 		WT_RET(__wt_buf_grow(
436 		    session, cbt->row_key, cbt->row_key->size + kpack->size));
437 		memcpy((uint8_t *)cbt->row_key->data + cbt->row_key->size,
438 		    kpack->data, kpack->size);
439 		cbt->row_key->size += kpack->size;
440 	} else {
441 		/*
442 		 * Call __wt_row_leaf_key_work instead of __wt_row_leaf_key: we
443 		 * already did __wt_row_leaf_key's fast-path checks inline.
444 		 */
445 slow:		WT_RET(__wt_row_leaf_key_work(
446 		    session, page, rip, cbt->row_key, false));
447 	}
448 	kb->data = cbt->row_key->data;
449 	kb->size = cbt->row_key->size;
450 	cbt->rip_saved = rip;
451 
452 value:
453 	/*
454 	 * If the item was ever modified, use the WT_UPDATE data.  Note the
455 	 * caller passes us the update: it has already resolved which one
456 	 * (if any) is visible.
457 	 */
458 	if (upd != NULL)
459 		return (__wt_value_return(cbt, upd));
460 
461 	/* Else, simple values have their location encoded in the WT_ROW. */
462 	if (__wt_row_leaf_value(page, rip, vb))
463 		return (0);
464 
465 	/* Else, take the value from the original page cell. */
466 	__wt_row_leaf_value_cell(page, rip, kpack, vpack);
467 	return (__wt_page_cell_data_ref(session, cbt->ref->page, vpack, vb));
468 }
469