1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 /*
12  * __key_return --
13  *	Change the cursor to reference an internal return key.
14  */
15 static inline int
__key_return(WT_CURSOR_BTREE * cbt)16 __key_return(WT_CURSOR_BTREE *cbt)
17 {
18 	WT_CURSOR *cursor;
19 	WT_ITEM *tmp;
20 	WT_PAGE *page;
21 	WT_ROW *rip;
22 	WT_SESSION_IMPL *session;
23 
24 	page = cbt->ref->page;
25 	cursor = &cbt->iface;
26 	session = (WT_SESSION_IMPL *)cbt->iface.session;
27 
28 	if (page->type == WT_PAGE_ROW_LEAF) {
29 		rip = &page->pg_row[cbt->slot];
30 
31 		/*
32 		 * If the cursor references a WT_INSERT item, take its key.
33 		 * Else, if we have an exact match, we copied the key in the
34 		 * search function, take it from there.
35 		 * If we don't have an exact match, take the key from the
36 		 * original page.
37 		 */
38 		if (cbt->ins != NULL) {
39 			cursor->key.data = WT_INSERT_KEY(cbt->ins);
40 			cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins);
41 			return (0);
42 		}
43 
44 		if (cbt->compare == 0) {
45 			/*
46 			 * If not in an insert list and there's an exact match,
47 			 * the row-store search function built the key we want
48 			 * to return in the cursor's temporary buffer. Swap the
49 			 * cursor's search-key and temporary buffers so we can
50 			 * return it (it's unsafe to return the temporary buffer
51 			 * itself because our caller might do another search in
52 			 * this table using the key we return, and we'd corrupt
53 			 * the search key during any subsequent search that used
54 			 * the temporary buffer).
55 			 */
56 			tmp = cbt->row_key;
57 			cbt->row_key = cbt->tmp;
58 			cbt->tmp = tmp;
59 
60 			cursor->key.data = cbt->row_key->data;
61 			cursor->key.size = cbt->row_key->size;
62 			return (0);
63 		}
64 		return (__wt_row_leaf_key(
65 		    session, page, rip, &cursor->key, false));
66 	}
67 
68 	/*
69 	 * WT_PAGE_COL_FIX, WT_PAGE_COL_VAR:
70 	 *	The interface cursor's record has usually been set, but that
71 	 * isn't universally true, specifically, cursor.search_near may call
72 	 * here without first setting the interface cursor.
73 	 */
74 	cursor->recno = cbt->recno;
75 	return (0);
76 }
77 
78 /*
79  * __value_return --
80  *	Change the cursor to reference an internal original-page return value.
81  */
82 static inline int
__value_return(WT_CURSOR_BTREE * cbt)83 __value_return(WT_CURSOR_BTREE *cbt)
84 {
85 	WT_BTREE *btree;
86 	WT_CELL *cell;
87 	WT_CELL_UNPACK unpack;
88 	WT_CURSOR *cursor;
89 	WT_PAGE *page;
90 	WT_ROW *rip;
91 	WT_SESSION_IMPL *session;
92 	uint8_t v;
93 
94 	session = (WT_SESSION_IMPL *)cbt->iface.session;
95 	btree = S2BT(session);
96 
97 	page = cbt->ref->page;
98 	cursor = &cbt->iface;
99 
100 	if (page->type == WT_PAGE_ROW_LEAF) {
101 		rip = &page->pg_row[cbt->slot];
102 
103 		/* Simple values have their location encoded in the WT_ROW. */
104 		if (__wt_row_leaf_value(page, rip, &cursor->value))
105 			return (0);
106 
107 		/* Take the value from the original page cell. */
108 		__wt_row_leaf_value_cell(page, rip, NULL, &unpack);
109 		return (__wt_page_cell_data_ref(
110 		    session, page, &unpack, &cursor->value));
111 
112 	}
113 
114 	if (page->type == WT_PAGE_COL_VAR) {
115 		/* Take the value from the original page cell. */
116 		cell = WT_COL_PTR(page, &page->pg_var[cbt->slot]);
117 		__wt_cell_unpack(cell, &unpack);
118 		return (__wt_page_cell_data_ref(
119 		    session, page, &unpack, &cursor->value));
120 	}
121 
122 	/* WT_PAGE_COL_FIX: Take the value from the original page. */
123 	v = __bit_getv_recno(cbt->ref, cursor->recno, btree->bitcnt);
124 	return (__wt_buf_set(session, &cursor->value, &v, 1));
125 }
126 
127 /*
128  * When threads race modifying a record, we can end up with more than the usual
129  * maximum number of modifications in an update list.  We'd prefer not to
130  * allocate memory in a return path, so add a few additional slots to the array
131  * we use to build up a list of modify records to apply.
132  */
133 #define	WT_MODIFY_ARRAY_SIZE	(WT_MAX_MODIFY_UPDATE + 10)
134 
135 /*
136  * __wt_value_return_upd --
137  *	Change the cursor to reference an internal update structure return
138  *	value.
139  */
140 int
__wt_value_return_upd(WT_CURSOR_BTREE * cbt,WT_UPDATE * upd,bool ignore_visibility)141 __wt_value_return_upd(
142     WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, bool ignore_visibility)
143 {
144 	WT_CURSOR *cursor;
145 	WT_DECL_RET;
146 	WT_SESSION_IMPL *session;
147 	WT_UPDATE **listp, *list[WT_MODIFY_ARRAY_SIZE];
148 	size_t allocated_bytes;
149 	u_int i;
150 	bool skipped_birthmark;
151 
152 	cursor = &cbt->iface;
153 	allocated_bytes = 0;
154 	session = (WT_SESSION_IMPL *)cbt->iface.session;
155 
156 	/*
157 	 * We're passed a "standard" or "modified"  update that's visible to us.
158 	 * Our caller should have already checked for deleted items (we're too
159 	 * far down the call stack to return not-found).
160 	 *
161 	 * Fast path if it's a standard item, assert our caller's behavior.
162 	 */
163 	if (upd->type == WT_UPDATE_STANDARD) {
164 		cursor->value.data = upd->data;
165 		cursor->value.size = upd->size;
166 		return (0);
167 	}
168 	WT_ASSERT(session, upd->type == WT_UPDATE_MODIFY);
169 
170 	/*
171 	 * Find a complete update that's visible to us, tracking modifications
172 	 * that are visible to us.
173 	 */
174 	for (i = 0, listp = list, skipped_birthmark = false;
175 	    upd != NULL;
176 	    upd = upd->next) {
177 		if (upd->txnid == WT_TXN_ABORTED)
178 			continue;
179 
180 		if (!ignore_visibility && !__wt_txn_upd_visible(session, upd)) {
181 			if (upd->type == WT_UPDATE_BIRTHMARK)
182 				skipped_birthmark = true;
183 			continue;
184 		}
185 
186 		if (upd->type == WT_UPDATE_BIRTHMARK) {
187 			upd = NULL;
188 			break;
189 		}
190 
191 		if (WT_UPDATE_DATA_VALUE(upd))
192 			break;
193 
194 		if (upd->type == WT_UPDATE_MODIFY) {
195 			/*
196 			 * Update lists are expected to be short, but it's not
197 			 * guaranteed. There's sufficient room on the stack to
198 			 * avoid memory allocation in normal cases, but we have
199 			 * to handle the edge cases too.
200 			 */
201 			if (i >= WT_MODIFY_ARRAY_SIZE) {
202 				if (i == WT_MODIFY_ARRAY_SIZE)
203 					listp = NULL;
204 				WT_ERR(__wt_realloc_def(
205 				    session, &allocated_bytes, i + 1, &listp));
206 				if (i == WT_MODIFY_ARRAY_SIZE)
207 					memcpy(listp, list, sizeof(list));
208 			}
209 			listp[i++] = upd;
210 
211 			/*
212 			 * Once a modify is found, all previously committed
213 			 * modifications should be applied regardless of
214 			 * visibility.
215 			 */
216 			ignore_visibility = true;
217 		}
218 	}
219 
220 	/*
221 	 * If there's no visible update and we skipped a birthmark, the base
222 	 * item is an empty item (in other words, birthmarks we can't read act
223 	 * as tombstones).
224 	 * If there's no visible update and we didn't skip a birthmark, the base
225 	 * item is the on-page item, which must be globally visible.
226 	 * If there's a visible update and it's a tombstone, the base item is an
227 	 * empty item.
228 	 * If there's a visible update and it's not a tombstone, the base item
229 	 * is the on-page item.
230 	 */
231 	if (upd == NULL) {
232 		if (skipped_birthmark)
233 			WT_ERR(__wt_buf_set(session, &cursor->value, "", 0));
234 		else {
235 			/*
236 			 * Callers of this function set the cursor slot to an
237 			 * impossible value to check we don't try and return
238 			 * on-page values when the update list should have been
239 			 * sufficient (which happens, for example, if an update
240 			 * list was truncated, deleting some standard update
241 			 * required by a previous modify update). Assert the
242 			 * case.
243 			 */
244 			WT_ASSERT(session, cbt->slot != UINT32_MAX);
245 
246 			WT_ERR(__value_return(cbt));
247 		}
248 	} else if (upd->type == WT_UPDATE_TOMBSTONE)
249 		WT_ERR(__wt_buf_set(session, &cursor->value, "", 0));
250 	else
251 		WT_ERR(__wt_buf_set(session,
252 		    &cursor->value, upd->data, upd->size));
253 
254 	/*
255 	 * Once we have a base item, roll forward through any visible modify
256 	 * updates.
257 	 */
258 	while (i > 0)
259 		WT_ERR(__wt_modify_apply(session, cursor, listp[--i]->data));
260 
261 err:	if (allocated_bytes != 0)
262 		__wt_free(session, listp);
263 	return (ret);
264 }
265 
266 /*
267  * __wt_key_return --
268  *	Change the cursor to reference an internal return key.
269  */
270 int
__wt_key_return(WT_CURSOR_BTREE * cbt)271 __wt_key_return(WT_CURSOR_BTREE *cbt)
272 {
273 	WT_CURSOR *cursor;
274 
275 	cursor = &cbt->iface;
276 
277 	/*
278 	 * We may already have an internal key and the cursor may not be set up
279 	 * to get another copy, so we have to leave it alone. Consider a cursor
280 	 * search followed by an update: the update doesn't repeat the search,
281 	 * it simply updates the currently referenced key's value. We will end
282 	 * up here with the correct internal key, but we can't "return" the key
283 	 * again even if we wanted to do the additional work, the cursor isn't
284 	 * set up for that because we didn't just complete a search.
285 	 */
286 	F_CLR(cursor, WT_CURSTD_KEY_EXT);
287 	if (!F_ISSET(cursor, WT_CURSTD_KEY_INT)) {
288 		WT_RET(__key_return(cbt));
289 		F_SET(cursor, WT_CURSTD_KEY_INT);
290 	}
291 	return (0);
292 }
293 
294 /*
295  * __wt_value_return --
296  *	Change the cursor to reference an internal return value.
297  */
298 int
__wt_value_return(WT_CURSOR_BTREE * cbt,WT_UPDATE * upd)299 __wt_value_return(WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
300 {
301 	WT_CURSOR *cursor;
302 
303 	cursor = &cbt->iface;
304 
305 	F_CLR(cursor, WT_CURSTD_VALUE_EXT);
306 	if (upd == NULL)
307 		WT_RET(__value_return(cbt));
308 	else
309 		WT_RET(__wt_value_return_upd(cbt, upd, false));
310 	F_SET(cursor, WT_CURSTD_VALUE_INT);
311 	return (0);
312 }
313