1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 #include "wt_internal.h"
10
11 /*
12 * __key_return --
13 * Change the cursor to reference an internal return key.
14 */
15 static inline int
__key_return(WT_CURSOR_BTREE * cbt)16 __key_return(WT_CURSOR_BTREE *cbt)
17 {
18 WT_CURSOR *cursor;
19 WT_ITEM *tmp;
20 WT_PAGE *page;
21 WT_ROW *rip;
22 WT_SESSION_IMPL *session;
23
24 page = cbt->ref->page;
25 cursor = &cbt->iface;
26 session = (WT_SESSION_IMPL *)cbt->iface.session;
27
28 if (page->type == WT_PAGE_ROW_LEAF) {
29 rip = &page->pg_row[cbt->slot];
30
31 /*
32 * If the cursor references a WT_INSERT item, take its key.
33 * Else, if we have an exact match, we copied the key in the
34 * search function, take it from there.
35 * If we don't have an exact match, take the key from the
36 * original page.
37 */
38 if (cbt->ins != NULL) {
39 cursor->key.data = WT_INSERT_KEY(cbt->ins);
40 cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins);
41 return (0);
42 }
43
44 if (cbt->compare == 0) {
45 /*
46 * If not in an insert list and there's an exact match,
47 * the row-store search function built the key we want
48 * to return in the cursor's temporary buffer. Swap the
49 * cursor's search-key and temporary buffers so we can
50 * return it (it's unsafe to return the temporary buffer
51 * itself because our caller might do another search in
52 * this table using the key we return, and we'd corrupt
53 * the search key during any subsequent search that used
54 * the temporary buffer).
55 */
56 tmp = cbt->row_key;
57 cbt->row_key = cbt->tmp;
58 cbt->tmp = tmp;
59
60 cursor->key.data = cbt->row_key->data;
61 cursor->key.size = cbt->row_key->size;
62 return (0);
63 }
64 return (__wt_row_leaf_key(
65 session, page, rip, &cursor->key, false));
66 }
67
68 /*
69 * WT_PAGE_COL_FIX, WT_PAGE_COL_VAR:
70 * The interface cursor's record has usually been set, but that
71 * isn't universally true, specifically, cursor.search_near may call
72 * here without first setting the interface cursor.
73 */
74 cursor->recno = cbt->recno;
75 return (0);
76 }
77
78 /*
79 * __value_return --
80 * Change the cursor to reference an internal original-page return value.
81 */
82 static inline int
__value_return(WT_CURSOR_BTREE * cbt)83 __value_return(WT_CURSOR_BTREE *cbt)
84 {
85 WT_BTREE *btree;
86 WT_CELL *cell;
87 WT_CELL_UNPACK unpack;
88 WT_CURSOR *cursor;
89 WT_PAGE *page;
90 WT_ROW *rip;
91 WT_SESSION_IMPL *session;
92 uint8_t v;
93
94 session = (WT_SESSION_IMPL *)cbt->iface.session;
95 btree = S2BT(session);
96
97 page = cbt->ref->page;
98 cursor = &cbt->iface;
99
100 if (page->type == WT_PAGE_ROW_LEAF) {
101 rip = &page->pg_row[cbt->slot];
102
103 /* Simple values have their location encoded in the WT_ROW. */
104 if (__wt_row_leaf_value(page, rip, &cursor->value))
105 return (0);
106
107 /* Take the value from the original page cell. */
108 __wt_row_leaf_value_cell(page, rip, NULL, &unpack);
109 return (__wt_page_cell_data_ref(
110 session, page, &unpack, &cursor->value));
111
112 }
113
114 if (page->type == WT_PAGE_COL_VAR) {
115 /* Take the value from the original page cell. */
116 cell = WT_COL_PTR(page, &page->pg_var[cbt->slot]);
117 __wt_cell_unpack(cell, &unpack);
118 return (__wt_page_cell_data_ref(
119 session, page, &unpack, &cursor->value));
120 }
121
122 /* WT_PAGE_COL_FIX: Take the value from the original page. */
123 v = __bit_getv_recno(cbt->ref, cursor->recno, btree->bitcnt);
124 return (__wt_buf_set(session, &cursor->value, &v, 1));
125 }
126
127 /*
128 * When threads race modifying a record, we can end up with more than the usual
129 * maximum number of modifications in an update list. We'd prefer not to
130 * allocate memory in a return path, so add a few additional slots to the array
131 * we use to build up a list of modify records to apply.
132 */
133 #define WT_MODIFY_ARRAY_SIZE (WT_MAX_MODIFY_UPDATE + 10)
134
135 /*
136 * __wt_value_return_upd --
137 * Change the cursor to reference an internal update structure return
138 * value.
139 */
140 int
__wt_value_return_upd(WT_CURSOR_BTREE * cbt,WT_UPDATE * upd,bool ignore_visibility)141 __wt_value_return_upd(
142 WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, bool ignore_visibility)
143 {
144 WT_CURSOR *cursor;
145 WT_DECL_RET;
146 WT_SESSION_IMPL *session;
147 WT_UPDATE **listp, *list[WT_MODIFY_ARRAY_SIZE];
148 size_t allocated_bytes;
149 u_int i;
150 bool skipped_birthmark;
151
152 cursor = &cbt->iface;
153 allocated_bytes = 0;
154 session = (WT_SESSION_IMPL *)cbt->iface.session;
155
156 /*
157 * We're passed a "standard" or "modified" update that's visible to us.
158 * Our caller should have already checked for deleted items (we're too
159 * far down the call stack to return not-found).
160 *
161 * Fast path if it's a standard item, assert our caller's behavior.
162 */
163 if (upd->type == WT_UPDATE_STANDARD) {
164 cursor->value.data = upd->data;
165 cursor->value.size = upd->size;
166 return (0);
167 }
168 WT_ASSERT(session, upd->type == WT_UPDATE_MODIFY);
169
170 /*
171 * Find a complete update that's visible to us, tracking modifications
172 * that are visible to us.
173 */
174 for (i = 0, listp = list, skipped_birthmark = false;
175 upd != NULL;
176 upd = upd->next) {
177 if (upd->txnid == WT_TXN_ABORTED)
178 continue;
179
180 if (!ignore_visibility && !__wt_txn_upd_visible(session, upd)) {
181 if (upd->type == WT_UPDATE_BIRTHMARK)
182 skipped_birthmark = true;
183 continue;
184 }
185
186 if (upd->type == WT_UPDATE_BIRTHMARK) {
187 upd = NULL;
188 break;
189 }
190
191 if (WT_UPDATE_DATA_VALUE(upd))
192 break;
193
194 if (upd->type == WT_UPDATE_MODIFY) {
195 /*
196 * Update lists are expected to be short, but it's not
197 * guaranteed. There's sufficient room on the stack to
198 * avoid memory allocation in normal cases, but we have
199 * to handle the edge cases too.
200 */
201 if (i >= WT_MODIFY_ARRAY_SIZE) {
202 if (i == WT_MODIFY_ARRAY_SIZE)
203 listp = NULL;
204 WT_ERR(__wt_realloc_def(
205 session, &allocated_bytes, i + 1, &listp));
206 if (i == WT_MODIFY_ARRAY_SIZE)
207 memcpy(listp, list, sizeof(list));
208 }
209 listp[i++] = upd;
210
211 /*
212 * Once a modify is found, all previously committed
213 * modifications should be applied regardless of
214 * visibility.
215 */
216 ignore_visibility = true;
217 }
218 }
219
220 /*
221 * If there's no visible update and we skipped a birthmark, the base
222 * item is an empty item (in other words, birthmarks we can't read act
223 * as tombstones).
224 * If there's no visible update and we didn't skip a birthmark, the base
225 * item is the on-page item, which must be globally visible.
226 * If there's a visible update and it's a tombstone, the base item is an
227 * empty item.
228 * If there's a visible update and it's not a tombstone, the base item
229 * is the on-page item.
230 */
231 if (upd == NULL) {
232 if (skipped_birthmark)
233 WT_ERR(__wt_buf_set(session, &cursor->value, "", 0));
234 else {
235 /*
236 * Callers of this function set the cursor slot to an
237 * impossible value to check we don't try and return
238 * on-page values when the update list should have been
239 * sufficient (which happens, for example, if an update
240 * list was truncated, deleting some standard update
241 * required by a previous modify update). Assert the
242 * case.
243 */
244 WT_ASSERT(session, cbt->slot != UINT32_MAX);
245
246 WT_ERR(__value_return(cbt));
247 }
248 } else if (upd->type == WT_UPDATE_TOMBSTONE)
249 WT_ERR(__wt_buf_set(session, &cursor->value, "", 0));
250 else
251 WT_ERR(__wt_buf_set(session,
252 &cursor->value, upd->data, upd->size));
253
254 /*
255 * Once we have a base item, roll forward through any visible modify
256 * updates.
257 */
258 while (i > 0)
259 WT_ERR(__wt_modify_apply(session, cursor, listp[--i]->data));
260
261 err: if (allocated_bytes != 0)
262 __wt_free(session, listp);
263 return (ret);
264 }
265
266 /*
267 * __wt_key_return --
268 * Change the cursor to reference an internal return key.
269 */
270 int
__wt_key_return(WT_CURSOR_BTREE * cbt)271 __wt_key_return(WT_CURSOR_BTREE *cbt)
272 {
273 WT_CURSOR *cursor;
274
275 cursor = &cbt->iface;
276
277 /*
278 * We may already have an internal key and the cursor may not be set up
279 * to get another copy, so we have to leave it alone. Consider a cursor
280 * search followed by an update: the update doesn't repeat the search,
281 * it simply updates the currently referenced key's value. We will end
282 * up here with the correct internal key, but we can't "return" the key
283 * again even if we wanted to do the additional work, the cursor isn't
284 * set up for that because we didn't just complete a search.
285 */
286 F_CLR(cursor, WT_CURSTD_KEY_EXT);
287 if (!F_ISSET(cursor, WT_CURSTD_KEY_INT)) {
288 WT_RET(__key_return(cbt));
289 F_SET(cursor, WT_CURSTD_KEY_INT);
290 }
291 return (0);
292 }
293
294 /*
295 * __wt_value_return --
296 * Change the cursor to reference an internal return value.
297 */
298 int
__wt_value_return(WT_CURSOR_BTREE * cbt,WT_UPDATE * upd)299 __wt_value_return(WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
300 {
301 WT_CURSOR *cursor;
302
303 cursor = &cbt->iface;
304
305 F_CLR(cursor, WT_CURSTD_VALUE_EXT);
306 if (upd == NULL)
307 WT_RET(__value_return(cbt));
308 else
309 WT_RET(__wt_value_return_upd(cbt, upd, false));
310 F_SET(cursor, WT_CURSTD_VALUE_INT);
311 return (0);
312 }
313