1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 static int __col_insert_alloc(
12     WT_SESSION_IMPL *, uint64_t, u_int, WT_INSERT **, size_t *);
13 
14 /*
15  * __wt_col_modify --
16  *	Column-store delete, insert, and update.
17  */
18 int
__wt_col_modify(WT_CURSOR_BTREE * cbt,uint64_t recno,const WT_ITEM * value,WT_UPDATE * upd_arg,u_int modify_type,bool exclusive)19 __wt_col_modify(WT_CURSOR_BTREE *cbt,
20     uint64_t recno, const WT_ITEM *value,
21     WT_UPDATE *upd_arg, u_int modify_type, bool exclusive)
22 {
23 	static const WT_ITEM col_fix_remove = { "", 1, NULL, 0, 0 };
24 	WT_BTREE *btree;
25 	WT_DECL_RET;
26 	WT_INSERT *ins;
27 	WT_INSERT_HEAD *ins_head, **ins_headp;
28 	WT_PAGE *page;
29 	WT_PAGE_MODIFY *mod;
30 	WT_SESSION_IMPL *session;
31 	WT_UPDATE *old_upd, *upd;
32 	size_t ins_size, upd_size;
33 	u_int i, skipdepth;
34 	bool append, logged;
35 
36 	btree = cbt->btree;
37 	ins = NULL;
38 	page = cbt->ref->page;
39 	session = (WT_SESSION_IMPL *)cbt->iface.session;
40 	upd = upd_arg;
41 	append = logged = false;
42 
43 	if (upd_arg == NULL) {
44 		if (modify_type == WT_UPDATE_RESERVE ||
45 		    modify_type == WT_UPDATE_TOMBSTONE) {
46 			/*
47 			 * Fixed-size column-store doesn't have on-page deleted
48 			 * values, it's a nul byte.
49 			 */
50 			if (modify_type == WT_UPDATE_TOMBSTONE &&
51 			    btree->type == BTREE_COL_FIX) {
52 				modify_type = WT_UPDATE_STANDARD;
53 				value = &col_fix_remove;
54 			}
55 		}
56 
57 		/*
58 		 * There's a chance the application specified a record past the
59 		 * last record on the page. If that's the case and we're
60 		 * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
61 		 * append list, not the update list. Also, an out-of-band recno
62 		 * implies an append operation, we're allocating a new row.
63 		 * Ignore any information obtained from the search.
64 		 */
65 		WT_ASSERT(session, recno != WT_RECNO_OOB || cbt->compare != 0);
66 		if (cbt->compare != 0 &&
67 		    (recno == WT_RECNO_OOB ||
68 		    recno > (btree->type == BTREE_COL_VAR ?
69 		    __col_var_last_recno(cbt->ref) :
70 		    __col_fix_last_recno(cbt->ref)))) {
71 			append = true;
72 			cbt->ins = NULL;
73 			cbt->ins_head = NULL;
74 		}
75 	}
76 
77 	/* We're going to modify the page, we should have loaded history. */
78 	WT_ASSERT(session, cbt->ref->state != WT_REF_LIMBO);
79 
80 	/* If we don't yet have a modify structure, we'll need one. */
81 	WT_RET(__wt_page_modify_init(session, page));
82 	mod = page->modify;
83 
84 	/*
85 	 * If modifying a record not previously modified, but which is in the
86 	 * same update slot as a previously modified record, cursor.ins will
87 	 * not be set because there's no list of update records for this recno,
88 	 * but cursor.ins_head will be set to point to the correct update slot.
89 	 * Acquire the necessary insert information, then create a new update
90 	 * entry and link it into the existing list. We get here if a page has
91 	 * a single cell representing multiple records (the records have the
92 	 * same value), and then a record in the cell is updated or removed,
93 	 * creating the update list for the cell, and then a cursor iterates
94 	 * into that same cell to update/remove a different record. We find the
95 	 * correct slot in the update array, but we don't find an update list
96 	 * (because it doesn't exist), and don't have the information we need
97 	 * to do the insert. Normally, we wouldn't care (we could fail and do
98 	 * a search for the record which would configure everything for the
99 	 * insert), but range truncation does this pattern for every record in
100 	 * the cell, and the performance is terrible. For that reason, catch it
101 	 * here.
102 	 */
103 	if (cbt->ins == NULL && cbt->ins_head != NULL) {
104 		cbt->ins = __col_insert_search(
105 		    cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno);
106 		if (cbt->ins != NULL) {
107 			if (WT_INSERT_RECNO(cbt->ins) == recno)
108 				cbt->compare = 0;
109 			else {
110 				/*
111 				 * The test below is for cursor.compare set to 0
112 				 * and cursor.ins set: cursor.compare wasn't set
113 				 * by the search we just did, and has an unknown
114 				 * value. Clear cursor.ins to avoid the test.
115 				 */
116 				cbt->ins = NULL;
117 			}
118 		}
119 	}
120 
121 	/*
122 	 * Delete, insert or update a column-store entry.
123 	 *
124 	 * If modifying a previously modified record, cursor.ins will be set to
125 	 * point to the correct update list. Create a new update entry and link
126 	 * it into the existing list.
127 	 *
128 	 * Else, allocate an insert array as necessary, build an insert/update
129 	 * structure pair, and link it into place.
130 	 */
131 	if (cbt->compare == 0 && cbt->ins != NULL) {
132 		/*
133 		 * If we are restoring updates that couldn't be evicted, the
134 		 * key must not exist on the new page.
135 		 */
136 		WT_ASSERT(session, upd_arg == NULL);
137 
138 		/* Make sure the update can proceed. */
139 		WT_ERR(__wt_txn_update_check(session, old_upd = cbt->ins->upd));
140 
141 		/* Allocate a WT_UPDATE structure and transaction ID. */
142 		WT_ERR(__wt_update_alloc(session,
143 		    value, &upd, &upd_size, modify_type));
144 		WT_ERR(__wt_txn_modify(session, cbt, upd));
145 		logged = true;
146 
147 		/* Avoid a data copy in WT_CURSOR.update. */
148 		cbt->modify_update = upd;
149 
150 		/*
151 		 * Point the new WT_UPDATE item to the next element in the list.
152 		 * If we get it right, the serialization function lock acts as
153 		 * our memory barrier to flush this write.
154 		 */
155 		upd->next = old_upd;
156 
157 		/* Serialize the update. */
158 		WT_ERR(__wt_update_serial(
159 		    session, page, &cbt->ins->upd, &upd, upd_size, false));
160 	} else {
161 		/* Allocate the append/update list reference as necessary. */
162 		if (append) {
163 			WT_PAGE_ALLOC_AND_SWAP(session,
164 			    page, mod->mod_col_append, ins_headp, 1);
165 			ins_headp = &mod->mod_col_append[0];
166 		} else if (page->type == WT_PAGE_COL_FIX) {
167 			WT_PAGE_ALLOC_AND_SWAP(session,
168 			    page, mod->mod_col_update, ins_headp, 1);
169 			ins_headp = &mod->mod_col_update[0];
170 		} else {
171 			WT_PAGE_ALLOC_AND_SWAP(session, page,
172 			    mod->mod_col_update, ins_headp, page->entries);
173 			ins_headp = &mod->mod_col_update[cbt->slot];
174 		}
175 
176 		/* Allocate the WT_INSERT_HEAD structure as necessary. */
177 		WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
178 		ins_head = *ins_headp;
179 
180 		/* Choose a skiplist depth for this insert. */
181 		skipdepth = __wt_skip_choose_depth(session);
182 
183 		/*
184 		 * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
185 		 * update the cursor to reference it (the WT_INSERT_HEAD might
186 		 * be allocated, the WT_INSERT was allocated).
187 		 */
188 		WT_ERR(__col_insert_alloc(
189 		    session, recno, skipdepth, &ins, &ins_size));
190 		cbt->ins_head = ins_head;
191 		cbt->ins = ins;
192 
193 		/*
194 		 * Check for insert split and checkpoint races in column-store:
195 		 * it's easy (as opposed to in row-store) and a difficult bug to
196 		 * otherwise diagnose.
197 		 */
198 		WT_ASSERT(session, mod->mod_col_split_recno == WT_RECNO_OOB ||
199 		    (recno != WT_RECNO_OOB &&
200 		    mod->mod_col_split_recno > recno));
201 
202 		if (upd_arg == NULL) {
203 			WT_ERR(__wt_update_alloc(session,
204 			    value, &upd, &upd_size, modify_type));
205 			WT_ERR(__wt_txn_modify(session, cbt, upd));
206 			logged = true;
207 
208 			/* Avoid a data copy in WT_CURSOR.update. */
209 			cbt->modify_update = upd;
210 		} else
211 			upd_size = __wt_update_list_memsize(upd);
212 		ins->upd = upd;
213 		ins_size += upd_size;
214 
215 		/*
216 		 * If there was no insert list during the search, or there was
217 		 * no search because the record number has not been allocated
218 		 * yet, the cursor's information cannot be correct, search
219 		 * couldn't have initialized it.
220 		 *
221 		 * Otherwise, point the new WT_INSERT item's skiplist to the
222 		 * next elements in the insert list (which we will check are
223 		 * still valid inside the serialization function).
224 		 *
225 		 * The serial mutex acts as our memory barrier to flush these
226 		 * writes before inserting them into the list.
227 		 */
228 		if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB)
229 			for (i = 0; i < skipdepth; i++) {
230 				cbt->ins_stack[i] = &ins_head->head[i];
231 				ins->next[i] = cbt->next_stack[i] = NULL;
232 			}
233 		else
234 			for (i = 0; i < skipdepth; i++)
235 				ins->next[i] = cbt->next_stack[i];
236 
237 		/* Append or insert the WT_INSERT structure. */
238 		if (append)
239 			WT_ERR(__wt_col_append_serial(
240 			    session, page, cbt->ins_head, cbt->ins_stack,
241 			    &ins, ins_size, &cbt->recno, skipdepth, exclusive));
242 		else
243 			WT_ERR(__wt_insert_serial(
244 			    session, page, cbt->ins_head, cbt->ins_stack,
245 			    &ins, ins_size, skipdepth, exclusive));
246 	}
247 
248 	/* If the update was successful, add it to the in-memory log. */
249 	if (logged && modify_type != WT_UPDATE_RESERVE)
250 		WT_ERR(__wt_txn_log_op(session, cbt));
251 
252 	if (0) {
253 err:		/*
254 		 * Remove the update from the current transaction, so we don't
255 		 * try to modify it on rollback.
256 		 */
257 		if (logged)
258 			__wt_txn_unmodify(session);
259 		__wt_free(session, ins);
260 		if (upd_arg == NULL)
261 			__wt_free(session, upd);
262 	}
263 
264 	return (ret);
265 }
266 
267 /*
268  * __col_insert_alloc --
269  *	Column-store insert: allocate a WT_INSERT structure and fill it in.
270  */
271 static int
__col_insert_alloc(WT_SESSION_IMPL * session,uint64_t recno,u_int skipdepth,WT_INSERT ** insp,size_t * ins_sizep)272 __col_insert_alloc(WT_SESSION_IMPL *session,
273     uint64_t recno, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep)
274 {
275 	WT_INSERT *ins;
276 	size_t ins_size;
277 
278 	/*
279 	 * Allocate the WT_INSERT structure and skiplist pointers, then copy
280 	 * the record number into place.
281 	 */
282 	ins_size = sizeof(WT_INSERT) + skipdepth * sizeof(WT_INSERT *);
283 	WT_RET(__wt_calloc(session, 1, ins_size, &ins));
284 
285 	WT_INSERT_RECNO(ins) = recno;
286 
287 	*insp = ins;
288 	*ins_sizep = ins_size;
289 	return (0);
290 }
291