1 /*****************************************************************************
2 
3 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2017, 2021, MariaDB Corporation.
5 
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9 
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17 
18 *****************************************************************************/
19 
20 /**************************************************//**
21 @file row/row0undo.cc
22 Row undo
23 
24 Created 1/8/1997 Heikki Tuuri
25 *******************************************************/
26 
27 #include "row0undo.h"
28 #include "fsp0fsp.h"
29 #include "mach0data.h"
30 #include "trx0rseg.h"
31 #include "trx0trx.h"
32 #include "trx0roll.h"
33 #include "trx0undo.h"
34 #include "trx0purge.h"
35 #include "trx0rec.h"
36 #include "que0que.h"
37 #include "row0row.h"
38 #include "row0uins.h"
39 #include "row0umod.h"
40 #include "row0upd.h"
41 #include "row0mysql.h"
42 #include "srv0srv.h"
43 #include "srv0start.h"
44 
45 /* How to undo row operations?
46 (1) For an insert, we have stored a prefix of the clustered index record
47 in the undo log. Using it, we look for the clustered record, and using
48 that we look for the records in the secondary indexes. The insert operation
49 may have been left incomplete, if the database crashed, for example.
50 We may have look at the trx id and roll ptr to make sure the record in the
51 clustered index is really the one for which the undo log record was
52 written. We can use the framework we get from the original insert op.
53 (2) Delete marking: We can use the framework we get from the original
54 delete mark op. We only have to check the trx id.
55 (3) Update: This may be the most complicated. We have to use the framework
56 we get from the original update op.
57 
58 What if the same trx repeatedly deletes and inserts an identical row.
59 Then the row id changes and also roll ptr. What if the row id was not
60 part of the ordering fields in the clustered index? Maybe we have to write
61 it to undo log. Well, maybe not, because if we order the row id and trx id
62 in descending order, then the only undeleted copy is the first in the
63 index. Our searches in row operations always position the cursor before
64 the first record in the result set. But, if there is no key defined for
65 a table, then it would be desirable that row id is in ascending order.
66 So, lets store row id in descending order only if it is not an ordering
67 field in the clustered index.
68 
69 NOTE: Deletes and inserts may lead to situation where there are identical
70 records in a secondary index. Is that a problem in the B-tree? Yes.
71 Also updates can lead to this, unless trx id and roll ptr are included in
72 ord fields.
73 (1) Fix in clustered indexes: include row id, trx id, and roll ptr
74 in node pointers of B-tree.
75 (2) Fix in secondary indexes: include all fields in node pointers, and
76 if an entry is inserted, check if it is equal to the right neighbor,
77 in which case update the right neighbor: the neighbor must be delete
78 marked, set it unmarked and write the trx id of the current transaction.
79 
80 What if the same trx repeatedly updates the same row, updating a secondary
81 index field or not? Updating a clustered index ordering field?
82 
83 (1) If it does not update the secondary index and not the clustered index
84 ord field. Then the secondary index record stays unchanged, but the
85 trx id in the secondary index record may be smaller than in the clustered
86 index record. This is no problem?
87 (2) If it updates secondary index ord field but not clustered: then in
88 secondary index there are delete marked records, which differ in an
89 ord field. No problem.
90 (3) Updates clustered ord field but not secondary, and secondary index
91 is unique. Then the record in secondary index is just updated at the
92 clustered ord field.
93 (4)
94 
95 Problem with duplicate records:
96 Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
97 bigger trx id has inserted and delete marked a similar row, our trx inserts
98 again a similar row, and a trx with an even bigger id delete marks it. Then
99 the position of the row should change in the index if the trx id affects
100 the alphabetical ordering.
101 
102 Fix 2: If an insert encounters a similar row marked deleted, we turn the
103 insert into an 'update' of the row marked deleted. Then we must write undo
104 info on the update. A problem: what if a purge operation tries to remove
105 the delete marked row?
106 
107 We can think of the database row versions as a linked list which starts
108 from the record in the clustered index, and is linked by roll ptrs
109 through undo logs. The secondary index records are references which tell
110 what kinds of records can be found in this linked list for a record
111 in the clustered index.
112 
113 How to do the purge? A record can be removed from the clustered index
114 if its linked list becomes empty, i.e., the row has been marked deleted
115 and its roll ptr points to the record in the undo log we are going through,
116 doing the purge. Similarly, during a rollback, a record can be removed
117 if the stored roll ptr in the undo log points to a trx already (being) purged,
118 or if the roll ptr is NULL, i.e., it was a fresh insert. */
119 
120 /********************************************************************//**
121 Creates a row undo node to a query graph.
122 @return own: undo node */
123 undo_node_t*
row_undo_node_create(trx_t * trx,que_thr_t * parent,mem_heap_t * heap)124 row_undo_node_create(
125 /*=================*/
126 	trx_t*		trx,	/*!< in/out: transaction */
127 	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
128 	mem_heap_t*	heap)	/*!< in: memory heap where created */
129 {
130 	undo_node_t*	undo;
131 
132 	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)
133 	      || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
134 	      || trx_state_eq(trx, TRX_STATE_PREPARED));
135 	ut_ad(parent);
136 
137 	undo = static_cast<undo_node_t*>(
138 		mem_heap_alloc(heap, sizeof(undo_node_t)));
139 
140 	undo->common.type = QUE_NODE_UNDO;
141 	undo->common.parent = parent;
142 
143 	undo->state = UNDO_NODE_FETCH_NEXT;
144 	undo->trx = trx;
145 
146 	btr_pcur_init(&(undo->pcur));
147 
148 	undo->heap = mem_heap_create(256);
149 
150 	return(undo);
151 }
152 
153 /***********************************************************//**
154 Looks for the clustered index record when node has the row reference.
155 The pcur in node is used in the search. If found, stores the row to node,
156 and stores the position of pcur, and detaches it. The pcur must be closed
157 by the caller in any case.
158 @return true if found; NOTE the node->pcur must be closed by the
159 caller, regardless of the return value */
160 bool
row_undo_search_clust_to_pcur(undo_node_t * node)161 row_undo_search_clust_to_pcur(
162 /*==========================*/
163 	undo_node_t*	node)	/*!< in/out: row undo node */
164 {
165 	dict_index_t*	clust_index;
166 	bool		found;
167 	mtr_t		mtr;
168 	row_ext_t**	ext;
169 	const rec_t*	rec;
170 	mem_heap_t*	heap		= NULL;
171 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
172 	rec_offs*	offsets		= offsets_;
173 	rec_offs_init(offsets_);
174 
175 	ut_ad(!node->table->skip_alter_undo);
176 
177 	mtr_start(&mtr);
178 
179 	clust_index = dict_table_get_first_index(node->table);
180 
181 	found = row_search_on_row_ref(&node->pcur, BTR_MODIFY_LEAF,
182 				      node->table, node->ref, &mtr);
183 
184 	if (!found) {
185 		goto func_exit;
186 	}
187 
188 	rec = btr_pcur_get_rec(&node->pcur);
189 
190 	offsets = rec_get_offsets(rec, clust_index, offsets,
191 				  clust_index->n_core_fields,
192 				  ULINT_UNDEFINED, &heap);
193 
194 	found = row_get_rec_roll_ptr(rec, clust_index, offsets)
195 		== node->roll_ptr;
196 
197 	if (found) {
198 		ut_ad(row_get_rec_trx_id(rec, clust_index, offsets)
199 		      == node->trx->id || node->table->is_temporary());
200 
201 		if (dict_table_has_atomic_blobs(node->table)) {
202 			/* There is no prefix of externally stored
203 			columns in the clustered index record. Build a
204 			cache of column prefixes. */
205 			ext = &node->ext;
206 		} else {
207 			/* REDUNDANT and COMPACT formats store a local
208 			768-byte prefix of each externally stored
209 			column. No cache is needed. */
210 			ext = NULL;
211 			node->ext = NULL;
212 		}
213 
214 		node->row = row_build(ROW_COPY_DATA, clust_index, rec,
215 				      offsets, NULL,
216 				      NULL, NULL, ext, node->heap);
217 
218 		/* We will need to parse out virtual column info from undo
219 		log, first mark them DATA_MISSING. So we will know if the
220 		value gets updated */
221 		if (node->table->n_v_cols
222 		    && node->state != UNDO_NODE_INSERT
223 		    && !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
224 			for (ulint i = 0;
225 			     i < dict_table_get_n_v_cols(node->table); i++) {
226 				dfield_get_type(dtuple_get_nth_v_field(
227 					node->row, i))->mtype = DATA_MISSING;
228 			}
229 		}
230 
231 		if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
232 			ut_ad(node->row->info_bits == REC_INFO_MIN_REC_FLAG
233 			      || node->row->info_bits == 0);
234 			node->undo_row = dtuple_copy(node->row, node->heap);
235 			row_upd_replace(node->undo_row, &node->undo_ext,
236 					clust_index, node->update, node->heap);
237 		} else {
238 			ut_ad((node->row->info_bits == REC_INFO_MIN_REC_FLAG)
239 			      == (node->rec_type == TRX_UNDO_INSERT_METADATA));
240 			node->undo_row = NULL;
241 			node->undo_ext = NULL;
242 		}
243 
244 		btr_pcur_store_position(&node->pcur, &mtr);
245 	}
246 
247 	if (heap) {
248 		mem_heap_free(heap);
249 	}
250 
251 func_exit:
252 	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
253 	return(found);
254 }
255 
256 /***********************************************************//**
257 Fetches an undo log record and does the undo for the recorded operation.
258 If none left, or a partial rollback completed, returns control to the
259 parent node, which is always a query thread node.
260 @return DB_SUCCESS if operation successfully completed, else error code */
261 static MY_ATTRIBUTE((warn_unused_result))
262 dberr_t
row_undo(undo_node_t * node,que_thr_t * thr)263 row_undo(
264 /*=====*/
265 	undo_node_t*	node,	/*!< in: row undo node */
266 	que_thr_t*	thr)	/*!< in: query thread */
267 {
268 	trx_t*		trx = node->trx;
269 	ut_ad(trx->in_rollback);
270 
271 	if (node->state == UNDO_NODE_FETCH_NEXT) {
272 
273 		node->undo_rec = trx_roll_pop_top_rec_of_trx(
274 			trx, &node->roll_ptr, node->heap);
275 
276 		if (!node->undo_rec) {
277 			/* Rollback completed for this query thread */
278 			thr->run_node = que_node_get_parent(node);
279 			return(DB_SUCCESS);
280 		}
281 
282 		node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
283 		node->state = trx_undo_roll_ptr_is_insert(node->roll_ptr)
284 			? UNDO_NODE_INSERT : UNDO_NODE_MODIFY;
285 	}
286 
287 	/* Prevent DROP TABLE etc. while we are rolling back this row.
288 	If we are doing a TABLE CREATE or some other dictionary operation,
289 	then we already have dict_operation_lock locked in x-mode. Do not
290 	try to lock again, because that would cause a hang. */
291 
292 	const bool locked_data_dict = (trx->dict_operation_lock_mode == 0);
293 
294 	if (locked_data_dict) {
295 
296 		row_mysql_freeze_data_dictionary(trx);
297 	}
298 
299 	dberr_t err;
300 
301 	if (node->state == UNDO_NODE_INSERT) {
302 
303 		err = row_undo_ins(node, thr);
304 
305 		node->state = UNDO_NODE_FETCH_NEXT;
306 	} else {
307 		ut_ad(node->state == UNDO_NODE_MODIFY);
308 		err = row_undo_mod(node, thr);
309 	}
310 
311 	if (locked_data_dict) {
312 
313 		row_mysql_unfreeze_data_dictionary(trx);
314 	}
315 
316 	/* Do some cleanup */
317 	btr_pcur_close(&(node->pcur));
318 
319 	mem_heap_empty(node->heap);
320 
321 	thr->run_node = node;
322 
323 	return(err);
324 }
325 
326 /***********************************************************//**
327 Undoes a row operation in a table. This is a high-level function used
328 in SQL execution graphs.
329 @return query thread to run next or NULL */
330 que_thr_t*
row_undo_step(que_thr_t * thr)331 row_undo_step(
332 /*==========*/
333 	que_thr_t*	thr)	/*!< in: query thread */
334 {
335 	dberr_t		err;
336 	undo_node_t*	node;
337 	trx_t*		trx;
338 
339 	ut_ad(thr);
340 
341 	srv_inc_activity_count();
342 
343 	trx = thr_get_trx(thr);
344 
345 	node = static_cast<undo_node_t*>(thr->run_node);
346 
347 	ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
348 
349 	if (UNIV_UNLIKELY(trx_get_dict_operation(trx) == TRX_DICT_OP_NONE
350 			  && !srv_undo_sources
351 			  && srv_shutdown_state != SRV_SHUTDOWN_NONE)
352 	    && (srv_fast_shutdown == 3 || trx == trx_roll_crash_recv_trx)) {
353 		/* Shutdown has been initiated. */
354 		trx->error_state = DB_INTERRUPTED;
355 		return NULL;
356 	}
357 
358 	if (UNIV_UNLIKELY(trx == trx_roll_crash_recv_trx)) {
359 		trx_roll_report_progress();
360 	}
361 
362 	err = row_undo(node, thr);
363 
364 #ifdef ENABLED_DEBUG_SYNC
365 	if (trx->mysql_thd) {
366 		DEBUG_SYNC_C("trx_after_rollback_row");
367 	}
368 #endif /* ENABLED_DEBUG_SYNC */
369 
370 	trx->error_state = err;
371 
372 	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
373 		ib::fatal() << "Error (" << err << ") in rollback.";
374 	}
375 
376 	return(thr);
377 }
378