1 /*****************************************************************************
2
3 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2017, 2021, MariaDB Corporation.
5
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17
18 *****************************************************************************/
19
20 /**************************************************//**
21 @file row/row0undo.cc
22 Row undo
23
24 Created 1/8/1997 Heikki Tuuri
25 *******************************************************/
26
27 #include "row0undo.h"
28 #include "fsp0fsp.h"
29 #include "mach0data.h"
30 #include "trx0rseg.h"
31 #include "trx0trx.h"
32 #include "trx0roll.h"
33 #include "trx0undo.h"
34 #include "trx0purge.h"
35 #include "trx0rec.h"
36 #include "que0que.h"
37 #include "row0row.h"
38 #include "row0uins.h"
39 #include "row0umod.h"
40 #include "row0upd.h"
41 #include "row0mysql.h"
42 #include "srv0srv.h"
43 #include "srv0start.h"
44
45 /* How to undo row operations?
46 (1) For an insert, we have stored a prefix of the clustered index record
47 in the undo log. Using it, we look for the clustered record, and using
48 that we look for the records in the secondary indexes. The insert operation
49 may have been left incomplete, if the database crashed, for example.
50 We may have look at the trx id and roll ptr to make sure the record in the
51 clustered index is really the one for which the undo log record was
52 written. We can use the framework we get from the original insert op.
53 (2) Delete marking: We can use the framework we get from the original
54 delete mark op. We only have to check the trx id.
55 (3) Update: This may be the most complicated. We have to use the framework
56 we get from the original update op.
57
58 What if the same trx repeatedly deletes and inserts an identical row.
59 Then the row id changes and also roll ptr. What if the row id was not
60 part of the ordering fields in the clustered index? Maybe we have to write
61 it to undo log. Well, maybe not, because if we order the row id and trx id
62 in descending order, then the only undeleted copy is the first in the
63 index. Our searches in row operations always position the cursor before
64 the first record in the result set. But, if there is no key defined for
65 a table, then it would be desirable that row id is in ascending order.
66 So, lets store row id in descending order only if it is not an ordering
67 field in the clustered index.
68
69 NOTE: Deletes and inserts may lead to situation where there are identical
70 records in a secondary index. Is that a problem in the B-tree? Yes.
71 Also updates can lead to this, unless trx id and roll ptr are included in
72 ord fields.
73 (1) Fix in clustered indexes: include row id, trx id, and roll ptr
74 in node pointers of B-tree.
75 (2) Fix in secondary indexes: include all fields in node pointers, and
76 if an entry is inserted, check if it is equal to the right neighbor,
77 in which case update the right neighbor: the neighbor must be delete
78 marked, set it unmarked and write the trx id of the current transaction.
79
80 What if the same trx repeatedly updates the same row, updating a secondary
81 index field or not? Updating a clustered index ordering field?
82
83 (1) If it does not update the secondary index and not the clustered index
84 ord field. Then the secondary index record stays unchanged, but the
85 trx id in the secondary index record may be smaller than in the clustered
86 index record. This is no problem?
87 (2) If it updates secondary index ord field but not clustered: then in
88 secondary index there are delete marked records, which differ in an
89 ord field. No problem.
90 (3) Updates clustered ord field but not secondary, and secondary index
91 is unique. Then the record in secondary index is just updated at the
92 clustered ord field.
93 (4)
94
95 Problem with duplicate records:
96 Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
97 bigger trx id has inserted and delete marked a similar row, our trx inserts
98 again a similar row, and a trx with an even bigger id delete marks it. Then
99 the position of the row should change in the index if the trx id affects
100 the alphabetical ordering.
101
102 Fix 2: If an insert encounters a similar row marked deleted, we turn the
103 insert into an 'update' of the row marked deleted. Then we must write undo
104 info on the update. A problem: what if a purge operation tries to remove
105 the delete marked row?
106
107 We can think of the database row versions as a linked list which starts
108 from the record in the clustered index, and is linked by roll ptrs
109 through undo logs. The secondary index records are references which tell
110 what kinds of records can be found in this linked list for a record
111 in the clustered index.
112
113 How to do the purge? A record can be removed from the clustered index
114 if its linked list becomes empty, i.e., the row has been marked deleted
115 and its roll ptr points to the record in the undo log we are going through,
116 doing the purge. Similarly, during a rollback, a record can be removed
117 if the stored roll ptr in the undo log points to a trx already (being) purged,
118 or if the roll ptr is NULL, i.e., it was a fresh insert. */
119
120 /********************************************************************//**
121 Creates a row undo node to a query graph.
122 @return own: undo node */
123 undo_node_t*
row_undo_node_create(trx_t * trx,que_thr_t * parent,mem_heap_t * heap)124 row_undo_node_create(
125 /*=================*/
126 trx_t* trx, /*!< in/out: transaction */
127 que_thr_t* parent, /*!< in: parent node, i.e., a thr node */
128 mem_heap_t* heap) /*!< in: memory heap where created */
129 {
130 undo_node_t* undo;
131
132 ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)
133 || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
134 || trx_state_eq(trx, TRX_STATE_PREPARED));
135 ut_ad(parent);
136
137 undo = static_cast<undo_node_t*>(
138 mem_heap_alloc(heap, sizeof(undo_node_t)));
139
140 undo->common.type = QUE_NODE_UNDO;
141 undo->common.parent = parent;
142
143 undo->state = UNDO_NODE_FETCH_NEXT;
144 undo->trx = trx;
145
146 btr_pcur_init(&(undo->pcur));
147
148 undo->heap = mem_heap_create(256);
149
150 return(undo);
151 }
152
153 /***********************************************************//**
154 Looks for the clustered index record when node has the row reference.
155 The pcur in node is used in the search. If found, stores the row to node,
156 and stores the position of pcur, and detaches it. The pcur must be closed
157 by the caller in any case.
158 @return true if found; NOTE the node->pcur must be closed by the
159 caller, regardless of the return value */
160 bool
row_undo_search_clust_to_pcur(undo_node_t * node)161 row_undo_search_clust_to_pcur(
162 /*==========================*/
163 undo_node_t* node) /*!< in/out: row undo node */
164 {
165 dict_index_t* clust_index;
166 bool found;
167 mtr_t mtr;
168 row_ext_t** ext;
169 const rec_t* rec;
170 mem_heap_t* heap = NULL;
171 rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
172 rec_offs* offsets = offsets_;
173 rec_offs_init(offsets_);
174
175 ut_ad(!node->table->skip_alter_undo);
176
177 mtr_start(&mtr);
178
179 clust_index = dict_table_get_first_index(node->table);
180
181 found = row_search_on_row_ref(&node->pcur, BTR_MODIFY_LEAF,
182 node->table, node->ref, &mtr);
183
184 if (!found) {
185 goto func_exit;
186 }
187
188 rec = btr_pcur_get_rec(&node->pcur);
189
190 offsets = rec_get_offsets(rec, clust_index, offsets,
191 clust_index->n_core_fields,
192 ULINT_UNDEFINED, &heap);
193
194 found = row_get_rec_roll_ptr(rec, clust_index, offsets)
195 == node->roll_ptr;
196
197 if (found) {
198 ut_ad(row_get_rec_trx_id(rec, clust_index, offsets)
199 == node->trx->id || node->table->is_temporary());
200
201 if (dict_table_has_atomic_blobs(node->table)) {
202 /* There is no prefix of externally stored
203 columns in the clustered index record. Build a
204 cache of column prefixes. */
205 ext = &node->ext;
206 } else {
207 /* REDUNDANT and COMPACT formats store a local
208 768-byte prefix of each externally stored
209 column. No cache is needed. */
210 ext = NULL;
211 node->ext = NULL;
212 }
213
214 node->row = row_build(ROW_COPY_DATA, clust_index, rec,
215 offsets, NULL,
216 NULL, NULL, ext, node->heap);
217
218 /* We will need to parse out virtual column info from undo
219 log, first mark them DATA_MISSING. So we will know if the
220 value gets updated */
221 if (node->table->n_v_cols
222 && node->state != UNDO_NODE_INSERT
223 && !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
224 for (ulint i = 0;
225 i < dict_table_get_n_v_cols(node->table); i++) {
226 dfield_get_type(dtuple_get_nth_v_field(
227 node->row, i))->mtype = DATA_MISSING;
228 }
229 }
230
231 if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
232 ut_ad(node->row->info_bits == REC_INFO_MIN_REC_FLAG
233 || node->row->info_bits == 0);
234 node->undo_row = dtuple_copy(node->row, node->heap);
235 row_upd_replace(node->undo_row, &node->undo_ext,
236 clust_index, node->update, node->heap);
237 } else {
238 ut_ad((node->row->info_bits == REC_INFO_MIN_REC_FLAG)
239 == (node->rec_type == TRX_UNDO_INSERT_METADATA));
240 node->undo_row = NULL;
241 node->undo_ext = NULL;
242 }
243
244 btr_pcur_store_position(&node->pcur, &mtr);
245 }
246
247 if (heap) {
248 mem_heap_free(heap);
249 }
250
251 func_exit:
252 btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
253 return(found);
254 }
255
256 /***********************************************************//**
257 Fetches an undo log record and does the undo for the recorded operation.
258 If none left, or a partial rollback completed, returns control to the
259 parent node, which is always a query thread node.
260 @return DB_SUCCESS if operation successfully completed, else error code */
261 static MY_ATTRIBUTE((warn_unused_result))
262 dberr_t
row_undo(undo_node_t * node,que_thr_t * thr)263 row_undo(
264 /*=====*/
265 undo_node_t* node, /*!< in: row undo node */
266 que_thr_t* thr) /*!< in: query thread */
267 {
268 trx_t* trx = node->trx;
269 ut_ad(trx->in_rollback);
270
271 if (node->state == UNDO_NODE_FETCH_NEXT) {
272
273 node->undo_rec = trx_roll_pop_top_rec_of_trx(
274 trx, &node->roll_ptr, node->heap);
275
276 if (!node->undo_rec) {
277 /* Rollback completed for this query thread */
278 thr->run_node = que_node_get_parent(node);
279 return(DB_SUCCESS);
280 }
281
282 node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
283 node->state = trx_undo_roll_ptr_is_insert(node->roll_ptr)
284 ? UNDO_NODE_INSERT : UNDO_NODE_MODIFY;
285 }
286
287 /* Prevent DROP TABLE etc. while we are rolling back this row.
288 If we are doing a TABLE CREATE or some other dictionary operation,
289 then we already have dict_operation_lock locked in x-mode. Do not
290 try to lock again, because that would cause a hang. */
291
292 const bool locked_data_dict = (trx->dict_operation_lock_mode == 0);
293
294 if (locked_data_dict) {
295
296 row_mysql_freeze_data_dictionary(trx);
297 }
298
299 dberr_t err;
300
301 if (node->state == UNDO_NODE_INSERT) {
302
303 err = row_undo_ins(node, thr);
304
305 node->state = UNDO_NODE_FETCH_NEXT;
306 } else {
307 ut_ad(node->state == UNDO_NODE_MODIFY);
308 err = row_undo_mod(node, thr);
309 }
310
311 if (locked_data_dict) {
312
313 row_mysql_unfreeze_data_dictionary(trx);
314 }
315
316 /* Do some cleanup */
317 btr_pcur_close(&(node->pcur));
318
319 mem_heap_empty(node->heap);
320
321 thr->run_node = node;
322
323 return(err);
324 }
325
326 /***********************************************************//**
327 Undoes a row operation in a table. This is a high-level function used
328 in SQL execution graphs.
329 @return query thread to run next or NULL */
330 que_thr_t*
row_undo_step(que_thr_t * thr)331 row_undo_step(
332 /*==========*/
333 que_thr_t* thr) /*!< in: query thread */
334 {
335 dberr_t err;
336 undo_node_t* node;
337 trx_t* trx;
338
339 ut_ad(thr);
340
341 srv_inc_activity_count();
342
343 trx = thr_get_trx(thr);
344
345 node = static_cast<undo_node_t*>(thr->run_node);
346
347 ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
348
349 if (UNIV_UNLIKELY(trx_get_dict_operation(trx) == TRX_DICT_OP_NONE
350 && !srv_undo_sources
351 && srv_shutdown_state != SRV_SHUTDOWN_NONE)
352 && (srv_fast_shutdown == 3 || trx == trx_roll_crash_recv_trx)) {
353 /* Shutdown has been initiated. */
354 trx->error_state = DB_INTERRUPTED;
355 return NULL;
356 }
357
358 if (UNIV_UNLIKELY(trx == trx_roll_crash_recv_trx)) {
359 trx_roll_report_progress();
360 }
361
362 err = row_undo(node, thr);
363
364 #ifdef ENABLED_DEBUG_SYNC
365 if (trx->mysql_thd) {
366 DEBUG_SYNC_C("trx_after_rollback_row");
367 }
368 #endif /* ENABLED_DEBUG_SYNC */
369
370 trx->error_state = err;
371
372 if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
373 ib::fatal() << "Error (" << err << ") in rollback.";
374 }
375
376 return(thr);
377 }
378