1 /*****************************************************************************
2
3 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2015, 2021, MariaDB Corporation.
5
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17
18 *****************************************************************************/
19
20 /**************************************************//**
21 @file trx/trx0trx.cc
22 The transaction
23
24 Created 3/26/1996 Heikki Tuuri
25 *******************************************************/
26
27 #include "trx0trx.h"
28
29 #ifdef WITH_WSREP
30 #include <mysql/service_wsrep.h>
31 #endif
32
33 #include <mysql/service_thd_error_context.h>
34
35 #include "btr0sea.h"
36 #include "lock0lock.h"
37 #include "log0log.h"
38 #include "que0que.h"
39 #include "srv0mon.h"
40 #include "srv0srv.h"
41 #include "srv0start.h"
42 #include "trx0purge.h"
43 #include "trx0rec.h"
44 #include "trx0roll.h"
45 #include "trx0rseg.h"
46 #include "trx0undo.h"
47 #include "trx0xa.h"
48 #include "ut0pool.h"
49 #include "ut0vec.h"
50
51 #include <set>
52 #include <new>
53
54 /** The bit pattern corresponding to TRX_ID_MAX */
55 const byte trx_id_max_bytes[8] = {
56 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
57 };
58
59 /** The bit pattern corresponding to max timestamp */
60 const byte timestamp_max_bytes[7] = {
61 0x7f, 0xff, 0xff, 0xff, 0x0f, 0x42, 0x3f
62 };
63
64
65 static const ulint MAX_DETAILED_ERROR_LEN = 256;
66
67 /** Set of table_id */
68 typedef std::set<
69 table_id_t,
70 std::less<table_id_t>,
71 ut_allocator<table_id_t> > table_id_set;
72
73 /*************************************************************//**
74 Set detailed error message for the transaction. */
75 void
trx_set_detailed_error(trx_t * trx,const char * msg)76 trx_set_detailed_error(
77 /*===================*/
78 trx_t* trx, /*!< in: transaction struct */
79 const char* msg) /*!< in: detailed error message */
80 {
81 strncpy(trx->detailed_error, msg, MAX_DETAILED_ERROR_LEN - 1);
82 trx->detailed_error[MAX_DETAILED_ERROR_LEN - 1] = '\0';
83 }
84
85 /*************************************************************//**
86 Set detailed error message for the transaction from a file. Note that the
87 file is rewinded before reading from it. */
88 void
trx_set_detailed_error_from_file(trx_t * trx,FILE * file)89 trx_set_detailed_error_from_file(
90 /*=============================*/
91 trx_t* trx, /*!< in: transaction struct */
92 FILE* file) /*!< in: file to read message from */
93 {
94 os_file_read_string(file, trx->detailed_error, MAX_DETAILED_ERROR_LEN);
95 }
96
97 /********************************************************************//**
98 Initialize transaction object.
99 @param trx trx to initialize */
100 static
101 void
trx_init(trx_t * trx)102 trx_init(
103 /*=====*/
104 trx_t* trx)
105 {
106 trx->state = TRX_STATE_NOT_STARTED;
107
108 trx->is_recovered = false;
109
110 trx->op_info = "";
111
112 trx->active_commit_ordered = false;
113
114 trx->isolation_level = TRX_ISO_REPEATABLE_READ;
115
116 trx->check_foreigns = true;
117
118 trx->check_unique_secondary = true;
119
120 trx->lock.n_rec_locks = 0;
121
122 trx->dict_operation = TRX_DICT_OP_NONE;
123
124 trx->table_id = 0;
125
126 trx->error_state = DB_SUCCESS;
127
128 trx->error_key_num = ULINT_UNDEFINED;
129
130 trx->undo_no = 0;
131
132 trx->rsegs.m_redo.rseg = NULL;
133
134 trx->rsegs.m_noredo.rseg = NULL;
135
136 trx->read_only = false;
137
138 trx->auto_commit = false;
139
140 trx->will_lock = false;
141
142 trx->ddl = false;
143
144 trx->internal = false;
145
146 ut_d(trx->start_file = 0);
147
148 ut_d(trx->start_line = 0);
149
150 trx->magic_n = TRX_MAGIC_N;
151
152 trx->lock.que_state = TRX_QUE_RUNNING;
153
154 trx->last_sql_stat_start.least_undo_no = 0;
155
156 ut_ad(!trx->read_view.is_open());
157
158 trx->lock.rec_cached = 0;
159
160 trx->lock.table_cached = 0;
161 #ifdef WITH_WSREP
162 ut_ad(!trx->wsrep);
163 ut_ad(!trx->wsrep_UK_scan);
164 #endif /* WITH_WSREP */
165 }
166
167 /** For managing the life-cycle of the trx_t instance that we get
168 from the pool. */
169 struct TrxFactory {
170
171 /** Initializes a transaction object. It must be explicitly started
172 with trx_start_if_not_started() before using it. The default isolation
173 level is TRX_ISO_REPEATABLE_READ.
174 @param trx Transaction instance to initialise */
initTrxFactory175 static void init(trx_t* trx)
176 {
177 /* Explicitly call the constructor of the already
178 allocated object. trx_t objects are allocated by
179 ut_zalloc_nokey() in Pool::Pool() which would not call
180 the constructors of the trx_t members. */
181 new(&trx->mod_tables) trx_mod_tables_t();
182
183 new(&trx->lock.table_locks) lock_list();
184
185 new(&trx->read_view) ReadView();
186
187 trx->rw_trx_hash_pins = 0;
188 trx_init(trx);
189
190 trx->dict_operation_lock_mode = 0;
191
192 trx->xid = UT_NEW_NOKEY(xid_t());
193
194 trx->detailed_error = reinterpret_cast<char*>(
195 ut_zalloc_nokey(MAX_DETAILED_ERROR_LEN));
196
197 trx->lock.lock_heap = mem_heap_create_typed(
198 1024, MEM_HEAP_FOR_LOCK_HEAP);
199
200 lock_trx_lock_list_init(&trx->lock.trx_locks);
201
202 UT_LIST_INIT(trx->lock.evicted_tables,
203 &dict_table_t::table_LRU);
204
205 UT_LIST_INIT(
206 trx->trx_savepoints,
207 &trx_named_savept_t::trx_savepoints);
208
209 mutex_create(LATCH_ID_TRX, &trx->mutex);
210 }
211
212 /** Release resources held by the transaction object.
213 @param trx the transaction for which to release resources */
destroyTrxFactory214 static void destroy(trx_t* trx)
215 {
216 #ifdef __SANITIZE_ADDRESS__
217 /* Unpoison the memory for AddressSanitizer */
218 MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
219 #elif !__has_feature(memory_sanitizer)
220 /* In Valgrind, we cannot cancel MEM_NOACCESS() without
221 changing the state of the V bits (which indicate
222 which bits are initialized).
223 We will declare the contents as initialized.
224 We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
225 MEM_MAKE_DEFINED(trx, sizeof *trx);
226 #endif
227
228 ut_a(trx->magic_n == TRX_MAGIC_N);
229 ut_ad(!trx->mysql_thd);
230
231 ut_a(trx->lock.wait_lock == NULL);
232 ut_a(trx->lock.wait_thr == NULL);
233 ut_a(trx->dict_operation_lock_mode == 0);
234
235 if (trx->lock.lock_heap != NULL) {
236 mem_heap_free(trx->lock.lock_heap);
237 trx->lock.lock_heap = NULL;
238 }
239
240 ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
241 ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
242
243 UT_DELETE(trx->xid);
244 ut_free(trx->detailed_error);
245
246 mutex_free(&trx->mutex);
247
248 trx->mod_tables.~trx_mod_tables_t();
249
250 ut_ad(!trx->read_view.is_open());
251
252 trx->lock.table_locks.~lock_list();
253
254 trx->read_view.~ReadView();
255 }
256 };
257
258 /** The lock strategy for TrxPool */
259 struct TrxPoolLock {
TrxPoolLockTrxPoolLock260 TrxPoolLock() { }
261
262 /** Create the mutex */
createTrxPoolLock263 void create()
264 {
265 mutex_create(LATCH_ID_TRX_POOL, &m_mutex);
266 }
267
268 /** Acquire the mutex */
enterTrxPoolLock269 void enter() { mutex_enter(&m_mutex); }
270
271 /** Release the mutex */
exitTrxPoolLock272 void exit() { mutex_exit(&m_mutex); }
273
274 /** Free the mutex */
destroyTrxPoolLock275 void destroy() { mutex_free(&m_mutex); }
276
277 /** Mutex to use */
278 ib_mutex_t m_mutex;
279 };
280
281 /** The lock strategy for the TrxPoolManager */
282 struct TrxPoolManagerLock {
TrxPoolManagerLockTrxPoolManagerLock283 TrxPoolManagerLock() { }
284
285 /** Create the mutex */
createTrxPoolManagerLock286 void create()
287 {
288 mutex_create(LATCH_ID_TRX_POOL_MANAGER, &m_mutex);
289 }
290
291 /** Acquire the mutex */
enterTrxPoolManagerLock292 void enter() { mutex_enter(&m_mutex); }
293
294 /** Release the mutex */
exitTrxPoolManagerLock295 void exit() { mutex_exit(&m_mutex); }
296
297 /** Free the mutex */
destroyTrxPoolManagerLock298 void destroy() { mutex_free(&m_mutex); }
299
300 /** Mutex to use */
301 ib_mutex_t m_mutex;
302 };
303
304 /** Use explicit mutexes for the trx_t pool and its manager. */
305 typedef Pool<trx_t, TrxFactory, TrxPoolLock> trx_pool_t;
306 typedef PoolManager<trx_pool_t, TrxPoolManagerLock > trx_pools_t;
307
308 /** The trx_t pool manager */
309 static trx_pools_t* trx_pools;
310
311 /** Size of on trx_t pool in bytes. */
312 static const ulint MAX_TRX_BLOCK_SIZE = 1024 * 1024 * 4;
313
314 /** Create the trx_t pool */
315 void
trx_pool_init()316 trx_pool_init()
317 {
318 trx_pools = UT_NEW_NOKEY(trx_pools_t(MAX_TRX_BLOCK_SIZE));
319
320 ut_a(trx_pools != 0);
321 }
322
323 /** Destroy the trx_t pool */
324 void
trx_pool_close()325 trx_pool_close()
326 {
327 UT_DELETE(trx_pools);
328
329 trx_pools = 0;
330 }
331
332 /** @return an allocated transaction */
trx_create()333 trx_t *trx_create()
334 {
335 trx_t* trx = trx_pools->get();
336
337 #ifdef __SANITIZE_ADDRESS__
338 /* Unpoison the memory for AddressSanitizer.
339 It may have been poisoned in trx_t::free().*/
340 MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
341 #elif !__has_feature(memory_sanitizer)
342 /* In Valgrind, we cannot cancel MEM_NOACCESS() without
343 changing the state of the V bits (which indicate
344 which bits are initialized).
345 We will declare the contents as initialized.
346 We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
347 MEM_MAKE_DEFINED(trx, sizeof *trx);
348 #endif
349
350 trx->assert_freed();
351
352 mem_heap_t* heap;
353 ib_alloc_t* alloc;
354
355 /* We just got trx from pool, it should be non locking */
356 ut_ad(!trx->will_lock);
357 ut_ad(!trx->rw_trx_hash_pins);
358
359 DBUG_LOG("trx", "Create: " << trx);
360
361 heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8);
362
363 alloc = ib_heap_allocator_create(heap);
364
365 trx->autoinc_locks = ib_vector_create(alloc, sizeof(void**), 4);
366
367 ut_ad(trx->mod_tables.empty());
368 ut_ad(trx->lock.n_rec_locks == 0);
369 ut_ad(trx->lock.table_cached == 0);
370 ut_ad(trx->lock.rec_cached == 0);
371 ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
372
373 #ifdef WITH_WSREP
374 ut_ad(!trx->wsrep_UK_scan);
375 #endif /* WITH_WSREP */
376
377 trx_sys.register_trx(trx);
378
379 return(trx);
380 }
381
382 /** Free the memory to trx_pools */
free()383 void trx_t::free()
384 {
385 MEM_CHECK_DEFINED(this, sizeof *this);
386
387 ut_ad(!n_mysql_tables_in_use);
388 ut_ad(!mysql_log_file_name);
389 ut_ad(!mysql_n_tables_locked);
390 ut_ad(!internal);
391 ut_ad(!will_lock);
392 ut_ad(error_state == DB_SUCCESS);
393 ut_ad(magic_n == TRX_MAGIC_N);
394 ut_ad(!read_only);
395 ut_ad(!lock.wait_lock);
396
397 dict_operation= TRX_DICT_OP_NONE;
398 trx_sys.deregister_trx(this);
399 assert_freed();
400 trx_sys.rw_trx_hash.put_pins(this);
401
402 mysql_thd= nullptr;
403
404 // FIXME: We need to avoid this heap free/alloc for each commit.
405 if (autoinc_locks)
406 {
407 ut_ad(ib_vector_is_empty(autoinc_locks));
408 /* We allocated a dedicated heap for the vector. */
409 ib_vector_free(autoinc_locks);
410 autoinc_locks= NULL;
411 }
412
413 mod_tables.clear();
414
415 MEM_NOACCESS(&n_ref, sizeof n_ref);
416 /* do not poison mutex */
417 MEM_NOACCESS(&id, sizeof id);
418 MEM_NOACCESS(&state, sizeof state);
419 MEM_NOACCESS(&is_recovered, sizeof is_recovered);
420 #ifdef WITH_WSREP
421 MEM_NOACCESS(&wsrep, sizeof wsrep);
422 #endif
423 read_view.mem_noaccess();
424 MEM_NOACCESS(&lock, sizeof lock);
425 MEM_NOACCESS(&op_info, sizeof op_info);
426 MEM_NOACCESS(&isolation_level, sizeof isolation_level);
427 MEM_NOACCESS(&check_foreigns, sizeof check_foreigns);
428 MEM_NOACCESS(&is_registered, sizeof is_registered);
429 MEM_NOACCESS(&active_commit_ordered, sizeof active_commit_ordered);
430 MEM_NOACCESS(&check_unique_secondary, sizeof check_unique_secondary);
431 MEM_NOACCESS(&flush_log_later, sizeof flush_log_later);
432 MEM_NOACCESS(&must_flush_log_later, sizeof must_flush_log_later);
433 MEM_NOACCESS(&duplicates, sizeof duplicates);
434 MEM_NOACCESS(&dict_operation, sizeof dict_operation);
435 MEM_NOACCESS(&dict_operation_lock_mode, sizeof dict_operation_lock_mode);
436 MEM_NOACCESS(&start_time, sizeof start_time);
437 MEM_NOACCESS(&start_time_micro, sizeof start_time_micro);
438 MEM_NOACCESS(&commit_lsn, sizeof commit_lsn);
439 MEM_NOACCESS(&table_id, sizeof table_id);
440 MEM_NOACCESS(&mysql_thd, sizeof mysql_thd);
441 MEM_NOACCESS(&mysql_log_file_name, sizeof mysql_log_file_name);
442 MEM_NOACCESS(&mysql_log_offset, sizeof mysql_log_offset);
443 MEM_NOACCESS(&n_mysql_tables_in_use, sizeof n_mysql_tables_in_use);
444 MEM_NOACCESS(&mysql_n_tables_locked, sizeof mysql_n_tables_locked);
445 MEM_NOACCESS(&error_state, sizeof error_state);
446 MEM_NOACCESS(&error_info, sizeof error_info);
447 MEM_NOACCESS(&error_key_num, sizeof error_key_num);
448 MEM_NOACCESS(&graph, sizeof graph);
449 MEM_NOACCESS(&trx_savepoints, sizeof trx_savepoints);
450 MEM_NOACCESS(&undo_no, sizeof undo_no);
451 MEM_NOACCESS(&last_sql_stat_start, sizeof last_sql_stat_start);
452 MEM_NOACCESS(&rsegs, sizeof rsegs);
453 MEM_NOACCESS(&roll_limit, sizeof roll_limit);
454 MEM_NOACCESS(&in_rollback, sizeof in_rollback);
455 MEM_NOACCESS(&pages_undone, sizeof pages_undone);
456 MEM_NOACCESS(&n_autoinc_rows, sizeof n_autoinc_rows);
457 MEM_NOACCESS(&autoinc_locks, sizeof autoinc_locks);
458 MEM_NOACCESS(&read_only, sizeof read_only);
459 MEM_NOACCESS(&auto_commit, sizeof auto_commit);
460 MEM_NOACCESS(&will_lock, sizeof will_lock);
461 MEM_NOACCESS(&fts_trx, sizeof fts_trx);
462 MEM_NOACCESS(&fts_next_doc_id, sizeof fts_next_doc_id);
463 MEM_NOACCESS(&flush_tables, sizeof flush_tables);
464 MEM_NOACCESS(&ddl, sizeof ddl);
465 MEM_NOACCESS(&internal, sizeof internal);
466 #ifdef UNIV_DEBUG
467 MEM_NOACCESS(&start_line, sizeof start_line);
468 MEM_NOACCESS(&start_file, sizeof start_file);
469 #endif /* UNIV_DEBUG */
470 MEM_NOACCESS(&xid, sizeof xid);
471 MEM_NOACCESS(&mod_tables, sizeof mod_tables);
472 MEM_NOACCESS(&detailed_error, sizeof detailed_error);
473 #ifdef WITH_WSREP
474 ut_ad(!wsrep_UK_scan);
475 MEM_NOACCESS(&wsrep_UK_scan, sizeof wsrep_UK_scan);
476 #endif /* WITH_WSREP */
477 MEM_NOACCESS(&magic_n, sizeof magic_n);
478 trx_pools->mem_free(this);
479 }
480
481 /** Transition to committed state, to release implicit locks. */
commit_state()482 inline void trx_t::commit_state()
483 {
484 ut_ad(state == TRX_STATE_PREPARED
485 || state == TRX_STATE_PREPARED_RECOVERED
486 || state == TRX_STATE_ACTIVE);
487 /* This makes the transaction committed in memory and makes its
488 changes to data visible to other transactions. NOTE that there is a
489 small discrepancy from the strict formal visibility rules here: a
490 user of the database can see modifications made by another
491 transaction T even before the necessary redo log segment has been
492 flushed to the disk. If the database happens to crash before the
493 flush, the user has seen modifications from T which will never be a
494 committed transaction. However, any transaction T2 which sees the
495 modifications of the committing transaction T, and which also itself
496 makes modifications to the database, will get an lsn larger than the
497 committing transaction T. In the case where the log flush fails, and
498 T never gets committed, also T2 will never get committed. */
499 trx_mutex_enter(this);
500 state= TRX_STATE_COMMITTED_IN_MEMORY;
501 trx_mutex_exit(this);
502 ut_ad(id || !is_referenced());
503 }
504
505 /** Release any explicit locks of a committing transaction. */
release_locks()506 inline void trx_t::release_locks()
507 {
508 DBUG_ASSERT(state == TRX_STATE_COMMITTED_IN_MEMORY);
509 DBUG_ASSERT(!is_referenced());
510
511 if (UT_LIST_GET_LEN(lock.trx_locks))
512 {
513 lock_release(this);
514 lock.n_rec_locks = 0;
515 ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
516 ut_ad(ib_vector_is_empty(autoinc_locks));
517 mem_heap_empty(lock.lock_heap);
518 }
519
520 lock.table_locks.clear();
521 }
522
523 /** At shutdown, frees a transaction object. */
524 void
trx_free_at_shutdown(trx_t * trx)525 trx_free_at_shutdown(trx_t *trx)
526 {
527 ut_ad(trx->is_recovered);
528 ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)
529 || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
530 || (trx_state_eq(trx, TRX_STATE_ACTIVE)
531 && (!srv_was_started
532 || srv_operation == SRV_OPERATION_RESTORE
533 || srv_operation == SRV_OPERATION_RESTORE_EXPORT
534 || srv_read_only_mode
535 || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
536 || (!srv_is_being_started
537 && !srv_undo_sources && srv_fast_shutdown))));
538 ut_a(trx->magic_n == TRX_MAGIC_N);
539
540 trx->commit_state();
541 trx->release_locks();
542 trx_undo_free_at_shutdown(trx);
543
544 ut_a(!trx->read_only);
545
546 DBUG_LOG("trx", "Free prepared: " << trx);
547 trx->state = TRX_STATE_NOT_STARTED;
548 ut_ad(!UT_LIST_GET_LEN(trx->lock.trx_locks));
549 trx->id = 0;
550 trx->free();
551 }
552
553
554 /**
555 Disconnect a prepared transaction from MySQL
556 @param[in,out] trx transaction
557 */
trx_disconnect_prepared(trx_t * trx)558 void trx_disconnect_prepared(trx_t *trx)
559 {
560 ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED));
561 ut_ad(trx->mysql_thd);
562 ut_ad(!trx->mysql_log_file_name);
563 trx->read_view.close();
564 trx->is_recovered= true;
565 trx->mysql_thd= NULL;
566 /* todo/fixme: suggest to do it at innodb prepare */
567 trx->will_lock= false;
568 trx_sys.rw_trx_hash.put_pins(trx);
569 }
570
571 /****************************************************************//**
572 Resurrect the table locks for a resurrected transaction. */
573 static
574 void
trx_resurrect_table_locks(trx_t * trx,const trx_undo_t * undo)575 trx_resurrect_table_locks(
576 /*======================*/
577 trx_t* trx, /*!< in/out: transaction */
578 const trx_undo_t* undo) /*!< in: undo log */
579 {
580 mtr_t mtr;
581 table_id_set tables;
582
583 ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
584 trx_state_eq(trx, TRX_STATE_PREPARED));
585 ut_ad(undo->rseg == trx->rsegs.m_redo.rseg);
586
587 if (undo->empty()) {
588 return;
589 }
590
591 mtr_start(&mtr);
592
593 /* trx_rseg_mem_create() may have acquired an X-latch on this
594 page, so we cannot acquire an S-latch. */
595 buf_block_t* block = trx_undo_page_get(
596 page_id_t(trx->rsegs.m_redo.rseg->space->id,
597 undo->top_page_no), &mtr);
598 buf_block_t* undo_block = block;
599 trx_undo_rec_t* undo_rec = block->frame + undo->top_offset;
600
601 do {
602 ulint type;
603 undo_no_t undo_no;
604 table_id_t table_id;
605 ulint cmpl_info;
606 bool updated_extern;
607
608 if (undo_block != block) {
609 mtr.memo_release(undo_block, MTR_MEMO_PAGE_X_FIX);
610 undo_block = block;
611 }
612
613 trx_undo_rec_get_pars(
614 undo_rec, &type, &cmpl_info,
615 &updated_extern, &undo_no, &table_id);
616 tables.insert(table_id);
617
618 undo_rec = trx_undo_get_prev_rec(
619 block, page_offset(undo_rec), undo->hdr_page_no,
620 undo->hdr_offset, false, &mtr);
621 } while (undo_rec);
622
623 mtr_commit(&mtr);
624
625 for (table_id_set::const_iterator i = tables.begin();
626 i != tables.end(); i++) {
627 if (dict_table_t* table = dict_table_open_on_id(
628 *i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) {
629 if (!table->is_readable()) {
630 mutex_enter(&dict_sys.mutex);
631 dict_table_close(table, TRUE, FALSE);
632 dict_sys.remove(table);
633 mutex_exit(&dict_sys.mutex);
634 continue;
635 }
636
637 if (trx->state == TRX_STATE_PREPARED) {
638 trx->mod_tables.insert(
639 trx_mod_tables_t::value_type(table,
640 0));
641 }
642 lock_table_ix_resurrect(table, trx);
643
644 DBUG_LOG("ib_trx",
645 "resurrect " << ib::hex(trx->id)
646 << " IX lock on " << table->name);
647
648 dict_table_close(table, FALSE, FALSE);
649 }
650 }
651 }
652
653
654 /**
655 Resurrect the transactions that were doing inserts/updates the time of the
656 crash, they need to be undone.
657 */
658
trx_resurrect(trx_undo_t * undo,trx_rseg_t * rseg,time_t start_time,ulonglong start_time_micro,uint64_t * rows_to_undo)659 static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg,
660 time_t start_time, ulonglong start_time_micro,
661 uint64_t *rows_to_undo)
662 {
663 trx_state_t state;
664 /*
665 This is single-threaded startup code, we do not need the
666 protection of trx->mutex here.
667 */
668 switch (undo->state)
669 {
670 case TRX_UNDO_ACTIVE:
671 state= TRX_STATE_ACTIVE;
672 break;
673 case TRX_UNDO_PREPARED:
674 /*
675 Prepared transactions are left in the prepared state
676 waiting for a commit or abort decision from MySQL
677 */
678 ib::info() << "Transaction " << undo->trx_id
679 << " was in the XA prepared state.";
680
681 state= TRX_STATE_PREPARED;
682 break;
683 default:
684 return;
685 }
686
687 trx_t *trx= trx_create();
688 trx->state= state;
689 ut_d(trx->start_file= __FILE__);
690 ut_d(trx->start_line= __LINE__);
691
692 trx->rsegs.m_redo.undo= undo;
693 trx->undo_no= undo->top_undo_no + 1;
694 trx->rsegs.m_redo.rseg= rseg;
695 /*
696 For transactions with active data will not have rseg size = 1
697 or will not qualify for purge limit criteria. So it is safe to increment
698 this trx_ref_count w/o mutex protection.
699 */
700 ++trx->rsegs.m_redo.rseg->trx_ref_count;
701 *trx->xid= undo->xid;
702 trx->id= undo->trx_id;
703 trx->is_recovered= true;
704 trx->start_time= start_time;
705 trx->start_time_micro= start_time_micro;
706
707 if (undo->dict_operation)
708 {
709 trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
710 if (!trx->table_id)
711 trx->table_id= undo->table_id;
712 }
713
714 trx_sys.rw_trx_hash.insert(trx);
715 trx_sys.rw_trx_hash.put_pins(trx);
716 trx_resurrect_table_locks(trx, undo);
717 if (trx_state_eq(trx, TRX_STATE_ACTIVE))
718 *rows_to_undo+= trx->undo_no;
719 }
720
721
722 /** Initialize (resurrect) transactions at startup. */
trx_lists_init_at_db_start()723 dberr_t trx_lists_init_at_db_start()
724 {
725 ut_a(srv_is_being_started);
726 ut_ad(!srv_was_started);
727
728 if (srv_operation == SRV_OPERATION_RESTORE) {
729 /* mariabackup --prepare only deals with
730 the redo log and the data files, not with
731 transactions or the data dictionary. */
732 return trx_rseg_array_init();
733 }
734
735 if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
736 return DB_SUCCESS;
737 }
738
739 purge_sys.create();
740 if (dberr_t err = trx_rseg_array_init()) {
741 ib::info() << "Retry with innodb_force_recovery=5";
742 return err;
743 }
744
745 /* Look from the rollback segments if there exist undo logs for
746 transactions. */
747 const time_t start_time = time(NULL);
748 const ulonglong start_time_micro= microsecond_interval_timer();
749 uint64_t rows_to_undo = 0;
750
751 for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
752 trx_undo_t* undo;
753 trx_rseg_t* rseg = trx_sys.rseg_array[i];
754
755 /* Some rollback segment may be unavailable,
756 especially if the server was previously run with a
757 non-default value of innodb_undo_logs. */
758 if (rseg == NULL) {
759 continue;
760 }
761 /* Ressurrect other transactions. */
762 for (undo = UT_LIST_GET_FIRST(rseg->undo_list);
763 undo != NULL;
764 undo = UT_LIST_GET_NEXT(undo_list, undo)) {
765 trx_t *trx = trx_sys.find(0, undo->trx_id, false);
766 if (!trx) {
767 trx_resurrect(undo, rseg, start_time,
768 start_time_micro, &rows_to_undo);
769 } else {
770 ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
771 trx_state_eq(trx, TRX_STATE_PREPARED));
772 ut_ad(trx->start_time == start_time);
773 ut_ad(trx->is_recovered);
774 ut_ad(trx->rsegs.m_redo.rseg == rseg);
775 ut_ad(trx->rsegs.m_redo.rseg->trx_ref_count);
776
777 trx->rsegs.m_redo.undo = undo;
778 if (undo->top_undo_no >= trx->undo_no) {
779 if (trx_state_eq(trx,
780 TRX_STATE_ACTIVE)) {
781 rows_to_undo -= trx->undo_no;
782 rows_to_undo +=
783 undo->top_undo_no + 1;
784 }
785
786 trx->undo_no = undo->top_undo_no + 1;
787 }
788 trx_resurrect_table_locks(trx, undo);
789 }
790 }
791 }
792
793 if (const auto size = trx_sys.rw_trx_hash.size()) {
794 ib::info() << size
795 << " transaction(s) which must be rolled back or"
796 " cleaned up in total " << rows_to_undo
797 << " row operations to undo";
798 ib::info() << "Trx id counter is " << trx_sys.get_max_trx_id();
799 }
800
801 purge_sys.clone_oldest_view();
802 return DB_SUCCESS;
803 }
804
805 /** Assign a persistent rollback segment in a round-robin fashion,
806 evenly distributed between 0 and innodb_undo_logs-1
807 @return persistent rollback segment
808 @retval NULL if innodb_read_only */
trx_assign_rseg_low()809 static trx_rseg_t* trx_assign_rseg_low()
810 {
811 if (high_level_read_only) {
812 ut_ad(!srv_available_undo_logs);
813 return(NULL);
814 }
815
816 ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
817
818 /* The first slot is always assigned to the system tablespace. */
819 ut_ad(trx_sys.rseg_array[0]->space == fil_system.sys_space);
820
821 /* Choose a rollback segment evenly distributed between 0 and
822 innodb_undo_logs-1 in a round-robin fashion, skipping those
823 undo tablespaces that are scheduled for truncation. */
824 static Atomic_counter<unsigned> rseg_slot;
825 unsigned slot = rseg_slot++ % TRX_SYS_N_RSEGS;
826 ut_d(if (trx_rseg_n_slots_debug) slot = 0);
827 trx_rseg_t* rseg;
828
829 #ifdef UNIV_DEBUG
830 ulint start_scan_slot = slot;
831 bool look_for_rollover = false;
832 #endif /* UNIV_DEBUG */
833
834 bool allocated = false;
835
836 do {
837 for (;;) {
838 rseg = trx_sys.rseg_array[slot];
839
840 #ifdef UNIV_DEBUG
841 /* Ensure that we are not revisiting the same
842 slot that we have already inspected. */
843 if (look_for_rollover) {
844 ut_ad(start_scan_slot != slot);
845 }
846 look_for_rollover = true;
847 #endif /* UNIV_DEBUG */
848
849 ut_d(if (!trx_rseg_n_slots_debug))
850 slot = (slot + 1) % TRX_SYS_N_RSEGS;
851
852 if (rseg == NULL) {
853 continue;
854 }
855
856 ut_ad(rseg->is_persistent());
857
858 if (rseg->space != fil_system.sys_space) {
859 if (rseg->skip_allocation
860 || !srv_undo_tablespaces) {
861 continue;
862 }
863 } else if (trx_rseg_t* next
864 = trx_sys.rseg_array[slot]) {
865 if (next->space != fil_system.sys_space
866 && srv_undo_tablespaces > 0) {
867 /** If dedicated
868 innodb_undo_tablespaces have
869 been configured, try to use them
870 instead of the system tablespace. */
871 continue;
872 }
873 }
874
875 break;
876 }
877
878 /* By now we have only selected the rseg but not marked it
879 allocated. By marking it allocated we are ensuring that it will
880 never be selected for UNDO truncate purge. */
881 mutex_enter(&rseg->mutex);
882 if (!rseg->skip_allocation) {
883 rseg->trx_ref_count++;
884 allocated = true;
885 }
886 mutex_exit(&rseg->mutex);
887 } while (!allocated);
888
889 ut_ad(rseg->trx_ref_count > 0);
890 ut_ad(rseg->is_persistent());
891 return(rseg);
892 }
893
894 /** Assign a rollback segment for modifying temporary tables.
895 @return the assigned rollback segment */
assign_temp_rseg()896 trx_rseg_t *trx_t::assign_temp_rseg()
897 {
898 ut_ad(!rsegs.m_noredo.rseg);
899 ut_ad(!is_autocommit_non_locking());
900 compile_time_assert(ut_is_2pow(TRX_SYS_N_RSEGS));
901
902 /* Choose a temporary rollback segment between 0 and 127
903 in a round-robin fashion. */
904 static Atomic_counter<unsigned> rseg_slot;
905 trx_rseg_t* rseg = trx_sys.temp_rsegs[
906 rseg_slot++ & (TRX_SYS_N_RSEGS - 1)];
907 ut_ad(!rseg->is_persistent());
908 rsegs.m_noredo.rseg = rseg;
909
910 if (id == 0) {
911 trx_sys.register_rw(this);
912 }
913
914 ut_ad(!rseg->is_persistent());
915 return(rseg);
916 }
917
918 /****************************************************************//**
919 Starts a transaction. */
920 static
921 void
trx_start_low(trx_t * trx,bool read_write)922 trx_start_low(
923 /*==========*/
924 trx_t* trx, /*!< in: transaction */
925 bool read_write) /*!< in: true if read-write transaction */
926 {
927 ut_ad(!trx->in_rollback);
928 ut_ad(!trx->is_recovered);
929 ut_ad(trx->start_line != 0);
930 ut_ad(trx->start_file != 0);
931 ut_ad(trx->roll_limit == 0);
932 ut_ad(trx->error_state == DB_SUCCESS);
933 ut_ad(trx->rsegs.m_redo.rseg == NULL);
934 ut_ad(trx->rsegs.m_noredo.rseg == NULL);
935 ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
936 ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
937
938 /* Check whether it is an AUTOCOMMIT SELECT */
939 trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd);
940
941 trx->read_only = srv_read_only_mode
942 || (!trx->ddl && !trx->internal
943 && thd_trx_is_read_only(trx->mysql_thd));
944
945 if (!trx->auto_commit) {
946 trx->will_lock = true;
947 } else if (!trx->will_lock) {
948 trx->read_only = true;
949 }
950
951 #ifdef WITH_WSREP
952 trx->xid->null();
953 #endif /* WITH_WSREP */
954
955 ut_a(ib_vector_is_empty(trx->autoinc_locks));
956 ut_a(trx->lock.table_locks.empty());
957
958 /* No other thread can access this trx object through rw_trx_hash,
959 still it can be found through trx_sys.trx_list. Sometimes it's
960 possible to indirectly protect trx_t::state by freezing
961 trx_sys.trx_list.
962
963 For now we update it without mutex protection, because original code
964 did it this way. It has to be reviewed and fixed properly. */
965 trx->state = TRX_STATE_ACTIVE;
966
967 /* By default all transactions are in the read-only list unless they
968 are non-locking auto-commit read only transactions or background
969 (internal) transactions. Note: Transactions marked explicitly as
970 read only can write to temporary tables, we put those on the RO
971 list too. */
972
973 if (!trx->read_only
974 && (trx->mysql_thd == 0 || read_write || trx->ddl)) {
975
976 /* Temporary rseg is assigned only if the transaction
977 updates a temporary table */
978 trx->rsegs.m_redo.rseg = trx_assign_rseg_low();
979 ut_ad(trx->rsegs.m_redo.rseg != 0
980 || srv_read_only_mode
981 || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
982
983 trx_sys.register_rw(trx);
984 } else {
985 if (!trx->is_autocommit_non_locking()) {
986
987 /* If this is a read-only transaction that is writing
988 to a temporary table then it needs a transaction id
989 to write to the temporary table. */
990
991 if (read_write) {
992 ut_ad(!srv_read_only_mode);
993 trx_sys.register_rw(trx);
994 }
995 } else {
996 ut_ad(!read_write);
997 }
998 }
999
1000 trx->start_time = time(NULL);
1001 trx->start_time_micro = trx->mysql_thd
1002 ? thd_query_start_micro(trx->mysql_thd)
1003 : microsecond_interval_timer();
1004
1005 ut_a(trx->error_state == DB_SUCCESS);
1006
1007 MONITOR_INC(MONITOR_TRX_ACTIVE);
1008 }
1009
1010 /** Set the serialisation number for a persistent committed transaction.
1011 @param[in,out] trx committed transaction with persistent changes */
1012 static
1013 void
trx_serialise(trx_t * trx)1014 trx_serialise(trx_t* trx)
1015 {
1016 trx_rseg_t *rseg = trx->rsegs.m_redo.rseg;
1017 ut_ad(rseg);
1018 ut_ad(mutex_own(&rseg->mutex));
1019
1020 if (rseg->last_page_no == FIL_NULL) {
1021 mutex_enter(&purge_sys.pq_mutex);
1022 }
1023
1024 trx_sys.assign_new_trx_no(trx);
1025
1026 /* If the rollback segment is not empty then the
1027 new trx_t::no can't be less than any trx_t::no
1028 already in the rollback segment. User threads only
1029 produce events when a rollback segment is empty. */
1030 if (rseg->last_page_no == FIL_NULL) {
1031 purge_sys.purge_queue.push(TrxUndoRsegs(trx->rw_trx_hash_element->no,
1032 *rseg));
1033 mutex_exit(&purge_sys.pq_mutex);
1034 }
1035 }
1036
1037 /****************************************************************//**
1038 Assign the transaction its history serialisation number and write the
1039 update UNDO log record to the assigned rollback segment. */
1040 static
1041 void
trx_write_serialisation_history(trx_t * trx,mtr_t * mtr)1042 trx_write_serialisation_history(
1043 /*============================*/
1044 trx_t* trx, /*!< in/out: transaction */
1045 mtr_t* mtr) /*!< in/out: mini-transaction */
1046 {
1047 /* Change the undo log segment states from TRX_UNDO_ACTIVE to some
1048 other state: these modifications to the file data structure define
1049 the transaction as committed in the file based domain, at the
1050 serialization point of the log sequence number lsn obtained below. */
1051
1052 /* We have to hold the rseg mutex because update log headers have
1053 to be put to the history list in the (serialisation) order of the
1054 UNDO trx number. This is required for the purge in-memory data
1055 structures too. */
1056
1057 if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
1058 /* Undo log for temporary tables is discarded at transaction
1059 commit. There is no purge for temporary tables, and also no
1060 MVCC, because they are private to a session. */
1061
1062 mtr_t temp_mtr;
1063 temp_mtr.start();
1064 temp_mtr.set_log_mode(MTR_LOG_NO_REDO);
1065
1066 mutex_enter(&trx->rsegs.m_noredo.rseg->mutex);
1067 trx_undo_set_state_at_finish(undo, &temp_mtr);
1068 mutex_exit(&trx->rsegs.m_noredo.rseg->mutex);
1069 temp_mtr.commit();
1070 }
1071
1072 trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
1073 if (!rseg) {
1074 ut_ad(!trx->rsegs.m_redo.undo);
1075 return;
1076 }
1077
1078 trx_undo_t*& undo = trx->rsegs.m_redo.undo;
1079
1080 if (!undo) {
1081 return;
1082 }
1083
1084 ut_ad(!trx->read_only);
1085 ut_ad(!undo || undo->rseg == rseg);
1086 mutex_enter(&rseg->mutex);
1087
1088 /* Assign the transaction serialisation number and add any
1089 undo log to the purge queue. */
1090 trx_serialise(trx);
1091 if (undo) {
1092 UT_LIST_REMOVE(rseg->undo_list, undo);
1093 trx_purge_add_undo_to_history(trx, undo, mtr);
1094 }
1095
1096 mutex_exit(&rseg->mutex);
1097
1098 MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
1099 }
1100
1101 /********************************************************************
1102 Finalize a transaction containing updates for a FTS table. */
1103 static
1104 void
trx_finalize_for_fts_table(fts_trx_table_t * ftt)1105 trx_finalize_for_fts_table(
1106 /*=======================*/
1107 fts_trx_table_t* ftt) /* in: FTS trx table */
1108 {
1109 fts_t* fts = ftt->table->fts;
1110 fts_doc_ids_t* doc_ids = ftt->added_doc_ids;
1111
1112 ut_a(fts->add_wq);
1113
1114 mem_heap_t* heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg);
1115
1116 ib_wqueue_add(fts->add_wq, doc_ids, heap);
1117
1118 /* fts_trx_table_t no longer owns the list. */
1119 ftt->added_doc_ids = NULL;
1120 }
1121
1122 /******************************************************************//**
1123 Finalize a transaction containing updates to FTS tables. */
1124 static
1125 void
trx_finalize_for_fts(trx_t * trx,bool is_commit)1126 trx_finalize_for_fts(
1127 /*=================*/
1128 trx_t* trx, /*!< in/out: transaction */
1129 bool is_commit) /*!< in: true if the transaction was
1130 committed, false if it was rolled back. */
1131 {
1132 if (is_commit) {
1133 const ib_rbt_node_t* node;
1134 ib_rbt_t* tables;
1135 fts_savepoint_t* savepoint;
1136
1137 savepoint = static_cast<fts_savepoint_t*>(
1138 ib_vector_last(trx->fts_trx->savepoints));
1139
1140 tables = savepoint->tables;
1141
1142 for (node = rbt_first(tables);
1143 node;
1144 node = rbt_next(tables, node)) {
1145 fts_trx_table_t** ftt;
1146
1147 ftt = rbt_value(fts_trx_table_t*, node);
1148
1149 if ((*ftt)->added_doc_ids) {
1150 trx_finalize_for_fts_table(*ftt);
1151 }
1152 }
1153 }
1154
1155 fts_trx_free(trx->fts_trx);
1156 trx->fts_trx = NULL;
1157 }
1158
1159 /**********************************************************************//**
1160 If required, flushes the log to disk based on the value of
1161 innodb_flush_log_at_trx_commit. */
1162 static
1163 void
trx_flush_log_if_needed_low(lsn_t lsn)1164 trx_flush_log_if_needed_low(
1165 /*========================*/
1166 lsn_t lsn) /*!< in: lsn up to which logs are to be
1167 flushed. */
1168 {
1169 bool flush = srv_file_flush_method != SRV_NOSYNC;
1170
1171 switch (srv_flush_log_at_trx_commit) {
1172 case 2:
1173 /* Write the log but do not flush it to disk */
1174 flush = false;
1175 /* fall through */
1176 case 1:
1177 case 3:
1178 /* Write the log and optionally flush it to disk */
1179 log_write_up_to(lsn, flush);
1180 srv_inc_activity_count();
1181 return;
1182 case 0:
1183 /* Do nothing */
1184 return;
1185 }
1186
1187 ut_error;
1188 }
1189
1190 /**********************************************************************//**
1191 If required, flushes the log to disk based on the value of
1192 innodb_flush_log_at_trx_commit. */
1193 static
1194 void
trx_flush_log_if_needed(lsn_t lsn,trx_t * trx)1195 trx_flush_log_if_needed(
1196 /*====================*/
1197 lsn_t lsn, /*!< in: lsn up to which logs are to be
1198 flushed. */
1199 trx_t* trx) /*!< in/out: transaction */
1200 {
1201 trx->op_info = "flushing log";
1202 trx_flush_log_if_needed_low(lsn);
1203 trx->op_info = "";
1204 }
1205
1206 /**********************************************************************//**
1207 For each table that has been modified by the given transaction: update
1208 its dict_table_t::update_time with the current timestamp. Clear the list
1209 of the modified tables at the end. */
1210 static
1211 void
trx_update_mod_tables_timestamp(trx_t * trx)1212 trx_update_mod_tables_timestamp(
1213 /*============================*/
1214 trx_t* trx) /*!< in: transaction */
1215 {
1216 /* consider using trx->start_time if calling time() is too
1217 expensive here */
1218 const time_t now = time(NULL);
1219
1220 trx_mod_tables_t::const_iterator end = trx->mod_tables.end();
1221
1222 for (trx_mod_tables_t::const_iterator it = trx->mod_tables.begin();
1223 it != end;
1224 ++it) {
1225
1226 /* This could be executed by multiple threads concurrently
1227 on the same table object. This is fine because time_t is
1228 word size or less. And _purely_ _theoretically_, even if
1229 time_t write is not atomic, likely the value of 'now' is
1230 the same in all threads and even if it is not, getting a
1231 "garbage" in table->update_time is justified because
1232 protecting it with a latch here would be too performance
1233 intrusive. */
1234 dict_table_t* table = it->first;
1235 table->update_time = now;
1236 }
1237
1238 trx->mod_tables.clear();
1239 }
1240
1241 /** Evict a table definition due to the rollback of ALTER TABLE.
1242 @param[in] table_id table identifier */
evict_table(table_id_t table_id)1243 void trx_t::evict_table(table_id_t table_id)
1244 {
1245 ut_ad(in_rollback);
1246
1247 dict_table_t* table = dict_table_open_on_id(
1248 table_id, true, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
1249 if (!table) {
1250 return;
1251 }
1252
1253 if (!table->release()) {
1254 /* This must be a DDL operation that is being rolled
1255 back in an active connection. */
1256 ut_a(table->get_ref_count() == 1);
1257 ut_ad(!is_recovered);
1258 ut_ad(mysql_thd);
1259 return;
1260 }
1261
1262 /* This table should only be locked by this transaction, if at all. */
1263 ut_ad(UT_LIST_GET_LEN(table->locks) <= 1);
1264 const bool locked = UT_LIST_GET_LEN(table->locks);
1265 ut_ad(!locked || UT_LIST_GET_FIRST(table->locks)->trx == this);
1266 dict_sys.remove(table, true, locked);
1267 if (locked) {
1268 UT_LIST_ADD_FIRST(lock.evicted_tables, table);
1269 }
1270 }
1271
1272 /** Mark a transaction committed in the main memory data structures. */
commit_in_memory(const mtr_t * mtr)1273 inline void trx_t::commit_in_memory(const mtr_t *mtr)
1274 {
1275 must_flush_log_later= false;
1276 read_view.close();
1277
1278 if (is_autocommit_non_locking())
1279 {
1280 ut_ad(id == 0);
1281 ut_ad(read_only);
1282 ut_ad(!will_lock);
1283 ut_a(!is_recovered);
1284 ut_ad(!rsegs.m_redo.rseg);
1285 ut_ad(mysql_thd);
1286 ut_ad(state == TRX_STATE_ACTIVE);
1287
1288 /* Note: We are asserting without holding the lock mutex. But
1289 that is OK because this transaction is not waiting and cannot
1290 be rolled back and no new locks can (or should) be added
1291 because it is flagged as a non-locking read-only transaction. */
1292 ut_a(UT_LIST_GET_LEN(lock.trx_locks) == 0);
1293
1294 /* This state change is not protected by any mutex, therefore
1295 there is an inherent race here around state transition during
1296 printouts. We ignore this race for the sake of efficiency.
1297 However, the freezing of trx_sys.trx_list will protect the trx_t
1298 instance and it cannot be removed from the trx_list and freed
1299 without first unfreezing trx_list. */
1300 state= TRX_STATE_NOT_STARTED;
1301
1302 MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT);
1303
1304 DBUG_LOG("trx", "Autocommit in memory: " << this);
1305 }
1306 else
1307 {
1308 #ifdef UNIV_DEBUG
1309 if (!UT_LIST_GET_LEN(lock.trx_locks))
1310 for (auto l : lock.table_locks)
1311 ut_ad(!l);
1312 #endif /* UNIV_DEBUG */
1313 commit_state();
1314
1315 if (id)
1316 {
1317 trx_sys.deregister_rw(this);
1318
1319 /* Wait for any implicit-to-explicit lock conversions to cease,
1320 so that there will be no race condition in lock_release(). */
1321 while (UNIV_UNLIKELY(is_referenced()))
1322 ut_delay(srv_spin_wait_delay);
1323 }
1324 else
1325 ut_ad(read_only || !rsegs.m_redo.rseg);
1326
1327 if (read_only || !rsegs.m_redo.rseg)
1328 {
1329 MONITOR_INC(MONITOR_TRX_RO_COMMIT);
1330 }
1331 else
1332 {
1333 trx_update_mod_tables_timestamp(this);
1334 MONITOR_INC(MONITOR_TRX_RW_COMMIT);
1335 is_recovered= false;
1336 }
1337
1338 release_locks();
1339 id= 0;
1340 DEBUG_SYNC_C("after_trx_committed_in_memory");
1341
1342 while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables))
1343 {
1344 UT_LIST_REMOVE(lock.evicted_tables, table);
1345 dict_mem_table_free(table);
1346 }
1347 }
1348
1349 ut_ad(!rsegs.m_redo.undo);
1350 ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
1351
1352 if (mtr)
1353 {
1354 if (trx_undo_t *&undo= rsegs.m_noredo.undo)
1355 {
1356 ut_ad(undo->rseg == rsegs.m_noredo.rseg);
1357 trx_undo_commit_cleanup(undo);
1358 undo= nullptr;
1359 }
1360
1361 /* NOTE that we could possibly make a group commit more efficient
1362 here: call os_thread_yield here to allow also other trxs to come
1363 to commit! */
1364
1365 /*-------------------------------------*/
1366
1367 /* Depending on the my.cnf options, we may now write the log
1368 buffer to the log files, making the transaction durable if the OS
1369 does not crash. We may also flush the log files to disk, making
1370 the transaction durable also at an OS crash or a power outage.
1371
1372 The idea in InnoDB's group commit is that a group of transactions
1373 gather behind a trx doing a physical disk write to log files, and
1374 when that physical write has been completed, one of those
1375 transactions does a write which commits the whole group. Note that
1376 this group commit will only bring benefit if there are > 2 users
1377 in the database. Then at least 2 users can gather behind one doing
1378 the physical log write to disk.
1379
1380 If we are calling trx_t::commit() under prepare_commit_mutex, we
1381 will delay possible log write and flush to a separate function
1382 trx_commit_complete_for_mysql(), which is only called when the
1383 thread has released the mutex. This is to make the group commit
1384 algorithm to work. Otherwise, the prepare_commit mutex would
1385 serialize all commits and prevent a group of transactions from
1386 gathering. */
1387
1388 commit_lsn= mtr->commit_lsn();
1389 if (!commit_lsn)
1390 /* Nothing to be done. */;
1391 else if (flush_log_later)
1392 /* Do nothing yet */
1393 must_flush_log_later= true;
1394 else if (srv_flush_log_at_trx_commit)
1395 trx_flush_log_if_needed(commit_lsn, this);
1396 }
1397
1398 ut_ad(!rsegs.m_noredo.undo);
1399
1400 /* Only after trx_undo_commit_cleanup() it is safe to release
1401 our rseg reference. */
1402 if (trx_rseg_t *rseg= rsegs.m_redo.rseg)
1403 {
1404 mutex_enter(&rseg->mutex);
1405 ut_ad(rseg->trx_ref_count > 0);
1406 --rseg->trx_ref_count;
1407 mutex_exit(&rseg->mutex);
1408 }
1409
1410 /* Free all savepoints, starting from the first. */
1411 trx_named_savept_t *savep= UT_LIST_GET_FIRST(trx_savepoints);
1412
1413 trx_roll_savepoints_free(this, savep);
1414
1415 if (fts_trx)
1416 trx_finalize_for_fts(this, undo_no != 0);
1417
1418 #ifdef WITH_WSREP
1419 /* Serialization history has been written and the transaction is
1420 committed in memory, which makes this commit ordered. Release commit
1421 order critical section. */
1422 if (wsrep)
1423 {
1424 wsrep= false;
1425 wsrep_commit_ordered(mysql_thd);
1426 }
1427 lock.was_chosen_as_wsrep_victim= false;
1428 #endif /* WITH_WSREP */
1429 trx_mutex_enter(this);
1430 dict_operation= TRX_DICT_OP_NONE;
1431
1432 DBUG_LOG("trx", "Commit in memory: " << this);
1433 state= TRX_STATE_NOT_STARTED;
1434
1435 assert_freed();
1436 trx_init(this);
1437 trx_mutex_exit(this);
1438
1439 ut_a(error_state == DB_SUCCESS);
1440 if (!srv_read_only_mode)
1441 srv_wake_purge_thread_if_not_active();
1442 }
1443
1444 /** Commit the transaction in a mini-transaction.
1445 @param mtr mini-transaction (if there are any persistent modifications) */
commit_low(mtr_t * mtr)1446 void trx_t::commit_low(mtr_t *mtr)
1447 {
1448 ut_ad(!mtr || mtr->is_active());
1449 ut_d(bool aborted = in_rollback && error_state == DB_DEADLOCK);
1450 ut_ad(!mtr == (aborted || !has_logged()));
1451 ut_ad(!mtr || !aborted);
1452
1453 /* undo_no is non-zero if we're doing the final commit. */
1454 if (fts_trx && undo_no)
1455 {
1456 ut_a(!is_autocommit_non_locking());
1457 /* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY instead of
1458 dying. This is a possible scenario if there is a crash between
1459 insert to DELETED table committing and transaction committing. The
1460 fix would be able to return error from this function */
1461 if (dberr_t error= fts_commit(this))
1462 ut_a(error == DB_DUPLICATE_KEY);
1463 }
1464
1465 #ifndef DBUG_OFF
1466 const bool debug_sync= mysql_thd && has_logged_persistent();
1467 #endif
1468
1469 if (mtr)
1470 {
1471 trx_write_serialisation_history(this, mtr);
1472
1473 /* The following call commits the mini-transaction, making the
1474 whole transaction committed in the file-based world, at this log
1475 sequence number. The transaction becomes 'durable' when we write
1476 the log to disk, but in the logical sense the commit in the
1477 file-based data structures (undo logs etc.) happens here.
1478
1479 NOTE that transaction numbers, which are assigned only to
1480 transactions with an update undo log, do not necessarily come in
1481 exactly the same order as commit lsn's, if the transactions have
1482 different rollback segments. To get exactly the same order we
1483 should hold the kernel mutex up to this point, adding to the
1484 contention of the kernel mutex. However, if a transaction T2 is
1485 able to see modifications made by a transaction T1, T2 will always
1486 get a bigger transaction number and a bigger commit lsn than T1. */
1487
1488 mtr->commit();
1489 }
1490 #ifndef DBUG_OFF
1491 if (debug_sync)
1492 DEBUG_SYNC_C("before_trx_state_committed_in_memory");
1493 #endif
1494
1495 commit_in_memory(mtr);
1496 }
1497
1498
commit()1499 void trx_t::commit()
1500 {
1501 mtr_t *mtr= nullptr;
1502 mtr_t local_mtr;
1503
1504 if (has_logged())
1505 {
1506 mtr= &local_mtr;
1507 local_mtr.start();
1508 }
1509 commit_low(mtr);
1510 }
1511
1512 /****************************************************************//**
1513 Prepares a transaction for commit/rollback. */
1514 void
trx_commit_or_rollback_prepare(trx_t * trx)1515 trx_commit_or_rollback_prepare(
1516 /*===========================*/
1517 trx_t* trx) /*!< in/out: transaction */
1518 {
1519 /* We are reading trx->state without holding trx->mutex
1520 here, because the commit or rollback should be invoked for a
1521 running (or recovered prepared) transaction that is associated
1522 with the current thread. */
1523
1524 switch (trx->state) {
1525 case TRX_STATE_NOT_STARTED:
1526 trx_start_low(trx, true);
1527 /* fall through */
1528
1529 case TRX_STATE_ACTIVE:
1530 case TRX_STATE_PREPARED:
1531 case TRX_STATE_PREPARED_RECOVERED:
1532 /* If the trx is in a lock wait state, moves the waiting
1533 query thread to the suspended state */
1534
1535 if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
1536
1537 ut_a(trx->lock.wait_thr != NULL);
1538 trx->lock.wait_thr->state = QUE_THR_SUSPENDED;
1539 trx->lock.wait_thr = NULL;
1540
1541 trx->lock.que_state = TRX_QUE_RUNNING;
1542 }
1543
1544 ut_ad(trx->lock.n_active_thrs == 1);
1545 return;
1546
1547 case TRX_STATE_COMMITTED_IN_MEMORY:
1548 break;
1549 }
1550
1551 ut_error;
1552 }
1553
1554 /*********************************************************************//**
1555 Creates a commit command node struct.
1556 @return own: commit node struct */
1557 commit_node_t*
trx_commit_node_create(mem_heap_t * heap)1558 trx_commit_node_create(
1559 /*===================*/
1560 mem_heap_t* heap) /*!< in: mem heap where created */
1561 {
1562 commit_node_t* node;
1563
1564 node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node)));
1565 node->common.type = QUE_NODE_COMMIT;
1566 node->state = COMMIT_NODE_SEND;
1567
1568 return(node);
1569 }
1570
1571 /***********************************************************//**
1572 Performs an execution step for a commit type node in a query graph.
1573 @return query thread to run next, or NULL */
1574 que_thr_t*
trx_commit_step(que_thr_t * thr)1575 trx_commit_step(
1576 /*============*/
1577 que_thr_t* thr) /*!< in: query thread */
1578 {
1579 commit_node_t* node;
1580
1581 node = static_cast<commit_node_t*>(thr->run_node);
1582
1583 ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
1584
1585 if (thr->prev_node == que_node_get_parent(node)) {
1586 node->state = COMMIT_NODE_SEND;
1587 }
1588
1589 if (node->state == COMMIT_NODE_SEND) {
1590 trx_t* trx;
1591
1592 node->state = COMMIT_NODE_WAIT;
1593
1594 trx = thr_get_trx(thr);
1595
1596 ut_a(trx->lock.wait_thr == NULL);
1597 ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT);
1598
1599 trx_commit_or_rollback_prepare(trx);
1600
1601 trx->lock.que_state = TRX_QUE_COMMITTING;
1602 trx->commit();
1603 ut_ad(trx->lock.wait_thr == NULL);
1604 trx->lock.que_state = TRX_QUE_RUNNING;
1605
1606 thr = NULL;
1607 } else {
1608 ut_ad(node->state == COMMIT_NODE_WAIT);
1609
1610 node->state = COMMIT_NODE_SEND;
1611
1612 thr->run_node = que_node_get_parent(node);
1613 }
1614
1615 return(thr);
1616 }
1617
1618 /**********************************************************************//**
1619 Does the transaction commit for MySQL.
1620 @return DB_SUCCESS or error number */
1621 dberr_t
trx_commit_for_mysql(trx_t * trx)1622 trx_commit_for_mysql(
1623 /*=================*/
1624 trx_t* trx) /*!< in/out: transaction */
1625 {
1626 /* Because we do not do the commit by sending an Innobase
1627 sig to the transaction, we must here make sure that trx has been
1628 started. */
1629
1630 switch (trx->state) {
1631 case TRX_STATE_NOT_STARTED:
1632 ut_d(trx->start_file = __FILE__);
1633 ut_d(trx->start_line = __LINE__);
1634
1635 trx_start_low(trx, true);
1636 /* fall through */
1637 case TRX_STATE_ACTIVE:
1638 case TRX_STATE_PREPARED:
1639 case TRX_STATE_PREPARED_RECOVERED:
1640 trx->op_info = "committing";
1641 trx->commit();
1642 MONITOR_DEC(MONITOR_TRX_ACTIVE);
1643 trx->op_info = "";
1644 return(DB_SUCCESS);
1645 case TRX_STATE_COMMITTED_IN_MEMORY:
1646 break;
1647 }
1648 ut_error;
1649 return(DB_CORRUPTION);
1650 }
1651
1652 /**********************************************************************//**
1653 If required, flushes the log to disk if we called trx_commit_for_mysql()
1654 with trx->flush_log_later == TRUE. */
1655 void
trx_commit_complete_for_mysql(trx_t * trx)1656 trx_commit_complete_for_mysql(
1657 /*==========================*/
1658 trx_t* trx) /*!< in/out: transaction */
1659 {
1660 if (trx->id != 0
1661 || !trx->must_flush_log_later
1662 || (srv_flush_log_at_trx_commit == 1 && trx->active_commit_ordered)) {
1663
1664 return;
1665 }
1666
1667 trx_flush_log_if_needed(trx->commit_lsn, trx);
1668
1669 trx->must_flush_log_later = false;
1670 }
1671
1672 /**********************************************************************//**
1673 Marks the latest SQL statement ended. */
1674 void
trx_mark_sql_stat_end(trx_t * trx)1675 trx_mark_sql_stat_end(
1676 /*==================*/
1677 trx_t* trx) /*!< in: trx handle */
1678 {
1679 ut_a(trx);
1680
1681 switch (trx->state) {
1682 case TRX_STATE_PREPARED:
1683 case TRX_STATE_PREPARED_RECOVERED:
1684 case TRX_STATE_COMMITTED_IN_MEMORY:
1685 break;
1686 case TRX_STATE_NOT_STARTED:
1687 trx->undo_no = 0;
1688 /* fall through */
1689 case TRX_STATE_ACTIVE:
1690 trx->last_sql_stat_start.least_undo_no = trx->undo_no;
1691
1692 if (trx->fts_trx != NULL) {
1693 fts_savepoint_laststmt_refresh(trx);
1694 }
1695
1696 return;
1697 }
1698
1699 ut_error;
1700 }
1701
1702 /**********************************************************************//**
1703 Prints info about a transaction. */
1704 void
trx_print_low(FILE * f,const trx_t * trx,ulint max_query_len,ulint n_rec_locks,ulint n_trx_locks,ulint heap_size)1705 trx_print_low(
1706 /*==========*/
1707 FILE* f,
1708 /*!< in: output stream */
1709 const trx_t* trx,
1710 /*!< in: transaction */
1711 ulint max_query_len,
1712 /*!< in: max query length to print,
1713 or 0 to use the default max length */
1714 ulint n_rec_locks,
1715 /*!< in: lock_number_of_rows_locked(&trx->lock) */
1716 ulint n_trx_locks,
1717 /*!< in: length of trx->lock.trx_locks */
1718 ulint heap_size)
1719 /*!< in: mem_heap_get_size(trx->lock.lock_heap) */
1720 {
1721 ibool newline;
1722
1723 fprintf(f, "TRANSACTION " TRX_ID_FMT, trx_get_id_for_print(trx));
1724
1725 switch (trx->state) {
1726 case TRX_STATE_NOT_STARTED:
1727 fputs(", not started", f);
1728 goto state_ok;
1729 case TRX_STATE_ACTIVE:
1730 fprintf(f, ", ACTIVE %lu sec",
1731 (ulong) difftime(time(NULL), trx->start_time));
1732 goto state_ok;
1733 case TRX_STATE_PREPARED:
1734 case TRX_STATE_PREPARED_RECOVERED:
1735 fprintf(f, ", ACTIVE (PREPARED) %lu sec",
1736 (ulong) difftime(time(NULL), trx->start_time));
1737 goto state_ok;
1738 case TRX_STATE_COMMITTED_IN_MEMORY:
1739 fputs(", COMMITTED IN MEMORY", f);
1740 goto state_ok;
1741 }
1742 fprintf(f, ", state %lu", (ulong) trx->state);
1743 ut_ad(0);
1744 state_ok:
1745 const char* op_info = trx->op_info;
1746
1747 if (*op_info) {
1748 putc(' ', f);
1749 fputs(op_info, f);
1750 }
1751
1752 if (trx->is_recovered) {
1753 fputs(" recovered trx", f);
1754 }
1755
1756 putc('\n', f);
1757
1758 if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
1759 fprintf(f, "mysql tables in use %lu, locked %lu\n",
1760 (ulong) trx->n_mysql_tables_in_use,
1761 (ulong) trx->mysql_n_tables_locked);
1762 }
1763
1764 newline = TRUE;
1765
1766 /* trx->lock.que_state of an ACTIVE transaction may change
1767 while we are not holding trx->mutex. We perform a dirty read
1768 for performance reasons. */
1769
1770 switch (trx->lock.que_state) {
1771 case TRX_QUE_RUNNING:
1772 newline = FALSE; break;
1773 case TRX_QUE_LOCK_WAIT:
1774 fputs("LOCK WAIT ", f); break;
1775 case TRX_QUE_ROLLING_BACK:
1776 fputs("ROLLING BACK ", f); break;
1777 case TRX_QUE_COMMITTING:
1778 fputs("COMMITTING ", f); break;
1779 default:
1780 fprintf(f, "que state %lu ", (ulong) trx->lock.que_state);
1781 }
1782
1783 if (n_trx_locks > 0 || heap_size > 400) {
1784 newline = TRUE;
1785
1786 fprintf(f, "%lu lock struct(s), heap size %lu,"
1787 " %lu row lock(s)",
1788 (ulong) n_trx_locks,
1789 (ulong) heap_size,
1790 (ulong) n_rec_locks);
1791 }
1792
1793 if (trx->undo_no != 0) {
1794 newline = TRUE;
1795 fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no);
1796 }
1797
1798 if (newline) {
1799 putc('\n', f);
1800 }
1801
1802 if (trx->state != TRX_STATE_NOT_STARTED && trx->mysql_thd != NULL) {
1803 innobase_mysql_print_thd(
1804 f, trx->mysql_thd, static_cast<uint>(max_query_len));
1805 }
1806 }
1807
1808 /**********************************************************************//**
1809 Prints info about a transaction.
1810 The caller must hold lock_sys.mutex.
1811 When possible, use trx_print() instead. */
1812 void
trx_print_latched(FILE * f,const trx_t * trx,ulint max_query_len)1813 trx_print_latched(
1814 /*==============*/
1815 FILE* f, /*!< in: output stream */
1816 const trx_t* trx, /*!< in: transaction */
1817 ulint max_query_len) /*!< in: max query length to print,
1818 or 0 to use the default max length */
1819 {
1820 ut_ad(lock_mutex_own());
1821
1822 trx_print_low(f, trx, max_query_len,
1823 lock_number_of_rows_locked(&trx->lock),
1824 UT_LIST_GET_LEN(trx->lock.trx_locks),
1825 mem_heap_get_size(trx->lock.lock_heap));
1826 }
1827
1828 /**********************************************************************//**
1829 Prints info about a transaction.
1830 Acquires and releases lock_sys.mutex. */
1831 void
trx_print(FILE * f,const trx_t * trx,ulint max_query_len)1832 trx_print(
1833 /*======*/
1834 FILE* f, /*!< in: output stream */
1835 const trx_t* trx, /*!< in: transaction */
1836 ulint max_query_len) /*!< in: max query length to print,
1837 or 0 to use the default max length */
1838 {
1839 ulint n_rec_locks;
1840 ulint n_trx_locks;
1841 ulint heap_size;
1842
1843 lock_mutex_enter();
1844 n_rec_locks = lock_number_of_rows_locked(&trx->lock);
1845 n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
1846 heap_size = mem_heap_get_size(trx->lock.lock_heap);
1847 lock_mutex_exit();
1848
1849 trx_print_low(f, trx, max_query_len,
1850 n_rec_locks, n_trx_locks, heap_size);
1851 }
1852
1853 /*******************************************************************//**
1854 Compares the "weight" (or size) of two transactions. Transactions that
1855 have edited non-transactional tables are considered heavier than ones
1856 that have not.
1857 @return TRUE if weight(a) >= weight(b) */
1858 bool
trx_weight_ge(const trx_t * a,const trx_t * b)1859 trx_weight_ge(
1860 /*==========*/
1861 const trx_t* a, /*!< in: transaction to be compared */
1862 const trx_t* b) /*!< in: transaction to be compared */
1863 {
1864 ibool a_notrans_edit;
1865 ibool b_notrans_edit;
1866
1867 /* If mysql_thd is NULL for a transaction we assume that it has
1868 not edited non-transactional tables. */
1869
1870 a_notrans_edit = a->mysql_thd != NULL
1871 && thd_has_edited_nontrans_tables(a->mysql_thd);
1872
1873 b_notrans_edit = b->mysql_thd != NULL
1874 && thd_has_edited_nontrans_tables(b->mysql_thd);
1875
1876 if (a_notrans_edit != b_notrans_edit) {
1877
1878 return(a_notrans_edit);
1879 }
1880
1881 /* Either both had edited non-transactional tables or both had
1882 not, we fall back to comparing the number of altered/locked
1883 rows. */
1884
1885 return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
1886 }
1887
1888 /** Prepare a transaction.
1889 @return log sequence number that makes the XA PREPARE durable
1890 @retval 0 if no changes needed to be made durable */
trx_prepare_low(trx_t * trx)1891 static lsn_t trx_prepare_low(trx_t *trx)
1892 {
1893 ut_ad(!trx->is_recovered);
1894
1895 mtr_t mtr;
1896
1897 if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
1898 ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg);
1899
1900 mtr.start();
1901 mtr.set_log_mode(MTR_LOG_NO_REDO);
1902
1903 mutex_enter(&undo->rseg->mutex);
1904 trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
1905 mutex_exit(&undo->rseg->mutex);
1906
1907 mtr.commit();
1908 }
1909
1910 trx_undo_t* undo = trx->rsegs.m_redo.undo;
1911
1912 if (!undo) {
1913 /* There were no changes to persistent tables. */
1914 return(0);
1915 }
1916
1917 trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
1918 ut_ad(undo->rseg == rseg);
1919
1920 mtr.start();
1921
1922 /* Change the undo log segment states from TRX_UNDO_ACTIVE to
1923 TRX_UNDO_PREPARED: these modifications to the file data
1924 structure define the transaction as prepared in the file-based
1925 world, at the serialization point of lsn. */
1926
1927 mutex_enter(&rseg->mutex);
1928 trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
1929 mutex_exit(&rseg->mutex);
1930
1931 /* Make the XA PREPARE durable. */
1932 mtr.commit();
1933 ut_ad(mtr.commit_lsn() > 0);
1934 return(mtr.commit_lsn());
1935 }
1936
1937 /****************************************************************//**
1938 Prepares a transaction. */
1939 static
1940 void
trx_prepare(trx_t * trx)1941 trx_prepare(
1942 /*========*/
1943 trx_t* trx) /*!< in/out: transaction */
1944 {
1945 /* Only fresh user transactions can be prepared.
1946 Recovered transactions cannot. */
1947 ut_a(!trx->is_recovered);
1948
1949 lsn_t lsn = trx_prepare_low(trx);
1950
1951 DBUG_EXECUTE_IF("ib_trx_crash_during_xa_prepare_step", DBUG_SUICIDE(););
1952
1953 ut_a(trx->state == TRX_STATE_ACTIVE);
1954 trx_mutex_enter(trx);
1955 trx->state = TRX_STATE_PREPARED;
1956 trx_mutex_exit(trx);
1957
1958 if (lsn) {
1959 /* Depending on the my.cnf options, we may now write the log
1960 buffer to the log files, making the prepared state of the
1961 transaction durable if the OS does not crash. We may also
1962 flush the log files to disk, making the prepared state of the
1963 transaction durable also at an OS crash or a power outage.
1964
1965 The idea in InnoDB's group prepare is that a group of
1966 transactions gather behind a trx doing a physical disk write
1967 to log files, and when that physical write has been completed,
1968 one of those transactions does a write which prepares the whole
1969 group. Note that this group prepare will only bring benefit if
1970 there are > 2 users in the database. Then at least 2 users can
1971 gather behind one doing the physical log write to disk.
1972
1973 We must not be holding any mutexes or latches here. */
1974
1975 trx_flush_log_if_needed(lsn, trx);
1976
1977 if (!UT_LIST_GET_LEN(trx->lock.trx_locks)
1978 || trx->isolation_level == TRX_ISO_SERIALIZABLE) {
1979 /* Do not release any locks at the
1980 SERIALIZABLE isolation level. */
1981 } else if (!trx->mysql_thd
1982 || thd_sql_command(trx->mysql_thd)
1983 != SQLCOM_XA_PREPARE) {
1984 /* Do not release locks for XA COMMIT ONE PHASE
1985 or for internal distributed transactions
1986 (XID::get_my_xid() would be nonzero). */
1987 } else {
1988 lock_release_on_prepare(trx);
1989 }
1990 }
1991 }
1992
1993 /** XA PREPARE a transaction.
1994 @param[in,out] trx transaction to prepare */
trx_prepare_for_mysql(trx_t * trx)1995 void trx_prepare_for_mysql(trx_t* trx)
1996 {
1997 trx_start_if_not_started_xa(trx, false);
1998
1999 trx->op_info = "preparing";
2000
2001 trx_prepare(trx);
2002
2003 trx->op_info = "";
2004 }
2005
2006
2007 struct trx_recover_for_mysql_callback_arg
2008 {
2009 XID *xid_list;
2010 uint len;
2011 uint count;
2012 };
2013
2014
trx_recover_for_mysql_callback(rw_trx_hash_element_t * element,trx_recover_for_mysql_callback_arg * arg)2015 static my_bool trx_recover_for_mysql_callback(rw_trx_hash_element_t *element,
2016 trx_recover_for_mysql_callback_arg *arg)
2017 {
2018 DBUG_ASSERT(arg->len > 0);
2019 mutex_enter(&element->mutex);
2020 if (trx_t *trx= element->trx)
2021 {
2022 /*
2023 The state of a read-write transaction can only change from ACTIVE to
2024 PREPARED while we are holding the element->mutex. But since it is
2025 executed at startup no state change should occur.
2026 */
2027 if (trx_state_eq(trx, TRX_STATE_PREPARED))
2028 {
2029 ut_ad(trx->is_recovered);
2030 ut_ad(trx->id);
2031 if (arg->count == 0)
2032 ib::info() << "Starting recovery for XA transactions...";
2033 XID& xid= arg->xid_list[arg->count];
2034 if (arg->count++ < arg->len)
2035 {
2036 trx->state= TRX_STATE_PREPARED_RECOVERED;
2037 ib::info() << "Transaction " << trx->id
2038 << " in prepared state after recovery";
2039 ib::info() << "Transaction contains changes to " << trx->undo_no
2040 << " rows";
2041 xid= *trx->xid;
2042 }
2043 }
2044 }
2045 mutex_exit(&element->mutex);
2046 /* Do not terminate upon reaching arg->len; count all transactions */
2047 return false;
2048 }
2049
2050
trx_recover_reset_callback(rw_trx_hash_element_t * element,void *)2051 static my_bool trx_recover_reset_callback(rw_trx_hash_element_t *element,
2052 void*)
2053 {
2054 mutex_enter(&element->mutex);
2055 if (trx_t *trx= element->trx)
2056 {
2057 if (trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED))
2058 trx->state= TRX_STATE_PREPARED;
2059 }
2060 mutex_exit(&element->mutex);
2061 return false;
2062 }
2063
2064
2065 /**
2066 Find prepared transaction objects for recovery.
2067
2068 @param[out] xid_list prepared transactions
2069 @param[in] len number of slots in xid_list
2070
2071 @return number of prepared transactions stored in xid_list
2072 */
2073
trx_recover_for_mysql(XID * xid_list,uint len)2074 int trx_recover_for_mysql(XID *xid_list, uint len)
2075 {
2076 trx_recover_for_mysql_callback_arg arg= { xid_list, len, 0 };
2077
2078 ut_ad(xid_list);
2079 ut_ad(len);
2080
2081 /* Fill xid_list with PREPARED transactions. */
2082 trx_sys.rw_trx_hash.iterate_no_dups(trx_recover_for_mysql_callback, &arg);
2083 if (arg.count)
2084 {
2085 ib::info() << arg.count
2086 << " transactions in prepared state after recovery";
2087 /* After returning the full list, reset the state, because
2088 init_server_components() wants to recover the collection of
2089 transactions twice, by first calling tc_log->open() and then
2090 ha_recover() directly. */
2091 if (arg.count <= len)
2092 trx_sys.rw_trx_hash.iterate(trx_recover_reset_callback);
2093 }
2094 return int(std::min(arg.count, len));
2095 }
2096
2097
2098 struct trx_get_trx_by_xid_callback_arg
2099 {
2100 const XID *xid;
2101 trx_t *trx;
2102 };
2103
2104
trx_get_trx_by_xid_callback(rw_trx_hash_element_t * element,trx_get_trx_by_xid_callback_arg * arg)2105 static my_bool trx_get_trx_by_xid_callback(rw_trx_hash_element_t *element,
2106 trx_get_trx_by_xid_callback_arg *arg)
2107 {
2108 my_bool found= 0;
2109 mutex_enter(&element->mutex);
2110 if (trx_t *trx= element->trx)
2111 {
2112 trx_mutex_enter(trx);
2113 if (trx->is_recovered &&
2114 (trx_state_eq(trx, TRX_STATE_PREPARED) ||
2115 trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) &&
2116 arg->xid->eq(reinterpret_cast<XID*>(trx->xid)))
2117 {
2118 #ifdef WITH_WSREP
2119 /* The commit of a prepared recovered Galera
2120 transaction needs a valid trx->xid for
2121 invoking trx_sys_update_wsrep_checkpoint(). */
2122 if (!wsrep_is_wsrep_xid(trx->xid))
2123 #endif /* WITH_WSREP */
2124 /* Invalidate the XID, so that subsequent calls will not find it. */
2125 trx->xid->null();
2126 arg->trx= trx;
2127 found= 1;
2128 }
2129 trx_mutex_exit(trx);
2130 }
2131 mutex_exit(&element->mutex);
2132 return found;
2133 }
2134
2135 /** Look up an X/Open distributed transaction in XA PREPARE state.
2136 @param[in] xid X/Open XA transaction identifier
2137 @return transaction on match (the trx_t::xid will be invalidated);
2138 note that the trx may have been committed before the caller acquires
2139 trx_t::mutex
2140 @retval NULL if no match */
trx_get_trx_by_xid(const XID * xid)2141 trx_t* trx_get_trx_by_xid(const XID* xid)
2142 {
2143 trx_get_trx_by_xid_callback_arg arg= { xid, 0 };
2144
2145 if (xid)
2146 trx_sys.rw_trx_hash.iterate(trx_get_trx_by_xid_callback, &arg);
2147 return arg.trx;
2148 }
2149
2150
2151 /*************************************************************//**
2152 Starts the transaction if it is not yet started. */
2153 void
trx_start_if_not_started_xa_low(trx_t * trx,bool read_write)2154 trx_start_if_not_started_xa_low(
2155 /*============================*/
2156 trx_t* trx, /*!< in/out: transaction */
2157 bool read_write) /*!< in: true if read write transaction */
2158 {
2159 switch (trx->state) {
2160 case TRX_STATE_NOT_STARTED:
2161 trx_start_low(trx, read_write);
2162 return;
2163
2164 case TRX_STATE_ACTIVE:
2165 if (trx->id == 0 && read_write) {
2166 /* If the transaction is tagged as read-only then
2167 it can only write to temp tables and for such
2168 transactions we don't want to move them to the
2169 trx_sys_t::rw_trx_hash. */
2170 if (!trx->read_only) {
2171 trx_set_rw_mode(trx);
2172 }
2173 }
2174 return;
2175 case TRX_STATE_PREPARED:
2176 case TRX_STATE_PREPARED_RECOVERED:
2177 case TRX_STATE_COMMITTED_IN_MEMORY:
2178 break;
2179 }
2180
2181 ut_error;
2182 }
2183
2184 /*************************************************************//**
2185 Starts the transaction if it is not yet started. */
2186 void
trx_start_if_not_started_low(trx_t * trx,bool read_write)2187 trx_start_if_not_started_low(
2188 /*==========================*/
2189 trx_t* trx, /*!< in: transaction */
2190 bool read_write) /*!< in: true if read write transaction */
2191 {
2192 switch (trx->state) {
2193 case TRX_STATE_NOT_STARTED:
2194 trx_start_low(trx, read_write);
2195 return;
2196
2197 case TRX_STATE_ACTIVE:
2198 if (read_write && trx->id == 0 && !trx->read_only) {
2199 trx_set_rw_mode(trx);
2200 }
2201 return;
2202
2203 case TRX_STATE_PREPARED:
2204 case TRX_STATE_PREPARED_RECOVERED:
2205 case TRX_STATE_COMMITTED_IN_MEMORY:
2206 break;
2207 }
2208
2209 ut_error;
2210 }
2211
2212 /*************************************************************//**
2213 Starts a transaction for internal processing. */
2214 void
trx_start_internal_low(trx_t * trx)2215 trx_start_internal_low(
2216 /*===================*/
2217 trx_t* trx) /*!< in/out: transaction */
2218 {
2219 /* Ensure it is not flagged as an auto-commit-non-locking
2220 transaction. */
2221
2222 trx->will_lock = true;
2223
2224 trx->internal = true;
2225
2226 trx_start_low(trx, true);
2227 }
2228
2229 /** Starts a read-only transaction for internal processing.
2230 @param[in,out] trx transaction to be started */
2231 void
trx_start_internal_read_only_low(trx_t * trx)2232 trx_start_internal_read_only_low(
2233 trx_t* trx)
2234 {
2235 /* Ensure it is not flagged as an auto-commit-non-locking
2236 transaction. */
2237
2238 trx->will_lock = true;
2239
2240 trx->internal = true;
2241
2242 trx_start_low(trx, false);
2243 }
2244
2245 /*************************************************************//**
2246 Starts the transaction for a DDL operation. */
2247 void
trx_start_for_ddl_low(trx_t * trx,trx_dict_op_t op)2248 trx_start_for_ddl_low(
2249 /*==================*/
2250 trx_t* trx, /*!< in/out: transaction */
2251 trx_dict_op_t op) /*!< in: dictionary operation type */
2252 {
2253 switch (trx->state) {
2254 case TRX_STATE_NOT_STARTED:
2255 /* Flag this transaction as a dictionary operation, so that
2256 the data dictionary will be locked in crash recovery. */
2257
2258 trx_set_dict_operation(trx, op);
2259 trx->ddl= true;
2260 trx_start_internal_low(trx);
2261 return;
2262
2263 case TRX_STATE_ACTIVE:
2264 case TRX_STATE_PREPARED:
2265 case TRX_STATE_PREPARED_RECOVERED:
2266 case TRX_STATE_COMMITTED_IN_MEMORY:
2267 break;
2268 }
2269
2270 ut_error;
2271 }
2272
2273 /*************************************************************//**
2274 Set the transaction as a read-write transaction if it is not already
2275 tagged as such. Read-only transactions that are writing to temporary
2276 tables are assigned an ID and a rollback segment but are not added
2277 to the trx read-write list because their updates should not be visible
2278 to other transactions and therefore their changes can be ignored by
2279 by MVCC. */
2280 void
trx_set_rw_mode(trx_t * trx)2281 trx_set_rw_mode(
2282 /*============*/
2283 trx_t* trx) /*!< in/out: transaction that is RW */
2284 {
2285 ut_ad(trx->rsegs.m_redo.rseg == 0);
2286 ut_ad(!trx->is_autocommit_non_locking());
2287 ut_ad(!trx->read_only);
2288 ut_ad(trx->id == 0);
2289
2290 if (high_level_read_only) {
2291 return;
2292 }
2293
2294 trx->rsegs.m_redo.rseg = trx_assign_rseg_low();
2295 ut_ad(trx->rsegs.m_redo.rseg != 0);
2296
2297 trx_sys.register_rw(trx);
2298
2299 /* So that we can see our own changes. */
2300 if (trx->read_view.is_open()) {
2301 trx->read_view.set_creator_trx_id(trx->id);
2302 }
2303 }
2304
has_stats_table_lock() const2305 bool trx_t::has_stats_table_lock() const
2306 {
2307 for (lock_list::const_iterator it= lock.table_locks.begin(),
2308 end= lock.table_locks.end(); it != end; ++it)
2309 {
2310 const lock_t *lock= *it;
2311 if (lock && lock->un_member.tab_lock.table->is_stats_table())
2312 return true;
2313 }
2314
2315 return false;
2316 }
2317